From 3b6d9bdac6f4318e80970317a05ba8d01449cf12 Mon Sep 17 00:00:00 2001 From: Jan Svabenik Date: Sun, 5 Apr 2026 09:58:11 +0200 Subject: [PATCH 01/55] docs: add pro runtime hardening workboard --- README.md | 13 +- docs/pro-runtime-hardening-workboard.md | 506 ++++++++++++++++++++++++ 2 files changed, 517 insertions(+), 2 deletions(-) create mode 100644 docs/pro-runtime-hardening-workboard.md diff --git a/README.md b/README.md index 91fe5cd..7cdf68d 100644 --- a/README.md +++ b/README.md @@ -93,12 +93,21 @@ internal/ rds/ RDS encoder (IEC 62106, CRC, differential, group scheduler) stereo/ stereo encoder (19 kHz pilot, 38 kHz DSB-SC) docs/ - config.sample.json default config - config.plutosdr.json PlutoSDR-specific config + config.sample.json default config + config.plutosdr.json PlutoSDR-specific config + pro-runtime-hardening-workboard.md living workboard for pro runtime hardening scripts/ examples/ ``` +## Planning / workboard + +For the current pro-runtime-hardening track, see: + +- `docs/pro-runtime-hardening-workboard.md` + +This document is the detailed working board for status tracking, confirmed findings, open decisions, verification notes, and implementation progress. + ## Legal note This project is intended only for lawful use within relevant license and regulatory constraints. diff --git a/docs/pro-runtime-hardening-workboard.md b/docs/pro-runtime-hardening-workboard.md new file mode 100644 index 0000000..1f46138 --- /dev/null +++ b/docs/pro-runtime-hardening-workboard.md @@ -0,0 +1,506 @@ +# Pro Runtime Hardening Workboard + +Status: living document +Branch: `feature/pro-runtime-hardening` + +Dieses Dokument ist das **Arbeitsdokument** zur schrittweisen Umsetzung des Konzepts aus `fm-rds-tx_pro_runtime_hardening_concept.json`. + +Ziel ist **nicht** nur eine hübsche Roadmap, sondern ein Ort, an dem wir konkret markieren können: +- **wo** wir im Code stehen, +- **welche Lücken** bestätigt sind, +- **welche Entscheidungen** gefallen sind, +- **welche Arbeiten** offen / in Arbeit / erledigt sind, +- **welche Risiken** noch bestehen, +- **welche Akzeptanzkriterien** wirklich nachgewiesen wurden. + +--- + +## 1. Arbeitsregeln für dieses Dokument + +### Statuswerte +- `TODO` → noch nicht begonnen +- `IN PROGRESS` → aktiv in Arbeit +- `BLOCKED` → sinnvoll erkannt, aber blockiert +- `DONE` → umgesetzt +- `VERIFIED` → umgesetzt **und** sinnvoll geprüft +- `DEFERRED` → bewusst nach hinten verschoben +- `REJECTED` → bewusst verworfen + +### Nachweispflicht +Ein Punkt gilt erst als wirklich fertig, wenn eingetragen ist: +1. **Code-Ort(e)** +2. **Was geändert wurde** +3. **Wie verifiziert wurde** +4. **Welche Restrisiken bleiben** + +### Update-Regel +Wenn wir an einem Workstream arbeiten, soll dieses Dokument mitgezogen werden. +Kein „ist im Kopf klar“. Der Stand kommt hier rein. + +--- + +## 2. Gesamtüberblick + +## Gesamtstatus +- Projektphase: `Planung / Strukturierung` +- Technischer Fokus aktuell: `noch offen` +- Nächster sinnvoller Startpunkt laut Konzept: `WS-03 Semantische Korrektheit und harte Config-/Runtime-Konsistenz` + +## Repo-bezogene bestätigte Ausgangslage + +| Thema | Status | Notiz | +|---|---|---| +| TX-Engine aktuell als synchroner Single-Loop | CONFIRMED | `internal/app/engine.go` | +| Persistenter DSP-Zustand im Generator vorhanden | CONFIRMED | `internal/offline/generator.go` | +| HTTP-Control vorhanden | CONFIRMED | `internal/control/control.go` | +| Config-Validation vorhanden, aber nicht überall semantisch konsistent | CONFIRMED | `internal/config/config.go` + Runtime-Pfade | +| Device/Capability-Modell vorhanden, aber noch nicht streng genug | CONFIRMED | `internal/platform/soapy.go` | +| Lock-freier SPSC-Audio-Ringbuffer vorhanden | CONFIRMED | `internal/audio/stream.go` | + +## Bereits bekannte bestätigte Inkonsistenzen + +| ID | Status | Beschreibung | Ort | +|---|---|---|---| +| CFG-SEM-001 | CONFIRMED | `fm.outputDrive` wird in Validation und Runtime nicht konsistent behandelt | `internal/config/config.go`, `internal/app/engine.go` | +| CTL-UX-001 | CONFIRMED | `handleAudioStream()` referenziert `--audio-http`, was CLI-seitig überprüft werden sollte | `internal/control/control.go` | + +--- + +## 3. Prioritätenmodell + +| Priorität | Bedeutung | +|---|---| +| P0 | Technische Perfektion und Determinismus | +| P1 | Betriebssicherheit und Fehlerbeherrschung | +| P2 | Hardware-Wahrheit und RF-Qualität | +| P3 | Sichere und saubere Runtime-Steuerung | +| P4 | Deployment-, Release- und Service-Reife | + +--- + +## 4. Umsetzungstracker nach Workstream + +# WS-03 — Semantische Korrektheit und harte Config-/Runtime-Konsistenz +**Priorität:** P0 +**Gesamtstatus:** TODO + +## Ziel +Ein einziger, eindeutig definierter Parameterraum. Jeder Wert hat exakt eine Bedeutung und identische Constraints in Config, HTTP-API, Runtime und Telemetrie. + +## Warum dieser Workstream zuerst +Wenn Semantik und Grenzwerte nicht sauber vereinheitlicht sind, bauen spätere Runtime- und Fault-Mechanismen auf unstabilem Fundament. + +## Aufgaben + +### WS-03-T1 — Parameterinventar erstellen +- **Status:** TODO +- **Owner:** offen +- **Code-Orte:** + - `internal/config/config.go` + - `internal/app/engine.go` + - `internal/control/control.go` + - ggf. weitere betroffene Pakete +- **Ziel:** + Alle öffentlich und intern verwendeten Parameter inventarisieren mit: + - Name + - Typ + - Einheit + - Bereich + - Default + - hot-reload-fähig ja/nein + - safety class + - Telemetrie-Name +- **Offene Fragen:** + - Wo leben heute implizite Parameter, die nicht sauber dokumentiert sind? + - Welche Runtime-Werte sind abgeleitet statt direkt konfigurierbar? +- **Nachweis:** + - Parameterinventar im Repo vorhanden + - referenzierbar für Config/API/Runtime +- **Restrisiken:** + - versteckte Semantik in Helper-Funktionen übersehen + +### WS-03-T2 — Validation vereinheitlichen +- **Status:** TODO +- **Owner:** offen +- **Code-Orte:** + - `internal/config/config.go` + - `internal/app/engine.go` + - `internal/control/control.go` +- **Ziel:** + `Config.Validate()`, Runtime-Update-Pfade und API-Patch-Validierung dürfen nicht divergieren. +- **Bereits bekannter Startpunkt:** + - `fm.outputDrive` +- **Nachweis:** + - bekannte Inkonsistenzen beseitigt + - Tests für gemeinsame Grenzwerte vorhanden +- **Restrisiken:** + - weitere Inkonsistenzen erst beim Inventar sichtbar + +### WS-03-T3 — DesiredConfig / AppliedConfig einführen +- **Status:** TODO +- **Owner:** offen +- **Code-Orte:** + - `internal/app/engine.go` + - `internal/control/control.go` + - ggf. Config-/Statusmodelle +- **Ziel:** + API und Runtime sollen trennen zwischen: + - gewünschter Konfiguration + - tatsächlich angewandter Konfiguration + - aktuellem Runtime-Zustand +- **Nachweis:** + - API kann beide Sichten getrennt ausgeben + - partielle oder abgelehnte Übernahmen werden sichtbar +- **Restrisiken:** + - unsaubere Migration bestehender Statusantworten + +## WS-03 Entscheidungslog +- Noch leer + +## WS-03 Verifikation +- Noch leer + +--- + +# WS-01 — Deterministische Echtzeit-TX-Pipeline mit entkoppeltem Writer +**Priorität:** P0 +**Gesamtstatus:** TODO + +## Ziel +Generator/Upsampler und Hardwarewriter werden als getrennte Stufen mit kleinem, kontrolliertem Frame-Puffer betrieben. + +## Aktueller Stand +- Der TX-Pfad ist laut Konzept aktuell noch synchron gekoppelt: + `GenerateFrame -> optional FMUpsampler.Process -> driver.Write` +- Das ist elegant, aber nicht pro-level-hart gegenüber Write-Spikes und Blockaden. + +## Aufgaben + +### WS-01-T1 — FrameQueue einführen +- **Status:** TODO +- **Owner:** offen +- **Code-Orte:** + - `internal/app/engine.go` + - ggf. neues internes Queue-Modul + - `internal/output/*` +- **Ziel:** + Bounded Queue mit fester Kapazität, sichtbarem Füllstand und Countern. +- **Zu entscheiden:** + - Puffern vor oder nach Upsampling? + - Referenzentscheidung im Konzept: eher Device-Frame-Ebene +- **Akzeptanzpunkte:** + - keine unbounded queue + - Fill-Level live sichtbar + - Drop/Repeat/Mute niemals ohne Counter/Log + +### WS-01-T2 — Writer-Worker einführen +- **Status:** TODO +- **Owner:** offen +- **Code-Orte:** + - `internal/app/engine.go` + - `internal/platform/*` +- **Ziel:** + Nur noch ein dedizierter Worker besitzt `driver.Write()`. +- **Akzeptanzpunkte:** + - Write-Latenz pro Frame messbar + - Timinginteraktionen klar isoliert + +### WS-01-T3 — Supervisor-Schicht einführen +- **Status:** TODO +- **Owner:** offen +- **Code-Orte:** + - `internal/app/engine.go` +- **Ziel:** + Queue-Füllstand, Late-Rate und Fehlerhäufigkeit überwachen und in Runtime-Zustände überführen. +- **Akzeptanzpunkte:** + - State-Entscheidungen sind explizit + - kein implizites Weiterwursteln bei Schieflage + +## Offene Architekturfragen +- Ist `capacity_frames = 3` ein guter Startwert oder nur Konzept-Default? +- Sollte im Fault-Fall `repeat last safe frame` erlaubt sein oder von Anfang an nur `mute`? +- Wie eng koppeln wir WS-01 mit WS-02, ohne Overengineering zu erzeugen? + +## WS-01 Entscheidungslog +- Noch leer + +## WS-01 Verifikation +- Noch leer + +--- + +# WS-02 — Explizite Runtime-State-Maschine und Fault-Handling +**Priorität:** P0 +**Gesamtstatus:** TODO + +## Ziel +Einführen eines klaren Betriebsmodells mit Fault-, Recovery- und Muted-Zuständen. + +## Zielzustände laut Konzept +- `idle` +- `arming` +- `prebuffering` +- `running` +- `degraded` +- `muted` +- `faulted` +- `stopping` + +## Aufgaben + +### WS-02-T1 — Fault-Klassifikation definieren +- **Status:** TODO +- **Owner:** offen +- **Beispiele:** + - Treiberfehler + - Write-Time-Budget überschritten + - Queue leer + - Queue dauerhaft kritisch + - Selbsttest fehlgeschlagen + - unerlaubtes Live-Update + +### WS-02-T2 — Reaktionsstrategie definieren +- **Status:** TODO +- **Owner:** offen +- **Ziel:** + Pro Fehlerklasse klar definieren: + - warn only + - degraded + - muted + - faulted + +### WS-02-T3 — Fault-Historie und Event-Log einführen +- **Status:** TODO +- **Owner:** offen +- **Ziel:** + Zustandswechsel und Faults auditierbar machen. + +## Offene Designfragen +- Wie fein granular darf die State-Maschine werden, ohne unwartbar zu werden? +- Welche Transitionen sind wirklich produktiv relevant und welche nur „theoretisch schön“? + +## WS-02 Entscheidungslog +- Noch leer + +## WS-02 Verifikation +- Noch leer + +--- + +# WS-04 — Observability, Telemetrie und Diagnosefähigkeit +**Priorität:** P1 +**Gesamtstatus:** TODO + +## Ziel +Vollständige Sichtbarkeit auf Runtime, Queue, Writer, Generator, RF-Selbsttests und API-Aktivität schaffen. + +## Aufgaben + +### WS-04-T1 — Strukturiertes Logging +- **Status:** TODO +- **Owner:** offen + +### WS-04-T2 — Prometheus-/Metrics-Schicht +- **Status:** TODO +- **Owner:** offen + +### WS-04-T3 — Debug-/Profiling-Endpunkte +- **Status:** TODO +- **Owner:** offen + +## Gewünschte Beispielmetriken +- `engine_chunks_generated_total` +- `engine_late_buffers_total` +- `engine_fault_transitions_total` +- `writer_write_duration_seconds` +- `queue_fill_ratio` +- `queue_dropped_frames_total` +- `queue_muted_frames_total` +- `driver_write_errors_total` +- `audio_stream_underruns_total` +- `audio_stream_overflows_total` +- `rf_selftest_pilot_db` +- `rf_selftest_rds_57k_db` + +## WS-04 Entscheidungslog +- Noch leer + +## WS-04 Verifikation +- Noch leer + +--- + +# WS-05 — Sichere und erwachsene Control-Plane +**Priorität:** P1 / P3-nah +**Gesamtstatus:** TODO + +## Ziel +API transport- und anwendungsseitig härten, state-aware machen und auditierbar gestalten. + +## Aufgaben + +### WS-05-T1 — Auth und Deploy-Modi definieren +- **Status:** TODO +- **Owner:** offen +- **Zielmodi:** + - localhost-only + - trusted-lan + - secured-remote + +### WS-05-T2 — HTTP-Server härten +- **Status:** TODO +- **Owner:** offen +- **Mindestpunkte:** + - ReadTimeout + - WriteTimeout + - IdleTimeout + - ReadHeaderTimeout + - Body-Size-Limits + - Content-Type-Validierung + - Method Enforcement + +### WS-05-T3 — API semantisch aufräumen +- **Status:** TODO +- **Owner:** offen +- **Ziel:** + - DesiredConfig vs AppliedConfig vs RuntimeState + - idempotente Start/Stop-Endpunkte + - transaktionsartige Apply-/Reject-Antworten + - Audit-Log pro Eingriff + +## Frühe Quick-Wins +Diese Punkte könnten ggf. vorgezogen werden, auch wenn WS-05 formal nach WS-01/02 kommt: +- HTTP-Timeouts +- Body-Limits +- sicherer Standard-Bind-Modus + +## WS-05 Entscheidungslog +- Noch leer + +## WS-05 Verifikation +- Noch leer + +--- + +# WS-06 — Hardware-in-the-loop und externe RF-Wahrheitsprüfung +**Priorität:** P2 +**Gesamtstatus:** TODO + +## Ziel +Nicht nur intern richtig rechnen, sondern extern nachweisen, dass tatsächlich korrekt gesendet wird. + +## Status +- Konzept vorhanden +- noch kein eingetragener HIL-Arbeitsstand in diesem Dokument + +## Offene Kernfragen +- Welches Referenz-Setup wird verbindlich? +- Welche Testfrequenz / Standarddauer / Schutzmaßnahmen gelten? +- Welcher externe Decoder / Empfänger gilt als Referenz? + +--- + +# WS-07 — Device-aware Capability- und Kalibrierungsmodell +**Priorität:** P2 +**Gesamtstatus:** TODO + +## Ziel +Fähigkeiten und Kalibrierungen nicht implizit, sondern explizit pro Device modellieren. + +## Noch offen +- Capability-Schema konkretisieren +- Kalibrierungsprofil definieren +- Device-aware Validation einbauen + +--- + +# WS-08 — Signal-Selbstüberwachung im Betrieb +**Priorität:** P2 +**Gesamtstatus:** TODO + +## Ziel +Pilot, Stereo, RDS und Composite-Anomalien im Betrieb erkennen. + +## Noch offen +- Goertzel/FFT-Strategie festlegen +- Schwellwerte definieren +- in Fault-Logik einspeisen + +--- + +# WS-09 — Teststrategie erweitern +**Priorität:** P3/P4-nah +**Gesamtstatus:** TODO + +## Ziel +Von Unit-Tests zu echter Qualitätsabsicherung: Golden Vectors, Long-Run, Race, Fuzzing, API-Mutation, HIL. + +## Noch offen +- Testpyramide konkretisieren +- Nightly-/CI-Fähigkeit bestimmen + +--- + +# WS-10 — Service-Reife, Packaging und Reproduzierbarkeit +**Priorität:** P4 +**Gesamtstatus:** TODO + +## Ziel +Build-, Release- und Betriebsartefakte reproduzierbar und teamtauglich machen. + +## Noch offen +- Build-Metadaten +- Service-Units +- Config-Versionierung / Migration + +--- + +## 5. Übergreifende Regeln + +### Musts +- Jeder neue Runtime-Zustand muss per API und Telemetrie sichtbar sein. +- Jede Recovery-, Drop- oder Mute-Strategie braucht Counter, Logs und Tests. +- Keine neue Config-Option ohne klaren Typ, Bereich, Einheit, Default und Hot-Reload-Klassifikation. +- Hardware-nahe Änderungen brauchen mindestens Simulations- und HIL-Validierung. +- Alle Faults müssen eine maschinenlesbare Ursache und eine menschenlesbare Zusammenfassung haben. + +### Must Not +- Keine unbounded Queues. +- Keine stillen Fallbacks ohne Telemetrie. +- Keine teilweise angewandten Live-Config-Änderungen ohne explizite Rückmeldung. +- Keine unterschiedlichen Grenzwerte zwischen Config, API und Runtime. +- Keine sicherheitsrelevanten HTTP-Endpunkte ohne Härtung im Remote-Betrieb. + +--- + +## 6. Aktuelle offene Entscheidungen + +| ID | Status | Frage | Notiz | +|---|---|---|---| +| DEC-001 | OPEN | Puffern wir auf CompositeFrame- oder DeviceFrame-Ebene? | Konzept empfiehlt Device-Frame-Ebene | +| DEC-002 | OPEN | Fault-Recovery zuerst mit `mute`, `repeat last safe frame` oder beidem? | Muss technisch und RF-seitig sauber bewertet werden | +| DEC-003 | OPEN | Ziehen wir minimale WS-05-Basis-Härtungen vor? | Timeouts/Body-Limits evtl. früher sinnvoll | +| DEC-004 | OPEN | Wie gross/simpel halten wir die erste State-Maschine? | Gefahr von Overengineering | + +--- + +## 7. Nächste sinnvolle Schritte + +### Empfohlener Start +1. **WS-03-T1 Parameterinventar erstellen** +2. **bekannte Inkonsistenzen (CFG-SEM-001, CTL-UX-001) konkret verifizieren** +3. **DesiredConfig / AppliedConfig / RuntimeState Zielmodell grob skizzieren** +4. Danach Architekturarbeit an **WS-01 + WS-02** starten + +### Vor dem ersten grossen Umbau klären +- Was ist „minimal sinnvoll“ für Milestone 1? +- Welche Dinge sind harte Must-haves und welche nur spätere Veredelung? +- Wo wollen wir bewusst nicht sofort maximal abstrahieren? + +--- + +## 8. Änderungsprotokoll + +| Datum | Änderung | Person / Agent | +|---|---|---| +| 2026-04-05 | Initiales Arbeitsdokument aus `fm-rds-tx_pro_runtime_hardening_concept.json` erstellt | Alfred | From bd3ddb86cde19b10ab6370ae8ae60da3a6bea9c6 Mon Sep 17 00:00:00 2001 From: Jan Svabenik Date: Sun, 5 Apr 2026 11:16:16 +0200 Subject: [PATCH 02/55] docs: refresh README for current runtime and control surface --- README.md | 316 ++++++++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 259 insertions(+), 57 deletions(-) diff --git a/README.md b/README.md index 7cdf68d..d2df8ad 100644 --- a/README.md +++ b/README.md @@ -1,101 +1,290 @@ # fm-rds-tx -Go-based FM stereo transmitter with RDS. Supports ADALM-Pluto (PlutoSDR) and any SoapySDR-compatible TX device. +Go-based FM stereo transmitter with RDS. Supports ADALM-Pluto (PlutoSDR) and SoapySDR-compatible TX devices. -## Status: v0.7.0-pre — hardware bring-up milestone +## Status -### What works -- Complete DSP chain: pre-emphasis → stereo encoding → RDS (IEC 62106) → MPX → limiter → FM modulation -- Real hardware TX via SoapySDR CGO binding (PlutoSDR tested) -- Continuous TX engine with Start/Stop/Stats -- IQ resampling (composite rate → device rate) -- HTTP control plane with /tx/start, /tx/stop, /runtime -- 82 passing tests including spectral verification +**Current status:** `v0.7.0-pre` — hardware bring-up milestone -### Signal path -``` -Audio Source → PreEmphasis(50µs) → StereoEncoder(19k+38k) → RDS(57k) -→ MPX Combiner → Limiter → FM Modulator(±75kHz) -→ IQ Resample(228k→528k) → SoapySDR → PlutoSDR RF +What is already in place: +- complete DSP chain: audio -> pre-emphasis -> stereo encoding -> RDS -> MPX -> limiter -> FM modulation +- real hardware TX paths for PlutoSDR / SoapySDR backends +- continuous TX engine with runtime telemetry +- dry-run, offline generation, and simulated TX modes +- HTTP control plane with live config patching and runtime/status endpoints +- browser UI on `/` +- live audio ingestion via stdin or HTTP stream input + +## Signal path + +```text +Audio Source -> PreEmphasis(50us/75us/off) -> StereoEncoder(19k + 38k DSB-SC) +-> RDS(57k BPSK) -> MPX Combiner -> Limiter -> FM Modulator(+/-75kHz) +-> optional split-rate FM upsampling -> SDR backend -> RF output ``` +For deeper DSP details, see: +- `docs/DSP-CHAIN.md` + +## Prerequisites + +### Go +- Go version from `go.mod` (currently Go 1.22) + +### Native SDR dependencies +Depending on backend, native libraries are required: + +- **SoapySDR backend** + - build with `-tags soapy` + - requires SoapySDR native library (`SoapySDR.dll` / `libSoapySDR.so` / `libSoapySDR.dylib`) + - on Windows, PothosSDR is the expected setup + +- **Pluto backend** + - uses native `libiio` + - Windows expects `libiio.dll` + - Linux build/runtime expects `pkg-config` + `libiio` + +### Hardware / legal +- validate RF output, deviation, filtering, and power with proper measurement equipment +- use only within applicable legal and regulatory constraints + +## Quick start + ## Build ```powershell -# Without hardware (simulation/offline only): +# Build CLI tools without hardware-specific build tags: go build ./cmd/fmrtx go build ./cmd/offline -# With SoapySDR hardware support (requires PothosSDR installed): +# Build fmrtx with SoapySDR support: go build -tags soapy ./cmd/fmrtx ``` -## Usage +## Quick verification + +```powershell +# Print effective config +go run ./cmd/fmrtx -print-config + +# Run tests +go test ./... + +# Basic dry-run summary +go run ./cmd/fmrtx --dry-run --dry-output build/dryrun/frame.json +``` + +For additional build/test commands, see: +- `docs/README.md` + +## Common usage flows + +### 1) List available SDR devices -### List available SDR devices ```powershell .\fmrtx.exe --list-devices ``` -### Offline IQ file generation +### 2) Dry-run / config verification + ```powershell .\fmrtx.exe --dry-run --dry-output build/dryrun/frame.json + +# Write dry-run JSON to stdout +.\fmrtx.exe --dry-run --dry-output - +``` + +### 3) Offline IQ/composite generation + +```powershell go run ./cmd/offline -duration 2s -output build/offline/composite.iqf32 + +# Optional output rate override +go run ./cmd/offline -duration 500ms -output build/offline/composite.iqf32 -output-rate 228000 +``` + +### 4) Simulated transmit path + +```powershell +go run ./cmd/fmrtx --simulate-tx --simulate-output build/sim/simulated-soapy.iqf32 --simulate-duration 250ms ``` -### Real TX (PlutoSDR) +### 5) Real TX with config file + ```powershell -# Start with manual TX control via HTTP: +# Start TX service with manual start over HTTP .\fmrtx.exe --tx --config docs/config.plutosdr.json -# Start with auto-TX on launch: +# Start and begin transmitting immediately .\fmrtx.exe --tx --tx-auto-start --config docs/config.plutosdr.json ``` -### HTTP control +### 6) Live audio via stdin + +```powershell +ffmpeg -i "http://svabi.ch:8443/stream" -f s16le -ar 44100 -ac 2 - | .\fmrtx.exe --tx --tx-auto-start --audio-stdin --config docs/config.plutosdr.json +``` + +### 7) Custom audio input rate + +```powershell +ffmpeg -i source.wav -f s16le -ar 48000 -ac 2 - | .\fmrtx.exe --tx --tx-auto-start --audio-stdin --audio-rate 48000 --config docs/config.plutosdr.json +``` + +## CLI overview + +## `fmrtx` +Important runtime modes and flags include: +- `--tx` +- `--tx-auto-start` +- `--dry-run` +- `--dry-output ` +- `--simulate-tx` +- `--simulate-output ` +- `--simulate-duration ` +- `--config ` +- `--print-config` +- `--list-devices` +- `--audio-stdin` +- `--audio-rate ` + +## `offline` +Useful flags include: +- `-duration ` +- `-output ` +- `-output-rate ` + +If the README is too high-level for the exact CLI surface, check: +- `cmd/fmrtx/main.go` +- `cmd/offline/main.go` + +## HTTP control plane + +Base URL: `http://{listenAddress}` (default typically `127.0.0.1:8088`) + +### Main endpoints + +```text +GET / browser UI +GET /healthz health check +GET /status current config/status snapshot +GET /runtime live engine/driver/audio telemetry +GET /config full config +POST /config patch config / live updates +GET /dry-run synthetic frame summary +POST /tx/start start transmission +POST /tx/stop stop transmission +POST /audio/stream push raw S16LE stereo PCM into live stream buffer ``` -POST http://localhost:8088/tx/start → start transmission -POST http://localhost:8088/tx/stop → stop transmission -GET http://localhost:8088/runtime → engine + driver telemetry -GET http://localhost:8088/status → config status -GET http://localhost:8088/config → full config -POST http://localhost:8088/config → patch config (freq, RDS, etc.) -GET http://localhost:8088/dry-run → dry-run summary -GET http://localhost:8088/healthz → health check + +### What the control plane covers +- TX start / stop +- runtime status and driver telemetry +- config inspection +- live patching of selected parameters +- dry-run inspection +- browser-accessible control UI +- optional HTTP audio ingest + +### Live config notes +`POST /config` supports live updates for selected fields such as: +- frequency +- stereo enable/disable +- pilot / RDS injection levels +- RDS enable/disable +- limiter settings +- PS / RadioText + +Some parameters are saved but not live-applied and require restart. + +For the full API contract, examples, live-patch semantics, and `/audio/stream` details, see: +- `docs/API.md` + +## Configuration + +Sample configs: +- `docs/config.sample.json` +- `docs/config.plutosdr.json` +- `docs/config.orangepi-pluto-soapy.json` + +Important config areas include: +- `fm.*` +- `rds.*` +- `audio.*` +- `backend.*` +- `control.*` + +Examples of relevant fields you may want to inspect: +- `fm.outputDrive` +- `fm.mpxGain` +- `fm.bs412Enabled` +- `fm.bs412ThresholdDBr` +- `fm.fmModulationEnabled` +- `backend.kind` +- `backend.driver` +- `backend.deviceArgs` +- `backend.uri` +- `backend.deviceSampleRateHz` +- `backend.outputPath` +- `control.listenAddress` + +For deeper config/API behavior, refer to: +- `internal/config/config.go` +- `docs/API.md` +- `docs/config.sample.json` + +## Development and testing + +Useful commands: + +```powershell +go test ./... +go run ./cmd/fmrtx -print-config +go run ./cmd/fmrtx -config docs/config.sample.json +go run ./cmd/fmrtx --dry-run --dry-output build/dryrun/frame.json +go run ./cmd/fmrtx --simulate-tx --simulate-output build/sim/simulated-soapy.iqf32 --simulate-duration 250ms +go run ./cmd/offline -duration 500ms -output build/offline/composite.iqf32 ``` -## PlutoSDR notes +See also: +- `docs/README.md` -- Device rate: 528 kHz (PlutoSDR minimum ~521 kHz) -- IQ format: CF32 (float32 interleaved I/Q) -- Gain range: 0–89 dB (`outputDrive` 0..1 maps to 0..89 dB) -- SoapySDR driver name: `plutosdr` -- Requires: PothosSDR or SoapySDR + SoapyPlutoSDR plugin installed +## PlutoSDR / backend notes + +- PlutoSDR commonly runs with a device-side sample rate above composite rate, so split-rate mode may be used automatically +- SoapySDR backend is suitable for Soapy-compatible TX hardware +- backend/device settings are selected through config rather than hardcoded paths +- runtime telemetry should be used to inspect effective TX state during operation ## Repository layout ```text cmd/ - fmrtx/ main CLI (--tx, --dry-run, --simulate-tx, --list-devices) - offline/ offline IQ file generator + fmrtx/ main CLI + offline/ offline generator internal/ - app/ TX engine (continuous chunk loop) + simulated transmit - audio/ sample types, WAV loader, resampler, tone generator - config/ config schema, validation, PI parsing - control/ HTTP control plane (/tx/start, /tx/stop, /runtime) - dryrun/ JSON dry-run summaries - dsp/ oscillator, pre-emphasis, FM modulator, limiter, Goertzel, IQ resampler - mpx/ MPX combiner - offline/ offline composite generation (full DSP chain) - output/ backend abstractions (file, dummy) - platform/ SoapyDriver interface, SoapyBackend, SimulatedDriver - platform/soapysdr/ CGO SoapySDR native binding (build tag: soapy) - rds/ RDS encoder (IEC 62106, CRC, differential, group scheduler) - stereo/ stereo encoder (19 kHz pilot, 38 kHz DSB-SC) + app/ TX engine + runtime state + audio/ audio input, resampling, tone generation, stream buffering + config/ config schema and validation + control/ HTTP control plane + browser UI + dryrun/ dry-run JSON summaries + dsp/ DSP primitives + mpx/ MPX combiner + offline/ full offline composite generation + output/ output/backend abstractions + platform/ backend abstractions and device/runtime stats + platform/soapysdr/ CGO SoapySDR binding + platform/plutosdr/ Pluto/libiio backend code + rds/ RDS encoder + stereo/ stereo encoder docs/ - config.sample.json default config - config.plutosdr.json PlutoSDR-specific config - pro-runtime-hardening-workboard.md living workboard for pro runtime hardening + API.md + DSP-CHAIN.md + README.md + config.sample.json + config.plutosdr.json + config.orangepi-pluto-soapy.json + pro-runtime-hardening-workboard.md scripts/ examples/ ``` @@ -103,10 +292,23 @@ examples/ ## Planning / workboard For the current pro-runtime-hardening track, see: - - `docs/pro-runtime-hardening-workboard.md` -This document is the detailed working board for status tracking, confirmed findings, open decisions, verification notes, and implementation progress. +This is the living workboard for: +- status tracking +- confirmed findings +- open technical decisions +- verification notes +- implementation progress + +## Release / project docs + +Additional project docs: +- `CHANGELOG.md` +- `RELEASE.md` +- `docs/README.md` +- `docs/API.md` +- `docs/DSP-CHAIN.md` ## Legal note From 9c70795fe24243b2bce415ff2e673f5281fd7edb Mon Sep 17 00:00:00 2001 From: Jan Svabenik Date: Sun, 5 Apr 2026 11:24:24 +0200 Subject: [PATCH 03/55] docs: add pro runtime hardening concept --- fm-rds-tx_pro_runtime_hardening_concept.json | 831 +++++++++++++++++++ 1 file changed, 831 insertions(+) create mode 100644 fm-rds-tx_pro_runtime_hardening_concept.json diff --git a/fm-rds-tx_pro_runtime_hardening_concept.json b/fm-rds-tx_pro_runtime_hardening_concept.json new file mode 100644 index 0000000..c757a2d --- /dev/null +++ b/fm-rds-tx_pro_runtime_hardening_concept.json @@ -0,0 +1,831 @@ +{ + "document_type": "technical_concept", + "language": "de", + "audience": [ + "AI-Entwicklerteam", + "Reviewer", + "Maintainer" + ], + "project": { + "name": "fm-rds-tx", + "goal": "Aus dem bestehenden FM-Stereo/RDS-TX-System ein technisch sauberes, deterministisches, messbares und betriebsfestes Pro-Level-System machen.", + "primary_priority": "Technische Perfektion", + "secondary_priority": "Sinnvolle Umsetzungsreihenfolge mit maximalem Risikoabbau zuerst" + }, + "executive_summary": { + "current_strengths": [ + "Saubere Modultrennung zwischen Generator, DSP, Control, Audio, Backend und Plattform.", + "Persistenter DSP-Zustand ist bereits vorhanden, insbesondere im Generator und im FM-Upsampler.", + "Live-Updates werden bereits über atomare Snapshots bzw. Pointer modelliert.", + "Es existieren bereits viele Unit-Tests sowie spektrale Blackbox-Tests für 19 kHz, 38 kHz und 57 kHz." + ], + "current_limitations": [ + "Der TX-Pfad in internal/app/engine.go ist noch ein einzelner synchroner Generate/Upsample/Write-Loop ohne entkoppelten Echtzeitpuffer.", + "Die Runtime-Recovery ist schwach: bei Fehlern wird nur gezählt, geloggt und gewartet; es gibt keinen expliziten Fault-State mit deterministischem Fallback.", + "Die Control-Plane in internal/control/control.go ist funktional, aber nicht hart genug: keine Authentisierung, keine Transporthärtung, keine Request-Limits, keine Timeouts, keine Audit-Trails.", + "Validation und Runtime-Semantik sind nicht überall deckungsgleich; Beispiel: fm.outputDrive wird in config.Validate() bis 10 akzeptiert, in Engine.UpdateConfig() aber nur bis 3.", + "Device-Abstraktion ist brauchbar, aber noch nicht streng genug capability- und kalibrierungsgetrieben.", + "Observability ist noch zu schwach für echten Dauerbetrieb und reproduzierbare Fehleranalyse." + ], + "core_statement": "Der DSP-Kern ist nah an ernsthaft brauchbar. Der Abstand zu Pro-Level liegt primär in Betriebssicherheit, Observability, Hardwarevalidierung und strenger Runtime-Kontrolle." + }, + "repo_grounding": { + "confirmed_code_touchpoints": [ + { + "path": "internal/app/engine.go", + "observation": "TX läuft aktuell in einer einzelnen Goroutine als synchroner Zyklus: GenerateFrame -> optional FMUpsampler.Process -> driver.Write. Kein echter Producer/Consumer-Puffer zwischen Generator und Hardwarewriter." + }, + { + "path": "internal/offline/generator.go", + "observation": "Generator hat bereits persistenten Zustand, LiveParams per atomic.Pointer und sinnvolle DSP-Kette inklusive optionalem BS.412-Limiter." + }, + { + "path": "internal/control/control.go", + "observation": "HTTP-Control existiert bereits, aber ohne sichtbare Authentisierung, ohne Server-Timeout-Konfiguration und ohne harte API-Grenzen." + }, + { + "path": "internal/config/config.go", + "observation": "Validation ist vorhanden, aber semantisch nicht überall konsistent mit Laufzeitregeln." + }, + { + "path": "internal/platform/soapy.go", + "observation": "Capabilities und RuntimeStats existieren als Ansatz, reichen aber noch nicht für eine wirklich harte device-aware Steuerung." + }, + { + "path": "internal/audio/stream.go", + "observation": "Lock-freier SPSC-Ringbuffer für Live-Audio ist bereits vorhanden und kann als Referenz für deterministische Buffer-Designs dienen." + } + ], + "confirmed_inconsistencies": [ + { + "id": "CFG-SEM-001", + "description": "fm.outputDrive wird in Config.Validate() bis 10 akzeptiert, die Fehlermeldung spricht aber von 0..3, und Engine.UpdateConfig() erzwingt tatsächlich 0..3." + }, + { + "id": "CTL-UX-001", + "description": "handleAudioStream() nennt in der Fehlermeldung '--audio-http', im CLI ist dieser Schalter nicht als gleichwertiger offensichtlicher Bedienpfad bestätigt." + } + ] + }, + "design_principles": [ + "Kein verstecktes Glück: Jeder relevante Echtzeit- oder RF-Pfad muss deterministisch, messbar und reproduzierbar sein.", + "Fail-safe statt fail-weird: Bei Unsicherheit oder Überlast lieber definiert muten oder faulten als kaputt weiterzusenden.", + "Eine Runtime-Wahrheit: Konfiguration, Live-State und tatsächlich angewandte Hardware-/DSP-Parameter dürfen nicht auseinanderlaufen.", + "Hardware ist Wahrheit: IQ-Dateien und Unit-Tests reichen nicht; es braucht Hardware-in-the-loop und externe Decoder-/Messvalidierung.", + "Observability ist Pflicht, nicht Luxus: Kein Pro-Level ohne Metriken, strukturierte Logs, Fault-Telemetrie und reproduzierbare Regressionen.", + "Keine implizite Semantik: Alle Parameter müssen in Config, API, Runtime und Telemetrie exakt dasselbe bedeuten." + ], + "priority_model": { + "P0": "Technische Perfektion und Determinismus", + "P1": "Betriebssicherheit und Fehlerbeherrschung", + "P2": "Hardware-Wahrheit und RF-Qualität", + "P3": "Sichere und saubere Runtime-Steuerung", + "P4": "Deployment-, Release- und Service-Reife" + }, + "workstreams": [ + { + "id": "WS-01", + "priority": "P0", + "title": "Deterministische Echtzeit-TX-Pipeline mit entkoppeltem Writer", + "why": "Der aktuelle Single-Loop ist elegant, aber anfällig gegen Write-Spikes, Scheduling-Jitter und hardwarebedingte Blockaden. Pro-Level verlangt Entkopplung zwischen Erzeugung und Ausgabe.", + "objective": "Generator/Upsampler und Hardwarewriter werden als getrennte Stufen mit kleinem, kontrolliertem Frame-Puffer betrieben. Der Writer darf kurzfristige Timing-Jitter absorbieren, ohne sofort den gesamten TX-Zyklus zu ruinieren.", + "target_architecture": { + "pipeline": [ + "control-plane", + "runtime supervisor", + "generator worker", + "optional upsampler worker", + "bounded frame queue", + "writer worker", + "driver" + ], + "queue_policy": { + "type": "bounded ring queue", + "capacity_frames": 3, + "default_behavior": "Producer füllt vor, Writer sendet in Echtzeit, Supervisor überwacht Queue-Füllstand", + "allowed_strategies": [ + "block producer kurzzeitig", + "repeat last safe frame im Fault-Recovery-Modus", + "mute frame im Sicherheitsmodus" + ], + "forbidden_strategies": [ + "unbounded buffering", + "still und heimlich alte Frames verwerfen ohne Counter/Log", + "dynamisches Verhalten ohne Telemetrie" + ] + } + }, + "implementation_tasks": [ + { + "id": "WS-01-T1", + "title": "FrameQueue einführen", + "details": [ + "Neue interne Queue-Struktur für CompositeFrame oder DeviceFrame einführen.", + "Explizit festlegen, ob vor oder nach FMUpsampler gepuffert wird. Empfehlung: Puffern auf Device-Frame-Ebene, damit der Writer nur noch sendet.", + "FrameQueue muss feste Kapazität, Füllstand, High-Watermark, Low-Watermark und Drop-/Mute-/Repeat-Counter liefern." + ] + }, + { + "id": "WS-01-T2", + "title": "Writer-Worker einführen", + "details": [ + "Writer läuft in eigener Goroutine und besitzt alleinige Ownership über driver.Write().", + "Nur der Writer darf Write- und Tune-nahe Timinginteraktionen mit dem Treiber koordinieren.", + "Write-Dauer, Blockzeiten und Late-Events werden pro Frame gemessen." + ] + }, + { + "id": "WS-01-T3", + "title": "Supervisor-Schicht einführen", + "details": [ + "Supervisor bewertet Queue-Füllstand, Late-Rate, Fehlerhäufigkeit und entscheidet über Normal/Fault/Recovery.", + "Supervisor ist nicht nur Logik, sondern Runtime-State-Maschine." + ] + } + ], + "acceptance_criteria": [ + "Keine direkte synchrone Generate->Write-Kopplung mehr im Hauptpfad.", + "Queue-Füllstand und Write-Latenz sind live sichtbar.", + "Kurzzeitige Write-Spikes führen nicht sofort zu hörbaren Aussetzern oder unkontrolliertem Timing-Kollaps.", + "Langlauf mit 6h Testdauer zeigt keine wachsende Drift im Kontrollpfad und keine ungebremste Fehlereskalation." + ], + "affected_files": [ + "internal/app/engine.go", + "internal/output/backend.go", + "internal/platform/soapy.go" + ], + "example_interfaces": { + "frame_queue_contract": "type FrameQueue interface { Push(frame *output.CompositeFrame) error; Pop(ctx context.Context) (*output.CompositeFrame, error); FillLevel() float64; Depth() int; Capacity() int; Stats() QueueStats }", + "queue_stats_example": { + "capacity": 3, + "depth": 2, + "fillLevel": 0.6667, + "pushTimeouts": 0, + "popTimeouts": 0, + "droppedFrames": 0, + "repeatedFrames": 0, + "mutedFrames": 0 + } + } + }, + { + "id": "WS-02", + "priority": "P0", + "title": "Explizite Runtime-State-Maschine und Fault-Handling", + "why": "Aktuell existieren im Engine-State nur idle, running, stopping. Das reicht nicht für professionelles Fehlermanagement im Sendebetrieb.", + "objective": "Einführen eines klaren Betriebsmodells mit Fault-, Recovery- und Muted-Zuständen. Jeder kritische Fehlerpfad endet in definiertem Verhalten.", + "target_state_machine": { + "states": [ + "idle", + "arming", + "prebuffering", + "running", + "degraded", + "muted", + "faulted", + "stopping" + ], + "transition_examples": [ + { + "from": "idle", + "to": "arming", + "trigger": "StartTX angefordert und Grundvalidierung erfolgreich" + }, + { + "from": "arming", + "to": "prebuffering", + "trigger": "Driver gestartet, Queue erstellt, Generator bereit" + }, + { + "from": "prebuffering", + "to": "running", + "trigger": "Queue-Minimum erreicht" + }, + { + "from": "running", + "to": "degraded", + "trigger": "Late-Rate oberhalb Schwellwert oder Queue-Füllstand wiederholt kritisch" + }, + { + "from": "degraded", + "to": "muted", + "trigger": "Writer kann keine sichere Ausgabe mehr garantieren" + }, + { + "from": "muted", + "to": "faulted", + "trigger": "Persistenter Treiberfehler oder Recovery gescheitert" + }, + { + "from": "muted", + "to": "running", + "trigger": "Queue und Treiber wieder stabil" + } + ] + }, + "implementation_tasks": [ + { + "id": "WS-02-T1", + "title": "Fault-Klassifikation definieren", + "details": [ + "Treiberfehler", + "Write-Time-Budget überschritten", + "Queue leer", + "Queue permanent überfüllt", + "Signal-Selbsttest fehlgeschlagen", + "unerlaubte Live-Konfigurationsänderung" + ] + }, + { + "id": "WS-02-T2", + "title": "Reaktionsstrategie pro Fehlerklasse definieren", + "details": [ + "Warnen ohne Zustandswechsel", + "degraded mit Countern", + "muted mit stiller Trägerstrategie oder kompletter TX-Stille", + "faulted mit manueller oder automatischer Recovery" + ] + }, + { + "id": "WS-02-T3", + "title": "Event-Log und Fault-Historie einführen", + "details": [ + "Jeder Zustandswechsel ist auditierbar.", + "Faults enthalten Ursache, Timestamp, Metriken und letzten bekannten Runtime-Kontext." + ] + } + ], + "acceptance_criteria": [ + "Jede kritische Fehlersituation führt in einen expliziten Zustand statt in implizites Weiterlaufen.", + "Der aktuelle Runtime-State ist über API und Telemetrie jederzeit sichtbar.", + "Degraded/Muted/Faulted lassen sich in Tests gezielt triggern und verifizieren." + ], + "affected_files": [ + "internal/app/engine.go", + "internal/control/control.go" + ], + "example_fault_policy": { + "late_buffer_threshold_per_60s": 10, + "queue_critical_fill_below": 0.1, + "writer_error_burst": 3, + "policy": [ + "Bei 1-2 Einzelereignissen nur Counter erhöhen.", + "Ab Burstschwelle auf degraded schalten.", + "Wenn degraded > 5s anhält oder Write wiederholt fehlschlägt -> muted.", + "Wenn muted nicht innerhalb definierter Frist stabilisiert werden kann -> faulted." + ] + } + }, + { + "id": "WS-03", + "priority": "P0", + "title": "Semantische Korrektheit und harte Config-/Runtime-Konsistenz", + "why": "Technische Perfektion scheitert oft nicht am DSP, sondern an stillen Semantikabweichungen zwischen Config, API, Live-Update und tatsächlicher Laufzeit.", + "objective": "Ein einziger, eindeutig definierter Parameterraum. Jeder Wert hat exakt eine Bedeutung und identische Constraints in Config, HTTP-API, Runtime und Telemetrie.", + "implementation_tasks": [ + { + "id": "WS-03-T1", + "title": "Parameterinventar erstellen", + "details": [ + "Alle öffentlich und intern verwendeten Parameter inventarisieren.", + "Für jeden Parameter: Typ, Einheit, Bereich, Default, Hot-Reload-Fähigkeit, Safety-Relevanz, Telemetrie-Name." + ] + }, + { + "id": "WS-03-T2", + "title": "Validation vereinheitlichen", + "details": [ + "Config.Validate(), Engine.UpdateConfig() und API-Patch-Validierung dürfen nicht divergieren.", + "Beispiel: outputDrive muss an allen Stellen denselben Bereich haben." + ] + }, + { + "id": "WS-03-T3", + "title": "AppliedConfig einführen", + "details": [ + "Neben DesiredConfig muss es eine AppliedConfig geben.", + "API-Antworten sollen nicht nur sagen, was gewünscht wurde, sondern was tatsächlich übernommen wurde." + ] + } + ], + "acceptance_criteria": [ + "Kein Parameter hat an zwei Stellen unterschiedliche Grenzwerte oder Einheiten.", + "API kann DesiredConfig und AppliedConfig getrennt zurückgeben.", + "Ungültige Hot-Updates werden deterministisch abgelehnt und nicht teilweise angewandt." + ], + "affected_files": [ + "internal/config/config.go", + "internal/app/engine.go", + "internal/control/control.go" + ], + "example_parameter_schema": { + "name": "fm.outputDrive", + "unit": "logical composite drive factor", + "type": "float64", + "range": { + "min": 0.0, + "max": 3.0 + }, + "default": 1.0, + "hot_reloadable": true, + "safety_class": "medium", + "notes": "Darf nicht mit hardware gain oder RF gain verwechselt werden." + } + }, + { + "id": "WS-04", + "priority": "P1", + "title": "Observability, Telemetrie und Diagnosefähigkeit", + "why": "Ohne harte Telemetrie bleibt jedes Timing- oder RF-Problem ratenbasiert. Pro-Level braucht Metriken, strukturierte Logs und Diagnostik-Endpunkte.", + "objective": "Vollständige Sichtbarkeit auf Runtime, Queue, Writer, Generator, RF-Selbsttests und API-Aktivität schaffen.", + "implementation_tasks": [ + { + "id": "WS-04-T1", + "title": "Strukturiertes Logging einführen", + "details": [ + "Einheitliches Logging-Backend nutzen.", + "Keine verstreuten Printf-only Pfade als primäre Diagnose.", + "Jeder Fault, State-Change und API-Eingriff wird strukturiert geloggt." + ] + }, + { + "id": "WS-04-T2", + "title": "Prometheus-kompatible Metriken einführen", + "details": [ + "Engine-Metriken", + "Writer-Metriken", + "Queue-Metriken", + "RDS-/Pilot-Selbsttest-Metriken", + "Audio-Stream-Metriken", + "Control-Plane-Metriken" + ] + }, + { + "id": "WS-04-T3", + "title": "Debug- und Profiling-Endpunkte", + "details": [ + "pprof optional aktivierbar", + "Build-Info, Git-Commit, Build-Tags, Backend, Plattform und Runtime-Version ausgeben" + ] + } + ], + "acceptance_criteria": [ + "Jeder relevante Betriebsaspekt ist per Runtime-Endpunkt oder Metrics-Endpunkt sichtbar.", + "Fehlerfälle sind anhand von Logs und Countern nachvollziehbar, ohne den Code erneut zu lesen.", + "Langlaufprobleme lassen sich zeitlich korrelieren." + ], + "example_metrics": [ + "engine_chunks_generated_total", + "engine_late_buffers_total", + "engine_fault_transitions_total", + "writer_write_duration_seconds", + "queue_fill_ratio", + "queue_dropped_frames_total", + "queue_muted_frames_total", + "driver_write_errors_total", + "audio_stream_underruns_total", + "audio_stream_overflows_total", + "rf_selftest_pilot_db", + "rf_selftest_rds_57k_db" + ], + "affected_files": [ + "internal/app/engine.go", + "internal/control/control.go", + "internal/platform/soapy.go", + "internal/audio/stream.go" + ] + }, + { + "id": "WS-05", + "priority": "P1", + "title": "Sichere und erwachsene Control-Plane", + "why": "Sobald TX start/stop, Frequenz, RDS-Text oder Live-Audio per HTTP steuerbar sind, ist die Control-Plane ein sicherheitsrelevanter Teil des Systems.", + "objective": "API transport- und anwendungsseitig härten, state-aware machen und auditierbar gestalten.", + "implementation_tasks": [ + { + "id": "WS-05-T1", + "title": "Auth und Deploy-Modi definieren", + "details": [ + "Mindestens Token-Auth für Remote-Betrieb.", + "Optionale mTLS-Unterstützung für geschützte Infrastrukturen.", + "Explizite Betriebsmodi: localhost-only, trusted-lan, secured-remote." + ] + }, + { + "id": "WS-05-T2", + "title": "HTTP-Server härten", + "details": [ + "ReadTimeout", + "WriteTimeout", + "IdleTimeout", + "ReadHeaderTimeout", + "Body-Size-Limits", + "Content-Type-Validierung", + "Method enforcement" + ] + }, + { + "id": "WS-05-T3", + "title": "API semantisch aufräumen", + "details": [ + "DesiredConfig vs AppliedConfig vs RuntimeState", + "Idempotente Start/Stop-Endpunkte", + "Transaktionsartige Apply-/Reject-Antworten für Patches", + "Audit-Log pro wirksamem Eingriff" + ] + } + ], + "acceptance_criteria": [ + "Kein ungeschützter Remote-TX-Betrieb im Standardmodus.", + "API liefert deterministische und vollständige Antworten.", + "Große oder falsche Requests können das System nicht unkontrolliert stressen." + ], + "affected_files": [ + "internal/control/control.go", + "cmd/fmrtx/main.go" + ], + "example_patch_response": { + "ok": true, + "requestedChangeId": "cfg-2026-04-05T12:00:00Z-0001", + "desired": { + "frequencyMHz": 100.2, + "ps": "JAN FM" + }, + "applied": { + "frequencyMHz": 100.2, + "ps": "JAN FM" + }, + "state": "running", + "warnings": [] + } + }, + { + "id": "WS-06", + "priority": "P2", + "title": "Hardware-in-the-loop und externe RF-Wahrheitsprüfung", + "why": "Ein DSP-System ist erst dann pro-tauglich, wenn es auf echter Hardware und mit externem Decoder/Messergebnis wiederholt korrekt ist.", + "objective": "Nightly- oder manuell triggerbare Hardware-Regressionen etablieren, die nicht nur intern, sondern extern prüfen, was tatsächlich gesendet wird.", + "implementation_tasks": [ + { + "id": "WS-06-T1", + "title": "Loopback-/Capture-Setup definieren", + "details": [ + "Referenz-SDR oder definierter externer Empfänger als Capture-Seite.", + "Leistungsschutz und Dummy-Load-Konzept klar dokumentieren.", + "Standard-Testfrequenz, Standard-Device-Rate und Standard-Testdauer festlegen." + ] + }, + { + "id": "WS-06-T2", + "title": "Automatisierte Analyse bauen", + "details": [ + "Pilotenergie 19 kHz", + "Stereoenergie 38 kHz", + "RDS-Energie 57 kHz", + "Deviation/Composite-Level", + "Trägergenauigkeit", + "RDS-Decodierbarkeit und Fehlerrate", + "Langlaufdrift" + ] + }, + { + "id": "WS-06-T3", + "title": "Externen Decoder in Regression einbinden", + "details": [ + "PS und RT müssen extern decodierbar und stabil sein.", + "Nicht nur intern erzeugte Bits prüfen." + ] + } + ], + "acceptance_criteria": [ + "Definierte Testsignale werden extern reproduzierbar korrekt empfangen.", + "Pilot-, Stereo- und RDS-Komponenten liegen mit stabilen Pegeln an den erwarteten Frequenzen.", + "Langlauftests zeigen keine schleichende Entwertung der Sendekette." + ], + "affected_files": [ + "internal/offline/spectral_test.go", + "internal/platform/soapy.go", + "scripts/*" + ], + "example_hil_report": { + "device": "pluto", + "testDurationMinutes": 30, + "pilot19kDetected": true, + "stereo38kDetected": true, + "rds57kDetected": true, + "rdsDecodeSuccessRate": 0.998, + "maxCarrierErrorHz": 12.0, + "maxLateBuffers": 0, + "result": "pass" + } + }, + { + "id": "WS-07", + "priority": "P2", + "title": "Device-aware Capability- und Kalibrierungsmodell", + "why": "Ein generisches Backend-Modell reicht nicht, wenn unterschiedliche SDRs unterschiedliche Raten, Gains, Latenzen und Abweichungen haben.", + "objective": "Pro Gerät bzw. Gerätetyp bekannte Fähigkeiten und Kalibrierungen explizit abbilden.", + "implementation_tasks": [ + { + "id": "WS-07-T1", + "title": "Capabilities ausbauen", + "details": [ + "Sample-Rate-Sets oder Bereiche", + "Gain-Semantik", + "Frequenzraster", + "MTU-/Buffer-Empfehlungen", + "Minimale stabile Chunkgrößen", + "Tune-Latenzverhalten" + ] + }, + { + "id": "WS-07-T2", + "title": "Kalibrierungsprofil einführen", + "details": [ + "frequencyOffsetHz", + "deviationScale", + "mpxGainCalibration", + "driverChunkRecommendation", + "safeDefaultGain" + ] + }, + { + "id": "WS-07-T3", + "title": "Device-aware Validation", + "details": [ + "Config nicht nur gegen generische Regeln validieren, sondern gegen bekannte Device-Fähigkeiten.", + "Nicht unterstützte Raten oder gefährliche Kombinationen früh blockieren." + ] + } + ], + "acceptance_criteria": [ + "Treiber und Runtime wissen explizit, was das konkrete Device sicher kann.", + "Kalibrierte Geräte liefern reproduzierbarere RF-Ergebnisse.", + "Fehlkonfigurationen werden vor TX-Beginn erkannt." + ], + "affected_files": [ + "internal/platform/soapy.go", + "internal/config/config.go" + ], + "example_calibration_profile": { + "deviceId": "pluto-serial-1234", + "sampleRateHz": 1000000, + "frequencyOffsetHz": -18.0, + "deviationScale": 0.97, + "mpxGainCalibration": 1.03, + "safeDefaultGainDb": -20.0, + "preferredChunkMs": 50 + } + }, + { + "id": "WS-08", + "priority": "P2", + "title": "Signal-Selbstüberwachung im Betrieb", + "why": "Nur Post-mortem-Messungen reichen nicht. Das System soll im Betrieb merken, wenn Pilot, RDS oder Composite auffällig werden.", + "objective": "Leichtgewichtige In-Band-Selbsttests auf Chunk-Basis oder in Intervallen ausführen und in Fault-Logik einspeisen.", + "implementation_tasks": [ + { + "id": "WS-08-T1", + "title": "Chunk-basierte RF-Checks definieren", + "details": [ + "Goertzel oder kleine FFT für 19 kHz, 38 kHz und 57 kHz", + "Composite-Clipping- und Pegelindikatoren", + "Optional Deviation-Schätzer" + ] + }, + { + "id": "WS-08-T2", + "title": "Anomalieerkennung definieren", + "details": [ + "Pilot fehlt", + "RDS ungewöhnlich schwach", + "unerwartete Composite-Energieverteilung", + "langsame Drift" + ] + } + ], + "acceptance_criteria": [ + "Runtime kann relevante RF-Anomalien erkennen und melden.", + "Selbsttests sind ausreichend billig, um die Echtzeitfähigkeit nicht zu gefährden." + ], + "affected_files": [ + "internal/dsp/goertzel.go", + "internal/offline/generator.go", + "internal/app/engine.go" + ] + }, + { + "id": "WS-09", + "priority": "P3", + "title": "Teststrategie von Unit-Tests zu echter Qualitätsabsicherung erweitern", + "why": "Viele Tests sind gut. Die richtigen Testklassen sind besser. Pro-Level verlangt deterministische Regressionen, Fuzzing und Concurrency-Tests.", + "objective": "Testpyramide so ausbauen, dass Signal, Runtime und API gleichermaßen abgesichert sind.", + "implementation_tasks": [ + { + "id": "WS-09-T1", + "title": "Golden-Vector-Tests", + "details": [ + "Definierte Inputsignale und erwartete Analysewerte festschreiben.", + "Nicht nur boolsche Pass/Fail-Aussagen, sondern tolerierte numerische Fenster." + ] + }, + { + "id": "WS-09-T2", + "title": "Long-run Regressionen", + "details": [ + "Tausende Chunks durchlaufen lassen.", + "Boundary-Continuity, Drift, Queue-Stabilität, Writer-Stabilität prüfen." + ] + }, + { + "id": "WS-09-T3", + "title": "Race Detector, Fuzzing und API-Mutation-Tests", + "details": [ + "Konfigurationspatches", + "RDS-Texte", + "Audio-Ingest", + "Start/Stop-Rennen", + "gleichzeitige Live-Updates" + ] + } + ], + "acceptance_criteria": [ + "Es gibt Regressionen für DSP, Runtime, API und HIL.", + "Race Detector und Fuzzing finden keine bekannten kritischen Pfade mehr.", + "Nightly-Regressions geben maschinenlesbare Berichte aus." + ] + }, + { + "id": "WS-10", + "priority": "P4", + "title": "Service-Reife, Packaging und Reproduzierbarkeit", + "why": "Ein System ist nicht professionell, wenn nur der Autor es zuverlässig starten kann.", + "objective": "Saubere Build-, Release- und Betriebsartefakte bereitstellen.", + "implementation_tasks": [ + { + "id": "WS-10-T1", + "title": "Reproduzierbare Builds", + "details": [ + "Build-Flags, Version, Commit und Tags ins Binary schreiben.", + "Artefakte pro Zielplattform konsistent erzeugen." + ] + }, + { + "id": "WS-10-T2", + "title": "Service-Units und Beispiel-Deployments", + "details": [ + "systemd unit", + "EnvironmentFile-Unterstützung", + "Beispielkonfigurationen pro Backend" + ] + }, + { + "id": "WS-10-T3", + "title": "Migrations- und Schema-Versionierung", + "details": [ + "Config-Versionen einführen.", + "Klare Migrationsstrategie für ältere JSON-Konfigurationen." + ] + } + ], + "acceptance_criteria": [ + "Standardisierte Start- und Betriebswege existieren.", + "Build- und Runtime-Version sind eindeutig sichtbar.", + "Alte Konfigurationen können kontrolliert migriert werden." + ] + } + ], + "implementation_sequence": [ + { + "order": 1, + "workstreams": [ + "WS-03", + "WS-01", + "WS-02" + ], + "reason": "Erst muss die Semantik stimmen, dann die deterministische Pipeline, dann das Fault-Modell. Sonst baut man saubere Infrastruktur auf falschen Annahmen." + }, + { + "order": 2, + "workstreams": [ + "WS-04", + "WS-05" + ], + "reason": "Sobald Runtime-Struktur sauber ist, müssen Sichtbarkeit und sichere Steuerung folgen. Sonst ist der neue Kern schwer überprüfbar und riskant bedienbar." + }, + { + "order": 3, + "workstreams": [ + "WS-06", + "WS-07", + "WS-08" + ], + "reason": "Jetzt wird die Hardware-Wahrheit und RF-Qualität fest verdrahtet: HIL, Kalibrierung und laufende Signalkontrolle." + }, + { + "order": 4, + "workstreams": [ + "WS-09", + "WS-10" + ], + "reason": "Zum Schluss werden Regressionstiefe, Packaging und Service-Reife maximal professionalisiert." + } + ], + "cross_cutting_rules": { + "musts": [ + "Jeder neue Runtime-Zustand muss per API und Telemetrie sichtbar sein.", + "Jede neue Recovery- oder Drop-/Mute-Strategie braucht Counter, Logs und Tests.", + "Keine neue Konfigurationsoption ohne klaren Typ, Bereich, Einheit, Default und Hot-Reload-Klassifikation.", + "Hardware-nahe Änderungen brauchen mindestens Simulations- und HIL-Validierung.", + "Alle Faults müssen eine maschinenlesbare Ursache und eine menschenlesbare Zusammenfassung haben." + ], + "must_not": [ + "Keine unbounded Queues.", + "Keine stillen Fallbacks ohne Telemetrie.", + "Keine teilweise angewandten Live-Config-Änderungen ohne explizite Rückmeldung.", + "Keine unterschiedlichen Grenzwerte zwischen Config, API und Runtime.", + "Keine sicherheitsrelevanten HTTP-Endpunkte ohne Härtung im Remote-Betrieb." + ] + }, + "concrete_examples": { + "example_runtime_status": { + "state": "degraded", + "substate": "writer_backpressure", + "engine": { + "chunksProduced": 128443, + "lateBuffers": 7, + "underruns": 0, + "maxCycleMs": 51.12, + "maxWriteMs": 49.91 + }, + "queue": { + "capacity": 3, + "depth": 1, + "fillLevel": 0.3333, + "droppedFrames": 0, + "repeatedFrames": 2, + "mutedFrames": 0 + }, + "driver": { + "txEnabled": true, + "streamActive": true, + "samplesWritten": 64221500, + "slowWrites": 4 + }, + "lastFault": { + "code": "WRITER-LATE-BURST", + "at": "2026-04-05T12:34:56Z", + "message": "Write-Latenz wiederholt über Chunk-Budget" + } + }, + "example_fault_event": { + "eventId": "fault-000184", + "severity": "error", + "stateBefore": "running", + "stateAfter": "muted", + "code": "QUEUE-EMPTY-RECOVERY-FAILED", + "message": "Queue blieb trotz Recovery leer; Ausgang wurde auf Mute gesetzt", + "metrics": { + "queueFillLevel": 0.0, + "lateBuffersLast60s": 14, + "writerErrorsLast60s": 3 + } + }, + "example_rollout_plan": { + "milestone_1": "Semantik und State-Maschine stabil", + "milestone_2": "Entkoppelter Writer mit Telemetrie", + "milestone_3": "Sichere API und Audit-Log", + "milestone_4": "HIL-Regression und Kalibrierung", + "milestone_5": "Nightly-Qualitätsgates und reproduzierbare Releases" + } + }, + "definition_of_done": { + "technical": [ + "TX-Pfad ist deterministisch entkoppelt und fault-tolerant.", + "DesiredConfig, AppliedConfig und RuntimeState sind sauber getrennt.", + "Alle kritischen Fehler führen in explizite Zustände.", + "RF-Signalqualität ist intern und extern nachweisbar.", + "Metriken, strukturierte Logs und Diagnosepfade sind vollständig." + ], + "operational": [ + "System läuft im Langlauftest stabil.", + "Remote-Bedienung ist gehärtet und auditierbar.", + "Gerätespezifische Fähigkeiten und Kalibrierungen sind modelliert.", + "Builds und Deployments sind reproduzierbar." + ], + "quality_gates": [ + "Unit- und Integrationssuite grün", + "Race Detector grün", + "Fuzzing ohne kritische Findings", + "HIL-Regression grün", + "Soak-Test grün" + ] + }, + "final_instruction_to_ai_team": { + "summary": "Nicht blind Features ergänzen. Zuerst die Semantik und den Runtime-Kern hart machen. Dann Observability und sichere Steuerung. Danach Hardware-Wahrheit, Kalibrierung und Nightly-Regressions. Alles, was nicht deterministisch, messbar und fault-beherrscht ist, ist noch nicht pro-level.", + "first_step_now": [ + "WS-03 beginnen: Parameterinventar und semantische Vereinheitlichung.", + "Danach direkt WS-01 und WS-02 in einem Architektur-Branch umsetzen." + ] + } +} From 959252cdf0d730a874ed1fbd366a52717cc0e966 Mon Sep 17 00:00:00 2001 From: Jan Svabenik Date: Sun, 5 Apr 2026 11:43:00 +0200 Subject: [PATCH 04/55] docs: polish README structure and control-plane notes --- README.md | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index d2df8ad..ca0a03b 100644 --- a/README.md +++ b/README.md @@ -15,6 +15,15 @@ What is already in place: - browser UI on `/` - live audio ingestion via stdin or HTTP stream input +Current engineering focus: +- deterministic runtime behavior +- fault handling / recovery +- observability and runtime telemetry +- hardware-validated signal quality + +For the active runtime-hardening track, see: +- `docs/pro-runtime-hardening-workboard.md` + ## Signal path ```text @@ -162,13 +171,16 @@ If the README is too high-level for the exact CLI surface, check: Base URL: `http://{listenAddress}` (default typically `127.0.0.1:8088`) +Security note: +- keep the control plane bound locally unless you intentionally place it behind a trusted and hardened access layer + ### Main endpoints ```text GET / browser UI GET /healthz health check GET /status current config/status snapshot -GET /runtime live engine/driver/audio telemetry +GET /runtime live engine / driver / audio telemetry GET /config full config POST /config patch config / live updates GET /dry-run synthetic frame summary @@ -291,7 +303,7 @@ examples/ ## Planning / workboard -For the current pro-runtime-hardening track, see: +For the current runtime-hardening / professionalization track, see: - `docs/pro-runtime-hardening-workboard.md` This is the living workboard for: @@ -309,6 +321,7 @@ Additional project docs: - `docs/README.md` - `docs/API.md` - `docs/DSP-CHAIN.md` +- `docs/NOTES.md` ## Legal note From c888186e30dc3a688e404714ec12d8d79f8285c4 Mon Sep 17 00:00:00 2001 From: Jan Svabenik Date: Sun, 5 Apr 2026 11:51:23 +0200 Subject: [PATCH 05/55] feat: document WS-03 parameters and align outputDrive validation --- docs/pro-runtime-hardening-workboard.md | 30 ++++--- docs/ws-03-parameter-inventory.md | 108 ++++++++++++++++++++++++ internal/app/engine.go | 4 +- internal/app/engine_test.go | 2 +- 4 files changed, 129 insertions(+), 15 deletions(-) create mode 100644 docs/ws-03-parameter-inventory.md diff --git a/docs/pro-runtime-hardening-workboard.md b/docs/pro-runtime-hardening-workboard.md index 1f46138..d1902a6 100644 --- a/docs/pro-runtime-hardening-workboard.md +++ b/docs/pro-runtime-hardening-workboard.md @@ -82,7 +82,7 @@ Kein „ist im Kopf klar“. Der Stand kommt hier rein. # WS-03 — Semantische Korrektheit und harte Config-/Runtime-Konsistenz **Priorität:** P0 -**Gesamtstatus:** TODO +**Gesamtstatus:** IN PROGRESS ## Ziel Ein einziger, eindeutig definierter Parameterraum. Jeder Wert hat exakt eine Bedeutung und identische Constraints in Config, HTTP-API, Runtime und Telemetrie. @@ -93,13 +93,13 @@ Wenn Semantik und Grenzwerte nicht sauber vereinheitlicht sind, bauen spätere R ## Aufgaben ### WS-03-T1 — Parameterinventar erstellen -- **Status:** TODO -- **Owner:** offen +- **Status:** VERIFIED +- **Owner:** Builder A - **Code-Orte:** - `internal/config/config.go` - `internal/app/engine.go` - `internal/control/control.go` - - ggf. weitere betroffene Pakete + - `internal/offline/generator.go` - **Ziel:** Alle öffentlich und intern verwendeten Parameter inventarisieren mit: - Name @@ -114,25 +114,27 @@ Wenn Semantik und Grenzwerte nicht sauber vereinheitlicht sind, bauen spätere R - Wo leben heute implizite Parameter, die nicht sauber dokumentiert sind? - Welche Runtime-Werte sind abgeleitet statt direkt konfigurierbar? - **Nachweis:** - - Parameterinventar im Repo vorhanden - - referenzierbar für Config/API/Runtime + - `docs/ws-03-parameter-inventory.md` enthält das inventarisierte Parameter-Tableau und referenziert Config/Control/Engine. + - Live-Nutzung über `internal/control/control.go` → `LivePatch` dokumentiert. - **Restrisiken:** - versteckte Semantik in Helper-Funktionen übersehen ### WS-03-T2 — Validation vereinheitlichen -- **Status:** TODO -- **Owner:** offen +- **Status:** VERIFIED +- **Owner:** Builder A - **Code-Orte:** - `internal/config/config.go` - `internal/app/engine.go` + - `internal/app/engine_test.go` - `internal/control/control.go` - **Ziel:** `Config.Validate()`, Runtime-Update-Pfade und API-Patch-Validierung dürfen nicht divergieren. - **Bereits bekannter Startpunkt:** - `fm.outputDrive` - **Nachweis:** - - bekannte Inkonsistenzen beseitigt - - Tests für gemeinsame Grenzwerte vorhanden + - CFG-SEM-001: `outputDrive`-Validation in `Engine.UpdateConfig` jetzt 0..10 (wie `Config.Validate`). + - Tests (`go test ./...`) fangen neue Range ab und besitzen aktualisierten `engine_test`-Check. + - Live-Patch fließt durch `txBridge` und `LivePatch` (control) → `LiveConfigUpdate`. - **Restrisiken:** - weitere Inkonsistenzen erst beim Inventar sichtbar @@ -155,10 +157,14 @@ Wenn Semantik und Grenzwerte nicht sauber vereinheitlicht sind, bauen spätere R - unsaubere Migration bestehender Statusantworten ## WS-03 Entscheidungslog -- Noch leer +| Datum | Entscheidung | Notiz | +|---|---|---| +| 2026-04-05 | CFG-SEM-001: `fm.outputDrive` | Live-Validierung auf 0..10 angeglichen, Tests angepasst, Parameterinventar dokumentiert. | ## WS-03 Verifikation -- Noch leer +| Datum | Fokus | Ergebnis | +|---|---|---| +| 2026-04-05 | `go test ./...` | ✅ Bestätigt `Engine.UpdateConfig`, `LivePatch` und Parameter-Range sowie Inventar-Dokumentation. | --- diff --git a/docs/ws-03-parameter-inventory.md b/docs/ws-03-parameter-inventory.md new file mode 100644 index 0000000..6f64feb --- /dev/null +++ b/docs/ws-03-parameter-inventory.md @@ -0,0 +1,108 @@ +# WS-03 Parameterinventar — Semantik & Runtime-Konsistenz + +> Repo-grounded Übersicht der öffentlich sichtbaren und runtime-relevanten Parameter aus `internal/config`, `internal/control`, `internal/app/engine` und dem HTTP-API-Stack. + +## Ziel + +Dieses Dokument liefert einen festen Referenzpunkt für WS-03: +1. Welche Parameter konfiguriert werden können (JSON + CLI + HTTP). +2. Welche Wertebereiche und Einheiten sie haben. +3. Welche davon live per HTTP-Patch übernommen werden. +4. Wo im Code die Validierung, Anwendung und Telemetrie lebt. + +Alle Angaben beziehen sich direkt auf die `Config`-Definition (`internal/config/config.go`), den Control-Server (`internal/control/control.go`) und die Engine-Live-Updates (`internal/app/engine.go`, `internal/offline/generator.go`). + +--- + +## 1. Control-Plane & Backend (requires restart) + +| Parameter | Typ | Default | Range / Einheit | Hot reload | Beschreibung & Code-Referenzen | +|---|---|---|---|---|---| +| `control.listenAddress` | `string` | `"127.0.0.1:8088"` | `:` | ❌ (Server-Neustart) | HTTP-Server-Bindadresse, `cmd/fmrtx/main.go` startet Listen mit `cfg.Control.ListenAddress`. | +| `backend.kind` | `string` | `"file"` | `file` / `pluto` / `soapy` | ❌ | Wahl des TX-Backends; `selectDriver` (cmd/fmrtx/main.go) entscheidet darauf basierend. | +| `backend.device` | `string` | `""` | SoapySDR/Pluto device string | ❌ | Wird an `platform.SoapyConfig.Device` weitergegeben. | +| `backend.deviceSampleRateHz` | `float64` | `0` | >0 Hz (0 = fallback auf `fm.compositeRateHz`) | ❌ | Treibt `cfg.EffectiveDeviceRate()` und damit Treiber-Konfiguration (`cmd/fmrtx/main.go`). | +| `backend.uri` / `deviceArgs` | `string` / `map[string]string` | `""` / `nil` | Driver-spezifisch | ❌ | Zusätzliche Soapy-Parameter, weitergereicht an `platform.SoapyConfig`. | + +> `backend.*` dürfen zur Konfiguration gepatcht werden, gelten aber erst nach Neustart des TX-Modus. + +--- + +## 2. Audio-Quelle (reload requires restart) + +| Parameter | Typ | Default | Range | Hot reload | Referenzen | +|---|---|---|---|---|---| +| `audio.inputPath` | `string` | `""` | Pfad zu WAV-Dateien | ❌ | `offline/generator.sourceFor` entscheidet, ob WAV oder interne Töne genutzt werden; `audio.LoadWAVSource`. | +| `audio.gain` | `float64` | `1.0` | `0..4` | ❌ | Verstärkt vor Pre-Emphasis (`generator.NewPreEmphasizedSource`, `docs/DSP-CHAIN.md`). | +| `audio.toneLeftHz`, `toneRightHz` | `float64` | `1000`, `1600` | >0 Hz | ❌ | Fallback-Tonquelle (`audio.NewConfiguredToneSource`). | +| `audio.toneAmplitude` | `float64` | `0.4` | `0..1` | ❌ | Amplitude der internen Töne, skaliert vor DSP. | + +Diese Parameter sind nur im JSON/HTTP-Config sichtbar, aber nicht live per `LiveConfigUpdate` (keine `LivePatch`-Felder). Ein Restart der TX-Engine ist nötig. + +--- + +## 3. FM-DSP Parameter (häufig hot-reloadable) + +| Parameter | Typ | Default | Range / Einheit | Hot reload | Beschreibung & Code-Referenzen | +|---|---|---|---|---|---| +| `fm.frequencyMHz` | `float64` | `100.0` | `65..110` MHz | ✅ (LivePatch → Engine.UpdateConfig) | Ruft `Engine.pendingFreq` auf, `driver.Tune` wird zwischen Chunks ausgeführt (`internal/app/engine.go`, `control.LivePatch`). | +| `fm.outputDrive` | `float64` | `0.5` | `0..10` (empfohlen `1..4`) | ✅ | Multiplikator vor Limiter/Klipps (`generator.GenerateFrame`, `docs/DSP-CHAIN.md`). Validierung: `internal/config/config.go` + `Engine.UpdateConfig` (CFG-SEM-001 behoben, nun 0..10). | +| `fm.stereoEnabled` | `bool` | `true` | — | ✅ | Schaltet Stereo-Encode und Pilot (Intern `offpkg.Generator`). | +| `fm.pilotLevel` | `float64` | `0.09` | `0..0.2` (9% ±75 kHz) | ✅ | Pilot-Addition nach Composite-Clipper (`generator.GenerateFrame`). | +| `fm.rdsInjection` | `float64` | `0.04` | `0..0.15` | ✅ | RDS-Träger am Ende der Kette (`generator.GenerateFrame`). | +| `fm.preEmphasisTauUS` | `float64` | `50` | `0` / `50` / `75` µs | ❌ | Pre-Emphasis-Filter vor Tonquelle (`NewPreEmphasizedSource`). | +| `fm.limiterEnabled` | `bool` | `true` | — | ✅ | Aktiviert StereoLimiter (`dsp.NewStereoLimiter`). | +| `fm.limiterCeiling` | `float64` | `1.0` | `0..2` | ✅ | Maximalwert für Clips und Composite Sättigung. | +| `fm.bs412Enabled` | `bool` | `false` | — | ❌ | Optionaler ITU-R BS.412 MPX Power Limiter (`dsp.NewBS412Limiter`). | +| `fm.bs412ThresholdDBr` | `float64` | `0` | beliebig (dBr) | ❌ | Grenzwert für BS.412-Limiter. | +| `fm.mpxGain` | `float64` | `1.0` | `0.1..5` | ❌ | Hardware-Calibration für effective Deviation (`generator.init`, `FMModulator`). | +| `fm.maxDeviationHz` | `float64` | `75000` | `0..150000` Hz | ❌ | Steuert `FMModulator.MaxDeviation`. | +| `fm.compositeRateHz` | `int` | `228000` | — | ❌ | Setzt DSP-Sample-Rate, beeinflusst `generator` + `Engine` (`cfg.EffectiveDeviceRate`). | +| `fm.fmModulationEnabled` | `bool` | `true` | — | ❌ | Schaltet `dsp.FMModulator`; beim Split-Rate-Modus wird es automatisch deaktiviert. | + +> Hot-reload-fähige Felder kommen in `LiveConfigUpdate`. Parameter wie `preEmphasisTauUS`, `bs412*`, `mpxGain` bleiben nur nach Neustart gültig und können via `/config` gepatched werden, aber nicht live übernommen. + +--- + +## 4. RDS & Telemetrie + +| Parameter | Typ | Default | Range | Hot reload | Beschreibung | +|---|---|---|---|---|---| +| `rds.enabled` | `bool` | `true` | — | ✅ | Aktiviert Encoder und Telemetrie (`generator.init`). | +| `rds.pi` | `string` | `"1234"` | Hex, 4 Zeichen | ❌ | Validierung `ParsePI`. | +| `rds.ps` | `string` | `"FMRTX"` | max 8 Zeichen | ✅ | Realtime-Update via `rdsp.UpdateText`. | +| `rds.radioText` | `string` | `"fm-rds-tx"` | max 64 Zeichen | ✅ | Text wird an Encoder weitergereicht. | +| `rds.pty` | `int` | `0` | `0..31` | ❌ | Wird nur bei Init in Encoder gesetzt. | + +Telemetrie: `/status` (control) meldet `rdsEnabled`, `pilotLevel`, `limiterEnabled` u.a. (`internal/control/control.go`). + +--- + +## 5. Hot-Update-Fluss + +1. `POST /config` (`internal/control.control.go`) aktualisiert das Snapshot-Config und validiert mit `Config.Validate()`. +2. Für die Live-fähigen Parameter (⇓) wird ein `LivePatch` erstellt. +3. `TXController.UpdateConfig` (z.B. `txBridge`) übersetzt in `LiveConfigUpdate` und ruft `Engine.UpdateConfig`. +4. `Engine` validiert identische Bereiche (jetzt 0..10 für `outputDrive`) und schreibt in Generator-Live-Params. +5. Änderungen werden zwischen Chunks angewendet (`pendingFreq`, `generator.UpdateLive`). + +| Live-Feld | Code-Quellen | +|---|---| +| `frequencyMHz` | `LiveConfigUpdate`, `Engine.pendingFreq`, `driver.Tune` | +| `outputDrive` | `Generator.LiveParams.OutputDrive`, `CFG-SEM-001 fix` | +| `stereoEnabled`, `pilotLevel`, `rdsInjection` | `generator.GenerateFrame` | +| `rdEnabled`, `limiterEnabled`, `limiterCeiling` | `LiveParams`, `Engine.UpdateConfig` | +| `PS`, `RadioText` | `generator.RDSEncoder().UpdateText` | + +Dieses Inventar ist Referenz für WS-03-T1/T2 und bildet die Basis für Tests und Telemetrie. + +--- + +## 6. Weiteres Nachweis-Tracking + +- Parameterwerte validiert über `config.Config.Validate()` (`internal/config/config.go`). +- CFG-SEM-001 (`fm.outputDrive`) wird sowohl von Config als auch von Live-Update begrenzt (nun 0..10). +- Dokumentation: `docs/DSP-CHAIN.md` beschreibt die grafische Signalverkettung und damit die Bedeutung von `outputDrive`, `limiterCeiling`, `pilotLevel` und `rdsInjection`. +- Runtime-Exposition: `/status` und `/runtime` melden Sample-, Driver- und Engine-Stats (control handler). + +Diese Datei gehört ab sofort zu WS-03 und sollte bei weiteren Änderungen an öffentlichen Parametern gepflegt werden. diff --git a/internal/app/engine.go b/internal/app/engine.go index 8928be0..bd13134 100644 --- a/internal/app/engine.go +++ b/internal/app/engine.go @@ -202,8 +202,8 @@ func (e *Engine) UpdateConfig(u LiveConfigUpdate) error { } } if u.OutputDrive != nil { - if *u.OutputDrive < 0 || *u.OutputDrive > 3 { - return fmt.Errorf("outputDrive out of range (0-3)") + if *u.OutputDrive < 0 || *u.OutputDrive > 10 { + return fmt.Errorf("outputDrive out of range (0-10)") } } if u.PilotLevel != nil { diff --git a/internal/app/engine_test.go b/internal/app/engine_test.go index 95a111b..64e350a 100644 --- a/internal/app/engine_test.go +++ b/internal/app/engine_test.go @@ -238,7 +238,7 @@ func TestEngineLiveUpdateValidation(t *testing.T) { } // Out of range drive - badDrive := 10.0 + badDrive := 11.0 if err := eng.UpdateConfig(LiveConfigUpdate{OutputDrive: &badDrive}); err == nil { t.Fatal("expected validation error for bad drive") } From 19716e26eeeef5c9dcfbb10e1356fe7c8d9ccbaf Mon Sep 17 00:00:00 2001 From: Jan Svabenik Date: Sun, 5 Apr 2026 12:24:31 +0200 Subject: [PATCH 06/55] WS-03: align outputDrive and desired/applied state --- docs/API.md | 4 +- docs/pro-runtime-hardening-workboard.md | 13 +++--- internal/config/config.go | 2 +- internal/control/control.go | 35 +++++++------- internal/control/control_test.go | 62 +++++++++++++++++++++++++ 5 files changed, 90 insertions(+), 26 deletions(-) diff --git a/docs/API.md b/docs/API.md index b29d676..78e0122 100644 --- a/docs/API.md +++ b/docs/API.md @@ -77,6 +77,8 @@ Full current configuration (all fields, including non-patchable). **Live parameter update.** Changes are applied to the running TX engine immediately — no restart required. Only include fields you want to change (PATCH semantics). +The control snapshot (GET /config) only reflects new values once they pass validation and, if the TX engine is running, after the live update succeeded. That keeps the API from reporting desired values that were rejected or still pending. + **Request body:** JSON with any subset of patchable fields. **Response:** @@ -92,7 +94,7 @@ Full current configuration (all fields, including non-patchable). | Field | Type | Range | Description | |---|---|---|---| | `frequencyMHz` | float | 65–110 | TX center frequency. Tunes hardware LO live. | -| `outputDrive` | float | 0–3 | Composite output level multiplier. | +| `outputDrive` | float | 0–10 | Composite output level multiplier (empfohlen 1..4). | | `stereoEnabled` | bool | | Enable/disable stereo (pilot + 38kHz subcarrier). | | `pilotLevel` | float | 0–0.2 | 19 kHz pilot injection level. | | `rdsInjection` | float | 0–0.15 | 57 kHz RDS subcarrier injection level. | diff --git a/docs/pro-runtime-hardening-workboard.md b/docs/pro-runtime-hardening-workboard.md index d1902a6..1a3866a 100644 --- a/docs/pro-runtime-hardening-workboard.md +++ b/docs/pro-runtime-hardening-workboard.md @@ -139,8 +139,8 @@ Wenn Semantik und Grenzwerte nicht sauber vereinheitlicht sind, bauen spätere R - weitere Inkonsistenzen erst beim Inventar sichtbar ### WS-03-T3 — DesiredConfig / AppliedConfig einführen -- **Status:** TODO -- **Owner:** offen +- **Status:** IN PROGRESS +- **Owner:** Lead Coderaffe - **Code-Orte:** - `internal/app/engine.go` - `internal/control/control.go` @@ -151,20 +151,21 @@ Wenn Semantik und Grenzwerte nicht sauber vereinheitlicht sind, bauen spätere R - tatsächlich angewandter Konfiguration - aktuellem Runtime-Zustand - **Nachweis:** - - API kann beide Sichten getrennt ausgeben - - partielle oder abgelehnte Übernahmen werden sichtbar + - `internal/control/control.go` wartet mit Snapshot-Updates, bis LivePatch erfolgreich war. + - `internal/control/control_test.go` deckt ab, dass abgelehnte Live-Updates keine neue `GET /config`-Ansicht schreiben. - **Restrisiken:** - - unsaubere Migration bestehender Statusantworten + - Die API liefert noch nicht beide Sichten gleichzeitig; weitere Workstreams müssen Desired/Applied explizit zurückgeben. ## WS-03 Entscheidungslog | Datum | Entscheidung | Notiz | |---|---|---| | 2026-04-05 | CFG-SEM-001: `fm.outputDrive` | Live-Validierung auf 0..10 angeglichen, Tests angepasst, Parameterinventar dokumentiert. | +| 2026-04-05 | WS-03-T3: Desired/Applied-Gate | Control-API zeigt Snapshots nur noch, wenn LivePatch erfolgreich angewendet wurde; Tests verhindern irreführende Wunschwerte. | ## WS-03 Verifikation | Datum | Fokus | Ergebnis | |---|---|---| -| 2026-04-05 | `go test ./...` | ✅ Bestätigt `Engine.UpdateConfig`, `LivePatch` und Parameter-Range sowie Inventar-Dokumentation. | +| 2026-04-05 | `go test ./...` | ✅ Bestätigt `Engine.UpdateConfig`, `LivePatch` und Parameter-Range sowie Inventar-Dokumentation. Neue Control-Tests sichern Desired/Applied-Gate. | --- diff --git a/internal/config/config.go b/internal/config/config.go index 768a40a..7654d17 100644 --- a/internal/config/config.go +++ b/internal/config/config.go @@ -136,7 +136,7 @@ func (c Config) Validate() error { return fmt.Errorf("fm.rdsInjection out of range") } if c.FM.OutputDrive < 0 || c.FM.OutputDrive > 10 { - return fmt.Errorf("fm.outputDrive out of range (0..3)") + return fmt.Errorf("fm.outputDrive out of range (0..10)") } if c.FM.CompositeRateHz < 96000 || c.FM.CompositeRateHz > 1520000 { return fmt.Errorf("fm.compositeRateHz out of range") diff --git a/internal/control/control.go b/internal/control/control.go index 7c74e7f..9f3420a 100644 --- a/internal/control/control.go +++ b/internal/control/control.go @@ -283,32 +283,31 @@ func (s *Server) handleConfig(w http.ResponseWriter, r *http.Request) { http.Error(w, err.Error(), http.StatusBadRequest) return } - s.cfg = next + lp := LivePatch{ + FrequencyMHz: patch.FrequencyMHz, + OutputDrive: patch.OutputDrive, + StereoEnabled: patch.StereoEnabled, + PilotLevel: patch.PilotLevel, + RDSInjection: patch.RDSInjection, + RDSEnabled: patch.RDSEnabled, + LimiterEnabled: patch.LimiterEnabled, + LimiterCeiling: patch.LimiterCeiling, + PS: patch.PS, + RadioText: patch.RadioText, + } tx := s.tx - s.mu.Unlock() - - // Forward live-patchable params to running engine (if active) if tx != nil { - lp := LivePatch{ - FrequencyMHz: patch.FrequencyMHz, - OutputDrive: patch.OutputDrive, - StereoEnabled: patch.StereoEnabled, - PilotLevel: patch.PilotLevel, - RDSInjection: patch.RDSInjection, - RDSEnabled: patch.RDSEnabled, - LimiterEnabled: patch.LimiterEnabled, - LimiterCeiling: patch.LimiterCeiling, - PS: patch.PS, - RadioText: patch.RadioText, - } if err := tx.UpdateConfig(lp); err != nil { + s.mu.Unlock() http.Error(w, err.Error(), http.StatusBadRequest) return } } - + s.cfg = next + live := tx != nil + s.mu.Unlock() w.Header().Set("Content-Type", "application/json") - _ = json.NewEncoder(w).Encode(map[string]any{"ok": true, "live": tx != nil}) + _ = json.NewEncoder(w).Encode(map[string]any{"ok": true, "live": live}) default: http.Error(w, "method not allowed", http.StatusMethodNotAllowed) } diff --git a/internal/control/control_test.go b/internal/control/control_test.go index 6172102..fc01438 100644 --- a/internal/control/control_test.go +++ b/internal/control/control_test.go @@ -3,6 +3,7 @@ package control import ( "bytes" "encoding/json" + "errors" "net/http" "net/http/httptest" "testing" @@ -59,3 +60,64 @@ func TestTXStartWithoutController(t *testing.T) { srv.Handler().ServeHTTP(rec, httptest.NewRequest(http.MethodPost, "/tx/start", nil)) if rec.Code != http.StatusServiceUnavailable { t.Fatalf("expected 503, got %d", rec.Code) } } + +func TestConfigPatchUpdatesSnapshot(t *testing.T) { + srv := NewServer(cfgpkg.Default()) + srv.SetTXController(&fakeTXController{}) + + rec := httptest.NewRecorder() + body := []byte(`{"outputDrive":1.2}`) + srv.Handler().ServeHTTP(rec, httptest.NewRequest(http.MethodPost, "/config", bytes.NewReader(body))) + if rec.Code != 200 { + t.Fatalf("status: %d", rec.Code) + } + var resp map[string]any + if err := json.Unmarshal(rec.Body.Bytes(), &resp); err != nil { + t.Fatalf("unmarshal response: %v", err) + } + if live, ok := resp["live"].(bool); !ok || !live { + t.Fatalf("expected live true, got %v", resp["live"]) + } + + rec = httptest.NewRecorder() + srv.Handler().ServeHTTP(rec, httptest.NewRequest(http.MethodGet, "/config", nil)) + var cfg cfgpkg.Config + if err := json.NewDecoder(rec.Body).Decode(&cfg); err != nil { + t.Fatalf("decode config: %v", err) + } + if cfg.FM.OutputDrive != 1.2 { + t.Fatalf("expected snapshot to reflect new drive, got %v", cfg.FM.OutputDrive) + } +} + +func TestConfigPatchEngineRejectsDoesNotUpdateSnapshot(t *testing.T) { + srv := NewServer(cfgpkg.Default()) + srv.SetTXController(&fakeTXController{updateErr: errors.New("boom")}) + + body := []byte(`{"outputDrive":2.2}`) + rec := httptest.NewRecorder() + srv.Handler().ServeHTTP(rec, httptest.NewRequest(http.MethodPost, "/config", bytes.NewReader(body))) + if rec.Code != http.StatusBadRequest { + t.Fatalf("expected 400, got %d", rec.Code) + } + + rec = httptest.NewRecorder() + srv.Handler().ServeHTTP(rec, httptest.NewRequest(http.MethodGet, "/config", nil)) + var cfg cfgpkg.Config + if err := json.NewDecoder(rec.Body).Decode(&cfg); err != nil { + t.Fatalf("decode config: %v", err) + } + if cfg.FM.OutputDrive != cfgpkg.Default().FM.OutputDrive { + t.Fatalf("expected snapshot untouched, got %v", cfg.FM.OutputDrive) + } +} + +type fakeTXController struct { + updateErr error +} + +func (f *fakeTXController) StartTX() error { return nil } +func (f *fakeTXController) StopTX() error { return nil } +func (f *fakeTXController) TXStats() map[string]any { return map[string]any{} } +func (f *fakeTXController) UpdateConfig(_ LivePatch) error { return f.updateErr } + From b2fa1d9c23e193082b3b1e64f0eebb65125df188 Mon Sep 17 00:00:00 2001 From: Jan Svabenik Date: Sun, 5 Apr 2026 12:36:41 +0200 Subject: [PATCH 07/55] feat: add runtime frame queue --- docs/pro-runtime-hardening-workboard.md | 49 ++++-- internal/app/engine.go | 67 ++++++-- internal/config/config.go | 27 ++- internal/output/frame_queue.go | 211 ++++++++++++++++++++++++ internal/output/frame_queue_test.go | 82 +++++++++ 5 files changed, 395 insertions(+), 41 deletions(-) create mode 100644 internal/output/frame_queue.go create mode 100644 internal/output/frame_queue_test.go diff --git a/docs/pro-runtime-hardening-workboard.md b/docs/pro-runtime-hardening-workboard.md index 1a3866a..d8aee65 100644 --- a/docs/pro-runtime-hardening-workboard.md +++ b/docs/pro-runtime-hardening-workboard.md @@ -42,9 +42,10 @@ Kein „ist im Kopf klar“. Der Stand kommt hier rein. ## 2. Gesamtüberblick ## Gesamtstatus -- Projektphase: `Planung / Strukturierung` -- Technischer Fokus aktuell: `noch offen` -- Nächster sinnvoller Startpunkt laut Konzept: `WS-03 Semantische Korrektheit und harte Config-/Runtime-Konsistenz` +- Projektphase: `Umsetzung (WS-01)` +- Technischer Fokus aktuell: `Entkoppelter TX-Pfad (FrameQueue + Writer)` +- Nächster sinnvoller Startpunkt laut Konzept: `WS-01 Deterministische Echtzeit-TX-Pipeline mit entkoppeltem Writer` +- Vorangegangene Workstreams: `WS-03 Semantische Korrektheit und konsistent angewandte Config` (abgeschlossen) ## Repo-bezogene bestätigte Ausgangslage @@ -171,7 +172,7 @@ Wenn Semantik und Grenzwerte nicht sauber vereinheitlicht sind, bauen spätere R # WS-01 — Deterministische Echtzeit-TX-Pipeline mit entkoppeltem Writer **Priorität:** P0 -**Gesamtstatus:** TODO +**Gesamtstatus:** IN PROGRESS ## Ziel Generator/Upsampler und Hardwarewriter werden als getrennte Stufen mit kleinem, kontrolliertem Frame-Puffer betrieben. @@ -184,21 +185,28 @@ Generator/Upsampler und Hardwarewriter werden als getrennte Stufen mit kleinem, ## Aufgaben ### WS-01-T1 — FrameQueue einführen -- **Status:** TODO -- **Owner:** offen +- **Status:** VERIFIED +- **Owner:** Lead Coderaffe - **Code-Orte:** + - `internal/output/frame_queue.go` + - `internal/output/frame_queue_test.go` - `internal/app/engine.go` - - ggf. neues internes Queue-Modul - - `internal/output/*` - **Ziel:** - Bounded Queue mit fester Kapazität, sichtbarem Füllstand und Countern. + Bounded Queue mit fester Kapazität, sichtbarem Füllstand, Counter- / Statistikzugriff und klarer Trennung zwischen Generator und Writer. - **Zu entscheiden:** - - Puffern vor oder nach Upsampling? - - Referenzentscheidung im Konzept: eher Device-Frame-Ebene + - Puffern vor oder nach Upsampling → Device-Frame-Ebene (Queue lebt nach dem Upsampler) für Writer-Simplifizierung. + - Referenzkapazität: `runtime.frameQueueCapacity` (default 3) bleibt konfigurierbar. - **Akzeptanzpunkte:** - - keine unbounded queue - - Fill-Level live sichtbar - - Drop/Repeat/Mute niemals ohne Counter/Log + - Keine unbounded Queue. + - Fill-Level (High/Low) ist aus `QueueStats` sichtbar. + - Drop/Repeat/Mute-Counter sind vorhanden und testbar. +- **Nachweis:** + - `FrameQueue`-Implementierung (`internal/output/frame_queue.go`) liefert kapazitätsgesteuerte Push/Pop-Logik und Counters. + - Engine-Run nutzt Queue vor dem Writer und zeigt `QueueStats` in `EngineStats`. + - Tests (`internal/output/frame_queue_test.go` + `go test ./...`) decken Push/Pop, Timeout-Counters und Stats ab. +- **Restrisiken:** + - Die Queue wird aktuell synchron getrieben; ein dedizierter Writer-Worker fehlt noch. + - Queue-Close erwartet, dass Generator/Writer vor dem Schließen stoppen, sonst droht Panik beim Schreiben. ### WS-01-T2 — Writer-Worker einführen - **Status:** TODO @@ -229,10 +237,14 @@ Generator/Upsampler und Hardwarewriter werden als getrennte Stufen mit kleinem, - Wie eng koppeln wir WS-01 mit WS-02, ohne Overengineering zu erzeugen? ## WS-01 Entscheidungslog -- Noch leer +| Datum | Entscheidung | Notiz | +|---|---|---| +| 2026-04-05 | FrameQueue mit Engine-Integration | Queue lebt nach dem Upsampler auf DeviceFrame-Ebene, Kapazität via `runtime.frameQueueCapacity`, `EngineStats` zeigt `QueueStats`, Tests decken Timeouts und Counters ab. | ## WS-01 Verifikation -- Noch leer +| Datum | Fokus | Ergebnis | +|---|---|---| +| 2026-04-05 | FrameQueue + Engine integration | ✅ `go test ./...` (im `internal`-Modul incl. `frame_queue_test.go`) | --- @@ -484,7 +496,7 @@ Build-, Release- und Betriebsartefakte reproduzierbar und teamtauglich machen. | ID | Status | Frage | Notiz | |---|---|---|---| -| DEC-001 | OPEN | Puffern wir auf CompositeFrame- oder DeviceFrame-Ebene? | Konzept empfiehlt Device-Frame-Ebene | +| DEC-001 | RESOLVED | Puffern wir auf CompositeFrame- oder DeviceFrame-Ebene? | Queue lebt nach dem Upsampler (DeviceFrame-Ebene) gemäß `internal/app/engine.go`-Integrationsschleife. | | DEC-002 | OPEN | Fault-Recovery zuerst mit `mute`, `repeat last safe frame` oder beidem? | Muss technisch und RF-seitig sauber bewertet werden | | DEC-003 | OPEN | Ziehen wir minimale WS-05-Basis-Härtungen vor? | Timeouts/Body-Limits evtl. früher sinnvoll | | DEC-004 | OPEN | Wie gross/simpel halten wir die erste State-Maschine? | Gefahr von Overengineering | @@ -494,10 +506,11 @@ Build-, Release- und Betriebsartefakte reproduzierbar und teamtauglich machen. ## 7. Nächste sinnvolle Schritte ### Empfohlener Start -1. **WS-03-T1 Parameterinventar erstellen** +1. **WS-03-T1 Parameterinventar erstellen** *(abgeschlossen)* 2. **bekannte Inkonsistenzen (CFG-SEM-001, CTL-UX-001) konkret verifizieren** 3. **DesiredConfig / AppliedConfig / RuntimeState Zielmodell grob skizzieren** 4. Danach Architekturarbeit an **WS-01 + WS-02** starten +5. **Aktuell:** WS-01-T2 Writer-Worker einführen (Queue → Driver), danach WS-01-T3 Supervisor + WS-02 Runtime-State. ### Vor dem ersten grossen Umbau klären - Was ist „minimal sinnvoll“ für Milestone 1? diff --git a/internal/app/engine.go b/internal/app/engine.go index bd13134..6904ec9 100644 --- a/internal/app/engine.go +++ b/internal/app/engine.go @@ -2,6 +2,7 @@ package app import ( "context" + "errors" "fmt" "log" "sync" @@ -12,6 +13,7 @@ import ( cfgpkg "github.com/jan/fm-rds-tx/internal/config" "github.com/jan/fm-rds-tx/internal/dsp" offpkg "github.com/jan/fm-rds-tx/internal/offline" + "github.com/jan/fm-rds-tx/internal/output" "github.com/jan/fm-rds-tx/internal/platform" ) @@ -54,17 +56,18 @@ func durationMs(ns uint64) float64 { } type EngineStats struct { - State string `json:"state"` - ChunksProduced uint64 `json:"chunksProduced"` - TotalSamples uint64 `json:"totalSamples"` - Underruns uint64 `json:"underruns"` - LateBuffers uint64 `json:"lateBuffers,omitempty"` - LastError string `json:"lastError,omitempty"` - UptimeSeconds float64 `json:"uptimeSeconds"` - MaxCycleMs float64 `json:"maxCycleMs,omitempty"` - MaxGenerateMs float64 `json:"maxGenerateMs,omitempty"` - MaxUpsampleMs float64 `json:"maxUpsampleMs,omitempty"` - MaxWriteMs float64 `json:"maxWriteMs,omitempty"` + State string `json:"state"` + ChunksProduced uint64 `json:"chunksProduced"` + TotalSamples uint64 `json:"totalSamples"` + Underruns uint64 `json:"underruns"` + LateBuffers uint64 `json:"lateBuffers,omitempty"` + LastError string `json:"lastError,omitempty"` + UptimeSeconds float64 `json:"uptimeSeconds"` + MaxCycleMs float64 `json:"maxCycleMs,omitempty"` + MaxGenerateMs float64 `json:"maxGenerateMs,omitempty"` + MaxUpsampleMs float64 `json:"maxUpsampleMs,omitempty"` + MaxWriteMs float64 `json:"maxWriteMs,omitempty"` + Queue output.QueueStats `json:"queue"` } // Engine is the continuous TX loop. It generates composite IQ in chunks, @@ -79,6 +82,7 @@ type Engine struct { upsampler *dsp.FMUpsampler // nil = same-rate, non-nil = split-rate chunkDuration time.Duration deviceRate float64 + frameQueue *output.FrameQueue mu sync.Mutex state EngineState @@ -168,6 +172,7 @@ func NewEngine(cfg cfgpkg.Config, driver platform.SoapyDriver) *Engine { chunkDuration: 50 * time.Millisecond, deviceRate: deviceRate, state: EngineIdle, + frameQueue: output.NewFrameQueue(cfg.Runtime.FrameQueueCapacity), } } @@ -346,6 +351,7 @@ func (e *Engine) Stats() EngineStats { MaxGenerateMs: durationMs(e.maxGenerateNs.Load()), MaxUpsampleMs: durationMs(e.maxUpsampleNs.Load()), MaxWriteMs: durationMs(e.maxWriteNs.Load()), + Queue: e.frameQueue.Stats(), } } @@ -372,13 +378,45 @@ func (e *Engine) run(ctx context.Context) { frame = e.upsampler.Process(frame) } t2 := time.Now() - n, err := e.driver.Write(ctx, frame) + + if err := e.frameQueue.Push(ctx, frame); err != nil { + if ctx.Err() != nil { + return + } + if errors.Is(err, output.ErrFrameQueueClosed) { + return + } + e.lastError.Store(err.Error()) + e.underruns.Add(1) + select { + case <-time.After(e.chunkDuration): + case <-ctx.Done(): + return + } + continue + } + + popFrame, err := e.frameQueue.Pop(ctx) + if err != nil { + if ctx.Err() != nil { + return + } + if errors.Is(err, output.ErrFrameQueueClosed) { + return + } + e.lastError.Store(err.Error()) + e.underruns.Add(1) + continue + } + t3 := time.Now() + n, err := e.driver.Write(ctx, popFrame) + t4 := time.Now() genDur := t1.Sub(t0) upDur := t2.Sub(t1) - writeDur := t3.Sub(t2) - cycleDur := t3.Sub(t0) + writeDur := t4.Sub(t3) + cycleDur := t4.Sub(t0) updateMaxDuration(&e.maxGenerateNs, genDur) updateMaxDuration(&e.maxUpsampleNs, upDur) @@ -399,7 +437,6 @@ func (e *Engine) run(ctx context.Context) { } e.lastError.Store(err.Error()) e.underruns.Add(1) - // Back off to avoid pegging CPU on persistent errors select { case <-time.After(e.chunkDuration): case <-ctx.Done(): diff --git a/internal/config/config.go b/internal/config/config.go index 7654d17..6c73382 100644 --- a/internal/config/config.go +++ b/internal/config/config.go @@ -14,6 +14,7 @@ type Config struct { FM FMConfig `json:"fm"` Backend BackendConfig `json:"backend"` Control ControlConfig `json:"control"` + Runtime RuntimeConfig `json:"runtime"` } type AudioConfig struct { @@ -35,18 +36,18 @@ type RDSConfig struct { type FMConfig struct { FrequencyMHz float64 `json:"frequencyMHz"` StereoEnabled bool `json:"stereoEnabled"` - PilotLevel float64 `json:"pilotLevel"` // fraction of ±75kHz deviation (0.09 = 9%, ITU standard) - RDSInjection float64 `json:"rdsInjection"` // fraction of ±75kHz deviation (0.04 = 4%, typical) - PreEmphasisTauUS float64 `json:"preEmphasisTauUS"` // time constant in µs: 50 (EU) or 75 (US), 0=off + PilotLevel float64 `json:"pilotLevel"` // fraction of ±75kHz deviation (0.09 = 9%, ITU standard) + RDSInjection float64 `json:"rdsInjection"` // fraction of ±75kHz deviation (0.04 = 4%, typical) + PreEmphasisTauUS float64 `json:"preEmphasisTauUS"` // time constant in µs: 50 (EU) or 75 (US), 0=off OutputDrive float64 `json:"outputDrive"` - CompositeRateHz int `json:"compositeRateHz"` // internal DSP/MPX sample rate + CompositeRateHz int `json:"compositeRateHz"` // internal DSP/MPX sample rate MaxDeviationHz float64 `json:"maxDeviationHz"` LimiterEnabled bool `json:"limiterEnabled"` LimiterCeiling float64 `json:"limiterCeiling"` FMModulationEnabled bool `json:"fmModulationEnabled"` - MpxGain float64 `json:"mpxGain"` // hardware calibration: scales entire composite output (default 1.0) - BS412Enabled bool `json:"bs412Enabled"` // ITU-R BS.412 MPX power limiter (EU requirement) - BS412ThresholdDBr float64 `json:"bs412ThresholdDBr"` // power limit in dBr (0 = standard, +3 = relaxed) + MpxGain float64 `json:"mpxGain"` // hardware calibration: scales entire composite output (default 1.0) + BS412Enabled bool `json:"bs412Enabled"` // ITU-R BS.412 MPX power limiter (EU requirement) + BS412ThresholdDBr float64 `json:"bs412ThresholdDBr"` // power limit in dBr (0 = standard, +3 = relaxed) } type BackendConfig struct { @@ -63,6 +64,10 @@ type ControlConfig struct { ListenAddress string `json:"listenAddress"` } +type RuntimeConfig struct { + FrameQueueCapacity int `json:"frameQueueCapacity"` +} + func Default() Config { return Config{ Audio: AudioConfig{Gain: 1.0, ToneLeftHz: 1000, ToneRightHz: 1600, ToneAmplitude: 0.4}, @@ -83,6 +88,7 @@ func Default() Config { }, Backend: BackendConfig{Kind: "file", OutputPath: "build/out/composite.f32"}, Control: ControlConfig{ListenAddress: "127.0.0.1:8088"}, + Runtime: RuntimeConfig{FrameQueueCapacity: 3}, } } @@ -150,7 +156,9 @@ func (c Config) Validate() error { if c.FM.LimiterCeiling < 0 || c.FM.LimiterCeiling > 2 { return fmt.Errorf("fm.limiterCeiling out of range") } - if c.FM.MpxGain == 0 { c.FM.MpxGain = 1.0 } // default if omitted from JSON + if c.FM.MpxGain == 0 { + c.FM.MpxGain = 1.0 + } // default if omitted from JSON if c.FM.MpxGain < 0.1 || c.FM.MpxGain > 5 { return fmt.Errorf("fm.mpxGain out of range (0.1..5)") } @@ -163,6 +171,9 @@ func (c Config) Validate() error { if c.Control.ListenAddress == "" { return fmt.Errorf("control.listenAddress is required") } + if c.Runtime.FrameQueueCapacity <= 0 { + return fmt.Errorf("runtime.frameQueueCapacity must be > 0") + } // Fail-loud PI validation if c.RDS.Enabled { if _, err := ParsePI(c.RDS.PI); err != nil { diff --git a/internal/output/frame_queue.go b/internal/output/frame_queue.go new file mode 100644 index 0000000..22b6d38 --- /dev/null +++ b/internal/output/frame_queue.go @@ -0,0 +1,211 @@ +package output + +import ( + "context" + "errors" + "sync" +) + +// ErrFrameQueueClosed is returned when a queue operation is attempted after the queue +// has been closed. +var ErrFrameQueueClosed = errors.New("frame queue closed") + +// QueueStats exposes the runtime state of a frame queue. +type QueueStats struct { + Capacity int `json:"capacity"` + Depth int `json:"depth"` + FillLevel float64 `json:"fillLevel"` + HighWaterMark int `json:"highWaterMark"` + LowWaterMark int `json:"lowWaterMark"` + PushTimeouts uint64 `json:"pushTimeouts"` + PopTimeouts uint64 `json:"popTimeouts"` + DroppedFrames uint64 `json:"droppedFrames"` + RepeatedFrames uint64 `json:"repeatedFrames"` + MutedFrames uint64 `json:"mutedFrames"` +} + +// FrameQueue is a bounded ring that holds CompositeFrame instances between the +// generator and the writer. Push blocks when the queue is full until space +// becomes available or the provided context is cancelled. Pop blocks when the +// queue is empty until a new frame arrives or the context is cancelled. +type FrameQueue struct { + capacity int + ch chan *CompositeFrame + + mu sync.Mutex + depth int + highWaterMark int + lowWaterMark int + pushTimeouts uint64 + popTimeouts uint64 + dropped uint64 + repeated uint64 + muted uint64 + closed bool + + closeOnce sync.Once +} + +// NewFrameQueue builds a bounded queue that holds up to capacity frames. +func NewFrameQueue(capacity int) *FrameQueue { + if capacity <= 0 { + capacity = 1 + } + fq := &FrameQueue{ + capacity: capacity, + ch: make(chan *CompositeFrame, capacity), + lowWaterMark: capacity, + } + fq.trackDepth(0) + return fq +} + +// Capacity returns the fixed frame capacity of the queue. +func (q *FrameQueue) Capacity() int { + return q.capacity +} + +// FillLevel reports the current occupancy as a fraction of capacity. +func (q *FrameQueue) FillLevel() float64 { + q.mu.Lock() + depth := q.depth + q.mu.Unlock() + if q.capacity == 0 { + return 0 + } + return float64(depth) / float64(q.capacity) +} + +// Depth returns the current number of frames in the queue. +func (q *FrameQueue) Depth() int { + q.mu.Lock() + depth := q.depth + q.mu.Unlock() + return depth +} + +// Stats returns a snapshot of the queue metrics. +func (q *FrameQueue) Stats() QueueStats { + q.mu.Lock() + stats := QueueStats{ + Capacity: q.capacity, + Depth: q.depth, + FillLevel: q.fillLevelLocked(), + HighWaterMark: q.highWaterMark, + LowWaterMark: q.lowWaterMark, + PushTimeouts: q.pushTimeouts, + PopTimeouts: q.popTimeouts, + DroppedFrames: q.dropped, + RepeatedFrames: q.repeated, + MutedFrames: q.muted, + } + q.mu.Unlock() + return stats +} + +// Push enqueues a frame, blocking until space is available or ctx is done. +func (q *FrameQueue) Push(ctx context.Context, frame *CompositeFrame) error { + if frame == nil { + return errors.New("frame required") + } + if q.isClosed() { + return ErrFrameQueueClosed + } + + select { + case q.ch <- frame: + q.updateDepth(+1) + return nil + case <-ctx.Done(): + q.recordPushTimeout() + return ctx.Err() + } +} + +// Pop removes a frame, blocking until one is available or ctx signals done. +func (q *FrameQueue) Pop(ctx context.Context) (*CompositeFrame, error) { + select { + case frame, ok := <-q.ch: + if !ok { + return nil, ErrFrameQueueClosed + } + q.updateDepth(-1) + return frame, nil + case <-ctx.Done(): + q.recordPopTimeout() + return nil, ctx.Err() + } +} + +// Close marks the queue as closed and wakes up blocked callers. +func (q *FrameQueue) Close() { + q.closeOnce.Do(func() { + q.mu.Lock() + q.closed = true + q.mu.Unlock() + close(q.ch) + }) +} + +// RecordDrop increments the drop counter for instrumentation. +func (q *FrameQueue) RecordDrop() { + q.mu.Lock() + q.dropped++ + q.mu.Unlock() +} + +// RecordRepeat increments the repeat counter for instrumentation. +func (q *FrameQueue) RecordRepeat() { + q.mu.Lock() + q.repeated++ + q.mu.Unlock() +} + +// RecordMute increments the mute counter for instrumentation. +func (q *FrameQueue) RecordMute() { + q.mu.Lock() + q.muted++ + q.mu.Unlock() +} + +func (q *FrameQueue) isClosed() bool { + q.mu.Lock() + closed := q.closed + q.mu.Unlock() + return closed +} + +func (q *FrameQueue) updateDepth(delta int) { + q.mu.Lock() + q.depth += delta + q.trackDepth(q.depth) + q.mu.Unlock() +} + +func (q *FrameQueue) trackDepth(depth int) { + if depth > q.highWaterMark { + q.highWaterMark = depth + } + if depth < q.lowWaterMark { + q.lowWaterMark = depth + } +} + +func (q *FrameQueue) fillLevelLocked() float64 { + if q.capacity == 0 { + return 0 + } + return float64(q.depth) / float64(q.capacity) +} + +func (q *FrameQueue) recordPushTimeout() { + q.mu.Lock() + q.pushTimeouts++ + q.mu.Unlock() +} + +func (q *FrameQueue) recordPopTimeout() { + q.mu.Lock() + q.popTimeouts++ + q.mu.Unlock() +} diff --git a/internal/output/frame_queue_test.go b/internal/output/frame_queue_test.go new file mode 100644 index 0000000..90f3460 --- /dev/null +++ b/internal/output/frame_queue_test.go @@ -0,0 +1,82 @@ +package output + +import ( + "context" + "testing" + "time" +) + +func TestFrameQueuePushPop(t *testing.T) { + q := NewFrameQueue(2) + ctx := context.Background() + + frame := &CompositeFrame{Sequence: 1} + if err := q.Push(ctx, frame); err != nil { + t.Fatalf("push failed: %v", err) + } + if got := q.Depth(); got != 1 { + t.Fatalf("expected depth 1, got %d", got) + } + if got := q.FillLevel(); got <= 0 || got >= 1 { + t.Fatalf("unexpected fill level: %f", got) + } + + popped, err := q.Pop(ctx) + if err != nil { + t.Fatalf("pop failed: %v", err) + } + if popped != frame { + t.Fatal("popped frame differs from pushed frame") + } + if q.Depth() != 0 { + t.Fatalf("expected depth 0 after pop, got %d", q.Depth()) + } + + stats := q.Stats() + if stats.HighWaterMark == 0 { + t.Fatal("expected high water mark to track push") + } + if stats.LowWaterMark != 0 { + t.Fatalf("expected low water mark 0, got %d", stats.LowWaterMark) + } +} + +func TestFrameQueuePushTimeout(t *testing.T) { + q := NewFrameQueue(1) + ctx := context.Background() + frame := &CompositeFrame{Sequence: 42} + + if err := q.Push(ctx, frame); err != nil { + t.Fatalf("initial push: %v", err) + } + + shortCtx, cancel := context.WithTimeout(ctx, 5*time.Millisecond) + defer cancel() + if err := q.Push(shortCtx, frame); err == nil { + t.Fatalf("expected timeout when pushing into full queue") + } + stats := q.Stats() + if stats.PushTimeouts == 0 { + t.Fatalf("expected push timeout counter to increment, got %d", stats.PushTimeouts) + } + + _, _ = q.Pop(ctx) +} + +func TestFrameQueueCounters(t *testing.T) { + q := NewFrameQueue(1) + q.RecordDrop() + q.RecordRepeat() + q.RecordMute() + + stats := q.Stats() + if stats.DroppedFrames != 1 { + t.Fatalf("expected 1 drop, got %d", stats.DroppedFrames) + } + if stats.RepeatedFrames != 1 { + t.Fatalf("expected 1 repeat, got %d", stats.RepeatedFrames) + } + if stats.MutedFrames != 1 { + t.Fatalf("expected 1 mute, got %d", stats.MutedFrames) + } +} From 43cb4ad747fb9a6f4174f803d98cc13265919e22 Mon Sep 17 00:00:00 2001 From: Jan Svabenik Date: Sun, 5 Apr 2026 15:16:50 +0200 Subject: [PATCH 08/55] feat: introduce writer worker --- docs/pro-runtime-hardening-workboard.md | 20 +++++--- internal/app/engine.go | 62 +++++++++++++++++++------ internal/dsp/fmupsample.go | 3 +- internal/dsp/iqresample.go | 1 + internal/dsp/upsample.go | 1 + internal/output/backend.go | 1 + 6 files changed, 67 insertions(+), 21 deletions(-) diff --git a/docs/pro-runtime-hardening-workboard.md b/docs/pro-runtime-hardening-workboard.md index d8aee65..a1cd7d3 100644 --- a/docs/pro-runtime-hardening-workboard.md +++ b/docs/pro-runtime-hardening-workboard.md @@ -209,16 +209,22 @@ Generator/Upsampler und Hardwarewriter werden als getrennte Stufen mit kleinem, - Queue-Close erwartet, dass Generator/Writer vor dem Schließen stoppen, sonst droht Panik beim Schreiben. ### WS-01-T2 — Writer-Worker einführen -- **Status:** TODO -- **Owner:** offen +- **Status:** VERIFIED +- **Owner:** Lead Coderaffe - **Code-Orte:** - - `internal/app/engine.go` - - `internal/platform/*` + - `internal/app/engine.go` (run loop, `writerLoop`, `cloneFrame`, Stats) + - `internal/dsp/*` (FMUpsampler / Resampler copy `GeneratedAt` für Cycle-Metriken) - **Ziel:** - Nur noch ein dedizierter Worker besitzt `driver.Write()`. + Generator/Upsampler liefern Frames in die FrameQueue, `driver.Write()` läuft nur noch im dedizierten Writer. - **Akzeptanzpunkte:** - - Write-Latenz pro Frame messbar - - Timinginteraktionen klar isoliert + - `writerLoop()` ist die einzige Stelle mit `driver.Write()` und zieht aus der Queue. + - FrameQueue ist ein echter Puffer (Generator klont Frames, Writer poppt) und `EngineStats.Queue` zeigt den Füllstand. + - Write- und Cycle-Latenzen plus `LateBuffers` bleiben in `EngineStats` sichtbar (`MaxWriteMs`, `LateBuffers`, `MaxCycleMs`). +- **Nachweis:** + - `go test ./...` (Engine + Queue + DSP) läuft erfolgreich. + - `EngineStats` berichtet weiterhin über Queue-/Writer-Metriken. +- **Restrisiken:** + - Frame-Klonierung pro Chunk erhöht Heap-Pressure; spätere Workstreams sollten Pooling / Zero-Copy prüfen. ### WS-01-T3 — Supervisor-Schicht einführen - **Status:** TODO diff --git a/internal/app/engine.go b/internal/app/engine.go index 6904ec9..f1920d1 100644 --- a/internal/app/engine.go +++ b/internal/app/engine.go @@ -356,7 +356,10 @@ func (e *Engine) Stats() EngineStats { } func (e *Engine) run(ctx context.Context) { + e.wg.Add(1) + go e.writerLoop(ctx) defer e.wg.Done() + for { if ctx.Err() != nil { return @@ -373,13 +376,27 @@ func (e *Engine) run(ctx context.Context) { t0 := time.Now() frame := e.generator.GenerateFrame(e.chunkDuration) + frame.GeneratedAt = t0 t1 := time.Now() if e.upsampler != nil { frame = e.upsampler.Process(frame) + frame.GeneratedAt = t0 } t2 := time.Now() - if err := e.frameQueue.Push(ctx, frame); err != nil { + genDur := t1.Sub(t0) + upDur := t2.Sub(t1) + updateMaxDuration(&e.maxGenerateNs, genDur) + updateMaxDuration(&e.maxUpsampleNs, upDur) + + enqueued := cloneFrame(frame) + if enqueued == nil { + e.lastError.Store("engine: frame clone failed") + e.underruns.Add(1) + continue + } + + if err := e.frameQueue.Push(ctx, enqueued); err != nil { if ctx.Err() != nil { return } @@ -395,8 +412,13 @@ func (e *Engine) run(ctx context.Context) { } continue } + } +} - popFrame, err := e.frameQueue.Pop(ctx) +func (e *Engine) writerLoop(ctx context.Context) { + defer e.wg.Done() + for { + frame, err := e.frameQueue.Pop(ctx) if err != nil { if ctx.Err() != nil { return @@ -409,25 +431,23 @@ func (e *Engine) run(ctx context.Context) { continue } - t3 := time.Now() - n, err := e.driver.Write(ctx, popFrame) - t4 := time.Now() + writeStart := time.Now() + n, err := e.driver.Write(ctx, frame) + writeDur := time.Since(writeStart) - genDur := t1.Sub(t0) - upDur := t2.Sub(t1) - writeDur := t4.Sub(t3) - cycleDur := t4.Sub(t0) + cycleDur := writeDur + if !frame.GeneratedAt.IsZero() { + cycleDur = time.Since(frame.GeneratedAt) + } - updateMaxDuration(&e.maxGenerateNs, genDur) - updateMaxDuration(&e.maxUpsampleNs, upDur) updateMaxDuration(&e.maxWriteNs, writeDur) updateMaxDuration(&e.maxCycleNs, cycleDur) if cycleDur > e.chunkDuration { late := e.lateBuffers.Add(1) if late <= 5 || late%20 == 0 { - log.Printf("TX LATE: cycle=%s budget=%s gen=%s up=%s write=%s over=%s", - cycleDur, e.chunkDuration, genDur, upDur, writeDur, cycleDur-e.chunkDuration) + log.Printf("TX LATE: cycle=%s budget=%s write=%s over=%s", + cycleDur, e.chunkDuration, writeDur, cycleDur-e.chunkDuration) } } @@ -444,7 +464,23 @@ func (e *Engine) run(ctx context.Context) { } continue } + e.chunksProduced.Add(1) e.totalSamples.Add(uint64(n)) } } + +func cloneFrame(src *output.CompositeFrame) *output.CompositeFrame { + if src == nil { + return nil + } + samples := make([]output.IQSample, len(src.Samples)) + copy(samples, src.Samples) + return &output.CompositeFrame{ + Samples: samples, + SampleRateHz: src.SampleRateHz, + Timestamp: src.Timestamp, + GeneratedAt: src.GeneratedAt, + Sequence: src.Sequence, + } +} diff --git a/internal/dsp/fmupsample.go b/internal/dsp/fmupsample.go index c4bcf6c..998d556 100644 --- a/internal/dsp/fmupsample.go +++ b/internal/dsp/fmupsample.go @@ -147,7 +147,7 @@ func (u *FMUpsampler) Process(frame *output.CompositeFrame) *output.CompositeFra pos := u.srcPos n := 0 for pos < float64(srcLen) && n < maxOut { - vi := int(pos) // virtual index (integer part) + vi := int(pos) // virtual index (integer part) frac := pos - float64(vi) pA := phaseAt(vi) @@ -171,6 +171,7 @@ func (u *FMUpsampler) Process(frame *output.CompositeFrame) *output.CompositeFra u.outFrame.SampleRateHz = u.dstRate u.outFrame.Timestamp = frame.Timestamp u.outFrame.Sequence = frame.Sequence + u.outFrame.GeneratedAt = frame.GeneratedAt return &u.outFrame } diff --git a/internal/dsp/iqresample.go b/internal/dsp/iqresample.go index 4d1c044..a3d565e 100644 --- a/internal/dsp/iqresample.go +++ b/internal/dsp/iqresample.go @@ -54,6 +54,7 @@ func ResampleIQ(frame *output.CompositeFrame, targetRateHz float64) *output.Comp Samples: dst, SampleRateHz: targetRateHz, Timestamp: frame.Timestamp, + GeneratedAt: frame.GeneratedAt, Sequence: frame.Sequence, } } diff --git a/internal/dsp/upsample.go b/internal/dsp/upsample.go index b45b1ea..9f81549 100644 --- a/internal/dsp/upsample.go +++ b/internal/dsp/upsample.go @@ -76,6 +76,7 @@ func (u *FMPhaseUpsampler) Process(frame *output.CompositeFrame) *output.Composi Samples: dst, SampleRateHz: u.dstRate, Timestamp: frame.Timestamp, + GeneratedAt: frame.GeneratedAt, Sequence: frame.Sequence, } } diff --git a/internal/output/backend.go b/internal/output/backend.go index 38e5b82..bbc0171 100644 --- a/internal/output/backend.go +++ b/internal/output/backend.go @@ -19,6 +19,7 @@ type CompositeFrame struct { Samples []IQSample SampleRateHz float64 Timestamp time.Time + GeneratedAt time.Time Sequence uint64 } From d62e8fae24c7139bc178ce975ff75da69e1518b4 Mon Sep 17 00:00:00 2001 From: Jan Svabenik Date: Sun, 5 Apr 2026 16:55:30 +0200 Subject: [PATCH 09/55] Add queue health indicator --- docs/pro-runtime-hardening-workboard.md | 5 ++- internal/output/frame_queue.go | 51 +++++++++++++++++++------ internal/output/frame_queue_test.go | 41 ++++++++++++++++++++ 3 files changed, 84 insertions(+), 13 deletions(-) diff --git a/docs/pro-runtime-hardening-workboard.md b/docs/pro-runtime-hardening-workboard.md index a1cd7d3..202f7a8 100644 --- a/docs/pro-runtime-hardening-workboard.md +++ b/docs/pro-runtime-hardening-workboard.md @@ -199,11 +199,12 @@ Generator/Upsampler und Hardwarewriter werden als getrennte Stufen mit kleinem, - **Akzeptanzpunkte:** - Keine unbounded Queue. - Fill-Level (High/Low) ist aus `QueueStats` sichtbar. + - Queue-Health-Indikator (`queue.health`) liefert `critical`, `low` oder `normal` aus dem Fill-Level. EngineStats.`queue` zeigt den Status ebenfalls. - Drop/Repeat/Mute-Counter sind vorhanden und testbar. - **Nachweis:** - `FrameQueue`-Implementierung (`internal/output/frame_queue.go`) liefert kapazitätsgesteuerte Push/Pop-Logik und Counters. - Engine-Run nutzt Queue vor dem Writer und zeigt `QueueStats` in `EngineStats`. - - Tests (`internal/output/frame_queue_test.go` + `go test ./...`) decken Push/Pop, Timeout-Counters und Stats ab. + - Tests (`internal/output/frame_queue_test.go` + `go test ./...`) decken Push/Pop, Timeout-Counters, Stats und den neuen Queue-Health-Indikator ab. - **Restrisiken:** - Die Queue wird aktuell synchron getrieben; ein dedizierter Writer-Worker fehlt noch. - Queue-Close erwartet, dass Generator/Writer vor dem Schließen stoppen, sonst droht Panik beim Schreiben. @@ -246,11 +247,13 @@ Generator/Upsampler und Hardwarewriter werden als getrennte Stufen mit kleinem, | Datum | Entscheidung | Notiz | |---|---|---| | 2026-04-05 | FrameQueue mit Engine-Integration | Queue lebt nach dem Upsampler auf DeviceFrame-Ebene, Kapazität via `runtime.frameQueueCapacity`, `EngineStats` zeigt `QueueStats`, Tests decken Timeouts und Counters ab. | +| 2026-04-05 | Queue-Health-Indikator | `QueueStats.Health` gibt `critical`/`low`/`normal` zurck und ist ber `EngineStats.Queue` im Runtime-Endpunkt sichtbar. | ## WS-01 Verifikation | Datum | Fokus | Ergebnis | |---|---|---| | 2026-04-05 | FrameQueue + Engine integration | ✅ `go test ./...` (im `internal`-Modul incl. `frame_queue_test.go`) | +| 2026-04-05 | Queue-Health-Indikator | go test ./... deckt `TestFrameQueueHealthIndicator` und `queue.health` ab. | --- diff --git a/internal/output/frame_queue.go b/internal/output/frame_queue.go index 22b6d38..e3db114 100644 --- a/internal/output/frame_queue.go +++ b/internal/output/frame_queue.go @@ -12,17 +12,31 @@ var ErrFrameQueueClosed = errors.New("frame queue closed") // QueueStats exposes the runtime state of a frame queue. type QueueStats struct { - Capacity int `json:"capacity"` - Depth int `json:"depth"` - FillLevel float64 `json:"fillLevel"` - HighWaterMark int `json:"highWaterMark"` - LowWaterMark int `json:"lowWaterMark"` - PushTimeouts uint64 `json:"pushTimeouts"` - PopTimeouts uint64 `json:"popTimeouts"` - DroppedFrames uint64 `json:"droppedFrames"` - RepeatedFrames uint64 `json:"repeatedFrames"` - MutedFrames uint64 `json:"mutedFrames"` -} + Capacity int `json:"capacity"` + Depth int `json:"depth"` + FillLevel float64 `json:"fillLevel"` + Health QueueHealth `json:"health"` + HighWaterMark int `json:"highWaterMark"` + LowWaterMark int `json:"lowWaterMark"` + PushTimeouts uint64 `json:"pushTimeouts"` + PopTimeouts uint64 `json:"popTimeouts"` + DroppedFrames uint64 `json:"droppedFrames"` + RepeatedFrames uint64 `json:"repeatedFrames"` + MutedFrames uint64 `json:"mutedFrames"` +} + +type QueueHealth string + +const ( + QueueHealthCritical QueueHealth = "critical" + QueueHealthLow QueueHealth = "low" + QueueHealthNormal QueueHealth = "normal" +) + +const ( + queueHealthCriticalThreshold = 0.2 + queueHealthLowThreshold = 0.5 +) // FrameQueue is a bounded ring that holds CompositeFrame instances between the // generator and the writer. Push blocks when the queue is full until space @@ -87,10 +101,12 @@ func (q *FrameQueue) Depth() int { // Stats returns a snapshot of the queue metrics. func (q *FrameQueue) Stats() QueueStats { q.mu.Lock() + fill := q.fillLevelLocked() stats := QueueStats{ Capacity: q.capacity, Depth: q.depth, - FillLevel: q.fillLevelLocked(), + FillLevel: fill, + Health: queueHealthFromFill(fill), HighWaterMark: q.highWaterMark, LowWaterMark: q.lowWaterMark, PushTimeouts: q.pushTimeouts, @@ -209,3 +225,14 @@ func (q *FrameQueue) recordPopTimeout() { q.popTimeouts++ q.mu.Unlock() } + +func queueHealthFromFill(fill float64) QueueHealth { + switch { + case fill <= queueHealthCriticalThreshold: + return QueueHealthCritical + case fill <= queueHealthLowThreshold: + return QueueHealthLow + default: + return QueueHealthNormal + } +} diff --git a/internal/output/frame_queue_test.go b/internal/output/frame_queue_test.go index 90f3460..86c0e8e 100644 --- a/internal/output/frame_queue_test.go +++ b/internal/output/frame_queue_test.go @@ -80,3 +80,44 @@ func TestFrameQueueCounters(t *testing.T) { t.Fatalf("expected 1 mute, got %d", stats.MutedFrames) } } + +func TestFrameQueueHealthIndicator(t *testing.T) { + q := NewFrameQueue(4) + ctx := context.Background() + + stats := q.Stats() + if stats.Health != QueueHealthCritical { + t.Fatalf("expected initial health critical, got %s", stats.Health) + } + + push := func(seq int) { + frame := &CompositeFrame{Sequence: seq} + if err := q.Push(ctx, frame); err != nil { + t.Fatalf("push %d failed: %v", seq, err) + } + } + + push(1) + stats = q.Stats() + if stats.Health != QueueHealthLow { + t.Fatalf("expected low after one frame, got %s", stats.Health) + } + + push(2) + stats = q.Stats() + if stats.Health != QueueHealthLow { + t.Fatalf("expected low at 50%% fill, got %s", stats.Health) + } + + push(3) + stats = q.Stats() + if stats.Health != QueueHealthNormal { + t.Fatalf("expected normal once queue has ~75%% fill, got %s", stats.Health) + } + + for q.Depth() > 0 { + if _, err := q.Pop(ctx); err != nil { + t.Fatalf("cleanup pop failed: %v", err) + } + } +} From 64fa67e4d86a72d3b92ddce5ec37864db56878d8 Mon Sep 17 00:00:00 2001 From: Jan Svabenik Date: Sun, 5 Apr 2026 17:08:01 +0200 Subject: [PATCH 10/55] Expose queue health in runtime stats --- cmd/fmrtx/main.go | 1 + cmd/fmrtx/main_test.go | 36 +++++++++++++++++++++++++ docs/pro-runtime-hardening-workboard.md | 3 ++- 3 files changed, 39 insertions(+), 1 deletion(-) create mode 100644 cmd/fmrtx/main_test.go diff --git a/cmd/fmrtx/main.go b/cmd/fmrtx/main.go index 9e7a5fa..573070b 100644 --- a/cmd/fmrtx/main.go +++ b/cmd/fmrtx/main.go @@ -254,6 +254,7 @@ func (b *txBridge) TXStats() map[string]any { "maxGenerateMs": s.MaxGenerateMs, "maxUpsampleMs": s.MaxUpsampleMs, "maxWriteMs": s.MaxWriteMs, + "queue": s.Queue, } } func (b *txBridge) UpdateConfig(lp ctrlpkg.LivePatch) error { diff --git a/cmd/fmrtx/main_test.go b/cmd/fmrtx/main_test.go new file mode 100644 index 0000000..b2bd2dd --- /dev/null +++ b/cmd/fmrtx/main_test.go @@ -0,0 +1,36 @@ +package main + +import ( + "testing" + + apppkg "github.com/jan/fm-rds-tx/internal/app" + cfgpkg "github.com/jan/fm-rds-tx/internal/config" + "github.com/jan/fm-rds-tx/internal/output" + "github.com/jan/fm-rds-tx/internal/platform" +) + +func TestTxBridgeExportsQueueStats(t *testing.T) { + cfg := cfgpkg.Default() + driver := platform.NewSimulatedDriver(nil) + engine := apppkg.NewEngine(cfg, driver) + bridge := &txBridge{engine: engine} + stats := bridge.TXStats() + + raw, ok := stats["queue"] + if !ok { + t.Fatalf("expected queue stats in tx stats") + } + + queue, ok := raw.(output.QueueStats) + if !ok { + t.Fatalf("queue stats type mismatch: %T", raw) + } + + if queue.Capacity != cfg.Runtime.FrameQueueCapacity { + t.Fatalf("unexpected queue capacity: want %d got %d", cfg.Runtime.FrameQueueCapacity, queue.Capacity) + } + + if queue.Health != output.QueueHealthCritical { + t.Fatalf("queue health should be critical with empty queue, got %s", queue.Health) + } +} diff --git a/docs/pro-runtime-hardening-workboard.md b/docs/pro-runtime-hardening-workboard.md index 202f7a8..f1eb60e 100644 --- a/docs/pro-runtime-hardening-workboard.md +++ b/docs/pro-runtime-hardening-workboard.md @@ -247,13 +247,14 @@ Generator/Upsampler und Hardwarewriter werden als getrennte Stufen mit kleinem, | Datum | Entscheidung | Notiz | |---|---|---| | 2026-04-05 | FrameQueue mit Engine-Integration | Queue lebt nach dem Upsampler auf DeviceFrame-Ebene, Kapazität via `runtime.frameQueueCapacity`, `EngineStats` zeigt `QueueStats`, Tests decken Timeouts und Counters ab. | -| 2026-04-05 | Queue-Health-Indikator | `QueueStats.Health` gibt `critical`/`low`/`normal` zurck und ist ber `EngineStats.Queue` im Runtime-Endpunkt sichtbar. | +| 2026-04-05 | Queue-Health-Indikator | `QueueStats.Health` gibt `critical`/`low`/`normal` zurück und `txBridge` leitet `EngineStats.Queue` ins `/runtime`-JSON. | ## WS-01 Verifikation | Datum | Fokus | Ergebnis | |---|---|---| | 2026-04-05 | FrameQueue + Engine integration | ✅ `go test ./...` (im `internal`-Modul incl. `frame_queue_test.go`) | | 2026-04-05 | Queue-Health-Indikator | go test ./... deckt `TestFrameQueueHealthIndicator` und `queue.health` ab. | +| 2026-04-05 | Runtime API queue health | ✅ `/runtime` liefert jetzt `engine.queue.health` dank `txBridge.TXStats`. | --- From 601b03b21debf568a79dbe0bf30d0bf7ebc6901d Mon Sep 17 00:00:00 2001 From: Jan Svabenik Date: Sun, 5 Apr 2026 17:12:21 +0200 Subject: [PATCH 11/55] test: fix queue health test type mismatch --- internal/output/frame_queue_test.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/internal/output/frame_queue_test.go b/internal/output/frame_queue_test.go index 86c0e8e..cb467ec 100644 --- a/internal/output/frame_queue_test.go +++ b/internal/output/frame_queue_test.go @@ -90,7 +90,7 @@ func TestFrameQueueHealthIndicator(t *testing.T) { t.Fatalf("expected initial health critical, got %s", stats.Health) } - push := func(seq int) { + push := func(seq uint64) { frame := &CompositeFrame{Sequence: seq} if err := q.Push(ctx, frame); err != nil { t.Fatalf("push %d failed: %v", seq, err) From 58364659e9597e8a5edc935273b22955277d94e1 Mon Sep 17 00:00:00 2001 From: Jan Svabenik Date: Sun, 5 Apr 2026 17:30:32 +0200 Subject: [PATCH 12/55] Add runtime indicator based on queue signals --- cmd/fmrtx/main.go | 25 ++++----- cmd/fmrtx/main_test.go | 12 +++++ docs/pro-runtime-hardening-workboard.md | 2 + internal/app/engine.go | 71 ++++++++++++++++--------- internal/app/runtime_indicator_test.go | 57 ++++++++++++++++++++ 5 files changed, 131 insertions(+), 36 deletions(-) create mode 100644 internal/app/runtime_indicator_test.go diff --git a/cmd/fmrtx/main.go b/cmd/fmrtx/main.go index 573070b..7c2a37b 100644 --- a/cmd/fmrtx/main.go +++ b/cmd/fmrtx/main.go @@ -243,18 +243,19 @@ func (b *txBridge) StopTX() error { return b.engine.Stop(context.Background()) func (b *txBridge) TXStats() map[string]any { s := b.engine.Stats() return map[string]any{ - "state": s.State, - "chunksProduced": s.ChunksProduced, - "totalSamples": s.TotalSamples, - "underruns": s.Underruns, - "lateBuffers": s.LateBuffers, - "lastError": s.LastError, - "uptimeSeconds": s.UptimeSeconds, - "maxCycleMs": s.MaxCycleMs, - "maxGenerateMs": s.MaxGenerateMs, - "maxUpsampleMs": s.MaxUpsampleMs, - "maxWriteMs": s.MaxWriteMs, - "queue": s.Queue, + "state": s.State, + "chunksProduced": s.ChunksProduced, + "totalSamples": s.TotalSamples, + "underruns": s.Underruns, + "lateBuffers": s.LateBuffers, + "lastError": s.LastError, + "uptimeSeconds": s.UptimeSeconds, + "maxCycleMs": s.MaxCycleMs, + "maxGenerateMs": s.MaxGenerateMs, + "maxUpsampleMs": s.MaxUpsampleMs, + "maxWriteMs": s.MaxWriteMs, + "queue": s.Queue, + "runtimeIndicator": s.RuntimeIndicator, } } func (b *txBridge) UpdateConfig(lp ctrlpkg.LivePatch) error { diff --git a/cmd/fmrtx/main_test.go b/cmd/fmrtx/main_test.go index b2bd2dd..43bc67f 100644 --- a/cmd/fmrtx/main_test.go +++ b/cmd/fmrtx/main_test.go @@ -33,4 +33,16 @@ func TestTxBridgeExportsQueueStats(t *testing.T) { if queue.Health != output.QueueHealthCritical { t.Fatalf("queue health should be critical with empty queue, got %s", queue.Health) } + + indicatorRaw, ok := stats["runtimeIndicator"] + if !ok { + t.Fatalf("expected runtimeIndicator in tx stats") + } + indicator, ok := indicatorRaw.(apppkg.RuntimeIndicator) + if !ok { + t.Fatalf("runtimeIndicator type mismatch: %T", indicatorRaw) + } + if indicator != apppkg.RuntimeIndicatorQueueCritical { + t.Fatalf("runtime indicator should be queueCritical, got %s", indicator) + } } diff --git a/docs/pro-runtime-hardening-workboard.md b/docs/pro-runtime-hardening-workboard.md index f1eb60e..f1618a3 100644 --- a/docs/pro-runtime-hardening-workboard.md +++ b/docs/pro-runtime-hardening-workboard.md @@ -248,12 +248,14 @@ Generator/Upsampler und Hardwarewriter werden als getrennte Stufen mit kleinem, |---|---|---| | 2026-04-05 | FrameQueue mit Engine-Integration | Queue lebt nach dem Upsampler auf DeviceFrame-Ebene, Kapazität via `runtime.frameQueueCapacity`, `EngineStats` zeigt `QueueStats`, Tests decken Timeouts und Counters ab. | | 2026-04-05 | Queue-Health-Indikator | `QueueStats.Health` gibt `critical`/`low`/`normal` zurück und `txBridge` leitet `EngineStats.Queue` ins `/runtime`-JSON. | +| 2026-04-05 | Runtime-Indikator | `EngineStats.RuntimeIndicator` kombiniert `queue.health` + `lateBuffers`, `/runtime` zeigt `engine.runtimeIndicator`. | ## WS-01 Verifikation | Datum | Fokus | Ergebnis | |---|---|---| | 2026-04-05 | FrameQueue + Engine integration | ✅ `go test ./...` (im `internal`-Modul incl. `frame_queue_test.go`) | | 2026-04-05 | Queue-Health-Indikator | go test ./... deckt `TestFrameQueueHealthIndicator` und `queue.health` ab. | +| 2026-04-05 | Runtime-Indikator | OK `go test ./...` deckt `runtimeIndicator` sowie `/runtime`-Exposition von `engine.runtimeIndicator`. | | 2026-04-05 | Runtime API queue health | ✅ `/runtime` liefert jetzt `engine.queue.health` dank `txBridge.TXStats`. | --- diff --git a/internal/app/engine.go b/internal/app/engine.go index f1920d1..4395b46 100644 --- a/internal/app/engine.go +++ b/internal/app/engine.go @@ -56,20 +56,29 @@ func durationMs(ns uint64) float64 { } type EngineStats struct { - State string `json:"state"` - ChunksProduced uint64 `json:"chunksProduced"` - TotalSamples uint64 `json:"totalSamples"` - Underruns uint64 `json:"underruns"` - LateBuffers uint64 `json:"lateBuffers,omitempty"` - LastError string `json:"lastError,omitempty"` - UptimeSeconds float64 `json:"uptimeSeconds"` - MaxCycleMs float64 `json:"maxCycleMs,omitempty"` - MaxGenerateMs float64 `json:"maxGenerateMs,omitempty"` - MaxUpsampleMs float64 `json:"maxUpsampleMs,omitempty"` - MaxWriteMs float64 `json:"maxWriteMs,omitempty"` - Queue output.QueueStats `json:"queue"` + State string `json:"state"` + ChunksProduced uint64 `json:"chunksProduced"` + TotalSamples uint64 `json:"totalSamples"` + Underruns uint64 `json:"underruns"` + LateBuffers uint64 `json:"lateBuffers,omitempty"` + LastError string `json:"lastError,omitempty"` + UptimeSeconds float64 `json:"uptimeSeconds"` + MaxCycleMs float64 `json:"maxCycleMs,omitempty"` + MaxGenerateMs float64 `json:"maxGenerateMs,omitempty"` + MaxUpsampleMs float64 `json:"maxUpsampleMs,omitempty"` + MaxWriteMs float64 `json:"maxWriteMs,omitempty"` + Queue output.QueueStats `json:"queue"` + RuntimeIndicator RuntimeIndicator `json:"runtimeIndicator"` } +type RuntimeIndicator string + +const ( + RuntimeIndicatorNormal RuntimeIndicator = "normal" + RuntimeIndicatorDegraded RuntimeIndicator = "degraded" + RuntimeIndicatorQueueCritical RuntimeIndicator = "queueCritical" +) + // Engine is the continuous TX loop. It generates composite IQ in chunks, // resamples to device rate, and pushes to hardware in a tight loop. // The hardware buffer_push call is blocking — it returns when the hardware @@ -339,19 +348,33 @@ func (e *Engine) Stats() EngineStats { } errVal, _ := e.lastError.Load().(string) + queue := e.frameQueue.Stats() + lateBuffers := e.lateBuffers.Load() return EngineStats{ - State: state.String(), - ChunksProduced: e.chunksProduced.Load(), - TotalSamples: e.totalSamples.Load(), - Underruns: e.underruns.Load(), - LateBuffers: e.lateBuffers.Load(), - LastError: errVal, - UptimeSeconds: uptime, - MaxCycleMs: durationMs(e.maxCycleNs.Load()), - MaxGenerateMs: durationMs(e.maxGenerateNs.Load()), - MaxUpsampleMs: durationMs(e.maxUpsampleNs.Load()), - MaxWriteMs: durationMs(e.maxWriteNs.Load()), - Queue: e.frameQueue.Stats(), + State: state.String(), + ChunksProduced: e.chunksProduced.Load(), + TotalSamples: e.totalSamples.Load(), + Underruns: e.underruns.Load(), + LateBuffers: lateBuffers, + LastError: errVal, + UptimeSeconds: uptime, + MaxCycleMs: durationMs(e.maxCycleNs.Load()), + MaxGenerateMs: durationMs(e.maxGenerateNs.Load()), + MaxUpsampleMs: durationMs(e.maxUpsampleNs.Load()), + MaxWriteMs: durationMs(e.maxWriteNs.Load()), + Queue: queue, + RuntimeIndicator: runtimeIndicator(queue.Health, lateBuffers), + } +} + +func runtimeIndicator(queueHealth output.QueueHealth, lateBuffers uint64) RuntimeIndicator { + switch { + case queueHealth == output.QueueHealthCritical: + return RuntimeIndicatorQueueCritical + case queueHealth == output.QueueHealthLow || lateBuffers > 0: + return RuntimeIndicatorDegraded + default: + return RuntimeIndicatorNormal } } diff --git a/internal/app/runtime_indicator_test.go b/internal/app/runtime_indicator_test.go new file mode 100644 index 0000000..b90aca2 --- /dev/null +++ b/internal/app/runtime_indicator_test.go @@ -0,0 +1,57 @@ +package app + +import ( + "testing" + + "github.com/jan/fm-rds-tx/internal/output" +) + +func TestRuntimeIndicator(t *testing.T) { + cases := []struct { + name string + queueHealth output.QueueHealth + lateBuffers uint64 + want RuntimeIndicator + }{ + { + name: "normal", + queueHealth: output.QueueHealthNormal, + lateBuffers: 0, + want: RuntimeIndicatorNormal, + }, + { + name: "degradedLateBuffers", + queueHealth: output.QueueHealthNormal, + lateBuffers: 1, + want: RuntimeIndicatorDegraded, + }, + { + name: "degradedQueueLow", + queueHealth: output.QueueHealthLow, + lateBuffers: 0, + want: RuntimeIndicatorDegraded, + }, + { + name: "queueCritical", + queueHealth: output.QueueHealthCritical, + lateBuffers: 0, + want: RuntimeIndicatorQueueCritical, + }, + { + name: "criticalLateBuffers", + queueHealth: output.QueueHealthCritical, + lateBuffers: 3, + want: RuntimeIndicatorQueueCritical, + }, + } + + for _, tc := range cases { + tc := tc + t.Run(tc.name, func(t *testing.T) { + if got := runtimeIndicator(tc.queueHealth, tc.lateBuffers); got != tc.want { + t.Fatalf("runtime indicator mismatch: queue=%s late=%d want=%s got=%s", + tc.queueHealth, tc.lateBuffers, tc.want, got) + } + }) + } +} From d39d59f1eddd1a1f96bb5b328fe28e2bc59359f4 Mon Sep 17 00:00:00 2001 From: Jan Svabenik Date: Sun, 5 Apr 2026 17:33:51 +0200 Subject: [PATCH 13/55] Expose runtime indicator in status --- docs/pro-runtime-hardening-workboard.md | 2 + internal/control/control.go | 76 ++++++++++++++++++------- internal/control/control_test.go | 64 ++++++++++++++++----- 3 files changed, 110 insertions(+), 32 deletions(-) diff --git a/docs/pro-runtime-hardening-workboard.md b/docs/pro-runtime-hardening-workboard.md index f1618a3..6854400 100644 --- a/docs/pro-runtime-hardening-workboard.md +++ b/docs/pro-runtime-hardening-workboard.md @@ -249,6 +249,7 @@ Generator/Upsampler und Hardwarewriter werden als getrennte Stufen mit kleinem, | 2026-04-05 | FrameQueue mit Engine-Integration | Queue lebt nach dem Upsampler auf DeviceFrame-Ebene, Kapazität via `runtime.frameQueueCapacity`, `EngineStats` zeigt `QueueStats`, Tests decken Timeouts und Counters ab. | | 2026-04-05 | Queue-Health-Indikator | `QueueStats.Health` gibt `critical`/`low`/`normal` zurück und `txBridge` leitet `EngineStats.Queue` ins `/runtime`-JSON. | | 2026-04-05 | Runtime-Indikator | `EngineStats.RuntimeIndicator` kombiniert `queue.health` + `lateBuffers`, `/runtime` zeigt `engine.runtimeIndicator`. | +| 2026-04-05 | /status runtime indicator | `/status` reuses `txBridge.TXStats()` and now reports `runtimeIndicator` alongside the config snapshot for quick ops. | ## WS-01 Verifikation | Datum | Fokus | Ergebnis | @@ -257,6 +258,7 @@ Generator/Upsampler und Hardwarewriter werden als getrennte Stufen mit kleinem, | 2026-04-05 | Queue-Health-Indikator | go test ./... deckt `TestFrameQueueHealthIndicator` und `queue.health` ab. | | 2026-04-05 | Runtime-Indikator | OK `go test ./...` deckt `runtimeIndicator` sowie `/runtime`-Exposition von `engine.runtimeIndicator`. | | 2026-04-05 | Runtime API queue health | ✅ `/runtime` liefert jetzt `engine.queue.health` dank `txBridge.TXStats`. | +| 2026-04-05 | /status runtime indicator | ✅ `/status` gibt jetzt `runtimeIndicator` aus (`control_test` deckt den neuen Key). | --- diff --git a/internal/control/control.go b/internal/control/control.go index 9f3420a..9e750ff 100644 --- a/internal/control/control.go +++ b/internal/control/control.go @@ -44,8 +44,8 @@ type Server struct { mu sync.RWMutex cfg config.Config tx TXController - drv platform.SoapyDriver // optional, for runtime stats - streamSrc *audio.StreamSource // optional, for live audio ingest + drv platform.SoapyDriver // optional, for runtime stats + streamSrc *audio.StreamSource // optional, for live audio ingest } type ConfigPatch struct { @@ -119,10 +119,10 @@ func (s *Server) handleUI(w http.ResponseWriter, r *http.Request) { func (s *Server) handleStatus(w http.ResponseWriter, _ *http.Request) { s.mu.RLock() cfg := s.cfg + tx := s.tx s.mu.RUnlock() - w.Header().Set("Content-Type", "application/json") - _ = json.NewEncoder(w).Encode(map[string]any{ + status := map[string]any{ "service": "fm-rds-tx", "backend": cfg.Backend.Kind, "frequencyMHz": cfg.FM.FrequencyMHz, @@ -131,7 +131,17 @@ func (s *Server) handleStatus(w http.ResponseWriter, _ *http.Request) { "preEmphasisTauUS": cfg.FM.PreEmphasisTauUS, "limiterEnabled": cfg.FM.LimiterEnabled, "fmModulationEnabled": cfg.FM.FMModulationEnabled, - }) + } + if tx != nil { + if stats := tx.TXStats(); stats != nil { + if ri, ok := stats["runtimeIndicator"]; ok { + status["runtimeIndicator"] = ri + } + } + } + + w.Header().Set("Content-Type", "application/json") + _ = json.NewEncoder(w).Encode(status) } func (s *Server) handleRuntime(w http.ResponseWriter, _ *http.Request) { @@ -264,20 +274,48 @@ func (s *Server) handleConfig(w http.ResponseWriter, r *http.Request) { // Update the server's config snapshot (for GET /config and /status) s.mu.Lock() next := s.cfg - if patch.FrequencyMHz != nil { next.FM.FrequencyMHz = *patch.FrequencyMHz } - if patch.OutputDrive != nil { next.FM.OutputDrive = *patch.OutputDrive } - if patch.ToneLeftHz != nil { next.Audio.ToneLeftHz = *patch.ToneLeftHz } - if patch.ToneRightHz != nil { next.Audio.ToneRightHz = *patch.ToneRightHz } - if patch.ToneAmplitude != nil { next.Audio.ToneAmplitude = *patch.ToneAmplitude } - if patch.PS != nil { next.RDS.PS = *patch.PS } - if patch.RadioText != nil { next.RDS.RadioText = *patch.RadioText } - if patch.PreEmphasisTauUS != nil { next.FM.PreEmphasisTauUS = *patch.PreEmphasisTauUS } - if patch.StereoEnabled != nil { next.FM.StereoEnabled = *patch.StereoEnabled } - if patch.LimiterEnabled != nil { next.FM.LimiterEnabled = *patch.LimiterEnabled } - if patch.LimiterCeiling != nil { next.FM.LimiterCeiling = *patch.LimiterCeiling } - if patch.RDSEnabled != nil { next.RDS.Enabled = *patch.RDSEnabled } - if patch.PilotLevel != nil { next.FM.PilotLevel = *patch.PilotLevel } - if patch.RDSInjection != nil { next.FM.RDSInjection = *patch.RDSInjection } + if patch.FrequencyMHz != nil { + next.FM.FrequencyMHz = *patch.FrequencyMHz + } + if patch.OutputDrive != nil { + next.FM.OutputDrive = *patch.OutputDrive + } + if patch.ToneLeftHz != nil { + next.Audio.ToneLeftHz = *patch.ToneLeftHz + } + if patch.ToneRightHz != nil { + next.Audio.ToneRightHz = *patch.ToneRightHz + } + if patch.ToneAmplitude != nil { + next.Audio.ToneAmplitude = *patch.ToneAmplitude + } + if patch.PS != nil { + next.RDS.PS = *patch.PS + } + if patch.RadioText != nil { + next.RDS.RadioText = *patch.RadioText + } + if patch.PreEmphasisTauUS != nil { + next.FM.PreEmphasisTauUS = *patch.PreEmphasisTauUS + } + if patch.StereoEnabled != nil { + next.FM.StereoEnabled = *patch.StereoEnabled + } + if patch.LimiterEnabled != nil { + next.FM.LimiterEnabled = *patch.LimiterEnabled + } + if patch.LimiterCeiling != nil { + next.FM.LimiterCeiling = *patch.LimiterCeiling + } + if patch.RDSEnabled != nil { + next.RDS.Enabled = *patch.RDSEnabled + } + if patch.PilotLevel != nil { + next.FM.PilotLevel = *patch.PilotLevel + } + if patch.RDSInjection != nil { + next.FM.RDSInjection = *patch.RDSInjection + } if err := next.Validate(); err != nil { s.mu.Unlock() http.Error(w, err.Error(), http.StatusBadRequest) diff --git a/internal/control/control_test.go b/internal/control/control_test.go index fc01438..2f92406 100644 --- a/internal/control/control_test.go +++ b/internal/control/control_test.go @@ -15,28 +15,55 @@ func TestHealthz(t *testing.T) { srv := NewServer(cfgpkg.Default()) rec := httptest.NewRecorder() srv.Handler().ServeHTTP(rec, httptest.NewRequest(http.MethodGet, "/healthz", nil)) - if rec.Code != 200 { t.Fatalf("status: %d", rec.Code) } + if rec.Code != 200 { + t.Fatalf("status: %d", rec.Code) + } } func TestStatus(t *testing.T) { srv := NewServer(cfgpkg.Default()) rec := httptest.NewRecorder() srv.Handler().ServeHTTP(rec, httptest.NewRequest(http.MethodGet, "/status", nil)) - if rec.Code != 200 { t.Fatalf("status: %d", rec.Code) } + if rec.Code != 200 { + t.Fatalf("status: %d", rec.Code) + } + var body map[string]any + json.Unmarshal(rec.Body.Bytes(), &body) + if body["service"] != "fm-rds-tx" { + t.Fatal("missing service") + } + if _, ok := body["preEmphasisTauUS"]; !ok { + t.Fatal("missing preEmphasisTauUS") + } +} + +func TestStatusReportsRuntimeIndicator(t *testing.T) { + srv := NewServer(cfgpkg.Default()) + srv.SetTXController(&fakeTXController{stats: map[string]any{"runtimeIndicator": "degraded"}}) + rec := httptest.NewRecorder() + srv.Handler().ServeHTTP(rec, httptest.NewRequest(http.MethodGet, "/status", nil)) + if rec.Code != 200 { + t.Fatalf("status: %d", rec.Code) + } var body map[string]any json.Unmarshal(rec.Body.Bytes(), &body) - if body["service"] != "fm-rds-tx" { t.Fatal("missing service") } - if _, ok := body["preEmphasisTauUS"]; !ok { t.Fatal("missing preEmphasisTauUS") } + if body["runtimeIndicator"] != "degraded" { + t.Fatalf("expected runtimeIndicator degraded, got %v", body["runtimeIndicator"]) + } } func TestDryRunEndpoint(t *testing.T) { srv := NewServer(cfgpkg.Default()) rec := httptest.NewRecorder() srv.Handler().ServeHTTP(rec, httptest.NewRequest(http.MethodGet, "/dry-run", nil)) - if rec.Code != 200 { t.Fatalf("status: %d", rec.Code) } + if rec.Code != 200 { + t.Fatalf("status: %d", rec.Code) + } var body map[string]any json.Unmarshal(rec.Body.Bytes(), &body) - if body["mode"] != "dry-run" { t.Fatal("wrong mode") } + if body["mode"] != "dry-run" { + t.Fatal("wrong mode") + } } func TestConfigPatch(t *testing.T) { @@ -44,21 +71,27 @@ func TestConfigPatch(t *testing.T) { body := []byte(`{"toneLeftHz":900,"radioText":"hello world","preEmphasisTauUS":75}`) rec := httptest.NewRecorder() srv.Handler().ServeHTTP(rec, httptest.NewRequest(http.MethodPost, "/config", bytes.NewReader(body))) - if rec.Code != 200 { t.Fatalf("status: %d body=%s", rec.Code, rec.Body.String()) } + if rec.Code != 200 { + t.Fatalf("status: %d body=%s", rec.Code, rec.Body.String()) + } } func TestRuntimeWithoutDriver(t *testing.T) { srv := NewServer(cfgpkg.Default()) rec := httptest.NewRecorder() srv.Handler().ServeHTTP(rec, httptest.NewRequest(http.MethodGet, "/runtime", nil)) - if rec.Code != 200 { t.Fatalf("status: %d", rec.Code) } + if rec.Code != 200 { + t.Fatalf("status: %d", rec.Code) + } } func TestTXStartWithoutController(t *testing.T) { srv := NewServer(cfgpkg.Default()) rec := httptest.NewRecorder() srv.Handler().ServeHTTP(rec, httptest.NewRequest(http.MethodPost, "/tx/start", nil)) - if rec.Code != http.StatusServiceUnavailable { t.Fatalf("expected 503, got %d", rec.Code) } + if rec.Code != http.StatusServiceUnavailable { + t.Fatalf("expected 503, got %d", rec.Code) + } } func TestConfigPatchUpdatesSnapshot(t *testing.T) { @@ -114,10 +147,15 @@ func TestConfigPatchEngineRejectsDoesNotUpdateSnapshot(t *testing.T) { type fakeTXController struct { updateErr error + stats map[string]any } -func (f *fakeTXController) StartTX() error { return nil } -func (f *fakeTXController) StopTX() error { return nil } -func (f *fakeTXController) TXStats() map[string]any { return map[string]any{} } +func (f *fakeTXController) StartTX() error { return nil } +func (f *fakeTXController) StopTX() error { return nil } +func (f *fakeTXController) TXStats() map[string]any { + if f.stats != nil { + return f.stats + } + return map[string]any{} +} func (f *fakeTXController) UpdateConfig(_ LivePatch) error { return f.updateErr } - From b3e9f7bf458cb60a5452b424af4ba4a4bf2d6ae1 Mon Sep 17 00:00:00 2001 From: Jan Svabenik Date: Sun, 5 Apr 2026 17:43:24 +0200 Subject: [PATCH 14/55] feat: add runtime health alert --- cmd/fmrtx/main.go | 1 + internal/app/engine.go | 18 +++++++++- internal/app/runtime_indicator_test.go | 50 ++++++++++++++++++++++++++ internal/control/control.go | 3 ++ internal/control/control_test.go | 5 ++- 5 files changed, 75 insertions(+), 2 deletions(-) diff --git a/cmd/fmrtx/main.go b/cmd/fmrtx/main.go index 7c2a37b..d839d65 100644 --- a/cmd/fmrtx/main.go +++ b/cmd/fmrtx/main.go @@ -256,6 +256,7 @@ func (b *txBridge) TXStats() map[string]any { "maxWriteMs": s.MaxWriteMs, "queue": s.Queue, "runtimeIndicator": s.RuntimeIndicator, + "runtimeAlert": s.RuntimeAlert, } } func (b *txBridge) UpdateConfig(lp ctrlpkg.LivePatch) error { diff --git a/internal/app/engine.go b/internal/app/engine.go index 4395b46..008616d 100644 --- a/internal/app/engine.go +++ b/internal/app/engine.go @@ -69,6 +69,7 @@ type EngineStats struct { MaxWriteMs float64 `json:"maxWriteMs,omitempty"` Queue output.QueueStats `json:"queue"` RuntimeIndicator RuntimeIndicator `json:"runtimeIndicator"` + RuntimeAlert string `json:"runtimeAlert,omitempty"` } type RuntimeIndicator string @@ -350,6 +351,7 @@ func (e *Engine) Stats() EngineStats { queue := e.frameQueue.Stats() lateBuffers := e.lateBuffers.Load() + ri := runtimeIndicator(queue.Health, lateBuffers) return EngineStats{ State: state.String(), ChunksProduced: e.chunksProduced.Load(), @@ -363,7 +365,8 @@ func (e *Engine) Stats() EngineStats { MaxUpsampleMs: durationMs(e.maxUpsampleNs.Load()), MaxWriteMs: durationMs(e.maxWriteNs.Load()), Queue: queue, - RuntimeIndicator: runtimeIndicator(queue.Health, lateBuffers), + RuntimeIndicator: ri, + RuntimeAlert: runtimeAlert(queue.Health, lateBuffers), } } @@ -378,6 +381,19 @@ func runtimeIndicator(queueHealth output.QueueHealth, lateBuffers uint64) Runtim } } +func runtimeAlert(queueHealth output.QueueHealth, lateBuffers uint64) string { + switch { + case queueHealth == output.QueueHealthCritical: + return "queue health critical" + case lateBuffers > 0: + return "late buffers" + case queueHealth == output.QueueHealthLow: + return "queue health low" + default: + return "" + } +} + func (e *Engine) run(ctx context.Context) { e.wg.Add(1) go e.writerLoop(ctx) diff --git a/internal/app/runtime_indicator_test.go b/internal/app/runtime_indicator_test.go index b90aca2..44a825e 100644 --- a/internal/app/runtime_indicator_test.go +++ b/internal/app/runtime_indicator_test.go @@ -55,3 +55,53 @@ func TestRuntimeIndicator(t *testing.T) { }) } } + +func TestRuntimeAlert(t *testing.T) { + cases := []struct { + name string + queueHealth output.QueueHealth + lateBuffers uint64 + want string + }{ + { + name: "normal", + queueHealth: output.QueueHealthNormal, + lateBuffers: 0, + want: "", + }, + { + name: "lateBuffers", + queueHealth: output.QueueHealthNormal, + lateBuffers: 1, + want: "late buffers", + }, + { + name: "queueLow", + queueHealth: output.QueueHealthLow, + lateBuffers: 0, + want: "queue health low", + }, + { + name: "queueCritical", + queueHealth: output.QueueHealthCritical, + lateBuffers: 0, + want: "queue health critical", + }, + { + name: "criticalLateBuffers", + queueHealth: output.QueueHealthCritical, + lateBuffers: 5, + want: "queue health critical", + }, + } + + for _, tc := range cases { + tc := tc + t.Run(tc.name, func(t *testing.T) { + if got := runtimeAlert(tc.queueHealth, tc.lateBuffers); got != tc.want { + t.Fatalf("runtime alert mismatch: queue=%s late=%d want=%q got=%q", + tc.queueHealth, tc.lateBuffers, tc.want, got) + } + }) + } +} diff --git a/internal/control/control.go b/internal/control/control.go index 9e750ff..278ec6a 100644 --- a/internal/control/control.go +++ b/internal/control/control.go @@ -137,6 +137,9 @@ func (s *Server) handleStatus(w http.ResponseWriter, _ *http.Request) { if ri, ok := stats["runtimeIndicator"]; ok { status["runtimeIndicator"] = ri } + if alert, ok := stats["runtimeAlert"]; ok { + status["runtimeAlert"] = alert + } } } diff --git a/internal/control/control_test.go b/internal/control/control_test.go index 2f92406..1a70684 100644 --- a/internal/control/control_test.go +++ b/internal/control/control_test.go @@ -39,7 +39,7 @@ func TestStatus(t *testing.T) { func TestStatusReportsRuntimeIndicator(t *testing.T) { srv := NewServer(cfgpkg.Default()) - srv.SetTXController(&fakeTXController{stats: map[string]any{"runtimeIndicator": "degraded"}}) + srv.SetTXController(&fakeTXController{stats: map[string]any{"runtimeIndicator": "degraded", "runtimeAlert": "late buffers"}}) rec := httptest.NewRecorder() srv.Handler().ServeHTTP(rec, httptest.NewRequest(http.MethodGet, "/status", nil)) if rec.Code != 200 { @@ -50,6 +50,9 @@ func TestStatusReportsRuntimeIndicator(t *testing.T) { if body["runtimeIndicator"] != "degraded" { t.Fatalf("expected runtimeIndicator degraded, got %v", body["runtimeIndicator"]) } + if body["runtimeAlert"] != "late buffers" { + t.Fatalf("expected runtimeAlert late buffers, got %v", body["runtimeAlert"]) + } } func TestDryRunEndpoint(t *testing.T) { From b56012ab15090e73c1e876fcdd1ddde804b7eec9 Mon Sep 17 00:00:00 2001 From: Jan Svabenik Date: Sun, 5 Apr 2026 17:48:20 +0200 Subject: [PATCH 15/55] Show runtime indicator in control UI --- internal/control/ui.html | 32 ++++++++++++++++++++++++++++++-- 1 file changed, 30 insertions(+), 2 deletions(-) diff --git a/internal/control/ui.html b/internal/control/ui.html index 8445eca..ff13122 100644 --- a/internal/control/ui.html +++ b/internal/control/ui.html @@ -1077,6 +1077,8 @@ input.input-error {
HTTP
--
Runtime
--
+
Runtime Signal
--
+
Runtime Alert
--
Audio Buffer
--
Last Update
--
@@ -1714,7 +1716,7 @@ function render() { updateText('info-fmmod', fmtBool(cfg.fm?.fmModulationEnabled)); updateText('info-live', engine.state ? `${String(engine.state).toUpperCase()} / ${state.server.runtimeOk ? 'runtime ok' : 'runtime pending'}` : (state.server.configOk ? 'config only' : '--')); - updateHealth(audioStream); + updateHealth(engine, audioStream); updateMeters(engine, driver, audioStream); drawSparkline('spark-audio', state.charts.audio, 'good', 1); drawSparkline('spark-underruns', state.charts.underruns, underruns > 0 ? 'err' : 'warn'); @@ -1731,7 +1733,8 @@ function renderToggle(key, toggleId, labelId) { updateText(labelId, busy ? '...' : (on ? 'ON' : 'OFF')); } -function updateHealth(audioStream) { +function updateHealth(engine, audioStream) { + engine = engine || {}; updateText('health-http', state.server.configOk ? 'OK' : 'OFFLINE'); $('health-http').className = 'val ' + (state.server.configOk ? 'good' : 'err'); @@ -1739,6 +1742,31 @@ function updateHealth(audioStream) { updateText('health-runtime', runtimeState); $('health-runtime').className = 'val ' + (state.server.runtimeOk ? 'good' : 'warn'); + const runtimeIndicator = engine.runtimeIndicator; + const indicatorLabels = { + normal: 'Normal', + degraded: 'Degraded', + queueCritical: 'Queue critical', + }; + const indicatorText = indicatorLabels[runtimeIndicator] || (runtimeIndicator ? runtimeIndicator : '--'); + let indicatorSeverity = ''; + if (runtimeIndicator === 'queueCritical') indicatorSeverity = 'err'; + else if (runtimeIndicator === 'degraded') indicatorSeverity = 'warn'; + else if (runtimeIndicator === 'normal') indicatorSeverity = 'good'; + const indicatorEl = $('health-indicator'); + if (indicatorEl) { + indicatorEl.className = 'val' + (indicatorSeverity ? ' ' + indicatorSeverity : ''); + } + updateText('health-indicator', indicatorText); + + const runtimeAlertRaw = (engine.runtimeAlert || '').trim(); + const hasAlert = !!runtimeAlertRaw; + const alertEl = $('health-alert'); + if (alertEl) { + alertEl.className = 'val ' + (hasAlert ? 'warn' : 'good'); + } + updateText('health-alert', hasAlert ? runtimeAlertRaw : 'None'); + let audioLabel = 'N/A'; let audioClass = 'val'; if (audioStream) { From fb21dec0ed0bff50db7248be626e38e819967ecf Mon Sep 17 00:00:00 2001 From: Jan Svabenik Date: Sun, 5 Apr 2026 18:00:59 +0200 Subject: [PATCH 16/55] Expose queue stats via status endpoint --- docs/API.md | 12 +++++++-- docs/pro-runtime-hardening-workboard.md | 2 ++ internal/control/control.go | 3 +++ internal/control/control_test.go | 36 +++++++++++++++++++++++++ 4 files changed, 51 insertions(+), 2 deletions(-) diff --git a/docs/API.md b/docs/API.md index 78e0122..bd51eb2 100644 --- a/docs/API.md +++ b/docs/API.md @@ -19,7 +19,7 @@ Health check. ### `GET /status` -Current transmitter status (read-only snapshot). +Current transmitter status (read-only snapshot). Runtime indicator, alert, and queue stats from the running TX controller are mirrored here for quick health checks. **Response:** ```json @@ -31,7 +31,15 @@ Current transmitter status (read-only snapshot). "rdsEnabled": true, "preEmphasisTauUS": 50, "limiterEnabled": true, - "fmModulationEnabled": true + "fmModulationEnabled": true, + "runtimeIndicator": "normal", + "runtimeAlert": "", + "queue": { + "capacity": 3, + "depth": 1, + "fillLevel": 0.33, + "health": "low" + } } ``` diff --git a/docs/pro-runtime-hardening-workboard.md b/docs/pro-runtime-hardening-workboard.md index 6854400..e2cd6af 100644 --- a/docs/pro-runtime-hardening-workboard.md +++ b/docs/pro-runtime-hardening-workboard.md @@ -250,6 +250,7 @@ Generator/Upsampler und Hardwarewriter werden als getrennte Stufen mit kleinem, | 2026-04-05 | Queue-Health-Indikator | `QueueStats.Health` gibt `critical`/`low`/`normal` zurück und `txBridge` leitet `EngineStats.Queue` ins `/runtime`-JSON. | | 2026-04-05 | Runtime-Indikator | `EngineStats.RuntimeIndicator` kombiniert `queue.health` + `lateBuffers`, `/runtime` zeigt `engine.runtimeIndicator`. | | 2026-04-05 | /status runtime indicator | `/status` reuses `txBridge.TXStats()` and now reports `runtimeIndicator` alongside the config snapshot for quick ops. | +| 2026-04-05 | /status queue stats | `/status` spiegelt das `queue`-Objekt aus `txBridge.TXStats()` für schnelle Queue-Checks, API-Doku und `TestStatusReportsQueueStats` fangen den neuen Key ab. | ## WS-01 Verifikation | Datum | Fokus | Ergebnis | @@ -259,6 +260,7 @@ Generator/Upsampler und Hardwarewriter werden als getrennte Stufen mit kleinem, | 2026-04-05 | Runtime-Indikator | OK `go test ./...` deckt `runtimeIndicator` sowie `/runtime`-Exposition von `engine.runtimeIndicator`. | | 2026-04-05 | Runtime API queue health | ✅ `/runtime` liefert jetzt `engine.queue.health` dank `txBridge.TXStats`. | | 2026-04-05 | /status runtime indicator | ✅ `/status` gibt jetzt `runtimeIndicator` aus (`control_test` deckt den neuen Key). | +| 2026-04-05 | /status queue stats | ✅ `TestStatusReportsQueueStats` plus `docs/API.md` zeigen, dass `queue` korrekt durchgereicht wird. | --- diff --git a/internal/control/control.go b/internal/control/control.go index 278ec6a..823a8af 100644 --- a/internal/control/control.go +++ b/internal/control/control.go @@ -140,6 +140,9 @@ func (s *Server) handleStatus(w http.ResponseWriter, _ *http.Request) { if alert, ok := stats["runtimeAlert"]; ok { status["runtimeAlert"] = alert } + if queue, ok := stats["queue"]; ok { + status["queue"] = queue + } } } diff --git a/internal/control/control_test.go b/internal/control/control_test.go index 1a70684..93fc508 100644 --- a/internal/control/control_test.go +++ b/internal/control/control_test.go @@ -9,6 +9,7 @@ import ( "testing" cfgpkg "github.com/jan/fm-rds-tx/internal/config" + "github.com/jan/fm-rds-tx/internal/output" ) func TestHealthz(t *testing.T) { @@ -55,6 +56,41 @@ func TestStatusReportsRuntimeIndicator(t *testing.T) { } } +func TestStatusReportsQueueStats(t *testing.T) { + cfg := cfgpkg.Default() + queueStats := output.QueueStats{ + Capacity: cfg.Runtime.FrameQueueCapacity, + Depth: 1, + FillLevel: 0.25, + Health: output.QueueHealthLow, + } + srv := NewServer(cfg) + srv.SetTXController(&fakeTXController{stats: map[string]any{"queue": queueStats}}) + rec := httptest.NewRecorder() + srv.Handler().ServeHTTP(rec, httptest.NewRequest(http.MethodGet, "/status", nil)) + if rec.Code != 200 { + t.Fatalf("status: %d", rec.Code) + } + var body map[string]any + if err := json.Unmarshal(rec.Body.Bytes(), &body); err != nil { + t.Fatalf("unmarshal queue stats: %v", err) + } + queueRaw, ok := body["queue"] + if !ok { + t.Fatalf("missing queue in status") + } + queueMap, ok := queueRaw.(map[string]any) + if !ok { + t.Fatalf("queue stats type mismatch: %T", queueRaw) + } + if queueMap["capacity"] != float64(queueStats.Capacity) { + t.Fatalf("queue capacity mismatch: want %v got %v", queueStats.Capacity, queueMap["capacity"]) + } + if queueMap["health"] != string(queueStats.Health) { + t.Fatalf("queue health mismatch: want %s got %v", queueStats.Health, queueMap["health"]) + } +} + func TestDryRunEndpoint(t *testing.T) { srv := NewServer(cfgpkg.Default()) rec := httptest.NewRecorder() From f388a9153af3048fee55b61a83e2a4b212c07b2d Mon Sep 17 00:00:00 2001 From: Jan Svabenik Date: Sun, 5 Apr 2026 18:07:48 +0200 Subject: [PATCH 17/55] test: cover runtime indicator and document semantics --- docs/API.md | 2 + internal/app/runtime_indicator_test.go | 112 ++++++++----------------- 2 files changed, 35 insertions(+), 79 deletions(-) diff --git a/docs/API.md b/docs/API.md index bd51eb2..b8513f3 100644 --- a/docs/API.md +++ b/docs/API.md @@ -43,6 +43,8 @@ Current transmitter status (read-only snapshot). Runtime indicator, alert, and q } ``` +`runtimeIndicator` is derived from the engine queue health plus any late buffers and can be "normal", "degraded", or "queueCritical". `runtimeAlert` surfaces a short reason (e.g. "queue health low" or "late buffers") when the indicator is not "normal", otherwise it stays empty. + --- ### `GET /runtime` diff --git a/internal/app/runtime_indicator_test.go b/internal/app/runtime_indicator_test.go index 44a825e..22ca93b 100644 --- a/internal/app/runtime_indicator_test.go +++ b/internal/app/runtime_indicator_test.go @@ -6,101 +6,55 @@ import ( "github.com/jan/fm-rds-tx/internal/output" ) -func TestRuntimeIndicator(t *testing.T) { +func TestRuntimeIndicatorAndAlert(t *testing.T) { cases := []struct { - name string - queueHealth output.QueueHealth - lateBuffers uint64 - want RuntimeIndicator + name string + health output.QueueHealth + lateBuffers uint64 + wantIndicator RuntimeIndicator + wantAlert string }{ { - name: "normal", - queueHealth: output.QueueHealthNormal, - lateBuffers: 0, - want: RuntimeIndicatorNormal, + name: "queue critical", + health: output.QueueHealthCritical, + lateBuffers: 0, + wantIndicator: RuntimeIndicatorQueueCritical, + wantAlert: "queue health critical", }, { - name: "degradedLateBuffers", - queueHealth: output.QueueHealthNormal, - lateBuffers: 1, - want: RuntimeIndicatorDegraded, + name: "queue low", + health: output.QueueHealthLow, + lateBuffers: 0, + wantIndicator: RuntimeIndicatorDegraded, + wantAlert: "queue health low", }, { - name: "degradedQueueLow", - queueHealth: output.QueueHealthLow, - lateBuffers: 0, - want: RuntimeIndicatorDegraded, + name: "late buffers", + health: output.QueueHealthNormal, + lateBuffers: 2, + wantIndicator: RuntimeIndicatorDegraded, + wantAlert: "late buffers", }, { - name: "queueCritical", - queueHealth: output.QueueHealthCritical, - lateBuffers: 0, - want: RuntimeIndicatorQueueCritical, - }, - { - name: "criticalLateBuffers", - queueHealth: output.QueueHealthCritical, - lateBuffers: 3, - want: RuntimeIndicatorQueueCritical, + name: "normal", + health: output.QueueHealthNormal, + lateBuffers: 0, + wantIndicator: RuntimeIndicatorNormal, + wantAlert: "", }, } for _, tc := range cases { tc := tc t.Run(tc.name, func(t *testing.T) { - if got := runtimeIndicator(tc.queueHealth, tc.lateBuffers); got != tc.want { - t.Fatalf("runtime indicator mismatch: queue=%s late=%d want=%s got=%s", - tc.queueHealth, tc.lateBuffers, tc.want, got) + t.Parallel() + got := runtimeIndicator(tc.health, tc.lateBuffers) + if got != tc.wantIndicator { + t.Fatalf("indicator: expected %s, got %s", tc.wantIndicator, got) } - }) - } -} - -func TestRuntimeAlert(t *testing.T) { - cases := []struct { - name string - queueHealth output.QueueHealth - lateBuffers uint64 - want string - }{ - { - name: "normal", - queueHealth: output.QueueHealthNormal, - lateBuffers: 0, - want: "", - }, - { - name: "lateBuffers", - queueHealth: output.QueueHealthNormal, - lateBuffers: 1, - want: "late buffers", - }, - { - name: "queueLow", - queueHealth: output.QueueHealthLow, - lateBuffers: 0, - want: "queue health low", - }, - { - name: "queueCritical", - queueHealth: output.QueueHealthCritical, - lateBuffers: 0, - want: "queue health critical", - }, - { - name: "criticalLateBuffers", - queueHealth: output.QueueHealthCritical, - lateBuffers: 5, - want: "queue health critical", - }, - } - - for _, tc := range cases { - tc := tc - t.Run(tc.name, func(t *testing.T) { - if got := runtimeAlert(tc.queueHealth, tc.lateBuffers); got != tc.want { - t.Fatalf("runtime alert mismatch: queue=%s late=%d want=%q got=%q", - tc.queueHealth, tc.lateBuffers, tc.want, got) + alert := runtimeAlert(tc.health, tc.lateBuffers) + if alert != tc.wantAlert { + t.Fatalf("alert: expected %q, got %q", tc.wantAlert, alert) } }) } From 38a6cf3d70e654c8ca30a6a927d1035e9ddd40d8 Mon Sep 17 00:00:00 2001 From: Jan Svabenik Date: Sun, 5 Apr 2026 18:19:33 +0200 Subject: [PATCH 18/55] Make runtime indicator drop stale late alerts --- docs/API.md | 2 +- internal/app/engine.go | 37 +++++++++++++++----------- internal/app/runtime_indicator_test.go | 18 ++++++++----- 3 files changed, 34 insertions(+), 23 deletions(-) diff --git a/docs/API.md b/docs/API.md index b8513f3..97742f3 100644 --- a/docs/API.md +++ b/docs/API.md @@ -43,7 +43,7 @@ Current transmitter status (read-only snapshot). Runtime indicator, alert, and q } ``` -`runtimeIndicator` is derived from the engine queue health plus any late buffers and can be "normal", "degraded", or "queueCritical". `runtimeAlert` surfaces a short reason (e.g. "queue health low" or "late buffers") when the indicator is not "normal", otherwise it stays empty. +`runtimeIndicator` is derived from the engine queue health plus any late buffers observed in the last 5 seconds and can be "normal", "degraded", or "queueCritical". `runtimeAlert` surfaces a short reason (e.g. "queue health low" or "late buffers") when the indicator is not "normal", but late-buffer alerts expire after a few seconds once cycle times settle so the signal doesn't stay stuck on degraded. The cumulative `lateBuffers` counter returned by `/runtime` still shows how many late cycles have occurred since start for post-mortem diagnosis. --- diff --git a/internal/app/engine.go b/internal/app/engine.go index 008616d..a4836e9 100644 --- a/internal/app/engine.go +++ b/internal/app/engine.go @@ -80,6 +80,8 @@ const ( RuntimeIndicatorQueueCritical RuntimeIndicator = "queueCritical" ) +const lateBufferIndicatorWindow = 5 * time.Second + // Engine is the continuous TX loop. It generates composite IQ in chunks, // resamples to device rate, and pushes to hardware in a tight loop. // The hardware buffer_push call is blocking — it returns when the hardware @@ -100,15 +102,16 @@ type Engine struct { startedAt time.Time wg sync.WaitGroup - chunksProduced atomic.Uint64 - totalSamples atomic.Uint64 - underruns atomic.Uint64 - lateBuffers atomic.Uint64 - maxCycleNs atomic.Uint64 - maxGenerateNs atomic.Uint64 - maxUpsampleNs atomic.Uint64 - maxWriteNs atomic.Uint64 - lastError atomic.Value // string + chunksProduced atomic.Uint64 + totalSamples atomic.Uint64 + underruns atomic.Uint64 + lateBuffers atomic.Uint64 + lateBufferAlertAt atomic.Uint64 + maxCycleNs atomic.Uint64 + maxGenerateNs atomic.Uint64 + maxUpsampleNs atomic.Uint64 + maxWriteNs atomic.Uint64 + lastError atomic.Value // string // Live config: pending frequency change, applied between chunks pendingFreq atomic.Pointer[float64] @@ -351,7 +354,10 @@ func (e *Engine) Stats() EngineStats { queue := e.frameQueue.Stats() lateBuffers := e.lateBuffers.Load() - ri := runtimeIndicator(queue.Health, lateBuffers) + now := time.Now() + lateAlertAt := e.lateBufferAlertAt.Load() + hasRecentLateBuffers := lateAlertAt > 0 && now.Sub(time.Unix(0, int64(lateAlertAt))) <= lateBufferIndicatorWindow + ri := runtimeIndicator(queue.Health, hasRecentLateBuffers) return EngineStats{ State: state.String(), ChunksProduced: e.chunksProduced.Load(), @@ -366,26 +372,26 @@ func (e *Engine) Stats() EngineStats { MaxWriteMs: durationMs(e.maxWriteNs.Load()), Queue: queue, RuntimeIndicator: ri, - RuntimeAlert: runtimeAlert(queue.Health, lateBuffers), + RuntimeAlert: runtimeAlert(queue.Health, hasRecentLateBuffers), } } -func runtimeIndicator(queueHealth output.QueueHealth, lateBuffers uint64) RuntimeIndicator { +func runtimeIndicator(queueHealth output.QueueHealth, recentLateBuffers bool) RuntimeIndicator { switch { case queueHealth == output.QueueHealthCritical: return RuntimeIndicatorQueueCritical - case queueHealth == output.QueueHealthLow || lateBuffers > 0: + case queueHealth == output.QueueHealthLow || recentLateBuffers: return RuntimeIndicatorDegraded default: return RuntimeIndicatorNormal } } -func runtimeAlert(queueHealth output.QueueHealth, lateBuffers uint64) string { +func runtimeAlert(queueHealth output.QueueHealth, recentLateBuffers bool) string { switch { case queueHealth == output.QueueHealthCritical: return "queue health critical" - case lateBuffers > 0: + case recentLateBuffers: return "late buffers" case queueHealth == output.QueueHealthLow: return "queue health low" @@ -484,6 +490,7 @@ func (e *Engine) writerLoop(ctx context.Context) { if cycleDur > e.chunkDuration { late := e.lateBuffers.Add(1) + e.lateBufferAlertAt.Store(uint64(time.Now().UnixNano())) if late <= 5 || late%20 == 0 { log.Printf("TX LATE: cycle=%s budget=%s write=%s over=%s", cycleDur, e.chunkDuration, writeDur, cycleDur-e.chunkDuration) diff --git a/internal/app/runtime_indicator_test.go b/internal/app/runtime_indicator_test.go index 22ca93b..c27eb63 100644 --- a/internal/app/runtime_indicator_test.go +++ b/internal/app/runtime_indicator_test.go @@ -10,35 +10,39 @@ func TestRuntimeIndicatorAndAlert(t *testing.T) { cases := []struct { name string health output.QueueHealth - lateBuffers uint64 + recentLate bool wantIndicator RuntimeIndicator wantAlert string }{ { name: "queue critical", health: output.QueueHealthCritical, - lateBuffers: 0, wantIndicator: RuntimeIndicatorQueueCritical, wantAlert: "queue health critical", }, { name: "queue low", health: output.QueueHealthLow, - lateBuffers: 0, wantIndicator: RuntimeIndicatorDegraded, wantAlert: "queue health low", }, { name: "late buffers", health: output.QueueHealthNormal, - lateBuffers: 2, + recentLate: true, + wantIndicator: RuntimeIndicatorDegraded, + wantAlert: "late buffers", + }, + { + name: "late buffers override queue low", + health: output.QueueHealthLow, + recentLate: true, wantIndicator: RuntimeIndicatorDegraded, wantAlert: "late buffers", }, { name: "normal", health: output.QueueHealthNormal, - lateBuffers: 0, wantIndicator: RuntimeIndicatorNormal, wantAlert: "", }, @@ -48,11 +52,11 @@ func TestRuntimeIndicatorAndAlert(t *testing.T) { tc := tc t.Run(tc.name, func(t *testing.T) { t.Parallel() - got := runtimeIndicator(tc.health, tc.lateBuffers) + got := runtimeIndicator(tc.health, tc.recentLate) if got != tc.wantIndicator { t.Fatalf("indicator: expected %s, got %s", tc.wantIndicator, got) } - alert := runtimeAlert(tc.health, tc.lateBuffers) + alert := runtimeAlert(tc.health, tc.recentLate) if alert != tc.wantAlert { t.Fatalf("alert: expected %q, got %q", tc.wantAlert, alert) } From 44ff130d230779ffd8d4359f16e9473e534a89b7 Mon Sep 17 00:00:00 2001 From: Jan Svabenik Date: Sun, 5 Apr 2026 18:26:27 +0200 Subject: [PATCH 19/55] feat: add explicit HTTP audio ingest mode --- README.md | 11 ++++++- cmd/fmrtx/main.go | 35 ++++++++++++-------- docs/API.md | 4 +-- docs/pro-runtime-hardening-workboard.md | 2 +- internal/control/control_test.go | 43 +++++++++++++++++++++++++ 5 files changed, 78 insertions(+), 17 deletions(-) diff --git a/README.md b/README.md index ca0a03b..ad73b7c 100644 --- a/README.md +++ b/README.md @@ -140,6 +140,14 @@ ffmpeg -i "http://svabi.ch:8443/stream" -f s16le -ar 44100 -ac 2 - | .\fmrtx.exe ffmpeg -i source.wav -f s16le -ar 48000 -ac 2 - | .\fmrtx.exe --tx --tx-auto-start --audio-stdin --audio-rate 48000 --config docs/config.plutosdr.json ``` +### 8) HTTP audio ingest + +Start the control plane with `--audio-http` to accept raw PCM pushes on `/audio/stream` and feed them into the live encoder: + +```powershell +ffmpeg -i music.mp3 -f s16le -ar 44100 -ac 2 - | curl -X POST --data-binary @- http://localhost:8088/audio/stream +``` + ## CLI overview ## `fmrtx` @@ -156,6 +164,7 @@ Important runtime modes and flags include: - `--list-devices` - `--audio-stdin` - `--audio-rate ` +- `--audio-http` ## `offline` Useful flags include: @@ -196,7 +205,7 @@ POST /audio/stream push raw S16LE stereo PCM into live stream buffer - live patching of selected parameters - dry-run inspection - browser-accessible control UI -- optional HTTP audio ingest +- optional HTTP audio ingest (enable with `--audio-http`) ### Live config notes `POST /config` supports live updates for selected fields such as: diff --git a/cmd/fmrtx/main.go b/cmd/fmrtx/main.go index d839d65..a7abed9 100644 --- a/cmd/fmrtx/main.go +++ b/cmd/fmrtx/main.go @@ -34,6 +34,7 @@ func main() { listDevices := flag.Bool("list-devices", false, "enumerate SoapySDR devices and exit") audioStdin := flag.Bool("audio-stdin", false, "read S16LE stereo PCM audio from stdin") audioRate := flag.Int("audio-rate", 44100, "sample rate of stdin audio input (Hz)") + audioHTTP := flag.Bool("audio-http", false, "enable HTTP audio ingest via /audio/stream") flag.Parse() // --- list-devices (SoapySDR) --- @@ -102,7 +103,7 @@ func main() { if driver == nil { log.Fatal("no hardware driver available — build with -tags pluto (or -tags soapy)") } - runTXMode(cfg, driver, *txAutoStart, *audioStdin, *audioRate) + runTXMode(cfg, driver, *txAutoStart, *audioStdin, *audioRate, *audioHTTP) return } @@ -145,7 +146,7 @@ func selectDriver(cfg cfgpkg.Config) platform.SoapyDriver { return nil } -func runTXMode(cfg cfgpkg.Config, driver platform.SoapyDriver, autoStart bool, audioStdin bool, audioRate int) { +func runTXMode(cfg cfgpkg.Config, driver platform.SoapyDriver, autoStart bool, audioStdin bool, audioRate int, audioHTTP bool) { ctx, cancel := context.WithCancel(context.Background()) defer cancel() @@ -185,20 +186,28 @@ func runTXMode(cfg cfgpkg.Config, driver platform.SoapyDriver, autoStart bool, a // Live audio stream source (optional) var streamSrc *audio.StreamSource - if audioStdin { + if audioStdin || audioHTTP { // Buffer: 2 seconds at input rate — enough to absorb jitter - streamSrc = audio.NewStreamSource(audioRate*2, audioRate) + bufferFrames := audioRate * 2 + if bufferFrames <= 0 { + bufferFrames = 1 + } + streamSrc = audio.NewStreamSource(bufferFrames, audioRate) engine.SetStreamSource(streamSrc) - // Stdin ingest goroutine - go func() { - log.Printf("audio: reading S16LE stereo PCM from stdin at %d Hz", audioRate) - if err := audio.IngestReader(os.Stdin, streamSrc); err != nil { - log.Printf("audio: stdin ingest ended: %v", err) - } else { - log.Println("audio: stdin EOF") - } - }() + if audioStdin { + go func() { + log.Printf("audio: reading S16LE stereo PCM from stdin at %d Hz", audioRate) + if err := audio.IngestReader(os.Stdin, streamSrc); err != nil { + log.Printf("audio: stdin ingest ended: %v", err) + } else { + log.Println("audio: stdin EOF") + } + }() + } + if audioHTTP { + log.Printf("audio: HTTP ingest enabled on /audio/stream (rate=%dHz, buffer=%d frames)", audioRate, streamSrc.Stats().Capacity) + } } // Control plane diff --git a/docs/API.md b/docs/API.md index 97742f3..57301bb 100644 --- a/docs/API.md +++ b/docs/API.md @@ -237,7 +237,7 @@ These cannot be hot-reloaded (they affect DSP pipeline structure): Push raw audio data into the live stream buffer. Format: **S16LE stereo PCM** at the configured `--audio-rate` (default 44100 Hz). -Requires `--audio-stdin` or a configured stream source. +Requires `--audio-stdin`, `--audio-http`, or another configured stream source to feed the buffer. **Request:** Binary body, `application/octet-stream`, raw S16LE stereo PCM bytes. @@ -300,7 +300,7 @@ ffmpeg -i source.wav -f s16le -ar 48000 -ac 2 - | \ ### HTTP audio push -Push audio from a remote machine via the HTTP API: +Push audio from a remote machine via the HTTP API. Run the server with `--audio-http` (and typically `--tx`/`--tx-auto-start`) so the `/audio/stream` endpoint is available. ```bash # From another machine on the network diff --git a/docs/pro-runtime-hardening-workboard.md b/docs/pro-runtime-hardening-workboard.md index e2cd6af..694a7cf 100644 --- a/docs/pro-runtime-hardening-workboard.md +++ b/docs/pro-runtime-hardening-workboard.md @@ -63,7 +63,7 @@ Kein „ist im Kopf klar“. Der Stand kommt hier rein. | ID | Status | Beschreibung | Ort | |---|---|---|---| | CFG-SEM-001 | CONFIRMED | `fm.outputDrive` wird in Validation und Runtime nicht konsistent behandelt | `internal/config/config.go`, `internal/app/engine.go` | -| CTL-UX-001 | CONFIRMED | `handleAudioStream()` referenziert `--audio-http`, was CLI-seitig überprüft werden sollte | `internal/control/control.go` | +| CTL-UX-001 | RESOLVED | `handleAudioStream()` beschreibt `--audio-http`; der CLI-Schalter ist nun vorhanden und setzt den Stream-Puffer für `/audio/stream` direkt. | `internal/control/control.go`, `cmd/fmrtx/main.go` | --- diff --git a/internal/control/control_test.go b/internal/control/control_test.go index 93fc508..9c07f76 100644 --- a/internal/control/control_test.go +++ b/internal/control/control_test.go @@ -9,6 +9,7 @@ import ( "testing" cfgpkg "github.com/jan/fm-rds-tx/internal/config" + "github.com/jan/fm-rds-tx/internal/audio" "github.com/jan/fm-rds-tx/internal/output" ) @@ -124,6 +125,48 @@ func TestRuntimeWithoutDriver(t *testing.T) { } } +func TestAudioStreamRequiresSource(t *testing.T) { + srv := NewServer(cfgpkg.Default()) + rec := httptest.NewRecorder() + req := httptest.NewRequest(http.MethodPost, "/audio/stream", bytes.NewReader(nil)) + srv.Handler().ServeHTTP(rec, req) + if rec.Code != http.StatusServiceUnavailable { + t.Fatalf("expected 503 when audio stream missing, got %d", rec.Code) + } +} + +func TestAudioStreamPushesPCM(t *testing.T) { + cfg := cfgpkg.Default() + srv := NewServer(cfg) + stream := audio.NewStreamSource(256, 44100) + srv.SetStreamSource(stream) + pcm := []byte{0, 0, 0, 0} + rec := httptest.NewRecorder() + req := httptest.NewRequest(http.MethodPost, "/audio/stream", bytes.NewReader(pcm)) + srv.Handler().ServeHTTP(rec, req) + if rec.Code != 200 { + t.Fatalf("expected 200, got %d", rec.Code) + } + var body map[string]any + if err := json.Unmarshal(rec.Body.Bytes(), &body); err != nil { + t.Fatalf("unmarshal response: %v", err) + } + if ok, _ := body["ok"].(bool); !ok { + t.Fatalf("expected ok true, got %v", body["ok"]) + } + frames, _ := body["frames"].(float64) + if frames != 1 { + t.Fatalf("expected 1 frame, got %v", frames) + } + stats, ok := body["stats"].(map[string]any) + if !ok { + t.Fatalf("missing stats: %v", body["stats"]) + } + if avail, _ := stats["available"].(float64); avail < 1 { + t.Fatalf("expected stats.available >= 1, got %v", avail) + } +} + func TestTXStartWithoutController(t *testing.T) { srv := NewServer(cfgpkg.Default()) rec := httptest.NewRecorder() From 5227a86f2e5a41da9fba44eaa89135c8b05f39d1 Mon Sep 17 00:00:00 2001 From: Jan Svabenik Date: Sun, 5 Apr 2026 18:29:04 +0200 Subject: [PATCH 20/55] Ensure audio stream handler requires POST --- docs/pro-runtime-hardening-workboard.md | 4 +++- internal/control/control_test.go | 10 ++++++++++ 2 files changed, 13 insertions(+), 1 deletion(-) diff --git a/docs/pro-runtime-hardening-workboard.md b/docs/pro-runtime-hardening-workboard.md index 694a7cf..b825bb4 100644 --- a/docs/pro-runtime-hardening-workboard.md +++ b/docs/pro-runtime-hardening-workboard.md @@ -413,7 +413,9 @@ Diese Punkte könnten ggf. vorgezogen werden, auch wenn WS-05 formal nach WS-01/ - Noch leer ## WS-05 Verifikation -- Noch leer +| Datum | Fokus | Ergebnis | +|---|---|---| +| 2026-04-05 | `/audio/stream` rejects non-POST requests | `TestAudioStreamRejectsNonPost` enforces POST-only access to `/audio/stream` before a stream source is configured | --- diff --git a/internal/control/control_test.go b/internal/control/control_test.go index 9c07f76..d810666 100644 --- a/internal/control/control_test.go +++ b/internal/control/control_test.go @@ -167,6 +167,16 @@ func TestAudioStreamPushesPCM(t *testing.T) { } } +func TestAudioStreamRejectsNonPost(t *testing.T) { + srv := NewServer(cfgpkg.Default()) + rec := httptest.NewRecorder() + req := httptest.NewRequest(http.MethodGet, "/audio/stream", nil) + srv.Handler().ServeHTTP(rec, req) + if rec.Code != http.StatusMethodNotAllowed { + t.Fatalf("expected 405 for audio stream GET, got %d", rec.Code) + } +} + func TestTXStartWithoutController(t *testing.T) { srv := NewServer(cfgpkg.Default()) rec := httptest.NewRecorder() From 1dbe150675681ee78472dcaf89d88d89d94224c5 Mon Sep 17 00:00:00 2001 From: Jan Svabenik Date: Sun, 5 Apr 2026 18:42:31 +0200 Subject: [PATCH 21/55] feat: track runtime states --- docs/API.md | 4 +++ docs/pro-runtime-hardening-workboard.md | 5 ++- internal/app/engine.go | 47 +++++++++++++++++++++---- internal/app/runtime_state_test.go | 26 ++++++++++++++ 4 files changed, 74 insertions(+), 8 deletions(-) create mode 100644 internal/app/runtime_state_test.go diff --git a/docs/API.md b/docs/API.md index 57301bb..c8807d4 100644 --- a/docs/API.md +++ b/docs/API.md @@ -15,6 +15,8 @@ Health check. {"ok": true} ``` +`engine.state` spiegelt jetzt die Runtime-State-Maschine wider (idle, arming, prebuffering, running, degraded, muted, faulted, stopping) und bietet eine erste beobachtbare Basis für Fault-Transitions. + --- ### `GET /status` @@ -73,6 +75,8 @@ Live engine and driver telemetry. Only populated when TX is active. } ``` +`engine.state` spiegelt jetzt die Runtime-State-Maschine wider (idle, arming, prebuffering, running, degraded, muted, faulted, stopping) und bietet eine erste beobachtbare Basis für Fault-Transitions. + --- ### `GET /config` diff --git a/docs/pro-runtime-hardening-workboard.md b/docs/pro-runtime-hardening-workboard.md index b825bb4..e993968 100644 --- a/docs/pro-runtime-hardening-workboard.md +++ b/docs/pro-runtime-hardening-workboard.md @@ -266,11 +266,14 @@ Generator/Upsampler und Hardwarewriter werden als getrennte Stufen mit kleinem, # WS-02 — Explizite Runtime-State-Maschine und Fault-Handling **Priorität:** P0 -**Gesamtstatus:** TODO +**Gesamtstatus:** IN PROGRESS ## Ziel Einführen eines klaren Betriebsmodells mit Fault-, Recovery- und Muted-Zuständen. +## Fortschritt +- EngineStats liefert das Runtime-State-Feld (`idle`, `arming`, `prebuffering`, `running`) und schafft eine beobachtbare Baseline für die nächste Fault-Maschine. + ## Zielzustände laut Konzept - `idle` - `arming` diff --git a/internal/app/engine.go b/internal/app/engine.go index a4836e9..2736766 100644 --- a/internal/app/engine.go +++ b/internal/app/engine.go @@ -38,6 +38,19 @@ func (s EngineState) String() string { } } +type RuntimeState string + +const ( + RuntimeStateIdle RuntimeState = "idle" + RuntimeStateArming RuntimeState = "arming" + RuntimeStatePrebuffering RuntimeState = "prebuffering" + RuntimeStateRunning RuntimeState = "running" + RuntimeStateDegraded RuntimeState = "degraded" + RuntimeStateMuted RuntimeState = "muted" + RuntimeStateFaulted RuntimeState = "faulted" + RuntimeStateStopping RuntimeState = "stopping" +) + func updateMaxDuration(dst *atomic.Uint64, d time.Duration) { v := uint64(d) for { @@ -96,11 +109,12 @@ type Engine struct { deviceRate float64 frameQueue *output.FrameQueue - mu sync.Mutex - state EngineState - cancel context.CancelFunc - startedAt time.Time - wg sync.WaitGroup + mu sync.Mutex + state EngineState + cancel context.CancelFunc + startedAt time.Time + wg sync.WaitGroup + runtimeState atomic.Value chunksProduced atomic.Uint64 totalSamples atomic.Uint64 @@ -177,7 +191,7 @@ func NewEngine(cfg cfgpkg.Config, driver platform.SoapyDriver) *Engine { log.Printf("engine: same-rate mode — DSP@%dHz", cfg.FM.CompositeRateHz) } - return &Engine{ + engine := &Engine{ cfg: cfg, driver: driver, generator: offpkg.NewGenerator(cfg), @@ -187,6 +201,8 @@ func NewEngine(cfg cfgpkg.Config, driver platform.SoapyDriver) *Engine { state: EngineIdle, frameQueue: output.NewFrameQueue(cfg.Runtime.FrameQueueCapacity), } + engine.setRuntimeState(RuntimeStateIdle) + return engine } func (e *Engine) SetChunkDuration(d time.Duration) { @@ -306,6 +322,7 @@ func (e *Engine) Start(ctx context.Context) error { runCtx, cancel := context.WithCancel(ctx) e.cancel = cancel e.state = EngineRunning + e.setRuntimeState(RuntimeStateArming) e.startedAt = time.Now() e.wg.Add(1) e.mu.Unlock() @@ -321,6 +338,7 @@ func (e *Engine) Stop(ctx context.Context) error { return nil } e.state = EngineStopping + e.setRuntimeState(RuntimeStateStopping) e.cancel() e.mu.Unlock() @@ -336,6 +354,7 @@ func (e *Engine) Stop(ctx context.Context) error { e.mu.Lock() e.state = EngineIdle + e.setRuntimeState(RuntimeStateIdle) e.mu.Unlock() return nil } @@ -359,7 +378,7 @@ func (e *Engine) Stats() EngineStats { hasRecentLateBuffers := lateAlertAt > 0 && now.Sub(time.Unix(0, int64(lateAlertAt))) <= lateBufferIndicatorWindow ri := runtimeIndicator(queue.Health, hasRecentLateBuffers) return EngineStats{ - State: state.String(), + State: string(e.currentRuntimeState()), ChunksProduced: e.chunksProduced.Load(), TotalSamples: e.totalSamples.Load(), Underruns: e.underruns.Load(), @@ -401,6 +420,7 @@ func runtimeAlert(queueHealth output.QueueHealth, recentLateBuffers bool) string } func (e *Engine) run(ctx context.Context) { + e.setRuntimeState(RuntimeStateRunning) e.wg.Add(1) go e.writerLoop(ctx) defer e.wg.Done() @@ -530,3 +550,16 @@ func cloneFrame(src *output.CompositeFrame) *output.CompositeFrame { Sequence: src.Sequence, } } + +func (e *Engine) setRuntimeState(state RuntimeState) { + e.runtimeState.Store(state) +} + +func (e *Engine) currentRuntimeState() RuntimeState { + if v := e.runtimeState.Load(); v != nil { + if rs, ok := v.(RuntimeState); ok { + return rs + } + } + return RuntimeStateIdle +} diff --git a/internal/app/runtime_state_test.go b/internal/app/runtime_state_test.go new file mode 100644 index 0000000..9ef04be --- /dev/null +++ b/internal/app/runtime_state_test.go @@ -0,0 +1,26 @@ +package app + +import ( + "testing" + + cfgpkg "github.com/jan/fm-rds-tx/internal/config" + "github.com/jan/fm-rds-tx/internal/platform" +) + +func TestEngineRuntimeStateReporting(t *testing.T) { + e := NewEngine(cfgpkg.Default(), platform.NewSimulatedDriver(nil)) + + if got := e.Stats().State; got != string(RuntimeStateIdle) { + t.Fatalf("expected initial state idle, got %s", got) + } + + e.setRuntimeState(RuntimeStatePrebuffering) + if got := e.Stats().State; got != string(RuntimeStatePrebuffering) { + t.Fatalf("expected prebuffering, got %s", got) + } + + e.setRuntimeState(RuntimeStateRunning) + if got := e.currentRuntimeState(); got != RuntimeStateRunning { + t.Fatalf("currentRuntimeState mismatch: %s", got) + } +} From 9885e449623412743249002893b9c258d8d88a34 Mon Sep 17 00:00:00 2001 From: Jan Svabenik Date: Sun, 5 Apr 2026 18:52:22 +0200 Subject: [PATCH 22/55] feat: add runtime supervisor transitions --- docs/pro-runtime-hardening-workboard.md | 2 +- internal/app/engine.go | 48 ++++++++++++++++++++++--- internal/app/runtime_state_test.go | 31 ++++++++++++++++ 3 files changed, 76 insertions(+), 5 deletions(-) diff --git a/docs/pro-runtime-hardening-workboard.md b/docs/pro-runtime-hardening-workboard.md index e993968..053239f 100644 --- a/docs/pro-runtime-hardening-workboard.md +++ b/docs/pro-runtime-hardening-workboard.md @@ -272,7 +272,7 @@ Generator/Upsampler und Hardwarewriter werden als getrennte Stufen mit kleinem, Einführen eines klaren Betriebsmodells mit Fault-, Recovery- und Muted-Zuständen. ## Fortschritt -- EngineStats liefert das Runtime-State-Feld (`idle`, `arming`, `prebuffering`, `running`) und schafft eine beobachtbare Baseline für die nächste Fault-Maschine. +- EngineStats liefert das Runtime-State-Feld (`idle`, `arming`, `prebuffering`, `running`) und reagiert nun auf Queue-Gesundheit bzw. späte Buffers, indem es bei `low`/`critical` oder späten Buffern in `degraded` wechselt und sonst auf `running` zurückkehrt. ## Zielzustände laut Konzept - `idle` diff --git a/internal/app/engine.go b/internal/app/engine.go index 2736766..2ef1963 100644 --- a/internal/app/engine.go +++ b/internal/app/engine.go @@ -94,6 +94,7 @@ const ( ) const lateBufferIndicatorWindow = 5 * time.Second +const queueCriticalStreakThreshold = 3 // Engine is the continuous TX loop. It generates composite IQ in chunks, // resamples to device rate, and pushes to hardware in a tight loop. @@ -121,6 +122,7 @@ type Engine struct { underruns atomic.Uint64 lateBuffers atomic.Uint64 lateBufferAlertAt atomic.Uint64 + criticalStreak atomic.Uint64 maxCycleNs atomic.Uint64 maxGenerateNs atomic.Uint64 maxUpsampleNs atomic.Uint64 @@ -373,9 +375,7 @@ func (e *Engine) Stats() EngineStats { queue := e.frameQueue.Stats() lateBuffers := e.lateBuffers.Load() - now := time.Now() - lateAlertAt := e.lateBufferAlertAt.Load() - hasRecentLateBuffers := lateAlertAt > 0 && now.Sub(time.Unix(0, int64(lateAlertAt))) <= lateBufferIndicatorWindow + hasRecentLateBuffers := e.hasRecentLateBuffers() ri := runtimeIndicator(queue.Health, hasRecentLateBuffers) return EngineStats{ State: string(e.currentRuntimeState()), @@ -420,7 +420,7 @@ func runtimeAlert(queueHealth output.QueueHealth, recentLateBuffers bool) string } func (e *Engine) run(ctx context.Context) { - e.setRuntimeState(RuntimeStateRunning) + e.setRuntimeState(RuntimeStatePrebuffering) e.wg.Add(1) go e.writerLoop(ctx) defer e.wg.Done() @@ -477,6 +477,8 @@ func (e *Engine) run(ctx context.Context) { } continue } + queueStats := e.frameQueue.Stats() + e.evaluateRuntimeState(queueStats, e.hasRecentLateBuffers()) } } @@ -507,6 +509,8 @@ func (e *Engine) writerLoop(ctx context.Context) { updateMaxDuration(&e.maxWriteNs, writeDur) updateMaxDuration(&e.maxCycleNs, cycleDur) + queueStats := e.frameQueue.Stats() + e.evaluateRuntimeState(queueStats, e.hasRecentLateBuffers()) if cycleDur > e.chunkDuration { late := e.lateBuffers.Add(1) @@ -563,3 +567,39 @@ func (e *Engine) currentRuntimeState() RuntimeState { } return RuntimeStateIdle } + +func (e *Engine) hasRecentLateBuffers() bool { + lateAlertAt := e.lateBufferAlertAt.Load() + if lateAlertAt == 0 { + return false + } + return time.Since(time.Unix(0, int64(lateAlertAt))) <= lateBufferIndicatorWindow +} + +func (e *Engine) evaluateRuntimeState(queue output.QueueStats, hasLateBuffers bool) { + state := e.currentRuntimeState() + switch state { + case RuntimeStateStopping, RuntimeStateFaulted: + return + } + if state == RuntimeStatePrebuffering { + if queue.Depth >= 1 { + e.setRuntimeState(RuntimeStateRunning) + } + return + } + critical := queue.Health == output.QueueHealthCritical + if critical { + if e.criticalStreak.Add(1) >= queueCriticalStreakThreshold { + e.setRuntimeState(RuntimeStateDegraded) + return + } + } else { + e.criticalStreak.Store(0) + } + if hasLateBuffers { + e.setRuntimeState(RuntimeStateDegraded) + return + } + e.setRuntimeState(RuntimeStateRunning) +} diff --git a/internal/app/runtime_state_test.go b/internal/app/runtime_state_test.go index 9ef04be..b253183 100644 --- a/internal/app/runtime_state_test.go +++ b/internal/app/runtime_state_test.go @@ -4,6 +4,7 @@ import ( "testing" cfgpkg "github.com/jan/fm-rds-tx/internal/config" + "github.com/jan/fm-rds-tx/internal/output" "github.com/jan/fm-rds-tx/internal/platform" ) @@ -24,3 +25,33 @@ func TestEngineRuntimeStateReporting(t *testing.T) { t.Fatalf("currentRuntimeState mismatch: %s", got) } } + +func TestEngineRuntimeStateTransitions(t *testing.T) { + e := NewEngine(cfgpkg.Default(), platform.NewSimulatedDriver(nil)) + e.setRuntimeState(RuntimeStatePrebuffering) + + queue := output.QueueStats{Depth: 1, FillLevel: 0.75, Health: output.QueueHealthNormal} + e.evaluateRuntimeState(queue, false) + if got := e.currentRuntimeState(); got != RuntimeStateRunning { + t.Fatalf("expected running after full buffer, got %s", got) + } + + queue.Health = output.QueueHealthCritical + for i := 0; i < queueCriticalStreakThreshold; i++ { + e.evaluateRuntimeState(queue, false) + } + if got := e.currentRuntimeState(); got != RuntimeStateDegraded { + t.Fatalf("expected degraded on queue critical streak, got %s", got) + } + + queue.Health = output.QueueHealthNormal + e.evaluateRuntimeState(queue, false) + if got := e.currentRuntimeState(); got != RuntimeStateRunning { + t.Fatalf("expected running once queue healthy, got %s", got) + } + + e.evaluateRuntimeState(queue, true) + if got := e.currentRuntimeState(); got != RuntimeStateDegraded { + t.Fatalf("expected degraded when late buffers seen, got %s", got) + } +} From 766ce79826ae9447129bd790190186fe620bdda9 Mon Sep 17 00:00:00 2001 From: Jan Svabenik Date: Sun, 5 Apr 2026 20:05:28 +0200 Subject: [PATCH 23/55] WS-02: Add fault classification foundation --- internal/app/engine.go | 84 +++++++++++++++++++++++++++++++++++++- internal/app/fault.go | 47 +++++++++++++++++++++ internal/app/fault_test.go | 72 ++++++++++++++++++++++++++++++++ 3 files changed, 201 insertions(+), 2 deletions(-) create mode 100644 internal/app/fault.go create mode 100644 internal/app/fault_test.go diff --git a/internal/app/engine.go b/internal/app/engine.go index 2ef1963..a62c11c 100644 --- a/internal/app/engine.go +++ b/internal/app/engine.go @@ -83,6 +83,7 @@ type EngineStats struct { Queue output.QueueStats `json:"queue"` RuntimeIndicator RuntimeIndicator `json:"runtimeIndicator"` RuntimeAlert string `json:"runtimeAlert,omitempty"` + LastFault *FaultEvent `json:"lastFault,omitempty"` } type RuntimeIndicator string @@ -93,8 +94,12 @@ const ( RuntimeIndicatorQueueCritical RuntimeIndicator = "queueCritical" ) -const lateBufferIndicatorWindow = 5 * time.Second -const queueCriticalStreakThreshold = 3 +const ( + lateBufferIndicatorWindow = 5 * time.Second + queueCriticalStreakThreshold = 3 + faultRepeatWindow = 1 * time.Second + faultHistoryCapacity = 8 +) // Engine is the continuous TX loop. It generates composite IQ in chunks, // resamples to device rate, and pushes to hardware in a tight loop. @@ -128,6 +133,9 @@ type Engine struct { maxUpsampleNs atomic.Uint64 maxWriteNs atomic.Uint64 lastError atomic.Value // string + lastFault atomic.Value // *FaultEvent + faultHistoryMu sync.Mutex + faultHistory []FaultEvent // Live config: pending frequency change, applied between chunks pendingFreq atomic.Pointer[float64] @@ -202,6 +210,7 @@ func NewEngine(cfg cfgpkg.Config, driver platform.SoapyDriver) *Engine { deviceRate: deviceRate, state: EngineIdle, frameQueue: output.NewFrameQueue(cfg.Runtime.FrameQueueCapacity), + faultHistory: make([]FaultEvent, 0, faultHistoryCapacity), } engine.setRuntimeState(RuntimeStateIdle) return engine @@ -377,6 +386,7 @@ func (e *Engine) Stats() EngineStats { lateBuffers := e.lateBuffers.Load() hasRecentLateBuffers := e.hasRecentLateBuffers() ri := runtimeIndicator(queue.Health, hasRecentLateBuffers) + lastFault := e.lastFaultEvent() return EngineStats{ State: string(e.currentRuntimeState()), ChunksProduced: e.chunksProduced.Load(), @@ -392,6 +402,7 @@ func (e *Engine) Stats() EngineStats { Queue: queue, RuntimeIndicator: ri, RuntimeAlert: runtimeAlert(queue.Health, hasRecentLateBuffers), + LastFault: lastFault, } } @@ -576,6 +587,71 @@ func (e *Engine) hasRecentLateBuffers() bool { return time.Since(time.Unix(0, int64(lateAlertAt))) <= lateBufferIndicatorWindow } +func (e *Engine) lastFaultEvent() *FaultEvent { + return copyFaultEvent(e.loadLastFault()) +} + +// LastFault exposes the most recent captured fault, if any. +func (e *Engine) LastFault() *FaultEvent { + return e.lastFaultEvent() +} + +func (e *Engine) FaultHistory() []FaultEvent { + e.faultHistoryMu.Lock() + defer e.faultHistoryMu.Unlock() + history := make([]FaultEvent, len(e.faultHistory)) + copy(history, e.faultHistory) + return history +} + +func (e *Engine) recordFault(reason FaultReason, severity FaultSeverity, message string) { + if reason == "" { + reason = FaultReasonUnknown + } + now := time.Now() + if last := e.loadLastFault(); last != nil { + if last.Reason == reason && last.Severity == severity && now.Sub(last.Time) < faultRepeatWindow { + return + } + } + ev := &FaultEvent{ + Time: now, + Reason: reason, + Severity: severity, + Message: message, + } + e.lastFault.Store(ev) + e.appendFaultHistory(ev) +} + +func (e *Engine) loadLastFault() *FaultEvent { + if v := e.lastFault.Load(); v != nil { + if ev, ok := v.(*FaultEvent); ok { + return ev + } + } + return nil +} + +func copyFaultEvent(source *FaultEvent) *FaultEvent { + if source == nil { + return nil + } + copy := *source + return © +} + +func (e *Engine) appendFaultHistory(ev *FaultEvent) { + e.faultHistoryMu.Lock() + defer e.faultHistoryMu.Unlock() + if len(e.faultHistory) >= faultHistoryCapacity { + copy(e.faultHistory, e.faultHistory[1:]) + e.faultHistory[len(e.faultHistory)-1] = *ev + return + } + e.faultHistory = append(e.faultHistory, *ev) +} + func (e *Engine) evaluateRuntimeState(queue output.QueueStats, hasLateBuffers bool) { state := e.currentRuntimeState() switch state { @@ -591,6 +667,8 @@ func (e *Engine) evaluateRuntimeState(queue output.QueueStats, hasLateBuffers bo critical := queue.Health == output.QueueHealthCritical if critical { if e.criticalStreak.Add(1) >= queueCriticalStreakThreshold { + e.recordFault(FaultReasonQueueCritical, FaultSeverityDegraded, + fmt.Sprintf("queue health critical (depth=%d)", queue.Depth)) e.setRuntimeState(RuntimeStateDegraded) return } @@ -598,6 +676,8 @@ func (e *Engine) evaluateRuntimeState(queue output.QueueStats, hasLateBuffers bo e.criticalStreak.Store(0) } if hasLateBuffers { + e.recordFault(FaultReasonLateBuffers, FaultSeverityWarn, + fmt.Sprintf("late buffers detected (health=%s)", queue.Health)) e.setRuntimeState(RuntimeStateDegraded) return } diff --git a/internal/app/fault.go b/internal/app/fault.go new file mode 100644 index 0000000..efdecb3 --- /dev/null +++ b/internal/app/fault.go @@ -0,0 +1,47 @@ +package app + +import "time" + +type FaultSeverity int + +const ( + FaultSeverityWarn FaultSeverity = iota + FaultSeverityDegraded + FaultSeverityMuted + FaultSeverityFaulted +) + +var faultSeverityNames = []string{"warn", "degraded", "muted", "faulted"} + +func (s FaultSeverity) String() string { + if int(s) < 0 || int(s) >= len(faultSeverityNames) { + return "unknown" + } + return faultSeverityNames[s] +} + +// MarshalText implements encoding.TextMarshaler so that FaultSeverity +// renders as a human-friendly string in JSON and other text contexts. +func (s FaultSeverity) MarshalText() ([]byte, error) { + return []byte(s.String()), nil +} + +type FaultReason string + +const ( + FaultReasonUnknown FaultReason = "unknown" + FaultReasonQueueCritical FaultReason = "queueCritical" + FaultReasonLateBuffers FaultReason = "lateBuffers" + FaultReasonWriteTimeout FaultReason = "writeTimeout" + FaultReasonQueueEmpty FaultReason = "queueEmpty" +) + +// FaultEvent captures a single fault observation along with its severity and +// optional human-readable hint. Fault history and last-fault exposure rely on +// this struct so operators can reason about runtime behavior. +type FaultEvent struct { + Time time.Time `json:"time"` + Reason FaultReason `json:"reason"` + Severity FaultSeverity `json:"severity"` + Message string `json:"message,omitempty"` +} diff --git a/internal/app/fault_test.go b/internal/app/fault_test.go new file mode 100644 index 0000000..4637e25 --- /dev/null +++ b/internal/app/fault_test.go @@ -0,0 +1,72 @@ +package app + +import ( + "testing" + + cfgpkg "github.com/jan/fm-rds-tx/internal/config" + "github.com/jan/fm-rds-tx/internal/output" + "github.com/jan/fm-rds-tx/internal/platform" +) + +func TestFaultSeverityString(t *testing.T) { + cases := []struct { + severity FaultSeverity + want string + }{ + {FaultSeverityWarn, "warn"}, + {FaultSeverityDegraded, "degraded"}, + {FaultSeverityMuted, "muted"}, + {FaultSeverityFaulted, "faulted"}, + {FaultSeverity(99), "unknown"}, + } + for _, tc := range cases { + t.Run(tc.want, func(t *testing.T) { + if got := tc.severity.String(); got != tc.want { + t.Fatalf("expected %s, got %s", tc.want, got) + } + if txt, _ := tc.severity.MarshalText(); string(txt) != tc.want { + t.Fatalf("MarshalText mismatch: want %s, got %s", tc.want, txt) + } + }) + } +} + +func TestEngineRecordsQueueCriticalFault(t *testing.T) { + e := NewEngine(cfgpkg.Default(), platform.NewSimulatedDriver(nil)) + e.setRuntimeState(RuntimeStateRunning) + + queue := output.QueueStats{Depth: 3, Health: output.QueueHealthCritical} + for i := 0; i < queueCriticalStreakThreshold; i++ { + e.evaluateRuntimeState(queue, false) + } + + last := e.LastFault() + if last == nil { + t.Fatal("expected fault recorded, got nil") + } + if last.Reason != FaultReasonQueueCritical { + t.Fatalf("expected queue critical reason, got %s", last.Reason) + } + if last.Severity != FaultSeverityDegraded { + t.Fatalf("expected degraded severity, got %s", last.Severity) + } +} + +func TestEngineRecordsLateBufferFault(t *testing.T) { + e := NewEngine(cfgpkg.Default(), platform.NewSimulatedDriver(nil)) + e.setRuntimeState(RuntimeStateRunning) + + queue := output.QueueStats{Depth: 5, Health: output.QueueHealthNormal} + e.evaluateRuntimeState(queue, true) + + last := e.LastFault() + if last == nil { + t.Fatal("expected fault recorded for late buffers") + } + if last.Reason != FaultReasonLateBuffers { + t.Fatalf("expected late buffer reason, got %s", last.Reason) + } + if last.Severity != FaultSeverityWarn { + t.Fatalf("expected warn severity, got %s", last.Severity) + } +} From 5d60f20f91a580d1a0f92e9d18e4fb2879d840c7 Mon Sep 17 00:00:00 2001 From: Jan Svabenik Date: Sun, 5 Apr 2026 20:19:16 +0200 Subject: [PATCH 24/55] Add muted transition for persistent queue-critical --- docs/pro-runtime-hardening-workboard.md | 1 + internal/app/engine.go | 10 +++++++++- internal/app/runtime_state_test.go | 25 +++++++++++++++++++++++++ 3 files changed, 35 insertions(+), 1 deletion(-) diff --git a/docs/pro-runtime-hardening-workboard.md b/docs/pro-runtime-hardening-workboard.md index 053239f..0a46e29 100644 --- a/docs/pro-runtime-hardening-workboard.md +++ b/docs/pro-runtime-hardening-workboard.md @@ -273,6 +273,7 @@ Einführen eines klaren Betriebsmodells mit Fault-, Recovery- und Muted-Zuständ ## Fortschritt - EngineStats liefert das Runtime-State-Feld (`idle`, `arming`, `prebuffering`, `running`) und reagiert nun auf Queue-Gesundheit bzw. späte Buffers, indem es bei `low`/`critical` oder späten Buffern in `degraded` wechselt und sonst auf `running` zurückkehrt. +- `evaluateRuntimeState` escalates persistent `critical` queues from `degraded` to `muted`, while `FaultReasonQueueCritical` surfaces `muted` severity so the mute transition stays observable. ## Zielzustände laut Konzept - `idle` diff --git a/internal/app/engine.go b/internal/app/engine.go index a62c11c..a33edc7 100644 --- a/internal/app/engine.go +++ b/internal/app/engine.go @@ -97,6 +97,7 @@ const ( const ( lateBufferIndicatorWindow = 5 * time.Second queueCriticalStreakThreshold = 3 + queueMutedStreakThreshold = queueCriticalStreakThreshold * 2 faultRepeatWindow = 1 * time.Second faultHistoryCapacity = 8 ) @@ -666,7 +667,14 @@ func (e *Engine) evaluateRuntimeState(queue output.QueueStats, hasLateBuffers bo } critical := queue.Health == output.QueueHealthCritical if critical { - if e.criticalStreak.Add(1) >= queueCriticalStreakThreshold { + count := e.criticalStreak.Add(1) + if count >= queueMutedStreakThreshold { + e.recordFault(FaultReasonQueueCritical, FaultSeverityMuted, + fmt.Sprintf("queue health critical for %d consecutive checks (depth=%d)", count, queue.Depth)) + e.setRuntimeState(RuntimeStateMuted) + return + } + if count >= queueCriticalStreakThreshold { e.recordFault(FaultReasonQueueCritical, FaultSeverityDegraded, fmt.Sprintf("queue health critical (depth=%d)", queue.Depth)) e.setRuntimeState(RuntimeStateDegraded) diff --git a/internal/app/runtime_state_test.go b/internal/app/runtime_state_test.go index b253183..744a36b 100644 --- a/internal/app/runtime_state_test.go +++ b/internal/app/runtime_state_test.go @@ -55,3 +55,28 @@ func TestEngineRuntimeStateTransitions(t *testing.T) { t.Fatalf("expected degraded when late buffers seen, got %s", got) } } + +func TestEngineRuntimeStateMuteOnPersistentQueueCritical(t *testing.T) { + e := NewEngine(cfgpkg.Default(), platform.NewSimulatedDriver(nil)) + e.setRuntimeState(RuntimeStateRunning) + + queue := output.QueueStats{Depth: 1, Health: output.QueueHealthCritical} + for i := 0; i < queueMutedStreakThreshold; i++ { + e.evaluateRuntimeState(queue, false) + } + + if got := e.currentRuntimeState(); got != RuntimeStateMuted { + t.Fatalf("expected muted after prolonged queue critical, got %s", got) + } + + last := e.LastFault() + if last == nil { + t.Fatal("expected fault recorded for the mute transition") + } + if last.Reason != FaultReasonQueueCritical { + t.Fatalf("expected queue critical reason, got %s", last.Reason) + } + if last.Severity != FaultSeverityMuted { + t.Fatalf("expected muted severity, got %s", last.Severity) + } +} From d80e4dca8449704384c24165d3d325434264bcc2 Mon Sep 17 00:00:00 2001 From: Jan Svabenik Date: Sun, 5 Apr 2026 20:27:26 +0200 Subject: [PATCH 25/55] Add muted recovery logic --- docs/pro-runtime-hardening-workboard.md | 1 + internal/app/engine.go | 42 ++++++++++++++++--------- internal/app/runtime_state_test.go | 41 ++++++++++++++++++++---- 3 files changed, 64 insertions(+), 20 deletions(-) diff --git a/docs/pro-runtime-hardening-workboard.md b/docs/pro-runtime-hardening-workboard.md index 0a46e29..9b393ae 100644 --- a/docs/pro-runtime-hardening-workboard.md +++ b/docs/pro-runtime-hardening-workboard.md @@ -274,6 +274,7 @@ Einführen eines klaren Betriebsmodells mit Fault-, Recovery- und Muted-Zuständ ## Fortschritt - EngineStats liefert das Runtime-State-Feld (`idle`, `arming`, `prebuffering`, `running`) und reagiert nun auf Queue-Gesundheit bzw. späte Buffers, indem es bei `low`/`critical` oder späten Buffern in `degraded` wechselt und sonst auf `running` zurückkehrt. - `evaluateRuntimeState` escalates persistent `critical` queues from `degraded` to `muted`, while `FaultReasonQueueCritical` surfaces `muted` severity so the mute transition stays observable. +- `evaluateRuntimeState` now waits for a short healthy streak before leaving `muted`, logging a degraded-severity recovery event once the queue settles. ## Zielzustände laut Konzept - `idle` diff --git a/internal/app/engine.go b/internal/app/engine.go index a33edc7..d294ca4 100644 --- a/internal/app/engine.go +++ b/internal/app/engine.go @@ -98,6 +98,7 @@ const ( lateBufferIndicatorWindow = 5 * time.Second queueCriticalStreakThreshold = 3 queueMutedStreakThreshold = queueCriticalStreakThreshold * 2 + queueMutedRecoveryThreshold = queueCriticalStreakThreshold faultRepeatWindow = 1 * time.Second faultHistoryCapacity = 8 ) @@ -123,20 +124,21 @@ type Engine struct { wg sync.WaitGroup runtimeState atomic.Value - chunksProduced atomic.Uint64 - totalSamples atomic.Uint64 - underruns atomic.Uint64 - lateBuffers atomic.Uint64 - lateBufferAlertAt atomic.Uint64 - criticalStreak atomic.Uint64 - maxCycleNs atomic.Uint64 - maxGenerateNs atomic.Uint64 - maxUpsampleNs atomic.Uint64 - maxWriteNs atomic.Uint64 - lastError atomic.Value // string - lastFault atomic.Value // *FaultEvent - faultHistoryMu sync.Mutex - faultHistory []FaultEvent + chunksProduced atomic.Uint64 + totalSamples atomic.Uint64 + underruns atomic.Uint64 + lateBuffers atomic.Uint64 + lateBufferAlertAt atomic.Uint64 + criticalStreak atomic.Uint64 + mutedRecoveryStreak atomic.Uint64 + maxCycleNs atomic.Uint64 + maxGenerateNs atomic.Uint64 + maxUpsampleNs atomic.Uint64 + maxWriteNs atomic.Uint64 + lastError atomic.Value // string + lastFault atomic.Value // *FaultEvent + faultHistoryMu sync.Mutex + faultHistory []FaultEvent // Live config: pending frequency change, applied between chunks pendingFreq atomic.Pointer[float64] @@ -658,6 +660,18 @@ func (e *Engine) evaluateRuntimeState(queue output.QueueStats, hasLateBuffers bo switch state { case RuntimeStateStopping, RuntimeStateFaulted: return + case RuntimeStateMuted: + if queue.Health == output.QueueHealthNormal && !hasLateBuffers { + if count := e.mutedRecoveryStreak.Add(1); count >= queueMutedRecoveryThreshold { + e.mutedRecoveryStreak.Store(0) + e.recordFault(FaultReasonQueueCritical, FaultSeverityDegraded, + fmt.Sprintf("queue healthy for %d checks after mute", count)) + e.setRuntimeState(RuntimeStateDegraded) + } + } else { + e.mutedRecoveryStreak.Store(0) + } + return } if state == RuntimeStatePrebuffering { if queue.Depth >= 1 { diff --git a/internal/app/runtime_state_test.go b/internal/app/runtime_state_test.go index 744a36b..36d8e6f 100644 --- a/internal/app/runtime_state_test.go +++ b/internal/app/runtime_state_test.go @@ -69,14 +69,43 @@ func TestEngineRuntimeStateMuteOnPersistentQueueCritical(t *testing.T) { t.Fatalf("expected muted after prolonged queue critical, got %s", got) } - last := e.LastFault() - if last == nil { + muteFault := e.LastFault() + if muteFault == nil { t.Fatal("expected fault recorded for the mute transition") } - if last.Reason != FaultReasonQueueCritical { - t.Fatalf("expected queue critical reason, got %s", last.Reason) + if muteFault.Reason != FaultReasonQueueCritical { + t.Fatalf("expected queue critical reason, got %s", muteFault.Reason) } - if last.Severity != FaultSeverityMuted { - t.Fatalf("expected muted severity, got %s", last.Severity) + if muteFault.Severity != FaultSeverityMuted { + t.Fatalf("expected muted severity, got %s", muteFault.Severity) + } + + queue.Health = output.QueueHealthNormal + for i := 0; i < queueMutedRecoveryThreshold-1; i++ { + e.evaluateRuntimeState(queue, false) + if got := e.currentRuntimeState(); got != RuntimeStateMuted { + t.Fatalf("expected still muted while recovery window builds, got %s", got) + } + } + + e.evaluateRuntimeState(queue, false) + if got := e.currentRuntimeState(); got != RuntimeStateDegraded { + t.Fatalf("expected degrade once mute recovery threshold reached, got %s", got) + } + + recoveryFault := e.LastFault() + if recoveryFault == nil { + t.Fatal("expected recovery fault entry after leaving mute") + } + if recoveryFault.Severity != FaultSeverityDegraded { + t.Fatalf("expected degraded severity for recovery event, got %s", recoveryFault.Severity) + } + if recoveryFault.Reason != FaultReasonQueueCritical { + t.Fatalf("expected queue critical reason for recovery event, got %s", recoveryFault.Reason) + } + + e.evaluateRuntimeState(queue, false) + if got := e.currentRuntimeState(); got != RuntimeStateRunning { + t.Fatalf("expected running after recovery, got %s", got) } } From e8b3c2804156bd4ac8cd42af562b61633b7251a6 Mon Sep 17 00:00:00 2001 From: Jan Svabenik Date: Sun, 5 Apr 2026 20:46:03 +0200 Subject: [PATCH 26/55] feat: add faulted runtime counters --- cmd/fmrtx/main.go | 5 ++ docs/pro-runtime-hardening-workboard.md | 10 +++- internal/app/engine.go | 39 ++++++++++++++ internal/app/runtime_state_test.go | 70 +++++++++++++++++++++++++ 4 files changed, 122 insertions(+), 2 deletions(-) diff --git a/cmd/fmrtx/main.go b/cmd/fmrtx/main.go index a7abed9..46d9113 100644 --- a/cmd/fmrtx/main.go +++ b/cmd/fmrtx/main.go @@ -266,6 +266,11 @@ func (b *txBridge) TXStats() map[string]any { "queue": s.Queue, "runtimeIndicator": s.RuntimeIndicator, "runtimeAlert": s.RuntimeAlert, + "degradedTransitions": s.DegradedTransitions, + "mutedTransitions": s.MutedTransitions, + "faultedTransitions": s.FaultedTransitions, + "faultCount": s.FaultCount, + "lastFault": s.LastFault, } } func (b *txBridge) UpdateConfig(lp ctrlpkg.LivePatch) error { diff --git a/docs/pro-runtime-hardening-workboard.md b/docs/pro-runtime-hardening-workboard.md index 9b393ae..67219b7 100644 --- a/docs/pro-runtime-hardening-workboard.md +++ b/docs/pro-runtime-hardening-workboard.md @@ -275,6 +275,8 @@ Einführen eines klaren Betriebsmodells mit Fault-, Recovery- und Muted-Zuständ - EngineStats liefert das Runtime-State-Feld (`idle`, `arming`, `prebuffering`, `running`) und reagiert nun auf Queue-Gesundheit bzw. späte Buffers, indem es bei `low`/`critical` oder späten Buffern in `degraded` wechselt und sonst auf `running` zurückkehrt. - `evaluateRuntimeState` escalates persistent `critical` queues from `degraded` to `muted`, while `FaultReasonQueueCritical` surfaces `muted` severity so the mute transition stays observable. - `evaluateRuntimeState` now waits for a short healthy streak before leaving `muted`, logging a degraded-severity recovery event once the queue settles. +- Persistent queue-critical streaks while `muted` now escalate to `faulted` with `FaultSeverityFaulted`, keeping `RuntimeStateFaulted` observable. +- `EngineStats` and `txBridge` now expose transition/fault counters plus `lastFault`, surfacing the new telemetry through `/runtime`. ## Zielzustände laut Konzept - `idle` @@ -320,10 +322,14 @@ Einführen eines klaren Betriebsmodells mit Fault-, Recovery- und Muted-Zuständ - Welche Transitionen sind wirklich produktiv relevant und welche nur „theoretisch schön“? ## WS-02 Entscheidungslog -- Noch leer +| Datum | Entscheidung | Notiz | +|---|---|---| +| 2026-04-05 | Faulted escalation on persistent critical queue | `muted` now surfaces `RuntimeStateFaulted` when queue health stays critical and metrics capture every transition. | ## WS-02 Verifikation -- Noch leer +| Datum | Fokus | Ergebnis | +|---|---|---| +| 2026-04-05 | Faulted path + transition counters | `go test ./...` exercises `TestEngineFaultsAfterMutedCriticalStreak` and `TestRuntimeTransitionCounters`, while `/runtime` now surfaces `engine.degradedTransitions`, `engine.mutedTransitions`, `engine.faultedTransitions`, `engine.faultCount`, and the last fault via `txBridge`. | --- diff --git a/internal/app/engine.go b/internal/app/engine.go index d294ca4..cc8d8b8 100644 --- a/internal/app/engine.go +++ b/internal/app/engine.go @@ -84,6 +84,10 @@ type EngineStats struct { RuntimeIndicator RuntimeIndicator `json:"runtimeIndicator"` RuntimeAlert string `json:"runtimeAlert,omitempty"` LastFault *FaultEvent `json:"lastFault,omitempty"` + DegradedTransitions uint64 `json:"degradedTransitions"` + MutedTransitions uint64 `json:"mutedTransitions"` + FaultedTransitions uint64 `json:"faultedTransitions"` + FaultCount uint64 `json:"faultCount"` } type RuntimeIndicator string @@ -99,6 +103,7 @@ const ( queueCriticalStreakThreshold = 3 queueMutedStreakThreshold = queueCriticalStreakThreshold * 2 queueMutedRecoveryThreshold = queueCriticalStreakThreshold + queueFaultedStreakThreshold = queueCriticalStreakThreshold faultRepeatWindow = 1 * time.Second faultHistoryCapacity = 8 ) @@ -131,6 +136,7 @@ type Engine struct { lateBufferAlertAt atomic.Uint64 criticalStreak atomic.Uint64 mutedRecoveryStreak atomic.Uint64 + mutedFaultStreak atomic.Uint64 maxCycleNs atomic.Uint64 maxGenerateNs atomic.Uint64 maxUpsampleNs atomic.Uint64 @@ -140,6 +146,11 @@ type Engine struct { faultHistoryMu sync.Mutex faultHistory []FaultEvent + degradedTransitions atomic.Uint64 + mutedTransitions atomic.Uint64 + faultedTransitions atomic.Uint64 + faultEvents atomic.Uint64 + // Live config: pending frequency change, applied between chunks pendingFreq atomic.Pointer[float64] @@ -406,6 +417,10 @@ func (e *Engine) Stats() EngineStats { RuntimeIndicator: ri, RuntimeAlert: runtimeAlert(queue.Health, hasRecentLateBuffers), LastFault: lastFault, + DegradedTransitions: e.degradedTransitions.Load(), + MutedTransitions: e.mutedTransitions.Load(), + FaultedTransitions: e.faultedTransitions.Load(), + FaultCount: e.faultEvents.Load(), } } @@ -570,6 +585,17 @@ func cloneFrame(src *output.CompositeFrame) *output.CompositeFrame { } func (e *Engine) setRuntimeState(state RuntimeState) { + prev := e.currentRuntimeState() + if prev != state { + switch state { + case RuntimeStateDegraded: + e.degradedTransitions.Add(1) + case RuntimeStateMuted: + e.mutedTransitions.Add(1) + case RuntimeStateFaulted: + e.faultedTransitions.Add(1) + } + } e.runtimeState.Store(state) } @@ -625,6 +651,7 @@ func (e *Engine) recordFault(reason FaultReason, severity FaultSeverity, message } e.lastFault.Store(ev) e.appendFaultHistory(ev) + e.faultEvents.Add(1) } func (e *Engine) loadLastFault() *FaultEvent { @@ -661,9 +688,21 @@ func (e *Engine) evaluateRuntimeState(queue output.QueueStats, hasLateBuffers bo case RuntimeStateStopping, RuntimeStateFaulted: return case RuntimeStateMuted: + if queue.Health == output.QueueHealthCritical { + if count := e.mutedFaultStreak.Add(1); count >= queueFaultedStreakThreshold { + e.mutedFaultStreak.Store(0) + e.recordFault(FaultReasonQueueCritical, FaultSeverityFaulted, + fmt.Sprintf("queue health critical for %d checks while muted (depth=%d)", count, queue.Depth)) + e.setRuntimeState(RuntimeStateFaulted) + return + } + } else { + e.mutedFaultStreak.Store(0) + } if queue.Health == output.QueueHealthNormal && !hasLateBuffers { if count := e.mutedRecoveryStreak.Add(1); count >= queueMutedRecoveryThreshold { e.mutedRecoveryStreak.Store(0) + e.mutedFaultStreak.Store(0) e.recordFault(FaultReasonQueueCritical, FaultSeverityDegraded, fmt.Sprintf("queue healthy for %d checks after mute", count)) e.setRuntimeState(RuntimeStateDegraded) diff --git a/internal/app/runtime_state_test.go b/internal/app/runtime_state_test.go index 36d8e6f..edc333f 100644 --- a/internal/app/runtime_state_test.go +++ b/internal/app/runtime_state_test.go @@ -109,3 +109,73 @@ func TestEngineRuntimeStateMuteOnPersistentQueueCritical(t *testing.T) { t.Fatalf("expected running after recovery, got %s", got) } } + +func TestEngineFaultsAfterMutedCriticalStreak(t *testing.T) { + e := NewEngine(cfgpkg.Default(), platform.NewSimulatedDriver(nil)) + e.setRuntimeState(RuntimeStateRunning) + + queue := output.QueueStats{Depth: 1, Health: output.QueueHealthCritical} + for i := 0; i < queueMutedStreakThreshold; i++ { + e.evaluateRuntimeState(queue, false) + } + if got := e.currentRuntimeState(); got != RuntimeStateMuted { + t.Fatalf("expected muted after draining critical streak, got %s", got) + } + + triggered := false + for i := 0; i < queueFaultedStreakThreshold; i++ { + e.evaluateRuntimeState(queue, false) + if e.currentRuntimeState() == RuntimeStateFaulted { + triggered = true + break + } + } + if !triggered { + t.Fatalf("expected faulted after %d extra critical checks", queueFaultedStreakThreshold) + } + if got := e.currentRuntimeState(); got != RuntimeStateFaulted { + t.Fatalf("expected faulted state, got %s", got) + } + + fault := e.LastFault() + if fault == nil { + t.Fatal("expected recorded fault") + } + if fault.Severity != FaultSeverityFaulted { + t.Fatalf("expected faulted severity, got %s", fault.Severity) + } + if fault.Reason != FaultReasonQueueCritical { + t.Fatalf("expected queue critical reason, got %s", fault.Reason) + } +} + +func TestRuntimeTransitionCounters(t *testing.T) { + e := NewEngine(cfgpkg.Default(), platform.NewSimulatedDriver(nil)) + + if got := e.Stats().DegradedTransitions; got != 0 { + t.Fatalf("expected zero transitions initially, got %d", got) + } + if got := e.Stats().FaultCount; got != 0 { + t.Fatalf("expected zero faults initially, got %d", got) + } + + e.setRuntimeState(RuntimeStateDegraded) + if got := e.Stats().DegradedTransitions; got != 1 { + t.Fatalf("expected one degraded transition, got %d", got) + } + + e.setRuntimeState(RuntimeStateMuted) + if got := e.Stats().MutedTransitions; got != 1 { + t.Fatalf("expected one mute transition, got %d", got) + } + + e.setRuntimeState(RuntimeStateFaulted) + if got := e.Stats().FaultedTransitions; got != 1 { + t.Fatalf("expected one faulted transition, got %d", got) + } + + e.recordFault(FaultReasonQueueCritical, FaultSeverityWarn, "audit") + if got := e.Stats().FaultCount; got != 1 { + t.Fatalf("expected one recorded fault, got %d", got) + } +} From 91225157bf268f0446a34d3c2edf866ce6056790 Mon Sep 17 00:00:00 2001 From: Jan Svabenik Date: Sun, 5 Apr 2026 23:28:08 +0200 Subject: [PATCH 27/55] ui: show fault telemetry in control health panel --- docs/pro-runtime-hardening-workboard.md | 2 ++ internal/control/ui.html | 42 ++++++++++++++++++++++++- 2 files changed, 43 insertions(+), 1 deletion(-) diff --git a/docs/pro-runtime-hardening-workboard.md b/docs/pro-runtime-hardening-workboard.md index 67219b7..362031b 100644 --- a/docs/pro-runtime-hardening-workboard.md +++ b/docs/pro-runtime-hardening-workboard.md @@ -277,6 +277,8 @@ Einführen eines klaren Betriebsmodells mit Fault-, Recovery- und Muted-Zuständ - `evaluateRuntimeState` now waits for a short healthy streak before leaving `muted`, logging a degraded-severity recovery event once the queue settles. - Persistent queue-critical streaks while `muted` now escalate to `faulted` with `FaultSeverityFaulted`, keeping `RuntimeStateFaulted` observable. - `EngineStats` and `txBridge` now expose transition/fault counters plus `lastFault`, surfacing the new telemetry through `/runtime`. +- Control-plane UI now renders those WS-02 transition counters, fault count, and last-fault summary so operators can watch runtime escalations without digging through logs. + ## Zielzustände laut Konzept - `idle` diff --git a/internal/control/ui.html b/internal/control/ui.html index ff13122..59558e8 100644 --- a/internal/control/ui.html +++ b/internal/control/ui.html @@ -1079,6 +1079,9 @@ input.input-error {
Runtime
--
Runtime Signal
--
Runtime Alert
--
+
Transitions (D/M/F)
--
+
Fault Count
--
+
Last Fault
--
Audio Buffer
--
Last Update
--
@@ -1782,6 +1785,43 @@ function updateHealth(engine, audioStream) { const last = Math.max(state.server.lastConfigAt || 0, state.server.lastRuntimeAt || 0); updateText('health-last', ageString(last)); + + const transitionsAvailable = engine.degradedTransitions != null || engine.mutedTransitions != null || engine.faultedTransitions != null; + const transitionsText = transitionsAvailable ? `${Number(engine.degradedTransitions ?? 0)} / ${Number(engine.mutedTransitions ?? 0)} / ${Number(engine.faultedTransitions ?? 0)}` : '--'; + updateText('health-transitions', transitionsText); + + const faultCountValue = engine.faultCount != null ? Number(engine.faultCount) : 0; + const hasFaultCount = engine.faultCount != null; + updateText('health-fault-count', hasFaultCount ? String(faultCountValue) : '--'); + const faultCountEl = $('health-fault-count'); + if (faultCountEl) { + faultCountEl.className = 'val' + (hasFaultCount ? (faultCountValue > 0 ? ' warn' : ' good') : ''); + } + + const lastFaultEl = $('health-last-fault'); + const lastFault = engine.lastFault; + if (lastFaultEl) { + if (lastFault) { + const severity = String(lastFault.severity || '').toLowerCase(); + const severityClass = severity === 'faulted' ? 'err' : 'warn'; + const severityLabel = (lastFault.severity || 'Fault').toUpperCase(); + const reasonLabel = lastFault.reason ? ` ${lastFault.reason}` : ''; + const messageLabel = lastFault.message ? ` - ${lastFault.message}` : ''; + let whenLabel = ''; + if (lastFault.time) { + const parsed = new Date(lastFault.time); + if (!Number.isNaN(parsed.getTime())) { + whenLabel = ` @ ${parsed.toLocaleTimeString()}`; + } + } + const title = `${severityLabel}${reasonLabel}`; + updateText('health-last-fault', `${title}${messageLabel}${whenLabel}`); + lastFaultEl.className = 'val ' + severityClass; + } else { + lastFaultEl.className = 'val good'; + updateText('health-last-fault', 'None'); + } + } } function updateMeters(engine, driver, audioStream) { @@ -2004,4 +2044,4 @@ async function init() { init(); - \ No newline at end of file + From f275e125a71a6e5ddddc6f46d8f8b027538b2fe6 Mon Sep 17 00:00:00 2001 From: Jan Svabenik Date: Sun, 5 Apr 2026 23:38:26 +0200 Subject: [PATCH 28/55] feat: add runtime fault reset path --- cmd/fmrtx/main.go | 4 ++ docs/API.md | 16 ++++++++ docs/pro-runtime-hardening-workboard.md | 3 ++ internal/app/engine.go | 14 +++++++ internal/app/runtime_state_test.go | 34 +++++++++++++++++ internal/control/control.go | 22 +++++++++++ internal/control/control_test.go | 51 +++++++++++++++++++++++++ 7 files changed, 144 insertions(+) diff --git a/cmd/fmrtx/main.go b/cmd/fmrtx/main.go index 46d9113..9a466fb 100644 --- a/cmd/fmrtx/main.go +++ b/cmd/fmrtx/main.go @@ -287,3 +287,7 @@ func (b *txBridge) UpdateConfig(lp ctrlpkg.LivePatch) error { RadioText: lp.RadioText, }) } + +func (b *txBridge) ResetFault() error { + return b.engine.ResetFault() +} diff --git a/docs/API.md b/docs/API.md index c8807d4..dd9da0c 100644 --- a/docs/API.md +++ b/docs/API.md @@ -79,6 +79,22 @@ Live engine and driver telemetry. Only populated when TX is active. --- +### `POST /runtime/fault/reset` + +Manually acknowledge a `faulted` runtime state so the supervisor can re-enter the recovery path (the engine moves back to `degraded` once the reset succeeds). + +**Response:** +```json +{"ok": true} +``` + +**Errors:** +- `405 Method Not Allowed` if the request is not a POST +- `503 Service Unavailable` when no TX controller is attached (`--tx` mode not active) +- `409 Conflict` when the engine is not currently faulted or the reset was rejected (e.g. still throttled) + +--- + ### `GET /config` Full current configuration (all fields, including non-patchable). diff --git a/docs/pro-runtime-hardening-workboard.md b/docs/pro-runtime-hardening-workboard.md index 362031b..f84e4db 100644 --- a/docs/pro-runtime-hardening-workboard.md +++ b/docs/pro-runtime-hardening-workboard.md @@ -278,6 +278,7 @@ Einführen eines klaren Betriebsmodells mit Fault-, Recovery- und Muted-Zuständ - Persistent queue-critical streaks while `muted` now escalate to `faulted` with `FaultSeverityFaulted`, keeping `RuntimeStateFaulted` observable. - `EngineStats` and `txBridge` now expose transition/fault counters plus `lastFault`, surfacing the new telemetry through `/runtime`. - Control-plane UI now renders those WS-02 transition counters, fault count, and last-fault summary so operators can watch runtime escalations without digging through logs. +- Control-plane now exposes `POST /runtime/fault/reset` so operators can acknowledge `faulted` state; `TestRuntimeFaultReset*` covers the new HTTP path. ## Zielzustände laut Konzept @@ -327,11 +328,13 @@ Einführen eines klaren Betriebsmodells mit Fault-, Recovery- und Muted-Zuständ | Datum | Entscheidung | Notiz | |---|---|---| | 2026-04-05 | Faulted escalation on persistent critical queue | `muted` now surfaces `RuntimeStateFaulted` when queue health stays critical and metrics capture every transition. | +| 2026-04-05 | Manual fault reset endpoint | Added `POST /runtime/fault/reset` so operators can acknowledge `faulted` before the supervisor re-enters recovery. | ## WS-02 Verifikation | Datum | Fokus | Ergebnis | |---|---|---| | 2026-04-05 | Faulted path + transition counters | `go test ./...` exercises `TestEngineFaultsAfterMutedCriticalStreak` and `TestRuntimeTransitionCounters`, while `/runtime` now surfaces `engine.degradedTransitions`, `engine.mutedTransitions`, `engine.faultedTransitions`, `engine.faultCount`, and the last fault via `txBridge`. | +| 2026-04-05 | Runtime fault reset API | `go test ./...` now runs `TestRuntimeFaultReset*`, verifying the new HTTP path and controller error scenarios. | --- diff --git a/internal/app/engine.go b/internal/app/engine.go index cc8d8b8..9a40ae7 100644 --- a/internal/app/engine.go +++ b/internal/app/engine.go @@ -744,3 +744,17 @@ func (e *Engine) evaluateRuntimeState(queue output.QueueStats, hasLateBuffers bo } e.setRuntimeState(RuntimeStateRunning) } + +// ResetFault attempts to move the engine out of the faulted state. +func (e *Engine) ResetFault() error { + state := e.currentRuntimeState() + if state != RuntimeStateFaulted { + return fmt.Errorf("engine not in faulted state (current=%s)", state) + } + + e.criticalStreak.Store(0) + e.mutedRecoveryStreak.Store(0) + e.mutedFaultStreak.Store(0) + e.setRuntimeState(RuntimeStateDegraded) + return nil +} diff --git a/internal/app/runtime_state_test.go b/internal/app/runtime_state_test.go index edc333f..018913f 100644 --- a/internal/app/runtime_state_test.go +++ b/internal/app/runtime_state_test.go @@ -179,3 +179,37 @@ func TestRuntimeTransitionCounters(t *testing.T) { t.Fatalf("expected one recorded fault, got %d", got) } } + + +func TestEngineResetFaultRequiresFaultedState(t *testing.T) { + e := NewEngine(cfgpkg.Default(), platform.NewSimulatedDriver(nil)) + if err := e.ResetFault(); err == nil { + t.Fatal("expected error when resetting non-faulted state") + } +} + +func TestEngineResetFaultTransitionsToDegraded(t *testing.T) { + e := NewEngine(cfgpkg.Default(), platform.NewSimulatedDriver(nil)) + e.criticalStreak.Store(7) + e.mutedRecoveryStreak.Store(3) + e.mutedFaultStreak.Store(1) + e.setRuntimeState(RuntimeStateFaulted) + if err := e.ResetFault(); err != nil { + t.Fatalf("reset fault failed: %v", err) + } + if got := e.currentRuntimeState(); got != RuntimeStateDegraded { + t.Fatalf("expected degraded after reset, got %s", got) + } + if e.criticalStreak.Load() != 0 { + t.Fatalf("expected critical streak reset, got %d", e.criticalStreak.Load()) + } + if e.mutedRecoveryStreak.Load() != 0 { + t.Fatalf("expected mute recovery streak reset, got %d", e.mutedRecoveryStreak.Load()) + } + if e.mutedFaultStreak.Load() != 0 { + t.Fatalf("expected mute fault streak reset, got %d", e.mutedFaultStreak.Load()) + } + if err := e.ResetFault(); err == nil { + t.Fatal("expected error when resetting after recovery") + } +} diff --git a/internal/control/control.go b/internal/control/control.go index 823a8af..5509199 100644 --- a/internal/control/control.go +++ b/internal/control/control.go @@ -23,6 +23,7 @@ type TXController interface { StopTX() error TXStats() map[string]any UpdateConfig(patch LivePatch) error + ResetFault() error } // LivePatch mirrors the patchable fields from ConfigPatch for the engine. @@ -95,6 +96,7 @@ func (s *Server) Handler() http.Handler { mux.HandleFunc("/dry-run", s.handleDryRun) mux.HandleFunc("/config", s.handleConfig) mux.HandleFunc("/runtime", s.handleRuntime) + mux.HandleFunc("/runtime/fault/reset", s.handleRuntimeFaultReset) mux.HandleFunc("/tx/start", s.handleTXStart) mux.HandleFunc("/tx/stop", s.handleTXStop) mux.HandleFunc("/audio/stream", s.handleAudioStream) @@ -171,6 +173,26 @@ func (s *Server) handleRuntime(w http.ResponseWriter, _ *http.Request) { _ = json.NewEncoder(w).Encode(result) } +func (s *Server) handleRuntimeFaultReset(w http.ResponseWriter, r *http.Request) { + if r.Method != http.MethodPost { + http.Error(w, "method not allowed", http.StatusMethodNotAllowed) + return + } + s.mu.RLock() + tx := s.tx + s.mu.RUnlock() + if tx == nil { + http.Error(w, "tx controller not available", http.StatusServiceUnavailable) + return + } + if err := tx.ResetFault(); err != nil { + http.Error(w, err.Error(), http.StatusConflict) + return + } + w.Header().Set("Content-Type", "application/json") + _ = json.NewEncoder(w).Encode(map[string]any{"ok": true}) +} + // handleAudioStream accepts raw S16LE stereo PCM via HTTP POST and pushes // it into the live audio ring buffer. Use with: // curl -X POST --data-binary @- http://host:8088/audio/stream < audio.raw diff --git a/internal/control/control_test.go b/internal/control/control_test.go index d810666..a42ca51 100644 --- a/internal/control/control_test.go +++ b/internal/control/control_test.go @@ -125,6 +125,55 @@ func TestRuntimeWithoutDriver(t *testing.T) { } } +func TestRuntimeFaultResetRejectsGet(t *testing.T) { + srv := NewServer(cfgpkg.Default()) + rec := httptest.NewRecorder() + req := httptest.NewRequest(http.MethodGet, "/runtime/fault/reset", nil) + srv.Handler().ServeHTTP(rec, req) + if rec.Code != http.StatusMethodNotAllowed { + t.Fatalf("expected 405 for fault reset GET, got %d", rec.Code) + } +} + +func TestRuntimeFaultResetRequiresController(t *testing.T) { + srv := NewServer(cfgpkg.Default()) + rec := httptest.NewRecorder() + req := httptest.NewRequest(http.MethodPost, "/runtime/fault/reset", nil) + srv.Handler().ServeHTTP(rec, req) + if rec.Code != http.StatusServiceUnavailable { + t.Fatalf("expected 503 without controller, got %d", rec.Code) + } +} + +func TestRuntimeFaultResetControllerError(t *testing.T) { + srv := NewServer(cfgpkg.Default()) + srv.SetTXController(&fakeTXController{resetErr: errors.New("boom")}) + rec := httptest.NewRecorder() + req := httptest.NewRequest(http.MethodPost, "/runtime/fault/reset", nil) + srv.Handler().ServeHTTP(rec, req) + if rec.Code != http.StatusConflict { + t.Fatalf("expected 409 when controller rejects, got %d", rec.Code) + } +} + +func TestRuntimeFaultResetSuccess(t *testing.T) { + srv := NewServer(cfgpkg.Default()) + srv.SetTXController(&fakeTXController{}) + rec := httptest.NewRecorder() + req := httptest.NewRequest(http.MethodPost, "/runtime/fault/reset", nil) + srv.Handler().ServeHTTP(rec, req) + if rec.Code != 200 { + t.Fatalf("expected 200 on success, got %d", rec.Code) + } + var body map[string]any + if err := json.Unmarshal(rec.Body.Bytes(), &body); err != nil { + t.Fatalf("unmarshal response: %v", err) + } + if ok, _ := body["ok"].(bool); !ok { + t.Fatalf("expected ok true, got %v", body["ok"]) + } +} + func TestAudioStreamRequiresSource(t *testing.T) { srv := NewServer(cfgpkg.Default()) rec := httptest.NewRecorder() @@ -239,6 +288,7 @@ func TestConfigPatchEngineRejectsDoesNotUpdateSnapshot(t *testing.T) { type fakeTXController struct { updateErr error + resetErr error stats map[string]any } @@ -251,3 +301,4 @@ func (f *fakeTXController) TXStats() map[string]any { return map[string]any{} } func (f *fakeTXController) UpdateConfig(_ LivePatch) error { return f.updateErr } +func (f *fakeTXController) ResetFault() error { return f.resetErr } From c2a27e71ec22c65574e21703a9bca28691cff886 Mon Sep 17 00:00:00 2001 From: Jan Svabenik Date: Sun, 5 Apr 2026 23:45:54 +0200 Subject: [PATCH 29/55] ui: add manual fault reset action --- docs/pro-runtime-hardening-workboard.md | 2 ++ internal/control/ui.html | 29 +++++++++++++++++++++++++ 2 files changed, 31 insertions(+) diff --git a/docs/pro-runtime-hardening-workboard.md b/docs/pro-runtime-hardening-workboard.md index f84e4db..c0e2653 100644 --- a/docs/pro-runtime-hardening-workboard.md +++ b/docs/pro-runtime-hardening-workboard.md @@ -279,6 +279,7 @@ Einführen eines klaren Betriebsmodells mit Fault-, Recovery- und Muted-Zuständ - `EngineStats` and `txBridge` now expose transition/fault counters plus `lastFault`, surfacing the new telemetry through `/runtime`. - Control-plane UI now renders those WS-02 transition counters, fault count, and last-fault summary so operators can watch runtime escalations without digging through logs. - Control-plane now exposes `POST /runtime/fault/reset` so operators can acknowledge `faulted` state; `TestRuntimeFaultReset*` covers the new HTTP path. +- Control-plane UI now also offers a Danger Zone `Reset Fault` button that calls the same endpoint so operators can acknowledge faults from the dashboard. ## Zielzustände laut Konzept @@ -329,6 +330,7 @@ Einführen eines klaren Betriebsmodells mit Fault-, Recovery- und Muted-Zuständ |---|---|---| | 2026-04-05 | Faulted escalation on persistent critical queue | `muted` now surfaces `RuntimeStateFaulted` when queue health stays critical and metrics capture every transition. | | 2026-04-05 | Manual fault reset endpoint | Added `POST /runtime/fault/reset` so operators can acknowledge `faulted` before the supervisor re-enters recovery. | +| 2026-04-05 | Fault-reset UI shortcut | Danger Zone now hosts a Reset Fault button wired to `/runtime/fault/reset` so operators get an in-app acknowledgement path without manual HTTP calls. | ## WS-02 Verifikation | Datum | Fokus | Ergebnis | diff --git a/internal/control/ui.html b/internal/control/ui.html index 59558e8..156d1e1 100644 --- a/internal/control/ui.html +++ b/internal/control/ui.html @@ -1121,6 +1121,7 @@ input.input-error {
+
@@ -1175,6 +1176,7 @@ const state = { dirty: new Set(), pendingRequests: 0, txBusy: false, + faultResetBusy: false, toggleBusy: {}, pollersStarted: false, mobilePanelsApplied: false, @@ -1489,6 +1491,26 @@ async function txAction(action) { } } +async function resetFaultAction() { + if (state.faultResetBusy) return; + state.faultResetBusy = true; + render(); + beginRequest(); + try { + await api('/runtime/fault/reset', { method: 'POST' }); + toast('Fault reset', 'ok'); + log('Fault reset request accepted', 'ok'); + await loadRuntime({ silent: true }); + } catch (error) { + toast(error.message, 'err'); + log('Fault reset failed: ' + error.message, 'err'); + } finally { + state.faultResetBusy = false; + endRequest(); + render(); + } +} + function fmt(n) { if (n == null) return '--'; if (n >= 1e9) return (n / 1e9).toFixed(2) + 'G'; @@ -1686,6 +1708,12 @@ function render() { $('btn-refresh').disabled = state.pendingRequests > 0; $('danger-stop').disabled = stopDisabled; $('danger-refresh').disabled = state.pendingRequests > 0; + const resetFaultBtn = $('danger-reset-fault'); + if (resetFaultBtn) { + const resetDisabled = state.faultResetBusy || !state.server.runtimeOk; + resetFaultBtn.disabled = resetDisabled; + resetFaultBtn.textContent = state.faultResetBusy ? 'Resetting…' : 'Reset Fault'; + } syncDirtyInput('freq-slider', 'frequencyMHz', (v) => typeof v === 'number' ? v.toFixed(1) : '100.0'); syncDirtyInput('freq-num', 'frequencyMHz', (v) => typeof v === 'number' ? v.toFixed(1) : '100.0'); @@ -1901,6 +1929,7 @@ function bindInputs() { $('danger-stop').addEventListener('click', () => txAction('stop')); $('btn-refresh').addEventListener('click', manualRefresh); $('danger-refresh').addEventListener('click', manualRefresh); + $('danger-reset-fault').addEventListener('click', () => resetFaultAction()); document.querySelectorAll('.toggle[data-toggle]').forEach((toggle) => { const key = toggle.dataset.toggle; From 9fbe4e5bf914b2a8e1677cbad401255d3422245e Mon Sep 17 00:00:00 2001 From: Jan Svabenik Date: Sun, 5 Apr 2026 23:54:57 +0200 Subject: [PATCH 30/55] ui: show runtime state in control health --- internal/control/control.go | 3 +++ internal/control/control_test.go | 21 ++++++++++++++++++-- internal/control/ui.html | 33 +++++++++++++++++++++++++++++--- 3 files changed, 52 insertions(+), 5 deletions(-) diff --git a/internal/control/control.go b/internal/control/control.go index 5509199..5ec9a97 100644 --- a/internal/control/control.go +++ b/internal/control/control.go @@ -145,6 +145,9 @@ func (s *Server) handleStatus(w http.ResponseWriter, _ *http.Request) { if queue, ok := stats["queue"]; ok { status["queue"] = queue } + if runtimeState, ok := stats["state"]; ok { + status["runtimeState"] = runtimeState + } } } diff --git a/internal/control/control_test.go b/internal/control/control_test.go index a42ca51..f7e1c4d 100644 --- a/internal/control/control_test.go +++ b/internal/control/control_test.go @@ -8,8 +8,8 @@ import ( "net/http/httptest" "testing" - cfgpkg "github.com/jan/fm-rds-tx/internal/config" "github.com/jan/fm-rds-tx/internal/audio" + cfgpkg "github.com/jan/fm-rds-tx/internal/config" "github.com/jan/fm-rds-tx/internal/output" ) @@ -92,6 +92,23 @@ func TestStatusReportsQueueStats(t *testing.T) { } } +func TestStatusReportsRuntimeState(t *testing.T) { + srv := NewServer(cfgpkg.Default()) + srv.SetTXController(&fakeTXController{stats: map[string]any{"state": "faulted"}}) + rec := httptest.NewRecorder() + srv.Handler().ServeHTTP(rec, httptest.NewRequest(http.MethodGet, "/status", nil)) + if rec.Code != 200 { + t.Fatalf("status: %d", rec.Code) + } + var body map[string]any + if err := json.Unmarshal(rec.Body.Bytes(), &body); err != nil { + t.Fatalf("unmarshal runtime state: %v", err) + } + if body["runtimeState"] != "faulted" { + t.Fatalf("expected runtimeState faulted, got %v", body["runtimeState"]) + } +} + func TestDryRunEndpoint(t *testing.T) { srv := NewServer(cfgpkg.Default()) rec := httptest.NewRecorder() @@ -301,4 +318,4 @@ func (f *fakeTXController) TXStats() map[string]any { return map[string]any{} } func (f *fakeTXController) UpdateConfig(_ LivePatch) error { return f.updateErr } -func (f *fakeTXController) ResetFault() error { return f.resetErr } +func (f *fakeTXController) ResetFault() error { return f.resetErr } diff --git a/internal/control/ui.html b/internal/control/ui.html index 156d1e1..374948d 100644 --- a/internal/control/ui.html +++ b/internal/control/ui.html @@ -1764,14 +1764,41 @@ function renderToggle(key, toggleId, labelId) { updateText(labelId, busy ? '...' : (on ? 'ON' : 'OFF')); } +function runtimeStateClass(engineState) { + const normalized = String(engineState || '').toLowerCase(); + if (!normalized) { + return 'warn'; + } + switch (normalized) { + case 'faulted': + return 'err'; + case 'muted': + case 'degraded': + case 'prebuffering': + case 'arming': + case 'stopping': + case 'idle': + case 'unknown': + return 'warn'; + default: + return 'good'; + } +} + function updateHealth(engine, audioStream) { engine = engine || {}; updateText('health-http', state.server.configOk ? 'OK' : 'OFFLINE'); $('health-http').className = 'val ' + (state.server.configOk ? 'good' : 'err'); - const runtimeState = state.server.runtimeOk ? 'OK' : 'WAITING'; - updateText('health-runtime', runtimeState); - $('health-runtime').className = 'val ' + (state.server.runtimeOk ? 'good' : 'warn'); + let runtimeLabel = 'WAITING'; + let runtimeClass = 'warn'; + if (state.server.runtimeOk) { + const engineStateName = String(engine.state || 'unknown'); + runtimeLabel = engineStateName.toUpperCase(); + runtimeClass = runtimeStateClass(engineStateName); + } + updateText('health-runtime', runtimeLabel); + $('health-runtime').className = 'val ' + runtimeClass; const runtimeIndicator = engine.runtimeIndicator; const indicatorLabels = { From 14501a3925c41dda7a30bc981e6e62d285cceb69 Mon Sep 17 00:00:00 2001 From: Jan Svabenik Date: Mon, 6 Apr 2026 00:05:48 +0200 Subject: [PATCH 31/55] feat: highlight runtime state transitions --- docs/pro-runtime-hardening-workboard.md | 4 +++ internal/control/ui.html | 37 +++++++++++++++++++++++++ 2 files changed, 41 insertions(+) diff --git a/docs/pro-runtime-hardening-workboard.md b/docs/pro-runtime-hardening-workboard.md index c0e2653..7be3c93 100644 --- a/docs/pro-runtime-hardening-workboard.md +++ b/docs/pro-runtime-hardening-workboard.md @@ -281,6 +281,8 @@ Einführen eines klaren Betriebsmodells mit Fault-, Recovery- und Muted-Zuständ - Control-plane now exposes `POST /runtime/fault/reset` so operators can acknowledge `faulted` state; `TestRuntimeFaultReset*` covers the new HTTP path. - Control-plane UI now also offers a Danger Zone `Reset Fault` button that calls the same endpoint so operators can acknowledge faults from the dashboard. +- Control-plane UI now posts an ops toast/log entry whenever the runtime state shifts so escalations and manual acknowledgements are immediately visible. + ## Zielzustände laut Konzept - `idle` @@ -331,12 +333,14 @@ Einführen eines klaren Betriebsmodells mit Fault-, Recovery- und Muted-Zuständ | 2026-04-05 | Faulted escalation on persistent critical queue | `muted` now surfaces `RuntimeStateFaulted` when queue health stays critical and metrics capture every transition. | | 2026-04-05 | Manual fault reset endpoint | Added `POST /runtime/fault/reset` so operators can acknowledge `faulted` before the supervisor re-enters recovery. | | 2026-04-05 | Fault-reset UI shortcut | Danger Zone now hosts a Reset Fault button wired to `/runtime/fault/reset` so operators get an in-app acknowledgement path without manual HTTP calls. | +| 2026-04-06 | Runtime transition visibility cue | Control UI now posts toast/log entries for runtime state shifts so ops instantly sees escalations and manual reset acknowledgements. | ## WS-02 Verifikation | Datum | Fokus | Ergebnis | |---|---|---| | 2026-04-05 | Faulted path + transition counters | `go test ./...` exercises `TestEngineFaultsAfterMutedCriticalStreak` and `TestRuntimeTransitionCounters`, while `/runtime` now surfaces `engine.degradedTransitions`, `engine.mutedTransitions`, `engine.faultedTransitions`, `engine.faultCount`, and the last fault via `txBridge`. | | 2026-04-05 | Runtime fault reset API | `go test ./...` now runs `TestRuntimeFaultReset*`, verifying the new HTTP path and controller error scenarios. | +| 2026-04-06 | Runtime transition visibility | ✅ `go test ./...`; manual UI smoke verification still pending to ensure the toast/log flow shows every runtime shift. | --- diff --git a/internal/control/ui.html b/internal/control/ui.html index 374948d..d105404 100644 --- a/internal/control/ui.html +++ b/internal/control/ui.html @@ -1163,6 +1163,7 @@ const state = { configOk: false, runtimeOk: false, }, + lastRuntimeState: '', draft: { frequencyMHz: undefined, ps: undefined, @@ -1338,6 +1339,7 @@ async function loadRuntime({ silent = true } = {}) { state.server.runtime = runtime; state.server.runtimeOk = true; state.server.lastRuntimeAt = nowTs(); + notifyRuntimeTransition(runtime.engine); pushHistory(runtime); setConnection(true, state.pendingRequests > 0 ? 'busy' : 'connected'); render(); @@ -1785,6 +1787,41 @@ function runtimeStateClass(engineState) { } } + + +function normalizeRuntimeState(stateName) { + const normalized = (typeof stateName === 'string' ? stateName.trim().toLowerCase() : ''); + return normalized || 'idle'; +} + +function runtimeStateSeverity(stateName) { + const normalized = normalizeRuntimeState(stateName); + switch (normalized) { + case 'running': + return 'ok'; + case 'degraded': + case 'muted': + return 'warn'; + case 'faulted': + return 'err'; + default: + return 'info'; + } +} + +function notifyRuntimeTransition(engine) { + if (!engine) return; + const next = normalizeRuntimeState(engine.state); + const prev = state.lastRuntimeState; + state.lastRuntimeState = next; + if (!prev || prev === next) return; + const message = `Runtime ${prev.toUpperCase()} → ${next.toUpperCase()}`; + const severity = runtimeStateSeverity(next); + const logLevel = severity === 'err' ? 'err' : (severity === 'warn' ? 'warn' : 'info'); + toast(message, severity); + log(message, logLevel); +} + function updateHealth(engine, audioStream) { engine = engine || {}; updateText('health-http', state.server.configOk ? 'OK' : 'OFFLINE'); From 24adbff8f27b8394360a28353c3b507611649557 Mon Sep 17 00:00:00 2001 From: Jan Svabenik Date: Mon, 6 Apr 2026 00:23:28 +0200 Subject: [PATCH 32/55] feat: expose fault history in runtime and UI --- cmd/fmrtx/main.go | 29 +++++----- cmd/fmrtx/main_test.go | 7 +++ docs/API.md | 18 +++++- internal/app/engine.go | 78 ++++++++++++------------- internal/control/control_test.go | 33 +++++++++++ internal/control/ui.html | 98 ++++++++++++++++++++++++++++++++ 6 files changed, 210 insertions(+), 53 deletions(-) diff --git a/cmd/fmrtx/main.go b/cmd/fmrtx/main.go index 9a466fb..ddc7e49 100644 --- a/cmd/fmrtx/main.go +++ b/cmd/fmrtx/main.go @@ -252,24 +252,25 @@ func (b *txBridge) StopTX() error { return b.engine.Stop(context.Background()) func (b *txBridge) TXStats() map[string]any { s := b.engine.Stats() return map[string]any{ - "state": s.State, - "chunksProduced": s.ChunksProduced, - "totalSamples": s.TotalSamples, - "underruns": s.Underruns, - "lateBuffers": s.LateBuffers, - "lastError": s.LastError, - "uptimeSeconds": s.UptimeSeconds, - "maxCycleMs": s.MaxCycleMs, - "maxGenerateMs": s.MaxGenerateMs, - "maxUpsampleMs": s.MaxUpsampleMs, - "maxWriteMs": s.MaxWriteMs, - "queue": s.Queue, - "runtimeIndicator": s.RuntimeIndicator, - "runtimeAlert": s.RuntimeAlert, + "state": s.State, + "chunksProduced": s.ChunksProduced, + "totalSamples": s.TotalSamples, + "underruns": s.Underruns, + "lateBuffers": s.LateBuffers, + "lastError": s.LastError, + "uptimeSeconds": s.UptimeSeconds, + "maxCycleMs": s.MaxCycleMs, + "maxGenerateMs": s.MaxGenerateMs, + "maxUpsampleMs": s.MaxUpsampleMs, + "maxWriteMs": s.MaxWriteMs, + "queue": s.Queue, + "runtimeIndicator": s.RuntimeIndicator, + "runtimeAlert": s.RuntimeAlert, "degradedTransitions": s.DegradedTransitions, "mutedTransitions": s.MutedTransitions, "faultedTransitions": s.FaultedTransitions, "faultCount": s.FaultCount, + "faultHistory": s.FaultHistory, "lastFault": s.LastFault, } } diff --git a/cmd/fmrtx/main_test.go b/cmd/fmrtx/main_test.go index 43bc67f..cb68607 100644 --- a/cmd/fmrtx/main_test.go +++ b/cmd/fmrtx/main_test.go @@ -45,4 +45,11 @@ func TestTxBridgeExportsQueueStats(t *testing.T) { if indicator != apppkg.RuntimeIndicatorQueueCritical { t.Fatalf("runtime indicator should be queueCritical, got %s", indicator) } + if historyRaw, ok := stats["faultHistory"]; !ok { + t.Fatalf("expected faultHistory in tx stats") + } else if history, ok := historyRaw.([]apppkg.FaultEvent); !ok { + t.Fatalf("faultHistory type mismatch: %T", historyRaw) + } else if len(history) != 0 { + t.Fatalf("expected no faults yet, got %d", len(history)) + } } diff --git a/docs/API.md b/docs/API.md index dd9da0c..5ad9fa6 100644 --- a/docs/API.md +++ b/docs/API.md @@ -17,6 +17,7 @@ Health check. `engine.state` spiegelt jetzt die Runtime-State-Maschine wider (idle, arming, prebuffering, running, degraded, muted, faulted, stopping) und bietet eine erste beobachtbare Basis für Fault-Transitions. + --- ### `GET /status` @@ -62,7 +63,22 @@ Live engine and driver telemetry. Only populated when TX is active. "totalSamples": 1408950000, "underruns": 0, "lastError": "", - "uptimeSeconds": 3614.2 + "uptimeSeconds": 3614.2, + "faultCount": 2, + "lastFault": { + "time": "2026-04-06T00:00:00Z", + "reason": "queueCritical", + "severity": "faulted", + "message": "queue health critical for 5 checks" + }, + "faultHistory": [ + { + "time": "2026-04-06T00:00:00Z", + "reason": "queueCritical", + "severity": "faulted", + "message": "queue health critical for 5 checks" + } + ] }, "driver": { "txEnabled": true, diff --git a/internal/app/engine.go b/internal/app/engine.go index 9a40ae7..d492094 100644 --- a/internal/app/engine.go +++ b/internal/app/engine.go @@ -69,25 +69,26 @@ func durationMs(ns uint64) float64 { } type EngineStats struct { - State string `json:"state"` - ChunksProduced uint64 `json:"chunksProduced"` - TotalSamples uint64 `json:"totalSamples"` - Underruns uint64 `json:"underruns"` - LateBuffers uint64 `json:"lateBuffers,omitempty"` - LastError string `json:"lastError,omitempty"` - UptimeSeconds float64 `json:"uptimeSeconds"` - MaxCycleMs float64 `json:"maxCycleMs,omitempty"` - MaxGenerateMs float64 `json:"maxGenerateMs,omitempty"` - MaxUpsampleMs float64 `json:"maxUpsampleMs,omitempty"` - MaxWriteMs float64 `json:"maxWriteMs,omitempty"` - Queue output.QueueStats `json:"queue"` - RuntimeIndicator RuntimeIndicator `json:"runtimeIndicator"` - RuntimeAlert string `json:"runtimeAlert,omitempty"` - LastFault *FaultEvent `json:"lastFault,omitempty"` - DegradedTransitions uint64 `json:"degradedTransitions"` - MutedTransitions uint64 `json:"mutedTransitions"` - FaultedTransitions uint64 `json:"faultedTransitions"` - FaultCount uint64 `json:"faultCount"` + State string `json:"state"` + ChunksProduced uint64 `json:"chunksProduced"` + TotalSamples uint64 `json:"totalSamples"` + Underruns uint64 `json:"underruns"` + LateBuffers uint64 `json:"lateBuffers,omitempty"` + LastError string `json:"lastError,omitempty"` + UptimeSeconds float64 `json:"uptimeSeconds"` + MaxCycleMs float64 `json:"maxCycleMs,omitempty"` + MaxGenerateMs float64 `json:"maxGenerateMs,omitempty"` + MaxUpsampleMs float64 `json:"maxUpsampleMs,omitempty"` + MaxWriteMs float64 `json:"maxWriteMs,omitempty"` + Queue output.QueueStats `json:"queue"` + RuntimeIndicator RuntimeIndicator `json:"runtimeIndicator"` + RuntimeAlert string `json:"runtimeAlert,omitempty"` + LastFault *FaultEvent `json:"lastFault,omitempty"` + DegradedTransitions uint64 `json:"degradedTransitions"` + MutedTransitions uint64 `json:"mutedTransitions"` + FaultedTransitions uint64 `json:"faultedTransitions"` + FaultCount uint64 `json:"faultCount"` + FaultHistory []FaultEvent `json:"faultHistory,omitempty"` } type RuntimeIndicator string @@ -146,10 +147,10 @@ type Engine struct { faultHistoryMu sync.Mutex faultHistory []FaultEvent - degradedTransitions atomic.Uint64 - mutedTransitions atomic.Uint64 - faultedTransitions atomic.Uint64 - faultEvents atomic.Uint64 + degradedTransitions atomic.Uint64 + mutedTransitions atomic.Uint64 + faultedTransitions atomic.Uint64 + faultEvents atomic.Uint64 // Live config: pending frequency change, applied between chunks pendingFreq atomic.Pointer[float64] @@ -402,25 +403,26 @@ func (e *Engine) Stats() EngineStats { ri := runtimeIndicator(queue.Health, hasRecentLateBuffers) lastFault := e.lastFaultEvent() return EngineStats{ - State: string(e.currentRuntimeState()), - ChunksProduced: e.chunksProduced.Load(), - TotalSamples: e.totalSamples.Load(), - Underruns: e.underruns.Load(), - LateBuffers: lateBuffers, - LastError: errVal, - UptimeSeconds: uptime, - MaxCycleMs: durationMs(e.maxCycleNs.Load()), - MaxGenerateMs: durationMs(e.maxGenerateNs.Load()), - MaxUpsampleMs: durationMs(e.maxUpsampleNs.Load()), - MaxWriteMs: durationMs(e.maxWriteNs.Load()), - Queue: queue, - RuntimeIndicator: ri, - RuntimeAlert: runtimeAlert(queue.Health, hasRecentLateBuffers), - LastFault: lastFault, + State: string(e.currentRuntimeState()), + ChunksProduced: e.chunksProduced.Load(), + TotalSamples: e.totalSamples.Load(), + Underruns: e.underruns.Load(), + LateBuffers: lateBuffers, + LastError: errVal, + UptimeSeconds: uptime, + MaxCycleMs: durationMs(e.maxCycleNs.Load()), + MaxGenerateMs: durationMs(e.maxGenerateNs.Load()), + MaxUpsampleMs: durationMs(e.maxUpsampleNs.Load()), + MaxWriteMs: durationMs(e.maxWriteNs.Load()), + Queue: queue, + RuntimeIndicator: ri, + RuntimeAlert: runtimeAlert(queue.Health, hasRecentLateBuffers), + LastFault: lastFault, DegradedTransitions: e.degradedTransitions.Load(), MutedTransitions: e.mutedTransitions.Load(), FaultedTransitions: e.faultedTransitions.Load(), FaultCount: e.faultEvents.Load(), + FaultHistory: e.FaultHistory(), } } diff --git a/internal/control/control_test.go b/internal/control/control_test.go index f7e1c4d..8656643 100644 --- a/internal/control/control_test.go +++ b/internal/control/control_test.go @@ -142,6 +142,39 @@ func TestRuntimeWithoutDriver(t *testing.T) { } } +func TestRuntimeReportsFaultHistory(t *testing.T) { + srv := NewServer(cfgpkg.Default()) + history := []map[string]any{ + { + "time": "2026-04-06T00:00:00Z", + "reason": "queueCritical", + "severity": "faulted", + "message": "queue critical", + }, + } + srv.SetTXController(&fakeTXController{stats: map[string]any{"faultHistory": history}}) + rec := httptest.NewRecorder() + srv.Handler().ServeHTTP(rec, httptest.NewRequest(http.MethodGet, "/runtime", nil)) + if rec.Code != 200 { + t.Fatalf("status: %d", rec.Code) + } + var body map[string]any + if err := json.Unmarshal(rec.Body.Bytes(), &body); err != nil { + t.Fatalf("unmarshal runtime: %v", err) + } + engineRaw, ok := body["engine"].(map[string]any) + if !ok { + t.Fatalf("runtime engine missing") + } + histRaw, ok := engineRaw["faultHistory"].([]any) + if !ok { + t.Fatalf("faultHistory missing or wrong type: %T", engineRaw["faultHistory"]) + } + if len(histRaw) != len(history) { + t.Fatalf("faultHistory length mismatch: want %d got %d", len(history), len(histRaw)) + } +} + func TestRuntimeFaultResetRejectsGet(t *testing.T) { srv := NewServer(cfgpkg.Default()) rec := httptest.NewRecorder() diff --git a/internal/control/ui.html b/internal/control/ui.html index d105404..ba94f0c 100644 --- a/internal/control/ui.html +++ b/internal/control/ui.html @@ -771,6 +771,50 @@ input.input-error { .health-line .val.warn { color: var(--amber); } .health-line .val.err { color: var(--accent); } +.fault-history { + margin-top: 12px; + padding: 10px; + border: 1px solid var(--border); + border-radius: 6px; + background: var(--surface1); + font-size: 11px; + max-height: 180px; + overflow-y: auto; + line-height: 1.3; +} +.fault-history-entry { + display: flex; + justify-content: space-between; + gap: 10px; + padding: 4px 0; + border-bottom: 1px solid rgba(255, 255, 255, 0.08); +} +.fault-history-entry:last-child { + border-bottom: none; +} +.fault-history-entry .fault-history-time { + color: var(--text-dim); +} +.fault-history-entry.ok { color: var(--green); } +.fault-history-entry.warn { color: var(--amber); } +.fault-history-entry.err { color: var(--accent); } +.fault-history-desc { + font-size: 10px; + flex: 1; + text-transform: uppercase; + letter-spacing: 0.5px; +} +.fault-history-empty { + padding: 6px 0; + color: var(--text-muted); + font-size: 11px; +} +.section-note.reset-hint { + font-size: 11px; + color: var(--text-dim); + margin-top: 10px; +} + .log { background: var(--bg); border: 1px solid var(--border); @@ -1122,6 +1166,24 @@ input.input-error { + + +
+ Reset Fault moves the runtime back to DEGRADED while the queue settles before running again. +
+ + + +
+
+

Fault History

+
recent faults
+ +
+
+
Recent fault events for quick ops situational awareness.
+
+
No faults yet.
@@ -1750,6 +1812,8 @@ function render() { updateText('info-live', engine.state ? `${String(engine.state).toUpperCase()} / ${state.server.runtimeOk ? 'runtime ok' : 'runtime pending'}` : (state.server.configOk ? 'config only' : '--')); updateHealth(engine, audioStream); + updateFaultHistory(engine); + updateResetHint(engine); updateMeters(engine, driver, audioStream); drawSparkline('spark-audio', state.charts.audio, 'good', 1); drawSparkline('spark-underruns', state.charts.underruns, underruns > 0 ? 'err' : 'warn'); @@ -1916,6 +1980,40 @@ function updateHealth(engine, audioStream) { } } + +function updateFaultHistory(engine) { + const container = $('fault-history'); + if (!container) return; + const history = Array.isArray(engine?.faultHistory) ? engine.faultHistory : []; + if (!history.length) { + container.innerHTML = '
No faults recorded yet.
'; + return; + } + const rows = history.slice().reverse().map((entry) => { + const when = entry?.time ? new Date(entry.time) : null; + const timeLabel = when && !Number.isNaN(when.getTime()) ? when.toLocaleTimeString() : '--:--'; + const severity = String(entry?.severity || 'warn').toLowerCase(); + const severityLabel = String(entry?.severity || 'Fault').toUpperCase(); + const reasonLabel = entry?.reason ? ` ${entry.reason}` : ''; + const messageLabel = entry?.message ? ` · ${entry.message}` : ''; + return `
${timeLabel}${severityLabel}${reasonLabel}${messageLabel}
`; + }); + container.innerHTML = rows.join(''); +} + +function updateResetHint(engine) { + const hint = $('reset-hint'); + if (!hint) return; + const stateName = String(engine?.state || '').toLowerCase(); + let text = 'Manual fault reset drops runtime to DEGRADED while the queue recovers.'; + if (stateName === 'faulted') { + text = 'Faulted: reset moves runtime back to DEGRADED until the queue settles.'; + } else if (stateName === 'muted' || stateName === 'degraded') { + text = 'Reset Fault keeps the runtime in DEGRADED so the queue can recover before running again.'; + } + hint.textContent = text; +} + function updateMeters(engine, driver, audioStream) { if (audioStream && typeof audioStream.buffered === 'number') { const ratio = Math.max(0, Math.min(1, audioStream.buffered)); From 051d5f2de580bdd07e197a0f1961bec2a7926afd Mon Sep 17 00:00:00 2001 From: Jan Svabenik Date: Mon, 6 Apr 2026 00:40:20 +0200 Subject: [PATCH 33/55] feat: add transition history visibility --- docs/pro-runtime-hardening-workboard.md | 2 + internal/control/ui.html | 105 +++++++++++++++++++++++- 2 files changed, 106 insertions(+), 1 deletion(-) diff --git a/docs/pro-runtime-hardening-workboard.md b/docs/pro-runtime-hardening-workboard.md index 7be3c93..8fb6a52 100644 --- a/docs/pro-runtime-hardening-workboard.md +++ b/docs/pro-runtime-hardening-workboard.md @@ -282,6 +282,7 @@ Einführen eines klaren Betriebsmodells mit Fault-, Recovery- und Muted-Zuständ - Control-plane UI now also offers a Danger Zone `Reset Fault` button that calls the same endpoint so operators can acknowledge faults from the dashboard. - Control-plane UI now posts an ops toast/log entry whenever the runtime state shifts so escalations and manual acknowledgements are immediately visible. +- Control-plane UI now keeps a compact Transition History panel beside the Fault History so operators can see recent runtime shifts without scrolling the activity log. ## Zielzustände laut Konzept @@ -334,6 +335,7 @@ Einführen eines klaren Betriebsmodells mit Fault-, Recovery- und Muted-Zuständ | 2026-04-05 | Manual fault reset endpoint | Added `POST /runtime/fault/reset` so operators can acknowledge `faulted` before the supervisor re-enters recovery. | | 2026-04-05 | Fault-reset UI shortcut | Danger Zone now hosts a Reset Fault button wired to `/runtime/fault/reset` so operators get an in-app acknowledgement path without manual HTTP calls. | | 2026-04-06 | Runtime transition visibility cue | Control UI now posts toast/log entries for runtime state shifts so ops instantly sees escalations and manual reset acknowledgements. | +| 2026-04-06 | Transition history panel | Added a compact Transition History panel next to the Fault History so the last few runtime state shifts stay visible even when the activity log is full. | ## WS-02 Verifikation | Datum | Fokus | Ergebnis | diff --git a/internal/control/ui.html b/internal/control/ui.html index ba94f0c..f3aeb7b 100644 --- a/internal/control/ui.html +++ b/internal/control/ui.html @@ -809,6 +809,45 @@ input.input-error { color: var(--text-muted); font-size: 11px; } +.transition-history { + margin-top: 12px; + padding: 10px; + border: 1px solid var(--border); + border-radius: 6px; + background: var(--surface); + font-size: 11px; + max-height: 180px; + overflow-y: auto; + line-height: 1.3; +} +.transition-history-entry { + display: flex; + justify-content: space-between; + gap: 10px; + padding: 4px 0; + border-bottom: 1px solid rgba(255, 255, 255, 0.08); +} +.transition-history-entry:last-child { + border-bottom: none; +} +.transition-history-entry .transition-history-time { + color: var(--text-dim); +} +.transition-history-entry.good { color: var(--green); } +.transition-history-entry.warn { color: var(--amber); } +.transition-history-entry.err { color: var(--accent); } +.transition-history-entry.info { color: var(--text); } +.transition-history-desc { + font-size: 10px; + flex: 1; + text-transform: uppercase; + letter-spacing: 0.5px; +} +.transition-history-empty { + padding: 6px 0; + color: var(--text-muted); + font-size: 11px; +} .section-note.reset-hint { font-size: 11px; color: var(--text-dim); @@ -1174,6 +1213,20 @@ input.input-error { +
+
+

Transition History

+
recent state shifts
+ +
+
+
Keeps runtime escalations visible without scrolling the activity log.
+
+
No transitions yet.
+
+
+
+

Fault History

@@ -1215,6 +1268,7 @@ const configPollMs = 8000; const mobileMq = window.matchMedia('(max-width: 640px)'); const freqPresetValues = [87.6, 94.5, 99.5, 100.0, 107.9]; const sparkHistoryLimit = 40; +const transitionHistoryLimit = 6; const state = { server: { @@ -1248,6 +1302,7 @@ const state = { underruns: [], tx: [], }, + runtimeTransitions: [], freqPresetIndex: 0, }; @@ -1425,6 +1480,52 @@ function pushHistory(runtime) { pushChart(state.charts.tx, txState === 'running' ? 1 : state.txBusy ? 0.55 : 0.05); } +function pushTransitionHistory(from, to, severity) { + if (!from || !to) return; + const entry = { + from: normalizeRuntimeState(from), + to: normalizeRuntimeState(to), + severity: severity || 'info', + time: nowTs(), + }; + state.runtimeTransitions.unshift(entry); + if (state.runtimeTransitions.length > transitionHistoryLimit) { + state.runtimeTransitions.splice(transitionHistoryLimit); + } + updateTransitionHistory(); +} + +function transitionSeverityClass(severity) { + switch (String(severity || '').toLowerCase()) { + case 'err': + return 'err'; + case 'warn': + return 'warn'; + case 'ok': + case 'good': + return 'good'; + default: + return 'info'; + } +} + +function updateTransitionHistory() { + const container = $('transition-history'); + if (!container) return; + if (!state.runtimeTransitions.length) { + container.innerHTML = '
No transitions yet.
'; + return; + } + const rows = state.runtimeTransitions.map((entry) => { + const when = entry?.time ? new Date(entry.time) : null; + const timeLabel = when && !Number.isNaN(when.getTime()) ? when.toLocaleTimeString() : '--:--'; + const desc = `${entry.from.toUpperCase()} → ${entry.to.toUpperCase()}`; + const severityClass = transitionSeverityClass(entry.severity); + return `
${timeLabel}${desc}
`; + }); + container.innerHTML = rows.join(''); +} + function pushChart(arr, value) { arr.push(Number.isFinite(value) ? value : 0); if (arr.length > sparkHistoryLimit) arr.splice(0, arr.length - sparkHistoryLimit); @@ -1813,6 +1914,7 @@ function render() { updateHealth(engine, audioStream); updateFaultHistory(engine); + updateTransitionHistory(); updateResetHint(engine); updateMeters(engine, driver, audioStream); drawSparkline('spark-audio', state.charts.audio, 'good', 1); @@ -1879,8 +1981,9 @@ function notifyRuntimeTransition(engine) { const prev = state.lastRuntimeState; state.lastRuntimeState = next; if (!prev || prev === next) return; - const message = `Runtime ${prev.toUpperCase()} → ${next.toUpperCase()}`; const severity = runtimeStateSeverity(next); + pushTransitionHistory(prev, next, severity); + const message = `Runtime ${prev.toUpperCase()} → ${next.toUpperCase()}`; const logLevel = severity === 'err' ? 'err' : (severity === 'warn' ? 'warn' : 'info'); toast(message, severity); log(message, logLevel); From a7549f4187cd3ffbdb5ec1d12d7b90341128aea9 Mon Sep 17 00:00:00 2001 From: Jan Svabenik Date: Mon, 6 Apr 2026 00:59:56 +0200 Subject: [PATCH 34/55] Expose runtime transition history --- cmd/fmrtx/main.go | 1 + docs/API.md | 11 ++- internal/app/engine.go | 124 ++++++++++++++++++++--------- internal/app/runtime_state_test.go | 23 ++++++ internal/control/control_test.go | 30 +++++++ internal/control/ui.html | 34 +++++++- 6 files changed, 183 insertions(+), 40 deletions(-) diff --git a/cmd/fmrtx/main.go b/cmd/fmrtx/main.go index ddc7e49..ea9358e 100644 --- a/cmd/fmrtx/main.go +++ b/cmd/fmrtx/main.go @@ -271,6 +271,7 @@ func (b *txBridge) TXStats() map[string]any { "faultedTransitions": s.FaultedTransitions, "faultCount": s.FaultCount, "faultHistory": s.FaultHistory, + "transitionHistory": s.TransitionHistory, "lastFault": s.LastFault, } } diff --git a/docs/API.md b/docs/API.md index 5ad9fa6..4d1ea4c 100644 --- a/docs/API.md +++ b/docs/API.md @@ -78,6 +78,14 @@ Live engine and driver telemetry. Only populated when TX is active. "severity": "faulted", "message": "queue health critical for 5 checks" } + ], + "transitionHistory": [ + { + "time": "2026-04-06T00:00:00Z", + "from": "running", + "to": "degraded", + "severity": "warn" + } ] }, "driver": { @@ -90,9 +98,10 @@ Live engine and driver telemetry. Only populated when TX is active. } } ``` - `engine.state` spiegelt jetzt die Runtime-State-Maschine wider (idle, arming, prebuffering, running, degraded, muted, faulted, stopping) und bietet eine erste beobachtbare Basis für Fault-Transitions. +`transitionHistory` liefert die jüngsten Übergänge (from/to, severity, timestamp) damit API und UI die Runtime History synchronisieren können. + --- ### `POST /runtime/fault/reset` diff --git a/internal/app/engine.go b/internal/app/engine.go index d492094..041d0ed 100644 --- a/internal/app/engine.go +++ b/internal/app/engine.go @@ -69,26 +69,27 @@ func durationMs(ns uint64) float64 { } type EngineStats struct { - State string `json:"state"` - ChunksProduced uint64 `json:"chunksProduced"` - TotalSamples uint64 `json:"totalSamples"` - Underruns uint64 `json:"underruns"` - LateBuffers uint64 `json:"lateBuffers,omitempty"` - LastError string `json:"lastError,omitempty"` - UptimeSeconds float64 `json:"uptimeSeconds"` - MaxCycleMs float64 `json:"maxCycleMs,omitempty"` - MaxGenerateMs float64 `json:"maxGenerateMs,omitempty"` - MaxUpsampleMs float64 `json:"maxUpsampleMs,omitempty"` - MaxWriteMs float64 `json:"maxWriteMs,omitempty"` - Queue output.QueueStats `json:"queue"` - RuntimeIndicator RuntimeIndicator `json:"runtimeIndicator"` - RuntimeAlert string `json:"runtimeAlert,omitempty"` - LastFault *FaultEvent `json:"lastFault,omitempty"` - DegradedTransitions uint64 `json:"degradedTransitions"` - MutedTransitions uint64 `json:"mutedTransitions"` - FaultedTransitions uint64 `json:"faultedTransitions"` - FaultCount uint64 `json:"faultCount"` - FaultHistory []FaultEvent `json:"faultHistory,omitempty"` + State string `json:"state"` + ChunksProduced uint64 `json:"chunksProduced"` + TotalSamples uint64 `json:"totalSamples"` + Underruns uint64 `json:"underruns"` + LateBuffers uint64 `json:"lateBuffers,omitempty"` + LastError string `json:"lastError,omitempty"` + UptimeSeconds float64 `json:"uptimeSeconds"` + MaxCycleMs float64 `json:"maxCycleMs,omitempty"` + MaxGenerateMs float64 `json:"maxGenerateMs,omitempty"` + MaxUpsampleMs float64 `json:"maxUpsampleMs,omitempty"` + MaxWriteMs float64 `json:"maxWriteMs,omitempty"` + Queue output.QueueStats `json:"queue"` + RuntimeIndicator RuntimeIndicator `json:"runtimeIndicator"` + RuntimeAlert string `json:"runtimeAlert,omitempty"` + LastFault *FaultEvent `json:"lastFault,omitempty"` + DegradedTransitions uint64 `json:"degradedTransitions"` + MutedTransitions uint64 `json:"mutedTransitions"` + FaultedTransitions uint64 `json:"faultedTransitions"` + FaultCount uint64 `json:"faultCount"` + FaultHistory []FaultEvent `json:"faultHistory,omitempty"` + TransitionHistory []RuntimeTransition `json:"transitionHistory,omitempty"` } type RuntimeIndicator string @@ -99,14 +100,22 @@ const ( RuntimeIndicatorQueueCritical RuntimeIndicator = "queueCritical" ) +type RuntimeTransition struct { + Time time.Time `json:"time"` + From RuntimeState `json:"from"` + To RuntimeState `json:"to"` + Severity string `json:"severity"` +} + const ( - lateBufferIndicatorWindow = 5 * time.Second - queueCriticalStreakThreshold = 3 - queueMutedStreakThreshold = queueCriticalStreakThreshold * 2 - queueMutedRecoveryThreshold = queueCriticalStreakThreshold - queueFaultedStreakThreshold = queueCriticalStreakThreshold - faultRepeatWindow = 1 * time.Second - faultHistoryCapacity = 8 + lateBufferIndicatorWindow = 5 * time.Second + queueCriticalStreakThreshold = 3 + queueMutedStreakThreshold = queueCriticalStreakThreshold * 2 + queueMutedRecoveryThreshold = queueCriticalStreakThreshold + queueFaultedStreakThreshold = queueCriticalStreakThreshold + faultRepeatWindow = 1 * time.Second + faultHistoryCapacity = 8 + runtimeTransitionHistoryCapacity = 8 ) // Engine is the continuous TX loop. It generates composite IQ in chunks, @@ -146,6 +155,8 @@ type Engine struct { lastFault atomic.Value // *FaultEvent faultHistoryMu sync.Mutex faultHistory []FaultEvent + transitionHistoryMu sync.Mutex + transitionHistory []RuntimeTransition degradedTransitions atomic.Uint64 mutedTransitions atomic.Uint64 @@ -217,15 +228,16 @@ func NewEngine(cfg cfgpkg.Config, driver platform.SoapyDriver) *Engine { } engine := &Engine{ - cfg: cfg, - driver: driver, - generator: offpkg.NewGenerator(cfg), - upsampler: upsampler, - chunkDuration: 50 * time.Millisecond, - deviceRate: deviceRate, - state: EngineIdle, - frameQueue: output.NewFrameQueue(cfg.Runtime.FrameQueueCapacity), - faultHistory: make([]FaultEvent, 0, faultHistoryCapacity), + cfg: cfg, + driver: driver, + generator: offpkg.NewGenerator(cfg), + upsampler: upsampler, + chunkDuration: 50 * time.Millisecond, + deviceRate: deviceRate, + state: EngineIdle, + frameQueue: output.NewFrameQueue(cfg.Runtime.FrameQueueCapacity), + faultHistory: make([]FaultEvent, 0, faultHistoryCapacity), + transitionHistory: make([]RuntimeTransition, 0, runtimeTransitionHistoryCapacity), } engine.setRuntimeState(RuntimeStateIdle) return engine @@ -423,6 +435,7 @@ func (e *Engine) Stats() EngineStats { FaultedTransitions: e.faultedTransitions.Load(), FaultCount: e.faultEvents.Load(), FaultHistory: e.FaultHistory(), + TransitionHistory: e.TransitionHistory(), } } @@ -450,6 +463,19 @@ func runtimeAlert(queueHealth output.QueueHealth, recentLateBuffers bool) string } } +func runtimeStateSeverity(state RuntimeState) string { + switch state { + case RuntimeStateRunning: + return "ok" + case RuntimeStateDegraded, RuntimeStateMuted: + return "warn" + case RuntimeStateFaulted: + return "err" + default: + return "info" + } +} + func (e *Engine) run(ctx context.Context) { e.setRuntimeState(RuntimeStatePrebuffering) e.wg.Add(1) @@ -589,6 +615,7 @@ func cloneFrame(src *output.CompositeFrame) *output.CompositeFrame { func (e *Engine) setRuntimeState(state RuntimeState) { prev := e.currentRuntimeState() if prev != state { + e.recordRuntimeTransition(prev, state) switch state { case RuntimeStateDegraded: e.degradedTransitions.Add(1) @@ -635,6 +662,31 @@ func (e *Engine) FaultHistory() []FaultEvent { return history } +func (e *Engine) TransitionHistory() []RuntimeTransition { + e.transitionHistoryMu.Lock() + defer e.transitionHistoryMu.Unlock() + history := make([]RuntimeTransition, len(e.transitionHistory)) + copy(history, e.transitionHistory) + return history +} + +func (e *Engine) recordRuntimeTransition(from, to RuntimeState) { + ev := RuntimeTransition{ + Time: time.Now(), + From: from, + To: to, + Severity: runtimeStateSeverity(to), + } + e.transitionHistoryMu.Lock() + defer e.transitionHistoryMu.Unlock() + if len(e.transitionHistory) >= runtimeTransitionHistoryCapacity { + copy(e.transitionHistory, e.transitionHistory[1:]) + e.transitionHistory[len(e.transitionHistory)-1] = ev + return + } + e.transitionHistory = append(e.transitionHistory, ev) +} + func (e *Engine) recordFault(reason FaultReason, severity FaultSeverity, message string) { if reason == "" { reason = FaultReasonUnknown diff --git a/internal/app/runtime_state_test.go b/internal/app/runtime_state_test.go index 018913f..6a0696b 100644 --- a/internal/app/runtime_state_test.go +++ b/internal/app/runtime_state_test.go @@ -180,6 +180,29 @@ func TestRuntimeTransitionCounters(t *testing.T) { } } +func TestEngineTransitionHistory(t *testing.T) { + e := NewEngine(cfgpkg.Default(), platform.NewSimulatedDriver(nil)) + e.setRuntimeState(RuntimeStateRunning) + e.setRuntimeState(RuntimeStateDegraded) + e.setRuntimeState(RuntimeStateMuted) + + history := e.Stats().TransitionHistory + if len(history) != 3 { + t.Fatalf("expected 3 transitions recorded, got %d", len(history)) + } + if history[0].From != RuntimeStateIdle || history[0].To != RuntimeStateRunning { + t.Fatalf("unexpected first transition: %+v", history[0]) + } + if history[0].Severity != "ok" { + t.Fatalf("expected ok severity for running transition, got %s", history[0].Severity) + } + if history[1].To != RuntimeStateDegraded || history[1].Severity != "warn" { + t.Fatalf("expected degraded transition with warn severity, got %+v", history[1]) + } + if history[2].To != RuntimeStateMuted || history[2].Severity != "warn" { + t.Fatalf("expected muted transition with warn severity, got %+v", history[2]) + } +} func TestEngineResetFaultRequiresFaultedState(t *testing.T) { e := NewEngine(cfgpkg.Default(), platform.NewSimulatedDriver(nil)) diff --git a/internal/control/control_test.go b/internal/control/control_test.go index 8656643..e67ae4c 100644 --- a/internal/control/control_test.go +++ b/internal/control/control_test.go @@ -174,6 +174,36 @@ func TestRuntimeReportsFaultHistory(t *testing.T) { t.Fatalf("faultHistory length mismatch: want %d got %d", len(history), len(histRaw)) } } +func TestRuntimeReportsTransitionHistory(t *testing.T) { + srv := NewServer(cfgpkg.Default()) + history := []map[string]any{{ + "time": "2026-04-06T00:00:00Z", + "from": "running", + "to": "degraded", + "severity": "warn", + }} + srv.SetTXController(&fakeTXController{stats: map[string]any{"transitionHistory": history}}) + rec := httptest.NewRecorder() + srv.Handler().ServeHTTP(rec, httptest.NewRequest(http.MethodGet, "/runtime", nil)) + if rec.Code != 200 { + t.Fatalf("status: %d", rec.Code) + } + var body map[string]any + if err := json.Unmarshal(rec.Body.Bytes(), &body); err != nil { + t.Fatalf("unmarshal runtime: %v", err) + } + engineRaw, ok := body["engine"].(map[string]any) + if !ok { + t.Fatalf("runtime engine missing") + } + histRaw, ok := engineRaw["transitionHistory"].([]any) + if !ok { + t.Fatalf("transitionHistory missing or wrong type: %T", engineRaw["transitionHistory"]) + } + if len(histRaw) != len(history) { + t.Fatalf("transitionHistory length mismatch: want %d got %d", len(history), len(histRaw)) + } +} func TestRuntimeFaultResetRejectsGet(t *testing.T) { srv := NewServer(cfgpkg.Default()) diff --git a/internal/control/ui.html b/internal/control/ui.html index f3aeb7b..49709c6 100644 --- a/internal/control/ui.html +++ b/internal/control/ui.html @@ -1456,7 +1456,8 @@ async function loadRuntime({ silent = true } = {}) { state.server.runtime = runtime; state.server.runtimeOk = true; state.server.lastRuntimeAt = nowTs(); - notifyRuntimeTransition(runtime.engine); + const syncedTransitions = syncTransitionHistoryFromEngine(runtime.engine); + notifyRuntimeTransition(runtime.engine, !syncedTransitions); pushHistory(runtime); setConnection(true, state.pendingRequests > 0 ? 'busy' : 'connected'); render(); @@ -1495,6 +1496,30 @@ function pushTransitionHistory(from, to, severity) { updateTransitionHistory(); } +function transitionEntryTime(value) { + if (value == null) return nowTs(); + if (typeof value === 'number') return value; + const parsed = Date.parse(String(value)); + return Number.isNaN(parsed) ? nowTs() : parsed; +} + +function syncTransitionHistoryFromEngine(engine) { + const entries = Array.isArray(engine?.transitionHistory) ? engine.transitionHistory : null; + if (!entries) return false; + const sliceStart = Math.max(0, entries.length - transitionHistoryLimit); + const trimmed = entries.slice(sliceStart); + const normalized = trimmed.map((entry) => ({ + from: normalizeRuntimeState(entry?.from), + to: normalizeRuntimeState(entry?.to), + severity: String(entry?.severity || 'info').toLowerCase(), + time: transitionEntryTime(entry?.time), + })); + normalized.reverse(); + state.runtimeTransitions = normalized; + updateTransitionHistory(); + return true; +} + function transitionSeverityClass(severity) { switch (String(severity || '').toLowerCase()) { case 'err': @@ -1975,20 +2000,23 @@ function runtimeStateSeverity(stateName) { } } -function notifyRuntimeTransition(engine) { +function notifyRuntimeTransition(engine, pushHistory = true) { if (!engine) return; const next = normalizeRuntimeState(engine.state); const prev = state.lastRuntimeState; state.lastRuntimeState = next; if (!prev || prev === next) return; const severity = runtimeStateSeverity(next); - pushTransitionHistory(prev, next, severity); + if (pushHistory) { + pushTransitionHistory(prev, next, severity); + } const message = `Runtime ${prev.toUpperCase()} → ${next.toUpperCase()}`; const logLevel = severity === 'err' ? 'err' : (severity === 'warn' ? 'warn' : 'info'); toast(message, severity); log(message, logLevel); } + function updateHealth(engine, audioStream) { engine = engine || {}; updateText('health-http', state.server.configOk ? 'OK' : 'OFFLINE'); From 21a38d8ab2fa78857c9fb1389d78081b4fb8e46b Mon Sep 17 00:00:00 2001 From: Jan Svabenik Date: Mon, 6 Apr 2026 01:04:58 +0200 Subject: [PATCH 35/55] ws02: expose runtime state age --- cmd/fmrtx/main.go | 43 ++++++++------- docs/API.md | 3 + internal/app/engine.go | 115 ++++++++++++++++++++++----------------- internal/control/ui.html | 14 ++++- 4 files changed, 104 insertions(+), 71 deletions(-) diff --git a/cmd/fmrtx/main.go b/cmd/fmrtx/main.go index ea9358e..a45a5ed 100644 --- a/cmd/fmrtx/main.go +++ b/cmd/fmrtx/main.go @@ -252,27 +252,28 @@ func (b *txBridge) StopTX() error { return b.engine.Stop(context.Background()) func (b *txBridge) TXStats() map[string]any { s := b.engine.Stats() return map[string]any{ - "state": s.State, - "chunksProduced": s.ChunksProduced, - "totalSamples": s.TotalSamples, - "underruns": s.Underruns, - "lateBuffers": s.LateBuffers, - "lastError": s.LastError, - "uptimeSeconds": s.UptimeSeconds, - "maxCycleMs": s.MaxCycleMs, - "maxGenerateMs": s.MaxGenerateMs, - "maxUpsampleMs": s.MaxUpsampleMs, - "maxWriteMs": s.MaxWriteMs, - "queue": s.Queue, - "runtimeIndicator": s.RuntimeIndicator, - "runtimeAlert": s.RuntimeAlert, - "degradedTransitions": s.DegradedTransitions, - "mutedTransitions": s.MutedTransitions, - "faultedTransitions": s.FaultedTransitions, - "faultCount": s.FaultCount, - "faultHistory": s.FaultHistory, - "transitionHistory": s.TransitionHistory, - "lastFault": s.LastFault, + "runtimeStateDurationSeconds": s.RuntimeStateDurationSeconds, + "state": s.State, + "chunksProduced": s.ChunksProduced, + "totalSamples": s.TotalSamples, + "underruns": s.Underruns, + "lateBuffers": s.LateBuffers, + "lastError": s.LastError, + "uptimeSeconds": s.UptimeSeconds, + "maxCycleMs": s.MaxCycleMs, + "maxGenerateMs": s.MaxGenerateMs, + "maxUpsampleMs": s.MaxUpsampleMs, + "maxWriteMs": s.MaxWriteMs, + "queue": s.Queue, + "runtimeIndicator": s.RuntimeIndicator, + "runtimeAlert": s.RuntimeAlert, + "degradedTransitions": s.DegradedTransitions, + "mutedTransitions": s.MutedTransitions, + "faultedTransitions": s.FaultedTransitions, + "faultCount": s.FaultCount, + "faultHistory": s.FaultHistory, + "transitionHistory": s.TransitionHistory, + "lastFault": s.LastFault, } } func (b *txBridge) UpdateConfig(lp ctrlpkg.LivePatch) error { diff --git a/docs/API.md b/docs/API.md index 4d1ea4c..ed4d8b1 100644 --- a/docs/API.md +++ b/docs/API.md @@ -59,6 +59,7 @@ Live engine and driver telemetry. Only populated when TX is active. { "engine": { "state": "running", + "runtimeStateDurationSeconds": 12.4, "chunksProduced": 12345, "totalSamples": 1408950000, "underruns": 0, @@ -100,6 +101,8 @@ Live engine and driver telemetry. Only populated when TX is active. ``` `engine.state` spiegelt jetzt die Runtime-State-Maschine wider (idle, arming, prebuffering, running, degraded, muted, faulted, stopping) und bietet eine erste beobachtbare Basis für Fault-Transitions. +`runtimeStateDurationSeconds` sagt, wie viele Sekunden die Engine bereits im aktuellen Runtime-Zustand verweilt. So erkennt man schnell, ob `muted`/`degraded` zu lange dauern oder ob ein Übergang gerade frisch begonnen hat. + `transitionHistory` liefert die jüngsten Übergänge (from/to, severity, timestamp) damit API und UI die Runtime History synchronisieren können. --- diff --git a/internal/app/engine.go b/internal/app/engine.go index 041d0ed..a269cd8 100644 --- a/internal/app/engine.go +++ b/internal/app/engine.go @@ -69,27 +69,28 @@ func durationMs(ns uint64) float64 { } type EngineStats struct { - State string `json:"state"` - ChunksProduced uint64 `json:"chunksProduced"` - TotalSamples uint64 `json:"totalSamples"` - Underruns uint64 `json:"underruns"` - LateBuffers uint64 `json:"lateBuffers,omitempty"` - LastError string `json:"lastError,omitempty"` - UptimeSeconds float64 `json:"uptimeSeconds"` - MaxCycleMs float64 `json:"maxCycleMs,omitempty"` - MaxGenerateMs float64 `json:"maxGenerateMs,omitempty"` - MaxUpsampleMs float64 `json:"maxUpsampleMs,omitempty"` - MaxWriteMs float64 `json:"maxWriteMs,omitempty"` - Queue output.QueueStats `json:"queue"` - RuntimeIndicator RuntimeIndicator `json:"runtimeIndicator"` - RuntimeAlert string `json:"runtimeAlert,omitempty"` - LastFault *FaultEvent `json:"lastFault,omitempty"` - DegradedTransitions uint64 `json:"degradedTransitions"` - MutedTransitions uint64 `json:"mutedTransitions"` - FaultedTransitions uint64 `json:"faultedTransitions"` - FaultCount uint64 `json:"faultCount"` - FaultHistory []FaultEvent `json:"faultHistory,omitempty"` - TransitionHistory []RuntimeTransition `json:"transitionHistory,omitempty"` + State string `json:"state"` + RuntimeStateDurationSeconds float64 `json:"runtimeStateDurationSeconds"` + ChunksProduced uint64 `json:"chunksProduced"` + TotalSamples uint64 `json:"totalSamples"` + Underruns uint64 `json:"underruns"` + LateBuffers uint64 `json:"lateBuffers,omitempty"` + LastError string `json:"lastError,omitempty"` + UptimeSeconds float64 `json:"uptimeSeconds"` + MaxCycleMs float64 `json:"maxCycleMs,omitempty"` + MaxGenerateMs float64 `json:"maxGenerateMs,omitempty"` + MaxUpsampleMs float64 `json:"maxUpsampleMs,omitempty"` + MaxWriteMs float64 `json:"maxWriteMs,omitempty"` + Queue output.QueueStats `json:"queue"` + RuntimeIndicator RuntimeIndicator `json:"runtimeIndicator"` + RuntimeAlert string `json:"runtimeAlert,omitempty"` + LastFault *FaultEvent `json:"lastFault,omitempty"` + DegradedTransitions uint64 `json:"degradedTransitions"` + MutedTransitions uint64 `json:"mutedTransitions"` + FaultedTransitions uint64 `json:"faultedTransitions"` + FaultCount uint64 `json:"faultCount"` + FaultHistory []FaultEvent `json:"faultHistory,omitempty"` + TransitionHistory []RuntimeTransition `json:"transitionHistory,omitempty"` } type RuntimeIndicator string @@ -158,10 +159,11 @@ type Engine struct { transitionHistoryMu sync.Mutex transitionHistory []RuntimeTransition - degradedTransitions atomic.Uint64 - mutedTransitions atomic.Uint64 - faultedTransitions atomic.Uint64 - faultEvents atomic.Uint64 + degradedTransitions atomic.Uint64 + mutedTransitions atomic.Uint64 + faultedTransitions atomic.Uint64 + faultEvents atomic.Uint64 + runtimeStateEnteredAt atomic.Uint64 // Live config: pending frequency change, applied between chunks pendingFreq atomic.Pointer[float64] @@ -415,27 +417,28 @@ func (e *Engine) Stats() EngineStats { ri := runtimeIndicator(queue.Health, hasRecentLateBuffers) lastFault := e.lastFaultEvent() return EngineStats{ - State: string(e.currentRuntimeState()), - ChunksProduced: e.chunksProduced.Load(), - TotalSamples: e.totalSamples.Load(), - Underruns: e.underruns.Load(), - LateBuffers: lateBuffers, - LastError: errVal, - UptimeSeconds: uptime, - MaxCycleMs: durationMs(e.maxCycleNs.Load()), - MaxGenerateMs: durationMs(e.maxGenerateNs.Load()), - MaxUpsampleMs: durationMs(e.maxUpsampleNs.Load()), - MaxWriteMs: durationMs(e.maxWriteNs.Load()), - Queue: queue, - RuntimeIndicator: ri, - RuntimeAlert: runtimeAlert(queue.Health, hasRecentLateBuffers), - LastFault: lastFault, - DegradedTransitions: e.degradedTransitions.Load(), - MutedTransitions: e.mutedTransitions.Load(), - FaultedTransitions: e.faultedTransitions.Load(), - FaultCount: e.faultEvents.Load(), - FaultHistory: e.FaultHistory(), - TransitionHistory: e.TransitionHistory(), + State: string(e.currentRuntimeState()), + RuntimeStateDurationSeconds: e.runtimeStateDurationSeconds(), + ChunksProduced: e.chunksProduced.Load(), + TotalSamples: e.totalSamples.Load(), + Underruns: e.underruns.Load(), + LateBuffers: lateBuffers, + LastError: errVal, + UptimeSeconds: uptime, + MaxCycleMs: durationMs(e.maxCycleNs.Load()), + MaxGenerateMs: durationMs(e.maxGenerateNs.Load()), + MaxUpsampleMs: durationMs(e.maxUpsampleNs.Load()), + MaxWriteMs: durationMs(e.maxWriteNs.Load()), + Queue: queue, + RuntimeIndicator: ri, + RuntimeAlert: runtimeAlert(queue.Health, hasRecentLateBuffers), + LastFault: lastFault, + DegradedTransitions: e.degradedTransitions.Load(), + MutedTransitions: e.mutedTransitions.Load(), + FaultedTransitions: e.faultedTransitions.Load(), + FaultCount: e.faultEvents.Load(), + FaultHistory: e.FaultHistory(), + TransitionHistory: e.TransitionHistory(), } } @@ -613,9 +616,10 @@ func cloneFrame(src *output.CompositeFrame) *output.CompositeFrame { } func (e *Engine) setRuntimeState(state RuntimeState) { + now := time.Now() prev := e.currentRuntimeState() if prev != state { - e.recordRuntimeTransition(prev, state) + e.recordRuntimeTransition(prev, state, now) switch state { case RuntimeStateDegraded: e.degradedTransitions.Add(1) @@ -624,6 +628,9 @@ func (e *Engine) setRuntimeState(state RuntimeState) { case RuntimeStateFaulted: e.faultedTransitions.Add(1) } + e.runtimeStateEnteredAt.Store(uint64(now.UnixNano())) + } else if e.runtimeStateEnteredAt.Load() == 0 { + e.runtimeStateEnteredAt.Store(uint64(now.UnixNano())) } e.runtimeState.Store(state) } @@ -637,6 +644,13 @@ func (e *Engine) currentRuntimeState() RuntimeState { return RuntimeStateIdle } +func (e *Engine) runtimeStateDurationSeconds() float64 { + if ts := e.runtimeStateEnteredAt.Load(); ts != 0 { + return time.Since(time.Unix(0, int64(ts))).Seconds() + } + return 0 +} + func (e *Engine) hasRecentLateBuffers() bool { lateAlertAt := e.lateBufferAlertAt.Load() if lateAlertAt == 0 { @@ -670,9 +684,12 @@ func (e *Engine) TransitionHistory() []RuntimeTransition { return history } -func (e *Engine) recordRuntimeTransition(from, to RuntimeState) { +func (e *Engine) recordRuntimeTransition(from, to RuntimeState, when time.Time) { + if when.IsZero() { + when = time.Now() + } ev := RuntimeTransition{ - Time: time.Now(), + Time: when, From: from, To: to, Severity: runtimeStateSeverity(to), diff --git a/internal/control/ui.html b/internal/control/ui.html index 49709c6..754a22f 100644 --- a/internal/control/ui.html +++ b/internal/control/ui.html @@ -1160,6 +1160,7 @@ input.input-error {
HTTP
--
Runtime
--
+
State Age
--
Runtime Signal
--
Runtime Alert
--
Transitions (D/M/F)
--
@@ -2032,6 +2033,14 @@ function updateHealth(engine, audioStream) { updateText('health-runtime', runtimeLabel); $('health-runtime').className = 'val ' + runtimeClass; + const durationSeconds = Number(engine.runtimeStateDurationSeconds); + const durationLabel = Number.isFinite(durationSeconds) && durationSeconds > 0 ? fmtTime(durationSeconds) : '--'; + updateText('health-state-age', durationLabel); + const stateAgeEl = $('health-state-age'); + if (stateAgeEl) { + stateAgeEl.className = 'val ' + runtimeClass; + } + const runtimeIndicator = engine.runtimeIndicator; const indicatorLabels = { normal: 'Normal', @@ -2142,7 +2151,10 @@ function updateResetHint(engine) { } else if (stateName === 'muted' || stateName === 'degraded') { text = 'Reset Fault keeps the runtime in DEGRADED so the queue can recover before running again.'; } - hint.textContent = text; + const durationSeconds = Number(engine?.runtimeStateDurationSeconds); + const durationLabel = Number.isFinite(durationSeconds) && durationSeconds > 0 ? fmtTime(durationSeconds) : null; + const ageHint = durationLabel ? ` State age ${durationLabel}.` : ''; + hint.textContent = text + ageHint; } function updateMeters(engine, driver, audioStream) { From 1d20e798d15e2ef1c1e11bdb483fdc50e7e92f51 Mon Sep 17 00:00:00 2001 From: Jan Svabenik Date: Mon, 6 Apr 2026 03:37:20 +0200 Subject: [PATCH 36/55] Add buffered duration metric for audio stream stats --- docs/API.md | 3 +++ internal/audio/stream.go | 38 ++++++++++++++++++++++++----------- internal/audio/stream_test.go | 16 +++++++++++++++ 3 files changed, 45 insertions(+), 12 deletions(-) diff --git a/docs/API.md b/docs/API.md index ed4d8b1..a81a0e6 100644 --- a/docs/API.md +++ b/docs/API.md @@ -298,6 +298,7 @@ Requires `--audio-stdin`, `--audio-http`, or another configured stream source to "available": 12000, "capacity": 131072, "buffered": 0.09, + "bufferedDurationSeconds": 0.27, "written": 890000, "underruns": 0, "overflows": 0 @@ -366,6 +367,7 @@ The stream uses a lock-free ring buffer (default: 2 seconds at input rate). Buff "available": 12000, "capacity": 131072, "buffered": 0.09, + "bufferedDurationSeconds": 0.27, "written": 890000, "underruns": 0, "overflows": 0 @@ -376,5 +378,6 @@ The stream uses a lock-free ring buffer (default: 2 seconds at input rate). Buff - **underruns**: DSP consumed faster than audio arrived (silence inserted) - **overflows**: Audio arrived faster than DSP consumed (data dropped) - **buffered**: Fill ratio (0.0 = empty, 1.0 = full) +- **bufferedDurationSeconds**: Approximate seconds of audio queued in the buffer (`available` frames divided by the sample rate) When no audio is streaming, the transmitter falls back to the configured tone generator or silence. diff --git a/internal/audio/stream.go b/internal/audio/stream.go index bf951a8..14ceac1 100644 --- a/internal/audio/stream.go +++ b/internal/audio/stream.go @@ -109,24 +109,38 @@ func (s *StreamSource) Buffered() float64 { // Stats returns diagnostic counters. func (s *StreamSource) Stats() StreamStats { + available := s.Available() + buffered := 0.0 + if s.size > 0 { + buffered = float64(available) / float64(s.size) + } return StreamStats{ - Available: s.Available(), - Capacity: s.size, - Buffered: s.Buffered(), - Written: s.Written.Load(), - Underruns: s.Underruns.Load(), - Overflows: s.Overflows.Load(), + Available: available, + Capacity: s.size, + Buffered: buffered, + BufferedDurationSeconds: s.bufferedDurationSeconds(available), + Written: s.Written.Load(), + Underruns: s.Underruns.Load(), + Overflows: s.Overflows.Load(), } } // StreamStats exposes runtime telemetry for the stream buffer. type StreamStats struct { - Available int `json:"available"` - Capacity int `json:"capacity"` - Buffered float64 `json:"buffered"` - Written uint64 `json:"written"` - Underruns uint64 `json:"underruns"` - Overflows uint64 `json:"overflows"` + Available int `json:"available"` + Capacity int `json:"capacity"` + Buffered float64 `json:"buffered"` + BufferedDurationSeconds float64 `json:"bufferedDurationSeconds"` + Written uint64 `json:"written"` + Underruns uint64 `json:"underruns"` + Overflows uint64 `json:"overflows"` +} + +func (s *StreamSource) bufferedDurationSeconds(available int) float64 { + if s.SampleRate <= 0 { + return 0 + } + return float64(available) / float64(s.SampleRate) } // --- StreamResampler --- diff --git a/internal/audio/stream_test.go b/internal/audio/stream_test.go index cc2820a..43fe0ee 100644 --- a/internal/audio/stream_test.go +++ b/internal/audio/stream_test.go @@ -205,6 +205,22 @@ func TestStreamSource_ConcurrentSPSC(t *testing.T) { } } +func TestStreamSource_StatsBufferedDuration(t *testing.T) { + rate := 48000 + s := NewStreamSource(128, rate) + for i := 0; i < 24; i++ { + s.WriteFrame(NewFrame(0, 0)) + } + stats := s.Stats() + if stats.BufferedDurationSeconds <= 0 { + t.Fatalf("expected buffered duration > 0, got %.6f", stats.BufferedDurationSeconds) + } + expected := float64(stats.Available) / float64(rate) + if math.Abs(stats.BufferedDurationSeconds-expected) > 1e-9 { + t.Fatalf("buffered duration %.9f != expected %.9f", stats.BufferedDurationSeconds, expected) + } +} + // --- StreamResampler tests --- func TestStreamResampler_1to1(t *testing.T) { From 1becfa5e0c6e730f7ee8d4b32de0d1b32fdbd3ac Mon Sep 17 00:00:00 2001 From: Jan Svabenik Date: Mon, 6 Apr 2026 03:45:27 +0200 Subject: [PATCH 37/55] Add high watermark telemetry to stream stats --- docs/API.md | 6 ++++++ internal/audio/stream.go | 20 ++++++++++++++++++++ internal/audio/stream_test.go | 23 +++++++++++++++++++++++ 3 files changed, 49 insertions(+) diff --git a/docs/API.md b/docs/API.md index a81a0e6..7ebc4b7 100644 --- a/docs/API.md +++ b/docs/API.md @@ -299,6 +299,8 @@ Requires `--audio-stdin`, `--audio-http`, or another configured stream source to "capacity": 131072, "buffered": 0.09, "bufferedDurationSeconds": 0.27, + "highWatermark": 15000, + "highWatermarkDurationSeconds": 0.34, "written": 890000, "underruns": 0, "overflows": 0 @@ -368,6 +370,8 @@ The stream uses a lock-free ring buffer (default: 2 seconds at input rate). Buff "capacity": 131072, "buffered": 0.09, "bufferedDurationSeconds": 0.27, + "highWatermark": 15000, + "highWatermarkDurationSeconds": 0.34, "written": 890000, "underruns": 0, "overflows": 0 @@ -379,5 +383,7 @@ The stream uses a lock-free ring buffer (default: 2 seconds at input rate). Buff - **overflows**: Audio arrived faster than DSP consumed (data dropped) - **buffered**: Fill ratio (0.0 = empty, 1.0 = full) - **bufferedDurationSeconds**: Approximate seconds of audio queued in the buffer (`available` frames divided by the sample rate) +- **highWatermark**: Highest observed buffer occupancy (frames) since the buffer was created +- **highWatermarkDurationSeconds**: Equivalent peak time (`highWatermark` frames divided by the sample rate) When no audio is streaming, the transmitter falls back to the configured tone generator or silence. diff --git a/internal/audio/stream.go b/internal/audio/stream.go index 14ceac1..09f6de3 100644 --- a/internal/audio/stream.go +++ b/internal/audio/stream.go @@ -24,6 +24,7 @@ type StreamSource struct { Underruns atomic.Uint64 Overflows atomic.Uint64 Written atomic.Uint64 + highWatermark atomic.Int64 } // NewStreamSource creates a ring buffer with the given capacity (rounded up @@ -54,6 +55,7 @@ func (s *StreamSource) WriteFrame(f Frame) bool { s.ring[int(wp)&s.mask] = f s.writePos.Add(1) s.Written.Add(1) + s.updateHighWatermark() return true } @@ -114,11 +116,14 @@ func (s *StreamSource) Stats() StreamStats { if s.size > 0 { buffered = float64(available) / float64(s.size) } + highWatermark := int(s.highWatermark.Load()) return StreamStats{ Available: available, Capacity: s.size, Buffered: buffered, BufferedDurationSeconds: s.bufferedDurationSeconds(available), + HighWatermark: highWatermark, + HighWatermarkDurationSeconds: s.bufferedDurationSeconds(highWatermark), Written: s.Written.Load(), Underruns: s.Underruns.Load(), Overflows: s.Overflows.Load(), @@ -131,6 +136,8 @@ type StreamStats struct { Capacity int `json:"capacity"` Buffered float64 `json:"buffered"` BufferedDurationSeconds float64 `json:"bufferedDurationSeconds"` + HighWatermark int `json:"highWatermark"` + HighWatermarkDurationSeconds float64 `json:"highWatermarkDurationSeconds"` Written uint64 `json:"written"` Underruns uint64 `json:"underruns"` Overflows uint64 `json:"overflows"` @@ -143,6 +150,19 @@ func (s *StreamSource) bufferedDurationSeconds(available int) float64 { return float64(available) / float64(s.SampleRate) } +func (s *StreamSource) updateHighWatermark() { + available := s.Available() + for { + prev := s.highWatermark.Load() + if int64(available) <= prev { + return + } + if s.highWatermark.CompareAndSwap(prev, int64(available)) { + return + } + } +} + // --- StreamResampler --- // StreamResampler wraps a StreamSource and rate-converts from the stream's diff --git a/internal/audio/stream_test.go b/internal/audio/stream_test.go index 43fe0ee..2169e09 100644 --- a/internal/audio/stream_test.go +++ b/internal/audio/stream_test.go @@ -221,6 +221,29 @@ func TestStreamSource_StatsBufferedDuration(t *testing.T) { } } +func TestStreamSource_StatsHighWatermark(t *testing.T) { + rate := 44100 + s := NewStreamSource(64, rate) + for i := 0; i < 12; i++ { + s.WriteFrame(NewFrame(0, 0)) + } + for i := 0; i < 5; i++ { + s.ReadFrame() + } + stats := s.Stats() + if stats.HighWatermark != 12 { + t.Fatalf("expected high watermark 12, got %d", stats.HighWatermark) + } + expected := float64(stats.HighWatermark) / float64(rate) + if math.Abs(stats.HighWatermarkDurationSeconds-expected) > 1e-9 { + t.Fatalf("high watermark duration %.9f != %.9f", stats.HighWatermarkDurationSeconds, expected) + } + if stats.HighWatermark < stats.Available { + t.Fatalf("high watermark %d < available %d", stats.HighWatermark, stats.Available) + } +} + + // --- StreamResampler tests --- func TestStreamResampler_1to1(t *testing.T) { From 82ed2c348555da4109b002e3d20a488243b14de1 Mon Sep 17 00:00:00 2001 From: Jan Svabenik Date: Mon, 6 Apr 2026 03:50:43 +0200 Subject: [PATCH 38/55] ui: show audio buffer duration metrics --- internal/control/ui.html | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/internal/control/ui.html b/internal/control/ui.html index 754a22f..e1a1eaa 100644 --- a/internal/control/ui.html +++ b/internal/control/ui.html @@ -1167,6 +1167,8 @@ input.input-error {
Fault Count
--
Last Fault
--
Audio Buffer
--
+
Buffer Duration
--
+
High Watermark
--
Last Update
--
@@ -1720,6 +1722,12 @@ function fmtTime(seconds) { return `${s}s`; } +function fmtDurationSeconds(value) { + if (!Number.isFinite(value) || value < 0) return '--'; + if (value >= 1) return `${value.toFixed(2)} s`; + return `${(value * 1000).toFixed(0)} ms`; +} + function fmtBool(v) { return v == null ? '--' : (v ? 'ON' : 'OFF'); } @@ -2079,6 +2087,24 @@ function updateHealth(engine, audioStream) { updateText('health-audio', audioLabel); $('health-audio').className = audioClass; + const bufferedDurationSeconds = Number(audioStream?.bufferedDurationSeconds); + updateText('health-buffer-duration', fmtDurationSeconds(bufferedDurationSeconds)); + + const highWatermarkRaw = audioStream?.highWatermark; + const highWatermarkFrames = Number.isFinite(Number(highWatermarkRaw)) ? Number(highWatermarkRaw) : null; + const highWatermarkDurationRaw = audioStream?.highWatermarkDurationSeconds; + const highWatermarkDuration = Number.isFinite(Number(highWatermarkDurationRaw)) ? Number(highWatermarkDurationRaw) : null; + let highWatermarkLabel = '--'; + if (highWatermarkDuration !== null) { + highWatermarkLabel = fmtDurationSeconds(highWatermarkDuration); + if (highWatermarkFrames !== null) { + highWatermarkLabel += ` (${highWatermarkFrames} frames)`; + } + } else if (highWatermarkFrames !== null) { + highWatermarkLabel = `${highWatermarkFrames} frames`; + } + updateText('health-buffer-highwater', highWatermarkLabel); + const last = Math.max(state.server.lastConfigAt || 0, state.server.lastRuntimeAt || 0); updateText('health-last', ageString(last)); From 9baea0ea057aec1133914461f857958de2e6eb6f Mon Sep 17 00:00:00 2001 From: Jan Svabenik Date: Mon, 6 Apr 2026 03:56:42 +0200 Subject: [PATCH 39/55] feat: add high watermark trend sparkline --- docs/pro-runtime-hardening-workboard.md | 8 ++++-- internal/control/ui.html | 38 +++++++++++++++++++++++++ 2 files changed, 44 insertions(+), 2 deletions(-) diff --git a/docs/pro-runtime-hardening-workboard.md b/docs/pro-runtime-hardening-workboard.md index 8fb6a52..e0f5f3b 100644 --- a/docs/pro-runtime-hardening-workboard.md +++ b/docs/pro-runtime-hardening-workboard.md @@ -382,10 +382,14 @@ Vollständige Sichtbarkeit auf Runtime, Queue, Writer, Generator, RF-Selbsttests - `rf_selftest_rds_57k_db` ## WS-04 Entscheidungslog -- Noch leer +| Datum | Entscheidung | Notiz | +| --- | --- | --- | +| 2026-04-06 | High-watermark trend sparkline | Captured audio high-watermark duration history and surface it as a new Health-panel sparkline for queue pressure visibility. | ## WS-04 Verifikation -- Noch leer +| Datum | Fokus | Ergebnis | +| --- | --- | --- | +| 2026-04-06 | High-watermark trend sparkline | `go test ./...` plus manual UI check confirm the new sparkline updates with runtime audio stats. | --- diff --git a/internal/control/ui.html b/internal/control/ui.html index e1a1eaa..8764bff 100644 --- a/internal/control/ui.html +++ b/internal/control/ui.html @@ -770,6 +770,16 @@ input.input-error { .health-line .val.good { color: var(--green); } .health-line .val.warn { color: var(--amber); } .health-line .val.err { color: var(--accent); } +.health-trend { + margin-top: 10px; +} +.health-trend-label { + font-size: 10px; + text-transform: uppercase; + letter-spacing: 1px; + color: var(--text-muted); + margin-bottom: 6px; +} .fault-history { margin-top: 12px; @@ -1170,6 +1180,10 @@ input.input-error {
Buffer Duration
--
High Watermark
--
Last Update
--
+
+
High Watermark Trend
+ +
@@ -1304,6 +1318,7 @@ const state = { audio: [], underruns: [], tx: [], + highWatermark: [], }, runtimeTransitions: [], freqPresetIndex: 0, @@ -1479,6 +1494,9 @@ function pushHistory(runtime) { const driver = runtime.driver || {}; const audio = runtime.audioStream || {}; pushChart(state.charts.audio, typeof audio.buffered === 'number' ? audio.buffered : 0); + const highWatermarkDurationSeconds = Number(audio.highWatermarkDurationSeconds); + const normalizedHighWatermark = Number.isFinite(highWatermarkDurationSeconds) ? highWatermarkDurationSeconds : 0; + pushChart(state.charts.highWatermark, normalizedHighWatermark); pushChart(state.charts.underruns, Number(engine.underruns ?? driver.underruns ?? 0)); const txState = String(engine.state || 'idle').toLowerCase(); pushChart(state.charts.tx, txState === 'running' ? 1 : state.txBusy ? 0.55 : 0.05); @@ -1951,7 +1969,27 @@ function render() { updateTransitionHistory(); updateResetHint(engine); updateMeters(engine, driver, audioStream); + const highWatermarkDurationSecondsRaw = audioStream?.highWatermarkDurationSeconds; + const highWatermarkDurationSeconds = Number(highWatermarkDurationSecondsRaw); + const highWatermarkFramesRaw = audioStream?.highWatermark; + const highWatermarkFrames = Number.isFinite(Number(highWatermarkFramesRaw)) ? Number(highWatermarkFramesRaw) : 0; + const capacityRaw = audioStream?.capacity; + const capacity = Number.isFinite(Number(capacityRaw)) ? Number(capacityRaw) : 0; + const bufferedDurationSecondsRaw = audioStream?.bufferedDurationSeconds; + const bufferedDurationSeconds = Number(bufferedDurationSecondsRaw); + const hasBufferedDuration = Number.isFinite(bufferedDurationSeconds); + const hasHighWatermarkDuration = Number.isFinite(highWatermarkDurationSeconds); + const highWatermarkRatio = capacity > 0 ? Math.min(1, highWatermarkFrames / capacity) : 0; + let highWatermarkMode = 'good'; + if (highWatermarkRatio >= 0.95) highWatermarkMode = 'err'; + else if (highWatermarkRatio >= 0.65) highWatermarkMode = 'warn'; + const sparkHighWatermarkMax = Math.max( + 1, + hasHighWatermarkDuration ? highWatermarkDurationSeconds : 0, + hasBufferedDuration ? bufferedDurationSeconds : 0 + ); drawSparkline('spark-audio', state.charts.audio, 'good', 1); + drawSparkline('spark-high-watermark', state.charts.highWatermark, highWatermarkMode, sparkHighWatermarkMax); drawSparkline('spark-underruns', state.charts.underruns, underruns > 0 ? 'err' : 'warn'); drawSparkline('spark-tx', state.charts.tx, txStateValue === 'running' ? 'good' : 'warn', 1); applyMobilePanelDefaults(); From 4d9895918a105421d832adde411727f223950d8e Mon Sep 17 00:00:00 2001 From: Jan Svabenik Date: Mon, 6 Apr 2026 04:01:57 +0200 Subject: [PATCH 40/55] feat: show queue fill telemetry --- docs/pro-runtime-hardening-workboard.md | 2 ++ internal/control/ui.html | 32 +++++++++++++++++++++++++ 2 files changed, 34 insertions(+) diff --git a/docs/pro-runtime-hardening-workboard.md b/docs/pro-runtime-hardening-workboard.md index e0f5f3b..ce14fdf 100644 --- a/docs/pro-runtime-hardening-workboard.md +++ b/docs/pro-runtime-hardening-workboard.md @@ -385,11 +385,13 @@ Vollständige Sichtbarkeit auf Runtime, Queue, Writer, Generator, RF-Selbsttests | Datum | Entscheidung | Notiz | | --- | --- | --- | | 2026-04-06 | High-watermark trend sparkline | Captured audio high-watermark duration history and surface it as a new Health-panel sparkline for queue pressure visibility. | +| 2026-04-06 | Queue fill visibility | Added queue fill ratio health line and sparklines to highlight real-time queue pressure alongside high-watermark trends. | ## WS-04 Verifikation | Datum | Fokus | Ergebnis | | --- | --- | --- | | 2026-04-06 | High-watermark trend sparkline | `go test ./...` plus manual UI check confirm the new sparkline updates with runtime audio stats. | +| 2026-04-06 | Queue fill visibility | `go test ./...` plus UI smoke check confirm queue fill stats stay available and the new sparkline/health line react to queue health changes. | --- diff --git a/internal/control/ui.html b/internal/control/ui.html index 8764bff..7ff20f8 100644 --- a/internal/control/ui.html +++ b/internal/control/ui.html @@ -1179,11 +1179,16 @@ input.input-error {
Audio Buffer
--
Buffer Duration
--
High Watermark
--
+
Queue Fill
--
Last Update
--
High Watermark Trend
+
+
Queue Fill Trend
+ +
@@ -1319,6 +1324,7 @@ const state = { underruns: [], tx: [], highWatermark: [], + queueFill: [], }, runtimeTransitions: [], freqPresetIndex: 0, @@ -1497,6 +1503,8 @@ function pushHistory(runtime) { const highWatermarkDurationSeconds = Number(audio.highWatermarkDurationSeconds); const normalizedHighWatermark = Number.isFinite(highWatermarkDurationSeconds) ? highWatermarkDurationSeconds : 0; pushChart(state.charts.highWatermark, normalizedHighWatermark); + const queueFill = Number(engine.queue?.fillLevel ?? 0); + pushChart(state.charts.queueFill, Number.isFinite(queueFill) ? queueFill : 0); pushChart(state.charts.underruns, Number(engine.underruns ?? driver.underruns ?? 0)); const txState = String(engine.state || 'idle').toLowerCase(); pushChart(state.charts.tx, txState === 'running' ? 1 : state.txBusy ? 0.55 : 0.05); @@ -1988,8 +1996,13 @@ function render() { hasHighWatermarkDuration ? highWatermarkDurationSeconds : 0, hasBufferedDuration ? bufferedDurationSeconds : 0 ); + const queueHealthRaw = String(engine.queue?.health || '').toLowerCase(); + let queueSparkMode = 'good'; + if (queueHealthRaw === 'critical') queueSparkMode = 'err'; + else if (queueHealthRaw === 'low') queueSparkMode = 'warn'; drawSparkline('spark-audio', state.charts.audio, 'good', 1); drawSparkline('spark-high-watermark', state.charts.highWatermark, highWatermarkMode, sparkHighWatermarkMax); + drawSparkline('spark-queue-fill', state.charts.queueFill, queueSparkMode, 1); drawSparkline('spark-underruns', state.charts.underruns, underruns > 0 ? 'err' : 'warn'); drawSparkline('spark-tx', state.charts.tx, txStateValue === 'running' ? 'good' : 'warn', 1); applyMobilePanelDefaults(); @@ -2143,6 +2156,25 @@ function updateHealth(engine, audioStream) { } updateText('health-buffer-highwater', highWatermarkLabel); + const queueFill = Number(engine.queue?.fillLevel); + const queueHealthRaw = String(engine.queue?.health || '').toLowerCase(); + const queueHealthLabel = queueHealthRaw ? queueHealthRaw[0].toUpperCase() + queueHealthRaw.slice(1) : ''; + let queueFillLabel = '--'; + if (Number.isFinite(queueFill)) { + queueFillLabel = fmtPercent(queueFill); + if (queueHealthLabel) queueFillLabel += ` · ${queueHealthLabel}`; + } else if (queueHealthLabel) { + queueFillLabel = queueHealthLabel; + } + updateText('health-queue-fill', queueFillLabel); + const queueFillEl = $('health-queue-fill'); + if (queueFillEl) { + let queueFillClass = 'good'; + if (queueHealthRaw === 'critical') queueFillClass = 'err'; + else if (queueHealthRaw === 'low') queueFillClass = 'warn'; + queueFillEl.className = 'val ' + queueFillClass; + } + const last = Math.max(state.server.lastConfigAt || 0, state.server.lastRuntimeAt || 0); updateText('health-last', ageString(last)); From 8d43cf6badbe146ebfd7f708da214896be8181e5 Mon Sep 17 00:00:00 2001 From: Jan Svabenik Date: Mon, 6 Apr 2026 04:13:09 +0200 Subject: [PATCH 41/55] Add underrun streak telemetry --- docs/API.md | 5 +++++ docs/pro-runtime-hardening-workboard.md | 2 ++ internal/audio/stream.go | 27 ++++++++++++++++++++++ internal/audio/stream_test.go | 30 +++++++++++++++++++++++++ 4 files changed, 64 insertions(+) diff --git a/docs/API.md b/docs/API.md index 7ebc4b7..bde17a6 100644 --- a/docs/API.md +++ b/docs/API.md @@ -95,6 +95,8 @@ Live engine and driver telemetry. Only populated when TX is active. "framesWritten": 12345, "samplesWritten": 1408950000, "underruns": 0, + "underrunStreak": 0, + "maxUnderrunStreak": 0, "effectiveSampleRateHz": 2280000 } } @@ -105,6 +107,9 @@ Live engine and driver telemetry. Only populated when TX is active. `transitionHistory` liefert die jüngsten Übergänge (from/to, severity, timestamp) damit API und UI die Runtime History synchronisieren können. +`driver.underrunStreak` reports how many consecutive reads returned silence, and `driver.maxUnderrunStreak` captures the longest such run since the engine started. Together they help differentiate short glitches from persistent underrun storms and can be plotted alongside queue health sparkline telemetry. + + --- ### `POST /runtime/fault/reset` diff --git a/docs/pro-runtime-hardening-workboard.md b/docs/pro-runtime-hardening-workboard.md index ce14fdf..de45aa1 100644 --- a/docs/pro-runtime-hardening-workboard.md +++ b/docs/pro-runtime-hardening-workboard.md @@ -386,12 +386,14 @@ Vollständige Sichtbarkeit auf Runtime, Queue, Writer, Generator, RF-Selbsttests | --- | --- | --- | | 2026-04-06 | High-watermark trend sparkline | Captured audio high-watermark duration history and surface it as a new Health-panel sparkline for queue pressure visibility. | | 2026-04-06 | Queue fill visibility | Added queue fill ratio health line and sparklines to highlight real-time queue pressure alongside high-watermark trends. | +| 2026-04-07 | Underrun streak telemetry | StreamStats now expose current and max underrun streak counters so queue diagnostics can see repeated underruns without touching the metrics stack. | ## WS-04 Verifikation | Datum | Fokus | Ergebnis | | --- | --- | --- | | 2026-04-06 | High-watermark trend sparkline | `go test ./...` plus manual UI check confirm the new sparkline updates with runtime audio stats. | | 2026-04-06 | Queue fill visibility | `go test ./...` plus UI smoke check confirm queue fill stats stay available and the new sparkline/health line react to queue health changes. | +| 2026-04-07 | Underrun streak telemetry | `go test ./internal/audio` confirms the new streak counters plus Stats coverage so the API surfaces the same names. | --- diff --git a/internal/audio/stream.go b/internal/audio/stream.go index 09f6de3..6366f93 100644 --- a/internal/audio/stream.go +++ b/internal/audio/stream.go @@ -25,6 +25,8 @@ type StreamSource struct { Overflows atomic.Uint64 Written atomic.Uint64 highWatermark atomic.Int64 + underrunStreak atomic.Uint64 + maxUnderrunStreak atomic.Uint64 } // NewStreamSource creates a ring buffer with the given capacity (rounded up @@ -87,10 +89,12 @@ func (s *StreamSource) ReadFrame() Frame { wp := s.writePos.Load() if rp >= wp { s.Underruns.Add(1) + s.recordUnderrunStreak() return NewFrame(0, 0) } f := s.ring[int(rp)&s.mask] s.readPos.Add(1) + s.resetUnderrunStreak() return f } @@ -117,6 +121,8 @@ func (s *StreamSource) Stats() StreamStats { buffered = float64(available) / float64(s.size) } highWatermark := int(s.highWatermark.Load()) + currentStreak := int(s.underrunStreak.Load()) + maxStreak := int(s.maxUnderrunStreak.Load()) return StreamStats{ Available: available, Capacity: s.size, @@ -127,6 +133,8 @@ func (s *StreamSource) Stats() StreamStats { Written: s.Written.Load(), Underruns: s.Underruns.Load(), Overflows: s.Overflows.Load(), + UnderrunStreak: currentStreak, + MaxUnderrunStreak: maxStreak, } } @@ -141,6 +149,8 @@ type StreamStats struct { Written uint64 `json:"written"` Underruns uint64 `json:"underruns"` Overflows uint64 `json:"overflows"` + UnderrunStreak int `json:"underrunStreak"` + MaxUnderrunStreak int `json:"maxUnderrunStreak"` } func (s *StreamSource) bufferedDurationSeconds(available int) float64 { @@ -163,6 +173,23 @@ func (s *StreamSource) updateHighWatermark() { } } +func (s *StreamSource) recordUnderrunStreak() { + current := s.underrunStreak.Add(1) + for { + prevMax := s.maxUnderrunStreak.Load() + if current <= prevMax { + return + } + if s.maxUnderrunStreak.CompareAndSwap(prevMax, current) { + return + } + } +} + +func (s *StreamSource) resetUnderrunStreak() { + s.underrunStreak.Store(0) +} + // --- StreamResampler --- // StreamResampler wraps a StreamSource and rate-converts from the stream's diff --git a/internal/audio/stream_test.go b/internal/audio/stream_test.go index 2169e09..6cfac5e 100644 --- a/internal/audio/stream_test.go +++ b/internal/audio/stream_test.go @@ -45,6 +45,36 @@ func TestStreamSource_Underrun(t *testing.T) { if s.Underruns.Load() != 1 { t.Fatalf("expected 1 underrun, got %d", s.Underruns.Load()) } + stats := s.Stats() + if stats.UnderrunStreak != 1 || stats.MaxUnderrunStreak != 1 { + t.Fatalf("unexpected streak: %d/%d", stats.UnderrunStreak, stats.MaxUnderrunStreak) + } +} + +func TestStreamSource_UnderrunStreakTracking(t *testing.T) { + s := NewStreamSource(16, 44100) + for i := 0; i < 3; i++ { + s.ReadFrame() + } + stats := s.Stats() + if stats.UnderrunStreak != 3 { + t.Fatalf("expected streak 3, got %d", stats.UnderrunStreak) + } + if stats.MaxUnderrunStreak != 3 { + t.Fatalf("expected max streak 3, got %d", stats.MaxUnderrunStreak) + } + + if !s.WriteFrame(NewFrame(0, 0)) { + t.Fatal("expected write to succeed") + } + s.ReadFrame() + stats = s.Stats() + if stats.UnderrunStreak != 0 { + t.Fatalf("expected streak reset to 0, got %d", stats.UnderrunStreak) + } + if stats.MaxUnderrunStreak != 3 { + t.Fatalf("expected max streak to stay 3, got %d", stats.MaxUnderrunStreak) + } } func TestStreamSource_Overflow(t *testing.T) { From 4d6edf9f5722563f2e94ecd494a15e701bcc6bff Mon Sep 17 00:00:00 2001 From: Jan Svabenik Date: Mon, 6 Apr 2026 06:56:52 +0200 Subject: [PATCH 42/55] Add underrun streak visibility in health panel --- internal/control/ui.html | 35 +++++++++++++++++++++++++++++++++-- 1 file changed, 33 insertions(+), 2 deletions(-) diff --git a/internal/control/ui.html b/internal/control/ui.html index 7ff20f8..eb211cd 100644 --- a/internal/control/ui.html +++ b/internal/control/ui.html @@ -1180,6 +1180,7 @@ input.input-error {
Buffer Duration
--
High Watermark
--
Queue Fill
--
+
Underrun Streak
--
Last Update
--
High Watermark Trend
@@ -1972,7 +1973,7 @@ function render() { updateText('info-fmmod', fmtBool(cfg.fm?.fmModulationEnabled)); updateText('info-live', engine.state ? `${String(engine.state).toUpperCase()} / ${state.server.runtimeOk ? 'runtime ok' : 'runtime pending'}` : (state.server.configOk ? 'config only' : '--')); - updateHealth(engine, audioStream); + updateHealth(engine, driver, audioStream); updateFaultHistory(engine); updateTransitionHistory(); updateResetHint(engine); @@ -2077,8 +2078,9 @@ function notifyRuntimeTransition(engine, pushHistory = true) { } -function updateHealth(engine, audioStream) { +function updateHealth(engine, driver, audioStream) { engine = engine || {}; + driver = driver || {}; updateText('health-http', state.server.configOk ? 'OK' : 'OFFLINE'); $('health-http').className = 'val ' + (state.server.configOk ? 'good' : 'err'); @@ -2175,6 +2177,35 @@ function updateHealth(engine, audioStream) { queueFillEl.className = 'val ' + queueFillClass; } + const streakEl = $('health-underrun-streak'); + if (streakEl) { + const streakRaw = driver?.underrunStreak; + const streakMaxRaw = driver?.maxUnderrunStreak; + const streakCurrent = Number.isFinite(Number(streakRaw)) ? Number(streakRaw) : null; + const streakMax = Number.isFinite(Number(streakMaxRaw)) ? Number(streakMaxRaw) : null; + let streakLabel = '--'; + if (streakCurrent != null) { + streakLabel = String(streakCurrent); + if (streakMax != null) { + streakLabel += ` (max ${streakMax})`; + } + } else if (streakMax != null) { + streakLabel = `Max ${streakMax}`; + } + let streakSeverity = ''; + if (streakCurrent != null || streakMax != null) { + const highestStreak = Math.max( + streakCurrent != null ? streakCurrent : 0, + streakMax != null ? streakMax : 0 + ); + if (highestStreak >= 6) streakSeverity = ' err'; + else if (highestStreak > 0) streakSeverity = ' warn'; + else streakSeverity = ' good'; + } + streakEl.textContent = streakLabel; + streakEl.className = 'val' + streakSeverity; + } + const last = Math.max(state.server.lastConfigAt || 0, state.server.lastRuntimeAt || 0); updateText('health-last', ageString(last)); From 6df385bd3712f9940a7f4631c4b40a9fb994ea33 Mon Sep 17 00:00:00 2001 From: Jan Svabenik Date: Mon, 6 Apr 2026 07:21:01 +0200 Subject: [PATCH 43/55] feat: limit config request body size --- internal/control/control.go | 10 +++++++++- internal/control/control_test.go | 13 +++++++++++++ 2 files changed, 22 insertions(+), 1 deletion(-) diff --git a/internal/control/control.go b/internal/control/control.go index 5ec9a97..7f98c03 100644 --- a/internal/control/control.go +++ b/internal/control/control.go @@ -5,6 +5,7 @@ import ( "encoding/json" "io" "net/http" + "strings" "sync" "github.com/jan/fm-rds-tx/internal/audio" @@ -49,6 +50,8 @@ type Server struct { streamSrc *audio.StreamSource // optional, for live audio ingest } +const maxConfigBodyBytes = 64 << 10 // 64 KiB + type ConfigPatch struct { FrequencyMHz *float64 `json:"frequencyMHz,omitempty"` OutputDrive *float64 `json:"outputDrive,omitempty"` @@ -296,9 +299,14 @@ func (s *Server) handleConfig(w http.ResponseWriter, r *http.Request) { w.Header().Set("Content-Type", "application/json") _ = json.NewEncoder(w).Encode(cfg) case http.MethodPost: + r.Body = http.MaxBytesReader(w, r.Body, maxConfigBodyBytes) var patch ConfigPatch if err := json.NewDecoder(r.Body).Decode(&patch); err != nil { - http.Error(w, err.Error(), http.StatusBadRequest) + statusCode := http.StatusBadRequest + if strings.Contains(err.Error(), "http: request body too large") { + statusCode = http.StatusRequestEntityTooLarge + } + http.Error(w, err.Error(), statusCode) return } diff --git a/internal/control/control_test.go b/internal/control/control_test.go index e67ae4c..6d883c5 100644 --- a/internal/control/control_test.go +++ b/internal/control/control_test.go @@ -133,6 +133,19 @@ func TestConfigPatch(t *testing.T) { } } +func TestConfigPatchRejectsOversizeBody(t *testing.T) { + srv := NewServer(cfgpkg.Default()) + rec := httptest.NewRecorder() + payload := bytes.Repeat([]byte("x"), maxConfigBodyBytes+32) + body := append([]byte(`{"ps":"`), payload...) + body = append(body, []byte(`"}`)...) + req := httptest.NewRequest(http.MethodPost, "/config", bytes.NewReader(body)) + srv.Handler().ServeHTTP(rec, req) + if rec.Code != http.StatusRequestEntityTooLarge { + t.Fatalf("expected 413, got %d response=%q", rec.Code, rec.Body.String()) + } +} + func TestRuntimeWithoutDriver(t *testing.T) { srv := NewServer(cfgpkg.Default()) rec := httptest.NewRecorder() From b51a7da522847ab20f7241d906f0b314217717c3 Mon Sep 17 00:00:00 2001 From: Jan Svabenik Date: Mon, 6 Apr 2026 07:26:20 +0200 Subject: [PATCH 44/55] control: enforce JSON content type for config API --- docs/API.md | 2 ++ internal/control/control.go | 22 +++++++++++++++++++- internal/control/control_test.go | 35 ++++++++++++++++++++++++++++---- 3 files changed, 54 insertions(+), 5 deletions(-) diff --git a/docs/API.md b/docs/API.md index bde17a6..c5fdebc 100644 --- a/docs/API.md +++ b/docs/API.md @@ -144,6 +144,8 @@ The control snapshot (GET /config) only reflects new values once they pass valid **Request body:** JSON with any subset of patchable fields. +**Content-Type:** `application/json` (charset parameters allowed). Requests without it are rejected with 415 Unsupported Media Type. + **Response:** ```json {"ok": true, "live": true} diff --git a/internal/control/control.go b/internal/control/control.go index 7f98c03..dd1ac59 100644 --- a/internal/control/control.go +++ b/internal/control/control.go @@ -4,6 +4,7 @@ import ( _ "embed" "encoding/json" "io" + "mime" "net/http" "strings" "sync" @@ -50,7 +51,22 @@ type Server struct { streamSrc *audio.StreamSource // optional, for live audio ingest } -const maxConfigBodyBytes = 64 << 10 // 64 KiB +const ( + maxConfigBodyBytes = 64 << 10 // 64 KiB + configContentTypeHeader = "application/json" +) + +func isJSONContentType(r *http.Request) bool { + ct := strings.TrimSpace(r.Header.Get("Content-Type")) + if ct == "" { + return false + } + mediaType, _, err := mime.ParseMediaType(ct) + if err != nil { + return false + } + return strings.EqualFold(mediaType, configContentTypeHeader) +} type ConfigPatch struct { FrequencyMHz *float64 `json:"frequencyMHz,omitempty"` @@ -299,6 +315,10 @@ func (s *Server) handleConfig(w http.ResponseWriter, r *http.Request) { w.Header().Set("Content-Type", "application/json") _ = json.NewEncoder(w).Encode(cfg) case http.MethodPost: + if !isJSONContentType(r) { + http.Error(w, "Content-Type must be application/json", http.StatusUnsupportedMediaType) + return + } r.Body = http.MaxBytesReader(w, r.Body, maxConfigBodyBytes) var patch ConfigPatch if err := json.NewDecoder(r.Body).Decode(&patch); err != nil { diff --git a/internal/control/control_test.go b/internal/control/control_test.go index 6d883c5..8a86cd2 100644 --- a/internal/control/control_test.go +++ b/internal/control/control_test.go @@ -127,7 +127,7 @@ func TestConfigPatch(t *testing.T) { srv := NewServer(cfgpkg.Default()) body := []byte(`{"toneLeftHz":900,"radioText":"hello world","preEmphasisTauUS":75}`) rec := httptest.NewRecorder() - srv.Handler().ServeHTTP(rec, httptest.NewRequest(http.MethodPost, "/config", bytes.NewReader(body))) + srv.Handler().ServeHTTP(rec, newConfigPostRequest(body)) if rec.Code != 200 { t.Fatalf("status: %d body=%s", rec.Code, rec.Body.String()) } @@ -139,13 +139,34 @@ func TestConfigPatchRejectsOversizeBody(t *testing.T) { payload := bytes.Repeat([]byte("x"), maxConfigBodyBytes+32) body := append([]byte(`{"ps":"`), payload...) body = append(body, []byte(`"}`)...) - req := httptest.NewRequest(http.MethodPost, "/config", bytes.NewReader(body)) + req := newConfigPostRequest(body) srv.Handler().ServeHTTP(rec, req) if rec.Code != http.StatusRequestEntityTooLarge { t.Fatalf("expected 413, got %d response=%q", rec.Code, rec.Body.String()) } } +func TestConfigPatchRejectsMissingContentType(t *testing.T) { + srv := NewServer(cfgpkg.Default()) + rec := httptest.NewRecorder() + req := httptest.NewRequest(http.MethodPost, "/config", bytes.NewReader([]byte(`{}`))) + srv.Handler().ServeHTTP(rec, req) + if rec.Code != http.StatusUnsupportedMediaType { + t.Fatalf("expected 415 when Content-Type missing, got %d", rec.Code) + } +} + +func TestConfigPatchRejectsNonJSONContentType(t *testing.T) { + srv := NewServer(cfgpkg.Default()) + rec := httptest.NewRecorder() + req := httptest.NewRequest(http.MethodPost, "/config", bytes.NewReader([]byte(`{}`))) + req.Header.Set("Content-Type", "text/plain") + srv.Handler().ServeHTTP(rec, req) + if rec.Code != http.StatusUnsupportedMediaType { + t.Fatalf("expected 415 for non-JSON Content-Type, got %d", rec.Code) + } +} + func TestRuntimeWithoutDriver(t *testing.T) { srv := NewServer(cfgpkg.Default()) rec := httptest.NewRecorder() @@ -334,7 +355,7 @@ func TestConfigPatchUpdatesSnapshot(t *testing.T) { rec := httptest.NewRecorder() body := []byte(`{"outputDrive":1.2}`) - srv.Handler().ServeHTTP(rec, httptest.NewRequest(http.MethodPost, "/config", bytes.NewReader(body))) + srv.Handler().ServeHTTP(rec, newConfigPostRequest(body)) if rec.Code != 200 { t.Fatalf("status: %d", rec.Code) } @@ -363,7 +384,7 @@ func TestConfigPatchEngineRejectsDoesNotUpdateSnapshot(t *testing.T) { body := []byte(`{"outputDrive":2.2}`) rec := httptest.NewRecorder() - srv.Handler().ServeHTTP(rec, httptest.NewRequest(http.MethodPost, "/config", bytes.NewReader(body))) + srv.Handler().ServeHTTP(rec, newConfigPostRequest(body)) if rec.Code != http.StatusBadRequest { t.Fatalf("expected 400, got %d", rec.Code) } @@ -379,6 +400,12 @@ func TestConfigPatchEngineRejectsDoesNotUpdateSnapshot(t *testing.T) { } } +func newConfigPostRequest(body []byte) *http.Request { + req := httptest.NewRequest(http.MethodPost, "/config", bytes.NewReader(body)) + req.Header.Set("Content-Type", "application/json") + return req +} + type fakeTXController struct { updateErr error resetErr error From 002bb0a96e8acef07c89c39f0c04b1a29abe2fc9 Mon Sep 17 00:00:00 2001 From: Jan Svabenik Date: Mon, 6 Apr 2026 07:31:12 +0200 Subject: [PATCH 45/55] control: harden HTTP server timeouts --- cmd/fmrtx/main.go | 11 ++++++----- docs/README.md | 2 ++ internal/control/server.go | 27 +++++++++++++++++++++++++++ internal/control/server_test.go | 33 +++++++++++++++++++++++++++++++++ 4 files changed, 68 insertions(+), 5 deletions(-) create mode 100644 internal/control/server.go create mode 100644 internal/control/server_test.go diff --git a/cmd/fmrtx/main.go b/cmd/fmrtx/main.go index a45a5ed..9bc15ed 100644 --- a/cmd/fmrtx/main.go +++ b/cmd/fmrtx/main.go @@ -5,7 +5,6 @@ import ( "flag" "fmt" "log" - "net/http" "os" "os/signal" "syscall" @@ -109,8 +108,9 @@ func main() { // --- default: HTTP only --- srv := ctrlpkg.NewServer(cfg) - log.Printf("fm-rds-tx listening on %s (TX default: off, use --tx for hardware)", cfg.Control.ListenAddress) - log.Fatal(http.ListenAndServe(cfg.Control.ListenAddress, srv.Handler())) + server := ctrlpkg.NewHTTPServer(cfg, srv.Handler()) + log.Printf("fm-rds-tx listening on %s (TX default: off, use --tx for hardware)", server.Addr) + log.Fatal(server.ListenAndServe()) } // selectDriver picks the best available driver based on config and build tags. @@ -228,9 +228,10 @@ func runTXMode(cfg cfgpkg.Config, driver platform.SoapyDriver, autoStart bool, a log.Println("TX ready (idle) — POST /tx/start to begin") } + ctrlServer := ctrlpkg.NewHTTPServer(cfg, srv.Handler()) go func() { - log.Printf("control plane on %s", cfg.Control.ListenAddress) - if err := http.ListenAndServe(cfg.Control.ListenAddress, srv.Handler()); err != nil { + log.Printf("control plane on %s (read=%s write=%s idle=%s)", ctrlServer.Addr, ctrlServer.ReadTimeout, ctrlServer.WriteTimeout, ctrlServer.IdleTimeout) + if err := ctrlServer.ListenAndServe(); err != nil { log.Printf("http: %v", err) } }() diff --git a/docs/README.md b/docs/README.md index 9549c41..34424e9 100644 --- a/docs/README.md +++ b/docs/README.md @@ -87,6 +87,8 @@ All major TX parameters are hot-reloadable via `POST /config` during live transm Available endpoints: `/healthz`, `/status`, `/runtime`, `/config` (GET/POST), `/dry-run`, `/tx/start`, `/tx/stop` +Control-plane HTTP server is configured with 5s read, 10s write, and 60s idle timeouts plus a 1 MiB header limit to reduce slow-client abuse. + ### Internal DSP module - `cd internal` - `go test ./...` diff --git a/internal/control/server.go b/internal/control/server.go new file mode 100644 index 0000000..9fcd5cd --- /dev/null +++ b/internal/control/server.go @@ -0,0 +1,27 @@ +package control + +import ( + "net/http" + "time" + + "github.com/jan/fm-rds-tx/internal/config" +) + +const ( + defaultReadTimeout = 5 * time.Second + defaultWriteTimeout = 10 * time.Second + defaultIdleTimeout = 60 * time.Second + defaultMaxHeaderBytes = 1 << 20 // 1 MiB +) + +// NewHTTPServer returns a configured HTTP server for the control plane. +func NewHTTPServer(cfg config.Config, handler http.Handler) *http.Server { + return &http.Server{ + Addr: cfg.Control.ListenAddress, + Handler: handler, + ReadTimeout: defaultReadTimeout, + WriteTimeout: defaultWriteTimeout, + IdleTimeout: defaultIdleTimeout, + MaxHeaderBytes: defaultMaxHeaderBytes, + } +} diff --git a/internal/control/server_test.go b/internal/control/server_test.go new file mode 100644 index 0000000..9f8cb95 --- /dev/null +++ b/internal/control/server_test.go @@ -0,0 +1,33 @@ +package control + +import ( + "net/http" + "testing" + + cfgpkg "github.com/jan/fm-rds-tx/internal/config" +) + +func TestNewHTTPServerConfig(t *testing.T) { + cfg := cfgpkg.Default() + handler := http.NewServeMux() + srv := NewHTTPServer(cfg, handler) + + if srv.Addr != cfg.Control.ListenAddress { + t.Fatalf("expected server address %q, got %q", cfg.Control.ListenAddress, srv.Addr) + } + if srv.Handler != handler { + t.Fatalf("expected handler to be preserved") + } + if srv.ReadTimeout != defaultReadTimeout { + t.Fatalf("expected read timeout %s, got %s", defaultReadTimeout, srv.ReadTimeout) + } + if srv.WriteTimeout != defaultWriteTimeout { + t.Fatalf("expected write timeout %s, got %s", defaultWriteTimeout, srv.WriteTimeout) + } + if srv.IdleTimeout != defaultIdleTimeout { + t.Fatalf("expected idle timeout %s, got %s", defaultIdleTimeout, srv.IdleTimeout) + } + if srv.MaxHeaderBytes != defaultMaxHeaderBytes { + t.Fatalf("expected max header bytes %d, got %d", defaultMaxHeaderBytes, srv.MaxHeaderBytes) + } +} From dd7ae483c40412b81276894c70da849a9b22d15d Mon Sep 17 00:00:00 2001 From: Jan Svabenik Date: Mon, 6 Apr 2026 07:39:25 +0200 Subject: [PATCH 46/55] control: reject unexpected bodies on control POSTs --- internal/control/control.go | 30 ++++++++++++++++++++++ internal/control/control_test.go | 43 ++++++++++++++++++++++++++++++++ 2 files changed, 73 insertions(+) diff --git a/internal/control/control.go b/internal/control/control.go index dd1ac59..07cb355 100644 --- a/internal/control/control.go +++ b/internal/control/control.go @@ -54,6 +54,7 @@ type Server struct { const ( maxConfigBodyBytes = 64 << 10 // 64 KiB configContentTypeHeader = "application/json" + noBodyErrMsg = "request must not include a body" ) func isJSONContentType(r *http.Request) bool { @@ -89,6 +90,26 @@ func NewServer(cfg config.Config) *Server { return &Server{cfg: cfg} } +func hasRequestBody(r *http.Request) bool { + if r.ContentLength > 0 { + return true + } + for _, te := range r.TransferEncoding { + if strings.EqualFold(te, "chunked") { + return true + } + } + return false +} + +func rejectBody(w http.ResponseWriter, r *http.Request) bool { + if !hasRequestBody(r) { + return true + } + http.Error(w, noBodyErrMsg, http.StatusBadRequest) + return false +} + func (s *Server) SetTXController(tx TXController) { s.mu.Lock() s.tx = tx @@ -200,6 +221,9 @@ func (s *Server) handleRuntimeFaultReset(w http.ResponseWriter, r *http.Request) http.Error(w, "method not allowed", http.StatusMethodNotAllowed) return } + if !rejectBody(w, r) { + return + } s.mu.RLock() tx := s.tx s.mu.RUnlock() @@ -263,6 +287,9 @@ func (s *Server) handleTXStart(w http.ResponseWriter, r *http.Request) { http.Error(w, "method not allowed", http.StatusMethodNotAllowed) return } + if !rejectBody(w, r) { + return + } s.mu.RLock() tx := s.tx s.mu.RUnlock() @@ -283,6 +310,9 @@ func (s *Server) handleTXStop(w http.ResponseWriter, r *http.Request) { http.Error(w, "method not allowed", http.StatusMethodNotAllowed) return } + if !rejectBody(w, r) { + return + } s.mu.RLock() tx := s.tx s.mu.RUnlock() diff --git a/internal/control/control_test.go b/internal/control/control_test.go index 8a86cd2..846b24d 100644 --- a/internal/control/control_test.go +++ b/internal/control/control_test.go @@ -6,6 +6,7 @@ import ( "errors" "net/http" "net/http/httptest" + "strings" "testing" "github.com/jan/fm-rds-tx/internal/audio" @@ -288,6 +289,20 @@ func TestRuntimeFaultResetSuccess(t *testing.T) { } } +func TestRuntimeFaultResetRejectsBody(t *testing.T) { + srv := NewServer(cfgpkg.Default()) + srv.SetTXController(&fakeTXController{}) + rec := httptest.NewRecorder() + req := httptest.NewRequest(http.MethodPost, "/runtime/fault/reset", bytes.NewReader([]byte("nope"))) + srv.Handler().ServeHTTP(rec, req) + if rec.Code != http.StatusBadRequest { + t.Fatalf("expected 400 when body present, got %d", rec.Code) + } + if !strings.Contains(rec.Body.String(), "request must not include a body") { + t.Fatalf("unexpected response body: %q", rec.Body.String()) + } +} + func TestAudioStreamRequiresSource(t *testing.T) { srv := NewServer(cfgpkg.Default()) rec := httptest.NewRecorder() @@ -349,6 +364,34 @@ func TestTXStartWithoutController(t *testing.T) { } } +func TestTXStartRejectsBody(t *testing.T) { + srv := NewServer(cfgpkg.Default()) + srv.SetTXController(&fakeTXController{}) + rec := httptest.NewRecorder() + req := httptest.NewRequest(http.MethodPost, "/tx/start", bytes.NewReader([]byte("body"))) + srv.Handler().ServeHTTP(rec, req) + if rec.Code != http.StatusBadRequest { + t.Fatalf("expected 400 when body present, got %d", rec.Code) + } + if !strings.Contains(rec.Body.String(), "request must not include a body") { + t.Fatalf("unexpected response body: %q", rec.Body.String()) + } +} + +func TestTXStopRejectsBody(t *testing.T) { + srv := NewServer(cfgpkg.Default()) + srv.SetTXController(&fakeTXController{}) + rec := httptest.NewRecorder() + req := httptest.NewRequest(http.MethodPost, "/tx/stop", bytes.NewReader([]byte("body"))) + srv.Handler().ServeHTTP(rec, req) + if rec.Code != http.StatusBadRequest { + t.Fatalf("expected 400 when body present, got %d", rec.Code) + } + if !strings.Contains(rec.Body.String(), "request must not include a body") { + t.Fatalf("unexpected response body: %q", rec.Body.String()) + } +} + func TestConfigPatchUpdatesSnapshot(t *testing.T) { srv := NewServer(cfgpkg.Default()) srv.SetTXController(&fakeTXController{}) From 25dfb6c24c9ea3e73869457899f033a8c3cc16c9 Mon Sep 17 00:00:00 2001 From: Jan Date: Mon, 6 Apr 2026 07:53:06 +0200 Subject: [PATCH 47/55] Hardening audio stream Content-Type guard --- README.md | 678 ++++++------- docs/API.md | 793 ++++++++-------- docs/pro-runtime-hardening-workboard.md | 1153 ++++++++++++----------- internal/control/control.go | 33 +- internal/control/control_test.go | 33 + 5 files changed, 1377 insertions(+), 1313 deletions(-) diff --git a/README.md b/README.md index ad73b7c..ff7d2fb 100644 --- a/README.md +++ b/README.md @@ -1,338 +1,340 @@ -# fm-rds-tx - -Go-based FM stereo transmitter with RDS. Supports ADALM-Pluto (PlutoSDR) and SoapySDR-compatible TX devices. - -## Status - -**Current status:** `v0.7.0-pre` — hardware bring-up milestone - -What is already in place: -- complete DSP chain: audio -> pre-emphasis -> stereo encoding -> RDS -> MPX -> limiter -> FM modulation -- real hardware TX paths for PlutoSDR / SoapySDR backends -- continuous TX engine with runtime telemetry -- dry-run, offline generation, and simulated TX modes -- HTTP control plane with live config patching and runtime/status endpoints -- browser UI on `/` -- live audio ingestion via stdin or HTTP stream input - -Current engineering focus: -- deterministic runtime behavior -- fault handling / recovery -- observability and runtime telemetry -- hardware-validated signal quality - -For the active runtime-hardening track, see: -- `docs/pro-runtime-hardening-workboard.md` - -## Signal path - -```text -Audio Source -> PreEmphasis(50us/75us/off) -> StereoEncoder(19k + 38k DSB-SC) --> RDS(57k BPSK) -> MPX Combiner -> Limiter -> FM Modulator(+/-75kHz) --> optional split-rate FM upsampling -> SDR backend -> RF output -``` - -For deeper DSP details, see: -- `docs/DSP-CHAIN.md` - -## Prerequisites - -### Go -- Go version from `go.mod` (currently Go 1.22) - -### Native SDR dependencies -Depending on backend, native libraries are required: - -- **SoapySDR backend** - - build with `-tags soapy` - - requires SoapySDR native library (`SoapySDR.dll` / `libSoapySDR.so` / `libSoapySDR.dylib`) - - on Windows, PothosSDR is the expected setup - -- **Pluto backend** - - uses native `libiio` - - Windows expects `libiio.dll` - - Linux build/runtime expects `pkg-config` + `libiio` - -### Hardware / legal -- validate RF output, deviation, filtering, and power with proper measurement equipment -- use only within applicable legal and regulatory constraints - -## Quick start - -## Build - -```powershell -# Build CLI tools without hardware-specific build tags: -go build ./cmd/fmrtx -go build ./cmd/offline - -# Build fmrtx with SoapySDR support: -go build -tags soapy ./cmd/fmrtx -``` - -## Quick verification - -```powershell -# Print effective config -go run ./cmd/fmrtx -print-config - -# Run tests -go test ./... - -# Basic dry-run summary -go run ./cmd/fmrtx --dry-run --dry-output build/dryrun/frame.json -``` - -For additional build/test commands, see: -- `docs/README.md` - -## Common usage flows - -### 1) List available SDR devices - -```powershell -.\fmrtx.exe --list-devices -``` - -### 2) Dry-run / config verification - -```powershell -.\fmrtx.exe --dry-run --dry-output build/dryrun/frame.json - -# Write dry-run JSON to stdout -.\fmrtx.exe --dry-run --dry-output - -``` - -### 3) Offline IQ/composite generation - -```powershell -go run ./cmd/offline -duration 2s -output build/offline/composite.iqf32 - -# Optional output rate override -go run ./cmd/offline -duration 500ms -output build/offline/composite.iqf32 -output-rate 228000 -``` - -### 4) Simulated transmit path - -```powershell -go run ./cmd/fmrtx --simulate-tx --simulate-output build/sim/simulated-soapy.iqf32 --simulate-duration 250ms -``` - -### 5) Real TX with config file - -```powershell -# Start TX service with manual start over HTTP -.\fmrtx.exe --tx --config docs/config.plutosdr.json - -# Start and begin transmitting immediately -.\fmrtx.exe --tx --tx-auto-start --config docs/config.plutosdr.json -``` - -### 6) Live audio via stdin - -```powershell -ffmpeg -i "http://svabi.ch:8443/stream" -f s16le -ar 44100 -ac 2 - | .\fmrtx.exe --tx --tx-auto-start --audio-stdin --config docs/config.plutosdr.json -``` - -### 7) Custom audio input rate - -```powershell -ffmpeg -i source.wav -f s16le -ar 48000 -ac 2 - | .\fmrtx.exe --tx --tx-auto-start --audio-stdin --audio-rate 48000 --config docs/config.plutosdr.json -``` - -### 8) HTTP audio ingest - -Start the control plane with `--audio-http` to accept raw PCM pushes on `/audio/stream` and feed them into the live encoder: - -```powershell -ffmpeg -i music.mp3 -f s16le -ar 44100 -ac 2 - | curl -X POST --data-binary @- http://localhost:8088/audio/stream -``` - -## CLI overview - -## `fmrtx` -Important runtime modes and flags include: -- `--tx` -- `--tx-auto-start` -- `--dry-run` -- `--dry-output ` -- `--simulate-tx` -- `--simulate-output ` -- `--simulate-duration ` -- `--config ` -- `--print-config` -- `--list-devices` -- `--audio-stdin` -- `--audio-rate ` -- `--audio-http` - -## `offline` -Useful flags include: -- `-duration ` -- `-output ` -- `-output-rate ` - -If the README is too high-level for the exact CLI surface, check: -- `cmd/fmrtx/main.go` -- `cmd/offline/main.go` - -## HTTP control plane - -Base URL: `http://{listenAddress}` (default typically `127.0.0.1:8088`) - -Security note: -- keep the control plane bound locally unless you intentionally place it behind a trusted and hardened access layer - -### Main endpoints - -```text -GET / browser UI -GET /healthz health check -GET /status current config/status snapshot -GET /runtime live engine / driver / audio telemetry -GET /config full config -POST /config patch config / live updates -GET /dry-run synthetic frame summary -POST /tx/start start transmission -POST /tx/stop stop transmission -POST /audio/stream push raw S16LE stereo PCM into live stream buffer -``` - -### What the control plane covers -- TX start / stop -- runtime status and driver telemetry -- config inspection -- live patching of selected parameters -- dry-run inspection -- browser-accessible control UI -- optional HTTP audio ingest (enable with `--audio-http`) - -### Live config notes -`POST /config` supports live updates for selected fields such as: -- frequency -- stereo enable/disable -- pilot / RDS injection levels -- RDS enable/disable -- limiter settings -- PS / RadioText - -Some parameters are saved but not live-applied and require restart. - -For the full API contract, examples, live-patch semantics, and `/audio/stream` details, see: -- `docs/API.md` - -## Configuration - -Sample configs: -- `docs/config.sample.json` -- `docs/config.plutosdr.json` -- `docs/config.orangepi-pluto-soapy.json` - -Important config areas include: -- `fm.*` -- `rds.*` -- `audio.*` -- `backend.*` -- `control.*` - -Examples of relevant fields you may want to inspect: -- `fm.outputDrive` -- `fm.mpxGain` -- `fm.bs412Enabled` -- `fm.bs412ThresholdDBr` -- `fm.fmModulationEnabled` -- `backend.kind` -- `backend.driver` -- `backend.deviceArgs` -- `backend.uri` -- `backend.deviceSampleRateHz` -- `backend.outputPath` -- `control.listenAddress` - -For deeper config/API behavior, refer to: -- `internal/config/config.go` -- `docs/API.md` -- `docs/config.sample.json` - -## Development and testing - -Useful commands: - -```powershell -go test ./... -go run ./cmd/fmrtx -print-config -go run ./cmd/fmrtx -config docs/config.sample.json -go run ./cmd/fmrtx --dry-run --dry-output build/dryrun/frame.json -go run ./cmd/fmrtx --simulate-tx --simulate-output build/sim/simulated-soapy.iqf32 --simulate-duration 250ms -go run ./cmd/offline -duration 500ms -output build/offline/composite.iqf32 -``` - -See also: -- `docs/README.md` - -## PlutoSDR / backend notes - -- PlutoSDR commonly runs with a device-side sample rate above composite rate, so split-rate mode may be used automatically -- SoapySDR backend is suitable for Soapy-compatible TX hardware -- backend/device settings are selected through config rather than hardcoded paths -- runtime telemetry should be used to inspect effective TX state during operation - -## Repository layout - -```text -cmd/ - fmrtx/ main CLI - offline/ offline generator -internal/ - app/ TX engine + runtime state - audio/ audio input, resampling, tone generation, stream buffering - config/ config schema and validation - control/ HTTP control plane + browser UI - dryrun/ dry-run JSON summaries - dsp/ DSP primitives - mpx/ MPX combiner - offline/ full offline composite generation - output/ output/backend abstractions - platform/ backend abstractions and device/runtime stats - platform/soapysdr/ CGO SoapySDR binding - platform/plutosdr/ Pluto/libiio backend code - rds/ RDS encoder - stereo/ stereo encoder -docs/ - API.md - DSP-CHAIN.md - README.md - config.sample.json - config.plutosdr.json - config.orangepi-pluto-soapy.json - pro-runtime-hardening-workboard.md -scripts/ -examples/ -``` - -## Planning / workboard - -For the current runtime-hardening / professionalization track, see: -- `docs/pro-runtime-hardening-workboard.md` - -This is the living workboard for: -- status tracking -- confirmed findings -- open technical decisions -- verification notes -- implementation progress - -## Release / project docs - -Additional project docs: -- `CHANGELOG.md` -- `RELEASE.md` -- `docs/README.md` -- `docs/API.md` -- `docs/DSP-CHAIN.md` -- `docs/NOTES.md` - -## Legal note - -This project is intended only for lawful use within relevant license and regulatory constraints. -RF output, deviation, filtering, and transmitted power must be validated with proper measurement equipment. +# fm-rds-tx + +Go-based FM stereo transmitter with RDS. Supports ADALM-Pluto (PlutoSDR) and SoapySDR-compatible TX devices. + +## Status + +**Current status:** `v0.7.0-pre` — hardware bring-up milestone + +What is already in place: +- complete DSP chain: audio -> pre-emphasis -> stereo encoding -> RDS -> MPX -> limiter -> FM modulation +- real hardware TX paths for PlutoSDR / SoapySDR backends +- continuous TX engine with runtime telemetry +- dry-run, offline generation, and simulated TX modes +- HTTP control plane with live config patching and runtime/status endpoints +- browser UI on `/` +- live audio ingestion via stdin or HTTP stream input + +Current engineering focus: +- deterministic runtime behavior +- fault handling / recovery +- observability and runtime telemetry +- hardware-validated signal quality + +For the active runtime-hardening track, see: +- `docs/pro-runtime-hardening-workboard.md` + +## Signal path + +```text +Audio Source -> PreEmphasis(50us/75us/off) -> StereoEncoder(19k + 38k DSB-SC) +-> RDS(57k BPSK) -> MPX Combiner -> Limiter -> FM Modulator(+/-75kHz) +-> optional split-rate FM upsampling -> SDR backend -> RF output +``` + +For deeper DSP details, see: +- `docs/DSP-CHAIN.md` + +## Prerequisites + +### Go +- Go version from `go.mod` (currently Go 1.22) + +### Native SDR dependencies +Depending on backend, native libraries are required: + +- **SoapySDR backend** + - build with `-tags soapy` + - requires SoapySDR native library (`SoapySDR.dll` / `libSoapySDR.so` / `libSoapySDR.dylib`) + - on Windows, PothosSDR is the expected setup + +- **Pluto backend** + - uses native `libiio` + - Windows expects `libiio.dll` + - Linux build/runtime expects `pkg-config` + `libiio` + +### Hardware / legal +- validate RF output, deviation, filtering, and power with proper measurement equipment +- use only within applicable legal and regulatory constraints + +## Quick start + +## Build + +```powershell +# Build CLI tools without hardware-specific build tags: +go build ./cmd/fmrtx +go build ./cmd/offline + +# Build fmrtx with SoapySDR support: +go build -tags soapy ./cmd/fmrtx +``` + +## Quick verification + +```powershell +# Print effective config +go run ./cmd/fmrtx -print-config + +# Run tests +go test ./... + +# Basic dry-run summary +go run ./cmd/fmrtx --dry-run --dry-output build/dryrun/frame.json +``` + +For additional build/test commands, see: +- `docs/README.md` + +## Common usage flows + +### 1) List available SDR devices + +```powershell +.\fmrtx.exe --list-devices +``` + +### 2) Dry-run / config verification + +```powershell +.\fmrtx.exe --dry-run --dry-output build/dryrun/frame.json + +# Write dry-run JSON to stdout +.\fmrtx.exe --dry-run --dry-output - +``` + +### 3) Offline IQ/composite generation + +```powershell +go run ./cmd/offline -duration 2s -output build/offline/composite.iqf32 + +# Optional output rate override +go run ./cmd/offline -duration 500ms -output build/offline/composite.iqf32 -output-rate 228000 +``` + +### 4) Simulated transmit path + +```powershell +go run ./cmd/fmrtx --simulate-tx --simulate-output build/sim/simulated-soapy.iqf32 --simulate-duration 250ms +``` + +### 5) Real TX with config file + +```powershell +# Start TX service with manual start over HTTP +.\fmrtx.exe --tx --config docs/config.plutosdr.json + +# Start and begin transmitting immediately +.\fmrtx.exe --tx --tx-auto-start --config docs/config.plutosdr.json +``` + +### 6) Live audio via stdin + +```powershell +ffmpeg -i "http://svabi.ch:8443/stream" -f s16le -ar 44100 -ac 2 - | .\fmrtx.exe --tx --tx-auto-start --audio-stdin --config docs/config.plutosdr.json +``` + +### 7) Custom audio input rate + +```powershell +ffmpeg -i source.wav -f s16le -ar 48000 -ac 2 - | .\fmrtx.exe --tx --tx-auto-start --audio-stdin --audio-rate 48000 --config docs/config.plutosdr.json +``` + +### 8) HTTP audio ingest + +Start the control plane with `--audio-http` to accept raw PCM pushes on `/audio/stream` and feed them into the live encoder: + +Set `Content-Type` to `application/octet-stream` (or `audio/L16`) when posting audio data: + +```powershell +ffmpeg -i music.mp3 -f s16le -ar 44100 -ac 2 - | curl -X POST -H "Content-Type: application/octet-stream" --data-binary @- http://localhost:8088/audio/stream +``` + +## CLI overview + +## `fmrtx` +Important runtime modes and flags include: +- `--tx` +- `--tx-auto-start` +- `--dry-run` +- `--dry-output ` +- `--simulate-tx` +- `--simulate-output ` +- `--simulate-duration ` +- `--config ` +- `--print-config` +- `--list-devices` +- `--audio-stdin` +- `--audio-rate ` +- `--audio-http` + +## `offline` +Useful flags include: +- `-duration ` +- `-output ` +- `-output-rate ` + +If the README is too high-level for the exact CLI surface, check: +- `cmd/fmrtx/main.go` +- `cmd/offline/main.go` + +## HTTP control plane + +Base URL: `http://{listenAddress}` (default typically `127.0.0.1:8088`) + +Security note: +- keep the control plane bound locally unless you intentionally place it behind a trusted and hardened access layer + +### Main endpoints + +```text +GET / browser UI +GET /healthz health check +GET /status current config/status snapshot +GET /runtime live engine / driver / audio telemetry +GET /config full config +POST /config patch config / live updates +GET /dry-run synthetic frame summary +POST /tx/start start transmission +POST /tx/stop stop transmission +POST /audio/stream push raw S16LE stereo PCM into live stream buffer (Content-Type: application/octet-stream or audio/L16 required) +``` + +### What the control plane covers +- TX start / stop +- runtime status and driver telemetry +- config inspection +- live patching of selected parameters +- dry-run inspection +- browser-accessible control UI +- optional HTTP audio ingest (enable with `--audio-http`) + +### Live config notes +`POST /config` supports live updates for selected fields such as: +- frequency +- stereo enable/disable +- pilot / RDS injection levels +- RDS enable/disable +- limiter settings +- PS / RadioText + +Some parameters are saved but not live-applied and require restart. + +For the full API contract, examples, live-patch semantics, and `/audio/stream` details, see: +- `docs/API.md` + +## Configuration + +Sample configs: +- `docs/config.sample.json` +- `docs/config.plutosdr.json` +- `docs/config.orangepi-pluto-soapy.json` + +Important config areas include: +- `fm.*` +- `rds.*` +- `audio.*` +- `backend.*` +- `control.*` + +Examples of relevant fields you may want to inspect: +- `fm.outputDrive` +- `fm.mpxGain` +- `fm.bs412Enabled` +- `fm.bs412ThresholdDBr` +- `fm.fmModulationEnabled` +- `backend.kind` +- `backend.driver` +- `backend.deviceArgs` +- `backend.uri` +- `backend.deviceSampleRateHz` +- `backend.outputPath` +- `control.listenAddress` + +For deeper config/API behavior, refer to: +- `internal/config/config.go` +- `docs/API.md` +- `docs/config.sample.json` + +## Development and testing + +Useful commands: + +```powershell +go test ./... +go run ./cmd/fmrtx -print-config +go run ./cmd/fmrtx -config docs/config.sample.json +go run ./cmd/fmrtx --dry-run --dry-output build/dryrun/frame.json +go run ./cmd/fmrtx --simulate-tx --simulate-output build/sim/simulated-soapy.iqf32 --simulate-duration 250ms +go run ./cmd/offline -duration 500ms -output build/offline/composite.iqf32 +``` + +See also: +- `docs/README.md` + +## PlutoSDR / backend notes + +- PlutoSDR commonly runs with a device-side sample rate above composite rate, so split-rate mode may be used automatically +- SoapySDR backend is suitable for Soapy-compatible TX hardware +- backend/device settings are selected through config rather than hardcoded paths +- runtime telemetry should be used to inspect effective TX state during operation + +## Repository layout + +```text +cmd/ + fmrtx/ main CLI + offline/ offline generator +internal/ + app/ TX engine + runtime state + audio/ audio input, resampling, tone generation, stream buffering + config/ config schema and validation + control/ HTTP control plane + browser UI + dryrun/ dry-run JSON summaries + dsp/ DSP primitives + mpx/ MPX combiner + offline/ full offline composite generation + output/ output/backend abstractions + platform/ backend abstractions and device/runtime stats + platform/soapysdr/ CGO SoapySDR binding + platform/plutosdr/ Pluto/libiio backend code + rds/ RDS encoder + stereo/ stereo encoder +docs/ + API.md + DSP-CHAIN.md + README.md + config.sample.json + config.plutosdr.json + config.orangepi-pluto-soapy.json + pro-runtime-hardening-workboard.md +scripts/ +examples/ +``` + +## Planning / workboard + +For the current runtime-hardening / professionalization track, see: +- `docs/pro-runtime-hardening-workboard.md` + +This is the living workboard for: +- status tracking +- confirmed findings +- open technical decisions +- verification notes +- implementation progress + +## Release / project docs + +Additional project docs: +- `CHANGELOG.md` +- `RELEASE.md` +- `docs/README.md` +- `docs/API.md` +- `docs/DSP-CHAIN.md` +- `docs/NOTES.md` + +## Legal note + +This project is intended only for lawful use within relevant license and regulatory constraints. +RF output, deviation, filtering, and transmitted power must be validated with proper measurement equipment. diff --git a/docs/API.md b/docs/API.md index c5fdebc..e7f89b0 100644 --- a/docs/API.md +++ b/docs/API.md @@ -1,396 +1,397 @@ -# fm-rds-tx HTTP Control API - -Base URL: `http://{listenAddress}` (default `127.0.0.1:8088`) - ---- - -## Endpoints - -### `GET /healthz` - -Health check. - -**Response:** -```json -{"ok": true} -``` - -`engine.state` spiegelt jetzt die Runtime-State-Maschine wider (idle, arming, prebuffering, running, degraded, muted, faulted, stopping) und bietet eine erste beobachtbare Basis für Fault-Transitions. - - ---- - -### `GET /status` - -Current transmitter status (read-only snapshot). Runtime indicator, alert, and queue stats from the running TX controller are mirrored here for quick health checks. - -**Response:** -```json -{ - "service": "fm-rds-tx", - "backend": "pluto", - "frequencyMHz": 100.0, - "stereoEnabled": true, - "rdsEnabled": true, - "preEmphasisTauUS": 50, - "limiterEnabled": true, - "fmModulationEnabled": true, - "runtimeIndicator": "normal", - "runtimeAlert": "", - "queue": { - "capacity": 3, - "depth": 1, - "fillLevel": 0.33, - "health": "low" - } -} -``` - -`runtimeIndicator` is derived from the engine queue health plus any late buffers observed in the last 5 seconds and can be "normal", "degraded", or "queueCritical". `runtimeAlert` surfaces a short reason (e.g. "queue health low" or "late buffers") when the indicator is not "normal", but late-buffer alerts expire after a few seconds once cycle times settle so the signal doesn't stay stuck on degraded. The cumulative `lateBuffers` counter returned by `/runtime` still shows how many late cycles have occurred since start for post-mortem diagnosis. - ---- - -### `GET /runtime` - -Live engine and driver telemetry. Only populated when TX is active. - -**Response:** -```json -{ - "engine": { - "state": "running", - "runtimeStateDurationSeconds": 12.4, - "chunksProduced": 12345, - "totalSamples": 1408950000, - "underruns": 0, - "lastError": "", - "uptimeSeconds": 3614.2, - "faultCount": 2, - "lastFault": { - "time": "2026-04-06T00:00:00Z", - "reason": "queueCritical", - "severity": "faulted", - "message": "queue health critical for 5 checks" - }, - "faultHistory": [ - { - "time": "2026-04-06T00:00:00Z", - "reason": "queueCritical", - "severity": "faulted", - "message": "queue health critical for 5 checks" - } - ], - "transitionHistory": [ - { - "time": "2026-04-06T00:00:00Z", - "from": "running", - "to": "degraded", - "severity": "warn" - } - ] - }, - "driver": { - "txEnabled": true, - "streamActive": true, - "framesWritten": 12345, - "samplesWritten": 1408950000, - "underruns": 0, - "underrunStreak": 0, - "maxUnderrunStreak": 0, - "effectiveSampleRateHz": 2280000 - } -} -``` -`engine.state` spiegelt jetzt die Runtime-State-Maschine wider (idle, arming, prebuffering, running, degraded, muted, faulted, stopping) und bietet eine erste beobachtbare Basis für Fault-Transitions. - -`runtimeStateDurationSeconds` sagt, wie viele Sekunden die Engine bereits im aktuellen Runtime-Zustand verweilt. So erkennt man schnell, ob `muted`/`degraded` zu lange dauern oder ob ein Übergang gerade frisch begonnen hat. - -`transitionHistory` liefert die jüngsten Übergänge (from/to, severity, timestamp) damit API und UI die Runtime History synchronisieren können. - -`driver.underrunStreak` reports how many consecutive reads returned silence, and `driver.maxUnderrunStreak` captures the longest such run since the engine started. Together they help differentiate short glitches from persistent underrun storms and can be plotted alongside queue health sparkline telemetry. - - ---- - -### `POST /runtime/fault/reset` - -Manually acknowledge a `faulted` runtime state so the supervisor can re-enter the recovery path (the engine moves back to `degraded` once the reset succeeds). - -**Response:** -```json -{"ok": true} -``` - -**Errors:** -- `405 Method Not Allowed` if the request is not a POST -- `503 Service Unavailable` when no TX controller is attached (`--tx` mode not active) -- `409 Conflict` when the engine is not currently faulted or the reset was rejected (e.g. still throttled) - ---- - -### `GET /config` - -Full current configuration (all fields, including non-patchable). - -**Response:** Complete `Config` JSON object. - ---- - -### `POST /config` - -**Live parameter update.** Changes are applied to the running TX engine immediately — no restart required. Only include fields you want to change (PATCH semantics). - -The control snapshot (GET /config) only reflects new values once they pass validation and, if the TX engine is running, after the live update succeeded. That keeps the API from reporting desired values that were rejected or still pending. - -**Request body:** JSON with any subset of patchable fields. - -**Content-Type:** `application/json` (charset parameters allowed). Requests without it are rejected with 415 Unsupported Media Type. - -**Response:** -```json -{"ok": true, "live": true} -``` - -`"live": true` = changes were forwarded to the running engine. -`"live": false` = engine not active, changes saved for next start. - -#### Patchable fields — DSP (applied within ~50ms) - -| Field | Type | Range | Description | -|---|---|---|---| -| `frequencyMHz` | float | 65–110 | TX center frequency. Tunes hardware LO live. | -| `outputDrive` | float | 0–10 | Composite output level multiplier (empfohlen 1..4). | -| `stereoEnabled` | bool | | Enable/disable stereo (pilot + 38kHz subcarrier). | -| `pilotLevel` | float | 0–0.2 | 19 kHz pilot injection level. | -| `rdsInjection` | float | 0–0.15 | 57 kHz RDS subcarrier injection level. | -| `rdsEnabled` | bool | | Enable/disable RDS subcarrier. | -| `limiterEnabled` | bool | | Enable/disable MPX peak limiter. | -| `limiterCeiling` | float | 0–2 | Limiter ceiling (max composite amplitude). | - -#### Patchable fields — RDS text (applied within ~88ms) - -| Field | Type | Max length | Description | -|---|---|---|---| -| `ps` | string | 8 chars | Program Service name (station name on receiver display). | -| `radioText` | string | 64 chars | RadioText message (scrolling text on receiver). | - -When `radioText` is updated, the RDS A/B flag toggles automatically per spec, signaling receivers to refresh their display. - -#### Patchable fields — other (saved, not live-applied) - -| Field | Type | Description | -|---|---|---| -| `toneLeftHz` | float | Left tone frequency (test generator). | -| `toneRightHz` | float | Right tone frequency (test generator). | -| `toneAmplitude` | float | Test tone amplitude (0–1). | -| `preEmphasisTauUS` | float | Pre-emphasis time constant. **Requires restart.** | - -#### Examples - -```bash -# Tune to 99.5 MHz -curl -X POST localhost:8088/config -d '{"frequencyMHz": 99.5}' - -# Switch to mono -curl -X POST localhost:8088/config -d '{"stereoEnabled": false}' - -# Update now-playing text -curl -X POST localhost:8088/config \ - -d '{"ps": "MYRADIO", "radioText": "Artist - Song Title"}' - -# Reduce power + disable limiter -curl -X POST localhost:8088/config \ - -d '{"outputDrive": 0.8, "limiterEnabled": false}' - -# Full update -curl -X POST localhost:8088/config -d '{ - "frequencyMHz": 101.3, - "outputDrive": 2.2, - "stereoEnabled": true, - "pilotLevel": 0.041, - "rdsInjection": 0.021, - "rdsEnabled": true, - "limiterEnabled": true, - "limiterCeiling": 1.0, - "ps": "PIRATE", - "radioText": "Broadcasting from the attic" -}' -``` - -#### Error handling - -Invalid values return `400 Bad Request` with a descriptive message: -```bash -curl -X POST localhost:8088/config -d '{"frequencyMHz": 200}' -# → 400: frequencyMHz out of range (65-110) -``` - ---- - -### `POST /tx/start` - -Start transmission. Requires `--tx` mode with hardware. - -**Response:** -```json -{"ok": true, "action": "started"} -``` - -**Errors:** -- `405` if not POST -- `503` if no TX controller (not in `--tx` mode) -- `409` if already running - ---- - -### `POST /tx/stop` - -Stop transmission. - -**Response:** -```json -{"ok": true, "action": "stopped"} -``` - ---- - -### `GET /dry-run` - -Generate a synthetic frame summary without hardware. Useful for config verification. - -**Response:** `FrameSummary` JSON with mode, rates, source info, preview samples. - ---- - -## Live update architecture - -All live updates are **lock-free** in the DSP path: - -| What | Mechanism | Latency | -|---|---|---| -| DSP params | `atomic.Pointer[LiveParams]` loaded once per chunk | ≤ 50ms | -| RDS text | `atomic.Value` in encoder, read at group boundary | ≤ 88ms | -| TX frequency | `atomic.Pointer` in engine, `driver.Tune()` between chunks | ≤ 50ms | - -No mutex, no channel, no allocation in the real-time path. The HTTP goroutine writes atomics, the DSP goroutine reads them. - -## Parameters that require restart - -These cannot be hot-reloaded (they affect DSP pipeline structure): - -- `compositeRateHz` — changes sample rate of entire DSP chain -- `deviceSampleRateHz` — changes hardware rate / upsampler ratio -- `maxDeviationHz` — changes FM modulator scaling -- `preEmphasisTauUS` — changes filter coefficients -- `rds.pi` / `rds.pty` — rarely change, baked into encoder init -- `audio.inputPath` — audio source selection -- `backend.kind` / `backend.device` — hardware selection - ---- - -### `POST /audio/stream` - -Push raw audio data into the live stream buffer. Format: **S16LE stereo PCM** at the configured `--audio-rate` (default 44100 Hz). - -Requires `--audio-stdin`, `--audio-http`, or another configured stream source to feed the buffer. - -**Request:** Binary body, `application/octet-stream`, raw S16LE stereo PCM bytes. - -**Response:** -```json -{ - "ok": true, - "frames": 4096, - "stats": { - "available": 12000, - "capacity": 131072, - "buffered": 0.09, - "bufferedDurationSeconds": 0.27, - "highWatermark": 15000, - "highWatermarkDurationSeconds": 0.34, - "written": 890000, - "underruns": 0, - "overflows": 0 - } -} -``` - -**Example:** -```bash -# Push a file -ffmpeg -i song.mp3 -f s16le -ar 44100 -ac 2 - | \ - curl -X POST --data-binary @- http://pluto:8088/audio/stream -``` - -**Errors:** -- `405` if not POST -- `503` if no audio stream configured - ---- - -## Audio Streaming - -### Stdin pipe (primary method) - -Pipe any audio source through ffmpeg into the transmitter: - -```bash -# Internet radio stream -ffmpeg -i "http://stream.example.com/radio.mp3" -f s16le -ar 44100 -ac 2 - | \ - fmrtx --tx --tx-auto-start --audio-stdin --config config.json - -# Local music file -ffmpeg -i music.flac -f s16le -ar 44100 -ac 2 - | \ - fmrtx --tx --tx-auto-start --audio-stdin - -# Playlist (ffmpeg concat) -ffmpeg -f concat -i playlist.txt -f s16le -ar 44100 -ac 2 - | \ - fmrtx --tx --tx-auto-start --audio-stdin - -# PulseAudio / ALSA capture (Linux) -parecord --format=s16le --rate=44100 --channels=2 - | \ - fmrtx --tx --tx-auto-start --audio-stdin - -# Custom sample rate (e.g. 48kHz source) -ffmpeg -i source.wav -f s16le -ar 48000 -ac 2 - | \ - fmrtx --tx --tx-auto-start --audio-stdin --audio-rate 48000 -``` - -### HTTP audio push - -Push audio from a remote machine via the HTTP API. Run the server with `--audio-http` (and typically `--tx`/`--tx-auto-start`) so the `/audio/stream` endpoint is available. - -```bash -# From another machine on the network -ffmpeg -i music.mp3 -f s16le -ar 44100 -ac 2 - | \ - curl -X POST --data-binary @- http://pluto-host:8088/audio/stream -``` - -### Audio buffer - -The stream uses a lock-free ring buffer (default: 2 seconds at input rate). Buffer stats are available in `GET /runtime` under `audioStream`: - -```json -{ - "audioStream": { - "available": 12000, - "capacity": 131072, - "buffered": 0.09, - "bufferedDurationSeconds": 0.27, - "highWatermark": 15000, - "highWatermarkDurationSeconds": 0.34, - "written": 890000, - "underruns": 0, - "overflows": 0 - } -} -``` - -- **underruns**: DSP consumed faster than audio arrived (silence inserted) -- **overflows**: Audio arrived faster than DSP consumed (data dropped) -- **buffered**: Fill ratio (0.0 = empty, 1.0 = full) -- **bufferedDurationSeconds**: Approximate seconds of audio queued in the buffer (`available` frames divided by the sample rate) -- **highWatermark**: Highest observed buffer occupancy (frames) since the buffer was created -- **highWatermarkDurationSeconds**: Equivalent peak time (`highWatermark` frames divided by the sample rate) - -When no audio is streaming, the transmitter falls back to the configured tone generator or silence. +# fm-rds-tx HTTP Control API + +Base URL: `http://{listenAddress}` (default `127.0.0.1:8088`) + +--- + +## Endpoints + +### `GET /healthz` + +Health check. + +**Response:** +```json +{"ok": true} +``` + +`engine.state` spiegelt jetzt die Runtime-State-Maschine wider (idle, arming, prebuffering, running, degraded, muted, faulted, stopping) und bietet eine erste beobachtbare Basis für Fault-Transitions. + + +--- + +### `GET /status` + +Current transmitter status (read-only snapshot). Runtime indicator, alert, and queue stats from the running TX controller are mirrored here for quick health checks. + +**Response:** +```json +{ + "service": "fm-rds-tx", + "backend": "pluto", + "frequencyMHz": 100.0, + "stereoEnabled": true, + "rdsEnabled": true, + "preEmphasisTauUS": 50, + "limiterEnabled": true, + "fmModulationEnabled": true, + "runtimeIndicator": "normal", + "runtimeAlert": "", + "queue": { + "capacity": 3, + "depth": 1, + "fillLevel": 0.33, + "health": "low" + } +} +``` + +`runtimeIndicator` is derived from the engine queue health plus any late buffers observed in the last 5 seconds and can be "normal", "degraded", or "queueCritical". `runtimeAlert` surfaces a short reason (e.g. "queue health low" or "late buffers") when the indicator is not "normal", but late-buffer alerts expire after a few seconds once cycle times settle so the signal doesn't stay stuck on degraded. The cumulative `lateBuffers` counter returned by `/runtime` still shows how many late cycles have occurred since start for post-mortem diagnosis. + +--- + +### `GET /runtime` + +Live engine and driver telemetry. Only populated when TX is active. + +**Response:** +```json +{ + "engine": { + "state": "running", + "runtimeStateDurationSeconds": 12.4, + "chunksProduced": 12345, + "totalSamples": 1408950000, + "underruns": 0, + "lastError": "", + "uptimeSeconds": 3614.2, + "faultCount": 2, + "lastFault": { + "time": "2026-04-06T00:00:00Z", + "reason": "queueCritical", + "severity": "faulted", + "message": "queue health critical for 5 checks" + }, + "faultHistory": [ + { + "time": "2026-04-06T00:00:00Z", + "reason": "queueCritical", + "severity": "faulted", + "message": "queue health critical for 5 checks" + } + ], + "transitionHistory": [ + { + "time": "2026-04-06T00:00:00Z", + "from": "running", + "to": "degraded", + "severity": "warn" + } + ] + }, + "driver": { + "txEnabled": true, + "streamActive": true, + "framesWritten": 12345, + "samplesWritten": 1408950000, + "underruns": 0, + "underrunStreak": 0, + "maxUnderrunStreak": 0, + "effectiveSampleRateHz": 2280000 + } +} +``` +`engine.state` spiegelt jetzt die Runtime-State-Maschine wider (idle, arming, prebuffering, running, degraded, muted, faulted, stopping) und bietet eine erste beobachtbare Basis für Fault-Transitions. + +`runtimeStateDurationSeconds` sagt, wie viele Sekunden die Engine bereits im aktuellen Runtime-Zustand verweilt. So erkennt man schnell, ob `muted`/`degraded` zu lange dauern oder ob ein Übergang gerade frisch begonnen hat. + +`transitionHistory` liefert die jüngsten Übergänge (from/to, severity, timestamp) damit API und UI die Runtime History synchronisieren können. + +`driver.underrunStreak` reports how many consecutive reads returned silence, and `driver.maxUnderrunStreak` captures the longest such run since the engine started. Together they help differentiate short glitches from persistent underrun storms and can be plotted alongside queue health sparkline telemetry. + + +--- + +### `POST /runtime/fault/reset` + +Manually acknowledge a `faulted` runtime state so the supervisor can re-enter the recovery path (the engine moves back to `degraded` once the reset succeeds). + +**Response:** +```json +{"ok": true} +``` + +**Errors:** +- `405 Method Not Allowed` if the request is not a POST +- `503 Service Unavailable` when no TX controller is attached (`--tx` mode not active) +- `409 Conflict` when the engine is not currently faulted or the reset was rejected (e.g. still throttled) + +--- + +### `GET /config` + +Full current configuration (all fields, including non-patchable). + +**Response:** Complete `Config` JSON object. + +--- + +### `POST /config` + +**Live parameter update.** Changes are applied to the running TX engine immediately — no restart required. Only include fields you want to change (PATCH semantics). + +The control snapshot (GET /config) only reflects new values once they pass validation and, if the TX engine is running, after the live update succeeded. That keeps the API from reporting desired values that were rejected or still pending. + +**Request body:** JSON with any subset of patchable fields. + +**Content-Type:** `application/json` (charset parameters allowed). Requests without it are rejected with 415 Unsupported Media Type. + +**Response:** +```json +{"ok": true, "live": true} +``` + +`"live": true` = changes were forwarded to the running engine. +`"live": false` = engine not active, changes saved for next start. + +#### Patchable fields — DSP (applied within ~50ms) + +| Field | Type | Range | Description | +|---|---|---|---| +| `frequencyMHz` | float | 65–110 | TX center frequency. Tunes hardware LO live. | +| `outputDrive` | float | 0–10 | Composite output level multiplier (empfohlen 1..4). | +| `stereoEnabled` | bool | | Enable/disable stereo (pilot + 38kHz subcarrier). | +| `pilotLevel` | float | 0–0.2 | 19 kHz pilot injection level. | +| `rdsInjection` | float | 0–0.15 | 57 kHz RDS subcarrier injection level. | +| `rdsEnabled` | bool | | Enable/disable RDS subcarrier. | +| `limiterEnabled` | bool | | Enable/disable MPX peak limiter. | +| `limiterCeiling` | float | 0–2 | Limiter ceiling (max composite amplitude). | + +#### Patchable fields — RDS text (applied within ~88ms) + +| Field | Type | Max length | Description | +|---|---|---|---| +| `ps` | string | 8 chars | Program Service name (station name on receiver display). | +| `radioText` | string | 64 chars | RadioText message (scrolling text on receiver). | + +When `radioText` is updated, the RDS A/B flag toggles automatically per spec, signaling receivers to refresh their display. + +#### Patchable fields — other (saved, not live-applied) + +| Field | Type | Description | +|---|---|---| +| `toneLeftHz` | float | Left tone frequency (test generator). | +| `toneRightHz` | float | Right tone frequency (test generator). | +| `toneAmplitude` | float | Test tone amplitude (0–1). | +| `preEmphasisTauUS` | float | Pre-emphasis time constant. **Requires restart.** | + +#### Examples + +```bash +# Tune to 99.5 MHz +curl -X POST localhost:8088/config -d '{"frequencyMHz": 99.5}' + +# Switch to mono +curl -X POST localhost:8088/config -d '{"stereoEnabled": false}' + +# Update now-playing text +curl -X POST localhost:8088/config \ + -d '{"ps": "MYRADIO", "radioText": "Artist - Song Title"}' + +# Reduce power + disable limiter +curl -X POST localhost:8088/config \ + -d '{"outputDrive": 0.8, "limiterEnabled": false}' + +# Full update +curl -X POST localhost:8088/config -d '{ + "frequencyMHz": 101.3, + "outputDrive": 2.2, + "stereoEnabled": true, + "pilotLevel": 0.041, + "rdsInjection": 0.021, + "rdsEnabled": true, + "limiterEnabled": true, + "limiterCeiling": 1.0, + "ps": "PIRATE", + "radioText": "Broadcasting from the attic" +}' +``` + +#### Error handling + +Invalid values return `400 Bad Request` with a descriptive message: +```bash +curl -X POST localhost:8088/config -d '{"frequencyMHz": 200}' +# → 400: frequencyMHz out of range (65-110) +``` + +--- + +### `POST /tx/start` + +Start transmission. Requires `--tx` mode with hardware. + +**Response:** +```json +{"ok": true, "action": "started"} +``` + +**Errors:** +- `405` if not POST +- `503` if no TX controller (not in `--tx` mode) +- `409` if already running + +--- + +### `POST /tx/stop` + +Stop transmission. + +**Response:** +```json +{"ok": true, "action": "stopped"} +``` + +--- + +### `GET /dry-run` + +Generate a synthetic frame summary without hardware. Useful for config verification. + +**Response:** `FrameSummary` JSON with mode, rates, source info, preview samples. + +--- + +## Live update architecture + +All live updates are **lock-free** in the DSP path: + +| What | Mechanism | Latency | +|---|---|---| +| DSP params | `atomic.Pointer[LiveParams]` loaded once per chunk | ≤ 50ms | +| RDS text | `atomic.Value` in encoder, read at group boundary | ≤ 88ms | +| TX frequency | `atomic.Pointer` in engine, `driver.Tune()` between chunks | ≤ 50ms | + +No mutex, no channel, no allocation in the real-time path. The HTTP goroutine writes atomics, the DSP goroutine reads them. + +## Parameters that require restart + +These cannot be hot-reloaded (they affect DSP pipeline structure): + +- `compositeRateHz` — changes sample rate of entire DSP chain +- `deviceSampleRateHz` — changes hardware rate / upsampler ratio +- `maxDeviationHz` — changes FM modulator scaling +- `preEmphasisTauUS` — changes filter coefficients +- `rds.pi` / `rds.pty` — rarely change, baked into encoder init +- `audio.inputPath` — audio source selection +- `backend.kind` / `backend.device` — hardware selection + +--- + +### `POST /audio/stream` + +Push raw audio data into the live stream buffer. Format: **S16LE stereo PCM** at the configured `--audio-rate` (default 44100 Hz). + +Requires `--audio-stdin`, `--audio-http`, or another configured stream source to feed the buffer. + +**Request:** Binary body, `application/octet-stream`, raw S16LE stereo PCM bytes. Set `Content-Type` to `application/octet-stream` or `audio/L16`; other media types are rejected. + +**Response:** +```json +{ + "ok": true, + "frames": 4096, + "stats": { + "available": 12000, + "capacity": 131072, + "buffered": 0.09, + "bufferedDurationSeconds": 0.27, + "highWatermark": 15000, + "highWatermarkDurationSeconds": 0.34, + "written": 890000, + "underruns": 0, + "overflows": 0 + } +} +``` + +**Example:** +```bash +# Push a file +ffmpeg -i song.mp3 -f s16le -ar 44100 -ac 2 - | \ + curl -X POST -H "Content-Type: application/octet-stream" --data-binary @- http://pluto:8088/audio/stream +``` + +**Errors:** +- `405` if not POST +- `415` if Content-Type is missing or unsupported (must be `application/octet-stream` or `audio/L16`) +- `503` if no audio stream configured + +--- + +## Audio Streaming + +### Stdin pipe (primary method) + +Pipe any audio source through ffmpeg into the transmitter: + +```bash +# Internet radio stream +ffmpeg -i "http://stream.example.com/radio.mp3" -f s16le -ar 44100 -ac 2 - | \ + fmrtx --tx --tx-auto-start --audio-stdin --config config.json + +# Local music file +ffmpeg -i music.flac -f s16le -ar 44100 -ac 2 - | \ + fmrtx --tx --tx-auto-start --audio-stdin + +# Playlist (ffmpeg concat) +ffmpeg -f concat -i playlist.txt -f s16le -ar 44100 -ac 2 - | \ + fmrtx --tx --tx-auto-start --audio-stdin + +# PulseAudio / ALSA capture (Linux) +parecord --format=s16le --rate=44100 --channels=2 - | \ + fmrtx --tx --tx-auto-start --audio-stdin + +# Custom sample rate (e.g. 48kHz source) +ffmpeg -i source.wav -f s16le -ar 48000 -ac 2 - | \ + fmrtx --tx --tx-auto-start --audio-stdin --audio-rate 48000 +``` + +### HTTP audio push + +Push audio from a remote machine via the HTTP API. Run the server with `--audio-http` (and typically `--tx`/`--tx-auto-start`) so the `/audio/stream` endpoint is available. + +```bash +# From another machine on the network +ffmpeg -i music.mp3 -f s16le -ar 44100 -ac 2 - | \ + curl -X POST -H "Content-Type: application/octet-stream" --data-binary @- http://pluto-host:8088/audio/stream +``` + +### Audio buffer + +The stream uses a lock-free ring buffer (default: 2 seconds at input rate). Buffer stats are available in `GET /runtime` under `audioStream`: + +```json +{ + "audioStream": { + "available": 12000, + "capacity": 131072, + "buffered": 0.09, + "bufferedDurationSeconds": 0.27, + "highWatermark": 15000, + "highWatermarkDurationSeconds": 0.34, + "written": 890000, + "underruns": 0, + "overflows": 0 + } +} +``` + +- **underruns**: DSP consumed faster than audio arrived (silence inserted) +- **overflows**: Audio arrived faster than DSP consumed (data dropped) +- **buffered**: Fill ratio (0.0 = empty, 1.0 = full) +- **bufferedDurationSeconds**: Approximate seconds of audio queued in the buffer (`available` frames divided by the sample rate) +- **highWatermark**: Highest observed buffer occupancy (frames) since the buffer was created +- **highWatermarkDurationSeconds**: Equivalent peak time (`highWatermark` frames divided by the sample rate) + +When no audio is streaming, the transmitter falls back to the configured tone generator or silence. diff --git a/docs/pro-runtime-hardening-workboard.md b/docs/pro-runtime-hardening-workboard.md index de45aa1..304d61f 100644 --- a/docs/pro-runtime-hardening-workboard.md +++ b/docs/pro-runtime-hardening-workboard.md @@ -1,576 +1,577 @@ -# Pro Runtime Hardening Workboard - -Status: living document -Branch: `feature/pro-runtime-hardening` - -Dieses Dokument ist das **Arbeitsdokument** zur schrittweisen Umsetzung des Konzepts aus `fm-rds-tx_pro_runtime_hardening_concept.json`. - -Ziel ist **nicht** nur eine hübsche Roadmap, sondern ein Ort, an dem wir konkret markieren können: -- **wo** wir im Code stehen, -- **welche Lücken** bestätigt sind, -- **welche Entscheidungen** gefallen sind, -- **welche Arbeiten** offen / in Arbeit / erledigt sind, -- **welche Risiken** noch bestehen, -- **welche Akzeptanzkriterien** wirklich nachgewiesen wurden. - ---- - -## 1. Arbeitsregeln für dieses Dokument - -### Statuswerte -- `TODO` → noch nicht begonnen -- `IN PROGRESS` → aktiv in Arbeit -- `BLOCKED` → sinnvoll erkannt, aber blockiert -- `DONE` → umgesetzt -- `VERIFIED` → umgesetzt **und** sinnvoll geprüft -- `DEFERRED` → bewusst nach hinten verschoben -- `REJECTED` → bewusst verworfen - -### Nachweispflicht -Ein Punkt gilt erst als wirklich fertig, wenn eingetragen ist: -1. **Code-Ort(e)** -2. **Was geändert wurde** -3. **Wie verifiziert wurde** -4. **Welche Restrisiken bleiben** - -### Update-Regel -Wenn wir an einem Workstream arbeiten, soll dieses Dokument mitgezogen werden. -Kein „ist im Kopf klar“. Der Stand kommt hier rein. - ---- - -## 2. Gesamtüberblick - -## Gesamtstatus -- Projektphase: `Umsetzung (WS-01)` -- Technischer Fokus aktuell: `Entkoppelter TX-Pfad (FrameQueue + Writer)` -- Nächster sinnvoller Startpunkt laut Konzept: `WS-01 Deterministische Echtzeit-TX-Pipeline mit entkoppeltem Writer` -- Vorangegangene Workstreams: `WS-03 Semantische Korrektheit und konsistent angewandte Config` (abgeschlossen) - -## Repo-bezogene bestätigte Ausgangslage - -| Thema | Status | Notiz | -|---|---|---| -| TX-Engine aktuell als synchroner Single-Loop | CONFIRMED | `internal/app/engine.go` | -| Persistenter DSP-Zustand im Generator vorhanden | CONFIRMED | `internal/offline/generator.go` | -| HTTP-Control vorhanden | CONFIRMED | `internal/control/control.go` | -| Config-Validation vorhanden, aber nicht überall semantisch konsistent | CONFIRMED | `internal/config/config.go` + Runtime-Pfade | -| Device/Capability-Modell vorhanden, aber noch nicht streng genug | CONFIRMED | `internal/platform/soapy.go` | -| Lock-freier SPSC-Audio-Ringbuffer vorhanden | CONFIRMED | `internal/audio/stream.go` | - -## Bereits bekannte bestätigte Inkonsistenzen - -| ID | Status | Beschreibung | Ort | -|---|---|---|---| -| CFG-SEM-001 | CONFIRMED | `fm.outputDrive` wird in Validation und Runtime nicht konsistent behandelt | `internal/config/config.go`, `internal/app/engine.go` | -| CTL-UX-001 | RESOLVED | `handleAudioStream()` beschreibt `--audio-http`; der CLI-Schalter ist nun vorhanden und setzt den Stream-Puffer für `/audio/stream` direkt. | `internal/control/control.go`, `cmd/fmrtx/main.go` | - ---- - -## 3. Prioritätenmodell - -| Priorität | Bedeutung | -|---|---| -| P0 | Technische Perfektion und Determinismus | -| P1 | Betriebssicherheit und Fehlerbeherrschung | -| P2 | Hardware-Wahrheit und RF-Qualität | -| P3 | Sichere und saubere Runtime-Steuerung | -| P4 | Deployment-, Release- und Service-Reife | - ---- - -## 4. Umsetzungstracker nach Workstream - -# WS-03 — Semantische Korrektheit und harte Config-/Runtime-Konsistenz -**Priorität:** P0 -**Gesamtstatus:** IN PROGRESS - -## Ziel -Ein einziger, eindeutig definierter Parameterraum. Jeder Wert hat exakt eine Bedeutung und identische Constraints in Config, HTTP-API, Runtime und Telemetrie. - -## Warum dieser Workstream zuerst -Wenn Semantik und Grenzwerte nicht sauber vereinheitlicht sind, bauen spätere Runtime- und Fault-Mechanismen auf unstabilem Fundament. - -## Aufgaben - -### WS-03-T1 — Parameterinventar erstellen -- **Status:** VERIFIED -- **Owner:** Builder A -- **Code-Orte:** - - `internal/config/config.go` - - `internal/app/engine.go` - - `internal/control/control.go` - - `internal/offline/generator.go` -- **Ziel:** - Alle öffentlich und intern verwendeten Parameter inventarisieren mit: - - Name - - Typ - - Einheit - - Bereich - - Default - - hot-reload-fähig ja/nein - - safety class - - Telemetrie-Name -- **Offene Fragen:** - - Wo leben heute implizite Parameter, die nicht sauber dokumentiert sind? - - Welche Runtime-Werte sind abgeleitet statt direkt konfigurierbar? -- **Nachweis:** - - `docs/ws-03-parameter-inventory.md` enthält das inventarisierte Parameter-Tableau und referenziert Config/Control/Engine. - - Live-Nutzung über `internal/control/control.go` → `LivePatch` dokumentiert. -- **Restrisiken:** - - versteckte Semantik in Helper-Funktionen übersehen - -### WS-03-T2 — Validation vereinheitlichen -- **Status:** VERIFIED -- **Owner:** Builder A -- **Code-Orte:** - - `internal/config/config.go` - - `internal/app/engine.go` - - `internal/app/engine_test.go` - - `internal/control/control.go` -- **Ziel:** - `Config.Validate()`, Runtime-Update-Pfade und API-Patch-Validierung dürfen nicht divergieren. -- **Bereits bekannter Startpunkt:** - - `fm.outputDrive` -- **Nachweis:** - - CFG-SEM-001: `outputDrive`-Validation in `Engine.UpdateConfig` jetzt 0..10 (wie `Config.Validate`). - - Tests (`go test ./...`) fangen neue Range ab und besitzen aktualisierten `engine_test`-Check. - - Live-Patch fließt durch `txBridge` und `LivePatch` (control) → `LiveConfigUpdate`. -- **Restrisiken:** - - weitere Inkonsistenzen erst beim Inventar sichtbar - -### WS-03-T3 — DesiredConfig / AppliedConfig einführen -- **Status:** IN PROGRESS -- **Owner:** Lead Coderaffe -- **Code-Orte:** - - `internal/app/engine.go` - - `internal/control/control.go` - - ggf. Config-/Statusmodelle -- **Ziel:** - API und Runtime sollen trennen zwischen: - - gewünschter Konfiguration - - tatsächlich angewandter Konfiguration - - aktuellem Runtime-Zustand -- **Nachweis:** - - `internal/control/control.go` wartet mit Snapshot-Updates, bis LivePatch erfolgreich war. - - `internal/control/control_test.go` deckt ab, dass abgelehnte Live-Updates keine neue `GET /config`-Ansicht schreiben. -- **Restrisiken:** - - Die API liefert noch nicht beide Sichten gleichzeitig; weitere Workstreams müssen Desired/Applied explizit zurückgeben. - -## WS-03 Entscheidungslog -| Datum | Entscheidung | Notiz | -|---|---|---| -| 2026-04-05 | CFG-SEM-001: `fm.outputDrive` | Live-Validierung auf 0..10 angeglichen, Tests angepasst, Parameterinventar dokumentiert. | -| 2026-04-05 | WS-03-T3: Desired/Applied-Gate | Control-API zeigt Snapshots nur noch, wenn LivePatch erfolgreich angewendet wurde; Tests verhindern irreführende Wunschwerte. | - -## WS-03 Verifikation -| Datum | Fokus | Ergebnis | -|---|---|---| -| 2026-04-05 | `go test ./...` | ✅ Bestätigt `Engine.UpdateConfig`, `LivePatch` und Parameter-Range sowie Inventar-Dokumentation. Neue Control-Tests sichern Desired/Applied-Gate. | - ---- - -# WS-01 — Deterministische Echtzeit-TX-Pipeline mit entkoppeltem Writer -**Priorität:** P0 -**Gesamtstatus:** IN PROGRESS - -## Ziel -Generator/Upsampler und Hardwarewriter werden als getrennte Stufen mit kleinem, kontrolliertem Frame-Puffer betrieben. - -## Aktueller Stand -- Der TX-Pfad ist laut Konzept aktuell noch synchron gekoppelt: - `GenerateFrame -> optional FMUpsampler.Process -> driver.Write` -- Das ist elegant, aber nicht pro-level-hart gegenüber Write-Spikes und Blockaden. - -## Aufgaben - -### WS-01-T1 — FrameQueue einführen -- **Status:** VERIFIED -- **Owner:** Lead Coderaffe -- **Code-Orte:** - - `internal/output/frame_queue.go` - - `internal/output/frame_queue_test.go` - - `internal/app/engine.go` -- **Ziel:** - Bounded Queue mit fester Kapazität, sichtbarem Füllstand, Counter- / Statistikzugriff und klarer Trennung zwischen Generator und Writer. -- **Zu entscheiden:** - - Puffern vor oder nach Upsampling → Device-Frame-Ebene (Queue lebt nach dem Upsampler) für Writer-Simplifizierung. - - Referenzkapazität: `runtime.frameQueueCapacity` (default 3) bleibt konfigurierbar. -- **Akzeptanzpunkte:** - - Keine unbounded Queue. - - Fill-Level (High/Low) ist aus `QueueStats` sichtbar. - - Queue-Health-Indikator (`queue.health`) liefert `critical`, `low` oder `normal` aus dem Fill-Level. EngineStats.`queue` zeigt den Status ebenfalls. - - Drop/Repeat/Mute-Counter sind vorhanden und testbar. -- **Nachweis:** - - `FrameQueue`-Implementierung (`internal/output/frame_queue.go`) liefert kapazitätsgesteuerte Push/Pop-Logik und Counters. - - Engine-Run nutzt Queue vor dem Writer und zeigt `QueueStats` in `EngineStats`. - - Tests (`internal/output/frame_queue_test.go` + `go test ./...`) decken Push/Pop, Timeout-Counters, Stats und den neuen Queue-Health-Indikator ab. -- **Restrisiken:** - - Die Queue wird aktuell synchron getrieben; ein dedizierter Writer-Worker fehlt noch. - - Queue-Close erwartet, dass Generator/Writer vor dem Schließen stoppen, sonst droht Panik beim Schreiben. - -### WS-01-T2 — Writer-Worker einführen -- **Status:** VERIFIED -- **Owner:** Lead Coderaffe -- **Code-Orte:** - - `internal/app/engine.go` (run loop, `writerLoop`, `cloneFrame`, Stats) - - `internal/dsp/*` (FMUpsampler / Resampler copy `GeneratedAt` für Cycle-Metriken) -- **Ziel:** - Generator/Upsampler liefern Frames in die FrameQueue, `driver.Write()` läuft nur noch im dedizierten Writer. -- **Akzeptanzpunkte:** - - `writerLoop()` ist die einzige Stelle mit `driver.Write()` und zieht aus der Queue. - - FrameQueue ist ein echter Puffer (Generator klont Frames, Writer poppt) und `EngineStats.Queue` zeigt den Füllstand. - - Write- und Cycle-Latenzen plus `LateBuffers` bleiben in `EngineStats` sichtbar (`MaxWriteMs`, `LateBuffers`, `MaxCycleMs`). -- **Nachweis:** - - `go test ./...` (Engine + Queue + DSP) läuft erfolgreich. - - `EngineStats` berichtet weiterhin über Queue-/Writer-Metriken. -- **Restrisiken:** - - Frame-Klonierung pro Chunk erhöht Heap-Pressure; spätere Workstreams sollten Pooling / Zero-Copy prüfen. - -### WS-01-T3 — Supervisor-Schicht einführen -- **Status:** TODO -- **Owner:** offen -- **Code-Orte:** - - `internal/app/engine.go` -- **Ziel:** - Queue-Füllstand, Late-Rate und Fehlerhäufigkeit überwachen und in Runtime-Zustände überführen. -- **Akzeptanzpunkte:** - - State-Entscheidungen sind explizit - - kein implizites Weiterwursteln bei Schieflage - -## Offene Architekturfragen -- Ist `capacity_frames = 3` ein guter Startwert oder nur Konzept-Default? -- Sollte im Fault-Fall `repeat last safe frame` erlaubt sein oder von Anfang an nur `mute`? -- Wie eng koppeln wir WS-01 mit WS-02, ohne Overengineering zu erzeugen? - -## WS-01 Entscheidungslog -| Datum | Entscheidung | Notiz | -|---|---|---| -| 2026-04-05 | FrameQueue mit Engine-Integration | Queue lebt nach dem Upsampler auf DeviceFrame-Ebene, Kapazität via `runtime.frameQueueCapacity`, `EngineStats` zeigt `QueueStats`, Tests decken Timeouts und Counters ab. | -| 2026-04-05 | Queue-Health-Indikator | `QueueStats.Health` gibt `critical`/`low`/`normal` zurück und `txBridge` leitet `EngineStats.Queue` ins `/runtime`-JSON. | -| 2026-04-05 | Runtime-Indikator | `EngineStats.RuntimeIndicator` kombiniert `queue.health` + `lateBuffers`, `/runtime` zeigt `engine.runtimeIndicator`. | -| 2026-04-05 | /status runtime indicator | `/status` reuses `txBridge.TXStats()` and now reports `runtimeIndicator` alongside the config snapshot for quick ops. | -| 2026-04-05 | /status queue stats | `/status` spiegelt das `queue`-Objekt aus `txBridge.TXStats()` für schnelle Queue-Checks, API-Doku und `TestStatusReportsQueueStats` fangen den neuen Key ab. | - -## WS-01 Verifikation -| Datum | Fokus | Ergebnis | -|---|---|---| -| 2026-04-05 | FrameQueue + Engine integration | ✅ `go test ./...` (im `internal`-Modul incl. `frame_queue_test.go`) | -| 2026-04-05 | Queue-Health-Indikator | go test ./... deckt `TestFrameQueueHealthIndicator` und `queue.health` ab. | -| 2026-04-05 | Runtime-Indikator | OK `go test ./...` deckt `runtimeIndicator` sowie `/runtime`-Exposition von `engine.runtimeIndicator`. | -| 2026-04-05 | Runtime API queue health | ✅ `/runtime` liefert jetzt `engine.queue.health` dank `txBridge.TXStats`. | -| 2026-04-05 | /status runtime indicator | ✅ `/status` gibt jetzt `runtimeIndicator` aus (`control_test` deckt den neuen Key). | -| 2026-04-05 | /status queue stats | ✅ `TestStatusReportsQueueStats` plus `docs/API.md` zeigen, dass `queue` korrekt durchgereicht wird. | - ---- - -# WS-02 — Explizite Runtime-State-Maschine und Fault-Handling -**Priorität:** P0 -**Gesamtstatus:** IN PROGRESS - -## Ziel -Einführen eines klaren Betriebsmodells mit Fault-, Recovery- und Muted-Zuständen. - -## Fortschritt -- EngineStats liefert das Runtime-State-Feld (`idle`, `arming`, `prebuffering`, `running`) und reagiert nun auf Queue-Gesundheit bzw. späte Buffers, indem es bei `low`/`critical` oder späten Buffern in `degraded` wechselt und sonst auf `running` zurückkehrt. -- `evaluateRuntimeState` escalates persistent `critical` queues from `degraded` to `muted`, while `FaultReasonQueueCritical` surfaces `muted` severity so the mute transition stays observable. -- `evaluateRuntimeState` now waits for a short healthy streak before leaving `muted`, logging a degraded-severity recovery event once the queue settles. -- Persistent queue-critical streaks while `muted` now escalate to `faulted` with `FaultSeverityFaulted`, keeping `RuntimeStateFaulted` observable. -- `EngineStats` and `txBridge` now expose transition/fault counters plus `lastFault`, surfacing the new telemetry through `/runtime`. -- Control-plane UI now renders those WS-02 transition counters, fault count, and last-fault summary so operators can watch runtime escalations without digging through logs. -- Control-plane now exposes `POST /runtime/fault/reset` so operators can acknowledge `faulted` state; `TestRuntimeFaultReset*` covers the new HTTP path. -- Control-plane UI now also offers a Danger Zone `Reset Fault` button that calls the same endpoint so operators can acknowledge faults from the dashboard. - -- Control-plane UI now posts an ops toast/log entry whenever the runtime state shifts so escalations and manual acknowledgements are immediately visible. -- Control-plane UI now keeps a compact Transition History panel beside the Fault History so operators can see recent runtime shifts without scrolling the activity log. - - -## Zielzustände laut Konzept -- `idle` -- `arming` -- `prebuffering` -- `running` -- `degraded` -- `muted` -- `faulted` -- `stopping` - -## Aufgaben - -### WS-02-T1 — Fault-Klassifikation definieren -- **Status:** TODO -- **Owner:** offen -- **Beispiele:** - - Treiberfehler - - Write-Time-Budget überschritten - - Queue leer - - Queue dauerhaft kritisch - - Selbsttest fehlgeschlagen - - unerlaubtes Live-Update - -### WS-02-T2 — Reaktionsstrategie definieren -- **Status:** TODO -- **Owner:** offen -- **Ziel:** - Pro Fehlerklasse klar definieren: - - warn only - - degraded - - muted - - faulted - -### WS-02-T3 — Fault-Historie und Event-Log einführen -- **Status:** TODO -- **Owner:** offen -- **Ziel:** - Zustandswechsel und Faults auditierbar machen. - -## Offene Designfragen -- Wie fein granular darf die State-Maschine werden, ohne unwartbar zu werden? -- Welche Transitionen sind wirklich produktiv relevant und welche nur „theoretisch schön“? - -## WS-02 Entscheidungslog -| Datum | Entscheidung | Notiz | -|---|---|---| -| 2026-04-05 | Faulted escalation on persistent critical queue | `muted` now surfaces `RuntimeStateFaulted` when queue health stays critical and metrics capture every transition. | -| 2026-04-05 | Manual fault reset endpoint | Added `POST /runtime/fault/reset` so operators can acknowledge `faulted` before the supervisor re-enters recovery. | -| 2026-04-05 | Fault-reset UI shortcut | Danger Zone now hosts a Reset Fault button wired to `/runtime/fault/reset` so operators get an in-app acknowledgement path without manual HTTP calls. | -| 2026-04-06 | Runtime transition visibility cue | Control UI now posts toast/log entries for runtime state shifts so ops instantly sees escalations and manual reset acknowledgements. | -| 2026-04-06 | Transition history panel | Added a compact Transition History panel next to the Fault History so the last few runtime state shifts stay visible even when the activity log is full. | - -## WS-02 Verifikation -| Datum | Fokus | Ergebnis | -|---|---|---| -| 2026-04-05 | Faulted path + transition counters | `go test ./...` exercises `TestEngineFaultsAfterMutedCriticalStreak` and `TestRuntimeTransitionCounters`, while `/runtime` now surfaces `engine.degradedTransitions`, `engine.mutedTransitions`, `engine.faultedTransitions`, `engine.faultCount`, and the last fault via `txBridge`. | -| 2026-04-05 | Runtime fault reset API | `go test ./...` now runs `TestRuntimeFaultReset*`, verifying the new HTTP path and controller error scenarios. | -| 2026-04-06 | Runtime transition visibility | ✅ `go test ./...`; manual UI smoke verification still pending to ensure the toast/log flow shows every runtime shift. | - ---- - -# WS-04 — Observability, Telemetrie und Diagnosefähigkeit -**Priorität:** P1 -**Gesamtstatus:** TODO - -## Ziel -Vollständige Sichtbarkeit auf Runtime, Queue, Writer, Generator, RF-Selbsttests und API-Aktivität schaffen. - -## Aufgaben - -### WS-04-T1 — Strukturiertes Logging -- **Status:** TODO -- **Owner:** offen - -### WS-04-T2 — Prometheus-/Metrics-Schicht -- **Status:** TODO -- **Owner:** offen - -### WS-04-T3 — Debug-/Profiling-Endpunkte -- **Status:** TODO -- **Owner:** offen - -## Gewünschte Beispielmetriken -- `engine_chunks_generated_total` -- `engine_late_buffers_total` -- `engine_fault_transitions_total` -- `writer_write_duration_seconds` -- `queue_fill_ratio` -- `queue_dropped_frames_total` -- `queue_muted_frames_total` -- `driver_write_errors_total` -- `audio_stream_underruns_total` -- `audio_stream_overflows_total` -- `rf_selftest_pilot_db` -- `rf_selftest_rds_57k_db` - -## WS-04 Entscheidungslog -| Datum | Entscheidung | Notiz | -| --- | --- | --- | -| 2026-04-06 | High-watermark trend sparkline | Captured audio high-watermark duration history and surface it as a new Health-panel sparkline for queue pressure visibility. | -| 2026-04-06 | Queue fill visibility | Added queue fill ratio health line and sparklines to highlight real-time queue pressure alongside high-watermark trends. | -| 2026-04-07 | Underrun streak telemetry | StreamStats now expose current and max underrun streak counters so queue diagnostics can see repeated underruns without touching the metrics stack. | - -## WS-04 Verifikation -| Datum | Fokus | Ergebnis | -| --- | --- | --- | -| 2026-04-06 | High-watermark trend sparkline | `go test ./...` plus manual UI check confirm the new sparkline updates with runtime audio stats. | -| 2026-04-06 | Queue fill visibility | `go test ./...` plus UI smoke check confirm queue fill stats stay available and the new sparkline/health line react to queue health changes. | -| 2026-04-07 | Underrun streak telemetry | `go test ./internal/audio` confirms the new streak counters plus Stats coverage so the API surfaces the same names. | - ---- - -# WS-05 — Sichere und erwachsene Control-Plane -**Priorität:** P1 / P3-nah -**Gesamtstatus:** TODO - -## Ziel -API transport- und anwendungsseitig härten, state-aware machen und auditierbar gestalten. - -## Aufgaben - -### WS-05-T1 — Auth und Deploy-Modi definieren -- **Status:** TODO -- **Owner:** offen -- **Zielmodi:** - - localhost-only - - trusted-lan - - secured-remote - -### WS-05-T2 — HTTP-Server härten -- **Status:** TODO -- **Owner:** offen -- **Mindestpunkte:** - - ReadTimeout - - WriteTimeout - - IdleTimeout - - ReadHeaderTimeout - - Body-Size-Limits - - Content-Type-Validierung - - Method Enforcement - -### WS-05-T3 — API semantisch aufräumen -- **Status:** TODO -- **Owner:** offen -- **Ziel:** - - DesiredConfig vs AppliedConfig vs RuntimeState - - idempotente Start/Stop-Endpunkte - - transaktionsartige Apply-/Reject-Antworten - - Audit-Log pro Eingriff - -## Frühe Quick-Wins -Diese Punkte könnten ggf. vorgezogen werden, auch wenn WS-05 formal nach WS-01/02 kommt: -- HTTP-Timeouts -- Body-Limits -- sicherer Standard-Bind-Modus - -## WS-05 Entscheidungslog -- Noch leer - -## WS-05 Verifikation -| Datum | Fokus | Ergebnis | -|---|---|---| -| 2026-04-05 | `/audio/stream` rejects non-POST requests | `TestAudioStreamRejectsNonPost` enforces POST-only access to `/audio/stream` before a stream source is configured | - ---- - -# WS-06 — Hardware-in-the-loop und externe RF-Wahrheitsprüfung -**Priorität:** P2 -**Gesamtstatus:** TODO - -## Ziel -Nicht nur intern richtig rechnen, sondern extern nachweisen, dass tatsächlich korrekt gesendet wird. - -## Status -- Konzept vorhanden -- noch kein eingetragener HIL-Arbeitsstand in diesem Dokument - -## Offene Kernfragen -- Welches Referenz-Setup wird verbindlich? -- Welche Testfrequenz / Standarddauer / Schutzmaßnahmen gelten? -- Welcher externe Decoder / Empfänger gilt als Referenz? - ---- - -# WS-07 — Device-aware Capability- und Kalibrierungsmodell -**Priorität:** P2 -**Gesamtstatus:** TODO - -## Ziel -Fähigkeiten und Kalibrierungen nicht implizit, sondern explizit pro Device modellieren. - -## Noch offen -- Capability-Schema konkretisieren -- Kalibrierungsprofil definieren -- Device-aware Validation einbauen - ---- - -# WS-08 — Signal-Selbstüberwachung im Betrieb -**Priorität:** P2 -**Gesamtstatus:** TODO - -## Ziel -Pilot, Stereo, RDS und Composite-Anomalien im Betrieb erkennen. - -## Noch offen -- Goertzel/FFT-Strategie festlegen -- Schwellwerte definieren -- in Fault-Logik einspeisen - ---- - -# WS-09 — Teststrategie erweitern -**Priorität:** P3/P4-nah -**Gesamtstatus:** TODO - -## Ziel -Von Unit-Tests zu echter Qualitätsabsicherung: Golden Vectors, Long-Run, Race, Fuzzing, API-Mutation, HIL. - -## Noch offen -- Testpyramide konkretisieren -- Nightly-/CI-Fähigkeit bestimmen - ---- - -# WS-10 — Service-Reife, Packaging und Reproduzierbarkeit -**Priorität:** P4 -**Gesamtstatus:** TODO - -## Ziel -Build-, Release- und Betriebsartefakte reproduzierbar und teamtauglich machen. - -## Noch offen -- Build-Metadaten -- Service-Units -- Config-Versionierung / Migration - ---- - -## 5. Übergreifende Regeln - -### Musts -- Jeder neue Runtime-Zustand muss per API und Telemetrie sichtbar sein. -- Jede Recovery-, Drop- oder Mute-Strategie braucht Counter, Logs und Tests. -- Keine neue Config-Option ohne klaren Typ, Bereich, Einheit, Default und Hot-Reload-Klassifikation. -- Hardware-nahe Änderungen brauchen mindestens Simulations- und HIL-Validierung. -- Alle Faults müssen eine maschinenlesbare Ursache und eine menschenlesbare Zusammenfassung haben. - -### Must Not -- Keine unbounded Queues. -- Keine stillen Fallbacks ohne Telemetrie. -- Keine teilweise angewandten Live-Config-Änderungen ohne explizite Rückmeldung. -- Keine unterschiedlichen Grenzwerte zwischen Config, API und Runtime. -- Keine sicherheitsrelevanten HTTP-Endpunkte ohne Härtung im Remote-Betrieb. - ---- - -## 6. Aktuelle offene Entscheidungen - -| ID | Status | Frage | Notiz | -|---|---|---|---| -| DEC-001 | RESOLVED | Puffern wir auf CompositeFrame- oder DeviceFrame-Ebene? | Queue lebt nach dem Upsampler (DeviceFrame-Ebene) gemäß `internal/app/engine.go`-Integrationsschleife. | -| DEC-002 | OPEN | Fault-Recovery zuerst mit `mute`, `repeat last safe frame` oder beidem? | Muss technisch und RF-seitig sauber bewertet werden | -| DEC-003 | OPEN | Ziehen wir minimale WS-05-Basis-Härtungen vor? | Timeouts/Body-Limits evtl. früher sinnvoll | -| DEC-004 | OPEN | Wie gross/simpel halten wir die erste State-Maschine? | Gefahr von Overengineering | - ---- - -## 7. Nächste sinnvolle Schritte - -### Empfohlener Start -1. **WS-03-T1 Parameterinventar erstellen** *(abgeschlossen)* -2. **bekannte Inkonsistenzen (CFG-SEM-001, CTL-UX-001) konkret verifizieren** -3. **DesiredConfig / AppliedConfig / RuntimeState Zielmodell grob skizzieren** -4. Danach Architekturarbeit an **WS-01 + WS-02** starten -5. **Aktuell:** WS-01-T2 Writer-Worker einführen (Queue → Driver), danach WS-01-T3 Supervisor + WS-02 Runtime-State. - -### Vor dem ersten grossen Umbau klären -- Was ist „minimal sinnvoll“ für Milestone 1? -- Welche Dinge sind harte Must-haves und welche nur spätere Veredelung? -- Wo wollen wir bewusst nicht sofort maximal abstrahieren? - ---- - -## 8. Änderungsprotokoll - -| Datum | Änderung | Person / Agent | -|---|---|---| -| 2026-04-05 | Initiales Arbeitsdokument aus `fm-rds-tx_pro_runtime_hardening_concept.json` erstellt | Alfred | +# Pro Runtime Hardening Workboard + +Status: living document +Branch: `feature/pro-runtime-hardening` + +Dieses Dokument ist das **Arbeitsdokument** zur schrittweisen Umsetzung des Konzepts aus `fm-rds-tx_pro_runtime_hardening_concept.json`. + +Ziel ist **nicht** nur eine hübsche Roadmap, sondern ein Ort, an dem wir konkret markieren können: +- **wo** wir im Code stehen, +- **welche Lücken** bestätigt sind, +- **welche Entscheidungen** gefallen sind, +- **welche Arbeiten** offen / in Arbeit / erledigt sind, +- **welche Risiken** noch bestehen, +- **welche Akzeptanzkriterien** wirklich nachgewiesen wurden. + +--- + +## 1. Arbeitsregeln für dieses Dokument + +### Statuswerte +- `TODO` → noch nicht begonnen +- `IN PROGRESS` → aktiv in Arbeit +- `BLOCKED` → sinnvoll erkannt, aber blockiert +- `DONE` → umgesetzt +- `VERIFIED` → umgesetzt **und** sinnvoll geprüft +- `DEFERRED` → bewusst nach hinten verschoben +- `REJECTED` → bewusst verworfen + +### Nachweispflicht +Ein Punkt gilt erst als wirklich fertig, wenn eingetragen ist: +1. **Code-Ort(e)** +2. **Was geändert wurde** +3. **Wie verifiziert wurde** +4. **Welche Restrisiken bleiben** + +### Update-Regel +Wenn wir an einem Workstream arbeiten, soll dieses Dokument mitgezogen werden. +Kein „ist im Kopf klar“. Der Stand kommt hier rein. + +--- + +## 2. Gesamtüberblick + +## Gesamtstatus +- Projektphase: `Umsetzung (WS-01)` +- Technischer Fokus aktuell: `Entkoppelter TX-Pfad (FrameQueue + Writer)` +- Nächster sinnvoller Startpunkt laut Konzept: `WS-01 Deterministische Echtzeit-TX-Pipeline mit entkoppeltem Writer` +- Vorangegangene Workstreams: `WS-03 Semantische Korrektheit und konsistent angewandte Config` (abgeschlossen) + +## Repo-bezogene bestätigte Ausgangslage + +| Thema | Status | Notiz | +|---|---|---| +| TX-Engine aktuell als synchroner Single-Loop | CONFIRMED | `internal/app/engine.go` | +| Persistenter DSP-Zustand im Generator vorhanden | CONFIRMED | `internal/offline/generator.go` | +| HTTP-Control vorhanden | CONFIRMED | `internal/control/control.go` | +| Config-Validation vorhanden, aber nicht überall semantisch konsistent | CONFIRMED | `internal/config/config.go` + Runtime-Pfade | +| Device/Capability-Modell vorhanden, aber noch nicht streng genug | CONFIRMED | `internal/platform/soapy.go` | +| Lock-freier SPSC-Audio-Ringbuffer vorhanden | CONFIRMED | `internal/audio/stream.go` | + +## Bereits bekannte bestätigte Inkonsistenzen + +| ID | Status | Beschreibung | Ort | +|---|---|---|---| +| CFG-SEM-001 | CONFIRMED | `fm.outputDrive` wird in Validation und Runtime nicht konsistent behandelt | `internal/config/config.go`, `internal/app/engine.go` | +| CTL-UX-001 | RESOLVED | `handleAudioStream()` beschreibt `--audio-http`; der CLI-Schalter ist nun vorhanden und setzt den Stream-Puffer für `/audio/stream` direkt. | `internal/control/control.go`, `cmd/fmrtx/main.go` | + +--- + +## 3. Prioritätenmodell + +| Priorität | Bedeutung | +|---|---| +| P0 | Technische Perfektion und Determinismus | +| P1 | Betriebssicherheit und Fehlerbeherrschung | +| P2 | Hardware-Wahrheit und RF-Qualität | +| P3 | Sichere und saubere Runtime-Steuerung | +| P4 | Deployment-, Release- und Service-Reife | + +--- + +## 4. Umsetzungstracker nach Workstream + +# WS-03 — Semantische Korrektheit und harte Config-/Runtime-Konsistenz +**Priorität:** P0 +**Gesamtstatus:** IN PROGRESS + +## Ziel +Ein einziger, eindeutig definierter Parameterraum. Jeder Wert hat exakt eine Bedeutung und identische Constraints in Config, HTTP-API, Runtime und Telemetrie. + +## Warum dieser Workstream zuerst +Wenn Semantik und Grenzwerte nicht sauber vereinheitlicht sind, bauen spätere Runtime- und Fault-Mechanismen auf unstabilem Fundament. + +## Aufgaben + +### WS-03-T1 — Parameterinventar erstellen +- **Status:** VERIFIED +- **Owner:** Builder A +- **Code-Orte:** + - `internal/config/config.go` + - `internal/app/engine.go` + - `internal/control/control.go` + - `internal/offline/generator.go` +- **Ziel:** + Alle öffentlich und intern verwendeten Parameter inventarisieren mit: + - Name + - Typ + - Einheit + - Bereich + - Default + - hot-reload-fähig ja/nein + - safety class + - Telemetrie-Name +- **Offene Fragen:** + - Wo leben heute implizite Parameter, die nicht sauber dokumentiert sind? + - Welche Runtime-Werte sind abgeleitet statt direkt konfigurierbar? +- **Nachweis:** + - `docs/ws-03-parameter-inventory.md` enthält das inventarisierte Parameter-Tableau und referenziert Config/Control/Engine. + - Live-Nutzung über `internal/control/control.go` → `LivePatch` dokumentiert. +- **Restrisiken:** + - versteckte Semantik in Helper-Funktionen übersehen + +### WS-03-T2 — Validation vereinheitlichen +- **Status:** VERIFIED +- **Owner:** Builder A +- **Code-Orte:** + - `internal/config/config.go` + - `internal/app/engine.go` + - `internal/app/engine_test.go` + - `internal/control/control.go` +- **Ziel:** + `Config.Validate()`, Runtime-Update-Pfade und API-Patch-Validierung dürfen nicht divergieren. +- **Bereits bekannter Startpunkt:** + - `fm.outputDrive` +- **Nachweis:** + - CFG-SEM-001: `outputDrive`-Validation in `Engine.UpdateConfig` jetzt 0..10 (wie `Config.Validate`). + - Tests (`go test ./...`) fangen neue Range ab und besitzen aktualisierten `engine_test`-Check. + - Live-Patch fließt durch `txBridge` und `LivePatch` (control) → `LiveConfigUpdate`. +- **Restrisiken:** + - weitere Inkonsistenzen erst beim Inventar sichtbar + +### WS-03-T3 — DesiredConfig / AppliedConfig einführen +- **Status:** IN PROGRESS +- **Owner:** Lead Coderaffe +- **Code-Orte:** + - `internal/app/engine.go` + - `internal/control/control.go` + - ggf. Config-/Statusmodelle +- **Ziel:** + API und Runtime sollen trennen zwischen: + - gewünschter Konfiguration + - tatsächlich angewandter Konfiguration + - aktuellem Runtime-Zustand +- **Nachweis:** + - `internal/control/control.go` wartet mit Snapshot-Updates, bis LivePatch erfolgreich war. + - `internal/control/control_test.go` deckt ab, dass abgelehnte Live-Updates keine neue `GET /config`-Ansicht schreiben. +- **Restrisiken:** + - Die API liefert noch nicht beide Sichten gleichzeitig; weitere Workstreams müssen Desired/Applied explizit zurückgeben. + +## WS-03 Entscheidungslog +| Datum | Entscheidung | Notiz | +|---|---|---| +| 2026-04-05 | CFG-SEM-001: `fm.outputDrive` | Live-Validierung auf 0..10 angeglichen, Tests angepasst, Parameterinventar dokumentiert. | +| 2026-04-05 | WS-03-T3: Desired/Applied-Gate | Control-API zeigt Snapshots nur noch, wenn LivePatch erfolgreich angewendet wurde; Tests verhindern irreführende Wunschwerte. | + +## WS-03 Verifikation +| Datum | Fokus | Ergebnis | +|---|---|---| +| 2026-04-05 | `go test ./...` | ✅ Bestätigt `Engine.UpdateConfig`, `LivePatch` und Parameter-Range sowie Inventar-Dokumentation. Neue Control-Tests sichern Desired/Applied-Gate. | + +--- + +# WS-01 — Deterministische Echtzeit-TX-Pipeline mit entkoppeltem Writer +**Priorität:** P0 +**Gesamtstatus:** IN PROGRESS + +## Ziel +Generator/Upsampler und Hardwarewriter werden als getrennte Stufen mit kleinem, kontrolliertem Frame-Puffer betrieben. + +## Aktueller Stand +- Der TX-Pfad ist laut Konzept aktuell noch synchron gekoppelt: + `GenerateFrame -> optional FMUpsampler.Process -> driver.Write` +- Das ist elegant, aber nicht pro-level-hart gegenüber Write-Spikes und Blockaden. + +## Aufgaben + +### WS-01-T1 — FrameQueue einführen +- **Status:** VERIFIED +- **Owner:** Lead Coderaffe +- **Code-Orte:** + - `internal/output/frame_queue.go` + - `internal/output/frame_queue_test.go` + - `internal/app/engine.go` +- **Ziel:** + Bounded Queue mit fester Kapazität, sichtbarem Füllstand, Counter- / Statistikzugriff und klarer Trennung zwischen Generator und Writer. +- **Zu entscheiden:** + - Puffern vor oder nach Upsampling → Device-Frame-Ebene (Queue lebt nach dem Upsampler) für Writer-Simplifizierung. + - Referenzkapazität: `runtime.frameQueueCapacity` (default 3) bleibt konfigurierbar. +- **Akzeptanzpunkte:** + - Keine unbounded Queue. + - Fill-Level (High/Low) ist aus `QueueStats` sichtbar. + - Queue-Health-Indikator (`queue.health`) liefert `critical`, `low` oder `normal` aus dem Fill-Level. EngineStats.`queue` zeigt den Status ebenfalls. + - Drop/Repeat/Mute-Counter sind vorhanden und testbar. +- **Nachweis:** + - `FrameQueue`-Implementierung (`internal/output/frame_queue.go`) liefert kapazitätsgesteuerte Push/Pop-Logik und Counters. + - Engine-Run nutzt Queue vor dem Writer und zeigt `QueueStats` in `EngineStats`. + - Tests (`internal/output/frame_queue_test.go` + `go test ./...`) decken Push/Pop, Timeout-Counters, Stats und den neuen Queue-Health-Indikator ab. +- **Restrisiken:** + - Die Queue wird aktuell synchron getrieben; ein dedizierter Writer-Worker fehlt noch. + - Queue-Close erwartet, dass Generator/Writer vor dem Schließen stoppen, sonst droht Panik beim Schreiben. + +### WS-01-T2 — Writer-Worker einführen +- **Status:** VERIFIED +- **Owner:** Lead Coderaffe +- **Code-Orte:** + - `internal/app/engine.go` (run loop, `writerLoop`, `cloneFrame`, Stats) + - `internal/dsp/*` (FMUpsampler / Resampler copy `GeneratedAt` für Cycle-Metriken) +- **Ziel:** + Generator/Upsampler liefern Frames in die FrameQueue, `driver.Write()` läuft nur noch im dedizierten Writer. +- **Akzeptanzpunkte:** + - `writerLoop()` ist die einzige Stelle mit `driver.Write()` und zieht aus der Queue. + - FrameQueue ist ein echter Puffer (Generator klont Frames, Writer poppt) und `EngineStats.Queue` zeigt den Füllstand. + - Write- und Cycle-Latenzen plus `LateBuffers` bleiben in `EngineStats` sichtbar (`MaxWriteMs`, `LateBuffers`, `MaxCycleMs`). +- **Nachweis:** + - `go test ./...` (Engine + Queue + DSP) läuft erfolgreich. + - `EngineStats` berichtet weiterhin über Queue-/Writer-Metriken. +- **Restrisiken:** + - Frame-Klonierung pro Chunk erhöht Heap-Pressure; spätere Workstreams sollten Pooling / Zero-Copy prüfen. + +### WS-01-T3 — Supervisor-Schicht einführen +- **Status:** TODO +- **Owner:** offen +- **Code-Orte:** + - `internal/app/engine.go` +- **Ziel:** + Queue-Füllstand, Late-Rate und Fehlerhäufigkeit überwachen und in Runtime-Zustände überführen. +- **Akzeptanzpunkte:** + - State-Entscheidungen sind explizit + - kein implizites Weiterwursteln bei Schieflage + +## Offene Architekturfragen +- Ist `capacity_frames = 3` ein guter Startwert oder nur Konzept-Default? +- Sollte im Fault-Fall `repeat last safe frame` erlaubt sein oder von Anfang an nur `mute`? +- Wie eng koppeln wir WS-01 mit WS-02, ohne Overengineering zu erzeugen? + +## WS-01 Entscheidungslog +| Datum | Entscheidung | Notiz | +|---|---|---| +| 2026-04-05 | FrameQueue mit Engine-Integration | Queue lebt nach dem Upsampler auf DeviceFrame-Ebene, Kapazität via `runtime.frameQueueCapacity`, `EngineStats` zeigt `QueueStats`, Tests decken Timeouts und Counters ab. | +| 2026-04-05 | Queue-Health-Indikator | `QueueStats.Health` gibt `critical`/`low`/`normal` zurück und `txBridge` leitet `EngineStats.Queue` ins `/runtime`-JSON. | +| 2026-04-05 | Runtime-Indikator | `EngineStats.RuntimeIndicator` kombiniert `queue.health` + `lateBuffers`, `/runtime` zeigt `engine.runtimeIndicator`. | +| 2026-04-05 | /status runtime indicator | `/status` reuses `txBridge.TXStats()` and now reports `runtimeIndicator` alongside the config snapshot for quick ops. | +| 2026-04-05 | /status queue stats | `/status` spiegelt das `queue`-Objekt aus `txBridge.TXStats()` für schnelle Queue-Checks, API-Doku und `TestStatusReportsQueueStats` fangen den neuen Key ab. | + +## WS-01 Verifikation +| Datum | Fokus | Ergebnis | +|---|---|---| +| 2026-04-05 | FrameQueue + Engine integration | ✅ `go test ./...` (im `internal`-Modul incl. `frame_queue_test.go`) | +| 2026-04-05 | Queue-Health-Indikator | go test ./... deckt `TestFrameQueueHealthIndicator` und `queue.health` ab. | +| 2026-04-05 | Runtime-Indikator | OK `go test ./...` deckt `runtimeIndicator` sowie `/runtime`-Exposition von `engine.runtimeIndicator`. | +| 2026-04-05 | Runtime API queue health | ✅ `/runtime` liefert jetzt `engine.queue.health` dank `txBridge.TXStats`. | +| 2026-04-05 | /status runtime indicator | ✅ `/status` gibt jetzt `runtimeIndicator` aus (`control_test` deckt den neuen Key). | +| 2026-04-05 | /status queue stats | ✅ `TestStatusReportsQueueStats` plus `docs/API.md` zeigen, dass `queue` korrekt durchgereicht wird. | + +--- + +# WS-02 — Explizite Runtime-State-Maschine und Fault-Handling +**Priorität:** P0 +**Gesamtstatus:** IN PROGRESS + +## Ziel +Einführen eines klaren Betriebsmodells mit Fault-, Recovery- und Muted-Zuständen. + +## Fortschritt +- EngineStats liefert das Runtime-State-Feld (`idle`, `arming`, `prebuffering`, `running`) und reagiert nun auf Queue-Gesundheit bzw. späte Buffers, indem es bei `low`/`critical` oder späten Buffern in `degraded` wechselt und sonst auf `running` zurückkehrt. +- `evaluateRuntimeState` escalates persistent `critical` queues from `degraded` to `muted`, while `FaultReasonQueueCritical` surfaces `muted` severity so the mute transition stays observable. +- `evaluateRuntimeState` now waits for a short healthy streak before leaving `muted`, logging a degraded-severity recovery event once the queue settles. +- Persistent queue-critical streaks while `muted` now escalate to `faulted` with `FaultSeverityFaulted`, keeping `RuntimeStateFaulted` observable. +- `EngineStats` and `txBridge` now expose transition/fault counters plus `lastFault`, surfacing the new telemetry through `/runtime`. +- Control-plane UI now renders those WS-02 transition counters, fault count, and last-fault summary so operators can watch runtime escalations without digging through logs. +- Control-plane now exposes `POST /runtime/fault/reset` so operators can acknowledge `faulted` state; `TestRuntimeFaultReset*` covers the new HTTP path. +- Control-plane UI now also offers a Danger Zone `Reset Fault` button that calls the same endpoint so operators can acknowledge faults from the dashboard. + +- Control-plane UI now posts an ops toast/log entry whenever the runtime state shifts so escalations and manual acknowledgements are immediately visible. +- Control-plane UI now keeps a compact Transition History panel beside the Fault History so operators can see recent runtime shifts without scrolling the activity log. + + +## Zielzustände laut Konzept +- `idle` +- `arming` +- `prebuffering` +- `running` +- `degraded` +- `muted` +- `faulted` +- `stopping` + +## Aufgaben + +### WS-02-T1 — Fault-Klassifikation definieren +- **Status:** TODO +- **Owner:** offen +- **Beispiele:** + - Treiberfehler + - Write-Time-Budget überschritten + - Queue leer + - Queue dauerhaft kritisch + - Selbsttest fehlgeschlagen + - unerlaubtes Live-Update + +### WS-02-T2 — Reaktionsstrategie definieren +- **Status:** TODO +- **Owner:** offen +- **Ziel:** + Pro Fehlerklasse klar definieren: + - warn only + - degraded + - muted + - faulted + +### WS-02-T3 — Fault-Historie und Event-Log einführen +- **Status:** TODO +- **Owner:** offen +- **Ziel:** + Zustandswechsel und Faults auditierbar machen. + +## Offene Designfragen +- Wie fein granular darf die State-Maschine werden, ohne unwartbar zu werden? +- Welche Transitionen sind wirklich produktiv relevant und welche nur „theoretisch schön“? + +## WS-02 Entscheidungslog +| Datum | Entscheidung | Notiz | +|---|---|---| +| 2026-04-05 | Faulted escalation on persistent critical queue | `muted` now surfaces `RuntimeStateFaulted` when queue health stays critical and metrics capture every transition. | +| 2026-04-05 | Manual fault reset endpoint | Added `POST /runtime/fault/reset` so operators can acknowledge `faulted` before the supervisor re-enters recovery. | +| 2026-04-05 | Fault-reset UI shortcut | Danger Zone now hosts a Reset Fault button wired to `/runtime/fault/reset` so operators get an in-app acknowledgement path without manual HTTP calls. | +| 2026-04-06 | Runtime transition visibility cue | Control UI now posts toast/log entries for runtime state shifts so ops instantly sees escalations and manual reset acknowledgements. | +| 2026-04-06 | Transition history panel | Added a compact Transition History panel next to the Fault History so the last few runtime state shifts stay visible even when the activity log is full. | + +## WS-02 Verifikation +| Datum | Fokus | Ergebnis | +|---|---|---| +| 2026-04-05 | Faulted path + transition counters | `go test ./...` exercises `TestEngineFaultsAfterMutedCriticalStreak` and `TestRuntimeTransitionCounters`, while `/runtime` now surfaces `engine.degradedTransitions`, `engine.mutedTransitions`, `engine.faultedTransitions`, `engine.faultCount`, and the last fault via `txBridge`. | +| 2026-04-05 | Runtime fault reset API | `go test ./...` now runs `TestRuntimeFaultReset*`, verifying the new HTTP path and controller error scenarios. | +| 2026-04-06 | Runtime transition visibility | ✅ `go test ./...`; manual UI smoke verification still pending to ensure the toast/log flow shows every runtime shift. | + +--- + +# WS-04 — Observability, Telemetrie und Diagnosefähigkeit +**Priorität:** P1 +**Gesamtstatus:** TODO + +## Ziel +Vollständige Sichtbarkeit auf Runtime, Queue, Writer, Generator, RF-Selbsttests und API-Aktivität schaffen. + +## Aufgaben + +### WS-04-T1 — Strukturiertes Logging +- **Status:** TODO +- **Owner:** offen + +### WS-04-T2 — Prometheus-/Metrics-Schicht +- **Status:** TODO +- **Owner:** offen + +### WS-04-T3 — Debug-/Profiling-Endpunkte +- **Status:** TODO +- **Owner:** offen + +## Gewünschte Beispielmetriken +- `engine_chunks_generated_total` +- `engine_late_buffers_total` +- `engine_fault_transitions_total` +- `writer_write_duration_seconds` +- `queue_fill_ratio` +- `queue_dropped_frames_total` +- `queue_muted_frames_total` +- `driver_write_errors_total` +- `audio_stream_underruns_total` +- `audio_stream_overflows_total` +- `rf_selftest_pilot_db` +- `rf_selftest_rds_57k_db` + +## WS-04 Entscheidungslog +| Datum | Entscheidung | Notiz | +| --- | --- | --- | +| 2026-04-06 | High-watermark trend sparkline | Captured audio high-watermark duration history and surface it as a new Health-panel sparkline for queue pressure visibility. | +| 2026-04-06 | Queue fill visibility | Added queue fill ratio health line and sparklines to highlight real-time queue pressure alongside high-watermark trends. | +| 2026-04-07 | Underrun streak telemetry | StreamStats now expose current and max underrun streak counters so queue diagnostics can see repeated underruns without touching the metrics stack. | + +## WS-04 Verifikation +| Datum | Fokus | Ergebnis | +| --- | --- | --- | +| 2026-04-06 | High-watermark trend sparkline | `go test ./...` plus manual UI check confirm the new sparkline updates with runtime audio stats. | +| 2026-04-06 | Queue fill visibility | `go test ./...` plus UI smoke check confirm queue fill stats stay available and the new sparkline/health line react to queue health changes. | +| 2026-04-07 | Underrun streak telemetry | `go test ./internal/audio` confirms the new streak counters plus Stats coverage so the API surfaces the same names. | + +--- + +# WS-05 — Sichere und erwachsene Control-Plane +**Priorität:** P1 / P3-nah +**Gesamtstatus:** TODO + +## Ziel +API transport- und anwendungsseitig härten, state-aware machen und auditierbar gestalten. + +## Aufgaben + +### WS-05-T1 — Auth und Deploy-Modi definieren +- **Status:** TODO +- **Owner:** offen +- **Zielmodi:** + - localhost-only + - trusted-lan + - secured-remote + +### WS-05-T2 — HTTP-Server härten +- **Status:** TODO +- **Owner:** offen +- **Mindestpunkte:** + - ReadTimeout + - WriteTimeout + - IdleTimeout + - ReadHeaderTimeout + - Body-Size-Limits + - Content-Type-Validierung + - Method Enforcement + +### WS-05-T3 — API semantisch aufräumen +- **Status:** TODO +- **Owner:** offen +- **Ziel:** + - DesiredConfig vs AppliedConfig vs RuntimeState + - idempotente Start/Stop-Endpunkte + - transaktionsartige Apply-/Reject-Antworten + - Audit-Log pro Eingriff + +## Frühe Quick-Wins +Diese Punkte könnten ggf. vorgezogen werden, auch wenn WS-05 formal nach WS-01/02 kommt: +- HTTP-Timeouts +- Body-Limits +- sicherer Standard-Bind-Modus + +## WS-05 Entscheidungslog +- 2026-04-06: `/audio/stream` now enforces a binary `Content-Type` (`application/octet-stream` or `audio/L16`) before queuing any samples. + +## WS-05 Verifikation +| Datum | Fokus | Ergebnis | +|---|---|---| +| 2026-04-05 | `/audio/stream` rejects non-POST requests | `TestAudioStreamRejectsNonPost` enforces POST-only access to `/audio/stream` before a stream source is configured | +| 2026-04-06 | `/audio/stream` enforces binary Content-Type headers | `TestAudioStreamRejectsMissingContentType` and `TestAudioStreamRejectsUnsupportedContentType` confirm 415 when the media type is missing or wrong | + +--- + +# WS-06 — Hardware-in-the-loop und externe RF-Wahrheitsprüfung +**Priorität:** P2 +**Gesamtstatus:** TODO + +## Ziel +Nicht nur intern richtig rechnen, sondern extern nachweisen, dass tatsächlich korrekt gesendet wird. + +## Status +- Konzept vorhanden +- noch kein eingetragener HIL-Arbeitsstand in diesem Dokument + +## Offene Kernfragen +- Welches Referenz-Setup wird verbindlich? +- Welche Testfrequenz / Standarddauer / Schutzmaßnahmen gelten? +- Welcher externe Decoder / Empfänger gilt als Referenz? + +--- + +# WS-07 — Device-aware Capability- und Kalibrierungsmodell +**Priorität:** P2 +**Gesamtstatus:** TODO + +## Ziel +Fähigkeiten und Kalibrierungen nicht implizit, sondern explizit pro Device modellieren. + +## Noch offen +- Capability-Schema konkretisieren +- Kalibrierungsprofil definieren +- Device-aware Validation einbauen + +--- + +# WS-08 — Signal-Selbstüberwachung im Betrieb +**Priorität:** P2 +**Gesamtstatus:** TODO + +## Ziel +Pilot, Stereo, RDS und Composite-Anomalien im Betrieb erkennen. + +## Noch offen +- Goertzel/FFT-Strategie festlegen +- Schwellwerte definieren +- in Fault-Logik einspeisen + +--- + +# WS-09 — Teststrategie erweitern +**Priorität:** P3/P4-nah +**Gesamtstatus:** TODO + +## Ziel +Von Unit-Tests zu echter Qualitätsabsicherung: Golden Vectors, Long-Run, Race, Fuzzing, API-Mutation, HIL. + +## Noch offen +- Testpyramide konkretisieren +- Nightly-/CI-Fähigkeit bestimmen + +--- + +# WS-10 — Service-Reife, Packaging und Reproduzierbarkeit +**Priorität:** P4 +**Gesamtstatus:** TODO + +## Ziel +Build-, Release- und Betriebsartefakte reproduzierbar und teamtauglich machen. + +## Noch offen +- Build-Metadaten +- Service-Units +- Config-Versionierung / Migration + +--- + +## 5. Übergreifende Regeln + +### Musts +- Jeder neue Runtime-Zustand muss per API und Telemetrie sichtbar sein. +- Jede Recovery-, Drop- oder Mute-Strategie braucht Counter, Logs und Tests. +- Keine neue Config-Option ohne klaren Typ, Bereich, Einheit, Default und Hot-Reload-Klassifikation. +- Hardware-nahe Änderungen brauchen mindestens Simulations- und HIL-Validierung. +- Alle Faults müssen eine maschinenlesbare Ursache und eine menschenlesbare Zusammenfassung haben. + +### Must Not +- Keine unbounded Queues. +- Keine stillen Fallbacks ohne Telemetrie. +- Keine teilweise angewandten Live-Config-Änderungen ohne explizite Rückmeldung. +- Keine unterschiedlichen Grenzwerte zwischen Config, API und Runtime. +- Keine sicherheitsrelevanten HTTP-Endpunkte ohne Härtung im Remote-Betrieb. + +--- + +## 6. Aktuelle offene Entscheidungen + +| ID | Status | Frage | Notiz | +|---|---|---|---| +| DEC-001 | RESOLVED | Puffern wir auf CompositeFrame- oder DeviceFrame-Ebene? | Queue lebt nach dem Upsampler (DeviceFrame-Ebene) gemäß `internal/app/engine.go`-Integrationsschleife. | +| DEC-002 | OPEN | Fault-Recovery zuerst mit `mute`, `repeat last safe frame` oder beidem? | Muss technisch und RF-seitig sauber bewertet werden | +| DEC-003 | OPEN | Ziehen wir minimale WS-05-Basis-Härtungen vor? | Timeouts/Body-Limits evtl. früher sinnvoll | +| DEC-004 | OPEN | Wie gross/simpel halten wir die erste State-Maschine? | Gefahr von Overengineering | + +--- + +## 7. Nächste sinnvolle Schritte + +### Empfohlener Start +1. **WS-03-T1 Parameterinventar erstellen** *(abgeschlossen)* +2. **bekannte Inkonsistenzen (CFG-SEM-001, CTL-UX-001) konkret verifizieren** +3. **DesiredConfig / AppliedConfig / RuntimeState Zielmodell grob skizzieren** +4. Danach Architekturarbeit an **WS-01 + WS-02** starten +5. **Aktuell:** WS-01-T2 Writer-Worker einführen (Queue → Driver), danach WS-01-T3 Supervisor + WS-02 Runtime-State. + +### Vor dem ersten grossen Umbau klären +- Was ist „minimal sinnvoll“ für Milestone 1? +- Welche Dinge sind harte Must-haves und welche nur spätere Veredelung? +- Wo wollen wir bewusst nicht sofort maximal abstrahieren? + +--- + +## 8. Änderungsprotokoll + +| Datum | Änderung | Person / Agent | +|---|---|---| +| 2026-04-05 | Initiales Arbeitsdokument aus `fm-rds-tx_pro_runtime_hardening_concept.json` erstellt | Alfred | diff --git a/internal/control/control.go b/internal/control/control.go index 07cb355..283ac96 100644 --- a/internal/control/control.go +++ b/internal/control/control.go @@ -52,11 +52,17 @@ type Server struct { } const ( - maxConfigBodyBytes = 64 << 10 // 64 KiB - configContentTypeHeader = "application/json" - noBodyErrMsg = "request must not include a body" + maxConfigBodyBytes = 64 << 10 // 64 KiB + configContentTypeHeader = "application/json" + noBodyErrMsg = "request must not include a body" + audioStreamContentTypeError = "Content-Type must be application/octet-stream or audio/L16" ) +var audioStreamAllowedMediaTypes = []string{ + "application/octet-stream", + "audio/l16", +} + func isJSONContentType(r *http.Request) bool { ct := strings.TrimSpace(r.Header.Get("Content-Type")) if ct == "" { @@ -110,6 +116,23 @@ func rejectBody(w http.ResponseWriter, r *http.Request) bool { return false } +func isAudioStreamContentType(r *http.Request) bool { + ct := strings.TrimSpace(r.Header.Get("Content-Type")) + if ct == "" { + return false + } + mediaType, _, err := mime.ParseMediaType(ct) + if err != nil { + return false + } + for _, allowed := range audioStreamAllowedMediaTypes { + if strings.EqualFold(mediaType, allowed) { + return true + } + } + return false +} + func (s *Server) SetTXController(tx TXController) { s.mu.Lock() s.tx = tx @@ -248,6 +271,10 @@ func (s *Server) handleAudioStream(w http.ResponseWriter, r *http.Request) { http.Error(w, "method not allowed", http.StatusMethodNotAllowed) return } + if !isAudioStreamContentType(r) { + http.Error(w, audioStreamContentTypeError, http.StatusUnsupportedMediaType) + return + } s.mu.RLock() stream := s.streamSrc s.mu.RUnlock() diff --git a/internal/control/control_test.go b/internal/control/control_test.go index 846b24d..e20a0b3 100644 --- a/internal/control/control_test.go +++ b/internal/control/control_test.go @@ -307,6 +307,7 @@ func TestAudioStreamRequiresSource(t *testing.T) { srv := NewServer(cfgpkg.Default()) rec := httptest.NewRecorder() req := httptest.NewRequest(http.MethodPost, "/audio/stream", bytes.NewReader(nil)) + req.Header.Set("Content-Type", "application/octet-stream") srv.Handler().ServeHTTP(rec, req) if rec.Code != http.StatusServiceUnavailable { t.Fatalf("expected 503 when audio stream missing, got %d", rec.Code) @@ -321,6 +322,7 @@ func TestAudioStreamPushesPCM(t *testing.T) { pcm := []byte{0, 0, 0, 0} rec := httptest.NewRecorder() req := httptest.NewRequest(http.MethodPost, "/audio/stream", bytes.NewReader(pcm)) + req.Header.Set("Content-Type", "application/octet-stream") srv.Handler().ServeHTTP(rec, req) if rec.Code != 200 { t.Fatalf("expected 200, got %d", rec.Code) @@ -355,6 +357,37 @@ func TestAudioStreamRejectsNonPost(t *testing.T) { } } +func TestAudioStreamRejectsMissingContentType(t *testing.T) { + cfg := cfgpkg.Default() + srv := NewServer(cfg) + srv.SetStreamSource(audio.NewStreamSource(256, 44100)) + rec := httptest.NewRecorder() + req := httptest.NewRequest(http.MethodPost, "/audio/stream", bytes.NewReader([]byte{0, 0})) + srv.Handler().ServeHTTP(rec, req) + if rec.Code != http.StatusUnsupportedMediaType { + t.Fatalf("expected 415 when Content-Type missing, got %d", rec.Code) + } + if !strings.Contains(rec.Body.String(), "Content-Type must be") { + t.Fatalf("unexpected response body: %q", rec.Body.String()) + } +} + +func TestAudioStreamRejectsUnsupportedContentType(t *testing.T) { + cfg := cfgpkg.Default() + srv := NewServer(cfg) + srv.SetStreamSource(audio.NewStreamSource(256, 44100)) + rec := httptest.NewRecorder() + req := httptest.NewRequest(http.MethodPost, "/audio/stream", bytes.NewReader([]byte{0, 0})) + req.Header.Set("Content-Type", "text/plain") + srv.Handler().ServeHTTP(rec, req) + if rec.Code != http.StatusUnsupportedMediaType { + t.Fatalf("expected 415 for unsupported Content-Type, got %d", rec.Code) + } + if !strings.Contains(rec.Body.String(), "Content-Type must be") { + t.Fatalf("unexpected response body: %q", rec.Body.String()) + } +} + func TestTXStartWithoutController(t *testing.T) { srv := NewServer(cfgpkg.Default()) rec := httptest.NewRecorder() From da668863a1124548151468f81ed63404ea58d096 Mon Sep 17 00:00:00 2001 From: Jan Date: Mon, 6 Apr 2026 08:12:50 +0200 Subject: [PATCH 48/55] Harden /audio/stream uploads --- docs/API.md | 3 ++- docs/pro-runtime-hardening-workboard.md | 2 ++ internal/control/control.go | 11 +++++++++++ internal/control/control_test.go | 22 ++++++++++++++++++++++ 4 files changed, 37 insertions(+), 1 deletion(-) diff --git a/docs/API.md b/docs/API.md index e7f89b0..ce1538b 100644 --- a/docs/API.md +++ b/docs/API.md @@ -294,7 +294,7 @@ Push raw audio data into the live stream buffer. Format: **S16LE stereo PCM** at Requires `--audio-stdin`, `--audio-http`, or another configured stream source to feed the buffer. -**Request:** Binary body, `application/octet-stream`, raw S16LE stereo PCM bytes. Set `Content-Type` to `application/octet-stream` or `audio/L16`; other media types are rejected. +**Request:** Binary body, `application/octet-stream`, raw S16LE stereo PCM bytes. Set `Content-Type` to `application/octet-stream` or `audio/L16`; other media types are rejected. Requests larger than 512 MiB are rejected with `413 Request Entity Too Large`. **Response:** ```json @@ -325,6 +325,7 @@ ffmpeg -i song.mp3 -f s16le -ar 44100 -ac 2 - | \ **Errors:** - `405` if not POST - `415` if Content-Type is missing or unsupported (must be `application/octet-stream` or `audio/L16`) +- `413` if the upload body exceeds the 512 MiB limit - `503` if no audio stream configured --- diff --git a/docs/pro-runtime-hardening-workboard.md b/docs/pro-runtime-hardening-workboard.md index 304d61f..46473dc 100644 --- a/docs/pro-runtime-hardening-workboard.md +++ b/docs/pro-runtime-hardening-workboard.md @@ -443,12 +443,14 @@ Diese Punkte könnten ggf. vorgezogen werden, auch wenn WS-05 formal nach WS-01/ ## WS-05 Entscheidungslog - 2026-04-06: `/audio/stream` now enforces a binary `Content-Type` (`application/octet-stream` or `audio/L16`) before queuing any samples. +- 2026-04-06: `/audio/stream` caps uploads at 512 MiB and rejects larger bodies with `413 Request Entity Too Large` before touching the ring buffer. ## WS-05 Verifikation | Datum | Fokus | Ergebnis | |---|---|---| | 2026-04-05 | `/audio/stream` rejects non-POST requests | `TestAudioStreamRejectsNonPost` enforces POST-only access to `/audio/stream` before a stream source is configured | | 2026-04-06 | `/audio/stream` enforces binary Content-Type headers | `TestAudioStreamRejectsMissingContentType` and `TestAudioStreamRejectsUnsupportedContentType` confirm 415 when the media type is missing or wrong | +| 2026-04-06 | `/audio/stream` rejects oversized uploads | `TestAudioStreamRejectsBodyTooLarge` confirms a 413 Request Entity Too Large before buffering when the HTTP body exceeds the 512 MiB guard | --- diff --git a/internal/control/control.go b/internal/control/control.go index 283ac96..25f5386 100644 --- a/internal/control/control.go +++ b/internal/control/control.go @@ -3,6 +3,7 @@ package control import ( _ "embed" "encoding/json" + "errors" "io" "mime" "net/http" @@ -56,6 +57,7 @@ const ( configContentTypeHeader = "application/json" noBodyErrMsg = "request must not include a body" audioStreamContentTypeError = "Content-Type must be application/octet-stream or audio/L16" + audioStreamBodyLimitDefault = 512 << 20 // 512 MiB ) var audioStreamAllowedMediaTypes = []string{ @@ -63,6 +65,8 @@ var audioStreamAllowedMediaTypes = []string{ "audio/l16", } +var audioStreamBodyLimit = int64(audioStreamBodyLimitDefault) // bytes allowed per /audio/stream request; tests may override. + func isJSONContentType(r *http.Request) bool { ct := strings.TrimSpace(r.Header.Get("Content-Type")) if ct == "" { @@ -284,6 +288,8 @@ func (s *Server) handleAudioStream(w http.ResponseWriter, r *http.Request) { return } + r.Body = http.MaxBytesReader(w, r.Body, audioStreamBodyLimit) + // Read body in chunks and push to ring buffer buf := make([]byte, 32768) totalFrames := 0 @@ -296,6 +302,11 @@ func (s *Server) handleAudioStream(w http.ResponseWriter, r *http.Request) { if err == io.EOF { break } + var maxErr *http.MaxBytesError + if errors.As(err, &maxErr) { + http.Error(w, maxErr.Error(), http.StatusRequestEntityTooLarge) + return + } http.Error(w, err.Error(), http.StatusInternalServerError) return } diff --git a/internal/control/control_test.go b/internal/control/control_test.go index e20a0b3..8195b67 100644 --- a/internal/control/control_test.go +++ b/internal/control/control_test.go @@ -388,6 +388,28 @@ func TestAudioStreamRejectsUnsupportedContentType(t *testing.T) { } } +func TestAudioStreamRejectsBodyTooLarge(t *testing.T) { + orig := audioStreamBodyLimit + t.Cleanup(func() { + audioStreamBodyLimit = orig + }) + audioStreamBodyLimit = 1024 + limit := int(audioStreamBodyLimit) + body := make([]byte, limit+1) + srv := NewServer(cfgpkg.Default()) + srv.SetStreamSource(audio.NewStreamSource(256, 44100)) + rec := httptest.NewRecorder() + req := httptest.NewRequest(http.MethodPost, "/audio/stream", bytes.NewReader(body)) + req.Header.Set("Content-Type", "application/octet-stream") + srv.Handler().ServeHTTP(rec, req) + if rec.Code != http.StatusRequestEntityTooLarge { + t.Fatalf("expected 413 for oversized body, got %d", rec.Code) + } + if !strings.Contains(rec.Body.String(), "request body too large") { + t.Fatalf("unexpected response body: %q", rec.Body.String()) + } +} + func TestTXStartWithoutController(t *testing.T) { srv := NewServer(cfgpkg.Default()) rec := httptest.NewRecorder() From 06bf511391b5a96fa5387c9d77444501297c7b77 Mon Sep 17 00:00:00 2001 From: Jan Date: Mon, 6 Apr 2026 08:27:56 +0200 Subject: [PATCH 49/55] control: expose request rejection audit counters --- internal/control/control.go | 58 +++++++++++++++++-- internal/control/control_test.go | 95 ++++++++++++++++++++++++++++++++ 2 files changed, 149 insertions(+), 4 deletions(-) diff --git a/internal/control/control.go b/internal/control/control.go index 25f5386..381a637 100644 --- a/internal/control/control.go +++ b/internal/control/control.go @@ -9,6 +9,7 @@ import ( "net/http" "strings" "sync" + "sync/atomic" "github.com/jan/fm-rds-tx/internal/audio" "github.com/jan/fm-rds-tx/internal/config" @@ -50,6 +51,23 @@ type Server struct { tx TXController drv platform.SoapyDriver // optional, for runtime stats streamSrc *audio.StreamSource // optional, for live audio ingest + audit auditCounters +} + +type auditEvent string + +const ( + auditMethodNotAllowed auditEvent = "methodNotAllowed" + auditUnsupportedMediaType auditEvent = "unsupportedMediaType" + auditBodyTooLarge auditEvent = "bodyTooLarge" + auditUnexpectedBody auditEvent = "unexpectedBody" +) + +type auditCounters struct { + methodNotAllowed uint64 + unsupportedMediaType uint64 + bodyTooLarge uint64 + unexpectedBody uint64 } const ( @@ -112,14 +130,37 @@ func hasRequestBody(r *http.Request) bool { return false } -func rejectBody(w http.ResponseWriter, r *http.Request) bool { +func (s *Server) rejectBody(w http.ResponseWriter, r *http.Request) bool { if !hasRequestBody(r) { return true } + s.recordAudit(auditUnexpectedBody) http.Error(w, noBodyErrMsg, http.StatusBadRequest) return false } +func (s *Server) recordAudit(evt auditEvent) { + switch evt { + case auditMethodNotAllowed: + atomic.AddUint64(&s.audit.methodNotAllowed, 1) + case auditUnsupportedMediaType: + atomic.AddUint64(&s.audit.unsupportedMediaType, 1) + case auditBodyTooLarge: + atomic.AddUint64(&s.audit.bodyTooLarge, 1) + case auditUnexpectedBody: + atomic.AddUint64(&s.audit.unexpectedBody, 1) + } +} + +func (s *Server) auditSnapshot() map[string]uint64 { + return map[string]uint64{ + "methodNotAllowed": atomic.LoadUint64(&s.audit.methodNotAllowed), + "unsupportedMediaType": atomic.LoadUint64(&s.audit.unsupportedMediaType), + "bodyTooLarge": atomic.LoadUint64(&s.audit.bodyTooLarge), + "unexpectedBody": atomic.LoadUint64(&s.audit.unexpectedBody), + } +} + func isAudioStreamContentType(r *http.Request) bool { ct := strings.TrimSpace(r.Header.Get("Content-Type")) if ct == "" { @@ -239,16 +280,18 @@ func (s *Server) handleRuntime(w http.ResponseWriter, _ *http.Request) { if stream != nil { result["audioStream"] = stream.Stats() } + result["controlAudit"] = s.auditSnapshot() w.Header().Set("Content-Type", "application/json") _ = json.NewEncoder(w).Encode(result) } func (s *Server) handleRuntimeFaultReset(w http.ResponseWriter, r *http.Request) { if r.Method != http.MethodPost { + s.recordAudit(auditMethodNotAllowed) http.Error(w, "method not allowed", http.StatusMethodNotAllowed) return } - if !rejectBody(w, r) { + if !s.rejectBody(w, r) { return } s.mu.RLock() @@ -272,10 +315,12 @@ func (s *Server) handleRuntimeFaultReset(w http.ResponseWriter, r *http.Request) // ffmpeg ... -f s16le -ar 44100 -ac 2 - | curl -X POST --data-binary @- http://host:8088/audio/stream func (s *Server) handleAudioStream(w http.ResponseWriter, r *http.Request) { if r.Method != http.MethodPost { + s.recordAudit(auditMethodNotAllowed) http.Error(w, "method not allowed", http.StatusMethodNotAllowed) return } if !isAudioStreamContentType(r) { + s.recordAudit(auditUnsupportedMediaType) http.Error(w, audioStreamContentTypeError, http.StatusUnsupportedMediaType) return } @@ -304,6 +349,7 @@ func (s *Server) handleAudioStream(w http.ResponseWriter, r *http.Request) { } var maxErr *http.MaxBytesError if errors.As(err, &maxErr) { + s.recordAudit(auditBodyTooLarge) http.Error(w, maxErr.Error(), http.StatusRequestEntityTooLarge) return } @@ -322,10 +368,11 @@ func (s *Server) handleAudioStream(w http.ResponseWriter, r *http.Request) { func (s *Server) handleTXStart(w http.ResponseWriter, r *http.Request) { if r.Method != http.MethodPost { + s.recordAudit(auditMethodNotAllowed) http.Error(w, "method not allowed", http.StatusMethodNotAllowed) return } - if !rejectBody(w, r) { + if !s.rejectBody(w, r) { return } s.mu.RLock() @@ -345,10 +392,11 @@ func (s *Server) handleTXStart(w http.ResponseWriter, r *http.Request) { func (s *Server) handleTXStop(w http.ResponseWriter, r *http.Request) { if r.Method != http.MethodPost { + s.recordAudit(auditMethodNotAllowed) http.Error(w, "method not allowed", http.StatusMethodNotAllowed) return } - if !rejectBody(w, r) { + if !s.rejectBody(w, r) { return } s.mu.RLock() @@ -384,6 +432,7 @@ func (s *Server) handleConfig(w http.ResponseWriter, r *http.Request) { _ = json.NewEncoder(w).Encode(cfg) case http.MethodPost: if !isJSONContentType(r) { + s.recordAudit(auditUnsupportedMediaType) http.Error(w, "Content-Type must be application/json", http.StatusUnsupportedMediaType) return } @@ -393,6 +442,7 @@ func (s *Server) handleConfig(w http.ResponseWriter, r *http.Request) { statusCode := http.StatusBadRequest if strings.Contains(err.Error(), "http: request body too large") { statusCode = http.StatusRequestEntityTooLarge + s.recordAudit(auditBodyTooLarge) } http.Error(w, err.Error(), statusCode) return diff --git a/internal/control/control_test.go b/internal/control/control_test.go index 8195b67..e25ea07 100644 --- a/internal/control/control_test.go +++ b/internal/control/control_test.go @@ -498,6 +498,101 @@ func TestConfigPatchEngineRejectsDoesNotUpdateSnapshot(t *testing.T) { } } +func TestRuntimeIncludesControlAudit(t *testing.T) { + srv := NewServer(cfgpkg.Default()) + counts := controlAuditCounts(t, srv) + keys := []string{"methodNotAllowed", "unsupportedMediaType", "bodyTooLarge", "unexpectedBody"} + for _, key := range keys { + if counts[key] != 0 { + t.Fatalf("expected %s to start at 0, got %d", key, counts[key]) + } + } +} + +func TestControlAuditTracksMethodNotAllowed(t *testing.T) { + srv := NewServer(cfgpkg.Default()) + rec := httptest.NewRecorder() + srv.Handler().ServeHTTP(rec, httptest.NewRequest(http.MethodGet, "/audio/stream", nil)) + if rec.Code != http.StatusMethodNotAllowed { + t.Fatalf("expected 405 from audio stream GET, got %d", rec.Code) + } + counts := controlAuditCounts(t, srv) + if counts["methodNotAllowed"] != 1 { + t.Fatalf("expected methodNotAllowed=1, got %d", counts["methodNotAllowed"]) + } +} + +func TestControlAuditTracksUnsupportedMediaType(t *testing.T) { + srv := NewServer(cfgpkg.Default()) + srv.SetStreamSource(audio.NewStreamSource(256, 44100)) + rec := httptest.NewRecorder() + req := httptest.NewRequest(http.MethodPost, "/audio/stream", bytes.NewReader([]byte{0, 0})) + srv.Handler().ServeHTTP(rec, req) + if rec.Code != http.StatusUnsupportedMediaType { + t.Fatalf("expected 415 for audio stream content type, got %d", rec.Code) + } + counts := controlAuditCounts(t, srv) + if counts["unsupportedMediaType"] != 1 { + t.Fatalf("expected unsupportedMediaType=1, got %d", counts["unsupportedMediaType"]) + } +} + +func TestControlAuditTracksBodyTooLarge(t *testing.T) { + srv := NewServer(cfgpkg.Default()) + limit := int(maxConfigBodyBytes) + body := []byte("{\"ps\":\"" + strings.Repeat("x", limit+1) + "\"}") + rec := httptest.NewRecorder() + srv.Handler().ServeHTTP(rec, newConfigPostRequest(body)) + if rec.Code != http.StatusRequestEntityTooLarge { + t.Fatalf("expected 413 for oversized config body, got %d", rec.Code) + } + counts := controlAuditCounts(t, srv) + if counts["bodyTooLarge"] != 1 { + t.Fatalf("expected bodyTooLarge=1, got %d", counts["bodyTooLarge"]) + } +} + +func TestControlAuditTracksUnexpectedBody(t *testing.T) { + srv := NewServer(cfgpkg.Default()) + srv.SetTXController(&fakeTXController{}) + rec := httptest.NewRecorder() + req := httptest.NewRequest(http.MethodPost, "/tx/start", bytes.NewReader([]byte("body"))) + srv.Handler().ServeHTTP(rec, req) + if rec.Code != http.StatusBadRequest { + t.Fatalf("expected 400 for unexpected body, got %d", rec.Code) + } + counts := controlAuditCounts(t, srv) + if counts["unexpectedBody"] != 1 { + t.Fatalf("expected unexpectedBody=1, got %d", counts["unexpectedBody"]) + } +} + +func controlAuditCounts(t *testing.T, srv *Server) map[string]uint64 { + t.Helper() + rec := httptest.NewRecorder() + srv.Handler().ServeHTTP(rec, httptest.NewRequest(http.MethodGet, "/runtime", nil)) + if rec.Code != http.StatusOK { + t.Fatalf("runtime request failed: %d", rec.Code) + } + var payload map[string]any + if err := json.Unmarshal(rec.Body.Bytes(), &payload); err != nil { + t.Fatalf("unmarshal runtime: %v", err) + } + raw, ok := payload["controlAudit"].(map[string]any) + if !ok { + t.Fatalf("controlAudit missing or wrong type: %T", payload["controlAudit"]) + } + counts := map[string]uint64{} + for key, value := range raw { + num, ok := value.(float64) + if !ok { + t.Fatalf("controlAudit %s not numeric: %T", key, value) + } + counts[key] = uint64(num) + } + return counts +} + func newConfigPostRequest(body []byte) *http.Request { req := httptest.NewRequest(http.MethodPost, "/config", bytes.NewReader(body)) req.Header.Set("Content-Type", "application/json") From 06056c08c8c49b0007202223e849de81b2e44c7d Mon Sep 17 00:00:00 2001 From: Jan Date: Mon, 6 Apr 2026 08:40:38 +0200 Subject: [PATCH 50/55] Surface control audit telemetry --- docs/API.md | 8 +++++++ internal/control/ui.html | 48 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 56 insertions(+) diff --git a/docs/API.md b/docs/API.md index ce1538b..c000124 100644 --- a/docs/API.md +++ b/docs/API.md @@ -15,6 +15,8 @@ Health check. {"ok": true} ``` +`controlAudit` mirrors the control plane's HTTP reject counters (405/415/413/400) so runtime telemetry can spot abusive clients and the UI can keep ops aware of guardrail hits. + `engine.state` spiegelt jetzt die Runtime-State-Maschine wider (idle, arming, prebuffering, running, degraded, muted, faulted, stopping) und bietet eine erste beobachtbare Basis für Fault-Transitions. @@ -98,6 +100,12 @@ Live engine and driver telemetry. Only populated when TX is active. "underrunStreak": 0, "maxUnderrunStreak": 0, "effectiveSampleRateHz": 2280000 + }, + "controlAudit": { + "methodNotAllowed": 0, + "unsupportedMediaType": 0, + "bodyTooLarge": 0, + "unexpectedBody": 0 } } ``` diff --git a/internal/control/ui.html b/internal/control/ui.html index eb211cd..38f1170 100644 --- a/internal/control/ui.html +++ b/internal/control/ui.html @@ -1193,6 +1193,19 @@ input.input-error {
+ + +

Shortcuts

@@ -1974,6 +1987,7 @@ function render() { updateText('info-live', engine.state ? `${String(engine.state).toUpperCase()} / ${state.server.runtimeOk ? 'runtime ok' : 'runtime pending'}` : (state.server.configOk ? 'config only' : '--')); updateHealth(engine, driver, audioStream); + updateControlAudit(runtime.controlAudit); updateFaultHistory(engine); updateTransitionHistory(); updateResetHint(engine); @@ -2248,6 +2262,40 @@ function updateHealth(engine, driver, audioStream) { } + +function updateControlAudit(audit) { + const entries = [ + { key: 'methodNotAllowed', id: 'audit-methodNotAllowed' }, + { key: 'unsupportedMediaType', id: 'audit-unsupportedMediaType' }, + { key: 'bodyTooLarge', id: 'audit-bodyTooLarge' }, + { key: 'unexpectedBody', id: 'audit-unexpectedBody' }, + ]; + let total = 0; + let hasData = false; + entries.forEach(({ key, id }) => { + const raw = audit && typeof audit[key] !== 'undefined' ? Number(audit[key]) : NaN; + const value = Number.isFinite(raw) ? raw : null; + if (value != null) { + hasData = true; + total += value; + } + setAuditValue(id, value); + }); + setAuditValue('audit-total', hasData ? total : null); +} + +function setAuditValue(id, count) { + const el = $(id); + if (!el) return; + if (count == null) { + el.textContent = '--'; + el.className = 'val'; + return; + } + el.textContent = String(count); + el.className = 'val ' + (count > 0 ? 'warn' : 'good'); +} + function updateFaultHistory(engine) { const container = $('fault-history'); if (!container) return; From 4bf56a6e6cbfdb7e08f155785626e386e927bbb7 Mon Sep 17 00:00:00 2001 From: Jan Date: Mon, 6 Apr 2026 09:08:47 +0200 Subject: [PATCH 51/55] fix: separate write latency from pipeline latency --- internal/app/engine.go | 41 ++++++++++++++++++++++++++++---------- internal/output/backend.go | 13 +++++++----- 2 files changed, 38 insertions(+), 16 deletions(-) diff --git a/internal/app/engine.go b/internal/app/engine.go index a269cd8..34fbaaf 100644 --- a/internal/app/engine.go +++ b/internal/app/engine.go @@ -81,6 +81,8 @@ type EngineStats struct { MaxGenerateMs float64 `json:"maxGenerateMs,omitempty"` MaxUpsampleMs float64 `json:"maxUpsampleMs,omitempty"` MaxWriteMs float64 `json:"maxWriteMs,omitempty"` + MaxQueueResidenceMs float64 `json:"maxQueueResidenceMs,omitempty"` + MaxPipelineLatencyMs float64 `json:"maxPipelineLatencyMs,omitempty"` Queue output.QueueStats `json:"queue"` RuntimeIndicator RuntimeIndicator `json:"runtimeIndicator"` RuntimeAlert string `json:"runtimeAlert,omitempty"` @@ -152,6 +154,8 @@ type Engine struct { maxGenerateNs atomic.Uint64 maxUpsampleNs atomic.Uint64 maxWriteNs atomic.Uint64 + maxQueueResidenceNs atomic.Uint64 + maxPipelineNs atomic.Uint64 lastError atomic.Value // string lastFault atomic.Value // *FaultEvent faultHistoryMu sync.Mutex @@ -429,6 +433,8 @@ func (e *Engine) Stats() EngineStats { MaxGenerateMs: durationMs(e.maxGenerateNs.Load()), MaxUpsampleMs: durationMs(e.maxUpsampleNs.Load()), MaxWriteMs: durationMs(e.maxWriteNs.Load()), + MaxQueueResidenceMs: durationMs(e.maxQueueResidenceNs.Load()), + MaxPipelineLatencyMs: durationMs(e.maxPipelineNs.Load()), Queue: queue, RuntimeIndicator: ri, RuntimeAlert: runtimeAlert(queue.Health, hasRecentLateBuffers), @@ -515,6 +521,7 @@ func (e *Engine) run(ctx context.Context) { updateMaxDuration(&e.maxUpsampleNs, upDur) enqueued := cloneFrame(frame) + enqueued.EnqueuedAt = time.Now() if enqueued == nil { e.lastError.Store("engine: frame clone failed") e.underruns.Add(1) @@ -558,26 +565,35 @@ func (e *Engine) writerLoop(ctx context.Context) { continue } + frame.DequeuedAt = time.Now() + queueResidence := time.Duration(0) + if !frame.EnqueuedAt.IsZero() { + queueResidence = frame.DequeuedAt.Sub(frame.EnqueuedAt) + } + writeStart := time.Now() + frame.WriteStartedAt = writeStart n, err := e.driver.Write(ctx, frame) writeDur := time.Since(writeStart) - cycleDur := writeDur + pipelineLatency := writeDur if !frame.GeneratedAt.IsZero() { - cycleDur = time.Since(frame.GeneratedAt) + pipelineLatency = time.Since(frame.GeneratedAt) } updateMaxDuration(&e.maxWriteNs, writeDur) - updateMaxDuration(&e.maxCycleNs, cycleDur) + updateMaxDuration(&e.maxQueueResidenceNs, queueResidence) + updateMaxDuration(&e.maxPipelineNs, pipelineLatency) + updateMaxDuration(&e.maxCycleNs, writeDur) queueStats := e.frameQueue.Stats() e.evaluateRuntimeState(queueStats, e.hasRecentLateBuffers()) - if cycleDur > e.chunkDuration { + if writeDur > e.chunkDuration { late := e.lateBuffers.Add(1) e.lateBufferAlertAt.Store(uint64(time.Now().UnixNano())) if late <= 5 || late%20 == 0 { - log.Printf("TX LATE: cycle=%s budget=%s write=%s over=%s", - cycleDur, e.chunkDuration, writeDur, cycleDur-e.chunkDuration) + log.Printf("TX LATE: write=%s budget=%s over=%s queueResidence=%s pipeline=%s", + writeDur, e.chunkDuration, writeDur-e.chunkDuration, queueResidence, pipelineLatency) } } @@ -607,11 +623,14 @@ func cloneFrame(src *output.CompositeFrame) *output.CompositeFrame { samples := make([]output.IQSample, len(src.Samples)) copy(samples, src.Samples) return &output.CompositeFrame{ - Samples: samples, - SampleRateHz: src.SampleRateHz, - Timestamp: src.Timestamp, - GeneratedAt: src.GeneratedAt, - Sequence: src.Sequence, + Samples: samples, + SampleRateHz: src.SampleRateHz, + Timestamp: src.Timestamp, + GeneratedAt: src.GeneratedAt, + EnqueuedAt: src.EnqueuedAt, + DequeuedAt: src.DequeuedAt, + WriteStartedAt: src.WriteStartedAt, + Sequence: src.Sequence, } } diff --git a/internal/output/backend.go b/internal/output/backend.go index bbc0171..94fb3c8 100644 --- a/internal/output/backend.go +++ b/internal/output/backend.go @@ -16,11 +16,14 @@ type IQSample struct { // CompositeFrame carries a block of MPX/IQ samples along with timing metadata. type CompositeFrame struct { - Samples []IQSample - SampleRateHz float64 - Timestamp time.Time - GeneratedAt time.Time - Sequence uint64 + Samples []IQSample + SampleRateHz float64 + Timestamp time.Time + GeneratedAt time.Time + EnqueuedAt time.Time + DequeuedAt time.Time + WriteStartedAt time.Time + Sequence uint64 } // BackendConfig describes the properties for a backend instance. From 7d0efe3a38e9c4296f1080f82a50169111d341de Mon Sep 17 00:00:00 2001 From: Jan Date: Mon, 6 Apr 2026 09:18:04 +0200 Subject: [PATCH 52/55] tune: ignore sub-millisecond write jitter in TX late detection --- internal/app/engine.go | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/internal/app/engine.go b/internal/app/engine.go index 34fbaaf..ba824ef 100644 --- a/internal/app/engine.go +++ b/internal/app/engine.go @@ -112,6 +112,7 @@ type RuntimeTransition struct { const ( lateBufferIndicatorWindow = 5 * time.Second + writeLateTolerance = 1 * time.Millisecond queueCriticalStreakThreshold = 3 queueMutedStreakThreshold = queueCriticalStreakThreshold * 2 queueMutedRecoveryThreshold = queueCriticalStreakThreshold @@ -588,12 +589,13 @@ func (e *Engine) writerLoop(ctx context.Context) { queueStats := e.frameQueue.Stats() e.evaluateRuntimeState(queueStats, e.hasRecentLateBuffers()) - if writeDur > e.chunkDuration { + lateOver := writeDur - e.chunkDuration + if lateOver > writeLateTolerance { late := e.lateBuffers.Add(1) e.lateBufferAlertAt.Store(uint64(time.Now().UnixNano())) if late <= 5 || late%20 == 0 { - log.Printf("TX LATE: write=%s budget=%s over=%s queueResidence=%s pipeline=%s", - writeDur, e.chunkDuration, writeDur-e.chunkDuration, queueResidence, pipelineLatency) + log.Printf("TX LATE: write=%s budget=%s over=%s tolerance=%s queueResidence=%s pipeline=%s", + writeDur, e.chunkDuration, lateOver, writeLateTolerance, queueResidence, pipelineLatency) } } From 41aff2767112342473c12c5ba82228ee2e53a488 Mon Sep 17 00:00:00 2001 From: Jan Date: Mon, 6 Apr 2026 09:44:00 +0200 Subject: [PATCH 53/55] docs: reconcile workboard and runtime API status --- docs/API.md | 13 +++-- docs/pro-runtime-hardening-workboard.md | 66 ++++++++++++++++++------- 2 files changed, 56 insertions(+), 23 deletions(-) diff --git a/docs/API.md b/docs/API.md index c000124..c97206e 100644 --- a/docs/API.md +++ b/docs/API.md @@ -15,9 +15,7 @@ Health check. {"ok": true} ``` -`controlAudit` mirrors the control plane's HTTP reject counters (405/415/413/400) so runtime telemetry can spot abusive clients and the UI can keep ops aware of guardrail hits. - -`engine.state` spiegelt jetzt die Runtime-State-Maschine wider (idle, arming, prebuffering, running, degraded, muted, faulted, stopping) und bietet eine erste beobachtbare Basis für Fault-Transitions. +This endpoint is a simple liveness signal — it does not include runtime-state data or audit counters. Use it for readiness/liveness probes. --- @@ -48,7 +46,12 @@ Current transmitter status (read-only snapshot). Runtime indicator, alert, and q } ``` -`runtimeIndicator` is derived from the engine queue health plus any late buffers observed in the last 5 seconds and can be "normal", "degraded", or "queueCritical". `runtimeAlert` surfaces a short reason (e.g. "queue health low" or "late buffers") when the indicator is not "normal", but late-buffer alerts expire after a few seconds once cycle times settle so the signal doesn't stay stuck on degraded. The cumulative `lateBuffers` counter returned by `/runtime` still shows how many late cycles have occurred since start for post-mortem diagnosis. +`runtimeIndicator` is derived from the engine queue health plus any late buffers observed in the last 5 seconds and can be "normal", "degraded", or "queueCritical". + +`runtimeState` mirrors the same runtime-state machine string that `/runtime` exposes as `engine.state` when a TX controller is active, so quick health checks reuse the same terminology. + +`runtimeAlert` surfaces a short reason (e.g. "queue health low" or "late buffers") when the indicator is not "normal", but late-buffer alerts expire after a few seconds once cycle times settle so the signal doesn't stay stuck on degraded. The cumulative `lateBuffers` counter returned by `/runtime` still shows how many late cycles have occurred since start for post-mortem diagnosis. + --- @@ -117,6 +120,8 @@ Live engine and driver telemetry. Only populated when TX is active. `driver.underrunStreak` reports how many consecutive reads returned silence, and `driver.maxUnderrunStreak` captures the longest such run since the engine started. Together they help differentiate short glitches from persistent underrun storms and can be plotted alongside queue health sparkline telemetry. +`controlAudit` mirrors the control plane's HTTP reject counters (405/415/413/400). Whenever the HTTP server rejects a request (method not allowed, unsupported media type, body too large, or unexpected body), the respective counter increments — this lets runtime telemetry spot abusive clients without polluting the runtime state payload. + --- diff --git a/docs/pro-runtime-hardening-workboard.md b/docs/pro-runtime-hardening-workboard.md index 46473dc..b7ba0db 100644 --- a/docs/pro-runtime-hardening-workboard.md +++ b/docs/pro-runtime-hardening-workboard.md @@ -228,15 +228,24 @@ Generator/Upsampler und Hardwarewriter werden als getrennte Stufen mit kleinem, - Frame-Klonierung pro Chunk erhöht Heap-Pressure; spätere Workstreams sollten Pooling / Zero-Copy prüfen. ### WS-01-T3 — Supervisor-Schicht einführen -- **Status:** TODO -- **Owner:** offen +- **Status:** IN PROGRESS +- **Owner:** Lead Coderaffe - **Code-Orte:** - `internal/app/engine.go` - **Ziel:** - Queue-Füllstand, Late-Rate und Fehlerhäufigkeit überwachen und in Runtime-Zustände überführen. + Queue-Füllstand, Late-Rate und Fehlerhäufigkeit überwachen und in explizite Runtime-Zustände überführen, + sodass ein degradierter Queue-Health-Pfad automatisch auf `degraded`, `muted` oder `faulted` zeigt. - **Akzeptanzpunkte:** - - State-Entscheidungen sind explizit - - kein implizites Weiterwursteln bei Schieflage + - Alle Runtime-Entscheidungen laufen über `evaluateRuntimeState`, nicht stillschweigend weiter auf `running`. + - Queue-Health, Late-Buffers und Fault-Events treiben gezielt `degraded` → `muted` → `faulted`, damit Operatoren wissen, wann Blockaden vorliegen. + - `EngineStats` und `/runtime` bringen `runtimeIndicator`, `queue`, `faultHistory`, `transitionHistory` und das `runtimeState`-Label, so Telemetrie und UI dieselben Signale sehen. +- **Nachweis:** + - `internal/app/engine.go` (Generator-/Writer-Loops) ruft `evaluateRuntimeState` auf und protokolliert Fault-Events, Transition-Historien und Counters. + - `txBridge.TXStats` (`cmd/fmrtx/main.go`) leitet die Runtime-Infos an `/status` und `/runtime`, damit die API-Layer aktuelle Fault-Zustände spiegeln. + - `internal/app/runtime_state_test.go` plus `go test ./...` sichern die erwarteten Transition-Reihenfolgen und Fault-Counter. +- **Restrisiken:** + - Queue-Schwellen für `critical`/`lateBuffers` brauchen noch Feldvalidierung und ggf. Konfiguration. + - Fault-Reset/Operator-Interaktion ist im Control-Plane-UI noch zu finalisieren. ## Offene Architekturfragen - Ist `capacity_frames = 3` ein guter Startwert oder nur Konzept-Default? @@ -298,31 +307,50 @@ Einführen eines klaren Betriebsmodells mit Fault-, Recovery- und Muted-Zuständ ## Aufgaben ### WS-02-T1 — Fault-Klassifikation definieren -- **Status:** TODO -- **Owner:** offen +- **Status:** IN PROGRESS +- **Owner:** Lead Coderaffe - **Beispiele:** - - Treiberfehler - - Write-Time-Budget überschritten - - Queue leer - - Queue dauerhaft kritisch - - Selbsttest fehlgeschlagen - - unerlaubtes Live-Update + - `queueCritical` + - `lateBuffers` + - `writeTimeout` (z. B. Driver-Timeouts) + - `queueEmpty` + - `unknown` (Catch-all für unvorhergesehene Runtime-Zustände) +- **Ziel:** + Alle relevanten Fehlertypen als `FaultReason`/`FaultSeverity` codieren, damit sie später eindeutig auf Telemetrie und Logs abgebildet werden können. +- **Nachweis:** + - `internal/app/fault.go` definiert Reasons (`queueCritical`, `lateBuffers`, `writeTimeout`, `queueEmpty`, `unknown`) und Severity-Stufen (`warn`, `degraded`, `muted`, `faulted`). + - `internal/app/engine.go` ruft `recordFault` im Queue- und Late-Buffer-Prozess auf, so dass jede Reason vom Fault-Historien-Log erfasst wird. + - `internal/app/runtime_state_test.go` und `internal/app/fault_test.go` prüfen, dass die Reason/Severity-Kombinationen korrekt geloggt und ausgewertet werden. +- **Restrisiken:** + Weitere Driver-/Hardware-Faults (z. B. Soapy-Timeouts oder Audio-Stream-Abbrüche) müssen noch explizit getriggert und klassifiziert werden. ### WS-02-T2 — Reaktionsstrategie definieren -- **Status:** TODO -- **Owner:** offen +- **Status:** IN PROGRESS +- **Owner:** Lead Coderaffe - **Ziel:** - Pro Fehlerklasse klar definieren: + Reaktionen für jede FaultSeverity klar definieren (warn → loggen, degraded → degrade state, muted → stilllegen, faulted → Reset-Hürde). - warn only - degraded - muted - faulted +- **Nachweis:** + - `evaluateRuntimeState` eskaliert queueCritical-Läufe zuerst zu `degraded`, dann `muted`, schließlich `faulted` und protokolliert die entsprechenden Severity-Labels. + - `Engine.ResetFault()` bringt `faulted` deterministisch zurück auf `degraded`, damit die Supervisor-Logik das Manual-Reset respektiert. + - Tests in `internal/app/runtime_state_test.go` prüfen, dass die Transition-Counter (`degradedTransitions`, `mutedTransitions`, `faultedTransitions`) und `faultCount` bei den richtigen Ereignissen springen. +- **Restrisiken:** + Die aktuellen Schwellen basieren auf queueCritical-Streaks; zusätzliche FaultSources (Driver, Audio-Stream, Live-Update-Rejection) brauchen eigene Severity-Strategien. ### WS-02-T3 — Fault-Historie und Event-Log einführen -- **Status:** TODO -- **Owner:** offen +- **Status:** IN PROGRESS +- **Owner:** Lead Coderaffe - **Ziel:** - Zustandswechsel und Faults auditierbar machen. + Zustandswechsel, Fault-Count und Trace-Historien auditierbar machen, damit `/runtime` und die UI eine nachvollziehbare Story liefern können. +- **Nachweis:** + - `EngineStats` enthält `faultHistory`, `transitionHistory`, `lastFault`, `faultCount` sowie `runtimeStateDurationSeconds` und Runtime-Indikatoren. + - `txBridge.TXStats` leitet diese Infos in `/runtime` und `/status` weiter, `internal/control/control_test.go` sichert, dass `faultHistory` und `transitionHistory` korrekt serialisiert werden. + - `internal/app/runtime_state_test.go` validiert die Historienkapazität, `go test ./...` deckt die API-Exposition ab. +- **Restrisiken:** + Die History-Kapazität ist auf 8 Einträge begrenzt; ein Audit-Log-Backend könnte später die Lücke auffangen. ## Offene Designfragen - Wie fein granular darf die State-Maschine werden, ohne unwartbar zu werden? From 5b0d76048a3916ce6e7ccafb8d76ae6dfc34ff39 Mon Sep 17 00:00:00 2001 From: Jan Date: Mon, 6 Apr 2026 09:54:37 +0200 Subject: [PATCH 54/55] config: enforce rds text lengths --- internal/config/config.go | 6 +++ internal/config/config_test.go | 95 +++++++++++++++++++++++++++------- 2 files changed, 81 insertions(+), 20 deletions(-) diff --git a/internal/config/config.go b/internal/config/config.go index 6c73382..2eaa227 100644 --- a/internal/config/config.go +++ b/internal/config/config.go @@ -183,6 +183,12 @@ func (c Config) Validate() error { if c.RDS.PTY < 0 || c.RDS.PTY > 31 { return fmt.Errorf("rds.pty out of range (0-31)") } + if len(c.RDS.PS) > 8 { + return fmt.Errorf("rds.ps must be <= 8 characters") + } + if len(c.RDS.RadioText) > 64 { + return fmt.Errorf("rds.radioText must be <= 64 characters") + } return nil } diff --git a/internal/config/config_test.go b/internal/config/config_test.go index aee3ada..079d9c0 100644 --- a/internal/config/config_test.go +++ b/internal/config/config_test.go @@ -3,11 +3,14 @@ package config import ( "os" "path/filepath" + "strings" "testing" ) func TestDefaultValidate(t *testing.T) { - if err := Default().Validate(); err != nil { t.Fatalf("default invalid: %v", err) } + if err := Default().Validate(); err != nil { + t.Fatalf("default invalid: %v", err) + } } func TestLoadAndValidate(t *testing.T) { @@ -15,56 +18,108 @@ func TestLoadAndValidate(t *testing.T) { path := filepath.Join(dir, "config.json") os.WriteFile(path, []byte(`{"audio":{"toneLeftHz":900,"toneRightHz":1700,"toneAmplitude":0.3},"fm":{"frequencyMHz":99.9},"backend":{"kind":"file","outputPath":"out.f32"},"control":{"listenAddress":"127.0.0.1:8088"}}`), 0o644) cfg, err := Load(path) - if err != nil { t.Fatalf("load: %v", err) } - if cfg.Audio.ToneLeftHz != 900 { t.Fatalf("unexpected left tone: %v", cfg.Audio.ToneLeftHz) } + if err != nil { + t.Fatalf("load: %v", err) + } + if cfg.Audio.ToneLeftHz != 900 { + t.Fatalf("unexpected left tone: %v", cfg.Audio.ToneLeftHz) + } } func TestValidateRejectsBadFrequency(t *testing.T) { - cfg := Default(); cfg.FM.FrequencyMHz = 200 - if err := cfg.Validate(); err == nil { t.Fatal("expected error") } + cfg := Default() + cfg.FM.FrequencyMHz = 200 + if err := cfg.Validate(); err == nil { + t.Fatal("expected error") + } } func TestValidateRejectsBadPreEmphasis(t *testing.T) { - cfg := Default(); cfg.FM.PreEmphasisTauUS = 150 - if err := cfg.Validate(); err == nil { t.Fatal("expected error") } + cfg := Default() + cfg.FM.PreEmphasisTauUS = 150 + if err := cfg.Validate(); err == nil { + t.Fatal("expected error") + } } func TestDefaultPreEmphasis(t *testing.T) { - if Default().FM.PreEmphasisTauUS != 50 { t.Fatal("expected 50") } + if Default().FM.PreEmphasisTauUS != 50 { + t.Fatal("expected 50") + } } func TestDefaultFMModulation(t *testing.T) { cfg := Default() - if !cfg.FM.FMModulationEnabled { t.Fatal("expected true") } - if cfg.FM.MaxDeviationHz != 75000 { t.Fatal("expected 75000") } + if !cfg.FM.FMModulationEnabled { + t.Fatal("expected true") + } + if cfg.FM.MaxDeviationHz != 75000 { + t.Fatal("expected 75000") + } } func TestParsePI(t *testing.T) { - tests := []struct{ in string; want uint16; ok bool }{ + tests := []struct { + in string + want uint16 + ok bool + }{ {"1234", 0x1234, true}, {"0xBEEF", 0xBEEF, true}, {"0XCAFE", 0xCAFE, true}, {" 0x2345 ", 0x2345, true}, {"", 0, false}, {"nope", 0, false}, } for _, tt := range tests { got, err := ParsePI(tt.in) - if tt.ok && err != nil { t.Fatalf("ParsePI(%q): %v", tt.in, err) } - if !tt.ok && err == nil { t.Fatalf("ParsePI(%q): expected error", tt.in) } - if tt.ok && got != tt.want { t.Fatalf("ParsePI(%q): got %x want %x", tt.in, got, tt.want) } + if tt.ok && err != nil { + t.Fatalf("ParsePI(%q): %v", tt.in, err) + } + if !tt.ok && err == nil { + t.Fatalf("ParsePI(%q): expected error", tt.in) + } + if tt.ok && got != tt.want { + t.Fatalf("ParsePI(%q): got %x want %x", tt.in, got, tt.want) + } } } func TestValidateRejectsInvalidPI(t *testing.T) { - cfg := Default(); cfg.RDS.PI = "nope" - if err := cfg.Validate(); err == nil { t.Fatal("expected error") } + cfg := Default() + cfg.RDS.PI = "nope" + if err := cfg.Validate(); err == nil { + t.Fatal("expected error") + } } func TestValidateRejectsEmptyPI(t *testing.T) { - cfg := Default(); cfg.RDS.PI = "" - if err := cfg.Validate(); err == nil { t.Fatal("expected error") } + cfg := Default() + cfg.RDS.PI = "" + if err := cfg.Validate(); err == nil { + t.Fatal("expected error") + } +} + +func TestValidateRejectsLongPS(t *testing.T) { + cfg := Default() + cfg.RDS.PS = "TOO_LONG_PS" + if err := cfg.Validate(); err == nil { + t.Fatal("expected error for PS longer than 8 characters") + } +} + +func TestValidateRejectsLongRadioText(t *testing.T) { + cfg := Default() + cfg.RDS.RadioText = strings.Repeat("x", 65) + if err := cfg.Validate(); err == nil { + t.Fatal("expected error for RadioText longer than 64 characters") + } } func TestEffectiveDeviceRate(t *testing.T) { cfg := Default() - if cfg.EffectiveDeviceRate() != float64(cfg.FM.CompositeRateHz) { t.Fatal("expected composite rate") } + if cfg.EffectiveDeviceRate() != float64(cfg.FM.CompositeRateHz) { + t.Fatal("expected composite rate") + } cfg.Backend.DeviceSampleRateHz = 912000 - if cfg.EffectiveDeviceRate() != 912000 { t.Fatal("expected 912000") } + if cfg.EffectiveDeviceRate() != 912000 { + t.Fatal("expected 912000") + } } From 868bd55c7a0528c0817eda485dcd533116b80ecf Mon Sep 17 00:00:00 2001 From: Jan Date: Mon, 6 Apr 2026 10:15:43 +0200 Subject: [PATCH 55/55] feat: surface applied frequency and write faults --- cmd/fmrtx/main.go | 1 + cmd/fmrtx/main_test.go | 11 +++++++++ docs/API.md | 5 ++++ internal/app/engine.go | 14 +++++++++++ internal/app/fault_test.go | 48 ++++++++++++++++++++++++++++++++++++++ internal/control/ui.html | 39 +++++++++++++++++++++++++++++-- 6 files changed, 116 insertions(+), 2 deletions(-) diff --git a/cmd/fmrtx/main.go b/cmd/fmrtx/main.go index 9bc15ed..05472da 100644 --- a/cmd/fmrtx/main.go +++ b/cmd/fmrtx/main.go @@ -268,6 +268,7 @@ func (b *txBridge) TXStats() map[string]any { "queue": s.Queue, "runtimeIndicator": s.RuntimeIndicator, "runtimeAlert": s.RuntimeAlert, + "appliedFrequencyMHz": s.AppliedFrequencyMHz, "degradedTransitions": s.DegradedTransitions, "mutedTransitions": s.MutedTransitions, "faultedTransitions": s.FaultedTransitions, diff --git a/cmd/fmrtx/main_test.go b/cmd/fmrtx/main_test.go index cb68607..f8d7cbc 100644 --- a/cmd/fmrtx/main_test.go +++ b/cmd/fmrtx/main_test.go @@ -45,6 +45,17 @@ func TestTxBridgeExportsQueueStats(t *testing.T) { if indicator != apppkg.RuntimeIndicatorQueueCritical { t.Fatalf("runtime indicator should be queueCritical, got %s", indicator) } + freqRaw, ok := stats["appliedFrequencyMHz"] + if !ok { + t.Fatalf("missing appliedFrequencyMHz") + } + freq, ok := freqRaw.(float64) + if !ok { + t.Fatalf("appliedFrequencyMHz type mismatch: %T", freqRaw) + } + if freq != cfg.FM.FrequencyMHz { + t.Fatalf("applied frequency mismatch: want %v got %v", cfg.FM.FrequencyMHz, freq) + } if historyRaw, ok := stats["faultHistory"]; !ok { t.Fatalf("expected faultHistory in tx stats") } else if history, ok := historyRaw.([]apppkg.FaultEvent); !ok { diff --git a/docs/API.md b/docs/API.md index c97206e..58d3ac1 100644 --- a/docs/API.md +++ b/docs/API.md @@ -65,6 +65,7 @@ Live engine and driver telemetry. Only populated when TX is active. "engine": { "state": "running", "runtimeStateDurationSeconds": 12.4, + "appliedFrequencyMHz": 100.0, "chunksProduced": 12345, "totalSamples": 1408950000, "underruns": 0, @@ -118,8 +119,12 @@ Live engine and driver telemetry. Only populated when TX is active. `transitionHistory` liefert die jüngsten Übergänge (from/to, severity, timestamp) damit API und UI die Runtime History synchronisieren können. +`engine.appliedFrequencyMHz` meldet die zuletzt tatsächlich getunte Frequenz auf der Hardware, sodass man sie mit dem gewünschten `/config`-Wert vergleichen und ausstehende Live-Updates sofort entdecken kann. + `driver.underrunStreak` reports how many consecutive reads returned silence, and `driver.maxUnderrunStreak` captures the longest such run since the engine started. Together they help differentiate short glitches from persistent underrun storms and can be plotted alongside queue health sparkline telemetry. +`lastFault.reason` kann jetzt auch `writeTimeout` lauten, wenn der Treiber Schreibaufrufe wiederholt verweigert oder blockiert. Die Control-Plane hebt solche Driver-Faults hervor, damit man Blockaden im Writer-Pfad ohne Log-Search sieht. + `controlAudit` mirrors the control plane's HTTP reject counters (405/415/413/400). Whenever the HTTP server rejects a request (method not allowed, unsupported media type, body too large, or unexpected body), the respective counter increments — this lets runtime telemetry spot abusive clients without polluting the runtime state payload. diff --git a/internal/app/engine.go b/internal/app/engine.go index ba824ef..8348b52 100644 --- a/internal/app/engine.go +++ b/internal/app/engine.go @@ -5,6 +5,7 @@ import ( "errors" "fmt" "log" + "math" "sync" "sync/atomic" "time" @@ -86,6 +87,7 @@ type EngineStats struct { Queue output.QueueStats `json:"queue"` RuntimeIndicator RuntimeIndicator `json:"runtimeIndicator"` RuntimeAlert string `json:"runtimeAlert,omitempty"` + AppliedFrequencyMHz float64 `json:"appliedFrequencyMHz"` LastFault *FaultEvent `json:"lastFault,omitempty"` DegradedTransitions uint64 `json:"degradedTransitions"` MutedTransitions uint64 `json:"mutedTransitions"` @@ -172,6 +174,8 @@ type Engine struct { // Live config: pending frequency change, applied between chunks pendingFreq atomic.Pointer[float64] + // Most recently tuned frequency (Hz) + appliedFreqHz atomic.Uint64 // Live audio stream (optional) streamSrc *audio.StreamSource @@ -246,6 +250,8 @@ func NewEngine(cfg cfgpkg.Config, driver platform.SoapyDriver) *Engine { faultHistory: make([]FaultEvent, 0, faultHistoryCapacity), transitionHistory: make([]RuntimeTransition, 0, runtimeTransitionHistoryCapacity), } + initFreqHz := cfg.FM.FrequencyMHz * 1e6 + engine.appliedFreqHz.Store(math.Float64bits(initFreqHz)) engine.setRuntimeState(RuntimeStateIdle) return engine } @@ -439,6 +445,7 @@ func (e *Engine) Stats() EngineStats { Queue: queue, RuntimeIndicator: ri, RuntimeAlert: runtimeAlert(queue.Health, hasRecentLateBuffers), + AppliedFrequencyMHz: e.appliedFrequencyMHz(), LastFault: lastFault, DegradedTransitions: e.degradedTransitions.Load(), MutedTransitions: e.mutedTransitions.Load(), @@ -449,6 +456,11 @@ func (e *Engine) Stats() EngineStats { } } +func (e *Engine) appliedFrequencyMHz() float64 { + bits := e.appliedFreqHz.Load() + return math.Float64frombits(bits) / 1e6 +} + func runtimeIndicator(queueHealth output.QueueHealth, recentLateBuffers bool) RuntimeIndicator { switch { case queueHealth == output.QueueHealthCritical: @@ -502,6 +514,7 @@ func (e *Engine) run(ctx context.Context) { if err := e.driver.Tune(ctx, *pf); err != nil { e.lastError.Store(fmt.Sprintf("tune: %v", err)) } else { + e.appliedFreqHz.Store(math.Float64bits(*pf)) log.Printf("engine: tuned to %.3f MHz", *pf/1e6) } } @@ -603,6 +616,7 @@ func (e *Engine) writerLoop(ctx context.Context) { if ctx.Err() != nil { return } + e.recordFault(FaultReasonWriteTimeout, FaultSeverityWarn, fmt.Sprintf("driver write error: %v", err)) e.lastError.Store(err.Error()) e.underruns.Add(1) select { diff --git a/internal/app/fault_test.go b/internal/app/fault_test.go index 4637e25..fa0bb61 100644 --- a/internal/app/fault_test.go +++ b/internal/app/fault_test.go @@ -1,7 +1,10 @@ package app import ( + "context" + "errors" "testing" + "time" cfgpkg "github.com/jan/fm-rds-tx/internal/config" "github.com/jan/fm-rds-tx/internal/output" @@ -70,3 +73,48 @@ func TestEngineRecordsLateBufferFault(t *testing.T) { t.Fatalf("expected warn severity, got %s", last.Severity) } } + +func TestEngineRecordsWriteTimeoutFault(t *testing.T) { + cfg := cfgpkg.Default() + driver := platform.NewSimulatedDriver(&writeErrorBackend{}) + eng := NewEngine(cfg, driver) + eng.SetChunkDuration(10 * time.Millisecond) + + ctx := context.Background() + if err := eng.Start(ctx); err != nil { + t.Fatalf("start: %v", err) + } + time.Sleep(120 * time.Millisecond) + if err := eng.Stop(ctx); err != nil { + t.Fatalf("stop: %v", err) + } + + last := eng.LastFault() + if last == nil { + t.Fatal("expected write timeout fault") + } + if last.Reason != FaultReasonWriteTimeout { + t.Fatalf("expected writeTimeout reason, got %s", last.Reason) + } + if last.Severity != FaultSeverityWarn { + t.Fatalf("expected warn severity, got %s", last.Severity) + } +} + +type writeErrorBackend struct{} + +func (writeErrorBackend) Configure(context.Context, output.BackendConfig) error { return nil } +func (writeErrorBackend) Write(context.Context, *output.CompositeFrame) (int, error) { + return 0, errors.New("write timeout") +} +func (writeErrorBackend) Flush(context.Context) error { return nil } +func (writeErrorBackend) Close(context.Context) error { return nil } +func (writeErrorBackend) Info() output.BackendInfo { + return output.BackendInfo{ + Name: "write-error", + Description: "backend that rejects writes", + Capabilities: output.BackendCapabilities{ + SupportsComposite: true, + }, + } +} diff --git a/internal/control/ui.html b/internal/control/ui.html index 38f1170..5c09e35 100644 --- a/internal/control/ui.html +++ b/internal/control/ui.html @@ -233,6 +233,24 @@ button { user-select: none; } margin-left: 5px; } +.freq-note { + display: flex; + gap: 12px; + margin-top: 6px; + font-size: 11px; + color: var(--text-muted); + text-transform: uppercase; + letter-spacing: 1px; +} +.freq-note-item { + display: inline-flex; + align-items: center; + gap: 4px; +} +.freq-note.mismatch .freq-note-item { + color: var(--amber); +} + .tx-actions { display: flex; flex-wrap: wrap; @@ -967,6 +985,10 @@ input.input-error {
Carrier
---.-MHz
+
+ Applied: -- + Desired: -- +
@@ -1916,8 +1938,21 @@ function render() { const driver = runtime.driver || {}; const audioStream = runtime.audioStream || null; - const freq = effectiveValue('frequencyMHz') ?? cfg.fm?.frequencyMHz; - updateHTML('freq-display', `${typeof freq === 'number' ? freq.toFixed(1) : '---.-'}MHz`); + const appliedRaw = engine.appliedFrequencyMHz; + const appliedFreq = Number.isFinite(Number(appliedRaw)) ? Number(appliedRaw) : null; + const desiredRaw = cfg.fm?.frequencyMHz; + const desiredFreq = Number.isFinite(Number(desiredRaw)) ? Number(desiredRaw) : null; + const displayFreq = appliedFreq ?? effectiveValue('frequencyMHz') ?? desiredFreq; + updateHTML('freq-display', `${typeof displayFreq === 'number' ? displayFreq.toFixed(1) : '---.-'}MHz`); + const appliedLabel = appliedFreq != null ? `Applied ${appliedFreq.toFixed(1)} MHz` : 'Applied --'; + const desiredLabel = desiredFreq != null ? `Desired ${desiredFreq.toFixed(1)} MHz` : 'Desired --'; + updateText('freq-applied', appliedLabel); + updateText('freq-desired', desiredLabel); + const noteEl = $('freq-note'); + if (noteEl) { + const mismatch = appliedFreq != null && desiredFreq != null && !nearlyEqual(appliedFreq, desiredFreq, 0.001); + noteEl.classList.toggle('mismatch', mismatch); + } updateText('badge-backend', cfg.backend?.kind || cfg.backend || '--'); updateText('badge-mode', engine.state && engine.state !== 'idle' ? 'TX Active' : 'Control Plane');