diff --git a/cmd/fmrtx/main.go b/cmd/fmrtx/main.go index 46d9113..9a466fb 100644 --- a/cmd/fmrtx/main.go +++ b/cmd/fmrtx/main.go @@ -287,3 +287,7 @@ func (b *txBridge) UpdateConfig(lp ctrlpkg.LivePatch) error { RadioText: lp.RadioText, }) } + +func (b *txBridge) ResetFault() error { + return b.engine.ResetFault() +} diff --git a/docs/API.md b/docs/API.md index c8807d4..dd9da0c 100644 --- a/docs/API.md +++ b/docs/API.md @@ -79,6 +79,22 @@ Live engine and driver telemetry. Only populated when TX is active. --- +### `POST /runtime/fault/reset` + +Manually acknowledge a `faulted` runtime state so the supervisor can re-enter the recovery path (the engine moves back to `degraded` once the reset succeeds). + +**Response:** +```json +{"ok": true} +``` + +**Errors:** +- `405 Method Not Allowed` if the request is not a POST +- `503 Service Unavailable` when no TX controller is attached (`--tx` mode not active) +- `409 Conflict` when the engine is not currently faulted or the reset was rejected (e.g. still throttled) + +--- + ### `GET /config` Full current configuration (all fields, including non-patchable). diff --git a/docs/pro-runtime-hardening-workboard.md b/docs/pro-runtime-hardening-workboard.md index 362031b..f84e4db 100644 --- a/docs/pro-runtime-hardening-workboard.md +++ b/docs/pro-runtime-hardening-workboard.md @@ -278,6 +278,7 @@ Einführen eines klaren Betriebsmodells mit Fault-, Recovery- und Muted-Zuständ - Persistent queue-critical streaks while `muted` now escalate to `faulted` with `FaultSeverityFaulted`, keeping `RuntimeStateFaulted` observable. - `EngineStats` and `txBridge` now expose transition/fault counters plus `lastFault`, surfacing the new telemetry through `/runtime`. - Control-plane UI now renders those WS-02 transition counters, fault count, and last-fault summary so operators can watch runtime escalations without digging through logs. +- Control-plane now exposes `POST /runtime/fault/reset` so operators can acknowledge `faulted` state; `TestRuntimeFaultReset*` covers the new HTTP path. ## Zielzustände laut Konzept @@ -327,11 +328,13 @@ Einführen eines klaren Betriebsmodells mit Fault-, Recovery- und Muted-Zuständ | Datum | Entscheidung | Notiz | |---|---|---| | 2026-04-05 | Faulted escalation on persistent critical queue | `muted` now surfaces `RuntimeStateFaulted` when queue health stays critical and metrics capture every transition. | +| 2026-04-05 | Manual fault reset endpoint | Added `POST /runtime/fault/reset` so operators can acknowledge `faulted` before the supervisor re-enters recovery. | ## WS-02 Verifikation | Datum | Fokus | Ergebnis | |---|---|---| | 2026-04-05 | Faulted path + transition counters | `go test ./...` exercises `TestEngineFaultsAfterMutedCriticalStreak` and `TestRuntimeTransitionCounters`, while `/runtime` now surfaces `engine.degradedTransitions`, `engine.mutedTransitions`, `engine.faultedTransitions`, `engine.faultCount`, and the last fault via `txBridge`. | +| 2026-04-05 | Runtime fault reset API | `go test ./...` now runs `TestRuntimeFaultReset*`, verifying the new HTTP path and controller error scenarios. | --- diff --git a/internal/app/engine.go b/internal/app/engine.go index cc8d8b8..9a40ae7 100644 --- a/internal/app/engine.go +++ b/internal/app/engine.go @@ -744,3 +744,17 @@ func (e *Engine) evaluateRuntimeState(queue output.QueueStats, hasLateBuffers bo } e.setRuntimeState(RuntimeStateRunning) } + +// ResetFault attempts to move the engine out of the faulted state. +func (e *Engine) ResetFault() error { + state := e.currentRuntimeState() + if state != RuntimeStateFaulted { + return fmt.Errorf("engine not in faulted state (current=%s)", state) + } + + e.criticalStreak.Store(0) + e.mutedRecoveryStreak.Store(0) + e.mutedFaultStreak.Store(0) + e.setRuntimeState(RuntimeStateDegraded) + return nil +} diff --git a/internal/app/runtime_state_test.go b/internal/app/runtime_state_test.go index edc333f..018913f 100644 --- a/internal/app/runtime_state_test.go +++ b/internal/app/runtime_state_test.go @@ -179,3 +179,37 @@ func TestRuntimeTransitionCounters(t *testing.T) { t.Fatalf("expected one recorded fault, got %d", got) } } + + +func TestEngineResetFaultRequiresFaultedState(t *testing.T) { + e := NewEngine(cfgpkg.Default(), platform.NewSimulatedDriver(nil)) + if err := e.ResetFault(); err == nil { + t.Fatal("expected error when resetting non-faulted state") + } +} + +func TestEngineResetFaultTransitionsToDegraded(t *testing.T) { + e := NewEngine(cfgpkg.Default(), platform.NewSimulatedDriver(nil)) + e.criticalStreak.Store(7) + e.mutedRecoveryStreak.Store(3) + e.mutedFaultStreak.Store(1) + e.setRuntimeState(RuntimeStateFaulted) + if err := e.ResetFault(); err != nil { + t.Fatalf("reset fault failed: %v", err) + } + if got := e.currentRuntimeState(); got != RuntimeStateDegraded { + t.Fatalf("expected degraded after reset, got %s", got) + } + if e.criticalStreak.Load() != 0 { + t.Fatalf("expected critical streak reset, got %d", e.criticalStreak.Load()) + } + if e.mutedRecoveryStreak.Load() != 0 { + t.Fatalf("expected mute recovery streak reset, got %d", e.mutedRecoveryStreak.Load()) + } + if e.mutedFaultStreak.Load() != 0 { + t.Fatalf("expected mute fault streak reset, got %d", e.mutedFaultStreak.Load()) + } + if err := e.ResetFault(); err == nil { + t.Fatal("expected error when resetting after recovery") + } +} diff --git a/internal/control/control.go b/internal/control/control.go index 823a8af..5509199 100644 --- a/internal/control/control.go +++ b/internal/control/control.go @@ -23,6 +23,7 @@ type TXController interface { StopTX() error TXStats() map[string]any UpdateConfig(patch LivePatch) error + ResetFault() error } // LivePatch mirrors the patchable fields from ConfigPatch for the engine. @@ -95,6 +96,7 @@ func (s *Server) Handler() http.Handler { mux.HandleFunc("/dry-run", s.handleDryRun) mux.HandleFunc("/config", s.handleConfig) mux.HandleFunc("/runtime", s.handleRuntime) + mux.HandleFunc("/runtime/fault/reset", s.handleRuntimeFaultReset) mux.HandleFunc("/tx/start", s.handleTXStart) mux.HandleFunc("/tx/stop", s.handleTXStop) mux.HandleFunc("/audio/stream", s.handleAudioStream) @@ -171,6 +173,26 @@ func (s *Server) handleRuntime(w http.ResponseWriter, _ *http.Request) { _ = json.NewEncoder(w).Encode(result) } +func (s *Server) handleRuntimeFaultReset(w http.ResponseWriter, r *http.Request) { + if r.Method != http.MethodPost { + http.Error(w, "method not allowed", http.StatusMethodNotAllowed) + return + } + s.mu.RLock() + tx := s.tx + s.mu.RUnlock() + if tx == nil { + http.Error(w, "tx controller not available", http.StatusServiceUnavailable) + return + } + if err := tx.ResetFault(); err != nil { + http.Error(w, err.Error(), http.StatusConflict) + return + } + w.Header().Set("Content-Type", "application/json") + _ = json.NewEncoder(w).Encode(map[string]any{"ok": true}) +} + // handleAudioStream accepts raw S16LE stereo PCM via HTTP POST and pushes // it into the live audio ring buffer. Use with: // curl -X POST --data-binary @- http://host:8088/audio/stream < audio.raw diff --git a/internal/control/control_test.go b/internal/control/control_test.go index d810666..a42ca51 100644 --- a/internal/control/control_test.go +++ b/internal/control/control_test.go @@ -125,6 +125,55 @@ func TestRuntimeWithoutDriver(t *testing.T) { } } +func TestRuntimeFaultResetRejectsGet(t *testing.T) { + srv := NewServer(cfgpkg.Default()) + rec := httptest.NewRecorder() + req := httptest.NewRequest(http.MethodGet, "/runtime/fault/reset", nil) + srv.Handler().ServeHTTP(rec, req) + if rec.Code != http.StatusMethodNotAllowed { + t.Fatalf("expected 405 for fault reset GET, got %d", rec.Code) + } +} + +func TestRuntimeFaultResetRequiresController(t *testing.T) { + srv := NewServer(cfgpkg.Default()) + rec := httptest.NewRecorder() + req := httptest.NewRequest(http.MethodPost, "/runtime/fault/reset", nil) + srv.Handler().ServeHTTP(rec, req) + if rec.Code != http.StatusServiceUnavailable { + t.Fatalf("expected 503 without controller, got %d", rec.Code) + } +} + +func TestRuntimeFaultResetControllerError(t *testing.T) { + srv := NewServer(cfgpkg.Default()) + srv.SetTXController(&fakeTXController{resetErr: errors.New("boom")}) + rec := httptest.NewRecorder() + req := httptest.NewRequest(http.MethodPost, "/runtime/fault/reset", nil) + srv.Handler().ServeHTTP(rec, req) + if rec.Code != http.StatusConflict { + t.Fatalf("expected 409 when controller rejects, got %d", rec.Code) + } +} + +func TestRuntimeFaultResetSuccess(t *testing.T) { + srv := NewServer(cfgpkg.Default()) + srv.SetTXController(&fakeTXController{}) + rec := httptest.NewRecorder() + req := httptest.NewRequest(http.MethodPost, "/runtime/fault/reset", nil) + srv.Handler().ServeHTTP(rec, req) + if rec.Code != 200 { + t.Fatalf("expected 200 on success, got %d", rec.Code) + } + var body map[string]any + if err := json.Unmarshal(rec.Body.Bytes(), &body); err != nil { + t.Fatalf("unmarshal response: %v", err) + } + if ok, _ := body["ok"].(bool); !ok { + t.Fatalf("expected ok true, got %v", body["ok"]) + } +} + func TestAudioStreamRequiresSource(t *testing.T) { srv := NewServer(cfgpkg.Default()) rec := httptest.NewRecorder() @@ -239,6 +288,7 @@ func TestConfigPatchEngineRejectsDoesNotUpdateSnapshot(t *testing.T) { type fakeTXController struct { updateErr error + resetErr error stats map[string]any } @@ -251,3 +301,4 @@ func (f *fakeTXController) TXStats() map[string]any { return map[string]any{} } func (f *fakeTXController) UpdateConfig(_ LivePatch) error { return f.updateErr } +func (f *fakeTXController) ResetFault() error { return f.resetErr }