Преглед изворни кода

feat: add runtime fault reset path

tags/v0.9.0
Jan Svabenik пре 1 месец
родитељ
комит
f275e125a7
7 измењених фајлова са 144 додато и 0 уклоњено
  1. +4
    -0
      cmd/fmrtx/main.go
  2. +16
    -0
      docs/API.md
  3. +3
    -0
      docs/pro-runtime-hardening-workboard.md
  4. +14
    -0
      internal/app/engine.go
  5. +34
    -0
      internal/app/runtime_state_test.go
  6. +22
    -0
      internal/control/control.go
  7. +51
    -0
      internal/control/control_test.go

+ 4
- 0
cmd/fmrtx/main.go Прегледај датотеку

@@ -287,3 +287,7 @@ func (b *txBridge) UpdateConfig(lp ctrlpkg.LivePatch) error {
RadioText: lp.RadioText,
})
}

func (b *txBridge) ResetFault() error {
return b.engine.ResetFault()
}

+ 16
- 0
docs/API.md Прегледај датотеку

@@ -79,6 +79,22 @@ Live engine and driver telemetry. Only populated when TX is active.

---

### `POST /runtime/fault/reset`

Manually acknowledge a `faulted` runtime state so the supervisor can re-enter the recovery path (the engine moves back to `degraded` once the reset succeeds).

**Response:**
```json
{"ok": true}
```

**Errors:**
- `405 Method Not Allowed` if the request is not a POST
- `503 Service Unavailable` when no TX controller is attached (`--tx` mode not active)
- `409 Conflict` when the engine is not currently faulted or the reset was rejected (e.g. still throttled)

---

### `GET /config`

Full current configuration (all fields, including non-patchable).


+ 3
- 0
docs/pro-runtime-hardening-workboard.md Прегледај датотеку

@@ -278,6 +278,7 @@ Einführen eines klaren Betriebsmodells mit Fault-, Recovery- und Muted-Zuständ
- Persistent queue-critical streaks while `muted` now escalate to `faulted` with `FaultSeverityFaulted`, keeping `RuntimeStateFaulted` observable.
- `EngineStats` and `txBridge` now expose transition/fault counters plus `lastFault`, surfacing the new telemetry through `/runtime`.
- Control-plane UI now renders those WS-02 transition counters, fault count, and last-fault summary so operators can watch runtime escalations without digging through logs.
- Control-plane now exposes `POST /runtime/fault/reset` so operators can acknowledge `faulted` state; `TestRuntimeFaultReset*` covers the new HTTP path.


## Zielzustände laut Konzept
@@ -327,11 +328,13 @@ Einführen eines klaren Betriebsmodells mit Fault-, Recovery- und Muted-Zuständ
| Datum | Entscheidung | Notiz |
|---|---|---|
| 2026-04-05 | Faulted escalation on persistent critical queue | `muted` now surfaces `RuntimeStateFaulted` when queue health stays critical and metrics capture every transition. |
| 2026-04-05 | Manual fault reset endpoint | Added `POST /runtime/fault/reset` so operators can acknowledge `faulted` before the supervisor re-enters recovery. |

## WS-02 Verifikation
| Datum | Fokus | Ergebnis |
|---|---|---|
| 2026-04-05 | Faulted path + transition counters | `go test ./...` exercises `TestEngineFaultsAfterMutedCriticalStreak` and `TestRuntimeTransitionCounters`, while `/runtime` now surfaces `engine.degradedTransitions`, `engine.mutedTransitions`, `engine.faultedTransitions`, `engine.faultCount`, and the last fault via `txBridge`. |
| 2026-04-05 | Runtime fault reset API | `go test ./...` now runs `TestRuntimeFaultReset*`, verifying the new HTTP path and controller error scenarios. |

---



+ 14
- 0
internal/app/engine.go Прегледај датотеку

@@ -744,3 +744,17 @@ func (e *Engine) evaluateRuntimeState(queue output.QueueStats, hasLateBuffers bo
}
e.setRuntimeState(RuntimeStateRunning)
}

// ResetFault attempts to move the engine out of the faulted state.
func (e *Engine) ResetFault() error {
state := e.currentRuntimeState()
if state != RuntimeStateFaulted {
return fmt.Errorf("engine not in faulted state (current=%s)", state)
}

e.criticalStreak.Store(0)
e.mutedRecoveryStreak.Store(0)
e.mutedFaultStreak.Store(0)
e.setRuntimeState(RuntimeStateDegraded)
return nil
}

+ 34
- 0
internal/app/runtime_state_test.go Прегледај датотеку

@@ -179,3 +179,37 @@ func TestRuntimeTransitionCounters(t *testing.T) {
t.Fatalf("expected one recorded fault, got %d", got)
}
}


func TestEngineResetFaultRequiresFaultedState(t *testing.T) {
e := NewEngine(cfgpkg.Default(), platform.NewSimulatedDriver(nil))
if err := e.ResetFault(); err == nil {
t.Fatal("expected error when resetting non-faulted state")
}
}

func TestEngineResetFaultTransitionsToDegraded(t *testing.T) {
e := NewEngine(cfgpkg.Default(), platform.NewSimulatedDriver(nil))
e.criticalStreak.Store(7)
e.mutedRecoveryStreak.Store(3)
e.mutedFaultStreak.Store(1)
e.setRuntimeState(RuntimeStateFaulted)
if err := e.ResetFault(); err != nil {
t.Fatalf("reset fault failed: %v", err)
}
if got := e.currentRuntimeState(); got != RuntimeStateDegraded {
t.Fatalf("expected degraded after reset, got %s", got)
}
if e.criticalStreak.Load() != 0 {
t.Fatalf("expected critical streak reset, got %d", e.criticalStreak.Load())
}
if e.mutedRecoveryStreak.Load() != 0 {
t.Fatalf("expected mute recovery streak reset, got %d", e.mutedRecoveryStreak.Load())
}
if e.mutedFaultStreak.Load() != 0 {
t.Fatalf("expected mute fault streak reset, got %d", e.mutedFaultStreak.Load())
}
if err := e.ResetFault(); err == nil {
t.Fatal("expected error when resetting after recovery")
}
}

+ 22
- 0
internal/control/control.go Прегледај датотеку

@@ -23,6 +23,7 @@ type TXController interface {
StopTX() error
TXStats() map[string]any
UpdateConfig(patch LivePatch) error
ResetFault() error
}

// LivePatch mirrors the patchable fields from ConfigPatch for the engine.
@@ -95,6 +96,7 @@ func (s *Server) Handler() http.Handler {
mux.HandleFunc("/dry-run", s.handleDryRun)
mux.HandleFunc("/config", s.handleConfig)
mux.HandleFunc("/runtime", s.handleRuntime)
mux.HandleFunc("/runtime/fault/reset", s.handleRuntimeFaultReset)
mux.HandleFunc("/tx/start", s.handleTXStart)
mux.HandleFunc("/tx/stop", s.handleTXStop)
mux.HandleFunc("/audio/stream", s.handleAudioStream)
@@ -171,6 +173,26 @@ func (s *Server) handleRuntime(w http.ResponseWriter, _ *http.Request) {
_ = json.NewEncoder(w).Encode(result)
}

func (s *Server) handleRuntimeFaultReset(w http.ResponseWriter, r *http.Request) {
if r.Method != http.MethodPost {
http.Error(w, "method not allowed", http.StatusMethodNotAllowed)
return
}
s.mu.RLock()
tx := s.tx
s.mu.RUnlock()
if tx == nil {
http.Error(w, "tx controller not available", http.StatusServiceUnavailable)
return
}
if err := tx.ResetFault(); err != nil {
http.Error(w, err.Error(), http.StatusConflict)
return
}
w.Header().Set("Content-Type", "application/json")
_ = json.NewEncoder(w).Encode(map[string]any{"ok": true})
}

// handleAudioStream accepts raw S16LE stereo PCM via HTTP POST and pushes
// it into the live audio ring buffer. Use with:
// curl -X POST --data-binary @- http://host:8088/audio/stream < audio.raw


+ 51
- 0
internal/control/control_test.go Прегледај датотеку

@@ -125,6 +125,55 @@ func TestRuntimeWithoutDriver(t *testing.T) {
}
}

func TestRuntimeFaultResetRejectsGet(t *testing.T) {
srv := NewServer(cfgpkg.Default())
rec := httptest.NewRecorder()
req := httptest.NewRequest(http.MethodGet, "/runtime/fault/reset", nil)
srv.Handler().ServeHTTP(rec, req)
if rec.Code != http.StatusMethodNotAllowed {
t.Fatalf("expected 405 for fault reset GET, got %d", rec.Code)
}
}

func TestRuntimeFaultResetRequiresController(t *testing.T) {
srv := NewServer(cfgpkg.Default())
rec := httptest.NewRecorder()
req := httptest.NewRequest(http.MethodPost, "/runtime/fault/reset", nil)
srv.Handler().ServeHTTP(rec, req)
if rec.Code != http.StatusServiceUnavailable {
t.Fatalf("expected 503 without controller, got %d", rec.Code)
}
}

func TestRuntimeFaultResetControllerError(t *testing.T) {
srv := NewServer(cfgpkg.Default())
srv.SetTXController(&fakeTXController{resetErr: errors.New("boom")})
rec := httptest.NewRecorder()
req := httptest.NewRequest(http.MethodPost, "/runtime/fault/reset", nil)
srv.Handler().ServeHTTP(rec, req)
if rec.Code != http.StatusConflict {
t.Fatalf("expected 409 when controller rejects, got %d", rec.Code)
}
}

func TestRuntimeFaultResetSuccess(t *testing.T) {
srv := NewServer(cfgpkg.Default())
srv.SetTXController(&fakeTXController{})
rec := httptest.NewRecorder()
req := httptest.NewRequest(http.MethodPost, "/runtime/fault/reset", nil)
srv.Handler().ServeHTTP(rec, req)
if rec.Code != 200 {
t.Fatalf("expected 200 on success, got %d", rec.Code)
}
var body map[string]any
if err := json.Unmarshal(rec.Body.Bytes(), &body); err != nil {
t.Fatalf("unmarshal response: %v", err)
}
if ok, _ := body["ok"].(bool); !ok {
t.Fatalf("expected ok true, got %v", body["ok"])
}
}

func TestAudioStreamRequiresSource(t *testing.T) {
srv := NewServer(cfgpkg.Default())
rec := httptest.NewRecorder()
@@ -239,6 +288,7 @@ func TestConfigPatchEngineRejectsDoesNotUpdateSnapshot(t *testing.T) {

type fakeTXController struct {
updateErr error
resetErr error
stats map[string]any
}

@@ -251,3 +301,4 @@ func (f *fakeTXController) TXStats() map[string]any {
return map[string]any{}
}
func (f *fakeTXController) UpdateConfig(_ LivePatch) error { return f.updateErr }
func (f *fakeTXController) ResetFault() error { return f.resetErr }

Loading…
Откажи
Сачувај