diff --git a/docs/pro-runtime-hardening-workboard.md b/docs/pro-runtime-hardening-workboard.md index 053239f..0a46e29 100644 --- a/docs/pro-runtime-hardening-workboard.md +++ b/docs/pro-runtime-hardening-workboard.md @@ -273,6 +273,7 @@ Einführen eines klaren Betriebsmodells mit Fault-, Recovery- und Muted-Zuständ ## Fortschritt - EngineStats liefert das Runtime-State-Feld (`idle`, `arming`, `prebuffering`, `running`) und reagiert nun auf Queue-Gesundheit bzw. späte Buffers, indem es bei `low`/`critical` oder späten Buffern in `degraded` wechselt und sonst auf `running` zurückkehrt. +- `evaluateRuntimeState` escalates persistent `critical` queues from `degraded` to `muted`, while `FaultReasonQueueCritical` surfaces `muted` severity so the mute transition stays observable. ## Zielzustände laut Konzept - `idle` diff --git a/internal/app/engine.go b/internal/app/engine.go index a62c11c..a33edc7 100644 --- a/internal/app/engine.go +++ b/internal/app/engine.go @@ -97,6 +97,7 @@ const ( const ( lateBufferIndicatorWindow = 5 * time.Second queueCriticalStreakThreshold = 3 + queueMutedStreakThreshold = queueCriticalStreakThreshold * 2 faultRepeatWindow = 1 * time.Second faultHistoryCapacity = 8 ) @@ -666,7 +667,14 @@ func (e *Engine) evaluateRuntimeState(queue output.QueueStats, hasLateBuffers bo } critical := queue.Health == output.QueueHealthCritical if critical { - if e.criticalStreak.Add(1) >= queueCriticalStreakThreshold { + count := e.criticalStreak.Add(1) + if count >= queueMutedStreakThreshold { + e.recordFault(FaultReasonQueueCritical, FaultSeverityMuted, + fmt.Sprintf("queue health critical for %d consecutive checks (depth=%d)", count, queue.Depth)) + e.setRuntimeState(RuntimeStateMuted) + return + } + if count >= queueCriticalStreakThreshold { e.recordFault(FaultReasonQueueCritical, FaultSeverityDegraded, fmt.Sprintf("queue health critical (depth=%d)", queue.Depth)) e.setRuntimeState(RuntimeStateDegraded) diff --git a/internal/app/runtime_state_test.go b/internal/app/runtime_state_test.go index b253183..744a36b 100644 --- a/internal/app/runtime_state_test.go +++ b/internal/app/runtime_state_test.go @@ -55,3 +55,28 @@ func TestEngineRuntimeStateTransitions(t *testing.T) { t.Fatalf("expected degraded when late buffers seen, got %s", got) } } + +func TestEngineRuntimeStateMuteOnPersistentQueueCritical(t *testing.T) { + e := NewEngine(cfgpkg.Default(), platform.NewSimulatedDriver(nil)) + e.setRuntimeState(RuntimeStateRunning) + + queue := output.QueueStats{Depth: 1, Health: output.QueueHealthCritical} + for i := 0; i < queueMutedStreakThreshold; i++ { + e.evaluateRuntimeState(queue, false) + } + + if got := e.currentRuntimeState(); got != RuntimeStateMuted { + t.Fatalf("expected muted after prolonged queue critical, got %s", got) + } + + last := e.LastFault() + if last == nil { + t.Fatal("expected fault recorded for the mute transition") + } + if last.Reason != FaultReasonQueueCritical { + t.Fatalf("expected queue critical reason, got %s", last.Reason) + } + if last.Severity != FaultSeverityMuted { + t.Fatalf("expected muted severity, got %s", last.Severity) + } +}