Add muted transition for persistent queue-critical

1 månad sedan · 5d60f20f91
--- a/docs/pro-runtime-hardening-workboard.md
+++ b/docs/pro-runtime-hardening-workboard.md
@@ -273,6 +273,7 @@ Einführen eines klaren Betriebsmodells mit Fault-, Recovery- und Muted-Zuständ
 ## Fortschritt
 - EngineStats liefert das Runtime-State-Feld (`idle`, `arming`, `prebuffering`, `running`) und reagiert nun auf Queue-Gesundheit bzw. späte Buffers, indem es bei `low`/`critical` oder späten Buffern in `degraded` wechselt und sonst auf `running` zurückkehrt.
 - `evaluateRuntimeState` escalates persistent `critical` queues from `degraded` to `muted`, while `FaultReasonQueueCritical` surfaces `muted` severity so the mute transition stays observable.
 ## Zielzustände laut Konzept
 - `idle`
--- a/internal/app/engine.go
+++ b/internal/app/engine.go
@@ -97,6 +97,7 @@ const (
 const (
 	lateBufferIndicatorWindow    = 5 * time.Second
 	queueCriticalStreakThreshold = 3
 	queueMutedStreakThreshold    = queueCriticalStreakThreshold * 2
 	faultRepeatWindow            = 1 * time.Second
 	faultHistoryCapacity         = 8
 )
@@ -666,7 +667,14 @@ func (e *Engine) evaluateRuntimeState(queue output.QueueStats, hasLateBuffers bo
 	}
 	critical := queue.Health == output.QueueHealthCritical
 	if critical {
 		if e.criticalStreak.Add(1) >= queueCriticalStreakThreshold {
 		count := e.criticalStreak.Add(1)
 		if count >= queueMutedStreakThreshold {
 			e.recordFault(FaultReasonQueueCritical, FaultSeverityMuted,
 				fmt.Sprintf("queue health critical for %d consecutive checks (depth=%d)", count, queue.Depth))
 			e.setRuntimeState(RuntimeStateMuted)
 			return
 		}
 		if count >= queueCriticalStreakThreshold {
 			e.recordFault(FaultReasonQueueCritical, FaultSeverityDegraded,
 				fmt.Sprintf("queue health critical (depth=%d)", queue.Depth))
 			e.setRuntimeState(RuntimeStateDegraded)
--- a/internal/app/runtime_state_test.go
+++ b/internal/app/runtime_state_test.go
@@ -55,3 +55,28 @@ func TestEngineRuntimeStateTransitions(t *testing.T) {
 		t.Fatalf("expected degraded when late buffers seen, got %s", got)
 	}
 }
 func TestEngineRuntimeStateMuteOnPersistentQueueCritical(t *testing.T) {
 	e := NewEngine(cfgpkg.Default(), platform.NewSimulatedDriver(nil))
 	e.setRuntimeState(RuntimeStateRunning)
 	queue := output.QueueStats{Depth: 1, Health: output.QueueHealthCritical}
 	for i := 0; i < queueMutedStreakThreshold; i++ {
 		e.evaluateRuntimeState(queue, false)
 	}
 	if got := e.currentRuntimeState(); got != RuntimeStateMuted {
 		t.Fatalf("expected muted after prolonged queue critical, got %s", got)
 	}
 	last := e.LastFault()
 	if last == nil {
 		t.Fatal("expected fault recorded for the mute transition")
 	}
 	if last.Reason != FaultReasonQueueCritical {
 		t.Fatalf("expected queue critical reason, got %s", last.Reason)
 	}
 	if last.Severity != FaultSeverityMuted {
 		t.Fatalf("expected muted severity, got %s", last.Severity)
 	}
 }