| @@ -274,6 +274,7 @@ Einführen eines klaren Betriebsmodells mit Fault-, Recovery- und Muted-Zuständ | |||||
| ## Fortschritt | ## Fortschritt | ||||
| - EngineStats liefert das Runtime-State-Feld (`idle`, `arming`, `prebuffering`, `running`) und reagiert nun auf Queue-Gesundheit bzw. späte Buffers, indem es bei `low`/`critical` oder späten Buffern in `degraded` wechselt und sonst auf `running` zurückkehrt. | - EngineStats liefert das Runtime-State-Feld (`idle`, `arming`, `prebuffering`, `running`) und reagiert nun auf Queue-Gesundheit bzw. späte Buffers, indem es bei `low`/`critical` oder späten Buffern in `degraded` wechselt und sonst auf `running` zurückkehrt. | ||||
| - `evaluateRuntimeState` escalates persistent `critical` queues from `degraded` to `muted`, while `FaultReasonQueueCritical` surfaces `muted` severity so the mute transition stays observable. | - `evaluateRuntimeState` escalates persistent `critical` queues from `degraded` to `muted`, while `FaultReasonQueueCritical` surfaces `muted` severity so the mute transition stays observable. | ||||
| - `evaluateRuntimeState` now waits for a short healthy streak before leaving `muted`, logging a degraded-severity recovery event once the queue settles. | |||||
| ## Zielzustände laut Konzept | ## Zielzustände laut Konzept | ||||
| - `idle` | - `idle` | ||||
| @@ -98,6 +98,7 @@ const ( | |||||
| lateBufferIndicatorWindow = 5 * time.Second | lateBufferIndicatorWindow = 5 * time.Second | ||||
| queueCriticalStreakThreshold = 3 | queueCriticalStreakThreshold = 3 | ||||
| queueMutedStreakThreshold = queueCriticalStreakThreshold * 2 | queueMutedStreakThreshold = queueCriticalStreakThreshold * 2 | ||||
| queueMutedRecoveryThreshold = queueCriticalStreakThreshold | |||||
| faultRepeatWindow = 1 * time.Second | faultRepeatWindow = 1 * time.Second | ||||
| faultHistoryCapacity = 8 | faultHistoryCapacity = 8 | ||||
| ) | ) | ||||
| @@ -123,20 +124,21 @@ type Engine struct { | |||||
| wg sync.WaitGroup | wg sync.WaitGroup | ||||
| runtimeState atomic.Value | runtimeState atomic.Value | ||||
| chunksProduced atomic.Uint64 | |||||
| totalSamples atomic.Uint64 | |||||
| underruns atomic.Uint64 | |||||
| lateBuffers atomic.Uint64 | |||||
| lateBufferAlertAt atomic.Uint64 | |||||
| criticalStreak atomic.Uint64 | |||||
| maxCycleNs atomic.Uint64 | |||||
| maxGenerateNs atomic.Uint64 | |||||
| maxUpsampleNs atomic.Uint64 | |||||
| maxWriteNs atomic.Uint64 | |||||
| lastError atomic.Value // string | |||||
| lastFault atomic.Value // *FaultEvent | |||||
| faultHistoryMu sync.Mutex | |||||
| faultHistory []FaultEvent | |||||
| chunksProduced atomic.Uint64 | |||||
| totalSamples atomic.Uint64 | |||||
| underruns atomic.Uint64 | |||||
| lateBuffers atomic.Uint64 | |||||
| lateBufferAlertAt atomic.Uint64 | |||||
| criticalStreak atomic.Uint64 | |||||
| mutedRecoveryStreak atomic.Uint64 | |||||
| maxCycleNs atomic.Uint64 | |||||
| maxGenerateNs atomic.Uint64 | |||||
| maxUpsampleNs atomic.Uint64 | |||||
| maxWriteNs atomic.Uint64 | |||||
| lastError atomic.Value // string | |||||
| lastFault atomic.Value // *FaultEvent | |||||
| faultHistoryMu sync.Mutex | |||||
| faultHistory []FaultEvent | |||||
| // Live config: pending frequency change, applied between chunks | // Live config: pending frequency change, applied between chunks | ||||
| pendingFreq atomic.Pointer[float64] | pendingFreq atomic.Pointer[float64] | ||||
| @@ -658,6 +660,18 @@ func (e *Engine) evaluateRuntimeState(queue output.QueueStats, hasLateBuffers bo | |||||
| switch state { | switch state { | ||||
| case RuntimeStateStopping, RuntimeStateFaulted: | case RuntimeStateStopping, RuntimeStateFaulted: | ||||
| return | return | ||||
| case RuntimeStateMuted: | |||||
| if queue.Health == output.QueueHealthNormal && !hasLateBuffers { | |||||
| if count := e.mutedRecoveryStreak.Add(1); count >= queueMutedRecoveryThreshold { | |||||
| e.mutedRecoveryStreak.Store(0) | |||||
| e.recordFault(FaultReasonQueueCritical, FaultSeverityDegraded, | |||||
| fmt.Sprintf("queue healthy for %d checks after mute", count)) | |||||
| e.setRuntimeState(RuntimeStateDegraded) | |||||
| } | |||||
| } else { | |||||
| e.mutedRecoveryStreak.Store(0) | |||||
| } | |||||
| return | |||||
| } | } | ||||
| if state == RuntimeStatePrebuffering { | if state == RuntimeStatePrebuffering { | ||||
| if queue.Depth >= 1 { | if queue.Depth >= 1 { | ||||
| @@ -69,14 +69,43 @@ func TestEngineRuntimeStateMuteOnPersistentQueueCritical(t *testing.T) { | |||||
| t.Fatalf("expected muted after prolonged queue critical, got %s", got) | t.Fatalf("expected muted after prolonged queue critical, got %s", got) | ||||
| } | } | ||||
| last := e.LastFault() | |||||
| if last == nil { | |||||
| muteFault := e.LastFault() | |||||
| if muteFault == nil { | |||||
| t.Fatal("expected fault recorded for the mute transition") | t.Fatal("expected fault recorded for the mute transition") | ||||
| } | } | ||||
| if last.Reason != FaultReasonQueueCritical { | |||||
| t.Fatalf("expected queue critical reason, got %s", last.Reason) | |||||
| if muteFault.Reason != FaultReasonQueueCritical { | |||||
| t.Fatalf("expected queue critical reason, got %s", muteFault.Reason) | |||||
| } | } | ||||
| if last.Severity != FaultSeverityMuted { | |||||
| t.Fatalf("expected muted severity, got %s", last.Severity) | |||||
| if muteFault.Severity != FaultSeverityMuted { | |||||
| t.Fatalf("expected muted severity, got %s", muteFault.Severity) | |||||
| } | |||||
| queue.Health = output.QueueHealthNormal | |||||
| for i := 0; i < queueMutedRecoveryThreshold-1; i++ { | |||||
| e.evaluateRuntimeState(queue, false) | |||||
| if got := e.currentRuntimeState(); got != RuntimeStateMuted { | |||||
| t.Fatalf("expected still muted while recovery window builds, got %s", got) | |||||
| } | |||||
| } | |||||
| e.evaluateRuntimeState(queue, false) | |||||
| if got := e.currentRuntimeState(); got != RuntimeStateDegraded { | |||||
| t.Fatalf("expected degrade once mute recovery threshold reached, got %s", got) | |||||
| } | |||||
| recoveryFault := e.LastFault() | |||||
| if recoveryFault == nil { | |||||
| t.Fatal("expected recovery fault entry after leaving mute") | |||||
| } | |||||
| if recoveryFault.Severity != FaultSeverityDegraded { | |||||
| t.Fatalf("expected degraded severity for recovery event, got %s", recoveryFault.Severity) | |||||
| } | |||||
| if recoveryFault.Reason != FaultReasonQueueCritical { | |||||
| t.Fatalf("expected queue critical reason for recovery event, got %s", recoveryFault.Reason) | |||||
| } | |||||
| e.evaluateRuntimeState(queue, false) | |||||
| if got := e.currentRuntimeState(); got != RuntimeStateRunning { | |||||
| t.Fatalf("expected running after recovery, got %s", got) | |||||
| } | } | ||||
| } | } | ||||