Make runtime indicator drop stale late alerts

pirms 1 mēnesi · 38a6cf3d70
--- a/docs/API.md
+++ b/docs/API.md
@@ -43,7 +43,7 @@ Current transmitter status (read-only snapshot). Runtime indicator, alert, and q
 }
 ```

 `runtimeIndicator` is derived from the engine queue health plus any late buffers and can be "normal", "degraded", or "queueCritical". `runtimeAlert` surfaces a short reason (e.g. "queue health low" or "late buffers") when the indicator is not "normal", otherwise it stays empty.
 `runtimeIndicator` is derived from the engine queue health plus any late buffers observed in the last 5 seconds and can be "normal", "degraded", or "queueCritical". `runtimeAlert` surfaces a short reason (e.g. "queue health low" or "late buffers") when the indicator is not "normal", but late-buffer alerts expire after a few seconds once cycle times settle so the signal doesn't stay stuck on degraded. The cumulative `lateBuffers` counter returned by `/runtime` still shows how many late cycles have occurred since start for post-mortem diagnosis.

 ---

--- a/internal/app/engine.go
+++ b/internal/app/engine.go
@@ -80,6 +80,8 @@ const (
 	RuntimeIndicatorQueueCritical RuntimeIndicator = "queueCritical"
 )

 const lateBufferIndicatorWindow = 5 * time.Second

 // Engine is the continuous TX loop. It generates composite IQ in chunks,
 // resamples to device rate, and pushes to hardware in a tight loop.
 // The hardware buffer_push call is blocking — it returns when the hardware
@@ -100,15 +102,16 @@ type Engine struct {
 	startedAt time.Time
 	wg        sync.WaitGroup

 	chunksProduced atomic.Uint64
 	totalSamples   atomic.Uint64
 	underruns      atomic.Uint64
 	lateBuffers    atomic.Uint64
 	maxCycleNs     atomic.Uint64
 	maxGenerateNs  atomic.Uint64
 	maxUpsampleNs  atomic.Uint64
 	maxWriteNs     atomic.Uint64
 	lastError      atomic.Value // string
 	chunksProduced    atomic.Uint64
 	totalSamples      atomic.Uint64
 	underruns         atomic.Uint64
 	lateBuffers       atomic.Uint64
 	lateBufferAlertAt atomic.Uint64
 	maxCycleNs        atomic.Uint64
 	maxGenerateNs     atomic.Uint64
 	maxUpsampleNs     atomic.Uint64
 	maxWriteNs        atomic.Uint64
 	lastError         atomic.Value // string

 	// Live config: pending frequency change, applied between chunks
 	pendingFreq atomic.Pointer[float64]
@@ -351,7 +354,10 @@ func (e *Engine) Stats() EngineStats {

 	queue := e.frameQueue.Stats()
 	lateBuffers := e.lateBuffers.Load()
 	ri := runtimeIndicator(queue.Health, lateBuffers)
 	now := time.Now()
 	lateAlertAt := e.lateBufferAlertAt.Load()
 	hasRecentLateBuffers := lateAlertAt > 0 && now.Sub(time.Unix(0, int64(lateAlertAt))) <= lateBufferIndicatorWindow
 	ri := runtimeIndicator(queue.Health, hasRecentLateBuffers)
 	return EngineStats{
 		State:            state.String(),
 		ChunksProduced:   e.chunksProduced.Load(),
@@ -366,26 +372,26 @@ func (e *Engine) Stats() EngineStats {
 		MaxWriteMs:       durationMs(e.maxWriteNs.Load()),
 		Queue:            queue,
 		RuntimeIndicator: ri,
 		RuntimeAlert:     runtimeAlert(queue.Health, lateBuffers),
 		RuntimeAlert:     runtimeAlert(queue.Health, hasRecentLateBuffers),
 	}
 }

 func runtimeIndicator(queueHealth output.QueueHealth, lateBuffers uint64) RuntimeIndicator {
 func runtimeIndicator(queueHealth output.QueueHealth, recentLateBuffers bool) RuntimeIndicator {
 	switch {
 	case queueHealth == output.QueueHealthCritical:
 		return RuntimeIndicatorQueueCritical
 	case queueHealth == output.QueueHealthLow || lateBuffers > 0:
 	case queueHealth == output.QueueHealthLow || recentLateBuffers:
 		return RuntimeIndicatorDegraded
 	default:
 		return RuntimeIndicatorNormal
 	}
 }

 func runtimeAlert(queueHealth output.QueueHealth, lateBuffers uint64) string {
 func runtimeAlert(queueHealth output.QueueHealth, recentLateBuffers bool) string {
 	switch {
 	case queueHealth == output.QueueHealthCritical:
 		return "queue health critical"
 	case lateBuffers > 0:
 	case recentLateBuffers:
 		return "late buffers"
 	case queueHealth == output.QueueHealthLow:
 		return "queue health low"
@@ -484,6 +490,7 @@ func (e *Engine) writerLoop(ctx context.Context) {

 		if cycleDur > e.chunkDuration {
 			late := e.lateBuffers.Add(1)
 			e.lateBufferAlertAt.Store(uint64(time.Now().UnixNano()))
 			if late <= 5 || late%20 == 0 {
 				log.Printf("TX LATE: cycle=%s budget=%s write=%s over=%s",
 					cycleDur, e.chunkDuration, writeDur, cycleDur-e.chunkDuration)
--- a/internal/app/runtime_indicator_test.go
+++ b/internal/app/runtime_indicator_test.go
@@ -10,35 +10,39 @@ func TestRuntimeIndicatorAndAlert(t *testing.T) {
 	cases := []struct {
 		name          string
 		health        output.QueueHealth
 		lateBuffers   uint64
 		recentLate    bool
 		wantIndicator RuntimeIndicator
 		wantAlert     string
 	}{
 		{
 			name:          "queue critical",
 			health:        output.QueueHealthCritical,
 			lateBuffers:   0,
 			wantIndicator: RuntimeIndicatorQueueCritical,
 			wantAlert:     "queue health critical",
 		},
 		{
 			name:          "queue low",
 			health:        output.QueueHealthLow,
 			lateBuffers:   0,
 			wantIndicator: RuntimeIndicatorDegraded,
 			wantAlert:     "queue health low",
 		},
 		{
 			name:          "late buffers",
 			health:        output.QueueHealthNormal,
 			lateBuffers:   2,
 			recentLate:    true,
 			wantIndicator: RuntimeIndicatorDegraded,
 			wantAlert:     "late buffers",
 		},
 		{
 			name:          "late buffers override queue low",
 			health:        output.QueueHealthLow,
 			recentLate:    true,
 			wantIndicator: RuntimeIndicatorDegraded,
 			wantAlert:     "late buffers",
 		},
 		{
 			name:          "normal",
 			health:        output.QueueHealthNormal,
 			lateBuffers:   0,
 			wantIndicator: RuntimeIndicatorNormal,
 			wantAlert:     "",
 		},
@@ -48,11 +52,11 @@ func TestRuntimeIndicatorAndAlert(t *testing.T) {
 		tc := tc
 		t.Run(tc.name, func(t *testing.T) {
 			t.Parallel()
 			got := runtimeIndicator(tc.health, tc.lateBuffers)
 			got := runtimeIndicator(tc.health, tc.recentLate)
 			if got != tc.wantIndicator {
 				t.Fatalf("indicator: expected %s, got %s", tc.wantIndicator, got)
 			}
 			alert := runtimeAlert(tc.health, tc.lateBuffers)
 			alert := runtimeAlert(tc.health, tc.recentLate)
 			if alert != tc.wantAlert {
 				t.Fatalf("alert: expected %q, got %q", tc.wantAlert, alert)
 			}