Przeglądaj źródła

Make runtime indicator drop stale late alerts

tags/v0.9.0
Jan Svabenik 1 miesiąc temu
rodzic
commit
38a6cf3d70
3 zmienionych plików z 34 dodań i 23 usunięć
  1. +1
    -1
      docs/API.md
  2. +22
    -15
      internal/app/engine.go
  3. +11
    -7
      internal/app/runtime_indicator_test.go

+ 1
- 1
docs/API.md Wyświetl plik

@@ -43,7 +43,7 @@ Current transmitter status (read-only snapshot). Runtime indicator, alert, and q
} }
``` ```


`runtimeIndicator` is derived from the engine queue health plus any late buffers and can be "normal", "degraded", or "queueCritical". `runtimeAlert` surfaces a short reason (e.g. "queue health low" or "late buffers") when the indicator is not "normal", otherwise it stays empty.
`runtimeIndicator` is derived from the engine queue health plus any late buffers observed in the last 5 seconds and can be "normal", "degraded", or "queueCritical". `runtimeAlert` surfaces a short reason (e.g. "queue health low" or "late buffers") when the indicator is not "normal", but late-buffer alerts expire after a few seconds once cycle times settle so the signal doesn't stay stuck on degraded. The cumulative `lateBuffers` counter returned by `/runtime` still shows how many late cycles have occurred since start for post-mortem diagnosis.


--- ---




+ 22
- 15
internal/app/engine.go Wyświetl plik

@@ -80,6 +80,8 @@ const (
RuntimeIndicatorQueueCritical RuntimeIndicator = "queueCritical" RuntimeIndicatorQueueCritical RuntimeIndicator = "queueCritical"
) )


const lateBufferIndicatorWindow = 5 * time.Second

// Engine is the continuous TX loop. It generates composite IQ in chunks, // Engine is the continuous TX loop. It generates composite IQ in chunks,
// resamples to device rate, and pushes to hardware in a tight loop. // resamples to device rate, and pushes to hardware in a tight loop.
// The hardware buffer_push call is blocking — it returns when the hardware // The hardware buffer_push call is blocking — it returns when the hardware
@@ -100,15 +102,16 @@ type Engine struct {
startedAt time.Time startedAt time.Time
wg sync.WaitGroup wg sync.WaitGroup


chunksProduced atomic.Uint64
totalSamples atomic.Uint64
underruns atomic.Uint64
lateBuffers atomic.Uint64
maxCycleNs atomic.Uint64
maxGenerateNs atomic.Uint64
maxUpsampleNs atomic.Uint64
maxWriteNs atomic.Uint64
lastError atomic.Value // string
chunksProduced atomic.Uint64
totalSamples atomic.Uint64
underruns atomic.Uint64
lateBuffers atomic.Uint64
lateBufferAlertAt atomic.Uint64
maxCycleNs atomic.Uint64
maxGenerateNs atomic.Uint64
maxUpsampleNs atomic.Uint64
maxWriteNs atomic.Uint64
lastError atomic.Value // string


// Live config: pending frequency change, applied between chunks // Live config: pending frequency change, applied between chunks
pendingFreq atomic.Pointer[float64] pendingFreq atomic.Pointer[float64]
@@ -351,7 +354,10 @@ func (e *Engine) Stats() EngineStats {


queue := e.frameQueue.Stats() queue := e.frameQueue.Stats()
lateBuffers := e.lateBuffers.Load() lateBuffers := e.lateBuffers.Load()
ri := runtimeIndicator(queue.Health, lateBuffers)
now := time.Now()
lateAlertAt := e.lateBufferAlertAt.Load()
hasRecentLateBuffers := lateAlertAt > 0 && now.Sub(time.Unix(0, int64(lateAlertAt))) <= lateBufferIndicatorWindow
ri := runtimeIndicator(queue.Health, hasRecentLateBuffers)
return EngineStats{ return EngineStats{
State: state.String(), State: state.String(),
ChunksProduced: e.chunksProduced.Load(), ChunksProduced: e.chunksProduced.Load(),
@@ -366,26 +372,26 @@ func (e *Engine) Stats() EngineStats {
MaxWriteMs: durationMs(e.maxWriteNs.Load()), MaxWriteMs: durationMs(e.maxWriteNs.Load()),
Queue: queue, Queue: queue,
RuntimeIndicator: ri, RuntimeIndicator: ri,
RuntimeAlert: runtimeAlert(queue.Health, lateBuffers),
RuntimeAlert: runtimeAlert(queue.Health, hasRecentLateBuffers),
} }
} }


func runtimeIndicator(queueHealth output.QueueHealth, lateBuffers uint64) RuntimeIndicator {
func runtimeIndicator(queueHealth output.QueueHealth, recentLateBuffers bool) RuntimeIndicator {
switch { switch {
case queueHealth == output.QueueHealthCritical: case queueHealth == output.QueueHealthCritical:
return RuntimeIndicatorQueueCritical return RuntimeIndicatorQueueCritical
case queueHealth == output.QueueHealthLow || lateBuffers > 0:
case queueHealth == output.QueueHealthLow || recentLateBuffers:
return RuntimeIndicatorDegraded return RuntimeIndicatorDegraded
default: default:
return RuntimeIndicatorNormal return RuntimeIndicatorNormal
} }
} }


func runtimeAlert(queueHealth output.QueueHealth, lateBuffers uint64) string {
func runtimeAlert(queueHealth output.QueueHealth, recentLateBuffers bool) string {
switch { switch {
case queueHealth == output.QueueHealthCritical: case queueHealth == output.QueueHealthCritical:
return "queue health critical" return "queue health critical"
case lateBuffers > 0:
case recentLateBuffers:
return "late buffers" return "late buffers"
case queueHealth == output.QueueHealthLow: case queueHealth == output.QueueHealthLow:
return "queue health low" return "queue health low"
@@ -484,6 +490,7 @@ func (e *Engine) writerLoop(ctx context.Context) {


if cycleDur > e.chunkDuration { if cycleDur > e.chunkDuration {
late := e.lateBuffers.Add(1) late := e.lateBuffers.Add(1)
e.lateBufferAlertAt.Store(uint64(time.Now().UnixNano()))
if late <= 5 || late%20 == 0 { if late <= 5 || late%20 == 0 {
log.Printf("TX LATE: cycle=%s budget=%s write=%s over=%s", log.Printf("TX LATE: cycle=%s budget=%s write=%s over=%s",
cycleDur, e.chunkDuration, writeDur, cycleDur-e.chunkDuration) cycleDur, e.chunkDuration, writeDur, cycleDur-e.chunkDuration)


+ 11
- 7
internal/app/runtime_indicator_test.go Wyświetl plik

@@ -10,35 +10,39 @@ func TestRuntimeIndicatorAndAlert(t *testing.T) {
cases := []struct { cases := []struct {
name string name string
health output.QueueHealth health output.QueueHealth
lateBuffers uint64
recentLate bool
wantIndicator RuntimeIndicator wantIndicator RuntimeIndicator
wantAlert string wantAlert string
}{ }{
{ {
name: "queue critical", name: "queue critical",
health: output.QueueHealthCritical, health: output.QueueHealthCritical,
lateBuffers: 0,
wantIndicator: RuntimeIndicatorQueueCritical, wantIndicator: RuntimeIndicatorQueueCritical,
wantAlert: "queue health critical", wantAlert: "queue health critical",
}, },
{ {
name: "queue low", name: "queue low",
health: output.QueueHealthLow, health: output.QueueHealthLow,
lateBuffers: 0,
wantIndicator: RuntimeIndicatorDegraded, wantIndicator: RuntimeIndicatorDegraded,
wantAlert: "queue health low", wantAlert: "queue health low",
}, },
{ {
name: "late buffers", name: "late buffers",
health: output.QueueHealthNormal, health: output.QueueHealthNormal,
lateBuffers: 2,
recentLate: true,
wantIndicator: RuntimeIndicatorDegraded,
wantAlert: "late buffers",
},
{
name: "late buffers override queue low",
health: output.QueueHealthLow,
recentLate: true,
wantIndicator: RuntimeIndicatorDegraded, wantIndicator: RuntimeIndicatorDegraded,
wantAlert: "late buffers", wantAlert: "late buffers",
}, },
{ {
name: "normal", name: "normal",
health: output.QueueHealthNormal, health: output.QueueHealthNormal,
lateBuffers: 0,
wantIndicator: RuntimeIndicatorNormal, wantIndicator: RuntimeIndicatorNormal,
wantAlert: "", wantAlert: "",
}, },
@@ -48,11 +52,11 @@ func TestRuntimeIndicatorAndAlert(t *testing.T) {
tc := tc tc := tc
t.Run(tc.name, func(t *testing.T) { t.Run(tc.name, func(t *testing.T) {
t.Parallel() t.Parallel()
got := runtimeIndicator(tc.health, tc.lateBuffers)
got := runtimeIndicator(tc.health, tc.recentLate)
if got != tc.wantIndicator { if got != tc.wantIndicator {
t.Fatalf("indicator: expected %s, got %s", tc.wantIndicator, got) t.Fatalf("indicator: expected %s, got %s", tc.wantIndicator, got)
} }
alert := runtimeAlert(tc.health, tc.lateBuffers)
alert := runtimeAlert(tc.health, tc.recentLate)
if alert != tc.wantAlert { if alert != tc.wantAlert {
t.Fatalf("alert: expected %q, got %q", tc.wantAlert, alert) t.Fatalf("alert: expected %q, got %q", tc.wantAlert, alert)
} }


Ładowanie…
Anuluj
Zapisz