diff --git a/AGENTS.md b/AGENTS.md
new file mode 100644
index 0000000..648887f
--- /dev/null
+++ b/AGENTS.md
@@ -0,0 +1,299 @@
+# AGENTS.md
+
+This file is the repo-level working guide for humans, coding agents, and LLMs.
+Read it before making changes.
+
+---
+
+## 1. Purpose of this file
+
+Use this file as the canonical "how to work in this repo" guide.
+It is intentionally practical and operational.
+
+Use it to answer questions like:
+- Where should changes go?
+- What must not be committed?
+- How should builds/tests be run?
+- Which docs are canonical?
+- How should debugging work be documented?
+- How should agents behave when touching this repo?
+
+---
+
+## 2. Repo intent
+
+`sd r-wideband-suite` is a Go-based SDR analysis and streaming system with:
+- live spectrum/waterfall UI
+- signal detection/classification
+- extraction / demodulation / recording
+- GPU-assisted paths
+- streaming audio paths
+- extensive telemetry/debugging support
+
+This repo has gone through active streaming-path and audio-click debugging.
+Do not assume older comments, notes, or experimental code paths are still authoritative.
+Prefer current code, current docs in `docs/`, and current branch state over historical assumptions.
+
+---
+
+## 3. Canonical documentation
+
+### Keep as primary references
+- `README.md`
+  - high-level project overview
+  - build/run basics
+  - feature summary
+- `ROADMAP.md`
+  - longer-lived architectural direction
+- `docs/known-issues.md`
+  - curated open engineering issues
+- `docs/telemetry-api.md`
+  - telemetry endpoint documentation
+- `docs/telemetry-debug-runbook.md`
+  - telemetry/debug operating guide
+- `docs/audio-click-debug-notes-2026-03-24.md`
+  - historical incident record and final resolution notes for the audio-click investigation
+
+### Treat as historical / contextual docs
+Anything in `docs/` that reads like an incident log, deep debug note, or one-off investigation should be treated as supporting context, not automatic source of truth.
+
+### Do not create multiple competing issue lists
+If new open problems are found:
+- update `docs/known-issues.md`
+- keep raw reviewer/ad-hoc reports out of the main repo flow unless they are converted into curated docs
+
+---
+
+## 4. Branching and workflow rules
+
+### Current working model
+- Use focused branches for real feature/fix work.
+- Do not keep long-lived junk/debug branches alive once the useful work has been transferred.
+- Prefer short-lived cleanup branches for docs/config cleanup.
+
+### Branch hygiene
+- Do not pile unrelated work onto one branch if it can be split cleanly.
+- Keep bugfixes, config cleanup, and large refactors logically separable when possible.
+- Before deleting an old branch, ensure all useful work is already present in the active branch or merged into the main line.
+
+### Mainline policy
+- Do not merge to `master` blindly.
+- Before merge, prefer at least a short sanity pass on:
+  - live playback
+  - recording
+  - WFM / WFM_STEREO / at least one non-WFM mode if relevant
+  - restart behavior if the change affects runtime state
+
+---
+
+## 5. Commit policy
+
+### Commit what matters
+Good commits are:
+- real code fixes
+- clear docs improvements
+- deliberate config-default changes
+- cleanup that reduces confusion
+
+### Do not commit accidental noise
+Do **not** commit unless explicitly intended:
+- local debug dumps
+- ad-hoc telemetry exports
+- generated WAV debug windows
+- temporary patch files
+- throwaway reviewer JSON snapshots
+- local-only runtime artifacts
+
+### Prefer small, readable commit scopes
+Examples of good separate commit scopes:
+- code fix
+- config default cleanup
+- doc cleanup
+- known-issues update
+
+---
+
+## 6. Files and paths that need extra care
+
+### Config files
+- `config.yaml`
+- `config.autosave.yaml`
+
+Rules:
+- These can drift during debugging.
+- Do not commit config changes accidentally.
+- Only commit them when the intent is to change repo defaults.
+- Keep in mind that `config.autosave.yaml` can override expected runtime behavior after restart.
+
+### Debug / dump artifacts
+Examples:
+- `debug/`
+- `tele-*.json`
+- ad-hoc patch/report scratch files
+- generated WAV capture windows
+
+Rules:
+- Treat these as local investigation material unless intentionally promoted into docs.
+- Do not leave them hanging around as tracked repo clutter.
+
+### Root docs
+The repo root should stay relatively clean.
+Keep only genuinely canonical top-level docs there.
+One-off investigation output belongs in `docs/` or should be deleted.
+
+---
+
+## 7. Build and test rules
+
+### General rule
+Prefer the repo's own scripts and established workflow over ad-hoc raw build commands.
+
+### Important operational rule
+Before coding/build/test sessions on this repo:
+- stop the browser UI
+- stop `sdrd.exe`
+
+This avoids file locks, stale runtime state, and misleading live-test behavior.
+
+### Build preference
+Use the project scripts where applicable, especially for the real app flows.
+Examples already used during this project include:
+- `build-sdrplay.ps1`
+- `start-sdr.ps1`
+
+Do **not** default to random raw `go build` commands for full workflow validation unless the goal is a narrow compile-only sanity check.
+
+### GPU / native-path caution
+If working on GPU/native streaming code:
+- do not assume the CPU oracle path is currently trustworthy unless you have just validated it
+- do not assume old README notes inside subdirectories are current
+- check the current code and current docs first
+
+---
+
+## 8. Debugging rules
+
+### Telemetry-first, but disciplined
+Telemetry is available and useful.
+However:
+- heavy telemetry can distort runtime behavior
+- debug config can accidentally persist via autosave
+- not every one-off probe belongs in permanent code
+
+### When debugging
+Prefer this order:
+1. existing telemetry and current docs
+2. focused additional instrumentation
+3. short-lived dumps / captures
+4. cleanup afterward
+
+### If you add debugging support
+Ask:
+- Is this reusable for future incidents?
+- Should it live in `docs/known-issues.md` or a runbook?
+- Is it temporary and should be removed after use?
+
+### If a reviewer provides a raw report
+Do not blindly keep raw snapshots as canonical repo docs.
+Instead:
+- extract the durable findings
+- update `docs/known-issues.md`
+- keep only the cleaned/curated version in the main repo flow
+
+---
+
+## 9. Documentation rules
+
+### Prefer curated docs over raw dumps
+Good:
+- `docs/known-issues.md`
+- runbooks
+- architectural notes
+- incident summaries with clear final status
+
+Bad:
+- random JSON reviewer dumps as primary docs
+- duplicate issue lists
+- stale TODO/STATE files that nobody maintains
+
+### If a doc becomes stale
+Choose one:
+- update it
+- move it into `docs/` as historical context
+- delete it
+
+Do not keep stale docs in prominent locations if they compete with current truth.
+
+---
+
+## 10. Known lessons from recent work
+
+These are important enough to keep visible:
+
+### Audio-click investigation lessons
+- The final click bug was not a single simple DSP bug.
+- Real causes included:
+  - shared-buffer mutation / aliasing
+  - extractor reset churn from unstable config hashing
+  - streaming-path batch rejection / fallback behavior
+- Secondary contributing issues existed in discriminator bridging and WFM mono/plain-path filtering.
+
+### Practical repo lessons
+- Silent fallback paths are dangerous; keep important fallthrough/fallback visibility.
+- Shared IQ buffers should be treated very carefully.
+- Debug artifacts should not become permanent repo clutter.
+- Curated issue tracking in Git is better than keeping raw review snapshots around.
+
+---
+
+## 11. Agent behavior expectations
+
+If you are an AI coding agent / LLM working in this repo:
+
+### Do
+- read this file first
+- prefer current code and current docs over old assumptions
+- keep changes scoped and explainable
+- separate config cleanup from code fixes when possible
+- leave the repo cleaner than you found it
+- promote durable findings into curated docs
+
+### Do not
+- commit local debug noise by default
+- create duplicate status/todo/issue files without a strong reason
+- assume experimental comments or old subdirectory READMEs are still correct
+- leave raw reviewer output as the only source of truth
+- hide fallback behavior or silently ignore critical path failures
+
+---
+
+## 12. Recommended doc update pattern after meaningful work
+
+When a meaningful fix or investigation lands:
+1. update code
+2. update any relevant canonical docs
+3. update `docs/known-issues.md` if open issues changed
+4. remove or archive temporary debug artifacts
+5. keep the repo root and branch state clean
+
+---
+
+## 13. Minimal pre-commit checklist
+
+Before committing, quickly check:
+- Am I committing only intended files?
+- Are config changes intentional?
+- Am I accidentally committing dumps/logs/debug exports?
+- Should any reviewer findings be moved into `docs/known-issues.md`?
+- Did I leave stale temporary files behind?
+
+---
+
+## 14. If unsure
+
+If a file looks ambiguous:
+- canonical + actively maintained -> keep/update
+- historical but useful -> move or keep in `docs/`
+- stale and confusing -> delete
+
+Clarity beats nostalgia.
diff --git a/README.md b/README.md
index a2625ec..d872a1f 100644
--- a/README.md
+++ b/README.md
@@ -192,6 +192,32 @@ go build -tags sdrplay ./cmd/sdrd
 - `GET /api/signals` -> current live signals
 - `GET /api/events?limit=&since=` -> recent events
 
+### Debug Telemetry
+- `GET /api/debug/telemetry/live` -> current telemetry snapshot (counters, gauges, distributions, recent events, collector status/config)
+- `GET /api/debug/telemetry/history` -> historical metric samples with filtering by time/name/prefix/tags
+- `GET /api/debug/telemetry/events` -> telemetry event/anomaly history with filtering by time/name/prefix/level/tags
+- `GET /api/debug/telemetry/config` -> current collector config plus `debug.telemetry` runtime config
+- `POST /api/debug/telemetry/config` -> update telemetry settings at runtime and persist them to autosave config
+
+Telemetry query params (`history` / `events`) include:
+- `since`, `until` -> unix seconds, unix milliseconds, or RFC3339 timestamps
+- `limit`
+- `name`, `prefix`
+- `signal_id`, `session_id`, `stage`, `trace_id`, `component`
+- `tag_<key>=<value>` for arbitrary tag filters
+- `include_persisted=true|false` (default `true`)
+- `level` on the events endpoint
+
+Telemetry config lives under `debug.telemetry`:
+- `enabled`, `heavy_enabled`, `heavy_sample_every`
+- `metric_sample_every`, `metric_history_max`, `event_history_max`
+- `retention_seconds`
+- `persist_enabled`, `persist_dir`, `rotate_mb`, `keep_files`
+
+See also:
+- `docs/telemetry-api.md` for the full telemetry API reference
+- `docs/telemetry-debug-runbook.md` for the short operational debug flow
+
 ### Recordings
 - `GET /api/recordings`
 - `GET /api/recordings/:id` (meta.json)
diff --git a/STATE.md b/STATE.md
deleted file mode 100644
index f3487ef..0000000
--- a/STATE.md
+++ /dev/null
@@ -1,184 +0,0 @@
-# SDR Wideband Suite - Current State
-
-This file is the practical handoff / resume state for future work.
-Use it together with `ROADMAP.md`.
-
-- `ROADMAP.md` = long-term architecture and phase roadmap
-- `STATE.md` = current repo state, working conventions, and next recommended entry point
-
-## Current Milestone State
-
-- **Phase 1 complete**
-- **Phase 2 complete**
-- **Phase 3 complete**
-- **Phase 4 complete**
-
-Current project state should be treated as:
-- Phase 1 = architecture foundation landed
-- Phase 2 = multi-resolution surveillance semantics landed
-- Phase 3 = conservative runtime prioritization/admission/rebalance landed
-- Phase 4 = monitor-window operating model landed
-
-Do not reopen these phases unless there is a concrete bug, mismatch, or regression.
-
----
-
-## Most Recent Relevant Commits
-
-These are the most important recent milestone commits that define the current state:
-
-### Phase 4 monitor-window operating model
-- `efe137b` Add monitor window goals for multi-span gating
-- `ac64d6b` Add monitor window matches and stats
-- `d7e457d` Expose monitor window summaries in runtime debug
-- `c520423` Add monitor window priority bias
-- `838c941` Add window-based record/decode actions
-- `962cf06` Add window zone biases for record/decode actions
-- `402a772` Consolidate monitor window summary in debug outputs
-- `8545b62` Add per-window outcome summaries for admission pressure
-- `65b9845` test: cover overlapping monitor windows
-- `efe3215` docs: capture Phase-4 monitor-window status
-
-### Phase 3 runtime intelligence milestone
-- `4ebd51d` Add priority tiers and admission classes to pipeline
-- `18b179b` Expose admission metadata in debug output and tests
-- `ba9adca` Add budget preference and pressure modeling
-- `7a75367` Expose arbitration pressure summary
-- `592fa03` pipeline: deepen hold/displacement semantics
-- `30a5d11` pipeline: apply intent holds and family tier floors
-- `1f5d4ab` pipeline: add intent and family priority tests
-- `822829c` Add conservative budget rebalance layer
-- `da5fa22` Update Phase-3 Wave 3E status
-
-### Documentation / stable defaults
-- `fd718d5` docs: finalize phase milestones and ukf test config
-
-If resuming after a long pause, inspect the current `git log` around these commits first.
-
----
-
-## Current Important Files / Subsystems
-
-### Long-term guidance
-- `ROADMAP.md` - durable roadmap across phases
-- `STATE.md` - practical resume/handoff state
-- `PLAN.md` - project plan / narrative (may be less pristine than ROADMAP.md)
-- `README.md` - user-facing/current feature status
-
-### Config / runtime surface
-- `config.yaml` - current committed default config
-- `config.autosave.yaml` - local autosave; intentionally not tracked in git
-- `internal/config/config.go`
-- `internal/runtime/runtime.go`
-
-### Phase 3 core runtime intelligence
-- `internal/pipeline/arbiter.go`
-- `internal/pipeline/arbitration.go`
-- `internal/pipeline/arbitration_state.go`
-- `internal/pipeline/priority.go`
-- `internal/pipeline/budget.go`
-- `internal/pipeline/pressure.go`
-- `internal/pipeline/rebalance.go`
-- `internal/pipeline/decision_queue.go`
-
-### Phase 2 surveillance/evidence model
-- `internal/pipeline/types.go`
-- `internal/pipeline/evidence.go`
-- `internal/pipeline/candidate_fusion.go`
-- `internal/pipeline/scheduler.go`
-- `cmd/sdrd/pipeline_runtime.go`
-
-### Phase 4 monitor-window model
-- `internal/pipeline/monitor_rules.go`
-- `cmd/sdrd/window_summary.go`
-- `cmd/sdrd/level_summary.go`
-- `cmd/sdrd/http_handlers.go`
-- `cmd/sdrd/decision_compact.go`
-- `cmd/sdrd/dsp_loop.go`
-
----
-
-## Current Default Operator / Test Posture
-
-The repo was intentionally switched to an FM/UKW-friendly default test posture.
-
-### Current committed config defaults
-- band: `87.5-108.0 MHz`
-- center: `99.5 MHz`
-- sample rate: `2.048 MHz`
-- FFT: `4096`
-- profile: `wideband-balanced`
-- intent: `broadcast-monitoring`
-- priorities include `wfm`, `rds`, `broadcast`, `digital`
-
-### Important config note
-- `config.yaml` is committed and intended as the stable default reference
-- `config.autosave.yaml` is **not** git-tracked and may diverge locally
-- if behavior seems odd, compare the active runtime config against `config.yaml`
-
----
-
-## Working Conventions That Matter
-
-### Codex invocation on Windows
-Preferred stable flow:
-1. write prompt to `codex_prompt.txt`
-2. create/use `run_codex.ps1` containing:
-   - read prompt file
-   - pipe to `codex exec --yolo`
-3. run with PTY/background from the repo root
-4. remove `codex_prompt.txt` and `run_codex.ps1` after the run
-
-This was adopted specifically to avoid PowerShell quoting failures.
-
-### Expectations for coding runs
-- before every commit: `go test ./...` and `go build ./cmd/sdrd`
-- commit in coherent blocks with clear messages
-- push after successful validation
-- avoid reopening already-closed phase work without a concrete reason
-
----
-
-## Known Practical Caveats
-
-- `PLAN.md` has had encoding/character issues in some reads; treat `ROADMAP.md` + `STATE.md` as the cleaner authoritative continuity docs.
-- README is generally useful, but `ROADMAP.md`/`STATE.md` are better for architectural continuity.
-- `config.autosave.yaml` can become misleading because it is local/autosaved and not tracked.
-
----
-
-## Recommended Next Entry Point
-
-If resuming technical work after this checkpoint:
-
-### Start with **Phase 5**
-Do **not** reopen Phase 1-4 unless there is a concrete bug or regression.
-
-### Recommended Phase 5 direction
-Move from monitor windows inside a single capture span toward richer span / operating orchestration:
-- span / zone groups
-- span-aware resource allocation
-- stronger profile-driven operating modes
-- retune / scan / dwell semantics where needed
-
-### Avoid jumping ahead prematurely to
-- full adaptive QoS engine (Phase 6)
-- major GPU/performance re-architecture (Phase 7)
-- heavy UX/product polish (Phase 8)
-
-Those should build on Phase 5, not bypass it.
-
----
-
-## Resume Checklist For A Future Agent
-
-1. Read `ROADMAP.md`
-2. Read `STATE.md`
-3. Check current `git log` near the commits listed above
-4. Inspect `config.yaml`
-5. Confirm current repo state with:
-   - `go test ./...`
-   - `go build ./cmd/sdrd`
-6. Then start Phase 5 planning from the actual repo state
-
-If these steps still match the repo, continuation should be seamless enough even after a hard context reset.
diff --git a/TODO.md b/TODO.md
deleted file mode 100644
index eb3a9ca..0000000
--- a/TODO.md
+++ /dev/null
@@ -1,23 +0,0 @@
-# TODO — SDR Visual Suite
-
-## UI
-- [ ] RDS RadioText (RT) Anzeige hinzufügen:
-  - Overlay: 1 Zeile, sanfter Fade bei Updates, Ellipsis bei Überlänge, optional kleines „RT“-Badge.
-  - Detail-Panel: 2 Zeilen Auto-Wrap; bei Überlänge Ellipsis + Expand (Modal/Zone) für Volltext.
-  - Update-Logik: RT nur bei stabilem Text (z. B. 2–3 identische Blöcke), optional „RT · HH:MM“ Timestamp.
-
-## Band Settings Profiles (v1.2)
-- [ ] Backend: built-in Profile-Struktur + embedded JSON (6 Profile)
-- [ ] Backend: Apply-Helper (shared mit /api/config) inkl. source/dsp/save
-- [ ] Backend: Merge-Patch mit Feld-Präsenz (nur explizite Felder anwenden)
-- [ ] Backend: DisallowUnknownFields + Config-Validierung → 400
-- [ ] Backend: Endpoints GET /api/profiles, POST /api/profiles/apply, POST /api/profiles/undo, GET /api/profiles/suggest
-- [ ] Backend: Undo-Snapshot (1 Level) + Active Profile ID (Runtime-State)
-- [ ] Optional: Active Profile ID über Neustart persistieren (falls gewünscht)
-- [ ] UI: Dropdown + Split-Apply (full/dsp_only) + Undo + Active-Badge
-- [ ] UI: Suggest-Toast bei center_hz Wechsel, Dismiss-Schutz (>5 MHz)
-- [ ] UX: Loading-Indicator während Profilwechsel (1–3s Reset)
-- [ ] Tests: Patch-Semantik, dsp_only (center_hz/gain_db bleiben), Unknown Fields, Suggest-Match
-
-## Notes
-- Ab jetzt hier die Todo-Liste führen.
diff --git a/build-gpudemod-dll.ps1 b/build-gpudemod-dll.ps1
index da19b08..4e095c2 100644
--- a/build-gpudemod-dll.ps1
+++ b/build-gpudemod-dll.ps1
@@ -16,12 +16,25 @@ if (!(Test-Path $outDir)) { New-Item -ItemType Directory -Path $outDir | Out-Nul
 
 Remove-Item $dll,$lib,$exp -Force -ErrorAction SilentlyContinue
 
-$cmd = @"
-call "$vcvars" && "$nvcc" -shared "$src" -o "$dll" -cudart=hybrid -Xcompiler "/MD" -arch=sm_75 -gencode arch=compute_75,code=sm_75 -gencode arch=compute_80,code=sm_80 -gencode arch=compute_86,code=sm_86 -gencode arch=compute_89,code=sm_89 -gencode arch=compute_90,code=sm_90
+$bat = Join-Path $env:TEMP 'build-gpudemod-dll.bat'
+$batContent = @"
+@echo off
+call "$vcvars"
+if errorlevel 1 exit /b %errorlevel%
+"$nvcc" -shared "$src" -o "$dll" -cudart=hybrid -Xcompiler "/MD" -arch=sm_75 ^
+  -gencode arch=compute_75,code=sm_75 ^
+  -gencode arch=compute_80,code=sm_80 ^
+  -gencode arch=compute_86,code=sm_86 ^
+  -gencode arch=compute_89,code=sm_89 ^
+  -gencode arch=compute_90,code=sm_90
+exit /b %errorlevel%
 "@
+Set-Content -Path $bat -Value $batContent -Encoding ASCII
 
 Write-Host 'Building gpudemod CUDA DLL...' -ForegroundColor Cyan
-cmd.exe /c $cmd
-if ($LASTEXITCODE -ne 0) { throw 'gpudemod DLL build failed' }
+cmd.exe /c ""$bat""
+$exitCode = $LASTEXITCODE
+Remove-Item $bat -Force -ErrorAction SilentlyContinue
+if ($exitCode -ne 0) { throw 'gpudemod DLL build failed' }
 
 Write-Host "Built: $dll" -ForegroundColor Green
diff --git a/build-sdrplay.ps1 b/build-sdrplay.ps1
index 5f5e2bb..89c5507 100644
--- a/build-sdrplay.ps1
+++ b/build-sdrplay.ps1
@@ -21,10 +21,13 @@ if (Test-Path $sdrplayBin) { $env:PATH = "$sdrplayBin;" + $env:PATH }
 # CUDA runtime / cuFFT
 $cudaInc = 'C:\CUDA\include'
 $cudaBin = 'C:\CUDA\bin'
+$cudaBinX64 = 'C:\CUDA\bin\x64'
 if (-not (Test-Path $cudaInc)) { $cudaInc = 'C:\PROGRA~1\NVIDIA~2\CUDA\v13.2\include' }
 if (-not (Test-Path $cudaBin)) { $cudaBin = 'C:\PROGRA~1\NVIDIA~2\CUDA\v13.2\bin' }
+if (-not (Test-Path $cudaBinX64)) { $cudaBinX64 = 'C:\PROGRA~1\NVIDIA~2\CUDA\v13.2\bin\x64' }
 $cudaMingw = Join-Path $PSScriptRoot 'cuda-mingw'
 if (Test-Path $cudaInc) { $env:CGO_CFLAGS = "$env:CGO_CFLAGS -I$cudaInc" }
+if (Test-Path $cudaBinX64) { $env:PATH = "$cudaBinX64;" + $env:PATH }
 if (Test-Path $cudaBin) { $env:PATH = "$cudaBin;" + $env:PATH }
 if (Test-Path $cudaMingw) { $env:CGO_LDFLAGS = "$env:CGO_LDFLAGS -L$cudaMingw -lcudart64_13 -lcufft64_12 -lkernel32" }
 
@@ -68,8 +71,11 @@ if ($dllSrc) {
 }
 
 $cudartCandidates = @(
+  (Join-Path $cudaBinX64 'cudart64_13.dll'),
   (Join-Path $cudaBin 'cudart64_13.dll'),
+  'C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.2\bin\x64\cudart64_13.dll',
   'C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.2\bin\cudart64_13.dll',
+  'C:\CUDA\bin\x64\cudart64_13.dll',
   'C:\CUDA\bin\cudart64_13.dll'
 )
 $cudartSrc = $cudartCandidates | Where-Object { $_ -and (Test-Path $_) } | Select-Object -First 1
diff --git a/cmd/sdrd/dsp_loop.go b/cmd/sdrd/dsp_loop.go
index f8149a8..be6395e 100644
--- a/cmd/sdrd/dsp_loop.go
+++ b/cmd/sdrd/dsp_loop.go
@@ -3,6 +3,7 @@ package main
 import (
 	"context"
 	"encoding/json"
+	"fmt"
 	"log"
 	"os"
 	"runtime/debug"
@@ -16,15 +17,16 @@ import (
 	"sdr-wideband-suite/internal/logging"
 	"sdr-wideband-suite/internal/pipeline"
 	"sdr-wideband-suite/internal/recorder"
+	"sdr-wideband-suite/internal/telemetry"
 )
 
-func runDSP(ctx context.Context, srcMgr *sourceManager, cfg config.Config, det *detector.Detector, window []float64, h *hub, eventFile *os.File, eventMu *sync.RWMutex, updates <-chan dspUpdate, gpuState *gpuStatus, rec *recorder.Manager, sigSnap *signalSnapshot, extractMgr *extractionManager, phaseSnap *phaseSnapshot) {
+func runDSP(ctx context.Context, srcMgr *sourceManager, cfg config.Config, det *detector.Detector, window []float64, h *hub, eventFile *os.File, eventMu *sync.RWMutex, updates <-chan dspUpdate, gpuState *gpuStatus, rec *recorder.Manager, sigSnap *signalSnapshot, extractMgr *extractionManager, phaseSnap *phaseSnapshot, coll *telemetry.Collector) {
 	defer func() {
 		if r := recover(); r != nil {
 			log.Printf("FATAL: runDSP goroutine panic: %v\n%s", r, debug.Stack())
 		}
 	}()
-	rt := newDSPRuntime(cfg, det, window, gpuState)
+	rt := newDSPRuntime(cfg, det, window, gpuState, coll)
 	ticker := time.NewTicker(cfg.FrameInterval())
 	defer ticker.Stop()
 	logTicker := time.NewTicker(5 * time.Second)
@@ -33,6 +35,9 @@ func runDSP(ctx context.Context, srcMgr *sourceManager, cfg config.Config, det *
 	dcBlocker := dsp.NewDCBlocker(0.995)
 	state := &phaseState{}
 	var frameID uint64
+	prevDisplayed := map[int64]detector.Signal{}
+	lastSourceDrops := uint64(0)
+	lastSourceResets := uint64(0)
 	for {
 		select {
 		case <-ctx.Done():
@@ -40,11 +45,28 @@ func runDSP(ctx context.Context, srcMgr *sourceManager, cfg config.Config, det *
 		case <-logTicker.C:
 			st := srcMgr.Stats()
 			log.Printf("stats: buf=%d drop=%d reset=%d last=%dms", st.BufferSamples, st.Dropped, st.Resets, st.LastSampleAgoMs)
+			if coll != nil {
+				coll.SetGauge("source.buffer_samples", float64(st.BufferSamples), nil)
+				coll.SetGauge("source.last_sample_ago_ms", float64(st.LastSampleAgoMs), nil)
+				if st.Dropped > lastSourceDrops {
+					coll.IncCounter("source.drop.count", float64(st.Dropped-lastSourceDrops), nil)
+				}
+				if st.Resets > lastSourceResets {
+					coll.IncCounter("source.reset.count", float64(st.Resets-lastSourceResets), nil)
+					coll.Event("source_reset", "warn", "source reset observed", nil, map[string]any{"resets": st.Resets})
+				}
+				lastSourceDrops = st.Dropped
+				lastSourceResets = st.Resets
+			}
 		case upd := <-updates:
 			rt.applyUpdate(upd, srcMgr, rec, gpuState)
 			dcBlocker.Reset()
 			ticker.Reset(rt.cfg.FrameInterval())
+			if coll != nil {
+				coll.IncCounter("dsp.update.apply", 1, nil)
+			}
 		case <-ticker.C:
+			frameStart := time.Now()
 			frameID++
 			art, err := rt.captureSpectrum(srcMgr, rec, dcBlocker, gpuState)
 			if err != nil {
@@ -61,8 +83,19 @@ func runDSP(ctx context.Context, srcMgr *sourceManager, cfg config.Config, det *
 				rt.gotSamples = true
 			}
 			logging.Debug("trace", "capture_done", "trace", frameID, "allIQ", len(art.allIQ), "detailIQ", len(art.detailIQ))
+			if coll != nil {
+				coll.Observe("stage.capture.duration_ms", float64(time.Since(frameStart).Microseconds())/1000.0, telemetry.TagsFromPairs("frame_id", fmt.Sprintf("%d", frameID)))
+			}
+			survStart := time.Now()
 			state.surveillance = rt.buildSurveillanceResult(art)
+			if coll != nil {
+				coll.Observe("stage.surveillance.duration_ms", float64(time.Since(survStart).Microseconds())/1000.0, telemetry.TagsFromPairs("frame_id", fmt.Sprintf("%d", frameID)))
+			}
+			refineStart := time.Now()
 			state.refinement = rt.runRefinement(art, state.surveillance, extractMgr, rec)
+			if coll != nil {
+				coll.Observe("stage.refinement.duration_ms", float64(time.Since(refineStart).Microseconds())/1000.0, telemetry.TagsFromPairs("frame_id", fmt.Sprintf("%d", frameID)))
+			}
 			finished := state.surveillance.Finished
 			thresholds := state.surveillance.Thresholds
 			noiseFloor := state.surveillance.NoiseFloor
@@ -75,11 +108,44 @@ func runDSP(ctx context.Context, srcMgr *sourceManager, cfg config.Config, det *
 					streamSignals = stableSignals
 				}
 				if rec != nil && len(art.allIQ) > 0 {
+					if art.streamDropped {
+						rt.streamOverlap = &streamIQOverlap{}
+						for k := range rt.streamPhaseState {
+							rt.streamPhaseState[k].phase = 0
+						}
+						resetStreamingOracleRunner()
+						rec.ResetStreams()
+						logging.Warn("gap", "iq_dropped", "msg", "buffer bloat caused extraction drop; overlap reset")
+						if coll != nil {
+							coll.IncCounter("capture.stream_reset", 1, nil)
+							coll.Event("iq_dropped", "warn", "stream overlap reset after dropped IQ", nil, map[string]any{"frame_id": frameID})
+						}
+					}
 					if rt.cfg.Recorder.DebugLiveAudio {
 						log.Printf("LIVEAUDIO DSP: detailIQ=%d displaySignals=%d streamSignals=%d stableSignals=%d allIQ=%d", len(art.detailIQ), len(displaySignals), len(streamSignals), len(stableSignals), len(art.allIQ))
 					}
 					aqCfg := extractionConfig{firTaps: rt.cfg.Recorder.ExtractionTaps, bwMult: rt.cfg.Recorder.ExtractionBwMult}
-					streamSnips, streamRates := extractForStreaming(extractMgr, art.allIQ, rt.cfg.SampleRate, rt.cfg.CenterHz, streamSignals, rt.streamPhaseState, rt.streamOverlap, aqCfg)
+					extractStart := time.Now()
+					streamSnips, streamRates := extractForStreaming(extractMgr, art.allIQ, rt.cfg.SampleRate, rt.cfg.CenterHz, streamSignals, rt.streamPhaseState, rt.streamOverlap, aqCfg, rt.telemetry)
+					if coll != nil {
+						coll.Observe("stage.extract_stream.duration_ms", float64(time.Since(extractStart).Microseconds())/1000.0, telemetry.TagsFromPairs("frame_id", fmt.Sprintf("%d", frameID)))
+						coll.SetGauge("stage.extract_stream.signals", float64(len(streamSignals)), nil)
+						if coll.ShouldSampleHeavy() {
+							for i := range streamSnips {
+								if i >= len(streamSignals) {
+									break
+								}
+								tags := telemetry.TagsFromPairs(
+									"signal_id", fmt.Sprintf("%d", streamSignals[i].ID),
+									"stage", "extract_stream",
+								)
+								coll.SetGauge("iq.stage.extract.length", float64(len(streamSnips[i])), tags)
+								if len(streamSnips[i]) > 0 {
+									observeIQStats(coll, "extract_stream", streamSnips[i], tags)
+								}
+							}
+						}
+					}
 					nonEmpty := 0
 					minLen := 0
 					maxLen := 0
@@ -127,10 +193,18 @@ func runDSP(ctx context.Context, srcMgr *sourceManager, cfg config.Config, det *
 						log.Printf("LIVEAUDIO DSP: feedItems=%d", len(items))
 					}
 					if len(items) > 0 {
+						feedStart := time.Now()
 						rec.FeedSnippets(items, frameID)
+						if coll != nil {
+							coll.Observe("stage.feed_enqueue.duration_ms", float64(time.Since(feedStart).Microseconds())/1000.0, telemetry.TagsFromPairs("frame_id", fmt.Sprintf("%d", frameID)))
+							coll.SetGauge("stage.feed.items", float64(len(items)), nil)
+						}
 						logging.Debug("trace", "feed", "trace", frameID, "items", len(items), "signals", len(streamSignals), "allIQ", len(art.allIQ))
 					} else {
 						logging.Warn("gap", "feed_empty", "signals", len(streamSignals), "trace", frameID)
+						if coll != nil {
+							coll.IncCounter("stage.feed.empty", 1, nil)
+						}
 					}
 				}
 				rt.maintenance(displaySignals, rec)
@@ -156,6 +230,27 @@ func runDSP(ctx context.Context, srcMgr *sourceManager, cfg config.Config, det *
 			if sigSnap != nil {
 				sigSnap.set(displaySignals)
 			}
+			if coll != nil {
+				coll.SetGauge("signals.display.count", float64(len(displaySignals)), nil)
+				current := make(map[int64]detector.Signal, len(displaySignals))
+				for _, s := range displaySignals {
+					current[s.ID] = s
+					if _, ok := prevDisplayed[s.ID]; !ok {
+						coll.Event("signal_create", "info", "signal entered display set", telemetry.TagsFromPairs("signal_id", fmt.Sprintf("%d", s.ID)), map[string]any{
+							"center_hz": s.CenterHz,
+							"bw_hz":     s.BWHz,
+						})
+					}
+				}
+				for id, prev := range prevDisplayed {
+					if _, ok := current[id]; !ok {
+						coll.Event("signal_remove", "info", "signal left display set", telemetry.TagsFromPairs("signal_id", fmt.Sprintf("%d", id)), map[string]any{
+							"center_hz": prev.CenterHz,
+						})
+					}
+				}
+				prevDisplayed = current
+			}
 			eventMu.Lock()
 			for _, ev := range finished {
 				_ = enc.Encode(ev)
@@ -244,6 +339,9 @@ func runDSP(ctx context.Context, srcMgr *sourceManager, cfg config.Config, det *
 				debugInfo.Refinement = refinementDebug
 			}
 			h.broadcast(SpectrumFrame{Timestamp: art.now.UnixMilli(), CenterHz: rt.cfg.CenterHz, SampleHz: rt.cfg.SampleRate, FFTSize: rt.cfg.FFTSize, Spectrum: art.surveillanceSpectrum, Signals: displaySignals, Debug: debugInfo})
+			if coll != nil {
+				coll.Observe("dsp.frame.duration_ms", float64(time.Since(frameStart).Microseconds())/1000.0, nil)
+			}
 		}
 	}
 }
diff --git a/cmd/sdrd/helpers.go b/cmd/sdrd/helpers.go
index 76524e9..905af0a 100644
--- a/cmd/sdrd/helpers.go
+++ b/cmd/sdrd/helpers.go
@@ -1,10 +1,13 @@
 package main
 
 import (
+	"fmt"
 	"log"
 	"math"
+	"os"
 	"sort"
 	"strconv"
+	"strings"
 	"time"
 
 	"sdr-wideband-suite/internal/config"
@@ -12,6 +15,7 @@ import (
 	"sdr-wideband-suite/internal/detector"
 	"sdr-wideband-suite/internal/dsp"
 	"sdr-wideband-suite/internal/logging"
+	"sdr-wideband-suite/internal/telemetry"
 )
 
 func mustParseDuration(raw string, fallback time.Duration) time.Duration {
@@ -227,15 +231,30 @@ type extractionConfig struct {
 
 const streamOverlapLen = 512 // must be >= FIR tap count with margin
 const (
-	wfmStreamOutRate = 500000
+	wfmStreamOutRate = 512000
 	wfmStreamMinBW   = 250000
 )
 
+var forceCPUStreamExtract = func() bool {
+	raw := strings.TrimSpace(os.Getenv("SDR_FORCE_CPU_STREAM_EXTRACT"))
+	if raw == "" {
+		return false
+	}
+	v, err := strconv.ParseBool(raw)
+	if err != nil {
+		return false
+	}
+	return v
+}()
+
 // extractForStreaming performs GPU-accelerated extraction with:
 //   - Per-signal phase-continuous FreqShift (via PhaseStart in ExtractJob)
 //   - IQ overlap prepended to allIQ so FIR kernel has real data in halo
 //
 // Returns extracted snippets with overlap trimmed, and updates phase state.
+// extractForStreaming is the current legacy production path.
+// It still relies on overlap-prepend + trim semantics and is intentionally
+// kept separate from the new streaming refactor/oracle path under development.
 func extractForStreaming(
 	extractMgr *extractionManager,
 	allIQ []complex64,
@@ -245,7 +264,57 @@ func extractForStreaming(
 	phaseState map[int64]*streamExtractState,
 	overlap *streamIQOverlap,
 	aqCfg extractionConfig,
+	coll *telemetry.Collector,
 ) ([][]complex64, []int) {
+	if useStreamingProductionPath {
+		out, rates, err := extractForStreamingProduction(extractMgr, allIQ, sampleRate, centerHz, signals, aqCfg, coll)
+		if err == nil {
+			logging.Debug("extract", "path_active", "path", "streaming_production", "signals", len(signals), "allIQ", len(allIQ))
+			if coll != nil {
+				coll.IncCounter("extract.path.streaming_production", 1, nil)
+			}
+			return out, rates
+		}
+		// CRITICAL: the streaming production path failed — log WHY before falling through
+		log.Printf("EXTRACT PATH FALLTHROUGH: streaming production failed: %v — using legacy overlap+trim", err)
+		logging.Warn("extract", "streaming_production_fallthrough",
+			"err", err.Error(),
+			"signals", len(signals),
+			"allIQ", len(allIQ),
+			"sampleRate", sampleRate,
+		)
+		if coll != nil {
+			coll.IncCounter("extract.path.streaming_production_failed", 1, nil)
+			coll.Event("extraction_path_fallthrough", "warn",
+				"streaming production path failed, using legacy overlap+trim", nil,
+				map[string]any{
+					"error":      err.Error(),
+					"signals":    len(signals),
+					"allIQ_len":  len(allIQ),
+					"sampleRate": sampleRate,
+				})
+		}
+	}
+	if useStreamingOraclePath {
+		out, rates, err := extractForStreamingOracle(allIQ, sampleRate, centerHz, signals, aqCfg, coll)
+		if err == nil {
+			logging.Debug("extract", "path_active", "path", "streaming_oracle", "signals", len(signals))
+			if coll != nil {
+				coll.IncCounter("extract.path.streaming_oracle", 1, nil)
+			}
+			return out, rates
+		}
+		log.Printf("EXTRACT PATH FALLTHROUGH: streaming oracle failed: %v", err)
+		logging.Warn("extract", "streaming_oracle_fallthrough", "err", err.Error())
+		if coll != nil {
+			coll.IncCounter("extract.path.streaming_oracle_failed", 1, nil)
+		}
+	}
+	// If we reach here, the legacy overlap+trim path is running
+	logging.Warn("extract", "path_active", "path", "legacy_overlap_trim", "signals", len(signals), "allIQ", len(allIQ))
+	if coll != nil {
+		coll.IncCounter("extract.path.legacy_overlap_trim", 1, nil)
+	}
 	out := make([][]complex64, len(signals))
 	rates := make([]int, len(signals))
 	if len(allIQ) == 0 || sampleRate <= 0 || len(signals) == 0 {
@@ -286,6 +355,18 @@ func extractForStreaming(
 		bwMult = 1.0
 	}
 
+	if coll != nil {
+		coll.SetGauge("iq.extract.input.length", float64(len(allIQ)), nil)
+		coll.SetGauge("iq.extract.input.overlap_length", float64(overlapLen), nil)
+		headMean, tailMean, boundaryScore, _ := boundaryMetrics(overlap.tail, allIQ, 32)
+		coll.SetGauge("iq.extract.input.head_mean_mag", headMean, nil)
+		coll.SetGauge("iq.extract.input.prev_tail_mean_mag", tailMean, nil)
+		coll.Observe("iq.extract.input.discontinuity_score", boundaryScore, nil)
+	}
+
+	rawBoundary := make(map[int64]boundaryProbeState, len(signals))
+	trimmedBoundary := make(map[int64]boundaryProbeState, len(signals))
+
 	// Build jobs with per-signal phase
 	jobs := make([]gpudemod.ExtractJob, len(signals))
 	for i, sig := range signals {
@@ -323,11 +404,45 @@ func extractForStreaming(
 			OutRate:    jobOutRate,
 			PhaseStart: gpuPhaseStart,
 		}
+		if coll != nil {
+			tags := telemetry.TagsFromPairs("signal_id", fmt.Sprintf("%d", sig.ID), "path", "gpu")
+			inputHead := probeHead(gpuIQ, 16, 1e-6)
+			coll.SetGauge("iq.extract.input_head.zero_count", float64(inputHead.zeroCount), tags)
+			coll.SetGauge("iq.extract.input_head.first_nonzero_index", float64(inputHead.firstNonZeroIndex), tags)
+			coll.SetGauge("iq.extract.input_head.max_step", inputHead.maxStep, tags)
+			coll.Event("extract_input_head_probe", "info", "extractor input head probe", tags, map[string]any{
+				"mags": inputHead.mags,
+				"zero_count": inputHead.zeroCount,
+				"first_nonzero_index": inputHead.firstNonZeroIndex,
+				"head_max_step": inputHead.maxStep,
+				"center_offset_hz": jobs[i].OffsetHz,
+				"bandwidth_hz": bw,
+				"out_rate": jobOutRate,
+				"trim_samples": (overlapLen + int(math.Max(1, math.Round(float64(sampleRate)/float64(jobOutRate)))) - 1) / int(math.Max(1, math.Round(float64(sampleRate)/float64(jobOutRate)))),
+			})
+		}
 	}
 
-	// Try GPU BatchRunner with phase
-	runner := extractMgr.get(len(gpuIQ), sampleRate)
+	// Try GPU BatchRunner with phase unless CPU-only debug is forced.
+	var runner *gpudemod.BatchRunner
+	if forceCPUStreamExtract {
+		logging.Warn("boundary", "force_cpu_stream_extract", "allIQ_len", len(allIQ), "gpuIQ_len", len(gpuIQ), "signals", len(signals))
+	} else {
+		runner = extractMgr.get(len(gpuIQ), sampleRate)
+	}
 	if runner != nil {
+		if coll != nil && len(gpuIQ) > 0 {
+			inputProbe := probeHead(gpuIQ, 16, 1e-6)
+			coll.Event("gpu_kernel_input_head_probe", "info", "gpu kernel input head probe", nil, map[string]any{
+				"mags": inputProbe.mags,
+				"zero_count": inputProbe.zeroCount,
+				"first_nonzero_index": inputProbe.firstNonZeroIndex,
+				"head_max_step": inputProbe.maxStep,
+				"gpuIQ_len": len(gpuIQ),
+				"sample_rate": sampleRate,
+				"signals": len(signals),
+			})
+		}
 		results, err := runner.ShiftFilterDecimateBatchWithPhase(gpuIQ, jobs)
 		if err == nil && len(results) == len(signals) {
 			for i, res := range results {
@@ -356,9 +471,95 @@ func extractForStreaming(
 
 				// Trim overlap from output
 				iq := res.IQ
+				rawLen := len(iq)
 				if trimSamples > 0 && trimSamples < len(iq) {
 					iq = iq[trimSamples:]
 				}
+				if i == 0 {
+					logging.Debug("boundary", "extract_trim", "path", "gpu", "raw_len", rawLen, "trim", trimSamples, "out_len", len(iq), "overlap_len", overlapLen, "allIQ_len", len(allIQ), "gpuIQ_len", len(gpuIQ), "outRate", outRate, "signal", signals[i].ID)
+					logExtractorHeadComparison(signals[i].ID, "gpu", overlapLen, res.IQ, trimSamples, iq)
+				}
+				if coll != nil {
+					tags := telemetry.TagsFromPairs("signal_id", fmt.Sprintf("%d", signals[i].ID), "path", "gpu")
+					kernelProbe := probeHead(res.IQ, 16, 1e-6)
+					coll.Event("gpu_kernel_output_head_probe", "info", "gpu kernel output head probe", tags, map[string]any{
+						"mags": kernelProbe.mags,
+						"zero_count": kernelProbe.zeroCount,
+						"first_nonzero_index": kernelProbe.firstNonZeroIndex,
+						"head_max_step": kernelProbe.maxStep,
+						"raw_len": rawLen,
+						"out_rate": outRate,
+						"trim_samples": trimSamples,
+					})
+					stats := computeIQHeadStats(iq, 64)
+					coll.SetGauge("iq.extract.output.length", float64(len(iq)), tags)
+					coll.Observe("iq.extract.output.head_mean_mag", stats.meanMag, tags)
+					coll.Observe("iq.extract.output.head_min_mag", stats.minMag, tags)
+					coll.Observe("iq.extract.output.head_max_step", stats.maxStep, tags)
+					coll.Observe("iq.extract.output.head_p95_step", stats.p95Step, tags)
+					coll.Observe("iq.extract.output.head_tail_ratio", stats.headTail, tags)
+					coll.SetGauge("iq.extract.output.head_low_magnitude_count", float64(stats.lowMag), tags)
+					coll.SetGauge("iq.extract.raw.length", float64(rawLen), tags)
+					coll.SetGauge("iq.extract.trim.trim_samples", float64(trimSamples), tags)
+					if rawLen > 0 {
+						coll.SetGauge("iq.extract.raw.head_mag", math.Hypot(float64(real(res.IQ[0])), float64(imag(res.IQ[0]))), tags)
+						coll.SetGauge("iq.extract.raw.tail_mag", math.Hypot(float64(real(res.IQ[rawLen-1])), float64(imag(res.IQ[rawLen-1]))), tags)
+						rawHead := probeHead(res.IQ, 16, 1e-6)
+						coll.SetGauge("iq.extract.raw.head_zero_count", float64(rawHead.zeroCount), tags)
+						coll.SetGauge("iq.extract.raw.first_nonzero_index", float64(rawHead.firstNonZeroIndex), tags)
+						coll.SetGauge("iq.extract.raw.head_max_step", rawHead.maxStep, tags)
+						coll.Event("extract_raw_head_probe", "info", "raw extractor head probe", tags, map[string]any{
+							"mags": rawHead.mags,
+							"zero_count": rawHead.zeroCount,
+							"first_nonzero_index": rawHead.firstNonZeroIndex,
+							"head_max_step": rawHead.maxStep,
+							"trim_samples": trimSamples,
+						})
+					}
+					if len(iq) > 0 {
+						coll.SetGauge("iq.extract.trimmed.head_mag", math.Hypot(float64(real(iq[0])), float64(imag(iq[0]))), tags)
+						coll.SetGauge("iq.extract.trimmed.tail_mag", math.Hypot(float64(real(iq[len(iq)-1])), float64(imag(iq[len(iq)-1]))), tags)
+						trimmedHead := probeHead(iq, 16, 1e-6)
+						coll.SetGauge("iq.extract.trimmed.head_zero_count", float64(trimmedHead.zeroCount), tags)
+						coll.SetGauge("iq.extract.trimmed.first_nonzero_index", float64(trimmedHead.firstNonZeroIndex), tags)
+						coll.SetGauge("iq.extract.trimmed.head_max_step", trimmedHead.maxStep, tags)
+						coll.Event("extract_trimmed_head_probe", "info", "trimmed extractor head probe", tags, map[string]any{
+							"mags": trimmedHead.mags,
+							"zero_count": trimmedHead.zeroCount,
+							"first_nonzero_index": trimmedHead.firstNonZeroIndex,
+							"head_max_step": trimmedHead.maxStep,
+							"trim_samples": trimSamples,
+						})
+					}
+					if rb := rawBoundary[signals[i].ID]; rb.set && rawLen > 0 {
+						prevMag := math.Hypot(float64(real(rb.last)), float64(imag(rb.last)))
+						currMag := math.Hypot(float64(real(res.IQ[0])), float64(imag(res.IQ[0])))
+						coll.SetGauge("iq.extract.raw.boundary.prev_tail_mag", prevMag, tags)
+						coll.SetGauge("iq.extract.raw.boundary.curr_head_mag", currMag, tags)
+						coll.Event("extract_raw_boundary", "info", "raw extractor boundary", tags, map[string]any{
+							"delta_mag": math.Abs(currMag - prevMag),
+							"trim_samples": trimSamples,
+							"raw_len": rawLen,
+						})
+					}
+					if tb := trimmedBoundary[signals[i].ID]; tb.set && len(iq) > 0 {
+						prevMag := math.Hypot(float64(real(tb.last)), float64(imag(tb.last)))
+						currMag := math.Hypot(float64(real(iq[0])), float64(imag(iq[0])))
+						coll.SetGauge("iq.extract.trimmed.boundary.prev_tail_mag", prevMag, tags)
+						coll.SetGauge("iq.extract.trimmed.boundary.curr_head_mag", currMag, tags)
+						coll.Event("extract_trimmed_boundary", "info", "trimmed extractor boundary", tags, map[string]any{
+							"delta_mag": math.Abs(currMag - prevMag),
+							"trim_samples": trimSamples,
+							"out_len": len(iq),
+						})
+					}
+				}
+				if rawLen > 0 {
+					rawBoundary[signals[i].ID] = boundaryProbeState{last: res.IQ[rawLen-1], set: true}
+				}
+				if len(iq) > 0 {
+					trimmedBoundary[signals[i].ID] = boundaryProbeState{last: iq[len(iq)-1], set: true}
+				}
 				out[i] = iq
 				rates[i] = res.Rate
 			}
@@ -424,10 +625,240 @@ func extractForStreaming(
 		if i == 0 {
 			logging.Debug("extract", "cpu_result", "outRate", outRate, "decim", decim, "trim", trimSamples)
 		}
+		rawIQ := decimated
+		rawLen := len(rawIQ)
 		if trimSamples > 0 && trimSamples < len(decimated) {
 			decimated = decimated[trimSamples:]
 		}
+		if i == 0 {
+			logging.Debug("boundary", "extract_trim", "path", "cpu", "raw_len", rawLen, "trim", trimSamples, "out_len", len(decimated), "overlap_len", overlapLen, "allIQ_len", len(allIQ), "gpuIQ_len", len(gpuIQ), "outRate", outRate, "signal", signals[i].ID)
+			logExtractorHeadComparison(signals[i].ID, "cpu", overlapLen, decimated, trimSamples, decimated)
+		}
+		if coll != nil {
+			tags := telemetry.TagsFromPairs("signal_id", fmt.Sprintf("%d", signals[i].ID), "path", "cpu")
+			stats := computeIQHeadStats(decimated, 64)
+			coll.SetGauge("iq.extract.output.length", float64(len(decimated)), tags)
+			coll.Observe("iq.extract.output.head_mean_mag", stats.meanMag, tags)
+			coll.Observe("iq.extract.output.head_min_mag", stats.minMag, tags)
+			coll.Observe("iq.extract.output.head_max_step", stats.maxStep, tags)
+			coll.Observe("iq.extract.output.head_p95_step", stats.p95Step, tags)
+			coll.Observe("iq.extract.output.head_tail_ratio", stats.headTail, tags)
+			coll.SetGauge("iq.extract.output.head_low_magnitude_count", float64(stats.lowMag), tags)
+			coll.SetGauge("iq.extract.raw.length", float64(rawLen), tags)
+			coll.SetGauge("iq.extract.trim.trim_samples", float64(trimSamples), tags)
+			if rb := rawBoundary[signals[i].ID]; rb.set && rawLen > 0 {
+				observeBoundarySample(coll, "iq.extract.raw.boundary", tags, rb.last, rawIQ[0])
+			}
+			if tb := trimmedBoundary[signals[i].ID]; tb.set && len(decimated) > 0 {
+				observeBoundarySample(coll, "iq.extract.trimmed.boundary", tags, tb.last, decimated[0])
+			}
+		}
+		if rawLen > 0 {
+			rawBoundary[signals[i].ID] = boundaryProbeState{last: rawIQ[rawLen-1], set: true}
+		}
+		if len(decimated) > 0 {
+			trimmedBoundary[signals[i].ID] = boundaryProbeState{last: decimated[len(decimated)-1], set: true}
+		}
 		out[i] = decimated
 	}
 	return out, rates
 }
+
+type iqHeadStats struct {
+	length      int
+	minMag      float64
+	maxMag      float64
+	meanMag     float64
+	lowMag      int
+	maxStep     float64
+	maxStepIdx  int
+	p95Step     float64
+	headTail    float64
+	headMinIdx  int
+	stepSamples []float64
+}
+
+type boundaryProbeState struct {
+	last complex64
+	set  bool
+}
+
+type headProbe struct {
+	zeroCount         int
+	firstNonZeroIndex int
+	maxStep           float64
+	mags              []float64
+}
+
+func probeHead(samples []complex64, n int, zeroThreshold float64) headProbe {
+	if n <= 0 || len(samples) == 0 {
+		return headProbe{firstNonZeroIndex: -1}
+	}
+	if len(samples) < n {
+		n = len(samples)
+	}
+	if zeroThreshold <= 0 {
+		zeroThreshold = 1e-6
+	}
+	out := headProbe{firstNonZeroIndex: -1, mags: make([]float64, 0, n)}
+	for i := 0; i < n; i++ {
+		v := samples[i]
+		mag := math.Hypot(float64(real(v)), float64(imag(v)))
+		out.mags = append(out.mags, mag)
+		if mag <= zeroThreshold {
+			out.zeroCount++
+		} else if out.firstNonZeroIndex < 0 {
+			out.firstNonZeroIndex = i
+		}
+		if i > 0 {
+			p := samples[i-1]
+			num := float64(real(p))*float64(imag(v)) - float64(imag(p))*float64(real(v))
+			den := float64(real(p))*float64(real(v)) + float64(imag(p))*float64(imag(v))
+			step := math.Abs(math.Atan2(num, den))
+			if step > out.maxStep {
+				out.maxStep = step
+			}
+		}
+	}
+	return out
+}
+
+func observeBoundarySample(coll *telemetry.Collector, metricPrefix string, tags map[string]string, prev complex64, curr complex64) {
+	prevMag := math.Hypot(float64(real(prev)), float64(imag(prev)))
+	currMag := math.Hypot(float64(real(curr)), float64(imag(curr)))
+	deltaMag := math.Abs(currMag - prevMag)
+	num := float64(real(prev))*float64(imag(curr)) - float64(imag(prev))*float64(real(curr))
+	den := float64(real(prev))*float64(real(curr)) + float64(imag(prev))*float64(imag(curr))
+	deltaPhase := math.Abs(math.Atan2(num, den))
+	d2 := float64(real(curr-prev))*float64(real(curr-prev)) + float64(imag(curr-prev))*float64(imag(curr-prev))
+	coll.Observe(metricPrefix+".delta_mag", deltaMag, tags)
+	coll.Observe(metricPrefix+".delta_phase", deltaPhase, tags)
+	coll.Observe(metricPrefix+".d2", d2, tags)
+	coll.Observe(metricPrefix+".discontinuity_score", deltaMag+deltaPhase, tags)
+}
+
+func computeIQHeadStats(iq []complex64, headLen int) iqHeadStats {
+	stats := iqHeadStats{minMag: math.MaxFloat64, headMinIdx: -1, maxStepIdx: -1}
+	if len(iq) == 0 {
+		stats.minMag = 0
+		return stats
+	}
+	n := len(iq)
+	if headLen > 0 && headLen < n {
+		n = headLen
+	}
+	stats.length = n
+	stats.stepSamples = make([]float64, 0, max(0, n-1))
+	sumMag := 0.0
+	headSum := 0.0
+	tailSum := 0.0
+	tailCount := 0
+	for i := 0; i < n; i++ {
+		v := iq[i]
+		mag := math.Hypot(float64(real(v)), float64(imag(v)))
+		if mag < stats.minMag {
+			stats.minMag = mag
+			stats.headMinIdx = i
+		}
+		if mag > stats.maxMag {
+			stats.maxMag = mag
+		}
+		sumMag += mag
+		if mag < 0.05 {
+			stats.lowMag++
+		}
+		if i < min(16, n) {
+			headSum += mag
+		}
+		if i >= max(0, n-16) {
+			tailSum += mag
+			tailCount++
+		}
+		if i > 0 {
+			p := iq[i-1]
+			num := float64(real(p))*float64(imag(v)) - float64(imag(p))*float64(real(v))
+			den := float64(real(p))*float64(real(v)) + float64(imag(p))*float64(imag(v))
+			step := math.Abs(math.Atan2(num, den))
+			if step > stats.maxStep {
+				stats.maxStep = step
+				stats.maxStepIdx = i - 1
+			}
+			stats.stepSamples = append(stats.stepSamples, step)
+		}
+	}
+	stats.meanMag = sumMag / float64(n)
+	if len(stats.stepSamples) > 0 {
+		sorted := append([]float64(nil), stats.stepSamples...)
+		sort.Float64s(sorted)
+		idx := int(float64(len(sorted)-1) * 0.95)
+		stats.p95Step = sorted[idx]
+	} else {
+		stats.p95Step = stats.maxStep
+	}
+	if headSum > 0 && tailCount > 0 {
+		headMean := headSum / float64(min(16, n))
+		tailMean := tailSum / float64(tailCount)
+		if tailMean > 0 {
+			stats.headTail = headMean / tailMean
+		}
+	}
+	return stats
+}
+
+func observeIQStats(coll *telemetry.Collector, stage string, iq []complex64, tags telemetry.Tags) {
+	if coll == nil || len(iq) == 0 {
+		return
+	}
+	stats := computeIQHeadStats(iq, len(iq))
+	stageTags := telemetry.TagsWith(tags, "stage", stage)
+	coll.Observe("iq.magnitude.min", stats.minMag, stageTags)
+	coll.Observe("iq.magnitude.max", stats.maxMag, stageTags)
+	coll.Observe("iq.magnitude.mean", stats.meanMag, stageTags)
+	coll.Observe("iq.phase_step.max", stats.maxStep, stageTags)
+	coll.Observe("iq.phase_step.p95", stats.p95Step, stageTags)
+	coll.Observe("iq.low_magnitude.count", float64(stats.lowMag), stageTags)
+	coll.SetGauge("iq.length", float64(stats.length), stageTags)
+}
+
+func logExtractorHeadComparison(signalID int64, path string, overlapLen int, raw []complex64, trimSamples int, out []complex64) {
+	rawStats := computeIQHeadStats(raw, 96)
+	trimmedStats := computeIQHeadStats(out, 96)
+	logging.Debug("boundary", "extract_head_compare",
+		"signal", signalID,
+		"path", path,
+		"raw_len", len(raw),
+		"trim", trimSamples,
+		"out_len", len(out),
+		"overlap_len", overlapLen,
+		"raw_min_mag", rawStats.minMag,
+		"raw_min_idx", rawStats.headMinIdx,
+		"raw_max_step", rawStats.maxStep,
+		"raw_max_step_idx", rawStats.maxStepIdx,
+		"raw_head_tail", rawStats.headTail,
+		"trimmed_min_mag", trimmedStats.minMag,
+		"trimmed_min_idx", trimmedStats.headMinIdx,
+		"trimmed_max_step", trimmedStats.maxStep,
+		"trimmed_max_step_idx", trimmedStats.maxStepIdx,
+		"trimmed_head_tail", trimmedStats.headTail,
+	)
+	for _, off := range []int{2, 4, 8, 16} {
+		if len(out) <= off+8 {
+			continue
+		}
+		offStats := computeIQHeadStats(out[off:], 96)
+		logging.Debug("boundary", "extract_head_offset_compare",
+			"signal", signalID,
+			"path", path,
+			"offset", off,
+			"base_min_mag", trimmedStats.minMag,
+			"base_min_idx", trimmedStats.headMinIdx,
+			"base_max_step", trimmedStats.maxStep,
+			"base_max_step_idx", trimmedStats.maxStepIdx,
+			"offset_min_mag", offStats.minMag,
+			"offset_min_idx", offStats.headMinIdx,
+			"offset_max_step", offStats.maxStep,
+			"offset_max_step_idx", offStats.maxStepIdx,
+			"offset_head_tail", offStats.headTail,
+		)
+	}
+}
diff --git a/cmd/sdrd/http_handlers.go b/cmd/sdrd/http_handlers.go
index 14c0846..a633fde 100644
--- a/cmd/sdrd/http_handlers.go
+++ b/cmd/sdrd/http_handlers.go
@@ -3,6 +3,7 @@ package main
 import (
 	"context"
 	"encoding/json"
+	"errors"
 	"log"
 	"net/http"
 	"os"
@@ -19,9 +20,10 @@ import (
 	"sdr-wideband-suite/internal/pipeline"
 	"sdr-wideband-suite/internal/recorder"
 	"sdr-wideband-suite/internal/runtime"
+	"sdr-wideband-suite/internal/telemetry"
 )
 
-func registerAPIHandlers(mux *http.ServeMux, cfgPath string, cfgManager *runtime.Manager, srcMgr *sourceManager, dspUpdates chan dspUpdate, gpuState *gpuStatus, recMgr *recorder.Manager, sigSnap *signalSnapshot, eventMu *sync.RWMutex, phaseSnap *phaseSnapshot) {
+func registerAPIHandlers(mux *http.ServeMux, cfgPath string, cfgManager *runtime.Manager, srcMgr *sourceManager, dspUpdates chan dspUpdate, gpuState *gpuStatus, recMgr *recorder.Manager, sigSnap *signalSnapshot, eventMu *sync.RWMutex, phaseSnap *phaseSnapshot, telem *telemetry.Collector) {
 	mux.HandleFunc("/api/config", func(w http.ResponseWriter, r *http.Request) {
 		w.Header().Set("Content-Type", "application/json")
 		switch r.Method {
@@ -378,16 +380,196 @@ func registerAPIHandlers(mux *http.ServeMux, cfgPath string, cfgManager *runtime
 		w.Header().Set("Content-Type", "audio/wav")
 		_, _ = w.Write(data)
 	})
+	mux.HandleFunc("/api/debug/telemetry/live", func(w http.ResponseWriter, r *http.Request) {
+		w.Header().Set("Content-Type", "application/json")
+		if telem == nil {
+			_ = json.NewEncoder(w).Encode(map[string]any{"enabled": false, "error": "telemetry unavailable"})
+			return
+		}
+		_ = json.NewEncoder(w).Encode(telem.LiveSnapshot())
+	})
+	mux.HandleFunc("/api/debug/telemetry/history", func(w http.ResponseWriter, r *http.Request) {
+		w.Header().Set("Content-Type", "application/json")
+		if telem == nil {
+			http.Error(w, "telemetry unavailable", http.StatusServiceUnavailable)
+			return
+		}
+		query, err := telemetryQueryFromRequest(r)
+		if err != nil {
+			http.Error(w, err.Error(), http.StatusBadRequest)
+			return
+		}
+		items, err := telem.QueryMetrics(query)
+		if err != nil {
+			http.Error(w, err.Error(), http.StatusInternalServerError)
+			return
+		}
+		_ = json.NewEncoder(w).Encode(map[string]any{"items": items, "count": len(items)})
+	})
+	mux.HandleFunc("/api/debug/telemetry/events", func(w http.ResponseWriter, r *http.Request) {
+		w.Header().Set("Content-Type", "application/json")
+		if telem == nil {
+			http.Error(w, "telemetry unavailable", http.StatusServiceUnavailable)
+			return
+		}
+		query, err := telemetryQueryFromRequest(r)
+		if err != nil {
+			http.Error(w, err.Error(), http.StatusBadRequest)
+			return
+		}
+		items, err := telem.QueryEvents(query)
+		if err != nil {
+			http.Error(w, err.Error(), http.StatusInternalServerError)
+			return
+		}
+		_ = json.NewEncoder(w).Encode(map[string]any{"items": items, "count": len(items)})
+	})
+	mux.HandleFunc("/api/debug/telemetry/config", func(w http.ResponseWriter, r *http.Request) {
+		w.Header().Set("Content-Type", "application/json")
+		if telem == nil {
+			http.Error(w, "telemetry unavailable", http.StatusServiceUnavailable)
+			return
+		}
+		switch r.Method {
+		case http.MethodGet:
+			_ = json.NewEncoder(w).Encode(map[string]any{
+				"collector": telem.Config(),
+				"config":    cfgManager.Snapshot().Debug.Telemetry,
+			})
+		case http.MethodPost:
+			var update struct {
+				Enabled           *bool   `json:"enabled"`
+				HeavyEnabled      *bool   `json:"heavy_enabled"`
+				HeavySampleEvery  *int    `json:"heavy_sample_every"`
+				MetricSampleEvery *int    `json:"metric_sample_every"`
+				MetricHistoryMax  *int    `json:"metric_history_max"`
+				EventHistoryMax   *int    `json:"event_history_max"`
+				RetentionSeconds  *int    `json:"retention_seconds"`
+				PersistEnabled    *bool   `json:"persist_enabled"`
+				PersistDir        *string `json:"persist_dir"`
+				RotateMB          *int    `json:"rotate_mb"`
+				KeepFiles         *int    `json:"keep_files"`
+			}
+			if err := json.NewDecoder(r.Body).Decode(&update); err != nil {
+				http.Error(w, "invalid json", http.StatusBadRequest)
+				return
+			}
+			next := cfgManager.Snapshot()
+			cur := next.Debug.Telemetry
+			if update.Enabled != nil {
+				cur.Enabled = *update.Enabled
+			}
+			if update.HeavyEnabled != nil {
+				cur.HeavyEnabled = *update.HeavyEnabled
+			}
+			if update.HeavySampleEvery != nil {
+				cur.HeavySampleEvery = *update.HeavySampleEvery
+			}
+			if update.MetricSampleEvery != nil {
+				cur.MetricSampleEvery = *update.MetricSampleEvery
+			}
+			if update.MetricHistoryMax != nil {
+				cur.MetricHistoryMax = *update.MetricHistoryMax
+			}
+			if update.EventHistoryMax != nil {
+				cur.EventHistoryMax = *update.EventHistoryMax
+			}
+			if update.RetentionSeconds != nil {
+				cur.RetentionSeconds = *update.RetentionSeconds
+			}
+			if update.PersistEnabled != nil {
+				cur.PersistEnabled = *update.PersistEnabled
+			}
+			if update.PersistDir != nil && *update.PersistDir != "" {
+				cur.PersistDir = *update.PersistDir
+			}
+			if update.RotateMB != nil {
+				cur.RotateMB = *update.RotateMB
+			}
+			if update.KeepFiles != nil {
+				cur.KeepFiles = *update.KeepFiles
+			}
+			next.Debug.Telemetry = cur
+			cfgManager.Replace(next)
+			if err := config.Save(cfgPath, next); err != nil {
+				log.Printf("telemetry config save failed: %v", err)
+			}
+			err := telem.Configure(telemetry.Config{
+				Enabled:           cur.Enabled,
+				HeavyEnabled:      cur.HeavyEnabled,
+				HeavySampleEvery:  cur.HeavySampleEvery,
+				MetricSampleEvery: cur.MetricSampleEvery,
+				MetricHistoryMax:  cur.MetricHistoryMax,
+				EventHistoryMax:   cur.EventHistoryMax,
+				Retention:         time.Duration(cur.RetentionSeconds) * time.Second,
+				PersistEnabled:    cur.PersistEnabled,
+				PersistDir:        cur.PersistDir,
+				RotateMB:          cur.RotateMB,
+				KeepFiles:         cur.KeepFiles,
+			})
+			if err != nil {
+				http.Error(w, err.Error(), http.StatusBadRequest)
+				return
+			}
+			_ = json.NewEncoder(w).Encode(map[string]any{"ok": true, "collector": telem.Config(), "config": cur})
+		default:
+			http.Error(w, "method not allowed", http.StatusMethodNotAllowed)
+		}
+	})
 }
 
-func newHTTPServer(addr string, webRoot string, h *hub, cfgPath string, cfgManager *runtime.Manager, srcMgr *sourceManager, dspUpdates chan dspUpdate, gpuState *gpuStatus, recMgr *recorder.Manager, sigSnap *signalSnapshot, eventMu *sync.RWMutex, phaseSnap *phaseSnapshot) *http.Server {
+func newHTTPServer(addr string, webRoot string, h *hub, cfgPath string, cfgManager *runtime.Manager, srcMgr *sourceManager, dspUpdates chan dspUpdate, gpuState *gpuStatus, recMgr *recorder.Manager, sigSnap *signalSnapshot, eventMu *sync.RWMutex, phaseSnap *phaseSnapshot, telem *telemetry.Collector) *http.Server {
 	mux := http.NewServeMux()
 	registerWSHandlers(mux, h, recMgr)
-	registerAPIHandlers(mux, cfgPath, cfgManager, srcMgr, dspUpdates, gpuState, recMgr, sigSnap, eventMu, phaseSnap)
+	registerAPIHandlers(mux, cfgPath, cfgManager, srcMgr, dspUpdates, gpuState, recMgr, sigSnap, eventMu, phaseSnap, telem)
 	mux.Handle("/", http.FileServer(http.Dir(webRoot)))
 	return &http.Server{Addr: addr, Handler: mux}
 }
 
+func telemetryQueryFromRequest(r *http.Request) (telemetry.Query, error) {
+	q := r.URL.Query()
+	var out telemetry.Query
+	var err error
+	if out.From, err = telemetry.ParseTimeQuery(q.Get("since")); err != nil {
+		return out, errors.New("invalid since")
+	}
+	if out.To, err = telemetry.ParseTimeQuery(q.Get("until")); err != nil {
+		return out, errors.New("invalid until")
+	}
+	if v := q.Get("limit"); v != "" {
+		if parsed, parseErr := strconv.Atoi(v); parseErr == nil {
+			out.Limit = parsed
+		}
+	}
+	out.Name = q.Get("name")
+	out.NamePrefix = q.Get("prefix")
+	out.Level = q.Get("level")
+	out.IncludePersisted = true
+	if v := q.Get("include_persisted"); v != "" {
+		if b, parseErr := strconv.ParseBool(v); parseErr == nil {
+			out.IncludePersisted = b
+		}
+	}
+	tags := telemetry.Tags{}
+	for key, vals := range q {
+		if len(vals) == 0 {
+			continue
+		}
+		if strings.HasPrefix(key, "tag_") {
+			tags[strings.TrimPrefix(key, "tag_")] = vals[0]
+		}
+	}
+	for _, key := range []string{"signal_id", "session_id", "stage", "trace_id", "component"} {
+		if v := q.Get(key); v != "" {
+			tags[key] = v
+		}
+	}
+	if len(tags) > 0 {
+		out.Tags = tags
+	}
+	return out, nil
+}
+
 func shutdownServer(server *http.Server) {
 	ctxTimeout, cancelTimeout := context.WithTimeout(context.Background(), 5*time.Second)
 	defer cancelTimeout()
diff --git a/cmd/sdrd/legacy_extract.go b/cmd/sdrd/legacy_extract.go
new file mode 100644
index 0000000..52590cc
--- /dev/null
+++ b/cmd/sdrd/legacy_extract.go
@@ -0,0 +1,6 @@
+package main
+
+// NOTE: Legacy extractor logic still lives in helpers.go for now.
+// This file is intentionally reserved for the later explicit move once the
+// production-path rewrite is far enough along that the split can be done in one
+// safe pass instead of a risky mechanical half-step.
diff --git a/cmd/sdrd/main.go b/cmd/sdrd/main.go
index 77a9814..361775d 100644
--- a/cmd/sdrd/main.go
+++ b/cmd/sdrd/main.go
@@ -23,6 +23,7 @@ import (
 	"sdr-wideband-suite/internal/runtime"
 	"sdr-wideband-suite/internal/sdr"
 	"sdr-wideband-suite/internal/sdrplay"
+	"sdr-wideband-suite/internal/telemetry"
 )
 
 func main() {
@@ -51,6 +52,25 @@ func main() {
 
 	cfgManager := runtime.New(cfg)
 	gpuState := &gpuStatus{Available: gpufft.Available()}
+	telemetryCfg := telemetry.Config{
+		Enabled:           cfg.Debug.Telemetry.Enabled,
+		HeavyEnabled:      cfg.Debug.Telemetry.HeavyEnabled,
+		HeavySampleEvery:  cfg.Debug.Telemetry.HeavySampleEvery,
+		MetricSampleEvery: cfg.Debug.Telemetry.MetricSampleEvery,
+		MetricHistoryMax:  cfg.Debug.Telemetry.MetricHistoryMax,
+		EventHistoryMax:   cfg.Debug.Telemetry.EventHistoryMax,
+		Retention:         time.Duration(cfg.Debug.Telemetry.RetentionSeconds) * time.Second,
+		PersistEnabled:    cfg.Debug.Telemetry.PersistEnabled,
+		PersistDir:        cfg.Debug.Telemetry.PersistDir,
+		RotateMB:          cfg.Debug.Telemetry.RotateMB,
+		KeepFiles:         cfg.Debug.Telemetry.KeepFiles,
+	}
+	telemetryCollector, err := telemetry.New(telemetryCfg)
+	if err != nil {
+		log.Fatalf("telemetry init failed: %v", err)
+	}
+	defer telemetryCollector.Close()
+	telemetryCollector.SetStatus("build", "sdrd")
 
 	newSource := func(cfg config.Config) (sdr.Source, error) {
 		if mockFlag {
@@ -74,7 +94,7 @@ func main() {
 	if err != nil {
 		log.Fatalf("sdrplay init failed: %v (try --mock or build with -tags sdrplay)", err)
 	}
-	srcMgr := newSourceManager(src, newSource)
+	srcMgr := newSourceManagerWithTelemetry(src, newSource, telemetryCollector)
 	if err := srcMgr.Start(); err != nil {
 		log.Fatalf("source start: %v", err)
 	}
@@ -118,7 +138,7 @@ func main() {
 		DeemphasisUs:     cfg.Recorder.DeemphasisUs,
 		ExtractionTaps:   cfg.Recorder.ExtractionTaps,
 		ExtractionBwMult: cfg.Recorder.ExtractionBwMult,
-	}, cfg.CenterHz, decodeMap)
+	}, cfg.CenterHz, decodeMap, telemetryCollector)
 	defer recMgr.Close()
 
 	sigSnap := &signalSnapshot{}
@@ -126,9 +146,9 @@ func main() {
 	defer extractMgr.reset()
 
 	phaseSnap := &phaseSnapshot{}
-	go runDSP(ctx, srcMgr, cfg, det, window, h, eventFile, eventMu, dspUpdates, gpuState, recMgr, sigSnap, extractMgr, phaseSnap)
+	go runDSP(ctx, srcMgr, cfg, det, window, h, eventFile, eventMu, dspUpdates, gpuState, recMgr, sigSnap, extractMgr, phaseSnap, telemetryCollector)
 
-	server := newHTTPServer(cfg.WebAddr, cfg.WebRoot, h, cfgPath, cfgManager, srcMgr, dspUpdates, gpuState, recMgr, sigSnap, eventMu, phaseSnap)
+	server := newHTTPServer(cfg.WebAddr, cfg.WebRoot, h, cfgPath, cfgManager, srcMgr, dspUpdates, gpuState, recMgr, sigSnap, eventMu, phaseSnap, telemetryCollector)
 	go func() {
 		log.Printf("web listening on %s", cfg.WebAddr)
 		if err := server.ListenAndServe(); err != nil && err != http.ErrServerClosed {
diff --git a/cmd/sdrd/pipeline_runtime.go b/cmd/sdrd/pipeline_runtime.go
index d2cec7f..5b37088 100644
--- a/cmd/sdrd/pipeline_runtime.go
+++ b/cmd/sdrd/pipeline_runtime.go
@@ -3,6 +3,8 @@ package main
 import (
 	"fmt"
 	"math"
+	"os"
+	"strconv"
 	"strings"
 	"sync"
 	"sync/atomic"
@@ -19,6 +21,7 @@ import (
 	"sdr-wideband-suite/internal/pipeline"
 	"sdr-wideband-suite/internal/rds"
 	"sdr-wideband-suite/internal/recorder"
+	"sdr-wideband-suite/internal/telemetry"
 )
 
 type rdsState struct {
@@ -29,6 +32,18 @@ type rdsState struct {
 	mu         sync.Mutex
 }
 
+var forceFixedStreamReadSamples = func() int {
+	raw := strings.TrimSpace(os.Getenv("SDR_FORCE_FIXED_STREAM_READ_SAMPLES"))
+	if raw == "" {
+		return 0
+	}
+	v, err := strconv.Atoi(raw)
+	if err != nil || v <= 0 {
+		return 0
+	}
+	return v
+}()
+
 type dspRuntime struct {
 	cfg              config.Config
 	det              *detector.Detector
@@ -52,10 +67,13 @@ type dspRuntime struct {
 	arbiter          *pipeline.Arbiter
 	arbitration      pipeline.ArbitrationState
 	gotSamples       bool
+	telemetry        *telemetry.Collector
+	lastAllIQTail    []complex64
 }
 
 type spectrumArtifacts struct {
 	allIQ                []complex64
+	streamDropped        bool
 	surveillanceIQ       []complex64
 	detailIQ             []complex64
 	surveillanceSpectrum []float64
@@ -94,7 +112,7 @@ type surveillancePlan struct {
 
 const derivedIDBlock = int64(1_000_000_000)
 
-func newDSPRuntime(cfg config.Config, det *detector.Detector, window []float64, gpuState *gpuStatus) *dspRuntime {
+func newDSPRuntime(cfg config.Config, det *detector.Detector, window []float64, gpuState *gpuStatus, coll *telemetry.Collector) *dspRuntime {
 	detailFFT := cfg.Refinement.DetailFFTSize
 	if detailFFT <= 0 {
 		detailFFT = cfg.FFTSize
@@ -119,6 +137,7 @@ func newDSPRuntime(cfg config.Config, det *detector.Detector, window []float64,
 		streamPhaseState: map[int64]*streamExtractState{},
 		streamOverlap:    &streamIQOverlap{},
 		arbiter:          pipeline.NewArbiter(),
+		telemetry:        coll,
 	}
 	if rt.useGPU && gpuState != nil {
 		snap := gpuState.snapshot()
@@ -216,6 +235,15 @@ func (rt *dspRuntime) applyUpdate(upd dspUpdate, srcMgr *sourceManager, rec *rec
 			gpuState.set(false, nil)
 		}
 	}
+	if rt.telemetry != nil {
+		rt.telemetry.Event("dsp_config_update", "info", "dsp runtime configuration updated", nil, map[string]any{
+			"fft_size":     rt.cfg.FFTSize,
+			"sample_rate":  rt.cfg.SampleRate,
+			"use_gpu_fft":  rt.cfg.UseGPUFFT,
+			"detail_fft":   rt.detailFFT,
+			"surv_strategy": rt.cfg.Surveillance.Strategy,
+		})
+	}
 }
 
 func (rt *dspRuntime) spectrumFromIQ(iq []complex64, gpuState *gpuStatus) []float64 {
@@ -334,26 +362,112 @@ func (rt *dspRuntime) decimateSurveillanceIQ(iq []complex64, factor int) []compl
 	return dsp.Decimate(filtered, factor)
 }
 
+func meanMagComplex(samples []complex64) float64 {
+	if len(samples) == 0 {
+		return 0
+	}
+	var sum float64
+	for _, v := range samples {
+		sum += math.Hypot(float64(real(v)), float64(imag(v)))
+	}
+	return sum / float64(len(samples))
+}
+
+func phaseStepAbs(a, b complex64) float64 {
+	num := float64(real(a))*float64(imag(b)) - float64(imag(a))*float64(real(b))
+	den := float64(real(a))*float64(real(b)) + float64(imag(a))*float64(imag(b))
+	return math.Abs(math.Atan2(num, den))
+}
+
+func boundaryMetrics(prevTail []complex64, curr []complex64, window int) (float64, float64, float64, int) {
+	if len(curr) == 0 {
+		return 0, 0, 0, 0
+	}
+	if window <= 0 {
+		window = 16
+	}
+	headN := window
+	if len(curr) < headN {
+		headN = len(curr)
+	}
+	headMean := meanMagComplex(curr[:headN])
+	if len(prevTail) == 0 {
+		return headMean, 0, 0, headN
+	}
+	tailN := window
+	if len(prevTail) < tailN {
+		tailN = len(prevTail)
+	}
+	tailMean := meanMagComplex(prevTail[len(prevTail)-tailN:])
+	deltaMag := math.Abs(headMean - tailMean)
+	phaseJump := phaseStepAbs(prevTail[len(prevTail)-1], curr[0])
+	score := deltaMag + phaseJump
+	return headMean, tailMean, score, headN
+}
+
+func tailWindowComplex(src []complex64, n int) []complex64 {
+	if n <= 0 || len(src) == 0 {
+		return nil
+	}
+	if len(src) <= n {
+		out := make([]complex64, len(src))
+		copy(out, src)
+		return out
+	}
+	out := make([]complex64, n)
+	copy(out, src[len(src)-n:])
+	return out
+}
+
 func (rt *dspRuntime) captureSpectrum(srcMgr *sourceManager, rec *recorder.Manager, dcBlocker *dsp.DCBlocker, gpuState *gpuStatus) (*spectrumArtifacts, error) {
+	start := time.Now()
 	required := rt.cfg.FFTSize
 	if rt.detailFFT > required {
 		required = rt.detailFFT
 	}
 	available := required
 	st := srcMgr.Stats()
-	if st.BufferSamples > required {
+	if rt.telemetry != nil {
+		rt.telemetry.SetGauge("source.buffer_samples", float64(st.BufferSamples), nil)
+		rt.telemetry.SetGauge("source.last_sample_ago_ms", float64(st.LastSampleAgoMs), nil)
+		rt.telemetry.SetGauge("source.dropped", float64(st.Dropped), nil)
+		rt.telemetry.SetGauge("source.resets", float64(st.Resets), nil)
+	}
+	if forceFixedStreamReadSamples > 0 {
+		available = forceFixedStreamReadSamples
+		if available < required {
+			available = required
+		}
+		available = (available / required) * required
+		if available < required {
+			available = required
+		}
+		logging.Warn("boundary", "fixed_stream_read_samples", "configured", forceFixedStreamReadSamples, "effective", available, "required", required)
+	} else if st.BufferSamples > required {
 		available = (st.BufferSamples / required) * required
 		if available < required {
 			available = required
 		}
 	}
 	logging.Debug("capture", "read_iq", "required", required, "available", available, "buf", st.BufferSamples, "reset", st.Resets, "drop", st.Dropped)
+	readStart := time.Now()
 	allIQ, err := srcMgr.ReadIQ(available)
 	if err != nil {
+		if rt.telemetry != nil {
+			rt.telemetry.IncCounter("capture.read.error", 1, nil)
+		}
 		return nil, err
 	}
+	if rt.telemetry != nil {
+		rt.telemetry.Observe("capture.read.duration_ms", float64(time.Since(readStart).Microseconds())/1000.0, nil)
+		rt.telemetry.Observe("capture.read.samples", float64(len(allIQ)), nil)
+	}
 	if rec != nil {
+		ingestStart := time.Now()
 		rec.Ingest(time.Now(), allIQ)
+		if rt.telemetry != nil {
+			rt.telemetry.Observe("capture.ingest.duration_ms", float64(time.Since(ingestStart).Microseconds())/1000.0, nil)
+		}
 	}
 	// Cap allIQ for downstream extraction to prevent buffer bloat.
 	// Without this cap, buffer accumulation during processing stalls causes
@@ -366,8 +480,17 @@ func (rt *dspRuntime) captureSpectrum(srcMgr *sourceManager, rec *recorder.Manag
 		maxStreamSamples = required
 	}
 	maxStreamSamples = (maxStreamSamples / required) * required
+	streamDropped := false
 	if len(allIQ) > maxStreamSamples {
 		allIQ = allIQ[len(allIQ)-maxStreamSamples:]
+		streamDropped = true
+		if rt.telemetry != nil {
+			rt.telemetry.IncCounter("capture.stream_drop.count", 1, nil)
+			rt.telemetry.Event("iq_dropped", "warn", "capture IQ dropped before extraction", nil, map[string]any{
+				"max_stream_samples": maxStreamSamples,
+				"required":           required,
+			})
+		}
 	}
 	logging.Debug("capture", "iq_len", "len", len(allIQ), "surv_fft", rt.cfg.FFTSize, "detail_fft", rt.detailFFT)
 	survIQ := allIQ
@@ -380,14 +503,60 @@ func (rt *dspRuntime) captureSpectrum(srcMgr *sourceManager, rec *recorder.Manag
 	}
 	if rt.dcEnabled {
 		dcBlocker.Apply(allIQ)
+		if rt.telemetry != nil {
+			rt.telemetry.IncCounter("dsp.dc_block.apply", 1, nil)
+		}
 	}
 	if rt.iqEnabled {
+		// IQBalance must NOT modify allIQ in-place: allIQ goes to the extraction
+		// pipeline and any in-place modification creates a phase/amplitude
+		// discontinuity at the survIQ boundary (len-FFTSize) that the polyphase
+		// extractor then sees as paired click artifacts in the FM discriminator.
+		detailIsSurv := sameIQBuffer(detailIQ, survIQ)
+		survIQ = append([]complex64(nil), survIQ...)
 		dsp.IQBalance(survIQ)
-		if !sameIQBuffer(detailIQ, survIQ) {
+		if detailIsSurv {
+			detailIQ = survIQ
+		} else {
 			detailIQ = append([]complex64(nil), detailIQ...)
 			dsp.IQBalance(detailIQ)
 		}
 	}
+	if rt.telemetry != nil {
+		rt.telemetry.SetGauge("iq.stage.all.length", float64(len(allIQ)), nil)
+		rt.telemetry.SetGauge("iq.stage.surveillance.length", float64(len(survIQ)), nil)
+		rt.telemetry.SetGauge("iq.stage.detail.length", float64(len(detailIQ)), nil)
+		rt.telemetry.Observe("capture.total.duration_ms", float64(time.Since(start).Microseconds())/1000.0, nil)
+
+		headMean, tailMean, boundaryScore, boundaryWindow := boundaryMetrics(rt.lastAllIQTail, allIQ, 32)
+		rt.telemetry.SetGauge("iq.boundary.all.head_mean_mag", headMean, nil)
+		rt.telemetry.SetGauge("iq.boundary.all.prev_tail_mean_mag", tailMean, nil)
+		rt.telemetry.Observe("iq.boundary.all.discontinuity_score", boundaryScore, nil)
+		if len(rt.lastAllIQTail) > 0 && len(allIQ) > 0 {
+			deltaMag := math.Abs(math.Hypot(float64(real(allIQ[0])), float64(imag(allIQ[0]))) - math.Hypot(float64(real(rt.lastAllIQTail[len(rt.lastAllIQTail)-1])), float64(imag(rt.lastAllIQTail[len(rt.lastAllIQTail)-1]))))
+			phaseJump := phaseStepAbs(rt.lastAllIQTail[len(rt.lastAllIQTail)-1], allIQ[0])
+			rt.telemetry.Observe("iq.boundary.all.delta_mag", deltaMag, nil)
+			rt.telemetry.Observe("iq.boundary.all.delta_phase", phaseJump, nil)
+			if rt.telemetry.ShouldSampleHeavy() {
+				rt.telemetry.Event("alliq_boundary", "info", "allIQ boundary snapshot", nil, map[string]any{
+					"window":                boundaryWindow,
+					"head_mean_mag":         headMean,
+					"prev_tail_mean_mag":    tailMean,
+					"delta_mag":             deltaMag,
+					"delta_phase":           phaseJump,
+					"discontinuity_score":   boundaryScore,
+					"alliq_len":             len(allIQ),
+					"stream_dropped":        streamDropped,
+				})
+			}
+		}
+		if rt.telemetry.ShouldSampleHeavy() {
+			observeIQStats(rt.telemetry, "capture_all", allIQ, nil)
+			observeIQStats(rt.telemetry, "capture_surveillance", survIQ, nil)
+			observeIQStats(rt.telemetry, "capture_detail", detailIQ, nil)
+		}
+	}
+	rt.lastAllIQTail = tailWindowComplex(allIQ, 32)
 	survSpectrum := rt.spectrumFromIQ(survIQ, gpuState)
 	sanitizeSpectrum(survSpectrum)
 	detailSpectrum := survSpectrum
@@ -430,8 +599,13 @@ func (rt *dspRuntime) captureSpectrum(srcMgr *sourceManager, rec *recorder.Manag
 	}
 	now := time.Now()
 	finished, detected := rt.det.Process(now, survSpectrum, rt.cfg.CenterHz)
+	if rt.telemetry != nil {
+		rt.telemetry.SetGauge("signals.detected.count", float64(len(detected)), nil)
+		rt.telemetry.SetGauge("signals.finished.count", float64(len(finished)), nil)
+	}
 	return &spectrumArtifacts{
 		allIQ:                allIQ,
+		streamDropped:        streamDropped,
 		surveillanceIQ:       survIQ,
 		detailIQ:             detailIQ,
 		surveillanceSpectrum: survSpectrum,
diff --git a/cmd/sdrd/pipeline_runtime_test.go b/cmd/sdrd/pipeline_runtime_test.go
index 99e2654..54d4ac9 100644
--- a/cmd/sdrd/pipeline_runtime_test.go
+++ b/cmd/sdrd/pipeline_runtime_test.go
@@ -13,7 +13,7 @@ func TestNewDSPRuntime(t *testing.T) {
 	cfg := config.Default()
 	det := detector.New(cfg.Detector, cfg.SampleRate, cfg.FFTSize)
 	window := fftutil.Hann(cfg.FFTSize)
-	rt := newDSPRuntime(cfg, det, window, &gpuStatus{})
+	rt := newDSPRuntime(cfg, det, window, &gpuStatus{}, nil)
 	if rt == nil {
 		t.Fatalf("runtime is nil")
 	}
@@ -47,7 +47,7 @@ func TestSurveillanceLevelsRespectStrategy(t *testing.T) {
 	cfg := config.Default()
 	det := detector.New(cfg.Detector, cfg.SampleRate, cfg.FFTSize)
 	window := fftutil.Hann(cfg.FFTSize)
-	rt := newDSPRuntime(cfg, det, window, &gpuStatus{})
+	rt := newDSPRuntime(cfg, det, window, &gpuStatus{}, nil)
 	policy := pipeline.Policy{SurveillanceStrategy: "single-resolution"}
 	plan := rt.buildSurveillancePlan(policy)
 	if len(plan.Levels) != 1 {
diff --git a/cmd/sdrd/source_manager.go b/cmd/sdrd/source_manager.go
index 606e6e8..4f58a54 100644
--- a/cmd/sdrd/source_manager.go
+++ b/cmd/sdrd/source_manager.go
@@ -1,11 +1,16 @@
 package main
 
 import (
+	"fmt"
+	"time"
+
 	"sdr-wideband-suite/internal/config"
 	"sdr-wideband-suite/internal/sdr"
+	"sdr-wideband-suite/internal/telemetry"
 )
 
 func (m *sourceManager) Restart(cfg config.Config) error {
+	start := time.Now()
 	m.mu.Lock()
 	defer m.mu.Unlock()
 	old := m.src
@@ -14,15 +19,27 @@ func (m *sourceManager) Restart(cfg config.Config) error {
 	if err != nil {
 		_ = old.Start()
 		m.src = old
+		if m.telemetry != nil {
+			m.telemetry.IncCounter("source.restart.error", 1, nil)
+			m.telemetry.Event("source_restart_failed", "warn", "source restart failed", nil, map[string]any{"error": err.Error()})
+		}
 		return err
 	}
 	if err := next.Start(); err != nil {
 		_ = next.Stop()
 		_ = old.Start()
 		m.src = old
+		if m.telemetry != nil {
+			m.telemetry.IncCounter("source.restart.error", 1, nil)
+			m.telemetry.Event("source_restart_failed", "warn", "source restart failed", nil, map[string]any{"error": err.Error()})
+		}
 		return err
 	}
 	m.src = next
+	if m.telemetry != nil {
+		m.telemetry.IncCounter("source.restart.count", 1, nil)
+		m.telemetry.Observe("source.restart.duration_ms", float64(time.Since(start).Milliseconds()), nil)
+	}
 	return nil
 }
 
@@ -44,7 +61,11 @@ func (m *sourceManager) Flush() {
 }
 
 func newSourceManager(src sdr.Source, newSource func(cfg config.Config) (sdr.Source, error)) *sourceManager {
-	return &sourceManager{src: src, newSource: newSource}
+	return newSourceManagerWithTelemetry(src, newSource, nil)
+}
+
+func newSourceManagerWithTelemetry(src sdr.Source, newSource func(cfg config.Config) (sdr.Source, error), coll *telemetry.Collector) *sourceManager {
+	return &sourceManager{src: src, newSource: newSource, telemetry: coll}
 }
 
 func (m *sourceManager) Start() error {
@@ -60,9 +81,27 @@ func (m *sourceManager) Stop() error {
 }
 
 func (m *sourceManager) ReadIQ(n int) ([]complex64, error) {
+	waitStart := time.Now()
 	m.mu.RLock()
+	wait := time.Since(waitStart)
 	defer m.mu.RUnlock()
-	return m.src.ReadIQ(n)
+	if m.telemetry != nil {
+		m.telemetry.Observe("source.lock_wait_ms", float64(wait.Microseconds())/1000.0, telemetry.TagsFromPairs("lock", "read"))
+		if wait > 2*time.Millisecond {
+			m.telemetry.IncCounter("source.lock_contention.count", 1, telemetry.TagsFromPairs("lock", "read"))
+		}
+	}
+	readStart := time.Now()
+	out, err := m.src.ReadIQ(n)
+	if m.telemetry != nil {
+		tags := telemetry.TagsFromPairs("requested", fmt.Sprintf("%d", n))
+		m.telemetry.Observe("source.read.duration_ms", float64(time.Since(readStart).Microseconds())/1000.0, tags)
+		m.telemetry.SetGauge("source.read.samples", float64(len(out)), nil)
+		if err != nil {
+			m.telemetry.IncCounter("source.read.error", 1, nil)
+		}
+	}
+	return out, err
 }
 
 func (m *sourceManager) ApplyConfig(cfg config.Config) error {
diff --git a/cmd/sdrd/streaming_compare.go b/cmd/sdrd/streaming_compare.go
new file mode 100644
index 0000000..dda334b
--- /dev/null
+++ b/cmd/sdrd/streaming_compare.go
@@ -0,0 +1,45 @@
+package main
+
+import (
+	"fmt"
+
+	"sdr-wideband-suite/internal/demod/gpudemod"
+	"sdr-wideband-suite/internal/telemetry"
+)
+
+func observeStreamingComparison(coll *telemetry.Collector, oracle gpudemod.StreamingExtractResult, prod gpudemod.StreamingExtractResult) {
+	if coll == nil {
+		return
+	}
+	metrics, stats := gpudemod.CompareOracleAndGPUHostOracle(oracle, prod)
+	tags := telemetry.TagsFromPairs("signal_id", fmt.Sprintf("%d", oracle.SignalID), "path", "streaming_compare")
+	coll.SetGauge("streaming.compare.n_out", float64(metrics.NOut), tags)
+	coll.SetGauge("streaming.compare.phase_count", float64(metrics.PhaseCount), tags)
+	coll.SetGauge("streaming.compare.history_len", float64(metrics.HistoryLen), tags)
+	coll.Observe("streaming.compare.ref_max_abs_err", metrics.RefMaxAbsErr, tags)
+	coll.Observe("streaming.compare.ref_rms_err", metrics.RefRMSErr, tags)
+	coll.SetGauge("streaming.compare.compare_count", float64(stats.Count), tags)
+	coll.SetGauge("streaming.compare.oracle_rate", float64(oracle.Rate), tags)
+	coll.SetGauge("streaming.compare.production_rate", float64(prod.Rate), tags)
+	coll.SetGauge("streaming.compare.oracle_output_len", float64(len(oracle.IQ)), tags)
+	coll.SetGauge("streaming.compare.production_output_len", float64(len(prod.IQ)), tags)
+	if len(oracle.IQ) > 0 {
+		oracleStats := computeIQHeadStats(oracle.IQ, 64)
+		coll.Observe("streaming.compare.oracle_head_mean_mag", oracleStats.meanMag, tags)
+		coll.Observe("streaming.compare.oracle_head_max_step", oracleStats.maxStep, tags)
+	}
+	if len(prod.IQ) > 0 {
+		prodStats := computeIQHeadStats(prod.IQ, 64)
+		coll.Observe("streaming.compare.production_head_mean_mag", prodStats.meanMag, tags)
+		coll.Observe("streaming.compare.production_head_max_step", prodStats.maxStep, tags)
+	}
+	coll.Event("streaming_compare_snapshot", "info", "streaming comparison snapshot", tags, map[string]any{
+		"oracle_rate":           oracle.Rate,
+		"production_rate":       prod.Rate,
+		"oracle_output_len":     len(oracle.IQ),
+		"production_output_len": len(prod.IQ),
+		"ref_max_abs_err":       metrics.RefMaxAbsErr,
+		"ref_rms_err":           metrics.RefRMSErr,
+		"compare_count":         stats.Count,
+	})
+}
diff --git a/cmd/sdrd/streaming_monitoring.go b/cmd/sdrd/streaming_monitoring.go
new file mode 100644
index 0000000..f334a15
--- /dev/null
+++ b/cmd/sdrd/streaming_monitoring.go
@@ -0,0 +1,27 @@
+package main
+
+import (
+	"fmt"
+
+	"sdr-wideband-suite/internal/demod/gpudemod"
+	"sdr-wideband-suite/internal/telemetry"
+)
+
+func observeStreamingResult(coll *telemetry.Collector, prefix string, res gpudemod.StreamingExtractResult) {
+	if coll == nil {
+		return
+	}
+	tags := telemetry.TagsFromPairs("signal_id", fmt.Sprintf("%d", res.SignalID), "path", prefix)
+	coll.SetGauge(prefix+".n_out", float64(res.NOut), tags)
+	coll.SetGauge(prefix+".phase_count", float64(res.PhaseCount), tags)
+	coll.SetGauge(prefix+".history_len", float64(res.HistoryLen), tags)
+	coll.SetGauge(prefix+".rate", float64(res.Rate), tags)
+	coll.SetGauge(prefix+".output_len", float64(len(res.IQ)), tags)
+	if len(res.IQ) > 0 {
+		stats := computeIQHeadStats(res.IQ, 64)
+		coll.Observe(prefix+".head_mean_mag", stats.meanMag, tags)
+		coll.Observe(prefix+".head_max_step", stats.maxStep, tags)
+		coll.Observe(prefix+".head_p95_step", stats.p95Step, tags)
+		coll.SetGauge(prefix+".head_low_magnitude_count", float64(stats.lowMag), tags)
+	}
+}
diff --git a/cmd/sdrd/streaming_production.go b/cmd/sdrd/streaming_production.go
new file mode 100644
index 0000000..6198993
--- /dev/null
+++ b/cmd/sdrd/streaming_production.go
@@ -0,0 +1,50 @@
+package main
+
+import (
+	"fmt"
+
+	"sdr-wideband-suite/internal/demod/gpudemod"
+	"sdr-wideband-suite/internal/detector"
+	"sdr-wideband-suite/internal/telemetry"
+)
+
+func extractForStreamingProduction(
+	extractMgr *extractionManager,
+	allIQ []complex64,
+	sampleRate int,
+	centerHz float64,
+	signals []detector.Signal,
+	aqCfg extractionConfig,
+	coll *telemetry.Collector,
+) ([][]complex64, []int, error) {
+	out := make([][]complex64, len(signals))
+	rates := make([]int, len(signals))
+	jobs, err := buildStreamingJobs(sampleRate, centerHz, signals, aqCfg)
+	if err != nil {
+		return nil, nil, err
+	}
+	runner := extractMgr.get(len(allIQ), sampleRate)
+	if runner == nil {
+		return nil, nil, fmt.Errorf("streaming production path unavailable: no batch runner")
+	}
+	results, err := runner.StreamingExtractGPU(allIQ, jobs)
+	if err != nil {
+		return nil, nil, err
+	}
+	var oracleResults []gpudemod.StreamingExtractResult
+	if useStreamingOraclePath {
+		if streamingOracleRunner == nil || streamingOracleRunner.SampleRate != sampleRate {
+			streamingOracleRunner = gpudemod.NewCPUOracleRunner(sampleRate)
+		}
+		oracleResults, _ = streamingOracleRunner.StreamingExtract(allIQ, jobs)
+	}
+	for i, res := range results {
+		out[i] = res.IQ
+		rates[i] = res.Rate
+		observeStreamingResult(coll, "streaming.production", res)
+		if i < len(oracleResults) {
+			observeStreamingComparison(coll, oracleResults[i], res)
+		}
+	}
+	return out, rates, nil
+}
diff --git a/cmd/sdrd/streaming_refactor.go b/cmd/sdrd/streaming_refactor.go
new file mode 100644
index 0000000..9ad2260
--- /dev/null
+++ b/cmd/sdrd/streaming_refactor.go
@@ -0,0 +1,137 @@
+package main
+
+import (
+	"math"
+
+	"sdr-wideband-suite/internal/demod/gpudemod"
+	"sdr-wideband-suite/internal/detector"
+	"sdr-wideband-suite/internal/telemetry"
+)
+
+const useStreamingOraclePath = false // temporarily disable oracle during bring-up to isolate production-path runtime behavior
+const useStreamingProductionPath = true // route top-level extraction through the new production path during bring-up/validation
+
+var streamingOracleRunner *gpudemod.CPUOracleRunner
+
+func buildStreamingJobs(sampleRate int, centerHz float64, signals []detector.Signal, aqCfg extractionConfig) ([]gpudemod.StreamingExtractJob, error) {
+	jobs := make([]gpudemod.StreamingExtractJob, len(signals))
+	bwMult := aqCfg.bwMult
+	if bwMult <= 0 {
+		bwMult = 1.0
+	}
+	firTaps := aqCfg.firTaps
+	if firTaps <= 0 {
+		firTaps = 101
+	}
+	for i, sig := range signals {
+		bw := sig.BWHz * bwMult
+		sigMHz := sig.CenterHz / 1e6
+		isWFM := (sigMHz >= 87.5 && sigMHz <= 108.0) ||
+			(sig.Class != nil && (sig.Class.ModType == "WFM" || sig.Class.ModType == "WFM_STEREO"))
+		var outRate int
+		if isWFM {
+			outRate = wfmStreamOutRate
+			if bw < wfmStreamMinBW {
+				bw = wfmStreamMinBW
+			}
+		} else {
+			// Non-WFM target: must be an exact integer divisor of sampleRate.
+			// The old hardcoded 200000 fails for common SDR rates (e.g. 4096000/200000=20.48).
+			// Find the nearest valid rate >= 128000 (enough for NFM/AM/SSB).
+			outRate = nearestExactDecimationRate(sampleRate, 200000, 128000)
+			if bw < 20000 {
+				bw = 20000
+			}
+		}
+		if _, err := gpudemod.ExactIntegerDecimation(sampleRate, outRate); err != nil {
+			return nil, err
+		}
+		offset := sig.CenterHz - centerHz
+		jobs[i] = gpudemod.StreamingExtractJob{
+			SignalID:   sig.ID,
+			OffsetHz:   offset,
+			Bandwidth:  bw,
+			OutRate:    outRate,
+			NumTaps:    firTaps,
+			ConfigHash: gpudemod.StreamingConfigHash(sig.ID, offset, bw, outRate, firTaps, sampleRate),
+		}
+	}
+	return jobs, nil
+}
+
+func resetStreamingOracleRunner() {
+	if streamingOracleRunner != nil {
+		streamingOracleRunner.ResetAllStates()
+	}
+}
+
+func extractForStreamingOracle(
+	allIQ []complex64,
+	sampleRate int,
+	centerHz float64,
+	signals []detector.Signal,
+	aqCfg extractionConfig,
+	coll *telemetry.Collector,
+) ([][]complex64, []int, error) {
+	out := make([][]complex64, len(signals))
+	rates := make([]int, len(signals))
+	jobs, err := buildStreamingJobs(sampleRate, centerHz, signals, aqCfg)
+	if err != nil {
+		return nil, nil, err
+	}
+	if streamingOracleRunner == nil || streamingOracleRunner.SampleRate != sampleRate {
+		streamingOracleRunner = gpudemod.NewCPUOracleRunner(sampleRate)
+	}
+	results, err := streamingOracleRunner.StreamingExtract(allIQ, jobs)
+	if err != nil {
+		return nil, nil, err
+	}
+	for i, res := range results {
+		out[i] = res.IQ
+		rates[i] = res.Rate
+		observeStreamingResult(coll, "streaming.oracle", res)
+	}
+	return out, rates, nil
+}
+
+func phaseIncForOffset(sampleRate int, offsetHz float64) float64 {
+	return -2.0 * math.Pi * offsetHz / float64(sampleRate)
+}
+
+// nearestExactDecimationRate finds the output rate closest to targetRate
+// (but not below minRate) that is an exact integer divisor of sampleRate.
+// This avoids the ExactIntegerDecimation check failing for rates like
+// 4096000/200000=20.48 which silently killed the entire streaming batch.
+func nearestExactDecimationRate(sampleRate int, targetRate int, minRate int) int {
+	if sampleRate <= 0 || targetRate <= 0 {
+		return targetRate
+	}
+	if sampleRate%targetRate == 0 {
+		return targetRate // already exact
+	}
+	// Try decimation factors near the target
+	targetDecim := sampleRate / targetRate // floor
+	bestRate := 0
+	bestDist := sampleRate // impossibly large
+	for d := max(1, targetDecim-2); d <= targetDecim+2; d++ {
+		rate := sampleRate / d
+		if rate < minRate {
+			continue
+		}
+		if sampleRate%rate != 0 {
+			continue // not exact (shouldn't happen since rate = sampleRate/d, but guard)
+		}
+		dist := targetRate - rate
+		if dist < 0 {
+			dist = -dist
+		}
+		if dist < bestDist {
+			bestDist = dist
+			bestRate = rate
+		}
+	}
+	if bestRate > 0 {
+		return bestRate
+	}
+	return targetRate // fallback — will fail ExactIntegerDecimation and surface the error
+}
diff --git a/cmd/sdrd/types.go b/cmd/sdrd/types.go
index c96e5c6..0e36748 100644
--- a/cmd/sdrd/types.go
+++ b/cmd/sdrd/types.go
@@ -11,6 +11,7 @@ import (
 	"sdr-wideband-suite/internal/detector"
 	"sdr-wideband-suite/internal/pipeline"
 	"sdr-wideband-suite/internal/sdr"
+	"sdr-wideband-suite/internal/telemetry"
 )
 
 type SpectrumDebug struct {
@@ -110,6 +111,7 @@ type sourceManager struct {
 	mu        sync.RWMutex
 	src       sdr.Source
 	newSource func(cfg config.Config) (sdr.Source, error)
+	telemetry *telemetry.Collector
 }
 
 type extractionManager struct {
diff --git a/config.autosave.yaml b/config.autosave.yaml
new file mode 100644
index 0000000..03dbd1c
--- /dev/null
+++ b/config.autosave.yaml
@@ -0,0 +1,343 @@
+bands:
+    - name: uk-fm-broadcast
+      start_hz: 8.75e+07
+      end_hz: 1.08e+08
+center_hz: 1.02e+08
+sample_rate: 4096000
+fft_size: 512
+gain_db: 32
+tuner_bw_khz: 5000
+use_gpu_fft: true
+classifier_mode: combined
+agc: true
+dc_block: true
+iq_balance: true
+pipeline:
+    mode: wideband-balanced
+    profile: wideband-balanced
+    goals:
+        intent: broadcast-monitoring
+        monitor_start_hz: 8.8e+07
+        monitor_end_hz: 1.08e+08
+        monitor_span_hz: 2e+07
+        monitor_windows:
+            - label: ""
+              zone: focus
+              start_hz: 8.75e+07
+              end_hz: 1.08e+08
+              center_hz: 0
+              span_hz: 0
+              priority: 1.25
+              auto_record: false
+              auto_decode: false
+            - label: ""
+              zone: decode
+              start_hz: 8.75e+07
+              end_hz: 1.08e+08
+              center_hz: 0
+              span_hz: 0
+              priority: 1.35
+              auto_record: false
+              auto_decode: false
+        signal_priorities:
+            - wfm
+            - rds
+            - broadcast
+        auto_record_classes:
+            - WFM
+            - WFM_STEREO
+        auto_decode_classes:
+            - WFM
+            - WFM_STEREO
+            - RDS
+surveillance:
+    analysis_fft_size: 512
+    frame_rate: 12
+    strategy: multi-resolution
+    display_bins: 2048
+    display_fps: 12
+    derived_detection: auto
+refinement:
+    enabled: true
+    max_concurrent: 24
+    detail_fft_size: 4096
+    min_candidate_snr_db: -3
+    min_span_hz: 60000
+    max_span_hz: 250000
+    auto_span: true
+resources:
+    prefer_gpu: true
+    max_refinement_jobs: 24
+    max_recording_streams: 32
+    max_decode_jobs: 16
+    decision_hold_ms: 2500
+profiles:
+    - name: legacy
+      description: Current single-band pipeline behavior
+      pipeline:
+        mode: legacy
+        profile: legacy
+        goals:
+            intent: general-monitoring
+            monitor_start_hz: 0
+            monitor_end_hz: 0
+            monitor_span_hz: 0
+            monitor_windows: []
+            signal_priorities: []
+            auto_record_classes: []
+            auto_decode_classes: []
+      surveillance:
+        analysis_fft_size: 2048
+        frame_rate: 15
+        strategy: single-resolution
+        display_bins: 2048
+        display_fps: 15
+        derived_detection: auto
+      refinement:
+        enabled: true
+        max_concurrent: 8
+        detail_fft_size: 2048
+        min_candidate_snr_db: 0
+        min_span_hz: 0
+        max_span_hz: 0
+        auto_span: true
+      resources:
+        prefer_gpu: false
+        max_refinement_jobs: 8
+        max_recording_streams: 16
+        max_decode_jobs: 16
+        decision_hold_ms: 2000
+    - name: wideband-balanced
+      description: Baseline multi-resolution wideband surveillance
+      pipeline:
+        mode: wideband-balanced
+        profile: wideband-balanced
+        goals:
+            intent: broadcast-monitoring
+            monitor_start_hz: 0
+            monitor_end_hz: 0
+            monitor_span_hz: 0
+            monitor_windows: []
+            signal_priorities:
+                - wfm
+                - rds
+                - broadcast
+            auto_record_classes:
+                - WFM
+                - WFM_STEREO
+            auto_decode_classes:
+                - WFM
+                - WFM_STEREO
+                - RDS
+      surveillance:
+        analysis_fft_size: 4096
+        frame_rate: 12
+        strategy: multi-resolution
+        display_bins: 2048
+        display_fps: 12
+        derived_detection: auto
+      refinement:
+        enabled: true
+        max_concurrent: 24
+        detail_fft_size: 4096
+        min_candidate_snr_db: -3
+        min_span_hz: 60000
+        max_span_hz: 250000
+        auto_span: true
+      resources:
+        prefer_gpu: true
+        max_refinement_jobs: 24
+        max_recording_streams: 32
+        max_decode_jobs: 16
+        decision_hold_ms: 2500
+    - name: wideband-aggressive
+      description: Higher surveillance/refinement budgets for dense wideband monitoring
+      pipeline:
+        mode: wideband-aggressive
+        profile: wideband-aggressive
+        goals:
+            intent: high-density-wideband-surveillance
+            monitor_start_hz: 0
+            monitor_end_hz: 0
+            monitor_span_hz: 0
+            monitor_windows: []
+            signal_priorities:
+                - wfm
+                - rds
+                - broadcast
+                - digital
+            auto_record_classes: []
+            auto_decode_classes: []
+      surveillance:
+        analysis_fft_size: 8192
+        frame_rate: 10
+        strategy: multi-resolution
+        display_bins: 4096
+        display_fps: 10
+        derived_detection: auto
+      refinement:
+        enabled: true
+        max_concurrent: 32
+        detail_fft_size: 8192
+        min_candidate_snr_db: -3
+        min_span_hz: 50000
+        max_span_hz: 280000
+        auto_span: true
+      resources:
+        prefer_gpu: true
+        max_refinement_jobs: 32
+        max_recording_streams: 40
+        max_decode_jobs: 24
+        decision_hold_ms: 2500
+    - name: archive
+      description: Record-first monitoring profile
+      pipeline:
+        mode: archive
+        profile: archive
+        goals:
+            intent: archive-and-triage
+            monitor_start_hz: 0
+            monitor_end_hz: 0
+            monitor_span_hz: 0
+            monitor_windows: []
+            signal_priorities:
+                - wfm
+                - broadcast
+                - digital
+            auto_record_classes: []
+            auto_decode_classes: []
+      surveillance:
+        analysis_fft_size: 4096
+        frame_rate: 12
+        strategy: single-resolution
+        display_bins: 2048
+        display_fps: 12
+        derived_detection: auto
+      refinement:
+        enabled: true
+        max_concurrent: 16
+        detail_fft_size: 4096
+        min_candidate_snr_db: -2
+        min_span_hz: 50000
+        max_span_hz: 250000
+        auto_span: true
+      resources:
+        prefer_gpu: true
+        max_refinement_jobs: 16
+        max_recording_streams: 40
+        max_decode_jobs: 16
+        decision_hold_ms: 3000
+    - name: digital-hunting
+      description: Digital-first refinement and decode focus
+      pipeline:
+        mode: digital-hunting
+        profile: digital-hunting
+        goals:
+            intent: digital-surveillance
+            monitor_start_hz: 0
+            monitor_end_hz: 0
+            monitor_span_hz: 0
+            monitor_windows: []
+            signal_priorities:
+                - rds
+                - digital
+                - wfm
+            auto_record_classes: []
+            auto_decode_classes: []
+      surveillance:
+        analysis_fft_size: 4096
+        frame_rate: 12
+        strategy: multi-resolution
+        display_bins: 2048
+        display_fps: 12
+        derived_detection: auto
+      refinement:
+        enabled: true
+        max_concurrent: 20
+        detail_fft_size: 4096
+        min_candidate_snr_db: -2
+        min_span_hz: 50000
+        max_span_hz: 200000
+        auto_span: true
+      resources:
+        prefer_gpu: true
+        max_refinement_jobs: 20
+        max_recording_streams: 20
+        max_decode_jobs: 24
+        decision_hold_ms: 2500
+detector:
+    threshold_db: -60
+    min_duration_ms: 500
+    hold_ms: 1500
+    ema_alpha: 0.025
+    hysteresis_db: 10
+    min_stable_frames: 4
+    gap_tolerance_ms: 2000
+    cfar_mode: GOSCA
+    cfar_guard_hz: 200000
+    cfar_train_hz: 100000
+    cfar_guard_cells: 3
+    cfar_train_cells: 24
+    cfar_rank: 36
+    cfar_scale_db: 23
+    cfar_wrap_around: true
+    edge_margin_db: 6
+    max_signal_bw_hz: 260000
+    merge_gap_hz: 20000
+    class_history_size: 10
+    class_switch_ratio: 0.6
+recorder:
+    enabled: false
+    min_snr_db: 0
+    min_duration: 500ms
+    max_duration: 300s
+    preroll_ms: 500
+    record_iq: false
+    record_audio: true
+    auto_demod: true
+    auto_decode: false
+    max_disk_mb: 0
+    output_dir: data/recordings
+    class_filter: []
+    ring_seconds: 12
+    deemphasis_us: 50
+    extraction_fir_taps: 101
+    extraction_bw_mult: 1.35
+    debug_live_audio: false
+decoder:
+    ft8_cmd: C:/WSJT/wsjtx-2.7.0-rc6/bin/jt9.exe -8 {audio}
+    wspr_cmd: C:/WSJT/wsjtx-2.7.0-rc6/bin/wsprd.exe {audio}
+    dmr_cmd: tools/dsd-neo/bin/dsd-neo.exe -fs -i {audio} -s {sr} -o null
+    dstar_cmd: tools/dsd-neo/bin/dsd-neo.exe -fd -i {audio} -s {sr} -o null
+    fsk_cmd: tools/fsk/fsk_decoder --iq {iq} --sample-rate {sr}
+    psk_cmd: tools/psk/psk_decoder --iq {iq} --sample-rate {sr}
+debug:
+    audio_dump_enabled: false
+    cpu_monitoring: false
+    telemetry:
+        enabled: true
+        heavy_enabled: false
+        heavy_sample_every: 12
+        metric_sample_every: 8
+        metric_history_max: 6000
+        event_history_max: 1500
+        retention_seconds: 900
+        persist_enabled: false
+        persist_dir: debug/telemetry
+        rotate_mb: 16
+        keep_files: 8
+logging:
+    level: error
+    categories: []
+    rate_limit_ms: 1000
+    stdout: true
+    stdout_color: true
+    file: logs/trace.log
+    file_level: error
+    time_format: "15:04:05"
+    disable_time: false
+web_addr: :8080
+event_path: data/events.jsonl
+frame_rate: 12
+waterfall_lines: 200
+web_root: web
diff --git a/config.yaml b/config.yaml
index 6d8f0de..53cdb0b 100644
--- a/config.yaml
+++ b/config.yaml
@@ -248,14 +248,29 @@ decoder:
   dstar_cmd: tools/dsd-neo/bin/dsd-neo.exe -fd -i {audio} -s {sr} -o null
   fsk_cmd: tools/fsk/fsk_decoder --iq {iq} --sample-rate {sr}
   psk_cmd: tools/psk/psk_decoder --iq {iq} --sample-rate {sr}
+debug:
+  audio_dump_enabled: false
+  cpu_monitoring: false
+  telemetry:
+    enabled: true
+    heavy_enabled: false
+    heavy_sample_every: 12
+    metric_sample_every: 8
+    metric_history_max: 6000
+    event_history_max: 1500
+    retention_seconds: 900
+    persist_enabled: true
+    persist_dir: debug/telemetry
+    rotate_mb: 16
+    keep_files: 8
 logging:
-  level: debug
-  categories: [capture, extract, demod, resample, drop, ws, boundary]
-  rate_limit_ms: 500
+  level: error
+  categories: []
+  rate_limit_ms: 1000
   stdout: true
   stdout_color: true
-  file: logs/trace.log
-  file_level: debug
+  file: ""
+  file_level: error
   time_format: "15:04:05"
   disable_time: false
 web_addr: :8080
diff --git a/docs/audio-click-debug-notes-2026-03-24.md b/docs/audio-click-debug-notes-2026-03-24.md
new file mode 100644
index 0000000..44a12ba
--- /dev/null
+++ b/docs/audio-click-debug-notes-2026-03-24.md
@@ -0,0 +1,1077 @@
+# Audio Click Debug Notes — 2026-03-24
+
+## Context
+
+This note captures the intermediate findings from the live/recording audio click investigation on `sdr-wideband-suite`.
+
+Goal: preserve the reasoning, experiments, false leads, and current best understanding so future work does not restart from scratch.
+
+---
+
+## High-level outcome so far
+
+**SOLVED** — the persistent audio clicking issue is now resolved.
+
+Final result:
+- live listening test confirmed the clicks are gone
+- the final fix set consists of three independent root-cause fixes plus two secondary fixes
+- the CUDA DLL did **not** need a rebuild for the final fix
+
+This document now serves as the investigation log plus final resolution record.
+
+---
+
+## What was tested
+
+### 1. Session/context recovery
+- Reconstructed prior debugging context from reset-session backup files.
+- Confirmed the relevant investigation was the persistent audio clicking bug in live audio / recordings.
+
+### 2. Codebase deep-read
+Reviewed in detail:
+- `cmd/sdrd/dsp_loop.go`
+- `cmd/sdrd/pipeline_runtime.go`
+- `cmd/sdrd/helpers.go`
+- `internal/recorder/streamer.go`
+- `internal/recorder/demod_live.go`
+- `internal/dsp/fir.go`
+- `internal/dsp/fir_stateful.go`
+- `internal/dsp/resample.go`
+- `internal/demod/fm.go`
+- `internal/demod/gpudemod/*`
+- `web/app.js`
+
+Main conclusion from static reading: the pipeline contains several stateful continuity mechanisms, so clicks are likely to emerge at boundaries or from phase/timing inconsistencies rather than from one obvious isolated bug.
+
+### 3. AM vs FM tests
+Observed by ear:
+- AM clicks too.
+- Therefore this is **not** an FM-only issue.
+- That shifted focus away from purely FM-specific explanations and toward shared-path / continuity / transport / demod-adjacent causes.
+
+### 4. Recording vs live path comparison
+Observed by ear:
+- Recordings click too.
+- Therefore browser/WebSocket/live playback is **not** the sole cause.
+- The root problem exists in the server-side audio pipeline before browser playback.
+
+### 5. Boundary instrumentation added
+Temporary diagnostics were added to inspect:
+- extract trimming
+- snippet lengths
+- demod path lengths
+- boundary click / intra-click detector
+- IQ continuity at various stages
+
+### 6. Discriminator-overlap hypothesis
+A test switch temporarily disabled the extra 1-sample discriminator overlap prepend in `streamer.go`.
+
+Result:
+- This extra overlap **was** a real problem.
+- It caused the downstream decimation phase to flip between blocks.
+- Removing it cleaned up the boundary model and was the correct change.
+
+However:
+- Removing it did **not** eliminate the audible clicks.
+- Therefore it was a real bug, but **not the main remaining root cause**.
+
+### 7. GPU vs CPU extraction test
+Forced CPU-only stream extraction.
+
+Result:
+- CPU-only made things dramatically worse in real time.
+- Large `feed_gap` values appeared.
+- Huge backlogs built up.
+- Therefore CPU-only is not a solution, and the GPU path is not the sole main problem.
+
+### 8. Fixed read-size test
+Forced a constant extraction read size (`389120`) instead of variable read sizing based on backlog.
+
+Result:
+- `allIQ`, `gpuIQ_len`, `raw_len`, and `out_len` became very stable.
+- This reduced pipeline variability and made logs much cleaner.
+- Subjectively, audio may have become slightly better, but clicks remained.
+- Therefore variable block sizing is likely a contributing factor, but not the full explanation.
+
+### 9. Multi-stage audio dump test
+Added optional debug dumping for:
+- demod audio (`*-demod.wav`)
+- final audio after resampler (`*-final.wav`)
+
+Observed by ear:
+- Clicks are present in **both** dump types.
+- Therefore the click is already present by the time demodulated audio exists.
+- Resampler/final audio path is not the primary origin.
+
+### 10. CPU monitoring
+A process-level CSV monitor was added and used.
+
+Result:
+- Overall process CPU usage was modest (not near full machine saturation).
+- This does **not** support “overall CPU is pegged” as the main explanation.
+- Caveat: this does not fully exclude a hot thread or scheduler issue, but gross total CPU overload is not the main story.
+
+---
+
+## What we now know with reasonable confidence
+
+### A. The issue is not primarily caused by:
+- Browser playback
+- WebSocket transport
+- Final PCM fanout only
+- Resampler alone
+- CPU-only vs GPU-only as the core dichotomy
+- The old extra discriminator overlap prepend (that was a bug, but not the remaining dominant one)
+- Purely variable block sizes alone
+- Gross whole-process CPU saturation
+
+### B. The issue is server-side and exists before final playback
+Because:
+- recordings click
+- demod dump clicks
+- final dump clicks
+
+### C. The issue is present by the demodulated audio stage
+This is one of the strongest current findings.
+
+### D. The WFM/FM-demod-adjacent path remains highly suspicious
+Current best area of suspicion:
+- decimated IQ may still contain subtle corruption/instability not fully captured by current metrics
+- OR the FM discriminator (`fmDiscrim`) is producing pathological output from otherwise “boundary-clean-looking” IQ
+
+---
+
+## Important runtime/pathology observations
+
+### 1. Backlog amplification is real
+Several debug runs showed severe buffer growth and drops:
+- large `buf=` values
+- growing `drop=` counts
+- repeated `audio_gap`
+
+This means some debug configurations can easily become self-distorting and produce additional artifacts that are not representative of the original bug.
+
+### 2. Too much debug output causes self-inflicted load
+At one point:
+- rate limiter was disabled (`rate_limit_ms: 0`)
+- aggressive boundary logging was enabled
+- many short WAV files were generated
+
+This clearly increased overhead and likely polluted some runs.
+
+### 3. Many short WAVs were a bad debug design
+That was replaced with a design intended to write one continuous window file instead of many micro-files.
+
+### 4. Total process CPU saturation does not appear to be the main cause
+A process-level CSV monitor was collected and showed only modest total CPU utilisation during the relevant tests.
+This does **not** support a simple “the machine is pegged” explanation.
+A hot thread / scheduling issue is still theoretically possible, but gross overall CPU overload is not the main signal.
+
+---
+
+## Current debug state in repo
+
+### Branch
+All current work is on:
+- `debug/audio-clicks`
+
+### Commits so far
+- `94c132d` — `debug: instrument audio click investigation`
+- `ffbc45d` — `debug: add advanced boundary metering`
+
+### Current config/logging state
+The active debug logging was trimmed down to:
+- `demod`
+- `discrim`
+- `gap`
+- `boundary`
+
+Rate limit is currently back to a nonzero value to avoid self-induced spam.
+
+### Dump/CPU debug state
+A `debug:` config section was added with:
+- `audio_dump_enabled: false`
+- `cpu_monitoring: false`
+
+Meaning:
+- heavy WAV dumping is now OFF by default
+- CPU monitoring is conceptually OFF by default (script still exists, but must be explicitly used)
+
+---
+
+## Most important code changes/findings to remember
+
+### 1. Removed the extra discriminator overlap prepend in `streamer.go`
+This was a correct fix.
+
+Reason:
+- it introduced a blockwise extra IQ sample
+- this shifted decimation phase between blocks
+- it created real boundary artifacts
+
+This should **not** be reintroduced casually.
+
+### 2. Fixed read-size test exists and is useful for investigation
+A temporary mechanism exists to force stable extraction block sizes.
+This is useful diagnostically because it removes one source of pipeline variability.
+
+**IMPORTANT DECISION / DO NOT LOSE:**
+- The fixed read-size path currently lives behind the environment variable `SDR_FORCE_FIXED_STREAM_READ_SAMPLES`.
+- The tested value `389120` clearly helps by making `allIQ`, `gpuIQ_len`, `raw_len`, and `out_len` much more stable and by reducing one major source of pipeline variability.
+- Current plan: **once the remaining click root cause is solved, promote this behavior into the normal code path instead of leaving it as an env-var-only debug switch.**
+- In other words: treat fixed read sizing as a likely permanent stabilization improvement, but do not bake it in blindly until the click investigation is complete.
+
+### 3. FM discriminator metering exists
+`internal/demod/fm.go` now emits targeted discriminator stats under `discrim` logging, including:
+- min/max IQ magnitude
+- maximum absolute phase step
+- count of large phase steps
+
+This was useful to establish that large discriminator steps correlate with low IQ magnitude, but discriminator logging was later disabled from the active category list to reduce log spam.
+
+### 4. Strong `dec`-IQ findings before demod
+Additional metering in `streamer.go` showed:
+- repeated `dec_iq_head_dip`
+- repeated low magnitude near `min_idx ~= 25`
+- repeated early large local phase step near `max_step_idx ~= 24`
+- repeated `demod_boundary` and audible clicks shortly afterward
+
+This is the strongest currently known mechanism in the chain.
+
+### 5. Group delay observation
+For the current pre-demod FIR:
+- taps = `101`
+- FIR group delay = `(101 - 1) / 2 = 50` input samples
+- with `decim1 = 2`, this projects to about `25` output samples
+
+This matches the repeatedly observed problematic `dec` indices (~24-25) remarkably well.
+That strongly suggests the audible issue is connected to the FIR/decimation settling region at the beginning of the `dec` block.
+
+### 6. Pre-FIR vs post-FIR comparison
+A dedicated pre-FIR probe was added on `fullSnip` (the input to the pre-demod FIR) and compared against the existing `dec`-side probes.
+
+Observed pattern:
+- pre-FIR head probe usually looked relatively normal
+- no equally strong or equally reproducible hot spot appeared there
+- after FIR + decimation, the problematic dip/step repeatedly appeared near `dec` indices ~24-25
+
+Interpretation:
+- the strongest currently observed defect is **not already present in the same form before the FIR**
+- it is much more likely to emerge in the FIR/decimation section (or its settling behavior) than in the raw pre-FIR input
+
+### 7. Head-trim test results
+A debug head-trim on `dec` was tested.
+Subjective result:
+- `trim=32` sounded best among the tested values (`16/32/48/64`)
+- but it did **not** remove the clicks entirely
+
+Interpretation:
+- the early `dec` settling region is a real contributor
+- but it is probably not the only contributor, or trimming alone is not the final correct fix
+
+### 8. Current architectural conclusion
+The likely clean fix is **not** to keep trimming samples away.
+The FIR/decimation section is still suspicious, but later tests showed it is likely not the sole origin.
+
+Important nuance:
+- the currently suspicious FIR + decimation section is already running in **Go/CPU** (`processSnippet`), not in CUDA
+- therefore the next correctness fix should be developed and validated in Go first
+
+Later update:
+- a stateful decimating FIR / polyphase-style replacement was implemented in Go and tested
+- it was architecturally cleaner than the old separated FIR->decimate handoff
+- but it did **not** remove the recurring hot spot / clicks
+- therefore the old handoff was not the whole root cause, even if the newer path is still cleaner
+
+---
+
+## Best current hypothesis
+
+The remaining audible clicks are most likely generated **at or immediately before FM demodulation**.
+
+Most plausible interpretations:
+1. The decimated IQ stream still contains subtle corruption/instability not fully captured by the earliest boundary metrics.
+2. The FM discriminator is reacting violently to short abnormal IQ behavior inside blocks, not just at block boundaries.
+3. The problematic region is likely a **very specific early decimated-IQ settling zone**, not broad corruption across the whole block.
+
+At this point, the most valuable next data is low-overhead IQ telemetry right before demod, plus carefully controlled demod-vs-final audio comparison.
+
+### Stronger updated working theory (later findings, same day)
+
+After discriminator-focused metering and targeted `dec`-IQ probes, the strongest current theory is:
+
+> A reproducible early defect in the `dec` IQ block appears around sample index **24-25**, where IQ magnitude dips sharply and the effective FM phase step becomes abnormally large. This then shows up as `demod_boundary` and audible clicks.
+
+Crucially:
+- this issue appears in `demod.wav`, so it exists before the final resampler/playback path
+- it is **not** spread uniformly across the whole `dec` block
+- it repeatedly appears near the same index
+- trimming the first ~32 samples subjectively reduces the click, but does not eliminate it entirely
+
+This strongly suggests a **settling/transient zone at the beginning of the decimated IQ block**.
+
+Later refinements to this theory:
+- pre-FIR probing originally looked cleaner than post-FIR probing, which made FIR/decimation look like the main culprit
+- however, a temporary FIR bypass showed the clicks were still present, only somewhat quieter / less aggressive
+- this indicates the pre-demod FIR likely amplifies or sharpens an upstream issue, but is not the sole origin
+- a cleaner stateful decimating FIR implementation also failed to eliminate the recurring hot spot, further weakening the idea that the old FIR->decimate handoff alone caused the bug
+
+---
+
+## Recommended next steps
+
+1. Run with reduced logging only and keep heavy dump features OFF unless explicitly needed.
+2. Continue investigating the extractor path and its immediate surroundings (`extractForStreaming`, signal parameter source, offset/BW stability, overlap/trim behavior).
+3. Treat FIR/decimation as a possible amplifier/focuser of the issue, but not the only suspect.
+4. When testing fixes, prefer low-overhead, theory-driven experiments over broad logging/dump spam.
+5. Only re-enable audio dump windows selectively and briefly.
+
+### Debug TODO / operational reminders
+
+- The current telemetry collector is **not** using a true ring buffer for metric/event history.
+- Internally it keeps append-only history slices (`metricsHistory`, `events`) and periodically trims them by copying tail slices.
+- Under heavy per-block telemetry this can add enough mutex/copy overhead to make the live stream start stuttering after a short run.
+- Therefore: keep telemetry sampling conservative during live reproduction runs; do **not** leave full heavy telemetry enabled longer than needed.
+- Follow-up engineering task: replace or redesign telemetry history storage to use a proper low-overhead ring-buffer style structure (or equivalent bounded lock-light design) if live telemetry is to remain a standard debugging tool.
+
+---
+
+## 2026-03-25 update — extractor-focused live telemetry findings
+
+### Where the investigation moved
+
+The investigation was deliberately refocused away from browser/feed/demod-only suspicions and toward:
+- shared upstream IQ cadence / block boundaries
+- extractor input/output continuity
+- raw vs trimmed extractor-head behaviour
+
+This was driven by two observations:
+1. all signals still click
+2. the newly added live telemetry made it possible to inspect the shared path while the system was running
+
+### Telemetry infrastructure / config notes
+
+Two config files matter for debug telemetry defaults:
+- `config.yaml`
+- `config.autosave.yaml`
+
+The autosave file can overwrite intended telemetry defaults after restart, so both must be updated together.
+
+Current conservative live-debug defaults that worked better:
+- `heavy_enabled: false`
+- `heavy_sample_every: 12`
+- `metric_sample_every: 8`
+- `metric_history_max: 6000`
+- `event_history_max: 1500`
+
+Important operational lesson:
+- runtime `POST /api/debug/telemetry/config` changes only affect the current `sdrd` process
+- after restart, the process reloads config defaults again
+- if autosave still contains older values (for example `heavy_enabled: true` or very large history limits), the debug run can accidentally become self-distorting again
+
+### Telemetry endpoints
+
+The live debug work used these HTTP endpoints on the `sdrd` web server (typically `http://127.0.0.1:8080`):
+
+#### `GET /api/debug/telemetry/config`
+Returns the current effective telemetry configuration.
+Useful for verifying:
+- whether heavy telemetry is enabled
+- history sizes
+- persistence settings
+- sample rates actually active in the running process
+
+Typical fields:
+- `enabled`
+- `heavy_enabled`
+- `heavy_sample_every`
+- `metric_sample_every`
+- `metric_history_max`
+- `event_history_max`
+- `retention_seconds`
+- `persist_enabled`
+- `persist_dir`
+
+#### `POST /api/debug/telemetry/config`
+Applies runtime telemetry config changes to the current process.
+Used during investigation to temporarily reduce telemetry load without editing files.
+
+Example body used during investigation:
+```json
+{
+  "heavy_enabled": true,
+  "heavy_sample_every": 12,
+  "metric_sample_every": 8
+}
+```
+
+#### `GET /api/debug/telemetry/live`
+Returns the current live metric snapshot (gauges/counters/distributions).
+Useful for:
+- quick sanity checks
+- verifying that a metric family exists
+- confirming whether a new metric name is actually being emitted
+
+#### `GET /api/debug/telemetry/history?prefix=<prefix>&limit=<n>`
+Returns stored metric history entries filtered by metric-name prefix.
+This is the main endpoint for time-series debugging during live runs.
+
+Useful examples:
+- `prefix=stage.`
+- `prefix=source.`
+- `prefix=iq.boundary.all`
+- `prefix=iq.extract.input`
+- `prefix=iq.extract.output`
+- `prefix=iq.extract.raw.`
+- `prefix=iq.extract.trimmed.`
+- `prefix=iq.pre_demod`
+- `prefix=audio.demod`
+
+#### `GET /api/debug/telemetry/events?limit=<n>`
+Returns recent structured telemetry events.
+Used heavily once compact per-block event probes were added, because events were often easier to inspect reliably than sparsely sampled distribution histories.
+
+This ended up being especially useful for:
+- raw extractor head probes
+- trimmed extractor head probes
+- extractor input head probes
+- GPU kernel input/output head probes
+- boundary snapshots
+
+### Important telemetry families added/used
+
+#### Shared-path / global boundary metrics
+- `iq.boundary.all.head_mean_mag`
+- `iq.boundary.all.prev_tail_mean_mag`
+- `iq.boundary.all.delta_mag`
+- `iq.boundary.all.delta_phase`
+- `iq.boundary.all.discontinuity_score`
+
+Purpose:
+- detect whether the shared `allIQ` block boundary was already obviously broken before signal-specific extraction
+
+#### Extractor input/output metrics
+- `iq.extract.input.length`
+- `iq.extract.input.overlap_length`
+- `iq.extract.input.head_mean_mag`
+- `iq.extract.input.prev_tail_mean_mag`
+- `iq.extract.input.discontinuity_score`
+- `iq.extract.output.length`
+- `iq.extract.output.head_mean_mag`
+- `iq.extract.output.head_min_mag`
+- `iq.extract.output.head_max_step`
+- `iq.extract.output.head_p95_step`
+- `iq.extract.output.head_tail_ratio`
+- `iq.extract.output.head_low_magnitude_count`
+- `iq.extract.output.boundary.delta_mag`
+- `iq.extract.output.boundary.delta_phase`
+- `iq.extract.output.boundary.d2`
+- `iq.extract.output.boundary.discontinuity_score`
+
+Purpose:
+- isolate whether the final per-signal extractor output itself was discontinuous across blocks
+
+#### Raw vs trimmed extractor-head telemetry
+- `iq.extract.raw.length`
+- `iq.extract.raw.head_mag`
+- `iq.extract.raw.tail_mag`
+- `iq.extract.raw.head_zero_count`
+- `iq.extract.raw.first_nonzero_index`
+- `iq.extract.raw.head_max_step`
+- `iq.extract.trim.trim_samples`
+- `iq.extract.trimmed.head_mag`
+- `iq.extract.trimmed.tail_mag`
+- `iq.extract.trimmed.head_zero_count`
+- `iq.extract.trimmed.first_nonzero_index`
+- `iq.extract.trimmed.head_max_step`
+- event `extract_raw_head_probe`
+- event `extract_trimmed_head_probe`
+
+Purpose:
+- answer the key question: is the corruption already present in the raw extractor output head, or created by trimming/overlap logic afterward?
+
+#### Additional extractor input / GPU-kernel probe telemetry
+- `iq.extract.input_head.zero_count`
+- `iq.extract.input_head.first_nonzero_index`
+- `iq.extract.input_head.max_step`
+- event `extract_input_head_probe`
+- event `gpu_kernel_input_head_probe`
+- event `gpu_kernel_output_head_probe`
+
+Purpose:
+- split the remaining uncertainty between:
+  - signal-specific input already being bad
+  - GPU extractor kernel/start semantics producing the bad raw head
+  - later output assembly after the kernel
+
+#### Pre-demod / audio-stage metrics
+- `iq.pre_demod.head_mean_mag`
+- `iq.pre_demod.head_min_mag`
+- `iq.pre_demod.head_max_step`
+- `iq.pre_demod.head_p95_step`
+- `iq.pre_demod.head_low_magnitude_count`
+- `audio.demod.head_mean_abs`
+- `audio.demod.tail_mean_abs`
+- `audio.demod.edge_delta_abs`
+- existing `audio.demod_boundary.*`
+
+Purpose:
+- verify where artifacts become visible/audible downstream
+
+### What the 2026-03-25 telemetry actually showed
+
+#### 1. Feed / enqueue remained relatively uninteresting
+`stage.feed_enqueue.duration_ms` was usually effectively zero.
+
+Representative values during live runs:
+- mostly `0`
+- occasional small spikes such as `0.5 ms` and `5.8 ms`
+
+Interpretation:
+- feed enqueue is not the main source of clicks
+
+#### 2. Extract-stream time was usually modest
+`stage.extract_stream.duration_ms` was usually small and stable compared with the main loop.
+
+Representative values:
+- often `1–5 ms`
+- occasional spikes such as `10.7 ms` and `18.9 ms`
+
+Interpretation:
+- extraction is not free, but runtime cost alone does not explain the clicks
+
+#### 3. Shared capture / source cadence still fluctuated heavily
+Representative live values:
+- `dsp.frame.duration_ms`: often around `90–100 ms`, but also `110–150 ms`, with one observed spike around `212.6 ms`
+- `source.read.duration_ms`: roughly `80–90 ms` often, but also about `60 ms`, `47 ms`, `19 ms`, and even `0.677 ms`
+- `source.buffer_samples`: ranged from very small to very large bursts, including examples like `512`, `4608`, `94720`, `179200`, `304544`
+- a `source_reset` event was seen and `source.resets=1`
+
+Interpretation:
+- shared upstream cadence is clearly unstable enough to remain suspicious
+- but this alone did not localize the final click mechanism
+
+#### 4. Pre-demod stage showed repeated hard phase anomalies even when energy looked healthy
+Representative live values for normal non-vanishing signals:
+- `iq.pre_demod.head_mean_mag` around `0.25–0.31`
+- `iq.pre_demod.head_low_magnitude_count = 0`
+- `iq.pre_demod.head_max_step` repeatedly high, including roughly:
+  - `1.5`
+  - `2.0`
+  - `2.4`
+  - `2.8`
+  - `3.08`
+
+Interpretation:
+- not primarily an amplitude collapse
+- rather a strong phase/continuity defect reaching the pre-demod stage
+
+#### 5. Audio stage still showed real block-edge artifacts
+Representative values:
+- `audio.demod.edge_delta_abs` repeatedly around `0.4–0.8`
+- outliers up to roughly `1.21` and `1.26`
+- `audio.demod_boundary.count` continued to fire repeatedly
+
+Interpretation:
+- demod is where the problem becomes audible, but the root cause still appeared to be earlier/shared
+
+### Key extractor findings from the new telemetry
+
+#### A. Per-signal extractor output boundary is genuinely broken
+For a representative strong signal (`signal_id=2`), `iq.extract.output.boundary.delta_phase` repeatedly showed very large jumps such as:
+- `2.60`
+- `3.06`
+- `2.14`
+- `2.71`
+- `3.09`
+- `2.92`
+- `2.63`
+- `2.78`
+
+Also observed for `iq.extract.output.boundary.discontinuity_score`:
+- `2.86`
+- `3.08`
+- `2.92`
+- `2.52`
+- `2.40`
+- `2.85`
+
+Later runs using `d2` made the discontinuity even easier to see. Representative `iq.extract.output.boundary.d2` values for the same strong signal included:
+- `0.347`
+- `0.303`
+- `0.362`
+- `0.359`
+- `0.382`
+- `0.344`
+- `0.337`
+- `0.206`
+
+At the same time, `iq.extract.output.boundary.delta_mag` was often comparatively small (examples around `0.0003–0.0038`).
+
+Interpretation:
+- the main boundary defect is not primarily amplitude mismatch
+- it is much more consistent with complex/phase discontinuity across output blocks
+
+#### B. The raw extractor head is systematically bad on all signals
+The new `extract_raw_head_probe` events were the strongest finding of the day.
+
+Representative repeated pattern for strong signals (`signal_id=1` and `signal_id=2`):
+- `first_nonzero_index = 1`
+- `zero_count = 1`
+- first magnitude sample exactly `0`
+- then a short ramp: e.g. for `signal_id=2`
+  - `0`
+  - `0.000388`
+  - `0.002316`
+  - `0.004152`
+  - `0.019126`
+  - `0.011418`
+  - `0.124034`
+  - `0.257569`
+  - `0.317579`
+- `head_max_step` often near π, e.g.:
+  - `3.141592653589793`
+  - `3.088773696463606`
+  - `3.0106854446936318`
+  - `2.9794833659932527`
+
+The same qualitative pattern appeared for weaker signals too:
+- raw head starts at `0`
+- a brief near-zero ramp follows
+- only after several samples does the magnitude look like a normal extracted band
+
+Interpretation:
+- the raw extractor output head is already damaged / settling / invalid before trimming
+- this strongly supports an upstream/shared-start-condition problem rather than a trim-created artifact
+
+#### C. The trimmed extractor head usually looks sane
+Representative repeated pattern for the same signals after `trim_samples = 64`:
+- `first_nonzero_index = 0`
+- `zero_count = 0`
+- magnitudes look immediately plausible and stable
+- `head_max_step` is dramatically lower than raw, often around `0.15–0.9` for strong channels
+
+Example trimmed head magnitudes for `signal_id=2`:
+- `0.299350`
+- `0.300954`
+- `0.298032`
+- `0.298738`
+- `0.312258`
+- `0.296932`
+- `0.239010`
+- `0.266881`
+- `0.313193`
+
+Example trimmed head magnitudes for `signal_id=1`:
+- `0.277400`
+- `0.275994`
+- `0.273718`
+- `0.272846`
+- `0.277842`
+- `0.278398`
+- `0.268829`
+- `0.273790`
+- `0.279031`
+
+Interpretation:
+- trimming is removing a genuinely bad raw head region
+- trimming is therefore **not** the main origin of the problem
+- it acts more like cleanup of an already bad upstream/raw start region
+
+### Input-vs-raw-vs-trimmed extractor result (important refinement)
+
+A later, more targeted telemetry pass added a direct probe on the signal-specific extractor input head (`extract_input_head_probe`) and compared it against the raw and trimmed extractor output heads.
+
+This materially refined the earlier conclusion.
+
+#### Input-head result
+Representative values from `iq.extract.input_head.*`:
+- `iq.extract.input_head.zero_count = 0`
+- `iq.extract.input_head.first_nonzero_index = 0`
+
+Interpretation:
+- the signal-specific input head going into the GPU extractor is **not** starting with a zero sample
+- the head is not arriving already dead/null from the immediate input probe point
+
+#### Raw-head result
+Representative values from `iq.extract.raw.*`:
+- `iq.extract.raw.head_mag = 0`
+- `iq.extract.raw.head_zero_count = 1`
+- `iq.extract.raw.head_max_step` frequently around `2.4–3.14`
+
+These values repeated for strong channels such as `signal_id=2`, and similarly across other signals.
+
+Interpretation:
+- the first raw output sample is repeatedly exactly zero
+- therefore the visibly bad raw head is being created **after** the probed input head and **before/during raw extractor output generation**
+
+#### Trimmed-head result
+Representative values from `iq.extract.trimmed.*`:
+- `iq.extract.trimmed.head_zero_count = 0`
+- `iq.extract.trimmed.head_mag` often looked healthy immediately after trimming, for example:
+  - signal 1: about `0.275–0.300`
+  - signal 2: about `0.311`
+- `iq.extract.trimmed.head_max_step` was much lower than raw for strong channels, often around:
+  - `0.11`
+  - `0.14`
+  - `0.19`
+  - `0.30`
+  - `0.75`
+
+Interpretation:
+- trimming cleans up the visibly bad raw head region
+- trimming still does **not** explain the deeper output-boundary continuity issue
+
+### Further refinement after direct extractor-input and GPU-kernel probes
+
+A final telemetry round added:
+- `extract_input_head_probe`
+- `gpu_kernel_input_head_probe`
+- `gpu_kernel_output_head_probe`
+
+These probes further sharpened the likely fault location.
+
+#### Signal-specific extractor input head looked sane
+Representative values:
+- `iq.extract.input_head.zero_count = 0`
+- `iq.extract.input_head.first_nonzero_index = 0`
+
+Interpretation:
+- at the observed signal-specific input probe point, the GPU extractor is **not** receiving a dead/null head
+
+#### Raw GPU output head remained systematically broken
+Representative repeated values:
+- `iq.extract.raw.head_mag = 0`
+- `iq.extract.raw.head_zero_count = 1`
+- `iq.extract.raw.head_max_step` repeatedly around:
+  - `3.141592653589793`
+  - `3.122847934305907`
+  - `3.101915352902961`
+  - `3.080672178550904`
+  - `3.062425574273907`
+  - `2.9785041567778427`
+  - `2.7508533785793476`
+
+Representative repeated examples from strong channels:
+- signal 2: `head_mag = 0`, `head_zero_count = 1`
+- signal 3: `head_mag = 0`, `head_zero_count = 1`
+- signal 1/4 showed the same qualitative head-zero pattern as well
+
+Interpretation:
+- the raw extractor output head is still repeatedly born broken
+- the problem is therefore after the currently probed input head and before/during raw output creation
+
+#### Trimmed head still looked healthier
+Representative values:
+- `iq.extract.trimmed.head_zero_count = 0`
+- signal 1 `iq.extract.trimmed.head_mag` repeatedly around:
+  - `0.2868`
+  - `0.2907`
+  - `0.3036`
+  - `0.3116`
+  - `0.2838`
+  - `0.2760`
+- signal 2 examples:
+  - `0.3461`
+  - `0.3182`
+
+Representative `iq.extract.trimmed.head_max_step` values for strong channels were much lower than raw, often around:
+- `0.11`
+- `0.13`
+- `0.21`
+- `0.30`
+- `0.44`
+- `0.69`
+- `0.86`
+
+Interpretation:
+- trimming still removes the most visibly broken head region
+- but trimming does not explain the deeper output-boundary continuity issue
+
+### Refined strongest current conclusion after the full 2026-03-25 telemetry pass
+
+The strongest current reading is now:
+
+> The click root cause is very likely **not** that the signal-specific extractor input already starts dead/null. Instead, the bad raw head appears to be introduced **inside the GPU extractor path itself** (or at its immediate start/output semantics) before final trimming.
+
+More specifically:
+- signal-specific extractor input head looks non-zero and sane at the probe point
+- raw GPU output head still repeatedly starts with an exact zero sample and a short bad settling region
+- the trimmed head usually looks healthier
+- yet the final extractor output still exhibits significant complex boundary discontinuity from block to block
+
+This now points away from a simple "shared global input head is already zero" theory and toward one of these narrower causes:
+1. GPU extractor kernel start semantics / warmup / first-output handling
+2. phase-start or alignment handling at extractor block start
+3. raw GPU output assembly semantics within the extractor path
+
+### What should not be forgotten from this stage
+
+- The overlap-prepend bug was real and worth fixing, but was not sufficient.
+- The fixed read-size path (`SDR_FORCE_FIXED_STREAM_READ_SAMPLES=389120`) remains useful and likely worth promoting later, but it is not the root-cause fix.
+- The telemetry system itself can perturb runs if overused; conservative sampling matters.
+- `config.autosave.yaml` must be kept in sync with `config.yaml` or telemetry defaults can silently revert after restart.
+- The most promising root-cause area is now the shared upstream/extractor-start boundary path, not downstream playback.
+
+### 2026-03-25 refactor work status (post-reviewer instruction)
+
+After the reviewer guidance, work pivoted away from symptomatic patching and onto the required two-track architecture change:
+
+#### Track 1 — CPU/oracle path repair (in progress)
+The following was added to start building a trustworthy streaming oracle:
+- `internal/demod/gpudemod/streaming_types.go`
+- `internal/demod/gpudemod/cpu_oracle.go`
+- `internal/demod/gpudemod/cpu_oracle_test.go`
+- `internal/demod/gpudemod/streaming_oracle_extract.go`
+- `internal/demod/gpudemod/polyphase.go`
+- `internal/demod/gpudemod/polyphase_test.go`
+
+What exists now:
+- explicit `StreamingExtractJob` / `StreamingExtractResult`
+- explicit `CPUOracleState`
+- exact integer decimation enforcement (`ExactIntegerDecimation`)
+- monolithic-vs-chunked CPU oracle test
+- explicit polyphase tap layout (`phase-major`)
+- CPU oracle direct-vs-polyphase equivalence test
+- persistent CPU oracle runner state keyed by signal ID
+- config-hash reset behavior
+- cleanup of disappeared signals from oracle state
+
+Important limitation:
+- this is **not finished production validation yet**
+- the CPU oracle path is being built toward the reviewer’s required semantics, but it is not yet the final signed-off oracle for GPU validation
+
+#### Track 2 — GPU path architecture refactor (in progress)
+The following was added to begin the new stateful GPU architecture:
+- `internal/demod/gpudemod/stream_state.go`
+- `internal/demod/gpudemod/streaming_gpu_stub.go`
+- `docs/gpu-streaming-refactor-plan-2026-03-25.md`
+- `cmd/sdrd/streaming_refactor.go`
+
+What exists now:
+- explicit `ExtractStreamState`
+- batch-runner-owned per-signal state map
+- config-hash reset behavior for GPU-side stream state
+- exact integer decimation enforcement in relevant batch path
+- base taps and polyphase taps initialized into GPU-side stream state
+- explicit future production entry point: `StreamingExtractGPU(...)`
+- explicit separation between current legacy extractor path and the new streaming/oracle path
+- persistent oracle-runner lifecycle hooks, including reset on stream-drop events
+
+Important limitation:
+- the new GPU production path is **not implemented yet**
+- the legacy overlap+trim production path still exists and is still the current active path
+- the new GPU entry point currently exists as an explicit architectural boundary and state owner, not as the finished stateful polyphase kernel path
+
+#### Tests currently passing during refactor
+Repeatedly verified during the refactor work:
+- `go test ./internal/demod/gpudemod/...`
+- `go test ./cmd/sdrd/...`
+
+#### Incremental progress reached so far inside the refactor
+
+Additional progress after the initial refactor scaffolding:
+- the CPU oracle runner now uses the explicit polyphase oracle path (`CPUOracleExtractPolyphase`) instead of only carrying polyphase tap data passively
+- the CPU oracle now has a direct-vs-polyphase equivalence test
+- the GPU-side stream state now initializes both `BaseTaps` and `PolyphaseTaps`
+- the GPU side now has an explicit future production entry point `StreamingExtractGPU(...)`
+- the GPU streaming stub now advances `NCOPhase` over NEW samples only
+- the GPU streaming stub now advances `PhaseCount` modulo exact integer decimation
+- the GPU streaming stub now builds and persists `ShiftedHistory` from already frequency-shifted NEW samples
+- the new streaming/oracle path is explicitly separated from the current legacy overlap+trim production path
+
+Important current limitation:
+- `StreamingExtractGPU(...)` still intentionally returns a not-implemented error rather than pretending to be the finished production path
+- this is deliberate, to avoid hidden quick-fix semantics or silent goalpost shifts
+
+Additional note on the latest step:
+- the GPU streaming stub now also reports an estimated output-count schedule (`NOut`) derived from NEW sample consumption plus carried `PhaseCount`
+- this still does **not** make it a production path; it only means the stub now models output cadence semantics more honestly
+- the new CPU/oracle path is also now exposing additional runtime telemetry such as `streaming.oracle.rate` and `streaming.oracle.output_len`, so the reference path becomes easier to inspect as it matures
+- a reusable complex-slice comparison helper now exists (`CompareComplexSlices`) to support later oracle-vs-GPU equivalence work without improvising comparison logic at the last minute
+- a dedicated `TestCPUOracleMonolithicVsChunkedPolyphase` now verifies chunked-vs-monolithic self-consistency for the polyphase oracle path specifically
+- explicit reset tests now exist for both CPU oracle state and GPU streaming state, so config-change reset semantics are no longer only implicit in code review
+- a dedicated `ExtractDebugMetrics` structure now exists as a future comparison/telemetry contract for reviewer-required state/error/boundary metrics
+- the first mapper from oracle results into that debug-metric structure now exists, so the comparison contract is beginning to attach to real refactor code rather than staying purely conceptual
+- the same minimal debug-metric mapping now also exists for GPU-stub results, so both sides of the future GPU-vs-oracle comparison now have an initial common reporting shape
+- a first comparison-pipeline helper now exists to turn oracle-vs-GPU-stub results into shared `CompareStats` / `ExtractDebugMetrics` output, even though the GPU path is still intentionally incomplete
+- that comparison helper is now also covered by a dedicated unit test, so even the scaffolding around future GPU-vs-oracle validation is being locked down incrementally
+- GPU-side stream-state initialization is now also unit-tested (`Decim`, `BaseTaps`, `PolyphaseTaps`, `ShiftedHistory` capacity), so the new state ownership layer is no longer just trusted by inspection
+- the GPU streaming stub now also has a dedicated test proving that it advances persistent state while still explicitly failing as a not-yet-implemented production path
+- at this point, enough scaffolding exists that the next sensible step is to build the broader validation/test harness in one larger pass before continuing the actual production-path rewrite
+- that harness pass has now happened: deterministic IQ/tone fixtures, harness config/state builders, chunked polyphase oracle runners, and additional validation tests now exist, so the next step is back to the actual production-path rewrite
+- the first non-stub NEW-samples-only production-like path now exists as `StreamingExtractGPUHostOracle(...)`: it is still host-side, but it executes the new streaming/stateful semantics and therefore serves as a concrete bridge between pure test infrastructure and the eventual real GPU production path
+- that host-side production-like path is now directly compared against the CPU oracle in tests and currently matches within tight tolerance, which is an important confidence step before any real CUDA-path replacement
+- the canonical new production entry point `StreamingExtractGPU(...)` is now structurally wired so that the host-side production-like implementation can sit behind the same API later, without forcing a premature switch today
+- a top-level `cmd/sdrd` production path hook now exists as well (`extractForStreamingProduction` plus `useStreamingProductionPath=false`), so the new architecture is no longer isolated to internal packages only
+- the new production path now also emits first-class output/heading telemetry (`rate`, `output_len`, `head_mean_mag`, `head_max_step`) in addition to pure state counters, which will make activation/debugging easier later
+- a top-level comparison observation hook now also exists in `cmd/sdrd`, so oracle-vs-production metrics no longer have to remain buried inside internal package helpers
+- after the broader monitoring/comparison consolidation pass, the next agreed work mode is to continue in larger clusters rather than micro-steps: (1) wire the new production semantics more deeply, (2) isolate the legacy path more sharply, (3) keep preparing the eventual real GPU production path behind the same architecture
+- after the first larger cluster, the next explicit target is to complete Cluster B: make the host-oracle bridge sit more naturally behind the new production execution architecture, rather than leaving production-path semantics spread across loosely connected files
+- after Cluster B, the remaining GPU rewrite work is now best split into two explicit parts: `C1 = prepare` and `C2 = definitive implementation`, so the project can keep momentum without pretending that the final CUDA/stateful production path is already done
+- Cluster B is now effectively complete: CPU oracle runner, host-oracle production-like path, and top-level production comparison all share the same host streaming core, and that common core is directly tested against the polyphase oracle
+- Cluster C1 is now also complete: the new GPU production layer has an explicit invocation contract, execution-result contract, state handoff/build/apply stages, and a host-side execution strategy already running behind the same model
+
+### Current refactor status before C2
+
+At this point the project has:
+- a corrected streaming/oracle architecture direction
+- a shared host-side streaming core used by both the CPU oracle runner and the host-side production-like bridge
+- explicit production-path hooks in `cmd/sdrd`
+- comparison and monitoring scaffolding above and below the execution layer
+- a prepared GPU execution contract (`StreamingGPUInvocation` / `StreamingGPUExecutionResult`)
+
+What it does **not** have yet:
+- a real native CUDA streaming/polyphase execution entry point with history-in/history-out and phase-count in/out semantics
+- a real CUDA-backed implementation behind `StreamingExtractGPUExec(...)`
+- completed GPU-vs-oracle validation on the final native execution path
+
+### C2 plan
+
+#### C2-A — native CUDA / bridge entry preparation
+Goal:
+- introduce the real native entry shape for stateful streaming/polyphase execution
+
+Status note before starting C2-A:
+- C2 is **not** honestly complete yet because the native CUDA side still only exposes the old separate freq-shift/FIR/decimate pieces.
+- Therefore C2-A must begin by creating the real native entry shape rather than continuing to stack more Go-only abstractions on top of the old kernels.
+
+Required outcomes:
+- explicit native/CUDA function signature for streaming execution
+- bridge bindings for history in/out, phase count in/out, new samples in, outputs out
+- Go-side wrapper ready to call the new native path through the prepared invocation/result model
+
+#### C2-B — definitive execution implementation hookup
+Goal:
+- put a real native CUDA-backed execution strategy behind `StreamingExtractGPUExec(...)`
+
+Status note after C2-A:
+- the native entry shape now exists in CUDA, the Windows bridge can resolve it, and the Go execution layer can route into a native-prepared strategy.
+- what is still missing for C2-B is the actual stateful execution body behind that new native entrypoint.
+- therefore C2-B now means exactly one serious thing: replace the current placeholder body of the new native entrypoint with real stateful streaming/polyphase execution semantics, rather than adding more scaffolding around it.
+- C2-B is now materially done: the new native entrypoint no longer returns only placeholder state, and the Go native execution path now uploads inputs/history/taps, runs the new native function, and reads back outputs plus updated state.
+- when the new exact-integer streaming decimation rules were turned on, an immediate runtime integration issue appeared: previous WFM extraction defaults expected `outRate=500000`, but the live sample rate was `4096000`, which is not exactly divisible. The correct fix is to align streaming defaults with the new integer-decimation model instead of trying to preserve the old rounded ratio behavior.
+- the concrete immediate adjustment made for this was: `wfmStreamOutRate = 512000` (instead of `500000`), because `4096000 / 512000 = 8` is exactly divisible and therefore consistent with the new streaming architecture’s no-rounding rule.
+
+Required outcomes:
+- `StreamingExtractGPUExec(...)` can execute a real native stateful path
+- host-oracle bridge remains available only as a comparison/support path, not as the disguised production implementation
+- state apply/backflow goes through the already prepared invocation/result contract
+
+#### C2-C — final validation and serious completion gate
+Goal:
+- validate the real CUDA-backed path against the corrected oracle and make the completion criterion explicit
+
+Required outcomes:
+- GPU-vs-oracle comparison active on the real native path
+- test coverage and runtime comparison hooks in place
+- after C2-C, the CUDA story must be treated as complete, correct, and serious — not half-switched or pseudo-finished
+
+#### Why the refactor is intentionally incremental
+The reviewer explicitly required:
+- no start-index-only production patch
+- no continued reliance on overlap+trim as final continuity model
+- no silent decimation rounding
+- no GPU sign-off without a corrected CPU oracle
+
+Because of that, the work is being done in ordered layers:
+1. define streaming types and state
+2. build the CPU oracle with exact streaming semantics
+3. establish shared polyphase/tap semantics
+4. prepare GPU-side persistent state ownership
+5. only then replace the actual production GPU execution path
+
+This means the repo now contains partially completed new architecture pieces that are deliberate stepping stones, not abandoned half-fixes.
+
+### Reviewer package artifacts created for second-opinion review
+
+To support external/secondary review of the GPU extractor path, a focused reviewer package was created in the project root:
+- `reviewer-gpu-extractor-package/`
+- `reviewer-gpu-extractor-package.zip`
+- `reviewer-gpu-extractor-package.json`
+
+The package intentionally contains:
+- relevant GPU extractor / kernel code
+- surrounding host-path code needed for context
+- current debug notes
+- a reviewer brief
+- a short reviewer prompt
+- relevant config files used during live telemetry work
+
+The JSON variant is uncompressed and stores all included package files as a single JSON document with repeated entries of:
+- `path`
+- `content`
+
+This was created specifically so the same reviewer payload can be consumed by tools or APIs that prefer a single structured text file instead of a ZIP archive.
+
+---
+
+## Final resolution — 2026-03-25
+
+Status: **SOLVED**
+
+The final fix set that resolved the audible clicks consisted of **three root-cause fixes** and **two secondary fixes**:
+
+### Root causes fixed
+
+1. **IQBalance in-place corruption of shared `allIQ` tail**
+   - File: `cmd/sdrd/pipeline_runtime.go`
+   - The surveillance slice (`survIQ`) was an alias of the tail of `allIQ`.
+   - `dsp.IQBalance(survIQ)` therefore modified the shared `allIQ` buffer in-place.
+   - The same `allIQ` buffer was then passed into the streaming extractor, creating a discontinuity where the IQ-balanced tail met unbalanced samples.
+   - Fix: copy `survIQ` before applying IQBalance so extraction sees an unmodified `allIQ` buffer.
+
+2. **`StreamingConfigHash` forced full extractor state reset every frame**
+   - File: `internal/demod/gpudemod/streaming_types.go`
+   - Floating-point jitter in smoothed center frequency caused `offsetHz` / `bandwidth` hash churn.
+   - That reset extractor history, NCO phase, and decimation phase every frame.
+   - Fix: hash only structural parameters (`signalID`, `outRate`, `numTaps`, `sampleRate`).
+
+3. **Non-WFM exact-decimation failure killed the entire streaming batch**
+   - File: `cmd/sdrd/streaming_refactor.go`
+   - Hardcoded `200000` output rate was not an exact divisor of `4096000`, so one non-WFM signal could reject the whole batch and silently force fallback to legacy extraction.
+   - Fix: use nearest exact integer-divisor output rate and keep fallthrough logging visible.
+
+### Secondary issues fixed
+
+1. **FM discriminator block-boundary gap**
+   - File: `internal/recorder/streamer.go`
+   - The cross-boundary phase step between consecutive IQ blocks was missing.
+   - Fix: carry the last IQ sample into the next discriminator block.
+
+2. **Missing 15 kHz lowpass on WFM mono/plain paths**
+   - File: `internal/recorder/streamer.go`
+   - Mono fallback / plain WFM paths sent raw discriminator output (pilot/subcarrier/RDS energy) directly into the resampler.
+   - Fix: add a stateful 15 kHz LPF before resampling on those paths.
+
+### Final verification summary
+
+- Before major fixes:
+  - persistent loud clicking on all signals/modes
+  - `intra_click_rate` about `110/sec`
+  - extractor/audio boundary telemetry showed large discontinuities
+- After config-hash fix:
+  - hard clicks disappeared
+  - large discontinuities dropped sharply
+  - fine click noise still remained
+- After the final `IQBalance` aliasing fix:
+  - operator listening test confirmed clicks were eliminated
+
+### Files involved in the final fix set
+
+- `cmd/sdrd/helpers.go`
+- `cmd/sdrd/streaming_refactor.go`
+- `cmd/sdrd/pipeline_runtime.go`
+- `internal/demod/gpudemod/streaming_types.go`
+- `internal/demod/gpudemod/stream_state.go`
+- `internal/recorder/streamer.go`
+
+### Important architectural note
+
+The CUDA streaming polyphase kernel itself was **not** the root cause.
+The actual bugs were in the Go-side orchestration around path selection, extractor reset semantics, and mutation of the shared IQ buffer before extraction.
+
+## Meta note
+
+This investigation disproved several plausible explanations before landing the final answer.
+That mattered, because the eventual root cause was not a single simple DSP bug but a combination of path fallthrough, state-reset churn, and shared-buffer mutation.
diff --git a/docs/gpu-streaming-refactor-plan-2026-03-25.md b/docs/gpu-streaming-refactor-plan-2026-03-25.md
new file mode 100644
index 0000000..a381078
--- /dev/null
+++ b/docs/gpu-streaming-refactor-plan-2026-03-25.md
@@ -0,0 +1,48 @@
+# GPU Streaming Refactor Plan (2026-03-25)
+
+## Goal
+Replace the current overlap+trim GPU extractor model with a true stateful per-signal streaming architecture, and build a corrected CPU oracle/reference path for validation.
+
+## Non-negotiables
+- No production start-index-only patch.
+- No production overlap-prepend + trim continuity model.
+- Exact integer decimation only in the new streaming production path.
+- Persistent per-signal state must include NCO phase, FIR history, and decimator phase/residue.
+- GPU validation must compare against a corrected CPU oracle, not the legacy CPU fallback.
+
+## Work order
+1. Introduce explicit stateful streaming types in `gpudemod`.
+2. Add a clean CPU oracle implementation and monolithic-vs-chunked tests.
+3. Add per-signal state ownership in batch runner.
+4. Implement new streaming extractor semantics in Go using NEW IQ samples only.
+5. Replace legacy GPU-path assumptions (rounding decimation, overlap-prepend, trim-defined validity) in the new path.
+6. Add production telemetry that proves state continuity (`phase_count`, `history_len`, `n_out`, reference error).
+7. Keep legacy path isolated only for temporary comparison if needed.
+
+## Initial files in scope
+- `internal/demod/gpudemod/batch.go`
+- `internal/demod/gpudemod/batch_runner.go`
+- `internal/demod/gpudemod/batch_runner_windows.go`
+- `internal/demod/gpudemod/kernels.cu`
+- `internal/demod/gpudemod/native/exports.cu`
+- `cmd/sdrd/helpers.go`
+
+## Immediate implementation strategy
+### Phase 1
+- Create explicit streaming state structs in Go.
+- Add CPU oracle/reference path with exact semantics and tests.
+- Introduce exact integer-decimation checks.
+
+### Phase 2
+- Rework batch runner to own persistent per-signal state.
+- Add config-hash-based resets.
+- Stop modeling continuity via overlap tail in the new path.
+
+### Phase 3
+- Introduce a real streaming GPU entry path that consumes NEW shifted samples plus carried state.
+- Move to a stateful polyphase decimator model.
+
+## Validation expectations
+- CPU oracle monolithic == CPU oracle chunked within tolerance.
+- GPU streaming output == CPU oracle chunked within tolerance.
+- Former periodic block-boundary clicks gone in real-world testing.
diff --git a/docs/known-issues.md b/docs/known-issues.md
new file mode 100644
index 0000000..02860ac
--- /dev/null
+++ b/docs/known-issues.md
@@ -0,0 +1,196 @@
+# Known Issues
+
+This file tracks durable open engineering issues that remain after the 2026-03-25 audio-click fix.
+
+Primary source:
+- `docs/open-issues-report-2026-03-25.json`
+
+Status values used here:
+- `open`
+- `deferred`
+- `info`
+
+---
+
+## High Priority
+
+### OI-02 — `lastDiscrimIQ` missing from `dspStateSnapshot`
+- Status: `open`
+- Severity: High
+- Category: state-continuity
+- File: `internal/recorder/streamer.go`
+- Summary: FM discriminator bridging state is not preserved across `captureDSPState()` / `restoreDSPState()`, so recording segment splits can lose the final IQ sample and create a micro-click at the segment boundary.
+- Recommended fix: add `lastDiscrimIQ` and `lastDiscrimIQSet` to `dspStateSnapshot`.
+- Source: `docs/open-issues-report-2026-03-25.json` (OI-02)
+
+### OI-03 — CPU oracle path not yet usable as validation baseline
+- Status: `open`
+- Severity: High
+- Category: architecture
+- File: `cmd/sdrd/streaming_refactor.go`, `internal/demod/gpudemod/cpu_oracle.go`
+- Summary: the CPU oracle exists, but the production comparison/integration path is not trusted yet. That means GPU-path regressions still cannot be checked automatically with confidence.
+- Recommended fix: repair oracle integration and restore GPU-vs-CPU validation flow.
+- Source: `docs/open-issues-report-2026-03-25.json` (OI-03)
+
+### OI-18 — planned C2-C validation gate never completed
+- Status: `open`
+- Severity: Info
+- Category: architecture
+- File: `docs/audio-click-debug-notes-2026-03-24.md`
+- Summary: the final native streaming path works in practice, but the planned formal GPU-vs-oracle validation gate was never completed.
+- Recommended fix: complete this together with OI-03.
+- Source: `docs/open-issues-report-2026-03-25.json` (OI-18)
+
+---
+
+## Medium Priority
+
+### OI-14 — no regression test for `allIQ` immutability through spectrum/detection pipeline
+- Status: `open`
+- Severity: Low
+- Category: test-coverage
+- File: `cmd/sdrd/pipeline_runtime.go`
+- Summary: the `IQBalance` aliasing bug showed that shared-buffer mutation can slip in undetected. There is still no test asserting that `allIQ` remains unchanged after capture/detection-side processing.
+- Recommended fix: add an integration test that compares `allIQ` before and after the relevant pipeline stage.
+- Source: `docs/open-issues-report-2026-03-25.json` (OI-14)
+
+### OI-15 — very low test coverage for `processSnippet` audio pipeline
+- Status: `open`
+- Severity: Low
+- Category: test-coverage
+- File: `internal/recorder/streamer.go`
+- Summary: the main live audio pipeline still lacks focused tests for boundary continuity, WFM mono/stereo behavior, resampling, and demod-path regressions.
+- Recommended fix: add synthetic fixtures and continuity-oriented tests around repeated `processSnippet` calls.
+- Source: `docs/open-issues-report-2026-03-25.json` (OI-15)
+
+### OI-07 — taps are recalculated every frame
+- Status: `open`
+- Severity: Medium
+- Category: correctness
+- File: `internal/demod/gpudemod/stream_state.go`
+- Summary: FIR/polyphase taps are recomputed every frame even when parameters do not change, which is unnecessary work and makes it easier for host/GPU tap state to drift apart.
+- Recommended fix: only rebuild taps when tap-relevant inputs actually change.
+- Source: `docs/open-issues-report-2026-03-25.json` (OI-07)
+
+### OI-17 — bandwidth changes can change Go-side taps without GPU tap re-upload
+- Status: `open`
+- Severity: Low-Medium
+- Category: correctness
+- File: `internal/demod/gpudemod/streaming_gpu_native_prepare.go`, `internal/demod/gpudemod/stream_state.go`
+- Summary: after the config-hash fix, a bandwidth change may rebuild taps on the Go side while the GPU still keeps older uploaded taps unless a reset happens.
+- Recommended fix: add a separate tap-change detection/re-upload path without forcing full extractor reset.
+- Source: `docs/open-issues-report-2026-03-25.json` (OI-17)
+
+### OI-09 — streaming feature flags are compile-time constants
+- Status: `open`
+- Severity: Medium
+- Category: architecture
+- File: `cmd/sdrd/streaming_refactor.go`, `internal/demod/gpudemod/streaming_gpu_modes.go`
+- Summary: switching between production/oracle/native-host modes still requires code changes and rebuilds, which makes field debugging and A/B validation harder than necessary.
+- Recommended fix: expose these as config or environment-driven switches.
+- Source: `docs/open-issues-report-2026-03-25.json` (OI-09)
+
+### OI-05 — feed channel is shallow and can drop frames under pressure
+- Status: `open`
+- Severity: Medium
+- Category: reliability
+- File: `internal/recorder/streamer.go`
+- Summary: `feedCh` has a buffer of only 2. Under heavier processing or debug load, dropped feed messages can create audible gaps.
+- Recommended fix: increase channel depth or redesign backpressure behavior.
+- Source: `docs/open-issues-report-2026-03-25.json` (OI-05)
+
+### OI-06 — legacy overlap/trim extractor path is now mostly legacy baggage
+- Status: `deferred`
+- Severity: Medium
+- Category: dead-code
+- File: `cmd/sdrd/helpers.go`
+- Summary: the old overlap/trim path is now mainly fallback/legacy code and adds complexity plus old instrumentation noise.
+- Recommended fix: isolate, simplify, or remove it once the production path and fallback strategy are formally settled.
+- Source: `docs/open-issues-report-2026-03-25.json` (OI-06)
+
+### OI-04 — telemetry history storage still uses append+copy trim
+- Status: `deferred`
+- Severity: Medium
+- Category: telemetry
+- File: `internal/telemetry/telemetry.go`
+- Summary: heavy telemetry can still create avoidable allocation/copy pressure because history trimming is O(n) and happens under lock.
+- Recommended fix: replace with a ring-buffer design.
+- Source: `docs/open-issues-report-2026-03-25.json` (OI-04)
+
+---
+
+## Lower Priority / Nice-to-Have
+
+### OI-01 — `DCBlocker.Apply(allIQ)` still mutates extraction input in-place
+- Status: `deferred`
+- Severity: High
+- Category: data-integrity
+- File: `cmd/sdrd/pipeline_runtime.go`
+- Summary: unlike the old `IQBalance` bug this does not create a boundary artifact, but it does mean live extraction and recorded/replayed data are not semantically identical.
+- Recommended fix: clarify the contract or move to immutable/copy-based handling.
+- Source: `docs/open-issues-report-2026-03-25.json` (OI-01)
+
+### OI-08 — WFM audio LPF could reject pilot more strongly
+- Status: `deferred`
+- Severity: Medium
+- Category: audio-quality
+- File: `internal/recorder/streamer.go`
+- Summary: the current 15 kHz LPF is good enough functionally, but a steeper filter could further improve pilot suppression.
+- Recommended fix: more taps or a dedicated pilot notch.
+- Source: `docs/open-issues-report-2026-03-25.json` (OI-08)
+
+### OI-10 — `demod.wav` debug dumps can clip and mislead analysis
+- Status: `deferred`
+- Severity: Medium
+- Category: correctness
+- File: `internal/recorder/streamer.go`, `internal/recorder/wavwriter.go`
+- Summary: raw discriminator output can exceed the WAV writer's `[-1,+1]` clip range, so debug dumps can show artifacts that are not part of the real downstream audio path.
+- Recommended fix: scale by `1/pi` before dumping or use float WAV output.
+- Source: `docs/open-issues-report-2026-03-25.json` (OI-10)
+
+### OI-11 — browser AudioContext resync still causes audible micro-gaps
+- Status: `deferred`
+- Severity: Low
+- Category: reliability
+- File: `web/app.js`
+- Summary: underrun recovery is softened with a fade-in, but repeated resyncs still create audible stutter on the browser side.
+- Recommended fix: prefer the AudioWorklet/ring-player path wherever possible.
+- Source: `docs/open-issues-report-2026-03-25.json` (OI-11)
+
+### OI-12 — tiny per-frame tail copy for boundary telemetry
+- Status: `info`
+- Severity: Low
+- Category: performance
+- File: `cmd/sdrd/pipeline_runtime.go`
+- Summary: the last-32-sample copy is trivial and not urgent, but it is one more small allocation in a path that already has several.
+- Recommended fix: none needed unless a broader allocation cleanup happens.
+- Source: `docs/open-issues-report-2026-03-25.json` (OI-12)
+
+### OI-13 — temporary patch artifacts should not live in the repo long-term
+- Status: `deferred`
+- Severity: Low
+- Category: dead-code
+- File: `patches/*`
+- Summary: reviewer/debug patch artifacts were useful during the investigation, but they should either be removed or archived under docs rather than kept as loose patch files.
+- Recommended fix: delete or archive them once no longer needed.
+- Source: `docs/open-issues-report-2026-03-25.json` (OI-13)
+
+### OI-16 — `config.autosave.yaml` can re-enable unwanted debug telemetry after restart
+- Status: `deferred`
+- Severity: Low
+- Category: config
+- File: `config.autosave.yaml`
+- Summary: autosave can silently restore debug-heavy telemetry settings after restart and distort future runs.
+- Recommended fix: stop persisting debug telemetry knobs to autosave or explicitly ignore them.
+- Source: `docs/open-issues-report-2026-03-25.json` (OI-16)
+
+---
+
+## Suggested next execution order
+
+1. Fix OI-02 (`lastDiscrimIQ` snapshot/restore)
+2. Repair OI-03 and close OI-18 (oracle + formal validation path)
+3. Add OI-14 and OI-15 regression tests
+4. Consolidate OI-07 and OI-17 (tap rebuild / tap upload logic)
+5. Expose OI-09 feature flags via config or env
+6. Revisit OI-05 / OI-06 / OI-04 when doing reliability/cleanup work
diff --git a/docs/telemetry-api.md b/docs/telemetry-api.md
new file mode 100644
index 0000000..9ac8672
--- /dev/null
+++ b/docs/telemetry-api.md
@@ -0,0 +1,711 @@
+# Telemetry API Reference
+
+This document describes the server-side telemetry collector, its runtime configuration, and the HTTP API exposed by `sdrd`.
+
+The telemetry system is intended for debugging and performance analysis of the SDR pipeline, especially around source cadence, extraction, DSP timing, boundary artifacts, queue pressure, and other runtime anomalies.
+
+## Goals
+
+The telemetry layer gives you three different views of runtime state:
+
+1. **Live snapshot**
+   - Current counters, gauges, distributions, recent events, and collector status.
+2. **Historical metrics**
+   - Timestamped metric samples that can be filtered by name, prefix, or tags.
+3. **Historical events**
+   - Structured anomalies / warnings / debug events with optional fields.
+
+It is designed to be lightweight in normal operation and more detailed when `heavy_enabled` is turned on.
+
+---
+
+## Base URLs
+
+All telemetry endpoints live under:
+
+- `/api/debug/telemetry/live`
+- `/api/debug/telemetry/history`
+- `/api/debug/telemetry/events`
+- `/api/debug/telemetry/config`
+
+Responses are JSON.
+
+---
+
+## Data model
+
+### Metric types
+
+Telemetry metrics are stored in three logical groups:
+
+- **counter**
+  - Accumulating values, usually incremented over time.
+- **gauge**
+  - Latest current value.
+- **distribution**
+  - Observed numeric samples with summary stats.
+
+A historical metric sample is returned as:
+
+```json
+{
+  "ts": "2026-03-25T12:00:00Z",
+  "name": "stage.extract_stream.duration_ms",
+  "type": "distribution",
+  "value": 4.83,
+  "tags": {
+    "stage": "extract_stream",
+    "signal_id": "1"
+  }
+}
+```
+
+### Events
+
+Telemetry events are structured anomaly/debug records:
+
+```json
+{
+  "id": 123,
+  "ts": "2026-03-25T12:00:02Z",
+  "name": "demod_boundary",
+  "level": "warn",
+  "message": "boundary discontinuity detected",
+  "tags": {
+    "signal_id": "1",
+    "stage": "demod"
+  },
+  "fields": {
+    "d2": 0.3358,
+    "index": 25
+  }
+}
+```
+
+### Tags
+
+Tags are string key/value metadata used for filtering and correlation.
+
+Common tag keys already supported by the HTTP layer:
+
+- `signal_id`
+- `session_id`
+- `stage`
+- `trace_id`
+- `component`
+
+You can also filter on arbitrary tags via `tag_<key>=<value>` query parameters.
+
+---
+
+## Endpoint: `GET /api/debug/telemetry/live`
+
+Returns a live snapshot of the in-memory collector state.
+
+### Response shape
+
+```json
+{
+  "now": "2026-03-25T12:00:05Z",
+  "started_at": "2026-03-25T11:52:10Z",
+  "uptime_ms": 472500,
+  "config": {
+    "enabled": true,
+    "heavy_enabled": false,
+    "heavy_sample_every": 12,
+    "metric_sample_every": 2,
+    "metric_history_max": 12000,
+    "event_history_max": 4000,
+    "retention": 900000000000,
+    "persist_enabled": false,
+    "persist_dir": "debug/telemetry",
+    "rotate_mb": 16,
+    "keep_files": 8
+  },
+  "counters": [
+    {
+      "name": "source.resets",
+      "value": 1,
+      "tags": {
+        "component": "source"
+      }
+    }
+  ],
+  "gauges": [
+    {
+      "name": "source.buffer_samples",
+      "value": 304128,
+      "tags": {
+        "component": "source"
+      }
+    }
+  ],
+  "distributions": [
+    {
+      "name": "dsp.frame.duration_ms",
+      "count": 96,
+      "min": 82.5,
+      "max": 212.4,
+      "mean": 104.8,
+      "last": 98.3,
+      "p95": 149.2,
+      "tags": {
+        "stage": "dsp"
+      }
+    }
+  ],
+  "recent_events": [],
+  "status": {
+    "source_state": "running"
+  }
+}
+```
+
+### Notes
+
+- `counters`, `gauges`, and `distributions` are sorted by metric name.
+- `recent_events` contains the most recent in-memory event slice.
+- `status` is optional and contains arbitrary runtime status published by code using `SetStatus(...)`.
+- If telemetry is unavailable, the server returns a small JSON object instead of a full snapshot.
+
+### Typical uses
+
+- Check whether telemetry is enabled.
+- Look for timing hotspots in `*.duration_ms` distributions.
+- Inspect current queue or source gauges.
+- See recent anomaly events without querying history.
+
+---
+
+## Endpoint: `GET /api/debug/telemetry/history`
+
+Returns historical metric samples from in-memory history and, optionally, persisted JSONL files.
+
+### Response shape
+
+```json
+{
+  "items": [
+    {
+      "ts": "2026-03-25T12:00:01Z",
+      "name": "stage.extract_stream.duration_ms",
+      "type": "distribution",
+      "value": 5.2,
+      "tags": {
+        "stage": "extract_stream",
+        "signal_id": "2"
+      }
+    }
+  ],
+  "count": 1
+}
+```
+
+### Supported query parameters
+
+#### Time filters
+
+- `since`
+- `until`
+
+Accepted formats:
+
+- Unix seconds
+- Unix milliseconds
+- RFC3339
+- RFC3339Nano
+
+Examples:
+
+- `?since=1711368000`
+- `?since=1711368000123`
+- `?since=2026-03-25T12:00:00Z`
+
+#### Result shaping
+
+- `limit`
+  - Default normalization is 500.
+  - Values above 5000 are clamped down by the collector query layer.
+
+#### Name filters
+
+- `name=<exact_metric_name>`
+- `prefix=<metric_name_prefix>`
+
+Examples:
+
+- `?name=source.read.duration_ms`
+- `?prefix=stage.`
+- `?prefix=iq.extract.`
+
+#### Tag filters
+
+Special convenience query params map directly to tag filters:
+
+- `signal_id`
+- `session_id`
+- `stage`
+- `trace_id`
+- `component`
+
+Arbitrary tag filters:
+
+- `tag_<key>=<value>`
+
+Examples:
+
+- `?signal_id=1`
+- `?stage=extract_stream`
+- `?tag_path=gpu`
+- `?tag_zone=broadcast`
+
+#### Persistence control
+
+- `include_persisted=true|false`
+  - Default: `true`
+
+When enabled and persistence is active, the server reads matching data from rotated JSONL telemetry files in addition to in-memory history.
+
+### Notes
+
+- Results are sorted by timestamp ascending.
+- If `limit` is hit, the most recent matching items are retained.
+- Exact retention depends on both in-memory retention and persisted file availability.
+- A small set of boundary-related IQ metrics is force-stored regardless of the normal metric sample cadence.
+
+### Typical queries
+
+Get all stage timing since a specific start:
+
+```text
+/api/debug/telemetry/history?since=2026-03-25T12:00:00Z&prefix=stage.
+```
+
+Get extraction metrics for a single signal:
+
+```text
+/api/debug/telemetry/history?since=2026-03-25T12:00:00Z&prefix=extract.&signal_id=2
+```
+
+Get source cadence metrics only from in-memory history:
+
+```text
+/api/debug/telemetry/history?prefix=source.&include_persisted=false
+```
+
+---
+
+## Endpoint: `GET /api/debug/telemetry/events`
+
+Returns historical telemetry events from memory and, optionally, persisted storage.
+
+### Response shape
+
+```json
+{
+  "items": [
+    {
+      "id": 991,
+      "ts": "2026-03-25T12:00:03Z",
+      "name": "source_reset",
+      "level": "warn",
+      "message": "source reader reset observed",
+      "tags": {
+        "component": "source"
+      },
+      "fields": {
+        "reason": "short_read"
+      }
+    }
+  ],
+  "count": 1
+}
+```
+
+### Supported query parameters
+
+All `history` filters are also supported here, plus:
+
+- `level=<debug|info|warn|error|...>`
+
+Examples:
+
+- `?since=2026-03-25T12:00:00Z&level=warn`
+- `?prefix=audio.&signal_id=1`
+- `?name=demod_boundary&signal_id=1`
+
+### Notes
+
+- Event matching supports `name`, `prefix`, `level`, time range, and tags.
+- Event `level` matching is case-insensitive.
+- Results are timestamp-sorted ascending.
+
+### Typical queries
+
+Get warnings during a reproduction run:
+
+```text
+/api/debug/telemetry/events?since=2026-03-25T12:00:00Z&level=warn
+```
+
+Get boundary-related events for one signal:
+
+```text
+/api/debug/telemetry/events?since=2026-03-25T12:00:00Z&signal_id=1&prefix=demod_
+```
+
+---
+
+## Endpoint: `GET /api/debug/telemetry/config`
+
+Returns both:
+
+1. the active collector configuration, and
+2. the current runtime config under `debug.telemetry`
+
+### Response shape
+
+```json
+{
+  "collector": {
+    "enabled": true,
+    "heavy_enabled": false,
+    "heavy_sample_every": 12,
+    "metric_sample_every": 2,
+    "metric_history_max": 12000,
+    "event_history_max": 4000,
+    "retention": 900000000000,
+    "persist_enabled": false,
+    "persist_dir": "debug/telemetry",
+    "rotate_mb": 16,
+    "keep_files": 8
+  },
+  "config": {
+    "enabled": true,
+    "heavy_enabled": false,
+    "heavy_sample_every": 12,
+    "metric_sample_every": 2,
+    "metric_history_max": 12000,
+    "event_history_max": 4000,
+    "retention_seconds": 900,
+    "persist_enabled": false,
+    "persist_dir": "debug/telemetry",
+    "rotate_mb": 16,
+    "keep_files": 8
+  }
+}
+```
+
+### Important distinction
+
+- `collector.retention` is a Go duration serialized in nanoseconds.
+- `config.retention_seconds` is the config-facing field used by YAML and the POST update API.
+
+If you are writing tooling, prefer `config.retention_seconds` for human-facing config edits.
+
+---
+
+## Endpoint: `POST /api/debug/telemetry/config`
+
+Updates telemetry settings at runtime and writes them back via the autosave config path.
+
+### Request body
+
+All fields are optional. Only provided fields are changed.
+
+```json
+{
+  "enabled": true,
+  "heavy_enabled": true,
+  "heavy_sample_every": 8,
+  "metric_sample_every": 1,
+  "metric_history_max": 20000,
+  "event_history_max": 6000,
+  "retention_seconds": 1800,
+  "persist_enabled": true,
+  "persist_dir": "debug/telemetry",
+  "rotate_mb": 32,
+  "keep_files": 12
+}
+```
+
+### Response shape
+
+```json
+{
+  "ok": true,
+  "collector": {
+    "enabled": true,
+    "heavy_enabled": true,
+    "heavy_sample_every": 8,
+    "metric_sample_every": 1,
+    "metric_history_max": 20000,
+    "event_history_max": 6000,
+    "retention": 1800000000000,
+    "persist_enabled": true,
+    "persist_dir": "debug/telemetry",
+    "rotate_mb": 32,
+    "keep_files": 12
+  },
+  "config": {
+    "enabled": true,
+    "heavy_enabled": true,
+    "heavy_sample_every": 8,
+    "metric_sample_every": 1,
+    "metric_history_max": 20000,
+    "event_history_max": 6000,
+    "retention_seconds": 1800,
+    "persist_enabled": true,
+    "persist_dir": "debug/telemetry",
+    "rotate_mb": 32,
+    "keep_files": 12
+  }
+}
+```
+
+### Persistence behavior
+
+A POST updates:
+
+- the runtime manager snapshot/config
+- the in-process collector config
+- the autosave config file via `config.Save(...)`
+
+That means these updates are runtime-effective immediately and also survive restarts through autosave, unless manually reverted.
+
+### Error cases
+
+- Invalid JSON -> `400 Bad Request`
+- Invalid collector reconfiguration -> `400 Bad Request`
+- Telemetry unavailable -> `503 Service Unavailable`
+
+---
+
+## Configuration fields (`debug.telemetry`)
+
+Telemetry config lives under:
+
+```yaml
+debug:
+  telemetry:
+    enabled: true
+    heavy_enabled: false
+    heavy_sample_every: 12
+    metric_sample_every: 2
+    metric_history_max: 12000
+    event_history_max: 4000
+    retention_seconds: 900
+    persist_enabled: false
+    persist_dir: debug/telemetry
+    rotate_mb: 16
+    keep_files: 8
+```
+
+### Field reference
+
+#### `enabled`
+Master on/off switch for telemetry collection.
+
+If false:
+- metrics are not recorded
+- events are not recorded
+- live snapshot remains effectively empty/minimal
+
+#### `heavy_enabled`
+Enables more expensive / more detailed telemetry paths that should not be left on permanently unless needed.
+
+Use this for deep extractor/IQ/boundary debugging.
+
+#### `heavy_sample_every`
+Sampling cadence for heavy telemetry.
+
+- `1` means every eligible heavy sample
+- higher numbers reduce cost by sampling less often
+
+#### `metric_sample_every`
+Sampling cadence for normal historical metric point storage.
+
+Collector summaries still update live, but historical storage becomes less dense when this value is greater than 1.
+
+#### `metric_history_max`
+Maximum number of in-memory historical metric samples retained.
+
+#### `event_history_max`
+Maximum number of in-memory telemetry events retained.
+
+#### `retention_seconds`
+Time-based in-memory retention window.
+
+Older in-memory metrics/events are trimmed once they fall outside this retention period.
+
+#### `persist_enabled`
+When enabled, telemetry metrics/events are also appended to rotated JSONL files.
+
+#### `persist_dir`
+Directory where rotated telemetry JSONL files are written.
+
+Default:
+
+- `debug/telemetry`
+
+#### `rotate_mb`
+Approximate JSONL file rotation threshold in megabytes.
+
+#### `keep_files`
+How many rotated telemetry files to retain in `persist_dir`.
+
+Older files beyond this count are pruned.
+
+---
+
+## Collector behavior and caveats
+
+### In-memory vs persisted data
+
+The query endpoints can read from both:
+
+- current in-memory collector state/history
+- persisted JSONL files
+
+This means a request may return data older than current in-memory retention if:
+
+- `persist_enabled=true`, and
+- `include_persisted=true`
+
+### Sampling behavior
+
+Not every observation necessarily becomes a historical metric point.
+
+The collector:
+
+- always updates live counters/gauges/distributions while enabled
+- stores historical points according to `metric_sample_every`
+- force-stores selected boundary IQ metrics even when sampling would normally skip them
+
+So the live snapshot and historical series density are intentionally different.
+
+### Distribution summaries
+
+Distribution values in the live snapshot include:
+
+- `count`
+- `min`
+- `max`
+- `mean`
+- `last`
+- `p95`
+
+The p95 estimate is based on the collector's bounded rolling sample buffer, not an unbounded full-history quantile computation.
+
+### Config serialization detail
+
+The collector's `retention` field is a Go duration. In JSON this appears as an integer nanosecond count.
+
+This is expected.
+
+---
+
+## Recommended workflows
+
+### Fast low-overhead runtime watch
+
+Use:
+
+- `enabled=true`
+- `heavy_enabled=false`
+- `persist_enabled=false` or `true` if you want an archive
+
+Then query:
+
+- `/api/debug/telemetry/live`
+- `/api/debug/telemetry/history?prefix=stage.`
+- `/api/debug/telemetry/events?level=warn`
+
+### 5-10 minute anomaly capture
+
+Suggested settings:
+
+- `enabled=true`
+- `heavy_enabled=false`
+- `persist_enabled=true`
+- moderate `metric_sample_every`
+
+Then:
+
+1. note start time
+2. reproduce workload
+3. fetch live snapshot
+4. inspect warning events
+5. inspect `stage.*`, `streamer.*`, and `source.*` history
+
+### Deep extractor / boundary investigation
+
+Temporarily enable:
+
+- `heavy_enabled=true`
+- `heavy_sample_every` > 1 unless you really need every sample
+- `persist_enabled=true`
+
+Then inspect:
+
+- `iq.*`
+- `extract.*`
+- `audio.*`
+- boundary/anomaly events for specific `signal_id` or `session_id`
+
+Turn heavy telemetry back off once done.
+
+---
+
+## Example requests
+
+### Fetch live snapshot
+
+```bash
+curl http://localhost:8080/api/debug/telemetry/live
+```
+
+### Fetch stage timings from the last 10 minutes
+
+```bash
+curl "http://localhost:8080/api/debug/telemetry/history?since=2026-03-25T12:00:00Z&prefix=stage."
+```
+
+### Fetch source metrics for one signal
+
+```bash
+curl "http://localhost:8080/api/debug/telemetry/history?prefix=source.&signal_id=1"
+```
+
+### Fetch warning events only
+
+```bash
+curl "http://localhost:8080/api/debug/telemetry/events?since=2026-03-25T12:00:00Z&level=warn"
+```
+
+### Fetch events with a custom tag filter
+
+```bash
+curl "http://localhost:8080/api/debug/telemetry/events?tag_zone=broadcast"
+```
+
+### Enable persistence and heavy telemetry temporarily
+
+```bash
+curl -X POST http://localhost:8080/api/debug/telemetry/config \
+  -H "Content-Type: application/json" \
+  -d '{
+    "heavy_enabled": true,
+    "heavy_sample_every": 8,
+    "persist_enabled": true
+  }'
+```
+
+---
+
+## Related docs
+
+- `README.md` - high-level project overview and endpoint summary
+- `docs/telemetry-debug-runbook.md` - quick operational runbook for short debug sessions
+- `internal/telemetry/telemetry.go` - collector implementation details
+- `cmd/sdrd/http_handlers.go` - HTTP wiring for telemetry endpoints
diff --git a/docs/telemetry-debug-runbook.md b/docs/telemetry-debug-runbook.md
new file mode 100644
index 0000000..4b14c87
--- /dev/null
+++ b/docs/telemetry-debug-runbook.md
@@ -0,0 +1,100 @@
+# Debug Telemetry Runbook
+
+This project now includes structured server-side telemetry for the audio/DSP pipeline.
+
+## Endpoints
+
+- `GET /api/debug/telemetry/live`
+  - Current counters/gauges/distributions and recent events.
+- `GET /api/debug/telemetry/history`
+  - Historical metric samples.
+  - Query params:
+    - `since`, `until`: unix seconds/ms or RFC3339
+    - `limit`
+    - `name`, `prefix`
+    - `signal_id`, `session_id`, `stage`, `trace_id`, `component`
+    - `tag_<key>=<value>` for arbitrary tag filters
+    - `include_persisted=true|false`
+- `GET /api/debug/telemetry/events`
+  - Historical events/anomalies.
+  - Same filters as history plus `level`.
+- `GET /api/debug/telemetry/config`
+  - Active telemetry config from runtime + collector.
+- `POST /api/debug/telemetry/config`
+  - Runtime config update (also saved to autosave config).
+
+## Config knobs
+
+`debug.telemetry` in config:
+
+- `enabled`
+- `heavy_enabled`
+- `heavy_sample_every`
+- `metric_sample_every`
+- `metric_history_max`
+- `event_history_max`
+- `retention_seconds`
+- `persist_enabled`
+- `persist_dir`
+- `rotate_mb`
+- `keep_files`
+
+Persisted JSONL files rotate in `persist_dir` (default: `debug/telemetry`).
+
+## 5-10 minute debug flow
+
+1. Keep `enabled=true`, `heavy_enabled=false`, `persist_enabled=true`.
+2. Run workload for 5-10 minutes.
+3. Pull live state:
+   - `GET /api/debug/telemetry/live`
+4. Pull anomalies:
+   - `GET /api/debug/telemetry/events?since=<start>&level=warn`
+5. Pull pipeline timing and queue/backpressure:
+   - `GET /api/debug/telemetry/history?since=<start>&prefix=stage.`
+   - `GET /api/debug/telemetry/history?since=<start>&prefix=streamer.`
+6. If IQ boundary issues persist, temporarily set `heavy_enabled=true` (keep sampling coarse with `heavy_sample_every` > 1), rerun, then inspect `iq.*` metrics and `audio.*` anomalies by `signal_id`/`session_id`.
+
+## 2026-03-25 audio click incident — final resolved summary
+
+Status: **SOLVED**
+
+The March 2026 live-audio click investigation ultimately converged on a combination of three real root causes plus two secondary fixes:
+
+### Root causes
+
+1. **Shared `allIQ` corruption by `IQBalance` aliasing**
+   - `cmd/sdrd/pipeline_runtime.go`
+   - `survIQ` aliased the tail of `allIQ`
+   - `dsp.IQBalance(survIQ)` modified `allIQ` in-place
+   - extractor then saw a corrupted boundary inside the shared buffer
+   - final fix: copy `survIQ` before `IQBalance`
+
+2. **Per-frame extractor reset due to `StreamingConfigHash` jitter**
+   - `internal/demod/gpudemod/streaming_types.go`
+   - smoothed tuning values changed slightly every frame
+   - offset/bandwidth in the hash caused repeated state resets
+   - final fix: hash only structural parameters
+
+3. **Streaming path batch rejection for non-WFM exact-decimation mismatch**
+   - `cmd/sdrd/streaming_refactor.go`
+   - one non-WFM signal could reject the whole batch and silently force fallback to the legacy path
+   - final fix: choose nearest exact integer-divisor output rate and keep fallback logging visible
+
+### Secondary fixes
+
+- FM discriminator cross-block carry in `internal/recorder/streamer.go`
+- WFM mono/plain-path 15 kHz audio lowpass in `internal/recorder/streamer.go`
+
+### Verification notes
+
+- major discontinuities dropped sharply after the config-hash fix
+- remaining fine clicks were eliminated only after the `IQBalance` aliasing fix in `pipeline_runtime.go`
+- final confirmation was by operator listening test, backed by prior telemetry and WAV analysis
+
+### Practical lesson
+
+When the same captured `allIQ` buffer feeds both:
+- surveillance/detail analysis
+- and extraction/streaming
+
+then surveillance-side DSP helpers must not mutate a shared sub-slice in-place unless that mutation is intentionally part of the extraction contract.
diff --git a/internal/config/config.go b/internal/config/config.go
index 1cd1fb7..66f0c9a 100644
--- a/internal/config/config.go
+++ b/internal/config/config.go
@@ -96,6 +96,26 @@ type DecoderConfig struct {
 	PSKCmd   string `yaml:"psk_cmd" json:"psk_cmd"`
 }
 
+type DebugConfig struct {
+	AudioDumpEnabled bool `yaml:"audio_dump_enabled" json:"audio_dump_enabled"`
+	CPUMonitoring    bool `yaml:"cpu_monitoring" json:"cpu_monitoring"`
+	Telemetry        TelemetryConfig `yaml:"telemetry" json:"telemetry"`
+}
+
+type TelemetryConfig struct {
+	Enabled           bool   `yaml:"enabled" json:"enabled"`
+	HeavyEnabled      bool   `yaml:"heavy_enabled" json:"heavy_enabled"`
+	HeavySampleEvery  int    `yaml:"heavy_sample_every" json:"heavy_sample_every"`
+	MetricSampleEvery int    `yaml:"metric_sample_every" json:"metric_sample_every"`
+	MetricHistoryMax  int    `yaml:"metric_history_max" json:"metric_history_max"`
+	EventHistoryMax   int    `yaml:"event_history_max" json:"event_history_max"`
+	RetentionSeconds  int    `yaml:"retention_seconds" json:"retention_seconds"`
+	PersistEnabled    bool   `yaml:"persist_enabled" json:"persist_enabled"`
+	PersistDir        string `yaml:"persist_dir" json:"persist_dir"`
+	RotateMB          int    `yaml:"rotate_mb" json:"rotate_mb"`
+	KeepFiles         int    `yaml:"keep_files" json:"keep_files"`
+}
+
 type PipelineGoalConfig struct {
 	Intent            string          `yaml:"intent" json:"intent"`
 	MonitorStartHz    float64         `yaml:"monitor_start_hz" json:"monitor_start_hz"`
@@ -169,6 +189,7 @@ type Config struct {
 	Detector       DetectorConfig     `yaml:"detector" json:"detector"`
 	Recorder       RecorderConfig     `yaml:"recorder" json:"recorder"`
 	Decoder        DecoderConfig      `yaml:"decoder" json:"decoder"`
+	Debug          DebugConfig        `yaml:"debug" json:"debug"`
 	Logging        LogConfig          `yaml:"logging" json:"logging"`
 	WebAddr        string             `yaml:"web_addr" json:"web_addr"`
 	EventPath      string             `yaml:"event_path" json:"event_path"`
@@ -421,6 +442,23 @@ func Default() Config {
 			ExtractionBwMult: 1.2,
 		},
 		Decoder:        DecoderConfig{},
+		Debug: DebugConfig{
+			AudioDumpEnabled: false,
+			CPUMonitoring:    false,
+			Telemetry: TelemetryConfig{
+				Enabled:           true,
+				HeavyEnabled:      false,
+				HeavySampleEvery:  12,
+				MetricSampleEvery: 2,
+				MetricHistoryMax:  12000,
+				EventHistoryMax:   4000,
+				RetentionSeconds:  900,
+				PersistEnabled:    false,
+				PersistDir:        "debug/telemetry",
+				RotateMB:          16,
+				KeepFiles:         8,
+			},
+		},
 		Logging: LogConfig{
 			Level:       "informal",
 			Categories:  []string{},
@@ -664,6 +702,30 @@ func applyDefaults(cfg Config) Config {
 	if cfg.Recorder.ExtractionBwMult <= 0 {
 		cfg.Recorder.ExtractionBwMult = 1.2
 	}
+	if cfg.Debug.Telemetry.HeavySampleEvery <= 0 {
+		cfg.Debug.Telemetry.HeavySampleEvery = 12
+	}
+	if cfg.Debug.Telemetry.MetricSampleEvery <= 0 {
+		cfg.Debug.Telemetry.MetricSampleEvery = 2
+	}
+	if cfg.Debug.Telemetry.MetricHistoryMax <= 0 {
+		cfg.Debug.Telemetry.MetricHistoryMax = 12000
+	}
+	if cfg.Debug.Telemetry.EventHistoryMax <= 0 {
+		cfg.Debug.Telemetry.EventHistoryMax = 4000
+	}
+	if cfg.Debug.Telemetry.RetentionSeconds <= 0 {
+		cfg.Debug.Telemetry.RetentionSeconds = 900
+	}
+	if cfg.Debug.Telemetry.PersistDir == "" {
+		cfg.Debug.Telemetry.PersistDir = "debug/telemetry"
+	}
+	if cfg.Debug.Telemetry.RotateMB <= 0 {
+		cfg.Debug.Telemetry.RotateMB = 16
+	}
+	if cfg.Debug.Telemetry.KeepFiles <= 0 {
+		cfg.Debug.Telemetry.KeepFiles = 8
+	}
 	return cfg
 }
 
diff --git a/internal/demod/fm.go b/internal/demod/fm.go
index 6c1d153..7fe05dc 100644
--- a/internal/demod/fm.go
+++ b/internal/demod/fm.go
@@ -4,6 +4,7 @@ import (
 	"math"
 
 	"sdr-wideband-suite/internal/dsp"
+	"sdr-wideband-suite/internal/logging"
 )
 
 type NFM struct{}
@@ -45,12 +46,45 @@ func fmDiscrim(iq []complex64) []float32 {
 		return nil
 	}
 	out := make([]float32, len(iq)-1)
+	maxAbs := 0.0
+	maxIdx := 0
+	largeSteps := 0
+	minMag := math.MaxFloat64
+	maxMag := 0.0
 	for i := 1; i < len(iq); i++ {
 		p := iq[i-1]
 		c := iq[i]
+		pmag := math.Hypot(float64(real(p)), float64(imag(p)))
+		cmag := math.Hypot(float64(real(c)), float64(imag(c)))
+		if pmag < minMag {
+			minMag = pmag
+		}
+		if cmag < minMag {
+			minMag = cmag
+		}
+		if pmag > maxMag {
+			maxMag = pmag
+		}
+		if cmag > maxMag {
+			maxMag = cmag
+		}
 		num := float64(real(p))*float64(imag(c)) - float64(imag(p))*float64(real(c))
 		den := float64(real(p))*float64(real(c)) + float64(imag(p))*float64(imag(c))
-		out[i-1] = float32(math.Atan2(num, den))
+		step := math.Atan2(num, den)
+		if a := math.Abs(step); a > maxAbs {
+			maxAbs = a
+			maxIdx = i - 1
+		}
+		if math.Abs(step) > 1.5 {
+			largeSteps++
+		}
+		out[i-1] = float32(step)
+	}
+	if logging.EnabledCategory("discrim") {
+		logging.Debug("discrim", "fm_meter", "iq_len", len(iq), "audio_len", len(out), "min_mag", minMag, "max_mag", maxMag, "max_abs_step", maxAbs, "max_idx", maxIdx, "large_steps", largeSteps)
+		if largeSteps > 0 {
+			logging.Warn("discrim", "fm_large_steps", "iq_len", len(iq), "large_steps", largeSteps, "max_abs_step", maxAbs, "max_idx", maxIdx, "min_mag", minMag, "max_mag", maxMag)
+		}
 	}
 	return out
 }
diff --git a/internal/demod/gpudemod/README.md b/internal/demod/gpudemod/README.md
deleted file mode 100644
index adcd2a6..0000000
--- a/internal/demod/gpudemod/README.md
+++ /dev/null
@@ -1,34 +0,0 @@
-# gpudemod
-
-Phase 1 CUDA demod scaffolding.
-
-## Current state
-
-- Standard Go builds use `gpudemod_stub.go` (`!cufft`).
-- `cufft` builds allocate GPU buffers and cross the CGO/CUDA launch boundary.
-- If CUDA launch wrappers are not backed by compiled kernels yet, the code falls back to CPU DSP.
-- The shifted IQ path is already wired so a successful GPU freq-shift result can be copied back and reused immediately.
-- Build orchestration should now be considered OS-specific; see `docs/build-cuda.md`.
-
-## First real kernel
-
-`kernels.cu` contains the first candidate implementation:
-- `gpud_freq_shift_kernel`
-
-This is **not compiled automatically yet** in the current environment because the machine currently lacks a CUDA compiler toolchain in PATH (`nvcc` not found).
-
-## Next machine-side step
-
-On a CUDA-capable dev machine with toolchain installed:
-
-1. Compile `kernels.cu` into an object file and archive it into a linkable library
-   - helper script: `tools/build-gpudemod-kernel.ps1`
-2. On Jan's Windows machine, the working kernel-build path currently relies on `nvcc` + MSVC `cl.exe` in PATH
-3. Link `gpudemod_kernels.lib` into the `cufft` build
-3. Replace `gpud_launch_freq_shift(...)` stub body with the real kernel launch
-4. Validate copied-back shifted IQ against `dsp.FreqShift`
-5. Only then move the next stage (FM discriminator) onto the GPU
-
-## Why this is still useful
-
-The runtime/buffer/recorder/fallback structure is already in place, so once kernel compilation is available, real acceleration can be inserted without another architecture rewrite.
diff --git a/internal/demod/gpudemod/batch.go b/internal/demod/gpudemod/batch.go
index 6bbf9df..df6af46 100644
--- a/internal/demod/gpudemod/batch.go
+++ b/internal/demod/gpudemod/batch.go
@@ -6,7 +6,7 @@ type ExtractJob struct {
 	OffsetHz   float64
 	BW         float64
 	OutRate    int
-	PhaseStart float64 // FreqShift starting phase (0 for stateless, carry over for streaming)
+	PhaseStart float64 // legacy batch phase field; retained only while migrating to streaming extractor semantics
 }
 
 // ExtractResult holds the output of a batch extraction including the ending
diff --git a/internal/demod/gpudemod/batch_runner.go b/internal/demod/gpudemod/batch_runner.go
index 7441263..3933c1b 100644
--- a/internal/demod/gpudemod/batch_runner.go
+++ b/internal/demod/gpudemod/batch_runner.go
@@ -10,10 +10,12 @@ type batchSlot struct {
 }
 
 type BatchRunner struct {
-	eng        *Engine
-	slots      []batchSlot
-	slotBufs   []slotBuffers
+	eng         *Engine
+	slots       []batchSlot
+	slotBufs    []slotBuffers
 	slotBufSize int // number of IQ samples the slot buffers were allocated for
+	streamState map[int64]*ExtractStreamState
+	nativeState map[int64]*nativeStreamingSignalState
 }
 
 func NewBatchRunner(maxSamples int, sampleRate int) (*BatchRunner, error) {
@@ -21,7 +23,11 @@ func NewBatchRunner(maxSamples int, sampleRate int) (*BatchRunner, error) {
 	if err != nil {
 		return nil, err
 	}
-	return &BatchRunner{eng: eng}, nil
+	return &BatchRunner{
+		eng:         eng,
+		streamState: make(map[int64]*ExtractStreamState),
+		nativeState: make(map[int64]*nativeStreamingSignalState),
+	}, nil
 }
 
 func (r *BatchRunner) Close() {
@@ -29,9 +35,12 @@ func (r *BatchRunner) Close() {
 		return
 	}
 	r.freeSlotBuffers()
+	r.freeAllNativeStreamingStates()
 	r.eng.Close()
 	r.eng = nil
 	r.slots = nil
+	r.streamState = nil
+	r.nativeState = nil
 }
 
 func (r *BatchRunner) prepare(jobs []ExtractJob) {
diff --git a/internal/demod/gpudemod/batch_runner_windows.go b/internal/demod/gpudemod/batch_runner_windows.go
index c81467c..58836fd 100644
--- a/internal/demod/gpudemod/batch_runner_windows.go
+++ b/internal/demod/gpudemod/batch_runner_windows.go
@@ -160,9 +160,9 @@ func (r *BatchRunner) shiftFilterDecimateSlotParallel(iq []complex64, job Extrac
 	if bridgeMemcpyH2D(buf.dTaps, unsafe.Pointer(&taps[0]), tapsBytes) != 0 {
 		return 0, 0, errors.New("taps H2D failed")
 	}
-	decim := int(math.Round(float64(e.sampleRate) / float64(job.OutRate)))
-	if decim < 1 {
-		decim = 1
+	decim, err := ExactIntegerDecimation(e.sampleRate, job.OutRate)
+	if err != nil {
+		return 0, 0, err
 	}
 	nOut := n / decim
 	if nOut <= 0 {
diff --git a/internal/demod/gpudemod/build/gpudemod_kernels.lib b/internal/demod/gpudemod/build/gpudemod_kernels.lib
deleted file mode 100644
index dccfca0..0000000
Binary files a/internal/demod/gpudemod/build/gpudemod_kernels.lib and /dev/null differ
diff --git a/internal/demod/gpudemod/compare.go b/internal/demod/gpudemod/compare.go
new file mode 100644
index 0000000..24ba29b
--- /dev/null
+++ b/internal/demod/gpudemod/compare.go
@@ -0,0 +1,47 @@
+package gpudemod
+
+import "math/cmplx"
+
+type CompareStats struct {
+	MaxAbsErr float64
+	RMSErr    float64
+	Count     int
+}
+
+func CompareComplexSlices(a []complex64, b []complex64) CompareStats {
+	n := len(a)
+	if len(b) < n {
+		n = len(b)
+	}
+	if n == 0 {
+		return CompareStats{}
+	}
+	var sumSq float64
+	var maxAbs float64
+	for i := 0; i < n; i++ {
+		err := cmplx.Abs(complex128(a[i] - b[i]))
+		if err > maxAbs {
+			maxAbs = err
+		}
+		sumSq += err * err
+	}
+	return CompareStats{
+		MaxAbsErr: maxAbs,
+		RMSErr:    mathSqrt(sumSq / float64(n)),
+		Count:     n,
+	}
+}
+
+func mathSqrt(v float64) float64 {
+	// tiny shim to keep the compare helper self-contained and easy to move
+	// without importing additional logic elsewhere
+	z := v
+	if z <= 0 {
+		return 0
+	}
+	x := z
+	for i := 0; i < 12; i++ {
+		x = 0.5 * (x + z/x)
+	}
+	return x
+}
diff --git a/internal/demod/gpudemod/compare_gpu.go b/internal/demod/gpudemod/compare_gpu.go
new file mode 100644
index 0000000..9232c3c
--- /dev/null
+++ b/internal/demod/gpudemod/compare_gpu.go
@@ -0,0 +1,19 @@
+package gpudemod
+
+func BuildGPUStubDebugMetrics(res StreamingExtractResult) ExtractDebugMetrics {
+	return ExtractDebugMetrics{
+		SignalID:   res.SignalID,
+		PhaseCount: res.PhaseCount,
+		HistoryLen: res.HistoryLen,
+		NOut:       res.NOut,
+	}
+}
+
+func BuildGPUHostOracleDebugMetrics(res StreamingExtractResult) ExtractDebugMetrics {
+	return ExtractDebugMetrics{
+		SignalID:   res.SignalID,
+		PhaseCount: res.PhaseCount,
+		HistoryLen: res.HistoryLen,
+		NOut:       res.NOut,
+	}
+}
diff --git a/internal/demod/gpudemod/compare_oracle.go b/internal/demod/gpudemod/compare_oracle.go
new file mode 100644
index 0000000..ccf48e5
--- /dev/null
+++ b/internal/demod/gpudemod/compare_oracle.go
@@ -0,0 +1,10 @@
+package gpudemod
+
+func BuildOracleDebugMetrics(res StreamingExtractResult) ExtractDebugMetrics {
+	return ExtractDebugMetrics{
+		SignalID:   res.SignalID,
+		PhaseCount: res.PhaseCount,
+		HistoryLen: res.HistoryLen,
+		NOut:       res.NOut,
+	}
+}
diff --git a/internal/demod/gpudemod/compare_pipeline.go b/internal/demod/gpudemod/compare_pipeline.go
new file mode 100644
index 0000000..5578fd9
--- /dev/null
+++ b/internal/demod/gpudemod/compare_pipeline.go
@@ -0,0 +1,27 @@
+package gpudemod
+
+func CompareOracleAndGPUStub(oracle StreamingExtractResult, gpu StreamingExtractResult) (ExtractDebugMetrics, CompareStats) {
+	stats := CompareComplexSlices(oracle.IQ, gpu.IQ)
+	metrics := ExtractDebugMetrics{
+		SignalID:     oracle.SignalID,
+		PhaseCount:   gpu.PhaseCount,
+		HistoryLen:   gpu.HistoryLen,
+		NOut:         gpu.NOut,
+		RefMaxAbsErr: stats.MaxAbsErr,
+		RefRMSErr:    stats.RMSErr,
+	}
+	return metrics, stats
+}
+
+func CompareOracleAndGPUHostOracle(oracle StreamingExtractResult, gpu StreamingExtractResult) (ExtractDebugMetrics, CompareStats) {
+	stats := CompareComplexSlices(oracle.IQ, gpu.IQ)
+	metrics := ExtractDebugMetrics{
+		SignalID:     oracle.SignalID,
+		PhaseCount:   gpu.PhaseCount,
+		HistoryLen:   gpu.HistoryLen,
+		NOut:         gpu.NOut,
+		RefMaxAbsErr: stats.MaxAbsErr,
+		RefRMSErr:    stats.RMSErr,
+	}
+	return metrics, stats
+}
diff --git a/internal/demod/gpudemod/compare_pipeline_test.go b/internal/demod/gpudemod/compare_pipeline_test.go
new file mode 100644
index 0000000..9337674
--- /dev/null
+++ b/internal/demod/gpudemod/compare_pipeline_test.go
@@ -0,0 +1,32 @@
+package gpudemod
+
+import "testing"
+
+func TestCompareOracleAndGPUStub(t *testing.T) {
+	oracle := StreamingExtractResult{
+		SignalID:   1,
+		IQ:         []complex64{1 + 1i, 2 + 2i},
+		Rate:       200000,
+		NOut:       2,
+		PhaseCount: 0,
+		HistoryLen: 64,
+	}
+	gpu := StreamingExtractResult{
+		SignalID:   1,
+		IQ:         []complex64{1 + 1i, 2.1 + 2i},
+		Rate:       200000,
+		NOut:       2,
+		PhaseCount: 3,
+		HistoryLen: 64,
+	}
+	metrics, stats := CompareOracleAndGPUStub(oracle, gpu)
+	if metrics.SignalID != 1 {
+		t.Fatalf("unexpected signal id: %d", metrics.SignalID)
+	}
+	if stats.Count != 2 {
+		t.Fatalf("unexpected compare count: %d", stats.Count)
+	}
+	if metrics.RefMaxAbsErr <= 0 {
+		t.Fatalf("expected positive max abs error")
+	}
+}
diff --git a/internal/demod/gpudemod/compare_state.go b/internal/demod/gpudemod/compare_state.go
new file mode 100644
index 0000000..34e35d0
--- /dev/null
+++ b/internal/demod/gpudemod/compare_state.go
@@ -0,0 +1,12 @@
+package gpudemod
+
+type ExtractDebugMetrics struct {
+	SignalID      int64
+	PhaseCount    int
+	HistoryLen    int
+	NOut          int
+	RefMaxAbsErr  float64
+	RefRMSErr     float64
+	BoundaryDelta float64
+	BoundaryD2    float64
+}
diff --git a/internal/demod/gpudemod/compare_test.go b/internal/demod/gpudemod/compare_test.go
new file mode 100644
index 0000000..643c61e
--- /dev/null
+++ b/internal/demod/gpudemod/compare_test.go
@@ -0,0 +1,18 @@
+package gpudemod
+
+import "testing"
+
+func TestCompareComplexSlices(t *testing.T) {
+	a := []complex64{1 + 1i, 2 + 2i, 3 + 3i}
+	b := []complex64{1 + 1i, 2.1 + 2i, 2.9 + 3.2i}
+	stats := CompareComplexSlices(a, b)
+	if stats.Count != 3 {
+		t.Fatalf("unexpected count: %d", stats.Count)
+	}
+	if stats.MaxAbsErr <= 0 {
+		t.Fatalf("expected positive max abs error")
+	}
+	if stats.RMSErr <= 0 {
+		t.Fatalf("expected positive rms error")
+	}
+}
diff --git a/internal/demod/gpudemod/cpu_oracle.go b/internal/demod/gpudemod/cpu_oracle.go
new file mode 100644
index 0000000..d045072
--- /dev/null
+++ b/internal/demod/gpudemod/cpu_oracle.go
@@ -0,0 +1,170 @@
+package gpudemod
+
+import (
+	"fmt"
+	"math"
+)
+
+type CPUOracleState struct {
+	SignalID       int64
+	ConfigHash     uint64
+	NCOPhase       float64
+	Decim          int
+	PhaseCount     int
+	NumTaps        int
+	ShiftedHistory []complex64
+	BaseTaps       []float32
+	PolyphaseTaps  []float32
+}
+
+func ResetCPUOracleStateIfConfigChanged(state *CPUOracleState, newHash uint64) {
+	if state == nil {
+		return
+	}
+	if state.ConfigHash != newHash {
+		state.ConfigHash = newHash
+		state.NCOPhase = 0
+		state.PhaseCount = 0
+		state.ShiftedHistory = state.ShiftedHistory[:0]
+	}
+}
+
+func CPUOracleExtract(iqNew []complex64, state *CPUOracleState, phaseInc float64) []complex64 {
+	if state == nil || state.NumTaps <= 0 || state.Decim <= 0 || len(state.BaseTaps) < state.NumTaps {
+		return nil
+	}
+	out := make([]complex64, 0, len(iqNew)/maxInt(1, state.Decim)+2)
+	phase := state.NCOPhase
+	hist := append([]complex64(nil), state.ShiftedHistory...)
+
+	for _, x := range iqNew {
+		rot := complex64(complex(math.Cos(phase), math.Sin(phase)))
+		s := x * rot
+		hist = append(hist, s)
+		state.PhaseCount++
+
+		if state.PhaseCount == state.Decim {
+			var y complex64
+			for k := 0; k < state.NumTaps; k++ {
+				idx := len(hist) - 1 - k
+				var sample complex64
+				if idx >= 0 {
+					sample = hist[idx]
+				}
+				y += complex(state.BaseTaps[k], 0) * sample
+			}
+			out = append(out, y)
+			state.PhaseCount = 0
+		}
+
+		if len(hist) > state.NumTaps-1 {
+			hist = hist[len(hist)-(state.NumTaps-1):]
+		}
+
+		phase += phaseInc
+		if phase >= math.Pi {
+			phase -= 2 * math.Pi
+		} else if phase < -math.Pi {
+			phase += 2 * math.Pi
+		}
+	}
+
+	state.NCOPhase = phase
+	state.ShiftedHistory = append(state.ShiftedHistory[:0], hist...)
+	return out
+}
+
+// CPUOracleExtractPolyphase keeps the same streaming state semantics as CPUOracleExtract,
+// but computes outputs using the explicit phase-major polyphase tap layout.
+func CPUOracleExtractPolyphase(iqNew []complex64, state *CPUOracleState, phaseInc float64) []complex64 {
+	if state == nil || state.NumTaps <= 0 || state.Decim <= 0 || len(state.BaseTaps) < state.NumTaps {
+		return nil
+	}
+	if len(state.PolyphaseTaps) == 0 {
+		state.PolyphaseTaps = BuildPolyphaseTapsPhaseMajor(state.BaseTaps, state.Decim)
+	}
+	phaseLen := PolyphasePhaseLen(len(state.BaseTaps), state.Decim)
+	out := make([]complex64, 0, len(iqNew)/maxInt(1, state.Decim)+2)
+	phase := state.NCOPhase
+	hist := append([]complex64(nil), state.ShiftedHistory...)
+
+	for _, x := range iqNew {
+		rot := complex64(complex(math.Cos(phase), math.Sin(phase)))
+		s := x * rot
+		hist = append(hist, s)
+		state.PhaseCount++
+
+		if state.PhaseCount == state.Decim {
+			var y complex64
+			for p := 0; p < state.Decim; p++ {
+				for k := 0; k < phaseLen; k++ {
+					tap := state.PolyphaseTaps[p*phaseLen+k]
+					if tap == 0 {
+						continue
+					}
+					srcBack := p + k*state.Decim
+					idx := len(hist) - 1 - srcBack
+					if idx < 0 {
+						continue
+					}
+					y += complex(tap, 0) * hist[idx]
+				}
+			}
+			out = append(out, y)
+			state.PhaseCount = 0
+		}
+
+		if len(hist) > state.NumTaps-1 {
+			hist = hist[len(hist)-(state.NumTaps-1):]
+		}
+
+		phase += phaseInc
+		if phase >= math.Pi {
+			phase -= 2 * math.Pi
+		} else if phase < -math.Pi {
+			phase += 2 * math.Pi
+		}
+	}
+
+	state.NCOPhase = phase
+	state.ShiftedHistory = append(state.ShiftedHistory[:0], hist...)
+	return out
+}
+
+func RunChunkedCPUOracle(all []complex64, chunkSizes []int, mkState func() *CPUOracleState, phaseInc float64) []complex64 {
+	state := mkState()
+	out := make([]complex64, 0)
+	pos := 0
+	for _, n := range chunkSizes {
+		if pos >= len(all) {
+			break
+		}
+		end := pos + n
+		if end > len(all) {
+			end = len(all)
+		}
+		out = append(out, CPUOracleExtract(all[pos:end], state, phaseInc)...)
+		pos = end
+	}
+	if pos < len(all) {
+		out = append(out, CPUOracleExtract(all[pos:], state, phaseInc)...)
+	}
+	return out
+}
+
+func ExactIntegerDecimation(sampleRate int, outRate int) (int, error) {
+	if sampleRate <= 0 || outRate <= 0 {
+		return 0, fmt.Errorf("invalid sampleRate/outRate: %d/%d", sampleRate, outRate)
+	}
+	if sampleRate%outRate != 0 {
+		return 0, fmt.Errorf("streaming polyphase extractor requires integer decimation: sampleRate=%d outRate=%d", sampleRate, outRate)
+	}
+	return sampleRate / outRate, nil
+}
+
+func maxInt(a int, b int) int {
+	if a > b {
+		return a
+	}
+	return b
+}
diff --git a/internal/demod/gpudemod/cpu_oracle_test.go b/internal/demod/gpudemod/cpu_oracle_test.go
new file mode 100644
index 0000000..762caeb
--- /dev/null
+++ b/internal/demod/gpudemod/cpu_oracle_test.go
@@ -0,0 +1,89 @@
+package gpudemod
+
+import (
+	"math"
+	"math/cmplx"
+	"testing"
+)
+
+func makeDeterministicIQ(n int) []complex64 {
+	out := make([]complex64, n)
+	for i := 0; i < n; i++ {
+		a := 0.017 * float64(i)
+		b := 0.031 * float64(i)
+		out[i] = complex64(complex(math.Cos(a)+0.2*math.Cos(b), math.Sin(a)+0.15*math.Sin(b)))
+	}
+	return out
+}
+
+func makeLowpassTaps(n int) []float32 {
+	out := make([]float32, n)
+	for i := range out {
+		out[i] = 1.0 / float32(n)
+	}
+	return out
+}
+
+func requireComplexSlicesClose(t *testing.T, a []complex64, b []complex64, tol float64) {
+	t.Helper()
+	if len(a) != len(b) {
+		t.Fatalf("length mismatch: %d vs %d", len(a), len(b))
+	}
+	for i := range a {
+		if cmplx.Abs(complex128(a[i]-b[i])) > tol {
+			t.Fatalf("slice mismatch at %d: %v vs %v (tol=%f)", i, a[i], b[i], tol)
+		}
+	}
+}
+
+func TestCPUOracleMonolithicVsChunked(t *testing.T) {
+	iq := makeDeterministicIQ(200000)
+	mk := func() *CPUOracleState {
+		return &CPUOracleState{
+			SignalID:       1,
+			ConfigHash:     123,
+			NCOPhase:       0,
+			Decim:          20,
+			PhaseCount:     0,
+			NumTaps:        65,
+			ShiftedHistory: make([]complex64, 0, 64),
+			BaseTaps:       makeLowpassTaps(65),
+		}
+	}
+	phaseInc := 0.017
+	monoState := mk()
+	mono := CPUOracleExtract(iq, monoState, phaseInc)
+	chunked := RunChunkedCPUOracle(iq, []int{4096, 5000, 8192, 27307}, mk, phaseInc)
+	requireComplexSlicesClose(t, mono, chunked, 1e-5)
+}
+
+func TestExactIntegerDecimation(t *testing.T) {
+	if d, err := ExactIntegerDecimation(4000000, 200000); err != nil || d != 20 {
+		t.Fatalf("unexpected exact decim result: d=%d err=%v", d, err)
+	}
+	if _, err := ExactIntegerDecimation(4000000, 192000); err == nil {
+		t.Fatalf("expected non-integer decimation error")
+	}
+}
+
+func TestCPUOracleDirectVsPolyphase(t *testing.T) {
+	iq := makeDeterministicIQ(50000)
+	mk := func() *CPUOracleState {
+		taps := makeLowpassTaps(65)
+		return &CPUOracleState{
+			SignalID:       1,
+			ConfigHash:     123,
+			NCOPhase:       0,
+			Decim:          20,
+			PhaseCount:     0,
+			NumTaps:        65,
+			ShiftedHistory: make([]complex64, 0, 64),
+			BaseTaps:       taps,
+			PolyphaseTaps:  BuildPolyphaseTapsPhaseMajor(taps, 20),
+		}
+	}
+	phaseInc := 0.017
+	direct := CPUOracleExtract(iq, mk(), phaseInc)
+	poly := CPUOracleExtractPolyphase(iq, mk(), phaseInc)
+	requireComplexSlicesClose(t, direct, poly, 1e-5)
+}
diff --git a/internal/demod/gpudemod/native/exports.cu b/internal/demod/gpudemod/native/exports.cu
index 6081b57..d2bceae 100644
--- a/internal/demod/gpudemod/native/exports.cu
+++ b/internal/demod/gpudemod/native/exports.cu
@@ -11,6 +11,10 @@
 
 typedef void* gpud_stream_handle;
 
+static __forceinline__ int gpud_max_i(int a, int b) {
+    return a > b ? a : b;
+}
+
 GPUD_API int GPUD_CALL gpud_stream_create(gpud_stream_handle* out) {
     if (!out) return -1;
     cudaStream_t stream;
@@ -320,3 +324,308 @@ GPUD_API int GPUD_CALL gpud_launch_ssb_product_cuda(
     gpud_ssb_product_kernel<<<grid, block>>>(in, out, n, phase_inc, phase_start);
     return (int)cudaGetLastError();
 }
+
+__global__ void gpud_streaming_polyphase_accum_kernel(
+    const float2* __restrict__ history_state,
+    int history_len,
+    const float2* __restrict__ shifted_new,
+    int n_new,
+    const float* __restrict__ polyphase_taps,
+    int polyphase_len,
+    int decim,
+    int phase_len,
+    int start_idx,
+    int n_out,
+    float2* __restrict__ out
+);
+
+__global__ void gpud_streaming_history_tail_kernel(
+    const float2* __restrict__ history_state,
+    int history_len,
+    const float2* __restrict__ shifted_new,
+    int n_new,
+    int keep,
+    float2* __restrict__ history_out
+);
+
+static __forceinline__ double gpud_reduce_phase(double phase);
+
+// Transitional legacy entrypoint retained for bring-up and comparison.
+// The production-native streaming path is gpud_launch_streaming_polyphase_stateful_cuda,
+// which preserves per-signal carry state across NEW-samples-only chunks.
+GPUD_API int GPUD_CALL gpud_launch_streaming_polyphase_prepare_cuda(
+    const float2* in_new,
+    int n_new,
+    const float2* history_in,
+    int history_len,
+    const float* polyphase_taps,
+    int polyphase_len,
+    int decim,
+    int num_taps,
+    int phase_count_in,
+    double phase_start,
+    double phase_inc,
+    float2* out,
+    int* n_out,
+    int* phase_count_out,
+    double* phase_end_out,
+    float2* history_out
+) {
+    if (n_new < 0 || !polyphase_taps || polyphase_len <= 0 || decim <= 0 || num_taps <= 0) return -1;
+    const int phase_len = (num_taps + decim - 1) / decim;
+    if (polyphase_len < decim * phase_len) return -2;
+
+    const int keep = num_taps > 1 ? num_taps - 1 : 0;
+    int clamped_history_len = history_len;
+    if (clamped_history_len < 0) clamped_history_len = 0;
+    if (clamped_history_len > keep) clamped_history_len = keep;
+    if (clamped_history_len > 0 && !history_in) return -5;
+
+    float2* shifted = NULL;
+    cudaError_t err = cudaSuccess;
+    if (n_new > 0) {
+        if (!in_new) return -3;
+        err = cudaMalloc((void**)&shifted, (size_t)gpud_max_i(1, n_new) * sizeof(float2));
+        if (err != cudaSuccess) return (int)err;
+        const int block = 256;
+        const int grid_shift = (n_new + block - 1) / block;
+        gpud_freq_shift_kernel<<<grid_shift, block>>>(in_new, shifted, n_new, phase_inc, phase_start);
+        err = cudaGetLastError();
+        if (err != cudaSuccess) {
+            cudaFree(shifted);
+            return (int)err;
+        }
+    }
+
+    int phase_count = phase_count_in;
+    if (phase_count < 0) phase_count = 0;
+    if (phase_count >= decim) phase_count %= decim;
+    const int total_phase = phase_count + n_new;
+    const int out_count = total_phase / decim;
+    if (out_count > 0) {
+        if (!out) {
+            cudaFree(shifted);
+            return -4;
+        }
+        const int block = 256;
+        const int grid = (out_count + block - 1) / block;
+        const int start_idx = decim - phase_count - 1;
+        gpud_streaming_polyphase_accum_kernel<<<grid, block>>>(
+            history_in,
+            clamped_history_len,
+            shifted,
+            n_new,
+            polyphase_taps,
+            polyphase_len,
+            decim,
+            phase_len,
+            start_idx,
+            out_count,
+            out
+        );
+        err = cudaGetLastError();
+        if (err != cudaSuccess) {
+            cudaFree(shifted);
+            return (int)err;
+        }
+    }
+
+    if (history_out && keep > 0) {
+        const int new_history_len = clamped_history_len + n_new < keep ? clamped_history_len + n_new : keep;
+        if (new_history_len > 0) {
+            const int block = 256;
+            const int grid = (new_history_len + block - 1) / block;
+            gpud_streaming_history_tail_kernel<<<grid, block>>>(
+                history_in,
+                clamped_history_len,
+                shifted,
+                n_new,
+                new_history_len,
+                history_out
+            );
+            err = cudaGetLastError();
+            if (err != cudaSuccess) {
+                cudaFree(shifted);
+                return (int)err;
+            }
+        }
+    }
+
+    if (n_out) *n_out = out_count;
+    if (phase_count_out) *phase_count_out = total_phase % decim;
+    if (phase_end_out) *phase_end_out = gpud_reduce_phase(phase_start + phase_inc * (double)n_new);
+
+    if (shifted) cudaFree(shifted);
+    return 0;
+}
+
+static __device__ __forceinline__ float2 gpud_stream_sample_at(
+    const float2* __restrict__ history_state,
+    int history_len,
+    const float2* __restrict__ shifted_new,
+    int n_new,
+    int idx
+) {
+    if (idx < 0) return make_float2(0.0f, 0.0f);
+    if (idx < history_len) return history_state[idx];
+    int shifted_idx = idx - history_len;
+    if (shifted_idx < 0 || shifted_idx >= n_new) return make_float2(0.0f, 0.0f);
+    return shifted_new[shifted_idx];
+}
+
+__global__ void gpud_streaming_polyphase_accum_kernel(
+    const float2* __restrict__ history_state,
+    int history_len,
+    const float2* __restrict__ shifted_new,
+    int n_new,
+    const float* __restrict__ polyphase_taps,
+    int polyphase_len,
+    int decim,
+    int phase_len,
+    int start_idx,
+    int n_out,
+    float2* __restrict__ out
+) {
+    int out_idx = blockIdx.x * blockDim.x + threadIdx.x;
+    if (out_idx >= n_out) return;
+
+    int newest = history_len + start_idx + out_idx * decim;
+    float acc_r = 0.0f;
+    float acc_i = 0.0f;
+    for (int p = 0; p < decim; ++p) {
+        for (int k = 0; k < phase_len; ++k) {
+            int tap_idx = p * phase_len + k;
+            if (tap_idx >= polyphase_len) continue;
+            float tap = polyphase_taps[tap_idx];
+            if (tap == 0.0f) continue;
+            int src_back = p + k * decim;
+            int src_idx = newest - src_back;
+            float2 sample = gpud_stream_sample_at(history_state, history_len, shifted_new, n_new, src_idx);
+            acc_r += sample.x * tap;
+            acc_i += sample.y * tap;
+        }
+    }
+    out[out_idx] = make_float2(acc_r, acc_i);
+}
+
+__global__ void gpud_streaming_history_tail_kernel(
+    const float2* __restrict__ history_state,
+    int history_len,
+    const float2* __restrict__ shifted_new,
+    int n_new,
+    int keep,
+    float2* __restrict__ history_out
+) {
+    int idx = blockIdx.x * blockDim.x + threadIdx.x;
+    if (idx >= keep) return;
+    int combined_len = history_len + n_new;
+    int src_idx = combined_len - keep + idx;
+    history_out[idx] = gpud_stream_sample_at(history_state, history_len, shifted_new, n_new, src_idx);
+}
+
+static __forceinline__ double gpud_reduce_phase(double phase) {
+    const double TWO_PI = 6.283185307179586;
+    return phase - rint(phase / TWO_PI) * TWO_PI;
+}
+
+// Production-native candidate entrypoint for the stateful streaming extractor.
+// Callers provide only NEW samples; overlap+trim is intentionally not part of this path.
+GPUD_API int GPUD_CALL gpud_launch_streaming_polyphase_stateful_cuda(
+    const float2* in_new,
+    int n_new,
+    float2* shifted_new_tmp,
+    const float* polyphase_taps,
+    int polyphase_len,
+    int decim,
+    int num_taps,
+    float2* history_state,
+    float2* history_scratch,
+    int history_cap,
+    int* history_len_io,
+    int* phase_count_state,
+    double* phase_state,
+    double phase_inc,
+    float2* out,
+    int out_cap,
+    int* n_out
+) {
+    if (!polyphase_taps || decim <= 0 || num_taps <= 0 || !history_len_io || !phase_count_state || !phase_state || !n_out) return -10;
+    if (n_new < 0 || out_cap < 0 || history_cap < 0) return -11;
+    const int phase_len = (num_taps + decim - 1) / decim;
+    if (polyphase_len < decim * phase_len) return -12;
+
+    int history_len = *history_len_io;
+    if (history_len < 0) history_len = 0;
+    if (history_len > history_cap) history_len = history_cap;
+
+    int phase_count = *phase_count_state;
+    if (phase_count < 0) phase_count = 0;
+    if (phase_count >= decim) phase_count %= decim;
+
+    double phase_start = *phase_state;
+    if (n_new > 0) {
+        if (!in_new || !shifted_new_tmp) return -13;
+        const int block = 256;
+        const int grid = (n_new + block - 1) / block;
+        gpud_freq_shift_kernel<<<grid, block>>>(in_new, shifted_new_tmp, n_new, phase_inc, phase_start);
+        cudaError_t err = cudaGetLastError();
+        if (err != cudaSuccess) return (int)err;
+    }
+
+    const int total_phase = phase_count + n_new;
+    const int out_count = total_phase / decim;
+    if (out_count > out_cap) return -14;
+
+    if (out_count > 0) {
+        if (!out) return -15;
+        const int block = 256;
+        const int grid = (out_count + block - 1) / block;
+        const int start_idx = decim - phase_count - 1;
+        gpud_streaming_polyphase_accum_kernel<<<grid, block>>>(
+            history_state,
+            history_len,
+            shifted_new_tmp,
+            n_new,
+            polyphase_taps,
+            polyphase_len,
+            decim,
+            phase_len,
+            start_idx,
+            out_count,
+            out
+        );
+        cudaError_t err = cudaGetLastError();
+        if (err != cudaSuccess) return (int)err;
+    }
+
+    int new_history_len = history_len;
+    if (history_cap > 0) {
+        new_history_len = history_len + n_new;
+        if (new_history_len > history_cap) new_history_len = history_cap;
+        if (new_history_len > 0) {
+            if (!history_state || !history_scratch) return -16;
+            const int block = 256;
+            const int grid = (new_history_len + block - 1) / block;
+            gpud_streaming_history_tail_kernel<<<grid, block>>>(
+                history_state,
+                history_len,
+                shifted_new_tmp,
+                n_new,
+                new_history_len,
+                history_scratch
+            );
+            cudaError_t err = cudaGetLastError();
+            if (err != cudaSuccess) return (int)err;
+            err = cudaMemcpy(history_state, history_scratch, (size_t)new_history_len * sizeof(float2), cudaMemcpyDeviceToDevice);
+            if (err != cudaSuccess) return (int)err;
+        }
+    } else {
+        new_history_len = 0;
+    }
+
+    *history_len_io = new_history_len;
+    *phase_count_state = total_phase % decim;
+    *phase_state = gpud_reduce_phase(phase_start + phase_inc * (double)n_new);
+    *n_out = out_count;
+    return 0;
+}
diff --git a/internal/demod/gpudemod/oracle_runner_test.go b/internal/demod/gpudemod/oracle_runner_test.go
new file mode 100644
index 0000000..e7d27bd
--- /dev/null
+++ b/internal/demod/gpudemod/oracle_runner_test.go
@@ -0,0 +1,31 @@
+package gpudemod
+
+import "testing"
+
+func TestCPUOracleRunnerCleansUpDisappearedSignals(t *testing.T) {
+	r := NewCPUOracleRunner(4000000)
+	jobs1 := []StreamingExtractJob{
+		{SignalID: 1, OffsetHz: 1000, Bandwidth: 20000, OutRate: 200000, NumTaps: 65, ConfigHash: 101},
+		{SignalID: 2, OffsetHz: 2000, Bandwidth: 20000, OutRate: 200000, NumTaps: 65, ConfigHash: 102},
+	}
+	_, err := r.StreamingExtract(makeDeterministicIQ(4096), jobs1)
+	if err != nil {
+		t.Fatalf("unexpected error on first extract: %v", err)
+	}
+	if len(r.States) != 2 {
+		t.Fatalf("expected 2 states, got %d", len(r.States))
+	}
+	jobs2 := []StreamingExtractJob{
+		{SignalID: 2, OffsetHz: 2000, Bandwidth: 20000, OutRate: 200000, NumTaps: 65, ConfigHash: 102},
+	}
+	_, err = r.StreamingExtract(makeDeterministicIQ(2048), jobs2)
+	if err != nil {
+		t.Fatalf("unexpected error on second extract: %v", err)
+	}
+	if len(r.States) != 1 {
+		t.Fatalf("expected 1 state after cleanup, got %d", len(r.States))
+	}
+	if _, ok := r.States[1]; ok {
+		t.Fatalf("expected signal 1 state to be cleaned up")
+	}
+}
diff --git a/internal/demod/gpudemod/oracle_validation_test.go b/internal/demod/gpudemod/oracle_validation_test.go
new file mode 100644
index 0000000..7026dcb
--- /dev/null
+++ b/internal/demod/gpudemod/oracle_validation_test.go
@@ -0,0 +1,45 @@
+package gpudemod
+
+import "testing"
+
+func TestCPUOracleMonolithicVsChunkedPolyphase(t *testing.T) {
+	iq := makeDeterministicIQ(120000)
+	mk := func() *CPUOracleState {
+		taps := makeLowpassTaps(65)
+		return &CPUOracleState{
+			SignalID:       1,
+			ConfigHash:     999,
+			NCOPhase:       0,
+			Decim:          20,
+			PhaseCount:     0,
+			NumTaps:        65,
+			ShiftedHistory: make([]complex64, 0, 64),
+			BaseTaps:       taps,
+			PolyphaseTaps:  BuildPolyphaseTapsPhaseMajor(taps, 20),
+		}
+	}
+	phaseInc := 0.013
+	mono := CPUOracleExtractPolyphase(iq, mk(), phaseInc)
+	chunked := func() []complex64 {
+		state := mk()
+		out := make([]complex64, 0)
+		chunks := []int{4096, 3000, 8192, 7777, 12000}
+		pos := 0
+		for _, n := range chunks {
+			if pos >= len(iq) {
+				break
+			}
+			end := pos + n
+			if end > len(iq) {
+				end = len(iq)
+			}
+			out = append(out, CPUOracleExtractPolyphase(iq[pos:end], state, phaseInc)...)
+			pos = end
+		}
+		if pos < len(iq) {
+			out = append(out, CPUOracleExtractPolyphase(iq[pos:], state, phaseInc)...)
+		}
+		return out
+	}()
+	requireComplexSlicesClose(t, mono, chunked, 1e-5)
+}
diff --git a/internal/demod/gpudemod/polyphase.go b/internal/demod/gpudemod/polyphase.go
new file mode 100644
index 0000000..f92acd7
--- /dev/null
+++ b/internal/demod/gpudemod/polyphase.go
@@ -0,0 +1,28 @@
+package gpudemod
+
+// BuildPolyphaseTapsPhaseMajor builds a phase-major polyphase tap layout:
+// tapsByPhase[p][k] = h[p + k*D]
+// Flattened as: [phase0 taps..., phase1 taps..., ...]
+func BuildPolyphaseTapsPhaseMajor(base []float32, decim int) []float32 {
+	if decim <= 0 || len(base) == 0 {
+		return nil
+	}
+	maxPhaseLen := (len(base) + decim - 1) / decim
+	out := make([]float32, decim*maxPhaseLen)
+	for p := 0; p < decim; p++ {
+		for k := 0; k < maxPhaseLen; k++ {
+			src := p + k*decim
+			if src < len(base) {
+				out[p*maxPhaseLen+k] = base[src]
+			}
+		}
+	}
+	return out
+}
+
+func PolyphasePhaseLen(baseLen int, decim int) int {
+	if decim <= 0 || baseLen <= 0 {
+		return 0
+	}
+	return (baseLen + decim - 1) / decim
+}
diff --git a/internal/demod/gpudemod/polyphase_test.go b/internal/demod/gpudemod/polyphase_test.go
new file mode 100644
index 0000000..bd8ecb9
--- /dev/null
+++ b/internal/demod/gpudemod/polyphase_test.go
@@ -0,0 +1,22 @@
+package gpudemod
+
+import "testing"
+
+func TestBuildPolyphaseTapsPhaseMajor(t *testing.T) {
+	base := []float32{1, 2, 3, 4, 5, 6, 7}
+	got := BuildPolyphaseTapsPhaseMajor(base, 3)
+	// phase-major with phase len ceil(7/3)=3
+	want := []float32{
+		1, 4, 7,
+		2, 5, 0,
+		3, 6, 0,
+	}
+	if len(got) != len(want) {
+		t.Fatalf("len mismatch: got %d want %d", len(got), len(want))
+	}
+	for i := range want {
+		if got[i] != want[i] {
+			t.Fatalf("mismatch at %d: got %v want %v", i, got[i], want[i])
+		}
+	}
+}
diff --git a/internal/demod/gpudemod/state_reset_test.go b/internal/demod/gpudemod/state_reset_test.go
new file mode 100644
index 0000000..9345caa
--- /dev/null
+++ b/internal/demod/gpudemod/state_reset_test.go
@@ -0,0 +1,57 @@
+package gpudemod
+
+import "testing"
+
+func TestResetCPUOracleStateIfConfigChanged(t *testing.T) {
+	state := &CPUOracleState{
+		SignalID:       1,
+		ConfigHash:     111,
+		NCOPhase:       1.23,
+		Decim:          20,
+		PhaseCount:     7,
+		NumTaps:        65,
+		ShiftedHistory: []complex64{1 + 1i, 2 + 2i},
+	}
+	ResetCPUOracleStateIfConfigChanged(state, 222)
+	if state.ConfigHash != 222 {
+		t.Fatalf("config hash not updated")
+	}
+	if state.NCOPhase != 0 {
+		t.Fatalf("expected phase reset")
+	}
+	if state.PhaseCount != 0 {
+		t.Fatalf("expected phase count reset")
+	}
+	if len(state.ShiftedHistory) != 0 {
+		t.Fatalf("expected shifted history reset")
+	}
+}
+
+func TestResetExtractStreamState(t *testing.T) {
+	state := &ExtractStreamState{
+		SignalID:       1,
+		ConfigHash:     111,
+		NCOPhase:       2.34,
+		Decim:          20,
+		PhaseCount:     9,
+		NumTaps:        65,
+		ShiftedHistory: []complex64{3 + 3i, 4 + 4i},
+		Initialized:    true,
+	}
+	ResetExtractStreamState(state, 333)
+	if state.ConfigHash != 333 {
+		t.Fatalf("config hash not updated")
+	}
+	if state.NCOPhase != 0 {
+		t.Fatalf("expected phase reset")
+	}
+	if state.PhaseCount != 0 {
+		t.Fatalf("expected phase count reset")
+	}
+	if len(state.ShiftedHistory) != 0 {
+		t.Fatalf("expected shifted history reset")
+	}
+	if state.Initialized {
+		t.Fatalf("expected initialized=false after reset")
+	}
+}
diff --git a/internal/demod/gpudemod/stream_state.go b/internal/demod/gpudemod/stream_state.go
new file mode 100644
index 0000000..26bc5fd
--- /dev/null
+++ b/internal/demod/gpudemod/stream_state.go
@@ -0,0 +1,70 @@
+package gpudemod
+
+import (
+	"log"
+
+	"sdr-wideband-suite/internal/dsp"
+)
+
+func (r *BatchRunner) ResetSignalState(signalID int64) {
+	if r == nil || r.streamState == nil {
+		return
+	}
+	delete(r.streamState, signalID)
+	r.resetNativeStreamingState(signalID)
+}
+
+func (r *BatchRunner) ResetAllSignalStates() {
+	if r == nil {
+		return
+	}
+	r.streamState = make(map[int64]*ExtractStreamState)
+	r.resetAllNativeStreamingStates()
+}
+
+func (r *BatchRunner) getOrInitExtractState(job StreamingExtractJob, sampleRate int) (*ExtractStreamState, error) {
+	if r == nil {
+		return nil, ErrUnavailable
+	}
+	if r.streamState == nil {
+		r.streamState = make(map[int64]*ExtractStreamState)
+	}
+	decim, err := ExactIntegerDecimation(sampleRate, job.OutRate)
+	if err != nil {
+		return nil, err
+	}
+	state := r.streamState[job.SignalID]
+	if state == nil {
+		state = &ExtractStreamState{SignalID: job.SignalID}
+		r.streamState[job.SignalID] = state
+	}
+	if state.ConfigHash != job.ConfigHash {
+		if state.Initialized {
+			log.Printf("STREAMING STATE RESET: signal=%d oldHash=%d newHash=%d historyLen=%d",
+				job.SignalID, state.ConfigHash, job.ConfigHash, len(state.ShiftedHistory))
+		}
+		ResetExtractStreamState(state, job.ConfigHash)
+	}
+	state.Decim = decim
+	state.NumTaps = job.NumTaps
+	if state.NumTaps <= 0 {
+		state.NumTaps = 101
+	}
+	cutoff := job.Bandwidth / 2
+	if cutoff < 200 {
+		cutoff = 200
+	}
+	base := dsp.LowpassFIR(cutoff, sampleRate, state.NumTaps)
+	state.BaseTaps = make([]float32, len(base))
+	for i, v := range base {
+		state.BaseTaps[i] = float32(v)
+	}
+	state.PolyphaseTaps = BuildPolyphaseTapsPhaseMajor(state.BaseTaps, state.Decim)
+	if cap(state.ShiftedHistory) < maxInt(0, state.NumTaps-1) {
+		state.ShiftedHistory = make([]complex64, 0, maxInt(0, state.NumTaps-1))
+	} else if state.ShiftedHistory == nil {
+		state.ShiftedHistory = make([]complex64, 0, maxInt(0, state.NumTaps-1))
+	}
+	state.Initialized = true
+	return state, nil
+}
diff --git a/internal/demod/gpudemod/stream_state_test.go b/internal/demod/gpudemod/stream_state_test.go
new file mode 100644
index 0000000..b86c5f5
--- /dev/null
+++ b/internal/demod/gpudemod/stream_state_test.go
@@ -0,0 +1,31 @@
+package gpudemod
+
+import "testing"
+
+func TestGetOrInitExtractStateInitializesPolyphaseAndHistory(t *testing.T) {
+	r := &BatchRunner{streamState: make(map[int64]*ExtractStreamState)}
+	job := StreamingExtractJob{
+		SignalID:   7,
+		OffsetHz:   12500,
+		Bandwidth:  20000,
+		OutRate:    200000,
+		NumTaps:    65,
+		ConfigHash: 555,
+	}
+	state, err := r.getOrInitExtractState(job, 4000000)
+	if err != nil {
+		t.Fatalf("unexpected error: %v", err)
+	}
+	if state.Decim != 20 {
+		t.Fatalf("unexpected decim: %d", state.Decim)
+	}
+	if len(state.BaseTaps) != 65 {
+		t.Fatalf("unexpected base taps len: %d", len(state.BaseTaps))
+	}
+	if len(state.PolyphaseTaps) == 0 {
+		t.Fatalf("expected polyphase taps")
+	}
+	if cap(state.ShiftedHistory) < 64 {
+		t.Fatalf("expected shifted history capacity >= 64, got %d", cap(state.ShiftedHistory))
+	}
+}
diff --git a/internal/demod/gpudemod/streaming_gpu_contract.go b/internal/demod/gpudemod/streaming_gpu_contract.go
new file mode 100644
index 0000000..c978f22
--- /dev/null
+++ b/internal/demod/gpudemod/streaming_gpu_contract.go
@@ -0,0 +1,39 @@
+package gpudemod
+
+type StreamingGPUExecutionMode string
+
+const (
+	StreamingGPUExecUnavailable StreamingGPUExecutionMode = "unavailable"
+	StreamingGPUExecHostOracle StreamingGPUExecutionMode = "host_oracle"
+	StreamingGPUExecCUDA      StreamingGPUExecutionMode = "cuda"
+)
+
+type StreamingGPUInvocation struct {
+	SignalID       int64
+	ConfigHash     uint64
+	OffsetHz       float64
+	OutRate        int
+	Bandwidth      float64
+	SampleRate     int
+	NumTaps        int
+	Decim          int
+	PhaseCountIn   int
+	NCOPhaseIn     float64
+	HistoryLen     int
+	BaseTaps       []float32
+	PolyphaseTaps  []float32
+	ShiftedHistory []complex64
+	IQNew          []complex64
+}
+
+type StreamingGPUExecutionResult struct {
+	SignalID       int64
+	Mode           StreamingGPUExecutionMode
+	IQ             []complex64
+	Rate           int
+	NOut           int
+	PhaseCountOut  int
+	NCOPhaseOut    float64
+	HistoryOut     []complex64
+	HistoryLenOut  int
+}
diff --git a/internal/demod/gpudemod/streaming_gpu_exec.go b/internal/demod/gpudemod/streaming_gpu_exec.go
new file mode 100644
index 0000000..23ec814
--- /dev/null
+++ b/internal/demod/gpudemod/streaming_gpu_exec.go
@@ -0,0 +1,29 @@
+package gpudemod
+
+// StreamingExtractGPUExec is the internal execution selector for the new
+// production-path semantics. It intentionally keeps the public API stable while
+// allowing the implementation to evolve from host-side oracle execution toward
+// a real GPU polyphase path.
+func (r *BatchRunner) StreamingExtractGPUExec(iqNew []complex64, jobs []StreamingExtractJob) ([]StreamingExtractResult, error) {
+	invocations, err := r.buildStreamingGPUInvocations(iqNew, jobs)
+	if err != nil {
+		return nil, err
+	}
+	if useGPUNativePreparedExecution {
+		execResults, err := r.executeStreamingGPUNativePrepared(invocations)
+		if err == nil {
+			return r.applyStreamingGPUExecutionResults(execResults), nil
+		}
+		if !useGPUHostOracleExecution {
+			return nil, err
+		}
+	}
+	if useGPUHostOracleExecution {
+		execResults, err := r.executeStreamingGPUHostOraclePrepared(invocations)
+		if err != nil {
+			return nil, err
+		}
+		return r.applyStreamingGPUExecutionResults(execResults), nil
+	}
+	return nil, ErrUnavailable
+}
diff --git a/internal/demod/gpudemod/streaming_gpu_exec_test.go b/internal/demod/gpudemod/streaming_gpu_exec_test.go
new file mode 100644
index 0000000..9933cdf
--- /dev/null
+++ b/internal/demod/gpudemod/streaming_gpu_exec_test.go
@@ -0,0 +1,112 @@
+package gpudemod
+
+import "testing"
+
+func TestStreamingExtractGPUExecUsesSafeDefaultMode(t *testing.T) {
+	r := &BatchRunner{eng: &Engine{sampleRate: 4000000}, streamState: make(map[int64]*ExtractStreamState)}
+	job := StreamingExtractJob{
+		SignalID:   1,
+		OffsetHz:   12500,
+		Bandwidth:  20000,
+		OutRate:    200000,
+		NumTaps:    65,
+		ConfigHash: 777,
+	}
+	res, err := r.StreamingExtractGPUExec(makeDeterministicIQ(2048), []StreamingExtractJob{job})
+	if err != nil {
+		t.Fatalf("expected safe default execution path, got error: %v", err)
+	}
+	if len(res) != 1 {
+		t.Fatalf("expected 1 result, got %d", len(res))
+	}
+	if res[0].Rate != job.OutRate {
+		t.Fatalf("expected output rate %d, got %d", job.OutRate, res[0].Rate)
+	}
+	if res[0].NOut <= 0 {
+		t.Fatalf("expected streaming output samples")
+	}
+}
+
+func TestStreamingGPUExecMatchesCPUOracleAcrossChunkPatterns(t *testing.T) {
+	job := StreamingExtractJob{
+		SignalID:   1,
+		OffsetHz:   12500,
+		Bandwidth:  20000,
+		OutRate:    200000,
+		NumTaps:    65,
+		ConfigHash: 777,
+	}
+	t.Run("DeterministicIQ", func(t *testing.T) {
+		r := &BatchRunner{eng: &Engine{sampleRate: 4000000}, streamState: make(map[int64]*ExtractStreamState)}
+		steps := makeStreamingValidationSteps(
+			makeDeterministicIQ(1500),
+			[]int{0, 1, 2, 17, 63, 64, 65, 129, 511},
+			[]StreamingExtractJob{job},
+		)
+		runStreamingExecSequenceAgainstOracle(t, r, steps, 1e-5, 1e-9)
+	})
+	t.Run("ToneNoiseIQ", func(t *testing.T) {
+		r := &BatchRunner{eng: &Engine{sampleRate: 4000000}, streamState: make(map[int64]*ExtractStreamState)}
+		steps := makeStreamingValidationSteps(
+			makeToneNoiseIQ(4096, 0.023),
+			[]int{7, 20, 3, 63, 64, 65, 777},
+			[]StreamingExtractJob{job},
+		)
+		runStreamingExecSequenceAgainstOracle(t, r, steps, 1e-5, 1e-9)
+	})
+}
+
+func TestStreamingGPUExecLifecycleMatchesCPUOracle(t *testing.T) {
+	r := &BatchRunner{
+		eng:         &Engine{sampleRate: 4000000},
+		streamState: make(map[int64]*ExtractStreamState),
+		nativeState: make(map[int64]*nativeStreamingSignalState),
+	}
+	baseA := StreamingExtractJob{
+		SignalID:   11,
+		OffsetHz:   12500,
+		Bandwidth:  20000,
+		OutRate:    200000,
+		NumTaps:    65,
+		ConfigHash: 1001,
+	}
+	baseB := StreamingExtractJob{
+		SignalID:   22,
+		OffsetHz:   -18750,
+		Bandwidth:  16000,
+		OutRate:    100000,
+		NumTaps:    33,
+		ConfigHash: 2002,
+	}
+	steps := []streamingValidationStep{
+		{
+			name: "prime_both_signals",
+			iq:   makeDeterministicIQ(512),
+			jobs: []StreamingExtractJob{baseA, baseB},
+		},
+		{
+			name: "config_reset_with_zero_new",
+			iq:   nil,
+			jobs: []StreamingExtractJob{{SignalID: baseA.SignalID, OffsetHz: baseA.OffsetHz, Bandwidth: baseA.Bandwidth, OutRate: baseA.OutRate, NumTaps: baseA.NumTaps, ConfigHash: baseA.ConfigHash + 1}, baseB},
+		},
+		{
+			name: "signal_b_disappears",
+			iq:   makeToneNoiseIQ(96, 0.041),
+			jobs: []StreamingExtractJob{baseA},
+		},
+		{
+			name: "signal_b_reappears_fresh",
+			iq:   makeDeterministicIQ(160),
+			jobs: []StreamingExtractJob{baseA, baseB},
+		},
+		{
+			name: "small_history_boundary_chunk",
+			iq:   makeToneNoiseIQ(65, 0.017),
+			jobs: []StreamingExtractJob{baseA, baseB},
+		},
+	}
+	runStreamingExecSequenceAgainstOracle(t, r, steps, 1e-5, 1e-9)
+	if _, ok := r.nativeState[baseB.SignalID]; ok {
+		t.Fatalf("expected safe host-oracle path to keep native state inactive while gate is off")
+	}
+}
diff --git a/internal/demod/gpudemod/streaming_gpu_host_exec.go b/internal/demod/gpudemod/streaming_gpu_host_exec.go
new file mode 100644
index 0000000..02d5953
--- /dev/null
+++ b/internal/demod/gpudemod/streaming_gpu_host_exec.go
@@ -0,0 +1,30 @@
+package gpudemod
+
+func (r *BatchRunner) executeStreamingGPUHostOraclePrepared(invocations []StreamingGPUInvocation) ([]StreamingGPUExecutionResult, error) {
+	results := make([]StreamingGPUExecutionResult, len(invocations))
+	for i, inv := range invocations {
+		out, phase, phaseCount, hist := runStreamingPolyphaseHostCore(
+			inv.IQNew,
+			inv.SampleRate,
+			inv.OffsetHz,
+			inv.NCOPhaseIn,
+			inv.PhaseCountIn,
+			inv.NumTaps,
+			inv.Decim,
+			inv.ShiftedHistory,
+			inv.PolyphaseTaps,
+		)
+		results[i] = StreamingGPUExecutionResult{
+			SignalID:      inv.SignalID,
+			Mode:          StreamingGPUExecHostOracle,
+			IQ:            out,
+			Rate:          inv.OutRate,
+			NOut:          len(out),
+			PhaseCountOut: phaseCount,
+			NCOPhaseOut:   phase,
+			HistoryOut:    hist,
+			HistoryLenOut: len(hist),
+		}
+	}
+	return results, nil
+}
diff --git a/internal/demod/gpudemod/streaming_gpu_host_oracle.go b/internal/demod/gpudemod/streaming_gpu_host_oracle.go
new file mode 100644
index 0000000..aa2825e
--- /dev/null
+++ b/internal/demod/gpudemod/streaming_gpu_host_oracle.go
@@ -0,0 +1,49 @@
+package gpudemod
+
+// StreamingExtractGPUHostOracle is a temporary host-side execution of the intended
+// streaming semantics using GPU-owned stream state. It is not the final GPU
+// production implementation, but it allows the new production entrypoint to move
+// from pure stub semantics toward real NEW-samples-only streaming behavior
+// without reintroducing overlap+trim.
+func (r *BatchRunner) StreamingExtractGPUHostOracle(iqNew []complex64, jobs []StreamingExtractJob) ([]StreamingExtractResult, error) {
+	if r == nil || r.eng == nil {
+		return nil, ErrUnavailable
+	}
+	results := make([]StreamingExtractResult, len(jobs))
+	active := make(map[int64]struct{}, len(jobs))
+	for i, job := range jobs {
+		active[job.SignalID] = struct{}{}
+		state, err := r.getOrInitExtractState(job, r.eng.sampleRate)
+		if err != nil {
+			return nil, err
+		}
+		out, phase, phaseCount, hist := runStreamingPolyphaseHostCore(
+			iqNew,
+			r.eng.sampleRate,
+			job.OffsetHz,
+			state.NCOPhase,
+			state.PhaseCount,
+			state.NumTaps,
+			state.Decim,
+			state.ShiftedHistory,
+			state.PolyphaseTaps,
+		)
+		state.NCOPhase = phase
+		state.PhaseCount = phaseCount
+		state.ShiftedHistory = append(state.ShiftedHistory[:0], hist...)
+		results[i] = StreamingExtractResult{
+			SignalID:   job.SignalID,
+			IQ:         out,
+			Rate:       job.OutRate,
+			NOut:       len(out),
+			PhaseCount: state.PhaseCount,
+			HistoryLen: len(state.ShiftedHistory),
+		}
+	}
+	for signalID := range r.streamState {
+		if _, ok := active[signalID]; !ok {
+			delete(r.streamState, signalID)
+		}
+	}
+	return results, nil
+}
diff --git a/internal/demod/gpudemod/streaming_gpu_host_oracle_test.go b/internal/demod/gpudemod/streaming_gpu_host_oracle_test.go
new file mode 100644
index 0000000..b889ba5
--- /dev/null
+++ b/internal/demod/gpudemod/streaming_gpu_host_oracle_test.go
@@ -0,0 +1,35 @@
+package gpudemod
+
+import "testing"
+
+func TestStreamingGPUHostOracleComparableToCPUOracle(t *testing.T) {
+	r := &BatchRunner{eng: &Engine{sampleRate: 4000000}, streamState: make(map[int64]*ExtractStreamState)}
+	job := StreamingExtractJob{
+		SignalID:   1,
+		OffsetHz:   12500,
+		Bandwidth:  20000,
+		OutRate:    200000,
+		NumTaps:    65,
+		ConfigHash: 777,
+	}
+	iq := makeDeterministicIQ(16000)
+	gpuLike, err := r.StreamingExtractGPUHostOracle(iq, []StreamingExtractJob{job})
+	if err != nil {
+		t.Fatalf("unexpected host-oracle error: %v", err)
+	}
+	oracleRunner := NewCPUOracleRunner(4000000)
+	oracle, err := oracleRunner.StreamingExtract(iq, []StreamingExtractJob{job})
+	if err != nil {
+		t.Fatalf("unexpected oracle error: %v", err)
+	}
+	if len(gpuLike) != 1 || len(oracle) != 1 {
+		t.Fatalf("unexpected result lengths: gpuLike=%d oracle=%d", len(gpuLike), len(oracle))
+	}
+	metrics, stats := CompareOracleAndGPUHostOracle(oracle[0], gpuLike[0])
+	if stats.Count == 0 {
+		t.Fatalf("expected compare count > 0")
+	}
+	if metrics.RefMaxAbsErr > 1e-5 {
+		t.Fatalf("expected host-oracle path to match cpu oracle closely, got max abs err %f", metrics.RefMaxAbsErr)
+	}
+}
diff --git a/internal/demod/gpudemod/streaming_gpu_modes.go b/internal/demod/gpudemod/streaming_gpu_modes.go
new file mode 100644
index 0000000..c5e858d
--- /dev/null
+++ b/internal/demod/gpudemod/streaming_gpu_modes.go
@@ -0,0 +1,4 @@
+package gpudemod
+
+const useGPUHostOracleExecution = false
+const useGPUNativePreparedExecution = true
diff --git a/internal/demod/gpudemod/streaming_gpu_native_prepare.go b/internal/demod/gpudemod/streaming_gpu_native_prepare.go
new file mode 100644
index 0000000..247998d
--- /dev/null
+++ b/internal/demod/gpudemod/streaming_gpu_native_prepare.go
@@ -0,0 +1,284 @@
+//go:build cufft && windows
+
+package gpudemod
+
+/*
+#cgo windows CFLAGS: -I"C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v13.2/include"
+#include <cuda_runtime.h>
+typedef struct { float x; float y; } gpud_float2;
+*/
+import "C"
+
+import (
+	"math"
+	"unsafe"
+)
+
+func (r *BatchRunner) executeStreamingGPUNativePrepared(invocations []StreamingGPUInvocation) ([]StreamingGPUExecutionResult, error) {
+	if r == nil || r.eng == nil {
+		return nil, ErrUnavailable
+	}
+	if r.nativeState == nil {
+		r.nativeState = make(map[int64]*nativeStreamingSignalState)
+	}
+	results := make([]StreamingGPUExecutionResult, len(invocations))
+	for i, inv := range invocations {
+		state, err := r.getOrInitNativeStreamingState(inv)
+		if err != nil {
+			return nil, err
+		}
+		if len(inv.IQNew) > 0 {
+			if err := ensureNativeBuffer(&state.dInNew, &state.inNewCap, len(inv.IQNew), unsafe.Sizeof(C.gpud_float2{})); err != nil {
+				return nil, err
+			}
+			if bridgeMemcpyH2D(state.dInNew, unsafe.Pointer(&inv.IQNew[0]), uintptr(len(inv.IQNew))*unsafe.Sizeof(complex64(0))) != 0 {
+				return nil, ErrUnavailable
+			}
+		}
+		outCap := len(inv.IQNew)/maxInt(1, inv.Decim) + 2
+		if outCap > 0 {
+			if err := ensureNativeBuffer(&state.dOut, &state.outCap, outCap, unsafe.Sizeof(C.gpud_float2{})); err != nil {
+				return nil, err
+			}
+		}
+
+		phaseInc := -2.0 * math.Pi * inv.OffsetHz / float64(inv.SampleRate)
+		// The native export consumes phase carry as host scalars while sample/history
+		// buffers remain device-resident, so keep these counters in nativeState.
+		var nOut C.int
+		historyLen := C.int(state.historyLen)
+		phaseCount := C.int(state.phaseCount)
+		phaseNCO := C.double(state.phaseNCO)
+		res := bridgeLaunchStreamingPolyphaseStateful(
+			(*C.gpud_float2)(state.dInNew),
+			len(inv.IQNew),
+			(*C.gpud_float2)(state.dShifted),
+			(*C.float)(state.dTaps),
+			state.tapsLen,
+			state.decim,
+			state.numTaps,
+			(*C.gpud_float2)(state.dHistory),
+			(*C.gpud_float2)(state.dHistoryScratch),
+			state.historyCap,
+			&historyLen,
+			&phaseCount,
+			&phaseNCO,
+			phaseInc,
+			(*C.gpud_float2)(state.dOut),
+			outCap,
+			&nOut,
+		)
+		if res != 0 {
+			return nil, ErrUnavailable
+		}
+		state.historyLen = int(historyLen)
+		state.phaseCount = int(phaseCount)
+		state.phaseNCO = float64(phaseNCO)
+
+		outHost := make([]complex64, int(nOut))
+		if len(outHost) > 0 {
+			if bridgeMemcpyD2H(unsafe.Pointer(&outHost[0]), state.dOut, uintptr(len(outHost))*unsafe.Sizeof(complex64(0))) != 0 {
+				return nil, ErrUnavailable
+			}
+		}
+		histHost := make([]complex64, state.historyLen)
+		if state.historyLen > 0 {
+			if bridgeMemcpyD2H(unsafe.Pointer(&histHost[0]), state.dHistory, uintptr(state.historyLen)*unsafe.Sizeof(complex64(0))) != 0 {
+				return nil, ErrUnavailable
+			}
+		}
+
+		results[i] = StreamingGPUExecutionResult{
+			SignalID:      inv.SignalID,
+			Mode:          StreamingGPUExecCUDA,
+			IQ:            outHost,
+			Rate:          inv.OutRate,
+			NOut:          len(outHost),
+			PhaseCountOut: state.phaseCount,
+			NCOPhaseOut:   state.phaseNCO,
+			HistoryOut:    histHost,
+			HistoryLenOut: len(histHost),
+		}
+	}
+	return results, nil
+}
+
+func (r *BatchRunner) getOrInitNativeStreamingState(inv StreamingGPUInvocation) (*nativeStreamingSignalState, error) {
+	state := r.nativeState[inv.SignalID]
+	needReset := false
+	historyCap := maxInt(0, inv.NumTaps-1)
+	if state == nil {
+		state = &nativeStreamingSignalState{signalID: inv.SignalID}
+		r.nativeState[inv.SignalID] = state
+		needReset = true
+	}
+	if state.configHash != inv.ConfigHash {
+		needReset = true
+	}
+	if state.decim != inv.Decim || state.numTaps != inv.NumTaps || state.tapsLen != len(inv.PolyphaseTaps) {
+		needReset = true
+	}
+	if state.historyCap != historyCap {
+		needReset = true
+	}
+	if needReset {
+		releaseNativeStreamingSignalState(state)
+	}
+	if len(inv.PolyphaseTaps) == 0 {
+		return nil, ErrUnavailable
+	}
+	if state.dTaps == nil && len(inv.PolyphaseTaps) > 0 {
+		if bridgeCudaMalloc(&state.dTaps, uintptr(len(inv.PolyphaseTaps))*unsafe.Sizeof(C.float(0))) != 0 {
+			return nil, ErrUnavailable
+		}
+		if bridgeMemcpyH2D(state.dTaps, unsafe.Pointer(&inv.PolyphaseTaps[0]), uintptr(len(inv.PolyphaseTaps))*unsafe.Sizeof(float32(0))) != 0 {
+			return nil, ErrUnavailable
+		}
+		state.tapsLen = len(inv.PolyphaseTaps)
+	}
+	if state.dShifted == nil {
+		minCap := maxInt(1, len(inv.IQNew))
+		if bridgeCudaMalloc(&state.dShifted, uintptr(minCap)*unsafe.Sizeof(C.gpud_float2{})) != 0 {
+			return nil, ErrUnavailable
+		}
+		state.shiftedCap = minCap
+	}
+	if state.shiftedCap < len(inv.IQNew) {
+		if bridgeCudaFree(state.dShifted) != 0 {
+			return nil, ErrUnavailable
+		}
+		state.dShifted = nil
+		state.shiftedCap = 0
+		if bridgeCudaMalloc(&state.dShifted, uintptr(len(inv.IQNew))*unsafe.Sizeof(C.gpud_float2{})) != 0 {
+			return nil, ErrUnavailable
+		}
+		state.shiftedCap = len(inv.IQNew)
+	}
+	if state.dHistory == nil && historyCap > 0 {
+		if bridgeCudaMalloc(&state.dHistory, uintptr(historyCap)*unsafe.Sizeof(C.gpud_float2{})) != 0 {
+			return nil, ErrUnavailable
+		}
+	}
+	if state.dHistoryScratch == nil && historyCap > 0 {
+		if bridgeCudaMalloc(&state.dHistoryScratch, uintptr(historyCap)*unsafe.Sizeof(C.gpud_float2{})) != 0 {
+			return nil, ErrUnavailable
+		}
+		state.historyScratchCap = historyCap
+	}
+	if needReset {
+		state.phaseCount = inv.PhaseCountIn
+		state.phaseNCO = inv.NCOPhaseIn
+		state.historyLen = minInt(len(inv.ShiftedHistory), historyCap)
+		if state.historyLen > 0 {
+			if bridgeMemcpyH2D(state.dHistory, unsafe.Pointer(&inv.ShiftedHistory[len(inv.ShiftedHistory)-state.historyLen]), uintptr(state.historyLen)*unsafe.Sizeof(complex64(0))) != 0 {
+				return nil, ErrUnavailable
+			}
+		}
+	}
+	state.decim = inv.Decim
+	state.numTaps = inv.NumTaps
+	state.historyCap = historyCap
+	state.historyScratchCap = historyCap
+	state.configHash = inv.ConfigHash
+	return state, nil
+}
+
+func ensureNativeBuffer(ptr *unsafe.Pointer, capRef *int, need int, elemSize uintptr) error {
+	if need <= 0 {
+		return nil
+	}
+	if *ptr != nil && *capRef >= need {
+		return nil
+	}
+	if *ptr != nil {
+		if bridgeCudaFree(*ptr) != 0 {
+			return ErrUnavailable
+		}
+		*ptr = nil
+		*capRef = 0
+	}
+	if bridgeCudaMalloc(ptr, uintptr(need)*elemSize) != 0 {
+		return ErrUnavailable
+	}
+	*capRef = need
+	return nil
+}
+
+func (r *BatchRunner) syncNativeStreamingStates(active map[int64]struct{}) {
+	if r == nil || r.nativeState == nil {
+		return
+	}
+	for id, state := range r.nativeState {
+		if _, ok := active[id]; ok {
+			continue
+		}
+		releaseNativeStreamingSignalState(state)
+		delete(r.nativeState, id)
+	}
+}
+
+func (r *BatchRunner) resetNativeStreamingState(signalID int64) {
+	if r == nil || r.nativeState == nil {
+		return
+	}
+	if state := r.nativeState[signalID]; state != nil {
+		releaseNativeStreamingSignalState(state)
+	}
+	delete(r.nativeState, signalID)
+}
+
+func (r *BatchRunner) resetAllNativeStreamingStates() {
+	if r == nil {
+		return
+	}
+	r.freeAllNativeStreamingStates()
+	r.nativeState = make(map[int64]*nativeStreamingSignalState)
+}
+
+func (r *BatchRunner) freeAllNativeStreamingStates() {
+	if r == nil || r.nativeState == nil {
+		return
+	}
+	for id, state := range r.nativeState {
+		releaseNativeStreamingSignalState(state)
+		delete(r.nativeState, id)
+	}
+}
+
+func releaseNativeStreamingSignalState(state *nativeStreamingSignalState) {
+	if state == nil {
+		return
+	}
+	for _, ptr := range []*unsafe.Pointer{
+		&state.dInNew,
+		&state.dShifted,
+		&state.dOut,
+		&state.dTaps,
+		&state.dHistory,
+		&state.dHistoryScratch,
+	} {
+		if *ptr != nil {
+			_ = bridgeCudaFree(*ptr)
+			*ptr = nil
+		}
+	}
+	state.inNewCap = 0
+	state.shiftedCap = 0
+	state.outCap = 0
+	state.tapsLen = 0
+	state.historyCap = 0
+	state.historyLen = 0
+	state.historyScratchCap = 0
+	state.phaseCount = 0
+	state.phaseNCO = 0
+	state.decim = 0
+	state.numTaps = 0
+	state.configHash = 0
+}
+
+func minInt(a int, b int) int {
+	if a < b {
+		return a
+	}
+	return b
+}
diff --git a/internal/demod/gpudemod/streaming_gpu_native_prepare_stub.go b/internal/demod/gpudemod/streaming_gpu_native_prepare_stub.go
new file mode 100644
index 0000000..7f1e4c0
--- /dev/null
+++ b/internal/demod/gpudemod/streaming_gpu_native_prepare_stub.go
@@ -0,0 +1,44 @@
+//go:build !cufft || !windows
+
+package gpudemod
+
+func (r *BatchRunner) executeStreamingGPUNativePrepared(invocations []StreamingGPUInvocation) ([]StreamingGPUExecutionResult, error) {
+	_ = invocations
+	return nil, ErrUnavailable
+}
+
+func (r *BatchRunner) syncNativeStreamingStates(active map[int64]struct{}) {
+	_ = active
+	if r == nil {
+		return
+	}
+	if r.nativeState == nil {
+		r.nativeState = make(map[int64]*nativeStreamingSignalState)
+	}
+	for id := range r.nativeState {
+		if _, ok := active[id]; !ok {
+			delete(r.nativeState, id)
+		}
+	}
+}
+
+func (r *BatchRunner) resetNativeStreamingState(signalID int64) {
+	if r == nil || r.nativeState == nil {
+		return
+	}
+	delete(r.nativeState, signalID)
+}
+
+func (r *BatchRunner) resetAllNativeStreamingStates() {
+	if r == nil {
+		return
+	}
+	r.nativeState = make(map[int64]*nativeStreamingSignalState)
+}
+
+func (r *BatchRunner) freeAllNativeStreamingStates() {
+	if r == nil {
+		return
+	}
+	r.nativeState = nil
+}
diff --git a/internal/demod/gpudemod/streaming_gpu_native_prepare_test.go b/internal/demod/gpudemod/streaming_gpu_native_prepare_test.go
new file mode 100644
index 0000000..9312d65
--- /dev/null
+++ b/internal/demod/gpudemod/streaming_gpu_native_prepare_test.go
@@ -0,0 +1,206 @@
+//go:build cufft && windows
+
+package gpudemod
+
+import (
+	"os"
+	"path/filepath"
+	"testing"
+)
+
+func configureNativePreparedDLLPath(t *testing.T) {
+	t.Helper()
+	candidates := []string{
+		filepath.Join("build", "gpudemod_kernels.dll"),
+		filepath.Join("internal", "demod", "gpudemod", "build", "gpudemod_kernels.dll"),
+		"gpudemod_kernels.dll",
+	}
+	for _, candidate := range candidates {
+		if _, err := os.Stat(candidate); err == nil {
+			abs, err := filepath.Abs(candidate)
+			if err != nil {
+				t.Fatalf("resolve native prepared DLL path: %v", err)
+			}
+			t.Setenv("GPUMOD_DLL", abs)
+			return
+		}
+	}
+}
+
+func requireNativePreparedTestRunner(t *testing.T) *BatchRunner {
+	t.Helper()
+	configureNativePreparedDLLPath(t)
+	if err := ensureDLLLoaded(); err != nil {
+		t.Skipf("native prepared path unavailable: %v", err)
+	}
+	if !Available() {
+		t.Skip("native prepared path unavailable: cuda device not available")
+	}
+	r, err := NewBatchRunner(32768, 4000000)
+	if err != nil {
+		t.Skipf("native prepared path unavailable: %v", err)
+	}
+	t.Cleanup(r.Close)
+	return r
+}
+
+func TestStreamingGPUNativePreparedMatchesCPUOracleAcrossChunkPatterns(t *testing.T) {
+	job := StreamingExtractJob{
+		SignalID:   1,
+		OffsetHz:   12500,
+		Bandwidth:  20000,
+		OutRate:    200000,
+		NumTaps:    65,
+		ConfigHash: 777,
+	}
+	exec := func(r *BatchRunner, invocations []StreamingGPUInvocation) ([]StreamingGPUExecutionResult, error) {
+		return r.executeStreamingGPUNativePrepared(invocations)
+	}
+	t.Run("DeterministicIQ", func(t *testing.T) {
+		r := requireNativePreparedTestRunner(t)
+		steps := makeStreamingValidationSteps(
+			makeDeterministicIQ(8192),
+			[]int{0, 1, 2, 17, 63, 64, 65, 129, 511, 2048},
+			[]StreamingExtractJob{job},
+		)
+		runPreparedSequenceAgainstOracle(t, r, exec, steps, 1e-4, 1e-8)
+	})
+	t.Run("ToneNoiseIQ", func(t *testing.T) {
+		r := requireNativePreparedTestRunner(t)
+		steps := makeStreamingValidationSteps(
+			makeToneNoiseIQ(12288, 0.023),
+			[]int{7, 20, 3, 63, 64, 65, 777, 2048, 4096},
+			[]StreamingExtractJob{job},
+		)
+		runPreparedSequenceAgainstOracle(t, r, exec, steps, 1e-4, 1e-8)
+	})
+}
+
+func TestStreamingGPUNativePreparedLifecycleResetAndCapacity(t *testing.T) {
+	r := requireNativePreparedTestRunner(t)
+	exec := func(invocations []StreamingGPUInvocation) ([]StreamingGPUExecutionResult, error) {
+		return r.executeStreamingGPUNativePrepared(invocations)
+	}
+	jobA := StreamingExtractJob{
+		SignalID:   11,
+		OffsetHz:   12500,
+		Bandwidth:  20000,
+		OutRate:    200000,
+		NumTaps:    65,
+		ConfigHash: 3001,
+	}
+	jobB := StreamingExtractJob{
+		SignalID:   22,
+		OffsetHz:   -18750,
+		Bandwidth:  16000,
+		OutRate:    100000,
+		NumTaps:    33,
+		ConfigHash: 4002,
+	}
+
+	steps := []streamingValidationStep{
+		{
+			name: "prime_both_signals",
+			iq:   makeDeterministicIQ(256),
+			jobs: []StreamingExtractJob{jobA, jobB},
+		},
+		{
+			name: "grow_capacity",
+			iq:   makeToneNoiseIQ(4096, 0.037),
+			jobs: []StreamingExtractJob{jobA, jobB},
+		},
+		{
+			name: "config_reset_zero_new",
+			iq:   nil,
+			jobs: []StreamingExtractJob{{SignalID: jobA.SignalID, OffsetHz: jobA.OffsetHz, Bandwidth: jobA.Bandwidth, OutRate: jobA.OutRate, NumTaps: jobA.NumTaps, ConfigHash: jobA.ConfigHash + 1}, jobB},
+		},
+		{
+			name: "signal_b_disappears",
+			iq:   makeDeterministicIQ(64),
+			jobs: []StreamingExtractJob{jobA},
+		},
+		{
+			name: "signal_b_reappears",
+			iq:   makeToneNoiseIQ(96, 0.017),
+			jobs: []StreamingExtractJob{jobA, jobB},
+		},
+		{
+			name: "history_boundary",
+			iq:   makeDeterministicIQ(65),
+			jobs: []StreamingExtractJob{jobA, jobB},
+		},
+	}
+
+	oracle := NewCPUOracleRunner(r.eng.sampleRate)
+	var grownCap int
+	for idx, step := range steps {
+		invocations, err := r.buildStreamingGPUInvocations(step.iq, step.jobs)
+		if err != nil {
+			t.Fatalf("step %d (%s): build invocations failed: %v", idx, step.name, err)
+		}
+		got, err := exec(invocations)
+		if err != nil {
+			t.Fatalf("step %d (%s): native prepared exec failed: %v", idx, step.name, err)
+		}
+		want, err := oracle.StreamingExtract(step.iq, step.jobs)
+		if err != nil {
+			t.Fatalf("step %d (%s): oracle failed: %v", idx, step.name, err)
+		}
+		if len(got) != len(want) {
+			t.Fatalf("step %d (%s): result count mismatch: got=%d want=%d", idx, step.name, len(got), len(want))
+		}
+		applied := r.applyStreamingGPUExecutionResults(got)
+		for i, job := range step.jobs {
+			oracleState := oracle.States[job.SignalID]
+			requirePreparedExecutionResultMatchesOracle(t, got[i], want[i], oracleState, 1e-4, 1e-8)
+			requireStreamingExtractResultMatchesOracle(t, applied[i], want[i])
+			requireExtractStateMatchesOracle(t, r.streamState[job.SignalID], oracleState, 1e-8, 1e-4)
+
+			state := r.nativeState[job.SignalID]
+			if state == nil {
+				t.Fatalf("step %d (%s): missing native state for signal %d", idx, step.name, job.SignalID)
+			}
+			if state.configHash != job.ConfigHash {
+				t.Fatalf("step %d (%s): native config hash mismatch for signal %d: got=%d want=%d", idx, step.name, job.SignalID, state.configHash, job.ConfigHash)
+			}
+			if state.decim != oracleState.Decim {
+				t.Fatalf("step %d (%s): native decim mismatch for signal %d: got=%d want=%d", idx, step.name, job.SignalID, state.decim, oracleState.Decim)
+			}
+			if state.numTaps != oracleState.NumTaps {
+				t.Fatalf("step %d (%s): native num taps mismatch for signal %d: got=%d want=%d", idx, step.name, job.SignalID, state.numTaps, oracleState.NumTaps)
+			}
+			if state.historyCap != maxInt(0, oracleState.NumTaps-1) {
+				t.Fatalf("step %d (%s): native history cap mismatch for signal %d: got=%d want=%d", idx, step.name, job.SignalID, state.historyCap, maxInt(0, oracleState.NumTaps-1))
+			}
+			if state.historyLen != len(oracleState.ShiftedHistory) {
+				t.Fatalf("step %d (%s): native history len mismatch for signal %d: got=%d want=%d", idx, step.name, job.SignalID, state.historyLen, len(oracleState.ShiftedHistory))
+			}
+			if len(step.iq) > 0 && state.shiftedCap < len(step.iq) {
+				t.Fatalf("step %d (%s): native shifted capacity too small for signal %d: got=%d need>=%d", idx, step.name, job.SignalID, state.shiftedCap, len(step.iq))
+			}
+			if state.outCap < got[i].NOut {
+				t.Fatalf("step %d (%s): native out capacity too small for signal %d: got=%d need>=%d", idx, step.name, job.SignalID, state.outCap, got[i].NOut)
+			}
+			if job.SignalID == jobA.SignalID && state.shiftedCap > grownCap {
+				grownCap = state.shiftedCap
+			}
+		}
+		if step.name == "grow_capacity" && grownCap < len(step.iq) {
+			t.Fatalf("expected capacity growth for signal %d, got=%d want>=%d", jobA.SignalID, grownCap, len(step.iq))
+		}
+		if step.name == "config_reset_zero_new" {
+			state := r.nativeState[jobA.SignalID]
+			if state == nil {
+				t.Fatalf("missing native state for signal %d after config reset", jobA.SignalID)
+			}
+			if state.historyLen != 0 {
+				t.Fatalf("expected cleared native history after config reset, got=%d", state.historyLen)
+			}
+		}
+		if step.name == "signal_b_disappears" {
+			if _, ok := r.nativeState[jobB.SignalID]; ok {
+				t.Fatalf("expected native state for signal %d to be removed on disappearance", jobB.SignalID)
+			}
+		}
+	}
+}
diff --git a/internal/demod/gpudemod/streaming_gpu_native_state.go b/internal/demod/gpudemod/streaming_gpu_native_state.go
new file mode 100644
index 0000000..e1b6460
--- /dev/null
+++ b/internal/demod/gpudemod/streaming_gpu_native_state.go
@@ -0,0 +1,28 @@
+package gpudemod
+
+import "unsafe"
+
+type nativeStreamingSignalState struct {
+	signalID int64
+
+	configHash uint64
+	decim      int
+	numTaps    int
+
+	dInNew          unsafe.Pointer
+	dShifted        unsafe.Pointer
+	dOut            unsafe.Pointer
+	dTaps           unsafe.Pointer
+	dHistory        unsafe.Pointer
+	dHistoryScratch unsafe.Pointer
+
+	inNewCap          int
+	shiftedCap        int
+	outCap            int
+	tapsLen           int
+	historyCap        int
+	historyLen        int
+	historyScratchCap int
+	phaseCount        int
+	phaseNCO          float64
+}
diff --git a/internal/demod/gpudemod/streaming_gpu_prepare.go b/internal/demod/gpudemod/streaming_gpu_prepare.go
new file mode 100644
index 0000000..8e8a957
--- /dev/null
+++ b/internal/demod/gpudemod/streaming_gpu_prepare.go
@@ -0,0 +1,61 @@
+package gpudemod
+
+func (r *BatchRunner) buildStreamingGPUInvocations(iqNew []complex64, jobs []StreamingExtractJob) ([]StreamingGPUInvocation, error) {
+	if r == nil || r.eng == nil {
+		return nil, ErrUnavailable
+	}
+	invocations := make([]StreamingGPUInvocation, len(jobs))
+	active := make(map[int64]struct{}, len(jobs))
+	for i, job := range jobs {
+		active[job.SignalID] = struct{}{}
+		state, err := r.getOrInitExtractState(job, r.eng.sampleRate)
+		if err != nil {
+			return nil, err
+		}
+		invocations[i] = StreamingGPUInvocation{
+			SignalID:       job.SignalID,
+			ConfigHash:     state.ConfigHash,
+			OffsetHz:       job.OffsetHz,
+			OutRate:        job.OutRate,
+			Bandwidth:      job.Bandwidth,
+			SampleRate:     r.eng.sampleRate,
+			NumTaps:        state.NumTaps,
+			Decim:          state.Decim,
+			PhaseCountIn:   state.PhaseCount,
+			NCOPhaseIn:     state.NCOPhase,
+			HistoryLen:     len(state.ShiftedHistory),
+			BaseTaps:       append([]float32(nil), state.BaseTaps...),
+			PolyphaseTaps:  append([]float32(nil), state.PolyphaseTaps...),
+			ShiftedHistory: append([]complex64(nil), state.ShiftedHistory...),
+			IQNew:          iqNew,
+		}
+	}
+	for signalID := range r.streamState {
+		if _, ok := active[signalID]; !ok {
+			delete(r.streamState, signalID)
+		}
+	}
+	r.syncNativeStreamingStates(active)
+	return invocations, nil
+}
+
+func (r *BatchRunner) applyStreamingGPUExecutionResults(results []StreamingGPUExecutionResult) []StreamingExtractResult {
+	out := make([]StreamingExtractResult, len(results))
+	for i, res := range results {
+		state := r.streamState[res.SignalID]
+		if state != nil {
+			state.NCOPhase = res.NCOPhaseOut
+			state.PhaseCount = res.PhaseCountOut
+			state.ShiftedHistory = append(state.ShiftedHistory[:0], res.HistoryOut...)
+		}
+		out[i] = StreamingExtractResult{
+			SignalID:   res.SignalID,
+			IQ:         res.IQ,
+			Rate:       res.Rate,
+			NOut:       res.NOut,
+			PhaseCount: res.PhaseCountOut,
+			HistoryLen: res.HistoryLenOut,
+		}
+	}
+	return out
+}
diff --git a/internal/demod/gpudemod/streaming_gpu_stub.go b/internal/demod/gpudemod/streaming_gpu_stub.go
new file mode 100644
index 0000000..500e235
--- /dev/null
+++ b/internal/demod/gpudemod/streaming_gpu_stub.go
@@ -0,0 +1,26 @@
+package gpudemod
+
+func updateShiftedHistory(prev []complex64, shiftedNew []complex64, numTaps int) []complex64 {
+	need := numTaps - 1
+	if need <= 0 {
+		return nil
+	}
+	combined := append(append(make([]complex64, 0, len(prev)+len(shiftedNew)), prev...), shiftedNew...)
+	if len(combined) <= need {
+		out := make([]complex64, len(combined))
+		copy(out, combined)
+		return out
+	}
+	out := make([]complex64, need)
+	copy(out, combined[len(combined)-need:])
+	return out
+}
+
+// StreamingExtractGPU is the production entry point for the stateful streaming
+// extractor path. Execution strategy is selected by StreamingExtractGPUExec.
+func (r *BatchRunner) StreamingExtractGPU(iqNew []complex64, jobs []StreamingExtractJob) ([]StreamingExtractResult, error) {
+	if r == nil || r.eng == nil {
+		return nil, ErrUnavailable
+	}
+	return r.StreamingExtractGPUExec(iqNew, jobs)
+}
diff --git a/internal/demod/gpudemod/streaming_gpu_stub_test.go b/internal/demod/gpudemod/streaming_gpu_stub_test.go
new file mode 100644
index 0000000..2c947d3
--- /dev/null
+++ b/internal/demod/gpudemod/streaming_gpu_stub_test.go
@@ -0,0 +1,59 @@
+package gpudemod
+
+import "testing"
+
+func TestStreamingGPUUsesSafeProductionDefault(t *testing.T) {
+	r := &BatchRunner{eng: &Engine{sampleRate: 4000000}, streamState: make(map[int64]*ExtractStreamState)}
+	job := StreamingExtractJob{
+		SignalID:   1,
+		OffsetHz:   12500,
+		Bandwidth:  20000,
+		OutRate:    200000,
+		NumTaps:    65,
+		ConfigHash: 777,
+	}
+	iq := makeDeterministicIQ(1000)
+	results, err := r.StreamingExtractGPU(iq, []StreamingExtractJob{job})
+	if err != nil {
+		t.Fatalf("expected safe production default path, got error: %v", err)
+	}
+	if len(results) != 1 {
+		t.Fatalf("expected 1 result, got %d", len(results))
+	}
+	if results[0].NOut == 0 {
+		t.Fatalf("expected non-zero output count from safe production path")
+	}
+}
+
+func TestStreamingGPUHostOracleAdvancesState(t *testing.T) {
+	r := &BatchRunner{eng: &Engine{sampleRate: 4000000}, streamState: make(map[int64]*ExtractStreamState)}
+	job := StreamingExtractJob{
+		SignalID:   1,
+		OffsetHz:   12500,
+		Bandwidth:  20000,
+		OutRate:    200000,
+		NumTaps:    65,
+		ConfigHash: 777,
+	}
+	iq := makeDeterministicIQ(1000)
+	results, err := r.StreamingExtractGPUHostOracle(iq, []StreamingExtractJob{job})
+	if err != nil {
+		t.Fatalf("unexpected host-oracle error: %v", err)
+	}
+	if len(results) != 1 {
+		t.Fatalf("expected 1 result, got %d", len(results))
+	}
+	state := r.streamState[1]
+	if state == nil {
+		t.Fatalf("expected state to be initialized")
+	}
+	if state.NCOPhase == 0 {
+		t.Fatalf("expected phase to advance")
+	}
+	if len(state.ShiftedHistory) == 0 {
+		t.Fatalf("expected shifted history to be updated")
+	}
+	if results[0].NOut == 0 {
+		t.Fatalf("expected non-zero output count from host oracle path")
+	}
+}
diff --git a/internal/demod/gpudemod/streaming_gpu_validation_helpers_test.go b/internal/demod/gpudemod/streaming_gpu_validation_helpers_test.go
new file mode 100644
index 0000000..b88b102
--- /dev/null
+++ b/internal/demod/gpudemod/streaming_gpu_validation_helpers_test.go
@@ -0,0 +1,213 @@
+package gpudemod
+
+import (
+	"math"
+	"testing"
+)
+
+type streamingValidationStep struct {
+	name string
+	iq   []complex64
+	jobs []StreamingExtractJob
+}
+
+type streamingPreparedExecutor func(*BatchRunner, []StreamingGPUInvocation) ([]StreamingGPUExecutionResult, error)
+
+func makeToneNoiseIQ(n int, phaseInc float64) []complex64 {
+	out := make([]complex64, n)
+	phase := 0.0
+	for i := 0; i < n; i++ {
+		tone := complex(math.Cos(phase), math.Sin(phase))
+		noiseI := 0.17*math.Cos(0.113*float64(i)+0.31) + 0.07*math.Sin(0.071*float64(i))
+		noiseQ := 0.13*math.Sin(0.097*float64(i)+0.11) - 0.05*math.Cos(0.043*float64(i))
+		out[i] = complex64(0.85*tone + 0.15*complex(noiseI, noiseQ))
+		phase += phaseInc
+	}
+	return out
+}
+
+func makeStreamingValidationSteps(iq []complex64, chunkSizes []int, jobs []StreamingExtractJob) []streamingValidationStep {
+	steps := make([]streamingValidationStep, 0, len(chunkSizes)+1)
+	pos := 0
+	for idx, n := range chunkSizes {
+		if n < 0 {
+			n = 0
+		}
+		end := pos + n
+		if end > len(iq) {
+			end = len(iq)
+		}
+		steps = append(steps, streamingValidationStep{
+			name: "chunk",
+			iq:   append([]complex64(nil), iq[pos:end]...),
+			jobs: append([]StreamingExtractJob(nil), jobs...),
+		})
+		_ = idx
+		pos = end
+	}
+	if pos < len(iq) {
+		steps = append(steps, streamingValidationStep{
+			name: "remainder",
+			iq:   append([]complex64(nil), iq[pos:]...),
+			jobs: append([]StreamingExtractJob(nil), jobs...),
+		})
+	}
+	return steps
+}
+
+func requirePhaseClose(t *testing.T, got float64, want float64, tol float64) {
+	t.Helper()
+	diff := got - want
+	for diff > math.Pi {
+		diff -= 2 * math.Pi
+	}
+	for diff < -math.Pi {
+		diff += 2 * math.Pi
+	}
+	if math.Abs(diff) > tol {
+		t.Fatalf("phase mismatch: got=%0.12f want=%0.12f diff=%0.12f tol=%0.12f", got, want, diff, tol)
+	}
+}
+
+func requireStreamingExtractResultMatchesOracle(t *testing.T, got StreamingExtractResult, want StreamingExtractResult) {
+	t.Helper()
+	if got.SignalID != want.SignalID {
+		t.Fatalf("signal id mismatch: got=%d want=%d", got.SignalID, want.SignalID)
+	}
+	if got.Rate != want.Rate {
+		t.Fatalf("rate mismatch for signal %d: got=%d want=%d", got.SignalID, got.Rate, want.Rate)
+	}
+	if got.NOut != want.NOut {
+		t.Fatalf("n_out mismatch for signal %d: got=%d want=%d", got.SignalID, got.NOut, want.NOut)
+	}
+	if got.PhaseCount != want.PhaseCount {
+		t.Fatalf("phase count mismatch for signal %d: got=%d want=%d", got.SignalID, got.PhaseCount, want.PhaseCount)
+	}
+	if got.HistoryLen != want.HistoryLen {
+		t.Fatalf("history len mismatch for signal %d: got=%d want=%d", got.SignalID, got.HistoryLen, want.HistoryLen)
+	}
+}
+
+func requirePreparedExecutionResultMatchesOracle(t *testing.T, got StreamingGPUExecutionResult, want StreamingExtractResult, oracleState *CPUOracleState, sampleTol float64, phaseTol float64) {
+	t.Helper()
+	if oracleState == nil {
+		t.Fatalf("missing oracle state for signal %d", got.SignalID)
+	}
+	if got.SignalID != want.SignalID {
+		t.Fatalf("signal id mismatch: got=%d want=%d", got.SignalID, want.SignalID)
+	}
+	if got.Rate != want.Rate {
+		t.Fatalf("rate mismatch for signal %d: got=%d want=%d", got.SignalID, got.Rate, want.Rate)
+	}
+	if got.NOut != want.NOut {
+		t.Fatalf("n_out mismatch for signal %d: got=%d want=%d", got.SignalID, got.NOut, want.NOut)
+	}
+	if got.PhaseCountOut != oracleState.PhaseCount {
+		t.Fatalf("phase count mismatch for signal %d: got=%d want=%d", got.SignalID, got.PhaseCountOut, oracleState.PhaseCount)
+	}
+	requirePhaseClose(t, got.NCOPhaseOut, oracleState.NCOPhase, phaseTol)
+	if got.HistoryLenOut != len(oracleState.ShiftedHistory) {
+		t.Fatalf("history len mismatch for signal %d: got=%d want=%d", got.SignalID, got.HistoryLenOut, len(oracleState.ShiftedHistory))
+	}
+	requireComplexSlicesClose(t, got.IQ, want.IQ, sampleTol)
+	requireComplexSlicesClose(t, got.HistoryOut, oracleState.ShiftedHistory, sampleTol)
+}
+
+func requireExtractStateMatchesOracle(t *testing.T, got *ExtractStreamState, want *CPUOracleState, phaseTol float64, sampleTol float64) {
+	t.Helper()
+	if got == nil || want == nil {
+		t.Fatalf("state mismatch: got nil=%t want nil=%t", got == nil, want == nil)
+	}
+	if got.SignalID != want.SignalID {
+		t.Fatalf("signal id mismatch: got=%d want=%d", got.SignalID, want.SignalID)
+	}
+	if got.ConfigHash != want.ConfigHash {
+		t.Fatalf("config hash mismatch for signal %d: got=%d want=%d", got.SignalID, got.ConfigHash, want.ConfigHash)
+	}
+	if got.Decim != want.Decim {
+		t.Fatalf("decim mismatch for signal %d: got=%d want=%d", got.SignalID, got.Decim, want.Decim)
+	}
+	if got.NumTaps != want.NumTaps {
+		t.Fatalf("num taps mismatch for signal %d: got=%d want=%d", got.SignalID, got.NumTaps, want.NumTaps)
+	}
+	if got.PhaseCount != want.PhaseCount {
+		t.Fatalf("phase count mismatch for signal %d: got=%d want=%d", got.SignalID, got.PhaseCount, want.PhaseCount)
+	}
+	requirePhaseClose(t, got.NCOPhase, want.NCOPhase, phaseTol)
+	requireComplexSlicesClose(t, got.ShiftedHistory, want.ShiftedHistory, sampleTol)
+}
+
+func requireStateKeysMatchOracle(t *testing.T, got map[int64]*ExtractStreamState, want map[int64]*CPUOracleState) {
+	t.Helper()
+	if len(got) != len(want) {
+		t.Fatalf("active state count mismatch: got=%d want=%d", len(got), len(want))
+	}
+	for signalID := range want {
+		if got[signalID] == nil {
+			t.Fatalf("missing active state for signal %d", signalID)
+		}
+	}
+	for signalID := range got {
+		if want[signalID] == nil {
+			t.Fatalf("unexpected active state for signal %d", signalID)
+		}
+	}
+}
+
+func runStreamingExecSequenceAgainstOracle(t *testing.T, runner *BatchRunner, steps []streamingValidationStep, sampleTol float64, phaseTol float64) {
+	t.Helper()
+	oracle := NewCPUOracleRunner(runner.eng.sampleRate)
+	for idx, step := range steps {
+		got, err := runner.StreamingExtractGPUExec(step.iq, step.jobs)
+		if err != nil {
+			t.Fatalf("step %d (%s): exec failed: %v", idx, step.name, err)
+		}
+		want, err := oracle.StreamingExtract(step.iq, step.jobs)
+		if err != nil {
+			t.Fatalf("step %d (%s): oracle failed: %v", idx, step.name, err)
+		}
+		if len(got) != len(want) {
+			t.Fatalf("step %d (%s): result count mismatch: got=%d want=%d", idx, step.name, len(got), len(want))
+		}
+		for i, job := range step.jobs {
+			requireStreamingExtractResultMatchesOracle(t, got[i], want[i])
+			requireComplexSlicesClose(t, got[i].IQ, want[i].IQ, sampleTol)
+			requireExtractStateMatchesOracle(t, runner.streamState[job.SignalID], oracle.States[job.SignalID], phaseTol, sampleTol)
+		}
+		requireStateKeysMatchOracle(t, runner.streamState, oracle.States)
+	}
+}
+
+func runPreparedSequenceAgainstOracle(t *testing.T, runner *BatchRunner, exec streamingPreparedExecutor, steps []streamingValidationStep, sampleTol float64, phaseTol float64) {
+	t.Helper()
+	oracle := NewCPUOracleRunner(runner.eng.sampleRate)
+	for idx, step := range steps {
+		invocations, err := runner.buildStreamingGPUInvocations(step.iq, step.jobs)
+		if err != nil {
+			t.Fatalf("step %d (%s): build invocations failed: %v", idx, step.name, err)
+		}
+		got, err := exec(runner, invocations)
+		if err != nil {
+			t.Fatalf("step %d (%s): prepared exec failed: %v", idx, step.name, err)
+		}
+		want, err := oracle.StreamingExtract(step.iq, step.jobs)
+		if err != nil {
+			t.Fatalf("step %d (%s): oracle failed: %v", idx, step.name, err)
+		}
+		if len(got) != len(want) {
+			t.Fatalf("step %d (%s): result count mismatch: got=%d want=%d", idx, step.name, len(got), len(want))
+		}
+		applied := runner.applyStreamingGPUExecutionResults(got)
+		if len(applied) != len(want) {
+			t.Fatalf("step %d (%s): applied result count mismatch: got=%d want=%d", idx, step.name, len(applied), len(want))
+		}
+		for i, job := range step.jobs {
+			oracleState := oracle.States[job.SignalID]
+			requirePreparedExecutionResultMatchesOracle(t, got[i], want[i], oracleState, sampleTol, phaseTol)
+			requireStreamingExtractResultMatchesOracle(t, applied[i], want[i])
+			requireComplexSlicesClose(t, applied[i].IQ, want[i].IQ, sampleTol)
+			requireExtractStateMatchesOracle(t, runner.streamState[job.SignalID], oracleState, phaseTol, sampleTol)
+		}
+		requireStateKeysMatchOracle(t, runner.streamState, oracle.States)
+	}
+}
diff --git a/internal/demod/gpudemod/streaming_host_core.go b/internal/demod/gpudemod/streaming_host_core.go
new file mode 100644
index 0000000..f9b75aa
--- /dev/null
+++ b/internal/demod/gpudemod/streaming_host_core.go
@@ -0,0 +1,64 @@
+package gpudemod
+
+import "math"
+
+func runStreamingPolyphaseHostCore(
+	iqNew []complex64,
+	sampleRate int,
+	offsetHz float64,
+	stateNCOPhase float64,
+	statePhaseCount int,
+	stateNumTaps int,
+	stateDecim int,
+	stateHistory []complex64,
+	polyphaseTaps []float32,
+) ([]complex64, float64, int, []complex64) {
+	out := make([]complex64, 0, len(iqNew)/maxInt(1, stateDecim)+2)
+	phase := stateNCOPhase
+	phaseCount := statePhaseCount
+	hist := append([]complex64(nil), stateHistory...)
+	phaseLen := PolyphasePhaseLen(len(polyphaseTaps)/maxInt(1, stateDecim)*maxInt(1, stateDecim), stateDecim)
+	if phaseLen == 0 {
+		phaseLen = PolyphasePhaseLen(len(polyphaseTaps), stateDecim)
+	}
+	phaseInc := -2.0 * math.Pi * offsetHz / float64(sampleRate)
+	for _, x := range iqNew {
+		rot := complex64(complex(math.Cos(phase), math.Sin(phase)))
+		s := x * rot
+		hist = append(hist, s)
+		phaseCount++
+		if phaseCount == stateDecim {
+			var y complex64
+			for p := 0; p < stateDecim; p++ {
+				for k := 0; k < phaseLen; k++ {
+					idxTap := p*phaseLen + k
+					if idxTap >= len(polyphaseTaps) {
+						continue
+					}
+					tap := polyphaseTaps[idxTap]
+					if tap == 0 {
+						continue
+					}
+					srcBack := p + k*stateDecim
+					idx := len(hist) - 1 - srcBack
+					if idx < 0 {
+						continue
+					}
+					y += complex(tap, 0) * hist[idx]
+				}
+			}
+			out = append(out, y)
+			phaseCount = 0
+		}
+		if len(hist) > stateNumTaps-1 {
+			hist = hist[len(hist)-(stateNumTaps-1):]
+		}
+		phase += phaseInc
+		if phase >= math.Pi {
+			phase -= 2 * math.Pi
+		} else if phase < -math.Pi {
+			phase += 2 * math.Pi
+		}
+	}
+	return out, phase, phaseCount, append([]complex64(nil), hist...)
+}
diff --git a/internal/demod/gpudemod/streaming_host_core_test.go b/internal/demod/gpudemod/streaming_host_core_test.go
new file mode 100644
index 0000000..099c755
--- /dev/null
+++ b/internal/demod/gpudemod/streaming_host_core_test.go
@@ -0,0 +1,40 @@
+package gpudemod
+
+import "testing"
+
+func TestRunStreamingPolyphaseHostCoreMatchesCPUOraclePolyphase(t *testing.T) {
+	cfg := OracleHarnessConfig{
+		SignalID:   1,
+		ConfigHash: 123,
+		NCOPhase:   0,
+		Decim:      20,
+		NumTaps:    65,
+		PhaseInc:   0.017,
+	}
+	state := MakeCPUOracleState(cfg)
+	iq := MakeDeterministicIQ(12000)
+	oracle := CPUOracleExtractPolyphase(iq, state, cfg.PhaseInc)
+
+	state2 := MakeCPUOracleState(cfg)
+	out, phase, phaseCount, hist := runStreamingPolyphaseHostCore(
+		iq,
+		4000000,
+		-cfg.PhaseInc*4000000/(2*3.141592653589793),
+		state2.NCOPhase,
+		state2.PhaseCount,
+		state2.NumTaps,
+		state2.Decim,
+		state2.ShiftedHistory,
+		state2.PolyphaseTaps,
+	)
+	requireComplexSlicesClose(t, oracle, out, 1e-5)
+	if phase == 0 && len(iq) > 0 {
+		t.Fatalf("expected phase to advance")
+	}
+	if phaseCount < 0 || phaseCount >= state2.Decim {
+		t.Fatalf("unexpected phaseCount: %d", phaseCount)
+	}
+	if len(hist) == 0 {
+		t.Fatalf("expected history to be retained")
+	}
+}
diff --git a/internal/demod/gpudemod/streaming_oracle_extract.go b/internal/demod/gpudemod/streaming_oracle_extract.go
new file mode 100644
index 0000000..eb89b7e
--- /dev/null
+++ b/internal/demod/gpudemod/streaming_oracle_extract.go
@@ -0,0 +1,111 @@
+package gpudemod
+
+import (
+	"fmt"
+
+	"sdr-wideband-suite/internal/dsp"
+)
+
+type CPUOracleRunner struct {
+	SampleRate int
+	States     map[int64]*CPUOracleState
+}
+
+func (r *CPUOracleRunner) ResetAllStates() {
+	if r == nil {
+		return
+	}
+	r.States = make(map[int64]*CPUOracleState)
+}
+
+func NewCPUOracleRunner(sampleRate int) *CPUOracleRunner {
+	return &CPUOracleRunner{
+		SampleRate: sampleRate,
+		States:     make(map[int64]*CPUOracleState),
+	}
+}
+
+func (r *CPUOracleRunner) ResetSignalState(signalID int64) {
+	if r == nil || r.States == nil {
+		return
+	}
+	delete(r.States, signalID)
+}
+
+func (r *CPUOracleRunner) getOrInitState(job StreamingExtractJob) (*CPUOracleState, error) {
+	if r == nil {
+		return nil, fmt.Errorf("nil CPUOracleRunner")
+	}
+	if r.States == nil {
+		r.States = make(map[int64]*CPUOracleState)
+	}
+	decim, err := ExactIntegerDecimation(r.SampleRate, job.OutRate)
+	if err != nil {
+		return nil, err
+	}
+	state := r.States[job.SignalID]
+	if state == nil {
+		state = &CPUOracleState{SignalID: job.SignalID}
+		r.States[job.SignalID] = state
+	}
+	ResetCPUOracleStateIfConfigChanged(state, job.ConfigHash)
+	state.Decim = decim
+	state.NumTaps = job.NumTaps
+	if state.NumTaps <= 0 {
+		state.NumTaps = 101
+	}
+	cutoff := job.Bandwidth / 2
+	if cutoff < 200 {
+		cutoff = 200
+	}
+	base := dsp.LowpassFIR(cutoff, r.SampleRate, state.NumTaps)
+	state.BaseTaps = make([]float32, len(base))
+	for i, v := range base {
+		state.BaseTaps[i] = float32(v)
+	}
+	state.PolyphaseTaps = BuildPolyphaseTapsPhaseMajor(state.BaseTaps, state.Decim)
+	if state.ShiftedHistory == nil {
+		state.ShiftedHistory = make([]complex64, 0, maxInt(0, state.NumTaps-1))
+	}
+	return state, nil
+}
+
+func (r *CPUOracleRunner) StreamingExtract(iqNew []complex64, jobs []StreamingExtractJob) ([]StreamingExtractResult, error) {
+	results := make([]StreamingExtractResult, len(jobs))
+	active := make(map[int64]struct{}, len(jobs))
+	for i, job := range jobs {
+		active[job.SignalID] = struct{}{}
+		state, err := r.getOrInitState(job)
+		if err != nil {
+			return nil, err
+		}
+		out, phase, phaseCount, hist := runStreamingPolyphaseHostCore(
+			iqNew,
+			r.SampleRate,
+			job.OffsetHz,
+			state.NCOPhase,
+			state.PhaseCount,
+			state.NumTaps,
+			state.Decim,
+			state.ShiftedHistory,
+			state.PolyphaseTaps,
+		)
+		state.NCOPhase = phase
+		state.PhaseCount = phaseCount
+		state.ShiftedHistory = append(state.ShiftedHistory[:0], hist...)
+		results[i] = StreamingExtractResult{
+			SignalID:   job.SignalID,
+			IQ:         out,
+			Rate:       job.OutRate,
+			NOut:       len(out),
+			PhaseCount: state.PhaseCount,
+			HistoryLen: len(state.ShiftedHistory),
+		}
+	}
+	for signalID := range r.States {
+		if _, ok := active[signalID]; !ok {
+			delete(r.States, signalID)
+		}
+	}
+	return results, nil
+}
diff --git a/internal/demod/gpudemod/streaming_types.go b/internal/demod/gpudemod/streaming_types.go
new file mode 100644
index 0000000..fb15cb3
--- /dev/null
+++ b/internal/demod/gpudemod/streaming_types.go
@@ -0,0 +1,64 @@
+package gpudemod
+
+import (
+	"fmt"
+	"hash/fnv"
+)
+
+type StreamingExtractJob struct {
+	SignalID   int64
+	OffsetHz   float64
+	Bandwidth  float64
+	OutRate    int
+	NumTaps    int
+	ConfigHash uint64
+}
+
+type StreamingExtractResult struct {
+	SignalID   int64
+	IQ         []complex64
+	Rate       int
+	NOut       int
+	PhaseCount int
+	HistoryLen int
+}
+
+type ExtractStreamState struct {
+	SignalID       int64
+	ConfigHash     uint64
+	NCOPhase       float64
+	Decim          int
+	PhaseCount     int
+	NumTaps        int
+	ShiftedHistory []complex64
+	BaseTaps       []float32
+	PolyphaseTaps  []float32
+	Initialized    bool
+}
+
+func ResetExtractStreamState(state *ExtractStreamState, cfgHash uint64) {
+	if state == nil {
+		return
+	}
+	state.ConfigHash = cfgHash
+	state.NCOPhase = 0
+	state.PhaseCount = 0
+	state.ShiftedHistory = state.ShiftedHistory[:0]
+	state.Initialized = false
+}
+
+func StreamingConfigHash(signalID int64, offsetHz float64, bandwidth float64, outRate int, numTaps int, sampleRate int) uint64 {
+	// Hash only structural parameters that change the FIR/decimation geometry.
+	// Offset is NOT included because the NCO phase_inc tracks it smoothly each frame.
+	// Bandwidth is NOT included because taps are rebuilt every frame in getOrInitExtractState.
+	// A state reset (zeroing NCO phase, history, phase count) is only needed when
+	// decimation factor, tap count, or sample rate changes — all of which affect
+	// buffer sizes and polyphase structure.
+	//
+	// Previous bug: offset and bandwidth were formatted at %.9f precision, causing
+	// a new hash (and full state reset) every single frame because the detector's
+	// exponential smoothing changes CenterHz by sub-Hz fractions each frame.
+	h := fnv.New64a()
+	_, _ = h.Write([]byte(fmt.Sprintf("sig=%d|out=%d|taps=%d|sr=%d", signalID, outRate, numTaps, sampleRate)))
+	return h.Sum64()
+}
diff --git a/internal/demod/gpudemod/test_harness.go b/internal/demod/gpudemod/test_harness.go
new file mode 100644
index 0000000..2a74d0b
--- /dev/null
+++ b/internal/demod/gpudemod/test_harness.go
@@ -0,0 +1,78 @@
+package gpudemod
+
+import (
+	"math"
+)
+
+type OracleHarnessConfig struct {
+	SignalID   int64
+	ConfigHash uint64
+	NCOPhase   float64
+	Decim      int
+	NumTaps    int
+	PhaseInc   float64
+}
+
+func MakeDeterministicIQ(n int) []complex64 {
+	out := make([]complex64, n)
+	for i := 0; i < n; i++ {
+		a := 0.017 * float64(i)
+		b := 0.031 * float64(i)
+		out[i] = complex64(complex(math.Cos(a)+0.2*math.Cos(b), math.Sin(a)+0.15*math.Sin(b)))
+	}
+	return out
+}
+
+func MakeToneIQ(n int, phaseInc float64) []complex64 {
+	out := make([]complex64, n)
+	phase := 0.0
+	for i := 0; i < n; i++ {
+		out[i] = complex64(complex(math.Cos(phase), math.Sin(phase)))
+		phase += phaseInc
+	}
+	return out
+}
+
+func MakeLowpassTaps(n int) []float32 {
+	out := make([]float32, n)
+	for i := range out {
+		out[i] = 1.0 / float32(n)
+	}
+	return out
+}
+
+func MakeCPUOracleState(cfg OracleHarnessConfig) *CPUOracleState {
+	taps := MakeLowpassTaps(cfg.NumTaps)
+	return &CPUOracleState{
+		SignalID:       cfg.SignalID,
+		ConfigHash:     cfg.ConfigHash,
+		NCOPhase:       cfg.NCOPhase,
+		Decim:          cfg.Decim,
+		PhaseCount:     0,
+		NumTaps:        cfg.NumTaps,
+		ShiftedHistory: make([]complex64, 0, maxInt(0, cfg.NumTaps-1)),
+		BaseTaps:       taps,
+		PolyphaseTaps:  BuildPolyphaseTapsPhaseMajor(taps, cfg.Decim),
+	}
+}
+
+func RunChunkedCPUOraclePolyphase(all []complex64, chunkSizes []int, mkState func() *CPUOracleState, phaseInc float64) []complex64 {
+	state := mkState()
+	out := make([]complex64, 0)
+	pos := 0
+	for _, n := range chunkSizes {
+		if pos >= len(all) {
+			break
+		}
+		end := pos + n
+		if end > len(all) {
+			end = len(all)
+		}
+		out = append(out, CPUOracleExtractPolyphase(all[pos:end], state, phaseInc)...)
+		pos = end
+	}
+	if pos < len(all) {
+		out = append(out, CPUOracleExtractPolyphase(all[pos:], state, phaseInc)...)
+	}
+	return out
+}
diff --git a/internal/demod/gpudemod/test_harness_test.go b/internal/demod/gpudemod/test_harness_test.go
new file mode 100644
index 0000000..c4621b1
--- /dev/null
+++ b/internal/demod/gpudemod/test_harness_test.go
@@ -0,0 +1,39 @@
+package gpudemod
+
+import "testing"
+
+func requireComplexSlicesCloseHarness(t *testing.T, a []complex64, b []complex64, tol float64) {
+	t.Helper()
+	if len(a) != len(b) {
+		t.Fatalf("length mismatch: %d vs %d", len(a), len(b))
+	}
+	for i := range a {
+		d := CompareComplexSlices([]complex64{a[i]}, []complex64{b[i]})
+		if d.MaxAbsErr > tol {
+			t.Fatalf("slice mismatch at %d: %v vs %v (tol=%f)", i, a[i], b[i], tol)
+		}
+	}
+}
+
+func TestHarnessChunkedCPUOraclePolyphase(t *testing.T) {
+	cfg := OracleHarnessConfig{
+		SignalID:   1,
+		ConfigHash: 123,
+		NCOPhase:   0,
+		Decim:      20,
+		NumTaps:    65,
+		PhaseInc:   0.017,
+	}
+	iq := MakeDeterministicIQ(150000)
+	mk := func() *CPUOracleState { return MakeCPUOracleState(cfg) }
+	mono := CPUOracleExtractPolyphase(iq, mk(), cfg.PhaseInc)
+	chunked := RunChunkedCPUOraclePolyphase(iq, []int{4096, 5000, 8192, 27307}, mk, cfg.PhaseInc)
+	requireComplexSlicesCloseHarness(t, mono, chunked, 1e-5)
+}
+
+func TestHarnessToneIQ(t *testing.T) {
+	iq := MakeToneIQ(1024, 0.05)
+	if len(iq) != 1024 {
+		t.Fatalf("unexpected tone iq length: %d", len(iq))
+	}
+}
diff --git a/internal/demod/gpudemod/windows_bridge.go b/internal/demod/gpudemod/windows_bridge.go
index 3371be7..fbfcc9a 100644
--- a/internal/demod/gpudemod/windows_bridge.go
+++ b/internal/demod/gpudemod/windows_bridge.go
@@ -4,7 +4,7 @@ package gpudemod
 
 /*
 #cgo windows CFLAGS: -I"C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v13.2/include"
-#cgo windows LDFLAGS: -lcudart64_13 -lkernel32
+#cgo windows LDFLAGS: -L"C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v13.2/bin/x64" -l:cudart64_13.dll -lkernel32
 #include <windows.h>
 #include <stdlib.h>
 #include <cuda_runtime.h>
@@ -26,6 +26,8 @@ typedef int (__stdcall *gpud_launch_decimate_stream_fn)(const gpud_float2* in, g
 typedef int (__stdcall *gpud_launch_decimate_fn)(const gpud_float2* in, gpud_float2* out, int n_out, int factor);
 typedef int (__stdcall *gpud_launch_am_envelope_fn)(const gpud_float2* in, float* out, int n);
 typedef int (__stdcall *gpud_launch_ssb_product_fn)(const gpud_float2* in, float* out, int n, double phase_inc, double phase_start);
+typedef int (__stdcall *gpud_launch_streaming_polyphase_prepare_fn)(const gpud_float2* in_new, int n_new, const gpud_float2* history_in, int history_len, const float* polyphase_taps, int polyphase_len, int decim, int num_taps, int phase_count_in, double phase_start, double phase_inc, gpud_float2* out, int* n_out, int* phase_count_out, double* phase_end_out, gpud_float2* history_out);
+typedef int (__stdcall *gpud_launch_streaming_polyphase_stateful_fn)(const gpud_float2* in_new, int n_new, gpud_float2* shifted_new_tmp, const float* polyphase_taps, int polyphase_len, int decim, int num_taps, gpud_float2* history_state, gpud_float2* history_scratch, int history_cap, int* history_len_io, int* phase_count_state, double* phase_state, double phase_inc, gpud_float2* out, int out_cap, int* n_out);
 
 static HMODULE gpud_mod = NULL;
 static gpud_stream_create_fn gpud_p_stream_create = NULL;
@@ -42,6 +44,8 @@ static gpud_launch_decimate_stream_fn gpud_p_launch_decimate_stream = NULL;
 static gpud_launch_decimate_fn gpud_p_launch_decimate = NULL;
 static gpud_launch_am_envelope_fn gpud_p_launch_am_envelope = NULL;
 static gpud_launch_ssb_product_fn gpud_p_launch_ssb_product = NULL;
+static gpud_launch_streaming_polyphase_prepare_fn gpud_p_launch_streaming_polyphase_prepare = NULL;
+static gpud_launch_streaming_polyphase_stateful_fn gpud_p_launch_streaming_polyphase_stateful = NULL;
 
 static int gpud_cuda_malloc(void **ptr, size_t bytes) { return (int)cudaMalloc(ptr, bytes); }
 static int gpud_cuda_free(void *ptr) { return (int)cudaFree(ptr); }
@@ -67,6 +71,8 @@ static int gpud_load_library(const char* path) {
 	gpud_p_launch_decimate = (gpud_launch_decimate_fn)GetProcAddress(gpud_mod, "gpud_launch_decimate_cuda");
 	gpud_p_launch_am_envelope = (gpud_launch_am_envelope_fn)GetProcAddress(gpud_mod, "gpud_launch_am_envelope_cuda");
 	gpud_p_launch_ssb_product = (gpud_launch_ssb_product_fn)GetProcAddress(gpud_mod, "gpud_launch_ssb_product_cuda");
+	gpud_p_launch_streaming_polyphase_prepare = (gpud_launch_streaming_polyphase_prepare_fn)GetProcAddress(gpud_mod, "gpud_launch_streaming_polyphase_prepare_cuda");
+	gpud_p_launch_streaming_polyphase_stateful = (gpud_launch_streaming_polyphase_stateful_fn)GetProcAddress(gpud_mod, "gpud_launch_streaming_polyphase_stateful_cuda");
 	if (!gpud_p_stream_create || !gpud_p_stream_destroy || !gpud_p_stream_sync || !gpud_p_upload_fir_taps || !gpud_p_launch_freq_shift_stream || !gpud_p_launch_freq_shift || !gpud_p_launch_fm_discrim || !gpud_p_launch_fir_stream || !gpud_p_launch_fir || !gpud_p_launch_decimate_stream || !gpud_p_launch_decimate || !gpud_p_launch_am_envelope || !gpud_p_launch_ssb_product) {
 		FreeLibrary(gpud_mod);
 		gpud_mod = NULL;
@@ -89,6 +95,8 @@ static int gpud_launch_decimate_stream(gpud_float2 *in, gpud_float2 *out, int n_
 static int gpud_launch_decimate(gpud_float2 *in, gpud_float2 *out, int n_out, int factor) { if (!gpud_p_launch_decimate) return -1; return gpud_p_launch_decimate(in, out, n_out, factor); }
 static int gpud_launch_am_envelope(gpud_float2 *in, float *out, int n) { if (!gpud_p_launch_am_envelope) return -1; return gpud_p_launch_am_envelope(in, out, n); }
 static int gpud_launch_ssb_product(gpud_float2 *in, float *out, int n, double phase_inc, double phase_start) { if (!gpud_p_launch_ssb_product) return -1; return gpud_p_launch_ssb_product(in, out, n, phase_inc, phase_start); }
+static int gpud_launch_streaming_polyphase_prepare(gpud_float2 *in_new, int n_new, gpud_float2 *history_in, int history_len, float *polyphase_taps, int polyphase_len, int decim, int num_taps, int phase_count_in, double phase_start, double phase_inc, gpud_float2 *out, int *n_out, int *phase_count_out, double *phase_end_out, gpud_float2 *history_out) { if (!gpud_p_launch_streaming_polyphase_prepare) return -1; return gpud_p_launch_streaming_polyphase_prepare(in_new, n_new, history_in, history_len, polyphase_taps, polyphase_len, decim, num_taps, phase_count_in, phase_start, phase_inc, out, n_out, phase_count_out, phase_end_out, history_out); }
+static int gpud_launch_streaming_polyphase_stateful(gpud_float2 *in_new, int n_new, gpud_float2 *shifted_new_tmp, float *polyphase_taps, int polyphase_len, int decim, int num_taps, gpud_float2 *history_state, gpud_float2 *history_scratch, int history_cap, int *history_len_io, int *phase_count_state, double *phase_state, double phase_inc, gpud_float2 *out, int out_cap, int *n_out) { if (!gpud_p_launch_streaming_polyphase_stateful) return -1; return gpud_p_launch_streaming_polyphase_stateful(in_new, n_new, shifted_new_tmp, polyphase_taps, polyphase_len, decim, num_taps, history_state, history_scratch, history_cap, history_len_io, phase_count_state, phase_state, phase_inc, out, out_cap, n_out); }
 */
 import "C"
 
@@ -103,38 +111,68 @@ func bridgeLoadLibrary(path string) int {
 	defer C.free(unsafe.Pointer(cp))
 	return int(C.gpud_load_library(cp))
 }
-func bridgeCudaMalloc(ptr *unsafe.Pointer, bytes uintptr) int { return int(C.gpud_cuda_malloc(ptr, C.size_t(bytes))) }
+func bridgeCudaMalloc(ptr *unsafe.Pointer, bytes uintptr) int {
+	return int(C.gpud_cuda_malloc(ptr, C.size_t(bytes)))
+}
 func bridgeCudaFree(ptr unsafe.Pointer) int { return int(C.gpud_cuda_free(ptr)) }
-func bridgeMemcpyH2D(dst unsafe.Pointer, src unsafe.Pointer, bytes uintptr) int { return int(C.gpud_memcpy_h2d(dst, src, C.size_t(bytes))) }
-func bridgeMemcpyD2H(dst unsafe.Pointer, src unsafe.Pointer, bytes uintptr) int { return int(C.gpud_memcpy_d2h(dst, src, C.size_t(bytes))) }
+func bridgeMemcpyH2D(dst unsafe.Pointer, src unsafe.Pointer, bytes uintptr) int {
+	return int(C.gpud_memcpy_h2d(dst, src, C.size_t(bytes)))
+}
+func bridgeMemcpyD2H(dst unsafe.Pointer, src unsafe.Pointer, bytes uintptr) int {
+	return int(C.gpud_memcpy_d2h(dst, src, C.size_t(bytes)))
+}
 func bridgeDeviceSync() int { return int(C.gpud_device_sync()) }
-func bridgeUploadFIRTaps(taps *C.float, n int) int { return int(C.gpud_upload_fir_taps(taps, C.int(n))) }
+func bridgeUploadFIRTaps(taps *C.float, n int) int {
+	return int(C.gpud_upload_fir_taps(taps, C.int(n)))
+}
 func bridgeLaunchFreqShift(in *C.gpud_float2, out *C.gpud_float2, n int, phaseInc float64, phaseStart float64) int {
 	return int(C.gpud_launch_freq_shift(in, out, C.int(n), C.double(phaseInc), C.double(phaseStart)))
 }
 func bridgeLaunchFreqShiftStream(in *C.gpud_float2, out *C.gpud_float2, n int, phaseInc float64, phaseStart float64, stream streamHandle) int {
 	return int(C.gpud_launch_freq_shift_stream(in, out, C.int(n), C.double(phaseInc), C.double(phaseStart), C.gpud_stream_handle(stream)))
 }
-func bridgeLaunchFIR(in *C.gpud_float2, out *C.gpud_float2, n int, numTaps int) int { return int(C.gpud_launch_fir(in, out, C.int(n), C.int(numTaps))) }
+func bridgeLaunchFIR(in *C.gpud_float2, out *C.gpud_float2, n int, numTaps int) int {
+	return int(C.gpud_launch_fir(in, out, C.int(n), C.int(numTaps)))
+}
 func bridgeLaunchFIRStream(in *C.gpud_float2, out *C.gpud_float2, n int, numTaps int, stream streamHandle) int {
 	return int(C.gpud_launch_fir_stream(in, out, C.int(n), C.int(numTaps), C.gpud_stream_handle(stream)))
 }
 func bridgeLaunchFIRv2Stream(in *C.gpud_float2, out *C.gpud_float2, taps *C.float, n int, numTaps int, stream streamHandle) int {
 	return int(C.gpud_launch_fir_v2_stream(in, out, taps, C.int(n), C.int(numTaps), C.gpud_stream_handle(stream)))
 }
-func bridgeLaunchDecimate(in *C.gpud_float2, out *C.gpud_float2, nOut int, factor int) int { return int(C.gpud_launch_decimate(in, out, C.int(nOut), C.int(factor))) }
+func bridgeLaunchDecimate(in *C.gpud_float2, out *C.gpud_float2, nOut int, factor int) int {
+	return int(C.gpud_launch_decimate(in, out, C.int(nOut), C.int(factor)))
+}
 func bridgeLaunchDecimateStream(in *C.gpud_float2, out *C.gpud_float2, nOut int, factor int, stream streamHandle) int {
 	return int(C.gpud_launch_decimate_stream(in, out, C.int(nOut), C.int(factor), C.gpud_stream_handle(stream)))
 }
-func bridgeLaunchFMDiscrim(in *C.gpud_float2, out *C.float, n int) int { return int(C.gpud_launch_fm_discrim(in, out, C.int(n))) }
-func bridgeLaunchAMEnvelope(in *C.gpud_float2, out *C.float, n int) int { return int(C.gpud_launch_am_envelope(in, out, C.int(n))) }
+func bridgeLaunchFMDiscrim(in *C.gpud_float2, out *C.float, n int) int {
+	return int(C.gpud_launch_fm_discrim(in, out, C.int(n)))
+}
+func bridgeLaunchAMEnvelope(in *C.gpud_float2, out *C.float, n int) int {
+	return int(C.gpud_launch_am_envelope(in, out, C.int(n)))
+}
 func bridgeLaunchSSBProduct(in *C.gpud_float2, out *C.float, n int, phaseInc float64, phaseStart float64) int {
 	return int(C.gpud_launch_ssb_product(in, out, C.int(n), C.double(phaseInc), C.double(phaseStart)))
 }
+
+// bridgeLaunchStreamingPolyphasePrepare is a transitional bridge for the
+// legacy single-call prepare path. The stateful native path uses
+// bridgeLaunchStreamingPolyphaseStateful.
+func bridgeLaunchStreamingPolyphasePrepare(inNew *C.gpud_float2, nNew int, historyIn *C.gpud_float2, historyLen int, polyphaseTaps *C.float, polyphaseLen int, decim int, numTaps int, phaseCountIn int, phaseStart float64, phaseInc float64, out *C.gpud_float2, nOut *C.int, phaseCountOut *C.int, phaseEndOut *C.double, historyOut *C.gpud_float2) int {
+	return int(C.gpud_launch_streaming_polyphase_prepare(inNew, C.int(nNew), historyIn, C.int(historyLen), polyphaseTaps, C.int(polyphaseLen), C.int(decim), C.int(numTaps), C.int(phaseCountIn), C.double(phaseStart), C.double(phaseInc), out, nOut, phaseCountOut, phaseEndOut, historyOut))
+}
+func bridgeLaunchStreamingPolyphaseStateful(inNew *C.gpud_float2, nNew int, shiftedNewTmp *C.gpud_float2, polyphaseTaps *C.float, polyphaseLen int, decim int, numTaps int, historyState *C.gpud_float2, historyScratch *C.gpud_float2, historyCap int, historyLenIO *C.int, phaseCountState *C.int, phaseState *C.double, phaseInc float64, out *C.gpud_float2, outCap int, nOut *C.int) int {
+	return int(C.gpud_launch_streaming_polyphase_stateful(inNew, C.int(nNew), shiftedNewTmp, polyphaseTaps, C.int(polyphaseLen), C.int(decim), C.int(numTaps), historyState, historyScratch, C.int(historyCap), historyLenIO, phaseCountState, phaseState, C.double(phaseInc), out, C.int(outCap), nOut))
+}
 func bridgeStreamCreate() (streamHandle, int) {
 	var s C.gpud_stream_handle
 	res := int(C.gpud_stream_create(&s))
 	return streamHandle(s), res
 }
-func bridgeStreamDestroy(stream streamHandle) int { return int(C.gpud_stream_destroy(C.gpud_stream_handle(stream))) }
-func bridgeStreamSync(stream streamHandle) int { return int(C.gpud_stream_sync(C.gpud_stream_handle(stream))) }
+func bridgeStreamDestroy(stream streamHandle) int {
+	return int(C.gpud_stream_destroy(C.gpud_stream_handle(stream)))
+}
+func bridgeStreamSync(stream streamHandle) int {
+	return int(C.gpud_stream_sync(C.gpud_stream_handle(stream)))
+}
diff --git a/internal/dsp/decimating_fir.go b/internal/dsp/decimating_fir.go
new file mode 100644
index 0000000..3cd958f
--- /dev/null
+++ b/internal/dsp/decimating_fir.go
@@ -0,0 +1,95 @@
+package dsp
+
+// StatefulDecimatingFIRComplex combines FIR filtering and decimation into a
+// single stateful stage. This avoids exposing FIR settling/transient output as
+// ordinary block-leading samples before decimation.
+type StatefulDecimatingFIRComplex struct {
+	taps   []float64
+	delayR []float64
+	delayI []float64
+	factor int
+	phase  int // number of input samples until next output sample (0 => emit now)
+}
+
+func (f *StatefulDecimatingFIRComplex) Phase() int {
+	if f == nil {
+		return 0
+	}
+	return f.phase
+}
+
+func (f *StatefulDecimatingFIRComplex) TapsLen() int {
+	if f == nil {
+		return 0
+	}
+	return len(f.taps)
+}
+
+func NewStatefulDecimatingFIRComplex(taps []float64, factor int) *StatefulDecimatingFIRComplex {
+	if factor < 1 {
+		factor = 1
+	}
+	t := make([]float64, len(taps))
+	copy(t, taps)
+	return &StatefulDecimatingFIRComplex{
+		taps:   t,
+		delayR: make([]float64, len(taps)),
+		delayI: make([]float64, len(taps)),
+		factor: factor,
+		phase:  0,
+	}
+}
+
+func (f *StatefulDecimatingFIRComplex) Reset() {
+	for i := range f.delayR {
+		f.delayR[i] = 0
+		f.delayI[i] = 0
+	}
+	f.phase = 0
+}
+
+func (f *StatefulDecimatingFIRComplex) Process(iq []complex64) []complex64 {
+	if len(iq) == 0 || len(f.taps) == 0 {
+		return nil
+	}
+	if f.factor <= 1 {
+		out := make([]complex64, len(iq))
+		for i := 0; i < len(iq); i++ {
+			copy(f.delayR[1:], f.delayR[:len(f.taps)-1])
+			copy(f.delayI[1:], f.delayI[:len(f.taps)-1])
+			f.delayR[0] = float64(real(iq[i]))
+			f.delayI[0] = float64(imag(iq[i]))
+			var accR, accI float64
+			for k := 0; k < len(f.taps); k++ {
+				w := f.taps[k]
+				accR += f.delayR[k] * w
+				accI += f.delayI[k] * w
+			}
+			out[i] = complex(float32(accR), float32(accI))
+		}
+		return out
+	}
+
+	out := make([]complex64, 0, len(iq)/f.factor+1)
+	n := len(f.taps)
+	for i := 0; i < len(iq); i++ {
+		copy(f.delayR[1:], f.delayR[:n-1])
+		copy(f.delayI[1:], f.delayI[:n-1])
+		f.delayR[0] = float64(real(iq[i]))
+		f.delayI[0] = float64(imag(iq[i]))
+
+		if f.phase == 0 {
+			var accR, accI float64
+			for k := 0; k < n; k++ {
+				w := f.taps[k]
+				accR += f.delayR[k] * w
+				accI += f.delayI[k] * w
+			}
+			out = append(out, complex(float32(accR), float32(accI)))
+			f.phase = f.factor - 1
+		} else {
+			f.phase--
+		}
+	}
+	return out
+}
diff --git a/internal/dsp/decimating_fir_test.go b/internal/dsp/decimating_fir_test.go
new file mode 100644
index 0000000..821cb09
--- /dev/null
+++ b/internal/dsp/decimating_fir_test.go
@@ -0,0 +1,57 @@
+package dsp
+
+import (
+	"math/cmplx"
+	"testing"
+)
+
+func TestStatefulDecimatingFIRComplexStreamContinuity(t *testing.T) {
+	taps := LowpassFIR(90000, 512000, 101)
+	factor := 2
+
+	input := make([]complex64, 8192)
+	for i := range input {
+		input[i] = complex(float32((i%17)-8)/8.0, float32((i%11)-5)/8.0)
+	}
+
+	one := NewStatefulDecimatingFIRComplex(taps, factor)
+	whole := one.Process(input)
+
+	chunkedProc := NewStatefulDecimatingFIRComplex(taps, factor)
+	var chunked []complex64
+	for i := 0; i < len(input); i += 733 {
+		end := i + 733
+		if end > len(input) {
+			end = len(input)
+		}
+		chunked = append(chunked, chunkedProc.Process(input[i:end])...)
+	}
+
+	if len(whole) != len(chunked) {
+		t.Fatalf("length mismatch whole=%d chunked=%d", len(whole), len(chunked))
+	}
+	for i := range whole {
+		if cmplx.Abs(complex128(whole[i]-chunked[i])) > 1e-5 {
+			t.Fatalf("sample %d mismatch whole=%v chunked=%v", i, whole[i], chunked[i])
+		}
+	}
+}
+
+func TestStatefulDecimatingFIRComplexMatchesBlockPipelineLength(t *testing.T) {
+	taps := LowpassFIR(90000, 512000, 101)
+	factor := 2
+	input := make([]complex64, 48640)
+	for i := range input {
+		input[i] = complex(float32((i%13)-6)/8.0, float32((i%7)-3)/8.0)
+	}
+
+	stateful := NewStatefulDecimatingFIRComplex(taps, factor)
+	out := stateful.Process(input)
+
+	filtered := ApplyFIR(input, taps)
+	dec := Decimate(filtered, factor)
+
+	if len(out) != len(dec) {
+		t.Fatalf("unexpected output len got=%d want=%d", len(out), len(dec))
+	}
+}
diff --git a/internal/recorder/recorder.go b/internal/recorder/recorder.go
index 7e473a9..a03b378 100644
--- a/internal/recorder/recorder.go
+++ b/internal/recorder/recorder.go
@@ -12,6 +12,7 @@ import (
 
 	"sdr-wideband-suite/internal/demod/gpudemod"
 	"sdr-wideband-suite/internal/detector"
+	"sdr-wideband-suite/internal/telemetry"
 )
 
 type Policy struct {
@@ -54,9 +55,10 @@ type Manager struct {
 	streamer    *Streamer
 	streamedIDs map[int64]bool // signal IDs that were streamed (skip retroactive recording)
 	streamedMu  sync.Mutex
+	telemetry   *telemetry.Collector
 }
 
-func New(sampleRate int, blockSize int, policy Policy, centerHz float64, decodeCommands map[string]string) *Manager {
+func New(sampleRate int, blockSize int, policy Policy, centerHz float64, decodeCommands map[string]string, coll *telemetry.Collector) *Manager {
 	if policy.OutputDir == "" {
 		policy.OutputDir = "data/recordings"
 	}
@@ -71,8 +73,9 @@ func New(sampleRate int, blockSize int, policy Policy, centerHz float64, decodeC
 		centerHz:       centerHz,
 		decodeCommands: decodeCommands,
 		queue:          make(chan detector.Event, 64),
-		streamer:       newStreamer(policy, centerHz),
+		streamer:       newStreamer(policy, centerHz, coll),
 		streamedIDs:    make(map[int64]bool),
+		telemetry:      coll,
 	}
 	m.initGPUDemod(sampleRate, blockSize)
 	m.workerWG.Add(1)
@@ -103,6 +106,13 @@ func (m *Manager) Update(sampleRate int, blockSize int, policy Policy, centerHz
 	if m.streamer != nil {
 		m.streamer.updatePolicy(policy, centerHz)
 	}
+	if m.telemetry != nil {
+		m.telemetry.Event("recorder_update", "info", "recorder policy updated", nil, map[string]any{
+			"sample_rate": sampleRate,
+			"block_size":  blockSize,
+			"enabled":     policy.Enabled,
+		})
+	}
 }
 
 func (m *Manager) Ingest(t0 time.Time, samples []complex64) {
@@ -116,6 +126,9 @@ func (m *Manager) Ingest(t0 time.Time, samples []complex64) {
 		return
 	}
 	ring.Push(t0, samples)
+	if m.telemetry != nil {
+		m.telemetry.SetGauge("recorder.ring.push_samples", float64(len(samples)), nil)
+	}
 }
 
 func (m *Manager) OnEvents(events []detector.Event) {
@@ -134,8 +147,14 @@ func (m *Manager) OnEvents(events []detector.Event) {
 		case m.queue <- ev:
 		default:
 			// drop if queue full
+			if m.telemetry != nil {
+				m.telemetry.IncCounter("recorder.event_queue.drop", 1, nil)
+			}
 		}
 	}
+	if m.telemetry != nil {
+		m.telemetry.SetGauge("recorder.event_queue.len", float64(len(m.queue)), nil)
+	}
 }
 
 func (m *Manager) worker() {
@@ -357,6 +376,13 @@ func (m *Manager) StreamerRef() *Streamer {
 	return m.streamer
 }
 
+func (m *Manager) ResetStreams() {
+	if m == nil || m.streamer == nil {
+		return
+	}
+	m.streamer.ResetStreams()
+}
+
 func (m *Manager) RuntimeInfoBySignalID() map[int64]RuntimeSignalInfo {
 	if m == nil || m.streamer == nil {
 		return nil
diff --git a/internal/recorder/streamer.go b/internal/recorder/streamer.go
index 30d5248..d7fee44 100644
--- a/internal/recorder/streamer.go
+++ b/internal/recorder/streamer.go
@@ -10,7 +10,9 @@ import (
 	"math"
 	"os"
 	"path/filepath"
+	"strconv"
 	"strings"
+	"sort"
 	"sync"
 	"time"
 
@@ -19,6 +21,7 @@ import (
 	"sdr-wideband-suite/internal/detector"
 	"sdr-wideband-suite/internal/dsp"
 	"sdr-wideband-suite/internal/logging"
+	"sdr-wideband-suite/internal/telemetry"
 )
 
 // ---------------------------------------------------------------------------
@@ -26,6 +29,7 @@ import (
 // ---------------------------------------------------------------------------
 
 type streamSession struct {
+	sessionID    string
 	signalID     int64
 	centerHz     float64
 	bwHz         float64
@@ -37,11 +41,36 @@ type streamSession struct {
 	playbackMode string
 	stereoState  string
 	lastAudioTs  time.Time
+
+	debugDumpStart time.Time
+	debugDumpUntil time.Time
+	debugDumpBase  string
+
+	demodDump []float32
+	finalDump []float32
 	lastAudioL   float32
 	lastAudioR   float32
 	prevAudioL   float64 // second-to-last L sample for boundary transient detection
 	lastAudioSet bool
 
+	lastDecIQ    complex64
+	prevDecIQ    complex64
+	lastDecIQSet bool
+
+	lastExtractIQ    complex64
+	prevExtractIQ    complex64
+	lastExtractIQSet bool
+
+	// FM discriminator cross-block bridging: carry the last IQ sample so the
+	// discriminator can compute the phase step across block boundaries.
+	lastDiscrimIQ    complex64
+	lastDiscrimIQSet bool
+
+	lastDemodL   float32
+	prevDemodL   float64
+	lastDemodSet bool
+	snippetSeq   uint64
+
 	// listenOnly sessions have no WAV file and no disk I/O.
 	// They exist solely to feed audio to live-listen subscribers.
 	listenOnly bool
@@ -60,6 +89,8 @@ type streamSession struct {
 	// --- Persistent DSP state for click-free streaming ---
 
 	// Overlap-save: tail of previous extracted IQ snippet.
+	// Currently unused for live demod after removing the extra discriminator
+	// overlap prepend, but kept in DSP snapshot state for compatibility.
 	overlapIQ []complex64
 
 	// De-emphasis IIR state (persists across frames)
@@ -96,13 +127,21 @@ type streamSession struct {
 	pilotLPFHi       *dsp.StatefulFIRReal // ~21kHz LP for pilot bandpass high
 	pilotLPFLo       *dsp.StatefulFIRReal // ~17kHz LP for pilot bandpass low
 
+	// WFM 15kHz audio LPF — removes pilot (19kHz), L-R subcarrier (23-53kHz),
+	// and RDS (57kHz) from the FM discriminator output before resampling.
+	// Without this, the pilot leaks into the audio as a 19kHz tone (+55dB above
+	// noise floor) and L-R subcarrier energy causes audible click-like artifacts.
+	wfmAudioLPF     *dsp.StatefulFIRReal
+	wfmAudioLPFRate int
+
 	// Stateful pre-demod anti-alias FIR (eliminates cold-start transients
 	// and avoids per-frame FIR recomputation)
-	preDemodFIR    *dsp.StatefulFIRComplex
-	preDemodDecim  int     // cached decimation factor
-	preDemodRate   int     // cached snipRate this FIR was built for
-	preDemodCutoff float64 // cached cutoff
-	preDemodDecimPhase int // stateful decimation phase (index offset into next frame)
+	preDemodFIR       *dsp.StatefulFIRComplex
+	preDemodDecimator *dsp.StatefulDecimatingFIRComplex
+	preDemodDecim     int     // cached decimation factor
+	preDemodRate      int     // cached snipRate this FIR was built for
+	preDemodCutoff    float64 // cached cutoff
+	preDemodDecimPhase int    // retained for backward compatibility in snapshots/debug
 
 	// AQ-2: De-emphasis config (µs, 0 = disabled)
 	deemphasisUs float64
@@ -146,6 +185,54 @@ const (
 	resamplerTaps   = 32 // taps per polyphase arm — good quality
 )
 
+var debugDumpDelay = func() time.Duration {
+	raw := strings.TrimSpace(os.Getenv("SDR_DEBUG_DUMP_DELAY_SECONDS"))
+	if raw == "" {
+		return 5 * time.Second
+	}
+	v, err := strconv.Atoi(raw)
+	if err != nil || v < 0 {
+		return 5 * time.Second
+	}
+	return time.Duration(v) * time.Second
+}()
+
+var debugDumpDuration = func() time.Duration {
+	raw := strings.TrimSpace(os.Getenv("SDR_DEBUG_DUMP_DURATION_SECONDS"))
+	if raw == "" {
+		return 15 * time.Second
+	}
+	v, err := strconv.Atoi(raw)
+	if err != nil || v <= 0 {
+		return 15 * time.Second
+	}
+	return time.Duration(v) * time.Second
+}()
+
+var audioDumpEnabled = func() bool {
+	raw := strings.TrimSpace(os.Getenv("SDR_DEBUG_AUDIO_DUMP_ENABLED"))
+	if raw == "" {
+		return false
+	}
+	v, err := strconv.ParseBool(raw)
+	if err != nil {
+		return false
+	}
+	return v
+}()
+
+var decHeadTrimSamples = func() int {
+	raw := strings.TrimSpace(os.Getenv("SDR_DEC_HEAD_TRIM"))
+	if raw == "" {
+		return 0
+	}
+	v, err := strconv.Atoi(raw)
+	if err != nil || v < 0 {
+		return 0
+	}
+	return v
+}()
+
 // ---------------------------------------------------------------------------
 // Streamer — manages all active streaming sessions
 // ---------------------------------------------------------------------------
@@ -159,6 +246,7 @@ type streamFeedItem struct {
 type streamFeedMsg struct {
 	traceID uint64
 	items   []streamFeedItem
+	enqueuedAt time.Time
 }
 
 type Streamer struct {
@@ -178,6 +266,7 @@ type Streamer struct {
 
 	// pendingListens are subscribers waiting for a matching session.
 	pendingListens map[int64]*pendingListen
+	telemetry      *telemetry.Collector
 }
 
 type pendingListen struct {
@@ -187,7 +276,7 @@ type pendingListen struct {
 	ch   chan []byte
 }
 
-func newStreamer(policy Policy, centerHz float64) *Streamer {
+func newStreamer(policy Policy, centerHz float64, coll *telemetry.Collector) *Streamer {
 	st := &Streamer{
 		sessions:       make(map[int64]*streamSession),
 		policy:         policy,
@@ -195,6 +284,7 @@ func newStreamer(policy Policy, centerHz float64) *Streamer {
 		feedCh:         make(chan streamFeedMsg, 2),
 		done:           make(chan struct{}),
 		pendingListens: make(map[int64]*pendingListen),
+		telemetry:      coll,
 	}
 	go st.worker()
 	return st
@@ -282,18 +372,33 @@ func (st *Streamer) FeedSnippets(items []streamFeedItem, traceID uint64) {
 	if (!recEnabled && !hasListeners) || len(items) == 0 {
 		return
 	}
+	if st.telemetry != nil {
+		st.telemetry.SetGauge("streamer.feed.queue_len", float64(len(st.feedCh)), nil)
+		st.telemetry.SetGauge("streamer.pending_listeners", float64(pending), nil)
+		st.telemetry.Observe("streamer.feed.batch_size", float64(len(items)), nil)
+	}
 
 	select {
-	case st.feedCh <- streamFeedMsg{traceID: traceID, items: items}:
+	case st.feedCh <- streamFeedMsg{traceID: traceID, items: items, enqueuedAt: time.Now()}:
 	default:
 		st.droppedFeed++
 		logging.Warn("drop", "feed_drop", "count", st.droppedFeed)
+		if st.telemetry != nil {
+			st.telemetry.IncCounter("streamer.feed.drop", 1, nil)
+			st.telemetry.Event("stream_feed_drop", "warn", "feed queue full", nil, map[string]any{
+				"trace_id": traceID,
+				"queue_len": len(st.feedCh),
+			})
+		}
 	}
 }
 
 // processFeed runs in the worker goroutine.
 func (st *Streamer) processFeed(msg streamFeedMsg) {
+	procStart := time.Now()
+	lockStart := time.Now()
 	st.mu.Lock()
+	lockWait := time.Since(lockStart)
 	recEnabled := st.policy.Enabled && (st.policy.RecordAudio || st.policy.RecordIQ)
 	hasListeners := st.hasListenersLocked()
 	now := time.Now()
@@ -301,10 +406,24 @@ func (st *Streamer) processFeed(msg streamFeedMsg) {
 		gap := now.Sub(st.lastProcTS)
 		if gap > 150*time.Millisecond {
 			logging.Warn("gap", "process_gap", "gap_ms", gap.Milliseconds(), "trace", msg.traceID)
+			if st.telemetry != nil {
+				st.telemetry.IncCounter("streamer.process.gap.count", 1, nil)
+				st.telemetry.Observe("streamer.process.gap_ms", float64(gap.Milliseconds()), nil)
+			}
 		}
 	}
 	st.lastProcTS = now
 	defer st.mu.Unlock()
+	defer func() {
+		if st.telemetry != nil {
+			st.telemetry.Observe("streamer.process.total_ms", float64(time.Since(procStart).Microseconds())/1000.0, nil)
+			st.telemetry.Observe("streamer.lock_wait_ms", float64(lockWait.Microseconds())/1000.0, telemetry.TagsFromPairs("lock", "process"))
+		}
+	}()
+	if st.telemetry != nil {
+		st.telemetry.Observe("streamer.feed.enqueue_delay_ms", float64(now.Sub(msg.enqueuedAt).Microseconds())/1000.0, nil)
+		st.telemetry.SetGauge("streamer.sessions.active", float64(len(st.sessions)), nil)
+	}
 
 	logging.Debug("trace", "process_feed", "trace", msg.traceID, "items", len(msg.items))
 
@@ -367,6 +486,9 @@ func (st *Streamer) processFeed(msg streamFeedMsg) {
 				if err != nil {
 					log.Printf("STREAM: open failed signal=%d %.1fMHz: %v",
 						sig.ID, sig.CenterHz/1e6, err)
+					if st.telemetry != nil {
+						st.telemetry.IncCounter("streamer.session.open_error", 1, telemetry.TagsFromPairs("kind", "recording"))
+					}
 					continue
 				}
 				st.sessions[sig.ID] = s
@@ -378,6 +500,13 @@ func (st *Streamer) processFeed(msg streamFeedMsg) {
 			}
 			// Attach any pending listeners
 			st.attachPendingListeners(sess)
+			if st.telemetry != nil {
+				st.telemetry.IncCounter("streamer.session.open", 1, telemetry.TagsFromPairs("session_id", sess.sessionID, "signal_id", fmt.Sprintf("%d", sig.ID)))
+				st.telemetry.Event("session_open", "info", "stream session opened", telemetry.TagsFromPairs("session_id", sess.sessionID, "signal_id", fmt.Sprintf("%d", sig.ID)), map[string]any{
+					"listen_only": sess.listenOnly,
+					"demod":       sess.demodName,
+				})
+			}
 		}
 
 		// Update metadata
@@ -396,10 +525,17 @@ func (st *Streamer) processFeed(msg streamFeedMsg) {
 
 		// Demod with persistent state
 		logging.Debug("trace", "demod_start", "trace", msg.traceID, "signal", sess.signalID, "snip_len", len(item.snippet), "snip_rate", item.snipRate)
-		audio, audioRate := sess.processSnippet(item.snippet, item.snipRate)
+		audioStart := time.Now()
+		audio, audioRate := sess.processSnippet(item.snippet, item.snipRate, st.telemetry)
+		if st.telemetry != nil {
+			st.telemetry.Observe("streamer.process_snippet_ms", float64(time.Since(audioStart).Microseconds())/1000.0, telemetry.TagsFromPairs("signal_id", fmt.Sprintf("%d", sess.signalID), "session_id", sess.sessionID))
+		}
 		logging.Debug("trace", "demod_done", "trace", msg.traceID, "signal", sess.signalID, "audio_len", len(audio), "audio_rate", audioRate)
 		if len(audio) == 0 {
 			logging.Warn("gap", "audio_empty", "signal", sess.signalID, "snip_len", len(item.snippet), "snip_rate", item.snipRate)
+			if st.telemetry != nil {
+				st.telemetry.IncCounter("streamer.audio.empty", 1, telemetry.TagsFromPairs("signal_id", fmt.Sprintf("%d", sess.signalID)))
+			}
 		}
 		if len(audio) > 0 {
 			if sess.wavSamples == 0 && audioRate > 0 {
@@ -426,6 +562,10 @@ func (st *Streamer) processFeed(msg streamFeedMsg) {
 					gap := time.Since(sess.lastAudioTs)
 					if gap > 150*time.Millisecond {
 						logging.Warn("gap", "audio_gap", "signal", sess.signalID, "gap_ms", gap.Milliseconds())
+						if st.telemetry != nil {
+							st.telemetry.IncCounter("streamer.audio.gap.count", 1, telemetry.TagsFromPairs("signal_id", fmt.Sprintf("%d", sess.signalID)))
+							st.telemetry.Observe("streamer.audio.gap_ms", float64(gap.Milliseconds()), telemetry.TagsFromPairs("signal_id", fmt.Sprintf("%d", sess.signalID)))
+						}
 					}
 				}
 				// Transient click detector: finds short impulses (1-3 samples)
@@ -452,6 +592,10 @@ func (st *Streamer) processFeed(msg streamFeedMsg) {
 						d2 := math.Abs(2*float64(sess.lastAudioL) - sess.prevAudioL - first)
 						if d2 > 0.15 {
 							logging.Warn("boundary", "boundary_click", "signal", sess.signalID, "d2", d2)
+							if st.telemetry != nil {
+								st.telemetry.IncCounter("audio.boundary_click.count", 1, telemetry.TagsFromPairs("signal_id", fmt.Sprintf("%d", sess.signalID), "session_id", sess.sessionID))
+								st.telemetry.Observe("audio.boundary_click.d2", d2, telemetry.TagsFromPairs("signal_id", fmt.Sprintf("%d", sess.signalID)))
+							}
 						}
 					}
 
@@ -474,6 +618,10 @@ func (st *Streamer) processFeed(msg streamFeedMsg) {
 					}
 					if nClicks > 0 {
 						logging.Warn("boundary", "intra_click", "signal", sess.signalID, "clicks", nClicks, "maxD2", maxD2, "pos", maxD2Pos, "len", nFrames)
+						if st.telemetry != nil {
+							st.telemetry.IncCounter("audio.intra_click.count", float64(nClicks), telemetry.TagsFromPairs("signal_id", fmt.Sprintf("%d", sess.signalID), "session_id", sess.sessionID))
+							st.telemetry.Observe("audio.intra_click.max_d2", maxD2, telemetry.TagsFromPairs("signal_id", fmt.Sprintf("%d", sess.signalID)))
+						}
 					}
 
 					// Store last two samples for next frame's boundary check
@@ -513,6 +661,13 @@ func (st *Streamer) processFeed(msg streamFeedMsg) {
 			s.audioSubs = oldSubs
 			s.restoreDSPState(oldState)
 			st.sessions[sig.ID] = s
+			if st.telemetry != nil {
+				st.telemetry.IncCounter("streamer.session.reopen", 1, telemetry.TagsFromPairs("signal_id", fmt.Sprintf("%d", sig.ID)))
+				st.telemetry.Event("session_reopen", "info", "stream session rotated by max duration", telemetry.TagsFromPairs("signal_id", fmt.Sprintf("%d", sig.ID)), map[string]any{
+					"old_session": sess.sessionID,
+					"new_session": s.sessionID,
+				})
+			}
 		}
 	}
 
@@ -533,6 +688,13 @@ func (st *Streamer) processFeed(msg streamFeedMsg) {
 			if !sess.listenOnly {
 				closeSession(sess, &st.policy)
 			}
+			if st.telemetry != nil {
+				st.telemetry.IncCounter("streamer.session.close", 1, telemetry.TagsFromPairs("signal_id", fmt.Sprintf("%d", id), "session_id", sess.sessionID))
+				st.telemetry.Event("session_close", "info", "stream session closed", telemetry.TagsFromPairs("signal_id", fmt.Sprintf("%d", id), "session_id", sess.sessionID), map[string]any{
+					"reason": "signal_missing",
+					"listen_only": sess.listenOnly,
+				})
+			}
 			delete(st.sessions, id)
 		}
 	}
@@ -578,8 +740,19 @@ func (st *Streamer) attachPendingListeners(sess *streamSession) {
 			default:
 			}
 
+			if audioDumpEnabled {
+				now := time.Now()
+				sess.debugDumpStart = now.Add(debugDumpDelay)
+				sess.debugDumpUntil = sess.debugDumpStart.Add(debugDumpDuration)
+				sess.debugDumpBase = filepath.Join("debug", fmt.Sprintf("signal-%d-window-%s", sess.signalID, now.Format("20060102-150405")))
+				sess.demodDump = nil
+				sess.finalDump = nil
+			}
 			log.Printf("STREAM: attached pending listener %d to signal %d (%.1fMHz %s ch=%d)",
 				subID, sess.signalID, sess.centerHz/1e6, sess.demodName, sess.channels)
+			if audioDumpEnabled {
+				log.Printf("STREAM: debug dump armed signal=%d start=%s until=%s", sess.signalID, sess.debugDumpStart.Format(time.RFC3339), sess.debugDumpUntil.Format(time.RFC3339))
+			}
 		}
 	}
 }
@@ -615,12 +788,18 @@ func (st *Streamer) CloseAll() {
 		if !sess.listenOnly {
 			closeSession(sess, &st.policy)
 		}
+		if st.telemetry != nil {
+			st.telemetry.IncCounter("streamer.session.close", 1, telemetry.TagsFromPairs("signal_id", fmt.Sprintf("%d", id), "session_id", sess.sessionID))
+		}
 		delete(st.sessions, id)
 	}
 	for _, pl := range st.pendingListens {
 		close(pl.ch)
 	}
 	st.pendingListens = nil
+	if st.telemetry != nil {
+		st.telemetry.Event("streamer_close_all", "info", "all stream sessions closed", nil, nil)
+	}
 }
 
 // ActiveSessions returns the number of open streaming sessions.
@@ -663,9 +842,23 @@ func (st *Streamer) SubscribeAudio(freq float64, bw float64, mode string) (int64
 
 	if bestSess != nil && bestDist < 200000 {
 		bestSess.audioSubs = append(bestSess.audioSubs, audioSub{id: subID, ch: ch})
+		if audioDumpEnabled {
+			now := time.Now()
+			bestSess.debugDumpStart = now.Add(debugDumpDelay)
+			bestSess.debugDumpUntil = bestSess.debugDumpStart.Add(debugDumpDuration)
+			bestSess.debugDumpBase = filepath.Join("debug", fmt.Sprintf("signal-%d-window-%s", bestSess.signalID, now.Format("20060102-150405")))
+			bestSess.demodDump = nil
+			bestSess.finalDump = nil
+		}
 		info := bestSess.audioInfo()
 		log.Printf("STREAM: subscriber %d attached to signal %d (%.1fMHz %s)",
 			subID, bestSess.signalID, bestSess.centerHz/1e6, bestSess.demodName)
+		if audioDumpEnabled {
+			log.Printf("STREAM: debug dump armed signal=%d start=%s until=%s", bestSess.signalID, bestSess.debugDumpStart.Format(time.RFC3339), bestSess.debugDumpUntil.Format(time.RFC3339))
+		}
+		if st.telemetry != nil {
+			st.telemetry.IncCounter("streamer.listener.attach", 1, telemetry.TagsFromPairs("signal_id", fmt.Sprintf("%d", bestSess.signalID), "session_id", bestSess.sessionID))
+		}
 		return subID, ch, info, nil
 	}
 
@@ -679,6 +872,10 @@ func (st *Streamer) SubscribeAudio(freq float64, bw float64, mode string) (int64
 	info := defaultAudioInfoForMode(mode)
 	log.Printf("STREAM: subscriber %d pending (freq=%.1fMHz)", subID, freq/1e6)
 	log.Printf("LIVEAUDIO MATCH: subscriber=%d pending req=%.3fMHz bw=%.0f mode=%s", subID, freq/1e6, bw, mode)
+	if st.telemetry != nil {
+		st.telemetry.IncCounter("streamer.listener.pending", 1, nil)
+		st.telemetry.SetGauge("streamer.pending_listeners", float64(len(st.pendingListens)), nil)
+	}
 	return subID, ch, info, nil
 }
 
@@ -690,6 +887,10 @@ func (st *Streamer) UnsubscribeAudio(subID int64) {
 	if pl, ok := st.pendingListens[subID]; ok {
 		close(pl.ch)
 		delete(st.pendingListens, subID)
+		if st.telemetry != nil {
+			st.telemetry.IncCounter("streamer.listener.unsubscribe", 1, telemetry.TagsFromPairs("kind", "pending"))
+			st.telemetry.SetGauge("streamer.pending_listeners", float64(len(st.pendingListens)), nil)
+		}
 		return
 	}
 
@@ -698,6 +899,9 @@ func (st *Streamer) UnsubscribeAudio(subID int64) {
 			if sub.id == subID {
 				close(sub.ch)
 				sess.audioSubs = append(sess.audioSubs[:i], sess.audioSubs[i+1:]...)
+				if st.telemetry != nil {
+					st.telemetry.IncCounter("streamer.listener.unsubscribe", 1, telemetry.TagsFromPairs("kind", "active", "session_id", sess.sessionID))
+				}
 				return
 			}
 		}
@@ -711,10 +915,96 @@ func (st *Streamer) UnsubscribeAudio(subID int64) {
 // processSnippet takes a pre-extracted IQ snippet and demodulates it with
 // persistent state. Uses stateful FIR + polyphase resampler for exact 48kHz
 // output with zero transient artifacts.
-func (sess *streamSession) processSnippet(snippet []complex64, snipRate int) ([]float32, int) {
+type iqHeadProbeStats struct {
+	meanMag float64
+	minMag  float64
+	maxStep float64
+	p95Step float64
+	lowMag  int
+}
+
+func probeIQHeadStats(iq []complex64, probeLen int) iqHeadProbeStats {
+	if probeLen <= 0 || len(iq) == 0 {
+		return iqHeadProbeStats{}
+	}
+	if len(iq) < probeLen {
+		probeLen = len(iq)
+	}
+	stats := iqHeadProbeStats{minMag: math.MaxFloat64}
+	steps := make([]float64, 0, probeLen)
+	var sum float64
+	for i := 0; i < probeLen; i++ {
+		v := iq[i]
+		mag := math.Hypot(float64(real(v)), float64(imag(v)))
+		sum += mag
+		if mag < stats.minMag {
+			stats.minMag = mag
+		}
+		if mag < 0.02 {
+			stats.lowMag++
+		}
+		if i > 0 {
+			p := iq[i-1]
+			num := float64(real(p))*float64(imag(v)) - float64(imag(p))*float64(real(v))
+			den := float64(real(p))*float64(real(v)) + float64(imag(p))*float64(imag(v))
+			step := math.Abs(math.Atan2(num, den))
+			steps = append(steps, step)
+			if step > stats.maxStep {
+				stats.maxStep = step
+			}
+		}
+	}
+	stats.meanMag = sum / float64(probeLen)
+	if len(steps) > 0 {
+		sorted := append([]float64(nil), steps...)
+		sort.Float64s(sorted)
+		idx := int(math.Round(0.95 * float64(len(sorted)-1)))
+		if idx < 0 {
+			idx = 0
+		}
+		if idx >= len(sorted) {
+			idx = len(sorted) - 1
+		}
+		stats.p95Step = sorted[idx]
+	}
+	if stats.minMag == math.MaxFloat64 {
+		stats.minMag = 0
+	}
+	return stats
+}
+
+func (sess *streamSession) processSnippet(snippet []complex64, snipRate int, coll *telemetry.Collector) ([]float32, int) {
 	if len(snippet) == 0 || snipRate <= 0 {
 		return nil, 0
 	}
+	baseTags := telemetry.TagsFromPairs("signal_id", fmt.Sprintf("%d", sess.signalID), "session_id", sess.sessionID)
+	if coll != nil {
+		coll.SetGauge("iq.stage.snippet.length", float64(len(snippet)), baseTags)
+		stats := probeIQHeadStats(snippet, 64)
+		coll.Observe("iq.snippet.head_mean_mag", stats.meanMag, baseTags)
+		coll.Observe("iq.snippet.head_min_mag", stats.minMag, baseTags)
+		coll.Observe("iq.snippet.head_max_step", stats.maxStep, baseTags)
+		coll.Observe("iq.snippet.head_p95_step", stats.p95Step, baseTags)
+		coll.SetGauge("iq.snippet.head_low_magnitude_count", float64(stats.lowMag), baseTags)
+		if sess.lastExtractIQSet {
+			prevMag := math.Hypot(float64(real(sess.lastExtractIQ)), float64(imag(sess.lastExtractIQ)))
+			currMag := math.Hypot(float64(real(snippet[0])), float64(imag(snippet[0])))
+			deltaMag := math.Abs(currMag - prevMag)
+			num := float64(real(sess.lastExtractIQ))*float64(imag(snippet[0])) - float64(imag(sess.lastExtractIQ))*float64(real(snippet[0]))
+			den := float64(real(sess.lastExtractIQ))*float64(real(snippet[0])) + float64(imag(sess.lastExtractIQ))*float64(imag(snippet[0]))
+			deltaPhase := math.Abs(math.Atan2(num, den))
+			d2 := float64(real(snippet[0]-sess.lastExtractIQ))*float64(real(snippet[0]-sess.lastExtractIQ)) + float64(imag(snippet[0]-sess.lastExtractIQ))*float64(imag(snippet[0]-sess.lastExtractIQ))
+			coll.Observe("iq.extract.output.boundary.delta_mag", deltaMag, baseTags)
+			coll.Observe("iq.extract.output.boundary.delta_phase", deltaPhase, baseTags)
+			coll.Observe("iq.extract.output.boundary.d2", d2, baseTags)
+			coll.Observe("iq.extract.output.boundary.discontinuity_score", deltaMag+deltaPhase, baseTags)
+		}
+	}
+	if len(snippet) > 0 {
+		sess.prevExtractIQ = sess.lastExtractIQ
+		sess.lastExtractIQ = snippet[len(snippet)-1]
+		sess.lastExtractIQSet = true
+	}
 
 	isWFMStereo := sess.demodName == "WFM_STEREO"
 	isWFM := sess.demodName == "WFM" || isWFMStereo
@@ -731,25 +1021,48 @@ func (sess *streamSession) processSnippet(snippet []complex64, snipRate int) ([]
 		return nil, 0
 	}
 
-	// --- FM discriminator overlap: prepend 1 sample from previous frame ---
-	// The FM discriminator needs iq[i-1] to compute the first output.
-	// All FIR filtering is now stateful, so no additional overlap is needed.
-	var fullSnip []complex64
-	trimSamples := 0
-	_ = trimSamples
-	if len(sess.overlapIQ) == 1 {
-		fullSnip = make([]complex64, 1+len(snippet))
-		fullSnip[0] = sess.overlapIQ[0]
-		copy(fullSnip[1:], snippet)
-		trimSamples = 1
-		logging.Debug("discrim", "overlap_applied", "signal", sess.signalID, "snip", len(snippet))
-	} else {
-		fullSnip = snippet
-	}
+	// The extra 1-sample discriminator overlap prepend was removed after it was
+	// shown to shift the downstream decimation phase and create heavy click
+	// artifacts in steady-state streaming/recording. The upstream extraction path
+	// and the stateful FIR/decimation stages already provide continuity.
+	fullSnip := snippet
+	overlapApplied := false
+	prevTailValid := false
 
-	// Save last sample for next frame's FM discriminator
-	if len(snippet) > 0 {
-		sess.overlapIQ = []complex64{snippet[len(snippet)-1]}
+	if logging.EnabledCategory("prefir") && len(fullSnip) > 0 {
+		probeN := 64
+		if len(fullSnip) < probeN {
+			probeN = len(fullSnip)
+		}
+		minPreMag := math.MaxFloat64
+		minPreIdx := 0
+		maxPreStep := 0.0
+		maxPreStepIdx := 0
+		for i := 0; i < probeN; i++ {
+			v := fullSnip[i]
+			mag := math.Hypot(float64(real(v)), float64(imag(v)))
+			if mag < minPreMag {
+				minPreMag = mag
+				minPreIdx = i
+			}
+			if i > 0 {
+				p := fullSnip[i-1]
+				num := float64(real(p))*float64(imag(v)) - float64(imag(p))*float64(real(v))
+				den := float64(real(p))*float64(real(v)) + float64(imag(p))*float64(imag(v))
+				step := math.Abs(math.Atan2(num, den))
+				if step > maxPreStep {
+					maxPreStep = step
+					maxPreStepIdx = i - 1
+				}
+			}
+		}
+		logging.Debug("prefir", "pre_fir_head_probe", "signal", sess.signalID, "probe_len", probeN, "min_mag", minPreMag, "min_idx", minPreIdx, "max_step", maxPreStep, "max_step_idx", maxPreStepIdx, "snip_len", len(fullSnip))
+		if minPreMag < 0.18 {
+			logging.Warn("prefir", "pre_fir_head_dip", "signal", sess.signalID, "probe_len", probeN, "min_mag", minPreMag, "min_idx", minPreIdx, "max_step", maxPreStep, "max_step_idx", maxPreStepIdx)
+		}
+		if maxPreStep > 1.5 {
+			logging.Warn("prefir", "pre_fir_head_step", "signal", sess.signalID, "probe_len", probeN, "max_step", maxPreStep, "max_step_idx", maxPreStepIdx, "min_mag", minPreMag, "min_idx", minPreIdx)
+		}
 	}
 
 	// --- Stateful anti-alias FIR + decimation to demod rate ---
@@ -779,29 +1092,242 @@ func (sess *streamSession) processSnippet(snippet []complex64, snipRate int) ([]
 		}
 
 		// Lazy-init or reinit stateful FIR if parameters changed
-		if sess.preDemodFIR == nil || sess.preDemodRate != snipRate || sess.preDemodCutoff != cutoff {
+		if sess.preDemodDecimator == nil || sess.preDemodRate != snipRate || sess.preDemodCutoff != cutoff || sess.preDemodDecim != decim1 {
 			taps := dsp.LowpassFIR(cutoff, snipRate, 101)
 			sess.preDemodFIR = dsp.NewStatefulFIRComplex(taps)
+			sess.preDemodDecimator = dsp.NewStatefulDecimatingFIRComplex(taps, decim1)
 			sess.preDemodRate = snipRate
 			sess.preDemodCutoff = cutoff
 			sess.preDemodDecim = decim1
 			sess.preDemodDecimPhase = 0
+			if coll != nil {
+				coll.IncCounter("dsp.pre_demod.init", 1, telemetry.TagsFromPairs("signal_id", fmt.Sprintf("%d", sess.signalID), "session_id", sess.sessionID))
+				coll.Event("prefir_reinit", "info", "pre-demod decimator reinitialized", telemetry.TagsFromPairs("signal_id", fmt.Sprintf("%d", sess.signalID), "session_id", sess.sessionID), map[string]any{
+					"snip_rate": snipRate,
+					"cutoff_hz": cutoff,
+					"decim":     decim1,
+				})
+			}
 		}
 
+		decimPhaseBefore := sess.preDemodDecimPhase
 		filtered := sess.preDemodFIR.ProcessInto(fullSnip, sess.growIQ(len(fullSnip)))
-		dec = dsp.DecimateStateful(filtered, decim1, &sess.preDemodDecimPhase)
+		dec = sess.preDemodDecimator.Process(fullSnip)
+		sess.preDemodDecimPhase = sess.preDemodDecimator.Phase()
+		if coll != nil {
+			coll.Observe("dsp.pre_demod.decimation_factor", float64(decim1), baseTags)
+			coll.SetGauge("iq.stage.pre_demod.length", float64(len(dec)), baseTags)
+			decStats := probeIQHeadStats(dec, 64)
+			coll.Observe("iq.pre_demod.head_mean_mag", decStats.meanMag, baseTags)
+			coll.Observe("iq.pre_demod.head_min_mag", decStats.minMag, baseTags)
+			coll.Observe("iq.pre_demod.head_max_step", decStats.maxStep, baseTags)
+			coll.Observe("iq.pre_demod.head_p95_step", decStats.p95Step, baseTags)
+			coll.SetGauge("iq.pre_demod.head_low_magnitude_count", float64(decStats.lowMag), baseTags)
+		}
+		logging.Debug("boundary", "snippet_path", "signal", sess.signalID, "overlap_applied", overlapApplied, "snip_len", len(snippet), "full_len", len(fullSnip), "filtered_len", len(filtered), "dec_len", len(dec), "decim1", decim1, "phase_before", decimPhaseBefore, "phase_after", sess.preDemodDecimPhase)
 	} else {
+		logging.Debug("boundary", "snippet_path", "signal", sess.signalID, "overlap_applied", overlapApplied, "snip_len", len(snippet), "full_len", len(fullSnip), "filtered_len", len(fullSnip), "dec_len", len(fullSnip), "decim1", decim1, "phase_before", 0, "phase_after", 0)
 		dec = fullSnip
 	}
 
-	// --- FM Demod ---
-	audio := d.Demod(dec, actualDemodRate)
+	if decHeadTrimSamples > 0 && decHeadTrimSamples < len(dec) {
+		logging.Warn("boundary", "dec_head_trim_applied", "signal", sess.signalID, "trim", decHeadTrimSamples, "before_len", len(dec))
+		dec = dec[decHeadTrimSamples:]
+		if coll != nil {
+			coll.IncCounter("dsp.pre_demod.head_trim", 1, telemetry.TagsFromPairs("signal_id", fmt.Sprintf("%d", sess.signalID)))
+		}
+	}
+
+	if logging.EnabledCategory("boundary") && len(dec) > 0 {
+		first := dec[0]
+		if sess.lastDecIQSet {
+			d2Re := math.Abs(2*float64(real(sess.lastDecIQ)) - float64(real(sess.prevDecIQ)) - float64(real(first)))
+			d2Im := math.Abs(2*float64(imag(sess.lastDecIQ)) - float64(imag(sess.prevDecIQ)) - float64(imag(first)))
+			d2Mag := math.Hypot(d2Re, d2Im)
+			if d2Mag > 0.15 {
+				logging.Warn("boundary", "dec_iq_boundary", "signal", sess.signalID, "d2", d2Mag)
+				if coll != nil {
+					coll.IncCounter("iq.dec.boundary.count", 1, telemetry.TagsFromPairs("signal_id", fmt.Sprintf("%d", sess.signalID), "session_id", sess.sessionID))
+					coll.Observe("iq.dec.boundary.d2", d2Mag, telemetry.TagsFromPairs("signal_id", fmt.Sprintf("%d", sess.signalID)))
+				}
+			}
+		}
+
+		headN := 16
+		if len(dec) < headN {
+			headN = len(dec)
+		}
+		tailN := 16
+		if len(dec) < tailN {
+			tailN = len(dec)
+		}
+		var headSum, tailSum, minMag, maxMag float64
+		minMag = math.MaxFloat64
+		for i, v := range dec {
+			mag := math.Hypot(float64(real(v)), float64(imag(v)))
+			if mag < minMag {
+				minMag = mag
+			}
+			if mag > maxMag {
+				maxMag = mag
+			}
+			if i < headN {
+				headSum += mag
+			}
+		}
+		for i := len(dec) - tailN; i < len(dec); i++ {
+			if i >= 0 {
+				v := dec[i]
+				tailSum += math.Hypot(float64(real(v)), float64(imag(v)))
+			}
+		}
+		headAvg := 0.0
+		if headN > 0 {
+			headAvg = headSum / float64(headN)
+		}
+		tailAvg := 0.0
+		if tailN > 0 {
+			tailAvg = tailSum / float64(tailN)
+		}
+		logging.Debug("boundary", "dec_iq_meter", "signal", sess.signalID, "len", len(dec), "head_avg", headAvg, "tail_avg", tailAvg, "min_mag", minMag, "max_mag", maxMag)
+		if tailAvg > 0 {
+			ratio := headAvg / tailAvg
+			if ratio < 0.75 || ratio > 1.25 {
+				logging.Warn("boundary", "dec_iq_head_tail_skew", "signal", sess.signalID, "head_avg", headAvg, "tail_avg", tailAvg, "ratio", ratio)
+			}
+			if coll != nil {
+				coll.Observe("iq.dec.head_tail_ratio", ratio, telemetry.TagsFromPairs("signal_id", fmt.Sprintf("%d", sess.signalID), "session_id", sess.sessionID))
+			}
+		}
+
+		probeN := 64
+		if len(dec) < probeN {
+			probeN = len(dec)
+		}
+		minHeadMag := math.MaxFloat64
+		minHeadIdx := 0
+		maxHeadStep := 0.0
+		maxHeadStepIdx := 0
+		for i := 0; i < probeN; i++ {
+			v := dec[i]
+			mag := math.Hypot(float64(real(v)), float64(imag(v)))
+			if mag < minHeadMag {
+				minHeadMag = mag
+				minHeadIdx = i
+			}
+			if i > 0 {
+				p := dec[i-1]
+				num := float64(real(p))*float64(imag(v)) - float64(imag(p))*float64(real(v))
+				den := float64(real(p))*float64(real(v)) + float64(imag(p))*float64(imag(v))
+				step := math.Abs(math.Atan2(num, den))
+				if step > maxHeadStep {
+					maxHeadStep = step
+					maxHeadStepIdx = i - 1
+				}
+			}
+		}
+		logging.Debug("boundary", "dec_iq_head_probe", "signal", sess.signalID, "probe_len", probeN, "min_mag", minHeadMag, "min_idx", minHeadIdx, "max_step", maxHeadStep, "max_step_idx", maxHeadStepIdx)
+		if minHeadMag < 0.18 {
+			logging.Warn("boundary", "dec_iq_head_dip", "signal", sess.signalID, "probe_len", probeN, "min_mag", minHeadMag, "min_idx", minHeadIdx, "max_step", maxHeadStep, "max_step_idx", maxHeadStepIdx)
+		}
+		if maxHeadStep > 1.5 {
+			logging.Warn("boundary", "dec_iq_head_step", "signal", sess.signalID, "probe_len", probeN, "max_step", maxHeadStep, "max_step_idx", maxHeadStepIdx, "min_mag", minHeadMag, "min_idx", minHeadIdx)
+		}
+		if coll != nil {
+			coll.Observe("iq.dec.magnitude.min", minMag, telemetry.TagsFromPairs("signal_id", fmt.Sprintf("%d", sess.signalID), "session_id", sess.sessionID))
+			coll.Observe("iq.dec.magnitude.max", maxMag, telemetry.TagsFromPairs("signal_id", fmt.Sprintf("%d", sess.signalID), "session_id", sess.sessionID))
+			coll.Observe("iq.dec.phase_step.max", maxHeadStep, telemetry.TagsFromPairs("signal_id", fmt.Sprintf("%d", sess.signalID), "session_id", sess.sessionID))
+		}
+
+		if len(dec) >= 2 {
+			sess.prevDecIQ = dec[len(dec)-2]
+			sess.lastDecIQ = dec[len(dec)-1]
+		} else {
+			sess.prevDecIQ = sess.lastDecIQ
+			sess.lastDecIQ = dec[0]
+		}
+		sess.lastDecIQSet = true
+	}
+
+	// --- FM/AM/etc Demod ---
+	// For FM demod (NFM/WFM): bridge the block boundary by prepending the
+	// previous block's last IQ sample. Without this, the discriminator loses
+	// the cross-boundary phase step (1 audio sample missing per block) and
+	// any phase discontinuity at the seam becomes an unsmoothed audio transient.
+	var audio []float32
+	isFMDemod := demodName == "NFM" || demodName == "WFM"
+	if isFMDemod && sess.lastDiscrimIQSet && len(dec) > 0 {
+		bridged := make([]complex64, len(dec)+1)
+		bridged[0] = sess.lastDiscrimIQ
+		copy(bridged[1:], dec)
+		audio = d.Demod(bridged, actualDemodRate)
+		// bridged produced len(dec) audio samples (= len(bridged)-1)
+		// which is exactly the correct count for the new data
+	} else {
+		audio = d.Demod(dec, actualDemodRate)
+	}
+	if len(dec) > 0 {
+		sess.lastDiscrimIQ = dec[len(dec)-1]
+		sess.lastDiscrimIQSet = true
+	}
 	if len(audio) == 0 {
 		return nil, 0
 	}
+	if coll != nil {
+		coll.SetGauge("audio.stage.demod.length", float64(len(audio)), baseTags)
+		probe := 64
+		if len(audio) < probe {
+			probe = len(audio)
+		}
+		if probe > 0 {
+			var headAbs, tailAbs float64
+			for i := 0; i < probe; i++ {
+				headAbs += math.Abs(float64(audio[i]))
+				tailAbs += math.Abs(float64(audio[len(audio)-probe+i]))
+			}
+			coll.Observe("audio.demod.head_mean_abs", headAbs/float64(probe), baseTags)
+			coll.Observe("audio.demod.tail_mean_abs", tailAbs/float64(probe), baseTags)
+			coll.Observe("audio.demod.edge_delta_abs", math.Abs(float64(audio[0])-float64(audio[len(audio)-1])), baseTags)
+		}
+	}
+	if logging.EnabledCategory("boundary") {
+		stride := d.Channels()
+		if stride < 1 {
+			stride = 1
+		}
+		nFrames := len(audio) / stride
+		if nFrames > 0 {
+			first := float64(audio[0])
+			if sess.lastDemodSet {
+				d2 := math.Abs(2*float64(sess.lastDemodL) - sess.prevDemodL - first)
+				if d2 > 0.15 {
+					logging.Warn("boundary", "demod_boundary", "signal", sess.signalID, "d2", d2)
+					if coll != nil {
+						coll.IncCounter("audio.demod_boundary.count", 1, telemetry.TagsFromPairs("signal_id", fmt.Sprintf("%d", sess.signalID), "session_id", sess.sessionID))
+						coll.Observe("audio.demod_boundary.d2", d2, telemetry.TagsFromPairs("signal_id", fmt.Sprintf("%d", sess.signalID)))
+					}
+				}
+			}
+			if nFrames >= 2 {
+				sess.prevDemodL = float64(audio[(nFrames-2)*stride])
+				sess.lastDemodL = audio[(nFrames-1)*stride]
+			} else {
+				sess.prevDemodL = float64(sess.lastDemodL)
+				sess.lastDemodL = audio[0]
+			}
+			sess.lastDemodSet = true
+		}
+	}
+	logging.Debug("boundary", "audio_path", "signal", sess.signalID, "demod", demodName, "actual_rate", actualDemodRate, "audio_len", len(audio), "channels", d.Channels(), "overlap_applied", overlapApplied, "prev_tail_valid", prevTailValid)
 
-	// --- Trim the 1-sample FM discriminator overlap ---
-	// TEMP: skip audio trim to test if per-block trimming causes ticks
+	shouldDump := !sess.debugDumpStart.IsZero() && !sess.debugDumpUntil.IsZero()
+	if shouldDump {
+		now := time.Now()
+		shouldDump = !now.Before(sess.debugDumpStart) && now.Before(sess.debugDumpUntil)
+	}
+	if shouldDump {
+		sess.demodDump = append(sess.demodDump, audio...)
+	}
 
 	// --- Stateful stereo decode with conservative lock/hysteresis ---
 	channels := 1
@@ -829,6 +1355,11 @@ func (sess *streamSession) processSnippet(snippet []complex64, snipRate int) ([]
 			audio = stereoAudio
 		} else {
 			sess.stereoState = "mono-fallback"
+			// Apply 15kHz LPF before output: the raw discriminator contains
+			// the 19kHz pilot (+55dB), L-R subcarrier (23-53kHz), and RDS (57kHz).
+			// Without filtering, the pilot leaks into audio and subcarrier
+			// energy produces audible click-like artifacts.
+			audio = sess.wfmAudioFilter(audio, actualDemodRate)
 			dual := make([]float32, len(audio)*2)
 			for i, s := range audio {
 				dual[i*2] = s
@@ -839,6 +1370,9 @@ func (sess *streamSession) processSnippet(snippet []complex64, snipRate int) ([]
 		if (prevPlayback != sess.playbackMode || prevStereo != sess.stereoState) && len(sess.audioSubs) > 0 {
 			sendAudioInfo(sess.audioSubs, sess.audioInfo())
 		}
+	} else if isWFM {
+		// Plain WFM (not stereo): also needs 15kHz LPF on discriminator output
+		audio = sess.wfmAudioFilter(audio, actualDemodRate)
 	}
 
 	// --- Polyphase resample to exact 48kHz ---
@@ -848,6 +1382,12 @@ func (sess *streamSession) processSnippet(snippet []complex64, snipRate int) ([]
 				logging.Info("resample", "reset", "mode", "stereo", "rate", actualDemodRate)
 				sess.stereoResampler = dsp.NewStereoResampler(actualDemodRate, streamAudioRate, resamplerTaps)
 				sess.stereoResamplerRate = actualDemodRate
+				if coll != nil {
+					coll.Event("resampler_reset", "info", "stereo resampler reset", telemetry.TagsFromPairs("signal_id", fmt.Sprintf("%d", sess.signalID), "session_id", sess.sessionID), map[string]any{
+						"mode": "stereo",
+						"rate": actualDemodRate,
+					})
+				}
 			}
 			audio = sess.stereoResampler.Process(audio)
 		} else {
@@ -855,10 +1395,19 @@ func (sess *streamSession) processSnippet(snippet []complex64, snipRate int) ([]
 				logging.Info("resample", "reset", "mode", "mono", "rate", actualDemodRate)
 				sess.monoResampler = dsp.NewResampler(actualDemodRate, streamAudioRate, resamplerTaps)
 				sess.monoResamplerRate = actualDemodRate
+				if coll != nil {
+					coll.Event("resampler_reset", "info", "mono resampler reset", telemetry.TagsFromPairs("signal_id", fmt.Sprintf("%d", sess.signalID), "session_id", sess.sessionID), map[string]any{
+						"mode": "mono",
+						"rate": actualDemodRate,
+					})
+				}
 			}
 			audio = sess.monoResampler.Process(audio)
 		}
 	}
+	if coll != nil {
+		coll.SetGauge("audio.stage.output.length", float64(len(audio)), telemetry.TagsFromPairs("signal_id", fmt.Sprintf("%d", sess.signalID), "session_id", sess.sessionID))
+	}
 
 	// --- De-emphasis (configurable: 50µs Europe, 75µs US/Japan, 0=disabled) ---
 	if isWFM && sess.deemphasisUs > 0 && streamAudioRate > 0 {
@@ -890,6 +1439,24 @@ func (sess *streamSession) processSnippet(snippet []complex64, snipRate int) ([]
 		}
 	}
 
+	if shouldDump {
+		sess.finalDump = append(sess.finalDump, audio...)
+	} else if !sess.debugDumpUntil.IsZero() && time.Now().After(sess.debugDumpUntil) && sess.debugDumpBase != "" {
+		_ = os.MkdirAll(filepath.Dir(sess.debugDumpBase), 0o755)
+		if len(sess.demodDump) > 0 {
+			_ = writeWAVFile(sess.debugDumpBase+"-demod.wav", sess.demodDump, actualDemodRate, d.Channels())
+		}
+		if len(sess.finalDump) > 0 {
+			_ = writeWAVFile(sess.debugDumpBase+"-final.wav", sess.finalDump, streamAudioRate, channels)
+		}
+		logging.Warn("boundary", "debug_audio_dump_window", "signal", sess.signalID, "base", sess.debugDumpBase)
+		sess.debugDumpBase = ""
+		sess.demodDump = nil
+		sess.finalDump = nil
+		sess.debugDumpStart = time.Time{}
+		sess.debugDumpUntil = time.Time{}
+	}
+
 	return audio, streamAudioRate
 }
 
@@ -908,6 +1475,20 @@ func pllCoefficients(loopBW, damping float64, sampleRate int) (float64, float64)
 	return alpha, beta
 }
 
+// wfmAudioFilter applies a stateful 15kHz lowpass to WFM discriminator output.
+// Removes the 19kHz stereo pilot, L-R DSB-SC subcarrier (23-53kHz), and RDS (57kHz)
+// that would otherwise leak into the audio output as clicks and tonal artifacts.
+func (sess *streamSession) wfmAudioFilter(audio []float32, sampleRate int) []float32 {
+	if len(audio) == 0 || sampleRate <= 0 {
+		return audio
+	}
+	if sess.wfmAudioLPF == nil || sess.wfmAudioLPFRate != sampleRate {
+		sess.wfmAudioLPF = dsp.NewStatefulFIRReal(dsp.LowpassFIR(15000, sampleRate, 101))
+		sess.wfmAudioLPFRate = sampleRate
+	}
+	return sess.wfmAudioLPF.Process(audio)
+}
+
 // stereoDecodeStateful: pilot-locked 38kHz oscillator for L-R extraction.
 // Uses persistent FIR filter state across frames for click-free stereo.
 // Reuses session scratch buffers to minimize allocations.
@@ -1055,10 +1636,13 @@ type dspStateSnapshot struct {
 	pilotLPFHi          *dsp.StatefulFIRReal
 	pilotLPFLo          *dsp.StatefulFIRReal
 	preDemodFIR         *dsp.StatefulFIRComplex
+	preDemodDecimator   *dsp.StatefulDecimatingFIRComplex
 	preDemodDecim       int
 	preDemodRate        int
 	preDemodCutoff      float64
 	preDemodDecimPhase  int
+	wfmAudioLPF         *dsp.StatefulFIRReal
+	wfmAudioLPFRate     int
 }
 
 func (sess *streamSession) captureDSPState() dspStateSnapshot {
@@ -1087,10 +1671,13 @@ func (sess *streamSession) captureDSPState() dspStateSnapshot {
 		pilotLPFHi:          sess.pilotLPFHi,
 		pilotLPFLo:          sess.pilotLPFLo,
 		preDemodFIR:         sess.preDemodFIR,
+		preDemodDecimator:   sess.preDemodDecimator,
 		preDemodDecim:       sess.preDemodDecim,
 		preDemodRate:        sess.preDemodRate,
 		preDemodCutoff:      sess.preDemodCutoff,
 		preDemodDecimPhase:  sess.preDemodDecimPhase,
+		wfmAudioLPF:         sess.wfmAudioLPF,
+		wfmAudioLPFRate:     sess.wfmAudioLPFRate,
 	}
 }
 
@@ -1119,10 +1706,13 @@ func (sess *streamSession) restoreDSPState(s dspStateSnapshot) {
 	sess.pilotLPFHi = s.pilotLPFHi
 	sess.pilotLPFLo = s.pilotLPFLo
 	sess.preDemodFIR = s.preDemodFIR
+	sess.preDemodDecimator = s.preDemodDecimator
 	sess.preDemodDecim = s.preDemodDecim
 	sess.preDemodRate = s.preDemodRate
 	sess.preDemodCutoff = s.preDemodCutoff
 	sess.preDemodDecimPhase = s.preDemodDecimPhase
+	sess.wfmAudioLPF = s.wfmAudioLPF
+	sess.wfmAudioLPFRate = s.wfmAudioLPFRate
 }
 
 // ---------------------------------------------------------------------------
@@ -1157,6 +1747,7 @@ func (st *Streamer) openRecordingSession(sig *detector.Signal, now time.Time) (*
 	playbackMode, stereoState := initialPlaybackState(demodName)
 
 	sess := &streamSession{
+		sessionID:    fmt.Sprintf("%d-%d-r", sig.ID, now.UnixMilli()),
 		signalID:     sig.ID,
 		centerHz:     sig.CenterHz,
 		bwHz:         sig.BWHz,
@@ -1201,6 +1792,7 @@ func (st *Streamer) openListenSession(sig *detector.Signal, now time.Time) *stre
 	playbackMode, stereoState := initialPlaybackState(demodName)
 
 	sess := &streamSession{
+		sessionID:    fmt.Sprintf("%d-%d-l", sig.ID, now.UnixMilli()),
 		signalID:     sig.ID,
 		centerHz:     sig.CenterHz,
 		bwHz:         sig.BWHz,
@@ -1405,10 +1997,16 @@ func (st *Streamer) fanoutPCM(sess *streamSession, pcm []byte, pcmLen int) {
 		default:
 			st.droppedPCM++
 			logging.Warn("drop", "pcm_drop", "count", st.droppedPCM)
+			if st.telemetry != nil {
+				st.telemetry.IncCounter("streamer.pcm.drop", 1, telemetry.TagsFromPairs("signal_id", fmt.Sprintf("%d", sess.signalID), "session_id", sess.sessionID))
+			}
 		}
 		alive = append(alive, sub)
 	}
 	sess.audioSubs = alive
+	if st.telemetry != nil {
+		st.telemetry.SetGauge("streamer.subscribers.count", float64(len(alive)), telemetry.TagsFromPairs("signal_id", fmt.Sprintf("%d", sess.signalID), "session_id", sess.sessionID))
+	}
 }
 
 func (st *Streamer) classAllowed(cls *classifier.Classification) bool {
@@ -1433,6 +2031,15 @@ var ErrNoSession = errors.New("no active or pending session for this frequency")
 // WAV header helpers
 // ---------------------------------------------------------------------------
 
+func writeWAVFile(path string, audio []float32, sampleRate int, channels int) error {
+	f, err := os.Create(path)
+	if err != nil {
+		return err
+	}
+	defer f.Close()
+	return writeWAVTo(f, audio, sampleRate, channels)
+}
+
 func writeStreamWAVHeader(f *os.File, sampleRate int, channels int) error {
 	if channels <= 0 {
 		channels = 1
@@ -1483,3 +2090,22 @@ func fixStreamWAVHeader(f *os.File, totalSamples int64, sampleRate int, channels
 	}
 	_, _ = f.Write(buf[:])
 }
+
+// ResetStreams forces all active streaming sessions to discard their FIR states and decimation phases.
+// This is used when the upstream DSP drops samples, creating a hard break in phase continuity.
+func (st *Streamer) ResetStreams() {
+	st.mu.Lock()
+	defer st.mu.Unlock()
+	if st.telemetry != nil {
+		st.telemetry.IncCounter("streamer.reset.count", 1, nil)
+		st.telemetry.Event("stream_reset", "warn", "stream DSP state reset", nil, map[string]any{"sessions": len(st.sessions)})
+	}
+	for _, sess := range st.sessions {
+		sess.preDemodFIR = nil
+		sess.preDemodDecimator = nil
+		sess.preDemodDecimPhase = 0
+		sess.stereoResampler = nil
+		sess.monoResampler = nil
+		sess.wfmAudioLPF = nil
+	}
+}
diff --git a/internal/telemetry/telemetry.go b/internal/telemetry/telemetry.go
new file mode 100644
index 0000000..e57a6a1
--- /dev/null
+++ b/internal/telemetry/telemetry.go
@@ -0,0 +1,966 @@
+package telemetry
+
+import (
+	"bufio"
+	"encoding/json"
+	"errors"
+	"fmt"
+	"os"
+	"path/filepath"
+	"sort"
+	"strconv"
+	"strings"
+	"sync"
+	"sync/atomic"
+	"time"
+)
+
+type Config struct {
+	Enabled          bool          `json:"enabled"`
+	HeavyEnabled     bool          `json:"heavy_enabled"`
+	HeavySampleEvery int           `json:"heavy_sample_every"`
+	MetricSampleEvery int          `json:"metric_sample_every"`
+	MetricHistoryMax int           `json:"metric_history_max"`
+	EventHistoryMax  int           `json:"event_history_max"`
+	Retention        time.Duration `json:"retention"`
+	PersistEnabled   bool          `json:"persist_enabled"`
+	PersistDir       string        `json:"persist_dir"`
+	RotateMB         int           `json:"rotate_mb"`
+	KeepFiles        int           `json:"keep_files"`
+}
+
+func DefaultConfig() Config {
+	return Config{
+		Enabled:           true,
+		HeavyEnabled:      false,
+		HeavySampleEvery:  12,
+		MetricSampleEvery: 2,
+		MetricHistoryMax:  12_000,
+		EventHistoryMax:   4_000,
+		Retention:         15 * time.Minute,
+		PersistEnabled:    false,
+		PersistDir:        "debug/telemetry",
+		RotateMB:          16,
+		KeepFiles:         8,
+	}
+}
+
+type Tags map[string]string
+
+type MetricPoint struct {
+	Timestamp time.Time `json:"ts"`
+	Name      string    `json:"name"`
+	Type      string    `json:"type"`
+	Value     float64   `json:"value"`
+	Tags      Tags      `json:"tags,omitempty"`
+}
+
+type Event struct {
+	ID        uint64         `json:"id"`
+	Timestamp time.Time      `json:"ts"`
+	Name      string         `json:"name"`
+	Level     string         `json:"level"`
+	Message   string         `json:"message,omitempty"`
+	Tags      Tags           `json:"tags,omitempty"`
+	Fields    map[string]any `json:"fields,omitempty"`
+}
+
+type SeriesValue struct {
+	Name  string  `json:"name"`
+	Value float64 `json:"value"`
+	Tags  Tags    `json:"tags,omitempty"`
+}
+
+type DistValue struct {
+	Name  string  `json:"name"`
+	Count int64   `json:"count"`
+	Min   float64 `json:"min"`
+	Max   float64 `json:"max"`
+	Mean  float64 `json:"mean"`
+	Last  float64 `json:"last"`
+	P95   float64 `json:"p95"`
+	Tags  Tags    `json:"tags,omitempty"`
+}
+
+type LiveSnapshot struct {
+	Now           time.Time     `json:"now"`
+	StartedAt     time.Time     `json:"started_at"`
+	UptimeMs      int64         `json:"uptime_ms"`
+	Config        Config        `json:"config"`
+	Counters      []SeriesValue `json:"counters"`
+	Gauges        []SeriesValue `json:"gauges"`
+	Distributions []DistValue   `json:"distributions"`
+	RecentEvents  []Event       `json:"recent_events"`
+	Status        map[string]any `json:"status,omitempty"`
+}
+
+type Query struct {
+	From      time.Time
+	To        time.Time
+	Limit     int
+	Name      string
+	NamePrefix string
+	Level     string
+	Tags      Tags
+	IncludePersisted bool
+}
+
+type collectorMetric struct {
+	name string
+	tags Tags
+	value float64
+}
+
+type distMetric struct {
+	name string
+	tags Tags
+	count int64
+	sum float64
+	min float64
+	max float64
+	last float64
+	samples []float64
+	next int
+	full bool
+}
+
+type persistedEnvelope struct {
+	Kind   string      `json:"kind"`
+	Metric *MetricPoint `json:"metric,omitempty"`
+	Event  *Event      `json:"event,omitempty"`
+}
+
+type Collector struct {
+	mu sync.RWMutex
+	cfg Config
+	startedAt time.Time
+	counterSeq uint64
+	heavySeq uint64
+	eventSeq uint64
+
+	counters map[string]*collectorMetric
+	gauges map[string]*collectorMetric
+	dists map[string]*distMetric
+	metricsHistory []MetricPoint
+	events []Event
+	status map[string]any
+
+	writer *jsonlWriter
+}
+
+func New(cfg Config) (*Collector, error) {
+	cfg = sanitizeConfig(cfg)
+	c := &Collector{
+		cfg: cfg,
+		startedAt: time.Now().UTC(),
+		counters: map[string]*collectorMetric{},
+		gauges: map[string]*collectorMetric{},
+		dists: map[string]*distMetric{},
+		metricsHistory: make([]MetricPoint, 0, cfg.MetricHistoryMax),
+		events: make([]Event, 0, cfg.EventHistoryMax),
+		status: map[string]any{},
+	}
+	if cfg.PersistEnabled {
+		writer, err := newJSONLWriter(cfg)
+		if err != nil {
+			return nil, err
+		}
+		c.writer = writer
+	}
+	return c, nil
+}
+
+func (c *Collector) Close() error {
+	if c == nil {
+		return nil
+	}
+	c.mu.Lock()
+	writer := c.writer
+	c.writer = nil
+	c.mu.Unlock()
+	if writer != nil {
+		return writer.Close()
+	}
+	return nil
+}
+
+func (c *Collector) Configure(cfg Config) error {
+	if c == nil {
+		return nil
+	}
+	cfg = sanitizeConfig(cfg)
+	var writer *jsonlWriter
+	var err error
+	if cfg.PersistEnabled {
+		writer, err = newJSONLWriter(cfg)
+		if err != nil {
+			return err
+		}
+	}
+	c.mu.Lock()
+	old := c.writer
+	c.cfg = cfg
+	c.writer = writer
+	c.trimLocked(time.Now().UTC())
+	c.mu.Unlock()
+	if old != nil {
+		_ = old.Close()
+	}
+	return nil
+}
+
+func (c *Collector) Config() Config {
+	c.mu.RLock()
+	defer c.mu.RUnlock()
+	return c.cfg
+}
+
+func (c *Collector) Enabled() bool {
+	if c == nil {
+		return false
+	}
+	c.mu.RLock()
+	defer c.mu.RUnlock()
+	return c.cfg.Enabled
+}
+
+func (c *Collector) ShouldSampleHeavy() bool {
+	if c == nil {
+		return false
+	}
+	c.mu.RLock()
+	cfg := c.cfg
+	c.mu.RUnlock()
+	if !cfg.Enabled || !cfg.HeavyEnabled {
+		return false
+	}
+	n := cfg.HeavySampleEvery
+	if n <= 1 {
+		return true
+	}
+	seq := atomic.AddUint64(&c.heavySeq, 1)
+	return seq%uint64(n) == 0
+}
+
+func (c *Collector) SetStatus(key string, value any) {
+	if c == nil {
+		return
+	}
+	c.mu.Lock()
+	c.status[key] = value
+	c.mu.Unlock()
+}
+
+func (c *Collector) IncCounter(name string, delta float64, tags Tags) {
+	c.recordMetric("counter", name, delta, tags, true)
+}
+
+func (c *Collector) SetGauge(name string, value float64, tags Tags) {
+	c.recordMetric("gauge", name, value, tags, false)
+}
+
+func (c *Collector) Observe(name string, value float64, tags Tags) {
+	c.recordMetric("distribution", name, value, tags, false)
+}
+
+func (c *Collector) Event(name string, level string, message string, tags Tags, fields map[string]any) {
+	if c == nil {
+		return
+	}
+	now := time.Now().UTC()
+	c.mu.Lock()
+	if !c.cfg.Enabled {
+		c.mu.Unlock()
+		return
+	}
+	ev := Event{
+		ID: atomic.AddUint64(&c.eventSeq, 1),
+		Timestamp: now,
+		Name: name,
+		Level: strings.TrimSpace(strings.ToLower(level)),
+		Message: message,
+		Tags: cloneTags(tags),
+		Fields: cloneFields(fields),
+	}
+	if ev.Level == "" {
+		ev.Level = "info"
+	}
+	c.events = append(c.events, ev)
+	c.trimLocked(now)
+	writer := c.writer
+	c.mu.Unlock()
+	if writer != nil {
+		_ = writer.Write(persistedEnvelope{Kind: "event", Event: &ev})
+	}
+}
+
+func (c *Collector) recordMetric(kind string, name string, value float64, tags Tags, add bool) {
+	if c == nil || strings.TrimSpace(name) == "" {
+		return
+	}
+	now := time.Now().UTC()
+	c.mu.Lock()
+	if !c.cfg.Enabled {
+		c.mu.Unlock()
+		return
+	}
+	key := metricKey(name, tags)
+	switch kind {
+	case "counter":
+		m := c.counters[key]
+		if m == nil {
+			m = &collectorMetric{name: name, tags: cloneTags(tags)}
+			c.counters[key] = m
+		}
+		if add {
+			m.value += value
+		} else {
+			m.value = value
+		}
+	case "gauge":
+		m := c.gauges[key]
+		if m == nil {
+			m = &collectorMetric{name: name, tags: cloneTags(tags)}
+			c.gauges[key] = m
+		}
+		m.value = value
+	case "distribution":
+		d := c.dists[key]
+		if d == nil {
+			d = &distMetric{
+				name: name,
+				tags: cloneTags(tags),
+				min: value,
+				max: value,
+				samples: make([]float64, 64),
+			}
+			c.dists[key] = d
+		}
+		d.count++
+		d.sum += value
+		d.last = value
+		if d.count == 1 || value < d.min {
+			d.min = value
+		}
+		if d.count == 1 || value > d.max {
+			d.max = value
+		}
+		if len(d.samples) > 0 {
+			d.samples[d.next] = value
+			d.next++
+			if d.next >= len(d.samples) {
+				d.next = 0
+				d.full = true
+			}
+		}
+	}
+	sampleN := c.cfg.MetricSampleEvery
+	seq := atomic.AddUint64(&c.counterSeq, 1)
+	forceStore := strings.HasPrefix(name, "iq.extract.raw.boundary.") || strings.HasPrefix(name, "iq.extract.trimmed.boundary.")
+	shouldStore := forceStore || sampleN <= 1 || seq%uint64(sampleN) == 0 || kind == "counter"
+	var mp MetricPoint
+	if shouldStore {
+		mp = MetricPoint{
+			Timestamp: now,
+			Name: name,
+			Type: kind,
+			Value: value,
+			Tags: cloneTags(tags),
+		}
+		c.metricsHistory = append(c.metricsHistory, mp)
+	}
+	c.trimLocked(now)
+	writer := c.writer
+	c.mu.Unlock()
+
+	if writer != nil && shouldStore {
+		_ = writer.Write(persistedEnvelope{Kind: "metric", Metric: &mp})
+	}
+}
+
+func (c *Collector) LiveSnapshot() LiveSnapshot {
+	now := time.Now().UTC()
+	c.mu.RLock()
+	cfg := c.cfg
+	out := LiveSnapshot{
+		Now: now,
+		StartedAt: c.startedAt,
+		UptimeMs: now.Sub(c.startedAt).Milliseconds(),
+		Config: cfg,
+		Counters: make([]SeriesValue, 0, len(c.counters)),
+		Gauges: make([]SeriesValue, 0, len(c.gauges)),
+		Distributions: make([]DistValue, 0, len(c.dists)),
+		RecentEvents: make([]Event, 0, min(40, len(c.events))),
+		Status: cloneFields(c.status),
+	}
+	for _, m := range c.counters {
+		out.Counters = append(out.Counters, SeriesValue{Name: m.name, Value: m.value, Tags: cloneTags(m.tags)})
+	}
+	for _, m := range c.gauges {
+		out.Gauges = append(out.Gauges, SeriesValue{Name: m.name, Value: m.value, Tags: cloneTags(m.tags)})
+	}
+	for _, d := range c.dists {
+		mean := 0.0
+		if d.count > 0 {
+			mean = d.sum / float64(d.count)
+		}
+		out.Distributions = append(out.Distributions, DistValue{
+			Name: d.name,
+			Count: d.count,
+			Min: d.min,
+			Max: d.max,
+			Mean: mean,
+			Last: d.last,
+			P95: p95FromDist(d),
+			Tags: cloneTags(d.tags),
+		})
+	}
+	start := len(c.events) - cap(out.RecentEvents)
+	if start < 0 {
+		start = 0
+	}
+	for _, ev := range c.events[start:] {
+		out.RecentEvents = append(out.RecentEvents, copyEvent(ev))
+	}
+	c.mu.RUnlock()
+	sort.Slice(out.Counters, func(i, j int) bool { return out.Counters[i].Name < out.Counters[j].Name })
+	sort.Slice(out.Gauges, func(i, j int) bool { return out.Gauges[i].Name < out.Gauges[j].Name })
+	sort.Slice(out.Distributions, func(i, j int) bool { return out.Distributions[i].Name < out.Distributions[j].Name })
+	return out
+}
+
+func (c *Collector) QueryMetrics(q Query) ([]MetricPoint, error) {
+	if c == nil {
+		return nil, nil
+	}
+	q = normalizeQuery(q)
+	c.mu.RLock()
+	items := make([]MetricPoint, 0, len(c.metricsHistory))
+	for _, m := range c.metricsHistory {
+		if metricMatch(m, q) {
+			items = append(items, copyMetric(m))
+		}
+	}
+	cfg := c.cfg
+	c.mu.RUnlock()
+	if q.IncludePersisted && cfg.PersistEnabled {
+		persisted, err := readPersistedMetrics(cfg, q)
+		if err != nil && !errors.Is(err, os.ErrNotExist) {
+			return nil, err
+		}
+		items = append(items, persisted...)
+	}
+	sort.Slice(items, func(i, j int) bool {
+		return items[i].Timestamp.Before(items[j].Timestamp)
+	})
+	if q.Limit > 0 && len(items) > q.Limit {
+		items = items[len(items)-q.Limit:]
+	}
+	return items, nil
+}
+
+func (c *Collector) QueryEvents(q Query) ([]Event, error) {
+	if c == nil {
+		return nil, nil
+	}
+	q = normalizeQuery(q)
+	c.mu.RLock()
+	items := make([]Event, 0, len(c.events))
+	for _, ev := range c.events {
+		if eventMatch(ev, q) {
+			items = append(items, copyEvent(ev))
+		}
+	}
+	cfg := c.cfg
+	c.mu.RUnlock()
+	if q.IncludePersisted && cfg.PersistEnabled {
+		persisted, err := readPersistedEvents(cfg, q)
+		if err != nil && !errors.Is(err, os.ErrNotExist) {
+			return nil, err
+		}
+		items = append(items, persisted...)
+	}
+	sort.Slice(items, func(i, j int) bool {
+		return items[i].Timestamp.Before(items[j].Timestamp)
+	})
+	if q.Limit > 0 && len(items) > q.Limit {
+		items = items[len(items)-q.Limit:]
+	}
+	return items, nil
+}
+
+func (c *Collector) trimLocked(now time.Time) {
+	if c.cfg.MetricHistoryMax > 0 && len(c.metricsHistory) > c.cfg.MetricHistoryMax {
+		c.metricsHistory = append([]MetricPoint(nil), c.metricsHistory[len(c.metricsHistory)-c.cfg.MetricHistoryMax:]...)
+	}
+	if c.cfg.EventHistoryMax > 0 && len(c.events) > c.cfg.EventHistoryMax {
+		c.events = append([]Event(nil), c.events[len(c.events)-c.cfg.EventHistoryMax:]...)
+	}
+	ret := c.cfg.Retention
+	if ret <= 0 {
+		return
+	}
+	cut := now.Add(-ret)
+	mStart := 0
+	for mStart < len(c.metricsHistory) && c.metricsHistory[mStart].Timestamp.Before(cut) {
+		mStart++
+	}
+	if mStart > 0 {
+		c.metricsHistory = append([]MetricPoint(nil), c.metricsHistory[mStart:]...)
+	}
+	eStart := 0
+	for eStart < len(c.events) && c.events[eStart].Timestamp.Before(cut) {
+		eStart++
+	}
+	if eStart > 0 {
+		c.events = append([]Event(nil), c.events[eStart:]...)
+	}
+}
+
+func sanitizeConfig(cfg Config) Config {
+	def := DefaultConfig()
+	if cfg.HeavySampleEvery <= 0 {
+		cfg.HeavySampleEvery = def.HeavySampleEvery
+	}
+	if cfg.MetricSampleEvery <= 0 {
+		cfg.MetricSampleEvery = def.MetricSampleEvery
+	}
+	if cfg.MetricHistoryMax <= 0 {
+		cfg.MetricHistoryMax = def.MetricHistoryMax
+	}
+	if cfg.EventHistoryMax <= 0 {
+		cfg.EventHistoryMax = def.EventHistoryMax
+	}
+	if cfg.Retention <= 0 {
+		cfg.Retention = def.Retention
+	}
+	if strings.TrimSpace(cfg.PersistDir) == "" {
+		cfg.PersistDir = def.PersistDir
+	}
+	if cfg.RotateMB <= 0 {
+		cfg.RotateMB = def.RotateMB
+	}
+	if cfg.KeepFiles <= 0 {
+		cfg.KeepFiles = def.KeepFiles
+	}
+	return cfg
+}
+
+func normalizeQuery(q Query) Query {
+	if q.Limit <= 0 || q.Limit > 5000 {
+		q.Limit = 500
+	}
+	if q.Tags == nil {
+		q.Tags = Tags{}
+	}
+	return q
+}
+
+func metricMatch(m MetricPoint, q Query) bool {
+	if !q.From.IsZero() && m.Timestamp.Before(q.From) {
+		return false
+	}
+	if !q.To.IsZero() && m.Timestamp.After(q.To) {
+		return false
+	}
+	if q.Name != "" && m.Name != q.Name {
+		return false
+	}
+	if q.NamePrefix != "" && !strings.HasPrefix(m.Name, q.NamePrefix) {
+		return false
+	}
+	for k, v := range q.Tags {
+		if m.Tags[k] != v {
+			return false
+		}
+	}
+	return true
+}
+
+func eventMatch(ev Event, q Query) bool {
+	if !q.From.IsZero() && ev.Timestamp.Before(q.From) {
+		return false
+	}
+	if !q.To.IsZero() && ev.Timestamp.After(q.To) {
+		return false
+	}
+	if q.Name != "" && ev.Name != q.Name {
+		return false
+	}
+	if q.NamePrefix != "" && !strings.HasPrefix(ev.Name, q.NamePrefix) {
+		return false
+	}
+	if q.Level != "" && !strings.EqualFold(q.Level, ev.Level) {
+		return false
+	}
+	for k, v := range q.Tags {
+		if ev.Tags[k] != v {
+			return false
+		}
+	}
+	return true
+}
+
+func metricKey(name string, tags Tags) string {
+	if len(tags) == 0 {
+		return name
+	}
+	keys := make([]string, 0, len(tags))
+	for k := range tags {
+		keys = append(keys, k)
+	}
+	sort.Strings(keys)
+	var b strings.Builder
+	b.Grow(len(name) + len(keys)*16)
+	b.WriteString(name)
+	for _, k := range keys {
+		b.WriteString("|")
+		b.WriteString(k)
+		b.WriteString("=")
+		b.WriteString(tags[k])
+	}
+	return b.String()
+}
+
+func cloneTags(tags Tags) Tags {
+	if len(tags) == 0 {
+		return nil
+	}
+	out := make(Tags, len(tags))
+	for k, v := range tags {
+		out[k] = v
+	}
+	return out
+}
+
+func cloneFields(fields map[string]any) map[string]any {
+	if len(fields) == 0 {
+		return nil
+	}
+	out := make(map[string]any, len(fields))
+	for k, v := range fields {
+		out[k] = v
+	}
+	return out
+}
+
+func copyMetric(m MetricPoint) MetricPoint {
+	return MetricPoint{
+		Timestamp: m.Timestamp,
+		Name: m.Name,
+		Type: m.Type,
+		Value: m.Value,
+		Tags: cloneTags(m.Tags),
+	}
+}
+
+func copyEvent(ev Event) Event {
+	return Event{
+		ID: ev.ID,
+		Timestamp: ev.Timestamp,
+		Name: ev.Name,
+		Level: ev.Level,
+		Message: ev.Message,
+		Tags: cloneTags(ev.Tags),
+		Fields: cloneFields(ev.Fields),
+	}
+}
+
+func p95FromDist(d *distMetric) float64 {
+	if d == nil || d.count == 0 {
+		return 0
+	}
+	n := d.next
+	if d.full {
+		n = len(d.samples)
+	}
+	if n <= 0 {
+		return d.last
+	}
+	buf := make([]float64, n)
+	copy(buf, d.samples[:n])
+	sort.Float64s(buf)
+	idx := int(float64(n-1) * 0.95)
+	if idx < 0 {
+		idx = 0
+	}
+	if idx >= n {
+		idx = n - 1
+	}
+	return buf[idx]
+}
+
+type jsonlWriter struct {
+	cfg Config
+	mu sync.Mutex
+	dir string
+	f *os.File
+	w *bufio.Writer
+	currentPath string
+	currentSize int64
+	seq int64
+}
+
+func newJSONLWriter(cfg Config) (*jsonlWriter, error) {
+	dir := filepath.Clean(cfg.PersistDir)
+	if err := os.MkdirAll(dir, 0o755); err != nil {
+		return nil, err
+	}
+	w := &jsonlWriter{cfg: cfg, dir: dir}
+	if err := w.rotateLocked(); err != nil {
+		return nil, err
+	}
+	return w, nil
+}
+
+func (w *jsonlWriter) Write(v persistedEnvelope) error {
+	w.mu.Lock()
+	defer w.mu.Unlock()
+	if w.f == nil || w.w == nil {
+		return nil
+	}
+	line, err := json.Marshal(v)
+	if err != nil {
+		return err
+	}
+	line = append(line, '\n')
+	if w.currentSize+int64(len(line)) > int64(w.cfg.RotateMB)*1024*1024 {
+		if err := w.rotateLocked(); err != nil {
+			return err
+		}
+	}
+	n, err := w.w.Write(line)
+	w.currentSize += int64(n)
+	if err != nil {
+		return err
+	}
+	return w.w.Flush()
+}
+
+func (w *jsonlWriter) Close() error {
+	w.mu.Lock()
+	defer w.mu.Unlock()
+	if w.w != nil {
+		_ = w.w.Flush()
+	}
+	if w.f != nil {
+		err := w.f.Close()
+		w.f = nil
+		w.w = nil
+		return err
+	}
+	return nil
+}
+
+func (w *jsonlWriter) rotateLocked() error {
+	if w.w != nil {
+		_ = w.w.Flush()
+	}
+	if w.f != nil {
+		_ = w.f.Close()
+	}
+	w.seq++
+	name := fmt.Sprintf("telemetry-%s-%04d.jsonl", time.Now().UTC().Format("20060102-150405"), w.seq)
+	path := filepath.Join(w.dir, name)
+	f, err := os.OpenFile(path, os.O_CREATE|os.O_APPEND|os.O_WRONLY, 0o644)
+	if err != nil {
+		return err
+	}
+	info, _ := f.Stat()
+	size := int64(0)
+	if info != nil {
+		size = info.Size()
+	}
+	w.f = f
+	w.w = bufio.NewWriterSize(f, 64*1024)
+	w.currentPath = path
+	w.currentSize = size
+	_ = pruneFiles(w.dir, w.cfg.KeepFiles)
+	return nil
+}
+
+func pruneFiles(dir string, keep int) error {
+	if keep <= 0 {
+		return nil
+	}
+	ents, err := os.ReadDir(dir)
+	if err != nil {
+		return err
+	}
+	files := make([]string, 0, len(ents))
+	for _, ent := range ents {
+		if ent.IsDir() {
+			continue
+		}
+		name := ent.Name()
+		if !strings.HasPrefix(name, "telemetry-") || !strings.HasSuffix(name, ".jsonl") {
+			continue
+		}
+		files = append(files, filepath.Join(dir, name))
+	}
+	if len(files) <= keep {
+		return nil
+	}
+	sort.Strings(files)
+	for _, path := range files[:len(files)-keep] {
+		_ = os.Remove(path)
+	}
+	return nil
+}
+
+func readPersistedMetrics(cfg Config, q Query) ([]MetricPoint, error) {
+	files, err := listPersistedFiles(cfg.PersistDir)
+	if err != nil {
+		return nil, err
+	}
+	out := make([]MetricPoint, 0, 256)
+	for _, path := range files {
+		points, err := parsePersistedFile(path, q)
+		if err != nil {
+			continue
+		}
+		for _, p := range points.metrics {
+			if metricMatch(p, q) {
+				out = append(out, p)
+			}
+		}
+	}
+	return out, nil
+}
+
+func readPersistedEvents(cfg Config, q Query) ([]Event, error) {
+	files, err := listPersistedFiles(cfg.PersistDir)
+	if err != nil {
+		return nil, err
+	}
+	out := make([]Event, 0, 128)
+	for _, path := range files {
+		points, err := parsePersistedFile(path, q)
+		if err != nil {
+			continue
+		}
+		for _, ev := range points.events {
+			if eventMatch(ev, q) {
+				out = append(out, ev)
+			}
+		}
+	}
+	return out, nil
+}
+
+type parsedFile struct {
+	metrics []MetricPoint
+	events  []Event
+}
+
+func parsePersistedFile(path string, q Query) (parsedFile, error) {
+	f, err := os.Open(path)
+	if err != nil {
+		return parsedFile{}, err
+	}
+	defer f.Close()
+	out := parsedFile{
+		metrics: make([]MetricPoint, 0, 64),
+		events: make([]Event, 0, 32),
+	}
+	s := bufio.NewScanner(f)
+	s.Buffer(make([]byte, 0, 32*1024), 1024*1024)
+	for s.Scan() {
+		line := s.Bytes()
+		if len(line) == 0 {
+			continue
+		}
+		var env persistedEnvelope
+		if err := json.Unmarshal(line, &env); err != nil {
+			continue
+		}
+		if env.Metric != nil {
+			out.metrics = append(out.metrics, *env.Metric)
+		} else if env.Event != nil {
+			out.events = append(out.events, *env.Event)
+		}
+		if q.Limit > 0 && len(out.metrics)+len(out.events) > q.Limit*2 {
+			// keep bounded while scanning
+			if len(out.metrics) > q.Limit {
+				out.metrics = out.metrics[len(out.metrics)-q.Limit:]
+			}
+			if len(out.events) > q.Limit {
+				out.events = out.events[len(out.events)-q.Limit:]
+			}
+		}
+	}
+	return out, s.Err()
+}
+
+func listPersistedFiles(dir string) ([]string, error) {
+	ents, err := os.ReadDir(dir)
+	if err != nil {
+		return nil, err
+	}
+	files := make([]string, 0, len(ents))
+	for _, ent := range ents {
+		if ent.IsDir() {
+			continue
+		}
+		name := ent.Name()
+		if strings.HasPrefix(name, "telemetry-") && strings.HasSuffix(name, ".jsonl") {
+			files = append(files, filepath.Join(dir, name))
+		}
+	}
+	sort.Strings(files)
+	return files, nil
+}
+
+func ParseTimeQuery(raw string) (time.Time, error) {
+	raw = strings.TrimSpace(raw)
+	if raw == "" {
+		return time.Time{}, nil
+	}
+	if ms, err := strconv.ParseInt(raw, 10, 64); err == nil {
+		if ms > 1e12 {
+			return time.UnixMilli(ms).UTC(), nil
+		}
+		return time.Unix(ms, 0).UTC(), nil
+	}
+	if t, err := time.Parse(time.RFC3339Nano, raw); err == nil {
+		return t.UTC(), nil
+	}
+	if t, err := time.Parse(time.RFC3339, raw); err == nil {
+		return t.UTC(), nil
+	}
+	return time.Time{}, errors.New("invalid time query")
+}
+
+func TagsWith(base Tags, key string, value any) Tags {
+	out := cloneTags(base)
+	if out == nil {
+		out = Tags{}
+	}
+	out[key] = fmt.Sprint(value)
+	return out
+}
+
+func TagsFromPairs(kv ...string) Tags {
+	if len(kv) < 2 {
+		return nil
+	}
+	out := Tags{}
+	for i := 0; i+1 < len(kv); i += 2 {
+		k := strings.TrimSpace(kv[i])
+		if k == "" {
+			continue
+		}
+		out[k] = kv[i+1]
+	}
+	if len(out) == 0 {
+		return nil
+	}
+	return out
+}
+
+func min(a int, b int) int {
+	if a < b {
+		return a
+	}
+	return b
+}