From 5b0e368ed16a326d4e201dc198886a0d200bcc90 Mon Sep 17 00:00:00 2001
From: Jan Svabenik <jan@svabi.ch>
Date: Wed, 25 Mar 2026 07:55:03 +0100
Subject: [PATCH] debug: add gpu kernel probes and update notes

---
 cmd/sdrd/helpers.go                        | 22 +++++
 docs/audio-click-debug-notes-2026-03-24.md | 94 ++++++++++++++++++++--
 2 files changed, 110 insertions(+), 6 deletions(-)

diff --git a/cmd/sdrd/helpers.go b/cmd/sdrd/helpers.go
index 930630f..73dd250 100644
--- a/cmd/sdrd/helpers.go
+++ b/cmd/sdrd/helpers.go
@@ -379,6 +379,18 @@ func extractForStreaming(
 		runner = extractMgr.get(len(gpuIQ), sampleRate)
 	}
 	if runner != nil {
+		if coll != nil && len(gpuIQ) > 0 {
+			inputProbe := probeHead(gpuIQ, 16, 1e-6)
+			coll.Event("gpu_kernel_input_head_probe", "info", "gpu kernel input head probe", nil, map[string]any{
+				"mags": inputProbe.mags,
+				"zero_count": inputProbe.zeroCount,
+				"first_nonzero_index": inputProbe.firstNonZeroIndex,
+				"head_max_step": inputProbe.maxStep,
+				"gpuIQ_len": len(gpuIQ),
+				"sample_rate": sampleRate,
+				"signals": len(signals),
+			})
+		}
 		results, err := runner.ShiftFilterDecimateBatchWithPhase(gpuIQ, jobs)
 		if err == nil && len(results) == len(signals) {
 			for i, res := range results {
@@ -417,6 +429,16 @@ func extractForStreaming(
 				}
 				if coll != nil {
 					tags := telemetry.TagsFromPairs("signal_id", fmt.Sprintf("%d", signals[i].ID), "path", "gpu")
+					kernelProbe := probeHead(res.IQ, 16, 1e-6)
+					coll.Event("gpu_kernel_output_head_probe", "info", "gpu kernel output head probe", tags, map[string]any{
+						"mags": kernelProbe.mags,
+						"zero_count": kernelProbe.zeroCount,
+						"first_nonzero_index": kernelProbe.firstNonZeroIndex,
+						"head_max_step": kernelProbe.maxStep,
+						"raw_len": rawLen,
+						"out_rate": outRate,
+						"trim_samples": trimSamples,
+					})
 					stats := computeIQHeadStats(iq, 64)
 					coll.SetGauge("iq.extract.output.length", float64(len(iq)), tags)
 					coll.Observe("iq.extract.output.head_mean_mag", stats.meanMag, tags)
diff --git a/docs/audio-click-debug-notes-2026-03-24.md b/docs/audio-click-debug-notes-2026-03-24.md
index d022f9e..d8f2341 100644
--- a/docs/audio-click-debug-notes-2026-03-24.md
+++ b/docs/audio-click-debug-notes-2026-03-24.md
@@ -429,6 +429,8 @@ Used heavily once compact per-block event probes were added, because events were
 This ended up being especially useful for:
 - raw extractor head probes
 - trimmed extractor head probes
+- extractor input head probes
+- GPU kernel input/output head probes
 - boundary snapshots
 
 ### Important telemetry families added/used
@@ -483,6 +485,20 @@ Purpose:
 Purpose:
 - answer the key question: is the corruption already present in the raw extractor output head, or created by trimming/overlap logic afterward?
 
+#### Additional extractor input / GPU-kernel probe telemetry
+- `iq.extract.input_head.zero_count`
+- `iq.extract.input_head.first_nonzero_index`
+- `iq.extract.input_head.max_step`
+- event `extract_input_head_probe`
+- event `gpu_kernel_input_head_probe`
+- event `gpu_kernel_output_head_probe`
+
+Purpose:
+- split the remaining uncertainty between:
+  - signal-specific input already being bad
+  - GPU extractor kernel/start semantics producing the bad raw head
+  - later output assembly after the kernel
+
 #### Pre-demod / audio-stage metrics
 - `iq.pre_demod.head_mean_mag`
 - `iq.pre_demod.head_min_mag`
@@ -701,22 +717,88 @@ Interpretation:
 - trimming cleans up the visibly bad raw head region
 - trimming still does **not** explain the deeper output-boundary continuity issue
 
-### Refined strongest current conclusion after the 2026-03-25 telemetry pass
+### Further refinement after direct extractor-input and GPU-kernel probes
+
+A final telemetry round added:
+- `extract_input_head_probe`
+- `gpu_kernel_input_head_probe`
+- `gpu_kernel_output_head_probe`
+
+These probes further sharpened the likely fault location.
+
+#### Signal-specific extractor input head looked sane
+Representative values:
+- `iq.extract.input_head.zero_count = 0`
+- `iq.extract.input_head.first_nonzero_index = 0`
+
+Interpretation:
+- at the observed signal-specific input probe point, the GPU extractor is **not** receiving a dead/null head
+
+#### Raw GPU output head remained systematically broken
+Representative repeated values:
+- `iq.extract.raw.head_mag = 0`
+- `iq.extract.raw.head_zero_count = 1`
+- `iq.extract.raw.head_max_step` repeatedly around:
+  - `3.141592653589793`
+  - `3.122847934305907`
+  - `3.101915352902961`
+  - `3.080672178550904`
+  - `3.062425574273907`
+  - `2.9785041567778427`
+  - `2.7508533785793476`
+
+Representative repeated examples from strong channels:
+- signal 2: `head_mag = 0`, `head_zero_count = 1`
+- signal 3: `head_mag = 0`, `head_zero_count = 1`
+- signal 1/4 showed the same qualitative head-zero pattern as well
+
+Interpretation:
+- the raw extractor output head is still repeatedly born broken
+- the problem is therefore after the currently probed input head and before/during raw output creation
+
+#### Trimmed head still looked healthier
+Representative values:
+- `iq.extract.trimmed.head_zero_count = 0`
+- signal 1 `iq.extract.trimmed.head_mag` repeatedly around:
+  - `0.2868`
+  - `0.2907`
+  - `0.3036`
+  - `0.3116`
+  - `0.2838`
+  - `0.2760`
+- signal 2 examples:
+  - `0.3461`
+  - `0.3182`
+
+Representative `iq.extract.trimmed.head_max_step` values for strong channels were much lower than raw, often around:
+- `0.11`
+- `0.13`
+- `0.21`
+- `0.30`
+- `0.44`
+- `0.69`
+- `0.86`
+
+Interpretation:
+- trimming still removes the most visibly broken head region
+- but trimming does not explain the deeper output-boundary continuity issue
+
+### Refined strongest current conclusion after the full 2026-03-25 telemetry pass
 
 The strongest current reading is now:
 
-> The click root cause is very likely **not** that the signal-specific extractor input already starts dead/null. Instead, the bad raw head appears to be introduced **inside the GPU extractor path or at its immediate start/output semantics**, before final trimming.
+> The click root cause is very likely **not** that the signal-specific extractor input already starts dead/null. Instead, the bad raw head appears to be introduced **inside the GPU extractor path itself** (or at its immediate start/output semantics) before final trimming.
 
 More specifically:
 - signal-specific extractor input head looks non-zero and sane at the probe point
-- all signals still show a systematically bad raw extractor head
+- raw GPU output head still repeatedly starts with an exact zero sample and a short bad settling region
 - the trimmed head usually looks healthier
 - yet the final extractor output still exhibits significant complex boundary discontinuity from block to block
 
-This points away from a simple "shared global input head is already zero" theory and toward one of these narrower causes:
-1. GPU extractor start semantics / kernel warmup / first-output handling
+This now points away from a simple "shared global input head is already zero" theory and toward one of these narrower causes:
+1. GPU extractor kernel start semantics / warmup / first-output handling
 2. phase-start or alignment handling at extractor block start
-3. output assembly semantics inside the raw GPU extractor path
+3. raw GPU output assembly semantics within the extractor path
 
 ### What should not be forgotten from this stage