From 768fb0ee3ef119db280be79542fe8c3a44b31bc4 Mon Sep 17 00:00:00 2001
From: Jan Svabenik <jan@svabi.ch>
Date: Thu, 19 Mar 2026 07:56:35 +0100
Subject: [PATCH] docs: add initial CUDA demod kernel source

---
 internal/demod/gpudemod/README.md  | 31 ++++++++++++++++++++++++++++++
 internal/demod/gpudemod/doc.go     |  2 ++
 internal/demod/gpudemod/kernels.cu | 21 ++++++++++++++++++++
 3 files changed, 54 insertions(+)
 create mode 100644 internal/demod/gpudemod/README.md
 create mode 100644 internal/demod/gpudemod/kernels.cu

diff --git a/internal/demod/gpudemod/README.md b/internal/demod/gpudemod/README.md
new file mode 100644
index 0000000..266c8b8
--- /dev/null
+++ b/internal/demod/gpudemod/README.md
@@ -0,0 +1,31 @@
+# gpudemod
+
+Phase 1 CUDA demod scaffolding.
+
+## Current state
+
+- Standard Go builds use `gpudemod_stub.go` (`!cufft`).
+- `cufft` builds allocate GPU buffers and cross the CGO/CUDA launch boundary.
+- If CUDA launch wrappers are not backed by compiled kernels yet, the code falls back to CPU DSP.
+- The shifted IQ path is already wired so a successful GPU freq-shift result can be copied back and reused immediately.
+
+## First real kernel
+
+`kernels.cu` contains the first candidate implementation:
+- `gpud_freq_shift_kernel`
+
+This is **not compiled automatically yet** in the current environment because the machine currently lacks a CUDA compiler toolchain in PATH (`nvcc` not found).
+
+## Next machine-side step
+
+On a CUDA-capable dev machine with toolchain installed:
+
+1. Compile `kernels.cu` into an object file
+2. Link it into the `cufft` build
+3. Replace `gpud_launch_freq_shift(...)` stub body with the real kernel launch
+4. Validate copied-back shifted IQ against `dsp.FreqShift`
+5. Only then move the next stage (FM discriminator) onto the GPU
+
+## Why this is still useful
+
+The runtime/buffer/recorder/fallback structure is already in place, so once kernel compilation is available, real acceleration can be inserted without another architecture rewrite.
diff --git a/internal/demod/gpudemod/doc.go b/internal/demod/gpudemod/doc.go
index 25b4d4e..6f8d081 100644
--- a/internal/demod/gpudemod/doc.go
+++ b/internal/demod/gpudemod/doc.go
@@ -5,6 +5,8 @@
 //   - cufft builds allocate GPU buffers and cross the CGO/CUDA launch boundary.
 //   - If/when a CUDA freq-shift launch succeeds, the shifted IQ is copied back and
 //     reused by the remaining CPU-side FIR/decimate/NFM pipeline.
+//   - kernels.cu contains the first real candidate kernel source, ready for
+//     toolchain-backed integration on a CUDA build machine.
 //
 // This keeps Phase 1 incremental and verifiable while later phases replace the
 // placeholder launch wrappers with real kernels.
diff --git a/internal/demod/gpudemod/kernels.cu b/internal/demod/gpudemod/kernels.cu
new file mode 100644
index 0000000..eeef3ce
--- /dev/null
+++ b/internal/demod/gpudemod/kernels.cu
@@ -0,0 +1,21 @@
+#include <cuda_runtime.h>
+#include <math.h>
+
+extern "C" __global__ void gpud_freq_shift_kernel(
+    const float2* __restrict__ in,
+    float2* __restrict__ out,
+    int n,
+    double phase_inc,
+    double phase_start
+) {
+    int idx = blockIdx.x * blockDim.x + threadIdx.x;
+    if (idx >= n) return;
+
+    double phase = phase_start + phase_inc * (double)idx;
+    float si, co;
+    sincosf((float)phase, &si, &co);
+
+    float2 v = in[idx];
+    out[idx].x = v.x * co - v.y * si;
+    out[idx].y = v.x * si + v.y * co;
+}