From 768fb0ee3ef119db280be79542fe8c3a44b31bc4 Mon Sep 17 00:00:00 2001 From: Jan Svabenik Date: Thu, 19 Mar 2026 07:56:35 +0100 Subject: [PATCH] docs: add initial CUDA demod kernel source --- internal/demod/gpudemod/README.md | 31 ++++++++++++++++++++++++++++++ internal/demod/gpudemod/doc.go | 2 ++ internal/demod/gpudemod/kernels.cu | 21 ++++++++++++++++++++ 3 files changed, 54 insertions(+) create mode 100644 internal/demod/gpudemod/README.md create mode 100644 internal/demod/gpudemod/kernels.cu diff --git a/internal/demod/gpudemod/README.md b/internal/demod/gpudemod/README.md new file mode 100644 index 0000000..266c8b8 --- /dev/null +++ b/internal/demod/gpudemod/README.md @@ -0,0 +1,31 @@ +# gpudemod + +Phase 1 CUDA demod scaffolding. + +## Current state + +- Standard Go builds use `gpudemod_stub.go` (`!cufft`). +- `cufft` builds allocate GPU buffers and cross the CGO/CUDA launch boundary. +- If CUDA launch wrappers are not backed by compiled kernels yet, the code falls back to CPU DSP. +- The shifted IQ path is already wired so a successful GPU freq-shift result can be copied back and reused immediately. + +## First real kernel + +`kernels.cu` contains the first candidate implementation: +- `gpud_freq_shift_kernel` + +This is **not compiled automatically yet** in the current environment because the machine currently lacks a CUDA compiler toolchain in PATH (`nvcc` not found). + +## Next machine-side step + +On a CUDA-capable dev machine with toolchain installed: + +1. Compile `kernels.cu` into an object file +2. Link it into the `cufft` build +3. Replace `gpud_launch_freq_shift(...)` stub body with the real kernel launch +4. Validate copied-back shifted IQ against `dsp.FreqShift` +5. Only then move the next stage (FM discriminator) onto the GPU + +## Why this is still useful + +The runtime/buffer/recorder/fallback structure is already in place, so once kernel compilation is available, real acceleration can be inserted without another architecture rewrite. diff --git a/internal/demod/gpudemod/doc.go b/internal/demod/gpudemod/doc.go index 25b4d4e..6f8d081 100644 --- a/internal/demod/gpudemod/doc.go +++ b/internal/demod/gpudemod/doc.go @@ -5,6 +5,8 @@ // - cufft builds allocate GPU buffers and cross the CGO/CUDA launch boundary. // - If/when a CUDA freq-shift launch succeeds, the shifted IQ is copied back and // reused by the remaining CPU-side FIR/decimate/NFM pipeline. +// - kernels.cu contains the first real candidate kernel source, ready for +// toolchain-backed integration on a CUDA build machine. // // This keeps Phase 1 incremental and verifiable while later phases replace the // placeholder launch wrappers with real kernels. diff --git a/internal/demod/gpudemod/kernels.cu b/internal/demod/gpudemod/kernels.cu new file mode 100644 index 0000000..eeef3ce --- /dev/null +++ b/internal/demod/gpudemod/kernels.cu @@ -0,0 +1,21 @@ +#include +#include + +extern "C" __global__ void gpud_freq_shift_kernel( + const float2* __restrict__ in, + float2* __restrict__ out, + int n, + double phase_inc, + double phase_start +) { + int idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx >= n) return; + + double phase = phase_start + phase_inc * (double)idx; + float si, co; + sincosf((float)phase, &si, &co); + + float2 v = in[idx]; + out[idx].x = v.x * co - v.y * si; + out[idx].y = v.x * si + v.y * co; +}