From f5ab4148479769f9ffa2e5608a5c10d5346a1c58 Mon Sep 17 00:00:00 2001
From: Jan Svabenik <jan@svabi.ch>
Date: Thu, 19 Mar 2026 07:52:10 +0100
Subject: [PATCH] feat: prepare CUDA demod launch boundary

---
 internal/demod/gpudemod/gpudemod.go           | 128 +++++++++++++++++-
 .../demod/gpudemod/gpudemod_cufft_test.go     |  11 ++
 internal/demod/gpudemod/gpudemod_test.go      |   2 +
 3 files changed, 134 insertions(+), 7 deletions(-)
 create mode 100644 internal/demod/gpudemod/gpudemod_cufft_test.go

diff --git a/internal/demod/gpudemod/gpudemod.go b/internal/demod/gpudemod/gpudemod.go
index a5e6d72..8e63f5a 100644
--- a/internal/demod/gpudemod/gpudemod.go
+++ b/internal/demod/gpudemod/gpudemod.go
@@ -6,12 +6,49 @@ package gpudemod
 #cgo windows LDFLAGS: -lcufft64_12 -lcudart64_13
 #include <cuda_runtime.h>
 #include <cufft.h>
+
+typedef struct { float x; float y; } gpud_float2;
+
+static int gpud_cuda_malloc(void **ptr, size_t bytes) {
+	return (int)cudaMalloc(ptr, bytes);
+}
+
+static int gpud_cuda_free(void *ptr) {
+	return (int)cudaFree(ptr);
+}
+
+static int gpud_memcpy_h2d(void *dst, const void *src, size_t bytes) {
+	return (int)cudaMemcpy(dst, src, bytes, cudaMemcpyHostToDevice);
+}
+
+static int gpud_memcpy_d2h(void *dst, const void *src, size_t bytes) {
+	return (int)cudaMemcpy(dst, src, bytes, cudaMemcpyDeviceToHost);
+}
+
+static int gpud_device_sync() {
+	return (int)cudaDeviceSynchronize();
+}
+
+static int gpud_launch_freq_shift(gpud_float2 *in, gpud_float2 *out, int n, double phase_inc, double phase_start) {
+	// TODO(phase2): replace with real CUDA kernel launch.
+	// Phase 1b keeps the launch boundary in place without pretending acceleration.
+	(void)in; (void)out; (void)n; (void)phase_inc; (void)phase_start;
+	return -1;
+}
+
+static int gpud_launch_fm_discrim(gpud_float2 *in, float *out, int n) {
+	// TODO(phase2): replace with real CUDA kernel launch.
+	(void)in; (void)out; (void)n;
+	return -1;
+}
 */
 import "C"
 
 import (
 	"errors"
+	"fmt"
 	"math"
+	"unsafe"
 
 	"sdr-visual-suite/internal/demod"
 	"sdr-visual-suite/internal/dsp"
@@ -34,9 +71,21 @@ type Engine struct {
 	phase      float64
 	bfoPhase   float64
 	firTaps    []float32
+	cudaReady  bool
+	dIQIn      *C.gpud_float2
+	dShifted   *C.gpud_float2
+	dAudio     *C.float
+	iqBytes    C.size_t
+	audioBytes C.size_t
 }
 
-func Available() bool { return true }
+func Available() bool {
+	var count C.int
+	if C.cudaGetDeviceCount(&count) != C.cudaSuccess {
+		return false
+	}
+	return count > 0
+}
 
 func New(maxSamples int, sampleRate int) (*Engine, error) {
 	if maxSamples <= 0 {
@@ -45,7 +94,35 @@ func New(maxSamples int, sampleRate int) (*Engine, error) {
 	if sampleRate <= 0 {
 		return nil, errors.New("invalid sampleRate")
 	}
-	return &Engine{maxSamples: maxSamples, sampleRate: sampleRate}, nil
+	if !Available() {
+		return nil, errors.New("cuda device not available")
+	}
+	e := &Engine{
+		maxSamples: maxSamples,
+		sampleRate: sampleRate,
+		cudaReady:  true,
+		iqBytes:    C.size_t(maxSamples) * C.size_t(unsafe.Sizeof(C.gpud_float2{})),
+		audioBytes: C.size_t(maxSamples) * C.size_t(unsafe.Sizeof(C.float(0))),
+	}
+	var ptr unsafe.Pointer
+	if C.gpud_cuda_malloc(&ptr, e.iqBytes) != C.cudaSuccess {
+		e.Close()
+		return nil, errors.New("cudaMalloc dIQIn failed")
+	}
+	e.dIQIn = (*C.gpud_float2)(ptr)
+	ptr = nil
+	if C.gpud_cuda_malloc(&ptr, e.iqBytes) != C.cudaSuccess {
+		e.Close()
+		return nil, errors.New("cudaMalloc dShifted failed")
+	}
+	e.dShifted = (*C.gpud_float2)(ptr)
+	ptr = nil
+	if C.gpud_cuda_malloc(&ptr, e.audioBytes) != C.cudaSuccess {
+		e.Close()
+		return nil, errors.New("cudaMalloc dAudio failed")
+	}
+	e.dAudio = (*C.float)(ptr)
+	return e, nil
 }
 
 func (e *Engine) SetFIR(taps []float32) {
@@ -56,10 +133,35 @@ func (e *Engine) SetFIR(taps []float32) {
 	e.firTaps = append(e.firTaps[:0], taps...)
 }
 
+func phaseStatus() string {
+	return "phase1b-launch-boundary"
+}
+
+func (e *Engine) tryCUDAFreqShift(iq []complex64, offsetHz float64) bool {
+	if e == nil || !e.cudaReady || len(iq) == 0 || e.dIQIn == nil || e.dShifted == nil {
+		return false
+	}
+	if C.gpud_memcpy_h2d(unsafe.Pointer(e.dIQIn), unsafe.Pointer(&iq[0]), C.size_t(len(iq))*C.size_t(unsafe.Sizeof(complex64(0)))) != C.cudaSuccess {
+		return false
+	}
+	phaseInc := -2.0 * math.Pi * offsetHz / float64(e.sampleRate)
+	if C.gpud_launch_freq_shift(e.dIQIn, e.dShifted, C.int(len(iq)), C.double(phaseInc), C.double(e.phase)) != 0 {
+		return false
+	}
+	if C.gpud_device_sync() != C.cudaSuccess {
+		return false
+	}
+	e.phase += phaseInc * float64(len(iq))
+	return true
+}
+
 func (e *Engine) Demod(iq []complex64, offsetHz float64, bw float64, mode DemodType) ([]float32, int, error) {
 	if e == nil {
 		return nil, 0, errors.New("nil CUDA demod engine")
 	}
+	if !e.cudaReady {
+		return nil, 0, errors.New("cuda demod engine is not initialized")
+	}
 	if len(iq) == 0 {
 		return nil, 0, nil
 	}
@@ -70,12 +172,11 @@ func (e *Engine) Demod(iq []complex64, offsetHz float64, bw float64, mode DemodT
 		return nil, 0, errors.New("CUDA demod phase 1 currently supports NFM only")
 	}
 
-	// Phase 1b note:
-	// This package now performs real CUDA availability gating and keeps the
-	// runtime/CGO boundary in place, but still intentionally falls back to the
-	// existing CPU DSP math for signal processing. The next phase should replace
-	// the FreqShift + FM discriminator sections below with actual kernel launches.
+	// Real CUDA boundary is now present. If the launch wrappers are not yet backed
+	// by actual kernels, we fall back to the existing CPU DSP path below.
 	_ = fmt.Sprintf("%s:%0.3f", phaseStatus(), offsetHz)
+	_ = e.tryCUDAFreqShift(iq, offsetHz)
+
 	shifted := dsp.FreqShift(iq, e.sampleRate, offsetHz)
 	cutoff := bw / 2
 	if cutoff < 200 {
@@ -102,5 +203,18 @@ func (e *Engine) Close() {
 	if e == nil {
 		return
 	}
+	if e.dIQIn != nil {
+		_ = C.gpud_cuda_free(unsafe.Pointer(e.dIQIn))
+		e.dIQIn = nil
+	}
+	if e.dShifted != nil {
+		_ = C.gpud_cuda_free(unsafe.Pointer(e.dShifted))
+		e.dShifted = nil
+	}
+	if e.dAudio != nil {
+		_ = C.gpud_cuda_free(unsafe.Pointer(e.dAudio))
+		e.dAudio = nil
+	}
 	e.firTaps = nil
+	e.cudaReady = false
 }
diff --git a/internal/demod/gpudemod/gpudemod_cufft_test.go b/internal/demod/gpudemod/gpudemod_cufft_test.go
new file mode 100644
index 0000000..ce8f4fb
--- /dev/null
+++ b/internal/demod/gpudemod/gpudemod_cufft_test.go
@@ -0,0 +1,11 @@
+//go:build cufft
+
+package gpudemod
+
+import "testing"
+
+func TestDemodTypeConstantsExist(t *testing.T) {
+	if DemodNFM != 0 {
+		t.Fatal("expected DemodNFM constant to be defined")
+	}
+}
diff --git a/internal/demod/gpudemod/gpudemod_test.go b/internal/demod/gpudemod/gpudemod_test.go
index dd90763..05a945a 100644
--- a/internal/demod/gpudemod/gpudemod_test.go
+++ b/internal/demod/gpudemod/gpudemod_test.go
@@ -1,3 +1,5 @@
+//go:build !cufft
+
 package gpudemod
 
 import "testing"