feat: prepare CUDA demod launch boundary

1 månad sedan · f5ab414847
--- a/internal/demod/gpudemod/gpudemod.go
+++ b/internal/demod/gpudemod/gpudemod.go
@@ -6,12 +6,49 @@ package gpudemod
 #cgo windows LDFLAGS: -lcufft64_12 -lcudart64_13
 #include <cuda_runtime.h>
 #include <cufft.h>

 typedef struct { float x; float y; } gpud_float2;

 static int gpud_cuda_malloc(void **ptr, size_t bytes) {
 	return (int)cudaMalloc(ptr, bytes);
 }

 static int gpud_cuda_free(void *ptr) {
 	return (int)cudaFree(ptr);
 }

 static int gpud_memcpy_h2d(void *dst, const void *src, size_t bytes) {
 	return (int)cudaMemcpy(dst, src, bytes, cudaMemcpyHostToDevice);
 }

 static int gpud_memcpy_d2h(void *dst, const void *src, size_t bytes) {
 	return (int)cudaMemcpy(dst, src, bytes, cudaMemcpyDeviceToHost);
 }

 static int gpud_device_sync() {
 	return (int)cudaDeviceSynchronize();
 }

 static int gpud_launch_freq_shift(gpud_float2 *in, gpud_float2 *out, int n, double phase_inc, double phase_start) {
 	// TODO(phase2): replace with real CUDA kernel launch.
 	// Phase 1b keeps the launch boundary in place without pretending acceleration.
 	(void)in; (void)out; (void)n; (void)phase_inc; (void)phase_start;
 	return -1;
 }

 static int gpud_launch_fm_discrim(gpud_float2 *in, float *out, int n) {
 	// TODO(phase2): replace with real CUDA kernel launch.
 	(void)in; (void)out; (void)n;
 	return -1;
 }
 */
 import "C"

 import (
 	"errors"
 	"fmt"
 	"math"
 	"unsafe"

 	"sdr-visual-suite/internal/demod"
 	"sdr-visual-suite/internal/dsp"
@@ -34,9 +71,21 @@ type Engine struct {
 	phase      float64
 	bfoPhase   float64
 	firTaps    []float32
 	cudaReady  bool
 	dIQIn      *C.gpud_float2
 	dShifted   *C.gpud_float2
 	dAudio     *C.float
 	iqBytes    C.size_t
 	audioBytes C.size_t
 }

 func Available() bool { return true }
 func Available() bool {
 	var count C.int
 	if C.cudaGetDeviceCount(&count) != C.cudaSuccess {
 		return false
 	}
 	return count > 0
 }

 func New(maxSamples int, sampleRate int) (*Engine, error) {
 	if maxSamples <= 0 {
@@ -45,7 +94,35 @@ func New(maxSamples int, sampleRate int) (*Engine, error) {
 	if sampleRate <= 0 {
 		return nil, errors.New("invalid sampleRate")
 	}
 	return &Engine{maxSamples: maxSamples, sampleRate: sampleRate}, nil
 	if !Available() {
 		return nil, errors.New("cuda device not available")
 	}
 	e := &Engine{
 		maxSamples: maxSamples,
 		sampleRate: sampleRate,
 		cudaReady:  true,
 		iqBytes:    C.size_t(maxSamples) * C.size_t(unsafe.Sizeof(C.gpud_float2{})),
 		audioBytes: C.size_t(maxSamples) * C.size_t(unsafe.Sizeof(C.float(0))),
 	}
 	var ptr unsafe.Pointer
 	if C.gpud_cuda_malloc(&ptr, e.iqBytes) != C.cudaSuccess {
 		e.Close()
 		return nil, errors.New("cudaMalloc dIQIn failed")
 	}
 	e.dIQIn = (*C.gpud_float2)(ptr)
 	ptr = nil
 	if C.gpud_cuda_malloc(&ptr, e.iqBytes) != C.cudaSuccess {
 		e.Close()
 		return nil, errors.New("cudaMalloc dShifted failed")
 	}
 	e.dShifted = (*C.gpud_float2)(ptr)
 	ptr = nil
 	if C.gpud_cuda_malloc(&ptr, e.audioBytes) != C.cudaSuccess {
 		e.Close()
 		return nil, errors.New("cudaMalloc dAudio failed")
 	}
 	e.dAudio = (*C.float)(ptr)
 	return e, nil
 }

 func (e *Engine) SetFIR(taps []float32) {
@@ -56,10 +133,35 @@ func (e *Engine) SetFIR(taps []float32) {
 	e.firTaps = append(e.firTaps[:0], taps...)
 }

 func phaseStatus() string {
 	return "phase1b-launch-boundary"
 }

 func (e *Engine) tryCUDAFreqShift(iq []complex64, offsetHz float64) bool {
 	if e == nil || !e.cudaReady || len(iq) == 0 || e.dIQIn == nil || e.dShifted == nil {
 		return false
 	}
 	if C.gpud_memcpy_h2d(unsafe.Pointer(e.dIQIn), unsafe.Pointer(&iq[0]), C.size_t(len(iq))*C.size_t(unsafe.Sizeof(complex64(0)))) != C.cudaSuccess {
 		return false
 	}
 	phaseInc := -2.0 * math.Pi * offsetHz / float64(e.sampleRate)
 	if C.gpud_launch_freq_shift(e.dIQIn, e.dShifted, C.int(len(iq)), C.double(phaseInc), C.double(e.phase)) != 0 {
 		return false
 	}
 	if C.gpud_device_sync() != C.cudaSuccess {
 		return false
 	}
 	e.phase += phaseInc * float64(len(iq))
 	return true
 }

 func (e *Engine) Demod(iq []complex64, offsetHz float64, bw float64, mode DemodType) ([]float32, int, error) {
 	if e == nil {
 		return nil, 0, errors.New("nil CUDA demod engine")
 	}
 	if !e.cudaReady {
 		return nil, 0, errors.New("cuda demod engine is not initialized")
 	}
 	if len(iq) == 0 {
 		return nil, 0, nil
 	}
@@ -70,12 +172,11 @@ func (e *Engine) Demod(iq []complex64, offsetHz float64, bw float64, mode DemodT
 		return nil, 0, errors.New("CUDA demod phase 1 currently supports NFM only")
 	}

 	// Phase 1b note:
 	// This package now performs real CUDA availability gating and keeps the
 	// runtime/CGO boundary in place, but still intentionally falls back to the
 	// existing CPU DSP math for signal processing. The next phase should replace
 	// the FreqShift + FM discriminator sections below with actual kernel launches.
 	// Real CUDA boundary is now present. If the launch wrappers are not yet backed
 	// by actual kernels, we fall back to the existing CPU DSP path below.
 	_ = fmt.Sprintf("%s:%0.3f", phaseStatus(), offsetHz)
 	_ = e.tryCUDAFreqShift(iq, offsetHz)

 	shifted := dsp.FreqShift(iq, e.sampleRate, offsetHz)
 	cutoff := bw / 2
 	if cutoff < 200 {
@@ -102,5 +203,18 @@ func (e *Engine) Close() {
 	if e == nil {
 		return
 	}
 	if e.dIQIn != nil {
 		_ = C.gpud_cuda_free(unsafe.Pointer(e.dIQIn))
 		e.dIQIn = nil
 	}
 	if e.dShifted != nil {
 		_ = C.gpud_cuda_free(unsafe.Pointer(e.dShifted))
 		e.dShifted = nil
 	}
 	if e.dAudio != nil {
 		_ = C.gpud_cuda_free(unsafe.Pointer(e.dAudio))
 		e.dAudio = nil
 	}
 	e.firTaps = nil
 	e.cudaReady = false
 }
--- a/internal/demod/gpudemod/gpudemod_cufft_test.go
+++ b/internal/demod/gpudemod/gpudemod_cufft_test.go
@@ -0,0 +1,11 @@
 //go:build cufft

 package gpudemod

 import "testing"

 func TestDemodTypeConstantsExist(t *testing.T) {
 	if DemodNFM != 0 {
 		t.Fatal("expected DemodNFM constant to be defined")
 	}
 }
--- a/internal/demod/gpudemod/gpudemod_test.go
+++ b/internal/demod/gpudemod/gpudemod_test.go
@@ -1,3 +1,5 @@
 //go:build !cufft

 package gpudemod

 import "testing"