//go:build cufft package gpudemod /* #cgo windows LDFLAGS: -lcufft64_12 -lcudart64_13 #include #include typedef struct { float x; float y; } gpud_float2; static int gpud_cuda_malloc(void **ptr, size_t bytes) { return (int)cudaMalloc(ptr, bytes); } static int gpud_cuda_free(void *ptr) { return (int)cudaFree(ptr); } static int gpud_memcpy_h2d(void *dst, const void *src, size_t bytes) { return (int)cudaMemcpy(dst, src, bytes, cudaMemcpyHostToDevice); } static int gpud_memcpy_d2h(void *dst, const void *src, size_t bytes) { return (int)cudaMemcpy(dst, src, bytes, cudaMemcpyDeviceToHost); } static int gpud_device_sync() { return (int)cudaDeviceSynchronize(); } static int gpud_launch_freq_shift(gpud_float2 *in, gpud_float2 *out, int n, double phase_inc, double phase_start) { // TODO(phase2): replace with real CUDA kernel launch. // Phase 1b keeps the launch boundary in place without pretending acceleration. (void)in; (void)out; (void)n; (void)phase_inc; (void)phase_start; return -1; } static int gpud_launch_fm_discrim(gpud_float2 *in, float *out, int n) { // TODO(phase2): replace with real CUDA kernel launch. (void)in; (void)out; (void)n; return -1; } */ import "C" import ( "errors" "fmt" "math" "unsafe" "sdr-visual-suite/internal/demod" "sdr-visual-suite/internal/dsp" ) type DemodType int const ( DemodNFM DemodType = iota DemodWFM DemodAM DemodUSB DemodLSB DemodCW ) type Engine struct { maxSamples int sampleRate int phase float64 bfoPhase float64 firTaps []float32 cudaReady bool dIQIn *C.gpud_float2 dShifted *C.gpud_float2 dAudio *C.float iqBytes C.size_t audioBytes C.size_t } func Available() bool { var count C.int if C.cudaGetDeviceCount(&count) != C.cudaSuccess { return false } return count > 0 } func New(maxSamples int, sampleRate int) (*Engine, error) { if maxSamples <= 0 { return nil, errors.New("invalid maxSamples") } if sampleRate <= 0 { return nil, errors.New("invalid sampleRate") } if !Available() { return nil, errors.New("cuda device not available") } e := &Engine{ maxSamples: maxSamples, sampleRate: sampleRate, cudaReady: true, iqBytes: C.size_t(maxSamples) * C.size_t(unsafe.Sizeof(C.gpud_float2{})), audioBytes: C.size_t(maxSamples) * C.size_t(unsafe.Sizeof(C.float(0))), } var ptr unsafe.Pointer if C.gpud_cuda_malloc(&ptr, e.iqBytes) != C.cudaSuccess { e.Close() return nil, errors.New("cudaMalloc dIQIn failed") } e.dIQIn = (*C.gpud_float2)(ptr) ptr = nil if C.gpud_cuda_malloc(&ptr, e.iqBytes) != C.cudaSuccess { e.Close() return nil, errors.New("cudaMalloc dShifted failed") } e.dShifted = (*C.gpud_float2)(ptr) ptr = nil if C.gpud_cuda_malloc(&ptr, e.audioBytes) != C.cudaSuccess { e.Close() return nil, errors.New("cudaMalloc dAudio failed") } e.dAudio = (*C.float)(ptr) return e, nil } func (e *Engine) SetFIR(taps []float32) { if len(taps) == 0 { e.firTaps = nil return } e.firTaps = append(e.firTaps[:0], taps...) } func phaseStatus() string { return "phase1b-launch-boundary" } func (e *Engine) tryCUDAFreqShift(iq []complex64, offsetHz float64) bool { if e == nil || !e.cudaReady || len(iq) == 0 || e.dIQIn == nil || e.dShifted == nil { return false } if C.gpud_memcpy_h2d(unsafe.Pointer(e.dIQIn), unsafe.Pointer(&iq[0]), C.size_t(len(iq))*C.size_t(unsafe.Sizeof(complex64(0)))) != C.cudaSuccess { return false } phaseInc := -2.0 * math.Pi * offsetHz / float64(e.sampleRate) if C.gpud_launch_freq_shift(e.dIQIn, e.dShifted, C.int(len(iq)), C.double(phaseInc), C.double(e.phase)) != 0 { return false } if C.gpud_device_sync() != C.cudaSuccess { return false } e.phase += phaseInc * float64(len(iq)) return true } func (e *Engine) Demod(iq []complex64, offsetHz float64, bw float64, mode DemodType) ([]float32, int, error) { if e == nil { return nil, 0, errors.New("nil CUDA demod engine") } if !e.cudaReady { return nil, 0, errors.New("cuda demod engine is not initialized") } if len(iq) == 0 { return nil, 0, nil } if len(iq) > e.maxSamples { return nil, 0, errors.New("sample count exceeds engine capacity") } if mode != DemodNFM { return nil, 0, errors.New("CUDA demod phase 1 currently supports NFM only") } // Real CUDA boundary is now present. If the launch wrappers are not yet backed // by actual kernels, we fall back to the existing CPU DSP path below. _ = fmt.Sprintf("%s:%0.3f", phaseStatus(), offsetHz) shifted, ok := e.tryCUDAFreqShift(iq, offsetHz) if !ok { shifted = dsp.FreqShift(iq, e.sampleRate, offsetHz) } cutoff := bw / 2 if cutoff < 200 { cutoff = 200 } taps := e.firTaps if len(taps) == 0 { base := dsp.LowpassFIR(cutoff, e.sampleRate, 101) taps = append(make([]float32, 0, len(base)), base...) } filtered := dsp.ApplyFIR(shifted, taps) outRate := demod.NFM{}.OutputSampleRate() decim := int(math.Round(float64(e.sampleRate) / float64(outRate))) if decim < 1 { decim = 1 } dec := dsp.Decimate(filtered, decim) inputRate := e.sampleRate / decim audio := demod.NFM{}.Demod(dec, inputRate) return audio, inputRate, nil } func (e *Engine) Close() { if e == nil { return } if e.dIQIn != nil { _ = C.gpud_cuda_free(unsafe.Pointer(e.dIQIn)) e.dIQIn = nil } if e.dShifted != nil { _ = C.gpud_cuda_free(unsafe.Pointer(e.dShifted)) e.dShifted = nil } if e.dAudio != nil { _ = C.gpud_cuda_free(unsafe.Pointer(e.dAudio)) e.dAudio = nil } e.firTaps = nil e.cudaReady = false }