diff --git a/internal/demod/gpudemod/build/kernels.obj b/internal/demod/gpudemod/build/kernels.obj index 062d18f..808846a 100644 Binary files a/internal/demod/gpudemod/build/kernels.obj and b/internal/demod/gpudemod/build/kernels.obj differ diff --git a/internal/demod/gpudemod/gpudemod.go b/internal/demod/gpudemod/gpudemod.go index 0b68ba6..cb0b084 100644 --- a/internal/demod/gpudemod/gpudemod.go +++ b/internal/demod/gpudemod/gpudemod.go @@ -229,8 +229,12 @@ func (e *Engine) Demod(iq []complex64, offsetHz float64, bw float64, mode DemodT if len(taps) == 0 { base := dsp.LowpassFIR(cutoff, e.sampleRate, 101) taps = append(make([]float32, 0, len(base)), base...) + e.SetFIR(taps) + } + filtered, ok := e.tryCUDAFIR(shifted, len(taps)) + if !ok { + filtered = dsp.ApplyFIR(shifted, taps) } - filtered := dsp.ApplyFIR(shifted, taps) decim := int(math.Round(float64(e.sampleRate) / float64(outRate))) if decim < 1 { decim = 1 @@ -285,3 +289,35 @@ func (e *Engine) Close() { e.firTaps = nil e.cudaReady = false } +odLSB: + return demod.LSB{}.Demod(dec, inputRate), inputRate, nil + case DemodCW: + return demod.CW{}.Demod(dec, inputRate), inputRate, nil + default: + return nil, 0, errors.New("unsupported demod type") + } +} + +func (e *Engine) Close() { + if e == nil { + return + } + if e.dIQIn != nil { + _ = C.gpud_cuda_free(unsafe.Pointer(e.dIQIn)) + e.dIQIn = nil + } + if e.dShifted != nil { + _ = C.gpud_cuda_free(unsafe.Pointer(e.dShifted)) + e.dShifted = nil + } + if e.dDecimated != nil { + _ = C.gpud_cuda_free(unsafe.Pointer(e.dDecimated)) + e.dDecimated = nil + } + if e.dAudio != nil { + _ = C.gpud_cuda_free(unsafe.Pointer(e.dAudio)) + e.dAudio = nil + } + e.firTaps = nil + e.cudaReady = false +} diff --git a/internal/demod/gpudemod/kernels.cu b/internal/demod/gpudemod/kernels.cu index 5250a91..d29ed54 100644 --- a/internal/demod/gpudemod/kernels.cu +++ b/internal/demod/gpudemod/kernels.cu @@ -72,6 +72,49 @@ extern "C" __global__ void gpud_decimate_kernel( out[idx] = in[idx * factor]; } +extern "C" __constant__ float gpud_fir_taps[256]; + +extern "C" __global__ void gpud_fir_kernel( + const float2* __restrict__ in, + float2* __restrict__ out, + int n, + int num_taps +) { + int idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx >= n) return; + + float acc_r = 0.0f; + float acc_i = 0.0f; + for (int k = 0; k < num_taps; ++k) { + int src = idx - k; + if (src < 0) break; + float2 v = in[src]; + float t = gpud_fir_taps[k]; + acc_r += v.x * t; + acc_i += v.y * t; + } + out[idx] = make_float2(acc_r, acc_i); +} + +extern "C" int gpud_upload_fir_taps_cuda(const float* taps, int n) { + if (!taps || n <= 0 || n > 256) return -1; + cudaError_t err = cudaMemcpyToSymbol(gpud_fir_taps, taps, (size_t)n * sizeof(float)); + return (int)err; +} + +extern "C" int gpud_launch_fir_cuda( + const float2* in, + float2* out, + int n, + int num_taps +) { + if (n <= 0 || num_taps <= 0 || num_taps > 256) return 0; + const int block = 256; + const int grid = (n + block - 1) / block; + gpud_fir_kernel<<>>(in, out, n, num_taps); + return (int)cudaGetLastError(); +} + extern "C" int gpud_launch_decimate_cuda( const float2* in, float2* out,