diff --git a/internal/demod/gpudemod/batch_runner_test.go b/internal/demod/gpudemod/batch_runner_test.go new file mode 100644 index 0000000..a6ad799 --- /dev/null +++ b/internal/demod/gpudemod/batch_runner_test.go @@ -0,0 +1,80 @@ +package gpudemod + +import ( + "math" + "math/cmplx" + "testing" + + "sdr-visual-suite/internal/dsp" +) + +func TestMixedBandwidthBatch(t *testing.T) { + if !Available() { + t.Skip("no GPU") + } + sampleRate := 2048000 + n := 2048 + iq := makeSyntheticIQ(n, sampleRate, []float64{50e3, -120e3, 300e3, -80e3}) + jobs := []ExtractJob{ + {OffsetHz: 50e3, BW: 12000, OutRate: 48000}, + {OffsetHz: -120e3, BW: 150000, OutRate: 192000}, + {OffsetHz: 300e3, BW: 3000, OutRate: 48000}, + {OffsetHz: -80e3, BW: 500, OutRate: 48000}, + } + cpuOuts := make([][]complex64, len(jobs)) + for i, job := range jobs { + cpuOuts[i] = extractCPU(iq, sampleRate, job) + } + runner, err := NewBatchRunner(n, sampleRate) + if err != nil { + t.Fatalf("NewBatchRunner: %v", err) + } + defer runner.Close() + gpuOuts, rates, err := runner.ShiftFilterDecimateBatch(iq, jobs) + if err != nil { + t.Fatalf("Batch: %v", err) + } + for i := range jobs { + if !complexSliceClose(cpuOuts[i], gpuOuts[i], 1e-3) { + t.Errorf("job %d: GPU/CPU mismatch (rate=%d)", i, rates[i]) + } + } +} + +func makeSyntheticIQ(n int, sr int, freqs []float64) []complex64 { + iq := make([]complex64, n) + for _, f := range freqs { + for i := range iq { + phase := 2 * math.Pi * f * float64(i) / float64(sr) + iq[i] += complex(float32(math.Cos(phase)), float32(math.Sin(phase))) + } + } + return iq +} + +func extractCPU(iq []complex64, sr int, job ExtractJob) []complex64 { + shifted := dsp.FreqShift(iq, sr, job.OffsetHz) + cutoff := job.BW / 2 + if cutoff < 200 { + cutoff = 200 + } + taps := dsp.LowpassFIR(cutoff, sr, 101) + filtered := dsp.ApplyFIR(shifted, taps) + decim := int(math.Round(float64(sr) / float64(job.OutRate))) + if decim < 1 { + decim = 1 + } + return dsp.Decimate(filtered, decim) +} + +func complexSliceClose(a, b []complex64, tol float64) bool { + if len(a) != len(b) { + return false + } + for i := range a { + if cmplx.Abs(complex128(a[i]-b[i])) > tol { + return false + } + } + return true +} diff --git a/internal/demod/gpudemod/batch_runner_windows.go b/internal/demod/gpudemod/batch_runner_windows.go index be1e958..98299db 100644 --- a/internal/demod/gpudemod/batch_runner_windows.go +++ b/internal/demod/gpudemod/batch_runner_windows.go @@ -2,6 +2,11 @@ package gpudemod +/* +#include +*/ +import "C" + import ( "errors" "math" @@ -10,89 +15,175 @@ import ( "sdr-visual-suite/internal/dsp" ) +type slotBuffers struct { + dShifted unsafe.Pointer + dFiltered unsafe.Pointer + dDecimated unsafe.Pointer + dTaps unsafe.Pointer + stream streamHandle +} + +type windowsBatchRunner struct { + *BatchRunner + slotBufs []slotBuffers +} + +func asWindowsBatchRunner(r *BatchRunner) *windowsBatchRunner { + return (*windowsBatchRunner)(unsafe.Pointer(r)) +} + +func (r *windowsBatchRunner) freeSlotBuffers() { + for i := range r.slotBufs { + if r.slotBufs[i].dShifted != nil { + _ = bridgeCudaFree(r.slotBufs[i].dShifted) + r.slotBufs[i].dShifted = nil + } + if r.slotBufs[i].dFiltered != nil { + _ = bridgeCudaFree(r.slotBufs[i].dFiltered) + r.slotBufs[i].dFiltered = nil + } + if r.slotBufs[i].dDecimated != nil { + _ = bridgeCudaFree(r.slotBufs[i].dDecimated) + r.slotBufs[i].dDecimated = nil + } + if r.slotBufs[i].dTaps != nil { + _ = bridgeCudaFree(r.slotBufs[i].dTaps) + r.slotBufs[i].dTaps = nil + } + if r.slotBufs[i].stream != nil { + _ = bridgeStreamDestroy(r.slotBufs[i].stream) + r.slotBufs[i].stream = nil + } + } + r.slotBufs = nil +} + +func (r *windowsBatchRunner) allocSlotBuffers(n int) error { + if len(r.slotBufs) == len(r.slots) && len(r.slotBufs) > 0 { + return nil + } + r.freeSlotBuffers() + if len(r.slots) == 0 { + return nil + } + iqBytes := uintptr(n) * unsafe.Sizeof(complex64(0)) + tapsBytes := uintptr(256) * unsafe.Sizeof(float32(0)) + r.slotBufs = make([]slotBuffers, len(r.slots)) + for i := range r.slotBufs { + for _, ptr := range []*unsafe.Pointer{&r.slotBufs[i].dShifted, &r.slotBufs[i].dFiltered, &r.slotBufs[i].dDecimated} { + if bridgeCudaMalloc(ptr, iqBytes) != 0 { + r.freeSlotBuffers() + return errors.New("cudaMalloc slot buffer failed") + } + } + if bridgeCudaMalloc(&r.slotBufs[i].dTaps, tapsBytes) != 0 { + r.freeSlotBuffers() + return errors.New("cudaMalloc slot taps failed") + } + s, res := bridgeStreamCreate() + if res != 0 { + r.freeSlotBuffers() + return errors.New("cudaStreamCreate failed") + } + r.slotBufs[i].stream = s + } + return nil +} + func (r *BatchRunner) shiftFilterDecimateBatchImpl(iq []complex64) ([][]complex64, []int, error) { + wr := asWindowsBatchRunner(r) + e := r.eng + if e == nil || !e.cudaReady { + return nil, nil, ErrUnavailable + } outs := make([][]complex64, len(r.slots)) rates := make([]int, len(r.slots)) + n := len(iq) + if n == 0 { + return outs, rates, nil + } + if err := wr.allocSlotBuffers(n); err != nil { + return nil, nil, err + } + bytesIn := uintptr(n) * unsafe.Sizeof(complex64(0)) + if bridgeMemcpyH2D(unsafe.Pointer(e.dIQIn), unsafe.Pointer(&iq[0]), bytesIn) != 0 { + return nil, nil, errors.New("cudaMemcpy H2D failed") + } for i := range r.slots { if !r.slots[i].active { continue } - out, rate, err := r.shiftFilterDecimateSlot(iq, r.slots[i].job, nil) + nOut, rate, err := r.shiftFilterDecimateSlotParallel(iq, r.slots[i].job, wr.slotBufs[i]) if err != nil { return nil, nil, err } - r.slots[i].out = out r.slots[i].rate = rate - outs[i] = out + outs[i] = make([]complex64, nOut) rates[i] = rate } + for i := range r.slots { + if !r.slots[i].active { + continue + } + buf := wr.slotBufs[i] + if bridgeStreamSync(buf.stream) != 0 { + return nil, nil, errors.New("cuda stream sync failed") + } + out := outs[i] + if len(out) == 0 { + continue + } + outBytes := uintptr(len(out)) * unsafe.Sizeof(complex64(0)) + if bridgeMemcpyD2H(unsafe.Pointer(&out[0]), buf.dDecimated, outBytes) != 0 { + return nil, nil, errors.New("cudaMemcpy D2H failed") + } + r.slots[i].out = out + } return outs, rates, nil } -func (r *BatchRunner) shiftFilterDecimateSlot(iq []complex64, job ExtractJob, stream streamHandle) ([]complex64, int, error) { +func (r *BatchRunner) shiftFilterDecimateSlotParallel(iq []complex64, job ExtractJob, buf slotBuffers) (int, int, error) { e := r.eng if e == nil || !e.cudaReady { - return nil, 0, ErrUnavailable + return 0, 0, ErrUnavailable } - if len(iq) == 0 { - return nil, 0, nil + n := len(iq) + if n == 0 { + return 0, 0, nil } cutoff := job.BW / 2 if cutoff < 200 { cutoff = 200 } - base64 := dsp.LowpassFIR(cutoff, e.sampleRate, 101) - taps := make([]float32, len(base64)) - for i, v := range base64 { + base := dsp.LowpassFIR(cutoff, e.sampleRate, 101) + taps := make([]float32, len(base)) + for i, v := range base { taps[i] = float32(v) } if len(taps) == 0 { - return nil, 0, errors.New("no FIR taps configured") + return 0, 0, errors.New("no FIR taps configured") } - e.SetFIR(taps) - if stream == nil { - if bridgeDeviceSync() != 0 { - return nil, 0, errors.New("cudaDeviceSynchronize failed") - } + tapsBytes := uintptr(len(taps)) * unsafe.Sizeof(float32(0)) + if bridgeMemcpyH2D(buf.dTaps, unsafe.Pointer(&taps[0]), tapsBytes) != 0 { + return 0, 0, errors.New("taps H2D failed") } decim := int(math.Round(float64(e.sampleRate) / float64(job.OutRate))) if decim < 1 { decim = 1 } - n := len(iq) nOut := n / decim if nOut <= 0 { - return nil, 0, errors.New("not enough output samples after decimation") - } - bytesIn := uintptr(n) * unsafe.Sizeof(complex64(0)) - if bridgeMemcpyH2D(unsafe.Pointer(e.dIQIn), unsafe.Pointer(&iq[0]), bytesIn) != 0 { - return nil, 0, errors.New("cudaMemcpy H2D failed") + return 0, 0, errors.New("not enough output samples after decimation") } phaseInc := -2.0 * math.Pi * job.OffsetHz / float64(e.sampleRate) - phaseStart := e.phase - if bridgeLaunchFreqShiftStream(e.dIQIn, e.dShifted, n, phaseInc, phaseStart, stream) != 0 { - return nil, 0, errors.New("gpu freq shift failed") - } - if bridgeLaunchFIRStream(e.dShifted, e.dFiltered, n, len(taps), stream) != 0 { - return nil, 0, errors.New("gpu FIR failed") + if bridgeLaunchFreqShiftStream(e.dIQIn, (*gpuFloat2)(buf.dShifted), n, phaseInc, e.phase, buf.stream) != 0 { + return 0, 0, errors.New("gpu freq shift failed") } - if bridgeLaunchDecimateStream(e.dFiltered, e.dDecimated, nOut, decim, stream) != 0 { - return nil, 0, errors.New("gpu decimate failed") - } - if stream != nil { - if bridgeStreamSync(stream) != 0 { - return nil, 0, errors.New("cuda stream sync failed") - } - } else { - if bridgeDeviceSync() != 0 { - return nil, 0, errors.New("cudaDeviceSynchronize failed") - } + if bridgeLaunchFIRv2Stream((*gpuFloat2)(buf.dShifted), (*gpuFloat2)(buf.dFiltered), (*C.float)(buf.dTaps), n, len(taps), buf.stream) != 0 { + return 0, 0, errors.New("gpu FIR v2 failed") } - out := make([]complex64, nOut) - outBytes := uintptr(nOut) * unsafe.Sizeof(complex64(0)) - if bridgeMemcpyD2H(unsafe.Pointer(&out[0]), unsafe.Pointer(e.dDecimated), outBytes) != 0 { - return nil, 0, errors.New("cudaMemcpy D2H failed") + if bridgeLaunchDecimateStream((*gpuFloat2)(buf.dFiltered), (*gpuFloat2)(buf.dDecimated), nOut, decim, buf.stream) != 0 { + return 0, 0, errors.New("gpu decimate failed") } - e.phase = phaseStart + phaseInc*float64(n) - return out, e.sampleRate / decim, nil + return nOut, e.sampleRate / decim, nil } diff --git a/internal/demod/gpudemod/build/gpudemod_kernels.exp b/internal/demod/gpudemod/build/gpudemod_kernels.exp index de59c20..0239b60 100644 Binary files a/internal/demod/gpudemod/build/gpudemod_kernels.exp and b/internal/demod/gpudemod/build/gpudemod_kernels.exp differ diff --git a/internal/demod/gpudemod/build/gpudemod_kernels.lib b/internal/demod/gpudemod/build/gpudemod_kernels.lib index fb57b52..2bed4d2 100644 Binary files a/internal/demod/gpudemod/build/gpudemod_kernels.lib and b/internal/demod/gpudemod/build/gpudemod_kernels.lib differ diff --git a/internal/demod/gpudemod/native/exports.cu b/internal/demod/gpudemod/native/exports.cu index f5b741b..3a7e88b 100644 --- a/internal/demod/gpudemod/native/exports.cu +++ b/internal/demod/gpudemod/native/exports.cu @@ -170,6 +170,54 @@ GPUD_API int GPUD_CALL gpud_launch_fir_cuda( return (int)cudaGetLastError(); } +__global__ void gpud_fir_kernel_v2( + const float2* __restrict__ in, + float2* __restrict__ out, + const float* __restrict__ taps, + int n, + int num_taps +) { + extern __shared__ float2 s_data[]; + int gid = blockIdx.x * blockDim.x + threadIdx.x; + int lid = threadIdx.x; + int halo = num_taps - 1; + + if (gid < n) s_data[lid + halo] = in[gid]; + else s_data[lid + halo] = make_float2(0.0f, 0.0f); + + if (lid < halo) { + int src = gid - halo; + s_data[lid] = (src >= 0) ? in[src] : make_float2(0.0f, 0.0f); + } + __syncthreads(); + if (gid >= n) return; + + float acc_r = 0.0f, acc_i = 0.0f; + for (int k = 0; k < num_taps; ++k) { + float2 v = s_data[lid + halo - k]; + float t = taps[k]; + acc_r += v.x * t; + acc_i += v.y * t; + } + out[gid] = make_float2(acc_r, acc_i); +} + +GPUD_API int GPUD_CALL gpud_launch_fir_v2_stream_cuda( + const float2* in, + float2* out, + const float* taps, + int n, + int num_taps, + gpud_stream_handle stream +) { + if (n <= 0 || num_taps <= 0 || num_taps > 256) return 0; + const int block = 256; + const int grid = (n + block - 1) / block; + size_t sharedBytes = (size_t)(block + num_taps - 1) * sizeof(float2); + gpud_fir_kernel_v2<<>>(in, out, taps, n, num_taps); + return (int)cudaGetLastError(); +} + GPUD_API int GPUD_CALL gpud_launch_decimate_cuda( const float2* in, float2* out, diff --git a/internal/demod/gpudemod/windows_bridge.go b/internal/demod/gpudemod/windows_bridge.go index 2cf33cd..3371be7 100644 --- a/internal/demod/gpudemod/windows_bridge.go +++ b/internal/demod/gpudemod/windows_bridge.go @@ -20,6 +20,7 @@ typedef int (__stdcall *gpud_launch_freq_shift_stream_fn)(const gpud_float2* in, typedef int (__stdcall *gpud_launch_freq_shift_fn)(const gpud_float2* in, gpud_float2* out, int n, double phase_inc, double phase_start); typedef int (__stdcall *gpud_launch_fm_discrim_fn)(const gpud_float2* in, float* out, int n); typedef int (__stdcall *gpud_launch_fir_stream_fn)(const gpud_float2* in, gpud_float2* out, int n, int num_taps, gpud_stream_handle stream); +typedef int (__stdcall *gpud_launch_fir_v2_stream_fn)(const gpud_float2* in, gpud_float2* out, const float* taps, int n, int num_taps, gpud_stream_handle stream); typedef int (__stdcall *gpud_launch_fir_fn)(const gpud_float2* in, gpud_float2* out, int n, int num_taps); typedef int (__stdcall *gpud_launch_decimate_stream_fn)(const gpud_float2* in, gpud_float2* out, int n_out, int factor, gpud_stream_handle stream); typedef int (__stdcall *gpud_launch_decimate_fn)(const gpud_float2* in, gpud_float2* out, int n_out, int factor); @@ -35,6 +36,7 @@ static gpud_launch_freq_shift_stream_fn gpud_p_launch_freq_shift_stream = NULL; static gpud_launch_freq_shift_fn gpud_p_launch_freq_shift = NULL; static gpud_launch_fm_discrim_fn gpud_p_launch_fm_discrim = NULL; static gpud_launch_fir_stream_fn gpud_p_launch_fir_stream = NULL; +static gpud_launch_fir_v2_stream_fn gpud_p_launch_fir_v2_stream = NULL; static gpud_launch_fir_fn gpud_p_launch_fir = NULL; static gpud_launch_decimate_stream_fn gpud_p_launch_decimate_stream = NULL; static gpud_launch_decimate_fn gpud_p_launch_decimate = NULL; @@ -59,6 +61,7 @@ static int gpud_load_library(const char* path) { gpud_p_launch_freq_shift = (gpud_launch_freq_shift_fn)GetProcAddress(gpud_mod, "gpud_launch_freq_shift_cuda"); gpud_p_launch_fm_discrim = (gpud_launch_fm_discrim_fn)GetProcAddress(gpud_mod, "gpud_launch_fm_discrim_cuda"); gpud_p_launch_fir_stream = (gpud_launch_fir_stream_fn)GetProcAddress(gpud_mod, "gpud_launch_fir_stream_cuda"); + gpud_p_launch_fir_v2_stream = (gpud_launch_fir_v2_stream_fn)GetProcAddress(gpud_mod, "gpud_launch_fir_v2_stream_cuda"); gpud_p_launch_fir = (gpud_launch_fir_fn)GetProcAddress(gpud_mod, "gpud_launch_fir_cuda"); gpud_p_launch_decimate_stream = (gpud_launch_decimate_stream_fn)GetProcAddress(gpud_mod, "gpud_launch_decimate_stream_cuda"); gpud_p_launch_decimate = (gpud_launch_decimate_fn)GetProcAddress(gpud_mod, "gpud_launch_decimate_cuda"); @@ -80,6 +83,7 @@ static int gpud_launch_freq_shift_stream(gpud_float2 *in, gpud_float2 *out, int static int gpud_launch_freq_shift(gpud_float2 *in, gpud_float2 *out, int n, double phase_inc, double phase_start) { if (!gpud_p_launch_freq_shift) return -1; return gpud_p_launch_freq_shift(in, out, n, phase_inc, phase_start); } static int gpud_launch_fm_discrim(gpud_float2 *in, float *out, int n) { if (!gpud_p_launch_fm_discrim) return -1; return gpud_p_launch_fm_discrim(in, out, n); } static int gpud_launch_fir_stream(gpud_float2 *in, gpud_float2 *out, int n, int num_taps, gpud_stream_handle stream) { if (!gpud_p_launch_fir_stream) return -1; return gpud_p_launch_fir_stream(in, out, n, num_taps, stream); } +static int gpud_launch_fir_v2_stream(gpud_float2 *in, gpud_float2 *out, const float *taps, int n, int num_taps, gpud_stream_handle stream) { if (!gpud_p_launch_fir_v2_stream) return -1; return gpud_p_launch_fir_v2_stream(in, out, taps, n, num_taps, stream); } static int gpud_launch_fir(gpud_float2 *in, gpud_float2 *out, int n, int num_taps) { if (!gpud_p_launch_fir) return -1; return gpud_p_launch_fir(in, out, n, num_taps); } static int gpud_launch_decimate_stream(gpud_float2 *in, gpud_float2 *out, int n_out, int factor, gpud_stream_handle stream) { if (!gpud_p_launch_decimate_stream) return -1; return gpud_p_launch_decimate_stream(in, out, n_out, factor, stream); } static int gpud_launch_decimate(gpud_float2 *in, gpud_float2 *out, int n_out, int factor) { if (!gpud_p_launch_decimate) return -1; return gpud_p_launch_decimate(in, out, n_out, factor); } @@ -115,6 +119,9 @@ func bridgeLaunchFIR(in *C.gpud_float2, out *C.gpud_float2, n int, numTaps int) func bridgeLaunchFIRStream(in *C.gpud_float2, out *C.gpud_float2, n int, numTaps int, stream streamHandle) int { return int(C.gpud_launch_fir_stream(in, out, C.int(n), C.int(numTaps), C.gpud_stream_handle(stream))) } +func bridgeLaunchFIRv2Stream(in *C.gpud_float2, out *C.gpud_float2, taps *C.float, n int, numTaps int, stream streamHandle) int { + return int(C.gpud_launch_fir_v2_stream(in, out, taps, C.int(n), C.int(numTaps), C.gpud_stream_handle(stream))) +} func bridgeLaunchDecimate(in *C.gpud_float2, out *C.gpud_float2, nOut int, factor int) int { return int(C.gpud_launch_decimate(in, out, C.int(nOut), C.int(factor))) } func bridgeLaunchDecimateStream(in *C.gpud_float2, out *C.gpud_float2, nOut int, factor int, stream streamHandle) int { return int(C.gpud_launch_decimate_stream(in, out, C.int(nOut), C.int(factor), C.gpud_stream_handle(stream)))