From 2363292f4f6df995767007b01d16b6f661e5dc9e Mon Sep 17 00:00:00 2001 From: Jan Svabenik Date: Thu, 19 Mar 2026 15:58:54 +0100 Subject: [PATCH] feat: parallelize mixed-bandwidth GPU batch demod --- internal/demod/gpudemod/batch_runner_test.go | 80 ++++++++ .../demod/gpudemod/batch_runner_windows.go | 181 +++++++++++++----- .../demod/gpudemod/build/gpudemod_kernels.exp | Bin 2322 -> 2488 bytes .../demod/gpudemod/build/gpudemod_kernels.lib | Bin 4532 -> 4816 bytes internal/demod/gpudemod/native/exports.cu | 48 +++++ internal/demod/gpudemod/windows_bridge.go | 7 + 6 files changed, 271 insertions(+), 45 deletions(-) create mode 100644 internal/demod/gpudemod/batch_runner_test.go diff --git a/internal/demod/gpudemod/batch_runner_test.go b/internal/demod/gpudemod/batch_runner_test.go new file mode 100644 index 0000000..a6ad799 --- /dev/null +++ b/internal/demod/gpudemod/batch_runner_test.go @@ -0,0 +1,80 @@ +package gpudemod + +import ( + "math" + "math/cmplx" + "testing" + + "sdr-visual-suite/internal/dsp" +) + +func TestMixedBandwidthBatch(t *testing.T) { + if !Available() { + t.Skip("no GPU") + } + sampleRate := 2048000 + n := 2048 + iq := makeSyntheticIQ(n, sampleRate, []float64{50e3, -120e3, 300e3, -80e3}) + jobs := []ExtractJob{ + {OffsetHz: 50e3, BW: 12000, OutRate: 48000}, + {OffsetHz: -120e3, BW: 150000, OutRate: 192000}, + {OffsetHz: 300e3, BW: 3000, OutRate: 48000}, + {OffsetHz: -80e3, BW: 500, OutRate: 48000}, + } + cpuOuts := make([][]complex64, len(jobs)) + for i, job := range jobs { + cpuOuts[i] = extractCPU(iq, sampleRate, job) + } + runner, err := NewBatchRunner(n, sampleRate) + if err != nil { + t.Fatalf("NewBatchRunner: %v", err) + } + defer runner.Close() + gpuOuts, rates, err := runner.ShiftFilterDecimateBatch(iq, jobs) + if err != nil { + t.Fatalf("Batch: %v", err) + } + for i := range jobs { + if !complexSliceClose(cpuOuts[i], gpuOuts[i], 1e-3) { + t.Errorf("job %d: GPU/CPU mismatch (rate=%d)", i, rates[i]) + } + } +} + +func makeSyntheticIQ(n int, sr int, freqs []float64) []complex64 { + iq := make([]complex64, n) + for _, f := range freqs { + for i := range iq { + phase := 2 * math.Pi * f * float64(i) / float64(sr) + iq[i] += complex(float32(math.Cos(phase)), float32(math.Sin(phase))) + } + } + return iq +} + +func extractCPU(iq []complex64, sr int, job ExtractJob) []complex64 { + shifted := dsp.FreqShift(iq, sr, job.OffsetHz) + cutoff := job.BW / 2 + if cutoff < 200 { + cutoff = 200 + } + taps := dsp.LowpassFIR(cutoff, sr, 101) + filtered := dsp.ApplyFIR(shifted, taps) + decim := int(math.Round(float64(sr) / float64(job.OutRate))) + if decim < 1 { + decim = 1 + } + return dsp.Decimate(filtered, decim) +} + +func complexSliceClose(a, b []complex64, tol float64) bool { + if len(a) != len(b) { + return false + } + for i := range a { + if cmplx.Abs(complex128(a[i]-b[i])) > tol { + return false + } + } + return true +} diff --git a/internal/demod/gpudemod/batch_runner_windows.go b/internal/demod/gpudemod/batch_runner_windows.go index be1e958..98299db 100644 --- a/internal/demod/gpudemod/batch_runner_windows.go +++ b/internal/demod/gpudemod/batch_runner_windows.go @@ -2,6 +2,11 @@ package gpudemod +/* +#include +*/ +import "C" + import ( "errors" "math" @@ -10,89 +15,175 @@ import ( "sdr-visual-suite/internal/dsp" ) +type slotBuffers struct { + dShifted unsafe.Pointer + dFiltered unsafe.Pointer + dDecimated unsafe.Pointer + dTaps unsafe.Pointer + stream streamHandle +} + +type windowsBatchRunner struct { + *BatchRunner + slotBufs []slotBuffers +} + +func asWindowsBatchRunner(r *BatchRunner) *windowsBatchRunner { + return (*windowsBatchRunner)(unsafe.Pointer(r)) +} + +func (r *windowsBatchRunner) freeSlotBuffers() { + for i := range r.slotBufs { + if r.slotBufs[i].dShifted != nil { + _ = bridgeCudaFree(r.slotBufs[i].dShifted) + r.slotBufs[i].dShifted = nil + } + if r.slotBufs[i].dFiltered != nil { + _ = bridgeCudaFree(r.slotBufs[i].dFiltered) + r.slotBufs[i].dFiltered = nil + } + if r.slotBufs[i].dDecimated != nil { + _ = bridgeCudaFree(r.slotBufs[i].dDecimated) + r.slotBufs[i].dDecimated = nil + } + if r.slotBufs[i].dTaps != nil { + _ = bridgeCudaFree(r.slotBufs[i].dTaps) + r.slotBufs[i].dTaps = nil + } + if r.slotBufs[i].stream != nil { + _ = bridgeStreamDestroy(r.slotBufs[i].stream) + r.slotBufs[i].stream = nil + } + } + r.slotBufs = nil +} + +func (r *windowsBatchRunner) allocSlotBuffers(n int) error { + if len(r.slotBufs) == len(r.slots) && len(r.slotBufs) > 0 { + return nil + } + r.freeSlotBuffers() + if len(r.slots) == 0 { + return nil + } + iqBytes := uintptr(n) * unsafe.Sizeof(complex64(0)) + tapsBytes := uintptr(256) * unsafe.Sizeof(float32(0)) + r.slotBufs = make([]slotBuffers, len(r.slots)) + for i := range r.slotBufs { + for _, ptr := range []*unsafe.Pointer{&r.slotBufs[i].dShifted, &r.slotBufs[i].dFiltered, &r.slotBufs[i].dDecimated} { + if bridgeCudaMalloc(ptr, iqBytes) != 0 { + r.freeSlotBuffers() + return errors.New("cudaMalloc slot buffer failed") + } + } + if bridgeCudaMalloc(&r.slotBufs[i].dTaps, tapsBytes) != 0 { + r.freeSlotBuffers() + return errors.New("cudaMalloc slot taps failed") + } + s, res := bridgeStreamCreate() + if res != 0 { + r.freeSlotBuffers() + return errors.New("cudaStreamCreate failed") + } + r.slotBufs[i].stream = s + } + return nil +} + func (r *BatchRunner) shiftFilterDecimateBatchImpl(iq []complex64) ([][]complex64, []int, error) { + wr := asWindowsBatchRunner(r) + e := r.eng + if e == nil || !e.cudaReady { + return nil, nil, ErrUnavailable + } outs := make([][]complex64, len(r.slots)) rates := make([]int, len(r.slots)) + n := len(iq) + if n == 0 { + return outs, rates, nil + } + if err := wr.allocSlotBuffers(n); err != nil { + return nil, nil, err + } + bytesIn := uintptr(n) * unsafe.Sizeof(complex64(0)) + if bridgeMemcpyH2D(unsafe.Pointer(e.dIQIn), unsafe.Pointer(&iq[0]), bytesIn) != 0 { + return nil, nil, errors.New("cudaMemcpy H2D failed") + } for i := range r.slots { if !r.slots[i].active { continue } - out, rate, err := r.shiftFilterDecimateSlot(iq, r.slots[i].job, nil) + nOut, rate, err := r.shiftFilterDecimateSlotParallel(iq, r.slots[i].job, wr.slotBufs[i]) if err != nil { return nil, nil, err } - r.slots[i].out = out r.slots[i].rate = rate - outs[i] = out + outs[i] = make([]complex64, nOut) rates[i] = rate } + for i := range r.slots { + if !r.slots[i].active { + continue + } + buf := wr.slotBufs[i] + if bridgeStreamSync(buf.stream) != 0 { + return nil, nil, errors.New("cuda stream sync failed") + } + out := outs[i] + if len(out) == 0 { + continue + } + outBytes := uintptr(len(out)) * unsafe.Sizeof(complex64(0)) + if bridgeMemcpyD2H(unsafe.Pointer(&out[0]), buf.dDecimated, outBytes) != 0 { + return nil, nil, errors.New("cudaMemcpy D2H failed") + } + r.slots[i].out = out + } return outs, rates, nil } -func (r *BatchRunner) shiftFilterDecimateSlot(iq []complex64, job ExtractJob, stream streamHandle) ([]complex64, int, error) { +func (r *BatchRunner) shiftFilterDecimateSlotParallel(iq []complex64, job ExtractJob, buf slotBuffers) (int, int, error) { e := r.eng if e == nil || !e.cudaReady { - return nil, 0, ErrUnavailable + return 0, 0, ErrUnavailable } - if len(iq) == 0 { - return nil, 0, nil + n := len(iq) + if n == 0 { + return 0, 0, nil } cutoff := job.BW / 2 if cutoff < 200 { cutoff = 200 } - base64 := dsp.LowpassFIR(cutoff, e.sampleRate, 101) - taps := make([]float32, len(base64)) - for i, v := range base64 { + base := dsp.LowpassFIR(cutoff, e.sampleRate, 101) + taps := make([]float32, len(base)) + for i, v := range base { taps[i] = float32(v) } if len(taps) == 0 { - return nil, 0, errors.New("no FIR taps configured") + return 0, 0, errors.New("no FIR taps configured") } - e.SetFIR(taps) - if stream == nil { - if bridgeDeviceSync() != 0 { - return nil, 0, errors.New("cudaDeviceSynchronize failed") - } + tapsBytes := uintptr(len(taps)) * unsafe.Sizeof(float32(0)) + if bridgeMemcpyH2D(buf.dTaps, unsafe.Pointer(&taps[0]), tapsBytes) != 0 { + return 0, 0, errors.New("taps H2D failed") } decim := int(math.Round(float64(e.sampleRate) / float64(job.OutRate))) if decim < 1 { decim = 1 } - n := len(iq) nOut := n / decim if nOut <= 0 { - return nil, 0, errors.New("not enough output samples after decimation") - } - bytesIn := uintptr(n) * unsafe.Sizeof(complex64(0)) - if bridgeMemcpyH2D(unsafe.Pointer(e.dIQIn), unsafe.Pointer(&iq[0]), bytesIn) != 0 { - return nil, 0, errors.New("cudaMemcpy H2D failed") + return 0, 0, errors.New("not enough output samples after decimation") } phaseInc := -2.0 * math.Pi * job.OffsetHz / float64(e.sampleRate) - phaseStart := e.phase - if bridgeLaunchFreqShiftStream(e.dIQIn, e.dShifted, n, phaseInc, phaseStart, stream) != 0 { - return nil, 0, errors.New("gpu freq shift failed") - } - if bridgeLaunchFIRStream(e.dShifted, e.dFiltered, n, len(taps), stream) != 0 { - return nil, 0, errors.New("gpu FIR failed") + if bridgeLaunchFreqShiftStream(e.dIQIn, (*gpuFloat2)(buf.dShifted), n, phaseInc, e.phase, buf.stream) != 0 { + return 0, 0, errors.New("gpu freq shift failed") } - if bridgeLaunchDecimateStream(e.dFiltered, e.dDecimated, nOut, decim, stream) != 0 { - return nil, 0, errors.New("gpu decimate failed") - } - if stream != nil { - if bridgeStreamSync(stream) != 0 { - return nil, 0, errors.New("cuda stream sync failed") - } - } else { - if bridgeDeviceSync() != 0 { - return nil, 0, errors.New("cudaDeviceSynchronize failed") - } + if bridgeLaunchFIRv2Stream((*gpuFloat2)(buf.dShifted), (*gpuFloat2)(buf.dFiltered), (*C.float)(buf.dTaps), n, len(taps), buf.stream) != 0 { + return 0, 0, errors.New("gpu FIR v2 failed") } - out := make([]complex64, nOut) - outBytes := uintptr(nOut) * unsafe.Sizeof(complex64(0)) - if bridgeMemcpyD2H(unsafe.Pointer(&out[0]), unsafe.Pointer(e.dDecimated), outBytes) != 0 { - return nil, 0, errors.New("cudaMemcpy D2H failed") + if bridgeLaunchDecimateStream((*gpuFloat2)(buf.dFiltered), (*gpuFloat2)(buf.dDecimated), nOut, decim, buf.stream) != 0 { + return 0, 0, errors.New("gpu decimate failed") } - e.phase = phaseStart + phaseInc*float64(n) - return out, e.sampleRate / decim, nil + return nOut, e.sampleRate / decim, nil } diff --git a/internal/demod/gpudemod/build/gpudemod_kernels.exp b/internal/demod/gpudemod/build/gpudemod_kernels.exp index de59c2015e30844c49b4becf6b0ce16c7f238362..0239b603fc0b6b55506f9bc08388649c47811eb3 100644 GIT binary patch literal 2488 zcmeHI&u`R56n>ix5TJwv+VZ`CA{n z!**^vfCZ*ilynkjeSqET5`e^hzRze!v1{P3BGpp5Ji1_1fd4SwEN3IrH(=;IF}g<< z!Rg?O^?#;)n!W^mP=4~NeV;|!chbOJv7MH! zf>q^*k+R)XIz2(DT(9LMn?#ykTudi%wQlPqRyb{8wr0zgo_1qzvskQtwe^bEOg6c0 zBevDlS})mA9Lm(~zCO#w{LvF%yV8nN<8iITE$xY;soMixE9d!)I!Vn^s1**$3Lx8 zFHyn^5-dx{7hIzvD^{_TdcN!oJGHCY5uAeAVJh#JbRTYmxb%NgfUBMxhdOK~*4WSE z*4$6?SKv?haTwe+`JoE})mD^yJr0*+r)6F8e5I|i_Vm=a*qHtnZ^2qfQt5@FUaifD z^R(a_Vd!gd>w+-k>H^Z*WTie+nW)w#Cd85#h|8f)7N;gf8~?r++^NtV;aivTF$tXLuKESvtd#;{y654 zA-2fLtVt&h?Vfznomw#TAxt9Ua8h=)G4^2$k$<{FnZXZWjC|A`W24j4Br|bfQ^Pmh zuoYE2iM3Ut1HKH6CLSb}YIO)#Yn1NK3BABFbbmPLZIwaan|E6HlAqwnQ21i_7pTD#p_oaOgMaDQo delta 799 zcmZXS%P#{_6o*eAqb-VRJzF9)B;wIhR6_!VZ9Z-#@eJ4-b5pyl~WDq z;xq=@IQhUlP7`1{rvRAGX&RJwH&GBQ;3T(}$*B!2WO3{QWfvw| z2D3SJgKkauIyeo0C7d?FTux^24cFOq%yeq7lvB_FC7!J%qDjdUQN<}H z6}M=&Whxa|vxrN@>yc*=SzZlItt8^%nWd>YFcHSSf)1N_R@@%h{O>=kY((d%49hqH zacpyojU78o!T@bjDBT;>0qVT*M}{!bgVM*QOX%BiqwTHL^Xe cRP&%sF{(Vs3z_|7?f?J) diff --git a/internal/demod/gpudemod/build/gpudemod_kernels.lib b/internal/demod/gpudemod/build/gpudemod_kernels.lib index fb57b526703c6b343fc062624df27a3cb48e32c5..2bed4d2afe1ed22b487f251ee2076ab7a18db649 100644 GIT binary patch delta 528 zcmZvY&nrYx6vw~w%)IBlH}A)=ksp~E5h=q%OqpUw5gTJOHKUkPMhq#5$;QHp&R-xq zE7QnEEG#TU*{Q)aC7LN^X~}us#Kx^p=YGHUobLDD)0X?z{ct8c=yz5^0nh+QML_8S z)Fr?yIrtL54|%SPQK%5(%INIb)J!>OPk@acb+i6X3{1eADhUY1O+j@|$c#0Xj6O@Ipp4Bp7hR0};x9J_X!ozp4DxDHypocaN)Lab&g{X;M^j5iu(@ z4Oo@$^xOaW^##oBF1~J-`uOH5I?!t6bt}HQmY$zYQPvoz4|Z;rt=k?xcay4bsDb{A zM_YMVkBD^?5oJ2jqw>tr*5@GmA{j=;yRlckwOC1c7RrXdK$bR?dL>(13)EPl zyO@MDi&>2**-I(ePP{XttepDw)bBax?LCFgi}wBD?O3d}dNKxpMnFpd`XOKx0XJVI z7zZl&oQ#wE#p7gbc4fps6`Svuv2K1D<7C`qnRI$;W^Q&Nm5xs=(zBk*Mx)P1}NA zwP>Xqe|mWu99E5M!_M3p!U?=-#cP zjEGXB2JGm8*ct5R KwI;d@)&Br?GGcK6 diff --git a/internal/demod/gpudemod/native/exports.cu b/internal/demod/gpudemod/native/exports.cu index f5b741b..3a7e88b 100644 --- a/internal/demod/gpudemod/native/exports.cu +++ b/internal/demod/gpudemod/native/exports.cu @@ -170,6 +170,54 @@ GPUD_API int GPUD_CALL gpud_launch_fir_cuda( return (int)cudaGetLastError(); } +__global__ void gpud_fir_kernel_v2( + const float2* __restrict__ in, + float2* __restrict__ out, + const float* __restrict__ taps, + int n, + int num_taps +) { + extern __shared__ float2 s_data[]; + int gid = blockIdx.x * blockDim.x + threadIdx.x; + int lid = threadIdx.x; + int halo = num_taps - 1; + + if (gid < n) s_data[lid + halo] = in[gid]; + else s_data[lid + halo] = make_float2(0.0f, 0.0f); + + if (lid < halo) { + int src = gid - halo; + s_data[lid] = (src >= 0) ? in[src] : make_float2(0.0f, 0.0f); + } + __syncthreads(); + if (gid >= n) return; + + float acc_r = 0.0f, acc_i = 0.0f; + for (int k = 0; k < num_taps; ++k) { + float2 v = s_data[lid + halo - k]; + float t = taps[k]; + acc_r += v.x * t; + acc_i += v.y * t; + } + out[gid] = make_float2(acc_r, acc_i); +} + +GPUD_API int GPUD_CALL gpud_launch_fir_v2_stream_cuda( + const float2* in, + float2* out, + const float* taps, + int n, + int num_taps, + gpud_stream_handle stream +) { + if (n <= 0 || num_taps <= 0 || num_taps > 256) return 0; + const int block = 256; + const int grid = (n + block - 1) / block; + size_t sharedBytes = (size_t)(block + num_taps - 1) * sizeof(float2); + gpud_fir_kernel_v2<<>>(in, out, taps, n, num_taps); + return (int)cudaGetLastError(); +} + GPUD_API int GPUD_CALL gpud_launch_decimate_cuda( const float2* in, float2* out, diff --git a/internal/demod/gpudemod/windows_bridge.go b/internal/demod/gpudemod/windows_bridge.go index 2cf33cd..3371be7 100644 --- a/internal/demod/gpudemod/windows_bridge.go +++ b/internal/demod/gpudemod/windows_bridge.go @@ -20,6 +20,7 @@ typedef int (__stdcall *gpud_launch_freq_shift_stream_fn)(const gpud_float2* in, typedef int (__stdcall *gpud_launch_freq_shift_fn)(const gpud_float2* in, gpud_float2* out, int n, double phase_inc, double phase_start); typedef int (__stdcall *gpud_launch_fm_discrim_fn)(const gpud_float2* in, float* out, int n); typedef int (__stdcall *gpud_launch_fir_stream_fn)(const gpud_float2* in, gpud_float2* out, int n, int num_taps, gpud_stream_handle stream); +typedef int (__stdcall *gpud_launch_fir_v2_stream_fn)(const gpud_float2* in, gpud_float2* out, const float* taps, int n, int num_taps, gpud_stream_handle stream); typedef int (__stdcall *gpud_launch_fir_fn)(const gpud_float2* in, gpud_float2* out, int n, int num_taps); typedef int (__stdcall *gpud_launch_decimate_stream_fn)(const gpud_float2* in, gpud_float2* out, int n_out, int factor, gpud_stream_handle stream); typedef int (__stdcall *gpud_launch_decimate_fn)(const gpud_float2* in, gpud_float2* out, int n_out, int factor); @@ -35,6 +36,7 @@ static gpud_launch_freq_shift_stream_fn gpud_p_launch_freq_shift_stream = NULL; static gpud_launch_freq_shift_fn gpud_p_launch_freq_shift = NULL; static gpud_launch_fm_discrim_fn gpud_p_launch_fm_discrim = NULL; static gpud_launch_fir_stream_fn gpud_p_launch_fir_stream = NULL; +static gpud_launch_fir_v2_stream_fn gpud_p_launch_fir_v2_stream = NULL; static gpud_launch_fir_fn gpud_p_launch_fir = NULL; static gpud_launch_decimate_stream_fn gpud_p_launch_decimate_stream = NULL; static gpud_launch_decimate_fn gpud_p_launch_decimate = NULL; @@ -59,6 +61,7 @@ static int gpud_load_library(const char* path) { gpud_p_launch_freq_shift = (gpud_launch_freq_shift_fn)GetProcAddress(gpud_mod, "gpud_launch_freq_shift_cuda"); gpud_p_launch_fm_discrim = (gpud_launch_fm_discrim_fn)GetProcAddress(gpud_mod, "gpud_launch_fm_discrim_cuda"); gpud_p_launch_fir_stream = (gpud_launch_fir_stream_fn)GetProcAddress(gpud_mod, "gpud_launch_fir_stream_cuda"); + gpud_p_launch_fir_v2_stream = (gpud_launch_fir_v2_stream_fn)GetProcAddress(gpud_mod, "gpud_launch_fir_v2_stream_cuda"); gpud_p_launch_fir = (gpud_launch_fir_fn)GetProcAddress(gpud_mod, "gpud_launch_fir_cuda"); gpud_p_launch_decimate_stream = (gpud_launch_decimate_stream_fn)GetProcAddress(gpud_mod, "gpud_launch_decimate_stream_cuda"); gpud_p_launch_decimate = (gpud_launch_decimate_fn)GetProcAddress(gpud_mod, "gpud_launch_decimate_cuda"); @@ -80,6 +83,7 @@ static int gpud_launch_freq_shift_stream(gpud_float2 *in, gpud_float2 *out, int static int gpud_launch_freq_shift(gpud_float2 *in, gpud_float2 *out, int n, double phase_inc, double phase_start) { if (!gpud_p_launch_freq_shift) return -1; return gpud_p_launch_freq_shift(in, out, n, phase_inc, phase_start); } static int gpud_launch_fm_discrim(gpud_float2 *in, float *out, int n) { if (!gpud_p_launch_fm_discrim) return -1; return gpud_p_launch_fm_discrim(in, out, n); } static int gpud_launch_fir_stream(gpud_float2 *in, gpud_float2 *out, int n, int num_taps, gpud_stream_handle stream) { if (!gpud_p_launch_fir_stream) return -1; return gpud_p_launch_fir_stream(in, out, n, num_taps, stream); } +static int gpud_launch_fir_v2_stream(gpud_float2 *in, gpud_float2 *out, const float *taps, int n, int num_taps, gpud_stream_handle stream) { if (!gpud_p_launch_fir_v2_stream) return -1; return gpud_p_launch_fir_v2_stream(in, out, taps, n, num_taps, stream); } static int gpud_launch_fir(gpud_float2 *in, gpud_float2 *out, int n, int num_taps) { if (!gpud_p_launch_fir) return -1; return gpud_p_launch_fir(in, out, n, num_taps); } static int gpud_launch_decimate_stream(gpud_float2 *in, gpud_float2 *out, int n_out, int factor, gpud_stream_handle stream) { if (!gpud_p_launch_decimate_stream) return -1; return gpud_p_launch_decimate_stream(in, out, n_out, factor, stream); } static int gpud_launch_decimate(gpud_float2 *in, gpud_float2 *out, int n_out, int factor) { if (!gpud_p_launch_decimate) return -1; return gpud_p_launch_decimate(in, out, n_out, factor); } @@ -115,6 +119,9 @@ func bridgeLaunchFIR(in *C.gpud_float2, out *C.gpud_float2, n int, numTaps int) func bridgeLaunchFIRStream(in *C.gpud_float2, out *C.gpud_float2, n int, numTaps int, stream streamHandle) int { return int(C.gpud_launch_fir_stream(in, out, C.int(n), C.int(numTaps), C.gpud_stream_handle(stream))) } +func bridgeLaunchFIRv2Stream(in *C.gpud_float2, out *C.gpud_float2, taps *C.float, n int, numTaps int, stream streamHandle) int { + return int(C.gpud_launch_fir_v2_stream(in, out, taps, C.int(n), C.int(numTaps), C.gpud_stream_handle(stream))) +} func bridgeLaunchDecimate(in *C.gpud_float2, out *C.gpud_float2, nOut int, factor int) int { return int(C.gpud_launch_decimate(in, out, C.int(nOut), C.int(factor))) } func bridgeLaunchDecimateStream(in *C.gpud_float2, out *C.gpud_float2, nOut int, factor int, stream streamHandle) int { return int(C.gpud_launch_decimate_stream(in, out, C.int(nOut), C.int(factor), C.gpud_stream_handle(stream)))