From 2363292f4f6df995767007b01d16b6f661e5dc9e Mon Sep 17 00:00:00 2001
From: Jan Svabenik <jan@svabi.ch>
Date: Thu, 19 Mar 2026 15:58:54 +0100
Subject: [PATCH] feat: parallelize mixed-bandwidth GPU batch demod

---
 internal/demod/gpudemod/batch_runner_test.go  |  80 ++++++++
 .../demod/gpudemod/batch_runner_windows.go    | 181 +++++++++++++-----
 .../demod/gpudemod/build/gpudemod_kernels.exp | Bin 2322 -> 2488 bytes
 .../demod/gpudemod/build/gpudemod_kernels.lib | Bin 4532 -> 4816 bytes
 internal/demod/gpudemod/native/exports.cu     |  48 +++++
 internal/demod/gpudemod/windows_bridge.go     |   7 +
 6 files changed, 271 insertions(+), 45 deletions(-)
 create mode 100644 internal/demod/gpudemod/batch_runner_test.go

diff --git a/internal/demod/gpudemod/batch_runner_test.go b/internal/demod/gpudemod/batch_runner_test.go
new file mode 100644
index 0000000..a6ad799
--- /dev/null
+++ b/internal/demod/gpudemod/batch_runner_test.go
@@ -0,0 +1,80 @@
+package gpudemod
+
+import (
+	"math"
+	"math/cmplx"
+	"testing"
+
+	"sdr-visual-suite/internal/dsp"
+)
+
+func TestMixedBandwidthBatch(t *testing.T) {
+	if !Available() {
+		t.Skip("no GPU")
+	}
+	sampleRate := 2048000
+	n := 2048
+	iq := makeSyntheticIQ(n, sampleRate, []float64{50e3, -120e3, 300e3, -80e3})
+	jobs := []ExtractJob{
+		{OffsetHz: 50e3, BW: 12000, OutRate: 48000},
+		{OffsetHz: -120e3, BW: 150000, OutRate: 192000},
+		{OffsetHz: 300e3, BW: 3000, OutRate: 48000},
+		{OffsetHz: -80e3, BW: 500, OutRate: 48000},
+	}
+	cpuOuts := make([][]complex64, len(jobs))
+	for i, job := range jobs {
+		cpuOuts[i] = extractCPU(iq, sampleRate, job)
+	}
+	runner, err := NewBatchRunner(n, sampleRate)
+	if err != nil {
+		t.Fatalf("NewBatchRunner: %v", err)
+	}
+	defer runner.Close()
+	gpuOuts, rates, err := runner.ShiftFilterDecimateBatch(iq, jobs)
+	if err != nil {
+		t.Fatalf("Batch: %v", err)
+	}
+	for i := range jobs {
+		if !complexSliceClose(cpuOuts[i], gpuOuts[i], 1e-3) {
+			t.Errorf("job %d: GPU/CPU mismatch (rate=%d)", i, rates[i])
+		}
+	}
+}
+
+func makeSyntheticIQ(n int, sr int, freqs []float64) []complex64 {
+	iq := make([]complex64, n)
+	for _, f := range freqs {
+		for i := range iq {
+			phase := 2 * math.Pi * f * float64(i) / float64(sr)
+			iq[i] += complex(float32(math.Cos(phase)), float32(math.Sin(phase)))
+		}
+	}
+	return iq
+}
+
+func extractCPU(iq []complex64, sr int, job ExtractJob) []complex64 {
+	shifted := dsp.FreqShift(iq, sr, job.OffsetHz)
+	cutoff := job.BW / 2
+	if cutoff < 200 {
+		cutoff = 200
+	}
+	taps := dsp.LowpassFIR(cutoff, sr, 101)
+	filtered := dsp.ApplyFIR(shifted, taps)
+	decim := int(math.Round(float64(sr) / float64(job.OutRate)))
+	if decim < 1 {
+		decim = 1
+	}
+	return dsp.Decimate(filtered, decim)
+}
+
+func complexSliceClose(a, b []complex64, tol float64) bool {
+	if len(a) != len(b) {
+		return false
+	}
+	for i := range a {
+		if cmplx.Abs(complex128(a[i]-b[i])) > tol {
+			return false
+		}
+	}
+	return true
+}
diff --git a/internal/demod/gpudemod/batch_runner_windows.go b/internal/demod/gpudemod/batch_runner_windows.go
index be1e958..98299db 100644
--- a/internal/demod/gpudemod/batch_runner_windows.go
+++ b/internal/demod/gpudemod/batch_runner_windows.go
@@ -2,6 +2,11 @@
 
 package gpudemod
 
+/*
+#include <cuda_runtime.h>
+*/
+import "C"
+
 import (
 	"errors"
 	"math"
@@ -10,89 +15,175 @@ import (
 	"sdr-visual-suite/internal/dsp"
 )
 
+type slotBuffers struct {
+	dShifted   unsafe.Pointer
+	dFiltered  unsafe.Pointer
+	dDecimated unsafe.Pointer
+	dTaps      unsafe.Pointer
+	stream     streamHandle
+}
+
+type windowsBatchRunner struct {
+	*BatchRunner
+	slotBufs []slotBuffers
+}
+
+func asWindowsBatchRunner(r *BatchRunner) *windowsBatchRunner {
+	return (*windowsBatchRunner)(unsafe.Pointer(r))
+}
+
+func (r *windowsBatchRunner) freeSlotBuffers() {
+	for i := range r.slotBufs {
+		if r.slotBufs[i].dShifted != nil {
+			_ = bridgeCudaFree(r.slotBufs[i].dShifted)
+			r.slotBufs[i].dShifted = nil
+		}
+		if r.slotBufs[i].dFiltered != nil {
+			_ = bridgeCudaFree(r.slotBufs[i].dFiltered)
+			r.slotBufs[i].dFiltered = nil
+		}
+		if r.slotBufs[i].dDecimated != nil {
+			_ = bridgeCudaFree(r.slotBufs[i].dDecimated)
+			r.slotBufs[i].dDecimated = nil
+		}
+		if r.slotBufs[i].dTaps != nil {
+			_ = bridgeCudaFree(r.slotBufs[i].dTaps)
+			r.slotBufs[i].dTaps = nil
+		}
+		if r.slotBufs[i].stream != nil {
+			_ = bridgeStreamDestroy(r.slotBufs[i].stream)
+			r.slotBufs[i].stream = nil
+		}
+	}
+	r.slotBufs = nil
+}
+
+func (r *windowsBatchRunner) allocSlotBuffers(n int) error {
+	if len(r.slotBufs) == len(r.slots) && len(r.slotBufs) > 0 {
+		return nil
+	}
+	r.freeSlotBuffers()
+	if len(r.slots) == 0 {
+		return nil
+	}
+	iqBytes := uintptr(n) * unsafe.Sizeof(complex64(0))
+	tapsBytes := uintptr(256) * unsafe.Sizeof(float32(0))
+	r.slotBufs = make([]slotBuffers, len(r.slots))
+	for i := range r.slotBufs {
+		for _, ptr := range []*unsafe.Pointer{&r.slotBufs[i].dShifted, &r.slotBufs[i].dFiltered, &r.slotBufs[i].dDecimated} {
+			if bridgeCudaMalloc(ptr, iqBytes) != 0 {
+				r.freeSlotBuffers()
+				return errors.New("cudaMalloc slot buffer failed")
+			}
+		}
+		if bridgeCudaMalloc(&r.slotBufs[i].dTaps, tapsBytes) != 0 {
+			r.freeSlotBuffers()
+			return errors.New("cudaMalloc slot taps failed")
+		}
+		s, res := bridgeStreamCreate()
+		if res != 0 {
+			r.freeSlotBuffers()
+			return errors.New("cudaStreamCreate failed")
+		}
+		r.slotBufs[i].stream = s
+	}
+	return nil
+}
+
 func (r *BatchRunner) shiftFilterDecimateBatchImpl(iq []complex64) ([][]complex64, []int, error) {
+	wr := asWindowsBatchRunner(r)
+	e := r.eng
+	if e == nil || !e.cudaReady {
+		return nil, nil, ErrUnavailable
+	}
 	outs := make([][]complex64, len(r.slots))
 	rates := make([]int, len(r.slots))
+	n := len(iq)
+	if n == 0 {
+		return outs, rates, nil
+	}
+	if err := wr.allocSlotBuffers(n); err != nil {
+		return nil, nil, err
+	}
+	bytesIn := uintptr(n) * unsafe.Sizeof(complex64(0))
+	if bridgeMemcpyH2D(unsafe.Pointer(e.dIQIn), unsafe.Pointer(&iq[0]), bytesIn) != 0 {
+		return nil, nil, errors.New("cudaMemcpy H2D failed")
+	}
 	for i := range r.slots {
 		if !r.slots[i].active {
 			continue
 		}
-		out, rate, err := r.shiftFilterDecimateSlot(iq, r.slots[i].job, nil)
+		nOut, rate, err := r.shiftFilterDecimateSlotParallel(iq, r.slots[i].job, wr.slotBufs[i])
 		if err != nil {
 			return nil, nil, err
 		}
-		r.slots[i].out = out
 		r.slots[i].rate = rate
-		outs[i] = out
+		outs[i] = make([]complex64, nOut)
 		rates[i] = rate
 	}
+	for i := range r.slots {
+		if !r.slots[i].active {
+			continue
+		}
+		buf := wr.slotBufs[i]
+		if bridgeStreamSync(buf.stream) != 0 {
+			return nil, nil, errors.New("cuda stream sync failed")
+		}
+		out := outs[i]
+		if len(out) == 0 {
+			continue
+		}
+		outBytes := uintptr(len(out)) * unsafe.Sizeof(complex64(0))
+		if bridgeMemcpyD2H(unsafe.Pointer(&out[0]), buf.dDecimated, outBytes) != 0 {
+			return nil, nil, errors.New("cudaMemcpy D2H failed")
+		}
+		r.slots[i].out = out
+	}
 	return outs, rates, nil
 }
 
-func (r *BatchRunner) shiftFilterDecimateSlot(iq []complex64, job ExtractJob, stream streamHandle) ([]complex64, int, error) {
+func (r *BatchRunner) shiftFilterDecimateSlotParallel(iq []complex64, job ExtractJob, buf slotBuffers) (int, int, error) {
 	e := r.eng
 	if e == nil || !e.cudaReady {
-		return nil, 0, ErrUnavailable
+		return 0, 0, ErrUnavailable
 	}
-	if len(iq) == 0 {
-		return nil, 0, nil
+	n := len(iq)
+	if n == 0 {
+		return 0, 0, nil
 	}
 	cutoff := job.BW / 2
 	if cutoff < 200 {
 		cutoff = 200
 	}
-	base64 := dsp.LowpassFIR(cutoff, e.sampleRate, 101)
-	taps := make([]float32, len(base64))
-	for i, v := range base64 {
+	base := dsp.LowpassFIR(cutoff, e.sampleRate, 101)
+	taps := make([]float32, len(base))
+	for i, v := range base {
 		taps[i] = float32(v)
 	}
 	if len(taps) == 0 {
-		return nil, 0, errors.New("no FIR taps configured")
+		return 0, 0, errors.New("no FIR taps configured")
 	}
-	e.SetFIR(taps)
-	if stream == nil {
-		if bridgeDeviceSync() != 0 {
-			return nil, 0, errors.New("cudaDeviceSynchronize failed")
-		}
+	tapsBytes := uintptr(len(taps)) * unsafe.Sizeof(float32(0))
+	if bridgeMemcpyH2D(buf.dTaps, unsafe.Pointer(&taps[0]), tapsBytes) != 0 {
+		return 0, 0, errors.New("taps H2D failed")
 	}
 	decim := int(math.Round(float64(e.sampleRate) / float64(job.OutRate)))
 	if decim < 1 {
 		decim = 1
 	}
-	n := len(iq)
 	nOut := n / decim
 	if nOut <= 0 {
-		return nil, 0, errors.New("not enough output samples after decimation")
-	}
-	bytesIn := uintptr(n) * unsafe.Sizeof(complex64(0))
-	if bridgeMemcpyH2D(unsafe.Pointer(e.dIQIn), unsafe.Pointer(&iq[0]), bytesIn) != 0 {
-		return nil, 0, errors.New("cudaMemcpy H2D failed")
+		return 0, 0, errors.New("not enough output samples after decimation")
 	}
 	phaseInc := -2.0 * math.Pi * job.OffsetHz / float64(e.sampleRate)
-	phaseStart := e.phase
-	if bridgeLaunchFreqShiftStream(e.dIQIn, e.dShifted, n, phaseInc, phaseStart, stream) != 0 {
-		return nil, 0, errors.New("gpu freq shift failed")
-	}
-	if bridgeLaunchFIRStream(e.dShifted, e.dFiltered, n, len(taps), stream) != 0 {
-		return nil, 0, errors.New("gpu FIR failed")
+	if bridgeLaunchFreqShiftStream(e.dIQIn, (*gpuFloat2)(buf.dShifted), n, phaseInc, e.phase, buf.stream) != 0 {
+		return 0, 0, errors.New("gpu freq shift failed")
 	}
-	if bridgeLaunchDecimateStream(e.dFiltered, e.dDecimated, nOut, decim, stream) != 0 {
-		return nil, 0, errors.New("gpu decimate failed")
-	}
-	if stream != nil {
-		if bridgeStreamSync(stream) != 0 {
-			return nil, 0, errors.New("cuda stream sync failed")
-		}
-	} else {
-		if bridgeDeviceSync() != 0 {
-			return nil, 0, errors.New("cudaDeviceSynchronize failed")
-		}
+	if bridgeLaunchFIRv2Stream((*gpuFloat2)(buf.dShifted), (*gpuFloat2)(buf.dFiltered), (*C.float)(buf.dTaps), n, len(taps), buf.stream) != 0 {
+		return 0, 0, errors.New("gpu FIR v2 failed")
 	}
-	out := make([]complex64, nOut)
-	outBytes := uintptr(nOut) * unsafe.Sizeof(complex64(0))
-	if bridgeMemcpyD2H(unsafe.Pointer(&out[0]), unsafe.Pointer(e.dDecimated), outBytes) != 0 {
-		return nil, 0, errors.New("cudaMemcpy D2H failed")
+	if bridgeLaunchDecimateStream((*gpuFloat2)(buf.dFiltered), (*gpuFloat2)(buf.dDecimated), nOut, decim, buf.stream) != 0 {
+		return 0, 0, errors.New("gpu decimate failed")
 	}
-	e.phase = phaseStart + phaseInc*float64(n)
-	return out, e.sampleRate / decim, nil
+	return nOut, e.sampleRate / decim, nil
 }
diff --git a/internal/demod/gpudemod/build/gpudemod_kernels.exp b/internal/demod/gpudemod/build/gpudemod_kernels.exp
index de59c2015e30844c49b4becf6b0ce16c7f238362..0239b603fc0b6b55506f9bc08388649c47811eb3 100644
GIT binary patch
literal 2488
zcmeHI&u`R56n>ix5TJwv+VZ<GG)RdktKCfqNt=olAVft7r~#D2NXGHZCN5sv_1MYs
z<G=w`mHH2G<kTK|=FA<53iZ~%010vFl?%N0Cf@AYi3Ar;jCQ~I=6lcM8T)->`CA{n
z!**^vfCZ*ilynkjeSqET5`e^hzRze!v1{P3BGpp5Ji1_1fd4SwEN3IrH(=;IF}g<<
z!Rg?O^?#;)n!W^mP=<ck2HRl=48Ts<1-oH6N~LOr(!QhOK>4~NeV;|!chbOJv7MH!
zf>q^*k+R)XIz2(DT(9LMn?#ykTudi%wQlPqRyb{8wr0zgo_1qzvskQtwe^bEOg6c0
zBevDlS})mA9Lm(~zCO#w<s`~%Y+aHHNqDzM({}^c2x;Vpj<jhzi4$qFB6_R+_%8$1
z$TnscA{G~Fqli10jUj%?Y!PvQ*>{LvF%yV8nN<<LW_BBK7qdyk1I(x<cQcztJjiSX
zkycTLS;Rxke8fG>8i<FPMTmpUE+QUbmLTqBb{Ua&P=+?*5VJYNW6T~P4l}!s_zkm%
zi2In`KpbKA2ys8Ne#HOr{{kK2K#4Af3*U>ITE$xY;soMixE9d!)I!Vn^s1**$3Lx8
zFHyn^5-dx{7hIzvD^{_TdcN!oJGHCY5uAeAVJh#JbRTYmxb%NgfUBMxhdOK~*4WSE
z*4$6?SKv?haTwe+`JoE})mD^yJr0*+r)6F8e5I|i_Vm=a*qHtnZ^2qfQt5@FUaifD
z^R(a_Vd!gd>w+-k>H^Z*WTie+nW)w#Cd85#h|8f)7N;gf8~?r++^NtV;aiv<rjvsn
zz?R0eFbSV=Pg1Wzk6;F!QQgyYjCvA-^se&3oMBI38Wnd>TF$tXLuKESvtd#;{y654
zA-2fLtVt&h?Vfznomw#TAxt9Ua8h=)G4^2$k$<{FnZXZWjC|A`W24j4Br|bfQ^Pmh
zuoYE2iM3Ut1HKH6CLSb}YIO)#Yn1NK3BABFbbmPLZIwaan|E6HlAqwn<iyL-Dh^<*
zOEFn=WMnyO*J26qdq)|auVSmg)0|R=e{;$NyvZq(P{L`=%x0c5VC9r4n9V6?VKJvn
z!~L8x1CM)^S{44zDK&VOQ|j=tqu^5cV5OcKJvD&ij0TMILqVY<Dr+Vu%!b$-<&T2$
jS3!ARP+k?3w*_SocaSx6uaO>Q21i_7pTD#p_oaOgMaDQo

delta 799
zcmZXS%P#{_6o*eAqb-VRJzF9)B;wIhR6<x-Sa?4Y5#rURB(zc$B$6T`mg**U*s%~B
zHZ~S^{s&g}h?Ru}zk9|A<D~Q5-~DbgbI+Z*aUSN{LwiR0L_I1*r73$s?VW)rL`0QF
z09A~u6TiApcy=}79hOek71pa$gH=ifCK;*ih182nm<>_!VZ9Z-#@eJ4-b5pyl~WDq
z;xq=@IQhUlP7`1{rvRAGX&RJwH&GBQ;3T(}$*B!2<TMZ3Idy_XoEB#>WO3{QWfvw|
z2D3SJgKka<Fo#nQ=;5>uIyeo0C7d?FTux^24cFOq%yeq7lvB_FC7!J%qDjdUQN<}H
z6}M=&Whxa|vxrN@>yc*=SzZlItt8^%nWd>YFcHSSf)1N_R@@%h{O>=kY((d%49hqH
zacpyoj<f;8kvK@JNXj$CTS7ze$Xb6KN3!)}ynQ~DpDuK!f$nvtk)CuWK<_%!L{2<N
zeU^IXW~v|8sTLa2nIOe<rj>U78o!T@bjDBT;>0qVT*M}{!bgVM*QOX%BiqwTHL^Xe
cRP&%sF{<YEpW)uVt&xhcqlnnGj>(Vs3z_|7?f?J)

diff --git a/internal/demod/gpudemod/build/gpudemod_kernels.lib b/internal/demod/gpudemod/build/gpudemod_kernels.lib
index fb57b526703c6b343fc062624df27a3cb48e32c5..2bed4d2afe1ed22b487f251ee2076ab7a18db649 100644
GIT binary patch
delta 528
zcmZvY&nrYx6vw~w%)IBlH}A)=ksp~E5h=q%OqpUw5gTJOHKUkPMhq#5$;QHp&R-xq
zE7QnEEG#TU*{Q)aC7LN^X~}us#Kx^p=YGHUobLDD)0X?z{ct8c=yz5^0nh+QML_8S
z)Fr?yIrtL54|%SPQK%5(%INIb)J!>OPk@acb<nuS#>+i6X3{1eADhUY1O+j<kdCig
zbnL44IJ-MnGrbEs6@{Bjxce`m0iY-WU7U4E1xhB6@Bw+Iz=ts=uLDyxz>@|<c^~7N
z_X0`o<>$c#0Xj6O@Ipp4Bp7hR0};x9J_X!ozp4DxDHypocaN)Lab&g{X;M^j5iu(@
z4Oo@$^xOaW^##oBF1~J-`uOH5I?!t6bt}HQmY$zYQPvoz4|Z;rt=k?xcay4bsDb{A
zM_YMVkBD^?5oJ2jqw>tr*5@GmA{j=;yRlckwOC1<orbFdbZ$(m!Nc-HJDb%!y2GQr
m=Cpi0wEfb-o=gYkihTJNDzvduNcE-U-K){(5G$2-e7)b-jdQ91

delta 486
zcmZ{f%_~Gv7{;G#?uYw5XJ&{{<RdmTgJK~!M#f0KhK0>c7RrXdK$bR?dL>(13)EPl
zyO@MDi&>2**-I(ePP{XttepDw)bBax?LCFgi}wBD?O3d}dNKxpMnFpd`XOKx0XJVI
z7zZl&oQ#wE#p7gbc4fps6`Svuv2K1D<7C`qnRI$;W^Q&Nm5xs=(zBk*Mx<MbK4gbB
zREsjILnS#xd0EhUU~hY<|8GNwS4?QYp$-&z-@FUNgMiQExbzs`1_lIs&KnCDW0HK2
zzXl04uwlW3j#_ANAVYx);~yJ9nzrQA?406B*)#P~qg^**CYr-9QJxlnOvh>)P1}NA
zwP>Xqe|mW<cQSXr)Ii^oPI-6P{JxucA7Zamd<RY16J*#C>u99E5M!_M3p!U?=-#cP
zjEGXB2<zU;)k8Oz8V!m~B`39h_E1!W=~}FsC%rqRF79+5=xOyd8d@^f>JGm8*ct5R
KwI;d@)&Br?GGcK6

diff --git a/internal/demod/gpudemod/native/exports.cu b/internal/demod/gpudemod/native/exports.cu
index f5b741b..3a7e88b 100644
--- a/internal/demod/gpudemod/native/exports.cu
+++ b/internal/demod/gpudemod/native/exports.cu
@@ -170,6 +170,54 @@ GPUD_API int GPUD_CALL gpud_launch_fir_cuda(
     return (int)cudaGetLastError();
 }
 
+__global__ void gpud_fir_kernel_v2(
+    const float2* __restrict__ in,
+    float2* __restrict__ out,
+    const float* __restrict__ taps,
+    int n,
+    int num_taps
+) {
+    extern __shared__ float2 s_data[];
+    int gid = blockIdx.x * blockDim.x + threadIdx.x;
+    int lid = threadIdx.x;
+    int halo = num_taps - 1;
+
+    if (gid < n) s_data[lid + halo] = in[gid];
+    else s_data[lid + halo] = make_float2(0.0f, 0.0f);
+
+    if (lid < halo) {
+        int src = gid - halo;
+        s_data[lid] = (src >= 0) ? in[src] : make_float2(0.0f, 0.0f);
+    }
+    __syncthreads();
+    if (gid >= n) return;
+
+    float acc_r = 0.0f, acc_i = 0.0f;
+    for (int k = 0; k < num_taps; ++k) {
+        float2 v = s_data[lid + halo - k];
+        float t = taps[k];
+        acc_r += v.x * t;
+        acc_i += v.y * t;
+    }
+    out[gid] = make_float2(acc_r, acc_i);
+}
+
+GPUD_API int GPUD_CALL gpud_launch_fir_v2_stream_cuda(
+    const float2* in,
+    float2* out,
+    const float* taps,
+    int n,
+    int num_taps,
+    gpud_stream_handle stream
+) {
+    if (n <= 0 || num_taps <= 0 || num_taps > 256) return 0;
+    const int block = 256;
+    const int grid = (n + block - 1) / block;
+    size_t sharedBytes = (size_t)(block + num_taps - 1) * sizeof(float2);
+    gpud_fir_kernel_v2<<<grid, block, sharedBytes, (cudaStream_t)stream>>>(in, out, taps, n, num_taps);
+    return (int)cudaGetLastError();
+}
+
 GPUD_API int GPUD_CALL gpud_launch_decimate_cuda(
     const float2* in,
     float2* out,
diff --git a/internal/demod/gpudemod/windows_bridge.go b/internal/demod/gpudemod/windows_bridge.go
index 2cf33cd..3371be7 100644
--- a/internal/demod/gpudemod/windows_bridge.go
+++ b/internal/demod/gpudemod/windows_bridge.go
@@ -20,6 +20,7 @@ typedef int (__stdcall *gpud_launch_freq_shift_stream_fn)(const gpud_float2* in,
 typedef int (__stdcall *gpud_launch_freq_shift_fn)(const gpud_float2* in, gpud_float2* out, int n, double phase_inc, double phase_start);
 typedef int (__stdcall *gpud_launch_fm_discrim_fn)(const gpud_float2* in, float* out, int n);
 typedef int (__stdcall *gpud_launch_fir_stream_fn)(const gpud_float2* in, gpud_float2* out, int n, int num_taps, gpud_stream_handle stream);
+typedef int (__stdcall *gpud_launch_fir_v2_stream_fn)(const gpud_float2* in, gpud_float2* out, const float* taps, int n, int num_taps, gpud_stream_handle stream);
 typedef int (__stdcall *gpud_launch_fir_fn)(const gpud_float2* in, gpud_float2* out, int n, int num_taps);
 typedef int (__stdcall *gpud_launch_decimate_stream_fn)(const gpud_float2* in, gpud_float2* out, int n_out, int factor, gpud_stream_handle stream);
 typedef int (__stdcall *gpud_launch_decimate_fn)(const gpud_float2* in, gpud_float2* out, int n_out, int factor);
@@ -35,6 +36,7 @@ static gpud_launch_freq_shift_stream_fn gpud_p_launch_freq_shift_stream = NULL;
 static gpud_launch_freq_shift_fn gpud_p_launch_freq_shift = NULL;
 static gpud_launch_fm_discrim_fn gpud_p_launch_fm_discrim = NULL;
 static gpud_launch_fir_stream_fn gpud_p_launch_fir_stream = NULL;
+static gpud_launch_fir_v2_stream_fn gpud_p_launch_fir_v2_stream = NULL;
 static gpud_launch_fir_fn gpud_p_launch_fir = NULL;
 static gpud_launch_decimate_stream_fn gpud_p_launch_decimate_stream = NULL;
 static gpud_launch_decimate_fn gpud_p_launch_decimate = NULL;
@@ -59,6 +61,7 @@ static int gpud_load_library(const char* path) {
 	gpud_p_launch_freq_shift = (gpud_launch_freq_shift_fn)GetProcAddress(gpud_mod, "gpud_launch_freq_shift_cuda");
 	gpud_p_launch_fm_discrim = (gpud_launch_fm_discrim_fn)GetProcAddress(gpud_mod, "gpud_launch_fm_discrim_cuda");
 	gpud_p_launch_fir_stream = (gpud_launch_fir_stream_fn)GetProcAddress(gpud_mod, "gpud_launch_fir_stream_cuda");
+	gpud_p_launch_fir_v2_stream = (gpud_launch_fir_v2_stream_fn)GetProcAddress(gpud_mod, "gpud_launch_fir_v2_stream_cuda");
 	gpud_p_launch_fir = (gpud_launch_fir_fn)GetProcAddress(gpud_mod, "gpud_launch_fir_cuda");
 	gpud_p_launch_decimate_stream = (gpud_launch_decimate_stream_fn)GetProcAddress(gpud_mod, "gpud_launch_decimate_stream_cuda");
 	gpud_p_launch_decimate = (gpud_launch_decimate_fn)GetProcAddress(gpud_mod, "gpud_launch_decimate_cuda");
@@ -80,6 +83,7 @@ static int gpud_launch_freq_shift_stream(gpud_float2 *in, gpud_float2 *out, int
 static int gpud_launch_freq_shift(gpud_float2 *in, gpud_float2 *out, int n, double phase_inc, double phase_start) { if (!gpud_p_launch_freq_shift) return -1; return gpud_p_launch_freq_shift(in, out, n, phase_inc, phase_start); }
 static int gpud_launch_fm_discrim(gpud_float2 *in, float *out, int n) { if (!gpud_p_launch_fm_discrim) return -1; return gpud_p_launch_fm_discrim(in, out, n); }
 static int gpud_launch_fir_stream(gpud_float2 *in, gpud_float2 *out, int n, int num_taps, gpud_stream_handle stream) { if (!gpud_p_launch_fir_stream) return -1; return gpud_p_launch_fir_stream(in, out, n, num_taps, stream); }
+static int gpud_launch_fir_v2_stream(gpud_float2 *in, gpud_float2 *out, const float *taps, int n, int num_taps, gpud_stream_handle stream) { if (!gpud_p_launch_fir_v2_stream) return -1; return gpud_p_launch_fir_v2_stream(in, out, taps, n, num_taps, stream); }
 static int gpud_launch_fir(gpud_float2 *in, gpud_float2 *out, int n, int num_taps) { if (!gpud_p_launch_fir) return -1; return gpud_p_launch_fir(in, out, n, num_taps); }
 static int gpud_launch_decimate_stream(gpud_float2 *in, gpud_float2 *out, int n_out, int factor, gpud_stream_handle stream) { if (!gpud_p_launch_decimate_stream) return -1; return gpud_p_launch_decimate_stream(in, out, n_out, factor, stream); }
 static int gpud_launch_decimate(gpud_float2 *in, gpud_float2 *out, int n_out, int factor) { if (!gpud_p_launch_decimate) return -1; return gpud_p_launch_decimate(in, out, n_out, factor); }
@@ -115,6 +119,9 @@ func bridgeLaunchFIR(in *C.gpud_float2, out *C.gpud_float2, n int, numTaps int)
 func bridgeLaunchFIRStream(in *C.gpud_float2, out *C.gpud_float2, n int, numTaps int, stream streamHandle) int {
 	return int(C.gpud_launch_fir_stream(in, out, C.int(n), C.int(numTaps), C.gpud_stream_handle(stream)))
 }
+func bridgeLaunchFIRv2Stream(in *C.gpud_float2, out *C.gpud_float2, taps *C.float, n int, numTaps int, stream streamHandle) int {
+	return int(C.gpud_launch_fir_v2_stream(in, out, taps, C.int(n), C.int(numTaps), C.gpud_stream_handle(stream)))
+}
 func bridgeLaunchDecimate(in *C.gpud_float2, out *C.gpud_float2, nOut int, factor int) int { return int(C.gpud_launch_decimate(in, out, C.int(nOut), C.int(factor))) }
 func bridgeLaunchDecimateStream(in *C.gpud_float2, out *C.gpud_float2, nOut int, factor int, stream streamHandle) int {
 	return int(C.gpud_launch_decimate_stream(in, out, C.int(nOut), C.int(factor), C.gpud_stream_handle(stream)))