diff --git a/internal/demod/gpudemod/gpudemod_windows.go b/internal/demod/gpudemod/gpudemod_windows.go
index d2b173c..07d0f6e 100644
--- a/internal/demod/gpudemod/gpudemod_windows.go
+++ b/internal/demod/gpudemod/gpudemod_windows.go
@@ -18,7 +18,9 @@ typedef int (__stdcall *gpud_stream_sync_fn)(gpud_stream_handle stream);
 typedef int (__stdcall *gpud_upload_fir_taps_fn)(const float* taps, int n);
 typedef int (__stdcall *gpud_launch_freq_shift_fn)(const gpud_float2* in, gpud_float2* out, int n, double phase_inc, double phase_start);
 typedef int (__stdcall *gpud_launch_fm_discrim_fn)(const gpud_float2* in, float* out, int n);
+typedef int (__stdcall *gpud_launch_fir_stream_fn)(const gpud_float2* in, gpud_float2* out, int n, int num_taps, gpud_stream_handle stream);
 typedef int (__stdcall *gpud_launch_fir_fn)(const gpud_float2* in, gpud_float2* out, int n, int num_taps);
+typedef int (__stdcall *gpud_launch_decimate_stream_fn)(const gpud_float2* in, gpud_float2* out, int n_out, int factor, gpud_stream_handle stream);
 typedef int (__stdcall *gpud_launch_decimate_fn)(const gpud_float2* in, gpud_float2* out, int n_out, int factor);
 typedef int (__stdcall *gpud_launch_am_envelope_fn)(const gpud_float2* in, float* out, int n);
 typedef int (__stdcall *gpud_launch_ssb_product_fn)(const gpud_float2* in, float* out, int n, double phase_inc, double phase_start);
diff --git a/internal/demod/gpudemod/native/exports.cu b/internal/demod/gpudemod/native/exports.cu
index 85f0ffc..f5b741b 100644
--- a/internal/demod/gpudemod/native/exports.cu
+++ b/internal/demod/gpudemod/native/exports.cu
@@ -49,20 +49,31 @@ __global__ void gpud_freq_shift_kernel(
     out[idx].y = v.x * si + v.y * co;
 }
 
-GPUD_API int GPUD_CALL gpud_launch_freq_shift_cuda(
+GPUD_API int GPUD_CALL gpud_launch_freq_shift_stream_cuda(
     const float2* in,
     float2* out,
     int n,
     double phase_inc,
-    double phase_start
+    double phase_start,
+    gpud_stream_handle stream
 ) {
     if (n <= 0) return 0;
     const int block = 256;
     const int grid = (n + block - 1) / block;
-    gpud_freq_shift_kernel<<<grid, block>>>(in, out, n, phase_inc, phase_start);
+    gpud_freq_shift_kernel<<<grid, block, 0, (cudaStream_t)stream>>>(in, out, n, phase_inc, phase_start);
     return (int)cudaGetLastError();
 }
 
+GPUD_API int GPUD_CALL gpud_launch_freq_shift_cuda(
+    const float2* in,
+    float2* out,
+    int n,
+    double phase_inc,
+    double phase_start
+) {
+    return gpud_launch_freq_shift_stream_cuda(in, out, n, phase_inc, phase_start, 0);
+}
+
 __global__ void gpud_fm_discrim_kernel(
     const float2* __restrict__ in,
     float* __restrict__ out,