diff --git a/internal/demod/gpudemod/native/exports.cu b/internal/demod/gpudemod/native/exports.cu
index 64269a8..2114b65 100644
--- a/internal/demod/gpudemod/native/exports.cu
+++ b/internal/demod/gpudemod/native/exports.cu
@@ -9,7 +9,7 @@
 #define GPUD_CALL
 #endif
 
-GPUD_API __global__ void gpud_freq_shift_kernel(
+__global__ void gpud_freq_shift_kernel(
     const float2* __restrict__ in,
     float2* __restrict__ out,
     int n,
@@ -42,7 +42,7 @@ GPUD_API int GPUD_CALL gpud_launch_freq_shift_cuda(
     return (int)cudaGetLastError();
 }
 
-GPUD_API __global__ void gpud_fm_discrim_kernel(
+__global__ void gpud_fm_discrim_kernel(
     const float2* __restrict__ in,
     float* __restrict__ out,
     int n
@@ -69,7 +69,7 @@ GPUD_API int GPUD_CALL gpud_launch_fm_discrim_cuda(
     return (int)cudaGetLastError();
 }
 
-GPUD_API __global__ void gpud_decimate_kernel(
+__global__ void gpud_decimate_kernel(
     const float2* __restrict__ in,
     float2* __restrict__ out,
     int n_out,
@@ -88,20 +88,34 @@ __global__ void gpud_fir_kernel(
     int n,
     int num_taps
 ) {
-    int idx = blockIdx.x * blockDim.x + threadIdx.x;
-    if (idx >= n) return;
+    extern __shared__ float2 s_data[];
+    int gid = blockIdx.x * blockDim.x + threadIdx.x;
+    int lid = threadIdx.x;
+    int halo = num_taps - 1;
+
+    if (gid < n) {
+        s_data[lid + halo] = in[gid];
+    } else {
+        s_data[lid + halo] = make_float2(0.0f, 0.0f);
+    }
+
+    if (lid < halo) {
+        int src = gid - halo;
+        s_data[lid] = (src >= 0) ? in[src] : make_float2(0.0f, 0.0f);
+    }
+    __syncthreads();
+
+    if (gid >= n) return;
 
     float acc_r = 0.0f;
     float acc_i = 0.0f;
     for (int k = 0; k < num_taps; ++k) {
-        int src = idx - k;
-        if (src < 0) break;
-        float2 v = in[src];
+        float2 v = s_data[lid + halo - k];
         float t = gpud_fir_taps[k];
         acc_r += v.x * t;
         acc_i += v.y * t;
     }
-    out[idx] = make_float2(acc_r, acc_i);
+    out[gid] = make_float2(acc_r, acc_i);
 }
 
 GPUD_API int GPUD_CALL gpud_upload_fir_taps_cuda(const float* taps, int n) {
@@ -119,7 +133,8 @@ GPUD_API int GPUD_CALL gpud_launch_fir_cuda(
     if (n <= 0 || num_taps <= 0 || num_taps > 256) return 0;
     const int block = 256;
     const int grid = (n + block - 1) / block;
-    gpud_fir_kernel<<<grid, block>>>(in, out, n, num_taps);
+    size_t sharedBytes = (size_t)(block + num_taps - 1) * sizeof(float2);
+    gpud_fir_kernel<<<grid, block, sharedBytes>>>(in, out, n, num_taps);
     return (int)cudaGetLastError();
 }
 
@@ -136,7 +151,7 @@ GPUD_API int GPUD_CALL gpud_launch_decimate_cuda(
     return (int)cudaGetLastError();
 }
 
-GPUD_API __global__ void gpud_am_envelope_kernel(
+__global__ void gpud_am_envelope_kernel(
     const float2* __restrict__ in,
     float* __restrict__ out,
     int n
@@ -159,7 +174,7 @@ GPUD_API int GPUD_CALL gpud_launch_am_envelope_cuda(
     return (int)cudaGetLastError();
 }
 
-GPUD_API __global__ void gpud_ssb_product_kernel(
+__global__ void gpud_ssb_product_kernel(
     const float2* __restrict__ in,
     float* __restrict__ out,
     int n,