diff --git a/internal/demod/gpudemod/native/exports.cu b/internal/demod/gpudemod/native/exports.cu index 64269a8..2114b65 100644 --- a/internal/demod/gpudemod/native/exports.cu +++ b/internal/demod/gpudemod/native/exports.cu @@ -9,7 +9,7 @@ #define GPUD_CALL #endif -GPUD_API __global__ void gpud_freq_shift_kernel( +__global__ void gpud_freq_shift_kernel( const float2* __restrict__ in, float2* __restrict__ out, int n, @@ -42,7 +42,7 @@ GPUD_API int GPUD_CALL gpud_launch_freq_shift_cuda( return (int)cudaGetLastError(); } -GPUD_API __global__ void gpud_fm_discrim_kernel( +__global__ void gpud_fm_discrim_kernel( const float2* __restrict__ in, float* __restrict__ out, int n @@ -69,7 +69,7 @@ GPUD_API int GPUD_CALL gpud_launch_fm_discrim_cuda( return (int)cudaGetLastError(); } -GPUD_API __global__ void gpud_decimate_kernel( +__global__ void gpud_decimate_kernel( const float2* __restrict__ in, float2* __restrict__ out, int n_out, @@ -88,20 +88,34 @@ __global__ void gpud_fir_kernel( int n, int num_taps ) { - int idx = blockIdx.x * blockDim.x + threadIdx.x; - if (idx >= n) return; + extern __shared__ float2 s_data[]; + int gid = blockIdx.x * blockDim.x + threadIdx.x; + int lid = threadIdx.x; + int halo = num_taps - 1; + + if (gid < n) { + s_data[lid + halo] = in[gid]; + } else { + s_data[lid + halo] = make_float2(0.0f, 0.0f); + } + + if (lid < halo) { + int src = gid - halo; + s_data[lid] = (src >= 0) ? in[src] : make_float2(0.0f, 0.0f); + } + __syncthreads(); + + if (gid >= n) return; float acc_r = 0.0f; float acc_i = 0.0f; for (int k = 0; k < num_taps; ++k) { - int src = idx - k; - if (src < 0) break; - float2 v = in[src]; + float2 v = s_data[lid + halo - k]; float t = gpud_fir_taps[k]; acc_r += v.x * t; acc_i += v.y * t; } - out[idx] = make_float2(acc_r, acc_i); + out[gid] = make_float2(acc_r, acc_i); } GPUD_API int GPUD_CALL gpud_upload_fir_taps_cuda(const float* taps, int n) { @@ -119,7 +133,8 @@ GPUD_API int GPUD_CALL gpud_launch_fir_cuda( if (n <= 0 || num_taps <= 0 || num_taps > 256) return 0; const int block = 256; const int grid = (n + block - 1) / block; - gpud_fir_kernel<<>>(in, out, n, num_taps); + size_t sharedBytes = (size_t)(block + num_taps - 1) * sizeof(float2); + gpud_fir_kernel<<>>(in, out, n, num_taps); return (int)cudaGetLastError(); } @@ -136,7 +151,7 @@ GPUD_API int GPUD_CALL gpud_launch_decimate_cuda( return (int)cudaGetLastError(); } -GPUD_API __global__ void gpud_am_envelope_kernel( +__global__ void gpud_am_envelope_kernel( const float2* __restrict__ in, float* __restrict__ out, int n @@ -159,7 +174,7 @@ GPUD_API int GPUD_CALL gpud_launch_am_envelope_cuda( return (int)cudaGetLastError(); } -GPUD_API __global__ void gpud_ssb_product_kernel( +__global__ void gpud_ssb_product_kernel( const float2* __restrict__ in, float* __restrict__ out, int n,