| @@ -9,7 +9,7 @@ | |||||
| #define GPUD_CALL | #define GPUD_CALL | ||||
| #endif | #endif | ||||
| GPUD_API __global__ void gpud_freq_shift_kernel( | |||||
| __global__ void gpud_freq_shift_kernel( | |||||
| const float2* __restrict__ in, | const float2* __restrict__ in, | ||||
| float2* __restrict__ out, | float2* __restrict__ out, | ||||
| int n, | int n, | ||||
| @@ -42,7 +42,7 @@ GPUD_API int GPUD_CALL gpud_launch_freq_shift_cuda( | |||||
| return (int)cudaGetLastError(); | return (int)cudaGetLastError(); | ||||
| } | } | ||||
| GPUD_API __global__ void gpud_fm_discrim_kernel( | |||||
| __global__ void gpud_fm_discrim_kernel( | |||||
| const float2* __restrict__ in, | const float2* __restrict__ in, | ||||
| float* __restrict__ out, | float* __restrict__ out, | ||||
| int n | int n | ||||
| @@ -69,7 +69,7 @@ GPUD_API int GPUD_CALL gpud_launch_fm_discrim_cuda( | |||||
| return (int)cudaGetLastError(); | return (int)cudaGetLastError(); | ||||
| } | } | ||||
| GPUD_API __global__ void gpud_decimate_kernel( | |||||
| __global__ void gpud_decimate_kernel( | |||||
| const float2* __restrict__ in, | const float2* __restrict__ in, | ||||
| float2* __restrict__ out, | float2* __restrict__ out, | ||||
| int n_out, | int n_out, | ||||
| @@ -88,20 +88,34 @@ __global__ void gpud_fir_kernel( | |||||
| int n, | int n, | ||||
| int num_taps | int num_taps | ||||
| ) { | ) { | ||||
| int idx = blockIdx.x * blockDim.x + threadIdx.x; | |||||
| if (idx >= n) return; | |||||
| extern __shared__ float2 s_data[]; | |||||
| int gid = blockIdx.x * blockDim.x + threadIdx.x; | |||||
| int lid = threadIdx.x; | |||||
| int halo = num_taps - 1; | |||||
| if (gid < n) { | |||||
| s_data[lid + halo] = in[gid]; | |||||
| } else { | |||||
| s_data[lid + halo] = make_float2(0.0f, 0.0f); | |||||
| } | |||||
| if (lid < halo) { | |||||
| int src = gid - halo; | |||||
| s_data[lid] = (src >= 0) ? in[src] : make_float2(0.0f, 0.0f); | |||||
| } | |||||
| __syncthreads(); | |||||
| if (gid >= n) return; | |||||
| float acc_r = 0.0f; | float acc_r = 0.0f; | ||||
| float acc_i = 0.0f; | float acc_i = 0.0f; | ||||
| for (int k = 0; k < num_taps; ++k) { | for (int k = 0; k < num_taps; ++k) { | ||||
| int src = idx - k; | |||||
| if (src < 0) break; | |||||
| float2 v = in[src]; | |||||
| float2 v = s_data[lid + halo - k]; | |||||
| float t = gpud_fir_taps[k]; | float t = gpud_fir_taps[k]; | ||||
| acc_r += v.x * t; | acc_r += v.x * t; | ||||
| acc_i += v.y * t; | acc_i += v.y * t; | ||||
| } | } | ||||
| out[idx] = make_float2(acc_r, acc_i); | |||||
| out[gid] = make_float2(acc_r, acc_i); | |||||
| } | } | ||||
| GPUD_API int GPUD_CALL gpud_upload_fir_taps_cuda(const float* taps, int n) { | GPUD_API int GPUD_CALL gpud_upload_fir_taps_cuda(const float* taps, int n) { | ||||
| @@ -119,7 +133,8 @@ GPUD_API int GPUD_CALL gpud_launch_fir_cuda( | |||||
| if (n <= 0 || num_taps <= 0 || num_taps > 256) return 0; | if (n <= 0 || num_taps <= 0 || num_taps > 256) return 0; | ||||
| const int block = 256; | const int block = 256; | ||||
| const int grid = (n + block - 1) / block; | const int grid = (n + block - 1) / block; | ||||
| gpud_fir_kernel<<<grid, block>>>(in, out, n, num_taps); | |||||
| size_t sharedBytes = (size_t)(block + num_taps - 1) * sizeof(float2); | |||||
| gpud_fir_kernel<<<grid, block, sharedBytes>>>(in, out, n, num_taps); | |||||
| return (int)cudaGetLastError(); | return (int)cudaGetLastError(); | ||||
| } | } | ||||
| @@ -136,7 +151,7 @@ GPUD_API int GPUD_CALL gpud_launch_decimate_cuda( | |||||
| return (int)cudaGetLastError(); | return (int)cudaGetLastError(); | ||||
| } | } | ||||
| GPUD_API __global__ void gpud_am_envelope_kernel( | |||||
| __global__ void gpud_am_envelope_kernel( | |||||
| const float2* __restrict__ in, | const float2* __restrict__ in, | ||||
| float* __restrict__ out, | float* __restrict__ out, | ||||
| int n | int n | ||||
| @@ -159,7 +174,7 @@ GPUD_API int GPUD_CALL gpud_launch_am_envelope_cuda( | |||||
| return (int)cudaGetLastError(); | return (int)cudaGetLastError(); | ||||
| } | } | ||||
| GPUD_API __global__ void gpud_ssb_product_kernel( | |||||
| __global__ void gpud_ssb_product_kernel( | |||||
| const float2* __restrict__ in, | const float2* __restrict__ in, | ||||
| float* __restrict__ out, | float* __restrict__ out, | ||||
| int n, | int n, | ||||