You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

286 lines
7.4KB

  1. #include <cuda_runtime.h>
  2. #include <math.h>
  3. #if defined(_WIN32)
  4. #define GPUD_API extern "C" __declspec(dllexport)
  5. #define GPUD_CALL __stdcall
  6. #else
  7. #define GPUD_API extern "C"
  8. #define GPUD_CALL
  9. #endif
  10. typedef void* gpud_stream_handle;
  11. GPUD_API int GPUD_CALL gpud_stream_create(gpud_stream_handle* out) {
  12. if (!out) return -1;
  13. cudaStream_t stream;
  14. cudaError_t err = cudaStreamCreate(&stream);
  15. if (err != cudaSuccess) return (int)err;
  16. *out = (gpud_stream_handle)stream;
  17. return 0;
  18. }
  19. GPUD_API int GPUD_CALL gpud_stream_destroy(gpud_stream_handle stream) {
  20. if (!stream) return 0;
  21. return (int)cudaStreamDestroy((cudaStream_t)stream);
  22. }
  23. GPUD_API int GPUD_CALL gpud_stream_sync(gpud_stream_handle stream) {
  24. if (!stream) return (int)cudaDeviceSynchronize();
  25. return (int)cudaStreamSynchronize((cudaStream_t)stream);
  26. }
  27. __global__ void gpud_freq_shift_kernel(
  28. const float2* __restrict__ in,
  29. float2* __restrict__ out,
  30. int n,
  31. double phase_inc,
  32. double phase_start
  33. ) {
  34. int idx = blockIdx.x * blockDim.x + threadIdx.x;
  35. if (idx >= n) return;
  36. double phase = phase_start + phase_inc * (double)idx;
  37. float si, co;
  38. sincosf((float)phase, &si, &co);
  39. float2 v = in[idx];
  40. out[idx].x = v.x * co - v.y * si;
  41. out[idx].y = v.x * si + v.y * co;
  42. }
  43. GPUD_API int GPUD_CALL gpud_launch_freq_shift_stream_cuda(
  44. const float2* in,
  45. float2* out,
  46. int n,
  47. double phase_inc,
  48. double phase_start,
  49. gpud_stream_handle stream
  50. ) {
  51. if (n <= 0) return 0;
  52. const int block = 256;
  53. const int grid = (n + block - 1) / block;
  54. gpud_freq_shift_kernel<<<grid, block, 0, (cudaStream_t)stream>>>(in, out, n, phase_inc, phase_start);
  55. return (int)cudaGetLastError();
  56. }
  57. GPUD_API int GPUD_CALL gpud_launch_freq_shift_cuda(
  58. const float2* in,
  59. float2* out,
  60. int n,
  61. double phase_inc,
  62. double phase_start
  63. ) {
  64. return gpud_launch_freq_shift_stream_cuda(in, out, n, phase_inc, phase_start, 0);
  65. }
  66. __global__ void gpud_fm_discrim_kernel(
  67. const float2* __restrict__ in,
  68. float* __restrict__ out,
  69. int n
  70. ) {
  71. int idx = blockIdx.x * blockDim.x + threadIdx.x;
  72. if (idx >= n - 1) return;
  73. float2 prev = in[idx];
  74. float2 curr = in[idx + 1];
  75. float re = prev.x * curr.x + prev.y * curr.y;
  76. float im = prev.x * curr.y - prev.y * curr.x;
  77. out[idx] = atan2f(im, re);
  78. }
  79. GPUD_API int GPUD_CALL gpud_launch_fm_discrim_cuda(
  80. const float2* in,
  81. float* out,
  82. int n
  83. ) {
  84. if (n <= 1) return 0;
  85. const int block = 256;
  86. const int grid = (n + block - 1) / block;
  87. gpud_fm_discrim_kernel<<<grid, block>>>(in, out, n);
  88. return (int)cudaGetLastError();
  89. }
  90. __global__ void gpud_decimate_kernel(
  91. const float2* __restrict__ in,
  92. float2* __restrict__ out,
  93. int n_out,
  94. int factor
  95. ) {
  96. int idx = blockIdx.x * blockDim.x + threadIdx.x;
  97. if (idx >= n_out) return;
  98. out[idx] = in[idx * factor];
  99. }
  100. __device__ __constant__ float gpud_fir_taps[256];
  101. __global__ void gpud_fir_kernel(
  102. const float2* __restrict__ in,
  103. float2* __restrict__ out,
  104. int n,
  105. int num_taps
  106. ) {
  107. extern __shared__ float2 s_data[];
  108. int gid = blockIdx.x * blockDim.x + threadIdx.x;
  109. int lid = threadIdx.x;
  110. int halo = num_taps - 1;
  111. if (gid < n) {
  112. s_data[lid + halo] = in[gid];
  113. } else {
  114. s_data[lid + halo] = make_float2(0.0f, 0.0f);
  115. }
  116. if (lid < halo) {
  117. int src = gid - halo;
  118. s_data[lid] = (src >= 0) ? in[src] : make_float2(0.0f, 0.0f);
  119. }
  120. __syncthreads();
  121. if (gid >= n) return;
  122. float acc_r = 0.0f;
  123. float acc_i = 0.0f;
  124. for (int k = 0; k < num_taps; ++k) {
  125. float2 v = s_data[lid + halo - k];
  126. float t = gpud_fir_taps[k];
  127. acc_r += v.x * t;
  128. acc_i += v.y * t;
  129. }
  130. out[gid] = make_float2(acc_r, acc_i);
  131. }
  132. GPUD_API int GPUD_CALL gpud_upload_fir_taps_cuda(const float* taps, int n) {
  133. if (!taps || n <= 0 || n > 256) return -1;
  134. cudaError_t err = cudaMemcpyToSymbol(gpud_fir_taps, taps, (size_t)n * sizeof(float));
  135. return (int)err;
  136. }
  137. GPUD_API int GPUD_CALL gpud_launch_fir_cuda(
  138. const float2* in,
  139. float2* out,
  140. int n,
  141. int num_taps
  142. ) {
  143. if (n <= 0 || num_taps <= 0 || num_taps > 256) return 0;
  144. const int block = 256;
  145. const int grid = (n + block - 1) / block;
  146. size_t sharedBytes = (size_t)(block + num_taps - 1) * sizeof(float2);
  147. gpud_fir_kernel<<<grid, block, sharedBytes>>>(in, out, n, num_taps);
  148. return (int)cudaGetLastError();
  149. }
  150. __global__ void gpud_fir_kernel_v2(
  151. const float2* __restrict__ in,
  152. float2* __restrict__ out,
  153. const float* __restrict__ taps,
  154. int n,
  155. int num_taps
  156. ) {
  157. extern __shared__ float2 s_data[];
  158. int gid = blockIdx.x * blockDim.x + threadIdx.x;
  159. int lid = threadIdx.x;
  160. int halo = num_taps - 1;
  161. if (gid < n) s_data[lid + halo] = in[gid];
  162. else s_data[lid + halo] = make_float2(0.0f, 0.0f);
  163. if (lid < halo) {
  164. int src = gid - halo;
  165. s_data[lid] = (src >= 0) ? in[src] : make_float2(0.0f, 0.0f);
  166. }
  167. __syncthreads();
  168. if (gid >= n) return;
  169. float acc_r = 0.0f, acc_i = 0.0f;
  170. for (int k = 0; k < num_taps; ++k) {
  171. float2 v = s_data[lid + halo - k];
  172. float t = taps[k];
  173. acc_r += v.x * t;
  174. acc_i += v.y * t;
  175. }
  176. out[gid] = make_float2(acc_r, acc_i);
  177. }
  178. GPUD_API int GPUD_CALL gpud_launch_fir_v2_stream_cuda(
  179. const float2* in,
  180. float2* out,
  181. const float* taps,
  182. int n,
  183. int num_taps,
  184. gpud_stream_handle stream
  185. ) {
  186. if (n <= 0 || num_taps <= 0 || num_taps > 256) return 0;
  187. const int block = 256;
  188. const int grid = (n + block - 1) / block;
  189. size_t sharedBytes = (size_t)(block + num_taps - 1) * sizeof(float2);
  190. gpud_fir_kernel_v2<<<grid, block, sharedBytes, (cudaStream_t)stream>>>(in, out, taps, n, num_taps);
  191. return (int)cudaGetLastError();
  192. }
  193. GPUD_API int GPUD_CALL gpud_launch_decimate_cuda(
  194. const float2* in,
  195. float2* out,
  196. int n_out,
  197. int factor
  198. ) {
  199. if (n_out <= 0 || factor <= 0) return 0;
  200. const int block = 256;
  201. const int grid = (n_out + block - 1) / block;
  202. gpud_decimate_kernel<<<grid, block>>>(in, out, n_out, factor);
  203. return (int)cudaGetLastError();
  204. }
  205. __global__ void gpud_am_envelope_kernel(
  206. const float2* __restrict__ in,
  207. float* __restrict__ out,
  208. int n
  209. ) {
  210. int idx = blockIdx.x * blockDim.x + threadIdx.x;
  211. if (idx >= n) return;
  212. float2 v = in[idx];
  213. out[idx] = sqrtf(v.x * v.x + v.y * v.y);
  214. }
  215. GPUD_API int GPUD_CALL gpud_launch_am_envelope_cuda(
  216. const float2* in,
  217. float* out,
  218. int n
  219. ) {
  220. if (n <= 0) return 0;
  221. const int block = 256;
  222. const int grid = (n + block - 1) / block;
  223. gpud_am_envelope_kernel<<<grid, block>>>(in, out, n);
  224. return (int)cudaGetLastError();
  225. }
  226. __global__ void gpud_ssb_product_kernel(
  227. const float2* __restrict__ in,
  228. float* __restrict__ out,
  229. int n,
  230. double phase_inc,
  231. double phase_start
  232. ) {
  233. int idx = blockIdx.x * blockDim.x + threadIdx.x;
  234. if (idx >= n) return;
  235. double phase = phase_start + phase_inc * (double)idx;
  236. float si, co;
  237. sincosf((float)phase, &si, &co);
  238. float2 v = in[idx];
  239. out[idx] = v.x * co - v.y * si;
  240. }
  241. GPUD_API int GPUD_CALL gpud_launch_ssb_product_cuda(
  242. const float2* in,
  243. float* out,
  244. int n,
  245. double phase_inc,
  246. double phase_start
  247. ) {
  248. if (n <= 0) return 0;
  249. const int block = 256;
  250. const int grid = (n + block - 1) / block;
  251. gpud_ssb_product_kernel<<<grid, block>>>(in, out, n, phase_inc, phase_start);
  252. return (int)cudaGetLastError();
  253. }