No puede seleccionar más de 25 temas Los temas deben comenzar con una letra o número, pueden incluir guiones ('-') y pueden tener hasta 35 caracteres de largo.

315 líneas
8.3KB

  1. #include <cuda_runtime.h>
  2. #include <math.h>
  3. #if defined(_WIN32)
  4. #define GPUD_API extern "C" __declspec(dllexport)
  5. #define GPUD_CALL __stdcall
  6. #else
  7. #define GPUD_API extern "C"
  8. #define GPUD_CALL
  9. #endif
  10. typedef void* gpud_stream_handle;
  11. GPUD_API int GPUD_CALL gpud_stream_create(gpud_stream_handle* out) {
  12. if (!out) return -1;
  13. cudaStream_t stream;
  14. cudaError_t err = cudaStreamCreate(&stream);
  15. if (err != cudaSuccess) return (int)err;
  16. *out = (gpud_stream_handle)stream;
  17. return 0;
  18. }
  19. GPUD_API int GPUD_CALL gpud_stream_destroy(gpud_stream_handle stream) {
  20. if (!stream) return 0;
  21. return (int)cudaStreamDestroy((cudaStream_t)stream);
  22. }
  23. GPUD_API int GPUD_CALL gpud_stream_sync(gpud_stream_handle stream) {
  24. if (!stream) return (int)cudaDeviceSynchronize();
  25. return (int)cudaStreamSynchronize((cudaStream_t)stream);
  26. }
  27. __global__ void gpud_freq_shift_kernel(
  28. const float2* __restrict__ in,
  29. float2* __restrict__ out,
  30. int n,
  31. double phase_inc,
  32. double phase_start
  33. ) {
  34. int idx = blockIdx.x * blockDim.x + threadIdx.x;
  35. if (idx >= n) return;
  36. double phase = phase_start + phase_inc * (double)idx;
  37. float si, co;
  38. sincosf((float)phase, &si, &co);
  39. float2 v = in[idx];
  40. out[idx].x = v.x * co - v.y * si;
  41. out[idx].y = v.x * si + v.y * co;
  42. }
  43. GPUD_API int GPUD_CALL gpud_launch_freq_shift_stream_cuda(
  44. const float2* in,
  45. float2* out,
  46. int n,
  47. double phase_inc,
  48. double phase_start,
  49. gpud_stream_handle stream
  50. ) {
  51. if (n <= 0) return 0;
  52. const int block = 256;
  53. const int grid = (n + block - 1) / block;
  54. gpud_freq_shift_kernel<<<grid, block, 0, (cudaStream_t)stream>>>(in, out, n, phase_inc, phase_start);
  55. return (int)cudaGetLastError();
  56. }
  57. GPUD_API int GPUD_CALL gpud_launch_freq_shift_cuda(
  58. const float2* in,
  59. float2* out,
  60. int n,
  61. double phase_inc,
  62. double phase_start
  63. ) {
  64. return gpud_launch_freq_shift_stream_cuda(in, out, n, phase_inc, phase_start, 0);
  65. }
  66. __global__ void gpud_fm_discrim_kernel(
  67. const float2* __restrict__ in,
  68. float* __restrict__ out,
  69. int n
  70. ) {
  71. int idx = blockIdx.x * blockDim.x + threadIdx.x;
  72. if (idx >= n - 1) return;
  73. float2 prev = in[idx];
  74. float2 curr = in[idx + 1];
  75. float re = prev.x * curr.x + prev.y * curr.y;
  76. float im = prev.x * curr.y - prev.y * curr.x;
  77. out[idx] = atan2f(im, re);
  78. }
  79. GPUD_API int GPUD_CALL gpud_launch_fm_discrim_cuda(
  80. const float2* in,
  81. float* out,
  82. int n
  83. ) {
  84. if (n <= 1) return 0;
  85. const int block = 256;
  86. const int grid = (n + block - 1) / block;
  87. gpud_fm_discrim_kernel<<<grid, block>>>(in, out, n);
  88. return (int)cudaGetLastError();
  89. }
  90. __global__ void gpud_decimate_kernel(
  91. const float2* __restrict__ in,
  92. float2* __restrict__ out,
  93. int n_out,
  94. int factor
  95. ) {
  96. int idx = blockIdx.x * blockDim.x + threadIdx.x;
  97. if (idx >= n_out) return;
  98. out[idx] = in[idx * factor];
  99. }
  100. __device__ __constant__ float gpud_fir_taps[256];
  101. __global__ void gpud_fir_kernel(
  102. const float2* __restrict__ in,
  103. float2* __restrict__ out,
  104. int n,
  105. int num_taps
  106. ) {
  107. extern __shared__ float2 s_data[];
  108. int gid = blockIdx.x * blockDim.x + threadIdx.x;
  109. int lid = threadIdx.x;
  110. int halo = num_taps - 1;
  111. if (gid < n) {
  112. s_data[lid + halo] = in[gid];
  113. } else {
  114. s_data[lid + halo] = make_float2(0.0f, 0.0f);
  115. }
  116. if (lid < halo) {
  117. int src = gid - halo;
  118. s_data[lid] = (src >= 0) ? in[src] : make_float2(0.0f, 0.0f);
  119. }
  120. __syncthreads();
  121. if (gid >= n) return;
  122. float acc_r = 0.0f;
  123. float acc_i = 0.0f;
  124. for (int k = 0; k < num_taps; ++k) {
  125. float2 v = s_data[lid + halo - k];
  126. float t = gpud_fir_taps[k];
  127. acc_r += v.x * t;
  128. acc_i += v.y * t;
  129. }
  130. out[gid] = make_float2(acc_r, acc_i);
  131. }
  132. GPUD_API int GPUD_CALL gpud_upload_fir_taps_cuda(const float* taps, int n) {
  133. if (!taps || n <= 0 || n > 256) return -1;
  134. cudaError_t err = cudaMemcpyToSymbol(gpud_fir_taps, taps, (size_t)n * sizeof(float));
  135. return (int)err;
  136. }
  137. GPUD_API int GPUD_CALL gpud_launch_fir_cuda(
  138. const float2* in,
  139. float2* out,
  140. int n,
  141. int num_taps
  142. ) {
  143. if (n <= 0 || num_taps <= 0 || num_taps > 256) return 0;
  144. const int block = 256;
  145. const int grid = (n + block - 1) / block;
  146. size_t sharedBytes = (size_t)(block + num_taps - 1) * sizeof(float2);
  147. gpud_fir_kernel<<<grid, block, sharedBytes>>>(in, out, n, num_taps);
  148. return (int)cudaGetLastError();
  149. }
  150. GPUD_API int GPUD_CALL gpud_launch_fir_stream_cuda(
  151. const float2* in,
  152. float2* out,
  153. int n,
  154. int num_taps,
  155. gpud_stream_handle stream
  156. ) {
  157. if (n <= 0 || num_taps <= 0 || num_taps > 256) return 0;
  158. const int block = 256;
  159. const int grid = (n + block - 1) / block;
  160. size_t sharedBytes = (size_t)(block + num_taps - 1) * sizeof(float2);
  161. gpud_fir_kernel<<<grid, block, sharedBytes, (cudaStream_t)stream>>>(in, out, n, num_taps);
  162. return (int)cudaGetLastError();
  163. }
  164. __global__ void gpud_fir_kernel_v2(
  165. const float2* __restrict__ in,
  166. float2* __restrict__ out,
  167. const float* __restrict__ taps,
  168. int n,
  169. int num_taps
  170. ) {
  171. extern __shared__ float2 s_data[];
  172. int gid = blockIdx.x * blockDim.x + threadIdx.x;
  173. int lid = threadIdx.x;
  174. int halo = num_taps - 1;
  175. if (gid < n) s_data[lid + halo] = in[gid];
  176. else s_data[lid + halo] = make_float2(0.0f, 0.0f);
  177. if (lid < halo) {
  178. int src = gid - halo;
  179. s_data[lid] = (src >= 0) ? in[src] : make_float2(0.0f, 0.0f);
  180. }
  181. __syncthreads();
  182. if (gid >= n) return;
  183. float acc_r = 0.0f, acc_i = 0.0f;
  184. for (int k = 0; k < num_taps; ++k) {
  185. float2 v = s_data[lid + halo - k];
  186. float t = taps[k];
  187. acc_r += v.x * t;
  188. acc_i += v.y * t;
  189. }
  190. out[gid] = make_float2(acc_r, acc_i);
  191. }
  192. GPUD_API int GPUD_CALL gpud_launch_fir_v2_stream_cuda(
  193. const float2* in,
  194. float2* out,
  195. const float* taps,
  196. int n,
  197. int num_taps,
  198. gpud_stream_handle stream
  199. ) {
  200. if (n <= 0 || num_taps <= 0 || num_taps > 256) return 0;
  201. const int block = 256;
  202. const int grid = (n + block - 1) / block;
  203. size_t sharedBytes = (size_t)(block + num_taps - 1) * sizeof(float2);
  204. gpud_fir_kernel_v2<<<grid, block, sharedBytes, (cudaStream_t)stream>>>(in, out, taps, n, num_taps);
  205. return (int)cudaGetLastError();
  206. }
  207. GPUD_API int GPUD_CALL gpud_launch_decimate_cuda(
  208. const float2* in,
  209. float2* out,
  210. int n_out,
  211. int factor
  212. ) {
  213. if (n_out <= 0 || factor <= 0) return 0;
  214. const int block = 256;
  215. const int grid = (n_out + block - 1) / block;
  216. gpud_decimate_kernel<<<grid, block>>>(in, out, n_out, factor);
  217. return (int)cudaGetLastError();
  218. }
  219. GPUD_API int GPUD_CALL gpud_launch_decimate_stream_cuda(
  220. const float2* in,
  221. float2* out,
  222. int n_out,
  223. int factor,
  224. gpud_stream_handle stream
  225. ) {
  226. if (n_out <= 0 || factor <= 0) return 0;
  227. const int block = 256;
  228. const int grid = (n_out + block - 1) / block;
  229. gpud_decimate_kernel<<<grid, block, 0, (cudaStream_t)stream>>>(in, out, n_out, factor);
  230. return (int)cudaGetLastError();
  231. }
  232. __global__ void gpud_am_envelope_kernel(
  233. const float2* __restrict__ in,
  234. float* __restrict__ out,
  235. int n
  236. ) {
  237. int idx = blockIdx.x * blockDim.x + threadIdx.x;
  238. if (idx >= n) return;
  239. float2 v = in[idx];
  240. out[idx] = sqrtf(v.x * v.x + v.y * v.y);
  241. }
  242. GPUD_API int GPUD_CALL gpud_launch_am_envelope_cuda(
  243. const float2* in,
  244. float* out,
  245. int n
  246. ) {
  247. if (n <= 0) return 0;
  248. const int block = 256;
  249. const int grid = (n + block - 1) / block;
  250. gpud_am_envelope_kernel<<<grid, block>>>(in, out, n);
  251. return (int)cudaGetLastError();
  252. }
  253. __global__ void gpud_ssb_product_kernel(
  254. const float2* __restrict__ in,
  255. float* __restrict__ out,
  256. int n,
  257. double phase_inc,
  258. double phase_start
  259. ) {
  260. int idx = blockIdx.x * blockDim.x + threadIdx.x;
  261. if (idx >= n) return;
  262. double phase = phase_start + phase_inc * (double)idx;
  263. float si, co;
  264. sincosf((float)phase, &si, &co);
  265. float2 v = in[idx];
  266. out[idx] = v.x * co - v.y * si;
  267. }
  268. GPUD_API int GPUD_CALL gpud_launch_ssb_product_cuda(
  269. const float2* in,
  270. float* out,
  271. int n,
  272. double phase_inc,
  273. double phase_start
  274. ) {
  275. if (n <= 0) return 0;
  276. const int block = 256;
  277. const int grid = (n + block - 1) / block;
  278. gpud_ssb_product_kernel<<<grid, block>>>(in, out, n, phase_inc, phase_start);
  279. return (int)cudaGetLastError();
  280. }