No puede seleccionar más de 25 temas Los temas deben comenzar con una letra o número, pueden incluir guiones ('-') y pueden tener hasta 35 caracteres de largo.

238 líneas
6.1KB

  1. #include <cuda_runtime.h>
  2. #include <math.h>
  3. #if defined(_WIN32)
  4. #define GPUD_API extern "C" __declspec(dllexport)
  5. #define GPUD_CALL __stdcall
  6. #else
  7. #define GPUD_API extern "C"
  8. #define GPUD_CALL
  9. #endif
  10. typedef void* gpud_stream_handle;
  11. GPUD_API int GPUD_CALL gpud_stream_create(gpud_stream_handle* out) {
  12. if (!out) return -1;
  13. cudaStream_t stream;
  14. cudaError_t err = cudaStreamCreate(&stream);
  15. if (err != cudaSuccess) return (int)err;
  16. *out = (gpud_stream_handle)stream;
  17. return 0;
  18. }
  19. GPUD_API int GPUD_CALL gpud_stream_destroy(gpud_stream_handle stream) {
  20. if (!stream) return 0;
  21. return (int)cudaStreamDestroy((cudaStream_t)stream);
  22. }
  23. GPUD_API int GPUD_CALL gpud_stream_sync(gpud_stream_handle stream) {
  24. if (!stream) return (int)cudaDeviceSynchronize();
  25. return (int)cudaStreamSynchronize((cudaStream_t)stream);
  26. }
  27. __global__ void gpud_freq_shift_kernel(
  28. const float2* __restrict__ in,
  29. float2* __restrict__ out,
  30. int n,
  31. double phase_inc,
  32. double phase_start
  33. ) {
  34. int idx = blockIdx.x * blockDim.x + threadIdx.x;
  35. if (idx >= n) return;
  36. double phase = phase_start + phase_inc * (double)idx;
  37. float si, co;
  38. sincosf((float)phase, &si, &co);
  39. float2 v = in[idx];
  40. out[idx].x = v.x * co - v.y * si;
  41. out[idx].y = v.x * si + v.y * co;
  42. }
  43. GPUD_API int GPUD_CALL gpud_launch_freq_shift_stream_cuda(
  44. const float2* in,
  45. float2* out,
  46. int n,
  47. double phase_inc,
  48. double phase_start,
  49. gpud_stream_handle stream
  50. ) {
  51. if (n <= 0) return 0;
  52. const int block = 256;
  53. const int grid = (n + block - 1) / block;
  54. gpud_freq_shift_kernel<<<grid, block, 0, (cudaStream_t)stream>>>(in, out, n, phase_inc, phase_start);
  55. return (int)cudaGetLastError();
  56. }
  57. GPUD_API int GPUD_CALL gpud_launch_freq_shift_cuda(
  58. const float2* in,
  59. float2* out,
  60. int n,
  61. double phase_inc,
  62. double phase_start
  63. ) {
  64. return gpud_launch_freq_shift_stream_cuda(in, out, n, phase_inc, phase_start, 0);
  65. }
  66. __global__ void gpud_fm_discrim_kernel(
  67. const float2* __restrict__ in,
  68. float* __restrict__ out,
  69. int n
  70. ) {
  71. int idx = blockIdx.x * blockDim.x + threadIdx.x;
  72. if (idx >= n - 1) return;
  73. float2 prev = in[idx];
  74. float2 curr = in[idx + 1];
  75. float re = prev.x * curr.x + prev.y * curr.y;
  76. float im = prev.x * curr.y - prev.y * curr.x;
  77. out[idx] = atan2f(im, re);
  78. }
  79. GPUD_API int GPUD_CALL gpud_launch_fm_discrim_cuda(
  80. const float2* in,
  81. float* out,
  82. int n
  83. ) {
  84. if (n <= 1) return 0;
  85. const int block = 256;
  86. const int grid = (n + block - 1) / block;
  87. gpud_fm_discrim_kernel<<<grid, block>>>(in, out, n);
  88. return (int)cudaGetLastError();
  89. }
  90. __global__ void gpud_decimate_kernel(
  91. const float2* __restrict__ in,
  92. float2* __restrict__ out,
  93. int n_out,
  94. int factor
  95. ) {
  96. int idx = blockIdx.x * blockDim.x + threadIdx.x;
  97. if (idx >= n_out) return;
  98. out[idx] = in[idx * factor];
  99. }
  100. __device__ __constant__ float gpud_fir_taps[256];
  101. __global__ void gpud_fir_kernel(
  102. const float2* __restrict__ in,
  103. float2* __restrict__ out,
  104. int n,
  105. int num_taps
  106. ) {
  107. extern __shared__ float2 s_data[];
  108. int gid = blockIdx.x * blockDim.x + threadIdx.x;
  109. int lid = threadIdx.x;
  110. int halo = num_taps - 1;
  111. if (gid < n) {
  112. s_data[lid + halo] = in[gid];
  113. } else {
  114. s_data[lid + halo] = make_float2(0.0f, 0.0f);
  115. }
  116. if (lid < halo) {
  117. int src = gid - halo;
  118. s_data[lid] = (src >= 0) ? in[src] : make_float2(0.0f, 0.0f);
  119. }
  120. __syncthreads();
  121. if (gid >= n) return;
  122. float acc_r = 0.0f;
  123. float acc_i = 0.0f;
  124. for (int k = 0; k < num_taps; ++k) {
  125. float2 v = s_data[lid + halo - k];
  126. float t = gpud_fir_taps[k];
  127. acc_r += v.x * t;
  128. acc_i += v.y * t;
  129. }
  130. out[gid] = make_float2(acc_r, acc_i);
  131. }
  132. GPUD_API int GPUD_CALL gpud_upload_fir_taps_cuda(const float* taps, int n) {
  133. if (!taps || n <= 0 || n > 256) return -1;
  134. cudaError_t err = cudaMemcpyToSymbol(gpud_fir_taps, taps, (size_t)n * sizeof(float));
  135. return (int)err;
  136. }
  137. GPUD_API int GPUD_CALL gpud_launch_fir_cuda(
  138. const float2* in,
  139. float2* out,
  140. int n,
  141. int num_taps
  142. ) {
  143. if (n <= 0 || num_taps <= 0 || num_taps > 256) return 0;
  144. const int block = 256;
  145. const int grid = (n + block - 1) / block;
  146. size_t sharedBytes = (size_t)(block + num_taps - 1) * sizeof(float2);
  147. gpud_fir_kernel<<<grid, block, sharedBytes>>>(in, out, n, num_taps);
  148. return (int)cudaGetLastError();
  149. }
  150. GPUD_API int GPUD_CALL gpud_launch_decimate_cuda(
  151. const float2* in,
  152. float2* out,
  153. int n_out,
  154. int factor
  155. ) {
  156. if (n_out <= 0 || factor <= 0) return 0;
  157. const int block = 256;
  158. const int grid = (n_out + block - 1) / block;
  159. gpud_decimate_kernel<<<grid, block>>>(in, out, n_out, factor);
  160. return (int)cudaGetLastError();
  161. }
  162. __global__ void gpud_am_envelope_kernel(
  163. const float2* __restrict__ in,
  164. float* __restrict__ out,
  165. int n
  166. ) {
  167. int idx = blockIdx.x * blockDim.x + threadIdx.x;
  168. if (idx >= n) return;
  169. float2 v = in[idx];
  170. out[idx] = sqrtf(v.x * v.x + v.y * v.y);
  171. }
  172. GPUD_API int GPUD_CALL gpud_launch_am_envelope_cuda(
  173. const float2* in,
  174. float* out,
  175. int n
  176. ) {
  177. if (n <= 0) return 0;
  178. const int block = 256;
  179. const int grid = (n + block - 1) / block;
  180. gpud_am_envelope_kernel<<<grid, block>>>(in, out, n);
  181. return (int)cudaGetLastError();
  182. }
  183. __global__ void gpud_ssb_product_kernel(
  184. const float2* __restrict__ in,
  185. float* __restrict__ out,
  186. int n,
  187. double phase_inc,
  188. double phase_start
  189. ) {
  190. int idx = blockIdx.x * blockDim.x + threadIdx.x;
  191. if (idx >= n) return;
  192. double phase = phase_start + phase_inc * (double)idx;
  193. float si, co;
  194. sincosf((float)phase, &si, &co);
  195. float2 v = in[idx];
  196. out[idx] = v.x * co - v.y * si;
  197. }
  198. GPUD_API int GPUD_CALL gpud_launch_ssb_product_cuda(
  199. const float2* in,
  200. float* out,
  201. int n,
  202. double phase_inc,
  203. double phase_start
  204. ) {
  205. if (n <= 0) return 0;
  206. const int block = 256;
  207. const int grid = (n + block - 1) / block;
  208. gpud_ssb_product_kernel<<<grid, block>>>(in, out, n, phase_inc, phase_start);
  209. return (int)cudaGetLastError();
  210. }