Wideband autonomous SDR analysis engine forked from sdr-visual-suite
Vous ne pouvez pas sélectionner plus de 25 sujets Les noms de sujets doivent commencer par une lettre ou un nombre, peuvent contenir des tirets ('-') et peuvent comporter jusqu'à 35 caractères.

323 lignes
8.8KB

  1. #include <cuda_runtime.h>
  2. #include <math.h>
  3. #if defined(_WIN32)
  4. #define GPUD_API extern "C" __declspec(dllexport)
  5. #define GPUD_CALL __stdcall
  6. #else
  7. #define GPUD_API extern "C"
  8. #define GPUD_CALL
  9. #endif
  10. typedef void* gpud_stream_handle;
  11. GPUD_API int GPUD_CALL gpud_stream_create(gpud_stream_handle* out) {
  12. if (!out) return -1;
  13. cudaStream_t stream;
  14. cudaError_t err = cudaStreamCreate(&stream);
  15. if (err != cudaSuccess) return (int)err;
  16. *out = (gpud_stream_handle)stream;
  17. return 0;
  18. }
  19. GPUD_API int GPUD_CALL gpud_stream_destroy(gpud_stream_handle stream) {
  20. if (!stream) return 0;
  21. return (int)cudaStreamDestroy((cudaStream_t)stream);
  22. }
  23. GPUD_API int GPUD_CALL gpud_stream_sync(gpud_stream_handle stream) {
  24. if (!stream) return (int)cudaDeviceSynchronize();
  25. return (int)cudaStreamSynchronize((cudaStream_t)stream);
  26. }
  27. __global__ void gpud_freq_shift_kernel(
  28. const float2* __restrict__ in,
  29. float2* __restrict__ out,
  30. int n,
  31. double phase_inc,
  32. double phase_start
  33. ) {
  34. int idx = blockIdx.x * blockDim.x + threadIdx.x;
  35. if (idx >= n) return;
  36. double phase = phase_start + phase_inc * (double)idx;
  37. // Reduce phase to [-pi, pi) BEFORE float cast to preserve precision.
  38. // Without this, phase accumulates to millions of radians and the
  39. // (float) cast loses ~0.03-0.1 rad, causing audible clicks at
  40. // frame boundaries in streaming audio.
  41. const double TWO_PI = 6.283185307179586;
  42. phase = phase - rint(phase / TWO_PI) * TWO_PI;
  43. float si, co;
  44. sincosf((float)phase, &si, &co);
  45. float2 v = in[idx];
  46. out[idx].x = v.x * co - v.y * si;
  47. out[idx].y = v.x * si + v.y * co;
  48. }
  49. GPUD_API int GPUD_CALL gpud_launch_freq_shift_stream_cuda(
  50. const float2* in,
  51. float2* out,
  52. int n,
  53. double phase_inc,
  54. double phase_start,
  55. gpud_stream_handle stream
  56. ) {
  57. if (n <= 0) return 0;
  58. const int block = 256;
  59. const int grid = (n + block - 1) / block;
  60. gpud_freq_shift_kernel<<<grid, block, 0, (cudaStream_t)stream>>>(in, out, n, phase_inc, phase_start);
  61. return (int)cudaGetLastError();
  62. }
  63. GPUD_API int GPUD_CALL gpud_launch_freq_shift_cuda(
  64. const float2* in,
  65. float2* out,
  66. int n,
  67. double phase_inc,
  68. double phase_start
  69. ) {
  70. return gpud_launch_freq_shift_stream_cuda(in, out, n, phase_inc, phase_start, 0);
  71. }
  72. __global__ void gpud_fm_discrim_kernel(
  73. const float2* __restrict__ in,
  74. float* __restrict__ out,
  75. int n
  76. ) {
  77. int idx = blockIdx.x * blockDim.x + threadIdx.x;
  78. if (idx >= n - 1) return;
  79. float2 prev = in[idx];
  80. float2 curr = in[idx + 1];
  81. float re = prev.x * curr.x + prev.y * curr.y;
  82. float im = prev.x * curr.y - prev.y * curr.x;
  83. out[idx] = atan2f(im, re);
  84. }
  85. GPUD_API int GPUD_CALL gpud_launch_fm_discrim_cuda(
  86. const float2* in,
  87. float* out,
  88. int n
  89. ) {
  90. if (n <= 1) return 0;
  91. const int block = 256;
  92. const int grid = (n + block - 1) / block;
  93. gpud_fm_discrim_kernel<<<grid, block>>>(in, out, n);
  94. return (int)cudaGetLastError();
  95. }
  96. __global__ void gpud_decimate_kernel(
  97. const float2* __restrict__ in,
  98. float2* __restrict__ out,
  99. int n_out,
  100. int factor
  101. ) {
  102. int idx = blockIdx.x * blockDim.x + threadIdx.x;
  103. if (idx >= n_out) return;
  104. out[idx] = in[idx * factor];
  105. }
  106. __device__ __constant__ float gpud_fir_taps[256];
  107. __global__ void gpud_fir_kernel(
  108. const float2* __restrict__ in,
  109. float2* __restrict__ out,
  110. int n,
  111. int num_taps
  112. ) {
  113. extern __shared__ float2 s_data[];
  114. int gid = blockIdx.x * blockDim.x + threadIdx.x;
  115. int lid = threadIdx.x;
  116. int halo = num_taps - 1;
  117. if (gid < n) {
  118. s_data[lid + halo] = in[gid];
  119. } else {
  120. s_data[lid + halo] = make_float2(0.0f, 0.0f);
  121. }
  122. if (lid < halo) {
  123. int src = gid - halo;
  124. s_data[lid] = (src >= 0) ? in[src] : make_float2(0.0f, 0.0f);
  125. }
  126. __syncthreads();
  127. if (gid >= n) return;
  128. float acc_r = 0.0f;
  129. float acc_i = 0.0f;
  130. for (int k = 0; k < num_taps; ++k) {
  131. float2 v = s_data[lid + halo - k];
  132. float t = gpud_fir_taps[k];
  133. acc_r += v.x * t;
  134. acc_i += v.y * t;
  135. }
  136. out[gid] = make_float2(acc_r, acc_i);
  137. }
  138. GPUD_API int GPUD_CALL gpud_upload_fir_taps_cuda(const float* taps, int n) {
  139. if (!taps || n <= 0 || n > 256) return -1;
  140. cudaError_t err = cudaMemcpyToSymbol(gpud_fir_taps, taps, (size_t)n * sizeof(float));
  141. return (int)err;
  142. }
  143. GPUD_API int GPUD_CALL gpud_launch_fir_cuda(
  144. const float2* in,
  145. float2* out,
  146. int n,
  147. int num_taps
  148. ) {
  149. if (n <= 0 || num_taps <= 0 || num_taps > 256) return 0;
  150. const int block = 256;
  151. const int grid = (n + block - 1) / block;
  152. size_t sharedBytes = (size_t)(block + num_taps - 1) * sizeof(float2);
  153. gpud_fir_kernel<<<grid, block, sharedBytes>>>(in, out, n, num_taps);
  154. return (int)cudaGetLastError();
  155. }
  156. GPUD_API int GPUD_CALL gpud_launch_fir_stream_cuda(
  157. const float2* in,
  158. float2* out,
  159. int n,
  160. int num_taps,
  161. gpud_stream_handle stream
  162. ) {
  163. if (n <= 0 || num_taps <= 0 || num_taps > 256) return 0;
  164. const int block = 256;
  165. const int grid = (n + block - 1) / block;
  166. size_t sharedBytes = (size_t)(block + num_taps - 1) * sizeof(float2);
  167. gpud_fir_kernel<<<grid, block, sharedBytes, (cudaStream_t)stream>>>(in, out, n, num_taps);
  168. return (int)cudaGetLastError();
  169. }
  170. __global__ void gpud_fir_kernel_v2(
  171. const float2* __restrict__ in,
  172. float2* __restrict__ out,
  173. const float* __restrict__ taps,
  174. int n,
  175. int num_taps
  176. ) {
  177. extern __shared__ float2 s_data[];
  178. int gid = blockIdx.x * blockDim.x + threadIdx.x;
  179. int lid = threadIdx.x;
  180. int halo = num_taps - 1;
  181. if (gid < n) s_data[lid + halo] = in[gid];
  182. else s_data[lid + halo] = make_float2(0.0f, 0.0f);
  183. if (lid < halo) {
  184. int src = gid - halo;
  185. s_data[lid] = (src >= 0) ? in[src] : make_float2(0.0f, 0.0f);
  186. }
  187. __syncthreads();
  188. if (gid >= n) return;
  189. float acc_r = 0.0f, acc_i = 0.0f;
  190. for (int k = 0; k < num_taps; ++k) {
  191. float2 v = s_data[lid + halo - k];
  192. float t = taps[k];
  193. acc_r += v.x * t;
  194. acc_i += v.y * t;
  195. }
  196. out[gid] = make_float2(acc_r, acc_i);
  197. }
  198. GPUD_API int GPUD_CALL gpud_launch_fir_v2_stream_cuda(
  199. const float2* in,
  200. float2* out,
  201. const float* taps,
  202. int n,
  203. int num_taps,
  204. gpud_stream_handle stream
  205. ) {
  206. if (n <= 0 || num_taps <= 0 || num_taps > 256) return 0;
  207. const int block = 256;
  208. const int grid = (n + block - 1) / block;
  209. size_t sharedBytes = (size_t)(block + num_taps - 1) * sizeof(float2);
  210. gpud_fir_kernel_v2<<<grid, block, sharedBytes, (cudaStream_t)stream>>>(in, out, taps, n, num_taps);
  211. return (int)cudaGetLastError();
  212. }
  213. GPUD_API int GPUD_CALL gpud_launch_decimate_cuda(
  214. const float2* in,
  215. float2* out,
  216. int n_out,
  217. int factor
  218. ) {
  219. if (n_out <= 0 || factor <= 0) return 0;
  220. const int block = 256;
  221. const int grid = (n_out + block - 1) / block;
  222. gpud_decimate_kernel<<<grid, block>>>(in, out, n_out, factor);
  223. return (int)cudaGetLastError();
  224. }
  225. GPUD_API int GPUD_CALL gpud_launch_decimate_stream_cuda(
  226. const float2* in,
  227. float2* out,
  228. int n_out,
  229. int factor,
  230. gpud_stream_handle stream
  231. ) {
  232. if (n_out <= 0 || factor <= 0) return 0;
  233. const int block = 256;
  234. const int grid = (n_out + block - 1) / block;
  235. gpud_decimate_kernel<<<grid, block, 0, (cudaStream_t)stream>>>(in, out, n_out, factor);
  236. return (int)cudaGetLastError();
  237. }
  238. __global__ void gpud_am_envelope_kernel(
  239. const float2* __restrict__ in,
  240. float* __restrict__ out,
  241. int n
  242. ) {
  243. int idx = blockIdx.x * blockDim.x + threadIdx.x;
  244. if (idx >= n) return;
  245. float2 v = in[idx];
  246. out[idx] = sqrtf(v.x * v.x + v.y * v.y);
  247. }
  248. GPUD_API int GPUD_CALL gpud_launch_am_envelope_cuda(
  249. const float2* in,
  250. float* out,
  251. int n
  252. ) {
  253. if (n <= 0) return 0;
  254. const int block = 256;
  255. const int grid = (n + block - 1) / block;
  256. gpud_am_envelope_kernel<<<grid, block>>>(in, out, n);
  257. return (int)cudaGetLastError();
  258. }
  259. __global__ void gpud_ssb_product_kernel(
  260. const float2* __restrict__ in,
  261. float* __restrict__ out,
  262. int n,
  263. double phase_inc,
  264. double phase_start
  265. ) {
  266. int idx = blockIdx.x * blockDim.x + threadIdx.x;
  267. if (idx >= n) return;
  268. double phase = phase_start + phase_inc * (double)idx;
  269. const double TWO_PI = 6.283185307179586;
  270. phase = phase - rint(phase / TWO_PI) * TWO_PI;
  271. float si, co;
  272. sincosf((float)phase, &si, &co);
  273. float2 v = in[idx];
  274. out[idx] = v.x * co - v.y * si;
  275. }
  276. GPUD_API int GPUD_CALL gpud_launch_ssb_product_cuda(
  277. const float2* in,
  278. float* out,
  279. int n,
  280. double phase_inc,
  281. double phase_start
  282. ) {
  283. if (n <= 0) return 0;
  284. const int block = 256;
  285. const int grid = (n + block - 1) / block;
  286. gpud_ssb_product_kernel<<<grid, block>>>(in, out, n, phase_inc, phase_start);
  287. return (int)cudaGetLastError();
  288. }