Wideband autonomous SDR analysis engine forked from sdr-visual-suite
您最多选择25个主题 主题必须以字母或数字开头,可以包含连字符 (-),并且长度不得超过35个字符

452 行
13KB

  1. #include <cuda_runtime.h>
  2. #include <math.h>
  3. #if defined(_WIN32)
  4. #define GPUD_API extern "C" __declspec(dllexport)
  5. #define GPUD_CALL __stdcall
  6. #else
  7. #define GPUD_API extern "C"
  8. #define GPUD_CALL
  9. #endif
  10. typedef void* gpud_stream_handle;
  11. GPUD_API int GPUD_CALL gpud_stream_create(gpud_stream_handle* out) {
  12. if (!out) return -1;
  13. cudaStream_t stream;
  14. cudaError_t err = cudaStreamCreate(&stream);
  15. if (err != cudaSuccess) return (int)err;
  16. *out = (gpud_stream_handle)stream;
  17. return 0;
  18. }
  19. GPUD_API int GPUD_CALL gpud_stream_destroy(gpud_stream_handle stream) {
  20. if (!stream) return 0;
  21. return (int)cudaStreamDestroy((cudaStream_t)stream);
  22. }
  23. GPUD_API int GPUD_CALL gpud_stream_sync(gpud_stream_handle stream) {
  24. if (!stream) return (int)cudaDeviceSynchronize();
  25. return (int)cudaStreamSynchronize((cudaStream_t)stream);
  26. }
  27. __global__ void gpud_freq_shift_kernel(
  28. const float2* __restrict__ in,
  29. float2* __restrict__ out,
  30. int n,
  31. double phase_inc,
  32. double phase_start
  33. ) {
  34. int idx = blockIdx.x * blockDim.x + threadIdx.x;
  35. if (idx >= n) return;
  36. double phase = phase_start + phase_inc * (double)idx;
  37. // Reduce phase to [-pi, pi) BEFORE float cast to preserve precision.
  38. // Without this, phase accumulates to millions of radians and the
  39. // (float) cast loses ~0.03-0.1 rad, causing audible clicks at
  40. // frame boundaries in streaming audio.
  41. const double TWO_PI = 6.283185307179586;
  42. phase = phase - rint(phase / TWO_PI) * TWO_PI;
  43. float si, co;
  44. sincosf((float)phase, &si, &co);
  45. float2 v = in[idx];
  46. out[idx].x = v.x * co - v.y * si;
  47. out[idx].y = v.x * si + v.y * co;
  48. }
  49. GPUD_API int GPUD_CALL gpud_launch_freq_shift_stream_cuda(
  50. const float2* in,
  51. float2* out,
  52. int n,
  53. double phase_inc,
  54. double phase_start,
  55. gpud_stream_handle stream
  56. ) {
  57. if (n <= 0) return 0;
  58. const int block = 256;
  59. const int grid = (n + block - 1) / block;
  60. gpud_freq_shift_kernel<<<grid, block, 0, (cudaStream_t)stream>>>(in, out, n, phase_inc, phase_start);
  61. return (int)cudaGetLastError();
  62. }
  63. GPUD_API int GPUD_CALL gpud_launch_freq_shift_cuda(
  64. const float2* in,
  65. float2* out,
  66. int n,
  67. double phase_inc,
  68. double phase_start
  69. ) {
  70. return gpud_launch_freq_shift_stream_cuda(in, out, n, phase_inc, phase_start, 0);
  71. }
  72. __global__ void gpud_fm_discrim_kernel(
  73. const float2* __restrict__ in,
  74. float* __restrict__ out,
  75. int n
  76. ) {
  77. int idx = blockIdx.x * blockDim.x + threadIdx.x;
  78. if (idx >= n - 1) return;
  79. float2 prev = in[idx];
  80. float2 curr = in[idx + 1];
  81. float re = prev.x * curr.x + prev.y * curr.y;
  82. float im = prev.x * curr.y - prev.y * curr.x;
  83. out[idx] = atan2f(im, re);
  84. }
  85. GPUD_API int GPUD_CALL gpud_launch_fm_discrim_cuda(
  86. const float2* in,
  87. float* out,
  88. int n
  89. ) {
  90. if (n <= 1) return 0;
  91. const int block = 256;
  92. const int grid = (n + block - 1) / block;
  93. gpud_fm_discrim_kernel<<<grid, block>>>(in, out, n);
  94. return (int)cudaGetLastError();
  95. }
  96. __global__ void gpud_decimate_kernel(
  97. const float2* __restrict__ in,
  98. float2* __restrict__ out,
  99. int n_out,
  100. int factor
  101. ) {
  102. int idx = blockIdx.x * blockDim.x + threadIdx.x;
  103. if (idx >= n_out) return;
  104. out[idx] = in[idx * factor];
  105. }
  106. __device__ __constant__ float gpud_fir_taps[256];
  107. __global__ void gpud_fir_kernel(
  108. const float2* __restrict__ in,
  109. float2* __restrict__ out,
  110. int n,
  111. int num_taps
  112. ) {
  113. extern __shared__ float2 s_data[];
  114. int gid = blockIdx.x * blockDim.x + threadIdx.x;
  115. int lid = threadIdx.x;
  116. int halo = num_taps - 1;
  117. if (gid < n) {
  118. s_data[lid + halo] = in[gid];
  119. } else {
  120. s_data[lid + halo] = make_float2(0.0f, 0.0f);
  121. }
  122. if (lid < halo) {
  123. int src = gid - halo;
  124. s_data[lid] = (src >= 0) ? in[src] : make_float2(0.0f, 0.0f);
  125. }
  126. __syncthreads();
  127. if (gid >= n) return;
  128. float acc_r = 0.0f;
  129. float acc_i = 0.0f;
  130. for (int k = 0; k < num_taps; ++k) {
  131. float2 v = s_data[lid + halo - k];
  132. float t = gpud_fir_taps[k];
  133. acc_r += v.x * t;
  134. acc_i += v.y * t;
  135. }
  136. out[gid] = make_float2(acc_r, acc_i);
  137. }
  138. GPUD_API int GPUD_CALL gpud_upload_fir_taps_cuda(const float* taps, int n) {
  139. if (!taps || n <= 0 || n > 256) return -1;
  140. cudaError_t err = cudaMemcpyToSymbol(gpud_fir_taps, taps, (size_t)n * sizeof(float));
  141. return (int)err;
  142. }
  143. GPUD_API int GPUD_CALL gpud_launch_fir_cuda(
  144. const float2* in,
  145. float2* out,
  146. int n,
  147. int num_taps
  148. ) {
  149. if (n <= 0 || num_taps <= 0 || num_taps > 256) return 0;
  150. const int block = 256;
  151. const int grid = (n + block - 1) / block;
  152. size_t sharedBytes = (size_t)(block + num_taps - 1) * sizeof(float2);
  153. gpud_fir_kernel<<<grid, block, sharedBytes>>>(in, out, n, num_taps);
  154. return (int)cudaGetLastError();
  155. }
  156. GPUD_API int GPUD_CALL gpud_launch_fir_stream_cuda(
  157. const float2* in,
  158. float2* out,
  159. int n,
  160. int num_taps,
  161. gpud_stream_handle stream
  162. ) {
  163. if (n <= 0 || num_taps <= 0 || num_taps > 256) return 0;
  164. const int block = 256;
  165. const int grid = (n + block - 1) / block;
  166. size_t sharedBytes = (size_t)(block + num_taps - 1) * sizeof(float2);
  167. gpud_fir_kernel<<<grid, block, sharedBytes, (cudaStream_t)stream>>>(in, out, n, num_taps);
  168. return (int)cudaGetLastError();
  169. }
  170. __global__ void gpud_fir_kernel_v2(
  171. const float2* __restrict__ in,
  172. float2* __restrict__ out,
  173. const float* __restrict__ taps,
  174. int n,
  175. int num_taps
  176. ) {
  177. extern __shared__ float2 s_data[];
  178. int gid = blockIdx.x * blockDim.x + threadIdx.x;
  179. int lid = threadIdx.x;
  180. int halo = num_taps - 1;
  181. if (gid < n) s_data[lid + halo] = in[gid];
  182. else s_data[lid + halo] = make_float2(0.0f, 0.0f);
  183. if (lid < halo) {
  184. int src = gid - halo;
  185. s_data[lid] = (src >= 0) ? in[src] : make_float2(0.0f, 0.0f);
  186. }
  187. __syncthreads();
  188. if (gid >= n) return;
  189. float acc_r = 0.0f, acc_i = 0.0f;
  190. for (int k = 0; k < num_taps; ++k) {
  191. float2 v = s_data[lid + halo - k];
  192. float t = taps[k];
  193. acc_r += v.x * t;
  194. acc_i += v.y * t;
  195. }
  196. out[gid] = make_float2(acc_r, acc_i);
  197. }
  198. GPUD_API int GPUD_CALL gpud_launch_fir_v2_stream_cuda(
  199. const float2* in,
  200. float2* out,
  201. const float* taps,
  202. int n,
  203. int num_taps,
  204. gpud_stream_handle stream
  205. ) {
  206. if (n <= 0 || num_taps <= 0 || num_taps > 256) return 0;
  207. const int block = 256;
  208. const int grid = (n + block - 1) / block;
  209. size_t sharedBytes = (size_t)(block + num_taps - 1) * sizeof(float2);
  210. gpud_fir_kernel_v2<<<grid, block, sharedBytes, (cudaStream_t)stream>>>(in, out, taps, n, num_taps);
  211. return (int)cudaGetLastError();
  212. }
  213. GPUD_API int GPUD_CALL gpud_launch_decimate_cuda(
  214. const float2* in,
  215. float2* out,
  216. int n_out,
  217. int factor
  218. ) {
  219. if (n_out <= 0 || factor <= 0) return 0;
  220. const int block = 256;
  221. const int grid = (n_out + block - 1) / block;
  222. gpud_decimate_kernel<<<grid, block>>>(in, out, n_out, factor);
  223. return (int)cudaGetLastError();
  224. }
  225. GPUD_API int GPUD_CALL gpud_launch_decimate_stream_cuda(
  226. const float2* in,
  227. float2* out,
  228. int n_out,
  229. int factor,
  230. gpud_stream_handle stream
  231. ) {
  232. if (n_out <= 0 || factor <= 0) return 0;
  233. const int block = 256;
  234. const int grid = (n_out + block - 1) / block;
  235. gpud_decimate_kernel<<<grid, block, 0, (cudaStream_t)stream>>>(in, out, n_out, factor);
  236. return (int)cudaGetLastError();
  237. }
  238. __global__ void gpud_am_envelope_kernel(
  239. const float2* __restrict__ in,
  240. float* __restrict__ out,
  241. int n
  242. ) {
  243. int idx = blockIdx.x * blockDim.x + threadIdx.x;
  244. if (idx >= n) return;
  245. float2 v = in[idx];
  246. out[idx] = sqrtf(v.x * v.x + v.y * v.y);
  247. }
  248. GPUD_API int GPUD_CALL gpud_launch_am_envelope_cuda(
  249. const float2* in,
  250. float* out,
  251. int n
  252. ) {
  253. if (n <= 0) return 0;
  254. const int block = 256;
  255. const int grid = (n + block - 1) / block;
  256. gpud_am_envelope_kernel<<<grid, block>>>(in, out, n);
  257. return (int)cudaGetLastError();
  258. }
  259. __global__ void gpud_ssb_product_kernel(
  260. const float2* __restrict__ in,
  261. float* __restrict__ out,
  262. int n,
  263. double phase_inc,
  264. double phase_start
  265. ) {
  266. int idx = blockIdx.x * blockDim.x + threadIdx.x;
  267. if (idx >= n) return;
  268. double phase = phase_start + phase_inc * (double)idx;
  269. const double TWO_PI = 6.283185307179586;
  270. phase = phase - rint(phase / TWO_PI) * TWO_PI;
  271. float si, co;
  272. sincosf((float)phase, &si, &co);
  273. float2 v = in[idx];
  274. out[idx] = v.x * co - v.y * si;
  275. }
  276. GPUD_API int GPUD_CALL gpud_launch_ssb_product_cuda(
  277. const float2* in,
  278. float* out,
  279. int n,
  280. double phase_inc,
  281. double phase_start
  282. ) {
  283. if (n <= 0) return 0;
  284. const int block = 256;
  285. const int grid = (n + block - 1) / block;
  286. gpud_ssb_product_kernel<<<grid, block>>>(in, out, n, phase_inc, phase_start);
  287. return (int)cudaGetLastError();
  288. }
  289. GPUD_API int GPUD_CALL gpud_launch_streaming_polyphase_prepare_cuda(
  290. const float2* in_new,
  291. int n_new,
  292. const float2* history_in,
  293. int history_len,
  294. const float* polyphase_taps,
  295. int polyphase_len,
  296. int decim,
  297. int num_taps,
  298. int phase_count_in,
  299. double phase_start,
  300. double phase_inc,
  301. float2* out,
  302. int* n_out,
  303. int* phase_count_out,
  304. double* phase_end_out,
  305. float2* history_out
  306. ) {
  307. if (!in_new || n_new < 0 || !polyphase_taps || polyphase_len <= 0 || decim <= 0 || num_taps <= 0) return -1;
  308. const int phase_len = (num_taps + decim - 1) / decim;
  309. if (polyphase_len < decim * phase_len) return -2;
  310. const int combined_len = history_len + n_new;
  311. float2* shifted = NULL;
  312. float2* combined = NULL;
  313. cudaError_t err = cudaMalloc((void**)&shifted, (size_t)max(1, n_new) * sizeof(float2));
  314. if (err != cudaSuccess) return (int)err;
  315. err = cudaMalloc((void**)&combined, (size_t)max(1, combined_len) * sizeof(float2));
  316. if (err != cudaSuccess) {
  317. cudaFree(shifted);
  318. return (int)err;
  319. }
  320. const int block = 256;
  321. const int grid_shift = (n_new + block - 1) / block;
  322. if (n_new > 0) {
  323. gpud_freq_shift_kernel<<<grid_shift, block>>>(in_new, shifted, n_new, phase_inc, phase_start);
  324. err = cudaGetLastError();
  325. if (err != cudaSuccess) {
  326. cudaFree(shifted);
  327. cudaFree(combined);
  328. return (int)err;
  329. }
  330. }
  331. if (history_len > 0 && history_in) {
  332. err = cudaMemcpy(combined, history_in, (size_t)history_len * sizeof(float2), cudaMemcpyDeviceToDevice);
  333. if (err != cudaSuccess) {
  334. cudaFree(shifted);
  335. cudaFree(combined);
  336. return (int)err;
  337. }
  338. }
  339. if (n_new > 0) {
  340. err = cudaMemcpy(combined + history_len, shifted, (size_t)n_new * sizeof(float2), cudaMemcpyDeviceToDevice);
  341. if (err != cudaSuccess) {
  342. cudaFree(shifted);
  343. cudaFree(combined);
  344. return (int)err;
  345. }
  346. }
  347. int out_count = 0;
  348. int phase_count = phase_count_in;
  349. for (int i = 0; i < n_new; ++i) {
  350. phase_count++;
  351. if (phase_count == decim) {
  352. float2 acc = make_float2(0.0f, 0.0f);
  353. int newest = history_len + i;
  354. for (int p = 0; p < decim; ++p) {
  355. for (int k = 0; k < phase_len; ++k) {
  356. int tap_idx = p * phase_len + k;
  357. if (tap_idx >= polyphase_len) continue;
  358. float tap;
  359. err = cudaMemcpy(&tap, polyphase_taps + tap_idx, sizeof(float), cudaMemcpyDeviceToHost);
  360. if (err != cudaSuccess) {
  361. cudaFree(shifted);
  362. cudaFree(combined);
  363. return (int)err;
  364. }
  365. if (tap == 0.0f) continue;
  366. int src_back = p + k * decim;
  367. int src_idx = newest - src_back;
  368. if (src_idx < 0) continue;
  369. float2 sample;
  370. err = cudaMemcpy(&sample, combined + src_idx, sizeof(float2), cudaMemcpyDeviceToHost);
  371. if (err != cudaSuccess) {
  372. cudaFree(shifted);
  373. cudaFree(combined);
  374. return (int)err;
  375. }
  376. acc.x += sample.x * tap;
  377. acc.y += sample.y * tap;
  378. }
  379. }
  380. err = cudaMemcpy(out + out_count, &acc, sizeof(float2), cudaMemcpyHostToDevice);
  381. if (err != cudaSuccess) {
  382. cudaFree(shifted);
  383. cudaFree(combined);
  384. return (int)err;
  385. }
  386. out_count++;
  387. phase_count = 0;
  388. }
  389. }
  390. const int keep = num_taps > 1 ? num_taps - 1 : 0;
  391. if (history_out && keep > 0) {
  392. int copy = keep;
  393. if (combined_len < copy) copy = combined_len;
  394. if (copy > 0) {
  395. err = cudaMemcpy(history_out, combined + (combined_len - copy), (size_t)copy * sizeof(float2), cudaMemcpyDeviceToDevice);
  396. if (err != cudaSuccess) {
  397. cudaFree(shifted);
  398. cudaFree(combined);
  399. return (int)err;
  400. }
  401. }
  402. }
  403. if (n_out) *n_out = out_count;
  404. if (phase_count_out) *phase_count_out = phase_count;
  405. if (phase_end_out) *phase_end_out = phase_start + phase_inc * (double)n_new;
  406. cudaFree(shifted);
  407. cudaFree(combined);
  408. return 0;
  409. }