Wideband autonomous SDR analysis engine forked from sdr-visual-suite
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

632 line
19KB

  1. #include <cuda_runtime.h>
  2. #include <math.h>
  3. #if defined(_WIN32)
  4. #define GPUD_API extern "C" __declspec(dllexport)
  5. #define GPUD_CALL __stdcall
  6. #else
  7. #define GPUD_API extern "C"
  8. #define GPUD_CALL
  9. #endif
  10. typedef void* gpud_stream_handle;
  11. static __forceinline__ int gpud_max_i(int a, int b) {
  12. return a > b ? a : b;
  13. }
  14. GPUD_API int GPUD_CALL gpud_stream_create(gpud_stream_handle* out) {
  15. if (!out) return -1;
  16. cudaStream_t stream;
  17. cudaError_t err = cudaStreamCreate(&stream);
  18. if (err != cudaSuccess) return (int)err;
  19. *out = (gpud_stream_handle)stream;
  20. return 0;
  21. }
  22. GPUD_API int GPUD_CALL gpud_stream_destroy(gpud_stream_handle stream) {
  23. if (!stream) return 0;
  24. return (int)cudaStreamDestroy((cudaStream_t)stream);
  25. }
  26. GPUD_API int GPUD_CALL gpud_stream_sync(gpud_stream_handle stream) {
  27. if (!stream) return (int)cudaDeviceSynchronize();
  28. return (int)cudaStreamSynchronize((cudaStream_t)stream);
  29. }
  30. __global__ void gpud_freq_shift_kernel(
  31. const float2* __restrict__ in,
  32. float2* __restrict__ out,
  33. int n,
  34. double phase_inc,
  35. double phase_start
  36. ) {
  37. int idx = blockIdx.x * blockDim.x + threadIdx.x;
  38. if (idx >= n) return;
  39. double phase = phase_start + phase_inc * (double)idx;
  40. // Reduce phase to [-pi, pi) BEFORE float cast to preserve precision.
  41. // Without this, phase accumulates to millions of radians and the
  42. // (float) cast loses ~0.03-0.1 rad, causing audible clicks at
  43. // frame boundaries in streaming audio.
  44. const double TWO_PI = 6.283185307179586;
  45. phase = phase - rint(phase / TWO_PI) * TWO_PI;
  46. float si, co;
  47. sincosf((float)phase, &si, &co);
  48. float2 v = in[idx];
  49. out[idx].x = v.x * co - v.y * si;
  50. out[idx].y = v.x * si + v.y * co;
  51. }
  52. GPUD_API int GPUD_CALL gpud_launch_freq_shift_stream_cuda(
  53. const float2* in,
  54. float2* out,
  55. int n,
  56. double phase_inc,
  57. double phase_start,
  58. gpud_stream_handle stream
  59. ) {
  60. if (n <= 0) return 0;
  61. const int block = 256;
  62. const int grid = (n + block - 1) / block;
  63. gpud_freq_shift_kernel<<<grid, block, 0, (cudaStream_t)stream>>>(in, out, n, phase_inc, phase_start);
  64. return (int)cudaGetLastError();
  65. }
  66. GPUD_API int GPUD_CALL gpud_launch_freq_shift_cuda(
  67. const float2* in,
  68. float2* out,
  69. int n,
  70. double phase_inc,
  71. double phase_start
  72. ) {
  73. return gpud_launch_freq_shift_stream_cuda(in, out, n, phase_inc, phase_start, 0);
  74. }
  75. __global__ void gpud_fm_discrim_kernel(
  76. const float2* __restrict__ in,
  77. float* __restrict__ out,
  78. int n
  79. ) {
  80. int idx = blockIdx.x * blockDim.x + threadIdx.x;
  81. if (idx >= n - 1) return;
  82. float2 prev = in[idx];
  83. float2 curr = in[idx + 1];
  84. float re = prev.x * curr.x + prev.y * curr.y;
  85. float im = prev.x * curr.y - prev.y * curr.x;
  86. out[idx] = atan2f(im, re);
  87. }
  88. GPUD_API int GPUD_CALL gpud_launch_fm_discrim_cuda(
  89. const float2* in,
  90. float* out,
  91. int n
  92. ) {
  93. if (n <= 1) return 0;
  94. const int block = 256;
  95. const int grid = (n + block - 1) / block;
  96. gpud_fm_discrim_kernel<<<grid, block>>>(in, out, n);
  97. return (int)cudaGetLastError();
  98. }
  99. __global__ void gpud_decimate_kernel(
  100. const float2* __restrict__ in,
  101. float2* __restrict__ out,
  102. int n_out,
  103. int factor
  104. ) {
  105. int idx = blockIdx.x * blockDim.x + threadIdx.x;
  106. if (idx >= n_out) return;
  107. out[idx] = in[idx * factor];
  108. }
  109. __device__ __constant__ float gpud_fir_taps[256];
  110. __global__ void gpud_fir_kernel(
  111. const float2* __restrict__ in,
  112. float2* __restrict__ out,
  113. int n,
  114. int num_taps
  115. ) {
  116. extern __shared__ float2 s_data[];
  117. int gid = blockIdx.x * blockDim.x + threadIdx.x;
  118. int lid = threadIdx.x;
  119. int halo = num_taps - 1;
  120. if (gid < n) {
  121. s_data[lid + halo] = in[gid];
  122. } else {
  123. s_data[lid + halo] = make_float2(0.0f, 0.0f);
  124. }
  125. if (lid < halo) {
  126. int src = gid - halo;
  127. s_data[lid] = (src >= 0) ? in[src] : make_float2(0.0f, 0.0f);
  128. }
  129. __syncthreads();
  130. if (gid >= n) return;
  131. float acc_r = 0.0f;
  132. float acc_i = 0.0f;
  133. for (int k = 0; k < num_taps; ++k) {
  134. float2 v = s_data[lid + halo - k];
  135. float t = gpud_fir_taps[k];
  136. acc_r += v.x * t;
  137. acc_i += v.y * t;
  138. }
  139. out[gid] = make_float2(acc_r, acc_i);
  140. }
  141. GPUD_API int GPUD_CALL gpud_upload_fir_taps_cuda(const float* taps, int n) {
  142. if (!taps || n <= 0 || n > 256) return -1;
  143. cudaError_t err = cudaMemcpyToSymbol(gpud_fir_taps, taps, (size_t)n * sizeof(float));
  144. return (int)err;
  145. }
  146. GPUD_API int GPUD_CALL gpud_launch_fir_cuda(
  147. const float2* in,
  148. float2* out,
  149. int n,
  150. int num_taps
  151. ) {
  152. if (n <= 0 || num_taps <= 0 || num_taps > 256) return 0;
  153. const int block = 256;
  154. const int grid = (n + block - 1) / block;
  155. size_t sharedBytes = (size_t)(block + num_taps - 1) * sizeof(float2);
  156. gpud_fir_kernel<<<grid, block, sharedBytes>>>(in, out, n, num_taps);
  157. return (int)cudaGetLastError();
  158. }
  159. GPUD_API int GPUD_CALL gpud_launch_fir_stream_cuda(
  160. const float2* in,
  161. float2* out,
  162. int n,
  163. int num_taps,
  164. gpud_stream_handle stream
  165. ) {
  166. if (n <= 0 || num_taps <= 0 || num_taps > 256) return 0;
  167. const int block = 256;
  168. const int grid = (n + block - 1) / block;
  169. size_t sharedBytes = (size_t)(block + num_taps - 1) * sizeof(float2);
  170. gpud_fir_kernel<<<grid, block, sharedBytes, (cudaStream_t)stream>>>(in, out, n, num_taps);
  171. return (int)cudaGetLastError();
  172. }
  173. __global__ void gpud_fir_kernel_v2(
  174. const float2* __restrict__ in,
  175. float2* __restrict__ out,
  176. const float* __restrict__ taps,
  177. int n,
  178. int num_taps
  179. ) {
  180. extern __shared__ float2 s_data[];
  181. int gid = blockIdx.x * blockDim.x + threadIdx.x;
  182. int lid = threadIdx.x;
  183. int halo = num_taps - 1;
  184. if (gid < n) s_data[lid + halo] = in[gid];
  185. else s_data[lid + halo] = make_float2(0.0f, 0.0f);
  186. if (lid < halo) {
  187. int src = gid - halo;
  188. s_data[lid] = (src >= 0) ? in[src] : make_float2(0.0f, 0.0f);
  189. }
  190. __syncthreads();
  191. if (gid >= n) return;
  192. float acc_r = 0.0f, acc_i = 0.0f;
  193. for (int k = 0; k < num_taps; ++k) {
  194. float2 v = s_data[lid + halo - k];
  195. float t = taps[k];
  196. acc_r += v.x * t;
  197. acc_i += v.y * t;
  198. }
  199. out[gid] = make_float2(acc_r, acc_i);
  200. }
  201. GPUD_API int GPUD_CALL gpud_launch_fir_v2_stream_cuda(
  202. const float2* in,
  203. float2* out,
  204. const float* taps,
  205. int n,
  206. int num_taps,
  207. gpud_stream_handle stream
  208. ) {
  209. if (n <= 0 || num_taps <= 0 || num_taps > 256) return 0;
  210. const int block = 256;
  211. const int grid = (n + block - 1) / block;
  212. size_t sharedBytes = (size_t)(block + num_taps - 1) * sizeof(float2);
  213. gpud_fir_kernel_v2<<<grid, block, sharedBytes, (cudaStream_t)stream>>>(in, out, taps, n, num_taps);
  214. return (int)cudaGetLastError();
  215. }
  216. GPUD_API int GPUD_CALL gpud_launch_decimate_cuda(
  217. const float2* in,
  218. float2* out,
  219. int n_out,
  220. int factor
  221. ) {
  222. if (n_out <= 0 || factor <= 0) return 0;
  223. const int block = 256;
  224. const int grid = (n_out + block - 1) / block;
  225. gpud_decimate_kernel<<<grid, block>>>(in, out, n_out, factor);
  226. return (int)cudaGetLastError();
  227. }
  228. GPUD_API int GPUD_CALL gpud_launch_decimate_stream_cuda(
  229. const float2* in,
  230. float2* out,
  231. int n_out,
  232. int factor,
  233. gpud_stream_handle stream
  234. ) {
  235. if (n_out <= 0 || factor <= 0) return 0;
  236. const int block = 256;
  237. const int grid = (n_out + block - 1) / block;
  238. gpud_decimate_kernel<<<grid, block, 0, (cudaStream_t)stream>>>(in, out, n_out, factor);
  239. return (int)cudaGetLastError();
  240. }
  241. __global__ void gpud_am_envelope_kernel(
  242. const float2* __restrict__ in,
  243. float* __restrict__ out,
  244. int n
  245. ) {
  246. int idx = blockIdx.x * blockDim.x + threadIdx.x;
  247. if (idx >= n) return;
  248. float2 v = in[idx];
  249. out[idx] = sqrtf(v.x * v.x + v.y * v.y);
  250. }
  251. GPUD_API int GPUD_CALL gpud_launch_am_envelope_cuda(
  252. const float2* in,
  253. float* out,
  254. int n
  255. ) {
  256. if (n <= 0) return 0;
  257. const int block = 256;
  258. const int grid = (n + block - 1) / block;
  259. gpud_am_envelope_kernel<<<grid, block>>>(in, out, n);
  260. return (int)cudaGetLastError();
  261. }
  262. __global__ void gpud_ssb_product_kernel(
  263. const float2* __restrict__ in,
  264. float* __restrict__ out,
  265. int n,
  266. double phase_inc,
  267. double phase_start
  268. ) {
  269. int idx = blockIdx.x * blockDim.x + threadIdx.x;
  270. if (idx >= n) return;
  271. double phase = phase_start + phase_inc * (double)idx;
  272. const double TWO_PI = 6.283185307179586;
  273. phase = phase - rint(phase / TWO_PI) * TWO_PI;
  274. float si, co;
  275. sincosf((float)phase, &si, &co);
  276. float2 v = in[idx];
  277. out[idx] = v.x * co - v.y * si;
  278. }
  279. GPUD_API int GPUD_CALL gpud_launch_ssb_product_cuda(
  280. const float2* in,
  281. float* out,
  282. int n,
  283. double phase_inc,
  284. double phase_start
  285. ) {
  286. if (n <= 0) return 0;
  287. const int block = 256;
  288. const int grid = (n + block - 1) / block;
  289. gpud_ssb_product_kernel<<<grid, block>>>(in, out, n, phase_inc, phase_start);
  290. return (int)cudaGetLastError();
  291. }
  292. __global__ void gpud_streaming_polyphase_accum_kernel(
  293. const float2* __restrict__ history_state,
  294. int history_len,
  295. const float2* __restrict__ shifted_new,
  296. int n_new,
  297. const float* __restrict__ polyphase_taps,
  298. int polyphase_len,
  299. int decim,
  300. int phase_len,
  301. int start_idx,
  302. int n_out,
  303. float2* __restrict__ out
  304. );
  305. __global__ void gpud_streaming_history_tail_kernel(
  306. const float2* __restrict__ history_state,
  307. int history_len,
  308. const float2* __restrict__ shifted_new,
  309. int n_new,
  310. int keep,
  311. float2* __restrict__ history_out
  312. );
  313. static __forceinline__ double gpud_reduce_phase(double phase);
  314. // Transitional legacy entrypoint retained for bring-up and comparison.
  315. // The production-native streaming path is gpud_launch_streaming_polyphase_stateful_cuda,
  316. // which preserves per-signal carry state across NEW-samples-only chunks.
  317. GPUD_API int GPUD_CALL gpud_launch_streaming_polyphase_prepare_cuda(
  318. const float2* in_new,
  319. int n_new,
  320. const float2* history_in,
  321. int history_len,
  322. const float* polyphase_taps,
  323. int polyphase_len,
  324. int decim,
  325. int num_taps,
  326. int phase_count_in,
  327. double phase_start,
  328. double phase_inc,
  329. float2* out,
  330. int* n_out,
  331. int* phase_count_out,
  332. double* phase_end_out,
  333. float2* history_out
  334. ) {
  335. if (n_new < 0 || !polyphase_taps || polyphase_len <= 0 || decim <= 0 || num_taps <= 0) return -1;
  336. const int phase_len = (num_taps + decim - 1) / decim;
  337. if (polyphase_len < decim * phase_len) return -2;
  338. const int keep = num_taps > 1 ? num_taps - 1 : 0;
  339. int clamped_history_len = history_len;
  340. if (clamped_history_len < 0) clamped_history_len = 0;
  341. if (clamped_history_len > keep) clamped_history_len = keep;
  342. if (clamped_history_len > 0 && !history_in) return -5;
  343. float2* shifted = NULL;
  344. cudaError_t err = cudaSuccess;
  345. if (n_new > 0) {
  346. if (!in_new) return -3;
  347. err = cudaMalloc((void**)&shifted, (size_t)gpud_max_i(1, n_new) * sizeof(float2));
  348. if (err != cudaSuccess) return (int)err;
  349. const int block = 256;
  350. const int grid_shift = (n_new + block - 1) / block;
  351. gpud_freq_shift_kernel<<<grid_shift, block>>>(in_new, shifted, n_new, phase_inc, phase_start);
  352. err = cudaGetLastError();
  353. if (err != cudaSuccess) {
  354. cudaFree(shifted);
  355. return (int)err;
  356. }
  357. }
  358. int phase_count = phase_count_in;
  359. if (phase_count < 0) phase_count = 0;
  360. if (phase_count >= decim) phase_count %= decim;
  361. const int total_phase = phase_count + n_new;
  362. const int out_count = total_phase / decim;
  363. if (out_count > 0) {
  364. if (!out) {
  365. cudaFree(shifted);
  366. return -4;
  367. }
  368. const int block = 256;
  369. const int grid = (out_count + block - 1) / block;
  370. const int start_idx = decim - phase_count - 1;
  371. gpud_streaming_polyphase_accum_kernel<<<grid, block>>>(
  372. history_in,
  373. clamped_history_len,
  374. shifted,
  375. n_new,
  376. polyphase_taps,
  377. polyphase_len,
  378. decim,
  379. phase_len,
  380. start_idx,
  381. out_count,
  382. out
  383. );
  384. err = cudaGetLastError();
  385. if (err != cudaSuccess) {
  386. cudaFree(shifted);
  387. return (int)err;
  388. }
  389. }
  390. if (history_out && keep > 0) {
  391. const int new_history_len = clamped_history_len + n_new < keep ? clamped_history_len + n_new : keep;
  392. if (new_history_len > 0) {
  393. const int block = 256;
  394. const int grid = (new_history_len + block - 1) / block;
  395. gpud_streaming_history_tail_kernel<<<grid, block>>>(
  396. history_in,
  397. clamped_history_len,
  398. shifted,
  399. n_new,
  400. new_history_len,
  401. history_out
  402. );
  403. err = cudaGetLastError();
  404. if (err != cudaSuccess) {
  405. cudaFree(shifted);
  406. return (int)err;
  407. }
  408. }
  409. }
  410. if (n_out) *n_out = out_count;
  411. if (phase_count_out) *phase_count_out = total_phase % decim;
  412. if (phase_end_out) *phase_end_out = gpud_reduce_phase(phase_start + phase_inc * (double)n_new);
  413. if (shifted) cudaFree(shifted);
  414. return 0;
  415. }
  416. static __device__ __forceinline__ float2 gpud_stream_sample_at(
  417. const float2* __restrict__ history_state,
  418. int history_len,
  419. const float2* __restrict__ shifted_new,
  420. int n_new,
  421. int idx
  422. ) {
  423. if (idx < 0) return make_float2(0.0f, 0.0f);
  424. if (idx < history_len) return history_state[idx];
  425. int shifted_idx = idx - history_len;
  426. if (shifted_idx < 0 || shifted_idx >= n_new) return make_float2(0.0f, 0.0f);
  427. return shifted_new[shifted_idx];
  428. }
  429. __global__ void gpud_streaming_polyphase_accum_kernel(
  430. const float2* __restrict__ history_state,
  431. int history_len,
  432. const float2* __restrict__ shifted_new,
  433. int n_new,
  434. const float* __restrict__ polyphase_taps,
  435. int polyphase_len,
  436. int decim,
  437. int phase_len,
  438. int start_idx,
  439. int n_out,
  440. float2* __restrict__ out
  441. ) {
  442. int out_idx = blockIdx.x * blockDim.x + threadIdx.x;
  443. if (out_idx >= n_out) return;
  444. int newest = history_len + start_idx + out_idx * decim;
  445. float acc_r = 0.0f;
  446. float acc_i = 0.0f;
  447. for (int p = 0; p < decim; ++p) {
  448. for (int k = 0; k < phase_len; ++k) {
  449. int tap_idx = p * phase_len + k;
  450. if (tap_idx >= polyphase_len) continue;
  451. float tap = polyphase_taps[tap_idx];
  452. if (tap == 0.0f) continue;
  453. int src_back = p + k * decim;
  454. int src_idx = newest - src_back;
  455. float2 sample = gpud_stream_sample_at(history_state, history_len, shifted_new, n_new, src_idx);
  456. acc_r += sample.x * tap;
  457. acc_i += sample.y * tap;
  458. }
  459. }
  460. out[out_idx] = make_float2(acc_r, acc_i);
  461. }
  462. __global__ void gpud_streaming_history_tail_kernel(
  463. const float2* __restrict__ history_state,
  464. int history_len,
  465. const float2* __restrict__ shifted_new,
  466. int n_new,
  467. int keep,
  468. float2* __restrict__ history_out
  469. ) {
  470. int idx = blockIdx.x * blockDim.x + threadIdx.x;
  471. if (idx >= keep) return;
  472. int combined_len = history_len + n_new;
  473. int src_idx = combined_len - keep + idx;
  474. history_out[idx] = gpud_stream_sample_at(history_state, history_len, shifted_new, n_new, src_idx);
  475. }
  476. static __forceinline__ double gpud_reduce_phase(double phase) {
  477. const double TWO_PI = 6.283185307179586;
  478. return phase - rint(phase / TWO_PI) * TWO_PI;
  479. }
  480. // Production-native candidate entrypoint for the stateful streaming extractor.
  481. // Callers provide only NEW samples; overlap+trim is intentionally not part of this path.
  482. GPUD_API int GPUD_CALL gpud_launch_streaming_polyphase_stateful_cuda(
  483. const float2* in_new,
  484. int n_new,
  485. float2* shifted_new_tmp,
  486. const float* polyphase_taps,
  487. int polyphase_len,
  488. int decim,
  489. int num_taps,
  490. float2* history_state,
  491. float2* history_scratch,
  492. int history_cap,
  493. int* history_len_io,
  494. int* phase_count_state,
  495. double* phase_state,
  496. double phase_inc,
  497. float2* out,
  498. int out_cap,
  499. int* n_out
  500. ) {
  501. if (!polyphase_taps || decim <= 0 || num_taps <= 0 || !history_len_io || !phase_count_state || !phase_state || !n_out) return -10;
  502. if (n_new < 0 || out_cap < 0 || history_cap < 0) return -11;
  503. const int phase_len = (num_taps + decim - 1) / decim;
  504. if (polyphase_len < decim * phase_len) return -12;
  505. int history_len = *history_len_io;
  506. if (history_len < 0) history_len = 0;
  507. if (history_len > history_cap) history_len = history_cap;
  508. int phase_count = *phase_count_state;
  509. if (phase_count < 0) phase_count = 0;
  510. if (phase_count >= decim) phase_count %= decim;
  511. double phase_start = *phase_state;
  512. if (n_new > 0) {
  513. if (!in_new || !shifted_new_tmp) return -13;
  514. const int block = 256;
  515. const int grid = (n_new + block - 1) / block;
  516. gpud_freq_shift_kernel<<<grid, block>>>(in_new, shifted_new_tmp, n_new, phase_inc, phase_start);
  517. cudaError_t err = cudaGetLastError();
  518. if (err != cudaSuccess) return (int)err;
  519. }
  520. const int total_phase = phase_count + n_new;
  521. const int out_count = total_phase / decim;
  522. if (out_count > out_cap) return -14;
  523. if (out_count > 0) {
  524. if (!out) return -15;
  525. const int block = 256;
  526. const int grid = (out_count + block - 1) / block;
  527. const int start_idx = decim - phase_count - 1;
  528. gpud_streaming_polyphase_accum_kernel<<<grid, block>>>(
  529. history_state,
  530. history_len,
  531. shifted_new_tmp,
  532. n_new,
  533. polyphase_taps,
  534. polyphase_len,
  535. decim,
  536. phase_len,
  537. start_idx,
  538. out_count,
  539. out
  540. );
  541. cudaError_t err = cudaGetLastError();
  542. if (err != cudaSuccess) return (int)err;
  543. }
  544. int new_history_len = history_len;
  545. if (history_cap > 0) {
  546. new_history_len = history_len + n_new;
  547. if (new_history_len > history_cap) new_history_len = history_cap;
  548. if (new_history_len > 0) {
  549. if (!history_state || !history_scratch) return -16;
  550. const int block = 256;
  551. const int grid = (new_history_len + block - 1) / block;
  552. gpud_streaming_history_tail_kernel<<<grid, block>>>(
  553. history_state,
  554. history_len,
  555. shifted_new_tmp,
  556. n_new,
  557. new_history_len,
  558. history_scratch
  559. );
  560. cudaError_t err = cudaGetLastError();
  561. if (err != cudaSuccess) return (int)err;
  562. err = cudaMemcpy(history_state, history_scratch, (size_t)new_history_len * sizeof(float2), cudaMemcpyDeviceToDevice);
  563. if (err != cudaSuccess) return (int)err;
  564. }
  565. } else {
  566. new_history_len = 0;
  567. }
  568. *history_len_io = new_history_len;
  569. *phase_count_state = total_phase % decim;
  570. *phase_state = gpud_reduce_phase(phase_start + phase_inc * (double)n_new);
  571. *n_out = out_count;
  572. return 0;
  573. }