Wideband autonomous SDR analysis engine forked from sdr-visual-suite
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

685 line
22KB

  1. //go:build cufft && windows
  2. package gpudemod
  3. /*
  4. #cgo windows CFLAGS: -I"C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v13.2/include"
  5. #cgo windows LDFLAGS: -lcudart64_13 -lkernel32
  6. #include <windows.h>
  7. #include <stdlib.h>
  8. #include <cuda_runtime.h>
  9. typedef struct { float x; float y; } gpud_float2;
  10. typedef int (__stdcall *gpud_upload_fir_taps_fn)(const float* taps, int n);
  11. typedef int (__stdcall *gpud_launch_freq_shift_fn)(const gpud_float2* in, gpud_float2* out, int n, double phase_inc, double phase_start);
  12. typedef int (__stdcall *gpud_launch_fm_discrim_fn)(const gpud_float2* in, float* out, int n);
  13. typedef int (__stdcall *gpud_launch_fir_fn)(const gpud_float2* in, gpud_float2* out, int n, int num_taps);
  14. typedef int (__stdcall *gpud_launch_decimate_fn)(const gpud_float2* in, gpud_float2* out, int n_out, int factor);
  15. typedef int (__stdcall *gpud_launch_am_envelope_fn)(const gpud_float2* in, float* out, int n);
  16. typedef int (__stdcall *gpud_launch_ssb_product_fn)(const gpud_float2* in, float* out, int n, double phase_inc, double phase_start);
  17. static HMODULE gpud_mod = NULL;
  18. static gpud_upload_fir_taps_fn gpud_p_upload_fir_taps = NULL;
  19. static gpud_launch_freq_shift_fn gpud_p_launch_freq_shift = NULL;
  20. static gpud_launch_fm_discrim_fn gpud_p_launch_fm_discrim = NULL;
  21. static gpud_launch_fir_fn gpud_p_launch_fir = NULL;
  22. static gpud_launch_decimate_fn gpud_p_launch_decimate = NULL;
  23. static gpud_launch_am_envelope_fn gpud_p_launch_am_envelope = NULL;
  24. static gpud_launch_ssb_product_fn gpud_p_launch_ssb_product = NULL;
  25. static int gpud_cuda_malloc(void **ptr, size_t bytes) { return (int)cudaMalloc(ptr, bytes); }
  26. static int gpud_cuda_free(void *ptr) { return (int)cudaFree(ptr); }
  27. static int gpud_memcpy_h2d(void *dst, const void *src, size_t bytes) { return (int)cudaMemcpy(dst, src, bytes, cudaMemcpyHostToDevice); }
  28. static int gpud_memcpy_d2h(void *dst, const void *src, size_t bytes) { return (int)cudaMemcpy(dst, src, bytes, cudaMemcpyDeviceToHost); }
  29. static int gpud_device_sync() { return (int)cudaDeviceSynchronize(); }
  30. static int gpud_load_library(const char* path) {
  31. if (gpud_mod != NULL) return 0;
  32. gpud_mod = LoadLibraryA(path);
  33. if (gpud_mod == NULL) return -1;
  34. gpud_p_upload_fir_taps = (gpud_upload_fir_taps_fn)GetProcAddress(gpud_mod, "gpud_upload_fir_taps_cuda");
  35. gpud_p_launch_freq_shift = (gpud_launch_freq_shift_fn)GetProcAddress(gpud_mod, "gpud_launch_freq_shift_cuda");
  36. gpud_p_launch_fm_discrim = (gpud_launch_fm_discrim_fn)GetProcAddress(gpud_mod, "gpud_launch_fm_discrim_cuda");
  37. gpud_p_launch_fir = (gpud_launch_fir_fn)GetProcAddress(gpud_mod, "gpud_launch_fir_cuda");
  38. gpud_p_launch_decimate = (gpud_launch_decimate_fn)GetProcAddress(gpud_mod, "gpud_launch_decimate_cuda");
  39. gpud_p_launch_am_envelope = (gpud_launch_am_envelope_fn)GetProcAddress(gpud_mod, "gpud_launch_am_envelope_cuda");
  40. gpud_p_launch_ssb_product = (gpud_launch_ssb_product_fn)GetProcAddress(gpud_mod, "gpud_launch_ssb_product_cuda");
  41. if (!gpud_p_upload_fir_taps || !gpud_p_launch_freq_shift || !gpud_p_launch_fm_discrim || !gpud_p_launch_fir || !gpud_p_launch_decimate || !gpud_p_launch_am_envelope || !gpud_p_launch_ssb_product) {
  42. FreeLibrary(gpud_mod);
  43. gpud_mod = NULL;
  44. return -2;
  45. }
  46. return 0;
  47. }
  48. static int gpud_upload_fir_taps(const float* taps, int n) {
  49. if (!gpud_p_upload_fir_taps) return -1;
  50. return gpud_p_upload_fir_taps(taps, n);
  51. }
  52. static int gpud_launch_freq_shift(gpud_float2 *in, gpud_float2 *out, int n, double phase_inc, double phase_start) {
  53. if (!gpud_p_launch_freq_shift) return -1;
  54. return gpud_p_launch_freq_shift(in, out, n, phase_inc, phase_start);
  55. }
  56. static int gpud_launch_fm_discrim(gpud_float2 *in, float *out, int n) {
  57. if (!gpud_p_launch_fm_discrim) return -1;
  58. return gpud_p_launch_fm_discrim(in, out, n);
  59. }
  60. static int gpud_launch_fir(gpud_float2 *in, gpud_float2 *out, int n, int num_taps) {
  61. if (!gpud_p_launch_fir) return -1;
  62. return gpud_p_launch_fir(in, out, n, num_taps);
  63. }
  64. static int gpud_launch_decimate(gpud_float2 *in, gpud_float2 *out, int n_out, int factor) {
  65. if (!gpud_p_launch_decimate) return -1;
  66. return gpud_p_launch_decimate(in, out, n_out, factor);
  67. }
  68. static int gpud_launch_am_envelope(gpud_float2 *in, float *out, int n) {
  69. if (!gpud_p_launch_am_envelope) return -1;
  70. return gpud_p_launch_am_envelope(in, out, n);
  71. }
  72. static int gpud_launch_ssb_product(gpud_float2 *in, float *out, int n, double phase_inc, double phase_start) {
  73. if (!gpud_p_launch_ssb_product) return -1;
  74. return gpud_p_launch_ssb_product(in, out, n, phase_inc, phase_start);
  75. }
  76. */
  77. import "C"
  78. import (
  79. "errors"
  80. "fmt"
  81. "math"
  82. "os"
  83. "path/filepath"
  84. "sync"
  85. "unsafe"
  86. "sdr-visual-suite/internal/demod"
  87. "sdr-visual-suite/internal/dsp"
  88. )
  89. type DemodType int
  90. const (
  91. DemodNFM DemodType = iota
  92. DemodWFM
  93. DemodAM
  94. DemodUSB
  95. DemodLSB
  96. DemodCW
  97. )
  98. var loadOnce sync.Once
  99. var loadErr error
  100. func ensureDLLLoaded() error {
  101. loadOnce.Do(func() {
  102. candidates := []string{}
  103. if exe, err := os.Executable(); err == nil {
  104. dir := filepath.Dir(exe)
  105. candidates = append(candidates, filepath.Join(dir, "gpudemod_kernels.dll"))
  106. }
  107. if wd, err := os.Getwd(); err == nil {
  108. candidates = append(candidates,
  109. filepath.Join(wd, "gpudemod_kernels.dll"),
  110. filepath.Join(wd, "internal", "demod", "gpudemod", "build", "gpudemod_kernels.dll"),
  111. )
  112. }
  113. if env := os.Getenv("GPUMOD_DLL"); env != "" {
  114. candidates = append([]string{env}, candidates...)
  115. }
  116. seen := map[string]bool{}
  117. for _, p := range candidates {
  118. if p == "" || seen[p] {
  119. continue
  120. }
  121. seen[p] = true
  122. if _, err := os.Stat(p); err == nil {
  123. cp := C.CString(p)
  124. res := C.gpud_load_library(cp)
  125. C.free(unsafe.Pointer(cp))
  126. if res == 0 {
  127. loadErr = nil
  128. fmt.Fprintf(os.Stderr, "gpudemod: loaded DLL %s\n", p)
  129. return
  130. }
  131. loadErr = fmt.Errorf("failed to load gpudemod DLL: %s (code %d)", p, int(res))
  132. fmt.Fprintf(os.Stderr, "gpudemod: DLL load failed for %s (code %d)\n", p, int(res))
  133. }
  134. }
  135. if loadErr == nil {
  136. loadErr = errors.New("gpudemod_kernels.dll not found")
  137. fmt.Fprintln(os.Stderr, "gpudemod: gpudemod_kernels.dll not found in search paths")
  138. }
  139. })
  140. return loadErr
  141. }
  142. type Engine struct {
  143. maxSamples int
  144. sampleRate int
  145. phase float64
  146. bfoPhase float64
  147. firTaps []float32
  148. cudaReady bool
  149. lastShiftUsedGPU bool
  150. lastFIRUsedGPU bool
  151. lastDecimUsedGPU bool
  152. lastDemodUsedGPU bool
  153. dIQIn *C.gpud_float2
  154. dShifted *C.gpud_float2
  155. dFiltered *C.gpud_float2
  156. dDecimated *C.gpud_float2
  157. dAudio *C.float
  158. iqBytes C.size_t
  159. audioBytes C.size_t
  160. }
  161. func Available() bool {
  162. if ensureDLLLoaded() != nil {
  163. return false
  164. }
  165. var count C.int
  166. if C.cudaGetDeviceCount(&count) != C.cudaSuccess {
  167. return false
  168. }
  169. return count > 0
  170. }
  171. func New(maxSamples int, sampleRate int) (*Engine, error) {
  172. if maxSamples <= 0 {
  173. return nil, errors.New("invalid maxSamples")
  174. }
  175. if sampleRate <= 0 {
  176. return nil, errors.New("invalid sampleRate")
  177. }
  178. if err := ensureDLLLoaded(); err != nil {
  179. return nil, err
  180. }
  181. if !Available() {
  182. return nil, errors.New("cuda device not available")
  183. }
  184. e := &Engine{
  185. maxSamples: maxSamples,
  186. sampleRate: sampleRate,
  187. cudaReady: true,
  188. iqBytes: C.size_t(maxSamples) * C.size_t(unsafe.Sizeof(C.gpud_float2{})),
  189. audioBytes: C.size_t(maxSamples) * C.size_t(unsafe.Sizeof(C.float(0))),
  190. }
  191. var ptr unsafe.Pointer
  192. if C.gpud_cuda_malloc(&ptr, e.iqBytes) != C.cudaSuccess {
  193. e.Close()
  194. return nil, errors.New("cudaMalloc dIQIn failed")
  195. }
  196. e.dIQIn = (*C.gpud_float2)(ptr)
  197. ptr = nil
  198. if C.gpud_cuda_malloc(&ptr, e.iqBytes) != C.cudaSuccess {
  199. e.Close()
  200. return nil, errors.New("cudaMalloc dShifted failed")
  201. }
  202. e.dShifted = (*C.gpud_float2)(ptr)
  203. ptr = nil
  204. if C.gpud_cuda_malloc(&ptr, e.iqBytes) != C.cudaSuccess {
  205. e.Close()
  206. return nil, errors.New("cudaMalloc dFiltered failed")
  207. }
  208. e.dFiltered = (*C.gpud_float2)(ptr)
  209. ptr = nil
  210. if C.gpud_cuda_malloc(&ptr, e.iqBytes) != C.cudaSuccess {
  211. e.Close()
  212. return nil, errors.New("cudaMalloc dDecimated failed")
  213. }
  214. e.dDecimated = (*C.gpud_float2)(ptr)
  215. ptr = nil
  216. if C.gpud_cuda_malloc(&ptr, e.audioBytes) != C.cudaSuccess {
  217. e.Close()
  218. return nil, errors.New("cudaMalloc dAudio failed")
  219. }
  220. e.dAudio = (*C.float)(ptr)
  221. return e, nil
  222. }
  223. func (e *Engine) SetFIR(taps []float32) {
  224. if len(taps) == 0 {
  225. e.firTaps = nil
  226. return
  227. }
  228. if len(taps) > 256 {
  229. taps = taps[:256]
  230. }
  231. e.firTaps = append(e.firTaps[:0], taps...)
  232. if e.cudaReady {
  233. _ = C.gpud_upload_fir_taps((*C.float)(unsafe.Pointer(&e.firTaps[0])), C.int(len(e.firTaps)))
  234. }
  235. }
  236. func (e *Engine) LastShiftUsedGPU() bool { return e != nil && e.lastShiftUsedGPU }
  237. func (e *Engine) LastDemodUsedGPU() bool { return e != nil && e.lastDemodUsedGPU }
  238. func (e *Engine) tryCUDAFreqShift(iq []complex64, offsetHz float64) ([]complex64, bool) {
  239. if e == nil || !e.cudaReady || len(iq) == 0 || e.dIQIn == nil || e.dShifted == nil {
  240. return nil, false
  241. }
  242. bytes := C.size_t(len(iq)) * C.size_t(unsafe.Sizeof(complex64(0)))
  243. if C.gpud_memcpy_h2d(unsafe.Pointer(e.dIQIn), unsafe.Pointer(&iq[0]), bytes) != C.cudaSuccess {
  244. return nil, false
  245. }
  246. phaseInc := -2.0 * math.Pi * offsetHz / float64(e.sampleRate)
  247. if C.gpud_launch_freq_shift(e.dIQIn, e.dShifted, C.int(len(iq)), C.double(phaseInc), C.double(e.phase)) != 0 {
  248. return nil, false
  249. }
  250. if C.gpud_device_sync() != C.cudaSuccess {
  251. return nil, false
  252. }
  253. out := make([]complex64, len(iq))
  254. if C.gpud_memcpy_d2h(unsafe.Pointer(&out[0]), unsafe.Pointer(e.dShifted), bytes) != C.cudaSuccess {
  255. return nil, false
  256. }
  257. e.phase += phaseInc * float64(len(iq))
  258. return out, true
  259. }
  260. func (e *Engine) tryCUDAFIR(iq []complex64, numTaps int) ([]complex64, bool) {
  261. if e == nil || !e.cudaReady || len(iq) == 0 || numTaps <= 0 || e.dShifted == nil || e.dFiltered == nil {
  262. return nil, false
  263. }
  264. iqBytes := C.size_t(len(iq)) * C.size_t(unsafe.Sizeof(complex64(0)))
  265. if C.gpud_memcpy_h2d(unsafe.Pointer(e.dShifted), unsafe.Pointer(&iq[0]), iqBytes) != C.cudaSuccess {
  266. return nil, false
  267. }
  268. if C.gpud_launch_fir(e.dShifted, e.dFiltered, C.int(len(iq)), C.int(numTaps)) != 0 {
  269. return nil, false
  270. }
  271. if C.gpud_device_sync() != C.cudaSuccess {
  272. return nil, false
  273. }
  274. out := make([]complex64, len(iq))
  275. if C.gpud_memcpy_d2h(unsafe.Pointer(&out[0]), unsafe.Pointer(e.dFiltered), iqBytes) != C.cudaSuccess {
  276. return nil, false
  277. }
  278. return out, true
  279. }
  280. func (e *Engine) tryCUDADecimate(filtered []complex64, factor int) ([]complex64, bool) {
  281. if e == nil || !e.cudaReady || len(filtered) == 0 || factor <= 0 || e.dFiltered == nil || e.dDecimated == nil {
  282. return nil, false
  283. }
  284. nOut := len(filtered) / factor
  285. if nOut <= 0 {
  286. return nil, false
  287. }
  288. iqBytes := C.size_t(len(filtered)) * C.size_t(unsafe.Sizeof(complex64(0)))
  289. if C.gpud_memcpy_h2d(unsafe.Pointer(e.dFiltered), unsafe.Pointer(&filtered[0]), iqBytes) != C.cudaSuccess {
  290. return nil, false
  291. }
  292. if C.gpud_launch_decimate(e.dFiltered, e.dDecimated, C.int(nOut), C.int(factor)) != 0 {
  293. return nil, false
  294. }
  295. if C.gpud_device_sync() != C.cudaSuccess {
  296. return nil, false
  297. }
  298. out := make([]complex64, nOut)
  299. outBytes := C.size_t(nOut) * C.size_t(unsafe.Sizeof(complex64(0)))
  300. if C.gpud_memcpy_d2h(unsafe.Pointer(&out[0]), unsafe.Pointer(e.dDecimated), outBytes) != C.cudaSuccess {
  301. return nil, false
  302. }
  303. return out, true
  304. }
  305. func (e *Engine) tryCUDAFMDiscrim(shifted []complex64) ([]float32, bool) {
  306. if e == nil || !e.cudaReady || len(shifted) < 2 || e.dShifted == nil || e.dAudio == nil {
  307. return nil, false
  308. }
  309. iqBytes := C.size_t(len(shifted)) * C.size_t(unsafe.Sizeof(complex64(0)))
  310. if C.gpud_memcpy_h2d(unsafe.Pointer(e.dShifted), unsafe.Pointer(&shifted[0]), iqBytes) != C.cudaSuccess {
  311. return nil, false
  312. }
  313. if C.gpud_launch_fm_discrim(e.dShifted, e.dAudio, C.int(len(shifted))) != 0 {
  314. return nil, false
  315. }
  316. if C.gpud_device_sync() != C.cudaSuccess {
  317. return nil, false
  318. }
  319. out := make([]float32, len(shifted)-1)
  320. outBytes := C.size_t(len(out)) * C.size_t(unsafe.Sizeof(float32(0)))
  321. if C.gpud_memcpy_d2h(unsafe.Pointer(&out[0]), unsafe.Pointer(e.dAudio), outBytes) != C.cudaSuccess {
  322. return nil, false
  323. }
  324. return out, true
  325. }
  326. func (e *Engine) tryCUDAAMEnvelope(shifted []complex64) ([]float32, bool) {
  327. if e == nil || !e.cudaReady || len(shifted) == 0 || e.dShifted == nil || e.dAudio == nil {
  328. return nil, false
  329. }
  330. iqBytes := C.size_t(len(shifted)) * C.size_t(unsafe.Sizeof(complex64(0)))
  331. if C.gpud_memcpy_h2d(unsafe.Pointer(e.dShifted), unsafe.Pointer(&shifted[0]), iqBytes) != C.cudaSuccess {
  332. return nil, false
  333. }
  334. if C.gpud_launch_am_envelope(e.dShifted, e.dAudio, C.int(len(shifted))) != 0 {
  335. return nil, false
  336. }
  337. if C.gpud_device_sync() != C.cudaSuccess {
  338. return nil, false
  339. }
  340. out := make([]float32, len(shifted))
  341. outBytes := C.size_t(len(out)) * C.size_t(unsafe.Sizeof(float32(0)))
  342. if C.gpud_memcpy_d2h(unsafe.Pointer(&out[0]), unsafe.Pointer(e.dAudio), outBytes) != C.cudaSuccess {
  343. return nil, false
  344. }
  345. return out, true
  346. }
  347. func (e *Engine) tryCUDASSBProduct(shifted []complex64, bfoHz float64) ([]float32, bool) {
  348. if e == nil || !e.cudaReady || len(shifted) == 0 || e.dShifted == nil || e.dAudio == nil {
  349. return nil, false
  350. }
  351. iqBytes := C.size_t(len(shifted)) * C.size_t(unsafe.Sizeof(complex64(0)))
  352. if C.gpud_memcpy_h2d(unsafe.Pointer(e.dShifted), unsafe.Pointer(&shifted[0]), iqBytes) != C.cudaSuccess {
  353. return nil, false
  354. }
  355. phaseInc := 2.0 * math.Pi * bfoHz / float64(e.sampleRate)
  356. if C.gpud_launch_ssb_product(e.dShifted, e.dAudio, C.int(len(shifted)), C.double(phaseInc), C.double(e.bfoPhase)) != 0 {
  357. return nil, false
  358. }
  359. if C.gpud_device_sync() != C.cudaSuccess {
  360. return nil, false
  361. }
  362. out := make([]float32, len(shifted))
  363. outBytes := C.size_t(len(out)) * C.size_t(unsafe.Sizeof(float32(0)))
  364. if C.gpud_memcpy_d2h(unsafe.Pointer(&out[0]), unsafe.Pointer(e.dAudio), outBytes) != C.cudaSuccess {
  365. return nil, false
  366. }
  367. e.bfoPhase += phaseInc * float64(len(shifted))
  368. return out, true
  369. }
  370. func (e *Engine) DemodFused(iq []complex64, offsetHz float64, bw float64, mode DemodType) ([]float32, int, error) {
  371. if e == nil {
  372. return nil, 0, errors.New("nil CUDA demod engine")
  373. }
  374. if !e.cudaReady {
  375. return nil, 0, errors.New("cuda demod engine is not initialized")
  376. }
  377. if len(iq) == 0 {
  378. return nil, 0, nil
  379. }
  380. e.lastShiftUsedGPU = false
  381. e.lastFIRUsedGPU = false
  382. e.lastDecimUsedGPU = false
  383. e.lastDemodUsedGPU = false
  384. if len(iq) > e.maxSamples {
  385. return nil, 0, errors.New("sample count exceeds engine capacity")
  386. }
  387. var outRate int
  388. switch mode {
  389. case DemodNFM, DemodAM, DemodUSB, DemodLSB, DemodCW:
  390. outRate = 48000
  391. case DemodWFM:
  392. outRate = 192000
  393. default:
  394. return nil, 0, errors.New("unsupported demod type")
  395. }
  396. cutoff := bw / 2
  397. if cutoff < 200 {
  398. cutoff = 200
  399. }
  400. taps := e.firTaps
  401. if len(taps) == 0 {
  402. base64 := dsp.LowpassFIR(cutoff, e.sampleRate, 101)
  403. taps = make([]float32, len(base64))
  404. for i, v := range base64 {
  405. taps[i] = float32(v)
  406. }
  407. e.SetFIR(taps)
  408. }
  409. if len(taps) == 0 {
  410. return nil, 0, errors.New("no FIR taps configured")
  411. }
  412. decim := int(math.Round(float64(e.sampleRate) / float64(outRate)))
  413. if decim < 1 {
  414. decim = 1
  415. }
  416. n := len(iq)
  417. nOut := n / decim
  418. if nOut <= 1 {
  419. return nil, 0, errors.New("not enough output samples after decimation")
  420. }
  421. bytesIn := C.size_t(n) * C.size_t(unsafe.Sizeof(complex64(0)))
  422. if C.gpud_memcpy_h2d(unsafe.Pointer(e.dIQIn), unsafe.Pointer(&iq[0]), bytesIn) != C.cudaSuccess {
  423. return nil, 0, errors.New("cudaMemcpy H2D failed")
  424. }
  425. phaseInc := -2.0 * math.Pi * offsetHz / float64(e.sampleRate)
  426. if C.gpud_launch_freq_shift(e.dIQIn, e.dShifted, C.int(n), C.double(phaseInc), C.double(e.phase)) != 0 {
  427. return nil, 0, errors.New("gpu freq shift failed")
  428. }
  429. if C.gpud_launch_fir(e.dShifted, e.dFiltered, C.int(n), C.int(len(taps))) != 0 {
  430. return nil, 0, errors.New("gpu FIR failed")
  431. }
  432. if C.gpud_launch_decimate(e.dFiltered, e.dDecimated, C.int(nOut), C.int(decim)) != 0 {
  433. return nil, 0, errors.New("gpu decimate failed")
  434. }
  435. e.lastShiftUsedGPU = true
  436. e.lastFIRUsedGPU = true
  437. e.lastDecimUsedGPU = true
  438. e.lastDemodUsedGPU = false
  439. switch mode {
  440. case DemodNFM, DemodWFM:
  441. if C.gpud_launch_fm_discrim(e.dDecimated, e.dAudio, C.int(nOut)) != 0 {
  442. return nil, 0, errors.New("gpu FM discrim failed")
  443. }
  444. out := make([]float32, nOut-1)
  445. outBytes := C.size_t(len(out)) * C.size_t(unsafe.Sizeof(float32(0)))
  446. if C.gpud_device_sync() != C.cudaSuccess {
  447. return nil, 0, errors.New("cudaDeviceSynchronize failed")
  448. }
  449. if C.gpud_memcpy_d2h(unsafe.Pointer(&out[0]), unsafe.Pointer(e.dAudio), outBytes) != C.cudaSuccess {
  450. return nil, 0, errors.New("cudaMemcpy D2H failed")
  451. }
  452. e.phase += phaseInc * float64(n)
  453. e.lastDemodUsedGPU = true
  454. return out, e.sampleRate / decim, nil
  455. case DemodAM:
  456. if C.gpud_launch_am_envelope(e.dDecimated, e.dAudio, C.int(nOut)) != 0 {
  457. return nil, 0, errors.New("gpu AM envelope failed")
  458. }
  459. out := make([]float32, nOut)
  460. outBytes := C.size_t(len(out)) * C.size_t(unsafe.Sizeof(float32(0)))
  461. if C.gpud_device_sync() != C.cudaSuccess {
  462. return nil, 0, errors.New("cudaDeviceSynchronize failed")
  463. }
  464. if C.gpud_memcpy_d2h(unsafe.Pointer(&out[0]), unsafe.Pointer(e.dAudio), outBytes) != C.cudaSuccess {
  465. return nil, 0, errors.New("cudaMemcpy D2H failed")
  466. }
  467. e.phase += phaseInc * float64(n)
  468. e.lastDemodUsedGPU = true
  469. return out, e.sampleRate / decim, nil
  470. case DemodUSB, DemodLSB, DemodCW:
  471. bfoHz := 700.0
  472. if mode == DemodLSB {
  473. bfoHz = -700.0
  474. }
  475. phaseBFO := 2.0 * math.Pi * bfoHz / float64(e.sampleRate)
  476. if C.gpud_launch_ssb_product(e.dDecimated, e.dAudio, C.int(nOut), C.double(phaseBFO), C.double(e.bfoPhase)) != 0 {
  477. return nil, 0, errors.New("gpu SSB product failed")
  478. }
  479. out := make([]float32, nOut)
  480. outBytes := C.size_t(len(out)) * C.size_t(unsafe.Sizeof(float32(0)))
  481. if C.gpud_device_sync() != C.cudaSuccess {
  482. return nil, 0, errors.New("cudaDeviceSynchronize failed")
  483. }
  484. if C.gpud_memcpy_d2h(unsafe.Pointer(&out[0]), unsafe.Pointer(e.dAudio), outBytes) != C.cudaSuccess {
  485. return nil, 0, errors.New("cudaMemcpy D2H failed")
  486. }
  487. e.phase += phaseInc * float64(n)
  488. e.bfoPhase += phaseBFO * float64(nOut)
  489. e.lastDemodUsedGPU = true
  490. return out, e.sampleRate / decim, nil
  491. default:
  492. return nil, 0, errors.New("unsupported demod type")
  493. }
  494. }
  495. func (e *Engine) Demod(iq []complex64, offsetHz float64, bw float64, mode DemodType) ([]float32, int, error) {
  496. if e == nil {
  497. return nil, 0, errors.New("nil CUDA demod engine")
  498. }
  499. if !e.cudaReady {
  500. return nil, 0, errors.New("cuda demod engine is not initialized")
  501. }
  502. if len(iq) == 0 {
  503. return nil, 0, nil
  504. }
  505. if len(iq) > e.maxSamples {
  506. return nil, 0, errors.New("sample count exceeds engine capacity")
  507. }
  508. shifted, ok := e.tryCUDAFreqShift(iq, offsetHz)
  509. e.lastShiftUsedGPU = ok && ValidateFreqShift(iq, e.sampleRate, offsetHz, shifted, 1e-3)
  510. if !e.lastShiftUsedGPU {
  511. shifted = dsp.FreqShift(iq, e.sampleRate, offsetHz)
  512. }
  513. var outRate int
  514. switch mode {
  515. case DemodNFM, DemodAM, DemodUSB, DemodLSB, DemodCW:
  516. outRate = 48000
  517. case DemodWFM:
  518. outRate = 192000
  519. default:
  520. return nil, 0, errors.New("unsupported demod type")
  521. }
  522. cutoff := bw / 2
  523. if cutoff < 200 {
  524. cutoff = 200
  525. }
  526. taps := e.firTaps
  527. if len(taps) == 0 {
  528. base64 := dsp.LowpassFIR(cutoff, e.sampleRate, 101)
  529. taps = make([]float32, len(base64))
  530. for i, v := range base64 {
  531. taps[i] = float32(v)
  532. }
  533. e.SetFIR(taps)
  534. }
  535. filtered, ok := e.tryCUDAFIR(shifted, len(taps))
  536. if ok {
  537. if validationEnabled() {
  538. e.lastFIRUsedGPU = ValidateFIR(shifted, taps, filtered, 1e-3)
  539. if !e.lastFIRUsedGPU {
  540. ftaps := make([]float64, len(taps))
  541. for i, v := range taps {
  542. ftaps[i] = float64(v)
  543. }
  544. filtered = dsp.ApplyFIR(shifted, ftaps)
  545. }
  546. } else {
  547. e.lastFIRUsedGPU = true
  548. }
  549. }
  550. if filtered == nil {
  551. ftaps := make([]float64, len(taps))
  552. for i, v := range taps {
  553. ftaps[i] = float64(v)
  554. }
  555. filtered = dsp.ApplyFIR(shifted, ftaps)
  556. }
  557. decim := int(math.Round(float64(e.sampleRate) / float64(outRate)))
  558. if decim < 1 {
  559. decim = 1
  560. }
  561. dec, ok := e.tryCUDADecimate(filtered, decim)
  562. if ok {
  563. if validationEnabled() {
  564. e.lastDecimUsedGPU = ValidateDecimate(filtered, decim, dec, 1e-3)
  565. if !e.lastDecimUsedGPU {
  566. dec = dsp.Decimate(filtered, decim)
  567. }
  568. } else {
  569. e.lastDecimUsedGPU = true
  570. }
  571. }
  572. if dec == nil {
  573. dec = dsp.Decimate(filtered, decim)
  574. }
  575. inputRate := e.sampleRate / decim
  576. e.lastDemodUsedGPU = false
  577. switch mode {
  578. case DemodNFM:
  579. if gpuAudio, ok := e.tryCUDAFMDiscrim(dec); ok {
  580. e.lastDemodUsedGPU = true
  581. return gpuAudio, inputRate, nil
  582. }
  583. return demod.NFM{}.Demod(dec, inputRate), inputRate, nil
  584. case DemodWFM:
  585. if gpuAudio, ok := e.tryCUDAFMDiscrim(dec); ok {
  586. e.lastDemodUsedGPU = true
  587. return gpuAudio, inputRate, nil
  588. }
  589. return demod.WFM{}.Demod(dec, inputRate), inputRate, nil
  590. case DemodAM:
  591. if gpuAudio, ok := e.tryCUDAAMEnvelope(dec); ok {
  592. e.lastDemodUsedGPU = true
  593. return gpuAudio, inputRate, nil
  594. }
  595. return demod.AM{}.Demod(dec, inputRate), inputRate, nil
  596. case DemodUSB:
  597. if gpuAudio, ok := e.tryCUDASSBProduct(dec, 700.0); ok {
  598. e.lastDemodUsedGPU = true
  599. return gpuAudio, inputRate, nil
  600. }
  601. return demod.USB{}.Demod(dec, inputRate), inputRate, nil
  602. case DemodLSB:
  603. if gpuAudio, ok := e.tryCUDASSBProduct(dec, -700.0); ok {
  604. e.lastDemodUsedGPU = true
  605. return gpuAudio, inputRate, nil
  606. }
  607. return demod.LSB{}.Demod(dec, inputRate), inputRate, nil
  608. case DemodCW:
  609. if gpuAudio, ok := e.tryCUDASSBProduct(dec, 700.0); ok {
  610. e.lastDemodUsedGPU = true
  611. return gpuAudio, inputRate, nil
  612. }
  613. return demod.CW{}.Demod(dec, inputRate), inputRate, nil
  614. default:
  615. return nil, 0, errors.New("unsupported demod type")
  616. }
  617. }
  618. func (e *Engine) Close() {
  619. if e == nil {
  620. return
  621. }
  622. if e.dIQIn != nil {
  623. _ = C.gpud_cuda_free(unsafe.Pointer(e.dIQIn))
  624. e.dIQIn = nil
  625. }
  626. if e.dShifted != nil {
  627. _ = C.gpud_cuda_free(unsafe.Pointer(e.dShifted))
  628. e.dShifted = nil
  629. }
  630. if e.dFiltered != nil {
  631. _ = C.gpud_cuda_free(unsafe.Pointer(e.dFiltered))
  632. e.dFiltered = nil
  633. }
  634. if e.dDecimated != nil {
  635. _ = C.gpud_cuda_free(unsafe.Pointer(e.dDecimated))
  636. e.dDecimated = nil
  637. }
  638. if e.dAudio != nil {
  639. _ = C.gpud_cuda_free(unsafe.Pointer(e.dAudio))
  640. e.dAudio = nil
  641. }
  642. e.firTaps = nil
  643. e.cudaReady = false
  644. }