Você não pode selecionar mais de 25 tópicos Os tópicos devem começar com uma letra ou um número, podem incluir traços ('-') e podem ter até 35 caracteres.

767 linhas
24KB

  1. //go:build cufft && windows
  2. package gpudemod
  3. /*
  4. #cgo windows CFLAGS: -I"C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v13.2/include"
  5. #cgo windows LDFLAGS: -lcudart64_13 -lkernel32
  6. #include <windows.h>
  7. #include <stdlib.h>
  8. #include <cuda_runtime.h>
  9. typedef struct { float x; float y; } gpud_float2;
  10. typedef void* gpud_stream_handle;
  11. typedef int (__stdcall *gpud_stream_create_fn)(gpud_stream_handle* out);
  12. typedef int (__stdcall *gpud_stream_destroy_fn)(gpud_stream_handle stream);
  13. typedef int (__stdcall *gpud_stream_sync_fn)(gpud_stream_handle stream);
  14. typedef int (__stdcall *gpud_upload_fir_taps_fn)(const float* taps, int n);
  15. typedef int (__stdcall *gpud_launch_freq_shift_fn)(const gpud_float2* in, gpud_float2* out, int n, double phase_inc, double phase_start);
  16. typedef int (__stdcall *gpud_launch_fm_discrim_fn)(const gpud_float2* in, float* out, int n);
  17. typedef int (__stdcall *gpud_launch_fir_fn)(const gpud_float2* in, gpud_float2* out, int n, int num_taps);
  18. typedef int (__stdcall *gpud_launch_decimate_fn)(const gpud_float2* in, gpud_float2* out, int n_out, int factor);
  19. typedef int (__stdcall *gpud_launch_am_envelope_fn)(const gpud_float2* in, float* out, int n);
  20. typedef int (__stdcall *gpud_launch_ssb_product_fn)(const gpud_float2* in, float* out, int n, double phase_inc, double phase_start);
  21. static HMODULE gpud_mod = NULL;
  22. static gpud_upload_fir_taps_fn gpud_p_upload_fir_taps = NULL;
  23. static gpud_launch_freq_shift_fn gpud_p_launch_freq_shift = NULL;
  24. static gpud_launch_fm_discrim_fn gpud_p_launch_fm_discrim = NULL;
  25. static gpud_launch_fir_fn gpud_p_launch_fir = NULL;
  26. static gpud_launch_decimate_fn gpud_p_launch_decimate = NULL;
  27. static gpud_launch_am_envelope_fn gpud_p_launch_am_envelope = NULL;
  28. static gpud_launch_ssb_product_fn gpud_p_launch_ssb_product = NULL;
  29. static int gpud_cuda_malloc(void **ptr, size_t bytes) { return (int)cudaMalloc(ptr, bytes); }
  30. static int gpud_cuda_free(void *ptr) { return (int)cudaFree(ptr); }
  31. static int gpud_memcpy_h2d(void *dst, const void *src, size_t bytes) { return (int)cudaMemcpy(dst, src, bytes, cudaMemcpyHostToDevice); }
  32. static int gpud_memcpy_d2h(void *dst, const void *src, size_t bytes) { return (int)cudaMemcpy(dst, src, bytes, cudaMemcpyDeviceToHost); }
  33. static int gpud_device_sync() { return (int)cudaDeviceSynchronize(); }
  34. static int gpud_load_library(const char* path) {
  35. if (gpud_mod != NULL) return 0;
  36. gpud_mod = LoadLibraryA(path);
  37. if (gpud_mod == NULL) return -1;
  38. gpud_p_upload_fir_taps = (gpud_upload_fir_taps_fn)GetProcAddress(gpud_mod, "gpud_upload_fir_taps_cuda");
  39. gpud_p_launch_freq_shift = (gpud_launch_freq_shift_fn)GetProcAddress(gpud_mod, "gpud_launch_freq_shift_cuda");
  40. gpud_p_launch_fm_discrim = (gpud_launch_fm_discrim_fn)GetProcAddress(gpud_mod, "gpud_launch_fm_discrim_cuda");
  41. gpud_p_launch_fir = (gpud_launch_fir_fn)GetProcAddress(gpud_mod, "gpud_launch_fir_cuda");
  42. gpud_p_launch_decimate = (gpud_launch_decimate_fn)GetProcAddress(gpud_mod, "gpud_launch_decimate_cuda");
  43. gpud_p_launch_am_envelope = (gpud_launch_am_envelope_fn)GetProcAddress(gpud_mod, "gpud_launch_am_envelope_cuda");
  44. gpud_p_launch_ssb_product = (gpud_launch_ssb_product_fn)GetProcAddress(gpud_mod, "gpud_launch_ssb_product_cuda");
  45. if (!gpud_p_upload_fir_taps || !gpud_p_launch_freq_shift || !gpud_p_launch_fm_discrim || !gpud_p_launch_fir || !gpud_p_launch_decimate || !gpud_p_launch_am_envelope || !gpud_p_launch_ssb_product) {
  46. FreeLibrary(gpud_mod);
  47. gpud_mod = NULL;
  48. return -2;
  49. }
  50. return 0;
  51. }
  52. static int gpud_upload_fir_taps(const float* taps, int n) {
  53. if (!gpud_p_upload_fir_taps) return -1;
  54. return gpud_p_upload_fir_taps(taps, n);
  55. }
  56. static int gpud_launch_freq_shift(gpud_float2 *in, gpud_float2 *out, int n, double phase_inc, double phase_start) {
  57. if (!gpud_p_launch_freq_shift) return -1;
  58. return gpud_p_launch_freq_shift(in, out, n, phase_inc, phase_start);
  59. }
  60. static int gpud_launch_fm_discrim(gpud_float2 *in, float *out, int n) {
  61. if (!gpud_p_launch_fm_discrim) return -1;
  62. return gpud_p_launch_fm_discrim(in, out, n);
  63. }
  64. static int gpud_launch_fir(gpud_float2 *in, gpud_float2 *out, int n, int num_taps) {
  65. if (!gpud_p_launch_fir) return -1;
  66. return gpud_p_launch_fir(in, out, n, num_taps);
  67. }
  68. static int gpud_launch_decimate(gpud_float2 *in, gpud_float2 *out, int n_out, int factor) {
  69. if (!gpud_p_launch_decimate) return -1;
  70. return gpud_p_launch_decimate(in, out, n_out, factor);
  71. }
  72. static int gpud_launch_am_envelope(gpud_float2 *in, float *out, int n) {
  73. if (!gpud_p_launch_am_envelope) return -1;
  74. return gpud_p_launch_am_envelope(in, out, n);
  75. }
  76. static int gpud_launch_ssb_product(gpud_float2 *in, float *out, int n, double phase_inc, double phase_start) {
  77. if (!gpud_p_launch_ssb_product) return -1;
  78. return gpud_p_launch_ssb_product(in, out, n, phase_inc, phase_start);
  79. }
  80. */
  81. import "C"
  82. import (
  83. "errors"
  84. "fmt"
  85. "math"
  86. "os"
  87. "path/filepath"
  88. "sync"
  89. "unsafe"
  90. "sdr-visual-suite/internal/demod"
  91. "sdr-visual-suite/internal/dsp"
  92. )
  93. type DemodType int
  94. const (
  95. DemodNFM DemodType = iota
  96. DemodWFM
  97. DemodAM
  98. DemodUSB
  99. DemodLSB
  100. DemodCW
  101. )
  102. var loadOnce sync.Once
  103. var loadErr error
  104. func ensureDLLLoaded() error {
  105. loadOnce.Do(func() {
  106. candidates := []string{}
  107. if exe, err := os.Executable(); err == nil {
  108. dir := filepath.Dir(exe)
  109. candidates = append(candidates, filepath.Join(dir, "gpudemod_kernels.dll"))
  110. }
  111. if wd, err := os.Getwd(); err == nil {
  112. candidates = append(candidates,
  113. filepath.Join(wd, "gpudemod_kernels.dll"),
  114. filepath.Join(wd, "internal", "demod", "gpudemod", "build", "gpudemod_kernels.dll"),
  115. )
  116. }
  117. if env := os.Getenv("GPUMOD_DLL"); env != "" {
  118. candidates = append([]string{env}, candidates...)
  119. }
  120. seen := map[string]bool{}
  121. for _, p := range candidates {
  122. if p == "" || seen[p] {
  123. continue
  124. }
  125. seen[p] = true
  126. if _, err := os.Stat(p); err == nil {
  127. cp := C.CString(p)
  128. res := C.gpud_load_library(cp)
  129. C.free(unsafe.Pointer(cp))
  130. if res == 0 {
  131. loadErr = nil
  132. fmt.Fprintf(os.Stderr, "gpudemod: loaded DLL %s\n", p)
  133. return
  134. }
  135. loadErr = fmt.Errorf("failed to load gpudemod DLL: %s (code %d)", p, int(res))
  136. fmt.Fprintf(os.Stderr, "gpudemod: DLL load failed for %s (code %d)\n", p, int(res))
  137. }
  138. }
  139. if loadErr == nil {
  140. loadErr = errors.New("gpudemod_kernels.dll not found")
  141. fmt.Fprintln(os.Stderr, "gpudemod: gpudemod_kernels.dll not found in search paths")
  142. }
  143. })
  144. return loadErr
  145. }
  146. type Engine struct {
  147. maxSamples int
  148. sampleRate int
  149. phase float64
  150. bfoPhase float64
  151. firTaps []float32
  152. cudaReady bool
  153. lastShiftUsedGPU bool
  154. lastFIRUsedGPU bool
  155. lastDecimUsedGPU bool
  156. lastDemodUsedGPU bool
  157. dIQIn *C.gpud_float2
  158. dShifted *C.gpud_float2
  159. dFiltered *C.gpud_float2
  160. dDecimated *C.gpud_float2
  161. dAudio *C.float
  162. iqBytes C.size_t
  163. audioBytes C.size_t
  164. }
  165. func Available() bool {
  166. if ensureDLLLoaded() != nil {
  167. return false
  168. }
  169. var count C.int
  170. if C.cudaGetDeviceCount(&count) != C.cudaSuccess {
  171. return false
  172. }
  173. return count > 0
  174. }
  175. func New(maxSamples int, sampleRate int) (*Engine, error) {
  176. if maxSamples <= 0 {
  177. return nil, errors.New("invalid maxSamples")
  178. }
  179. if sampleRate <= 0 {
  180. return nil, errors.New("invalid sampleRate")
  181. }
  182. if err := ensureDLLLoaded(); err != nil {
  183. return nil, err
  184. }
  185. if !Available() {
  186. return nil, errors.New("cuda device not available")
  187. }
  188. e := &Engine{
  189. maxSamples: maxSamples,
  190. sampleRate: sampleRate,
  191. cudaReady: true,
  192. iqBytes: C.size_t(maxSamples) * C.size_t(unsafe.Sizeof(C.gpud_float2{})),
  193. audioBytes: C.size_t(maxSamples) * C.size_t(unsafe.Sizeof(C.float(0))),
  194. }
  195. var ptr unsafe.Pointer
  196. if C.gpud_cuda_malloc(&ptr, e.iqBytes) != C.cudaSuccess {
  197. e.Close()
  198. return nil, errors.New("cudaMalloc dIQIn failed")
  199. }
  200. e.dIQIn = (*C.gpud_float2)(ptr)
  201. ptr = nil
  202. if C.gpud_cuda_malloc(&ptr, e.iqBytes) != C.cudaSuccess {
  203. e.Close()
  204. return nil, errors.New("cudaMalloc dShifted failed")
  205. }
  206. e.dShifted = (*C.gpud_float2)(ptr)
  207. ptr = nil
  208. if C.gpud_cuda_malloc(&ptr, e.iqBytes) != C.cudaSuccess {
  209. e.Close()
  210. return nil, errors.New("cudaMalloc dFiltered failed")
  211. }
  212. e.dFiltered = (*C.gpud_float2)(ptr)
  213. ptr = nil
  214. if C.gpud_cuda_malloc(&ptr, e.iqBytes) != C.cudaSuccess {
  215. e.Close()
  216. return nil, errors.New("cudaMalloc dDecimated failed")
  217. }
  218. e.dDecimated = (*C.gpud_float2)(ptr)
  219. ptr = nil
  220. if C.gpud_cuda_malloc(&ptr, e.audioBytes) != C.cudaSuccess {
  221. e.Close()
  222. return nil, errors.New("cudaMalloc dAudio failed")
  223. }
  224. e.dAudio = (*C.float)(ptr)
  225. return e, nil
  226. }
  227. func (e *Engine) SetFIR(taps []float32) {
  228. if len(taps) == 0 {
  229. e.firTaps = nil
  230. return
  231. }
  232. if len(taps) > 256 {
  233. taps = taps[:256]
  234. }
  235. e.firTaps = append(e.firTaps[:0], taps...)
  236. if e.cudaReady {
  237. _ = C.gpud_upload_fir_taps((*C.float)(unsafe.Pointer(&e.firTaps[0])), C.int(len(e.firTaps)))
  238. }
  239. }
  240. func (e *Engine) LastShiftUsedGPU() bool { return e != nil && e.lastShiftUsedGPU }
  241. func (e *Engine) LastDemodUsedGPU() bool { return e != nil && e.lastDemodUsedGPU }
  242. func (e *Engine) tryCUDAFreqShift(iq []complex64, offsetHz float64) ([]complex64, bool) {
  243. if e == nil || !e.cudaReady || len(iq) == 0 || e.dIQIn == nil || e.dShifted == nil {
  244. return nil, false
  245. }
  246. bytes := C.size_t(len(iq)) * C.size_t(unsafe.Sizeof(complex64(0)))
  247. if C.gpud_memcpy_h2d(unsafe.Pointer(e.dIQIn), unsafe.Pointer(&iq[0]), bytes) != C.cudaSuccess {
  248. return nil, false
  249. }
  250. phaseInc := -2.0 * math.Pi * offsetHz / float64(e.sampleRate)
  251. if C.gpud_launch_freq_shift(e.dIQIn, e.dShifted, C.int(len(iq)), C.double(phaseInc), C.double(e.phase)) != 0 {
  252. return nil, false
  253. }
  254. if C.gpud_device_sync() != C.cudaSuccess {
  255. return nil, false
  256. }
  257. out := make([]complex64, len(iq))
  258. if C.gpud_memcpy_d2h(unsafe.Pointer(&out[0]), unsafe.Pointer(e.dShifted), bytes) != C.cudaSuccess {
  259. return nil, false
  260. }
  261. e.phase += phaseInc * float64(len(iq))
  262. return out, true
  263. }
  264. func (e *Engine) tryCUDAFIR(iq []complex64, numTaps int) ([]complex64, bool) {
  265. if e == nil || !e.cudaReady || len(iq) == 0 || numTaps <= 0 || e.dShifted == nil || e.dFiltered == nil {
  266. return nil, false
  267. }
  268. iqBytes := C.size_t(len(iq)) * C.size_t(unsafe.Sizeof(complex64(0)))
  269. if C.gpud_memcpy_h2d(unsafe.Pointer(e.dShifted), unsafe.Pointer(&iq[0]), iqBytes) != C.cudaSuccess {
  270. return nil, false
  271. }
  272. if C.gpud_launch_fir(e.dShifted, e.dFiltered, C.int(len(iq)), C.int(numTaps)) != 0 {
  273. return nil, false
  274. }
  275. if C.gpud_device_sync() != C.cudaSuccess {
  276. return nil, false
  277. }
  278. out := make([]complex64, len(iq))
  279. if C.gpud_memcpy_d2h(unsafe.Pointer(&out[0]), unsafe.Pointer(e.dFiltered), iqBytes) != C.cudaSuccess {
  280. return nil, false
  281. }
  282. return out, true
  283. }
  284. func (e *Engine) tryCUDADecimate(filtered []complex64, factor int) ([]complex64, bool) {
  285. if e == nil || !e.cudaReady || len(filtered) == 0 || factor <= 0 || e.dFiltered == nil || e.dDecimated == nil {
  286. return nil, false
  287. }
  288. nOut := len(filtered) / factor
  289. if nOut <= 0 {
  290. return nil, false
  291. }
  292. iqBytes := C.size_t(len(filtered)) * C.size_t(unsafe.Sizeof(complex64(0)))
  293. if C.gpud_memcpy_h2d(unsafe.Pointer(e.dFiltered), unsafe.Pointer(&filtered[0]), iqBytes) != C.cudaSuccess {
  294. return nil, false
  295. }
  296. if C.gpud_launch_decimate(e.dFiltered, e.dDecimated, C.int(nOut), C.int(factor)) != 0 {
  297. return nil, false
  298. }
  299. if C.gpud_device_sync() != C.cudaSuccess {
  300. return nil, false
  301. }
  302. out := make([]complex64, nOut)
  303. outBytes := C.size_t(nOut) * C.size_t(unsafe.Sizeof(complex64(0)))
  304. if C.gpud_memcpy_d2h(unsafe.Pointer(&out[0]), unsafe.Pointer(e.dDecimated), outBytes) != C.cudaSuccess {
  305. return nil, false
  306. }
  307. return out, true
  308. }
  309. func (e *Engine) tryCUDAFMDiscrim(shifted []complex64) ([]float32, bool) {
  310. if e == nil || !e.cudaReady || len(shifted) < 2 || e.dShifted == nil || e.dAudio == nil {
  311. return nil, false
  312. }
  313. iqBytes := C.size_t(len(shifted)) * C.size_t(unsafe.Sizeof(complex64(0)))
  314. if C.gpud_memcpy_h2d(unsafe.Pointer(e.dShifted), unsafe.Pointer(&shifted[0]), iqBytes) != C.cudaSuccess {
  315. return nil, false
  316. }
  317. if C.gpud_launch_fm_discrim(e.dShifted, e.dAudio, C.int(len(shifted))) != 0 {
  318. return nil, false
  319. }
  320. if C.gpud_device_sync() != C.cudaSuccess {
  321. return nil, false
  322. }
  323. out := make([]float32, len(shifted)-1)
  324. outBytes := C.size_t(len(out)) * C.size_t(unsafe.Sizeof(float32(0)))
  325. if C.gpud_memcpy_d2h(unsafe.Pointer(&out[0]), unsafe.Pointer(e.dAudio), outBytes) != C.cudaSuccess {
  326. return nil, false
  327. }
  328. return out, true
  329. }
  330. func (e *Engine) tryCUDAAMEnvelope(shifted []complex64) ([]float32, bool) {
  331. if e == nil || !e.cudaReady || len(shifted) == 0 || e.dShifted == nil || e.dAudio == nil {
  332. return nil, false
  333. }
  334. iqBytes := C.size_t(len(shifted)) * C.size_t(unsafe.Sizeof(complex64(0)))
  335. if C.gpud_memcpy_h2d(unsafe.Pointer(e.dShifted), unsafe.Pointer(&shifted[0]), iqBytes) != C.cudaSuccess {
  336. return nil, false
  337. }
  338. if C.gpud_launch_am_envelope(e.dShifted, e.dAudio, C.int(len(shifted))) != 0 {
  339. return nil, false
  340. }
  341. if C.gpud_device_sync() != C.cudaSuccess {
  342. return nil, false
  343. }
  344. out := make([]float32, len(shifted))
  345. outBytes := C.size_t(len(out)) * C.size_t(unsafe.Sizeof(float32(0)))
  346. if C.gpud_memcpy_d2h(unsafe.Pointer(&out[0]), unsafe.Pointer(e.dAudio), outBytes) != C.cudaSuccess {
  347. return nil, false
  348. }
  349. return out, true
  350. }
  351. func (e *Engine) tryCUDASSBProduct(shifted []complex64, bfoHz float64) ([]float32, bool) {
  352. if e == nil || !e.cudaReady || len(shifted) == 0 || e.dShifted == nil || e.dAudio == nil {
  353. return nil, false
  354. }
  355. iqBytes := C.size_t(len(shifted)) * C.size_t(unsafe.Sizeof(complex64(0)))
  356. if C.gpud_memcpy_h2d(unsafe.Pointer(e.dShifted), unsafe.Pointer(&shifted[0]), iqBytes) != C.cudaSuccess {
  357. return nil, false
  358. }
  359. phaseInc := 2.0 * math.Pi * bfoHz / float64(e.sampleRate)
  360. if C.gpud_launch_ssb_product(e.dShifted, e.dAudio, C.int(len(shifted)), C.double(phaseInc), C.double(e.bfoPhase)) != 0 {
  361. return nil, false
  362. }
  363. if C.gpud_device_sync() != C.cudaSuccess {
  364. return nil, false
  365. }
  366. out := make([]float32, len(shifted))
  367. outBytes := C.size_t(len(out)) * C.size_t(unsafe.Sizeof(float32(0)))
  368. if C.gpud_memcpy_d2h(unsafe.Pointer(&out[0]), unsafe.Pointer(e.dAudio), outBytes) != C.cudaSuccess {
  369. return nil, false
  370. }
  371. e.bfoPhase += phaseInc * float64(len(shifted))
  372. return out, true
  373. }
  374. func (e *Engine) ShiftFilterDecimate(iq []complex64, offsetHz float64, bw float64, outRate int) ([]complex64, int, error) {
  375. if e == nil {
  376. return nil, 0, errors.New("nil CUDA demod engine")
  377. }
  378. if !e.cudaReady {
  379. return nil, 0, errors.New("cuda demod engine is not initialized")
  380. }
  381. if len(iq) == 0 {
  382. return nil, 0, nil
  383. }
  384. if len(iq) > e.maxSamples {
  385. return nil, 0, errors.New("sample count exceeds engine capacity")
  386. }
  387. if outRate <= 0 {
  388. return nil, 0, errors.New("invalid output sample rate")
  389. }
  390. e.lastShiftUsedGPU = false
  391. e.lastFIRUsedGPU = false
  392. e.lastDecimUsedGPU = false
  393. e.lastDemodUsedGPU = false
  394. cutoff := bw / 2
  395. if cutoff < 200 {
  396. cutoff = 200
  397. }
  398. taps := e.firTaps
  399. if len(taps) == 0 {
  400. base64 := dsp.LowpassFIR(cutoff, e.sampleRate, 101)
  401. taps = make([]float32, len(base64))
  402. for i, v := range base64 {
  403. taps[i] = float32(v)
  404. }
  405. e.SetFIR(taps)
  406. }
  407. if len(taps) == 0 {
  408. return nil, 0, errors.New("no FIR taps configured")
  409. }
  410. decim := int(math.Round(float64(e.sampleRate) / float64(outRate)))
  411. if decim < 1 {
  412. decim = 1
  413. }
  414. n := len(iq)
  415. nOut := n / decim
  416. if nOut <= 0 {
  417. return nil, 0, errors.New("not enough output samples after decimation")
  418. }
  419. bytesIn := C.size_t(n) * C.size_t(unsafe.Sizeof(complex64(0)))
  420. if C.gpud_memcpy_h2d(unsafe.Pointer(e.dIQIn), unsafe.Pointer(&iq[0]), bytesIn) != C.cudaSuccess {
  421. return nil, 0, errors.New("cudaMemcpy H2D failed")
  422. }
  423. phaseInc := -2.0 * math.Pi * offsetHz / float64(e.sampleRate)
  424. if C.gpud_launch_freq_shift(e.dIQIn, e.dShifted, C.int(n), C.double(phaseInc), C.double(e.phase)) != 0 {
  425. return nil, 0, errors.New("gpu freq shift failed")
  426. }
  427. if C.gpud_launch_fir(e.dShifted, e.dFiltered, C.int(n), C.int(len(taps))) != 0 {
  428. return nil, 0, errors.New("gpu FIR failed")
  429. }
  430. if C.gpud_launch_decimate(e.dFiltered, e.dDecimated, C.int(nOut), C.int(decim)) != 0 {
  431. return nil, 0, errors.New("gpu decimate failed")
  432. }
  433. if C.gpud_device_sync() != C.cudaSuccess {
  434. return nil, 0, errors.New("cudaDeviceSynchronize failed")
  435. }
  436. out := make([]complex64, nOut)
  437. outBytes := C.size_t(nOut) * C.size_t(unsafe.Sizeof(complex64(0)))
  438. if C.gpud_memcpy_d2h(unsafe.Pointer(&out[0]), unsafe.Pointer(e.dDecimated), outBytes) != C.cudaSuccess {
  439. return nil, 0, errors.New("cudaMemcpy D2H failed")
  440. }
  441. e.phase += phaseInc * float64(n)
  442. e.lastShiftUsedGPU = true
  443. e.lastFIRUsedGPU = true
  444. e.lastDecimUsedGPU = true
  445. return out, e.sampleRate / decim, nil
  446. }
  447. func (e *Engine) DemodFused(iq []complex64, offsetHz float64, bw float64, mode DemodType) ([]float32, int, error) {
  448. if e == nil {
  449. return nil, 0, errors.New("nil CUDA demod engine")
  450. }
  451. if !e.cudaReady {
  452. return nil, 0, errors.New("cuda demod engine is not initialized")
  453. }
  454. if len(iq) == 0 {
  455. return nil, 0, nil
  456. }
  457. e.lastShiftUsedGPU = false
  458. e.lastFIRUsedGPU = false
  459. e.lastDecimUsedGPU = false
  460. e.lastDemodUsedGPU = false
  461. if len(iq) > e.maxSamples {
  462. return nil, 0, errors.New("sample count exceeds engine capacity")
  463. }
  464. var outRate int
  465. switch mode {
  466. case DemodNFM, DemodAM, DemodUSB, DemodLSB, DemodCW:
  467. outRate = 48000
  468. case DemodWFM:
  469. outRate = 192000
  470. default:
  471. return nil, 0, errors.New("unsupported demod type")
  472. }
  473. cutoff := bw / 2
  474. if cutoff < 200 {
  475. cutoff = 200
  476. }
  477. taps := e.firTaps
  478. if len(taps) == 0 {
  479. base64 := dsp.LowpassFIR(cutoff, e.sampleRate, 101)
  480. taps = make([]float32, len(base64))
  481. for i, v := range base64 {
  482. taps[i] = float32(v)
  483. }
  484. e.SetFIR(taps)
  485. }
  486. if len(taps) == 0 {
  487. return nil, 0, errors.New("no FIR taps configured")
  488. }
  489. decim := int(math.Round(float64(e.sampleRate) / float64(outRate)))
  490. if decim < 1 {
  491. decim = 1
  492. }
  493. n := len(iq)
  494. nOut := n / decim
  495. if nOut <= 1 {
  496. return nil, 0, errors.New("not enough output samples after decimation")
  497. }
  498. bytesIn := C.size_t(n) * C.size_t(unsafe.Sizeof(complex64(0)))
  499. if C.gpud_memcpy_h2d(unsafe.Pointer(e.dIQIn), unsafe.Pointer(&iq[0]), bytesIn) != C.cudaSuccess {
  500. return nil, 0, errors.New("cudaMemcpy H2D failed")
  501. }
  502. phaseInc := -2.0 * math.Pi * offsetHz / float64(e.sampleRate)
  503. if C.gpud_launch_freq_shift(e.dIQIn, e.dShifted, C.int(n), C.double(phaseInc), C.double(e.phase)) != 0 {
  504. return nil, 0, errors.New("gpu freq shift failed")
  505. }
  506. if C.gpud_launch_fir(e.dShifted, e.dFiltered, C.int(n), C.int(len(taps))) != 0 {
  507. return nil, 0, errors.New("gpu FIR failed")
  508. }
  509. if C.gpud_launch_decimate(e.dFiltered, e.dDecimated, C.int(nOut), C.int(decim)) != 0 {
  510. return nil, 0, errors.New("gpu decimate failed")
  511. }
  512. e.lastShiftUsedGPU = true
  513. e.lastFIRUsedGPU = true
  514. e.lastDecimUsedGPU = true
  515. e.lastDemodUsedGPU = false
  516. switch mode {
  517. case DemodNFM, DemodWFM:
  518. if C.gpud_launch_fm_discrim(e.dDecimated, e.dAudio, C.int(nOut)) != 0 {
  519. return nil, 0, errors.New("gpu FM discrim failed")
  520. }
  521. out := make([]float32, nOut-1)
  522. outBytes := C.size_t(len(out)) * C.size_t(unsafe.Sizeof(float32(0)))
  523. if C.gpud_device_sync() != C.cudaSuccess {
  524. return nil, 0, errors.New("cudaDeviceSynchronize failed")
  525. }
  526. if C.gpud_memcpy_d2h(unsafe.Pointer(&out[0]), unsafe.Pointer(e.dAudio), outBytes) != C.cudaSuccess {
  527. return nil, 0, errors.New("cudaMemcpy D2H failed")
  528. }
  529. e.phase += phaseInc * float64(n)
  530. e.lastDemodUsedGPU = true
  531. return out, e.sampleRate / decim, nil
  532. case DemodAM:
  533. if C.gpud_launch_am_envelope(e.dDecimated, e.dAudio, C.int(nOut)) != 0 {
  534. return nil, 0, errors.New("gpu AM envelope failed")
  535. }
  536. out := make([]float32, nOut)
  537. outBytes := C.size_t(len(out)) * C.size_t(unsafe.Sizeof(float32(0)))
  538. if C.gpud_device_sync() != C.cudaSuccess {
  539. return nil, 0, errors.New("cudaDeviceSynchronize failed")
  540. }
  541. if C.gpud_memcpy_d2h(unsafe.Pointer(&out[0]), unsafe.Pointer(e.dAudio), outBytes) != C.cudaSuccess {
  542. return nil, 0, errors.New("cudaMemcpy D2H failed")
  543. }
  544. e.phase += phaseInc * float64(n)
  545. e.lastDemodUsedGPU = true
  546. return out, e.sampleRate / decim, nil
  547. case DemodUSB, DemodLSB, DemodCW:
  548. bfoHz := 700.0
  549. if mode == DemodLSB {
  550. bfoHz = -700.0
  551. }
  552. phaseBFO := 2.0 * math.Pi * bfoHz / float64(e.sampleRate)
  553. if C.gpud_launch_ssb_product(e.dDecimated, e.dAudio, C.int(nOut), C.double(phaseBFO), C.double(e.bfoPhase)) != 0 {
  554. return nil, 0, errors.New("gpu SSB product failed")
  555. }
  556. out := make([]float32, nOut)
  557. outBytes := C.size_t(len(out)) * C.size_t(unsafe.Sizeof(float32(0)))
  558. if C.gpud_device_sync() != C.cudaSuccess {
  559. return nil, 0, errors.New("cudaDeviceSynchronize failed")
  560. }
  561. if C.gpud_memcpy_d2h(unsafe.Pointer(&out[0]), unsafe.Pointer(e.dAudio), outBytes) != C.cudaSuccess {
  562. return nil, 0, errors.New("cudaMemcpy D2H failed")
  563. }
  564. e.phase += phaseInc * float64(n)
  565. e.bfoPhase += phaseBFO * float64(nOut)
  566. e.lastDemodUsedGPU = true
  567. return out, e.sampleRate / decim, nil
  568. default:
  569. return nil, 0, errors.New("unsupported demod type")
  570. }
  571. }
  572. func (e *Engine) Demod(iq []complex64, offsetHz float64, bw float64, mode DemodType) ([]float32, int, error) {
  573. if e == nil {
  574. return nil, 0, errors.New("nil CUDA demod engine")
  575. }
  576. if !e.cudaReady {
  577. return nil, 0, errors.New("cuda demod engine is not initialized")
  578. }
  579. if len(iq) == 0 {
  580. return nil, 0, nil
  581. }
  582. if len(iq) > e.maxSamples {
  583. return nil, 0, errors.New("sample count exceeds engine capacity")
  584. }
  585. shifted, ok := e.tryCUDAFreqShift(iq, offsetHz)
  586. e.lastShiftUsedGPU = ok && ValidateFreqShift(iq, e.sampleRate, offsetHz, shifted, 1e-3)
  587. if !e.lastShiftUsedGPU {
  588. shifted = dsp.FreqShift(iq, e.sampleRate, offsetHz)
  589. }
  590. var outRate int
  591. switch mode {
  592. case DemodNFM, DemodAM, DemodUSB, DemodLSB, DemodCW:
  593. outRate = 48000
  594. case DemodWFM:
  595. outRate = 192000
  596. default:
  597. return nil, 0, errors.New("unsupported demod type")
  598. }
  599. cutoff := bw / 2
  600. if cutoff < 200 {
  601. cutoff = 200
  602. }
  603. taps := e.firTaps
  604. if len(taps) == 0 {
  605. base64 := dsp.LowpassFIR(cutoff, e.sampleRate, 101)
  606. taps = make([]float32, len(base64))
  607. for i, v := range base64 {
  608. taps[i] = float32(v)
  609. }
  610. e.SetFIR(taps)
  611. }
  612. filtered, ok := e.tryCUDAFIR(shifted, len(taps))
  613. if ok {
  614. if validationEnabled() {
  615. e.lastFIRUsedGPU = ValidateFIR(shifted, taps, filtered, 1e-3)
  616. if !e.lastFIRUsedGPU {
  617. ftaps := make([]float64, len(taps))
  618. for i, v := range taps {
  619. ftaps[i] = float64(v)
  620. }
  621. filtered = dsp.ApplyFIR(shifted, ftaps)
  622. }
  623. } else {
  624. e.lastFIRUsedGPU = true
  625. }
  626. }
  627. if filtered == nil {
  628. ftaps := make([]float64, len(taps))
  629. for i, v := range taps {
  630. ftaps[i] = float64(v)
  631. }
  632. filtered = dsp.ApplyFIR(shifted, ftaps)
  633. }
  634. decim := int(math.Round(float64(e.sampleRate) / float64(outRate)))
  635. if decim < 1 {
  636. decim = 1
  637. }
  638. dec, ok := e.tryCUDADecimate(filtered, decim)
  639. if ok {
  640. if validationEnabled() {
  641. e.lastDecimUsedGPU = ValidateDecimate(filtered, decim, dec, 1e-3)
  642. if !e.lastDecimUsedGPU {
  643. dec = dsp.Decimate(filtered, decim)
  644. }
  645. } else {
  646. e.lastDecimUsedGPU = true
  647. }
  648. }
  649. if dec == nil {
  650. dec = dsp.Decimate(filtered, decim)
  651. }
  652. inputRate := e.sampleRate / decim
  653. e.lastDemodUsedGPU = false
  654. switch mode {
  655. case DemodNFM:
  656. if gpuAudio, ok := e.tryCUDAFMDiscrim(dec); ok {
  657. e.lastDemodUsedGPU = true
  658. return gpuAudio, inputRate, nil
  659. }
  660. return demod.NFM{}.Demod(dec, inputRate), inputRate, nil
  661. case DemodWFM:
  662. if gpuAudio, ok := e.tryCUDAFMDiscrim(dec); ok {
  663. e.lastDemodUsedGPU = true
  664. return gpuAudio, inputRate, nil
  665. }
  666. return demod.WFM{}.Demod(dec, inputRate), inputRate, nil
  667. case DemodAM:
  668. if gpuAudio, ok := e.tryCUDAAMEnvelope(dec); ok {
  669. e.lastDemodUsedGPU = true
  670. return gpuAudio, inputRate, nil
  671. }
  672. return demod.AM{}.Demod(dec, inputRate), inputRate, nil
  673. case DemodUSB:
  674. if gpuAudio, ok := e.tryCUDASSBProduct(dec, 700.0); ok {
  675. e.lastDemodUsedGPU = true
  676. return gpuAudio, inputRate, nil
  677. }
  678. return demod.USB{}.Demod(dec, inputRate), inputRate, nil
  679. case DemodLSB:
  680. if gpuAudio, ok := e.tryCUDASSBProduct(dec, -700.0); ok {
  681. e.lastDemodUsedGPU = true
  682. return gpuAudio, inputRate, nil
  683. }
  684. return demod.LSB{}.Demod(dec, inputRate), inputRate, nil
  685. case DemodCW:
  686. if gpuAudio, ok := e.tryCUDASSBProduct(dec, 700.0); ok {
  687. e.lastDemodUsedGPU = true
  688. return gpuAudio, inputRate, nil
  689. }
  690. return demod.CW{}.Demod(dec, inputRate), inputRate, nil
  691. default:
  692. return nil, 0, errors.New("unsupported demod type")
  693. }
  694. }
  695. func (e *Engine) Close() {
  696. if e == nil {
  697. return
  698. }
  699. if e.dIQIn != nil {
  700. _ = C.gpud_cuda_free(unsafe.Pointer(e.dIQIn))
  701. e.dIQIn = nil
  702. }
  703. if e.dShifted != nil {
  704. _ = C.gpud_cuda_free(unsafe.Pointer(e.dShifted))
  705. e.dShifted = nil
  706. }
  707. if e.dFiltered != nil {
  708. _ = C.gpud_cuda_free(unsafe.Pointer(e.dFiltered))
  709. e.dFiltered = nil
  710. }
  711. if e.dDecimated != nil {
  712. _ = C.gpud_cuda_free(unsafe.Pointer(e.dDecimated))
  713. e.dDecimated = nil
  714. }
  715. if e.dAudio != nil {
  716. _ = C.gpud_cuda_free(unsafe.Pointer(e.dAudio))
  717. e.dAudio = nil
  718. }
  719. e.firTaps = nil
  720. e.cudaReady = false
  721. }