You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

763 line
24KB

  1. //go:build cufft && windows
  2. package gpudemod
  3. /*
  4. #cgo windows CFLAGS: -I"C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v13.2/include"
  5. #cgo windows LDFLAGS: -lcudart64_13 -lkernel32
  6. #include <windows.h>
  7. #include <stdlib.h>
  8. #include <cuda_runtime.h>
  9. typedef struct { float x; float y; } gpud_float2;
  10. typedef int (__stdcall *gpud_upload_fir_taps_fn)(const float* taps, int n);
  11. typedef int (__stdcall *gpud_launch_freq_shift_fn)(const gpud_float2* in, gpud_float2* out, int n, double phase_inc, double phase_start);
  12. typedef int (__stdcall *gpud_launch_fm_discrim_fn)(const gpud_float2* in, float* out, int n);
  13. typedef int (__stdcall *gpud_launch_fir_fn)(const gpud_float2* in, gpud_float2* out, int n, int num_taps);
  14. typedef int (__stdcall *gpud_launch_decimate_fn)(const gpud_float2* in, gpud_float2* out, int n_out, int factor);
  15. typedef int (__stdcall *gpud_launch_am_envelope_fn)(const gpud_float2* in, float* out, int n);
  16. typedef int (__stdcall *gpud_launch_ssb_product_fn)(const gpud_float2* in, float* out, int n, double phase_inc, double phase_start);
  17. static HMODULE gpud_mod = NULL;
  18. static gpud_upload_fir_taps_fn gpud_p_upload_fir_taps = NULL;
  19. static gpud_launch_freq_shift_fn gpud_p_launch_freq_shift = NULL;
  20. static gpud_launch_fm_discrim_fn gpud_p_launch_fm_discrim = NULL;
  21. static gpud_launch_fir_fn gpud_p_launch_fir = NULL;
  22. static gpud_launch_decimate_fn gpud_p_launch_decimate = NULL;
  23. static gpud_launch_am_envelope_fn gpud_p_launch_am_envelope = NULL;
  24. static gpud_launch_ssb_product_fn gpud_p_launch_ssb_product = NULL;
  25. static int gpud_cuda_malloc(void **ptr, size_t bytes) { return (int)cudaMalloc(ptr, bytes); }
  26. static int gpud_cuda_free(void *ptr) { return (int)cudaFree(ptr); }
  27. static int gpud_memcpy_h2d(void *dst, const void *src, size_t bytes) { return (int)cudaMemcpy(dst, src, bytes, cudaMemcpyHostToDevice); }
  28. static int gpud_memcpy_d2h(void *dst, const void *src, size_t bytes) { return (int)cudaMemcpy(dst, src, bytes, cudaMemcpyDeviceToHost); }
  29. static int gpud_device_sync() { return (int)cudaDeviceSynchronize(); }
  30. static int gpud_load_library(const char* path) {
  31. if (gpud_mod != NULL) return 0;
  32. gpud_mod = LoadLibraryA(path);
  33. if (gpud_mod == NULL) return -1;
  34. gpud_p_upload_fir_taps = (gpud_upload_fir_taps_fn)GetProcAddress(gpud_mod, "gpud_upload_fir_taps_cuda");
  35. gpud_p_launch_freq_shift = (gpud_launch_freq_shift_fn)GetProcAddress(gpud_mod, "gpud_launch_freq_shift_cuda");
  36. gpud_p_launch_fm_discrim = (gpud_launch_fm_discrim_fn)GetProcAddress(gpud_mod, "gpud_launch_fm_discrim_cuda");
  37. gpud_p_launch_fir = (gpud_launch_fir_fn)GetProcAddress(gpud_mod, "gpud_launch_fir_cuda");
  38. gpud_p_launch_decimate = (gpud_launch_decimate_fn)GetProcAddress(gpud_mod, "gpud_launch_decimate_cuda");
  39. gpud_p_launch_am_envelope = (gpud_launch_am_envelope_fn)GetProcAddress(gpud_mod, "gpud_launch_am_envelope_cuda");
  40. gpud_p_launch_ssb_product = (gpud_launch_ssb_product_fn)GetProcAddress(gpud_mod, "gpud_launch_ssb_product_cuda");
  41. if (!gpud_p_upload_fir_taps || !gpud_p_launch_freq_shift || !gpud_p_launch_fm_discrim || !gpud_p_launch_fir || !gpud_p_launch_decimate || !gpud_p_launch_am_envelope || !gpud_p_launch_ssb_product) {
  42. FreeLibrary(gpud_mod);
  43. gpud_mod = NULL;
  44. return -2;
  45. }
  46. return 0;
  47. }
  48. static int gpud_upload_fir_taps(const float* taps, int n) {
  49. if (!gpud_p_upload_fir_taps) return -1;
  50. return gpud_p_upload_fir_taps(taps, n);
  51. }
  52. static int gpud_launch_freq_shift(gpud_float2 *in, gpud_float2 *out, int n, double phase_inc, double phase_start) {
  53. if (!gpud_p_launch_freq_shift) return -1;
  54. return gpud_p_launch_freq_shift(in, out, n, phase_inc, phase_start);
  55. }
  56. static int gpud_launch_fm_discrim(gpud_float2 *in, float *out, int n) {
  57. if (!gpud_p_launch_fm_discrim) return -1;
  58. return gpud_p_launch_fm_discrim(in, out, n);
  59. }
  60. static int gpud_launch_fir(gpud_float2 *in, gpud_float2 *out, int n, int num_taps) {
  61. if (!gpud_p_launch_fir) return -1;
  62. return gpud_p_launch_fir(in, out, n, num_taps);
  63. }
  64. static int gpud_launch_decimate(gpud_float2 *in, gpud_float2 *out, int n_out, int factor) {
  65. if (!gpud_p_launch_decimate) return -1;
  66. return gpud_p_launch_decimate(in, out, n_out, factor);
  67. }
  68. static int gpud_launch_am_envelope(gpud_float2 *in, float *out, int n) {
  69. if (!gpud_p_launch_am_envelope) return -1;
  70. return gpud_p_launch_am_envelope(in, out, n);
  71. }
  72. static int gpud_launch_ssb_product(gpud_float2 *in, float *out, int n, double phase_inc, double phase_start) {
  73. if (!gpud_p_launch_ssb_product) return -1;
  74. return gpud_p_launch_ssb_product(in, out, n, phase_inc, phase_start);
  75. }
  76. */
  77. import "C"
  78. import (
  79. "errors"
  80. "fmt"
  81. "math"
  82. "os"
  83. "path/filepath"
  84. "sync"
  85. "unsafe"
  86. "sdr-visual-suite/internal/demod"
  87. "sdr-visual-suite/internal/dsp"
  88. )
  89. type DemodType int
  90. const (
  91. DemodNFM DemodType = iota
  92. DemodWFM
  93. DemodAM
  94. DemodUSB
  95. DemodLSB
  96. DemodCW
  97. )
  98. var loadOnce sync.Once
  99. var loadErr error
  100. func ensureDLLLoaded() error {
  101. loadOnce.Do(func() {
  102. candidates := []string{}
  103. if exe, err := os.Executable(); err == nil {
  104. dir := filepath.Dir(exe)
  105. candidates = append(candidates, filepath.Join(dir, "gpudemod_kernels.dll"))
  106. }
  107. if wd, err := os.Getwd(); err == nil {
  108. candidates = append(candidates,
  109. filepath.Join(wd, "gpudemod_kernels.dll"),
  110. filepath.Join(wd, "internal", "demod", "gpudemod", "build", "gpudemod_kernels.dll"),
  111. )
  112. }
  113. if env := os.Getenv("GPUMOD_DLL"); env != "" {
  114. candidates = append([]string{env}, candidates...)
  115. }
  116. seen := map[string]bool{}
  117. for _, p := range candidates {
  118. if p == "" || seen[p] {
  119. continue
  120. }
  121. seen[p] = true
  122. if _, err := os.Stat(p); err == nil {
  123. cp := C.CString(p)
  124. res := C.gpud_load_library(cp)
  125. C.free(unsafe.Pointer(cp))
  126. if res == 0 {
  127. loadErr = nil
  128. fmt.Fprintf(os.Stderr, "gpudemod: loaded DLL %s\n", p)
  129. return
  130. }
  131. loadErr = fmt.Errorf("failed to load gpudemod DLL: %s (code %d)", p, int(res))
  132. fmt.Fprintf(os.Stderr, "gpudemod: DLL load failed for %s (code %d)\n", p, int(res))
  133. }
  134. }
  135. if loadErr == nil {
  136. loadErr = errors.New("gpudemod_kernels.dll not found")
  137. fmt.Fprintln(os.Stderr, "gpudemod: gpudemod_kernels.dll not found in search paths")
  138. }
  139. })
  140. return loadErr
  141. }
  142. type Engine struct {
  143. maxSamples int
  144. sampleRate int
  145. phase float64
  146. bfoPhase float64
  147. firTaps []float32
  148. cudaReady bool
  149. lastShiftUsedGPU bool
  150. lastFIRUsedGPU bool
  151. lastDecimUsedGPU bool
  152. lastDemodUsedGPU bool
  153. dIQIn *C.gpud_float2
  154. dShifted *C.gpud_float2
  155. dFiltered *C.gpud_float2
  156. dDecimated *C.gpud_float2
  157. dAudio *C.float
  158. iqBytes C.size_t
  159. audioBytes C.size_t
  160. }
  161. func Available() bool {
  162. if ensureDLLLoaded() != nil {
  163. return false
  164. }
  165. var count C.int
  166. if C.cudaGetDeviceCount(&count) != C.cudaSuccess {
  167. return false
  168. }
  169. return count > 0
  170. }
  171. func New(maxSamples int, sampleRate int) (*Engine, error) {
  172. if maxSamples <= 0 {
  173. return nil, errors.New("invalid maxSamples")
  174. }
  175. if sampleRate <= 0 {
  176. return nil, errors.New("invalid sampleRate")
  177. }
  178. if err := ensureDLLLoaded(); err != nil {
  179. return nil, err
  180. }
  181. if !Available() {
  182. return nil, errors.New("cuda device not available")
  183. }
  184. e := &Engine{
  185. maxSamples: maxSamples,
  186. sampleRate: sampleRate,
  187. cudaReady: true,
  188. iqBytes: C.size_t(maxSamples) * C.size_t(unsafe.Sizeof(C.gpud_float2{})),
  189. audioBytes: C.size_t(maxSamples) * C.size_t(unsafe.Sizeof(C.float(0))),
  190. }
  191. var ptr unsafe.Pointer
  192. if C.gpud_cuda_malloc(&ptr, e.iqBytes) != C.cudaSuccess {
  193. e.Close()
  194. return nil, errors.New("cudaMalloc dIQIn failed")
  195. }
  196. e.dIQIn = (*C.gpud_float2)(ptr)
  197. ptr = nil
  198. if C.gpud_cuda_malloc(&ptr, e.iqBytes) != C.cudaSuccess {
  199. e.Close()
  200. return nil, errors.New("cudaMalloc dShifted failed")
  201. }
  202. e.dShifted = (*C.gpud_float2)(ptr)
  203. ptr = nil
  204. if C.gpud_cuda_malloc(&ptr, e.iqBytes) != C.cudaSuccess {
  205. e.Close()
  206. return nil, errors.New("cudaMalloc dFiltered failed")
  207. }
  208. e.dFiltered = (*C.gpud_float2)(ptr)
  209. ptr = nil
  210. if C.gpud_cuda_malloc(&ptr, e.iqBytes) != C.cudaSuccess {
  211. e.Close()
  212. return nil, errors.New("cudaMalloc dDecimated failed")
  213. }
  214. e.dDecimated = (*C.gpud_float2)(ptr)
  215. ptr = nil
  216. if C.gpud_cuda_malloc(&ptr, e.audioBytes) != C.cudaSuccess {
  217. e.Close()
  218. return nil, errors.New("cudaMalloc dAudio failed")
  219. }
  220. e.dAudio = (*C.float)(ptr)
  221. return e, nil
  222. }
  223. func (e *Engine) SetFIR(taps []float32) {
  224. if len(taps) == 0 {
  225. e.firTaps = nil
  226. return
  227. }
  228. if len(taps) > 256 {
  229. taps = taps[:256]
  230. }
  231. e.firTaps = append(e.firTaps[:0], taps...)
  232. if e.cudaReady {
  233. _ = C.gpud_upload_fir_taps((*C.float)(unsafe.Pointer(&e.firTaps[0])), C.int(len(e.firTaps)))
  234. }
  235. }
  236. func (e *Engine) LastShiftUsedGPU() bool { return e != nil && e.lastShiftUsedGPU }
  237. func (e *Engine) LastDemodUsedGPU() bool { return e != nil && e.lastDemodUsedGPU }
  238. func (e *Engine) tryCUDAFreqShift(iq []complex64, offsetHz float64) ([]complex64, bool) {
  239. if e == nil || !e.cudaReady || len(iq) == 0 || e.dIQIn == nil || e.dShifted == nil {
  240. return nil, false
  241. }
  242. bytes := C.size_t(len(iq)) * C.size_t(unsafe.Sizeof(complex64(0)))
  243. if C.gpud_memcpy_h2d(unsafe.Pointer(e.dIQIn), unsafe.Pointer(&iq[0]), bytes) != C.cudaSuccess {
  244. return nil, false
  245. }
  246. phaseInc := -2.0 * math.Pi * offsetHz / float64(e.sampleRate)
  247. if C.gpud_launch_freq_shift(e.dIQIn, e.dShifted, C.int(len(iq)), C.double(phaseInc), C.double(e.phase)) != 0 {
  248. return nil, false
  249. }
  250. if C.gpud_device_sync() != C.cudaSuccess {
  251. return nil, false
  252. }
  253. out := make([]complex64, len(iq))
  254. if C.gpud_memcpy_d2h(unsafe.Pointer(&out[0]), unsafe.Pointer(e.dShifted), bytes) != C.cudaSuccess {
  255. return nil, false
  256. }
  257. e.phase += phaseInc * float64(len(iq))
  258. return out, true
  259. }
  260. func (e *Engine) tryCUDAFIR(iq []complex64, numTaps int) ([]complex64, bool) {
  261. if e == nil || !e.cudaReady || len(iq) == 0 || numTaps <= 0 || e.dShifted == nil || e.dFiltered == nil {
  262. return nil, false
  263. }
  264. iqBytes := C.size_t(len(iq)) * C.size_t(unsafe.Sizeof(complex64(0)))
  265. if C.gpud_memcpy_h2d(unsafe.Pointer(e.dShifted), unsafe.Pointer(&iq[0]), iqBytes) != C.cudaSuccess {
  266. return nil, false
  267. }
  268. if C.gpud_launch_fir(e.dShifted, e.dFiltered, C.int(len(iq)), C.int(numTaps)) != 0 {
  269. return nil, false
  270. }
  271. if C.gpud_device_sync() != C.cudaSuccess {
  272. return nil, false
  273. }
  274. out := make([]complex64, len(iq))
  275. if C.gpud_memcpy_d2h(unsafe.Pointer(&out[0]), unsafe.Pointer(e.dFiltered), iqBytes) != C.cudaSuccess {
  276. return nil, false
  277. }
  278. return out, true
  279. }
  280. func (e *Engine) tryCUDADecimate(filtered []complex64, factor int) ([]complex64, bool) {
  281. if e == nil || !e.cudaReady || len(filtered) == 0 || factor <= 0 || e.dFiltered == nil || e.dDecimated == nil {
  282. return nil, false
  283. }
  284. nOut := len(filtered) / factor
  285. if nOut <= 0 {
  286. return nil, false
  287. }
  288. iqBytes := C.size_t(len(filtered)) * C.size_t(unsafe.Sizeof(complex64(0)))
  289. if C.gpud_memcpy_h2d(unsafe.Pointer(e.dFiltered), unsafe.Pointer(&filtered[0]), iqBytes) != C.cudaSuccess {
  290. return nil, false
  291. }
  292. if C.gpud_launch_decimate(e.dFiltered, e.dDecimated, C.int(nOut), C.int(factor)) != 0 {
  293. return nil, false
  294. }
  295. if C.gpud_device_sync() != C.cudaSuccess {
  296. return nil, false
  297. }
  298. out := make([]complex64, nOut)
  299. outBytes := C.size_t(nOut) * C.size_t(unsafe.Sizeof(complex64(0)))
  300. if C.gpud_memcpy_d2h(unsafe.Pointer(&out[0]), unsafe.Pointer(e.dDecimated), outBytes) != C.cudaSuccess {
  301. return nil, false
  302. }
  303. return out, true
  304. }
  305. func (e *Engine) tryCUDAFMDiscrim(shifted []complex64) ([]float32, bool) {
  306. if e == nil || !e.cudaReady || len(shifted) < 2 || e.dShifted == nil || e.dAudio == nil {
  307. return nil, false
  308. }
  309. iqBytes := C.size_t(len(shifted)) * C.size_t(unsafe.Sizeof(complex64(0)))
  310. if C.gpud_memcpy_h2d(unsafe.Pointer(e.dShifted), unsafe.Pointer(&shifted[0]), iqBytes) != C.cudaSuccess {
  311. return nil, false
  312. }
  313. if C.gpud_launch_fm_discrim(e.dShifted, e.dAudio, C.int(len(shifted))) != 0 {
  314. return nil, false
  315. }
  316. if C.gpud_device_sync() != C.cudaSuccess {
  317. return nil, false
  318. }
  319. out := make([]float32, len(shifted)-1)
  320. outBytes := C.size_t(len(out)) * C.size_t(unsafe.Sizeof(float32(0)))
  321. if C.gpud_memcpy_d2h(unsafe.Pointer(&out[0]), unsafe.Pointer(e.dAudio), outBytes) != C.cudaSuccess {
  322. return nil, false
  323. }
  324. return out, true
  325. }
  326. func (e *Engine) tryCUDAAMEnvelope(shifted []complex64) ([]float32, bool) {
  327. if e == nil || !e.cudaReady || len(shifted) == 0 || e.dShifted == nil || e.dAudio == nil {
  328. return nil, false
  329. }
  330. iqBytes := C.size_t(len(shifted)) * C.size_t(unsafe.Sizeof(complex64(0)))
  331. if C.gpud_memcpy_h2d(unsafe.Pointer(e.dShifted), unsafe.Pointer(&shifted[0]), iqBytes) != C.cudaSuccess {
  332. return nil, false
  333. }
  334. if C.gpud_launch_am_envelope(e.dShifted, e.dAudio, C.int(len(shifted))) != 0 {
  335. return nil, false
  336. }
  337. if C.gpud_device_sync() != C.cudaSuccess {
  338. return nil, false
  339. }
  340. out := make([]float32, len(shifted))
  341. outBytes := C.size_t(len(out)) * C.size_t(unsafe.Sizeof(float32(0)))
  342. if C.gpud_memcpy_d2h(unsafe.Pointer(&out[0]), unsafe.Pointer(e.dAudio), outBytes) != C.cudaSuccess {
  343. return nil, false
  344. }
  345. return out, true
  346. }
  347. func (e *Engine) tryCUDASSBProduct(shifted []complex64, bfoHz float64) ([]float32, bool) {
  348. if e == nil || !e.cudaReady || len(shifted) == 0 || e.dShifted == nil || e.dAudio == nil {
  349. return nil, false
  350. }
  351. iqBytes := C.size_t(len(shifted)) * C.size_t(unsafe.Sizeof(complex64(0)))
  352. if C.gpud_memcpy_h2d(unsafe.Pointer(e.dShifted), unsafe.Pointer(&shifted[0]), iqBytes) != C.cudaSuccess {
  353. return nil, false
  354. }
  355. phaseInc := 2.0 * math.Pi * bfoHz / float64(e.sampleRate)
  356. if C.gpud_launch_ssb_product(e.dShifted, e.dAudio, C.int(len(shifted)), C.double(phaseInc), C.double(e.bfoPhase)) != 0 {
  357. return nil, false
  358. }
  359. if C.gpud_device_sync() != C.cudaSuccess {
  360. return nil, false
  361. }
  362. out := make([]float32, len(shifted))
  363. outBytes := C.size_t(len(out)) * C.size_t(unsafe.Sizeof(float32(0)))
  364. if C.gpud_memcpy_d2h(unsafe.Pointer(&out[0]), unsafe.Pointer(e.dAudio), outBytes) != C.cudaSuccess {
  365. return nil, false
  366. }
  367. e.bfoPhase += phaseInc * float64(len(shifted))
  368. return out, true
  369. }
  370. func (e *Engine) ShiftFilterDecimate(iq []complex64, offsetHz float64, bw float64, outRate int) ([]complex64, int, error) {
  371. if e == nil {
  372. return nil, 0, errors.New("nil CUDA demod engine")
  373. }
  374. if !e.cudaReady {
  375. return nil, 0, errors.New("cuda demod engine is not initialized")
  376. }
  377. if len(iq) == 0 {
  378. return nil, 0, nil
  379. }
  380. if len(iq) > e.maxSamples {
  381. return nil, 0, errors.New("sample count exceeds engine capacity")
  382. }
  383. if outRate <= 0 {
  384. return nil, 0, errors.New("invalid output sample rate")
  385. }
  386. e.lastShiftUsedGPU = false
  387. e.lastFIRUsedGPU = false
  388. e.lastDecimUsedGPU = false
  389. e.lastDemodUsedGPU = false
  390. cutoff := bw / 2
  391. if cutoff < 200 {
  392. cutoff = 200
  393. }
  394. taps := e.firTaps
  395. if len(taps) == 0 {
  396. base64 := dsp.LowpassFIR(cutoff, e.sampleRate, 101)
  397. taps = make([]float32, len(base64))
  398. for i, v := range base64 {
  399. taps[i] = float32(v)
  400. }
  401. e.SetFIR(taps)
  402. }
  403. if len(taps) == 0 {
  404. return nil, 0, errors.New("no FIR taps configured")
  405. }
  406. decim := int(math.Round(float64(e.sampleRate) / float64(outRate)))
  407. if decim < 1 {
  408. decim = 1
  409. }
  410. n := len(iq)
  411. nOut := n / decim
  412. if nOut <= 0 {
  413. return nil, 0, errors.New("not enough output samples after decimation")
  414. }
  415. bytesIn := C.size_t(n) * C.size_t(unsafe.Sizeof(complex64(0)))
  416. if C.gpud_memcpy_h2d(unsafe.Pointer(e.dIQIn), unsafe.Pointer(&iq[0]), bytesIn) != C.cudaSuccess {
  417. return nil, 0, errors.New("cudaMemcpy H2D failed")
  418. }
  419. phaseInc := -2.0 * math.Pi * offsetHz / float64(e.sampleRate)
  420. if C.gpud_launch_freq_shift(e.dIQIn, e.dShifted, C.int(n), C.double(phaseInc), C.double(e.phase)) != 0 {
  421. return nil, 0, errors.New("gpu freq shift failed")
  422. }
  423. if C.gpud_launch_fir(e.dShifted, e.dFiltered, C.int(n), C.int(len(taps))) != 0 {
  424. return nil, 0, errors.New("gpu FIR failed")
  425. }
  426. if C.gpud_launch_decimate(e.dFiltered, e.dDecimated, C.int(nOut), C.int(decim)) != 0 {
  427. return nil, 0, errors.New("gpu decimate failed")
  428. }
  429. if C.gpud_device_sync() != C.cudaSuccess {
  430. return nil, 0, errors.New("cudaDeviceSynchronize failed")
  431. }
  432. out := make([]complex64, nOut)
  433. outBytes := C.size_t(nOut) * C.size_t(unsafe.Sizeof(complex64(0)))
  434. if C.gpud_memcpy_d2h(unsafe.Pointer(&out[0]), unsafe.Pointer(e.dDecimated), outBytes) != C.cudaSuccess {
  435. return nil, 0, errors.New("cudaMemcpy D2H failed")
  436. }
  437. e.phase += phaseInc * float64(n)
  438. e.lastShiftUsedGPU = true
  439. e.lastFIRUsedGPU = true
  440. e.lastDecimUsedGPU = true
  441. return out, e.sampleRate / decim, nil
  442. }
  443. func (e *Engine) DemodFused(iq []complex64, offsetHz float64, bw float64, mode DemodType) ([]float32, int, error) {
  444. if e == nil {
  445. return nil, 0, errors.New("nil CUDA demod engine")
  446. }
  447. if !e.cudaReady {
  448. return nil, 0, errors.New("cuda demod engine is not initialized")
  449. }
  450. if len(iq) == 0 {
  451. return nil, 0, nil
  452. }
  453. e.lastShiftUsedGPU = false
  454. e.lastFIRUsedGPU = false
  455. e.lastDecimUsedGPU = false
  456. e.lastDemodUsedGPU = false
  457. if len(iq) > e.maxSamples {
  458. return nil, 0, errors.New("sample count exceeds engine capacity")
  459. }
  460. var outRate int
  461. switch mode {
  462. case DemodNFM, DemodAM, DemodUSB, DemodLSB, DemodCW:
  463. outRate = 48000
  464. case DemodWFM:
  465. outRate = 192000
  466. default:
  467. return nil, 0, errors.New("unsupported demod type")
  468. }
  469. cutoff := bw / 2
  470. if cutoff < 200 {
  471. cutoff = 200
  472. }
  473. taps := e.firTaps
  474. if len(taps) == 0 {
  475. base64 := dsp.LowpassFIR(cutoff, e.sampleRate, 101)
  476. taps = make([]float32, len(base64))
  477. for i, v := range base64 {
  478. taps[i] = float32(v)
  479. }
  480. e.SetFIR(taps)
  481. }
  482. if len(taps) == 0 {
  483. return nil, 0, errors.New("no FIR taps configured")
  484. }
  485. decim := int(math.Round(float64(e.sampleRate) / float64(outRate)))
  486. if decim < 1 {
  487. decim = 1
  488. }
  489. n := len(iq)
  490. nOut := n / decim
  491. if nOut <= 1 {
  492. return nil, 0, errors.New("not enough output samples after decimation")
  493. }
  494. bytesIn := C.size_t(n) * C.size_t(unsafe.Sizeof(complex64(0)))
  495. if C.gpud_memcpy_h2d(unsafe.Pointer(e.dIQIn), unsafe.Pointer(&iq[0]), bytesIn) != C.cudaSuccess {
  496. return nil, 0, errors.New("cudaMemcpy H2D failed")
  497. }
  498. phaseInc := -2.0 * math.Pi * offsetHz / float64(e.sampleRate)
  499. if C.gpud_launch_freq_shift(e.dIQIn, e.dShifted, C.int(n), C.double(phaseInc), C.double(e.phase)) != 0 {
  500. return nil, 0, errors.New("gpu freq shift failed")
  501. }
  502. if C.gpud_launch_fir(e.dShifted, e.dFiltered, C.int(n), C.int(len(taps))) != 0 {
  503. return nil, 0, errors.New("gpu FIR failed")
  504. }
  505. if C.gpud_launch_decimate(e.dFiltered, e.dDecimated, C.int(nOut), C.int(decim)) != 0 {
  506. return nil, 0, errors.New("gpu decimate failed")
  507. }
  508. e.lastShiftUsedGPU = true
  509. e.lastFIRUsedGPU = true
  510. e.lastDecimUsedGPU = true
  511. e.lastDemodUsedGPU = false
  512. switch mode {
  513. case DemodNFM, DemodWFM:
  514. if C.gpud_launch_fm_discrim(e.dDecimated, e.dAudio, C.int(nOut)) != 0 {
  515. return nil, 0, errors.New("gpu FM discrim failed")
  516. }
  517. out := make([]float32, nOut-1)
  518. outBytes := C.size_t(len(out)) * C.size_t(unsafe.Sizeof(float32(0)))
  519. if C.gpud_device_sync() != C.cudaSuccess {
  520. return nil, 0, errors.New("cudaDeviceSynchronize failed")
  521. }
  522. if C.gpud_memcpy_d2h(unsafe.Pointer(&out[0]), unsafe.Pointer(e.dAudio), outBytes) != C.cudaSuccess {
  523. return nil, 0, errors.New("cudaMemcpy D2H failed")
  524. }
  525. e.phase += phaseInc * float64(n)
  526. e.lastDemodUsedGPU = true
  527. return out, e.sampleRate / decim, nil
  528. case DemodAM:
  529. if C.gpud_launch_am_envelope(e.dDecimated, e.dAudio, C.int(nOut)) != 0 {
  530. return nil, 0, errors.New("gpu AM envelope failed")
  531. }
  532. out := make([]float32, nOut)
  533. outBytes := C.size_t(len(out)) * C.size_t(unsafe.Sizeof(float32(0)))
  534. if C.gpud_device_sync() != C.cudaSuccess {
  535. return nil, 0, errors.New("cudaDeviceSynchronize failed")
  536. }
  537. if C.gpud_memcpy_d2h(unsafe.Pointer(&out[0]), unsafe.Pointer(e.dAudio), outBytes) != C.cudaSuccess {
  538. return nil, 0, errors.New("cudaMemcpy D2H failed")
  539. }
  540. e.phase += phaseInc * float64(n)
  541. e.lastDemodUsedGPU = true
  542. return out, e.sampleRate / decim, nil
  543. case DemodUSB, DemodLSB, DemodCW:
  544. bfoHz := 700.0
  545. if mode == DemodLSB {
  546. bfoHz = -700.0
  547. }
  548. phaseBFO := 2.0 * math.Pi * bfoHz / float64(e.sampleRate)
  549. if C.gpud_launch_ssb_product(e.dDecimated, e.dAudio, C.int(nOut), C.double(phaseBFO), C.double(e.bfoPhase)) != 0 {
  550. return nil, 0, errors.New("gpu SSB product failed")
  551. }
  552. out := make([]float32, nOut)
  553. outBytes := C.size_t(len(out)) * C.size_t(unsafe.Sizeof(float32(0)))
  554. if C.gpud_device_sync() != C.cudaSuccess {
  555. return nil, 0, errors.New("cudaDeviceSynchronize failed")
  556. }
  557. if C.gpud_memcpy_d2h(unsafe.Pointer(&out[0]), unsafe.Pointer(e.dAudio), outBytes) != C.cudaSuccess {
  558. return nil, 0, errors.New("cudaMemcpy D2H failed")
  559. }
  560. e.phase += phaseInc * float64(n)
  561. e.bfoPhase += phaseBFO * float64(nOut)
  562. e.lastDemodUsedGPU = true
  563. return out, e.sampleRate / decim, nil
  564. default:
  565. return nil, 0, errors.New("unsupported demod type")
  566. }
  567. }
  568. func (e *Engine) Demod(iq []complex64, offsetHz float64, bw float64, mode DemodType) ([]float32, int, error) {
  569. if e == nil {
  570. return nil, 0, errors.New("nil CUDA demod engine")
  571. }
  572. if !e.cudaReady {
  573. return nil, 0, errors.New("cuda demod engine is not initialized")
  574. }
  575. if len(iq) == 0 {
  576. return nil, 0, nil
  577. }
  578. if len(iq) > e.maxSamples {
  579. return nil, 0, errors.New("sample count exceeds engine capacity")
  580. }
  581. shifted, ok := e.tryCUDAFreqShift(iq, offsetHz)
  582. e.lastShiftUsedGPU = ok && ValidateFreqShift(iq, e.sampleRate, offsetHz, shifted, 1e-3)
  583. if !e.lastShiftUsedGPU {
  584. shifted = dsp.FreqShift(iq, e.sampleRate, offsetHz)
  585. }
  586. var outRate int
  587. switch mode {
  588. case DemodNFM, DemodAM, DemodUSB, DemodLSB, DemodCW:
  589. outRate = 48000
  590. case DemodWFM:
  591. outRate = 192000
  592. default:
  593. return nil, 0, errors.New("unsupported demod type")
  594. }
  595. cutoff := bw / 2
  596. if cutoff < 200 {
  597. cutoff = 200
  598. }
  599. taps := e.firTaps
  600. if len(taps) == 0 {
  601. base64 := dsp.LowpassFIR(cutoff, e.sampleRate, 101)
  602. taps = make([]float32, len(base64))
  603. for i, v := range base64 {
  604. taps[i] = float32(v)
  605. }
  606. e.SetFIR(taps)
  607. }
  608. filtered, ok := e.tryCUDAFIR(shifted, len(taps))
  609. if ok {
  610. if validationEnabled() {
  611. e.lastFIRUsedGPU = ValidateFIR(shifted, taps, filtered, 1e-3)
  612. if !e.lastFIRUsedGPU {
  613. ftaps := make([]float64, len(taps))
  614. for i, v := range taps {
  615. ftaps[i] = float64(v)
  616. }
  617. filtered = dsp.ApplyFIR(shifted, ftaps)
  618. }
  619. } else {
  620. e.lastFIRUsedGPU = true
  621. }
  622. }
  623. if filtered == nil {
  624. ftaps := make([]float64, len(taps))
  625. for i, v := range taps {
  626. ftaps[i] = float64(v)
  627. }
  628. filtered = dsp.ApplyFIR(shifted, ftaps)
  629. }
  630. decim := int(math.Round(float64(e.sampleRate) / float64(outRate)))
  631. if decim < 1 {
  632. decim = 1
  633. }
  634. dec, ok := e.tryCUDADecimate(filtered, decim)
  635. if ok {
  636. if validationEnabled() {
  637. e.lastDecimUsedGPU = ValidateDecimate(filtered, decim, dec, 1e-3)
  638. if !e.lastDecimUsedGPU {
  639. dec = dsp.Decimate(filtered, decim)
  640. }
  641. } else {
  642. e.lastDecimUsedGPU = true
  643. }
  644. }
  645. if dec == nil {
  646. dec = dsp.Decimate(filtered, decim)
  647. }
  648. inputRate := e.sampleRate / decim
  649. e.lastDemodUsedGPU = false
  650. switch mode {
  651. case DemodNFM:
  652. if gpuAudio, ok := e.tryCUDAFMDiscrim(dec); ok {
  653. e.lastDemodUsedGPU = true
  654. return gpuAudio, inputRate, nil
  655. }
  656. return demod.NFM{}.Demod(dec, inputRate), inputRate, nil
  657. case DemodWFM:
  658. if gpuAudio, ok := e.tryCUDAFMDiscrim(dec); ok {
  659. e.lastDemodUsedGPU = true
  660. return gpuAudio, inputRate, nil
  661. }
  662. return demod.WFM{}.Demod(dec, inputRate), inputRate, nil
  663. case DemodAM:
  664. if gpuAudio, ok := e.tryCUDAAMEnvelope(dec); ok {
  665. e.lastDemodUsedGPU = true
  666. return gpuAudio, inputRate, nil
  667. }
  668. return demod.AM{}.Demod(dec, inputRate), inputRate, nil
  669. case DemodUSB:
  670. if gpuAudio, ok := e.tryCUDASSBProduct(dec, 700.0); ok {
  671. e.lastDemodUsedGPU = true
  672. return gpuAudio, inputRate, nil
  673. }
  674. return demod.USB{}.Demod(dec, inputRate), inputRate, nil
  675. case DemodLSB:
  676. if gpuAudio, ok := e.tryCUDASSBProduct(dec, -700.0); ok {
  677. e.lastDemodUsedGPU = true
  678. return gpuAudio, inputRate, nil
  679. }
  680. return demod.LSB{}.Demod(dec, inputRate), inputRate, nil
  681. case DemodCW:
  682. if gpuAudio, ok := e.tryCUDASSBProduct(dec, 700.0); ok {
  683. e.lastDemodUsedGPU = true
  684. return gpuAudio, inputRate, nil
  685. }
  686. return demod.CW{}.Demod(dec, inputRate), inputRate, nil
  687. default:
  688. return nil, 0, errors.New("unsupported demod type")
  689. }
  690. }
  691. func (e *Engine) Close() {
  692. if e == nil {
  693. return
  694. }
  695. if e.dIQIn != nil {
  696. _ = C.gpud_cuda_free(unsafe.Pointer(e.dIQIn))
  697. e.dIQIn = nil
  698. }
  699. if e.dShifted != nil {
  700. _ = C.gpud_cuda_free(unsafe.Pointer(e.dShifted))
  701. e.dShifted = nil
  702. }
  703. if e.dFiltered != nil {
  704. _ = C.gpud_cuda_free(unsafe.Pointer(e.dFiltered))
  705. e.dFiltered = nil
  706. }
  707. if e.dDecimated != nil {
  708. _ = C.gpud_cuda_free(unsafe.Pointer(e.dDecimated))
  709. e.dDecimated = nil
  710. }
  711. if e.dAudio != nil {
  712. _ = C.gpud_cuda_free(unsafe.Pointer(e.dAudio))
  713. e.dAudio = nil
  714. }
  715. e.firTaps = nil
  716. e.cudaReady = false
  717. }