You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

465 line
14KB

  1. //go:build cufft
  2. package gpudemod
  3. /*
  4. #cgo windows LDFLAGS: -L${SRCDIR}/../../../cuda-mingw -L${SRCDIR}/build -lgpudemod_kernels -lcufft64_12 -lcudart64_13
  5. #cgo windows CFLAGS: -I"C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v13.2/include"
  6. #include <cuda_runtime.h>
  7. #include <cufft.h>
  8. typedef struct { float x; float y; } gpud_float2;
  9. static int gpud_cuda_malloc(void **ptr, size_t bytes) {
  10. return (int)cudaMalloc(ptr, bytes);
  11. }
  12. static int gpud_cuda_free(void *ptr) {
  13. return (int)cudaFree(ptr);
  14. }
  15. static int gpud_memcpy_h2d(void *dst, const void *src, size_t bytes) {
  16. return (int)cudaMemcpy(dst, src, bytes, cudaMemcpyHostToDevice);
  17. }
  18. static int gpud_memcpy_d2h(void *dst, const void *src, size_t bytes) {
  19. return (int)cudaMemcpy(dst, src, bytes, cudaMemcpyDeviceToHost);
  20. }
  21. static int gpud_device_sync() {
  22. return (int)cudaDeviceSynchronize();
  23. }
  24. extern int gpud_launch_freq_shift_cuda(const gpud_float2* in, gpud_float2* out, int n, double phase_inc, double phase_start);
  25. extern int gpud_launch_fm_discrim_cuda(const gpud_float2* in, float* out, int n);
  26. extern int gpud_upload_fir_taps_cuda(const float* taps, int n);
  27. extern int gpud_launch_fir_cuda(const gpud_float2* in, gpud_float2* out, int n, int num_taps);
  28. extern int gpud_launch_decimate_cuda(const gpud_float2* in, gpud_float2* out, int n_out, int factor);
  29. extern int gpud_launch_am_envelope_cuda(const gpud_float2* in, float* out, int n);
  30. extern int gpud_launch_ssb_product_cuda(const gpud_float2* in, float* out, int n, double phase_inc, double phase_start);
  31. static int gpud_launch_freq_shift(gpud_float2 *in, gpud_float2 *out, int n, double phase_inc, double phase_start) {
  32. return gpud_launch_freq_shift_cuda(in, out, n, phase_inc, phase_start);
  33. }
  34. static int gpud_launch_fm_discrim(gpud_float2 *in, float *out, int n) {
  35. return gpud_launch_fm_discrim_cuda(in, out, n);
  36. }
  37. static int gpud_upload_fir_taps(const float* taps, int n) {
  38. return gpud_upload_fir_taps_cuda(taps, n);
  39. }
  40. static int gpud_launch_fir(gpud_float2 *in, gpud_float2 *out, int n, int num_taps) {
  41. return gpud_launch_fir_cuda(in, out, n, num_taps);
  42. }
  43. static int gpud_launch_decimate(gpud_float2 *in, gpud_float2 *out, int n_out, int factor) {
  44. return gpud_launch_decimate_cuda(in, out, n_out, factor);
  45. }
  46. static int gpud_launch_am_envelope(gpud_float2 *in, float *out, int n) {
  47. return gpud_launch_am_envelope_cuda(in, out, n);
  48. }
  49. static int gpud_launch_ssb_product(gpud_float2 *in, float *out, int n, double phase_inc, double phase_start) {
  50. return gpud_launch_ssb_product_cuda(in, out, n, phase_inc, phase_start);
  51. }
  52. */
  53. import "C"
  54. import (
  55. "errors"
  56. "fmt"
  57. "math"
  58. "unsafe"
  59. "sdr-visual-suite/internal/demod"
  60. "sdr-visual-suite/internal/dsp"
  61. )
  62. type DemodType int
  63. const (
  64. DemodNFM DemodType = iota
  65. DemodWFM
  66. DemodAM
  67. DemodUSB
  68. DemodLSB
  69. DemodCW
  70. )
  71. type Engine struct {
  72. maxSamples int
  73. sampleRate int
  74. phase float64
  75. bfoPhase float64
  76. firTaps []float32
  77. cudaReady bool
  78. lastShiftUsedGPU bool
  79. lastFIRUsedGPU bool
  80. lastDecimUsedGPU bool
  81. lastDemodUsedGPU bool
  82. dIQIn *C.gpud_float2
  83. dShifted *C.gpud_float2
  84. dFiltered *C.gpud_float2
  85. dDecimated *C.gpud_float2
  86. dAudio *C.float
  87. iqBytes C.size_t
  88. audioBytes C.size_t
  89. }
  90. func Available() bool {
  91. var count C.int
  92. if C.cudaGetDeviceCount(&count) != C.cudaSuccess {
  93. return false
  94. }
  95. return count > 0
  96. }
  97. func New(maxSamples int, sampleRate int) (*Engine, error) {
  98. if maxSamples <= 0 {
  99. return nil, errors.New("invalid maxSamples")
  100. }
  101. if sampleRate <= 0 {
  102. return nil, errors.New("invalid sampleRate")
  103. }
  104. if !Available() {
  105. return nil, errors.New("cuda device not available")
  106. }
  107. e := &Engine{
  108. maxSamples: maxSamples,
  109. sampleRate: sampleRate,
  110. cudaReady: true,
  111. iqBytes: C.size_t(maxSamples) * C.size_t(unsafe.Sizeof(C.gpud_float2{})),
  112. audioBytes: C.size_t(maxSamples) * C.size_t(unsafe.Sizeof(C.float(0))),
  113. }
  114. var ptr unsafe.Pointer
  115. if C.gpud_cuda_malloc(&ptr, e.iqBytes) != C.cudaSuccess {
  116. e.Close()
  117. return nil, errors.New("cudaMalloc dIQIn failed")
  118. }
  119. e.dIQIn = (*C.gpud_float2)(ptr)
  120. ptr = nil
  121. if C.gpud_cuda_malloc(&ptr, e.iqBytes) != C.cudaSuccess {
  122. e.Close()
  123. return nil, errors.New("cudaMalloc dShifted failed")
  124. }
  125. e.dShifted = (*C.gpud_float2)(ptr)
  126. ptr = nil
  127. if C.gpud_cuda_malloc(&ptr, e.iqBytes) != C.cudaSuccess {
  128. e.Close()
  129. return nil, errors.New("cudaMalloc dFiltered failed")
  130. }
  131. e.dFiltered = (*C.gpud_float2)(ptr)
  132. ptr = nil
  133. if C.gpud_cuda_malloc(&ptr, e.iqBytes) != C.cudaSuccess {
  134. e.Close()
  135. return nil, errors.New("cudaMalloc dDecimated failed")
  136. }
  137. e.dDecimated = (*C.gpud_float2)(ptr)
  138. ptr = nil
  139. if C.gpud_cuda_malloc(&ptr, e.audioBytes) != C.cudaSuccess {
  140. e.Close()
  141. return nil, errors.New("cudaMalloc dAudio failed")
  142. }
  143. e.dAudio = (*C.float)(ptr)
  144. return e, nil
  145. }
  146. func (e *Engine) SetFIR(taps []float32) {
  147. if len(taps) == 0 {
  148. e.firTaps = nil
  149. return
  150. }
  151. if len(taps) > 256 {
  152. taps = taps[:256]
  153. }
  154. e.firTaps = append(e.firTaps[:0], taps...)
  155. if e.cudaReady {
  156. _ = C.gpud_upload_fir_taps((*C.float)(unsafe.Pointer(&e.firTaps[0])), C.int(len(e.firTaps)))
  157. }
  158. }
  159. func phaseStatus() string { return "phase1c-validated-shift" }
  160. func (e *Engine) LastShiftUsedGPU() bool {
  161. return e != nil && e.lastShiftUsedGPU
  162. }
  163. func (e *Engine) LastDemodUsedGPU() bool {
  164. return e != nil && e.lastDemodUsedGPU
  165. }
  166. func (e *Engine) tryCUDAFreqShift(iq []complex64, offsetHz float64) ([]complex64, bool) {
  167. if e == nil || !e.cudaReady || len(iq) == 0 || e.dIQIn == nil || e.dShifted == nil {
  168. return nil, false
  169. }
  170. bytes := C.size_t(len(iq)) * C.size_t(unsafe.Sizeof(complex64(0)))
  171. if C.gpud_memcpy_h2d(unsafe.Pointer(e.dIQIn), unsafe.Pointer(&iq[0]), bytes) != C.cudaSuccess {
  172. return nil, false
  173. }
  174. phaseInc := -2.0 * math.Pi * offsetHz / float64(e.sampleRate)
  175. if C.gpud_launch_freq_shift(e.dIQIn, e.dShifted, C.int(len(iq)), C.double(phaseInc), C.double(e.phase)) != 0 {
  176. return nil, false
  177. }
  178. if C.gpud_device_sync() != C.cudaSuccess {
  179. return nil, false
  180. }
  181. out := make([]complex64, len(iq))
  182. if C.gpud_memcpy_d2h(unsafe.Pointer(&out[0]), unsafe.Pointer(e.dShifted), bytes) != C.cudaSuccess {
  183. return nil, false
  184. }
  185. e.phase += phaseInc * float64(len(iq))
  186. return out, true
  187. }
  188. func (e *Engine) tryCUDAFIR(iq []complex64, numTaps int) ([]complex64, bool) {
  189. if e == nil || !e.cudaReady || len(iq) == 0 || numTaps <= 0 || e.dShifted == nil || e.dFiltered == nil {
  190. return nil, false
  191. }
  192. iqBytes := C.size_t(len(iq)) * C.size_t(unsafe.Sizeof(complex64(0)))
  193. if C.gpud_memcpy_h2d(unsafe.Pointer(e.dShifted), unsafe.Pointer(&iq[0]), iqBytes) != C.cudaSuccess {
  194. return nil, false
  195. }
  196. if C.gpud_launch_fir(e.dShifted, e.dFiltered, C.int(len(iq)), C.int(numTaps)) != 0 {
  197. return nil, false
  198. }
  199. if C.gpud_device_sync() != C.cudaSuccess {
  200. return nil, false
  201. }
  202. out := make([]complex64, len(iq))
  203. if C.gpud_memcpy_d2h(unsafe.Pointer(&out[0]), unsafe.Pointer(e.dFiltered), iqBytes) != C.cudaSuccess {
  204. return nil, false
  205. }
  206. return out, true
  207. }
  208. func (e *Engine) tryCUDADecimate(filtered []complex64, factor int) ([]complex64, bool) {
  209. if e == nil || !e.cudaReady || len(filtered) == 0 || factor <= 0 || e.dFiltered == nil || e.dDecimated == nil {
  210. return nil, false
  211. }
  212. nOut := len(filtered) / factor
  213. if nOut <= 0 {
  214. return nil, false
  215. }
  216. iqBytes := C.size_t(len(filtered)) * C.size_t(unsafe.Sizeof(complex64(0)))
  217. if C.gpud_memcpy_h2d(unsafe.Pointer(e.dFiltered), unsafe.Pointer(&filtered[0]), iqBytes) != C.cudaSuccess {
  218. return nil, false
  219. }
  220. if C.gpud_launch_decimate(e.dFiltered, e.dDecimated, C.int(nOut), C.int(factor)) != 0 {
  221. return nil, false
  222. }
  223. if C.gpud_device_sync() != C.cudaSuccess {
  224. return nil, false
  225. }
  226. out := make([]complex64, nOut)
  227. outBytes := C.size_t(nOut) * C.size_t(unsafe.Sizeof(complex64(0)))
  228. if C.gpud_memcpy_d2h(unsafe.Pointer(&out[0]), unsafe.Pointer(e.dDecimated), outBytes) != C.cudaSuccess {
  229. return nil, false
  230. }
  231. return out, true
  232. }
  233. func (e *Engine) tryCUDAFMDiscrim(shifted []complex64) ([]float32, bool) {
  234. if e == nil || !e.cudaReady || len(shifted) < 2 || e.dShifted == nil || e.dAudio == nil {
  235. return nil, false
  236. }
  237. iqBytes := C.size_t(len(shifted)) * C.size_t(unsafe.Sizeof(complex64(0)))
  238. if C.gpud_memcpy_h2d(unsafe.Pointer(e.dShifted), unsafe.Pointer(&shifted[0]), iqBytes) != C.cudaSuccess {
  239. return nil, false
  240. }
  241. if C.gpud_launch_fm_discrim(e.dShifted, e.dAudio, C.int(len(shifted))) != 0 {
  242. return nil, false
  243. }
  244. if C.gpud_device_sync() != C.cudaSuccess {
  245. return nil, false
  246. }
  247. out := make([]float32, len(shifted)-1)
  248. outBytes := C.size_t(len(out)) * C.size_t(unsafe.Sizeof(float32(0)))
  249. if C.gpud_memcpy_d2h(unsafe.Pointer(&out[0]), unsafe.Pointer(e.dAudio), outBytes) != C.cudaSuccess {
  250. return nil, false
  251. }
  252. return out, true
  253. }
  254. func (e *Engine) tryCUDAAMEnvelope(shifted []complex64) ([]float32, bool) {
  255. if e == nil || !e.cudaReady || len(shifted) == 0 || e.dShifted == nil || e.dAudio == nil {
  256. return nil, false
  257. }
  258. iqBytes := C.size_t(len(shifted)) * C.size_t(unsafe.Sizeof(complex64(0)))
  259. if C.gpud_memcpy_h2d(unsafe.Pointer(e.dShifted), unsafe.Pointer(&shifted[0]), iqBytes) != C.cudaSuccess {
  260. return nil, false
  261. }
  262. if C.gpud_launch_am_envelope(e.dShifted, e.dAudio, C.int(len(shifted))) != 0 {
  263. return nil, false
  264. }
  265. if C.gpud_device_sync() != C.cudaSuccess {
  266. return nil, false
  267. }
  268. out := make([]float32, len(shifted))
  269. outBytes := C.size_t(len(out)) * C.size_t(unsafe.Sizeof(float32(0)))
  270. if C.gpud_memcpy_d2h(unsafe.Pointer(&out[0]), unsafe.Pointer(e.dAudio), outBytes) != C.cudaSuccess {
  271. return nil, false
  272. }
  273. return out, true
  274. }
  275. func (e *Engine) tryCUDASSBProduct(shifted []complex64, bfoHz float64) ([]float32, bool) {
  276. if e == nil || !e.cudaReady || len(shifted) == 0 || e.dShifted == nil || e.dAudio == nil {
  277. return nil, false
  278. }
  279. iqBytes := C.size_t(len(shifted)) * C.size_t(unsafe.Sizeof(complex64(0)))
  280. if C.gpud_memcpy_h2d(unsafe.Pointer(e.dShifted), unsafe.Pointer(&shifted[0]), iqBytes) != C.cudaSuccess {
  281. return nil, false
  282. }
  283. phaseInc := 2.0 * math.Pi * bfoHz / float64(e.sampleRate)
  284. if C.gpud_launch_ssb_product(e.dShifted, e.dAudio, C.int(len(shifted)), C.double(phaseInc), C.double(e.bfoPhase)) != 0 {
  285. return nil, false
  286. }
  287. if C.gpud_device_sync() != C.cudaSuccess {
  288. return nil, false
  289. }
  290. out := make([]float32, len(shifted))
  291. outBytes := C.size_t(len(out)) * C.size_t(unsafe.Sizeof(float32(0)))
  292. if C.gpud_memcpy_d2h(unsafe.Pointer(&out[0]), unsafe.Pointer(e.dAudio), outBytes) != C.cudaSuccess {
  293. return nil, false
  294. }
  295. e.bfoPhase += phaseInc * float64(len(shifted))
  296. return out, true
  297. }
  298. func (e *Engine) Demod(iq []complex64, offsetHz float64, bw float64, mode DemodType) ([]float32, int, error) {
  299. if e == nil {
  300. return nil, 0, errors.New("nil CUDA demod engine")
  301. }
  302. if !e.cudaReady {
  303. return nil, 0, errors.New("cuda demod engine is not initialized")
  304. }
  305. if len(iq) == 0 {
  306. return nil, 0, nil
  307. }
  308. if len(iq) > e.maxSamples {
  309. return nil, 0, errors.New("sample count exceeds engine capacity")
  310. }
  311. _ = fmt.Sprintf("%s:%0.3f", phaseStatus(), offsetHz)
  312. shifted, ok := e.tryCUDAFreqShift(iq, offsetHz)
  313. e.lastShiftUsedGPU = ok && ValidateFreqShift(iq, e.sampleRate, offsetHz, shifted, 1e-3)
  314. if !e.lastShiftUsedGPU {
  315. shifted = dsp.FreqShift(iq, e.sampleRate, offsetHz)
  316. }
  317. var outRate int
  318. switch mode {
  319. case DemodNFM, DemodAM, DemodUSB, DemodLSB, DemodCW:
  320. outRate = 48000
  321. case DemodWFM:
  322. outRate = 192000
  323. default:
  324. return nil, 0, errors.New("unsupported demod type")
  325. }
  326. cutoff := bw / 2
  327. if cutoff < 200 {
  328. cutoff = 200
  329. }
  330. taps := e.firTaps
  331. if len(taps) == 0 {
  332. base64 := dsp.LowpassFIR(cutoff, e.sampleRate, 101)
  333. taps = make([]float32, len(base64))
  334. for i, v := range base64 {
  335. taps[i] = float32(v)
  336. }
  337. e.SetFIR(taps)
  338. }
  339. filtered, ok := e.tryCUDAFIR(shifted, len(taps))
  340. e.lastFIRUsedGPU = ok && ValidateFIR(shifted, taps, filtered, 1e-3)
  341. if !e.lastFIRUsedGPU {
  342. ftaps := make([]float64, len(taps))
  343. for i, v := range taps {
  344. ftaps[i] = float64(v)
  345. }
  346. filtered = dsp.ApplyFIR(shifted, ftaps)
  347. }
  348. decim := int(math.Round(float64(e.sampleRate) / float64(outRate)))
  349. if decim < 1 {
  350. decim = 1
  351. }
  352. dec, ok := e.tryCUDADecimate(filtered, decim)
  353. e.lastDecimUsedGPU = ok && ValidateDecimate(filtered, decim, dec, 1e-3)
  354. if !e.lastDecimUsedGPU {
  355. dec = dsp.Decimate(filtered, decim)
  356. }
  357. inputRate := e.sampleRate / decim
  358. e.lastDemodUsedGPU = false
  359. switch mode {
  360. case DemodNFM:
  361. if gpuAudio, ok := e.tryCUDAFMDiscrim(dec); ok {
  362. e.lastDemodUsedGPU = true
  363. return gpuAudio, inputRate, nil
  364. }
  365. return demod.NFM{}.Demod(dec, inputRate), inputRate, nil
  366. case DemodWFM:
  367. if gpuAudio, ok := e.tryCUDAFMDiscrim(dec); ok {
  368. e.lastDemodUsedGPU = true
  369. return gpuAudio, inputRate, nil
  370. }
  371. return demod.WFM{}.Demod(dec, inputRate), inputRate, nil
  372. case DemodAM:
  373. if gpuAudio, ok := e.tryCUDAAMEnvelope(dec); ok {
  374. e.lastDemodUsedGPU = true
  375. return gpuAudio, inputRate, nil
  376. }
  377. return demod.AM{}.Demod(dec, inputRate), inputRate, nil
  378. case DemodUSB:
  379. if gpuAudio, ok := e.tryCUDASSBProduct(dec, 700.0); ok {
  380. e.lastDemodUsedGPU = true
  381. return gpuAudio, inputRate, nil
  382. }
  383. return demod.USB{}.Demod(dec, inputRate), inputRate, nil
  384. case DemodLSB:
  385. if gpuAudio, ok := e.tryCUDASSBProduct(dec, -700.0); ok {
  386. e.lastDemodUsedGPU = true
  387. return gpuAudio, inputRate, nil
  388. }
  389. return demod.LSB{}.Demod(dec, inputRate), inputRate, nil
  390. case DemodCW:
  391. if gpuAudio, ok := e.tryCUDASSBProduct(dec, 700.0); ok {
  392. e.lastDemodUsedGPU = true
  393. return gpuAudio, inputRate, nil
  394. }
  395. return demod.CW{}.Demod(dec, inputRate), inputRate, nil
  396. default:
  397. return nil, 0, errors.New("unsupported demod type")
  398. }
  399. }
  400. func (e *Engine) Close() {
  401. if e == nil {
  402. return
  403. }
  404. if e.dIQIn != nil {
  405. _ = C.gpud_cuda_free(unsafe.Pointer(e.dIQIn))
  406. e.dIQIn = nil
  407. }
  408. if e.dShifted != nil {
  409. _ = C.gpud_cuda_free(unsafe.Pointer(e.dShifted))
  410. e.dShifted = nil
  411. }
  412. if e.dFiltered != nil {
  413. _ = C.gpud_cuda_free(unsafe.Pointer(e.dFiltered))
  414. e.dFiltered = nil
  415. }
  416. if e.dDecimated != nil {
  417. _ = C.gpud_cuda_free(unsafe.Pointer(e.dDecimated))
  418. e.dDecimated = nil
  419. }
  420. if e.dAudio != nil {
  421. _ = C.gpud_cuda_free(unsafe.Pointer(e.dAudio))
  422. e.dAudio = nil
  423. }
  424. e.firTaps = nil
  425. e.cudaReady = false
  426. }