Vous ne pouvez pas sélectionner plus de 25 sujets Les noms de sujets doivent commencer par une lettre ou un nombre, peuvent contenir des tirets ('-') et peuvent comporter jusqu'à 35 caractères.

490 lignes
15KB

  1. //go:build cufft && !windows
  2. package gpudemod
  3. /*
  4. #cgo windows LDFLAGS: -L${SRCDIR}/../../../cuda-mingw -L${SRCDIR}/build -lgpudemod_kernels -lcufft64_12 -lcudart64_13 -lstdc++
  5. #cgo windows CFLAGS: -I"C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v13.2/include"
  6. #include <cuda_runtime.h>
  7. #include <cufft.h>
  8. typedef struct { float x; float y; } gpud_float2;
  9. static int gpud_cuda_malloc(void **ptr, size_t bytes) {
  10. return (int)cudaMalloc(ptr, bytes);
  11. }
  12. static int gpud_cuda_free(void *ptr) {
  13. return (int)cudaFree(ptr);
  14. }
  15. static int gpud_memcpy_h2d(void *dst, const void *src, size_t bytes) {
  16. return (int)cudaMemcpy(dst, src, bytes, cudaMemcpyHostToDevice);
  17. }
  18. static int gpud_memcpy_d2h(void *dst, const void *src, size_t bytes) {
  19. return (int)cudaMemcpy(dst, src, bytes, cudaMemcpyDeviceToHost);
  20. }
  21. static int gpud_device_sync() {
  22. return (int)cudaDeviceSynchronize();
  23. }
  24. extern int gpud_launch_freq_shift_cuda(const gpud_float2* in, gpud_float2* out, int n, double phase_inc, double phase_start);
  25. extern int gpud_launch_fm_discrim_cuda(const gpud_float2* in, float* out, int n);
  26. extern int gpud_upload_fir_taps_cuda(const float* taps, int n);
  27. extern int gpud_launch_fir_cuda(const gpud_float2* in, gpud_float2* out, int n, int num_taps);
  28. extern int gpud_launch_decimate_cuda(const gpud_float2* in, gpud_float2* out, int n_out, int factor);
  29. extern int gpud_launch_am_envelope_cuda(const gpud_float2* in, float* out, int n);
  30. extern int gpud_launch_ssb_product_cuda(const gpud_float2* in, float* out, int n, double phase_inc, double phase_start);
  31. static int gpud_launch_freq_shift(gpud_float2 *in, gpud_float2 *out, int n, double phase_inc, double phase_start) {
  32. return gpud_launch_freq_shift_cuda(in, out, n, phase_inc, phase_start);
  33. }
  34. static int gpud_launch_fm_discrim(gpud_float2 *in, float *out, int n) {
  35. return gpud_launch_fm_discrim_cuda(in, out, n);
  36. }
  37. static int gpud_upload_fir_taps(const float* taps, int n) {
  38. return gpud_upload_fir_taps_cuda(taps, n);
  39. }
  40. static int gpud_launch_fir(gpud_float2 *in, gpud_float2 *out, int n, int num_taps) {
  41. return gpud_launch_fir_cuda(in, out, n, num_taps);
  42. }
  43. static int gpud_launch_decimate(gpud_float2 *in, gpud_float2 *out, int n_out, int factor) {
  44. return gpud_launch_decimate_cuda(in, out, n_out, factor);
  45. }
  46. static int gpud_launch_am_envelope(gpud_float2 *in, float *out, int n) {
  47. return gpud_launch_am_envelope_cuda(in, out, n);
  48. }
  49. static int gpud_launch_ssb_product(gpud_float2 *in, float *out, int n, double phase_inc, double phase_start) {
  50. return gpud_launch_ssb_product_cuda(in, out, n, phase_inc, phase_start);
  51. }
  52. */
  53. import "C"
  54. import (
  55. "errors"
  56. "fmt"
  57. "math"
  58. "unsafe"
  59. "sdr-visual-suite/internal/demod"
  60. "sdr-visual-suite/internal/dsp"
  61. )
  62. type DemodType int
  63. const (
  64. DemodNFM DemodType = iota
  65. DemodWFM
  66. DemodAM
  67. DemodUSB
  68. DemodLSB
  69. DemodCW
  70. )
  71. type Engine struct {
  72. maxSamples int
  73. sampleRate int
  74. phase float64
  75. bfoPhase float64
  76. firTaps []float32
  77. cudaReady bool
  78. lastShiftUsedGPU bool
  79. lastFIRUsedGPU bool
  80. lastDecimUsedGPU bool
  81. lastDemodUsedGPU bool
  82. dIQIn *C.gpud_float2
  83. dShifted *C.gpud_float2
  84. dFiltered *C.gpud_float2
  85. dDecimated *C.gpud_float2
  86. dAudio *C.float
  87. iqBytes C.size_t
  88. audioBytes C.size_t
  89. }
  90. func Available() bool {
  91. var count C.int
  92. if C.cudaGetDeviceCount(&count) != C.cudaSuccess {
  93. return false
  94. }
  95. return count > 0
  96. }
  97. func New(maxSamples int, sampleRate int) (*Engine, error) {
  98. if maxSamples <= 0 {
  99. return nil, errors.New("invalid maxSamples")
  100. }
  101. if sampleRate <= 0 {
  102. return nil, errors.New("invalid sampleRate")
  103. }
  104. if !Available() {
  105. return nil, errors.New("cuda device not available")
  106. }
  107. e := &Engine{
  108. maxSamples: maxSamples,
  109. sampleRate: sampleRate,
  110. cudaReady: true,
  111. iqBytes: C.size_t(maxSamples) * C.size_t(unsafe.Sizeof(C.gpud_float2{})),
  112. audioBytes: C.size_t(maxSamples) * C.size_t(unsafe.Sizeof(C.float(0))),
  113. }
  114. var ptr unsafe.Pointer
  115. if C.gpud_cuda_malloc(&ptr, e.iqBytes) != C.cudaSuccess {
  116. e.Close()
  117. return nil, errors.New("cudaMalloc dIQIn failed")
  118. }
  119. e.dIQIn = (*C.gpud_float2)(ptr)
  120. ptr = nil
  121. if C.gpud_cuda_malloc(&ptr, e.iqBytes) != C.cudaSuccess {
  122. e.Close()
  123. return nil, errors.New("cudaMalloc dShifted failed")
  124. }
  125. e.dShifted = (*C.gpud_float2)(ptr)
  126. ptr = nil
  127. if C.gpud_cuda_malloc(&ptr, e.iqBytes) != C.cudaSuccess {
  128. e.Close()
  129. return nil, errors.New("cudaMalloc dFiltered failed")
  130. }
  131. e.dFiltered = (*C.gpud_float2)(ptr)
  132. ptr = nil
  133. if C.gpud_cuda_malloc(&ptr, e.iqBytes) != C.cudaSuccess {
  134. e.Close()
  135. return nil, errors.New("cudaMalloc dDecimated failed")
  136. }
  137. e.dDecimated = (*C.gpud_float2)(ptr)
  138. ptr = nil
  139. if C.gpud_cuda_malloc(&ptr, e.audioBytes) != C.cudaSuccess {
  140. e.Close()
  141. return nil, errors.New("cudaMalloc dAudio failed")
  142. }
  143. e.dAudio = (*C.float)(ptr)
  144. return e, nil
  145. }
  146. func (e *Engine) SetFIR(taps []float32) {
  147. if len(taps) == 0 {
  148. e.firTaps = nil
  149. return
  150. }
  151. if len(taps) > 256 {
  152. taps = taps[:256]
  153. }
  154. e.firTaps = append(e.firTaps[:0], taps...)
  155. if e.cudaReady {
  156. _ = C.gpud_upload_fir_taps((*C.float)(unsafe.Pointer(&e.firTaps[0])), C.int(len(e.firTaps)))
  157. }
  158. }
  159. func (e *Engine) LastShiftUsedGPU() bool { return e != nil && e.lastShiftUsedGPU }
  160. func (e *Engine) LastDemodUsedGPU() bool { return e != nil && e.lastDemodUsedGPU }
  161. func (e *Engine) tryCUDAFreqShift(iq []complex64, offsetHz float64) ([]complex64, bool) {
  162. if e == nil || !e.cudaReady || len(iq) == 0 || e.dIQIn == nil || e.dShifted == nil {
  163. return nil, false
  164. }
  165. bytes := C.size_t(len(iq)) * C.size_t(unsafe.Sizeof(complex64(0)))
  166. if C.gpud_memcpy_h2d(unsafe.Pointer(e.dIQIn), unsafe.Pointer(&iq[0]), bytes) != C.cudaSuccess {
  167. return nil, false
  168. }
  169. phaseInc := -2.0 * math.Pi * offsetHz / float64(e.sampleRate)
  170. if C.gpud_launch_freq_shift(e.dIQIn, e.dShifted, C.int(len(iq)), C.double(phaseInc), C.double(e.phase)) != 0 {
  171. return nil, false
  172. }
  173. if C.gpud_device_sync() != C.cudaSuccess {
  174. return nil, false
  175. }
  176. out := make([]complex64, len(iq))
  177. if C.gpud_memcpy_d2h(unsafe.Pointer(&out[0]), unsafe.Pointer(e.dShifted), bytes) != C.cudaSuccess {
  178. return nil, false
  179. }
  180. e.phase += phaseInc * float64(len(iq))
  181. return out, true
  182. }
  183. func (e *Engine) tryCUDAFIR(iq []complex64, numTaps int) ([]complex64, bool) {
  184. if e == nil || !e.cudaReady || len(iq) == 0 || numTaps <= 0 || e.dShifted == nil || e.dFiltered == nil {
  185. return nil, false
  186. }
  187. iqBytes := C.size_t(len(iq)) * C.size_t(unsafe.Sizeof(complex64(0)))
  188. if C.gpud_memcpy_h2d(unsafe.Pointer(e.dShifted), unsafe.Pointer(&iq[0]), iqBytes) != C.cudaSuccess {
  189. return nil, false
  190. }
  191. if C.gpud_launch_fir(e.dShifted, e.dFiltered, C.int(len(iq)), C.int(numTaps)) != 0 {
  192. return nil, false
  193. }
  194. if C.gpud_device_sync() != C.cudaSuccess {
  195. return nil, false
  196. }
  197. out := make([]complex64, len(iq))
  198. if C.gpud_memcpy_d2h(unsafe.Pointer(&out[0]), unsafe.Pointer(e.dFiltered), iqBytes) != C.cudaSuccess {
  199. return nil, false
  200. }
  201. return out, true
  202. }
  203. func (e *Engine) tryCUDADecimate(filtered []complex64, factor int) ([]complex64, bool) {
  204. if e == nil || !e.cudaReady || len(filtered) == 0 || factor <= 0 || e.dFiltered == nil || e.dDecimated == nil {
  205. return nil, false
  206. }
  207. nOut := len(filtered) / factor
  208. if nOut <= 0 {
  209. return nil, false
  210. }
  211. iqBytes := C.size_t(len(filtered)) * C.size_t(unsafe.Sizeof(complex64(0)))
  212. if C.gpud_memcpy_h2d(unsafe.Pointer(e.dFiltered), unsafe.Pointer(&filtered[0]), iqBytes) != C.cudaSuccess {
  213. return nil, false
  214. }
  215. if C.gpud_launch_decimate(e.dFiltered, e.dDecimated, C.int(nOut), C.int(factor)) != 0 {
  216. return nil, false
  217. }
  218. if C.gpud_device_sync() != C.cudaSuccess {
  219. return nil, false
  220. }
  221. out := make([]complex64, nOut)
  222. outBytes := C.size_t(nOut) * C.size_t(unsafe.Sizeof(complex64(0)))
  223. if C.gpud_memcpy_d2h(unsafe.Pointer(&out[0]), unsafe.Pointer(e.dDecimated), outBytes) != C.cudaSuccess {
  224. return nil, false
  225. }
  226. return out, true
  227. }
  228. func (e *Engine) tryCUDAFMDiscrim(shifted []complex64) ([]float32, bool) {
  229. if e == nil || !e.cudaReady || len(shifted) < 2 || e.dShifted == nil || e.dAudio == nil {
  230. return nil, false
  231. }
  232. iqBytes := C.size_t(len(shifted)) * C.size_t(unsafe.Sizeof(complex64(0)))
  233. if C.gpud_memcpy_h2d(unsafe.Pointer(e.dShifted), unsafe.Pointer(&shifted[0]), iqBytes) != C.cudaSuccess {
  234. return nil, false
  235. }
  236. if C.gpud_launch_fm_discrim(e.dShifted, e.dAudio, C.int(len(shifted))) != 0 {
  237. return nil, false
  238. }
  239. if C.gpud_device_sync() != C.cudaSuccess {
  240. return nil, false
  241. }
  242. out := make([]float32, len(shifted)-1)
  243. outBytes := C.size_t(len(out)) * C.size_t(unsafe.Sizeof(float32(0)))
  244. if C.gpud_memcpy_d2h(unsafe.Pointer(&out[0]), unsafe.Pointer(e.dAudio), outBytes) != C.cudaSuccess {
  245. return nil, false
  246. }
  247. return out, true
  248. }
  249. func (e *Engine) tryCUDAAMEnvelope(shifted []complex64) ([]float32, bool) {
  250. if e == nil || !e.cudaReady || len(shifted) == 0 || e.dShifted == nil || e.dAudio == nil {
  251. return nil, false
  252. }
  253. iqBytes := C.size_t(len(shifted)) * C.size_t(unsafe.Sizeof(complex64(0)))
  254. if C.gpud_memcpy_h2d(unsafe.Pointer(e.dShifted), unsafe.Pointer(&shifted[0]), iqBytes) != C.cudaSuccess {
  255. return nil, false
  256. }
  257. if C.gpud_launch_am_envelope(e.dShifted, e.dAudio, C.int(len(shifted))) != 0 {
  258. return nil, false
  259. }
  260. if C.gpud_device_sync() != C.cudaSuccess {
  261. return nil, false
  262. }
  263. out := make([]float32, len(shifted))
  264. outBytes := C.size_t(len(out)) * C.size_t(unsafe.Sizeof(float32(0)))
  265. if C.gpud_memcpy_d2h(unsafe.Pointer(&out[0]), unsafe.Pointer(e.dAudio), outBytes) != C.cudaSuccess {
  266. return nil, false
  267. }
  268. return out, true
  269. }
  270. func (e *Engine) tryCUDASSBProduct(shifted []complex64, bfoHz float64) ([]float32, bool) {
  271. if e == nil || !e.cudaReady || len(shifted) == 0 || e.dShifted == nil || e.dAudio == nil {
  272. return nil, false
  273. }
  274. iqBytes := C.size_t(len(shifted)) * C.size_t(unsafe.Sizeof(complex64(0)))
  275. if C.gpud_memcpy_h2d(unsafe.Pointer(e.dShifted), unsafe.Pointer(&shifted[0]), iqBytes) != C.cudaSuccess {
  276. return nil, false
  277. }
  278. phaseInc := 2.0 * math.Pi * bfoHz / float64(e.sampleRate)
  279. if C.gpud_launch_ssb_product(e.dShifted, e.dAudio, C.int(len(shifted)), C.double(phaseInc), C.double(e.bfoPhase)) != 0 {
  280. return nil, false
  281. }
  282. if C.gpud_device_sync() != C.cudaSuccess {
  283. return nil, false
  284. }
  285. out := make([]float32, len(shifted))
  286. outBytes := C.size_t(len(out)) * C.size_t(unsafe.Sizeof(float32(0)))
  287. if C.gpud_memcpy_d2h(unsafe.Pointer(&out[0]), unsafe.Pointer(e.dAudio), outBytes) != C.cudaSuccess {
  288. return nil, false
  289. }
  290. e.bfoPhase += phaseInc * float64(len(shifted))
  291. return out, true
  292. }
  293. func (e *Engine) ShiftFilterDecimate(iq []complex64, offsetHz float64, bw float64, outRate int) ([]complex64, int, error) {
  294. if e == nil {
  295. return nil, 0, errors.New("nil CUDA demod engine")
  296. }
  297. if len(iq) == 0 {
  298. return nil, 0, nil
  299. }
  300. if outRate <= 0 {
  301. return nil, 0, errors.New("invalid output sample rate")
  302. }
  303. shifted := dsp.FreqShift(iq, e.sampleRate, offsetHz)
  304. cutoff := bw / 2
  305. if cutoff < 200 {
  306. cutoff = 200
  307. }
  308. if cutoff > float64(e.sampleRate)/2-1 {
  309. cutoff = float64(e.sampleRate)/2 - 1
  310. }
  311. ftaps := dsp.LowpassFIR(cutoff, e.sampleRate, 101)
  312. filtered := dsp.ApplyFIR(shifted, ftaps)
  313. decim := int(math.Round(float64(e.sampleRate) / float64(outRate)))
  314. if decim < 1 {
  315. decim = 1
  316. }
  317. return dsp.Decimate(filtered, decim), e.sampleRate / decim, nil
  318. }
  319. func (e *Engine) DemodFused(iq []complex64, offsetHz float64, bw float64, mode DemodType) ([]float32, int, error) {
  320. return e.Demod(iq, offsetHz, bw, mode)
  321. }
  322. func (e *Engine) Demod(iq []complex64, offsetHz float64, bw float64, mode DemodType) ([]float32, int, error) {
  323. if e == nil {
  324. return nil, 0, errors.New("nil CUDA demod engine")
  325. }
  326. if !e.cudaReady {
  327. return nil, 0, errors.New("cuda demod engine is not initialized")
  328. }
  329. if len(iq) == 0 {
  330. return nil, 0, nil
  331. }
  332. if len(iq) > e.maxSamples {
  333. return nil, 0, errors.New("sample count exceeds engine capacity")
  334. }
  335. shifted, ok := e.tryCUDAFreqShift(iq, offsetHz)
  336. e.lastShiftUsedGPU = ok && ValidateFreqShift(iq, e.sampleRate, offsetHz, shifted, 1e-3)
  337. if !e.lastShiftUsedGPU {
  338. shifted = dsp.FreqShift(iq, e.sampleRate, offsetHz)
  339. }
  340. var outRate int
  341. switch mode {
  342. case DemodNFM, DemodAM, DemodUSB, DemodLSB, DemodCW:
  343. outRate = 48000
  344. case DemodWFM:
  345. outRate = 192000
  346. default:
  347. return nil, 0, errors.New("unsupported demod type")
  348. }
  349. cutoff := bw / 2
  350. if cutoff < 200 {
  351. cutoff = 200
  352. }
  353. taps := e.firTaps
  354. if len(taps) == 0 {
  355. base64 := dsp.LowpassFIR(cutoff, e.sampleRate, 101)
  356. taps = make([]float32, len(base64))
  357. for i, v := range base64 {
  358. taps[i] = float32(v)
  359. }
  360. e.SetFIR(taps)
  361. }
  362. filtered, ok := e.tryCUDAFIR(shifted, len(taps))
  363. e.lastFIRUsedGPU = ok && ValidateFIR(shifted, taps, filtered, 1e-3)
  364. if !e.lastFIRUsedGPU {
  365. ftaps := make([]float64, len(taps))
  366. for i, v := range taps {
  367. ftaps[i] = float64(v)
  368. }
  369. filtered = dsp.ApplyFIR(shifted, ftaps)
  370. }
  371. decim := int(math.Round(float64(e.sampleRate) / float64(outRate)))
  372. if decim < 1 {
  373. decim = 1
  374. }
  375. dec, ok := e.tryCUDADecimate(filtered, decim)
  376. e.lastDecimUsedGPU = ok && ValidateDecimate(filtered, decim, dec, 1e-3)
  377. if !e.lastDecimUsedGPU {
  378. dec = dsp.Decimate(filtered, decim)
  379. }
  380. inputRate := e.sampleRate / decim
  381. e.lastDemodUsedGPU = false
  382. switch mode {
  383. case DemodNFM:
  384. if gpuAudio, ok := e.tryCUDAFMDiscrim(dec); ok {
  385. e.lastDemodUsedGPU = true
  386. return gpuAudio, inputRate, nil
  387. }
  388. return demod.NFM{}.Demod(dec, inputRate), inputRate, nil
  389. case DemodWFM:
  390. if gpuAudio, ok := e.tryCUDAFMDiscrim(dec); ok {
  391. e.lastDemodUsedGPU = true
  392. return gpuAudio, inputRate, nil
  393. }
  394. return demod.WFM{}.Demod(dec, inputRate), inputRate, nil
  395. case DemodAM:
  396. if gpuAudio, ok := e.tryCUDAAMEnvelope(dec); ok {
  397. e.lastDemodUsedGPU = true
  398. return gpuAudio, inputRate, nil
  399. }
  400. return demod.AM{}.Demod(dec, inputRate), inputRate, nil
  401. case DemodUSB:
  402. if gpuAudio, ok := e.tryCUDASSBProduct(dec, 700.0); ok {
  403. e.lastDemodUsedGPU = true
  404. return gpuAudio, inputRate, nil
  405. }
  406. return demod.USB{}.Demod(dec, inputRate), inputRate, nil
  407. case DemodLSB:
  408. if gpuAudio, ok := e.tryCUDASSBProduct(dec, -700.0); ok {
  409. e.lastDemodUsedGPU = true
  410. return gpuAudio, inputRate, nil
  411. }
  412. return demod.LSB{}.Demod(dec, inputRate), inputRate, nil
  413. case DemodCW:
  414. if gpuAudio, ok := e.tryCUDASSBProduct(dec, 700.0); ok {
  415. e.lastDemodUsedGPU = true
  416. return gpuAudio, inputRate, nil
  417. }
  418. return demod.CW{}.Demod(dec, inputRate), inputRate, nil
  419. default:
  420. return nil, 0, errors.New("unsupported demod type")
  421. }
  422. }
  423. func (e *Engine) Close() {
  424. if e == nil {
  425. return
  426. }
  427. if e.dIQIn != nil {
  428. _ = C.gpud_cuda_free(unsafe.Pointer(e.dIQIn))
  429. e.dIQIn = nil
  430. }
  431. if e.dShifted != nil {
  432. _ = C.gpud_cuda_free(unsafe.Pointer(e.dShifted))
  433. e.dShifted = nil
  434. }
  435. if e.dFiltered != nil {
  436. _ = C.gpud_cuda_free(unsafe.Pointer(e.dFiltered))
  437. e.dFiltered = nil
  438. }
  439. if e.dDecimated != nil {
  440. _ = C.gpud_cuda_free(unsafe.Pointer(e.dDecimated))
  441. e.dDecimated = nil
  442. }
  443. if e.dAudio != nil {
  444. _ = C.gpud_cuda_free(unsafe.Pointer(e.dAudio))
  445. e.dAudio = nil
  446. }
  447. e.firTaps = nil
  448. e.cudaReady = false
  449. }