Wideband autonomous SDR analysis engine forked from sdr-visual-suite
Вы не можете выбрать более 25 тем Темы должны начинаться с буквы или цифры, могут содержать дефисы(-) и должны содержать не более 35 символов.

183 строки
4.9KB

  1. //go:build cufft && windows
  2. package gpudemod
  3. /*
  4. #include <cuda_runtime.h>
  5. */
  6. import "C"
  7. import (
  8. "errors"
  9. "math"
  10. "unsafe"
  11. "sdr-visual-suite/internal/dsp"
  12. )
  13. type slotBuffers struct {
  14. dShifted unsafe.Pointer
  15. dFiltered unsafe.Pointer
  16. dDecimated unsafe.Pointer
  17. dTaps unsafe.Pointer
  18. stream streamHandle
  19. }
  20. func (r *BatchRunner) freeSlotBuffers() {
  21. for i := range r.slotBufs {
  22. if r.slotBufs[i].dShifted != nil {
  23. _ = bridgeCudaFree(r.slotBufs[i].dShifted)
  24. r.slotBufs[i].dShifted = nil
  25. }
  26. if r.slotBufs[i].dFiltered != nil {
  27. _ = bridgeCudaFree(r.slotBufs[i].dFiltered)
  28. r.slotBufs[i].dFiltered = nil
  29. }
  30. if r.slotBufs[i].dDecimated != nil {
  31. _ = bridgeCudaFree(r.slotBufs[i].dDecimated)
  32. r.slotBufs[i].dDecimated = nil
  33. }
  34. if r.slotBufs[i].dTaps != nil {
  35. _ = bridgeCudaFree(r.slotBufs[i].dTaps)
  36. r.slotBufs[i].dTaps = nil
  37. }
  38. if r.slotBufs[i].stream != nil {
  39. _ = bridgeStreamDestroy(r.slotBufs[i].stream)
  40. r.slotBufs[i].stream = nil
  41. }
  42. }
  43. r.slotBufs = nil
  44. }
  45. func (r *BatchRunner) allocSlotBuffers(n int) error {
  46. // Re-allocate if slot count changed OR if buffer size grew
  47. needRealloc := len(r.slotBufs) != len(r.slots) || n > r.slotBufSize
  48. if !needRealloc && len(r.slotBufs) > 0 {
  49. return nil
  50. }
  51. r.freeSlotBuffers()
  52. if len(r.slots) == 0 {
  53. return nil
  54. }
  55. iqBytes := uintptr(n) * unsafe.Sizeof(complex64(0))
  56. tapsBytes := uintptr(256) * unsafe.Sizeof(float32(0))
  57. r.slotBufs = make([]slotBuffers, len(r.slots))
  58. for i := range r.slotBufs {
  59. for _, ptr := range []*unsafe.Pointer{&r.slotBufs[i].dShifted, &r.slotBufs[i].dFiltered, &r.slotBufs[i].dDecimated} {
  60. if bridgeCudaMalloc(ptr, iqBytes) != 0 {
  61. r.freeSlotBuffers()
  62. return errors.New("cudaMalloc slot buffer failed")
  63. }
  64. }
  65. if bridgeCudaMalloc(&r.slotBufs[i].dTaps, tapsBytes) != 0 {
  66. r.freeSlotBuffers()
  67. return errors.New("cudaMalloc slot taps failed")
  68. }
  69. s, res := bridgeStreamCreate()
  70. if res != 0 {
  71. r.freeSlotBuffers()
  72. return errors.New("cudaStreamCreate failed")
  73. }
  74. r.slotBufs[i].stream = s
  75. }
  76. r.slotBufSize = n
  77. return nil
  78. }
  79. func (r *BatchRunner) shiftFilterDecimateBatchImpl(iq []complex64) ([][]complex64, []int, error) {
  80. e := r.eng
  81. if e == nil || !e.cudaReady {
  82. return nil, nil, ErrUnavailable
  83. }
  84. outs := make([][]complex64, len(r.slots))
  85. rates := make([]int, len(r.slots))
  86. n := len(iq)
  87. if n == 0 {
  88. return outs, rates, nil
  89. }
  90. if err := r.allocSlotBuffers(n); err != nil {
  91. return nil, nil, err
  92. }
  93. bytesIn := uintptr(n) * unsafe.Sizeof(complex64(0))
  94. if bridgeMemcpyH2D(unsafe.Pointer(e.dIQIn), unsafe.Pointer(&iq[0]), bytesIn) != 0 {
  95. return nil, nil, errors.New("cudaMemcpy H2D failed")
  96. }
  97. for i := range r.slots {
  98. if !r.slots[i].active {
  99. continue
  100. }
  101. nOut, rate, err := r.shiftFilterDecimateSlotParallel(iq, r.slots[i].job, r.slotBufs[i])
  102. if err != nil {
  103. return nil, nil, err
  104. }
  105. r.slots[i].rate = rate
  106. outs[i] = make([]complex64, nOut)
  107. rates[i] = rate
  108. }
  109. for i := range r.slots {
  110. if !r.slots[i].active {
  111. continue
  112. }
  113. buf := r.slotBufs[i]
  114. if bridgeStreamSync(buf.stream) != 0 {
  115. return nil, nil, errors.New("cuda stream sync failed")
  116. }
  117. out := outs[i]
  118. if len(out) == 0 {
  119. continue
  120. }
  121. outBytes := uintptr(len(out)) * unsafe.Sizeof(complex64(0))
  122. if bridgeMemcpyD2H(unsafe.Pointer(&out[0]), buf.dDecimated, outBytes) != 0 {
  123. return nil, nil, errors.New("cudaMemcpy D2H failed")
  124. }
  125. r.slots[i].out = out
  126. }
  127. return outs, rates, nil
  128. }
  129. func (r *BatchRunner) shiftFilterDecimateSlotParallel(iq []complex64, job ExtractJob, buf slotBuffers) (int, int, error) {
  130. e := r.eng
  131. if e == nil || !e.cudaReady {
  132. return 0, 0, ErrUnavailable
  133. }
  134. n := len(iq)
  135. if n == 0 {
  136. return 0, 0, nil
  137. }
  138. cutoff := job.BW / 2
  139. if cutoff < 200 {
  140. cutoff = 200
  141. }
  142. base := dsp.LowpassFIR(cutoff, e.sampleRate, 101)
  143. taps := make([]float32, len(base))
  144. for i, v := range base {
  145. taps[i] = float32(v)
  146. }
  147. if len(taps) == 0 {
  148. return 0, 0, errors.New("no FIR taps configured")
  149. }
  150. tapsBytes := uintptr(len(taps)) * unsafe.Sizeof(float32(0))
  151. if bridgeMemcpyH2D(buf.dTaps, unsafe.Pointer(&taps[0]), tapsBytes) != 0 {
  152. return 0, 0, errors.New("taps H2D failed")
  153. }
  154. decim := int(math.Round(float64(e.sampleRate) / float64(job.OutRate)))
  155. if decim < 1 {
  156. decim = 1
  157. }
  158. nOut := n / decim
  159. if nOut <= 0 {
  160. return 0, 0, errors.New("not enough output samples after decimation")
  161. }
  162. phaseInc := -2.0 * math.Pi * job.OffsetHz / float64(e.sampleRate)
  163. if bridgeLaunchFreqShiftStream(e.dIQIn, (*gpuFloat2)(buf.dShifted), n, phaseInc, job.PhaseStart, buf.stream) != 0 {
  164. return 0, 0, errors.New("gpu freq shift failed")
  165. }
  166. if bridgeLaunchFIRv2Stream((*gpuFloat2)(buf.dShifted), (*gpuFloat2)(buf.dFiltered), (*C.float)(buf.dTaps), n, len(taps), buf.stream) != 0 {
  167. return 0, 0, errors.New("gpu FIR v2 failed")
  168. }
  169. if bridgeLaunchDecimateStream((*gpuFloat2)(buf.dFiltered), (*gpuFloat2)(buf.dDecimated), nOut, decim, buf.stream) != 0 {
  170. return 0, 0, errors.New("gpu decimate failed")
  171. }
  172. return nOut, e.sampleRate / decim, nil
  173. }