Du kan inte välja fler än 25 ämnen Ämnen måste starta med en bokstav eller siffra, kan innehålla bindestreck ('-') och vara max 35 tecken långa.

190 lines
5.0KB

  1. //go:build cufft && windows
  2. package gpudemod
  3. /*
  4. #include <cuda_runtime.h>
  5. */
  6. import "C"
  7. import (
  8. "errors"
  9. "math"
  10. "unsafe"
  11. "sdr-visual-suite/internal/dsp"
  12. )
  13. type slotBuffers struct {
  14. dShifted unsafe.Pointer
  15. dFiltered unsafe.Pointer
  16. dDecimated unsafe.Pointer
  17. dTaps unsafe.Pointer
  18. stream streamHandle
  19. }
  20. type windowsBatchRunner struct {
  21. *BatchRunner
  22. slotBufs []slotBuffers
  23. }
  24. func asWindowsBatchRunner(r *BatchRunner) *windowsBatchRunner {
  25. return (*windowsBatchRunner)(unsafe.Pointer(r))
  26. }
  27. func (r *windowsBatchRunner) freeSlotBuffers() {
  28. for i := range r.slotBufs {
  29. if r.slotBufs[i].dShifted != nil {
  30. _ = bridgeCudaFree(r.slotBufs[i].dShifted)
  31. r.slotBufs[i].dShifted = nil
  32. }
  33. if r.slotBufs[i].dFiltered != nil {
  34. _ = bridgeCudaFree(r.slotBufs[i].dFiltered)
  35. r.slotBufs[i].dFiltered = nil
  36. }
  37. if r.slotBufs[i].dDecimated != nil {
  38. _ = bridgeCudaFree(r.slotBufs[i].dDecimated)
  39. r.slotBufs[i].dDecimated = nil
  40. }
  41. if r.slotBufs[i].dTaps != nil {
  42. _ = bridgeCudaFree(r.slotBufs[i].dTaps)
  43. r.slotBufs[i].dTaps = nil
  44. }
  45. if r.slotBufs[i].stream != nil {
  46. _ = bridgeStreamDestroy(r.slotBufs[i].stream)
  47. r.slotBufs[i].stream = nil
  48. }
  49. }
  50. r.slotBufs = nil
  51. }
  52. func (r *windowsBatchRunner) allocSlotBuffers(n int) error {
  53. if len(r.slotBufs) == len(r.slots) && len(r.slotBufs) > 0 {
  54. return nil
  55. }
  56. r.freeSlotBuffers()
  57. if len(r.slots) == 0 {
  58. return nil
  59. }
  60. iqBytes := uintptr(n) * unsafe.Sizeof(complex64(0))
  61. tapsBytes := uintptr(256) * unsafe.Sizeof(float32(0))
  62. r.slotBufs = make([]slotBuffers, len(r.slots))
  63. for i := range r.slotBufs {
  64. for _, ptr := range []*unsafe.Pointer{&r.slotBufs[i].dShifted, &r.slotBufs[i].dFiltered, &r.slotBufs[i].dDecimated} {
  65. if bridgeCudaMalloc(ptr, iqBytes) != 0 {
  66. r.freeSlotBuffers()
  67. return errors.New("cudaMalloc slot buffer failed")
  68. }
  69. }
  70. if bridgeCudaMalloc(&r.slotBufs[i].dTaps, tapsBytes) != 0 {
  71. r.freeSlotBuffers()
  72. return errors.New("cudaMalloc slot taps failed")
  73. }
  74. s, res := bridgeStreamCreate()
  75. if res != 0 {
  76. r.freeSlotBuffers()
  77. return errors.New("cudaStreamCreate failed")
  78. }
  79. r.slotBufs[i].stream = s
  80. }
  81. return nil
  82. }
  83. func (r *BatchRunner) shiftFilterDecimateBatchImpl(iq []complex64) ([][]complex64, []int, error) {
  84. wr := asWindowsBatchRunner(r)
  85. e := r.eng
  86. if e == nil || !e.cudaReady {
  87. return nil, nil, ErrUnavailable
  88. }
  89. outs := make([][]complex64, len(r.slots))
  90. rates := make([]int, len(r.slots))
  91. n := len(iq)
  92. if n == 0 {
  93. return outs, rates, nil
  94. }
  95. if err := wr.allocSlotBuffers(n); err != nil {
  96. return nil, nil, err
  97. }
  98. bytesIn := uintptr(n) * unsafe.Sizeof(complex64(0))
  99. if bridgeMemcpyH2D(unsafe.Pointer(e.dIQIn), unsafe.Pointer(&iq[0]), bytesIn) != 0 {
  100. return nil, nil, errors.New("cudaMemcpy H2D failed")
  101. }
  102. for i := range r.slots {
  103. if !r.slots[i].active {
  104. continue
  105. }
  106. nOut, rate, err := r.shiftFilterDecimateSlotParallel(iq, r.slots[i].job, wr.slotBufs[i])
  107. if err != nil {
  108. return nil, nil, err
  109. }
  110. r.slots[i].rate = rate
  111. outs[i] = make([]complex64, nOut)
  112. rates[i] = rate
  113. }
  114. for i := range r.slots {
  115. if !r.slots[i].active {
  116. continue
  117. }
  118. buf := wr.slotBufs[i]
  119. if bridgeStreamSync(buf.stream) != 0 {
  120. return nil, nil, errors.New("cuda stream sync failed")
  121. }
  122. out := outs[i]
  123. if len(out) == 0 {
  124. continue
  125. }
  126. outBytes := uintptr(len(out)) * unsafe.Sizeof(complex64(0))
  127. if bridgeMemcpyD2H(unsafe.Pointer(&out[0]), buf.dDecimated, outBytes) != 0 {
  128. return nil, nil, errors.New("cudaMemcpy D2H failed")
  129. }
  130. r.slots[i].out = out
  131. }
  132. return outs, rates, nil
  133. }
  134. func (r *BatchRunner) shiftFilterDecimateSlotParallel(iq []complex64, job ExtractJob, buf slotBuffers) (int, int, error) {
  135. e := r.eng
  136. if e == nil || !e.cudaReady {
  137. return 0, 0, ErrUnavailable
  138. }
  139. n := len(iq)
  140. if n == 0 {
  141. return 0, 0, nil
  142. }
  143. cutoff := job.BW / 2
  144. if cutoff < 200 {
  145. cutoff = 200
  146. }
  147. base := dsp.LowpassFIR(cutoff, e.sampleRate, 101)
  148. taps := make([]float32, len(base))
  149. for i, v := range base {
  150. taps[i] = float32(v)
  151. }
  152. if len(taps) == 0 {
  153. return 0, 0, errors.New("no FIR taps configured")
  154. }
  155. tapsBytes := uintptr(len(taps)) * unsafe.Sizeof(float32(0))
  156. if bridgeMemcpyH2D(buf.dTaps, unsafe.Pointer(&taps[0]), tapsBytes) != 0 {
  157. return 0, 0, errors.New("taps H2D failed")
  158. }
  159. decim := int(math.Round(float64(e.sampleRate) / float64(job.OutRate)))
  160. if decim < 1 {
  161. decim = 1
  162. }
  163. nOut := n / decim
  164. if nOut <= 0 {
  165. return 0, 0, errors.New("not enough output samples after decimation")
  166. }
  167. phaseInc := -2.0 * math.Pi * job.OffsetHz / float64(e.sampleRate)
  168. if bridgeLaunchFreqShiftStream(e.dIQIn, (*gpuFloat2)(buf.dShifted), n, phaseInc, e.phase, buf.stream) != 0 {
  169. return 0, 0, errors.New("gpu freq shift failed")
  170. }
  171. if bridgeLaunchFIRv2Stream((*gpuFloat2)(buf.dShifted), (*gpuFloat2)(buf.dFiltered), (*C.float)(buf.dTaps), n, len(taps), buf.stream) != 0 {
  172. return 0, 0, errors.New("gpu FIR v2 failed")
  173. }
  174. if bridgeLaunchDecimateStream((*gpuFloat2)(buf.dFiltered), (*gpuFloat2)(buf.dDecimated), nOut, decim, buf.stream) != 0 {
  175. return 0, 0, errors.New("gpu decimate failed")
  176. }
  177. return nOut, e.sampleRate / decim, nil
  178. }