diff --git a/README.md b/README.md index 73bd73c..9466dc5 100644 --- a/README.md +++ b/README.md @@ -42,15 +42,14 @@ powershell -ExecutionPolicy Bypass -File .\build-sdrplay.ps1 ``` This path uses: -- MinGW GCC/G++ for the Go/CGO toolchain -- `nvcc` with MinGW `g++` as the host compiler for `gpudemod` kernels -- MinGW-compatible CUDA import libs from `cuda-mingw/` +- `nvcc` + MSVC to build `gpudemod_kernels.dll` +- MinGW GCC/G++ for the Go/CGO application build +- runtime DLL loading for the Windows `gpudemod` path Important: -- the kernel archive is generated as `internal/demod/gpudemod/build/libgpudemod_kernels.a` -- `-lstdc++` is linked explicitly for CUDA host-side C++ runtime references -- CUDA 13.x no longer supports older targets like `sm_50`/`sm_60`, so the script builds for `sm_75+` -- if `nvcc` is missing, CUDA kernel preparation will fail +- `gpudemod_kernels.dll` must be present next to `sdrd.exe` or in `internal/demod/gpudemod/build/` +- `build-sdrplay.ps1` copies the DLL to the repo root after a successful app build when available +- this avoids directly linking MSVC CUDA kernel objects into the MinGW-linked Go binary ### Linux ```bash diff --git a/build-cuda-windows.ps1 b/build-cuda-windows.ps1 index fb2f2e3..be408ef 100644 --- a/build-cuda-windows.ps1 +++ b/build-cuda-windows.ps1 @@ -1,19 +1,6 @@ $ErrorActionPreference = 'Stop' -$mingw = 'C:\msys64\mingw64\bin' -if (-not (Test-Path (Join-Path $mingw 'g++.exe'))) { - throw "MinGW g++ not found at $mingw" -} - -$cudaBin = 'C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.2\bin' -if (-not (Test-Path (Join-Path $cudaBin 'nvcc.exe'))) { - throw "nvcc.exe not found at $cudaBin" -} - -$env:PATH = "$mingw;$cudaBin;" + $env:PATH - -Write-Host 'Preparing Windows CUDA environment for gpudemod (MinGW host compiler)...' -ForegroundColor Cyan -powershell -ExecutionPolicy Bypass -File tools\build-gpudemod-kernel.ps1 -if ($LASTEXITCODE -ne 0) { throw 'kernel build failed' } - -Write-Host 'Done. GNU-compatible gpudemod kernel library prepared.' -ForegroundColor Green +Write-Host 'Preparing Windows CUDA DLL for gpudemod (MSVC/nvcc path)...' -ForegroundColor Cyan +powershell -ExecutionPolicy Bypass -File .\build-gpudemod-dll.ps1 +if ($LASTEXITCODE -ne 0) { throw 'gpudemod DLL build failed' } +Write-Host 'Done. gpudemod_kernels.dll is ready.' -ForegroundColor Green diff --git a/build-gpudemod-dll.ps1 b/build-gpudemod-dll.ps1 new file mode 100644 index 0000000..47f021c --- /dev/null +++ b/build-gpudemod-dll.ps1 @@ -0,0 +1,27 @@ +$ErrorActionPreference = 'Stop' + +$vcvars = 'C:\Program Files (x86)\Microsoft Visual Studio\2019\BuildTools\VC\Auxiliary\Build\vcvars64.bat' +$cudaRoot = 'C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.2' +$nvcc = Join-Path $cudaRoot 'bin\nvcc.exe' +$src = Join-Path $PSScriptRoot 'internal\demod\gpudemod\native\exports.cu' +$outDir = Join-Path $PSScriptRoot 'internal\demod\gpudemod\build' +$dll = Join-Path $outDir 'gpudemod_kernels.dll' +$lib = Join-Path $outDir 'gpudemod_kernels.lib' +$exp = Join-Path $outDir 'gpudemod_kernels.exp' + +if (!(Test-Path $vcvars)) { throw "vcvars64.bat not found at $vcvars" } +if (!(Test-Path $nvcc)) { throw "nvcc.exe not found at $nvcc" } +if (!(Test-Path $src)) { throw "CUDA source not found at $src" } +if (!(Test-Path $outDir)) { New-Item -ItemType Directory -Path $outDir | Out-Null } + +Remove-Item $dll,$lib,$exp -Force -ErrorAction SilentlyContinue + +$cmd = @" +call "$vcvars" && "$nvcc" -shared "$src" -o "$dll" -Xcompiler "/MD" -arch=sm_75 -gencode arch=compute_75,code=sm_75 -gencode arch=compute_80,code=sm_80 -gencode arch=compute_86,code=sm_86 -gencode arch=compute_89,code=sm_89 -gencode arch=compute_90,code=sm_90 +"@ + +Write-Host 'Building gpudemod CUDA DLL...' -ForegroundColor Cyan +cmd.exe /c $cmd +if ($LASTEXITCODE -ne 0) { throw 'gpudemod DLL build failed' } + +Write-Host "Built: $dll" -ForegroundColor Green diff --git a/build-sdrplay.ps1 b/build-sdrplay.ps1 index 6f8e56b..a731d63 100644 --- a/build-sdrplay.ps1 +++ b/build-sdrplay.ps1 @@ -15,36 +15,27 @@ $env:CXX = 'g++' $env:CGO_CFLAGS = '-IC:\PROGRA~1\SDRplay\API\inc' $env:CGO_LDFLAGS = '-LC:\PROGRA~1\SDRplay\API\x64 -lsdrplay_api' -# CUDA (cuFFT) +# CUDA runtime / cuFFT $cudaInc = 'C:\CUDA\include' $cudaBin = 'C:\CUDA\bin' -if (-not (Test-Path $cudaInc)) { - $cudaInc = 'C:\PROGRA~1\NVIDIA GPU Computing Toolkit\CUDA\v13.2\include' - $cudaBin = 'C:\PROGRA~1\NVIDIA GPU Computing Toolkit\CUDA\v13.2\bin' -} -if (Test-Path $cudaInc) { - $env:CGO_CFLAGS = "$env:CGO_CFLAGS -I$cudaInc" -} -if (Test-Path $cudaBin) { - $env:PATH = "$cudaBin;" + $env:PATH -} - +if (-not (Test-Path $cudaInc)) { $cudaInc = 'C:\PROGRA~1\NVIDIA~2\CUDA\v13.2\include' } +if (-not (Test-Path $cudaBin)) { $cudaBin = 'C:\PROGRA~1\NVIDIA~2\CUDA\v13.2\bin' } $cudaMingw = Join-Path $PSScriptRoot 'cuda-mingw' -$gpuDemodBuild = Join-Path $PSScriptRoot 'internal\demod\gpudemod\build' -if (Test-Path $cudaMingw) { - $env:CGO_LDFLAGS = "$env:CGO_LDFLAGS -L$cudaMingw" -} -if (Test-Path $gpuDemodBuild) { - $env:CGO_LDFLAGS = "$env:CGO_LDFLAGS -L$gpuDemodBuild" -} -$env:CGO_LDFLAGS = "$env:CGO_LDFLAGS -lgpudemod_kernels -lcufft64_12 -lcudart64_13 -lstdc++" - -Write-Host 'Building with SDRplay + cuFFT support (MinGW-host CUDA path)...' -ForegroundColor Cyan -Write-Host 'Preparing GNU-compatible CUDA kernel artifacts...' -ForegroundColor Cyan -powershell -ExecutionPolicy Bypass -File tools\build-gpudemod-kernel.ps1 -if ($LASTEXITCODE -ne 0) { throw 'kernel build failed' } +if (Test-Path $cudaInc) { $env:CGO_CFLAGS = "$env:CGO_CFLAGS -I$cudaInc" } +if (Test-Path $cudaBin) { $env:PATH = "$cudaBin;" + $env:PATH } +if (Test-Path $cudaMingw) { $env:CGO_LDFLAGS = "$env:CGO_LDFLAGS -L$cudaMingw -lcudart64_13 -lcufft64_12 -lkernel32" } +Write-Host 'Building SDRplay + cuFFT app (Windows DLL path)...' -ForegroundColor Cyan go build -tags "sdrplay,cufft" ./cmd/sdrd if ($LASTEXITCODE -ne 0) { throw 'build failed' } +$dllSrc = Join-Path $PSScriptRoot 'internal\demod\gpudemod\build\gpudemod_kernels.dll' +$dllDst = Join-Path $PSScriptRoot 'gpudemod_kernels.dll' +if (Test-Path $dllSrc) { + Copy-Item $dllSrc $dllDst -Force + Write-Host "Copied DLL to $dllDst" -ForegroundColor Green +} else { + Write-Host 'WARNING: gpudemod_kernels.dll not found; build succeeded but runtime GPU demod will not load.' -ForegroundColor Yellow +} + Write-Host 'Done.' -ForegroundColor Green diff --git a/docs/build-cuda.md b/docs/build-cuda.md index 21c53d4..938bc73 100644 --- a/docs/build-cuda.md +++ b/docs/build-cuda.md @@ -1,55 +1,38 @@ # CUDA Build Strategy -## Windows: MinGW-host NVCC path +## Windows: gpudemod DLL split -The recommended Windows CUDA build path for this repository is: +The recommended Windows CUDA path is now a DLL split for `gpudemod`: -1. Compile `internal/demod/gpudemod/kernels.cu` with `nvcc` using MinGW `g++` as the host compiler -2. Archive the result as `internal/demod/gpudemod/build/libgpudemod_kernels.a` -3. Build the Go app with MinGW GCC/G++ via CGO +1. Build `internal/demod/gpudemod/native/exports.cu` into `gpudemod_kernels.dll` using `nvcc` + MSVC +2. Build the Go app with MinGW GCC/G++ via CGO +3. Load `gpudemod_kernels.dll` at runtime from Go on Windows -This keeps the CUDA demod kernel library in a GNU-compatible format so Go's MinGW CGO linker can consume it. +This avoids direct static linking of MSVC-built CUDA objects into the MinGW-linked Go binary. -### Why +## Why -The previous failing path mixed: -- `nvcc` + default MSVC host compiler (`cl.exe`) for CUDA kernels -- MinGW GCC/LD for the final Go/CGO link +The previous failing paths mixed incompatible toolchains at final link time: +- MSVC-host CUDA object/library generation +- MinGW GCC/LD for the Go executable -That produced unresolved MSVC runtime symbols such as: -- `__GSHandlerCheck` -- `__security_cookie` -- `_Init_thread_epoch` +The DLL split keeps that boundary at runtime instead of link time. -### Current Windows build flow +## Current Windows build flow ```powershell powershell -ExecutionPolicy Bypass -File .\build-cuda-windows.ps1 powershell -ExecutionPolicy Bypass -File .\build-sdrplay.ps1 ``` -### Critical details +## Runtime expectation -- CUDA kernel archive must be named `libgpudemod_kernels.a` -- `nvcc` must be invoked with `-ccbin C:\msys64\mingw64\bin\g++.exe` -- Windows CGO link uses: - - SDRplay API import lib - - MinGW CUDA import libs from `cuda-mingw/` - - `-lgpudemod_kernels` - - `-lcufft64_12` - - `-lcudart64_13` - - `-lstdc++` +`gpudemod_kernels.dll` must be available either: +- next to `sdrd.exe`, or +- in `internal/demod/gpudemod/build/` during local runs from the repo -### Caveat - -`nvcc` + MinGW on Windows is not officially supported by NVIDIA. For the kernel launcher style used here (`extern "C"` functions, limited host C++ surface), it is the most practical path. - -CUDA 13.x also drops older GPU targets such as `sm_50` and `sm_60`, so the kernel build script targets `sm_75+`. +The Windows `gpudemod` loader searches both locations. ## Linux -Linux remains the cleanest end-to-end CUDA path: - -1. Build CUDA kernels with `nvcc` + GCC -2. Link via standard CGO/GCC flow -3. Avoid Windows toolchain mismatch entirely +Linux remains the simpler direct-link path and still avoids the Windows mixed-toolchain problem entirely. diff --git a/internal/demod/gpudemod/build/gpudemod_kernels.exp b/internal/demod/gpudemod/build/gpudemod_kernels.exp new file mode 100644 index 0000000..979be64 Binary files /dev/null and b/internal/demod/gpudemod/build/gpudemod_kernels.exp differ diff --git a/internal/demod/gpudemod/build/gpudemod_kernels.lib b/internal/demod/gpudemod/build/gpudemod_kernels.lib index 1776fcd..948c17b 100644 Binary files a/internal/demod/gpudemod/build/gpudemod_kernels.lib and b/internal/demod/gpudemod/build/gpudemod_kernels.lib differ diff --git a/internal/demod/gpudemod/gpudemod.go b/internal/demod/gpudemod/gpudemod.go index 3559971..3b4a6a3 100644 --- a/internal/demod/gpudemod/gpudemod.go +++ b/internal/demod/gpudemod/gpudemod.go @@ -1,4 +1,4 @@ -//go:build cufft +//go:build cufft && !windows package gpudemod diff --git a/internal/demod/gpudemod/gpudemod_windows.go b/internal/demod/gpudemod/gpudemod_windows.go new file mode 100644 index 0000000..daae728 --- /dev/null +++ b/internal/demod/gpudemod/gpudemod_windows.go @@ -0,0 +1,525 @@ +//go:build cufft && windows + +package gpudemod + +/* +#cgo windows CFLAGS: -I"C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v13.2/include" +#cgo windows LDFLAGS: -lcudart64_13 -lkernel32 +#include +#include +#include + +typedef struct { float x; float y; } gpud_float2; + +typedef int (__stdcall *gpud_upload_fir_taps_fn)(const float* taps, int n); +typedef int (__stdcall *gpud_launch_freq_shift_fn)(const gpud_float2* in, gpud_float2* out, int n, double phase_inc, double phase_start); +typedef int (__stdcall *gpud_launch_fm_discrim_fn)(const gpud_float2* in, float* out, int n); +typedef int (__stdcall *gpud_launch_fir_fn)(const gpud_float2* in, gpud_float2* out, int n, int num_taps); +typedef int (__stdcall *gpud_launch_decimate_fn)(const gpud_float2* in, gpud_float2* out, int n_out, int factor); +typedef int (__stdcall *gpud_launch_am_envelope_fn)(const gpud_float2* in, float* out, int n); +typedef int (__stdcall *gpud_launch_ssb_product_fn)(const gpud_float2* in, float* out, int n, double phase_inc, double phase_start); + +static HMODULE gpud_mod = NULL; +static gpud_upload_fir_taps_fn gpud_p_upload_fir_taps = NULL; +static gpud_launch_freq_shift_fn gpud_p_launch_freq_shift = NULL; +static gpud_launch_fm_discrim_fn gpud_p_launch_fm_discrim = NULL; +static gpud_launch_fir_fn gpud_p_launch_fir = NULL; +static gpud_launch_decimate_fn gpud_p_launch_decimate = NULL; +static gpud_launch_am_envelope_fn gpud_p_launch_am_envelope = NULL; +static gpud_launch_ssb_product_fn gpud_p_launch_ssb_product = NULL; + +static int gpud_cuda_malloc(void **ptr, size_t bytes) { return (int)cudaMalloc(ptr, bytes); } +static int gpud_cuda_free(void *ptr) { return (int)cudaFree(ptr); } +static int gpud_memcpy_h2d(void *dst, const void *src, size_t bytes) { return (int)cudaMemcpy(dst, src, bytes, cudaMemcpyHostToDevice); } +static int gpud_memcpy_d2h(void *dst, const void *src, size_t bytes) { return (int)cudaMemcpy(dst, src, bytes, cudaMemcpyDeviceToHost); } +static int gpud_device_sync() { return (int)cudaDeviceSynchronize(); } + +static int gpud_load_library(const char* path) { + if (gpud_mod != NULL) return 0; + gpud_mod = LoadLibraryA(path); + if (gpud_mod == NULL) return -1; + gpud_p_upload_fir_taps = (gpud_upload_fir_taps_fn)GetProcAddress(gpud_mod, "gpud_upload_fir_taps_cuda"); + gpud_p_launch_freq_shift = (gpud_launch_freq_shift_fn)GetProcAddress(gpud_mod, "gpud_launch_freq_shift_cuda"); + gpud_p_launch_fm_discrim = (gpud_launch_fm_discrim_fn)GetProcAddress(gpud_mod, "gpud_launch_fm_discrim_cuda"); + gpud_p_launch_fir = (gpud_launch_fir_fn)GetProcAddress(gpud_mod, "gpud_launch_fir_cuda"); + gpud_p_launch_decimate = (gpud_launch_decimate_fn)GetProcAddress(gpud_mod, "gpud_launch_decimate_cuda"); + gpud_p_launch_am_envelope = (gpud_launch_am_envelope_fn)GetProcAddress(gpud_mod, "gpud_launch_am_envelope_cuda"); + gpud_p_launch_ssb_product = (gpud_launch_ssb_product_fn)GetProcAddress(gpud_mod, "gpud_launch_ssb_product_cuda"); + if (!gpud_p_upload_fir_taps || !gpud_p_launch_freq_shift || !gpud_p_launch_fm_discrim || !gpud_p_launch_fir || !gpud_p_launch_decimate || !gpud_p_launch_am_envelope || !gpud_p_launch_ssb_product) { + FreeLibrary(gpud_mod); + gpud_mod = NULL; + return -2; + } + return 0; +} + +static int gpud_upload_fir_taps(const float* taps, int n) { + if (!gpud_p_upload_fir_taps) return -1; + return gpud_p_upload_fir_taps(taps, n); +} +static int gpud_launch_freq_shift(gpud_float2 *in, gpud_float2 *out, int n, double phase_inc, double phase_start) { + if (!gpud_p_launch_freq_shift) return -1; + return gpud_p_launch_freq_shift(in, out, n, phase_inc, phase_start); +} +static int gpud_launch_fm_discrim(gpud_float2 *in, float *out, int n) { + if (!gpud_p_launch_fm_discrim) return -1; + return gpud_p_launch_fm_discrim(in, out, n); +} +static int gpud_launch_fir(gpud_float2 *in, gpud_float2 *out, int n, int num_taps) { + if (!gpud_p_launch_fir) return -1; + return gpud_p_launch_fir(in, out, n, num_taps); +} +static int gpud_launch_decimate(gpud_float2 *in, gpud_float2 *out, int n_out, int factor) { + if (!gpud_p_launch_decimate) return -1; + return gpud_p_launch_decimate(in, out, n_out, factor); +} +static int gpud_launch_am_envelope(gpud_float2 *in, float *out, int n) { + if (!gpud_p_launch_am_envelope) return -1; + return gpud_p_launch_am_envelope(in, out, n); +} +static int gpud_launch_ssb_product(gpud_float2 *in, float *out, int n, double phase_inc, double phase_start) { + if (!gpud_p_launch_ssb_product) return -1; + return gpud_p_launch_ssb_product(in, out, n, phase_inc, phase_start); +} +*/ +import "C" + +import ( + "errors" + "fmt" + "math" + "os" + "path/filepath" + "sync" + "unsafe" + + "sdr-visual-suite/internal/demod" + "sdr-visual-suite/internal/dsp" +) + +type DemodType int + +const ( + DemodNFM DemodType = iota + DemodWFM + DemodAM + DemodUSB + DemodLSB + DemodCW +) + +var loadOnce sync.Once +var loadErr error + +func ensureDLLLoaded() error { + loadOnce.Do(func() { + candidates := []string{} + if exe, err := os.Executable(); err == nil { + dir := filepath.Dir(exe) + candidates = append(candidates, filepath.Join(dir, "gpudemod_kernels.dll")) + } + if wd, err := os.Getwd(); err == nil { + candidates = append(candidates, + filepath.Join(wd, "gpudemod_kernels.dll"), + filepath.Join(wd, "internal", "demod", "gpudemod", "build", "gpudemod_kernels.dll"), + ) + } + seen := map[string]bool{} + for _, p := range candidates { + if p == "" || seen[p] { + continue + } + seen[p] = true + if _, err := os.Stat(p); err == nil { + cp := C.CString(p) + res := C.gpud_load_library(cp) + C.free(unsafe.Pointer(cp)) + if res == 0 { + loadErr = nil + return + } + loadErr = fmt.Errorf("failed to load gpudemod DLL: %s (code %d)", p, int(res)) + } + } + if loadErr == nil { + loadErr = errors.New("gpudemod_kernels.dll not found") + } + }) + return loadErr +} + +type Engine struct { + maxSamples int + sampleRate int + phase float64 + bfoPhase float64 + firTaps []float32 + cudaReady bool + lastShiftUsedGPU bool + lastFIRUsedGPU bool + lastDecimUsedGPU bool + lastDemodUsedGPU bool + dIQIn *C.gpud_float2 + dShifted *C.gpud_float2 + dFiltered *C.gpud_float2 + dDecimated *C.gpud_float2 + dAudio *C.float + iqBytes C.size_t + audioBytes C.size_t +} + +func Available() bool { + if ensureDLLLoaded() != nil { + return false + } + var count C.int + if C.cudaGetDeviceCount(&count) != C.cudaSuccess { + return false + } + return count > 0 +} + +func New(maxSamples int, sampleRate int) (*Engine, error) { + if maxSamples <= 0 { + return nil, errors.New("invalid maxSamples") + } + if sampleRate <= 0 { + return nil, errors.New("invalid sampleRate") + } + if err := ensureDLLLoaded(); err != nil { + return nil, err + } + if !Available() { + return nil, errors.New("cuda device not available") + } + e := &Engine{ + maxSamples: maxSamples, + sampleRate: sampleRate, + cudaReady: true, + iqBytes: C.size_t(maxSamples) * C.size_t(unsafe.Sizeof(C.gpud_float2{})), + audioBytes: C.size_t(maxSamples) * C.size_t(unsafe.Sizeof(C.float(0))), + } + var ptr unsafe.Pointer + if C.gpud_cuda_malloc(&ptr, e.iqBytes) != C.cudaSuccess { + e.Close() + return nil, errors.New("cudaMalloc dIQIn failed") + } + e.dIQIn = (*C.gpud_float2)(ptr) + ptr = nil + if C.gpud_cuda_malloc(&ptr, e.iqBytes) != C.cudaSuccess { + e.Close() + return nil, errors.New("cudaMalloc dShifted failed") + } + e.dShifted = (*C.gpud_float2)(ptr) + ptr = nil + if C.gpud_cuda_malloc(&ptr, e.iqBytes) != C.cudaSuccess { + e.Close() + return nil, errors.New("cudaMalloc dFiltered failed") + } + e.dFiltered = (*C.gpud_float2)(ptr) + ptr = nil + if C.gpud_cuda_malloc(&ptr, e.iqBytes) != C.cudaSuccess { + e.Close() + return nil, errors.New("cudaMalloc dDecimated failed") + } + e.dDecimated = (*C.gpud_float2)(ptr) + ptr = nil + if C.gpud_cuda_malloc(&ptr, e.audioBytes) != C.cudaSuccess { + e.Close() + return nil, errors.New("cudaMalloc dAudio failed") + } + e.dAudio = (*C.float)(ptr) + return e, nil +} + +func (e *Engine) SetFIR(taps []float32) { + if len(taps) == 0 { + e.firTaps = nil + return + } + if len(taps) > 256 { + taps = taps[:256] + } + e.firTaps = append(e.firTaps[:0], taps...) + if e.cudaReady { + _ = C.gpud_upload_fir_taps((*C.float)(unsafe.Pointer(&e.firTaps[0])), C.int(len(e.firTaps))) + } +} + +func phaseStatus() string { return "phase1c-validated-shift" } +func (e *Engine) LastShiftUsedGPU() bool { return e != nil && e.lastShiftUsedGPU } +func (e *Engine) LastDemodUsedGPU() bool { return e != nil && e.lastDemodUsedGPU } + +func (e *Engine) tryCUDAFreqShift(iq []complex64, offsetHz float64) ([]complex64, bool) { + if e == nil || !e.cudaReady || len(iq) == 0 || e.dIQIn == nil || e.dShifted == nil { + return nil, false + } + bytes := C.size_t(len(iq)) * C.size_t(unsafe.Sizeof(complex64(0))) + if C.gpud_memcpy_h2d(unsafe.Pointer(e.dIQIn), unsafe.Pointer(&iq[0]), bytes) != C.cudaSuccess { + return nil, false + } + phaseInc := -2.0 * math.Pi * offsetHz / float64(e.sampleRate) + if C.gpud_launch_freq_shift(e.dIQIn, e.dShifted, C.int(len(iq)), C.double(phaseInc), C.double(e.phase)) != 0 { + return nil, false + } + if C.gpud_device_sync() != C.cudaSuccess { + return nil, false + } + out := make([]complex64, len(iq)) + if C.gpud_memcpy_d2h(unsafe.Pointer(&out[0]), unsafe.Pointer(e.dShifted), bytes) != C.cudaSuccess { + return nil, false + } + e.phase += phaseInc * float64(len(iq)) + return out, true +} + +func (e *Engine) tryCUDAFIR(iq []complex64, numTaps int) ([]complex64, bool) { + if e == nil || !e.cudaReady || len(iq) == 0 || numTaps <= 0 || e.dShifted == nil || e.dFiltered == nil { + return nil, false + } + iqBytes := C.size_t(len(iq)) * C.size_t(unsafe.Sizeof(complex64(0))) + if C.gpud_memcpy_h2d(unsafe.Pointer(e.dShifted), unsafe.Pointer(&iq[0]), iqBytes) != C.cudaSuccess { + return nil, false + } + if C.gpud_launch_fir(e.dShifted, e.dFiltered, C.int(len(iq)), C.int(numTaps)) != 0 { + return nil, false + } + if C.gpud_device_sync() != C.cudaSuccess { + return nil, false + } + out := make([]complex64, len(iq)) + if C.gpud_memcpy_d2h(unsafe.Pointer(&out[0]), unsafe.Pointer(e.dFiltered), iqBytes) != C.cudaSuccess { + return nil, false + } + return out, true +} + +func (e *Engine) tryCUDADecimate(filtered []complex64, factor int) ([]complex64, bool) { + if e == nil || !e.cudaReady || len(filtered) == 0 || factor <= 0 || e.dFiltered == nil || e.dDecimated == nil { + return nil, false + } + nOut := len(filtered) / factor + if nOut <= 0 { + return nil, false + } + iqBytes := C.size_t(len(filtered)) * C.size_t(unsafe.Sizeof(complex64(0))) + if C.gpud_memcpy_h2d(unsafe.Pointer(e.dFiltered), unsafe.Pointer(&filtered[0]), iqBytes) != C.cudaSuccess { + return nil, false + } + if C.gpud_launch_decimate(e.dFiltered, e.dDecimated, C.int(nOut), C.int(factor)) != 0 { + return nil, false + } + if C.gpud_device_sync() != C.cudaSuccess { + return nil, false + } + out := make([]complex64, nOut) + outBytes := C.size_t(nOut) * C.size_t(unsafe.Sizeof(complex64(0))) + if C.gpud_memcpy_d2h(unsafe.Pointer(&out[0]), unsafe.Pointer(e.dDecimated), outBytes) != C.cudaSuccess { + return nil, false + } + return out, true +} + +func (e *Engine) tryCUDAFMDiscrim(shifted []complex64) ([]float32, bool) { + if e == nil || !e.cudaReady || len(shifted) < 2 || e.dShifted == nil || e.dAudio == nil { + return nil, false + } + iqBytes := C.size_t(len(shifted)) * C.size_t(unsafe.Sizeof(complex64(0))) + if C.gpud_memcpy_h2d(unsafe.Pointer(e.dShifted), unsafe.Pointer(&shifted[0]), iqBytes) != C.cudaSuccess { + return nil, false + } + if C.gpud_launch_fm_discrim(e.dShifted, e.dAudio, C.int(len(shifted))) != 0 { + return nil, false + } + if C.gpud_device_sync() != C.cudaSuccess { + return nil, false + } + out := make([]float32, len(shifted)-1) + outBytes := C.size_t(len(out)) * C.size_t(unsafe.Sizeof(float32(0))) + if C.gpud_memcpy_d2h(unsafe.Pointer(&out[0]), unsafe.Pointer(e.dAudio), outBytes) != C.cudaSuccess { + return nil, false + } + return out, true +} + +func (e *Engine) tryCUDAAMEnvelope(shifted []complex64) ([]float32, bool) { + if e == nil || !e.cudaReady || len(shifted) == 0 || e.dShifted == nil || e.dAudio == nil { + return nil, false + } + iqBytes := C.size_t(len(shifted)) * C.size_t(unsafe.Sizeof(complex64(0))) + if C.gpud_memcpy_h2d(unsafe.Pointer(e.dShifted), unsafe.Pointer(&shifted[0]), iqBytes) != C.cudaSuccess { + return nil, false + } + if C.gpud_launch_am_envelope(e.dShifted, e.dAudio, C.int(len(shifted))) != 0 { + return nil, false + } + if C.gpud_device_sync() != C.cudaSuccess { + return nil, false + } + out := make([]float32, len(shifted)) + outBytes := C.size_t(len(out)) * C.size_t(unsafe.Sizeof(float32(0))) + if C.gpud_memcpy_d2h(unsafe.Pointer(&out[0]), unsafe.Pointer(e.dAudio), outBytes) != C.cudaSuccess { + return nil, false + } + return out, true +} + +func (e *Engine) tryCUDASSBProduct(shifted []complex64, bfoHz float64) ([]float32, bool) { + if e == nil || !e.cudaReady || len(shifted) == 0 || e.dShifted == nil || e.dAudio == nil { + return nil, false + } + iqBytes := C.size_t(len(shifted)) * C.size_t(unsafe.Sizeof(complex64(0))) + if C.gpud_memcpy_h2d(unsafe.Pointer(e.dShifted), unsafe.Pointer(&shifted[0]), iqBytes) != C.cudaSuccess { + return nil, false + } + phaseInc := 2.0 * math.Pi * bfoHz / float64(e.sampleRate) + if C.gpud_launch_ssb_product(e.dShifted, e.dAudio, C.int(len(shifted)), C.double(phaseInc), C.double(e.bfoPhase)) != 0 { + return nil, false + } + if C.gpud_device_sync() != C.cudaSuccess { + return nil, false + } + out := make([]float32, len(shifted)) + outBytes := C.size_t(len(out)) * C.size_t(unsafe.Sizeof(float32(0))) + if C.gpud_memcpy_d2h(unsafe.Pointer(&out[0]), unsafe.Pointer(e.dAudio), outBytes) != C.cudaSuccess { + return nil, false + } + e.bfoPhase += phaseInc * float64(len(shifted)) + return out, true +} + +func (e *Engine) Demod(iq []complex64, offsetHz float64, bw float64, mode DemodType) ([]float32, int, error) { + if e == nil { + return nil, 0, errors.New("nil CUDA demod engine") + } + if !e.cudaReady { + return nil, 0, errors.New("cuda demod engine is not initialized") + } + if len(iq) == 0 { + return nil, 0, nil + } + if len(iq) > e.maxSamples { + return nil, 0, errors.New("sample count exceeds engine capacity") + } + + _ = fmt.Sprintf("%s:%0.3f", phaseStatus(), offsetHz) + shifted, ok := e.tryCUDAFreqShift(iq, offsetHz) + e.lastShiftUsedGPU = ok && ValidateFreqShift(iq, e.sampleRate, offsetHz, shifted, 1e-3) + if !e.lastShiftUsedGPU { + shifted = dsp.FreqShift(iq, e.sampleRate, offsetHz) + } + + var outRate int + switch mode { + case DemodNFM, DemodAM, DemodUSB, DemodLSB, DemodCW: + outRate = 48000 + case DemodWFM: + outRate = 192000 + default: + return nil, 0, errors.New("unsupported demod type") + } + + cutoff := bw / 2 + if cutoff < 200 { + cutoff = 200 + } + taps := e.firTaps + if len(taps) == 0 { + base64 := dsp.LowpassFIR(cutoff, e.sampleRate, 101) + taps = make([]float32, len(base64)) + for i, v := range base64 { + taps[i] = float32(v) + } + e.SetFIR(taps) + } + filtered, ok := e.tryCUDAFIR(shifted, len(taps)) + e.lastFIRUsedGPU = ok && ValidateFIR(shifted, taps, filtered, 1e-3) + if !e.lastFIRUsedGPU { + ftaps := make([]float64, len(taps)) + for i, v := range taps { + ftaps[i] = float64(v) + } + filtered = dsp.ApplyFIR(shifted, ftaps) + } + + decim := int(math.Round(float64(e.sampleRate) / float64(outRate))) + if decim < 1 { + decim = 1 + } + dec, ok := e.tryCUDADecimate(filtered, decim) + e.lastDecimUsedGPU = ok && ValidateDecimate(filtered, decim, dec, 1e-3) + if !e.lastDecimUsedGPU { + dec = dsp.Decimate(filtered, decim) + } + inputRate := e.sampleRate / decim + + e.lastDemodUsedGPU = false + switch mode { + case DemodNFM: + if gpuAudio, ok := e.tryCUDAFMDiscrim(dec); ok { + e.lastDemodUsedGPU = true + return gpuAudio, inputRate, nil + } + return demod.NFM{}.Demod(dec, inputRate), inputRate, nil + case DemodWFM: + if gpuAudio, ok := e.tryCUDAFMDiscrim(dec); ok { + e.lastDemodUsedGPU = true + return gpuAudio, inputRate, nil + } + return demod.WFM{}.Demod(dec, inputRate), inputRate, nil + case DemodAM: + if gpuAudio, ok := e.tryCUDAAMEnvelope(dec); ok { + e.lastDemodUsedGPU = true + return gpuAudio, inputRate, nil + } + return demod.AM{}.Demod(dec, inputRate), inputRate, nil + case DemodUSB: + if gpuAudio, ok := e.tryCUDASSBProduct(dec, 700.0); ok { + e.lastDemodUsedGPU = true + return gpuAudio, inputRate, nil + } + return demod.USB{}.Demod(dec, inputRate), inputRate, nil + case DemodLSB: + if gpuAudio, ok := e.tryCUDASSBProduct(dec, -700.0); ok { + e.lastDemodUsedGPU = true + return gpuAudio, inputRate, nil + } + return demod.LSB{}.Demod(dec, inputRate), inputRate, nil + case DemodCW: + if gpuAudio, ok := e.tryCUDASSBProduct(dec, 700.0); ok { + e.lastDemodUsedGPU = true + return gpuAudio, inputRate, nil + } + return demod.CW{}.Demod(dec, inputRate), inputRate, nil + default: + return nil, 0, errors.New("unsupported demod type") + } +} + +func (e *Engine) Close() { + if e == nil { + return + } + if e.dIQIn != nil { + _ = C.gpud_cuda_free(unsafe.Pointer(e.dIQIn)) + e.dIQIn = nil + } + if e.dShifted != nil { + _ = C.gpud_cuda_free(unsafe.Pointer(e.dShifted)) + e.dShifted = nil + } + if e.dFiltered != nil { + _ = C.gpud_cuda_free(unsafe.Pointer(e.dFiltered)) + e.dFiltered = nil + } + if e.dDecimated != nil { + _ = C.gpud_cuda_free(unsafe.Pointer(e.dDecimated)) + e.dDecimated = nil + } + if e.dAudio != nil { + _ = C.gpud_cuda_free(unsafe.Pointer(e.dAudio)) + e.dAudio = nil + } + e.firTaps = nil + e.cudaReady = false +} diff --git a/internal/demod/gpudemod/native/exports.cu b/internal/demod/gpudemod/native/exports.cu new file mode 100644 index 0000000..1278e61 --- /dev/null +++ b/internal/demod/gpudemod/native/exports.cu @@ -0,0 +1,190 @@ +#include +#include + +#if defined(_WIN32) +#define GPUD_API extern "C" __declspec(dllexport) +#define GPUD_CALL __stdcall +#else +#define GPUD_API extern "C" +#define GPUD_CALL +#endif + +GPUD_API __global__ void gpud_freq_shift_kernel( + const float2* __restrict__ in, + float2* __restrict__ out, + int n, + double phase_inc, + double phase_start +) { + int idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx >= n) return; + + double phase = phase_start + phase_inc * (double)idx; + float si, co; + sincosf((float)phase, &si, &co); + + float2 v = in[idx]; + out[idx].x = v.x * co - v.y * si; + out[idx].y = v.x * si + v.y * co; +} + +GPUD_API int GPUD_CALL gpud_launch_freq_shift_cuda( + const float2* in, + float2* out, + int n, + double phase_inc, + double phase_start +) { + if (n <= 0) return 0; + const int block = 256; + const int grid = (n + block - 1) / block; + gpud_freq_shift_kernel<<>>(in, out, n, phase_inc, phase_start); + return (int)cudaGetLastError(); +} + +GPUD_API __global__ void gpud_fm_discrim_kernel( + const float2* __restrict__ in, + float* __restrict__ out, + int n +) { + int idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx >= n - 1) return; + + float2 prev = in[idx]; + float2 curr = in[idx + 1]; + float re = prev.x * curr.x + prev.y * curr.y; + float im = prev.x * curr.y - prev.y * curr.x; + out[idx] = atan2f(im, re); +} + +GPUD_API int GPUD_CALL gpud_launch_fm_discrim_cuda( + const float2* in, + float* out, + int n +) { + if (n <= 1) return 0; + const int block = 256; + const int grid = (n + block - 1) / block; + gpud_fm_discrim_kernel<<>>(in, out, n); + return (int)cudaGetLastError(); +} + +GPUD_API __global__ void gpud_decimate_kernel( + const float2* __restrict__ in, + float2* __restrict__ out, + int n_out, + int factor +) { + int idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx >= n_out) return; + out[idx] = in[idx * factor]; +} + +__device__ __constant__ float gpud_fir_taps[256]; + +GPUD_API __global__ void gpud_fir_kernel( + const float2* __restrict__ in, + float2* __restrict__ out, + int n, + int num_taps +) { + int idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx >= n) return; + + float acc_r = 0.0f; + float acc_i = 0.0f; + for (int k = 0; k < num_taps; ++k) { + int src = idx - k; + if (src < 0) break; + float2 v = in[src]; + float t = gpud_fir_taps[k]; + acc_r += v.x * t; + acc_i += v.y * t; + } + out[idx] = make_float2(acc_r, acc_i); +} + +GPUD_API int GPUD_CALL gpud_upload_fir_taps_cuda(const float* taps, int n) { + if (!taps || n <= 0 || n > 256) return -1; + cudaError_t err = cudaMemcpyToSymbol(gpud_fir_taps, taps, (size_t)n * sizeof(float)); + return (int)err; +} + +GPUD_API int GPUD_CALL gpud_launch_fir_cuda( + const float2* in, + float2* out, + int n, + int num_taps +) { + if (n <= 0 || num_taps <= 0 || num_taps > 256) return 0; + const int block = 256; + const int grid = (n + block - 1) / block; + gpud_fir_kernel<<>>(in, out, n, num_taps); + return (int)cudaGetLastError(); +} + +GPUD_API int GPUD_CALL gpud_launch_decimate_cuda( + const float2* in, + float2* out, + int n_out, + int factor +) { + if (n_out <= 0 || factor <= 0) return 0; + const int block = 256; + const int grid = (n_out + block - 1) / block; + gpud_decimate_kernel<<>>(in, out, n_out, factor); + return (int)cudaGetLastError(); +} + +GPUD_API __global__ void gpud_am_envelope_kernel( + const float2* __restrict__ in, + float* __restrict__ out, + int n +) { + int idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx >= n) return; + float2 v = in[idx]; + out[idx] = sqrtf(v.x * v.x + v.y * v.y); +} + +GPUD_API int GPUD_CALL gpud_launch_am_envelope_cuda( + const float2* in, + float* out, + int n +) { + if (n <= 0) return 0; + const int block = 256; + const int grid = (n + block - 1) / block; + gpud_am_envelope_kernel<<>>(in, out, n); + return (int)cudaGetLastError(); +} + +GPUD_API __global__ void gpud_ssb_product_kernel( + const float2* __restrict__ in, + float* __restrict__ out, + int n, + double phase_inc, + double phase_start +) { + int idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx >= n) return; + double phase = phase_start + phase_inc * (double)idx; + float si, co; + sincosf((float)phase, &si, &co); + float2 v = in[idx]; + out[idx] = v.x * co - v.y * si; +} + +GPUD_API int GPUD_CALL gpud_launch_ssb_product_cuda( + const float2* in, + float* out, + int n, + double phase_inc, + double phase_start +) { + if (n <= 0) return 0; + const int block = 256; + const int grid = (n + block - 1) / block; + gpud_ssb_product_kernel<<>>(in, out, n, phase_inc, phase_start); + return (int)cudaGetLastError(); +}