From 8d0bea214ce2c4d55b4c510fe668e81dff9b96c7 Mon Sep 17 00:00:00 2001
From: Jan Svabenik <jan@svabi.ch>
Date: Thu, 19 Mar 2026 11:07:39 +0100
Subject: [PATCH] Attempt MinGW-host CUDA build path for Windows

---
 README.md                           |  25 ++++---
 build-cuda-windows.ps1              |  15 ++---
 build-sdrplay.ps1                   |  40 ++++-------
 docs/build-cuda.md                  |  64 ++++++++++--------
 internal/demod/gpudemod/gpudemod.go |  10 +--
 tools/build-gpudemod-kernel.ps1     | 100 +++++++++++++---------------
 6 files changed, 120 insertions(+), 134 deletions(-)

diff --git a/README.md b/README.md
index 50f544b..73bd73c 100644
--- a/README.md
+++ b/README.md
@@ -33,19 +33,24 @@ go build -tags sdrplay ./cmd/sdrd
 .\sdrd.exe -config config.yaml
 ```
 
-#### Windows (GPU / CUDA status)
-Windows CUDA support in this repository is currently split into separate steps:
+#### Windows (GPU / CUDA + SDRplay)
+Recommended build path:
 
-- `build-windows-default.ps1` → reliable default Windows app build
-- `build-cuda-windows.ps1` → builds CUDA kernel artifacts (`kernels.obj`, `gpudemod_kernels.lib`)
-- `build-windows-cuda-app.ps1` → experimental full Windows CUDA app build path
+```powershell
+powershell -ExecutionPolicy Bypass -File .\build-cuda-windows.ps1
+powershell -ExecutionPolicy Bypass -File .\build-sdrplay.ps1
+```
 
-Important:
-- the original invalid `#cgo LDFLAGS` CUDA integration issue has been fixed
-- CUDA kernel artifact preparation works on Jan's machine
-- a full end-to-end Windows CUDA app build is still blocked by Go/CGO + Windows toolchain behavior (see `docs/build-cuda.md` and `docs/windows-cgo-msvc-note.md`)
+This path uses:
+- MinGW GCC/G++ for the Go/CGO toolchain
+- `nvcc` with MinGW `g++` as the host compiler for `gpudemod` kernels
+- MinGW-compatible CUDA import libs from `cuda-mingw/`
 
-Use the scripts above instead of the older manual one-liner.
+Important:
+- the kernel archive is generated as `internal/demod/gpudemod/build/libgpudemod_kernels.a`
+- `-lstdc++` is linked explicitly for CUDA host-side C++ runtime references
+- CUDA 13.x no longer supports older targets like `sm_50`/`sm_60`, so the script builds for `sm_75+`
+- if `nvcc` is missing, CUDA kernel preparation will fail
 
 ### Linux
 ```bash
diff --git a/build-cuda-windows.ps1 b/build-cuda-windows.ps1
index 2294d21..fb2f2e3 100644
--- a/build-cuda-windows.ps1
+++ b/build-cuda-windows.ps1
@@ -1,8 +1,8 @@
 $ErrorActionPreference = 'Stop'
 
-$msvcCl = 'C:\Program Files (x86)\Microsoft Visual Studio\2019\BuildTools\VC\Tools\MSVC\14.29.30133\bin\Hostx64\x64'
-if (-not (Test-Path (Join-Path $msvcCl 'cl.exe'))) {
-  throw "cl.exe not found at $msvcCl"
+$mingw = 'C:\msys64\mingw64\bin'
+if (-not (Test-Path (Join-Path $mingw 'g++.exe'))) {
+  throw "MinGW g++ not found at $mingw"
 }
 
 $cudaBin = 'C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.2\bin'
@@ -10,11 +10,10 @@ if (-not (Test-Path (Join-Path $cudaBin 'nvcc.exe'))) {
   throw "nvcc.exe not found at $cudaBin"
 }
 
-$env:PATH = "$msvcCl;$cudaBin;" + $env:PATH
+$env:PATH = "$mingw;$cudaBin;" + $env:PATH
 
-Write-Host "Building CUDA kernel artifacts for Windows..." -ForegroundColor Cyan
+Write-Host 'Preparing Windows CUDA environment for gpudemod (MinGW host compiler)...' -ForegroundColor Cyan
 powershell -ExecutionPolicy Bypass -File tools\build-gpudemod-kernel.ps1
-if ($LASTEXITCODE -ne 0) { throw "kernel build failed" }
+if ($LASTEXITCODE -ne 0) { throw 'kernel build failed' }
 
-Write-Host "Done. Kernel artifacts prepared." -ForegroundColor Green
-Write-Host "Note: final full-app linking may still require an MSVC-compatible CGO/link strategy, not the current MinGW flow." -ForegroundColor Yellow
+Write-Host 'Done. GNU-compatible gpudemod kernel library prepared.' -ForegroundColor Green
diff --git a/build-sdrplay.ps1 b/build-sdrplay.ps1
index d2e4e88..6f8e56b 100644
--- a/build-sdrplay.ps1
+++ b/build-sdrplay.ps1
@@ -3,26 +3,23 @@ $gcc = 'C:\msys64\mingw64\bin'
 if (-not (Test-Path (Join-Path $gcc 'gcc.exe'))) {
   throw "gcc not found at $gcc"
 }
-$msvcCl = 'C:\Program Files (x86)\Microsoft Visual Studio\2019\BuildTools\VC\Tools\MSVC\14.29.30133\bin\Hostx64\x64'
-if (-not (Test-Path (Join-Path $msvcCl 'cl.exe'))) {
-  throw "cl.exe not found at $msvcCl"
+if (-not (Test-Path (Join-Path $gcc 'g++.exe'))) {
+  throw "g++ not found at $gcc"
 }
-$env:PATH = "$gcc;$msvcCl;" + $env:PATH
+$env:PATH = "$gcc;" + $env:PATH
 $env:CGO_ENABLED = '1'
+$env:CC = 'gcc'
+$env:CXX = 'g++'
 
 # SDRplay
 $env:CGO_CFLAGS = '-IC:\PROGRA~1\SDRplay\API\inc'
 $env:CGO_LDFLAGS = '-LC:\PROGRA~1\SDRplay\API\x64 -lsdrplay_api'
 
 # CUDA (cuFFT)
-# Prefer C:\CUDA if present (no spaces)
 $cudaInc = 'C:\CUDA\include'
-$cudaLib = 'C:\CUDA\lib\x64'
 $cudaBin = 'C:\CUDA\bin'
-
 if (-not (Test-Path $cudaInc)) {
   $cudaInc = 'C:\PROGRA~1\NVIDIA GPU Computing Toolkit\CUDA\v13.2\include'
-  $cudaLib = 'C:\PROGRA~1\NVIDIA GPU Computing Toolkit\CUDA\v13.2\lib\x64'
   $cudaBin = 'C:\PROGRA~1\NVIDIA GPU Computing Toolkit\CUDA\v13.2\bin'
 }
 if (Test-Path $cudaInc) {
@@ -33,30 +30,21 @@ if (Test-Path $cudaBin) {
 }
 
 $cudaMingw = Join-Path $PSScriptRoot 'cuda-mingw'
+$gpuDemodBuild = Join-Path $PSScriptRoot 'internal\demod\gpudemod\build'
 if (Test-Path $cudaMingw) {
-  # Use MinGW import libs to avoid MSVC .lib linking issues
   $env:CGO_LDFLAGS = "$env:CGO_LDFLAGS -L$cudaMingw"
-} elseif (Test-Path $cudaLib) {
-  # Fallback to CUDA lib path (requires compatible toolchain)
-  $env:CGO_LDFLAGS = "$env:CGO_LDFLAGS -L$cudaLib -lcufft -lcudart"
 }
-
-Write-Host "Building with SDRplay + cuFFT support..." -ForegroundColor Cyan
-Write-Host "WARNING: this path still performs final Go linking through MinGW GCC." -ForegroundColor Yellow
-Write-Host "If CUDA kernel artifacts are MSVC-built, final link may fail due to mixed toolchains." -ForegroundColor Yellow
-Write-Host "Use build-cuda-windows.ps1 for CUDA artifact prep; use this script for the current MinGW-oriented app build path." -ForegroundColor Yellow
-
-$gccHost = Join-Path $gcc 'g++.exe'
-if (!(Test-Path $gccHost)) {
-  throw "g++.exe not found at $gccHost"
+if (Test-Path $gpuDemodBuild) {
+  $env:CGO_LDFLAGS = "$env:CGO_LDFLAGS -L$gpuDemodBuild"
 }
+$env:CGO_LDFLAGS = "$env:CGO_LDFLAGS -lgpudemod_kernels -lcufft64_12 -lcudart64_13 -lstdc++"
 
-# Kernel build currently relies on nvcc + MSVC host compiler availability.
+Write-Host 'Building with SDRplay + cuFFT support (MinGW-host CUDA path)...' -ForegroundColor Cyan
+Write-Host 'Preparing GNU-compatible CUDA kernel artifacts...' -ForegroundColor Cyan
 powershell -ExecutionPolicy Bypass -File tools\build-gpudemod-kernel.ps1
-if ($LASTEXITCODE -ne 0) { throw "kernel build failed" }
+if ($LASTEXITCODE -ne 0) { throw 'kernel build failed' }
 
 go build -tags "sdrplay,cufft" ./cmd/sdrd
+if ($LASTEXITCODE -ne 0) { throw 'build failed' }
 
-if ($LASTEXITCODE -ne 0) { throw "build failed" }
-
-Write-Host "Done." -ForegroundColor Green
+Write-Host 'Done.' -ForegroundColor Green
diff --git a/docs/build-cuda.md b/docs/build-cuda.md
index 66e994a..21c53d4 100644
--- a/docs/build-cuda.md
+++ b/docs/build-cuda.md
@@ -1,47 +1,55 @@
 # CUDA Build Strategy
 
-## Problem statement
+## Windows: MinGW-host NVCC path
 
-The repository currently mixes two Windows toolchain worlds:
+The recommended Windows CUDA build path for this repository is:
 
-- Go/CGO final link often goes through MinGW GCC/LD
-- CUDA kernel compilation via `nvcc` on Windows prefers MSVC (`cl.exe`)
+1. Compile `internal/demod/gpudemod/kernels.cu` with `nvcc` using MinGW `g++` as the host compiler
+2. Archive the result as `internal/demod/gpudemod/build/libgpudemod_kernels.a`
+3. Build the Go app with MinGW GCC/G++ via CGO
 
-This works for isolated package tests, but full application builds can fail when an MSVC-built CUDA library is linked by MinGW, producing unresolved symbols such as:
+This keeps the CUDA demod kernel library in a GNU-compatible format so Go's MinGW CGO linker can consume it.
 
+### Why
+
+The previous failing path mixed:
+- `nvcc` + default MSVC host compiler (`cl.exe`) for CUDA kernels
+- MinGW GCC/LD for the final Go/CGO link
+
+That produced unresolved MSVC runtime symbols such as:
 - `__GSHandlerCheck`
 - `__security_cookie`
 - `_Init_thread_epoch`
 
-## Recommended split
+### Current Windows build flow
 
-### Windows
+```powershell
+powershell -ExecutionPolicy Bypass -File .\build-cuda-windows.ps1
+powershell -ExecutionPolicy Bypass -File .\build-sdrplay.ps1
+```
 
-Use an explicitly Windows-oriented build path:
+### Critical details
 
-1. Prepare CUDA kernel artifacts with `nvcc`
-2. Keep the resulting CUDA linkage path clearly separated from MinGW-based fallback builds
-3. Do not assume that a MinGW-linked Go binary can always consume MSVC-built CUDA archives
+- CUDA kernel archive must be named `libgpudemod_kernels.a`
+- `nvcc` must be invoked with `-ccbin C:\msys64\mingw64\bin\g++.exe`
+- Windows CGO link uses:
+  - SDRplay API import lib
+  - MinGW CUDA import libs from `cuda-mingw/`
+  - `-lgpudemod_kernels`
+  - `-lcufft64_12`
+  - `-lcudart64_13`
+  - `-lstdc++`
 
-### Linux
+### Caveat
 
-Prefer a GCC/NVCC-oriented build path:
+`nvcc` + MinGW on Windows is not officially supported by NVIDIA. For the kernel launcher style used here (`extern "C"` functions, limited host C++ surface), it is the most practical path.
 
-1. Build CUDA kernels with `nvcc` + GCC
-2. Link through the normal Linux CGO flow
-3. Avoid Windows-specific import-lib and MSVC runtime assumptions entirely
+CUDA 13.x also drops older GPU targets such as `sm_50` and `sm_60`, so the kernel build script targets `sm_75+`.
 
-## Repository design guidance
+## Linux
 
-- Keep `internal/demod/gpudemod/` platform-neutral at the Go API level
-- Keep CUDA kernels in `kernels.cu`
-- Use OS-specific build scripts for orchestration
-- Avoid embedding Windows-only build assumptions into shared Go code when possible
+Linux remains the cleanest end-to-end CUDA path:
 
-## Current practical status
-
-- `go test ./...` passes
-- `go test -tags cufft ./internal/demod/gpudemod` passes with NVCC/MSVC setup
-- `build-sdrplay.ps1` has progressed past the original invalid `#cgo LDFLAGS` issue
-- Remaining Windows blocker in the default path is a toolchain mismatch between MSVC-built CUDA artifacts and MinGW final linking
-- Experimental full-MSVC CGO path (`build-windows-cuda-app.ps1`) also currently blocks because even `go build runtime/cgo` emits GCC-style flags (`-Wall`, `-Werror`, `-fno-stack-protector`) that `cl.exe` rejects in this environment; this is a toolchain/Go integration issue, not a project-specific one
+1. Build CUDA kernels with `nvcc` + GCC
+2. Link via standard CGO/GCC flow
+3. Avoid Windows toolchain mismatch entirely
diff --git a/internal/demod/gpudemod/gpudemod.go b/internal/demod/gpudemod/gpudemod.go
index f42cd2b..3559971 100644
--- a/internal/demod/gpudemod/gpudemod.go
+++ b/internal/demod/gpudemod/gpudemod.go
@@ -3,7 +3,7 @@
 package gpudemod
 
 /*
-#cgo windows LDFLAGS: -L${SRCDIR}/../../../cuda-mingw -L${SRCDIR}/build -lgpudemod_kernels -lcufft64_12 -lcudart64_13
+#cgo windows LDFLAGS: -L${SRCDIR}/../../../cuda-mingw -L${SRCDIR}/build -lgpudemod_kernels -lcufft64_12 -lcudart64_13 -lstdc++
 #cgo windows CFLAGS: -I"C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v13.2/include"
 #include <cuda_runtime.h>
 #include <cufft.h>
@@ -182,12 +182,8 @@ func (e *Engine) SetFIR(taps []float32) {
 }
 
 func phaseStatus() string { return "phase1c-validated-shift" }
-func (e *Engine) LastShiftUsedGPU() bool {
-	return e != nil && e.lastShiftUsedGPU
-}
-func (e *Engine) LastDemodUsedGPU() bool {
-	return e != nil && e.lastDemodUsedGPU
-}
+func (e *Engine) LastShiftUsedGPU() bool { return e != nil && e.lastShiftUsedGPU }
+func (e *Engine) LastDemodUsedGPU() bool { return e != nil && e.lastDemodUsedGPU }
 
 func (e *Engine) tryCUDAFreqShift(iq []complex64, offsetHz float64) ([]complex64, bool) {
 	if e == nil || !e.cudaReady || len(iq) == 0 || e.dIQIn == nil || e.dShifted == nil {
diff --git a/tools/build-gpudemod-kernel.ps1 b/tools/build-gpudemod-kernel.ps1
index f09276f..dd0a329 100644
--- a/tools/build-gpudemod-kernel.ps1
+++ b/tools/build-gpudemod-kernel.ps1
@@ -1,63 +1,53 @@
-param(
-  [string]$CudaRoot = 'C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.2',
-  [string]$Source = 'internal/demod/gpudemod/kernels.cu',
-  [string]$OutDir = 'internal/demod/gpudemod/build'
-)
-
 $ErrorActionPreference = 'Stop'
-$repo = Split-Path -Parent $PSScriptRoot
-Set-Location $repo
 
-$nvcc = Join-Path $CudaRoot 'bin\nvcc.exe'
-if (!(Test-Path $nvcc)) {
-  throw "nvcc not found at $nvcc"
+$nvcc = (Get-Command nvcc -ErrorAction SilentlyContinue).Path
+if (-not $nvcc) {
+  $nvcc = 'C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.2\bin\nvcc.exe'
 }
-
-New-Item -ItemType Directory -Force -Path $OutDir | Out-Null
-$outObj = Join-Path $OutDir 'kernels.obj'
-$outLib = Join-Path $OutDir 'gpudemod_kernels.lib'
-if (Test-Path $outObj) { Remove-Item $outObj -Force }
-if (Test-Path $outLib) { Remove-Item $outLib -Force }
-
-Write-Host "Using nvcc: $nvcc"
-Write-Host "Building $Source -> $outObj"
-
-$nvccArgs = @('-c', $Source, '-o', $outObj, '-I', (Join-Path $CudaRoot 'include'))
-if ($HostCompiler) {
-  Write-Host "Using host compiler: $HostCompiler"
-  $hostDir = Split-Path -Parent $HostCompiler
-  $nvccArgs += @('-ccbin', $hostDir)
-} else {
-  $nvccArgs += @('-Xcompiler', '/EHsc')
+if (-not (Test-Path $nvcc)) {
+  Write-Host 'nvcc not found — skipping kernel build' -ForegroundColor Yellow
+  exit 0
 }
 
-& $nvcc @nvccArgs
-if ($LASTEXITCODE -ne 0) {
-  throw "nvcc failed with exit code $LASTEXITCODE"
+$mingwRoot = 'C:\msys64\mingw64\bin'
+$mingwGpp = Join-Path $mingwRoot 'g++.exe'
+$ar = Join-Path $mingwRoot 'ar.exe'
+if (-not (Test-Path $mingwGpp)) {
+  throw 'MinGW g++ not found'
 }
-
-if ($HostCompiler) {
-  $ar = Get-Command ar.exe -ErrorAction SilentlyContinue
-  if (-not $ar) {
-    throw "ar.exe not found in PATH; required for MinGW-compatible archive"
-  }
-  Write-Host "Archiving $outObj -> $outLib with ar.exe"
-  if (Test-Path $outLib) { Remove-Item $outLib -Force }
-  & $ar 'rcs' $outLib $outObj
-  if ($LASTEXITCODE -ne 0) {
-    throw "ar.exe failed with exit code $LASTEXITCODE"
-  }
-} else {
-  $libexe = Get-Command lib.exe -ErrorAction SilentlyContinue
-  if (-not $libexe) {
-    throw "lib.exe not found in PATH; run from vcvars64.bat environment"
-  }
-  Write-Host "Archiving $outObj -> $outLib with lib.exe"
-  & $libexe /nologo /OUT:$outLib $outObj
-  if ($LASTEXITCODE -ne 0) {
-    throw "lib.exe failed with exit code $LASTEXITCODE"
-  }
+if (-not (Test-Path $ar)) {
+  throw 'MinGW ar not found'
 }
 
-Write-Host "Built: $outObj"
-Write-Host "Archived: $outLib"
+$kernelSrc = Join-Path $PSScriptRoot '..\internal\demod\gpudemod\kernels.cu'
+$buildDir = Join-Path $PSScriptRoot '..\internal\demod\gpudemod\build'
+if (-not (Test-Path $buildDir)) { New-Item -ItemType Directory -Path $buildDir | Out-Null }
+
+$objFile = Join-Path $buildDir 'kernels.o'
+$libFile = Join-Path $buildDir 'libgpudemod_kernels.a'
+$legacyLib = Join-Path $buildDir 'gpudemod_kernels.lib'
+
+if (Test-Path $objFile) { Remove-Item $objFile -Force }
+if (Test-Path $libFile) { Remove-Item $libFile -Force }
+if (Test-Path $legacyLib) { Remove-Item $legacyLib -Force }
+
+Write-Host 'Compiling CUDA kernels with MinGW host...' -ForegroundColor Cyan
+& $nvcc -ccbin $mingwGpp -c $kernelSrc -o $objFile `
+  --compiler-options=-fno-exceptions `
+  -arch=sm_75 `
+  -gencode arch=compute_75,code=sm_75 `
+  -gencode arch=compute_80,code=sm_80 `
+  -gencode arch=compute_86,code=sm_86 `
+  -gencode arch=compute_87,code=sm_87 `
+  -gencode arch=compute_88,code=sm_88 `
+  -gencode arch=compute_89,code=sm_89 `
+  -gencode arch=compute_90,code=sm_90
+
+if ($LASTEXITCODE -ne 0) { throw 'nvcc compilation failed' }
+
+Write-Host 'Archiving GNU-compatible CUDA kernel library...' -ForegroundColor Cyan
+& $ar rcs $libFile $objFile
+if ($LASTEXITCODE -ne 0) { throw 'ar archive failed' }
+
+Write-Host "Kernel object: $objFile" -ForegroundColor Green
+Write-Host "Kernel library: $libFile" -ForegroundColor Green