diff --git a/README.md b/README.md
index 73bd73c..9466dc5 100644
--- a/README.md
+++ b/README.md
@@ -42,15 +42,14 @@ powershell -ExecutionPolicy Bypass -File .\build-sdrplay.ps1
 ```
 
 This path uses:
-- MinGW GCC/G++ for the Go/CGO toolchain
-- `nvcc` with MinGW `g++` as the host compiler for `gpudemod` kernels
-- MinGW-compatible CUDA import libs from `cuda-mingw/`
+- `nvcc` + MSVC to build `gpudemod_kernels.dll`
+- MinGW GCC/G++ for the Go/CGO application build
+- runtime DLL loading for the Windows `gpudemod` path
 
 Important:
-- the kernel archive is generated as `internal/demod/gpudemod/build/libgpudemod_kernels.a`
-- `-lstdc++` is linked explicitly for CUDA host-side C++ runtime references
-- CUDA 13.x no longer supports older targets like `sm_50`/`sm_60`, so the script builds for `sm_75+`
-- if `nvcc` is missing, CUDA kernel preparation will fail
+- `gpudemod_kernels.dll` must be present next to `sdrd.exe` or in `internal/demod/gpudemod/build/`
+- `build-sdrplay.ps1` copies the DLL to the repo root after a successful app build when available
+- this avoids directly linking MSVC CUDA kernel objects into the MinGW-linked Go binary
 
 ### Linux
 ```bash
diff --git a/build-cuda-windows.ps1 b/build-cuda-windows.ps1
index fb2f2e3..be408ef 100644
--- a/build-cuda-windows.ps1
+++ b/build-cuda-windows.ps1
@@ -1,19 +1,6 @@
 $ErrorActionPreference = 'Stop'
 
-$mingw = 'C:\msys64\mingw64\bin'
-if (-not (Test-Path (Join-Path $mingw 'g++.exe'))) {
-  throw "MinGW g++ not found at $mingw"
-}
-
-$cudaBin = 'C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.2\bin'
-if (-not (Test-Path (Join-Path $cudaBin 'nvcc.exe'))) {
-  throw "nvcc.exe not found at $cudaBin"
-}
-
-$env:PATH = "$mingw;$cudaBin;" + $env:PATH
-
-Write-Host 'Preparing Windows CUDA environment for gpudemod (MinGW host compiler)...' -ForegroundColor Cyan
-powershell -ExecutionPolicy Bypass -File tools\build-gpudemod-kernel.ps1
-if ($LASTEXITCODE -ne 0) { throw 'kernel build failed' }
-
-Write-Host 'Done. GNU-compatible gpudemod kernel library prepared.' -ForegroundColor Green
+Write-Host 'Preparing Windows CUDA DLL for gpudemod (MSVC/nvcc path)...' -ForegroundColor Cyan
+powershell -ExecutionPolicy Bypass -File .\build-gpudemod-dll.ps1
+if ($LASTEXITCODE -ne 0) { throw 'gpudemod DLL build failed' }
+Write-Host 'Done. gpudemod_kernels.dll is ready.' -ForegroundColor Green
diff --git a/build-gpudemod-dll.ps1 b/build-gpudemod-dll.ps1
new file mode 100644
index 0000000..47f021c
--- /dev/null
+++ b/build-gpudemod-dll.ps1
@@ -0,0 +1,27 @@
+$ErrorActionPreference = 'Stop'
+
+$vcvars = 'C:\Program Files (x86)\Microsoft Visual Studio\2019\BuildTools\VC\Auxiliary\Build\vcvars64.bat'
+$cudaRoot = 'C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.2'
+$nvcc = Join-Path $cudaRoot 'bin\nvcc.exe'
+$src = Join-Path $PSScriptRoot 'internal\demod\gpudemod\native\exports.cu'
+$outDir = Join-Path $PSScriptRoot 'internal\demod\gpudemod\build'
+$dll = Join-Path $outDir 'gpudemod_kernels.dll'
+$lib = Join-Path $outDir 'gpudemod_kernels.lib'
+$exp = Join-Path $outDir 'gpudemod_kernels.exp'
+
+if (!(Test-Path $vcvars)) { throw "vcvars64.bat not found at $vcvars" }
+if (!(Test-Path $nvcc)) { throw "nvcc.exe not found at $nvcc" }
+if (!(Test-Path $src)) { throw "CUDA source not found at $src" }
+if (!(Test-Path $outDir)) { New-Item -ItemType Directory -Path $outDir | Out-Null }
+
+Remove-Item $dll,$lib,$exp -Force -ErrorAction SilentlyContinue
+
+$cmd = @"
+call "$vcvars" && "$nvcc" -shared "$src" -o "$dll" -Xcompiler "/MD" -arch=sm_75 -gencode arch=compute_75,code=sm_75 -gencode arch=compute_80,code=sm_80 -gencode arch=compute_86,code=sm_86 -gencode arch=compute_89,code=sm_89 -gencode arch=compute_90,code=sm_90
+"@
+
+Write-Host 'Building gpudemod CUDA DLL...' -ForegroundColor Cyan
+cmd.exe /c $cmd
+if ($LASTEXITCODE -ne 0) { throw 'gpudemod DLL build failed' }
+
+Write-Host "Built: $dll" -ForegroundColor Green
diff --git a/build-sdrplay.ps1 b/build-sdrplay.ps1
index 6f8e56b..a731d63 100644
--- a/build-sdrplay.ps1
+++ b/build-sdrplay.ps1
@@ -15,36 +15,27 @@ $env:CXX = 'g++'
 $env:CGO_CFLAGS = '-IC:\PROGRA~1\SDRplay\API\inc'
 $env:CGO_LDFLAGS = '-LC:\PROGRA~1\SDRplay\API\x64 -lsdrplay_api'
 
-# CUDA (cuFFT)
+# CUDA runtime / cuFFT
 $cudaInc = 'C:\CUDA\include'
 $cudaBin = 'C:\CUDA\bin'
-if (-not (Test-Path $cudaInc)) {
-  $cudaInc = 'C:\PROGRA~1\NVIDIA GPU Computing Toolkit\CUDA\v13.2\include'
-  $cudaBin = 'C:\PROGRA~1\NVIDIA GPU Computing Toolkit\CUDA\v13.2\bin'
-}
-if (Test-Path $cudaInc) {
-  $env:CGO_CFLAGS = "$env:CGO_CFLAGS -I$cudaInc"
-}
-if (Test-Path $cudaBin) {
-  $env:PATH = "$cudaBin;" + $env:PATH
-}
-
+if (-not (Test-Path $cudaInc)) { $cudaInc = 'C:\PROGRA~1\NVIDIA~2\CUDA\v13.2\include' }
+if (-not (Test-Path $cudaBin)) { $cudaBin = 'C:\PROGRA~1\NVIDIA~2\CUDA\v13.2\bin' }
 $cudaMingw = Join-Path $PSScriptRoot 'cuda-mingw'
-$gpuDemodBuild = Join-Path $PSScriptRoot 'internal\demod\gpudemod\build'
-if (Test-Path $cudaMingw) {
-  $env:CGO_LDFLAGS = "$env:CGO_LDFLAGS -L$cudaMingw"
-}
-if (Test-Path $gpuDemodBuild) {
-  $env:CGO_LDFLAGS = "$env:CGO_LDFLAGS -L$gpuDemodBuild"
-}
-$env:CGO_LDFLAGS = "$env:CGO_LDFLAGS -lgpudemod_kernels -lcufft64_12 -lcudart64_13 -lstdc++"
-
-Write-Host 'Building with SDRplay + cuFFT support (MinGW-host CUDA path)...' -ForegroundColor Cyan
-Write-Host 'Preparing GNU-compatible CUDA kernel artifacts...' -ForegroundColor Cyan
-powershell -ExecutionPolicy Bypass -File tools\build-gpudemod-kernel.ps1
-if ($LASTEXITCODE -ne 0) { throw 'kernel build failed' }
+if (Test-Path $cudaInc) { $env:CGO_CFLAGS = "$env:CGO_CFLAGS -I$cudaInc" }
+if (Test-Path $cudaBin) { $env:PATH = "$cudaBin;" + $env:PATH }
+if (Test-Path $cudaMingw) { $env:CGO_LDFLAGS = "$env:CGO_LDFLAGS -L$cudaMingw -lcudart64_13 -lcufft64_12 -lkernel32" }
 
+Write-Host 'Building SDRplay + cuFFT app (Windows DLL path)...' -ForegroundColor Cyan
 go build -tags "sdrplay,cufft" ./cmd/sdrd
 if ($LASTEXITCODE -ne 0) { throw 'build failed' }
 
+$dllSrc = Join-Path $PSScriptRoot 'internal\demod\gpudemod\build\gpudemod_kernels.dll'
+$dllDst = Join-Path $PSScriptRoot 'gpudemod_kernels.dll'
+if (Test-Path $dllSrc) {
+  Copy-Item $dllSrc $dllDst -Force
+  Write-Host "Copied DLL to $dllDst" -ForegroundColor Green
+} else {
+  Write-Host 'WARNING: gpudemod_kernels.dll not found; build succeeded but runtime GPU demod will not load.' -ForegroundColor Yellow
+}
+
 Write-Host 'Done.' -ForegroundColor Green
diff --git a/docs/build-cuda.md b/docs/build-cuda.md
index 21c53d4..938bc73 100644
--- a/docs/build-cuda.md
+++ b/docs/build-cuda.md
@@ -1,55 +1,38 @@
 # CUDA Build Strategy
 
-## Windows: MinGW-host NVCC path
+## Windows: gpudemod DLL split
 
-The recommended Windows CUDA build path for this repository is:
+The recommended Windows CUDA path is now a DLL split for `gpudemod`:
 
-1. Compile `internal/demod/gpudemod/kernels.cu` with `nvcc` using MinGW `g++` as the host compiler
-2. Archive the result as `internal/demod/gpudemod/build/libgpudemod_kernels.a`
-3. Build the Go app with MinGW GCC/G++ via CGO
+1. Build `internal/demod/gpudemod/native/exports.cu` into `gpudemod_kernels.dll` using `nvcc` + MSVC
+2. Build the Go app with MinGW GCC/G++ via CGO
+3. Load `gpudemod_kernels.dll` at runtime from Go on Windows
 
-This keeps the CUDA demod kernel library in a GNU-compatible format so Go's MinGW CGO linker can consume it.
+This avoids direct static linking of MSVC-built CUDA objects into the MinGW-linked Go binary.
 
-### Why
+## Why
 
-The previous failing path mixed:
-- `nvcc` + default MSVC host compiler (`cl.exe`) for CUDA kernels
-- MinGW GCC/LD for the final Go/CGO link
+The previous failing paths mixed incompatible toolchains at final link time:
+- MSVC-host CUDA object/library generation
+- MinGW GCC/LD for the Go executable
 
-That produced unresolved MSVC runtime symbols such as:
-- `__GSHandlerCheck`
-- `__security_cookie`
-- `_Init_thread_epoch`
+The DLL split keeps that boundary at runtime instead of link time.
 
-### Current Windows build flow
+## Current Windows build flow
 
 ```powershell
 powershell -ExecutionPolicy Bypass -File .\build-cuda-windows.ps1
 powershell -ExecutionPolicy Bypass -File .\build-sdrplay.ps1
 ```
 
-### Critical details
+## Runtime expectation
 
-- CUDA kernel archive must be named `libgpudemod_kernels.a`
-- `nvcc` must be invoked with `-ccbin C:\msys64\mingw64\bin\g++.exe`
-- Windows CGO link uses:
-  - SDRplay API import lib
-  - MinGW CUDA import libs from `cuda-mingw/`
-  - `-lgpudemod_kernels`
-  - `-lcufft64_12`
-  - `-lcudart64_13`
-  - `-lstdc++`
+`gpudemod_kernels.dll` must be available either:
+- next to `sdrd.exe`, or
+- in `internal/demod/gpudemod/build/` during local runs from the repo
 
-### Caveat
-
-`nvcc` + MinGW on Windows is not officially supported by NVIDIA. For the kernel launcher style used here (`extern "C"` functions, limited host C++ surface), it is the most practical path.
-
-CUDA 13.x also drops older GPU targets such as `sm_50` and `sm_60`, so the kernel build script targets `sm_75+`.
+The Windows `gpudemod` loader searches both locations.
 
 ## Linux
 
-Linux remains the cleanest end-to-end CUDA path:
-
-1. Build CUDA kernels with `nvcc` + GCC
-2. Link via standard CGO/GCC flow
-3. Avoid Windows toolchain mismatch entirely
+Linux remains the simpler direct-link path and still avoids the Windows mixed-toolchain problem entirely.
diff --git a/internal/demod/gpudemod/build/gpudemod_kernels.exp b/internal/demod/gpudemod/build/gpudemod_kernels.exp
new file mode 100644
index 0000000..979be64
Binary files /dev/null and b/internal/demod/gpudemod/build/gpudemod_kernels.exp differ
diff --git a/internal/demod/gpudemod/build/gpudemod_kernels.lib b/internal/demod/gpudemod/build/gpudemod_kernels.lib
index 1776fcd..948c17b 100644
Binary files a/internal/demod/gpudemod/build/gpudemod_kernels.lib and b/internal/demod/gpudemod/build/gpudemod_kernels.lib differ
diff --git a/internal/demod/gpudemod/gpudemod.go b/internal/demod/gpudemod/gpudemod.go
index 3559971..3b4a6a3 100644
--- a/internal/demod/gpudemod/gpudemod.go
+++ b/internal/demod/gpudemod/gpudemod.go
@@ -1,4 +1,4 @@
-//go:build cufft
+//go:build cufft && !windows
 
 package gpudemod
 
diff --git a/internal/demod/gpudemod/gpudemod_windows.go b/internal/demod/gpudemod/gpudemod_windows.go
new file mode 100644
index 0000000..daae728
--- /dev/null
+++ b/internal/demod/gpudemod/gpudemod_windows.go
@@ -0,0 +1,525 @@
+//go:build cufft && windows
+
+package gpudemod
+
+/*
+#cgo windows CFLAGS: -I"C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v13.2/include"
+#cgo windows LDFLAGS: -lcudart64_13 -lkernel32
+#include <windows.h>
+#include <stdlib.h>
+#include <cuda_runtime.h>
+
+typedef struct { float x; float y; } gpud_float2;
+
+typedef int (__stdcall *gpud_upload_fir_taps_fn)(const float* taps, int n);
+typedef int (__stdcall *gpud_launch_freq_shift_fn)(const gpud_float2* in, gpud_float2* out, int n, double phase_inc, double phase_start);
+typedef int (__stdcall *gpud_launch_fm_discrim_fn)(const gpud_float2* in, float* out, int n);
+typedef int (__stdcall *gpud_launch_fir_fn)(const gpud_float2* in, gpud_float2* out, int n, int num_taps);
+typedef int (__stdcall *gpud_launch_decimate_fn)(const gpud_float2* in, gpud_float2* out, int n_out, int factor);
+typedef int (__stdcall *gpud_launch_am_envelope_fn)(const gpud_float2* in, float* out, int n);
+typedef int (__stdcall *gpud_launch_ssb_product_fn)(const gpud_float2* in, float* out, int n, double phase_inc, double phase_start);
+
+static HMODULE gpud_mod = NULL;
+static gpud_upload_fir_taps_fn gpud_p_upload_fir_taps = NULL;
+static gpud_launch_freq_shift_fn gpud_p_launch_freq_shift = NULL;
+static gpud_launch_fm_discrim_fn gpud_p_launch_fm_discrim = NULL;
+static gpud_launch_fir_fn gpud_p_launch_fir = NULL;
+static gpud_launch_decimate_fn gpud_p_launch_decimate = NULL;
+static gpud_launch_am_envelope_fn gpud_p_launch_am_envelope = NULL;
+static gpud_launch_ssb_product_fn gpud_p_launch_ssb_product = NULL;
+
+static int gpud_cuda_malloc(void **ptr, size_t bytes) { return (int)cudaMalloc(ptr, bytes); }
+static int gpud_cuda_free(void *ptr) { return (int)cudaFree(ptr); }
+static int gpud_memcpy_h2d(void *dst, const void *src, size_t bytes) { return (int)cudaMemcpy(dst, src, bytes, cudaMemcpyHostToDevice); }
+static int gpud_memcpy_d2h(void *dst, const void *src, size_t bytes) { return (int)cudaMemcpy(dst, src, bytes, cudaMemcpyDeviceToHost); }
+static int gpud_device_sync() { return (int)cudaDeviceSynchronize(); }
+
+static int gpud_load_library(const char* path) {
+	if (gpud_mod != NULL) return 0;
+	gpud_mod = LoadLibraryA(path);
+	if (gpud_mod == NULL) return -1;
+	gpud_p_upload_fir_taps = (gpud_upload_fir_taps_fn)GetProcAddress(gpud_mod, "gpud_upload_fir_taps_cuda");
+	gpud_p_launch_freq_shift = (gpud_launch_freq_shift_fn)GetProcAddress(gpud_mod, "gpud_launch_freq_shift_cuda");
+	gpud_p_launch_fm_discrim = (gpud_launch_fm_discrim_fn)GetProcAddress(gpud_mod, "gpud_launch_fm_discrim_cuda");
+	gpud_p_launch_fir = (gpud_launch_fir_fn)GetProcAddress(gpud_mod, "gpud_launch_fir_cuda");
+	gpud_p_launch_decimate = (gpud_launch_decimate_fn)GetProcAddress(gpud_mod, "gpud_launch_decimate_cuda");
+	gpud_p_launch_am_envelope = (gpud_launch_am_envelope_fn)GetProcAddress(gpud_mod, "gpud_launch_am_envelope_cuda");
+	gpud_p_launch_ssb_product = (gpud_launch_ssb_product_fn)GetProcAddress(gpud_mod, "gpud_launch_ssb_product_cuda");
+	if (!gpud_p_upload_fir_taps || !gpud_p_launch_freq_shift || !gpud_p_launch_fm_discrim || !gpud_p_launch_fir || !gpud_p_launch_decimate || !gpud_p_launch_am_envelope || !gpud_p_launch_ssb_product) {
+		FreeLibrary(gpud_mod);
+		gpud_mod = NULL;
+		return -2;
+	}
+	return 0;
+}
+
+static int gpud_upload_fir_taps(const float* taps, int n) {
+	if (!gpud_p_upload_fir_taps) return -1;
+	return gpud_p_upload_fir_taps(taps, n);
+}
+static int gpud_launch_freq_shift(gpud_float2 *in, gpud_float2 *out, int n, double phase_inc, double phase_start) {
+	if (!gpud_p_launch_freq_shift) return -1;
+	return gpud_p_launch_freq_shift(in, out, n, phase_inc, phase_start);
+}
+static int gpud_launch_fm_discrim(gpud_float2 *in, float *out, int n) {
+	if (!gpud_p_launch_fm_discrim) return -1;
+	return gpud_p_launch_fm_discrim(in, out, n);
+}
+static int gpud_launch_fir(gpud_float2 *in, gpud_float2 *out, int n, int num_taps) {
+	if (!gpud_p_launch_fir) return -1;
+	return gpud_p_launch_fir(in, out, n, num_taps);
+}
+static int gpud_launch_decimate(gpud_float2 *in, gpud_float2 *out, int n_out, int factor) {
+	if (!gpud_p_launch_decimate) return -1;
+	return gpud_p_launch_decimate(in, out, n_out, factor);
+}
+static int gpud_launch_am_envelope(gpud_float2 *in, float *out, int n) {
+	if (!gpud_p_launch_am_envelope) return -1;
+	return gpud_p_launch_am_envelope(in, out, n);
+}
+static int gpud_launch_ssb_product(gpud_float2 *in, float *out, int n, double phase_inc, double phase_start) {
+	if (!gpud_p_launch_ssb_product) return -1;
+	return gpud_p_launch_ssb_product(in, out, n, phase_inc, phase_start);
+}
+*/
+import "C"
+
+import (
+	"errors"
+	"fmt"
+	"math"
+	"os"
+	"path/filepath"
+	"sync"
+	"unsafe"
+
+	"sdr-visual-suite/internal/demod"
+	"sdr-visual-suite/internal/dsp"
+)
+
+type DemodType int
+
+const (
+	DemodNFM DemodType = iota
+	DemodWFM
+	DemodAM
+	DemodUSB
+	DemodLSB
+	DemodCW
+)
+
+var loadOnce sync.Once
+var loadErr error
+
+func ensureDLLLoaded() error {
+	loadOnce.Do(func() {
+		candidates := []string{}
+		if exe, err := os.Executable(); err == nil {
+			dir := filepath.Dir(exe)
+			candidates = append(candidates, filepath.Join(dir, "gpudemod_kernels.dll"))
+		}
+		if wd, err := os.Getwd(); err == nil {
+			candidates = append(candidates,
+				filepath.Join(wd, "gpudemod_kernels.dll"),
+				filepath.Join(wd, "internal", "demod", "gpudemod", "build", "gpudemod_kernels.dll"),
+			)
+		}
+		seen := map[string]bool{}
+		for _, p := range candidates {
+			if p == "" || seen[p] {
+				continue
+			}
+			seen[p] = true
+			if _, err := os.Stat(p); err == nil {
+				cp := C.CString(p)
+				res := C.gpud_load_library(cp)
+				C.free(unsafe.Pointer(cp))
+				if res == 0 {
+					loadErr = nil
+					return
+				}
+				loadErr = fmt.Errorf("failed to load gpudemod DLL: %s (code %d)", p, int(res))
+			}
+		}
+		if loadErr == nil {
+			loadErr = errors.New("gpudemod_kernels.dll not found")
+		}
+	})
+	return loadErr
+}
+
+type Engine struct {
+	maxSamples        int
+	sampleRate        int
+	phase             float64
+	bfoPhase          float64
+	firTaps           []float32
+	cudaReady         bool
+	lastShiftUsedGPU  bool
+	lastFIRUsedGPU    bool
+	lastDecimUsedGPU  bool
+	lastDemodUsedGPU  bool
+	dIQIn             *C.gpud_float2
+	dShifted          *C.gpud_float2
+	dFiltered         *C.gpud_float2
+	dDecimated        *C.gpud_float2
+	dAudio            *C.float
+	iqBytes           C.size_t
+	audioBytes        C.size_t
+}
+
+func Available() bool {
+	if ensureDLLLoaded() != nil {
+		return false
+	}
+	var count C.int
+	if C.cudaGetDeviceCount(&count) != C.cudaSuccess {
+		return false
+	}
+	return count > 0
+}
+
+func New(maxSamples int, sampleRate int) (*Engine, error) {
+	if maxSamples <= 0 {
+		return nil, errors.New("invalid maxSamples")
+	}
+	if sampleRate <= 0 {
+		return nil, errors.New("invalid sampleRate")
+	}
+	if err := ensureDLLLoaded(); err != nil {
+		return nil, err
+	}
+	if !Available() {
+		return nil, errors.New("cuda device not available")
+	}
+	e := &Engine{
+		maxSamples: maxSamples,
+		sampleRate: sampleRate,
+		cudaReady:  true,
+		iqBytes:    C.size_t(maxSamples) * C.size_t(unsafe.Sizeof(C.gpud_float2{})),
+		audioBytes: C.size_t(maxSamples) * C.size_t(unsafe.Sizeof(C.float(0))),
+	}
+	var ptr unsafe.Pointer
+	if C.gpud_cuda_malloc(&ptr, e.iqBytes) != C.cudaSuccess {
+		e.Close()
+		return nil, errors.New("cudaMalloc dIQIn failed")
+	}
+	e.dIQIn = (*C.gpud_float2)(ptr)
+	ptr = nil
+	if C.gpud_cuda_malloc(&ptr, e.iqBytes) != C.cudaSuccess {
+		e.Close()
+		return nil, errors.New("cudaMalloc dShifted failed")
+	}
+	e.dShifted = (*C.gpud_float2)(ptr)
+	ptr = nil
+	if C.gpud_cuda_malloc(&ptr, e.iqBytes) != C.cudaSuccess {
+		e.Close()
+		return nil, errors.New("cudaMalloc dFiltered failed")
+	}
+	e.dFiltered = (*C.gpud_float2)(ptr)
+	ptr = nil
+	if C.gpud_cuda_malloc(&ptr, e.iqBytes) != C.cudaSuccess {
+		e.Close()
+		return nil, errors.New("cudaMalloc dDecimated failed")
+	}
+	e.dDecimated = (*C.gpud_float2)(ptr)
+	ptr = nil
+	if C.gpud_cuda_malloc(&ptr, e.audioBytes) != C.cudaSuccess {
+		e.Close()
+		return nil, errors.New("cudaMalloc dAudio failed")
+	}
+	e.dAudio = (*C.float)(ptr)
+	return e, nil
+}
+
+func (e *Engine) SetFIR(taps []float32) {
+	if len(taps) == 0 {
+		e.firTaps = nil
+		return
+	}
+	if len(taps) > 256 {
+		taps = taps[:256]
+	}
+	e.firTaps = append(e.firTaps[:0], taps...)
+	if e.cudaReady {
+		_ = C.gpud_upload_fir_taps((*C.float)(unsafe.Pointer(&e.firTaps[0])), C.int(len(e.firTaps)))
+	}
+}
+
+func phaseStatus() string { return "phase1c-validated-shift" }
+func (e *Engine) LastShiftUsedGPU() bool { return e != nil && e.lastShiftUsedGPU }
+func (e *Engine) LastDemodUsedGPU() bool { return e != nil && e.lastDemodUsedGPU }
+
+func (e *Engine) tryCUDAFreqShift(iq []complex64, offsetHz float64) ([]complex64, bool) {
+	if e == nil || !e.cudaReady || len(iq) == 0 || e.dIQIn == nil || e.dShifted == nil {
+		return nil, false
+	}
+	bytes := C.size_t(len(iq)) * C.size_t(unsafe.Sizeof(complex64(0)))
+	if C.gpud_memcpy_h2d(unsafe.Pointer(e.dIQIn), unsafe.Pointer(&iq[0]), bytes) != C.cudaSuccess {
+		return nil, false
+	}
+	phaseInc := -2.0 * math.Pi * offsetHz / float64(e.sampleRate)
+	if C.gpud_launch_freq_shift(e.dIQIn, e.dShifted, C.int(len(iq)), C.double(phaseInc), C.double(e.phase)) != 0 {
+		return nil, false
+	}
+	if C.gpud_device_sync() != C.cudaSuccess {
+		return nil, false
+	}
+	out := make([]complex64, len(iq))
+	if C.gpud_memcpy_d2h(unsafe.Pointer(&out[0]), unsafe.Pointer(e.dShifted), bytes) != C.cudaSuccess {
+		return nil, false
+	}
+	e.phase += phaseInc * float64(len(iq))
+	return out, true
+}
+
+func (e *Engine) tryCUDAFIR(iq []complex64, numTaps int) ([]complex64, bool) {
+	if e == nil || !e.cudaReady || len(iq) == 0 || numTaps <= 0 || e.dShifted == nil || e.dFiltered == nil {
+		return nil, false
+	}
+	iqBytes := C.size_t(len(iq)) * C.size_t(unsafe.Sizeof(complex64(0)))
+	if C.gpud_memcpy_h2d(unsafe.Pointer(e.dShifted), unsafe.Pointer(&iq[0]), iqBytes) != C.cudaSuccess {
+		return nil, false
+	}
+	if C.gpud_launch_fir(e.dShifted, e.dFiltered, C.int(len(iq)), C.int(numTaps)) != 0 {
+		return nil, false
+	}
+	if C.gpud_device_sync() != C.cudaSuccess {
+		return nil, false
+	}
+	out := make([]complex64, len(iq))
+	if C.gpud_memcpy_d2h(unsafe.Pointer(&out[0]), unsafe.Pointer(e.dFiltered), iqBytes) != C.cudaSuccess {
+		return nil, false
+	}
+	return out, true
+}
+
+func (e *Engine) tryCUDADecimate(filtered []complex64, factor int) ([]complex64, bool) {
+	if e == nil || !e.cudaReady || len(filtered) == 0 || factor <= 0 || e.dFiltered == nil || e.dDecimated == nil {
+		return nil, false
+	}
+	nOut := len(filtered) / factor
+	if nOut <= 0 {
+		return nil, false
+	}
+	iqBytes := C.size_t(len(filtered)) * C.size_t(unsafe.Sizeof(complex64(0)))
+	if C.gpud_memcpy_h2d(unsafe.Pointer(e.dFiltered), unsafe.Pointer(&filtered[0]), iqBytes) != C.cudaSuccess {
+		return nil, false
+	}
+	if C.gpud_launch_decimate(e.dFiltered, e.dDecimated, C.int(nOut), C.int(factor)) != 0 {
+		return nil, false
+	}
+	if C.gpud_device_sync() != C.cudaSuccess {
+		return nil, false
+	}
+	out := make([]complex64, nOut)
+	outBytes := C.size_t(nOut) * C.size_t(unsafe.Sizeof(complex64(0)))
+	if C.gpud_memcpy_d2h(unsafe.Pointer(&out[0]), unsafe.Pointer(e.dDecimated), outBytes) != C.cudaSuccess {
+		return nil, false
+	}
+	return out, true
+}
+
+func (e *Engine) tryCUDAFMDiscrim(shifted []complex64) ([]float32, bool) {
+	if e == nil || !e.cudaReady || len(shifted) < 2 || e.dShifted == nil || e.dAudio == nil {
+		return nil, false
+	}
+	iqBytes := C.size_t(len(shifted)) * C.size_t(unsafe.Sizeof(complex64(0)))
+	if C.gpud_memcpy_h2d(unsafe.Pointer(e.dShifted), unsafe.Pointer(&shifted[0]), iqBytes) != C.cudaSuccess {
+		return nil, false
+	}
+	if C.gpud_launch_fm_discrim(e.dShifted, e.dAudio, C.int(len(shifted))) != 0 {
+		return nil, false
+	}
+	if C.gpud_device_sync() != C.cudaSuccess {
+		return nil, false
+	}
+	out := make([]float32, len(shifted)-1)
+	outBytes := C.size_t(len(out)) * C.size_t(unsafe.Sizeof(float32(0)))
+	if C.gpud_memcpy_d2h(unsafe.Pointer(&out[0]), unsafe.Pointer(e.dAudio), outBytes) != C.cudaSuccess {
+		return nil, false
+	}
+	return out, true
+}
+
+func (e *Engine) tryCUDAAMEnvelope(shifted []complex64) ([]float32, bool) {
+	if e == nil || !e.cudaReady || len(shifted) == 0 || e.dShifted == nil || e.dAudio == nil {
+		return nil, false
+	}
+	iqBytes := C.size_t(len(shifted)) * C.size_t(unsafe.Sizeof(complex64(0)))
+	if C.gpud_memcpy_h2d(unsafe.Pointer(e.dShifted), unsafe.Pointer(&shifted[0]), iqBytes) != C.cudaSuccess {
+		return nil, false
+	}
+	if C.gpud_launch_am_envelope(e.dShifted, e.dAudio, C.int(len(shifted))) != 0 {
+		return nil, false
+	}
+	if C.gpud_device_sync() != C.cudaSuccess {
+		return nil, false
+	}
+	out := make([]float32, len(shifted))
+	outBytes := C.size_t(len(out)) * C.size_t(unsafe.Sizeof(float32(0)))
+	if C.gpud_memcpy_d2h(unsafe.Pointer(&out[0]), unsafe.Pointer(e.dAudio), outBytes) != C.cudaSuccess {
+		return nil, false
+	}
+	return out, true
+}
+
+func (e *Engine) tryCUDASSBProduct(shifted []complex64, bfoHz float64) ([]float32, bool) {
+	if e == nil || !e.cudaReady || len(shifted) == 0 || e.dShifted == nil || e.dAudio == nil {
+		return nil, false
+	}
+	iqBytes := C.size_t(len(shifted)) * C.size_t(unsafe.Sizeof(complex64(0)))
+	if C.gpud_memcpy_h2d(unsafe.Pointer(e.dShifted), unsafe.Pointer(&shifted[0]), iqBytes) != C.cudaSuccess {
+		return nil, false
+	}
+	phaseInc := 2.0 * math.Pi * bfoHz / float64(e.sampleRate)
+	if C.gpud_launch_ssb_product(e.dShifted, e.dAudio, C.int(len(shifted)), C.double(phaseInc), C.double(e.bfoPhase)) != 0 {
+		return nil, false
+	}
+	if C.gpud_device_sync() != C.cudaSuccess {
+		return nil, false
+	}
+	out := make([]float32, len(shifted))
+	outBytes := C.size_t(len(out)) * C.size_t(unsafe.Sizeof(float32(0)))
+	if C.gpud_memcpy_d2h(unsafe.Pointer(&out[0]), unsafe.Pointer(e.dAudio), outBytes) != C.cudaSuccess {
+		return nil, false
+	}
+	e.bfoPhase += phaseInc * float64(len(shifted))
+	return out, true
+}
+
+func (e *Engine) Demod(iq []complex64, offsetHz float64, bw float64, mode DemodType) ([]float32, int, error) {
+	if e == nil {
+		return nil, 0, errors.New("nil CUDA demod engine")
+	}
+	if !e.cudaReady {
+		return nil, 0, errors.New("cuda demod engine is not initialized")
+	}
+	if len(iq) == 0 {
+		return nil, 0, nil
+	}
+	if len(iq) > e.maxSamples {
+		return nil, 0, errors.New("sample count exceeds engine capacity")
+	}
+
+	_ = fmt.Sprintf("%s:%0.3f", phaseStatus(), offsetHz)
+	shifted, ok := e.tryCUDAFreqShift(iq, offsetHz)
+	e.lastShiftUsedGPU = ok && ValidateFreqShift(iq, e.sampleRate, offsetHz, shifted, 1e-3)
+	if !e.lastShiftUsedGPU {
+		shifted = dsp.FreqShift(iq, e.sampleRate, offsetHz)
+	}
+
+	var outRate int
+	switch mode {
+	case DemodNFM, DemodAM, DemodUSB, DemodLSB, DemodCW:
+		outRate = 48000
+	case DemodWFM:
+		outRate = 192000
+	default:
+		return nil, 0, errors.New("unsupported demod type")
+	}
+
+	cutoff := bw / 2
+	if cutoff < 200 {
+		cutoff = 200
+	}
+	taps := e.firTaps
+	if len(taps) == 0 {
+		base64 := dsp.LowpassFIR(cutoff, e.sampleRate, 101)
+		taps = make([]float32, len(base64))
+		for i, v := range base64 {
+			taps[i] = float32(v)
+		}
+		e.SetFIR(taps)
+	}
+	filtered, ok := e.tryCUDAFIR(shifted, len(taps))
+	e.lastFIRUsedGPU = ok && ValidateFIR(shifted, taps, filtered, 1e-3)
+	if !e.lastFIRUsedGPU {
+		ftaps := make([]float64, len(taps))
+		for i, v := range taps {
+			ftaps[i] = float64(v)
+		}
+		filtered = dsp.ApplyFIR(shifted, ftaps)
+	}
+
+	decim := int(math.Round(float64(e.sampleRate) / float64(outRate)))
+	if decim < 1 {
+		decim = 1
+	}
+	dec, ok := e.tryCUDADecimate(filtered, decim)
+	e.lastDecimUsedGPU = ok && ValidateDecimate(filtered, decim, dec, 1e-3)
+	if !e.lastDecimUsedGPU {
+		dec = dsp.Decimate(filtered, decim)
+	}
+	inputRate := e.sampleRate / decim
+
+	e.lastDemodUsedGPU = false
+	switch mode {
+	case DemodNFM:
+		if gpuAudio, ok := e.tryCUDAFMDiscrim(dec); ok {
+			e.lastDemodUsedGPU = true
+			return gpuAudio, inputRate, nil
+		}
+		return demod.NFM{}.Demod(dec, inputRate), inputRate, nil
+	case DemodWFM:
+		if gpuAudio, ok := e.tryCUDAFMDiscrim(dec); ok {
+			e.lastDemodUsedGPU = true
+			return gpuAudio, inputRate, nil
+		}
+		return demod.WFM{}.Demod(dec, inputRate), inputRate, nil
+	case DemodAM:
+		if gpuAudio, ok := e.tryCUDAAMEnvelope(dec); ok {
+			e.lastDemodUsedGPU = true
+			return gpuAudio, inputRate, nil
+		}
+		return demod.AM{}.Demod(dec, inputRate), inputRate, nil
+	case DemodUSB:
+		if gpuAudio, ok := e.tryCUDASSBProduct(dec, 700.0); ok {
+			e.lastDemodUsedGPU = true
+			return gpuAudio, inputRate, nil
+		}
+		return demod.USB{}.Demod(dec, inputRate), inputRate, nil
+	case DemodLSB:
+		if gpuAudio, ok := e.tryCUDASSBProduct(dec, -700.0); ok {
+			e.lastDemodUsedGPU = true
+			return gpuAudio, inputRate, nil
+		}
+		return demod.LSB{}.Demod(dec, inputRate), inputRate, nil
+	case DemodCW:
+		if gpuAudio, ok := e.tryCUDASSBProduct(dec, 700.0); ok {
+			e.lastDemodUsedGPU = true
+			return gpuAudio, inputRate, nil
+		}
+		return demod.CW{}.Demod(dec, inputRate), inputRate, nil
+	default:
+		return nil, 0, errors.New("unsupported demod type")
+	}
+}
+
+func (e *Engine) Close() {
+	if e == nil {
+		return
+	}
+	if e.dIQIn != nil {
+		_ = C.gpud_cuda_free(unsafe.Pointer(e.dIQIn))
+		e.dIQIn = nil
+	}
+	if e.dShifted != nil {
+		_ = C.gpud_cuda_free(unsafe.Pointer(e.dShifted))
+		e.dShifted = nil
+	}
+	if e.dFiltered != nil {
+		_ = C.gpud_cuda_free(unsafe.Pointer(e.dFiltered))
+		e.dFiltered = nil
+	}
+	if e.dDecimated != nil {
+		_ = C.gpud_cuda_free(unsafe.Pointer(e.dDecimated))
+		e.dDecimated = nil
+	}
+	if e.dAudio != nil {
+		_ = C.gpud_cuda_free(unsafe.Pointer(e.dAudio))
+		e.dAudio = nil
+	}
+	e.firTaps = nil
+	e.cudaReady = false
+}
diff --git a/internal/demod/gpudemod/native/exports.cu b/internal/demod/gpudemod/native/exports.cu
new file mode 100644
index 0000000..1278e61
--- /dev/null
+++ b/internal/demod/gpudemod/native/exports.cu
@@ -0,0 +1,190 @@
+#include <cuda_runtime.h>
+#include <math.h>
+
+#if defined(_WIN32)
+#define GPUD_API extern "C" __declspec(dllexport)
+#define GPUD_CALL __stdcall
+#else
+#define GPUD_API extern "C"
+#define GPUD_CALL
+#endif
+
+GPUD_API __global__ void gpud_freq_shift_kernel(
+    const float2* __restrict__ in,
+    float2* __restrict__ out,
+    int n,
+    double phase_inc,
+    double phase_start
+) {
+    int idx = blockIdx.x * blockDim.x + threadIdx.x;
+    if (idx >= n) return;
+
+    double phase = phase_start + phase_inc * (double)idx;
+    float si, co;
+    sincosf((float)phase, &si, &co);
+
+    float2 v = in[idx];
+    out[idx].x = v.x * co - v.y * si;
+    out[idx].y = v.x * si + v.y * co;
+}
+
+GPUD_API int GPUD_CALL gpud_launch_freq_shift_cuda(
+    const float2* in,
+    float2* out,
+    int n,
+    double phase_inc,
+    double phase_start
+) {
+    if (n <= 0) return 0;
+    const int block = 256;
+    const int grid = (n + block - 1) / block;
+    gpud_freq_shift_kernel<<<grid, block>>>(in, out, n, phase_inc, phase_start);
+    return (int)cudaGetLastError();
+}
+
+GPUD_API __global__ void gpud_fm_discrim_kernel(
+    const float2* __restrict__ in,
+    float* __restrict__ out,
+    int n
+) {
+    int idx = blockIdx.x * blockDim.x + threadIdx.x;
+    if (idx >= n - 1) return;
+
+    float2 prev = in[idx];
+    float2 curr = in[idx + 1];
+    float re = prev.x * curr.x + prev.y * curr.y;
+    float im = prev.x * curr.y - prev.y * curr.x;
+    out[idx] = atan2f(im, re);
+}
+
+GPUD_API int GPUD_CALL gpud_launch_fm_discrim_cuda(
+    const float2* in,
+    float* out,
+    int n
+) {
+    if (n <= 1) return 0;
+    const int block = 256;
+    const int grid = (n + block - 1) / block;
+    gpud_fm_discrim_kernel<<<grid, block>>>(in, out, n);
+    return (int)cudaGetLastError();
+}
+
+GPUD_API __global__ void gpud_decimate_kernel(
+    const float2* __restrict__ in,
+    float2* __restrict__ out,
+    int n_out,
+    int factor
+) {
+    int idx = blockIdx.x * blockDim.x + threadIdx.x;
+    if (idx >= n_out) return;
+    out[idx] = in[idx * factor];
+}
+
+__device__ __constant__ float gpud_fir_taps[256];
+
+GPUD_API __global__ void gpud_fir_kernel(
+    const float2* __restrict__ in,
+    float2* __restrict__ out,
+    int n,
+    int num_taps
+) {
+    int idx = blockIdx.x * blockDim.x + threadIdx.x;
+    if (idx >= n) return;
+
+    float acc_r = 0.0f;
+    float acc_i = 0.0f;
+    for (int k = 0; k < num_taps; ++k) {
+        int src = idx - k;
+        if (src < 0) break;
+        float2 v = in[src];
+        float t = gpud_fir_taps[k];
+        acc_r += v.x * t;
+        acc_i += v.y * t;
+    }
+    out[idx] = make_float2(acc_r, acc_i);
+}
+
+GPUD_API int GPUD_CALL gpud_upload_fir_taps_cuda(const float* taps, int n) {
+    if (!taps || n <= 0 || n > 256) return -1;
+    cudaError_t err = cudaMemcpyToSymbol(gpud_fir_taps, taps, (size_t)n * sizeof(float));
+    return (int)err;
+}
+
+GPUD_API int GPUD_CALL gpud_launch_fir_cuda(
+    const float2* in,
+    float2* out,
+    int n,
+    int num_taps
+) {
+    if (n <= 0 || num_taps <= 0 || num_taps > 256) return 0;
+    const int block = 256;
+    const int grid = (n + block - 1) / block;
+    gpud_fir_kernel<<<grid, block>>>(in, out, n, num_taps);
+    return (int)cudaGetLastError();
+}
+
+GPUD_API int GPUD_CALL gpud_launch_decimate_cuda(
+    const float2* in,
+    float2* out,
+    int n_out,
+    int factor
+) {
+    if (n_out <= 0 || factor <= 0) return 0;
+    const int block = 256;
+    const int grid = (n_out + block - 1) / block;
+    gpud_decimate_kernel<<<grid, block>>>(in, out, n_out, factor);
+    return (int)cudaGetLastError();
+}
+
+GPUD_API __global__ void gpud_am_envelope_kernel(
+    const float2* __restrict__ in,
+    float* __restrict__ out,
+    int n
+) {
+    int idx = blockIdx.x * blockDim.x + threadIdx.x;
+    if (idx >= n) return;
+    float2 v = in[idx];
+    out[idx] = sqrtf(v.x * v.x + v.y * v.y);
+}
+
+GPUD_API int GPUD_CALL gpud_launch_am_envelope_cuda(
+    const float2* in,
+    float* out,
+    int n
+) {
+    if (n <= 0) return 0;
+    const int block = 256;
+    const int grid = (n + block - 1) / block;
+    gpud_am_envelope_kernel<<<grid, block>>>(in, out, n);
+    return (int)cudaGetLastError();
+}
+
+GPUD_API __global__ void gpud_ssb_product_kernel(
+    const float2* __restrict__ in,
+    float* __restrict__ out,
+    int n,
+    double phase_inc,
+    double phase_start
+) {
+    int idx = blockIdx.x * blockDim.x + threadIdx.x;
+    if (idx >= n) return;
+    double phase = phase_start + phase_inc * (double)idx;
+    float si, co;
+    sincosf((float)phase, &si, &co);
+    float2 v = in[idx];
+    out[idx] = v.x * co - v.y * si;
+}
+
+GPUD_API int GPUD_CALL gpud_launch_ssb_product_cuda(
+    const float2* in,
+    float* out,
+    int n,
+    double phase_inc,
+    double phase_start
+) {
+    if (n <= 0) return 0;
+    const int block = 256;
+    const int grid = (n + block - 1) / block;
+    gpud_ssb_product_kernel<<<grid, block>>>(in, out, n, phase_inc, phase_start);
+    return (int)cudaGetLastError();
+}