diff --git a/build-sdrplay.ps1 b/build-sdrplay.ps1
index 2f46e1f..d4f41e0 100644
--- a/build-sdrplay.ps1
+++ b/build-sdrplay.ps1
@@ -39,6 +39,14 @@ if (Test-Path $cudaMingw) {
 
 Write-Host "Building with SDRplay + cuFFT support..." -ForegroundColor Cyan
 
+$gccHost = Join-Path $gcc 'g++.exe'
+if (!(Test-Path $gccHost)) {
+  throw "g++.exe not found at $gccHost"
+}
+
+powershell -ExecutionPolicy Bypass -File tools\build-gpudemod-kernel.ps1 -HostCompiler $gccHost
+if ($LASTEXITCODE -ne 0) { throw "kernel build failed" }
+
 go build -tags "sdrplay,cufft" ./cmd/sdrd
 
 if ($LASTEXITCODE -ne 0) { throw "build failed" }
diff --git a/internal/demod/gpudemod/README.md b/internal/demod/gpudemod/README.md
index e51c0ed..c69ee9c 100644
--- a/internal/demod/gpudemod/README.md
+++ b/internal/demod/gpudemod/README.md
@@ -20,9 +20,10 @@ This is **not compiled automatically yet** in the current environment because th
 
 On a CUDA-capable dev machine with toolchain installed:
 
-1. Compile `kernels.cu` into an object file
+1. Compile `kernels.cu` into an object file and archive it into a linkable library
    - helper script: `tools/build-gpudemod-kernel.ps1`
-2. Link it into the `cufft` build
+2. For MinGW/CGO builds, prefer building the archive with MinGW host compiler + `ar.exe`
+3. Link `gpudemod_kernels.lib` into the `cufft` build
 3. Replace `gpud_launch_freq_shift(...)` stub body with the real kernel launch
 4. Validate copied-back shifted IQ against `dsp.FreqShift`
 5. Only then move the next stage (FM discriminator) onto the GPU
diff --git a/internal/demod/gpudemod/build/gpudemod_kernels.lib b/internal/demod/gpudemod/build/gpudemod_kernels.lib
new file mode 100644
index 0000000..1776fcd
Binary files /dev/null and b/internal/demod/gpudemod/build/gpudemod_kernels.lib differ
diff --git a/internal/demod/gpudemod/build/kernels.obj b/internal/demod/gpudemod/build/kernels.obj
index 3c8f2fd..f19e4a2 100644
Binary files a/internal/demod/gpudemod/build/kernels.obj and b/internal/demod/gpudemod/build/kernels.obj differ
diff --git a/internal/demod/gpudemod/gpudemod.go b/internal/demod/gpudemod/gpudemod.go
index 506ad93..f42cd2b 100644
--- a/internal/demod/gpudemod/gpudemod.go
+++ b/internal/demod/gpudemod/gpudemod.go
@@ -3,7 +3,7 @@
 package gpudemod
 
 /*
-#cgo windows LDFLAGS: -L${SRCDIR}/../../../cuda-mingw -lcufft64_12 -lcudart64_13 ${SRCDIR}/build/kernels.obj
+#cgo windows LDFLAGS: -L${SRCDIR}/../../../cuda-mingw -L${SRCDIR}/build -lgpudemod_kernels -lcufft64_12 -lcudart64_13
 #cgo windows CFLAGS: -I"C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v13.2/include"
 #include <cuda_runtime.h>
 #include <cufft.h>
@@ -31,16 +31,40 @@ static int gpud_device_sync() {
 }
 
 extern int gpud_launch_freq_shift_cuda(const gpud_float2* in, gpud_float2* out, int n, double phase_inc, double phase_start);
+extern int gpud_launch_fm_discrim_cuda(const gpud_float2* in, float* out, int n);
+extern int gpud_upload_fir_taps_cuda(const float* taps, int n);
+extern int gpud_launch_fir_cuda(const gpud_float2* in, gpud_float2* out, int n, int num_taps);
+extern int gpud_launch_decimate_cuda(const gpud_float2* in, gpud_float2* out, int n_out, int factor);
+extern int gpud_launch_am_envelope_cuda(const gpud_float2* in, float* out, int n);
+extern int gpud_launch_ssb_product_cuda(const gpud_float2* in, float* out, int n, double phase_inc, double phase_start);
 
 static int gpud_launch_freq_shift(gpud_float2 *in, gpud_float2 *out, int n, double phase_inc, double phase_start) {
 	return gpud_launch_freq_shift_cuda(in, out, n, phase_inc, phase_start);
 }
 
-extern int gpud_launch_fm_discrim_cuda(const gpud_float2* in, float* out, int n);
-
 static int gpud_launch_fm_discrim(gpud_float2 *in, float *out, int n) {
 	return gpud_launch_fm_discrim_cuda(in, out, n);
 }
+
+static int gpud_upload_fir_taps(const float* taps, int n) {
+	return gpud_upload_fir_taps_cuda(taps, n);
+}
+
+static int gpud_launch_fir(gpud_float2 *in, gpud_float2 *out, int n, int num_taps) {
+	return gpud_launch_fir_cuda(in, out, n, num_taps);
+}
+
+static int gpud_launch_decimate(gpud_float2 *in, gpud_float2 *out, int n_out, int factor) {
+	return gpud_launch_decimate_cuda(in, out, n_out, factor);
+}
+
+static int gpud_launch_am_envelope(gpud_float2 *in, float *out, int n) {
+	return gpud_launch_am_envelope_cuda(in, out, n);
+}
+
+static int gpud_launch_ssb_product(gpud_float2 *in, float *out, int n, double phase_inc, double phase_start) {
+	return gpud_launch_ssb_product_cuda(in, out, n, phase_inc, phase_start);
+}
 */
 import "C"
 
@@ -66,18 +90,23 @@ const (
 )
 
 type Engine struct {
-	maxSamples       int
-	sampleRate       int
-	phase            float64
-	bfoPhase         float64
-	firTaps          []float32
-	cudaReady        bool
-	lastShiftUsedGPU bool
-	dIQIn            *C.gpud_float2
-	dShifted         *C.gpud_float2
-	dAudio           *C.float
-	iqBytes          C.size_t
-	audioBytes       C.size_t
+	maxSamples        int
+	sampleRate        int
+	phase             float64
+	bfoPhase          float64
+	firTaps           []float32
+	cudaReady         bool
+	lastShiftUsedGPU  bool
+	lastFIRUsedGPU    bool
+	lastDecimUsedGPU  bool
+	lastDemodUsedGPU  bool
+	dIQIn             *C.gpud_float2
+	dShifted          *C.gpud_float2
+	dFiltered         *C.gpud_float2
+	dDecimated        *C.gpud_float2
+	dAudio            *C.float
+	iqBytes           C.size_t
+	audioBytes        C.size_t
 }
 
 func Available() bool {
@@ -118,6 +147,18 @@ func New(maxSamples int, sampleRate int) (*Engine, error) {
 	}
 	e.dShifted = (*C.gpud_float2)(ptr)
 	ptr = nil
+	if C.gpud_cuda_malloc(&ptr, e.iqBytes) != C.cudaSuccess {
+		e.Close()
+		return nil, errors.New("cudaMalloc dFiltered failed")
+	}
+	e.dFiltered = (*C.gpud_float2)(ptr)
+	ptr = nil
+	if C.gpud_cuda_malloc(&ptr, e.iqBytes) != C.cudaSuccess {
+		e.Close()
+		return nil, errors.New("cudaMalloc dDecimated failed")
+	}
+	e.dDecimated = (*C.gpud_float2)(ptr)
+	ptr = nil
 	if C.gpud_cuda_malloc(&ptr, e.audioBytes) != C.cudaSuccess {
 		e.Close()
 		return nil, errors.New("cudaMalloc dAudio failed")
@@ -131,18 +172,21 @@ func (e *Engine) SetFIR(taps []float32) {
 		e.firTaps = nil
 		return
 	}
+	if len(taps) > 256 {
+		taps = taps[:256]
+	}
 	e.firTaps = append(e.firTaps[:0], taps...)
+	if e.cudaReady {
+		_ = C.gpud_upload_fir_taps((*C.float)(unsafe.Pointer(&e.firTaps[0])), C.int(len(e.firTaps)))
+	}
 }
 
-func phaseStatus() string {
-	return "phase1c-validated-shift"
-}
-
+func phaseStatus() string { return "phase1c-validated-shift" }
 func (e *Engine) LastShiftUsedGPU() bool {
-	if e == nil {
-		return false
-	}
-	return e.lastShiftUsedGPU
+	return e != nil && e.lastShiftUsedGPU
+}
+func (e *Engine) LastDemodUsedGPU() bool {
+	return e != nil && e.lastDemodUsedGPU
 }
 
 func (e *Engine) tryCUDAFreqShift(iq []complex64, offsetHz float64) ([]complex64, bool) {
@@ -168,6 +212,53 @@ func (e *Engine) tryCUDAFreqShift(iq []complex64, offsetHz float64) ([]complex64
 	return out, true
 }
 
+func (e *Engine) tryCUDAFIR(iq []complex64, numTaps int) ([]complex64, bool) {
+	if e == nil || !e.cudaReady || len(iq) == 0 || numTaps <= 0 || e.dShifted == nil || e.dFiltered == nil {
+		return nil, false
+	}
+	iqBytes := C.size_t(len(iq)) * C.size_t(unsafe.Sizeof(complex64(0)))
+	if C.gpud_memcpy_h2d(unsafe.Pointer(e.dShifted), unsafe.Pointer(&iq[0]), iqBytes) != C.cudaSuccess {
+		return nil, false
+	}
+	if C.gpud_launch_fir(e.dShifted, e.dFiltered, C.int(len(iq)), C.int(numTaps)) != 0 {
+		return nil, false
+	}
+	if C.gpud_device_sync() != C.cudaSuccess {
+		return nil, false
+	}
+	out := make([]complex64, len(iq))
+	if C.gpud_memcpy_d2h(unsafe.Pointer(&out[0]), unsafe.Pointer(e.dFiltered), iqBytes) != C.cudaSuccess {
+		return nil, false
+	}
+	return out, true
+}
+
+func (e *Engine) tryCUDADecimate(filtered []complex64, factor int) ([]complex64, bool) {
+	if e == nil || !e.cudaReady || len(filtered) == 0 || factor <= 0 || e.dFiltered == nil || e.dDecimated == nil {
+		return nil, false
+	}
+	nOut := len(filtered) / factor
+	if nOut <= 0 {
+		return nil, false
+	}
+	iqBytes := C.size_t(len(filtered)) * C.size_t(unsafe.Sizeof(complex64(0)))
+	if C.gpud_memcpy_h2d(unsafe.Pointer(e.dFiltered), unsafe.Pointer(&filtered[0]), iqBytes) != C.cudaSuccess {
+		return nil, false
+	}
+	if C.gpud_launch_decimate(e.dFiltered, e.dDecimated, C.int(nOut), C.int(factor)) != 0 {
+		return nil, false
+	}
+	if C.gpud_device_sync() != C.cudaSuccess {
+		return nil, false
+	}
+	out := make([]complex64, nOut)
+	outBytes := C.size_t(nOut) * C.size_t(unsafe.Sizeof(complex64(0)))
+	if C.gpud_memcpy_d2h(unsafe.Pointer(&out[0]), unsafe.Pointer(e.dDecimated), outBytes) != C.cudaSuccess {
+		return nil, false
+	}
+	return out, true
+}
+
 func (e *Engine) tryCUDAFMDiscrim(shifted []complex64) ([]float32, bool) {
 	if e == nil || !e.cudaReady || len(shifted) < 2 || e.dShifted == nil || e.dAudio == nil {
 		return nil, false
@@ -273,20 +364,32 @@ func (e *Engine) Demod(iq []complex64, offsetHz float64, bw float64, mode DemodT
 	}
 	taps := e.firTaps
 	if len(taps) == 0 {
-		base := dsp.LowpassFIR(cutoff, e.sampleRate, 101)
-		taps = append(make([]float32, 0, len(base)), base...)
+		base64 := dsp.LowpassFIR(cutoff, e.sampleRate, 101)
+		taps = make([]float32, len(base64))
+		for i, v := range base64 {
+			taps[i] = float32(v)
+		}
 		e.SetFIR(taps)
 	}
 	filtered, ok := e.tryCUDAFIR(shifted, len(taps))
-	if !ok {
-		filtered = dsp.ApplyFIR(shifted, taps)
+	e.lastFIRUsedGPU = ok && ValidateFIR(shifted, taps, filtered, 1e-3)
+	if !e.lastFIRUsedGPU {
+		ftaps := make([]float64, len(taps))
+		for i, v := range taps {
+			ftaps[i] = float64(v)
+		}
+		filtered = dsp.ApplyFIR(shifted, ftaps)
 	}
+
 	decim := int(math.Round(float64(e.sampleRate) / float64(outRate)))
 	if decim < 1 {
 		decim = 1
 	}
-	dec := dsp.Decimate(filtered, decim)
-	e.lastDecimUsedGPU = false
+	dec, ok := e.tryCUDADecimate(filtered, decim)
+	e.lastDecimUsedGPU = ok && ValidateDecimate(filtered, decim, dec, 1e-3)
+	if !e.lastDecimUsedGPU {
+		dec = dsp.Decimate(filtered, decim)
+	}
 	inputRate := e.sampleRate / decim
 
 	e.lastDemodUsedGPU = false
@@ -344,37 +447,9 @@ func (e *Engine) Close() {
 		_ = C.gpud_cuda_free(unsafe.Pointer(e.dShifted))
 		e.dShifted = nil
 	}
-	if e.dDecimated != nil {
-		_ = C.gpud_cuda_free(unsafe.Pointer(e.dDecimated))
-		e.dDecimated = nil
-	}
-	if e.dAudio != nil {
-		_ = C.gpud_cuda_free(unsafe.Pointer(e.dAudio))
-		e.dAudio = nil
-	}
-	e.firTaps = nil
-	e.cudaReady = false
-}
-odLSB:
-		return demod.LSB{}.Demod(dec, inputRate), inputRate, nil
-	case DemodCW:
-		return demod.CW{}.Demod(dec, inputRate), inputRate, nil
-	default:
-		return nil, 0, errors.New("unsupported demod type")
-	}
-}
-
-func (e *Engine) Close() {
-	if e == nil {
-		return
-	}
-	if e.dIQIn != nil {
-		_ = C.gpud_cuda_free(unsafe.Pointer(e.dIQIn))
-		e.dIQIn = nil
-	}
-	if e.dShifted != nil {
-		_ = C.gpud_cuda_free(unsafe.Pointer(e.dShifted))
-		e.dShifted = nil
+	if e.dFiltered != nil {
+		_ = C.gpud_cuda_free(unsafe.Pointer(e.dFiltered))
+		e.dFiltered = nil
 	}
 	if e.dDecimated != nil {
 		_ = C.gpud_cuda_free(unsafe.Pointer(e.dDecimated))
diff --git a/tools/build-gpudemod-kernel.ps1 b/tools/build-gpudemod-kernel.ps1
index 5c8a96b..d021574 100644
--- a/tools/build-gpudemod-kernel.ps1
+++ b/tools/build-gpudemod-kernel.ps1
@@ -15,13 +15,47 @@ if (!(Test-Path $nvcc)) {
 
 New-Item -ItemType Directory -Force -Path $OutDir | Out-Null
 $outObj = Join-Path $OutDir 'kernels.obj'
+$outLib = Join-Path $OutDir 'gpudemod_kernels.lib'
 
 Write-Host "Using nvcc: $nvcc"
 Write-Host "Building $Source -> $outObj"
 
-& $nvcc -c $Source -o $outObj -I (Join-Path $CudaRoot 'include') -Xcompiler "/EHsc"
+$nvccArgs = @('-c', $Source, '-o', $outObj, '-I', (Join-Path $CudaRoot 'include'))
+if ($HostCompiler) {
+  Write-Host "Using host compiler: $HostCompiler"
+  $hostDir = Split-Path -Parent $HostCompiler
+  $nvccArgs += @('-ccbin', $hostDir)
+} else {
+  $nvccArgs += @('-Xcompiler', '/EHsc')
+}
+
+& $nvcc @nvccArgs
 if ($LASTEXITCODE -ne 0) {
   throw "nvcc failed with exit code $LASTEXITCODE"
 }
 
+if ($HostCompiler) {
+  $ar = Get-Command ar.exe -ErrorAction SilentlyContinue
+  if (-not $ar) {
+    throw "ar.exe not found in PATH; required for MinGW-compatible archive"
+  }
+  Write-Host "Archiving $outObj -> $outLib with ar.exe"
+  if (Test-Path $outLib) { Remove-Item $outLib -Force }
+  & $ar 'rcs' $outLib $outObj
+  if ($LASTEXITCODE -ne 0) {
+    throw "ar.exe failed with exit code $LASTEXITCODE"
+  }
+} else {
+  $libexe = Get-Command lib.exe -ErrorAction SilentlyContinue
+  if (-not $libexe) {
+    throw "lib.exe not found in PATH; run from vcvars64.bat environment"
+  }
+  Write-Host "Archiving $outObj -> $outLib with lib.exe"
+  & $libexe /nologo /OUT:$outLib $outObj
+  if ($LASTEXITCODE -ne 0) {
+    throw "lib.exe failed with exit code $LASTEXITCODE"
+  }
+}
+
 Write-Host "Built: $outObj"
+Write-Host "Archived: $outLib"