diff --git a/internal/demod/gpudemod/doc.go b/internal/demod/gpudemod/doc.go
new file mode 100644
index 0000000..25b4d4e
--- /dev/null
+++ b/internal/demod/gpudemod/doc.go
@@ -0,0 +1,11 @@
+// Package gpudemod contains the CUDA-tagged demodulation pipeline scaffolding.
+//
+// Current state:
+//   - Standard builds use the !cufft stub.
+//   - cufft builds allocate GPU buffers and cross the CGO/CUDA launch boundary.
+//   - If/when a CUDA freq-shift launch succeeds, the shifted IQ is copied back and
+//     reused by the remaining CPU-side FIR/decimate/NFM pipeline.
+//
+// This keeps Phase 1 incremental and verifiable while later phases replace the
+// placeholder launch wrappers with real kernels.
+package gpudemod
diff --git a/internal/demod/gpudemod/gpudemod.go b/internal/demod/gpudemod/gpudemod.go
index 8e63f5a..2b2cd5c 100644
--- a/internal/demod/gpudemod/gpudemod.go
+++ b/internal/demod/gpudemod/gpudemod.go
@@ -175,9 +175,10 @@ func (e *Engine) Demod(iq []complex64, offsetHz float64, bw float64, mode DemodT
 	// Real CUDA boundary is now present. If the launch wrappers are not yet backed
 	// by actual kernels, we fall back to the existing CPU DSP path below.
 	_ = fmt.Sprintf("%s:%0.3f", phaseStatus(), offsetHz)
-	_ = e.tryCUDAFreqShift(iq, offsetHz)
-
-	shifted := dsp.FreqShift(iq, e.sampleRate, offsetHz)
+	shifted, ok := e.tryCUDAFreqShift(iq, offsetHz)
+	if !ok {
+		shifted = dsp.FreqShift(iq, e.sampleRate, offsetHz)
+	}
 	cutoff := bw / 2
 	if cutoff < 200 {
 		cutoff = 200