diff --git a/cmd/fmrtx/main.go b/cmd/fmrtx/main.go
index 35ef2aa..048c10b 100644
--- a/cmd/fmrtx/main.go
+++ b/cmd/fmrtx/main.go
@@ -186,7 +186,8 @@ func runTXMode(cfg cfgpkg.Config, configPath string, driver platform.SoapyDriver
 		log.Printf("license: no valid key — evaluation jingle every %d minutes", license.JingleIntervalMinutes)
 	}
 	engine.SetLicenseState(licState, licenseKey)
-	log.Printf("watermark: embedding key fingerprint — Level=%.3f ChipRate=%d", watermark.Level, watermark.ChipRate)
+	log.Printf("watermark: STFT-domain embedding — WMRate=%d FFTSize=%d Level=%.1fdB",
+		watermark.WMRate, watermark.FFTSize, watermark.WMLevelDB)
 	cfg = applyLegacyAudioFlags(cfg, audioStdin, audioRate, audioHTTP)
 
 	var streamSrc *audio.StreamSource
diff --git a/cmd/wmdecode/main.go b/cmd/wmdecode/main.go
index 263f68a..0d7cce2 100644
--- a/cmd/wmdecode/main.go
+++ b/cmd/wmdecode/main.go
@@ -1,16 +1,23 @@
-// cmd/wmdecode — fm-rds-tx spread-spectrum watermark recovery tool.
+// cmd/wmdecode — STFT-domain spread-spectrum watermark decoder.
 //
-// Approach: downsample to chip rate (12 kHz), correlate at 1 sample/chip.
-// No fractional stepping, no clock drift issues. FFT-free phase search.
+// Decodes watermark from FM broadcast recordings following
+// Kirovski & Malvar (IEEE TSP 2003) architecture.
+//
+// Usage:
+//
+//	wmdecode <file.wav> [key ...]
 package main
 
 import (
 	"encoding/binary"
 	"fmt"
 	"math"
+	"math/cmplx"
 	"os"
 	"sort"
+	"time"
 
+	"github.com/jan/fm-rds-tx/internal/dsp"
 	"github.com/jan/fm-rds-tx/internal/watermark"
 )
 
@@ -20,6 +27,8 @@ func main() {
 		os.Exit(1)
 	}
 
+	t0 := time.Now()
+
 	samples, recRate, err := readMonoWAV(os.Args[1])
 	if err != nil {
 		fmt.Fprintf(os.Stderr, "read WAV: %v\n", err)
@@ -29,295 +38,235 @@ func main() {
 	fmt.Printf("WAV:   %d samples @ %.0f Hz = %.2fs, RMS %.1f dBFS\n",
 		len(samples), recRate, float64(len(samples))/recRate, 20*math.Log10(rms+1e-9))
 
-	chipRate := float64(watermark.ChipRate) // 12000
-	pnChips := watermark.PnChips            // 2048
-
-	// Step 1: LPF at ChipRate/2 then downsample to ChipRate.
-	// At chip rate: 1 sample = 1 chip. No fractional stepping.
-	decimFactor := int(recRate / chipRate) // 192000/12000 = 16
+	// Step 1: Decimate to WMRate (12 kHz)
+	wmRate := float64(watermark.WMRate)
+	decimFactor := int(recRate / wmRate)
 	if decimFactor < 1 {
 		decimFactor = 1
 	}
-	actualChipRate := recRate / float64(decimFactor) // should be exactly chipRate
-
-	fmt.Printf("Downsample: %d:1 (%.0f Hz → %.0f Hz)\n", decimFactor, recRate, actualChipRate)
+	actualRate := recRate / float64(decimFactor)
+	fmt.Printf("Downsample: %d:1 (%.0f Hz → %.0f Hz)\n", decimFactor, recRate, actualRate)
 
-	// Anti-alias LPF (8th-order IIR at 5.5 kHz)
 	lpfCoeffs := designLPF8(5500, recRate)
 	filtered := applyIIR(samples, lpfCoeffs)
 
-	// Decimate
 	nDown := len(filtered) / decimFactor
 	down := make([]float64, nDown)
 	for i := 0; i < nDown; i++ {
 		down[i] = filtered[i*decimFactor]
 	}
-	rmsDown := rmsLevel(down)
-	fmt.Printf("Downsampled: %d samples @ %.0f Hz, RMS %.1f dBFS\n",
-		nDown, actualChipRate, 20*math.Log10(rmsDown+1e-9))
-
-	// Step 2: Phase search — slide 1-bit PN template across [0, pnChips).
-	// At chip rate this is a simple 2048-element dot product per offset.
-	// Test all 2048 phases, accumulate energy over many bits.
-	fmt.Printf("Phase search: %d candidates\n", pnChips)
-
-	nSearchBits := nDown / pnChips
-	if nSearchBits > 500 {
-		nSearchBits = 500
+	fmt.Printf("Downsampled: %d samples, %.1fs\n", nDown, float64(nDown)/wmRate)
+
+	// Step 2: Compute ALL STFT frames with cepstrum filtering
+	fftSize := watermark.FFTSize
+	hop := watermark.FFTHop
+	nFrames := (nDown - fftSize) / hop
+	if nFrames <= 0 {
+		fmt.Fprintln(os.Stderr, "Recording too short")
+		os.Exit(1)
 	}
-	bestPhase := 0
-	bestEnergy := 0.0
-	for phase := 0; phase < pnChips; phase++ {
-		var energy float64
-		for b := 0; b < nSearchBits; b++ {
-			start := phase + b*pnChips
-			if start+pnChips > nDown {
-				break
-			}
-			c := corrChipRate(down, start, pnChips)
-			energy += c * c
+
+	var window [watermark.FFTSize]float64
+	dsp.HannWindow(window[:])
+	fmt.Printf("STFT:  %d frames (%d-point, hop=%d)\n", nFrames, fftSize, hop)
+
+	type stftMag [watermark.FFTSize / 2]float64
+	frameMags := make([]stftMag, nFrames)
+	for f := 0; f < nFrames; f++ {
+		offset := f * hop
+		var buf [watermark.FFTSize]complex128
+		for i := 0; i < fftSize; i++ {
+			buf[i] = complex(down[offset+i]*window[i], 0)
 		}
-		if energy > bestEnergy {
-			bestEnergy = energy
-			bestPhase = phase
+		dsp.FFT(buf[:])
+		for bin := 0; bin < fftSize/2; bin++ {
+			mag := cmplx.Abs(buf[bin])
+			if mag < 1e-12 {
+				mag = 1e-12
+			}
+			frameMags[f][bin] = 20 * math.Log10(mag)
 		}
+		cepstrumFilter(frameMags[f][:], 8)
 	}
-	fmt.Printf("Phase: offset=%d (%.2fms), energy=%.0f\n",
-		bestPhase, float64(bestPhase)/actualChipRate*1000, bestEnergy)
-
-	// Step 3: Per-bit correlation with ±4 sample sliding (handles residual drift).
-	// At chip rate, ±4 samples = ±0.33ms — covers ~±40 ppm over 22s frame.
-	nCompleteBits := (nDown - bestPhase) / pnChips
-	nFrames := nCompleteBits / watermark.PayloadBits
-	if nFrames < 1 {
-		nFrames = 1
+
+	// Step 3: For each key, search cycle offset + rep offset
+	keys := os.Args[2:]
+	if len(keys) == 0 {
+		fmt.Println("No keys supplied.")
+		os.Exit(1)
 	}
-	fmt.Printf("Sync:  %d complete bits, %d frames\n", nCompleteBits, nFrames)
-
-	const slideWindow = 200 // ±200 chips — handles phase errors + drift
-
-	corrs := make([]float64, watermark.PayloadBits)
-	for i := 0; i < watermark.PayloadBits; i++ {
-		// For each offset, sum correlation across ALL frames first.
-		// Signal adds coherently (×nFrames), noise adds as √nFrames.
-		// Then pick the offset with maximum |sum|.
-		bestAbs := 0.0
-		bestVal := 0.0
-		for off := -slideWindow; off <= slideWindow; off++ {
-			var sum float64
-			for f := 0; f < nFrames; f++ {
-				nominal := bestPhase + (f*watermark.PayloadBits+i)*pnChips + off
-				if nominal < 0 || nominal+pnChips > nDown {
-					continue
+
+	for _, key := range keys {
+		fmt.Printf("\nKey: %q\n", key)
+		det := watermark.NewSTFTDetector(key)
+
+		totalGroups := watermark.TotalGroups
+		timeRep := watermark.TimeRep
+		framesPerWM := watermark.FramesPerWM
+		numBins := watermark.NumBins
+		binLow := watermark.BinLow
+		centerRep := timeRep / 2
+
+		bestMetric := -1.0
+		var bestCorrs [watermark.PayloadBits]float64
+		bestCycleOff := 0
+		bestRepOff := 0
+
+		nCandidates := 0
+		for cycleOff := 0; cycleOff < framesPerWM; cycleOff += timeRep {
+			for repOff := 0; repOff < timeRep; repOff++ {
+				var testCorrs [watermark.PayloadBits]float64
+
+				for f := 0; f < nFrames; f++ {
+					wmFrame := ((f - cycleOff - repOff) % framesPerWM + framesPerWM) % framesPerWM
+					if wmFrame%timeRep != centerRep {
+						continue
+					}
+					g := wmFrame / timeRep
+					if g >= totalGroups {
+						continue
+					}
+
+					var corr float64
+					for b := 0; b < numBins; b++ {
+						corr += frameMags[f][binLow+b] * float64(det.PNChipAt(g, b))
+					}
+					testCorrs[det.GroupBit(g)] += corr
 				}
-				sum += corrChipRate(down, nominal, pnChips)
-			}
-			if math.Abs(sum) > bestAbs {
-				bestAbs = math.Abs(sum)
-				bestVal = sum
+
+				var metric float64
+				for _, c := range testCorrs {
+					metric += c * c
+				}
+
+				if metric > bestMetric {
+					bestMetric = metric
+					bestCorrs = testCorrs
+					bestCycleOff = cycleOff
+					bestRepOff = repOff
+				}
+				nCandidates++
 			}
 		}
-		corrs[i] = bestVal
-	}
 
-	// Diagnostics
-	var corrMin, corrMax, sumAbs float64
-	var nStrong, nDead int
-	for i, c := range corrs {
-		ac := math.Abs(c)
-		sumAbs += ac
-		if i == 0 || ac < corrMin {
-			corrMin = ac
-		}
-		if ac > corrMax {
-			corrMax = ac
-		}
-		if ac > sumAbs/float64(i+1)*2 {
-			nStrong++
-		}
-		if ac < 3 {
-			nDead++
-		}
-	}
-	avgCorr := sumAbs / 128
-	nStrong = 0
-	for _, c := range corrs {
-		if math.Abs(c) > avgCorr*0.5 {
-			nStrong++
-		}
-	}
-	fmt.Printf("Corrs: min|c|=%.1f, max|c|=%.1f, avg|c|=%.1f (strong=%d, dead=%d)\n",
-		corrMin, corrMax, avgCorr, nStrong, nDead)
+		fmt.Printf("Searched %d candidates in %v\n", nCandidates, time.Since(t0).Round(time.Millisecond))
+		fmt.Printf("Best: cycleOff=%d, repOff=%d, metric=%.0f\n", bestCycleOff, bestRepOff, bestMetric)
 
-	// Step 4: Frame sync — 128 rotations × byte-level erasure + bit-flipping.
+		var sumAbs float64
+		for _, c := range bestCorrs {
+			sumAbs += math.Abs(c)
+		}
+		fmt.Printf("Corrs: avg|c|=%.1f\n", sumAbs/128)
 
-	// Verbose: compute BER at each rotation against the known key (if supplied)
-	knownPayload := [watermark.RsDataBytes]byte{}
-	hasKnown := false
-	if len(os.Args) >= 3 {
-		hasKnown = true
-		knownPayload = watermark.KeyToPayload(os.Args[2])
+		// BER diagnostic against known key
+		knownPayload := watermark.KeyToPayload(key)
 		knownCW := watermark.RSEncode(knownPayload)
 		var knownBits [watermark.PayloadBits]int
 		for i := 0; i < watermark.PayloadBits; i++ {
 			knownBits[i] = int((knownCW[i/8] >> uint(7-(i%8))) & 1)
 		}
-		fmt.Println("\nRotation sweep (top 10 by BER):")
-		type rotBER struct{ rot, ber int }
-		var results []rotBER
-		for rot := 0; rot < watermark.PayloadBits; rot++ {
-			nerr := 0
-			for i := 0; i < watermark.PayloadBits; i++ {
-				srcBit := (i + rot) % watermark.PayloadBits
-				hard := 0
-				if corrs[srcBit] < 0 {
-					hard = 1
-				}
-				if hard != knownBits[i] {
-					nerr++
-				}
+		nerr := 0
+		for i := 0; i < watermark.PayloadBits; i++ {
+			hard := 0
+			if bestCorrs[i] < 0 {
+				hard = 1
 			}
-			results = append(results, rotBER{rot, nerr})
-		}
-		sort.Slice(results, func(a, b int) bool { return results[a].ber < results[b].ber })
-		for j := 0; j < 10 && j < len(results); j++ {
-			r := results[j]
-			fmt.Printf("  rot=%3d: BER=%d/128 (%4.1f%%)\n", r.rot, r.ber, 100*float64(r.ber)/128)
-		}
-
-		// Show byte error pattern at best rotation
-		bestRot := results[0].rot
-		fmt.Printf("\nByte errors at rot=%d:\n  ", bestRot)
-		for b := 0; b < watermark.RsTotalBytes; b++ {
-			nerr := 0
-			for bit := 0; bit < 8; bit++ {
-				srcBit := (b*8 + bit + bestRot) % watermark.PayloadBits
-				hard := 0
-				if corrs[srcBit] < 0 {
-					hard = 1
-				}
-				if hard != knownBits[b*8+bit] {
-					nerr++
-				}
+			if hard != knownBits[i] {
+				nerr++
 			}
-			fmt.Printf("B%d:%d ", b, nerr)
 		}
-		fmt.Println()
+		fmt.Printf("BER:   %d/128 (%.1f%%)\n", nerr, 100*float64(nerr)/128)
 
-		// Show received vs expected codeword at best rotation
+		// Show recv vs expected
 		var recv [watermark.RsTotalBytes]byte
+		confs := make([]float64, watermark.PayloadBits)
 		for i := 0; i < watermark.PayloadBits; i++ {
-			srcBit := (i + bestRot) % watermark.PayloadBits
-			if corrs[srcBit] < 0 {
+			confs[i] = math.Abs(bestCorrs[i])
+			if bestCorrs[i] < 0 {
 				recv[i/8] |= 1 << uint(7-(i%8))
 			}
 		}
-		fmt.Printf("  recv: %x\n", recv)
-		fmt.Printf("  want: %x\n", knownCW)
-	}
-	_ = hasKnown
-	_ = knownPayload
-	type decodeResult struct {
-		rotation int
-		payload  [watermark.RsDataBytes]byte
-		flips    int
-	}
-	var best *decodeResult
+		fmt.Printf("recv:  %x\nwant:  %x\n", recv, knownCW)
 
-	for rot := 0; rot < watermark.PayloadBits; rot++ {
-		var recv [watermark.RsTotalBytes]byte
-		confs := make([]float64, watermark.PayloadBits)
-		for i := 0; i < watermark.PayloadBits; i++ {
-			srcBit := (i + rot) % watermark.PayloadBits
-			c := corrs[srcBit]
-			confs[i] = math.Abs(c)
-			if c < 0 {
-				recv[i/8] |= 1 << uint(7-(i%8))
+		// Confidence-based erasure (MIN bit confidence per byte)
+		type bc struct{ idx int; conf float64 }
+		byteConfs := make([]bc, watermark.RsTotalBytes)
+		for b := 0; b < watermark.RsTotalBytes; b++ {
+			minC := confs[b*8]
+			for bit := 1; bit < 8; bit++ {
+				if confs[b*8+bit] < minC {
+					minC = confs[b*8+bit]
+				}
 			}
+			byteConfs[b] = bc{b, minC}
 		}
+		sort.Slice(byteConfs, func(a, b int) bool { return byteConfs[a].conf < byteConfs[b].conf })
 
-		// Brute-force RS decode: try ALL possible erasure subsets of size 1..8.
-		// With sliding correlation, confidence values are unreliable for erasure
-		// selection (all bits look "strong"). Instead, let RS tell us which
-		// subsets produce a valid codeword. This is fast: sum(C(16,k), k=1..8)
-		// = ~39k RS decodes per rotation, ~5M total. Each takes <1µs.
 		decoded := false
-		for nErase := 1; nErase <= watermark.RsCheckBytes; nErase++ {
-			if decoded { break }
-			indices := make([]int, nErase)
-			for i := range indices { indices[i] = i }
-			for {
-				erasePos := make([]int, nErase)
-				copy(erasePos, indices)
-				payload, ok := watermark.RSDecode(recv, erasePos)
-				if ok {
-					if best == nil {
-						best = &decodeResult{rot, payload, nErase}
-					}
+		for nErase := 0; nErase <= watermark.RsCheckBytes; nErase++ {
+			if nErase == 0 {
+				p, ok := watermark.RSDecode(recv, nil)
+				if ok && watermark.KeyMatchesPayload(key, p) {
+					fmt.Printf("  ✓ MATCH (0 erasures), payload=%x\n", p)
 					decoded = true
 					break
 				}
-				// Next combination
-				i := nErase - 1
-				for i >= 0 && indices[i] == watermark.RsTotalBytes-nErase+i {
-					i--
-				}
-				if i < 0 { break }
-				indices[i]++
-				for j := i + 1; j < nErase; j++ {
-					indices[j] = indices[j-1] + 1
-				}
+				continue
+			}
+			erasePos := make([]int, nErase)
+			for i := 0; i < nErase; i++ {
+				erasePos[i] = byteConfs[i].idx
+			}
+			sort.Ints(erasePos)
+			p, ok := watermark.RSDecode(recv, erasePos)
+			if ok && watermark.KeyMatchesPayload(key, p) {
+				fmt.Printf("  ✓ MATCH (%d erasures), payload=%x\n", nErase, p)
+				decoded = true
+				break
 			}
 		}
-		if decoded && best != nil && best.flips <= 4 {
-			break // clean decode with few erasures — stop early
-		}
-	}
 
-	if best == nil {
-		fmt.Println("RS decode: FAILED — no valid frame alignment found.")
-		fmt.Println("Watermark may not be present, or recording is too noisy/short.")
-		os.Exit(1)
+		if !decoded {
+			fmt.Println("  ✗ NOT FOUND")
+		}
 	}
 
-	fmt.Printf("\nFrame sync: rotation=%d, %d byte erasures\n", best.rotation, best.flips)
-	fmt.Printf("Payload:    %x\n\n", best.payload)
+	fmt.Printf("\nDone in %v\n", time.Since(t0).Round(time.Millisecond))
+}
 
-	keys := os.Args[2:]
-	if len(keys) == 0 {
-		fmt.Println("No keys supplied — payload shown above.")
+func cepstrumFilter(magDB []float64, nCeps int) {
+	n := len(magDB)
+	if n < nCeps*2 {
 		return
 	}
-	fmt.Println("Key check:")
-	matched := false
-	for _, key := range keys {
-		if watermark.KeyMatchesPayload(key, best.payload) {
-			fmt.Printf("  ✓ MATCH: %q\n", key)
-			matched = true
-		} else {
-			fmt.Printf("  ✗      : %q\n", key)
+	ceps := make([]float64, n)
+	for k := 0; k < n; k++ {
+		var sum float64
+		for i := 0; i < n; i++ {
+			sum += magDB[i] * math.Cos(math.Pi*float64(k)*(float64(i)+0.5)/float64(n))
 		}
+		ceps[k] = sum
 	}
-	if !matched {
-		fmt.Println("\nNo key matched.")
+	for k := 0; k < nCeps; k++ {
+		ceps[k] = 0
 	}
-}
-
-// corrChipRate correlates at chip rate (1 sample = 1 chip).
-func corrChipRate(down []float64, start, pnChips int) float64 {
-	var acc float64
-	for i := 0; i < pnChips; i++ {
-		acc += down[start+i] * float64(watermark.PNSequence[i])
+	for i := 0; i < n; i++ {
+		var sum float64
+		for k := 0; k < n; k++ {
+			w := 1.0
+			if k == 0 {
+				w = 0.5
+			}
+			sum += w * ceps[k] * math.Cos(math.Pi*float64(k)*(float64(i)+0.5)/float64(n))
+		}
+		magDB[i] = sum * 2.0 / float64(n)
 	}
-	return acc
 }
 
-// --- 8th-order Butterworth LPF (4 cascaded biquads) ---
 type biquad struct{ b0, b1, b2, a1, a2 float64 }
 type iirCoeffs []biquad
 
 func designLPF8(cutoffHz, sampleRate float64) iirCoeffs {
-	// 8th-order Butterworth = 4 biquad sections
 	angles := []float64{math.Pi / 16, 3 * math.Pi / 16, 5 * math.Pi / 16, 7 * math.Pi / 16}
 	coeffs := make(iirCoeffs, 4)
 	for i, angle := range angles {
@@ -328,11 +277,8 @@ func designLPF8(cutoffHz, sampleRate float64) iirCoeffs {
 		alpha := sinW / (2 * q)
 		a0 := 1 + alpha
 		coeffs[i] = biquad{
-			b0: (1 - cosW) / 2 / a0,
-			b1: (1 - cosW) / a0,
-			b2: (1 - cosW) / 2 / a0,
-			a1: (-2 * cosW) / a0,
-			a2: (1 - alpha) / a0,
+			b0: (1 - cosW) / 2 / a0, b1: (1 - cosW) / a0, b2: (1 - cosW) / 2 / a0,
+			a1: (-2 * cosW) / a0, a2: (1 - alpha) / a0,
 		}
 	}
 	return coeffs
@@ -396,7 +342,7 @@ func readMonoWAV(path string) ([]float64, float64, error) {
 		}
 	}
 	if dataStart == 0 || bitsPerSample != 16 || channels == 0 {
-		return nil, 0, fmt.Errorf("unsupported WAV (need 16-bit PCM, got bits=%d ch=%d)", bitsPerSample, channels)
+		return nil, 0, fmt.Errorf("unsupported WAV")
 	}
 	if dataStart+dataLen > len(data) {
 		dataLen = len(data) - dataStart
diff --git a/internal/dsp/fft.go b/internal/dsp/fft.go
new file mode 100644
index 0000000..97e0ef8
--- /dev/null
+++ b/internal/dsp/fft.go
@@ -0,0 +1,65 @@
+package dsp
+
+import (
+	"math"
+	"math/cmplx"
+)
+
+// FFT computes the discrete Fourier transform of x (in-place, radix-2).
+// len(x) must be a power of 2.
+func FFT(x []complex128) {
+	n := len(x)
+	if n <= 1 {
+		return
+	}
+	// Bit-reversal permutation
+	j := 0
+	for i := 1; i < n; i++ {
+		bit := n >> 1
+		for j&bit != 0 {
+			j ^= bit
+			bit >>= 1
+		}
+		j ^= bit
+		if i < j {
+			x[i], x[j] = x[j], x[i]
+		}
+	}
+	// Cooley-Tukey butterfly
+	for size := 2; size <= n; size <<= 1 {
+		half := size >> 1
+		wn := cmplx.Exp(complex(0, -2*math.Pi/float64(size)))
+		for start := 0; start < n; start += size {
+			w := complex(1, 0)
+			for k := 0; k < half; k++ {
+				u := x[start+k]
+				v := x[start+k+half] * w
+				x[start+k] = u + v
+				x[start+k+half] = u - v
+				w *= wn
+			}
+		}
+	}
+}
+
+// IFFT computes the inverse DFT of x (in-place).
+func IFFT(x []complex128) {
+	n := len(x)
+	// Conjugate, FFT, conjugate, scale
+	for i := range x {
+		x[i] = cmplx.Conj(x[i])
+	}
+	FFT(x)
+	scale := 1.0 / float64(n)
+	for i := range x {
+		x[i] = cmplx.Conj(x[i]) * complex(scale, 0)
+	}
+}
+
+// HannWindow fills w with a Hann window of length n.
+func HannWindow(w []float64) {
+	n := len(w)
+	for i := range w {
+		w[i] = 0.5 * (1.0 - math.Cos(2*math.Pi*float64(i)/float64(n)))
+	}
+}
diff --git a/internal/offline/generator.go b/internal/offline/generator.go
index f2a8a13..ccce1d0 100644
--- a/internal/offline/generator.go
+++ b/internal/offline/generator.go
@@ -5,7 +5,6 @@ import (
 	"encoding/binary"
 	"fmt"
 	"log"
-	"math"
 	"path/filepath"
 	"sync/atomic"
 	"time"
@@ -133,24 +132,20 @@ type Generator struct {
 	licenseState *license.State
 	jingleFrames []license.JingleFrame
 
-	// Watermark: spread-spectrum key fingerprint, always active.
-	watermark   *watermark.Embedder
-	wmShapeLPF  *dsp.FilterChain // pulse-shaping: confines PN energy to 0-6kHz
+	// Watermark: STFT-domain spread-spectrum (Kirovski & Malvar 2003).
+	stftEmbedder *watermark.STFTEmbedder
+	wmDecimLPF   *dsp.FilterChain // anti-alias LPF for 228k→12k decimation
 }
 
 func NewGenerator(cfg cfgpkg.Config) *Generator {
 	return &Generator{cfg: cfg}
 }
 
-// SetLicense configures license state (jingle) and creates the watermark
+// SetLicense configures license state (jingle) and creates the STFT watermark
 // embedder. Must be called before the first GenerateFrame.
 func (g *Generator) SetLicense(state *license.State, key string) {
 	g.licenseState = state
-	g.watermark = watermark.NewEmbedder(key)
-	// Gate threshold: -40 dBFS ≈ 0.01 linear amplitude.
-	// Watermark is muted during silence to prevent audibility.
-	// Composite rate will be set in init(); use 228000 as default.
-	g.watermark.EnableGate(0.001, 228000)
+	g.stftEmbedder = watermark.NewSTFTEmbedder(key)
 }
 
 // SetExternalSource sets a live audio source (e.g. StreamResampler) that
@@ -285,17 +280,10 @@ func (g *Generator) init() {
 		}
 	}
 
-	// Update watermark gate ramp rate with actual composite rate (may differ
-	// from the 228000 default used in SetLicense).
-	if g.watermark != nil {
-		g.watermark.EnableGate(0.001, g.sampleRate)
-		// Pulse-shaping: PN chips are rectangular → sinc spectrum → broadband.
-		// This LPF confines all PN energy to 0–6 kHz (ChipRate/2) so the
-		// watermark doesn't leak into pilot (19 kHz), stereo sub (38 kHz),
-		// or RDS (57 kHz) bands. Also prevents audible wideband noise.
-		// 4th-order Butterworth: -24 dB/octave rolloff. At 12 kHz: -24 dB,
-		// at 19 kHz: -38 dB, at 38 kHz: -62 dB. Clean enough.
-		g.wmShapeLPF = dsp.NewLPF4(float64(watermark.ChipRate)/2, g.sampleRate)
+	// STFT watermark: anti-alias LPF for decimation to WMRate (12 kHz).
+	// Nyquist at 12 kHz = 6 kHz. Cut at 5.5 kHz with margin.
+	if g.stftEmbedder != nil {
+		g.wmDecimLPF = dsp.NewLPF4(5500, g.sampleRate)
 	}
 
 	g.initialized = true
@@ -338,6 +326,10 @@ func (g *Generator) GenerateFrame(duration time.Duration) *output.CompositeFrame
 	g.frameSeq++
 	frame.Sequence = g.frameSeq
 
+	// L/R buffers for two-pass processing (STFT watermark between stages 3 and 4)
+	lBuf := make([]float64, samples)
+	rBuf := make([]float64, samples)
+
 	// Load live params once per chunk — single atomic read, zero per-sample cost
 	lp := g.liveParams.Load()
 	if lp == nil {
@@ -404,15 +396,6 @@ func (g *Generator) GenerateFrame(duration time.Duration) *output.CompositeFrame
 		r := g.audioLPF_R.Process(float64(in.R))
 		r = g.pilotNotchR.Process(r)
 
-		// Watermark gate level measurement — done BEFORE drive/clip/cleanup.
-		// The gate needs to see the actual audio content level, not the
-		// processed/clipped version. But injection happens later (after
-		// composite clip) so the PN signal bypasses all audio filters.
-		if g.watermark != nil {
-			audioLevel := (math.Abs(l) + math.Abs(r)) / 2.0
-			g.watermark.SetAudioLevel(audioLevel)
-		}
-
 		// --- Stage 2: Drive + Compress + Clip₁ ---
 		l *= lp.OutputDrive
 		r *= lp.OutputDrive
@@ -428,6 +411,59 @@ func (g *Generator) GenerateFrame(duration time.Duration) *output.CompositeFrame
 		l = dsp.HardClip(l, ceiling)
 		r = dsp.HardClip(r, ceiling)
 
+		lBuf[i] = l
+		rBuf[i] = r
+	}
+
+	// --- STFT Watermark: decimate → embed → upsample → add to L/R ---
+	if g.stftEmbedder != nil {
+		decimFactor := int(g.sampleRate) / watermark.WMRate // 228000/12000 = 19
+		if decimFactor < 1 {
+			decimFactor = 1
+		}
+		nDown := samples / decimFactor
+
+		// Anti-alias: LPF ALL composite-rate samples, THEN decimate.
+		// The LPF must see every sample for correct IIR state update.
+		mono12k := make([]float64, nDown)
+		lpfState := 0.0
+		decimCount := 0
+		outIdx := 0
+		for i := 0; i < samples && outIdx < nDown; i++ {
+			mono := (lBuf[i] + rBuf[i]) / 2
+			if g.wmDecimLPF != nil {
+				lpfState = g.wmDecimLPF.Process(mono)
+			} else {
+				lpfState = mono
+			}
+			decimCount++
+			if decimCount >= decimFactor {
+				decimCount = 0
+				mono12k[outIdx] = lpfState
+				outIdx++
+			}
+		}
+
+		// STFT embed at 12 kHz
+		embedded := g.stftEmbedder.ProcessBlock(mono12k)
+
+		// Extract watermark signal (difference) and upsample via ZOH
+		for i := 0; i < samples; i++ {
+			wmIdx := i / decimFactor
+			if wmIdx >= nDown {
+				wmIdx = nDown - 1
+			}
+			wmSig := embedded[wmIdx] - mono12k[wmIdx]
+			lBuf[i] += wmSig
+			rBuf[i] += wmSig
+		}
+	}
+
+	// --- Pass 2: Stereo encode + composite processing ---
+	for i := 0; i < samples; i++ {
+		l := lBuf[i]
+		r := rBuf[i]
+
 		// --- Stage 4: Stereo encode ---
 		limited := audio.NewFrame(audio.Sample(l), audio.Sample(r))
 		comps := g.stereoEncoder.Encode(limited)
@@ -453,22 +489,6 @@ func (g *Generator) GenerateFrame(duration time.Duration) *output.CompositeFrame
 		}
 		bs412PowerAccum += audioMPX * audioMPX
 
-		// --- Watermark injection: into audio composite AFTER all processing ---
-		// Injected after the entire clip-filter-clip chain, notch filters, and
-		// BS.412 power measurement. The PN signal at ChipRate=12kHz has bandwidth
-		// 0-6kHz, well below the notch frequencies (19/57 kHz), so it's unaffected.
-		// At -48 dBFS the watermark causes <0.05 dB of over-modulation, negligible.
-		// Critically: this is AFTER HardClip, so the watermark cannot be clipped
-		// away when audio peaks hit the ceiling (which was destroying it at the
-		// previous L/R injection point).
-		if g.watermark != nil {
-			wm := g.watermark.NextSample()
-			if g.wmShapeLPF != nil {
-				wm = g.wmShapeLPF.Process(wm)
-			}
-			audioMPX += wm
-		}
-
 		// --- Stage 6: Add protected components ---
 		composite := audioMPX
 		if lp.StereoEnabled {
@@ -503,15 +523,9 @@ func (g *Generator) GenerateFrame(duration time.Duration) *output.CompositeFrame
 		g.bs412.ProcessChunk(bs412PowerAccum / float64(samples))
 	}
 
-	// Watermark diagnostic: log state every 100 chunks (~5s) so we can verify
-	// the embedder is actually running and producing non-zero output.
-	if g.watermark != nil && g.frameSeq%100 == 1 {
-		wm := g.watermark.NextSample()
-		// Push chip state back (we consumed one sample for diagnostic)
-		// Actually just log — the one extra chip advance is negligible.
-		stats := g.watermark.DiagnosticState()
-		log.Printf("watermark diag: frame=%d gateGain=%.4f chipIdx=%d bitIdx=%d symbol=%d lastSample=%.6f enabled=%t",
-			g.frameSeq, stats.GateGain, stats.ChipIdx, stats.BitIdx, stats.Symbol, wm, stats.GateEnabled)
+	// STFT watermark diagnostic
+	if g.stftEmbedder != nil && g.frameSeq%100 == 1 {
+		log.Printf("watermark stft: frame=%d, active", g.frameSeq)
 	}
 
 	return frame
diff --git a/internal/watermark/stft_roundtrip_test.go b/internal/watermark/stft_roundtrip_test.go
new file mode 100644
index 0000000..c91b081
--- /dev/null
+++ b/internal/watermark/stft_roundtrip_test.go
@@ -0,0 +1,163 @@
+package watermark
+
+import (
+	"math"
+	"testing"
+)
+
+func TestSTFTRoundTrip(t *testing.T) {
+	const key = "test-stft-key"
+	const duration = 150.0 // seconds — need > 136.5s for one full WM cycle
+
+	nSamples := int(duration * WMRate)
+	t.Logf("Generating %d samples @ %d Hz (%.1fs)", nSamples, WMRate, duration)
+	t.Logf("WM cycle: %d STFT frames, %.1fs", FramesPerWM, float64(SamplesPerWM)/WMRate)
+
+	// Generate test signal: broadband noise (the multiplicative watermark
+	// needs energy in all frequency bins to work — a pure tone only has
+	// energy in one bin and the watermark has no effect on silent bins)
+	audio := make([]float64, nSamples)
+	// Simple LCG pseudo-random for reproducibility
+	var lcg uint64 = 12345
+	for i := range audio {
+		lcg = lcg*6364136223846793005 + 1442695040888963407
+		audio[i] = 0.3 * (float64(int32(lcg>>33))/float64(1<<31))
+	}
+
+	rmsIn := rmsF64(audio)
+	t.Logf("Input RMS: %.1f dBFS", 20*math.Log10(rmsIn+1e-12))
+
+	// Embed watermark
+	embedder := NewSTFTEmbedder(key)
+	watermarked := embedder.ProcessBlock(audio)
+
+	rmsOut := rmsF64(watermarked)
+	t.Logf("Output RMS: %.1f dBFS", 20*math.Log10(rmsOut+1e-12))
+	t.Logf("RMS change: %.2f dB", 20*math.Log10(rmsOut/rmsIn))
+
+	// Detect watermark
+	detector := NewSTFTDetector(key)
+	corrs, offset := detector.Detect(watermarked)
+
+	t.Logf("Detection offset: %d", offset)
+
+	// Check correlations
+	var nPositive, nNegative int
+	var sumAbs float64
+	for _, c := range corrs {
+		sumAbs += math.Abs(c)
+		if c > 0 {
+			nPositive++
+		} else {
+			nNegative++
+		}
+	}
+	avgAbs := sumAbs / float64(payloadBits)
+	t.Logf("Correlations: avg|c|=%.1f, positive=%d, negative=%d", avgAbs, nPositive, nNegative)
+
+	if avgAbs < 1.0 {
+		t.Errorf("avg|c| too low: %.1f (expected >> 1.0)", avgAbs)
+	}
+
+	// Check against known payload
+	payload := KeyToPayload(key)
+	codeword := RSEncode(payload)
+	var expectedBits [payloadBits]int
+	for i := 0; i < payloadBits; i++ {
+		expectedBits[i] = int((codeword[i/8] >> uint(7-(i%8))) & 1)
+	}
+
+	nerr := 0
+	for i := 0; i < payloadBits; i++ {
+		hard := 0
+		if corrs[i] < 0 {
+			hard = 1
+		}
+		if hard != expectedBits[i] {
+			nerr++
+		}
+	}
+	t.Logf("BER: %d/%d (%.1f%%)", nerr, payloadBits, 100*float64(nerr)/float64(payloadBits))
+
+	if nerr > 20 {
+		t.Errorf("BER too high: %d/%d", nerr, payloadBits)
+	}
+
+	// Try RS decode
+	var recv [rsTotalBytes]byte
+	for i := 0; i < payloadBits; i++ {
+		if corrs[i] < 0 {
+			recv[i/8] |= 1 << uint(7-(i%8))
+		}
+	}
+
+	// Try with erasures if needed
+	decoded := false
+	for nErase := 0; nErase <= rsCheckBytes; nErase++ {
+		if nErase == 0 {
+			// Try zero erasures (valid if BER=0)
+			p, ok := RSDecode(recv, nil)
+			if ok {
+				if KeyMatchesPayload(key, p) {
+					t.Logf("Decoded with 0 erasures: MATCH ✓")
+					decoded = true
+					break
+				}
+			}
+			continue
+		}
+		// Erase weakest bytes by |correlation|
+		type bc struct{ idx int; conf float64 }
+		byteConfs := make([]bc, rsTotalBytes)
+		for b := 0; b < rsTotalBytes; b++ {
+			minC := math.Abs(corrs[b*8])
+			for bit := 1; bit < 8; bit++ {
+				c := math.Abs(corrs[b*8+bit])
+				if c < minC {
+					minC = c
+				}
+			}
+			byteConfs[b] = bc{b, minC}
+		}
+		// Sort by confidence (weakest first)
+		for i := 0; i < len(byteConfs); i++ {
+			for j := i + 1; j < len(byteConfs); j++ {
+				if byteConfs[j].conf < byteConfs[i].conf {
+					byteConfs[i], byteConfs[j] = byteConfs[j], byteConfs[i]
+				}
+			}
+		}
+		erasePos := make([]int, nErase)
+		for i := 0; i < nErase; i++ {
+			erasePos[i] = byteConfs[i].idx
+		}
+		// Sort positions
+		for i := 0; i < len(erasePos); i++ {
+			for j := i + 1; j < len(erasePos); j++ {
+				if erasePos[j] < erasePos[i] {
+					erasePos[i], erasePos[j] = erasePos[j], erasePos[i]
+				}
+			}
+		}
+		p, ok := RSDecode(recv, erasePos)
+		if ok {
+			if KeyMatchesPayload(key, p) {
+				t.Logf("Decoded with %d erasures: MATCH ✓", nErase)
+				decoded = true
+				break
+			}
+		}
+	}
+
+	if !decoded {
+		t.Errorf("RS decode FAILED")
+	}
+}
+
+func rmsF64(s []float64) float64 {
+	var acc float64
+	for _, v := range s {
+		acc += v * v
+	}
+	return math.Sqrt(acc / float64(len(s)))
+}
diff --git a/internal/watermark/stft_watermark.go b/internal/watermark/stft_watermark.go
new file mode 100644
index 0000000..9548d15
--- /dev/null
+++ b/internal/watermark/stft_watermark.go
@@ -0,0 +1,413 @@
+// Package watermark implements STFT-domain spread-spectrum audio watermarking
+// based on Kirovski & Malvar (IEEE TSP 2003).
+//
+// Architecture:
+//   - Embedding in STFT magnitude (dB scale) — multiplicative, natural masking
+//   - Block repetition coding (R=5 time frames) — automatic drift tolerance
+//   - Cepstrum filtering at detection — 6 dB carrier noise reduction
+//   - PCC covert channel — PN partitioned into M=128 subsets for 128-bit payload
+//   - Multi-test sync — scan R frame offsets to find alignment
+//
+// Both encoder and decoder operate at 12 kHz (WMRate). The encoder decimates
+// from composite rate (÷19), processes STFT, and upsamples back. The decoder
+// decimates from recording rate (÷16 from 192kHz, ÷4 from 48kHz, etc.).
+// Same STFT parameters → bins align perfectly → no rate mismatch.
+package watermark
+
+import (
+	"crypto/sha256"
+	"math"
+	"math/cmplx"
+
+	"github.com/jan/fm-rds-tx/internal/dsp"
+)
+
+// STFT watermark constants.
+const (
+	WMRate    = 12000 // watermark processing sample rate (Hz)
+	FFTSize   = 512   // STFT frame size (samples at WMRate)
+	FFTHop    = 256   // 50% overlap
+	BinLow    = 9     // ~211 Hz at WMRate/FFTSize
+	BinHigh   = 213   // ~4992 Hz at WMRate/FFTSize
+	NumBins   = BinHigh - BinLow // 204 frequency chips per STFT frame
+	TimeRep   = 5     // block repetition factor (±2 frame drift tolerance)
+	GroupsPerBit = 10 // time groups per data bit
+	WMLevelDB = 1.5   // embedding level (dB)
+
+	TotalGroups   = GroupsPerBit * payloadBits // 10 × 128 = 1280
+	FramesPerWM   = TotalGroups * TimeRep      // 1280 × 5 = 6400
+	SamplesPerWM  = FramesPerWM * FFTHop       // 6400 × 256 = 1638400
+	// Duration at WMRate: 1638400 / 12000 = 136.5 seconds
+)
+
+// STFTEmbedder processes audio blocks and adds the STFT-domain watermark.
+// It works at WMRate (12 kHz). The caller must decimate input to WMRate
+// and upsample output back to the desired rate.
+type STFTEmbedder struct {
+	// PN chip matrix: pnChips[group][bin] ∈ {-1, +1}
+	// group ∈ [0, TotalGroups), bin ∈ [0, NumBins)
+	pnChips [TotalGroups][NumBins]int8
+
+	// Bit assignment: which data bit owns each group (PCC permutation)
+	groupToBit [TotalGroups]int
+
+	// RS-encoded codeword: 128 bits → symbol[bit] = +1 or -1
+	symbols [payloadBits]int8
+
+	// STFT state
+	window   [FFTSize]float64
+	inBuf    [FFTSize]float64 // analysis window buffer
+	outBuf   [FFTSize + FFTHop]float64 // overlap-add output buffer
+	inPos    int // samples written to inBuf
+	outPos   int // samples read from outBuf
+	frameIdx int // STFT frame counter (wraps at FramesPerWM)
+	primed   bool // true after first full frame
+
+	// Level in linear scale: 10^(WMLevelDB/20) - 1 ≈ 0.189 for 1.5 dB
+	levelLinear float64
+}
+
+// NewSTFTEmbedder creates an embedder for the given license key.
+func NewSTFTEmbedder(key string) *STFTEmbedder {
+	e := &STFTEmbedder{}
+
+	// Compute RS-encoded payload
+	var data [rsDataBytes]byte
+	if key != "" {
+		h := sha256.Sum256([]byte(key))
+		copy(data[:], h[:rsDataBytes])
+	}
+	codeword := rsEncode(data)
+
+	// BPSK symbols: bit 0 → +1, bit 1 → -1
+	for i := 0; i < payloadBits; i++ {
+		if (codeword[i/8]>>uint(7-(i%8)))&1 == 1 {
+			e.symbols[i] = -1
+		} else {
+			e.symbols[i] = 1
+		}
+	}
+
+	// Generate PN chips from key-seeded PRNG
+	seed := sha256.Sum256(append([]byte("stft-pn-"), key...))
+	prng := newPRNG(seed[:])
+	for g := 0; g < TotalGroups; g++ {
+		for b := 0; b < NumBins; b++ {
+			if prng.next()&1 == 0 {
+				e.pnChips[g][b] = 1
+			} else {
+				e.pnChips[g][b] = -1
+			}
+		}
+	}
+
+	// PCC permutation: assign groups to bits (interleaved + permuted)
+	// Simple interleaving first, then Fisher-Yates shuffle
+	for g := 0; g < TotalGroups; g++ {
+		e.groupToBit[g] = g % payloadBits
+	}
+	// Permute within each bit's groups using key-seeded PRNG
+	permSeed := sha256.Sum256(append([]byte("stft-perm-"), key...))
+	permRNG := newPRNG(permSeed[:])
+	for i := TotalGroups - 1; i > 0; i-- {
+		j := permRNG.next() % uint32(i+1)
+		e.groupToBit[i], e.groupToBit[j] = e.groupToBit[j], e.groupToBit[i]
+	}
+
+	// Hann window
+	dsp.HannWindow(e.window[:])
+
+	// Embedding level
+	e.levelLinear = math.Pow(10, WMLevelDB/20) - 1 // fractional magnitude change
+
+	return e
+}
+
+// ProcessBlock takes mono audio at WMRate and returns watermarked audio.
+// The input and output lengths are the same. Internally buffers for STFT
+// overlap-add processing. Call with chunks of any size.
+func (e *STFTEmbedder) ProcessBlock(in []float64) []float64 {
+	out := make([]float64, len(in))
+	for i, s := range in {
+		// Feed sample into STFT input buffer
+		e.inBuf[e.inPos] = s
+		e.inPos++
+
+		if e.inPos == FFTSize {
+			// Full frame: process STFT
+			e.processFrame()
+			e.inPos = FFTHop // shift: keep last hop samples for next frame overlap
+			copy(e.inBuf[:FFTHop], e.inBuf[FFTHop:FFTSize])
+		}
+
+		// Read from overlap-add output buffer
+		if e.primed {
+			out[i] = e.outBuf[e.outPos]
+			e.outPos++
+			if e.outPos >= FFTHop {
+				e.outPos = 0
+				// Shift output buffer: move overlap region to start
+				copy(e.outBuf[:FFTSize], e.outBuf[FFTHop:FFTSize+FFTHop])
+				// Zero the new region
+				for j := FFTSize - FFTHop; j < FFTSize+FFTHop; j++ {
+					if j < len(e.outBuf) {
+						e.outBuf[j] = 0
+					}
+				}
+			}
+		} else {
+			out[i] = s // pass-through until first frame is processed
+		}
+	}
+	return out
+}
+
+// processFrame computes one STFT frame: window → FFT → modify magnitudes → IFFT → overlap-add.
+func (e *STFTEmbedder) processFrame() {
+	// Determine which group this frame belongs to
+	wmFrame := e.frameIdx % FramesPerWM
+	groupIdx := wmFrame / TimeRep
+	repIdx := wmFrame % TimeRep
+	centerRep := TimeRep / 2 // only center repetition carries the watermark for detection
+
+	// Apply window and convert to complex
+	var buf [FFTSize]complex128
+	for i := 0; i < FFTSize; i++ {
+		buf[i] = complex(e.inBuf[i]*e.window[i], 0)
+	}
+
+	// Forward FFT
+	dsp.FFT(buf[:])
+
+	// Modify magnitudes in the watermark sub-band
+	// Only modify if this is within a valid group AND at the center repetition
+	// (we embed in ALL repetitions so the watermark energy is present everywhere,
+	// but the PN pattern is the same for all R frames in a group)
+	if groupIdx < TotalGroups {
+		bitIdx := e.groupToBit[groupIdx]
+		dataSign := float64(e.symbols[bitIdx])
+		_ = repIdx
+		_ = centerRep
+
+		for b := 0; b < NumBins; b++ {
+			bin := BinLow + b
+			chip := float64(e.pnChips[groupIdx][b])
+
+			// Modify magnitude: |Y| = |X| × (1 + level × chip × data)
+			// Phase preserved
+			mag := cmplx.Abs(buf[bin])
+			if mag < 1e-10 {
+				continue // skip near-silence bins to avoid division by zero
+			}
+			phase := cmplx.Phase(buf[bin])
+			newMag := mag * (1.0 + e.levelLinear*chip*dataSign)
+			buf[bin] = cmplx.Rect(newMag, phase)
+
+			// Mirror for negative frequencies (conjugate symmetry)
+			if bin > 0 && bin < FFTSize/2 {
+				buf[FFTSize-bin] = cmplx.Conj(buf[bin])
+			}
+		}
+	}
+
+	// Inverse FFT
+	dsp.IFFT(buf[:])
+
+	// Overlap-add to output buffer
+	for i := 0; i < FFTSize; i++ {
+		e.outBuf[i] += real(buf[i])
+	}
+
+	if !e.primed {
+		e.primed = true
+		e.outPos = 0
+	}
+
+	e.frameIdx++
+}
+
+// STFTDetector extracts watermark bits from an audio recording.
+type STFTDetector struct {
+	pnChips    [TotalGroups][NumBins]int8
+	groupToBit [TotalGroups]int
+}
+
+// NewSTFTDetector creates a detector matching the given key's PN sequence.
+func NewSTFTDetector(key string) *STFTDetector {
+	d := &STFTDetector{}
+
+	// Same PN generation as embedder
+	seed := sha256.Sum256(append([]byte("stft-pn-"), key...))
+	prng := newPRNG(seed[:])
+	for g := 0; g < TotalGroups; g++ {
+		for b := 0; b < NumBins; b++ {
+			if prng.next()&1 == 0 {
+				d.pnChips[g][b] = 1
+			} else {
+				d.pnChips[g][b] = -1
+			}
+		}
+	}
+
+	// Same permutation
+	for g := 0; g < TotalGroups; g++ {
+		d.groupToBit[g] = g % payloadBits
+	}
+	permSeed := sha256.Sum256(append([]byte("stft-perm-"), key...))
+	permRNG := newPRNG(permSeed[:])
+	for i := TotalGroups - 1; i > 0; i-- {
+		j := permRNG.next() % uint32(i+1)
+		d.groupToBit[i], d.groupToBit[j] = d.groupToBit[j], d.groupToBit[i]
+	}
+
+	return d
+}
+
+// Detect processes audio at WMRate and returns soft bit decisions.
+// The audio should already be decimated/resampled to WMRate and LPF'd.
+//
+// Multi-test: tries TimeRep frame offsets (the block repetition candidates).
+// Cepstrum filtering is applied to reduce carrier noise.
+//
+// Returns: 128 soft correlation values (sign = bit decision, magnitude = confidence),
+// and the frame offset that gave the best detection metric.
+func (d *STFTDetector) Detect(audio []float64) (corrs [payloadBits]float64, bestOffset int) {
+	// Compute all STFT frames
+	var window [FFTSize]float64
+	dsp.HannWindow(window[:])
+
+	nFrames := (len(audio) - FFTSize) / FFTHop
+	if nFrames < FramesPerWM {
+		// Not enough data for a full watermark cycle — use what we have
+	}
+
+	// Compute STFT magnitudes (dB) for all frames
+	type stftFrame struct {
+		magDB [FFTSize / 2]float64
+	}
+	frames := make([]stftFrame, nFrames)
+
+	for f := 0; f < nFrames; f++ {
+		offset := f * FFTHop
+		var buf [FFTSize]complex128
+		for i := 0; i < FFTSize; i++ {
+			if offset+i < len(audio) {
+				buf[i] = complex(audio[offset+i]*window[i], 0)
+			}
+		}
+		dsp.FFT(buf[:])
+
+		for bin := 0; bin < FFTSize/2; bin++ {
+			mag := cmplx.Abs(buf[bin])
+			if mag < 1e-12 {
+				mag = 1e-12
+			}
+			frames[f].magDB[bin] = 20 * math.Log10(mag)
+		}
+
+		// Cepstrum filtering: remove spectral envelope
+		// DCT of dB magnitudes, zero first N_ceps coefficients, IDCT
+		cepstrumFilter(frames[f].magDB[:], 8)
+	}
+
+	// Multi-test: try each of TimeRep frame offsets within the repetition block
+	bestMetric := -1.0
+	bestOffset = 0
+
+	for startOffset := 0; startOffset < TimeRep; startOffset++ {
+		var testCorrs [payloadBits]float64
+
+		// For each group, use the CENTER frame of the repetition block
+		for g := 0; g < TotalGroups; g++ {
+			bitIdx := d.groupToBit[g]
+			frameInWM := g*TimeRep + startOffset + TimeRep/2
+			if frameInWM >= nFrames {
+				continue
+			}
+
+			// Correlate this frame's magnitudes with the PN chips
+			var corr float64
+			for b := 0; b < NumBins; b++ {
+				bin := BinLow + b
+				corr += frames[frameInWM].magDB[bin] * float64(d.pnChips[g][b])
+			}
+			testCorrs[bitIdx] += corr
+		}
+
+		// Detection metric: sum of squared partial correlations (chi-squared)
+		// From paper equation (10): Q = Σ (corr_m)²
+		var metric float64
+		for _, c := range testCorrs {
+			metric += c * c
+		}
+
+		if metric > bestMetric {
+			bestMetric = metric
+			bestOffset = startOffset
+			corrs = testCorrs
+		}
+	}
+
+	return corrs, bestOffset
+}
+
+// cepstrumFilter removes the spectral envelope from dB magnitudes.
+// It zeros the first nCeps DCT coefficients (the smooth spectral shape).
+// This is Kirovski's "CF" technique: reduces carrier noise by ~6 dB.
+func cepstrumFilter(magDB []float64, nCeps int) {
+	n := len(magDB)
+	if n < nCeps*2 {
+		return
+	}
+
+	// DCT-II (simplified, not optimized)
+	ceps := make([]float64, n)
+	for k := 0; k < n; k++ {
+		var sum float64
+		for i := 0; i < n; i++ {
+			sum += magDB[i] * math.Cos(math.Pi*float64(k)*(float64(i)+0.5)/float64(n))
+		}
+		ceps[k] = sum
+	}
+
+	// Zero low-order cepstral coefficients (spectral envelope)
+	for k := 0; k < nCeps; k++ {
+		ceps[k] = 0
+	}
+
+	// IDCT (inverse DCT-II)
+	for i := 0; i < n; i++ {
+		var sum float64
+		for k := 0; k < n; k++ {
+			w := 1.0
+			if k == 0 {
+				w = 0.5
+			}
+			sum += w * ceps[k] * math.Cos(math.Pi*float64(k)*(float64(i)+0.5)/float64(n))
+		}
+		magDB[i] = sum * 2.0 / float64(n)
+	}
+}
+
+// Simple xorshift32 PRNG for deterministic chip generation.
+type simplePRNG struct {
+	state uint32
+}
+
+func newPRNG(seed []byte) *simplePRNG {
+	var s uint32
+	for i, b := range seed {
+		s ^= uint32(b) << (uint(i%4) * 8)
+	}
+	if s == 0 {
+		s = 1
+	}
+	return &simplePRNG{state: s}
+}
+
+func (p *simplePRNG) next() uint32 {
+	p.state ^= p.state << 13
+	p.state ^= p.state >> 17
+	p.state ^= p.state << 5
+	return p.state
+}
diff --git a/internal/watermark/watermark.go b/internal/watermark/watermark.go
index cbf0df7..571997c 100644
--- a/internal/watermark/watermark.go
+++ b/internal/watermark/watermark.go
@@ -395,7 +395,17 @@ func RSEncode(data [rsDataBytes]byte) [rsTotalBytes]byte {
 	return rsEncode(data)
 }
 
-// Constants exported for the recovery tool.
+// PNChipAt returns the PN chip value at group g, bin b.
+func (d *STFTDetector) PNChipAt(g, b int) int8 {
+	return d.pnChips[g][b]
+}
+
+// GroupBit returns the data bit index for group g.
+func (d *STFTDetector) GroupBit(g int) int {
+	return d.groupToBit[g]
+}
+
+// Constants exported for the recovery tool and legacy tools.
 const (
 	PnChips       = pnChips
 	PayloadBits   = payloadBits