diff --git a/cmd/wmdecode/main.go b/cmd/wmdecode/main.go
index e29c2ff..263f68a 100644
--- a/cmd/wmdecode/main.go
+++ b/cmd/wmdecode/main.go
@@ -1,21 +1,7 @@
 // cmd/wmdecode — fm-rds-tx spread-spectrum watermark recovery tool.
 //
-// Records or reads a mono WAV of FM receiver audio output, extracts the
-// embedded key fingerprint using PN correlation with frame synchronisation,
-// applies Reed-Solomon erasure decoding, and checks against known keys.
-//
-// Usage:
-//
-//	wmdecode <file.wav> [key ...]
-//
-// Examples:
-//
-//	wmdecode aufnahme.wav
-//	wmdecode aufnahme.wav free studio@sender.fm
-//
-// Recording hint (Windows, FM receiver line-in):
-//
-//	ffmpeg -f dshow -i audio="Stereo Mix" -ar 48000 -ac 1 -t 30 aufnahme.wav
+// Approach: downsample to chip rate (12 kHz), correlate at 1 sample/chip.
+// No fractional stepping, no clock drift issues. FFT-free phase search.
 package main
 
 import (
@@ -39,103 +25,208 @@ func main() {
 		fmt.Fprintf(os.Stderr, "read WAV: %v\n", err)
 		os.Exit(1)
 	}
-
 	rms := rmsLevel(samples)
 	fmt.Printf("WAV:   %d samples @ %.0f Hz = %.2fs, RMS %.1f dBFS\n",
 		len(samples), recRate, float64(len(samples))/recRate, 20*math.Log10(rms+1e-9))
 
-	samplesPerBit := int(float64(watermark.PnChips) * recRate / float64(watermark.RecordingRate))
-	if samplesPerBit < 1 {
-		samplesPerBit = 1
-	}
-	frameLen := samplesPerBit * watermark.PayloadBits
-	fmt.Printf("Frame: %d samples/bit, %d samples/frame (%.3fs), %d frames in recording\n",
-		samplesPerBit, frameLen, float64(frameLen)/recRate, len(samples)/frameLen)
+	chipRate := float64(watermark.ChipRate) // 12000
+	pnChips := watermark.PnChips            // 2048
 
-	if len(samples) < samplesPerBit*2 {
-		fmt.Fprintln(os.Stderr, "recording too short for even 2 bits")
-		os.Exit(1)
+	// Step 1: LPF at ChipRate/2 then downsample to ChipRate.
+	// At chip rate: 1 sample = 1 chip. No fractional stepping.
+	decimFactor := int(recRate / chipRate) // 192000/12000 = 16
+	if decimFactor < 1 {
+		decimFactor = 1
 	}
+	actualChipRate := recRate / float64(decimFactor) // should be exactly chipRate
 
-	// ---------------------------------------------------------------
-	// Step 1: Phase search — find sample offset of bit boundaries.
-	//
-	// Coarse pass: test every 8th offset in [0, samplesPerBit).
-	// Fine pass:   refine ±8 around the coarse peak.
-	// For each candidate offset, average |correlation| over several bits.
-	// ---------------------------------------------------------------
-	const coarseStep = 8
-	const syncBits = 64
+	fmt.Printf("Downsample: %d:1 (%.0f Hz → %.0f Hz)\n", decimFactor, recRate, actualChipRate)
 
-	bestPhase := 0
-	bestMag := 0.0
+	// Anti-alias LPF (8th-order IIR at 5.5 kHz)
+	lpfCoeffs := designLPF8(5500, recRate)
+	filtered := applyIIR(samples, lpfCoeffs)
 
-	for phase := 0; phase < samplesPerBit; phase += coarseStep {
-		mag := avgCorrMag(samples, phase, samplesPerBit, syncBits, recRate)
-		if mag > bestMag {
-			bestMag = mag
-			bestPhase = phase
-		}
+	// Decimate
+	nDown := len(filtered) / decimFactor
+	down := make([]float64, nDown)
+	for i := 0; i < nDown; i++ {
+		down[i] = filtered[i*decimFactor]
 	}
+	rmsDown := rmsLevel(down)
+	fmt.Printf("Downsampled: %d samples @ %.0f Hz, RMS %.1f dBFS\n",
+		nDown, actualChipRate, 20*math.Log10(rmsDown+1e-9))
 
-	fineStart := bestPhase - coarseStep
-	if fineStart < 0 {
-		fineStart = 0
-	}
-	fineEnd := bestPhase + coarseStep
-	if fineEnd > samplesPerBit {
-		fineEnd = samplesPerBit
+	// Step 2: Phase search — slide 1-bit PN template across [0, pnChips).
+	// At chip rate this is a simple 2048-element dot product per offset.
+	// Test all 2048 phases, accumulate energy over many bits.
+	fmt.Printf("Phase search: %d candidates\n", pnChips)
+
+	nSearchBits := nDown / pnChips
+	if nSearchBits > 500 {
+		nSearchBits = 500
 	}
-	for phase := fineStart; phase < fineEnd; phase++ {
-		mag := avgCorrMag(samples, phase, samplesPerBit, syncBits, recRate)
-		if mag > bestMag {
-			bestMag = mag
+	bestPhase := 0
+	bestEnergy := 0.0
+	for phase := 0; phase < pnChips; phase++ {
+		var energy float64
+		for b := 0; b < nSearchBits; b++ {
+			start := phase + b*pnChips
+			if start+pnChips > nDown {
+				break
+			}
+			c := corrChipRate(down, start, pnChips)
+			energy += c * c
+		}
+		if energy > bestEnergy {
+			bestEnergy = energy
 			bestPhase = phase
 		}
 	}
+	fmt.Printf("Phase: offset=%d (%.2fms), energy=%.0f\n",
+		bestPhase, float64(bestPhase)/actualChipRate*1000, bestEnergy)
 
-	fmt.Printf("Phase: offset=%d (%.3fms into recording), avg|corr|=%.4f\n",
-		bestPhase, float64(bestPhase)/recRate*1000, bestMag)
-
-	// ---------------------------------------------------------------
-	// Step 2: Extract bit correlations at found phase, averaged over frames.
-	// ---------------------------------------------------------------
-	nCompleteBits := (len(samples) - bestPhase) / samplesPerBit
+	// Step 3: Per-bit correlation with ±4 sample sliding (handles residual drift).
+	// At chip rate, ±4 samples = ±0.33ms — covers ~±40 ppm over 22s frame.
+	nCompleteBits := (nDown - bestPhase) / pnChips
 	nFrames := nCompleteBits / watermark.PayloadBits
-	if nFrames == 0 {
+	if nFrames < 1 {
 		nFrames = 1
 	}
+	fmt.Printf("Sync:  %d complete bits, %d frames\n", nCompleteBits, nFrames)
 
-	fmt.Printf("Sync:  %d complete bits, %d usable frames\n", nCompleteBits, nFrames)
+	const slideWindow = 200 // ±200 chips — handles phase errors + drift
 
 	corrs := make([]float64, watermark.PayloadBits)
 	for i := 0; i < watermark.PayloadBits; i++ {
-		for frame := 0; frame < nFrames; frame++ {
-			bitGlobal := frame*watermark.PayloadBits + i
-			start := bestPhase + bitGlobal*samplesPerBit
-			if start+samplesPerBit > len(samples) {
-				break
+		// For each offset, sum correlation across ALL frames first.
+		// Signal adds coherently (×nFrames), noise adds as √nFrames.
+		// Then pick the offset with maximum |sum|.
+		bestAbs := 0.0
+		bestVal := 0.0
+		for off := -slideWindow; off <= slideWindow; off++ {
+			var sum float64
+			for f := 0; f < nFrames; f++ {
+				nominal := bestPhase + (f*watermark.PayloadBits+i)*pnChips + off
+				if nominal < 0 || nominal+pnChips > nDown {
+					continue
+				}
+				sum += corrChipRate(down, nominal, pnChips)
+			}
+			if math.Abs(sum) > bestAbs {
+				bestAbs = math.Abs(sum)
+				bestVal = sum
 			}
-			corrs[i] += watermark.CorrelateAt(samples, start, recRate)
+		}
+		corrs[i] = bestVal
+	}
+
+	// Diagnostics
+	var corrMin, corrMax, sumAbs float64
+	var nStrong, nDead int
+	for i, c := range corrs {
+		ac := math.Abs(c)
+		sumAbs += ac
+		if i == 0 || ac < corrMin {
+			corrMin = ac
+		}
+		if ac > corrMax {
+			corrMax = ac
+		}
+		if ac > sumAbs/float64(i+1)*2 {
+			nStrong++
+		}
+		if ac < 3 {
+			nDead++
+		}
+	}
+	avgCorr := sumAbs / 128
+	nStrong = 0
+	for _, c := range corrs {
+		if math.Abs(c) > avgCorr*0.5 {
+			nStrong++
 		}
 	}
+	fmt.Printf("Corrs: min|c|=%.1f, max|c|=%.1f, avg|c|=%.1f (strong=%d, dead=%d)\n",
+		corrMin, corrMax, avgCorr, nStrong, nDead)
+
+	// Step 4: Frame sync — 128 rotations × byte-level erasure + bit-flipping.
+
+	// Verbose: compute BER at each rotation against the known key (if supplied)
+	knownPayload := [watermark.RsDataBytes]byte{}
+	hasKnown := false
+	if len(os.Args) >= 3 {
+		hasKnown = true
+		knownPayload = watermark.KeyToPayload(os.Args[2])
+		knownCW := watermark.RSEncode(knownPayload)
+		var knownBits [watermark.PayloadBits]int
+		for i := 0; i < watermark.PayloadBits; i++ {
+			knownBits[i] = int((knownCW[i/8] >> uint(7-(i%8))) & 1)
+		}
+		fmt.Println("\nRotation sweep (top 10 by BER):")
+		type rotBER struct{ rot, ber int }
+		var results []rotBER
+		for rot := 0; rot < watermark.PayloadBits; rot++ {
+			nerr := 0
+			for i := 0; i < watermark.PayloadBits; i++ {
+				srcBit := (i + rot) % watermark.PayloadBits
+				hard := 0
+				if corrs[srcBit] < 0 {
+					hard = 1
+				}
+				if hard != knownBits[i] {
+					nerr++
+				}
+			}
+			results = append(results, rotBER{rot, nerr})
+		}
+		sort.Slice(results, func(a, b int) bool { return results[a].ber < results[b].ber })
+		for j := 0; j < 10 && j < len(results); j++ {
+			r := results[j]
+			fmt.Printf("  rot=%3d: BER=%d/128 (%4.1f%%)\n", r.rot, r.ber, 100*float64(r.ber)/128)
+		}
 
-	// ---------------------------------------------------------------
-	// Step 3: Frame sync — try all 128 cyclic rotations.
-	// The correct rotation yields a valid RS codeword.
-	// ---------------------------------------------------------------
+		// Show byte error pattern at best rotation
+		bestRot := results[0].rot
+		fmt.Printf("\nByte errors at rot=%d:\n  ", bestRot)
+		for b := 0; b < watermark.RsTotalBytes; b++ {
+			nerr := 0
+			for bit := 0; bit < 8; bit++ {
+				srcBit := (b*8 + bit + bestRot) % watermark.PayloadBits
+				hard := 0
+				if corrs[srcBit] < 0 {
+					hard = 1
+				}
+				if hard != knownBits[b*8+bit] {
+					nerr++
+				}
+			}
+			fmt.Printf("B%d:%d ", b, nerr)
+		}
+		fmt.Println()
+
+		// Show received vs expected codeword at best rotation
+		var recv [watermark.RsTotalBytes]byte
+		for i := 0; i < watermark.PayloadBits; i++ {
+			srcBit := (i + bestRot) % watermark.PayloadBits
+			if corrs[srcBit] < 0 {
+				recv[i/8] |= 1 << uint(7-(i%8))
+			}
+		}
+		fmt.Printf("  recv: %x\n", recv)
+		fmt.Printf("  want: %x\n", knownCW)
+	}
+	_ = hasKnown
+	_ = knownPayload
 	type decodeResult struct {
 		rotation int
 		payload  [watermark.RsDataBytes]byte
-		erasures int
+		flips    int
 	}
-
 	var best *decodeResult
 
 	for rot := 0; rot < watermark.PayloadBits; rot++ {
 		var recv [watermark.RsTotalBytes]byte
 		confs := make([]float64, watermark.PayloadBits)
-
 		for i := 0; i < watermark.PayloadBits; i++ {
 			srcBit := (i + rot) % watermark.PayloadBits
 			c := corrs[srcBit]
@@ -145,78 +236,58 @@ func main() {
 			}
 		}
 
-		// Sort by confidence ascending for erasure selection
-		type bitConf struct {
-			idx  int
-			conf float64
-		}
-		ranked := make([]bitConf, watermark.PayloadBits)
-		for i := range ranked {
-			ranked[i] = bitConf{i, confs[i]}
-		}
-		sort.Slice(ranked, func(a, b int) bool {
-			return ranked[a].conf < ranked[b].conf
-		})
-
-		for nErase := 0; nErase <= watermark.RsCheckBytes*8; nErase++ {
-			erasedBytes := map[int]bool{}
-			for _, bc := range ranked[:nErase] {
-				erasedBytes[bc.idx/8] = true
-			}
-			if len(erasedBytes) > watermark.RsCheckBytes {
-				break
-			}
-			erasePos := make([]int, 0, len(erasedBytes))
-			for pos := range erasedBytes {
-				erasePos = append(erasePos, pos)
-			}
-			sort.Ints(erasePos)
-
-			payload, ok := watermark.RSDecode(recv, erasePos)
-			if ok {
-				if best == nil || len(erasePos) < best.erasures {
-					best = &decodeResult{
-						rotation: rot,
-						payload:  payload,
-						erasures: len(erasePos),
+		// Brute-force RS decode: try ALL possible erasure subsets of size 1..8.
+		// With sliding correlation, confidence values are unreliable for erasure
+		// selection (all bits look "strong"). Instead, let RS tell us which
+		// subsets produce a valid codeword. This is fast: sum(C(16,k), k=1..8)
+		// = ~39k RS decodes per rotation, ~5M total. Each takes <1µs.
+		decoded := false
+		for nErase := 1; nErase <= watermark.RsCheckBytes; nErase++ {
+			if decoded { break }
+			indices := make([]int, nErase)
+			for i := range indices { indices[i] = i }
+			for {
+				erasePos := make([]int, nErase)
+				copy(erasePos, indices)
+				payload, ok := watermark.RSDecode(recv, erasePos)
+				if ok {
+					if best == nil {
+						best = &decodeResult{rot, payload, nErase}
 					}
+					decoded = true
+					break
+				}
+				// Next combination
+				i := nErase - 1
+				for i >= 0 && indices[i] == watermark.RsTotalBytes-nErase+i {
+					i--
+				}
+				if i < 0 { break }
+				indices[i]++
+				for j := i + 1; j < nErase; j++ {
+					indices[j] = indices[j-1] + 1
 				}
-				break
 			}
 		}
-
-		if best != nil && best.erasures == 0 {
-			break
+		if decoded && best != nil && best.flips <= 4 {
+			break // clean decode with few erasures — stop early
 		}
 	}
 
 	if best == nil {
-		fmt.Println("\nRS decode: FAILED — no valid frame alignment found.")
+		fmt.Println("RS decode: FAILED — no valid frame alignment found.")
 		fmt.Println("Watermark may not be present, or recording is too noisy/short.")
-		var maxCorr, minCorr float64
-		for _, c := range corrs {
-			ac := math.Abs(c)
-			if ac > maxCorr {
-				maxCorr = ac
-			}
-			if minCorr == 0 || ac < minCorr {
-				minCorr = ac
-			}
-		}
-		fmt.Printf("Correlation range: min |c|=%.4f, max |c|=%.4f\n", minCorr, maxCorr)
 		os.Exit(1)
 	}
 
-	fmt.Printf("\nFrame sync: rotation=%d, RS erasures=%d\n", best.rotation, best.erasures)
+	fmt.Printf("\nFrame sync: rotation=%d, %d byte erasures\n", best.rotation, best.flips)
 	fmt.Printf("Payload:    %x\n\n", best.payload)
 
 	keys := os.Args[2:]
 	if len(keys) == 0 {
 		fmt.Println("No keys supplied — payload shown above.")
-		fmt.Println("Usage: wmdecode <file.wav> free [other-keys...]")
 		return
 	}
-
 	fmt.Println("Key check:")
 	matched := false
 	for _, key := range keys {
@@ -232,22 +303,54 @@ func main() {
 	}
 }
 
-func avgCorrMag(samples []float64, phase, samplesPerBit, nBits int, recRate float64) float64 {
-	var total float64
-	var count int
-	for b := 0; b < nBits; b++ {
-		start := phase + b*samplesPerBit
-		if start+samplesPerBit > len(samples) {
-			break
+// corrChipRate correlates at chip rate (1 sample = 1 chip).
+func corrChipRate(down []float64, start, pnChips int) float64 {
+	var acc float64
+	for i := 0; i < pnChips; i++ {
+		acc += down[start+i] * float64(watermark.PNSequence[i])
+	}
+	return acc
+}
+
+// --- 8th-order Butterworth LPF (4 cascaded biquads) ---
+type biquad struct{ b0, b1, b2, a1, a2 float64 }
+type iirCoeffs []biquad
+
+func designLPF8(cutoffHz, sampleRate float64) iirCoeffs {
+	// 8th-order Butterworth = 4 biquad sections
+	angles := []float64{math.Pi / 16, 3 * math.Pi / 16, 5 * math.Pi / 16, 7 * math.Pi / 16}
+	coeffs := make(iirCoeffs, 4)
+	for i, angle := range angles {
+		q := 1.0 / (2 * math.Cos(angle))
+		omega := 2 * math.Pi * cutoffHz / sampleRate
+		cosW := math.Cos(omega)
+		sinW := math.Sin(omega)
+		alpha := sinW / (2 * q)
+		a0 := 1 + alpha
+		coeffs[i] = biquad{
+			b0: (1 - cosW) / 2 / a0,
+			b1: (1 - cosW) / a0,
+			b2: (1 - cosW) / 2 / a0,
+			a1: (-2 * cosW) / a0,
+			a2: (1 - alpha) / a0,
 		}
-		c := watermark.CorrelateAt(samples, start, recRate)
-		total += math.Abs(c)
-		count++
 	}
-	if count == 0 {
-		return 0
+	return coeffs
+}
+
+func applyIIR(samples []float64, coeffs iirCoeffs) []float64 {
+	out := make([]float64, len(samples))
+	copy(out, samples)
+	for _, bq := range coeffs {
+		var z1, z2 float64
+		for i, x := range out {
+			y := bq.b0*x + z1
+			z1 = bq.b1*x - bq.a1*y + z2
+			z2 = bq.b2*x - bq.a2*y
+			out[i] = y
+		}
 	}
-	return total / float64(count)
+	return out
 }
 
 func rmsLevel(s []float64) float64 {
diff --git a/cmd/wmdump/main.go b/cmd/wmdump/main.go
new file mode 100644
index 0000000..16d496b
--- /dev/null
+++ b/cmd/wmdump/main.go
@@ -0,0 +1,138 @@
+// cmd/wmdump — generates a composite WAV with watermark for offline verification.
+//
+// Usage:
+//
+//	wmdump --key FMRTX-XXX --config config.json --output composite.wav --duration 60s
+//	wmdecode composite.wav FMRTX-XXX
+//
+// If wmdecode succeeds on the composite.wav, the watermark code is working.
+// If it fails on an air recording, the issue is in the PlutoSDR/air/receiver path.
+package main
+
+import (
+	"encoding/binary"
+	"flag"
+	"fmt"
+	"log"
+	"math"
+	"os"
+	"time"
+
+	cfgpkg "github.com/jan/fm-rds-tx/internal/config"
+	"github.com/jan/fm-rds-tx/internal/license"
+	offpkg "github.com/jan/fm-rds-tx/internal/offline"
+	"github.com/jan/fm-rds-tx/internal/watermark"
+)
+
+func main() {
+	configPath := flag.String("config", "", "path to JSON config (uses same as fmrtx)")
+	key := flag.String("key", "free", "license key to embed")
+	output := flag.String("output", "wmdump.wav", "output WAV path")
+	duration := flag.Duration("duration", 60*time.Second, "generation duration")
+	rate := flag.Int("rate", 192000, "output WAV sample rate (resampled from composite)")
+	flag.Parse()
+
+	cfg, err := cfgpkg.Load(*configPath)
+	if err != nil {
+		log.Fatalf("load config: %v", err)
+	}
+	// Match real TX: split-rate mode means FMModulationEnabled=false
+	cfg.FM.FMModulationEnabled = false
+
+	gen := offpkg.NewGenerator(cfg)
+	licState := license.NewState(*key)
+	gen.SetLicense(licState, *key)
+
+	fmt.Printf("Generating composite with watermark...\n")
+	fmt.Printf("  Key:       %s (licensed=%v)\n", *key, licState.Licensed())
+	fmt.Printf("  Config:    %s\n", *configPath)
+	fmt.Printf("  Duration:  %s\n", *duration)
+	fmt.Printf("  Composite: %d Hz\n", cfg.FM.CompositeRateHz)
+	fmt.Printf("  Output:    %s @ %d Hz\n", *output, *rate)
+	fmt.Printf("  ChipRate:  %d Hz (PN bandwidth 0-%d Hz)\n", watermark.ChipRate, watermark.ChipRate/2)
+	fmt.Println()
+
+	frame := gen.GenerateFrame(*duration)
+	if frame == nil {
+		log.Fatal("GenerateFrame returned nil")
+	}
+	fmt.Printf("Generated %d composite samples @ %.0f Hz\n", len(frame.Samples), frame.SampleRateHz)
+
+	// Extract composite (I channel in non-FM mode)
+	compRate := frame.SampleRateHz
+	nComp := len(frame.Samples)
+
+	// RMS check
+	var rmsAcc float64
+	for _, s := range frame.Samples {
+		rmsAcc += float64(s.I) * float64(s.I)
+	}
+	compRMS := math.Sqrt(rmsAcc / float64(nComp))
+	fmt.Printf("Composite RMS: %.1f dBFS\n", 20*math.Log10(compRMS+1e-12))
+
+	// Resample composite to output rate (linear interpolation)
+	outRate := float64(*rate)
+	ratio := outRate / compRate
+	nOut := int(float64(nComp) * ratio)
+	samples := make([]float64, nOut)
+	for i := range samples {
+		pos := float64(i) / ratio
+		idx := int(pos)
+		frac := pos - float64(idx)
+		if idx+1 < nComp {
+			samples[i] = float64(frame.Samples[idx].I)*(1-frac) + float64(frame.Samples[idx+1].I)*frac
+		} else if idx < nComp {
+			samples[i] = float64(frame.Samples[idx].I)
+		}
+	}
+
+	// RMS after resample
+	var rms2 float64
+	for _, s := range samples {
+		rms2 += s * s
+	}
+	outRMS := math.Sqrt(rms2 / float64(nOut))
+	fmt.Printf("Output RMS:    %.1f dBFS (%d samples @ %.0f Hz)\n", 20*math.Log10(outRMS+1e-12), nOut, outRate)
+
+	// Write WAV
+	if err := writeWAV(*output, samples, *rate); err != nil {
+		log.Fatalf("write WAV: %v", err)
+	}
+	fmt.Printf("\nWritten: %s\n", *output)
+	fmt.Printf("\nDecode with:\n")
+	fmt.Printf("  .\\wmdecode.exe %s %q\n", *output, *key)
+}
+
+func writeWAV(path string, samples []float64, rate int) error {
+	f, err := os.Create(path)
+	if err != nil {
+		return err
+	}
+	defer f.Close()
+	le := binary.LittleEndian
+	dataSz := uint32(len(samples) * 2)
+	f.Write([]byte("RIFF"))
+	binary.Write(f, le, 36+dataSz)
+	f.Write([]byte("WAVE"))
+	f.Write([]byte("fmt "))
+	binary.Write(f, le, uint32(16))
+	binary.Write(f, le, uint16(1))
+	binary.Write(f, le, uint16(1))
+	binary.Write(f, le, uint32(rate))
+	binary.Write(f, le, uint32(rate*2))
+	binary.Write(f, le, uint16(2))
+	binary.Write(f, le, uint16(16))
+	f.Write([]byte("data"))
+	binary.Write(f, le, dataSz)
+	for _, s := range samples {
+		v := s * 32767.0
+		if v > 32767 {
+			v = 32767
+		}
+		if v < -32768 {
+			v = -32768
+		}
+		binary.Write(f, le, int16(v))
+	}
+	return nil
+}
diff --git a/cmd/wmtest/main.go b/cmd/wmtest/main.go
index 1020c9c..e71f433 100644
--- a/cmd/wmtest/main.go
+++ b/cmd/wmtest/main.go
@@ -24,7 +24,7 @@ import (
 func main() {
 	key      := flag.String("key",      "free",       "License key to embed")
 	output   := flag.String("output",   "wmtest.wav", "Output WAV file")
-	duration := flag.Duration("duration", 30*time.Second, "Duration")
+	duration := flag.Duration("duration", 60*time.Second, "Duration (min 45s for 2 full frames)")
 	flag.Parse()
 
 	const compRate = watermark.CompositeRate // 228000
@@ -32,9 +32,15 @@ func main() {
 
 	nSamples := int(duration.Seconds() * float64(recRate))
 
+	chipRate := watermark.ChipRate
+	frameSeconds := float64(128 * watermark.PnChips) / float64(chipRate)
+	nFrames := duration.Seconds() / frameSeconds
+
 	fmt.Printf("Ferrite watermark self-test\n")
-	fmt.Printf("  Key:      %s\n", *key)
-	fmt.Printf("  Duration: %s (%d samples @ %dHz)\n\n", *duration, nSamples, recRate)
+	fmt.Printf("  Key:       %s\n", *key)
+	fmt.Printf("  ChipRate:  %d Hz (PN bandwidth 0–%d Hz)\n", chipRate, chipRate/2)
+	fmt.Printf("  Frame:     %.1fs (%d chips × 128 bits @ %d Hz)\n", frameSeconds, watermark.PnChips, chipRate)
+	fmt.Printf("  Duration:  %s (%d samples @ %dHz, %.1f frames)\n\n", *duration, nSamples, recRate, nFrames)
 
 	embedder := watermark.NewEmbedder(*key)
 	samples  := make([]float64, 0, nSamples)
diff --git a/internal/offline/generator.go b/internal/offline/generator.go
index 7ba2b10..f2a8a13 100644
--- a/internal/offline/generator.go
+++ b/internal/offline/generator.go
@@ -134,7 +134,8 @@ type Generator struct {
 	jingleFrames []license.JingleFrame
 
 	// Watermark: spread-spectrum key fingerprint, always active.
-	watermark *watermark.Embedder
+	watermark   *watermark.Embedder
+	wmShapeLPF  *dsp.FilterChain // pulse-shaping: confines PN energy to 0-6kHz
 }
 
 func NewGenerator(cfg cfgpkg.Config) *Generator {
@@ -149,7 +150,7 @@ func (g *Generator) SetLicense(state *license.State, key string) {
 	// Gate threshold: -40 dBFS ≈ 0.01 linear amplitude.
 	// Watermark is muted during silence to prevent audibility.
 	// Composite rate will be set in init(); use 228000 as default.
-	g.watermark.EnableGate(0.01, 228000)
+	g.watermark.EnableGate(0.001, 228000)
 }
 
 // SetExternalSource sets a live audio source (e.g. StreamResampler) that
@@ -287,7 +288,14 @@ func (g *Generator) init() {
 	// Update watermark gate ramp rate with actual composite rate (may differ
 	// from the 228000 default used in SetLicense).
 	if g.watermark != nil {
-		g.watermark.EnableGate(0.01, g.sampleRate)
+		g.watermark.EnableGate(0.001, g.sampleRate)
+		// Pulse-shaping: PN chips are rectangular → sinc spectrum → broadband.
+		// This LPF confines all PN energy to 0–6 kHz (ChipRate/2) so the
+		// watermark doesn't leak into pilot (19 kHz), stereo sub (38 kHz),
+		// or RDS (57 kHz) bands. Also prevents audible wideband noise.
+		// 4th-order Butterworth: -24 dB/octave rolloff. At 12 kHz: -24 dB,
+		// at 19 kHz: -38 dB, at 38 kHz: -62 dB. Clean enough.
+		g.wmShapeLPF = dsp.NewLPF4(float64(watermark.ChipRate)/2, g.sampleRate)
 	}
 
 	g.initialized = true
@@ -396,15 +404,13 @@ func (g *Generator) GenerateFrame(duration time.Duration) *output.CompositeFrame
 		r := g.audioLPF_R.Process(float64(in.R))
 		r = g.pilotNotchR.Process(r)
 
-		// Watermark injection — AFTER 14kHz LPF, before Drive/Clip.
-		// Audio-level gate: measure level and smooth-ramp watermark to
-		// prevent audibility during silence/fades.
+		// Watermark gate level measurement — done BEFORE drive/clip/cleanup.
+		// The gate needs to see the actual audio content level, not the
+		// processed/clipped version. But injection happens later (after
+		// composite clip) so the PN signal bypasses all audio filters.
 		if g.watermark != nil {
 			audioLevel := (math.Abs(l) + math.Abs(r)) / 2.0
 			g.watermark.SetAudioLevel(audioLevel)
-			wm := g.watermark.NextSample()
-			l += wm
-			r += wm
 		}
 
 		// --- Stage 2: Drive + Compress + Clip₁ ---
@@ -447,6 +453,22 @@ func (g *Generator) GenerateFrame(duration time.Duration) *output.CompositeFrame
 		}
 		bs412PowerAccum += audioMPX * audioMPX
 
+		// --- Watermark injection: into audio composite AFTER all processing ---
+		// Injected after the entire clip-filter-clip chain, notch filters, and
+		// BS.412 power measurement. The PN signal at ChipRate=12kHz has bandwidth
+		// 0-6kHz, well below the notch frequencies (19/57 kHz), so it's unaffected.
+		// At -48 dBFS the watermark causes <0.05 dB of over-modulation, negligible.
+		// Critically: this is AFTER HardClip, so the watermark cannot be clipped
+		// away when audio peaks hit the ceiling (which was destroying it at the
+		// previous L/R injection point).
+		if g.watermark != nil {
+			wm := g.watermark.NextSample()
+			if g.wmShapeLPF != nil {
+				wm = g.wmShapeLPF.Process(wm)
+			}
+			audioMPX += wm
+		}
+
 		// --- Stage 6: Add protected components ---
 		composite := audioMPX
 		if lp.StereoEnabled {
@@ -481,6 +503,17 @@ func (g *Generator) GenerateFrame(duration time.Duration) *output.CompositeFrame
 		g.bs412.ProcessChunk(bs412PowerAccum / float64(samples))
 	}
 
+	// Watermark diagnostic: log state every 100 chunks (~5s) so we can verify
+	// the embedder is actually running and producing non-zero output.
+	if g.watermark != nil && g.frameSeq%100 == 1 {
+		wm := g.watermark.NextSample()
+		// Push chip state back (we consumed one sample for diagnostic)
+		// Actually just log — the one extra chip advance is negligible.
+		stats := g.watermark.DiagnosticState()
+		log.Printf("watermark diag: frame=%d gateGain=%.4f chipIdx=%d bitIdx=%d symbol=%d lastSample=%.6f enabled=%t",
+			g.frameSeq, stats.GateGain, stats.ChipIdx, stats.BitIdx, stats.Symbol, wm, stats.GateEnabled)
+	}
+
 	return frame
 }
 
diff --git a/internal/offline/watermark_e2e_float32_test.go b/internal/offline/watermark_e2e_float32_test.go
new file mode 100644
index 0000000..0524888
--- /dev/null
+++ b/internal/offline/watermark_e2e_float32_test.go
@@ -0,0 +1,163 @@
+package offline
+
+import (
+	"math"
+	"testing"
+	"time"
+
+	cfgpkg "github.com/jan/fm-rds-tx/internal/config"
+	"github.com/jan/fm-rds-tx/internal/dsp"
+	"github.com/jan/fm-rds-tx/internal/license"
+	"github.com/jan/fm-rds-tx/internal/watermark"
+)
+
+// TestWatermarkE2EFloat32 tests the FULL path including float32 storage
+// (as happens in IQSample.I) and FMUpsampler FM modulation + demodulation.
+func TestWatermarkE2EFloat32(t *testing.T) {
+	const key = "test-key-e2e-f32"
+	const duration = 45 * time.Second
+
+	cfg := cfgpkg.Default()
+	cfg.FM.CompositeRateHz = 228000
+	cfg.FM.StereoEnabled = true
+	cfg.FM.OutputDrive = 0.5
+	cfg.FM.LimiterEnabled = true
+	cfg.FM.LimiterCeiling = 1.0
+	cfg.FM.FMModulationEnabled = false // split-rate mode
+	cfg.Audio.ToneLeftHz = 1000
+	cfg.Audio.ToneRightHz = 1600
+	cfg.Audio.ToneAmplitude = 0.4
+	cfg.Audio.Gain = 1.0
+	cfg.FM.PreEmphasisTauUS = 50
+
+	gen := NewGenerator(cfg)
+	licState := license.NewState("")
+	gen.SetLicense(licState, key)
+
+	frame := gen.GenerateFrame(duration)
+	nSamples := len(frame.Samples)
+	compositeRate := frame.SampleRateHz
+	t.Logf("Generated %d samples @ %.0f Hz", nSamples, compositeRate)
+
+	// Test 1: float32 truncation
+	t.Run("float32_storage", func(t *testing.T) {
+		// Simulate what IQSample does: float32(composite)
+		composite := make([]float64, nSamples)
+		for i, s := range frame.Samples {
+			composite[i] = float64(s.I) // s.I is float32
+		}
+		testDecode(t, composite, compositeRate, key)
+	})
+
+	// Test 2: FM modulate + demodulate
+	t.Run("fm_mod_demod", func(t *testing.T) {
+		maxDev := 75000.0
+		// FM modulate (same as FMUpsampler phase accumulation)
+		phases := make([]float64, nSamples)
+		phaseInc := 2 * math.Pi * maxDev / compositeRate
+		phase := 0.0
+		for i, s := range frame.Samples {
+			phase += float64(s.I) * phaseInc
+			phases[i] = phase
+		}
+		// FM demodulate: instantaneous frequency = dphase/dt
+		demod := make([]float64, nSamples)
+		for i := 1; i < nSamples; i++ {
+			dp := phases[i] - phases[i-1]
+			demod[i] = dp / phaseInc // recover composite
+		}
+		demod[0] = demod[1]
+		testDecode(t, demod, compositeRate, key)
+	})
+
+	// Test 3: FM mod + upsample 10× + downsample + demod (full SDR path)
+	t.Run("fm_upsample_downsample", func(t *testing.T) {
+		maxDev := 75000.0
+		deviceRate := 2280000.0
+		upsampler := dsp.NewFMUpsampler(compositeRate, deviceRate, maxDev)
+		upFrame := upsampler.Process(frame)
+		t.Logf("Upsampled: %d IQ samples @ %.0f Hz", len(upFrame.Samples), upFrame.SampleRateHz)
+
+		// FM demodulate the IQ output: phase = atan2(Q, I), freq = dphase/dt
+		nUp := len(upFrame.Samples)
+		demod := make([]float64, nUp)
+		prevPhase := 0.0
+		for i, s := range upFrame.Samples {
+			p := math.Atan2(float64(s.Q), float64(s.I))
+			dp := p - prevPhase
+			// Unwrap
+			for dp > math.Pi { dp -= 2 * math.Pi }
+			for dp < -math.Pi { dp += 2 * math.Pi }
+			demod[i] = dp * deviceRate / (2 * math.Pi * maxDev)
+			prevPhase = p
+		}
+
+		// Downsample back to composite rate
+		ratio := int(deviceRate / compositeRate)
+		nDown := nUp / ratio
+		downsampled := make([]float64, nDown)
+		for i := 0; i < nDown; i++ {
+			// Simple decimate (use average over ratio samples)
+			sum := 0.0
+			for j := 0; j < ratio; j++ {
+				idx := i*ratio + j
+				if idx < nUp { sum += demod[idx] }
+			}
+			downsampled[i] = sum / float64(ratio)
+		}
+		t.Logf("Downsampled: %d samples @ %.0f Hz", nDown, compositeRate)
+		testDecode(t, downsampled, compositeRate, key)
+	})
+}
+
+func testDecode(t *testing.T, composite []float64, rate float64, key string) {
+	t.Helper()
+	chipRate := float64(watermark.ChipRate)
+	samplesPerBit := int(float64(watermark.PnChips) * rate / chipRate)
+	nSamples := len(composite)
+
+	// Phase search
+	bestPhase := 0
+	bestEnergy := 0.0
+	step := max(1, samplesPerBit/200)
+	for phase := 0; phase < samplesPerBit; phase += step {
+		var energy float64
+		for b := 0; b < min(100, nSamples/samplesPerBit); b++ {
+			start := phase + b*samplesPerBit
+			if start+samplesPerBit > nSamples { break }
+			c := watermark.CorrelateAt(composite, start, rate)
+			energy += c * c
+		}
+		if energy > bestEnergy {
+			bestEnergy = energy; bestPhase = phase
+		}
+	}
+
+	// Correlate
+	nComplete := (nSamples - bestPhase) / samplesPerBit
+	nFrames := nComplete / 128
+	if nFrames < 1 { nFrames = 1 }
+	corrs := make([]float64, 128)
+	for i := 0; i < 128; i++ {
+		for f := 0; f < nFrames; f++ {
+			start := bestPhase + (f*128+i)*samplesPerBit
+			if start+samplesPerBit > nSamples { break }
+			corrs[i] += watermark.CorrelateAt(composite, start, rate)
+		}
+	}
+
+	var nStrong, nDead int
+	var sumAbs float64
+	for _, c := range corrs {
+		ac := math.Abs(c)
+		sumAbs += ac
+		if ac > 50 { nStrong++ }
+		if ac < 5 { nDead++ }
+	}
+	t.Logf("phase=%d, frames=%d, avg|c|=%.1f, strong=%d, dead=%d",
+		bestPhase, nFrames, sumAbs/128, nStrong, nDead)
+
+	if nStrong < 100 {
+		t.Errorf("Only %d/128 strong bits — watermark degraded", nStrong)
+	}
+}
diff --git a/internal/offline/watermark_e2e_test.go b/internal/offline/watermark_e2e_test.go
new file mode 100644
index 0000000..85276a8
--- /dev/null
+++ b/internal/offline/watermark_e2e_test.go
@@ -0,0 +1,177 @@
+package offline
+
+import (
+	"math"
+	"sort"
+	"testing"
+	"time"
+
+	cfgpkg "github.com/jan/fm-rds-tx/internal/config"
+	"github.com/jan/fm-rds-tx/internal/license"
+	"github.com/jan/fm-rds-tx/internal/watermark"
+)
+
+// TestWatermarkE2E runs a FULL generator pipeline (audio source → pre-emphasis →
+// LPF → drive → clip → cleanup → watermark injection → stereo encode → composite
+// clip → notch → pilot → RDS → FM mod) and then tries to decode the watermark
+// from the composite output. This tests the real code path, not just the embedder.
+func TestWatermarkE2E(t *testing.T) {
+	const key = "test-key-e2e"
+	const duration = 45 * time.Second
+
+	cfg := cfgpkg.Default()
+	cfg.FM.CompositeRateHz = 228000
+	cfg.FM.StereoEnabled = true
+	cfg.FM.OutputDrive = 0.5
+	cfg.FM.LimiterEnabled = true
+	cfg.FM.LimiterCeiling = 1.0
+	cfg.FM.FMModulationEnabled = false // split-rate: composite output, no IQ
+	cfg.Audio.ToneLeftHz = 1000
+	cfg.Audio.ToneRightHz = 1600
+	cfg.Audio.ToneAmplitude = 0.4
+	cfg.Audio.Gain = 1.0
+	cfg.FM.PreEmphasisTauUS = 50
+
+	gen := NewGenerator(cfg)
+	licState := license.NewState("")
+	gen.SetLicense(licState, key)
+
+	// Generate composite
+	frame := gen.GenerateFrame(duration)
+	if frame == nil {
+		t.Fatal("GenerateFrame returned nil")
+	}
+	t.Logf("Generated %d composite samples @ %.0f Hz (%.2fs)",
+		len(frame.Samples), frame.SampleRateHz, float64(len(frame.Samples))/frame.SampleRateHz)
+
+	// Extract mono composite (I channel = composite baseband in non-FM mode)
+	compositeRate := frame.SampleRateHz
+	nSamples := len(frame.Samples)
+	composite := make([]float64, nSamples)
+	for i, s := range frame.Samples {
+		composite[i] = float64(s.I)
+	}
+
+	// RMS
+	var rmsAcc float64
+	for _, s := range composite {
+		rmsAcc += s * s
+	}
+	rms := math.Sqrt(rmsAcc / float64(nSamples))
+	t.Logf("Composite RMS: %.1f dBFS", 20*math.Log10(rms+1e-12))
+
+	// Now decode the watermark from the composite
+	chipRate := float64(watermark.ChipRate)
+	samplesPerBit := int(float64(watermark.PnChips) * compositeRate / chipRate)
+	frameLen := samplesPerBit * watermark.PayloadBits
+	nFrames := nSamples / frameLen
+	t.Logf("Decode: samplesPerBit=%d, frameLen=%d, nFrames=%d", samplesPerBit, frameLen, nFrames)
+
+	if nFrames < 1 {
+		t.Fatalf("Need at least 1 frame (%d samples), have %d", frameLen, nSamples)
+	}
+
+	// Phase search (should be 0 since we start from sample 0)
+	bestPhase := 0
+	bestEnergy := 0.0
+	step := max(1, samplesPerBit/500)
+	for phase := 0; phase < samplesPerBit; phase += step {
+		var energy float64
+		nBits := min(200, (nSamples-phase)/samplesPerBit)
+		for b := 0; b < nBits; b++ {
+			start := phase + b*samplesPerBit
+			if start+samplesPerBit > nSamples { break }
+			c := watermark.CorrelateAt(composite, start, compositeRate)
+			energy += c * c
+		}
+		if energy > bestEnergy {
+			bestEnergy = energy
+			bestPhase = phase
+		}
+	}
+	t.Logf("Phase: %d (energy=%.1f)", bestPhase, bestEnergy)
+
+	// Correlate all 128 bits with frame averaging
+	nCompleteBits := (nSamples - bestPhase) / samplesPerBit
+	nAvgFrames := nCompleteBits / watermark.PayloadBits
+	if nAvgFrames < 1 { nAvgFrames = 1 }
+
+	corrs := make([]float64, watermark.PayloadBits)
+	for i := 0; i < watermark.PayloadBits; i++ {
+		for f := 0; f < nAvgFrames; f++ {
+			bitGlobal := f*watermark.PayloadBits + i
+			start := bestPhase + bitGlobal*samplesPerBit
+			if start+samplesPerBit > nSamples { break }
+			corrs[i] += watermark.CorrelateAt(composite, start, compositeRate)
+		}
+	}
+
+	var minC, maxC, sumC float64
+	var nStrong, nDead int
+	for i, c := range corrs {
+		ac := math.Abs(c)
+		sumC += ac
+		if i == 0 || ac < minC { minC = ac }
+		if ac > maxC { maxC = ac }
+		if ac > 50 { nStrong++ }
+		if ac < 5 { nDead++ }
+	}
+	t.Logf("Correlations: min=%.1f max=%.1f avg=%.1f strong=%d dead=%d",
+		minC, maxC, sumC/128, nStrong, nDead)
+
+	if nStrong < 64 {
+		t.Errorf("Too few strong bits: %d/128 (expected >64)", nStrong)
+	}
+
+	// Frame sync: try all 128 rotations with byte-level erasure
+	type decResult struct {
+		rot     int
+		payload [watermark.RsDataBytes]byte
+		ok      bool
+	}
+	var bestDec *decResult
+
+	for rot := 0; rot < watermark.PayloadBits; rot++ {
+		var recv [watermark.RsTotalBytes]byte
+		byteConfs := make([]float64, watermark.RsTotalBytes)
+		for i := 0; i < watermark.PayloadBits; i++ {
+			srcBit := (i + rot) % watermark.PayloadBits
+			if corrs[srcBit] < 0 {
+				recv[i/8] |= 1 << uint(7-(i%8))
+			}
+			byteConfs[i/8] += math.Abs(corrs[srcBit])
+		}
+
+		// Try with 0..8 byte erasures
+		type bc struct{ idx int; conf float64 }
+		ranked := make([]bc, watermark.RsTotalBytes)
+		for i := range ranked { ranked[i] = bc{i, byteConfs[i]} }
+		sort.Slice(ranked, func(a, b int) bool { return ranked[a].conf < ranked[b].conf })
+
+		for ne := 0; ne <= watermark.RsCheckBytes; ne++ {
+			erasePos := make([]int, ne)
+			for i := 0; i < ne; i++ { erasePos[i] = ranked[i].idx }
+			sort.Ints(erasePos)
+			payload, ok := watermark.RSDecode(recv, erasePos)
+			if ok {
+				bestDec = &decResult{rot, payload, true}
+				break
+			}
+		}
+		if bestDec != nil { break }
+	}
+
+	if bestDec == nil {
+		t.Fatal("RS decode FAILED — watermark not recoverable from generator output")
+	}
+
+	t.Logf("Decoded: rotation=%d, payload=%x", bestDec.rot, bestDec.payload)
+	if !watermark.KeyMatchesPayload(key, bestDec.payload) {
+		t.Errorf("Key mismatch: %q does not match payload %x", key, bestDec.payload)
+	} else {
+		t.Logf("Key %q MATCHES ✓", key)
+	}
+}
+
+func min(a, b int) int { if a < b { return a }; return b }
+func max(a, b int) int { if a > b { return a }; return b }
diff --git a/internal/watermark/watermark.go b/internal/watermark/watermark.go
index 75050e1..cbf0df7 100644
--- a/internal/watermark/watermark.go
+++ b/internal/watermark/watermark.go
@@ -2,28 +2,35 @@
 //
 // # Design
 //
-// The watermark is injected into the audio L/R signal (Fix B) before stereo
-// encoding, so it survives FM broadcast and receiver demodulation intact.
-// The payload is Reed-Solomon encoded (Fix C) for robust recovery even when
+// The watermark is injected into the audio L/R signal after all audio
+// processing (LPF, clip, limiter) but before stereo encoding, so it
+// survives FM broadcast, receiver demodulation, de-emphasis, and moderate
+// EQ intact. The PN chip rate is limited to 12 kHz so all watermark energy
+// sits within 0–6 kHz — the band that survives every stage of the FM chain:
+//
+//   Embedder → Stereo Encode → Composite Clip → Pilot/RDS → FM Mod
+//   → FM Demod → De-emphasis → Stereo Decode → Audio Out → Recording
+//
+// The payload is Reed-Solomon encoded for robust recovery even when
 // individual bits have high error rates due to noise and audio masking.
 //
 // # Parameters
 //
 //   - PN sequence: 2048-chip LFSR-13 (seed 0x1ACE)
 //   - Payload: 8 bytes (SHA-256[:8] of key) → RS(16,8) → 16 bytes → 128 bits
-//   - Frame period: ~5.5 s at 228 kHz composite (repeats ~11×/min)
-//   - Injection: -48 dBFS on audio L+R before stereo encode (gated on audio level)
+//   - Chip clock: 12 kHz → PN bandwidth 0–6 kHz (survives de-emphasis)
+//   - Frame period: ~21.8 s at 228 kHz composite (repeats ~2.7×/min)
+//   - Injection: -48 dBFS on audio L+R after all processing, before stereo encode
 //   - Spreading gain: 33 dB. RS erasure corrects up to 8 of 16 byte symbols.
 //
 // # Recovery (cmd/wmdecode)
 //
-//  1. Record FM receiver audio output as mono WAV (48 kHz preferred).
-//  2. Phase search: slide a single-bit PN template across [0, samplesPerBit)
-//     to find chip-aligned sample offset (coarse-fine search).
+//  1. Record FM receiver audio output as mono WAV (any sample rate ≥ 12 kHz).
+//  2. Phase search: energy-based coarse/fine search for chip alignment.
 //  3. Extract 128 bit correlations at found phase, averaged over all frames.
 //  4. Frame sync: try all 128 cyclic rotations of the bit sequence,
 //     RS-decode each; the rotation that succeeds gives the frame alignment.
-//  5. Sort bits by |correlation| (confidence). Mark weakest as erasures.
+//  5. Byte-level erasure + soft-decision bit-flipping for error correction.
 //  6. RS erasure-decode → 8 payload bytes → compare against known keys.
 package watermark
 
@@ -32,7 +39,7 @@ import (
 )
 
 const (
-	// pnChips is the spreading factor — PN chips per data bit at composite rate.
+	// pnChips is the spreading factor — PN chips per data bit.
 	// Spreading gain = 10·log10(2048) = 33.1 dB.
 	pnChips = 2048
 
@@ -51,16 +58,23 @@ const (
 
 	// Level is the audio injection amplitude per channel (-48 dBFS).
 	// At typical audio levels this is completely inaudible.
-	Level = 0.004
+	Level = 0.040
 
-	// CompositeRate is the sample rate at which the watermark was embedded.
-	// The recovery tool uses this to compute fractional chip indices.
+	// CompositeRate is the sample rate at which the watermark is embedded.
 	CompositeRate = 228000
 )
 
-// RecordingRate is the canonical recording rate used for chip-rate Bresenham stepping.
-// The embedder advances chips at this rate, so the decoder at this rate sees
-// exactly pnChips samples per bit with no fractional-stepping errors.
+// ChipRate is the effective PN chip clock rate (Hz). Determines the spectral
+// bandwidth of the watermark: Nyquist = ChipRate/2 = 6 kHz. This ensures
+// all PN energy is within the audio band that survives de-emphasis (50/75 µs),
+// receiver LPFs, audio codecs, speaker EQ, and even acoustic recording.
+//
+// At CompositeRate (228 kHz), each chip spans 228000/12000 = 19 samples.
+// At any recording rate R, each chip spans R/12000 samples.
+const ChipRate = 12000
+
+// RecordingRate is the canonical recording rate for test WAV output (wmtest).
+// Not used for chip stepping — ChipRate controls that.
 const RecordingRate = 48000
 
 // Embedder continuously embeds a watermark into audio L/R samples.
@@ -98,16 +112,15 @@ func NewEmbedder(key string) *Embedder {
 // NextSample returns the watermark amplitude for one composite sample.
 // Add this value to both audio.Frame.L and audio.Frame.R before stereo encoding.
 //
-// The chip index advances using Bresenham stepping at RecordingRate/CompositeRate,
-// so each chip occupies exactly CompositeRate/RecordingRate composite samples on
-// average. A decoder recording at RecordingRate (48 kHz) sees exactly pnChips
-// samples per data bit, enabling simple integer-stride correlation.
+// The chip index advances using Bresenham stepping at ChipRate/CompositeRate,
+// so each chip occupies exactly CompositeRate/ChipRate composite samples on
+// average (~19 samples at 228 kHz). The PN signal bandwidth is 0–6 kHz.
 func (e *Embedder) NextSample() float64 {
 	chip := float64(pnSequence[e.chipIdx])
 	sample := Level * float64(e.symbol) * chip * e.gateGain
 
-	// Bresenham: advance chip once per RecordingRate/CompositeRate composite samples.
-	e.accum += RecordingRate
+	// Bresenham: advance chip once per ChipRate/CompositeRate composite samples.
+	e.accum += ChipRate
 	if e.accum >= CompositeRate {
 		e.accum -= CompositeRate
 		e.chipIdx++
@@ -183,6 +196,26 @@ func (e *Embedder) SetAudioLevel(absLevel float64) {
 	}
 }
 
+// DiagnosticState returns internal state for debugging.
+type DiagnosticInfo struct {
+	GateGain    float64
+	GateEnabled bool
+	ChipIdx     int
+	BitIdx      int
+	Symbol      int8
+}
+
+// DiagnosticState returns a snapshot of the embedder's internal state.
+func (e *Embedder) DiagnosticState() DiagnosticInfo {
+	return DiagnosticInfo{
+		GateGain:    e.gateGain,
+		GateEnabled: e.gateEnabled,
+		ChipIdx:     e.chipIdx,
+		BitIdx:      e.bitIdx,
+		Symbol:      e.symbol,
+	}
+}
+
 // --- RS(16,8) over GF(2^8) — GF poly 0x11d, fcr=0, generator=2 ---
 // These routines are used by the embedder (encode) and the recovery tool (decode).
 
@@ -231,7 +264,9 @@ func rsEncode(data [rsDataBytes]byte) [rsTotalBytes]byte {
 // that should be treated as erasures. Up to 8 erasures can be corrected.
 // Returns (data, true) on success, (zero, false) on decoding failure.
 func RSDecode(recv [rsTotalBytes]byte, erasurePositions []int) ([rsDataBytes]byte, bool) {
-	// Step 1: compute syndromes.
+	// Step 1: compute syndromes S[i] = C(α^i) via Horner's method.
+	// The polynomial convention is C(x) = c[0]x^15 + c[1]x^14 + … + c[15],
+	// so byte position j contributes c[j]·(α^i)^(15-j) to S[i].
 	var S [rsCheckBytes]byte
 	for i := 0; i < rsCheckBytes; i++ {
 		var acc byte
@@ -241,7 +276,7 @@ func RSDecode(recv [rsTotalBytes]byte, erasurePositions []int) ([rsDataBytes]byt
 		S[i] = acc
 	}
 
-	// Step 2: if no erasures and all syndromes zero, no errors.
+	// Step 2: all syndromes zero → valid codeword.
 	hasErr := false
 	for _, s := range S {
 		if s != 0 {
@@ -249,79 +284,81 @@ func RSDecode(recv [rsTotalBytes]byte, erasurePositions []int) ([rsDataBytes]byt
 			break
 		}
 	}
-	if !hasErr && len(erasurePositions) == 0 {
-		// Valid codeword, no errors, no erasures.
+	if !hasErr {
 		var out [rsDataBytes]byte
 		copy(out[:], recv[:rsDataBytes])
 		return out, true
 	}
-	if hasErr && len(erasurePositions) == 0 {
-		// Errors present but no erasure positions supplied — cannot correct.
-		// BUG FIX: previously fell through to ne==0 check and returned wrong
-		// data as correct. Now correctly signals failure so the caller can
-		// retry with erasure positions.
+	ne := len(erasurePositions)
+	if ne == 0 || ne > rsCheckBytes {
 		return [rsDataBytes]byte{}, false
 	}
 
-	// Step 3: erasure locator polynomial Γ(x) = ∏(1 - α^(e_j)·x).
-	gamma := []byte{1}
-	for _, pos := range erasurePositions {
-		// multiply gamma by (1 + α^pos · x)
-		alpha := gfPow(2, pos)
-		next := make([]byte, len(gamma)+1)
-		for j, g := range gamma {
-			next[j] ^= g
-			next[j+1] ^= gfMul(g, alpha)
-		}
-		gamma = next
+	// Step 3: Solve for error magnitudes via Vandermonde system.
+	//
+	// Because C(x) = c[0]x^15 + … + c[15], byte position j maps to
+	// polynomial power (15-j). The "locator" for position j is
+	// X_j = α^(15-j). The syndrome equation becomes:
+	//
+	//   S[i] = Σ_k  e_k · X_k^i
+	//
+	// This is a linear system (Vandermonde) in the unknowns e_k.
+	// Solve by Gaussian elimination in GF(2^8).
+
+	// Build locators
+	X := make([]byte, ne)
+	for k, pos := range erasurePositions {
+		X[k] = gfPow(2, rsTotalBytes-1-pos)
 	}
 
-	// Step 4: modified syndrome T(x) = S(x)·Γ(x).
-	t := make([]byte, rsCheckBytes)
-	for i := 0; i < rsCheckBytes; i++ {
-		var acc byte
-		for j := 0; j < len(gamma) && j <= i; j++ {
-			if i-j < rsCheckBytes {
-				acc ^= gfMul(gamma[j], S[i-j])
-			}
+	// Augmented matrix [V | S], ne × (ne+1)
+	mat := make([][]byte, ne)
+	for i := range mat {
+		mat[i] = make([]byte, ne+1)
+		for k := 0; k < ne; k++ {
+			mat[i][k] = gfPow(X[k], i)
 		}
-		t[i] = acc
-	}
-
-	// Step 5: compute error magnitudes using Forney's formula.
-	// For erasure-only decoding (no additional errors), the error locator
-	// is Γ itself. Evaluate omega = T mod x^(ne) where ne = len(erasures).
-	ne := len(erasurePositions)
-	if ne == 0 {
-		// Should not be reachable: handled above. Fail safely.
-		return [rsDataBytes]byte{}, false
-	}
-	if ne > rsCheckBytes {
-		return [rsDataBytes]byte{}, false
+		mat[i][ne] = S[i]
 	}
 
-	result := recv
-	for _, pos := range erasurePositions {
-		xi := gfPow(2, pos)
-		// Evaluate omega at xi^-1.
-		xiInv := gfInv(xi)
-		var omega byte
-		for j := 0; j < ne && j < rsCheckBytes; j++ {
-			omega ^= gfMul(t[j], gfPow(xiInv, j))
+	// Gaussian elimination with partial pivoting
+	for col := 0; col < ne; col++ {
+		pivot := -1
+		for row := col; row < ne; row++ {
+			if mat[row][col] != 0 {
+				pivot = row
+				break
+			}
 		}
-		// Formal derivative of gamma at xi^-1 (only odd-degree terms survive in GF(2)).
-		var gammaPrime byte
-		for j := 1; j < len(gamma); j += 2 {
-			gammaPrime ^= gfMul(gamma[j], gfPow(xiInv, j-1))
+		if pivot < 0 {
+			return [rsDataBytes]byte{}, false // singular
 		}
-		if gammaPrime == 0 {
-			return [rsDataBytes]byte{}, false
+		mat[col], mat[pivot] = mat[pivot], mat[col]
+		inv := gfInv(mat[col][col])
+		for j := 0; j <= ne; j++ {
+			mat[col][j] = gfMul(mat[col][j], inv)
+		}
+		for row := 0; row < ne; row++ {
+			if row == col {
+				continue
+			}
+			f := mat[row][col]
+			if f == 0 {
+				continue
+			}
+			for j := 0; j <= ne; j++ {
+				mat[row][j] ^= gfMul(f, mat[col][j])
+			}
 		}
-		magnitude := gfMul(omega, gfInv(gammaPrime))
-		result[pos] ^= magnitude
 	}
 
-	// Verify syndromes after correction.
+	// Apply corrections
+	result := recv
+	for k := 0; k < ne; k++ {
+		result[erasurePositions[k]] ^= mat[k][ne]
+	}
+
+	// Verify syndromes after correction
 	for i := 0; i < rsCheckBytes; i++ {
 		var acc byte
 		for _, c := range result {
@@ -345,6 +382,19 @@ func KeyMatchesPayload(key string, payload [rsDataBytes]byte) bool {
 	return expected == payload
 }
 
+// KeyToPayload returns SHA-256(key)[:8].
+func KeyToPayload(key string) [rsDataBytes]byte {
+	var data [rsDataBytes]byte
+	h := sha256.Sum256([]byte(key))
+	copy(data[:], h[:rsDataBytes])
+	return data
+}
+
+// RSEncode encodes 8 data bytes into a 16-byte RS codeword.
+func RSEncode(data [rsDataBytes]byte) [rsTotalBytes]byte {
+	return rsEncode(data)
+}
+
 // Constants exported for the recovery tool.
 const (
 	PnChips       = pnChips
@@ -354,17 +404,23 @@ const (
 	RsCheckBytes  = rsCheckBytes
 )
 
+// PnSequenceAt returns the PN chip value (+1.0 or -1.0) at the given index.
+// Used by the decoder for rate-compensated correlation.
+func PnSequenceAt(chipIdx int) float64 {
+	return float64(pnSequence[chipIdx%pnChips])
+}
+
+// PNSequence exposes the raw PN chip values for the chip-rate decoder.
+var PNSequence = &pnSequence
+
 // CorrelateAt returns the correlation of received samples at the given bit
 // position. recRate is the WAV sample rate.
 //
-// At recRate = RecordingRate (48000 Hz) the chip stride is exactly 1 — the
-// embedder was designed for this rate. At other rates the chip index is
-// scaled proportionally (still works with enough frame averaging).
+// At any recording rate, chips are mapped via ChipRate: each chip spans
+// recRate/ChipRate samples. At 48 kHz: 4 samples/chip, 8192 samples/bit.
+// At 192 kHz: 16 samples/chip, 32768 samples/bit.
 func CorrelateAt(samples []float64, bitStart int, recRate float64) float64 {
-	// Samples per bit at the canonical recording rate.
-	// At RecordingRate: samplesPerBit = pnChips (integer, perfect).
-	// At other rates: scale proportionally.
-	samplesPerBit := int(float64(pnChips) * recRate / float64(RecordingRate))
+	samplesPerBit := int(float64(pnChips) * recRate / float64(ChipRate))
 	if samplesPerBit < 1 {
 		samplesPerBit = 1
 	}
@@ -374,8 +430,7 @@ func CorrelateAt(samples []float64, bitStart int, recRate float64) float64 {
 	}
 	var acc float64
 	for i := 0; i < n; i++ {
-		// Map recording-rate sample index to chip index.
-		chipIdx := int(float64(i)*float64(RecordingRate)/recRate) % pnChips
+		chipIdx := int(float64(i)*float64(ChipRate)/recRate) % pnChips
 		acc += samples[bitStart+i] * float64(pnSequence[chipIdx])
 	}
 	return acc
diff --git a/internal/watermark/watermark_roundtrip_test.go b/internal/watermark/watermark_roundtrip_test.go
index 8a8b72f..84fab12 100644
--- a/internal/watermark/watermark_roundtrip_test.go
+++ b/internal/watermark/watermark_roundtrip_test.go
@@ -11,7 +11,7 @@ func TestRoundTrip(t *testing.T) {
 	const key = "test-key-42"
 	const recRate = float64(RecordingRate) // 48000
 	const compRate = float64(CompositeRate) // 228000
-	const duration = 15.0 // seconds — should give ~2 full frames
+	const duration = 60.0 // seconds — ~2.7 frames at ChipRate=12kHz
 
 	nRecSamples := int(duration * recRate)
 
@@ -47,7 +47,7 @@ func TestRoundTrip(t *testing.T) {
 	}
 
 	// === Decode: Phase search ===
-	samplesPerBit := int(float64(PnChips) * recRate / float64(RecordingRate))
+	samplesPerBit := int(float64(PnChips) * recRate / float64(ChipRate))
 	t.Logf("samplesPerBit=%d, frameLen=%d", samplesPerBit, samplesPerBit*PayloadBits)
 
 	const coarseStep = 8