// Package watermark implements STFT-domain spread-spectrum audio watermarking // based on Kirovski & Malvar (IEEE TSP 2003). // // Architecture: // - Embedding in STFT magnitude (dB scale) — multiplicative, natural masking // - Block repetition coding (R=5 time frames) — automatic drift tolerance // - Cepstrum filtering at detection — 6 dB carrier noise reduction // - PCC covert channel — PN partitioned into M=128 subsets for 128-bit payload // - Multi-test sync — scan R frame offsets to find alignment // // Both encoder and decoder operate at 12 kHz (WMRate). The encoder decimates // from composite rate (÷19), processes STFT, and upsamples back. The decoder // decimates from recording rate (÷16 from 192kHz, ÷4 from 48kHz, etc.). // Same STFT parameters → bins align perfectly → no rate mismatch. package watermark import ( "crypto/sha256" "math" "math/cmplx" "github.com/jan/fm-rds-tx/internal/dsp" ) // STFT watermark constants. const ( WMRate = 12000 // watermark processing sample rate (Hz) FFTSize = 512 // STFT frame size (samples at WMRate) FFTHop = 256 // 50% overlap BinLow = 9 // ~211 Hz at WMRate/FFTSize BinHigh = 213 // ~4992 Hz at WMRate/FFTSize NumBins = BinHigh - BinLow // 204 frequency chips per STFT frame TimeRep = 5 // block repetition factor (±2 frame drift tolerance) GroupsPerBit = 10 // time groups per data bit WMLevelDB = 0.5 // embedding level (dB) — inaudible, 20 dB margin for decode TotalGroups = GroupsPerBit * payloadBits // 10 × 128 = 1280 FramesPerWM = TotalGroups * TimeRep // 1280 × 5 = 6400 SamplesPerWM = FramesPerWM * FFTHop // 6400 × 256 = 1638400 // Duration at WMRate: 1638400 / 12000 = 136.5 seconds ) // STFTEmbedder processes audio blocks and adds the STFT-domain watermark. // It works at WMRate (12 kHz). The caller must decimate input to WMRate // and upsample output back to the desired rate. type STFTEmbedder struct { // PN chip matrix: pnChips[group][bin] ∈ {-1, +1} // group ∈ [0, TotalGroups), bin ∈ [0, NumBins) pnChips [TotalGroups][NumBins]int8 // Bit assignment: which data bit owns each group (PCC permutation) groupToBit [TotalGroups]int // RS-encoded codeword: 128 bits → symbol[bit] = +1 or -1 symbols [payloadBits]int8 // STFT state window [FFTSize]float64 inBuf [FFTSize]float64 // analysis window buffer outBuf [FFTSize + FFTHop]float64 // overlap-add output buffer inPos int // samples written to inBuf outPos int // samples read from outBuf frameIdx int // STFT frame counter (wraps at FramesPerWM) primed bool // true after first full frame // Level in linear scale: 10^(WMLevelDB/20) - 1 ≈ 0.059 for 0.5 dB levelLinear float64 } // NewSTFTEmbedder creates an embedder for the given license key. func NewSTFTEmbedder(key string) *STFTEmbedder { e := &STFTEmbedder{} // Compute RS-encoded payload var data [rsDataBytes]byte if key != "" { h := sha256.Sum256([]byte(key)) copy(data[:], h[:rsDataBytes]) } codeword := rsEncode(data) // BPSK symbols: bit 0 → +1, bit 1 → -1 for i := 0; i < payloadBits; i++ { if (codeword[i/8]>>uint(7-(i%8)))&1 == 1 { e.symbols[i] = -1 } else { e.symbols[i] = 1 } } // Generate PN chips from key-seeded PRNG seed := sha256.Sum256([]byte("fmrtx-stft-pn-v1")) prng := newPRNG(seed[:]) for g := 0; g < TotalGroups; g++ { for b := 0; b < NumBins; b++ { if prng.next()&1 == 0 { e.pnChips[g][b] = 1 } else { e.pnChips[g][b] = -1 } } } // PCC permutation: assign groups to bits (interleaved + permuted) // Simple interleaving first, then Fisher-Yates shuffle for g := 0; g < TotalGroups; g++ { e.groupToBit[g] = g % payloadBits } // Permute within each bit's groups using key-seeded PRNG permSeed := sha256.Sum256([]byte("fmrtx-stft-perm-v1")) permRNG := newPRNG(permSeed[:]) for i := TotalGroups - 1; i > 0; i-- { j := permRNG.next() % uint32(i+1) e.groupToBit[i], e.groupToBit[j] = e.groupToBit[j], e.groupToBit[i] } // Hann window dsp.HannWindow(e.window[:]) // Embedding level e.levelLinear = math.Pow(10, WMLevelDB/20) - 1 // fractional magnitude change return e } // ProcessBlock takes mono audio at WMRate and returns watermarked audio. // The input and output lengths are the same. Internally buffers for STFT // overlap-add processing. Call with chunks of any size. func (e *STFTEmbedder) ProcessBlock(in []float64) []float64 { out := make([]float64, len(in)) for i, s := range in { // Feed sample into STFT input buffer e.inBuf[e.inPos] = s e.inPos++ if e.inPos == FFTSize { // Full frame: process STFT e.processFrame() e.inPos = FFTHop // shift: keep last hop samples for next frame overlap copy(e.inBuf[:FFTHop], e.inBuf[FFTHop:FFTSize]) } // Read from overlap-add output buffer if e.primed { out[i] = e.outBuf[e.outPos] e.outPos++ if e.outPos >= FFTHop { e.outPos = 0 // Shift output buffer: move overlap region to start copy(e.outBuf[:FFTSize], e.outBuf[FFTHop:FFTSize+FFTHop]) // Zero the new region for j := FFTSize - FFTHop; j < FFTSize+FFTHop; j++ { if j < len(e.outBuf) { e.outBuf[j] = 0 } } } } else { out[i] = s // pass-through until first frame is processed } } return out } // processFrame computes one STFT frame: window → FFT → modify magnitudes → IFFT → overlap-add. func (e *STFTEmbedder) processFrame() { // Determine which group this frame belongs to wmFrame := e.frameIdx % FramesPerWM groupIdx := wmFrame / TimeRep repIdx := wmFrame % TimeRep centerRep := TimeRep / 2 // only center repetition carries the watermark for detection // Apply window and convert to complex var buf [FFTSize]complex128 for i := 0; i < FFTSize; i++ { buf[i] = complex(e.inBuf[i]*e.window[i], 0) } // Forward FFT dsp.FFT(buf[:]) // Modify magnitudes in the watermark sub-band // Only modify if this is within a valid group AND at the center repetition // (we embed in ALL repetitions so the watermark energy is present everywhere, // but the PN pattern is the same for all R frames in a group) if groupIdx < TotalGroups { bitIdx := e.groupToBit[groupIdx] dataSign := float64(e.symbols[bitIdx]) _ = repIdx _ = centerRep for b := 0; b < NumBins; b++ { bin := BinLow + b chip := float64(e.pnChips[groupIdx][b]) // Modify magnitude: |Y| = |X| × (1 + level × chip × data) // Phase preserved mag := cmplx.Abs(buf[bin]) if mag < 1e-10 { continue // skip near-silence bins to avoid division by zero } phase := cmplx.Phase(buf[bin]) newMag := mag * (1.0 + e.levelLinear*chip*dataSign) buf[bin] = cmplx.Rect(newMag, phase) // Mirror for negative frequencies (conjugate symmetry) if bin > 0 && bin < FFTSize/2 { buf[FFTSize-bin] = cmplx.Conj(buf[bin]) } } } // Inverse FFT dsp.IFFT(buf[:]) // Overlap-add to output buffer for i := 0; i < FFTSize; i++ { e.outBuf[i] += real(buf[i]) } if !e.primed { e.primed = true e.outPos = 0 } e.frameIdx++ } // STFTDetector extracts watermark bits from an audio recording. type STFTDetector struct { pnChips [TotalGroups][NumBins]int8 groupToBit [TotalGroups]int } // NewSTFTDetector creates a detector. No key needed — the PN sequence is // public (fixed). The detector extracts the payload blindly. func NewSTFTDetector() *STFTDetector { d := &STFTDetector{} // Same PN generation as embedder seed := sha256.Sum256([]byte("fmrtx-stft-pn-v1")) prng := newPRNG(seed[:]) for g := 0; g < TotalGroups; g++ { for b := 0; b < NumBins; b++ { if prng.next()&1 == 0 { d.pnChips[g][b] = 1 } else { d.pnChips[g][b] = -1 } } } // Same permutation for g := 0; g < TotalGroups; g++ { d.groupToBit[g] = g % payloadBits } permSeed := sha256.Sum256([]byte("fmrtx-stft-perm-v1")) permRNG := newPRNG(permSeed[:]) for i := TotalGroups - 1; i > 0; i-- { j := permRNG.next() % uint32(i+1) d.groupToBit[i], d.groupToBit[j] = d.groupToBit[j], d.groupToBit[i] } return d } // Detect processes audio at WMRate and returns soft bit decisions. // The audio should already be decimated/resampled to WMRate and LPF'd. // // Multi-test: tries TimeRep frame offsets (the block repetition candidates). // Cepstrum filtering is applied to reduce carrier noise. // // Returns: 128 soft correlation values (sign = bit decision, magnitude = confidence), // and the frame offset that gave the best detection metric. func (d *STFTDetector) Detect(audio []float64) (corrs [payloadBits]float64, bestOffset int) { // Compute all STFT frames var window [FFTSize]float64 dsp.HannWindow(window[:]) nFrames := (len(audio) - FFTSize) / FFTHop if nFrames < FramesPerWM { // Not enough data for a full watermark cycle — use what we have } // Compute STFT magnitudes (dB) for all frames type stftFrame struct { magDB [FFTSize / 2]float64 } frames := make([]stftFrame, nFrames) for f := 0; f < nFrames; f++ { offset := f * FFTHop var buf [FFTSize]complex128 for i := 0; i < FFTSize; i++ { if offset+i < len(audio) { buf[i] = complex(audio[offset+i]*window[i], 0) } } dsp.FFT(buf[:]) for bin := 0; bin < FFTSize/2; bin++ { mag := cmplx.Abs(buf[bin]) if mag < 1e-12 { mag = 1e-12 } frames[f].magDB[bin] = 20 * math.Log10(mag) } // Cepstrum filtering: remove spectral envelope // DCT of dB magnitudes, zero first N_ceps coefficients, IDCT cepstrumFilter(frames[f].magDB[:], 8) } // Multi-test: try each of TimeRep frame offsets within the repetition block bestMetric := -1.0 bestOffset = 0 for startOffset := 0; startOffset < TimeRep; startOffset++ { var testCorrs [payloadBits]float64 // Iterate over ALL recording frames — multiple WM cycles accumulate // automatically via modular wrapping. This gives √N_cycles SNR gain. for f := 0; f < nFrames; f++ { wmFrame := ((f - startOffset) % FramesPerWM + FramesPerWM) % FramesPerWM if wmFrame%TimeRep != TimeRep/2 { continue // not center of repetition block } g := wmFrame / TimeRep if g >= TotalGroups { continue } var corr float64 for b := 0; b < NumBins; b++ { bin := BinLow + b corr += frames[f].magDB[bin] * float64(d.pnChips[g][b]) } testCorrs[d.groupToBit[g]] += corr } // Detection metric: sum of squared partial correlations (chi-squared) // From paper equation (10): Q = Σ (corr_m)² var metric float64 for _, c := range testCorrs { metric += c * c } if metric > bestMetric { bestMetric = metric bestOffset = startOffset corrs = testCorrs } } return corrs, bestOffset } // cepstrumFilter removes the spectral envelope from dB magnitudes. // It zeros the first nCeps DCT coefficients (the smooth spectral shape). // This is Kirovski's "CF" technique: reduces carrier noise by ~6 dB. // // Uses precomputed cosine table for O(N²) DCT without math.Cos calls. func cepstrumFilter(magDB []float64, nCeps int) { n := len(magDB) if n < nCeps*2 { return } cosTable := getCosTable(n) // DCT-II ceps := make([]float64, n) for k := 0; k < n; k++ { var sum float64 row := cosTable[k] for i := 0; i < n; i++ { sum += magDB[i] * row[i] } ceps[k] = sum } // Zero low-order coefficients for k := 0; k < nCeps; k++ { ceps[k] = 0 } // IDCT scale := 2.0 / float64(n) for i := 0; i < n; i++ { var sum float64 for k := 0; k < n; k++ { w := 1.0 if k == 0 { w = 0.5 } sum += w * ceps[k] * cosTable[k][i] } magDB[i] = sum * scale } } // Cached cosine table for DCT. cosTable[k][i] = cos(π·k·(i+0.5)/N). var cachedCosTable [][]float64 var cachedCosN int func getCosTable(n int) [][]float64 { if cachedCosN == n { return cachedCosTable } table := make([][]float64, n) for k := 0; k < n; k++ { table[k] = make([]float64, n) for i := 0; i < n; i++ { table[k][i] = math.Cos(math.Pi * float64(k) * (float64(i) + 0.5) / float64(n)) } } cachedCosTable = table cachedCosN = n return table } // Simple xorshift32 PRNG for deterministic chip generation. type simplePRNG struct { state uint32 } func newPRNG(seed []byte) *simplePRNG { var s uint32 for i, b := range seed { s ^= uint32(b) << (uint(i%4) * 8) } if s == 0 { s = 1 } return &simplePRNG{state: s} } func (p *simplePRNG) next() uint32 { p.state ^= p.state << 13 p.state ^= p.state >> 17 p.state ^= p.state << 5 return p.state }