|
- // Package watermark implements STFT-domain spread-spectrum audio watermarking
- // based on Kirovski & Malvar (IEEE TSP 2003).
- //
- // Architecture:
- // - Embedding in STFT magnitude (dB scale) — multiplicative, natural masking
- // - Block repetition coding (R=5 time frames) — automatic drift tolerance
- // - Cepstrum filtering at detection — 6 dB carrier noise reduction
- // - PCC covert channel — PN partitioned into M=128 subsets for 128-bit payload
- // - Multi-test sync — scan R frame offsets to find alignment
- //
- // Both encoder and decoder operate at 12 kHz (WMRate). The encoder decimates
- // from composite rate (÷19), processes STFT, and upsamples back. The decoder
- // decimates from recording rate (÷16 from 192kHz, ÷4 from 48kHz, etc.).
- // Same STFT parameters → bins align perfectly → no rate mismatch.
- package watermark
-
- import (
- "crypto/sha256"
- "math"
- "math/cmplx"
-
- "github.com/jan/fm-rds-tx/internal/dsp"
- )
-
- // STFT watermark constants.
- const (
- WMRate = 12000 // watermark processing sample rate (Hz)
- FFTSize = 512 // STFT frame size (samples at WMRate)
- FFTHop = 256 // 50% overlap
- BinLow = 9 // ~211 Hz at WMRate/FFTSize
- BinHigh = 213 // ~4992 Hz at WMRate/FFTSize
- NumBins = BinHigh - BinLow // 204 frequency chips per STFT frame
- TimeRep = 5 // block repetition factor (±2 frame drift tolerance)
- GroupsPerBit = 10 // time groups per data bit
- WMLevelDB = 1.5 // embedding level (dB)
-
- TotalGroups = GroupsPerBit * payloadBits // 10 × 128 = 1280
- FramesPerWM = TotalGroups * TimeRep // 1280 × 5 = 6400
- SamplesPerWM = FramesPerWM * FFTHop // 6400 × 256 = 1638400
- // Duration at WMRate: 1638400 / 12000 = 136.5 seconds
- )
-
- // STFTEmbedder processes audio blocks and adds the STFT-domain watermark.
- // It works at WMRate (12 kHz). The caller must decimate input to WMRate
- // and upsample output back to the desired rate.
- type STFTEmbedder struct {
- // PN chip matrix: pnChips[group][bin] ∈ {-1, +1}
- // group ∈ [0, TotalGroups), bin ∈ [0, NumBins)
- pnChips [TotalGroups][NumBins]int8
-
- // Bit assignment: which data bit owns each group (PCC permutation)
- groupToBit [TotalGroups]int
-
- // RS-encoded codeword: 128 bits → symbol[bit] = +1 or -1
- symbols [payloadBits]int8
-
- // STFT state
- window [FFTSize]float64
- inBuf [FFTSize]float64 // analysis window buffer
- outBuf [FFTSize + FFTHop]float64 // overlap-add output buffer
- inPos int // samples written to inBuf
- outPos int // samples read from outBuf
- frameIdx int // STFT frame counter (wraps at FramesPerWM)
- primed bool // true after first full frame
-
- // Level in linear scale: 10^(WMLevelDB/20) - 1 ≈ 0.189 for 1.5 dB
- levelLinear float64
- }
-
- // NewSTFTEmbedder creates an embedder for the given license key.
- func NewSTFTEmbedder(key string) *STFTEmbedder {
- e := &STFTEmbedder{}
-
- // Compute RS-encoded payload
- var data [rsDataBytes]byte
- if key != "" {
- h := sha256.Sum256([]byte(key))
- copy(data[:], h[:rsDataBytes])
- }
- codeword := rsEncode(data)
-
- // BPSK symbols: bit 0 → +1, bit 1 → -1
- for i := 0; i < payloadBits; i++ {
- if (codeword[i/8]>>uint(7-(i%8)))&1 == 1 {
- e.symbols[i] = -1
- } else {
- e.symbols[i] = 1
- }
- }
-
- // Generate PN chips from key-seeded PRNG
- seed := sha256.Sum256(append([]byte("stft-pn-"), key...))
- prng := newPRNG(seed[:])
- for g := 0; g < TotalGroups; g++ {
- for b := 0; b < NumBins; b++ {
- if prng.next()&1 == 0 {
- e.pnChips[g][b] = 1
- } else {
- e.pnChips[g][b] = -1
- }
- }
- }
-
- // PCC permutation: assign groups to bits (interleaved + permuted)
- // Simple interleaving first, then Fisher-Yates shuffle
- for g := 0; g < TotalGroups; g++ {
- e.groupToBit[g] = g % payloadBits
- }
- // Permute within each bit's groups using key-seeded PRNG
- permSeed := sha256.Sum256(append([]byte("stft-perm-"), key...))
- permRNG := newPRNG(permSeed[:])
- for i := TotalGroups - 1; i > 0; i-- {
- j := permRNG.next() % uint32(i+1)
- e.groupToBit[i], e.groupToBit[j] = e.groupToBit[j], e.groupToBit[i]
- }
-
- // Hann window
- dsp.HannWindow(e.window[:])
-
- // Embedding level
- e.levelLinear = math.Pow(10, WMLevelDB/20) - 1 // fractional magnitude change
-
- return e
- }
-
- // ProcessBlock takes mono audio at WMRate and returns watermarked audio.
- // The input and output lengths are the same. Internally buffers for STFT
- // overlap-add processing. Call with chunks of any size.
- func (e *STFTEmbedder) ProcessBlock(in []float64) []float64 {
- out := make([]float64, len(in))
- for i, s := range in {
- // Feed sample into STFT input buffer
- e.inBuf[e.inPos] = s
- e.inPos++
-
- if e.inPos == FFTSize {
- // Full frame: process STFT
- e.processFrame()
- e.inPos = FFTHop // shift: keep last hop samples for next frame overlap
- copy(e.inBuf[:FFTHop], e.inBuf[FFTHop:FFTSize])
- }
-
- // Read from overlap-add output buffer
- if e.primed {
- out[i] = e.outBuf[e.outPos]
- e.outPos++
- if e.outPos >= FFTHop {
- e.outPos = 0
- // Shift output buffer: move overlap region to start
- copy(e.outBuf[:FFTSize], e.outBuf[FFTHop:FFTSize+FFTHop])
- // Zero the new region
- for j := FFTSize - FFTHop; j < FFTSize+FFTHop; j++ {
- if j < len(e.outBuf) {
- e.outBuf[j] = 0
- }
- }
- }
- } else {
- out[i] = s // pass-through until first frame is processed
- }
- }
- return out
- }
-
- // processFrame computes one STFT frame: window → FFT → modify magnitudes → IFFT → overlap-add.
- func (e *STFTEmbedder) processFrame() {
- // Determine which group this frame belongs to
- wmFrame := e.frameIdx % FramesPerWM
- groupIdx := wmFrame / TimeRep
- repIdx := wmFrame % TimeRep
- centerRep := TimeRep / 2 // only center repetition carries the watermark for detection
-
- // Apply window and convert to complex
- var buf [FFTSize]complex128
- for i := 0; i < FFTSize; i++ {
- buf[i] = complex(e.inBuf[i]*e.window[i], 0)
- }
-
- // Forward FFT
- dsp.FFT(buf[:])
-
- // Modify magnitudes in the watermark sub-band
- // Only modify if this is within a valid group AND at the center repetition
- // (we embed in ALL repetitions so the watermark energy is present everywhere,
- // but the PN pattern is the same for all R frames in a group)
- if groupIdx < TotalGroups {
- bitIdx := e.groupToBit[groupIdx]
- dataSign := float64(e.symbols[bitIdx])
- _ = repIdx
- _ = centerRep
-
- for b := 0; b < NumBins; b++ {
- bin := BinLow + b
- chip := float64(e.pnChips[groupIdx][b])
-
- // Modify magnitude: |Y| = |X| × (1 + level × chip × data)
- // Phase preserved
- mag := cmplx.Abs(buf[bin])
- if mag < 1e-10 {
- continue // skip near-silence bins to avoid division by zero
- }
- phase := cmplx.Phase(buf[bin])
- newMag := mag * (1.0 + e.levelLinear*chip*dataSign)
- buf[bin] = cmplx.Rect(newMag, phase)
-
- // Mirror for negative frequencies (conjugate symmetry)
- if bin > 0 && bin < FFTSize/2 {
- buf[FFTSize-bin] = cmplx.Conj(buf[bin])
- }
- }
- }
-
- // Inverse FFT
- dsp.IFFT(buf[:])
-
- // Overlap-add to output buffer
- for i := 0; i < FFTSize; i++ {
- e.outBuf[i] += real(buf[i])
- }
-
- if !e.primed {
- e.primed = true
- e.outPos = 0
- }
-
- e.frameIdx++
- }
-
- // STFTDetector extracts watermark bits from an audio recording.
- type STFTDetector struct {
- pnChips [TotalGroups][NumBins]int8
- groupToBit [TotalGroups]int
- }
-
- // NewSTFTDetector creates a detector matching the given key's PN sequence.
- func NewSTFTDetector(key string) *STFTDetector {
- d := &STFTDetector{}
-
- // Same PN generation as embedder
- seed := sha256.Sum256(append([]byte("stft-pn-"), key...))
- prng := newPRNG(seed[:])
- for g := 0; g < TotalGroups; g++ {
- for b := 0; b < NumBins; b++ {
- if prng.next()&1 == 0 {
- d.pnChips[g][b] = 1
- } else {
- d.pnChips[g][b] = -1
- }
- }
- }
-
- // Same permutation
- for g := 0; g < TotalGroups; g++ {
- d.groupToBit[g] = g % payloadBits
- }
- permSeed := sha256.Sum256(append([]byte("stft-perm-"), key...))
- permRNG := newPRNG(permSeed[:])
- for i := TotalGroups - 1; i > 0; i-- {
- j := permRNG.next() % uint32(i+1)
- d.groupToBit[i], d.groupToBit[j] = d.groupToBit[j], d.groupToBit[i]
- }
-
- return d
- }
-
- // Detect processes audio at WMRate and returns soft bit decisions.
- // The audio should already be decimated/resampled to WMRate and LPF'd.
- //
- // Multi-test: tries TimeRep frame offsets (the block repetition candidates).
- // Cepstrum filtering is applied to reduce carrier noise.
- //
- // Returns: 128 soft correlation values (sign = bit decision, magnitude = confidence),
- // and the frame offset that gave the best detection metric.
- func (d *STFTDetector) Detect(audio []float64) (corrs [payloadBits]float64, bestOffset int) {
- // Compute all STFT frames
- var window [FFTSize]float64
- dsp.HannWindow(window[:])
-
- nFrames := (len(audio) - FFTSize) / FFTHop
- if nFrames < FramesPerWM {
- // Not enough data for a full watermark cycle — use what we have
- }
-
- // Compute STFT magnitudes (dB) for all frames
- type stftFrame struct {
- magDB [FFTSize / 2]float64
- }
- frames := make([]stftFrame, nFrames)
-
- for f := 0; f < nFrames; f++ {
- offset := f * FFTHop
- var buf [FFTSize]complex128
- for i := 0; i < FFTSize; i++ {
- if offset+i < len(audio) {
- buf[i] = complex(audio[offset+i]*window[i], 0)
- }
- }
- dsp.FFT(buf[:])
-
- for bin := 0; bin < FFTSize/2; bin++ {
- mag := cmplx.Abs(buf[bin])
- if mag < 1e-12 {
- mag = 1e-12
- }
- frames[f].magDB[bin] = 20 * math.Log10(mag)
- }
-
- // Cepstrum filtering: remove spectral envelope
- // DCT of dB magnitudes, zero first N_ceps coefficients, IDCT
- cepstrumFilter(frames[f].magDB[:], 8)
- }
-
- // Multi-test: try each of TimeRep frame offsets within the repetition block
- bestMetric := -1.0
- bestOffset = 0
-
- for startOffset := 0; startOffset < TimeRep; startOffset++ {
- var testCorrs [payloadBits]float64
-
- // For each group, use the CENTER frame of the repetition block
- for g := 0; g < TotalGroups; g++ {
- bitIdx := d.groupToBit[g]
- frameInWM := g*TimeRep + startOffset + TimeRep/2
- if frameInWM >= nFrames {
- continue
- }
-
- // Correlate this frame's magnitudes with the PN chips
- var corr float64
- for b := 0; b < NumBins; b++ {
- bin := BinLow + b
- corr += frames[frameInWM].magDB[bin] * float64(d.pnChips[g][b])
- }
- testCorrs[bitIdx] += corr
- }
-
- // Detection metric: sum of squared partial correlations (chi-squared)
- // From paper equation (10): Q = Σ (corr_m)²
- var metric float64
- for _, c := range testCorrs {
- metric += c * c
- }
-
- if metric > bestMetric {
- bestMetric = metric
- bestOffset = startOffset
- corrs = testCorrs
- }
- }
-
- return corrs, bestOffset
- }
-
- // cepstrumFilter removes the spectral envelope from dB magnitudes.
- // It zeros the first nCeps DCT coefficients (the smooth spectral shape).
- // This is Kirovski's "CF" technique: reduces carrier noise by ~6 dB.
- func cepstrumFilter(magDB []float64, nCeps int) {
- n := len(magDB)
- if n < nCeps*2 {
- return
- }
-
- // DCT-II (simplified, not optimized)
- ceps := make([]float64, n)
- for k := 0; k < n; k++ {
- var sum float64
- for i := 0; i < n; i++ {
- sum += magDB[i] * math.Cos(math.Pi*float64(k)*(float64(i)+0.5)/float64(n))
- }
- ceps[k] = sum
- }
-
- // Zero low-order cepstral coefficients (spectral envelope)
- for k := 0; k < nCeps; k++ {
- ceps[k] = 0
- }
-
- // IDCT (inverse DCT-II)
- for i := 0; i < n; i++ {
- var sum float64
- for k := 0; k < n; k++ {
- w := 1.0
- if k == 0 {
- w = 0.5
- }
- sum += w * ceps[k] * math.Cos(math.Pi*float64(k)*(float64(i)+0.5)/float64(n))
- }
- magDB[i] = sum * 2.0 / float64(n)
- }
- }
-
- // Simple xorshift32 PRNG for deterministic chip generation.
- type simplePRNG struct {
- state uint32
- }
-
- func newPRNG(seed []byte) *simplePRNG {
- var s uint32
- for i, b := range seed {
- s ^= uint32(b) << (uint(i%4) * 8)
- }
- if s == 0 {
- s = 1
- }
- return &simplePRNG{state: s}
- }
-
- func (p *simplePRNG) next() uint32 {
- p.state ^= p.state << 13
- p.state ^= p.state >> 17
- p.state ^= p.state << 5
- return p.state
- }
|