// Package watermark implements STFT-domain spread-spectrum audio watermarking
// based on Kirovski & Malvar (IEEE TSP 2003).
//
// Architecture:
//   - Embedding in STFT magnitude (dB scale) — multiplicative, natural masking
//   - Block repetition coding (R=5 time frames) — automatic drift tolerance
//   - Cepstrum filtering at detection — 6 dB carrier noise reduction
//   - PCC covert channel — PN partitioned into M=128 subsets for 128-bit payload
//   - Multi-test sync — scan R frame offsets to find alignment
//
// Both encoder and decoder operate at 12 kHz (WMRate). The encoder decimates
// from composite rate (÷19), processes STFT, and upsamples back. The decoder
// decimates from recording rate (÷16 from 192kHz, ÷4 from 48kHz, etc.).
// Same STFT parameters → bins align perfectly → no rate mismatch.
package watermark

import (
	"crypto/sha256"
	"math"
	"math/cmplx"

	"github.com/jan/fm-rds-tx/internal/dsp"
)

// STFT watermark constants.
const (
	WMRate    = 12000 // watermark processing sample rate (Hz)
	FFTSize   = 512   // STFT frame size (samples at WMRate)
	FFTHop    = 256   // 50% overlap
	BinLow    = 9     // ~211 Hz at WMRate/FFTSize
	BinHigh   = 213   // ~4992 Hz at WMRate/FFTSize
	NumBins   = BinHigh - BinLow // 204 frequency chips per STFT frame
	TimeRep   = 5     // block repetition factor (±2 frame drift tolerance)
	GroupsPerBit = 10 // time groups per data bit
	WMLevelDB = 0.5   // embedding level (dB) — inaudible, 20 dB margin for decode

	TotalGroups   = GroupsPerBit * payloadBits // 10 × 128 = 1280
	FramesPerWM   = TotalGroups * TimeRep      // 1280 × 5 = 6400
	SamplesPerWM  = FramesPerWM * FFTHop       // 6400 × 256 = 1638400
	// Duration at WMRate: 1638400 / 12000 = 136.5 seconds
)

// STFTEmbedder processes audio blocks and adds the STFT-domain watermark.
// It works at WMRate (12 kHz). The caller must decimate input to WMRate
// and upsample output back to the desired rate.
type STFTEmbedder struct {
	// PN chip matrix: pnChips[group][bin] ∈ {-1, +1}
	// group ∈ [0, TotalGroups), bin ∈ [0, NumBins)
	pnChips [TotalGroups][NumBins]int8

	// Bit assignment: which data bit owns each group (PCC permutation)
	groupToBit [TotalGroups]int

	// RS-encoded codeword: 128 bits → symbol[bit] = +1 or -1
	symbols [payloadBits]int8

	// STFT state
	window   [FFTSize]float64
	inBuf    [FFTSize]float64 // analysis window buffer
	outBuf   [FFTSize + FFTHop]float64 // overlap-add output buffer
	inPos    int // samples written to inBuf
	outPos   int // samples read from outBuf
	frameIdx int // STFT frame counter (wraps at FramesPerWM)
	primed   bool // true after first full frame

	// Level in linear scale: 10^(WMLevelDB/20) - 1 ≈ 0.059 for 0.5 dB
	levelLinear float64
}

// NewSTFTEmbedder creates an embedder for the given license key.
func NewSTFTEmbedder(key string) *STFTEmbedder {
	e := &STFTEmbedder{}

	// Compute RS-encoded payload
	var data [rsDataBytes]byte
	if key != "" {
		h := sha256.Sum256([]byte(key))
		copy(data[:], h[:rsDataBytes])
	}
	codeword := rsEncode(data)

	// BPSK symbols: bit 0 → +1, bit 1 → -1
	for i := 0; i < payloadBits; i++ {
		if (codeword[i/8]>>uint(7-(i%8)))&1 == 1 {
			e.symbols[i] = -1
		} else {
			e.symbols[i] = 1
		}
	}

	// Generate PN chips from key-seeded PRNG
	seed := sha256.Sum256([]byte("fmrtx-stft-pn-v1"))
	prng := newPRNG(seed[:])
	for g := 0; g < TotalGroups; g++ {
		for b := 0; b < NumBins; b++ {
			if prng.next()&1 == 0 {
				e.pnChips[g][b] = 1
			} else {
				e.pnChips[g][b] = -1
			}
		}
	}

	// PCC permutation: assign groups to bits (interleaved + permuted)
	// Simple interleaving first, then Fisher-Yates shuffle
	for g := 0; g < TotalGroups; g++ {
		e.groupToBit[g] = g % payloadBits
	}
	// Permute within each bit's groups using key-seeded PRNG
	permSeed := sha256.Sum256([]byte("fmrtx-stft-perm-v1"))
	permRNG := newPRNG(permSeed[:])
	for i := TotalGroups - 1; i > 0; i-- {
		j := permRNG.next() % uint32(i+1)
		e.groupToBit[i], e.groupToBit[j] = e.groupToBit[j], e.groupToBit[i]
	}

	// Hann window
	dsp.HannWindow(e.window[:])

	// Embedding level
	e.levelLinear = math.Pow(10, WMLevelDB/20) - 1 // fractional magnitude change

	return e
}

// ProcessBlock takes mono audio at WMRate and returns watermarked audio.
// The input and output lengths are the same. Internally buffers for STFT
// overlap-add processing. Call with chunks of any size.
func (e *STFTEmbedder) ProcessBlock(in []float64) []float64 {
	out := make([]float64, len(in))
	for i, s := range in {
		// Feed sample into STFT input buffer
		e.inBuf[e.inPos] = s
		e.inPos++

		if e.inPos == FFTSize {
			// Full frame: process STFT
			e.processFrame()
			e.inPos = FFTHop // shift: keep last hop samples for next frame overlap
			copy(e.inBuf[:FFTHop], e.inBuf[FFTHop:FFTSize])
		}

		// Read from overlap-add output buffer
		if e.primed {
			out[i] = e.outBuf[e.outPos]
			e.outPos++
			if e.outPos >= FFTHop {
				e.outPos = 0
				// Shift output buffer: move overlap region to start
				copy(e.outBuf[:FFTSize], e.outBuf[FFTHop:FFTSize+FFTHop])
				// Zero the new region
				for j := FFTSize - FFTHop; j < FFTSize+FFTHop; j++ {
					if j < len(e.outBuf) {
						e.outBuf[j] = 0
					}
				}
			}
		} else {
			out[i] = s // pass-through until first frame is processed
		}
	}
	return out
}

// processFrame computes one STFT frame: window → FFT → modify magnitudes → IFFT → overlap-add.
func (e *STFTEmbedder) processFrame() {
	// Determine which group this frame belongs to
	wmFrame := e.frameIdx % FramesPerWM
	groupIdx := wmFrame / TimeRep
	repIdx := wmFrame % TimeRep
	centerRep := TimeRep / 2 // only center repetition carries the watermark for detection

	// Apply window and convert to complex
	var buf [FFTSize]complex128
	for i := 0; i < FFTSize; i++ {
		buf[i] = complex(e.inBuf[i]*e.window[i], 0)
	}

	// Forward FFT
	dsp.FFT(buf[:])

	// Modify magnitudes in the watermark sub-band
	// Only modify if this is within a valid group AND at the center repetition
	// (we embed in ALL repetitions so the watermark energy is present everywhere,
	// but the PN pattern is the same for all R frames in a group)
	if groupIdx < TotalGroups {
		bitIdx := e.groupToBit[groupIdx]
		dataSign := float64(e.symbols[bitIdx])
		_ = repIdx
		_ = centerRep

		for b := 0; b < NumBins; b++ {
			bin := BinLow + b
			chip := float64(e.pnChips[groupIdx][b])

			// Modify magnitude: |Y| = |X| × (1 + level × chip × data)
			// Phase preserved
			mag := cmplx.Abs(buf[bin])
			if mag < 1e-10 {
				continue // skip near-silence bins to avoid division by zero
			}
			phase := cmplx.Phase(buf[bin])
			newMag := mag * (1.0 + e.levelLinear*chip*dataSign)
			buf[bin] = cmplx.Rect(newMag, phase)

			// Mirror for negative frequencies (conjugate symmetry)
			if bin > 0 && bin < FFTSize/2 {
				buf[FFTSize-bin] = cmplx.Conj(buf[bin])
			}
		}
	}

	// Inverse FFT
	dsp.IFFT(buf[:])

	// Overlap-add to output buffer
	for i := 0; i < FFTSize; i++ {
		e.outBuf[i] += real(buf[i])
	}

	if !e.primed {
		e.primed = true
		e.outPos = 0
	}

	e.frameIdx++
}

// STFTDetector extracts watermark bits from an audio recording.
type STFTDetector struct {
	pnChips    [TotalGroups][NumBins]int8
	groupToBit [TotalGroups]int
}

// NewSTFTDetector creates a detector. No key needed — the PN sequence is
// public (fixed). The detector extracts the payload blindly.
func NewSTFTDetector() *STFTDetector {
	d := &STFTDetector{}

	// Same PN generation as embedder
	seed := sha256.Sum256([]byte("fmrtx-stft-pn-v1"))
	prng := newPRNG(seed[:])
	for g := 0; g < TotalGroups; g++ {
		for b := 0; b < NumBins; b++ {
			if prng.next()&1 == 0 {
				d.pnChips[g][b] = 1
			} else {
				d.pnChips[g][b] = -1
			}
		}
	}

	// Same permutation
	for g := 0; g < TotalGroups; g++ {
		d.groupToBit[g] = g % payloadBits
	}
	permSeed := sha256.Sum256([]byte("fmrtx-stft-perm-v1"))
	permRNG := newPRNG(permSeed[:])
	for i := TotalGroups - 1; i > 0; i-- {
		j := permRNG.next() % uint32(i+1)
		d.groupToBit[i], d.groupToBit[j] = d.groupToBit[j], d.groupToBit[i]
	}

	return d
}

// Detect processes audio at WMRate and returns soft bit decisions.
// The audio should already be decimated/resampled to WMRate and LPF'd.
//
// Multi-test: tries TimeRep frame offsets (the block repetition candidates).
// Cepstrum filtering is applied to reduce carrier noise.
//
// Returns: 128 soft correlation values (sign = bit decision, magnitude = confidence),
// and the frame offset that gave the best detection metric.
func (d *STFTDetector) Detect(audio []float64) (corrs [payloadBits]float64, bestOffset int) {
	// Compute all STFT frames
	var window [FFTSize]float64
	dsp.HannWindow(window[:])

	nFrames := (len(audio) - FFTSize) / FFTHop
	if nFrames < FramesPerWM {
		// Not enough data for a full watermark cycle — use what we have
	}

	// Compute STFT magnitudes (dB) for all frames
	type stftFrame struct {
		magDB [FFTSize / 2]float64
	}
	frames := make([]stftFrame, nFrames)

	for f := 0; f < nFrames; f++ {
		offset := f * FFTHop
		var buf [FFTSize]complex128
		for i := 0; i < FFTSize; i++ {
			if offset+i < len(audio) {
				buf[i] = complex(audio[offset+i]*window[i], 0)
			}
		}
		dsp.FFT(buf[:])

		for bin := 0; bin < FFTSize/2; bin++ {
			mag := cmplx.Abs(buf[bin])
			if mag < 1e-12 {
				mag = 1e-12
			}
			frames[f].magDB[bin] = 20 * math.Log10(mag)
		}

		// Cepstrum filtering: remove spectral envelope
		// DCT of dB magnitudes, zero first N_ceps coefficients, IDCT
		cepstrumFilter(frames[f].magDB[:], 8)
	}

	// Multi-test: try each of TimeRep frame offsets within the repetition block
	bestMetric := -1.0
	bestOffset = 0

	for startOffset := 0; startOffset < TimeRep; startOffset++ {
		var testCorrs [payloadBits]float64

		// Iterate over ALL recording frames — multiple WM cycles accumulate
		// automatically via modular wrapping. This gives √N_cycles SNR gain.
		for f := 0; f < nFrames; f++ {
			wmFrame := ((f - startOffset) % FramesPerWM + FramesPerWM) % FramesPerWM
			if wmFrame%TimeRep != TimeRep/2 {
				continue // not center of repetition block
			}
			g := wmFrame / TimeRep
			if g >= TotalGroups {
				continue
			}

			var corr float64
			for b := 0; b < NumBins; b++ {
				bin := BinLow + b
				corr += frames[f].magDB[bin] * float64(d.pnChips[g][b])
			}
			testCorrs[d.groupToBit[g]] += corr
		}

		// Detection metric: sum of squared partial correlations (chi-squared)
		// From paper equation (10): Q = Σ (corr_m)²
		var metric float64
		for _, c := range testCorrs {
			metric += c * c
		}

		if metric > bestMetric {
			bestMetric = metric
			bestOffset = startOffset
			corrs = testCorrs
		}
	}

	return corrs, bestOffset
}

// cepstrumFilter removes the spectral envelope from dB magnitudes.
// It zeros the first nCeps DCT coefficients (the smooth spectral shape).
// This is Kirovski's "CF" technique: reduces carrier noise by ~6 dB.
//
// Uses precomputed cosine table for O(N²) DCT without math.Cos calls.
func cepstrumFilter(magDB []float64, nCeps int) {
	n := len(magDB)
	if n < nCeps*2 {
		return
	}

	cosTable := getCosTable(n)

	// DCT-II
	ceps := make([]float64, n)
	for k := 0; k < n; k++ {
		var sum float64
		row := cosTable[k]
		for i := 0; i < n; i++ {
			sum += magDB[i] * row[i]
		}
		ceps[k] = sum
	}

	// Zero low-order coefficients
	for k := 0; k < nCeps; k++ {
		ceps[k] = 0
	}

	// IDCT
	scale := 2.0 / float64(n)
	for i := 0; i < n; i++ {
		var sum float64
		for k := 0; k < n; k++ {
			w := 1.0
			if k == 0 {
				w = 0.5
			}
			sum += w * ceps[k] * cosTable[k][i]
		}
		magDB[i] = sum * scale
	}
}

// Cached cosine table for DCT. cosTable[k][i] = cos(π·k·(i+0.5)/N).
var cachedCosTable [][]float64
var cachedCosN int

func getCosTable(n int) [][]float64 {
	if cachedCosN == n {
		return cachedCosTable
	}
	table := make([][]float64, n)
	for k := 0; k < n; k++ {
		table[k] = make([]float64, n)
		for i := 0; i < n; i++ {
			table[k][i] = math.Cos(math.Pi * float64(k) * (float64(i) + 0.5) / float64(n))
		}
	}
	cachedCosTable = table
	cachedCosN = n
	return table
}

// Simple xorshift32 PRNG for deterministic chip generation.
type simplePRNG struct {
	state uint32
}

func newPRNG(seed []byte) *simplePRNG {
	var s uint32
	for i, b := range seed {
		s ^= uint32(b) << (uint(i%4) * 8)
	}
	if s == 0 {
		s = 1
	}
	return &simplePRNG{state: s}
}

func (p *simplePRNG) next() uint32 {
	p.state ^= p.state << 13
	p.state ^= p.state >> 17
	p.state ^= p.state << 5
	return p.state
}