| @@ -113,6 +113,7 @@ func runDSP(ctx context.Context, srcMgr *sourceManager, cfg config.Config, det * | |||||
| for k := range rt.streamPhaseState { | for k := range rt.streamPhaseState { | ||||
| rt.streamPhaseState[k].phase = 0 | rt.streamPhaseState[k].phase = 0 | ||||
| } | } | ||||
| resetStreamingOracleRunner() | |||||
| rec.ResetStreams() | rec.ResetStreams() | ||||
| logging.Warn("gap", "iq_dropped", "msg", "buffer bloat caused extraction drop; overlap reset") | logging.Warn("gap", "iq_dropped", "msg", "buffer bloat caused extraction drop; overlap reset") | ||||
| if coll != nil { | if coll != nil { | ||||
| @@ -231,7 +231,7 @@ type extractionConfig struct { | |||||
| const streamOverlapLen = 512 // must be >= FIR tap count with margin | const streamOverlapLen = 512 // must be >= FIR tap count with margin | ||||
| const ( | const ( | ||||
| wfmStreamOutRate = 500000 | |||||
| wfmStreamOutRate = 512000 | |||||
| wfmStreamMinBW = 250000 | wfmStreamMinBW = 250000 | ||||
| ) | ) | ||||
| @@ -252,6 +252,9 @@ var forceCPUStreamExtract = func() bool { | |||||
| // - IQ overlap prepended to allIQ so FIR kernel has real data in halo | // - IQ overlap prepended to allIQ so FIR kernel has real data in halo | ||||
| // | // | ||||
| // Returns extracted snippets with overlap trimmed, and updates phase state. | // Returns extracted snippets with overlap trimmed, and updates phase state. | ||||
| // extractForStreaming is the current legacy production path. | |||||
| // It still relies on overlap-prepend + trim semantics and is intentionally | |||||
| // kept separate from the new streaming refactor/oracle path under development. | |||||
| func extractForStreaming( | func extractForStreaming( | ||||
| extractMgr *extractionManager, | extractMgr *extractionManager, | ||||
| allIQ []complex64, | allIQ []complex64, | ||||
| @@ -263,6 +266,16 @@ func extractForStreaming( | |||||
| aqCfg extractionConfig, | aqCfg extractionConfig, | ||||
| coll *telemetry.Collector, | coll *telemetry.Collector, | ||||
| ) ([][]complex64, []int) { | ) ([][]complex64, []int) { | ||||
| if useStreamingProductionPath { | |||||
| if out, rates, err := extractForStreamingProduction(extractMgr, allIQ, sampleRate, centerHz, signals, aqCfg, coll); err == nil { | |||||
| return out, rates | |||||
| } | |||||
| } | |||||
| if useStreamingOraclePath { | |||||
| if out, rates, err := extractForStreamingOracle(allIQ, sampleRate, centerHz, signals, aqCfg, coll); err == nil { | |||||
| return out, rates | |||||
| } | |||||
| } | |||||
| out := make([][]complex64, len(signals)) | out := make([][]complex64, len(signals)) | ||||
| rates := make([]int, len(signals)) | rates := make([]int, len(signals)) | ||||
| if len(allIQ) == 0 || sampleRate <= 0 || len(signals) == 0 { | if len(allIQ) == 0 || sampleRate <= 0 || len(signals) == 0 { | ||||
| @@ -0,0 +1,6 @@ | |||||
| package main | |||||
| // NOTE: Legacy extractor logic still lives in helpers.go for now. | |||||
| // This file is intentionally reserved for the later explicit move once the | |||||
| // production-path rewrite is far enough along that the split can be done in one | |||||
| // safe pass instead of a risky mechanical half-step. | |||||
| @@ -13,7 +13,7 @@ func TestNewDSPRuntime(t *testing.T) { | |||||
| cfg := config.Default() | cfg := config.Default() | ||||
| det := detector.New(cfg.Detector, cfg.SampleRate, cfg.FFTSize) | det := detector.New(cfg.Detector, cfg.SampleRate, cfg.FFTSize) | ||||
| window := fftutil.Hann(cfg.FFTSize) | window := fftutil.Hann(cfg.FFTSize) | ||||
| rt := newDSPRuntime(cfg, det, window, &gpuStatus{}) | |||||
| rt := newDSPRuntime(cfg, det, window, &gpuStatus{}, nil) | |||||
| if rt == nil { | if rt == nil { | ||||
| t.Fatalf("runtime is nil") | t.Fatalf("runtime is nil") | ||||
| } | } | ||||
| @@ -47,7 +47,7 @@ func TestSurveillanceLevelsRespectStrategy(t *testing.T) { | |||||
| cfg := config.Default() | cfg := config.Default() | ||||
| det := detector.New(cfg.Detector, cfg.SampleRate, cfg.FFTSize) | det := detector.New(cfg.Detector, cfg.SampleRate, cfg.FFTSize) | ||||
| window := fftutil.Hann(cfg.FFTSize) | window := fftutil.Hann(cfg.FFTSize) | ||||
| rt := newDSPRuntime(cfg, det, window, &gpuStatus{}) | |||||
| rt := newDSPRuntime(cfg, det, window, &gpuStatus{}, nil) | |||||
| policy := pipeline.Policy{SurveillanceStrategy: "single-resolution"} | policy := pipeline.Policy{SurveillanceStrategy: "single-resolution"} | ||||
| plan := rt.buildSurveillancePlan(policy) | plan := rt.buildSurveillancePlan(policy) | ||||
| if len(plan.Levels) != 1 { | if len(plan.Levels) != 1 { | ||||
| @@ -0,0 +1,45 @@ | |||||
| package main | |||||
| import ( | |||||
| "fmt" | |||||
| "sdr-wideband-suite/internal/demod/gpudemod" | |||||
| "sdr-wideband-suite/internal/telemetry" | |||||
| ) | |||||
| func observeStreamingComparison(coll *telemetry.Collector, oracle gpudemod.StreamingExtractResult, prod gpudemod.StreamingExtractResult) { | |||||
| if coll == nil { | |||||
| return | |||||
| } | |||||
| metrics, stats := gpudemod.CompareOracleAndGPUHostOracle(oracle, prod) | |||||
| tags := telemetry.TagsFromPairs("signal_id", fmt.Sprintf("%d", oracle.SignalID), "path", "streaming_compare") | |||||
| coll.SetGauge("streaming.compare.n_out", float64(metrics.NOut), tags) | |||||
| coll.SetGauge("streaming.compare.phase_count", float64(metrics.PhaseCount), tags) | |||||
| coll.SetGauge("streaming.compare.history_len", float64(metrics.HistoryLen), tags) | |||||
| coll.Observe("streaming.compare.ref_max_abs_err", metrics.RefMaxAbsErr, tags) | |||||
| coll.Observe("streaming.compare.ref_rms_err", metrics.RefRMSErr, tags) | |||||
| coll.SetGauge("streaming.compare.compare_count", float64(stats.Count), tags) | |||||
| coll.SetGauge("streaming.compare.oracle_rate", float64(oracle.Rate), tags) | |||||
| coll.SetGauge("streaming.compare.production_rate", float64(prod.Rate), tags) | |||||
| coll.SetGauge("streaming.compare.oracle_output_len", float64(len(oracle.IQ)), tags) | |||||
| coll.SetGauge("streaming.compare.production_output_len", float64(len(prod.IQ)), tags) | |||||
| if len(oracle.IQ) > 0 { | |||||
| oracleStats := computeIQHeadStats(oracle.IQ, 64) | |||||
| coll.Observe("streaming.compare.oracle_head_mean_mag", oracleStats.meanMag, tags) | |||||
| coll.Observe("streaming.compare.oracle_head_max_step", oracleStats.maxStep, tags) | |||||
| } | |||||
| if len(prod.IQ) > 0 { | |||||
| prodStats := computeIQHeadStats(prod.IQ, 64) | |||||
| coll.Observe("streaming.compare.production_head_mean_mag", prodStats.meanMag, tags) | |||||
| coll.Observe("streaming.compare.production_head_max_step", prodStats.maxStep, tags) | |||||
| } | |||||
| coll.Event("streaming_compare_snapshot", "info", "streaming comparison snapshot", tags, map[string]any{ | |||||
| "oracle_rate": oracle.Rate, | |||||
| "production_rate": prod.Rate, | |||||
| "oracle_output_len": len(oracle.IQ), | |||||
| "production_output_len": len(prod.IQ), | |||||
| "ref_max_abs_err": metrics.RefMaxAbsErr, | |||||
| "ref_rms_err": metrics.RefRMSErr, | |||||
| "compare_count": stats.Count, | |||||
| }) | |||||
| } | |||||
| @@ -0,0 +1,27 @@ | |||||
| package main | |||||
| import ( | |||||
| "fmt" | |||||
| "sdr-wideband-suite/internal/demod/gpudemod" | |||||
| "sdr-wideband-suite/internal/telemetry" | |||||
| ) | |||||
| func observeStreamingResult(coll *telemetry.Collector, prefix string, res gpudemod.StreamingExtractResult) { | |||||
| if coll == nil { | |||||
| return | |||||
| } | |||||
| tags := telemetry.TagsFromPairs("signal_id", fmt.Sprintf("%d", res.SignalID), "path", prefix) | |||||
| coll.SetGauge(prefix+".n_out", float64(res.NOut), tags) | |||||
| coll.SetGauge(prefix+".phase_count", float64(res.PhaseCount), tags) | |||||
| coll.SetGauge(prefix+".history_len", float64(res.HistoryLen), tags) | |||||
| coll.SetGauge(prefix+".rate", float64(res.Rate), tags) | |||||
| coll.SetGauge(prefix+".output_len", float64(len(res.IQ)), tags) | |||||
| if len(res.IQ) > 0 { | |||||
| stats := computeIQHeadStats(res.IQ, 64) | |||||
| coll.Observe(prefix+".head_mean_mag", stats.meanMag, tags) | |||||
| coll.Observe(prefix+".head_max_step", stats.maxStep, tags) | |||||
| coll.Observe(prefix+".head_p95_step", stats.p95Step, tags) | |||||
| coll.SetGauge(prefix+".head_low_magnitude_count", float64(stats.lowMag), tags) | |||||
| } | |||||
| } | |||||
| @@ -0,0 +1,50 @@ | |||||
| package main | |||||
| import ( | |||||
| "fmt" | |||||
| "sdr-wideband-suite/internal/demod/gpudemod" | |||||
| "sdr-wideband-suite/internal/detector" | |||||
| "sdr-wideband-suite/internal/telemetry" | |||||
| ) | |||||
| func extractForStreamingProduction( | |||||
| extractMgr *extractionManager, | |||||
| allIQ []complex64, | |||||
| sampleRate int, | |||||
| centerHz float64, | |||||
| signals []detector.Signal, | |||||
| aqCfg extractionConfig, | |||||
| coll *telemetry.Collector, | |||||
| ) ([][]complex64, []int, error) { | |||||
| out := make([][]complex64, len(signals)) | |||||
| rates := make([]int, len(signals)) | |||||
| jobs, err := buildStreamingJobs(sampleRate, centerHz, signals, aqCfg) | |||||
| if err != nil { | |||||
| return nil, nil, err | |||||
| } | |||||
| runner := extractMgr.get(len(allIQ), sampleRate) | |||||
| if runner == nil { | |||||
| return nil, nil, fmt.Errorf("streaming production path unavailable: no batch runner") | |||||
| } | |||||
| results, err := runner.StreamingExtractGPU(allIQ, jobs) | |||||
| if err != nil { | |||||
| return nil, nil, err | |||||
| } | |||||
| var oracleResults []gpudemod.StreamingExtractResult | |||||
| if useStreamingOraclePath { | |||||
| if streamingOracleRunner == nil || streamingOracleRunner.SampleRate != sampleRate { | |||||
| streamingOracleRunner = gpudemod.NewCPUOracleRunner(sampleRate) | |||||
| } | |||||
| oracleResults, _ = streamingOracleRunner.StreamingExtract(allIQ, jobs) | |||||
| } | |||||
| for i, res := range results { | |||||
| out[i] = res.IQ | |||||
| rates[i] = res.Rate | |||||
| observeStreamingResult(coll, "streaming.production", res) | |||||
| if i < len(oracleResults) { | |||||
| observeStreamingComparison(coll, oracleResults[i], res) | |||||
| } | |||||
| } | |||||
| return out, rates, nil | |||||
| } | |||||
| @@ -0,0 +1,94 @@ | |||||
| package main | |||||
| import ( | |||||
| "math" | |||||
| "sdr-wideband-suite/internal/demod/gpudemod" | |||||
| "sdr-wideband-suite/internal/detector" | |||||
| "sdr-wideband-suite/internal/telemetry" | |||||
| ) | |||||
| const useStreamingOraclePath = true // keep true during C2-C so the real native path is continuously compared against the corrected oracle | |||||
| const useStreamingProductionPath = false // keep false until the new production path is explicitly activated in runtime bring-up | |||||
| var streamingOracleRunner *gpudemod.CPUOracleRunner | |||||
| func buildStreamingJobs(sampleRate int, centerHz float64, signals []detector.Signal, aqCfg extractionConfig) ([]gpudemod.StreamingExtractJob, error) { | |||||
| jobs := make([]gpudemod.StreamingExtractJob, len(signals)) | |||||
| decimTarget := 200000 | |||||
| bwMult := aqCfg.bwMult | |||||
| if bwMult <= 0 { | |||||
| bwMult = 1.0 | |||||
| } | |||||
| firTaps := aqCfg.firTaps | |||||
| if firTaps <= 0 { | |||||
| firTaps = 101 | |||||
| } | |||||
| for i, sig := range signals { | |||||
| bw := sig.BWHz * bwMult | |||||
| sigMHz := sig.CenterHz / 1e6 | |||||
| isWFM := (sigMHz >= 87.5 && sigMHz <= 108.0) || | |||||
| (sig.Class != nil && (sig.Class.ModType == "WFM" || sig.Class.ModType == "WFM_STEREO")) | |||||
| outRate := decimTarget | |||||
| if isWFM { | |||||
| outRate = wfmStreamOutRate | |||||
| if bw < wfmStreamMinBW { | |||||
| bw = wfmStreamMinBW | |||||
| } | |||||
| } else if bw < 20000 { | |||||
| bw = 20000 | |||||
| } | |||||
| if _, err := gpudemod.ExactIntegerDecimation(sampleRate, outRate); err != nil { | |||||
| return nil, err | |||||
| } | |||||
| offset := sig.CenterHz - centerHz | |||||
| jobs[i] = gpudemod.StreamingExtractJob{ | |||||
| SignalID: sig.ID, | |||||
| OffsetHz: offset, | |||||
| Bandwidth: bw, | |||||
| OutRate: outRate, | |||||
| NumTaps: firTaps, | |||||
| ConfigHash: gpudemod.StreamingConfigHash(sig.ID, offset, bw, outRate, firTaps, sampleRate), | |||||
| } | |||||
| } | |||||
| return jobs, nil | |||||
| } | |||||
| func resetStreamingOracleRunner() { | |||||
| if streamingOracleRunner != nil { | |||||
| streamingOracleRunner.ResetAllStates() | |||||
| } | |||||
| } | |||||
| func extractForStreamingOracle( | |||||
| allIQ []complex64, | |||||
| sampleRate int, | |||||
| centerHz float64, | |||||
| signals []detector.Signal, | |||||
| aqCfg extractionConfig, | |||||
| coll *telemetry.Collector, | |||||
| ) ([][]complex64, []int, error) { | |||||
| out := make([][]complex64, len(signals)) | |||||
| rates := make([]int, len(signals)) | |||||
| jobs, err := buildStreamingJobs(sampleRate, centerHz, signals, aqCfg) | |||||
| if err != nil { | |||||
| return nil, nil, err | |||||
| } | |||||
| if streamingOracleRunner == nil || streamingOracleRunner.SampleRate != sampleRate { | |||||
| streamingOracleRunner = gpudemod.NewCPUOracleRunner(sampleRate) | |||||
| } | |||||
| results, err := streamingOracleRunner.StreamingExtract(allIQ, jobs) | |||||
| if err != nil { | |||||
| return nil, nil, err | |||||
| } | |||||
| for i, res := range results { | |||||
| out[i] = res.IQ | |||||
| rates[i] = res.Rate | |||||
| observeStreamingResult(coll, "streaming.oracle", res) | |||||
| } | |||||
| return out, rates, nil | |||||
| } | |||||
| func phaseIncForOffset(sampleRate int, offsetHz float64) float64 { | |||||
| return -2.0 * math.Pi * offsetHz / float64(sampleRate) | |||||
| } | |||||
| @@ -808,6 +808,176 @@ This now points away from a simple "shared global input head is already zero" th | |||||
| - `config.autosave.yaml` must be kept in sync with `config.yaml` or telemetry defaults can silently revert after restart. | - `config.autosave.yaml` must be kept in sync with `config.yaml` or telemetry defaults can silently revert after restart. | ||||
| - The most promising root-cause area is now the shared upstream/extractor-start boundary path, not downstream playback. | - The most promising root-cause area is now the shared upstream/extractor-start boundary path, not downstream playback. | ||||
| ### 2026-03-25 refactor work status (post-reviewer instruction) | |||||
| After the reviewer guidance, work pivoted away from symptomatic patching and onto the required two-track architecture change: | |||||
| #### Track 1 — CPU/oracle path repair (in progress) | |||||
| The following was added to start building a trustworthy streaming oracle: | |||||
| - `internal/demod/gpudemod/streaming_types.go` | |||||
| - `internal/demod/gpudemod/cpu_oracle.go` | |||||
| - `internal/demod/gpudemod/cpu_oracle_test.go` | |||||
| - `internal/demod/gpudemod/streaming_oracle_extract.go` | |||||
| - `internal/demod/gpudemod/polyphase.go` | |||||
| - `internal/demod/gpudemod/polyphase_test.go` | |||||
| What exists now: | |||||
| - explicit `StreamingExtractJob` / `StreamingExtractResult` | |||||
| - explicit `CPUOracleState` | |||||
| - exact integer decimation enforcement (`ExactIntegerDecimation`) | |||||
| - monolithic-vs-chunked CPU oracle test | |||||
| - explicit polyphase tap layout (`phase-major`) | |||||
| - CPU oracle direct-vs-polyphase equivalence test | |||||
| - persistent CPU oracle runner state keyed by signal ID | |||||
| - config-hash reset behavior | |||||
| - cleanup of disappeared signals from oracle state | |||||
| Important limitation: | |||||
| - this is **not finished production validation yet** | |||||
| - the CPU oracle path is being built toward the reviewer’s required semantics, but it is not yet the final signed-off oracle for GPU validation | |||||
| #### Track 2 — GPU path architecture refactor (in progress) | |||||
| The following was added to begin the new stateful GPU architecture: | |||||
| - `internal/demod/gpudemod/stream_state.go` | |||||
| - `internal/demod/gpudemod/streaming_gpu_stub.go` | |||||
| - `docs/gpu-streaming-refactor-plan-2026-03-25.md` | |||||
| - `cmd/sdrd/streaming_refactor.go` | |||||
| What exists now: | |||||
| - explicit `ExtractStreamState` | |||||
| - batch-runner-owned per-signal state map | |||||
| - config-hash reset behavior for GPU-side stream state | |||||
| - exact integer decimation enforcement in relevant batch path | |||||
| - base taps and polyphase taps initialized into GPU-side stream state | |||||
| - explicit future production entry point: `StreamingExtractGPU(...)` | |||||
| - explicit separation between current legacy extractor path and the new streaming/oracle path | |||||
| - persistent oracle-runner lifecycle hooks, including reset on stream-drop events | |||||
| Important limitation: | |||||
| - the new GPU production path is **not implemented yet** | |||||
| - the legacy overlap+trim production path still exists and is still the current active path | |||||
| - the new GPU entry point currently exists as an explicit architectural boundary and state owner, not as the finished stateful polyphase kernel path | |||||
| #### Tests currently passing during refactor | |||||
| Repeatedly verified during the refactor work: | |||||
| - `go test ./internal/demod/gpudemod/...` | |||||
| - `go test ./cmd/sdrd/...` | |||||
| #### Incremental progress reached so far inside the refactor | |||||
| Additional progress after the initial refactor scaffolding: | |||||
| - the CPU oracle runner now uses the explicit polyphase oracle path (`CPUOracleExtractPolyphase`) instead of only carrying polyphase tap data passively | |||||
| - the CPU oracle now has a direct-vs-polyphase equivalence test | |||||
| - the GPU-side stream state now initializes both `BaseTaps` and `PolyphaseTaps` | |||||
| - the GPU side now has an explicit future production entry point `StreamingExtractGPU(...)` | |||||
| - the GPU streaming stub now advances `NCOPhase` over NEW samples only | |||||
| - the GPU streaming stub now advances `PhaseCount` modulo exact integer decimation | |||||
| - the GPU streaming stub now builds and persists `ShiftedHistory` from already frequency-shifted NEW samples | |||||
| - the new streaming/oracle path is explicitly separated from the current legacy overlap+trim production path | |||||
| Important current limitation: | |||||
| - `StreamingExtractGPU(...)` still intentionally returns a not-implemented error rather than pretending to be the finished production path | |||||
| - this is deliberate, to avoid hidden quick-fix semantics or silent goalpost shifts | |||||
| Additional note on the latest step: | |||||
| - the GPU streaming stub now also reports an estimated output-count schedule (`NOut`) derived from NEW sample consumption plus carried `PhaseCount` | |||||
| - this still does **not** make it a production path; it only means the stub now models output cadence semantics more honestly | |||||
| - the new CPU/oracle path is also now exposing additional runtime telemetry such as `streaming.oracle.rate` and `streaming.oracle.output_len`, so the reference path becomes easier to inspect as it matures | |||||
| - a reusable complex-slice comparison helper now exists (`CompareComplexSlices`) to support later oracle-vs-GPU equivalence work without improvising comparison logic at the last minute | |||||
| - a dedicated `TestCPUOracleMonolithicVsChunkedPolyphase` now verifies chunked-vs-monolithic self-consistency for the polyphase oracle path specifically | |||||
| - explicit reset tests now exist for both CPU oracle state and GPU streaming state, so config-change reset semantics are no longer only implicit in code review | |||||
| - a dedicated `ExtractDebugMetrics` structure now exists as a future comparison/telemetry contract for reviewer-required state/error/boundary metrics | |||||
| - the first mapper from oracle results into that debug-metric structure now exists, so the comparison contract is beginning to attach to real refactor code rather than staying purely conceptual | |||||
| - the same minimal debug-metric mapping now also exists for GPU-stub results, so both sides of the future GPU-vs-oracle comparison now have an initial common reporting shape | |||||
| - a first comparison-pipeline helper now exists to turn oracle-vs-GPU-stub results into shared `CompareStats` / `ExtractDebugMetrics` output, even though the GPU path is still intentionally incomplete | |||||
| - that comparison helper is now also covered by a dedicated unit test, so even the scaffolding around future GPU-vs-oracle validation is being locked down incrementally | |||||
| - GPU-side stream-state initialization is now also unit-tested (`Decim`, `BaseTaps`, `PolyphaseTaps`, `ShiftedHistory` capacity), so the new state ownership layer is no longer just trusted by inspection | |||||
| - the GPU streaming stub now also has a dedicated test proving that it advances persistent state while still explicitly failing as a not-yet-implemented production path | |||||
| - at this point, enough scaffolding exists that the next sensible step is to build the broader validation/test harness in one larger pass before continuing the actual production-path rewrite | |||||
| - that harness pass has now happened: deterministic IQ/tone fixtures, harness config/state builders, chunked polyphase oracle runners, and additional validation tests now exist, so the next step is back to the actual production-path rewrite | |||||
| - the first non-stub NEW-samples-only production-like path now exists as `StreamingExtractGPUHostOracle(...)`: it is still host-side, but it executes the new streaming/stateful semantics and therefore serves as a concrete bridge between pure test infrastructure and the eventual real GPU production path | |||||
| - that host-side production-like path is now directly compared against the CPU oracle in tests and currently matches within tight tolerance, which is an important confidence step before any real CUDA-path replacement | |||||
| - the canonical new production entry point `StreamingExtractGPU(...)` is now structurally wired so that the host-side production-like implementation can sit behind the same API later, without forcing a premature switch today | |||||
| - a top-level `cmd/sdrd` production path hook now exists as well (`extractForStreamingProduction` plus `useStreamingProductionPath=false`), so the new architecture is no longer isolated to internal packages only | |||||
| - the new production path now also emits first-class output/heading telemetry (`rate`, `output_len`, `head_mean_mag`, `head_max_step`) in addition to pure state counters, which will make activation/debugging easier later | |||||
| - a top-level comparison observation hook now also exists in `cmd/sdrd`, so oracle-vs-production metrics no longer have to remain buried inside internal package helpers | |||||
| - after the broader monitoring/comparison consolidation pass, the next agreed work mode is to continue in larger clusters rather than micro-steps: (1) wire the new production semantics more deeply, (2) isolate the legacy path more sharply, (3) keep preparing the eventual real GPU production path behind the same architecture | |||||
| - after the first larger cluster, the next explicit target is to complete Cluster B: make the host-oracle bridge sit more naturally behind the new production execution architecture, rather than leaving production-path semantics spread across loosely connected files | |||||
| - after Cluster B, the remaining GPU rewrite work is now best split into two explicit parts: `C1 = prepare` and `C2 = definitive implementation`, so the project can keep momentum without pretending that the final CUDA/stateful production path is already done | |||||
| - Cluster B is now effectively complete: CPU oracle runner, host-oracle production-like path, and top-level production comparison all share the same host streaming core, and that common core is directly tested against the polyphase oracle | |||||
| - Cluster C1 is now also complete: the new GPU production layer has an explicit invocation contract, execution-result contract, state handoff/build/apply stages, and a host-side execution strategy already running behind the same model | |||||
| ### Current refactor status before C2 | |||||
| At this point the project has: | |||||
| - a corrected streaming/oracle architecture direction | |||||
| - a shared host-side streaming core used by both the CPU oracle runner and the host-side production-like bridge | |||||
| - explicit production-path hooks in `cmd/sdrd` | |||||
| - comparison and monitoring scaffolding above and below the execution layer | |||||
| - a prepared GPU execution contract (`StreamingGPUInvocation` / `StreamingGPUExecutionResult`) | |||||
| What it does **not** have yet: | |||||
| - a real native CUDA streaming/polyphase execution entry point with history-in/history-out and phase-count in/out semantics | |||||
| - a real CUDA-backed implementation behind `StreamingExtractGPUExec(...)` | |||||
| - completed GPU-vs-oracle validation on the final native execution path | |||||
| ### C2 plan | |||||
| #### C2-A — native CUDA / bridge entry preparation | |||||
| Goal: | |||||
| - introduce the real native entry shape for stateful streaming/polyphase execution | |||||
| Status note before starting C2-A: | |||||
| - C2 is **not** honestly complete yet because the native CUDA side still only exposes the old separate freq-shift/FIR/decimate pieces. | |||||
| - Therefore C2-A must begin by creating the real native entry shape rather than continuing to stack more Go-only abstractions on top of the old kernels. | |||||
| Required outcomes: | |||||
| - explicit native/CUDA function signature for streaming execution | |||||
| - bridge bindings for history in/out, phase count in/out, new samples in, outputs out | |||||
| - Go-side wrapper ready to call the new native path through the prepared invocation/result model | |||||
| #### C2-B — definitive execution implementation hookup | |||||
| Goal: | |||||
| - put a real native CUDA-backed execution strategy behind `StreamingExtractGPUExec(...)` | |||||
| Status note after C2-A: | |||||
| - the native entry shape now exists in CUDA, the Windows bridge can resolve it, and the Go execution layer can route into a native-prepared strategy. | |||||
| - what is still missing for C2-B is the actual stateful execution body behind that new native entrypoint. | |||||
| - therefore C2-B now means exactly one serious thing: replace the current placeholder body of the new native entrypoint with real stateful streaming/polyphase execution semantics, rather than adding more scaffolding around it. | |||||
| - C2-B is now materially done: the new native entrypoint no longer returns only placeholder state, and the Go native execution path now uploads inputs/history/taps, runs the new native function, and reads back outputs plus updated state. | |||||
| - when the new exact-integer streaming decimation rules were turned on, an immediate runtime integration issue appeared: previous WFM extraction defaults expected `outRate=500000`, but the live sample rate was `4096000`, which is not exactly divisible. The correct fix is to align streaming defaults with the new integer-decimation model instead of trying to preserve the old rounded ratio behavior. | |||||
| - the concrete immediate adjustment made for this was: `wfmStreamOutRate = 512000` (instead of `500000`), because `4096000 / 512000 = 8` is exactly divisible and therefore consistent with the new streaming architecture’s no-rounding rule. | |||||
| Required outcomes: | |||||
| - `StreamingExtractGPUExec(...)` can execute a real native stateful path | |||||
| - host-oracle bridge remains available only as a comparison/support path, not as the disguised production implementation | |||||
| - state apply/backflow goes through the already prepared invocation/result contract | |||||
| #### C2-C — final validation and serious completion gate | |||||
| Goal: | |||||
| - validate the real CUDA-backed path against the corrected oracle and make the completion criterion explicit | |||||
| Required outcomes: | |||||
| - GPU-vs-oracle comparison active on the real native path | |||||
| - test coverage and runtime comparison hooks in place | |||||
| - after C2-C, the CUDA story must be treated as complete, correct, and serious — not half-switched or pseudo-finished | |||||
| #### Why the refactor is intentionally incremental | |||||
| The reviewer explicitly required: | |||||
| - no start-index-only production patch | |||||
| - no continued reliance on overlap+trim as final continuity model | |||||
| - no silent decimation rounding | |||||
| - no GPU sign-off without a corrected CPU oracle | |||||
| Because of that, the work is being done in ordered layers: | |||||
| 1. define streaming types and state | |||||
| 2. build the CPU oracle with exact streaming semantics | |||||
| 3. establish shared polyphase/tap semantics | |||||
| 4. prepare GPU-side persistent state ownership | |||||
| 5. only then replace the actual production GPU execution path | |||||
| This means the repo now contains partially completed new architecture pieces that are deliberate stepping stones, not abandoned half-fixes. | |||||
| ### Reviewer package artifacts created for second-opinion review | ### Reviewer package artifacts created for second-opinion review | ||||
| To support external/secondary review of the GPU extractor path, a focused reviewer package was created in the project root: | To support external/secondary review of the GPU extractor path, a focused reviewer package was created in the project root: | ||||
| @@ -0,0 +1,48 @@ | |||||
| # GPU Streaming Refactor Plan (2026-03-25) | |||||
| ## Goal | |||||
| Replace the current overlap+trim GPU extractor model with a true stateful per-signal streaming architecture, and build a corrected CPU oracle/reference path for validation. | |||||
| ## Non-negotiables | |||||
| - No production start-index-only patch. | |||||
| - No production overlap-prepend + trim continuity model. | |||||
| - Exact integer decimation only in the new streaming production path. | |||||
| - Persistent per-signal state must include NCO phase, FIR history, and decimator phase/residue. | |||||
| - GPU validation must compare against a corrected CPU oracle, not the legacy CPU fallback. | |||||
| ## Work order | |||||
| 1. Introduce explicit stateful streaming types in `gpudemod`. | |||||
| 2. Add a clean CPU oracle implementation and monolithic-vs-chunked tests. | |||||
| 3. Add per-signal state ownership in batch runner. | |||||
| 4. Implement new streaming extractor semantics in Go using NEW IQ samples only. | |||||
| 5. Replace legacy GPU-path assumptions (rounding decimation, overlap-prepend, trim-defined validity) in the new path. | |||||
| 6. Add production telemetry that proves state continuity (`phase_count`, `history_len`, `n_out`, reference error). | |||||
| 7. Keep legacy path isolated only for temporary comparison if needed. | |||||
| ## Initial files in scope | |||||
| - `internal/demod/gpudemod/batch.go` | |||||
| - `internal/demod/gpudemod/batch_runner.go` | |||||
| - `internal/demod/gpudemod/batch_runner_windows.go` | |||||
| - `internal/demod/gpudemod/kernels.cu` | |||||
| - `internal/demod/gpudemod/native/exports.cu` | |||||
| - `cmd/sdrd/helpers.go` | |||||
| ## Immediate implementation strategy | |||||
| ### Phase 1 | |||||
| - Create explicit streaming state structs in Go. | |||||
| - Add CPU oracle/reference path with exact semantics and tests. | |||||
| - Introduce exact integer-decimation checks. | |||||
| ### Phase 2 | |||||
| - Rework batch runner to own persistent per-signal state. | |||||
| - Add config-hash-based resets. | |||||
| - Stop modeling continuity via overlap tail in the new path. | |||||
| ### Phase 3 | |||||
| - Introduce a real streaming GPU entry path that consumes NEW shifted samples plus carried state. | |||||
| - Move to a stateful polyphase decimator model. | |||||
| ## Validation expectations | |||||
| - CPU oracle monolithic == CPU oracle chunked within tolerance. | |||||
| - GPU streaming output == CPU oracle chunked within tolerance. | |||||
| - Former periodic block-boundary clicks gone in real-world testing. | |||||
| @@ -6,7 +6,7 @@ type ExtractJob struct { | |||||
| OffsetHz float64 | OffsetHz float64 | ||||
| BW float64 | BW float64 | ||||
| OutRate int | OutRate int | ||||
| PhaseStart float64 // FreqShift starting phase (0 for stateless, carry over for streaming) | |||||
| PhaseStart float64 // legacy batch phase field; retained only while migrating to streaming extractor semantics | |||||
| } | } | ||||
| // ExtractResult holds the output of a batch extraction including the ending | // ExtractResult holds the output of a batch extraction including the ending | ||||
| @@ -10,10 +10,11 @@ type batchSlot struct { | |||||
| } | } | ||||
| type BatchRunner struct { | type BatchRunner struct { | ||||
| eng *Engine | |||||
| slots []batchSlot | |||||
| slotBufs []slotBuffers | |||||
| eng *Engine | |||||
| slots []batchSlot | |||||
| slotBufs []slotBuffers | |||||
| slotBufSize int // number of IQ samples the slot buffers were allocated for | slotBufSize int // number of IQ samples the slot buffers were allocated for | ||||
| streamState map[int64]*ExtractStreamState | |||||
| } | } | ||||
| func NewBatchRunner(maxSamples int, sampleRate int) (*BatchRunner, error) { | func NewBatchRunner(maxSamples int, sampleRate int) (*BatchRunner, error) { | ||||
| @@ -21,7 +22,7 @@ func NewBatchRunner(maxSamples int, sampleRate int) (*BatchRunner, error) { | |||||
| if err != nil { | if err != nil { | ||||
| return nil, err | return nil, err | ||||
| } | } | ||||
| return &BatchRunner{eng: eng}, nil | |||||
| return &BatchRunner{eng: eng, streamState: make(map[int64]*ExtractStreamState)}, nil | |||||
| } | } | ||||
| func (r *BatchRunner) Close() { | func (r *BatchRunner) Close() { | ||||
| @@ -32,6 +33,7 @@ func (r *BatchRunner) Close() { | |||||
| r.eng.Close() | r.eng.Close() | ||||
| r.eng = nil | r.eng = nil | ||||
| r.slots = nil | r.slots = nil | ||||
| r.streamState = nil | |||||
| } | } | ||||
| func (r *BatchRunner) prepare(jobs []ExtractJob) { | func (r *BatchRunner) prepare(jobs []ExtractJob) { | ||||
| @@ -160,9 +160,9 @@ func (r *BatchRunner) shiftFilterDecimateSlotParallel(iq []complex64, job Extrac | |||||
| if bridgeMemcpyH2D(buf.dTaps, unsafe.Pointer(&taps[0]), tapsBytes) != 0 { | if bridgeMemcpyH2D(buf.dTaps, unsafe.Pointer(&taps[0]), tapsBytes) != 0 { | ||||
| return 0, 0, errors.New("taps H2D failed") | return 0, 0, errors.New("taps H2D failed") | ||||
| } | } | ||||
| decim := int(math.Round(float64(e.sampleRate) / float64(job.OutRate))) | |||||
| if decim < 1 { | |||||
| decim = 1 | |||||
| decim, err := ExactIntegerDecimation(e.sampleRate, job.OutRate) | |||||
| if err != nil { | |||||
| return 0, 0, err | |||||
| } | } | ||||
| nOut := n / decim | nOut := n / decim | ||||
| if nOut <= 0 { | if nOut <= 0 { | ||||
| @@ -0,0 +1,47 @@ | |||||
| package gpudemod | |||||
| import "math/cmplx" | |||||
| type CompareStats struct { | |||||
| MaxAbsErr float64 | |||||
| RMSErr float64 | |||||
| Count int | |||||
| } | |||||
| func CompareComplexSlices(a []complex64, b []complex64) CompareStats { | |||||
| n := len(a) | |||||
| if len(b) < n { | |||||
| n = len(b) | |||||
| } | |||||
| if n == 0 { | |||||
| return CompareStats{} | |||||
| } | |||||
| var sumSq float64 | |||||
| var maxAbs float64 | |||||
| for i := 0; i < n; i++ { | |||||
| err := cmplx.Abs(complex128(a[i] - b[i])) | |||||
| if err > maxAbs { | |||||
| maxAbs = err | |||||
| } | |||||
| sumSq += err * err | |||||
| } | |||||
| return CompareStats{ | |||||
| MaxAbsErr: maxAbs, | |||||
| RMSErr: mathSqrt(sumSq / float64(n)), | |||||
| Count: n, | |||||
| } | |||||
| } | |||||
| func mathSqrt(v float64) float64 { | |||||
| // tiny shim to keep the compare helper self-contained and easy to move | |||||
| // without importing additional logic elsewhere | |||||
| z := v | |||||
| if z <= 0 { | |||||
| return 0 | |||||
| } | |||||
| x := z | |||||
| for i := 0; i < 12; i++ { | |||||
| x = 0.5 * (x + z/x) | |||||
| } | |||||
| return x | |||||
| } | |||||
| @@ -0,0 +1,19 @@ | |||||
| package gpudemod | |||||
| func BuildGPUStubDebugMetrics(res StreamingExtractResult) ExtractDebugMetrics { | |||||
| return ExtractDebugMetrics{ | |||||
| SignalID: res.SignalID, | |||||
| PhaseCount: res.PhaseCount, | |||||
| HistoryLen: res.HistoryLen, | |||||
| NOut: res.NOut, | |||||
| } | |||||
| } | |||||
| func BuildGPUHostOracleDebugMetrics(res StreamingExtractResult) ExtractDebugMetrics { | |||||
| return ExtractDebugMetrics{ | |||||
| SignalID: res.SignalID, | |||||
| PhaseCount: res.PhaseCount, | |||||
| HistoryLen: res.HistoryLen, | |||||
| NOut: res.NOut, | |||||
| } | |||||
| } | |||||
| @@ -0,0 +1,10 @@ | |||||
| package gpudemod | |||||
| func BuildOracleDebugMetrics(res StreamingExtractResult) ExtractDebugMetrics { | |||||
| return ExtractDebugMetrics{ | |||||
| SignalID: res.SignalID, | |||||
| PhaseCount: res.PhaseCount, | |||||
| HistoryLen: res.HistoryLen, | |||||
| NOut: res.NOut, | |||||
| } | |||||
| } | |||||
| @@ -0,0 +1,27 @@ | |||||
| package gpudemod | |||||
| func CompareOracleAndGPUStub(oracle StreamingExtractResult, gpu StreamingExtractResult) (ExtractDebugMetrics, CompareStats) { | |||||
| stats := CompareComplexSlices(oracle.IQ, gpu.IQ) | |||||
| metrics := ExtractDebugMetrics{ | |||||
| SignalID: oracle.SignalID, | |||||
| PhaseCount: gpu.PhaseCount, | |||||
| HistoryLen: gpu.HistoryLen, | |||||
| NOut: gpu.NOut, | |||||
| RefMaxAbsErr: stats.MaxAbsErr, | |||||
| RefRMSErr: stats.RMSErr, | |||||
| } | |||||
| return metrics, stats | |||||
| } | |||||
| func CompareOracleAndGPUHostOracle(oracle StreamingExtractResult, gpu StreamingExtractResult) (ExtractDebugMetrics, CompareStats) { | |||||
| stats := CompareComplexSlices(oracle.IQ, gpu.IQ) | |||||
| metrics := ExtractDebugMetrics{ | |||||
| SignalID: oracle.SignalID, | |||||
| PhaseCount: gpu.PhaseCount, | |||||
| HistoryLen: gpu.HistoryLen, | |||||
| NOut: gpu.NOut, | |||||
| RefMaxAbsErr: stats.MaxAbsErr, | |||||
| RefRMSErr: stats.RMSErr, | |||||
| } | |||||
| return metrics, stats | |||||
| } | |||||
| @@ -0,0 +1,32 @@ | |||||
| package gpudemod | |||||
| import "testing" | |||||
| func TestCompareOracleAndGPUStub(t *testing.T) { | |||||
| oracle := StreamingExtractResult{ | |||||
| SignalID: 1, | |||||
| IQ: []complex64{1 + 1i, 2 + 2i}, | |||||
| Rate: 200000, | |||||
| NOut: 2, | |||||
| PhaseCount: 0, | |||||
| HistoryLen: 64, | |||||
| } | |||||
| gpu := StreamingExtractResult{ | |||||
| SignalID: 1, | |||||
| IQ: []complex64{1 + 1i, 2.1 + 2i}, | |||||
| Rate: 200000, | |||||
| NOut: 2, | |||||
| PhaseCount: 3, | |||||
| HistoryLen: 64, | |||||
| } | |||||
| metrics, stats := CompareOracleAndGPUStub(oracle, gpu) | |||||
| if metrics.SignalID != 1 { | |||||
| t.Fatalf("unexpected signal id: %d", metrics.SignalID) | |||||
| } | |||||
| if stats.Count != 2 { | |||||
| t.Fatalf("unexpected compare count: %d", stats.Count) | |||||
| } | |||||
| if metrics.RefMaxAbsErr <= 0 { | |||||
| t.Fatalf("expected positive max abs error") | |||||
| } | |||||
| } | |||||
| @@ -0,0 +1,12 @@ | |||||
| package gpudemod | |||||
| type ExtractDebugMetrics struct { | |||||
| SignalID int64 | |||||
| PhaseCount int | |||||
| HistoryLen int | |||||
| NOut int | |||||
| RefMaxAbsErr float64 | |||||
| RefRMSErr float64 | |||||
| BoundaryDelta float64 | |||||
| BoundaryD2 float64 | |||||
| } | |||||
| @@ -0,0 +1,18 @@ | |||||
| package gpudemod | |||||
| import "testing" | |||||
| func TestCompareComplexSlices(t *testing.T) { | |||||
| a := []complex64{1 + 1i, 2 + 2i, 3 + 3i} | |||||
| b := []complex64{1 + 1i, 2.1 + 2i, 2.9 + 3.2i} | |||||
| stats := CompareComplexSlices(a, b) | |||||
| if stats.Count != 3 { | |||||
| t.Fatalf("unexpected count: %d", stats.Count) | |||||
| } | |||||
| if stats.MaxAbsErr <= 0 { | |||||
| t.Fatalf("expected positive max abs error") | |||||
| } | |||||
| if stats.RMSErr <= 0 { | |||||
| t.Fatalf("expected positive rms error") | |||||
| } | |||||
| } | |||||
| @@ -0,0 +1,170 @@ | |||||
| package gpudemod | |||||
| import ( | |||||
| "fmt" | |||||
| "math" | |||||
| ) | |||||
| type CPUOracleState struct { | |||||
| SignalID int64 | |||||
| ConfigHash uint64 | |||||
| NCOPhase float64 | |||||
| Decim int | |||||
| PhaseCount int | |||||
| NumTaps int | |||||
| ShiftedHistory []complex64 | |||||
| BaseTaps []float32 | |||||
| PolyphaseTaps []float32 | |||||
| } | |||||
| func ResetCPUOracleStateIfConfigChanged(state *CPUOracleState, newHash uint64) { | |||||
| if state == nil { | |||||
| return | |||||
| } | |||||
| if state.ConfigHash != newHash { | |||||
| state.ConfigHash = newHash | |||||
| state.NCOPhase = 0 | |||||
| state.PhaseCount = 0 | |||||
| state.ShiftedHistory = state.ShiftedHistory[:0] | |||||
| } | |||||
| } | |||||
| func CPUOracleExtract(iqNew []complex64, state *CPUOracleState, phaseInc float64) []complex64 { | |||||
| if state == nil || state.NumTaps <= 0 || state.Decim <= 0 || len(state.BaseTaps) < state.NumTaps { | |||||
| return nil | |||||
| } | |||||
| out := make([]complex64, 0, len(iqNew)/maxInt(1, state.Decim)+2) | |||||
| phase := state.NCOPhase | |||||
| hist := append([]complex64(nil), state.ShiftedHistory...) | |||||
| for _, x := range iqNew { | |||||
| rot := complex64(complex(math.Cos(phase), math.Sin(phase))) | |||||
| s := x * rot | |||||
| hist = append(hist, s) | |||||
| state.PhaseCount++ | |||||
| if state.PhaseCount == state.Decim { | |||||
| var y complex64 | |||||
| for k := 0; k < state.NumTaps; k++ { | |||||
| idx := len(hist) - 1 - k | |||||
| var sample complex64 | |||||
| if idx >= 0 { | |||||
| sample = hist[idx] | |||||
| } | |||||
| y += complex(state.BaseTaps[k], 0) * sample | |||||
| } | |||||
| out = append(out, y) | |||||
| state.PhaseCount = 0 | |||||
| } | |||||
| if len(hist) > state.NumTaps-1 { | |||||
| hist = hist[len(hist)-(state.NumTaps-1):] | |||||
| } | |||||
| phase += phaseInc | |||||
| if phase >= math.Pi { | |||||
| phase -= 2 * math.Pi | |||||
| } else if phase < -math.Pi { | |||||
| phase += 2 * math.Pi | |||||
| } | |||||
| } | |||||
| state.NCOPhase = phase | |||||
| state.ShiftedHistory = append(state.ShiftedHistory[:0], hist...) | |||||
| return out | |||||
| } | |||||
| // CPUOracleExtractPolyphase keeps the same streaming state semantics as CPUOracleExtract, | |||||
| // but computes outputs using the explicit phase-major polyphase tap layout. | |||||
| func CPUOracleExtractPolyphase(iqNew []complex64, state *CPUOracleState, phaseInc float64) []complex64 { | |||||
| if state == nil || state.NumTaps <= 0 || state.Decim <= 0 || len(state.BaseTaps) < state.NumTaps { | |||||
| return nil | |||||
| } | |||||
| if len(state.PolyphaseTaps) == 0 { | |||||
| state.PolyphaseTaps = BuildPolyphaseTapsPhaseMajor(state.BaseTaps, state.Decim) | |||||
| } | |||||
| phaseLen := PolyphasePhaseLen(len(state.BaseTaps), state.Decim) | |||||
| out := make([]complex64, 0, len(iqNew)/maxInt(1, state.Decim)+2) | |||||
| phase := state.NCOPhase | |||||
| hist := append([]complex64(nil), state.ShiftedHistory...) | |||||
| for _, x := range iqNew { | |||||
| rot := complex64(complex(math.Cos(phase), math.Sin(phase))) | |||||
| s := x * rot | |||||
| hist = append(hist, s) | |||||
| state.PhaseCount++ | |||||
| if state.PhaseCount == state.Decim { | |||||
| var y complex64 | |||||
| for p := 0; p < state.Decim; p++ { | |||||
| for k := 0; k < phaseLen; k++ { | |||||
| tap := state.PolyphaseTaps[p*phaseLen+k] | |||||
| if tap == 0 { | |||||
| continue | |||||
| } | |||||
| srcBack := p + k*state.Decim | |||||
| idx := len(hist) - 1 - srcBack | |||||
| if idx < 0 { | |||||
| continue | |||||
| } | |||||
| y += complex(tap, 0) * hist[idx] | |||||
| } | |||||
| } | |||||
| out = append(out, y) | |||||
| state.PhaseCount = 0 | |||||
| } | |||||
| if len(hist) > state.NumTaps-1 { | |||||
| hist = hist[len(hist)-(state.NumTaps-1):] | |||||
| } | |||||
| phase += phaseInc | |||||
| if phase >= math.Pi { | |||||
| phase -= 2 * math.Pi | |||||
| } else if phase < -math.Pi { | |||||
| phase += 2 * math.Pi | |||||
| } | |||||
| } | |||||
| state.NCOPhase = phase | |||||
| state.ShiftedHistory = append(state.ShiftedHistory[:0], hist...) | |||||
| return out | |||||
| } | |||||
| func RunChunkedCPUOracle(all []complex64, chunkSizes []int, mkState func() *CPUOracleState, phaseInc float64) []complex64 { | |||||
| state := mkState() | |||||
| out := make([]complex64, 0) | |||||
| pos := 0 | |||||
| for _, n := range chunkSizes { | |||||
| if pos >= len(all) { | |||||
| break | |||||
| } | |||||
| end := pos + n | |||||
| if end > len(all) { | |||||
| end = len(all) | |||||
| } | |||||
| out = append(out, CPUOracleExtract(all[pos:end], state, phaseInc)...) | |||||
| pos = end | |||||
| } | |||||
| if pos < len(all) { | |||||
| out = append(out, CPUOracleExtract(all[pos:], state, phaseInc)...) | |||||
| } | |||||
| return out | |||||
| } | |||||
| func ExactIntegerDecimation(sampleRate int, outRate int) (int, error) { | |||||
| if sampleRate <= 0 || outRate <= 0 { | |||||
| return 0, fmt.Errorf("invalid sampleRate/outRate: %d/%d", sampleRate, outRate) | |||||
| } | |||||
| if sampleRate%outRate != 0 { | |||||
| return 0, fmt.Errorf("streaming polyphase extractor requires integer decimation: sampleRate=%d outRate=%d", sampleRate, outRate) | |||||
| } | |||||
| return sampleRate / outRate, nil | |||||
| } | |||||
| func maxInt(a int, b int) int { | |||||
| if a > b { | |||||
| return a | |||||
| } | |||||
| return b | |||||
| } | |||||
| @@ -0,0 +1,89 @@ | |||||
| package gpudemod | |||||
| import ( | |||||
| "math" | |||||
| "math/cmplx" | |||||
| "testing" | |||||
| ) | |||||
| func makeDeterministicIQ(n int) []complex64 { | |||||
| out := make([]complex64, n) | |||||
| for i := 0; i < n; i++ { | |||||
| a := 0.017 * float64(i) | |||||
| b := 0.031 * float64(i) | |||||
| out[i] = complex64(complex(math.Cos(a)+0.2*math.Cos(b), math.Sin(a)+0.15*math.Sin(b))) | |||||
| } | |||||
| return out | |||||
| } | |||||
| func makeLowpassTaps(n int) []float32 { | |||||
| out := make([]float32, n) | |||||
| for i := range out { | |||||
| out[i] = 1.0 / float32(n) | |||||
| } | |||||
| return out | |||||
| } | |||||
| func requireComplexSlicesClose(t *testing.T, a []complex64, b []complex64, tol float64) { | |||||
| t.Helper() | |||||
| if len(a) != len(b) { | |||||
| t.Fatalf("length mismatch: %d vs %d", len(a), len(b)) | |||||
| } | |||||
| for i := range a { | |||||
| if cmplx.Abs(complex128(a[i]-b[i])) > tol { | |||||
| t.Fatalf("slice mismatch at %d: %v vs %v (tol=%f)", i, a[i], b[i], tol) | |||||
| } | |||||
| } | |||||
| } | |||||
| func TestCPUOracleMonolithicVsChunked(t *testing.T) { | |||||
| iq := makeDeterministicIQ(200000) | |||||
| mk := func() *CPUOracleState { | |||||
| return &CPUOracleState{ | |||||
| SignalID: 1, | |||||
| ConfigHash: 123, | |||||
| NCOPhase: 0, | |||||
| Decim: 20, | |||||
| PhaseCount: 0, | |||||
| NumTaps: 65, | |||||
| ShiftedHistory: make([]complex64, 0, 64), | |||||
| BaseTaps: makeLowpassTaps(65), | |||||
| } | |||||
| } | |||||
| phaseInc := 0.017 | |||||
| monoState := mk() | |||||
| mono := CPUOracleExtract(iq, monoState, phaseInc) | |||||
| chunked := RunChunkedCPUOracle(iq, []int{4096, 5000, 8192, 27307}, mk, phaseInc) | |||||
| requireComplexSlicesClose(t, mono, chunked, 1e-5) | |||||
| } | |||||
| func TestExactIntegerDecimation(t *testing.T) { | |||||
| if d, err := ExactIntegerDecimation(4000000, 200000); err != nil || d != 20 { | |||||
| t.Fatalf("unexpected exact decim result: d=%d err=%v", d, err) | |||||
| } | |||||
| if _, err := ExactIntegerDecimation(4000000, 192000); err == nil { | |||||
| t.Fatalf("expected non-integer decimation error") | |||||
| } | |||||
| } | |||||
| func TestCPUOracleDirectVsPolyphase(t *testing.T) { | |||||
| iq := makeDeterministicIQ(50000) | |||||
| mk := func() *CPUOracleState { | |||||
| taps := makeLowpassTaps(65) | |||||
| return &CPUOracleState{ | |||||
| SignalID: 1, | |||||
| ConfigHash: 123, | |||||
| NCOPhase: 0, | |||||
| Decim: 20, | |||||
| PhaseCount: 0, | |||||
| NumTaps: 65, | |||||
| ShiftedHistory: make([]complex64, 0, 64), | |||||
| BaseTaps: taps, | |||||
| PolyphaseTaps: BuildPolyphaseTapsPhaseMajor(taps, 20), | |||||
| } | |||||
| } | |||||
| phaseInc := 0.017 | |||||
| direct := CPUOracleExtract(iq, mk(), phaseInc) | |||||
| poly := CPUOracleExtractPolyphase(iq, mk(), phaseInc) | |||||
| requireComplexSlicesClose(t, direct, poly, 1e-5) | |||||
| } | |||||
| @@ -320,3 +320,132 @@ GPUD_API int GPUD_CALL gpud_launch_ssb_product_cuda( | |||||
| gpud_ssb_product_kernel<<<grid, block>>>(in, out, n, phase_inc, phase_start); | gpud_ssb_product_kernel<<<grid, block>>>(in, out, n, phase_inc, phase_start); | ||||
| return (int)cudaGetLastError(); | return (int)cudaGetLastError(); | ||||
| } | } | ||||
| GPUD_API int GPUD_CALL gpud_launch_streaming_polyphase_prepare_cuda( | |||||
| const float2* in_new, | |||||
| int n_new, | |||||
| const float2* history_in, | |||||
| int history_len, | |||||
| const float* polyphase_taps, | |||||
| int polyphase_len, | |||||
| int decim, | |||||
| int num_taps, | |||||
| int phase_count_in, | |||||
| double phase_start, | |||||
| double phase_inc, | |||||
| float2* out, | |||||
| int* n_out, | |||||
| int* phase_count_out, | |||||
| double* phase_end_out, | |||||
| float2* history_out | |||||
| ) { | |||||
| if (!in_new || n_new < 0 || !polyphase_taps || polyphase_len <= 0 || decim <= 0 || num_taps <= 0) return -1; | |||||
| const int phase_len = (num_taps + decim - 1) / decim; | |||||
| if (polyphase_len < decim * phase_len) return -2; | |||||
| const int combined_len = history_len + n_new; | |||||
| float2* shifted = NULL; | |||||
| float2* combined = NULL; | |||||
| cudaError_t err = cudaMalloc((void**)&shifted, (size_t)max(1, n_new) * sizeof(float2)); | |||||
| if (err != cudaSuccess) return (int)err; | |||||
| err = cudaMalloc((void**)&combined, (size_t)max(1, combined_len) * sizeof(float2)); | |||||
| if (err != cudaSuccess) { | |||||
| cudaFree(shifted); | |||||
| return (int)err; | |||||
| } | |||||
| const int block = 256; | |||||
| const int grid_shift = (n_new + block - 1) / block; | |||||
| if (n_new > 0) { | |||||
| gpud_freq_shift_kernel<<<grid_shift, block>>>(in_new, shifted, n_new, phase_inc, phase_start); | |||||
| err = cudaGetLastError(); | |||||
| if (err != cudaSuccess) { | |||||
| cudaFree(shifted); | |||||
| cudaFree(combined); | |||||
| return (int)err; | |||||
| } | |||||
| } | |||||
| if (history_len > 0 && history_in) { | |||||
| err = cudaMemcpy(combined, history_in, (size_t)history_len * sizeof(float2), cudaMemcpyDeviceToDevice); | |||||
| if (err != cudaSuccess) { | |||||
| cudaFree(shifted); | |||||
| cudaFree(combined); | |||||
| return (int)err; | |||||
| } | |||||
| } | |||||
| if (n_new > 0) { | |||||
| err = cudaMemcpy(combined + history_len, shifted, (size_t)n_new * sizeof(float2), cudaMemcpyDeviceToDevice); | |||||
| if (err != cudaSuccess) { | |||||
| cudaFree(shifted); | |||||
| cudaFree(combined); | |||||
| return (int)err; | |||||
| } | |||||
| } | |||||
| int out_count = 0; | |||||
| int phase_count = phase_count_in; | |||||
| for (int i = 0; i < n_new; ++i) { | |||||
| phase_count++; | |||||
| if (phase_count == decim) { | |||||
| float2 acc = make_float2(0.0f, 0.0f); | |||||
| int newest = history_len + i; | |||||
| for (int p = 0; p < decim; ++p) { | |||||
| for (int k = 0; k < phase_len; ++k) { | |||||
| int tap_idx = p * phase_len + k; | |||||
| if (tap_idx >= polyphase_len) continue; | |||||
| float tap; | |||||
| err = cudaMemcpy(&tap, polyphase_taps + tap_idx, sizeof(float), cudaMemcpyDeviceToHost); | |||||
| if (err != cudaSuccess) { | |||||
| cudaFree(shifted); | |||||
| cudaFree(combined); | |||||
| return (int)err; | |||||
| } | |||||
| if (tap == 0.0f) continue; | |||||
| int src_back = p + k * decim; | |||||
| int src_idx = newest - src_back; | |||||
| if (src_idx < 0) continue; | |||||
| float2 sample; | |||||
| err = cudaMemcpy(&sample, combined + src_idx, sizeof(float2), cudaMemcpyDeviceToHost); | |||||
| if (err != cudaSuccess) { | |||||
| cudaFree(shifted); | |||||
| cudaFree(combined); | |||||
| return (int)err; | |||||
| } | |||||
| acc.x += sample.x * tap; | |||||
| acc.y += sample.y * tap; | |||||
| } | |||||
| } | |||||
| err = cudaMemcpy(out + out_count, &acc, sizeof(float2), cudaMemcpyHostToDevice); | |||||
| if (err != cudaSuccess) { | |||||
| cudaFree(shifted); | |||||
| cudaFree(combined); | |||||
| return (int)err; | |||||
| } | |||||
| out_count++; | |||||
| phase_count = 0; | |||||
| } | |||||
| } | |||||
| const int keep = num_taps > 1 ? num_taps - 1 : 0; | |||||
| if (history_out && keep > 0) { | |||||
| int copy = keep; | |||||
| if (combined_len < copy) copy = combined_len; | |||||
| if (copy > 0) { | |||||
| err = cudaMemcpy(history_out, combined + (combined_len - copy), (size_t)copy * sizeof(float2), cudaMemcpyDeviceToDevice); | |||||
| if (err != cudaSuccess) { | |||||
| cudaFree(shifted); | |||||
| cudaFree(combined); | |||||
| return (int)err; | |||||
| } | |||||
| } | |||||
| } | |||||
| if (n_out) *n_out = out_count; | |||||
| if (phase_count_out) *phase_count_out = phase_count; | |||||
| if (phase_end_out) *phase_end_out = phase_start + phase_inc * (double)n_new; | |||||
| cudaFree(shifted); | |||||
| cudaFree(combined); | |||||
| return 0; | |||||
| } | |||||
| @@ -0,0 +1,31 @@ | |||||
| package gpudemod | |||||
| import "testing" | |||||
| func TestCPUOracleRunnerCleansUpDisappearedSignals(t *testing.T) { | |||||
| r := NewCPUOracleRunner(4000000) | |||||
| jobs1 := []StreamingExtractJob{ | |||||
| {SignalID: 1, OffsetHz: 1000, Bandwidth: 20000, OutRate: 200000, NumTaps: 65, ConfigHash: 101}, | |||||
| {SignalID: 2, OffsetHz: 2000, Bandwidth: 20000, OutRate: 200000, NumTaps: 65, ConfigHash: 102}, | |||||
| } | |||||
| _, err := r.StreamingExtract(makeDeterministicIQ(4096), jobs1) | |||||
| if err != nil { | |||||
| t.Fatalf("unexpected error on first extract: %v", err) | |||||
| } | |||||
| if len(r.States) != 2 { | |||||
| t.Fatalf("expected 2 states, got %d", len(r.States)) | |||||
| } | |||||
| jobs2 := []StreamingExtractJob{ | |||||
| {SignalID: 2, OffsetHz: 2000, Bandwidth: 20000, OutRate: 200000, NumTaps: 65, ConfigHash: 102}, | |||||
| } | |||||
| _, err = r.StreamingExtract(makeDeterministicIQ(2048), jobs2) | |||||
| if err != nil { | |||||
| t.Fatalf("unexpected error on second extract: %v", err) | |||||
| } | |||||
| if len(r.States) != 1 { | |||||
| t.Fatalf("expected 1 state after cleanup, got %d", len(r.States)) | |||||
| } | |||||
| if _, ok := r.States[1]; ok { | |||||
| t.Fatalf("expected signal 1 state to be cleaned up") | |||||
| } | |||||
| } | |||||
| @@ -0,0 +1,45 @@ | |||||
| package gpudemod | |||||
| import "testing" | |||||
| func TestCPUOracleMonolithicVsChunkedPolyphase(t *testing.T) { | |||||
| iq := makeDeterministicIQ(120000) | |||||
| mk := func() *CPUOracleState { | |||||
| taps := makeLowpassTaps(65) | |||||
| return &CPUOracleState{ | |||||
| SignalID: 1, | |||||
| ConfigHash: 999, | |||||
| NCOPhase: 0, | |||||
| Decim: 20, | |||||
| PhaseCount: 0, | |||||
| NumTaps: 65, | |||||
| ShiftedHistory: make([]complex64, 0, 64), | |||||
| BaseTaps: taps, | |||||
| PolyphaseTaps: BuildPolyphaseTapsPhaseMajor(taps, 20), | |||||
| } | |||||
| } | |||||
| phaseInc := 0.013 | |||||
| mono := CPUOracleExtractPolyphase(iq, mk(), phaseInc) | |||||
| chunked := func() []complex64 { | |||||
| state := mk() | |||||
| out := make([]complex64, 0) | |||||
| chunks := []int{4096, 3000, 8192, 7777, 12000} | |||||
| pos := 0 | |||||
| for _, n := range chunks { | |||||
| if pos >= len(iq) { | |||||
| break | |||||
| } | |||||
| end := pos + n | |||||
| if end > len(iq) { | |||||
| end = len(iq) | |||||
| } | |||||
| out = append(out, CPUOracleExtractPolyphase(iq[pos:end], state, phaseInc)...) | |||||
| pos = end | |||||
| } | |||||
| if pos < len(iq) { | |||||
| out = append(out, CPUOracleExtractPolyphase(iq[pos:], state, phaseInc)...) | |||||
| } | |||||
| return out | |||||
| }() | |||||
| requireComplexSlicesClose(t, mono, chunked, 1e-5) | |||||
| } | |||||
| @@ -0,0 +1,28 @@ | |||||
| package gpudemod | |||||
| // BuildPolyphaseTapsPhaseMajor builds a phase-major polyphase tap layout: | |||||
| // tapsByPhase[p][k] = h[p + k*D] | |||||
| // Flattened as: [phase0 taps..., phase1 taps..., ...] | |||||
| func BuildPolyphaseTapsPhaseMajor(base []float32, decim int) []float32 { | |||||
| if decim <= 0 || len(base) == 0 { | |||||
| return nil | |||||
| } | |||||
| maxPhaseLen := (len(base) + decim - 1) / decim | |||||
| out := make([]float32, decim*maxPhaseLen) | |||||
| for p := 0; p < decim; p++ { | |||||
| for k := 0; k < maxPhaseLen; k++ { | |||||
| src := p + k*decim | |||||
| if src < len(base) { | |||||
| out[p*maxPhaseLen+k] = base[src] | |||||
| } | |||||
| } | |||||
| } | |||||
| return out | |||||
| } | |||||
| func PolyphasePhaseLen(baseLen int, decim int) int { | |||||
| if decim <= 0 || baseLen <= 0 { | |||||
| return 0 | |||||
| } | |||||
| return (baseLen + decim - 1) / decim | |||||
| } | |||||
| @@ -0,0 +1,22 @@ | |||||
| package gpudemod | |||||
| import "testing" | |||||
| func TestBuildPolyphaseTapsPhaseMajor(t *testing.T) { | |||||
| base := []float32{1, 2, 3, 4, 5, 6, 7} | |||||
| got := BuildPolyphaseTapsPhaseMajor(base, 3) | |||||
| // phase-major with phase len ceil(7/3)=3 | |||||
| want := []float32{ | |||||
| 1, 4, 7, | |||||
| 2, 5, 0, | |||||
| 3, 6, 0, | |||||
| } | |||||
| if len(got) != len(want) { | |||||
| t.Fatalf("len mismatch: got %d want %d", len(got), len(want)) | |||||
| } | |||||
| for i := range want { | |||||
| if got[i] != want[i] { | |||||
| t.Fatalf("mismatch at %d: got %v want %v", i, got[i], want[i]) | |||||
| } | |||||
| } | |||||
| } | |||||
| @@ -0,0 +1,57 @@ | |||||
| package gpudemod | |||||
| import "testing" | |||||
| func TestResetCPUOracleStateIfConfigChanged(t *testing.T) { | |||||
| state := &CPUOracleState{ | |||||
| SignalID: 1, | |||||
| ConfigHash: 111, | |||||
| NCOPhase: 1.23, | |||||
| Decim: 20, | |||||
| PhaseCount: 7, | |||||
| NumTaps: 65, | |||||
| ShiftedHistory: []complex64{1 + 1i, 2 + 2i}, | |||||
| } | |||||
| ResetCPUOracleStateIfConfigChanged(state, 222) | |||||
| if state.ConfigHash != 222 { | |||||
| t.Fatalf("config hash not updated") | |||||
| } | |||||
| if state.NCOPhase != 0 { | |||||
| t.Fatalf("expected phase reset") | |||||
| } | |||||
| if state.PhaseCount != 0 { | |||||
| t.Fatalf("expected phase count reset") | |||||
| } | |||||
| if len(state.ShiftedHistory) != 0 { | |||||
| t.Fatalf("expected shifted history reset") | |||||
| } | |||||
| } | |||||
| func TestResetExtractStreamState(t *testing.T) { | |||||
| state := &ExtractStreamState{ | |||||
| SignalID: 1, | |||||
| ConfigHash: 111, | |||||
| NCOPhase: 2.34, | |||||
| Decim: 20, | |||||
| PhaseCount: 9, | |||||
| NumTaps: 65, | |||||
| ShiftedHistory: []complex64{3 + 3i, 4 + 4i}, | |||||
| Initialized: true, | |||||
| } | |||||
| ResetExtractStreamState(state, 333) | |||||
| if state.ConfigHash != 333 { | |||||
| t.Fatalf("config hash not updated") | |||||
| } | |||||
| if state.NCOPhase != 0 { | |||||
| t.Fatalf("expected phase reset") | |||||
| } | |||||
| if state.PhaseCount != 0 { | |||||
| t.Fatalf("expected phase count reset") | |||||
| } | |||||
| if len(state.ShiftedHistory) != 0 { | |||||
| t.Fatalf("expected shifted history reset") | |||||
| } | |||||
| if state.Initialized { | |||||
| t.Fatalf("expected initialized=false after reset") | |||||
| } | |||||
| } | |||||
| @@ -0,0 +1,60 @@ | |||||
| package gpudemod | |||||
| import "sdr-wideband-suite/internal/dsp" | |||||
| func (r *BatchRunner) ResetSignalState(signalID int64) { | |||||
| if r == nil || r.streamState == nil { | |||||
| return | |||||
| } | |||||
| delete(r.streamState, signalID) | |||||
| } | |||||
| func (r *BatchRunner) ResetAllSignalStates() { | |||||
| if r == nil { | |||||
| return | |||||
| } | |||||
| r.streamState = make(map[int64]*ExtractStreamState) | |||||
| } | |||||
| func (r *BatchRunner) getOrInitExtractState(job StreamingExtractJob, sampleRate int) (*ExtractStreamState, error) { | |||||
| if r == nil { | |||||
| return nil, ErrUnavailable | |||||
| } | |||||
| if r.streamState == nil { | |||||
| r.streamState = make(map[int64]*ExtractStreamState) | |||||
| } | |||||
| decim, err := ExactIntegerDecimation(sampleRate, job.OutRate) | |||||
| if err != nil { | |||||
| return nil, err | |||||
| } | |||||
| state := r.streamState[job.SignalID] | |||||
| if state == nil { | |||||
| state = &ExtractStreamState{SignalID: job.SignalID} | |||||
| r.streamState[job.SignalID] = state | |||||
| } | |||||
| if state.ConfigHash != job.ConfigHash { | |||||
| ResetExtractStreamState(state, job.ConfigHash) | |||||
| } | |||||
| state.Decim = decim | |||||
| state.NumTaps = job.NumTaps | |||||
| if state.NumTaps <= 0 { | |||||
| state.NumTaps = 101 | |||||
| } | |||||
| cutoff := job.Bandwidth / 2 | |||||
| if cutoff < 200 { | |||||
| cutoff = 200 | |||||
| } | |||||
| base := dsp.LowpassFIR(cutoff, sampleRate, state.NumTaps) | |||||
| state.BaseTaps = make([]float32, len(base)) | |||||
| for i, v := range base { | |||||
| state.BaseTaps[i] = float32(v) | |||||
| } | |||||
| state.PolyphaseTaps = BuildPolyphaseTapsPhaseMajor(state.BaseTaps, state.Decim) | |||||
| if cap(state.ShiftedHistory) < maxInt(0, state.NumTaps-1) { | |||||
| state.ShiftedHistory = make([]complex64, 0, maxInt(0, state.NumTaps-1)) | |||||
| } else if state.ShiftedHistory == nil { | |||||
| state.ShiftedHistory = make([]complex64, 0, maxInt(0, state.NumTaps-1)) | |||||
| } | |||||
| state.Initialized = true | |||||
| return state, nil | |||||
| } | |||||
| @@ -0,0 +1,31 @@ | |||||
| package gpudemod | |||||
| import "testing" | |||||
| func TestGetOrInitExtractStateInitializesPolyphaseAndHistory(t *testing.T) { | |||||
| r := &BatchRunner{streamState: make(map[int64]*ExtractStreamState)} | |||||
| job := StreamingExtractJob{ | |||||
| SignalID: 7, | |||||
| OffsetHz: 12500, | |||||
| Bandwidth: 20000, | |||||
| OutRate: 200000, | |||||
| NumTaps: 65, | |||||
| ConfigHash: 555, | |||||
| } | |||||
| state, err := r.getOrInitExtractState(job, 4000000) | |||||
| if err != nil { | |||||
| t.Fatalf("unexpected error: %v", err) | |||||
| } | |||||
| if state.Decim != 20 { | |||||
| t.Fatalf("unexpected decim: %d", state.Decim) | |||||
| } | |||||
| if len(state.BaseTaps) != 65 { | |||||
| t.Fatalf("unexpected base taps len: %d", len(state.BaseTaps)) | |||||
| } | |||||
| if len(state.PolyphaseTaps) == 0 { | |||||
| t.Fatalf("expected polyphase taps") | |||||
| } | |||||
| if cap(state.ShiftedHistory) < 64 { | |||||
| t.Fatalf("expected shifted history capacity >= 64, got %d", cap(state.ShiftedHistory)) | |||||
| } | |||||
| } | |||||
| @@ -0,0 +1,38 @@ | |||||
| package gpudemod | |||||
| type StreamingGPUExecutionMode string | |||||
| const ( | |||||
| StreamingGPUExecUnavailable StreamingGPUExecutionMode = "unavailable" | |||||
| StreamingGPUExecHostOracle StreamingGPUExecutionMode = "host_oracle" | |||||
| StreamingGPUExecCUDA StreamingGPUExecutionMode = "cuda" | |||||
| ) | |||||
| type StreamingGPUInvocation struct { | |||||
| SignalID int64 | |||||
| OffsetHz float64 | |||||
| OutRate int | |||||
| Bandwidth float64 | |||||
| SampleRate int | |||||
| NumTaps int | |||||
| Decim int | |||||
| PhaseCountIn int | |||||
| NCOPhaseIn float64 | |||||
| HistoryLen int | |||||
| BaseTaps []float32 | |||||
| PolyphaseTaps []float32 | |||||
| ShiftedHistory []complex64 | |||||
| IQNew []complex64 | |||||
| } | |||||
| type StreamingGPUExecutionResult struct { | |||||
| SignalID int64 | |||||
| Mode StreamingGPUExecutionMode | |||||
| IQ []complex64 | |||||
| Rate int | |||||
| NOut int | |||||
| PhaseCountOut int | |||||
| NCOPhaseOut float64 | |||||
| HistoryOut []complex64 | |||||
| HistoryLenOut int | |||||
| } | |||||
| @@ -0,0 +1,27 @@ | |||||
| package gpudemod | |||||
| // StreamingExtractGPUExec is the internal execution selector for the new | |||||
| // production-path semantics. It intentionally keeps the public API stable while | |||||
| // allowing the implementation to evolve from host-side oracle execution toward | |||||
| // a real GPU polyphase path. | |||||
| func (r *BatchRunner) StreamingExtractGPUExec(iqNew []complex64, jobs []StreamingExtractJob) ([]StreamingExtractResult, error) { | |||||
| invocations, err := r.buildStreamingGPUInvocations(iqNew, jobs) | |||||
| if err != nil { | |||||
| return nil, err | |||||
| } | |||||
| if useGPUHostOracleExecution { | |||||
| execResults, err := r.executeStreamingGPUHostOraclePrepared(invocations) | |||||
| if err != nil { | |||||
| return nil, err | |||||
| } | |||||
| return r.applyStreamingGPUExecutionResults(execResults), nil | |||||
| } | |||||
| if useGPUNativePreparedExecution { | |||||
| execResults, err := r.executeStreamingGPUNativePrepared(invocations) | |||||
| if err != nil { | |||||
| return nil, err | |||||
| } | |||||
| return r.applyStreamingGPUExecutionResults(execResults), nil | |||||
| } | |||||
| return nil, ErrUnavailable | |||||
| } | |||||
| @@ -0,0 +1,19 @@ | |||||
| package gpudemod | |||||
| import "testing" | |||||
| func TestStreamingExtractGPUExecUnavailableByDefault(t *testing.T) { | |||||
| r := &BatchRunner{eng: &Engine{sampleRate: 4000000}, streamState: make(map[int64]*ExtractStreamState)} | |||||
| job := StreamingExtractJob{ | |||||
| SignalID: 1, | |||||
| OffsetHz: 12500, | |||||
| Bandwidth: 20000, | |||||
| OutRate: 200000, | |||||
| NumTaps: 65, | |||||
| ConfigHash: 777, | |||||
| } | |||||
| _, err := r.StreamingExtractGPUExec(makeDeterministicIQ(2048), []StreamingExtractJob{job}) | |||||
| if err == nil { | |||||
| t.Fatalf("expected unavailable/disabled execution path by default") | |||||
| } | |||||
| } | |||||
| @@ -0,0 +1,30 @@ | |||||
| package gpudemod | |||||
| func (r *BatchRunner) executeStreamingGPUHostOraclePrepared(invocations []StreamingGPUInvocation) ([]StreamingGPUExecutionResult, error) { | |||||
| results := make([]StreamingGPUExecutionResult, len(invocations)) | |||||
| for i, inv := range invocations { | |||||
| out, phase, phaseCount, hist := runStreamingPolyphaseHostCore( | |||||
| inv.IQNew, | |||||
| inv.SampleRate, | |||||
| inv.OffsetHz, | |||||
| inv.NCOPhaseIn, | |||||
| inv.PhaseCountIn, | |||||
| inv.NumTaps, | |||||
| inv.Decim, | |||||
| inv.ShiftedHistory, | |||||
| inv.PolyphaseTaps, | |||||
| ) | |||||
| results[i] = StreamingGPUExecutionResult{ | |||||
| SignalID: inv.SignalID, | |||||
| Mode: StreamingGPUExecHostOracle, | |||||
| IQ: out, | |||||
| Rate: inv.OutRate, | |||||
| NOut: len(out), | |||||
| PhaseCountOut: phaseCount, | |||||
| NCOPhaseOut: phase, | |||||
| HistoryOut: hist, | |||||
| HistoryLenOut: len(hist), | |||||
| } | |||||
| } | |||||
| return results, nil | |||||
| } | |||||
| @@ -0,0 +1,49 @@ | |||||
| package gpudemod | |||||
| // StreamingExtractGPUHostOracle is a temporary host-side execution of the intended | |||||
| // streaming semantics using GPU-owned stream state. It is not the final GPU | |||||
| // production implementation, but it allows the new production entrypoint to move | |||||
| // from pure stub semantics toward real NEW-samples-only streaming behavior | |||||
| // without reintroducing overlap+trim. | |||||
| func (r *BatchRunner) StreamingExtractGPUHostOracle(iqNew []complex64, jobs []StreamingExtractJob) ([]StreamingExtractResult, error) { | |||||
| if r == nil || r.eng == nil { | |||||
| return nil, ErrUnavailable | |||||
| } | |||||
| results := make([]StreamingExtractResult, len(jobs)) | |||||
| active := make(map[int64]struct{}, len(jobs)) | |||||
| for i, job := range jobs { | |||||
| active[job.SignalID] = struct{}{} | |||||
| state, err := r.getOrInitExtractState(job, r.eng.sampleRate) | |||||
| if err != nil { | |||||
| return nil, err | |||||
| } | |||||
| out, phase, phaseCount, hist := runStreamingPolyphaseHostCore( | |||||
| iqNew, | |||||
| r.eng.sampleRate, | |||||
| job.OffsetHz, | |||||
| state.NCOPhase, | |||||
| state.PhaseCount, | |||||
| state.NumTaps, | |||||
| state.Decim, | |||||
| state.ShiftedHistory, | |||||
| state.PolyphaseTaps, | |||||
| ) | |||||
| state.NCOPhase = phase | |||||
| state.PhaseCount = phaseCount | |||||
| state.ShiftedHistory = append(state.ShiftedHistory[:0], hist...) | |||||
| results[i] = StreamingExtractResult{ | |||||
| SignalID: job.SignalID, | |||||
| IQ: out, | |||||
| Rate: job.OutRate, | |||||
| NOut: len(out), | |||||
| PhaseCount: state.PhaseCount, | |||||
| HistoryLen: len(state.ShiftedHistory), | |||||
| } | |||||
| } | |||||
| for signalID := range r.streamState { | |||||
| if _, ok := active[signalID]; !ok { | |||||
| delete(r.streamState, signalID) | |||||
| } | |||||
| } | |||||
| return results, nil | |||||
| } | |||||
| @@ -0,0 +1,35 @@ | |||||
| package gpudemod | |||||
| import "testing" | |||||
| func TestStreamingGPUHostOracleComparableToCPUOracle(t *testing.T) { | |||||
| r := &BatchRunner{eng: &Engine{sampleRate: 4000000}, streamState: make(map[int64]*ExtractStreamState)} | |||||
| job := StreamingExtractJob{ | |||||
| SignalID: 1, | |||||
| OffsetHz: 12500, | |||||
| Bandwidth: 20000, | |||||
| OutRate: 200000, | |||||
| NumTaps: 65, | |||||
| ConfigHash: 777, | |||||
| } | |||||
| iq := makeDeterministicIQ(16000) | |||||
| gpuLike, err := r.StreamingExtractGPUHostOracle(iq, []StreamingExtractJob{job}) | |||||
| if err != nil { | |||||
| t.Fatalf("unexpected host-oracle error: %v", err) | |||||
| } | |||||
| oracleRunner := NewCPUOracleRunner(4000000) | |||||
| oracle, err := oracleRunner.StreamingExtract(iq, []StreamingExtractJob{job}) | |||||
| if err != nil { | |||||
| t.Fatalf("unexpected oracle error: %v", err) | |||||
| } | |||||
| if len(gpuLike) != 1 || len(oracle) != 1 { | |||||
| t.Fatalf("unexpected result lengths: gpuLike=%d oracle=%d", len(gpuLike), len(oracle)) | |||||
| } | |||||
| metrics, stats := CompareOracleAndGPUHostOracle(oracle[0], gpuLike[0]) | |||||
| if stats.Count == 0 { | |||||
| t.Fatalf("expected compare count > 0") | |||||
| } | |||||
| if metrics.RefMaxAbsErr > 1e-5 { | |||||
| t.Fatalf("expected host-oracle path to match cpu oracle closely, got max abs err %f", metrics.RefMaxAbsErr) | |||||
| } | |||||
| } | |||||
| @@ -0,0 +1,4 @@ | |||||
| package gpudemod | |||||
| const useGPUHostOracleExecution = false | |||||
| const useGPUNativePreparedExecution = true | |||||
| @@ -0,0 +1,115 @@ | |||||
| //go:build cufft && windows | |||||
| package gpudemod | |||||
| /* | |||||
| #cgo windows CFLAGS: -I"C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v13.2/include" | |||||
| #include <cuda_runtime.h> | |||||
| typedef struct { float x; float y; } gpud_float2; | |||||
| */ | |||||
| import "C" | |||||
| import ( | |||||
| "math" | |||||
| "unsafe" | |||||
| ) | |||||
| func (r *BatchRunner) executeStreamingGPUNativePrepared(invocations []StreamingGPUInvocation) ([]StreamingGPUExecutionResult, error) { | |||||
| results := make([]StreamingGPUExecutionResult, len(invocations)) | |||||
| for i, inv := range invocations { | |||||
| phaseInc := -2.0 * math.Pi * inv.OffsetHz / float64(inv.SampleRate) | |||||
| outCap := len(inv.IQNew)/maxInt(1, inv.Decim) + 2 | |||||
| outHost := make([]complex64, outCap) | |||||
| histCap := maxInt(0, inv.NumTaps-1) | |||||
| histHost := make([]complex64, histCap) | |||||
| var nOut C.int | |||||
| var phaseCountOut C.int | |||||
| var phaseEndOut C.double | |||||
| var dInNew, dHistIn, dOut, dHistOut unsafe.Pointer | |||||
| var dTaps unsafe.Pointer | |||||
| if len(inv.IQNew) > 0 { | |||||
| if bridgeCudaMalloc(&dInNew, uintptr(len(inv.IQNew))*unsafe.Sizeof(C.gpud_float2{})) != 0 { | |||||
| return nil, ErrUnavailable | |||||
| } | |||||
| defer bridgeCudaFree(dInNew) | |||||
| if bridgeMemcpyH2D(dInNew, unsafe.Pointer(&inv.IQNew[0]), uintptr(len(inv.IQNew))*unsafe.Sizeof(complex64(0))) != 0 { | |||||
| return nil, ErrUnavailable | |||||
| } | |||||
| } | |||||
| if len(inv.ShiftedHistory) > 0 { | |||||
| if bridgeCudaMalloc(&dHistIn, uintptr(len(inv.ShiftedHistory))*unsafe.Sizeof(C.gpud_float2{})) != 0 { | |||||
| return nil, ErrUnavailable | |||||
| } | |||||
| defer bridgeCudaFree(dHistIn) | |||||
| if bridgeMemcpyH2D(dHistIn, unsafe.Pointer(&inv.ShiftedHistory[0]), uintptr(len(inv.ShiftedHistory))*unsafe.Sizeof(complex64(0))) != 0 { | |||||
| return nil, ErrUnavailable | |||||
| } | |||||
| } | |||||
| if len(inv.PolyphaseTaps) > 0 { | |||||
| if bridgeCudaMalloc(&dTaps, uintptr(len(inv.PolyphaseTaps))*unsafe.Sizeof(C.float(0))) != 0 { | |||||
| return nil, ErrUnavailable | |||||
| } | |||||
| defer bridgeCudaFree(dTaps) | |||||
| if bridgeMemcpyH2D(dTaps, unsafe.Pointer(&inv.PolyphaseTaps[0]), uintptr(len(inv.PolyphaseTaps))*unsafe.Sizeof(float32(0))) != 0 { | |||||
| return nil, ErrUnavailable | |||||
| } | |||||
| } | |||||
| if outCap > 0 { | |||||
| if bridgeCudaMalloc(&dOut, uintptr(outCap)*unsafe.Sizeof(C.gpud_float2{})) != 0 { | |||||
| return nil, ErrUnavailable | |||||
| } | |||||
| defer bridgeCudaFree(dOut) | |||||
| } | |||||
| if histCap > 0 { | |||||
| if bridgeCudaMalloc(&dHistOut, uintptr(histCap)*unsafe.Sizeof(C.gpud_float2{})) != 0 { | |||||
| return nil, ErrUnavailable | |||||
| } | |||||
| defer bridgeCudaFree(dHistOut) | |||||
| } | |||||
| res := bridgeLaunchStreamingPolyphasePrepare( | |||||
| (*C.gpud_float2)(dInNew), | |||||
| len(inv.IQNew), | |||||
| (*C.gpud_float2)(dHistIn), | |||||
| len(inv.ShiftedHistory), | |||||
| (*C.float)(dTaps), | |||||
| len(inv.PolyphaseTaps), | |||||
| inv.Decim, | |||||
| inv.NumTaps, | |||||
| inv.PhaseCountIn, | |||||
| inv.NCOPhaseIn, | |||||
| phaseInc, | |||||
| (*C.gpud_float2)(dOut), | |||||
| &nOut, | |||||
| &phaseCountOut, | |||||
| &phaseEndOut, | |||||
| (*C.gpud_float2)(dHistOut), | |||||
| ) | |||||
| if res != 0 { | |||||
| return nil, ErrUnavailable | |||||
| } | |||||
| if int(nOut) > 0 { | |||||
| if bridgeMemcpyD2H(unsafe.Pointer(&outHost[0]), dOut, uintptr(int(nOut))*unsafe.Sizeof(complex64(0))) != 0 { | |||||
| return nil, ErrUnavailable | |||||
| } | |||||
| } | |||||
| if histCap > 0 { | |||||
| if bridgeMemcpyD2H(unsafe.Pointer(&histHost[0]), dHistOut, uintptr(histCap)*unsafe.Sizeof(complex64(0))) != 0 { | |||||
| return nil, ErrUnavailable | |||||
| } | |||||
| } | |||||
| results[i] = StreamingGPUExecutionResult{ | |||||
| SignalID: inv.SignalID, | |||||
| Mode: StreamingGPUExecCUDA, | |||||
| IQ: append([]complex64(nil), outHost[:int(nOut)]...), | |||||
| Rate: inv.OutRate, | |||||
| NOut: int(nOut), | |||||
| PhaseCountOut: int(phaseCountOut), | |||||
| NCOPhaseOut: float64(phaseEndOut), | |||||
| HistoryOut: append([]complex64(nil), histHost...), | |||||
| HistoryLenOut: histCap, | |||||
| } | |||||
| } | |||||
| return results, nil | |||||
| } | |||||
| @@ -0,0 +1,8 @@ | |||||
| //go:build !cufft || !windows | |||||
| package gpudemod | |||||
| func (r *BatchRunner) executeStreamingGPUNativePrepared(invocations []StreamingGPUInvocation) ([]StreamingGPUExecutionResult, error) { | |||||
| _ = invocations | |||||
| return nil, ErrUnavailable | |||||
| } | |||||
| @@ -0,0 +1,37 @@ | |||||
| //go:build cufft && windows | |||||
| package gpudemod | |||||
| import "testing" | |||||
| func TestStreamingGPUNativePreparedComparableToCPUOracle(t *testing.T) { | |||||
| r := &BatchRunner{eng: &Engine{sampleRate: 4000000}, streamState: make(map[int64]*ExtractStreamState)} | |||||
| job := StreamingExtractJob{ | |||||
| SignalID: 1, | |||||
| OffsetHz: 12500, | |||||
| Bandwidth: 20000, | |||||
| OutRate: 200000, | |||||
| NumTaps: 65, | |||||
| ConfigHash: 777, | |||||
| } | |||||
| iq := makeDeterministicIQ(16000) | |||||
| gpuRes, err := r.StreamingExtractGPU(iq, []StreamingExtractJob{job}) | |||||
| if err != nil { | |||||
| t.Fatalf("unexpected native prepared GPU error: %v", err) | |||||
| } | |||||
| oracleRunner := NewCPUOracleRunner(4000000) | |||||
| oracleRes, err := oracleRunner.StreamingExtract(iq, []StreamingExtractJob{job}) | |||||
| if err != nil { | |||||
| t.Fatalf("unexpected oracle error: %v", err) | |||||
| } | |||||
| if len(gpuRes) != 1 || len(oracleRes) != 1 { | |||||
| t.Fatalf("unexpected result sizes: gpu=%d oracle=%d", len(gpuRes), len(oracleRes)) | |||||
| } | |||||
| metrics, stats := CompareOracleAndGPUHostOracle(oracleRes[0], gpuRes[0]) | |||||
| if stats.Count == 0 { | |||||
| t.Fatalf("expected compare count > 0") | |||||
| } | |||||
| if metrics.RefMaxAbsErr > 1e-4 { | |||||
| t.Fatalf("native prepared path diverges too much from oracle: max abs err=%f", metrics.RefMaxAbsErr) | |||||
| } | |||||
| } | |||||
| @@ -0,0 +1,59 @@ | |||||
| package gpudemod | |||||
| func (r *BatchRunner) buildStreamingGPUInvocations(iqNew []complex64, jobs []StreamingExtractJob) ([]StreamingGPUInvocation, error) { | |||||
| if r == nil || r.eng == nil { | |||||
| return nil, ErrUnavailable | |||||
| } | |||||
| invocations := make([]StreamingGPUInvocation, len(jobs)) | |||||
| active := make(map[int64]struct{}, len(jobs)) | |||||
| for i, job := range jobs { | |||||
| active[job.SignalID] = struct{}{} | |||||
| state, err := r.getOrInitExtractState(job, r.eng.sampleRate) | |||||
| if err != nil { | |||||
| return nil, err | |||||
| } | |||||
| invocations[i] = StreamingGPUInvocation{ | |||||
| SignalID: job.SignalID, | |||||
| OffsetHz: job.OffsetHz, | |||||
| OutRate: job.OutRate, | |||||
| Bandwidth: job.Bandwidth, | |||||
| SampleRate: r.eng.sampleRate, | |||||
| NumTaps: state.NumTaps, | |||||
| Decim: state.Decim, | |||||
| PhaseCountIn: state.PhaseCount, | |||||
| NCOPhaseIn: state.NCOPhase, | |||||
| HistoryLen: len(state.ShiftedHistory), | |||||
| BaseTaps: append([]float32(nil), state.BaseTaps...), | |||||
| PolyphaseTaps: append([]float32(nil), state.PolyphaseTaps...), | |||||
| ShiftedHistory: append([]complex64(nil), state.ShiftedHistory...), | |||||
| IQNew: iqNew, | |||||
| } | |||||
| } | |||||
| for signalID := range r.streamState { | |||||
| if _, ok := active[signalID]; !ok { | |||||
| delete(r.streamState, signalID) | |||||
| } | |||||
| } | |||||
| return invocations, nil | |||||
| } | |||||
| func (r *BatchRunner) applyStreamingGPUExecutionResults(results []StreamingGPUExecutionResult) []StreamingExtractResult { | |||||
| out := make([]StreamingExtractResult, len(results)) | |||||
| for i, res := range results { | |||||
| state := r.streamState[res.SignalID] | |||||
| if state != nil { | |||||
| state.NCOPhase = res.NCOPhaseOut | |||||
| state.PhaseCount = res.PhaseCountOut | |||||
| state.ShiftedHistory = append(state.ShiftedHistory[:0], res.HistoryOut...) | |||||
| } | |||||
| out[i] = StreamingExtractResult{ | |||||
| SignalID: res.SignalID, | |||||
| IQ: res.IQ, | |||||
| Rate: res.Rate, | |||||
| NOut: res.NOut, | |||||
| PhaseCount: res.PhaseCountOut, | |||||
| HistoryLen: res.HistoryLenOut, | |||||
| } | |||||
| } | |||||
| return out | |||||
| } | |||||
| @@ -0,0 +1,39 @@ | |||||
| package gpudemod | |||||
| import "fmt" | |||||
| func updateShiftedHistory(prev []complex64, shiftedNew []complex64, numTaps int) []complex64 { | |||||
| need := numTaps - 1 | |||||
| if need <= 0 { | |||||
| return nil | |||||
| } | |||||
| combined := append(append(make([]complex64, 0, len(prev)+len(shiftedNew)), prev...), shiftedNew...) | |||||
| if len(combined) <= need { | |||||
| out := make([]complex64, len(combined)) | |||||
| copy(out, combined) | |||||
| return out | |||||
| } | |||||
| out := make([]complex64, need) | |||||
| copy(out, combined[len(combined)-need:]) | |||||
| return out | |||||
| } | |||||
| // StreamingExtractGPU is the planned production entry point for the stateful | |||||
| // GPU extractor path. It intentionally exists early as an explicit boundary so | |||||
| // callers can migrate away from legacy overlap+trim semantics. | |||||
| // | |||||
| // Current status: | |||||
| // - validates jobs against persistent per-signal state ownership | |||||
| // - enforces exact integer decimation | |||||
| // - initializes per-signal state (config hash, taps, history capacity) | |||||
| // - does not yet execute the final stateful polyphase GPU kernel path | |||||
| func (r *BatchRunner) StreamingExtractGPU(iqNew []complex64, jobs []StreamingExtractJob) ([]StreamingExtractResult, error) { | |||||
| if r == nil || r.eng == nil { | |||||
| return nil, ErrUnavailable | |||||
| } | |||||
| if results, err := r.StreamingExtractGPUExec(iqNew, jobs); err == nil { | |||||
| return results, nil | |||||
| } | |||||
| _, _ = iqNew, jobs | |||||
| return nil, fmt.Errorf("StreamingExtractGPU not implemented yet: stateful polyphase GPU path pending") | |||||
| } | |||||
| @@ -0,0 +1,53 @@ | |||||
| package gpudemod | |||||
| import "testing" | |||||
| func TestStreamingGPUStubRemainsExplicitlyUnimplemented(t *testing.T) { | |||||
| r := &BatchRunner{eng: &Engine{sampleRate: 4000000}, streamState: make(map[int64]*ExtractStreamState)} | |||||
| job := StreamingExtractJob{ | |||||
| SignalID: 1, | |||||
| OffsetHz: 12500, | |||||
| Bandwidth: 20000, | |||||
| OutRate: 200000, | |||||
| NumTaps: 65, | |||||
| ConfigHash: 777, | |||||
| } | |||||
| iq := makeDeterministicIQ(1000) | |||||
| _, err := r.StreamingExtractGPU(iq, []StreamingExtractJob{job}) | |||||
| if err == nil { | |||||
| t.Fatalf("expected not-implemented error from GPU stub") | |||||
| } | |||||
| } | |||||
| func TestStreamingGPUHostOracleAdvancesState(t *testing.T) { | |||||
| r := &BatchRunner{eng: &Engine{sampleRate: 4000000}, streamState: make(map[int64]*ExtractStreamState)} | |||||
| job := StreamingExtractJob{ | |||||
| SignalID: 1, | |||||
| OffsetHz: 12500, | |||||
| Bandwidth: 20000, | |||||
| OutRate: 200000, | |||||
| NumTaps: 65, | |||||
| ConfigHash: 777, | |||||
| } | |||||
| iq := makeDeterministicIQ(1000) | |||||
| results, err := r.StreamingExtractGPUHostOracle(iq, []StreamingExtractJob{job}) | |||||
| if err != nil { | |||||
| t.Fatalf("unexpected host-oracle error: %v", err) | |||||
| } | |||||
| if len(results) != 1 { | |||||
| t.Fatalf("expected 1 result, got %d", len(results)) | |||||
| } | |||||
| state := r.streamState[1] | |||||
| if state == nil { | |||||
| t.Fatalf("expected state to be initialized") | |||||
| } | |||||
| if state.NCOPhase == 0 { | |||||
| t.Fatalf("expected phase to advance") | |||||
| } | |||||
| if len(state.ShiftedHistory) == 0 { | |||||
| t.Fatalf("expected shifted history to be updated") | |||||
| } | |||||
| if results[0].NOut == 0 { | |||||
| t.Fatalf("expected non-zero output count from host oracle path") | |||||
| } | |||||
| } | |||||
| @@ -0,0 +1,64 @@ | |||||
| package gpudemod | |||||
| import "math" | |||||
| func runStreamingPolyphaseHostCore( | |||||
| iqNew []complex64, | |||||
| sampleRate int, | |||||
| offsetHz float64, | |||||
| stateNCOPhase float64, | |||||
| statePhaseCount int, | |||||
| stateNumTaps int, | |||||
| stateDecim int, | |||||
| stateHistory []complex64, | |||||
| polyphaseTaps []float32, | |||||
| ) ([]complex64, float64, int, []complex64) { | |||||
| out := make([]complex64, 0, len(iqNew)/maxInt(1, stateDecim)+2) | |||||
| phase := stateNCOPhase | |||||
| phaseCount := statePhaseCount | |||||
| hist := append([]complex64(nil), stateHistory...) | |||||
| phaseLen := PolyphasePhaseLen(len(polyphaseTaps)/maxInt(1, stateDecim)*maxInt(1, stateDecim), stateDecim) | |||||
| if phaseLen == 0 { | |||||
| phaseLen = PolyphasePhaseLen(len(polyphaseTaps), stateDecim) | |||||
| } | |||||
| phaseInc := -2.0 * math.Pi * offsetHz / float64(sampleRate) | |||||
| for _, x := range iqNew { | |||||
| rot := complex64(complex(math.Cos(phase), math.Sin(phase))) | |||||
| s := x * rot | |||||
| hist = append(hist, s) | |||||
| phaseCount++ | |||||
| if phaseCount == stateDecim { | |||||
| var y complex64 | |||||
| for p := 0; p < stateDecim; p++ { | |||||
| for k := 0; k < phaseLen; k++ { | |||||
| idxTap := p*phaseLen + k | |||||
| if idxTap >= len(polyphaseTaps) { | |||||
| continue | |||||
| } | |||||
| tap := polyphaseTaps[idxTap] | |||||
| if tap == 0 { | |||||
| continue | |||||
| } | |||||
| srcBack := p + k*stateDecim | |||||
| idx := len(hist) - 1 - srcBack | |||||
| if idx < 0 { | |||||
| continue | |||||
| } | |||||
| y += complex(tap, 0) * hist[idx] | |||||
| } | |||||
| } | |||||
| out = append(out, y) | |||||
| phaseCount = 0 | |||||
| } | |||||
| if len(hist) > stateNumTaps-1 { | |||||
| hist = hist[len(hist)-(stateNumTaps-1):] | |||||
| } | |||||
| phase += phaseInc | |||||
| if phase >= math.Pi { | |||||
| phase -= 2 * math.Pi | |||||
| } else if phase < -math.Pi { | |||||
| phase += 2 * math.Pi | |||||
| } | |||||
| } | |||||
| return out, phase, phaseCount, append([]complex64(nil), hist...) | |||||
| } | |||||
| @@ -0,0 +1,40 @@ | |||||
| package gpudemod | |||||
| import "testing" | |||||
| func TestRunStreamingPolyphaseHostCoreMatchesCPUOraclePolyphase(t *testing.T) { | |||||
| cfg := OracleHarnessConfig{ | |||||
| SignalID: 1, | |||||
| ConfigHash: 123, | |||||
| NCOPhase: 0, | |||||
| Decim: 20, | |||||
| NumTaps: 65, | |||||
| PhaseInc: 0.017, | |||||
| } | |||||
| state := MakeCPUOracleState(cfg) | |||||
| iq := MakeDeterministicIQ(12000) | |||||
| oracle := CPUOracleExtractPolyphase(iq, state, cfg.PhaseInc) | |||||
| state2 := MakeCPUOracleState(cfg) | |||||
| out, phase, phaseCount, hist := runStreamingPolyphaseHostCore( | |||||
| iq, | |||||
| 4000000, | |||||
| -cfg.PhaseInc*4000000/(2*3.141592653589793), | |||||
| state2.NCOPhase, | |||||
| state2.PhaseCount, | |||||
| state2.NumTaps, | |||||
| state2.Decim, | |||||
| state2.ShiftedHistory, | |||||
| state2.PolyphaseTaps, | |||||
| ) | |||||
| requireComplexSlicesClose(t, oracle, out, 1e-5) | |||||
| if phase == 0 && len(iq) > 0 { | |||||
| t.Fatalf("expected phase to advance") | |||||
| } | |||||
| if phaseCount < 0 || phaseCount >= state2.Decim { | |||||
| t.Fatalf("unexpected phaseCount: %d", phaseCount) | |||||
| } | |||||
| if len(hist) == 0 { | |||||
| t.Fatalf("expected history to be retained") | |||||
| } | |||||
| } | |||||
| @@ -0,0 +1,111 @@ | |||||
| package gpudemod | |||||
| import ( | |||||
| "fmt" | |||||
| "sdr-wideband-suite/internal/dsp" | |||||
| ) | |||||
| type CPUOracleRunner struct { | |||||
| SampleRate int | |||||
| States map[int64]*CPUOracleState | |||||
| } | |||||
| func (r *CPUOracleRunner) ResetAllStates() { | |||||
| if r == nil { | |||||
| return | |||||
| } | |||||
| r.States = make(map[int64]*CPUOracleState) | |||||
| } | |||||
| func NewCPUOracleRunner(sampleRate int) *CPUOracleRunner { | |||||
| return &CPUOracleRunner{ | |||||
| SampleRate: sampleRate, | |||||
| States: make(map[int64]*CPUOracleState), | |||||
| } | |||||
| } | |||||
| func (r *CPUOracleRunner) ResetSignalState(signalID int64) { | |||||
| if r == nil || r.States == nil { | |||||
| return | |||||
| } | |||||
| delete(r.States, signalID) | |||||
| } | |||||
| func (r *CPUOracleRunner) getOrInitState(job StreamingExtractJob) (*CPUOracleState, error) { | |||||
| if r == nil { | |||||
| return nil, fmt.Errorf("nil CPUOracleRunner") | |||||
| } | |||||
| if r.States == nil { | |||||
| r.States = make(map[int64]*CPUOracleState) | |||||
| } | |||||
| decim, err := ExactIntegerDecimation(r.SampleRate, job.OutRate) | |||||
| if err != nil { | |||||
| return nil, err | |||||
| } | |||||
| state := r.States[job.SignalID] | |||||
| if state == nil { | |||||
| state = &CPUOracleState{SignalID: job.SignalID} | |||||
| r.States[job.SignalID] = state | |||||
| } | |||||
| ResetCPUOracleStateIfConfigChanged(state, job.ConfigHash) | |||||
| state.Decim = decim | |||||
| state.NumTaps = job.NumTaps | |||||
| if state.NumTaps <= 0 { | |||||
| state.NumTaps = 101 | |||||
| } | |||||
| cutoff := job.Bandwidth / 2 | |||||
| if cutoff < 200 { | |||||
| cutoff = 200 | |||||
| } | |||||
| base := dsp.LowpassFIR(cutoff, r.SampleRate, state.NumTaps) | |||||
| state.BaseTaps = make([]float32, len(base)) | |||||
| for i, v := range base { | |||||
| state.BaseTaps[i] = float32(v) | |||||
| } | |||||
| state.PolyphaseTaps = BuildPolyphaseTapsPhaseMajor(state.BaseTaps, state.Decim) | |||||
| if state.ShiftedHistory == nil { | |||||
| state.ShiftedHistory = make([]complex64, 0, maxInt(0, state.NumTaps-1)) | |||||
| } | |||||
| return state, nil | |||||
| } | |||||
| func (r *CPUOracleRunner) StreamingExtract(iqNew []complex64, jobs []StreamingExtractJob) ([]StreamingExtractResult, error) { | |||||
| results := make([]StreamingExtractResult, len(jobs)) | |||||
| active := make(map[int64]struct{}, len(jobs)) | |||||
| for i, job := range jobs { | |||||
| active[job.SignalID] = struct{}{} | |||||
| state, err := r.getOrInitState(job) | |||||
| if err != nil { | |||||
| return nil, err | |||||
| } | |||||
| out, phase, phaseCount, hist := runStreamingPolyphaseHostCore( | |||||
| iqNew, | |||||
| r.SampleRate, | |||||
| job.OffsetHz, | |||||
| state.NCOPhase, | |||||
| state.PhaseCount, | |||||
| state.NumTaps, | |||||
| state.Decim, | |||||
| state.ShiftedHistory, | |||||
| state.PolyphaseTaps, | |||||
| ) | |||||
| state.NCOPhase = phase | |||||
| state.PhaseCount = phaseCount | |||||
| state.ShiftedHistory = append(state.ShiftedHistory[:0], hist...) | |||||
| results[i] = StreamingExtractResult{ | |||||
| SignalID: job.SignalID, | |||||
| IQ: out, | |||||
| Rate: job.OutRate, | |||||
| NOut: len(out), | |||||
| PhaseCount: state.PhaseCount, | |||||
| HistoryLen: len(state.ShiftedHistory), | |||||
| } | |||||
| } | |||||
| for signalID := range r.States { | |||||
| if _, ok := active[signalID]; !ok { | |||||
| delete(r.States, signalID) | |||||
| } | |||||
| } | |||||
| return results, nil | |||||
| } | |||||
| @@ -0,0 +1,54 @@ | |||||
| package gpudemod | |||||
| import ( | |||||
| "fmt" | |||||
| "hash/fnv" | |||||
| ) | |||||
| type StreamingExtractJob struct { | |||||
| SignalID int64 | |||||
| OffsetHz float64 | |||||
| Bandwidth float64 | |||||
| OutRate int | |||||
| NumTaps int | |||||
| ConfigHash uint64 | |||||
| } | |||||
| type StreamingExtractResult struct { | |||||
| SignalID int64 | |||||
| IQ []complex64 | |||||
| Rate int | |||||
| NOut int | |||||
| PhaseCount int | |||||
| HistoryLen int | |||||
| } | |||||
| type ExtractStreamState struct { | |||||
| SignalID int64 | |||||
| ConfigHash uint64 | |||||
| NCOPhase float64 | |||||
| Decim int | |||||
| PhaseCount int | |||||
| NumTaps int | |||||
| ShiftedHistory []complex64 | |||||
| BaseTaps []float32 | |||||
| PolyphaseTaps []float32 | |||||
| Initialized bool | |||||
| } | |||||
| func ResetExtractStreamState(state *ExtractStreamState, cfgHash uint64) { | |||||
| if state == nil { | |||||
| return | |||||
| } | |||||
| state.ConfigHash = cfgHash | |||||
| state.NCOPhase = 0 | |||||
| state.PhaseCount = 0 | |||||
| state.ShiftedHistory = state.ShiftedHistory[:0] | |||||
| state.Initialized = false | |||||
| } | |||||
| func StreamingConfigHash(signalID int64, offsetHz float64, bandwidth float64, outRate int, numTaps int, sampleRate int) uint64 { | |||||
| h := fnv.New64a() | |||||
| _, _ = h.Write([]byte(fmt.Sprintf("sig=%d|off=%.9f|bw=%.9f|out=%d|taps=%d|sr=%d", signalID, offsetHz, bandwidth, outRate, numTaps, sampleRate))) | |||||
| return h.Sum64() | |||||
| } | |||||
| @@ -0,0 +1,78 @@ | |||||
| package gpudemod | |||||
| import ( | |||||
| "math" | |||||
| ) | |||||
| type OracleHarnessConfig struct { | |||||
| SignalID int64 | |||||
| ConfigHash uint64 | |||||
| NCOPhase float64 | |||||
| Decim int | |||||
| NumTaps int | |||||
| PhaseInc float64 | |||||
| } | |||||
| func MakeDeterministicIQ(n int) []complex64 { | |||||
| out := make([]complex64, n) | |||||
| for i := 0; i < n; i++ { | |||||
| a := 0.017 * float64(i) | |||||
| b := 0.031 * float64(i) | |||||
| out[i] = complex64(complex(math.Cos(a)+0.2*math.Cos(b), math.Sin(a)+0.15*math.Sin(b))) | |||||
| } | |||||
| return out | |||||
| } | |||||
| func MakeToneIQ(n int, phaseInc float64) []complex64 { | |||||
| out := make([]complex64, n) | |||||
| phase := 0.0 | |||||
| for i := 0; i < n; i++ { | |||||
| out[i] = complex64(complex(math.Cos(phase), math.Sin(phase))) | |||||
| phase += phaseInc | |||||
| } | |||||
| return out | |||||
| } | |||||
| func MakeLowpassTaps(n int) []float32 { | |||||
| out := make([]float32, n) | |||||
| for i := range out { | |||||
| out[i] = 1.0 / float32(n) | |||||
| } | |||||
| return out | |||||
| } | |||||
| func MakeCPUOracleState(cfg OracleHarnessConfig) *CPUOracleState { | |||||
| taps := MakeLowpassTaps(cfg.NumTaps) | |||||
| return &CPUOracleState{ | |||||
| SignalID: cfg.SignalID, | |||||
| ConfigHash: cfg.ConfigHash, | |||||
| NCOPhase: cfg.NCOPhase, | |||||
| Decim: cfg.Decim, | |||||
| PhaseCount: 0, | |||||
| NumTaps: cfg.NumTaps, | |||||
| ShiftedHistory: make([]complex64, 0, maxInt(0, cfg.NumTaps-1)), | |||||
| BaseTaps: taps, | |||||
| PolyphaseTaps: BuildPolyphaseTapsPhaseMajor(taps, cfg.Decim), | |||||
| } | |||||
| } | |||||
| func RunChunkedCPUOraclePolyphase(all []complex64, chunkSizes []int, mkState func() *CPUOracleState, phaseInc float64) []complex64 { | |||||
| state := mkState() | |||||
| out := make([]complex64, 0) | |||||
| pos := 0 | |||||
| for _, n := range chunkSizes { | |||||
| if pos >= len(all) { | |||||
| break | |||||
| } | |||||
| end := pos + n | |||||
| if end > len(all) { | |||||
| end = len(all) | |||||
| } | |||||
| out = append(out, CPUOracleExtractPolyphase(all[pos:end], state, phaseInc)...) | |||||
| pos = end | |||||
| } | |||||
| if pos < len(all) { | |||||
| out = append(out, CPUOracleExtractPolyphase(all[pos:], state, phaseInc)...) | |||||
| } | |||||
| return out | |||||
| } | |||||
| @@ -0,0 +1,39 @@ | |||||
| package gpudemod | |||||
| import "testing" | |||||
| func requireComplexSlicesCloseHarness(t *testing.T, a []complex64, b []complex64, tol float64) { | |||||
| t.Helper() | |||||
| if len(a) != len(b) { | |||||
| t.Fatalf("length mismatch: %d vs %d", len(a), len(b)) | |||||
| } | |||||
| for i := range a { | |||||
| d := CompareComplexSlices([]complex64{a[i]}, []complex64{b[i]}) | |||||
| if d.MaxAbsErr > tol { | |||||
| t.Fatalf("slice mismatch at %d: %v vs %v (tol=%f)", i, a[i], b[i], tol) | |||||
| } | |||||
| } | |||||
| } | |||||
| func TestHarnessChunkedCPUOraclePolyphase(t *testing.T) { | |||||
| cfg := OracleHarnessConfig{ | |||||
| SignalID: 1, | |||||
| ConfigHash: 123, | |||||
| NCOPhase: 0, | |||||
| Decim: 20, | |||||
| NumTaps: 65, | |||||
| PhaseInc: 0.017, | |||||
| } | |||||
| iq := MakeDeterministicIQ(150000) | |||||
| mk := func() *CPUOracleState { return MakeCPUOracleState(cfg) } | |||||
| mono := CPUOracleExtractPolyphase(iq, mk(), cfg.PhaseInc) | |||||
| chunked := RunChunkedCPUOraclePolyphase(iq, []int{4096, 5000, 8192, 27307}, mk, cfg.PhaseInc) | |||||
| requireComplexSlicesCloseHarness(t, mono, chunked, 1e-5) | |||||
| } | |||||
| func TestHarnessToneIQ(t *testing.T) { | |||||
| iq := MakeToneIQ(1024, 0.05) | |||||
| if len(iq) != 1024 { | |||||
| t.Fatalf("unexpected tone iq length: %d", len(iq)) | |||||
| } | |||||
| } | |||||
| @@ -26,6 +26,7 @@ typedef int (__stdcall *gpud_launch_decimate_stream_fn)(const gpud_float2* in, g | |||||
| typedef int (__stdcall *gpud_launch_decimate_fn)(const gpud_float2* in, gpud_float2* out, int n_out, int factor); | typedef int (__stdcall *gpud_launch_decimate_fn)(const gpud_float2* in, gpud_float2* out, int n_out, int factor); | ||||
| typedef int (__stdcall *gpud_launch_am_envelope_fn)(const gpud_float2* in, float* out, int n); | typedef int (__stdcall *gpud_launch_am_envelope_fn)(const gpud_float2* in, float* out, int n); | ||||
| typedef int (__stdcall *gpud_launch_ssb_product_fn)(const gpud_float2* in, float* out, int n, double phase_inc, double phase_start); | typedef int (__stdcall *gpud_launch_ssb_product_fn)(const gpud_float2* in, float* out, int n, double phase_inc, double phase_start); | ||||
| typedef int (__stdcall *gpud_launch_streaming_polyphase_prepare_fn)(const gpud_float2* in_new, int n_new, const gpud_float2* history_in, int history_len, const float* polyphase_taps, int polyphase_len, int decim, int num_taps, int phase_count_in, double phase_start, double phase_inc, gpud_float2* out, int* n_out, int* phase_count_out, double* phase_end_out, gpud_float2* history_out); | |||||
| static HMODULE gpud_mod = NULL; | static HMODULE gpud_mod = NULL; | ||||
| static gpud_stream_create_fn gpud_p_stream_create = NULL; | static gpud_stream_create_fn gpud_p_stream_create = NULL; | ||||
| @@ -42,6 +43,7 @@ static gpud_launch_decimate_stream_fn gpud_p_launch_decimate_stream = NULL; | |||||
| static gpud_launch_decimate_fn gpud_p_launch_decimate = NULL; | static gpud_launch_decimate_fn gpud_p_launch_decimate = NULL; | ||||
| static gpud_launch_am_envelope_fn gpud_p_launch_am_envelope = NULL; | static gpud_launch_am_envelope_fn gpud_p_launch_am_envelope = NULL; | ||||
| static gpud_launch_ssb_product_fn gpud_p_launch_ssb_product = NULL; | static gpud_launch_ssb_product_fn gpud_p_launch_ssb_product = NULL; | ||||
| static gpud_launch_streaming_polyphase_prepare_fn gpud_p_launch_streaming_polyphase_prepare = NULL; | |||||
| static int gpud_cuda_malloc(void **ptr, size_t bytes) { return (int)cudaMalloc(ptr, bytes); } | static int gpud_cuda_malloc(void **ptr, size_t bytes) { return (int)cudaMalloc(ptr, bytes); } | ||||
| static int gpud_cuda_free(void *ptr) { return (int)cudaFree(ptr); } | static int gpud_cuda_free(void *ptr) { return (int)cudaFree(ptr); } | ||||
| @@ -67,6 +69,7 @@ static int gpud_load_library(const char* path) { | |||||
| gpud_p_launch_decimate = (gpud_launch_decimate_fn)GetProcAddress(gpud_mod, "gpud_launch_decimate_cuda"); | gpud_p_launch_decimate = (gpud_launch_decimate_fn)GetProcAddress(gpud_mod, "gpud_launch_decimate_cuda"); | ||||
| gpud_p_launch_am_envelope = (gpud_launch_am_envelope_fn)GetProcAddress(gpud_mod, "gpud_launch_am_envelope_cuda"); | gpud_p_launch_am_envelope = (gpud_launch_am_envelope_fn)GetProcAddress(gpud_mod, "gpud_launch_am_envelope_cuda"); | ||||
| gpud_p_launch_ssb_product = (gpud_launch_ssb_product_fn)GetProcAddress(gpud_mod, "gpud_launch_ssb_product_cuda"); | gpud_p_launch_ssb_product = (gpud_launch_ssb_product_fn)GetProcAddress(gpud_mod, "gpud_launch_ssb_product_cuda"); | ||||
| gpud_p_launch_streaming_polyphase_prepare = (gpud_launch_streaming_polyphase_prepare_fn)GetProcAddress(gpud_mod, "gpud_launch_streaming_polyphase_prepare_cuda"); | |||||
| if (!gpud_p_stream_create || !gpud_p_stream_destroy || !gpud_p_stream_sync || !gpud_p_upload_fir_taps || !gpud_p_launch_freq_shift_stream || !gpud_p_launch_freq_shift || !gpud_p_launch_fm_discrim || !gpud_p_launch_fir_stream || !gpud_p_launch_fir || !gpud_p_launch_decimate_stream || !gpud_p_launch_decimate || !gpud_p_launch_am_envelope || !gpud_p_launch_ssb_product) { | if (!gpud_p_stream_create || !gpud_p_stream_destroy || !gpud_p_stream_sync || !gpud_p_upload_fir_taps || !gpud_p_launch_freq_shift_stream || !gpud_p_launch_freq_shift || !gpud_p_launch_fm_discrim || !gpud_p_launch_fir_stream || !gpud_p_launch_fir || !gpud_p_launch_decimate_stream || !gpud_p_launch_decimate || !gpud_p_launch_am_envelope || !gpud_p_launch_ssb_product) { | ||||
| FreeLibrary(gpud_mod); | FreeLibrary(gpud_mod); | ||||
| gpud_mod = NULL; | gpud_mod = NULL; | ||||
| @@ -89,6 +92,7 @@ static int gpud_launch_decimate_stream(gpud_float2 *in, gpud_float2 *out, int n_ | |||||
| static int gpud_launch_decimate(gpud_float2 *in, gpud_float2 *out, int n_out, int factor) { if (!gpud_p_launch_decimate) return -1; return gpud_p_launch_decimate(in, out, n_out, factor); } | static int gpud_launch_decimate(gpud_float2 *in, gpud_float2 *out, int n_out, int factor) { if (!gpud_p_launch_decimate) return -1; return gpud_p_launch_decimate(in, out, n_out, factor); } | ||||
| static int gpud_launch_am_envelope(gpud_float2 *in, float *out, int n) { if (!gpud_p_launch_am_envelope) return -1; return gpud_p_launch_am_envelope(in, out, n); } | static int gpud_launch_am_envelope(gpud_float2 *in, float *out, int n) { if (!gpud_p_launch_am_envelope) return -1; return gpud_p_launch_am_envelope(in, out, n); } | ||||
| static int gpud_launch_ssb_product(gpud_float2 *in, float *out, int n, double phase_inc, double phase_start) { if (!gpud_p_launch_ssb_product) return -1; return gpud_p_launch_ssb_product(in, out, n, phase_inc, phase_start); } | static int gpud_launch_ssb_product(gpud_float2 *in, float *out, int n, double phase_inc, double phase_start) { if (!gpud_p_launch_ssb_product) return -1; return gpud_p_launch_ssb_product(in, out, n, phase_inc, phase_start); } | ||||
| static int gpud_launch_streaming_polyphase_prepare(gpud_float2 *in_new, int n_new, gpud_float2 *history_in, int history_len, float *polyphase_taps, int polyphase_len, int decim, int num_taps, int phase_count_in, double phase_start, double phase_inc, gpud_float2 *out, int *n_out, int *phase_count_out, double *phase_end_out, gpud_float2 *history_out) { if (!gpud_p_launch_streaming_polyphase_prepare) return -1; return gpud_p_launch_streaming_polyphase_prepare(in_new, n_new, history_in, history_len, polyphase_taps, polyphase_len, decim, num_taps, phase_count_in, phase_start, phase_inc, out, n_out, phase_count_out, phase_end_out, history_out); } | |||||
| */ | */ | ||||
| import "C" | import "C" | ||||
| @@ -131,6 +135,9 @@ func bridgeLaunchAMEnvelope(in *C.gpud_float2, out *C.float, n int) int { return | |||||
| func bridgeLaunchSSBProduct(in *C.gpud_float2, out *C.float, n int, phaseInc float64, phaseStart float64) int { | func bridgeLaunchSSBProduct(in *C.gpud_float2, out *C.float, n int, phaseInc float64, phaseStart float64) int { | ||||
| return int(C.gpud_launch_ssb_product(in, out, C.int(n), C.double(phaseInc), C.double(phaseStart))) | return int(C.gpud_launch_ssb_product(in, out, C.int(n), C.double(phaseInc), C.double(phaseStart))) | ||||
| } | } | ||||
| func bridgeLaunchStreamingPolyphasePrepare(inNew *C.gpud_float2, nNew int, historyIn *C.gpud_float2, historyLen int, polyphaseTaps *C.float, polyphaseLen int, decim int, numTaps int, phaseCountIn int, phaseStart float64, phaseInc float64, out *C.gpud_float2, nOut *C.int, phaseCountOut *C.int, phaseEndOut *C.double, historyOut *C.gpud_float2) int { | |||||
| return int(C.gpud_launch_streaming_polyphase_prepare(inNew, C.int(nNew), historyIn, C.int(historyLen), polyphaseTaps, C.int(polyphaseLen), C.int(decim), C.int(numTaps), C.int(phaseCountIn), C.double(phaseStart), C.double(phaseInc), out, nOut, phaseCountOut, phaseEndOut, historyOut)) | |||||
| } | |||||
| func bridgeStreamCreate() (streamHandle, int) { | func bridgeStreamCreate() (streamHandle, int) { | ||||
| var s C.gpud_stream_handle | var s C.gpud_stream_handle | ||||
| res := int(C.gpud_stream_create(&s)) | res := int(C.gpud_stream_create(&s)) | ||||