| @@ -16,12 +16,25 @@ if (!(Test-Path $outDir)) { New-Item -ItemType Directory -Path $outDir | Out-Nul | |||
| Remove-Item $dll,$lib,$exp -Force -ErrorAction SilentlyContinue | |||
| $cmd = @" | |||
| call "$vcvars" && "$nvcc" -shared "$src" -o "$dll" -cudart=hybrid -Xcompiler "/MD" -arch=sm_75 -gencode arch=compute_75,code=sm_75 -gencode arch=compute_80,code=sm_80 -gencode arch=compute_86,code=sm_86 -gencode arch=compute_89,code=sm_89 -gencode arch=compute_90,code=sm_90 | |||
| $bat = Join-Path $env:TEMP 'build-gpudemod-dll.bat' | |||
| $batContent = @" | |||
| @echo off | |||
| call "$vcvars" | |||
| if errorlevel 1 exit /b %errorlevel% | |||
| "$nvcc" -shared "$src" -o "$dll" -cudart=hybrid -Xcompiler "/MD" -arch=sm_75 ^ | |||
| -gencode arch=compute_75,code=sm_75 ^ | |||
| -gencode arch=compute_80,code=sm_80 ^ | |||
| -gencode arch=compute_86,code=sm_86 ^ | |||
| -gencode arch=compute_89,code=sm_89 ^ | |||
| -gencode arch=compute_90,code=sm_90 | |||
| exit /b %errorlevel% | |||
| "@ | |||
| Set-Content -Path $bat -Value $batContent -Encoding ASCII | |||
| Write-Host 'Building gpudemod CUDA DLL...' -ForegroundColor Cyan | |||
| cmd.exe /c $cmd | |||
| if ($LASTEXITCODE -ne 0) { throw 'gpudemod DLL build failed' } | |||
| cmd.exe /c ""$bat"" | |||
| $exitCode = $LASTEXITCODE | |||
| Remove-Item $bat -Force -ErrorAction SilentlyContinue | |||
| if ($exitCode -ne 0) { throw 'gpudemod DLL build failed' } | |||
| Write-Host "Built: $dll" -ForegroundColor Green | |||
| @@ -21,10 +21,13 @@ if (Test-Path $sdrplayBin) { $env:PATH = "$sdrplayBin;" + $env:PATH } | |||
| # CUDA runtime / cuFFT | |||
| $cudaInc = 'C:\CUDA\include' | |||
| $cudaBin = 'C:\CUDA\bin' | |||
| $cudaBinX64 = 'C:\CUDA\bin\x64' | |||
| if (-not (Test-Path $cudaInc)) { $cudaInc = 'C:\PROGRA~1\NVIDIA~2\CUDA\v13.2\include' } | |||
| if (-not (Test-Path $cudaBin)) { $cudaBin = 'C:\PROGRA~1\NVIDIA~2\CUDA\v13.2\bin' } | |||
| if (-not (Test-Path $cudaBinX64)) { $cudaBinX64 = 'C:\PROGRA~1\NVIDIA~2\CUDA\v13.2\bin\x64' } | |||
| $cudaMingw = Join-Path $PSScriptRoot 'cuda-mingw' | |||
| if (Test-Path $cudaInc) { $env:CGO_CFLAGS = "$env:CGO_CFLAGS -I$cudaInc" } | |||
| if (Test-Path $cudaBinX64) { $env:PATH = "$cudaBinX64;" + $env:PATH } | |||
| if (Test-Path $cudaBin) { $env:PATH = "$cudaBin;" + $env:PATH } | |||
| if (Test-Path $cudaMingw) { $env:CGO_LDFLAGS = "$env:CGO_LDFLAGS -L$cudaMingw -lcudart64_13 -lcufft64_12 -lkernel32" } | |||
| @@ -68,8 +71,11 @@ if ($dllSrc) { | |||
| } | |||
| $cudartCandidates = @( | |||
| (Join-Path $cudaBinX64 'cudart64_13.dll'), | |||
| (Join-Path $cudaBin 'cudart64_13.dll'), | |||
| 'C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.2\bin\x64\cudart64_13.dll', | |||
| 'C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.2\bin\cudart64_13.dll', | |||
| 'C:\CUDA\bin\x64\cudart64_13.dll', | |||
| 'C:\CUDA\bin\cudart64_13.dll' | |||
| ) | |||
| $cudartSrc = $cudartCandidates | Where-Object { $_ -and (Test-Path $_) } | Select-Object -First 1 | |||
| @@ -15,6 +15,7 @@ type BatchRunner struct { | |||
| slotBufs []slotBuffers | |||
| slotBufSize int // number of IQ samples the slot buffers were allocated for | |||
| streamState map[int64]*ExtractStreamState | |||
| nativeState map[int64]*nativeStreamingSignalState | |||
| } | |||
| func NewBatchRunner(maxSamples int, sampleRate int) (*BatchRunner, error) { | |||
| @@ -22,7 +23,11 @@ func NewBatchRunner(maxSamples int, sampleRate int) (*BatchRunner, error) { | |||
| if err != nil { | |||
| return nil, err | |||
| } | |||
| return &BatchRunner{eng: eng, streamState: make(map[int64]*ExtractStreamState)}, nil | |||
| return &BatchRunner{ | |||
| eng: eng, | |||
| streamState: make(map[int64]*ExtractStreamState), | |||
| nativeState: make(map[int64]*nativeStreamingSignalState), | |||
| }, nil | |||
| } | |||
| func (r *BatchRunner) Close() { | |||
| @@ -30,10 +35,12 @@ func (r *BatchRunner) Close() { | |||
| return | |||
| } | |||
| r.freeSlotBuffers() | |||
| r.freeAllNativeStreamingStates() | |||
| r.eng.Close() | |||
| r.eng = nil | |||
| r.slots = nil | |||
| r.streamState = nil | |||
| r.nativeState = nil | |||
| } | |||
| func (r *BatchRunner) prepare(jobs []ExtractJob) { | |||
| @@ -11,6 +11,10 @@ | |||
| typedef void* gpud_stream_handle; | |||
| static __forceinline__ int gpud_max_i(int a, int b) { | |||
| return a > b ? a : b; | |||
| } | |||
| GPUD_API int GPUD_CALL gpud_stream_create(gpud_stream_handle* out) { | |||
| if (!out) return -1; | |||
| cudaStream_t stream; | |||
| @@ -321,6 +325,34 @@ GPUD_API int GPUD_CALL gpud_launch_ssb_product_cuda( | |||
| return (int)cudaGetLastError(); | |||
| } | |||
| __global__ void gpud_streaming_polyphase_accum_kernel( | |||
| const float2* __restrict__ history_state, | |||
| int history_len, | |||
| const float2* __restrict__ shifted_new, | |||
| int n_new, | |||
| const float* __restrict__ polyphase_taps, | |||
| int polyphase_len, | |||
| int decim, | |||
| int phase_len, | |||
| int start_idx, | |||
| int n_out, | |||
| float2* __restrict__ out | |||
| ); | |||
| __global__ void gpud_streaming_history_tail_kernel( | |||
| const float2* __restrict__ history_state, | |||
| int history_len, | |||
| const float2* __restrict__ shifted_new, | |||
| int n_new, | |||
| int keep, | |||
| float2* __restrict__ history_out | |||
| ); | |||
| static __forceinline__ double gpud_reduce_phase(double phase); | |||
| // Transitional legacy entrypoint retained for bring-up and comparison. | |||
| // The production-native streaming path is gpud_launch_streaming_polyphase_stateful_cuda, | |||
| // which preserves per-signal carry state across NEW-samples-only chunks. | |||
| GPUD_API int GPUD_CALL gpud_launch_streaming_polyphase_prepare_cuda( | |||
| const float2* in_new, | |||
| int n_new, | |||
| @@ -339,113 +371,261 @@ GPUD_API int GPUD_CALL gpud_launch_streaming_polyphase_prepare_cuda( | |||
| double* phase_end_out, | |||
| float2* history_out | |||
| ) { | |||
| if (!in_new || n_new < 0 || !polyphase_taps || polyphase_len <= 0 || decim <= 0 || num_taps <= 0) return -1; | |||
| if (n_new < 0 || !polyphase_taps || polyphase_len <= 0 || decim <= 0 || num_taps <= 0) return -1; | |||
| const int phase_len = (num_taps + decim - 1) / decim; | |||
| if (polyphase_len < decim * phase_len) return -2; | |||
| const int combined_len = history_len + n_new; | |||
| float2* shifted = NULL; | |||
| float2* combined = NULL; | |||
| cudaError_t err = cudaMalloc((void**)&shifted, (size_t)max(1, n_new) * sizeof(float2)); | |||
| if (err != cudaSuccess) return (int)err; | |||
| err = cudaMalloc((void**)&combined, (size_t)max(1, combined_len) * sizeof(float2)); | |||
| if (err != cudaSuccess) { | |||
| cudaFree(shifted); | |||
| return (int)err; | |||
| } | |||
| const int keep = num_taps > 1 ? num_taps - 1 : 0; | |||
| int clamped_history_len = history_len; | |||
| if (clamped_history_len < 0) clamped_history_len = 0; | |||
| if (clamped_history_len > keep) clamped_history_len = keep; | |||
| if (clamped_history_len > 0 && !history_in) return -5; | |||
| const int block = 256; | |||
| const int grid_shift = (n_new + block - 1) / block; | |||
| float2* shifted = NULL; | |||
| cudaError_t err = cudaSuccess; | |||
| if (n_new > 0) { | |||
| if (!in_new) return -3; | |||
| err = cudaMalloc((void**)&shifted, (size_t)gpud_max_i(1, n_new) * sizeof(float2)); | |||
| if (err != cudaSuccess) return (int)err; | |||
| const int block = 256; | |||
| const int grid_shift = (n_new + block - 1) / block; | |||
| gpud_freq_shift_kernel<<<grid_shift, block>>>(in_new, shifted, n_new, phase_inc, phase_start); | |||
| err = cudaGetLastError(); | |||
| if (err != cudaSuccess) { | |||
| cudaFree(shifted); | |||
| cudaFree(combined); | |||
| return (int)err; | |||
| } | |||
| } | |||
| if (history_len > 0 && history_in) { | |||
| err = cudaMemcpy(combined, history_in, (size_t)history_len * sizeof(float2), cudaMemcpyDeviceToDevice); | |||
| if (err != cudaSuccess) { | |||
| int phase_count = phase_count_in; | |||
| if (phase_count < 0) phase_count = 0; | |||
| if (phase_count >= decim) phase_count %= decim; | |||
| const int total_phase = phase_count + n_new; | |||
| const int out_count = total_phase / decim; | |||
| if (out_count > 0) { | |||
| if (!out) { | |||
| cudaFree(shifted); | |||
| cudaFree(combined); | |||
| return (int)err; | |||
| return -4; | |||
| } | |||
| } | |||
| if (n_new > 0) { | |||
| err = cudaMemcpy(combined + history_len, shifted, (size_t)n_new * sizeof(float2), cudaMemcpyDeviceToDevice); | |||
| const int block = 256; | |||
| const int grid = (out_count + block - 1) / block; | |||
| const int start_idx = decim - phase_count - 1; | |||
| gpud_streaming_polyphase_accum_kernel<<<grid, block>>>( | |||
| history_in, | |||
| clamped_history_len, | |||
| shifted, | |||
| n_new, | |||
| polyphase_taps, | |||
| polyphase_len, | |||
| decim, | |||
| phase_len, | |||
| start_idx, | |||
| out_count, | |||
| out | |||
| ); | |||
| err = cudaGetLastError(); | |||
| if (err != cudaSuccess) { | |||
| cudaFree(shifted); | |||
| cudaFree(combined); | |||
| return (int)err; | |||
| } | |||
| } | |||
| int out_count = 0; | |||
| int phase_count = phase_count_in; | |||
| for (int i = 0; i < n_new; ++i) { | |||
| phase_count++; | |||
| if (phase_count == decim) { | |||
| float2 acc = make_float2(0.0f, 0.0f); | |||
| int newest = history_len + i; | |||
| for (int p = 0; p < decim; ++p) { | |||
| for (int k = 0; k < phase_len; ++k) { | |||
| int tap_idx = p * phase_len + k; | |||
| if (tap_idx >= polyphase_len) continue; | |||
| float tap; | |||
| err = cudaMemcpy(&tap, polyphase_taps + tap_idx, sizeof(float), cudaMemcpyDeviceToHost); | |||
| if (err != cudaSuccess) { | |||
| cudaFree(shifted); | |||
| cudaFree(combined); | |||
| return (int)err; | |||
| } | |||
| if (tap == 0.0f) continue; | |||
| int src_back = p + k * decim; | |||
| int src_idx = newest - src_back; | |||
| if (src_idx < 0) continue; | |||
| float2 sample; | |||
| err = cudaMemcpy(&sample, combined + src_idx, sizeof(float2), cudaMemcpyDeviceToHost); | |||
| if (err != cudaSuccess) { | |||
| cudaFree(shifted); | |||
| cudaFree(combined); | |||
| return (int)err; | |||
| } | |||
| acc.x += sample.x * tap; | |||
| acc.y += sample.y * tap; | |||
| } | |||
| } | |||
| err = cudaMemcpy(out + out_count, &acc, sizeof(float2), cudaMemcpyHostToDevice); | |||
| if (history_out && keep > 0) { | |||
| const int new_history_len = clamped_history_len + n_new < keep ? clamped_history_len + n_new : keep; | |||
| if (new_history_len > 0) { | |||
| const int block = 256; | |||
| const int grid = (new_history_len + block - 1) / block; | |||
| gpud_streaming_history_tail_kernel<<<grid, block>>>( | |||
| history_in, | |||
| clamped_history_len, | |||
| shifted, | |||
| n_new, | |||
| new_history_len, | |||
| history_out | |||
| ); | |||
| err = cudaGetLastError(); | |||
| if (err != cudaSuccess) { | |||
| cudaFree(shifted); | |||
| cudaFree(combined); | |||
| return (int)err; | |||
| } | |||
| out_count++; | |||
| phase_count = 0; | |||
| } | |||
| } | |||
| const int keep = num_taps > 1 ? num_taps - 1 : 0; | |||
| if (history_out && keep > 0) { | |||
| int copy = keep; | |||
| if (combined_len < copy) copy = combined_len; | |||
| if (copy > 0) { | |||
| err = cudaMemcpy(history_out, combined + (combined_len - copy), (size_t)copy * sizeof(float2), cudaMemcpyDeviceToDevice); | |||
| if (err != cudaSuccess) { | |||
| cudaFree(shifted); | |||
| cudaFree(combined); | |||
| return (int)err; | |||
| } | |||
| if (n_out) *n_out = out_count; | |||
| if (phase_count_out) *phase_count_out = total_phase % decim; | |||
| if (phase_end_out) *phase_end_out = gpud_reduce_phase(phase_start + phase_inc * (double)n_new); | |||
| if (shifted) cudaFree(shifted); | |||
| return 0; | |||
| } | |||
| static __device__ __forceinline__ float2 gpud_stream_sample_at( | |||
| const float2* __restrict__ history_state, | |||
| int history_len, | |||
| const float2* __restrict__ shifted_new, | |||
| int n_new, | |||
| int idx | |||
| ) { | |||
| if (idx < 0) return make_float2(0.0f, 0.0f); | |||
| if (idx < history_len) return history_state[idx]; | |||
| int shifted_idx = idx - history_len; | |||
| if (shifted_idx < 0 || shifted_idx >= n_new) return make_float2(0.0f, 0.0f); | |||
| return shifted_new[shifted_idx]; | |||
| } | |||
| __global__ void gpud_streaming_polyphase_accum_kernel( | |||
| const float2* __restrict__ history_state, | |||
| int history_len, | |||
| const float2* __restrict__ shifted_new, | |||
| int n_new, | |||
| const float* __restrict__ polyphase_taps, | |||
| int polyphase_len, | |||
| int decim, | |||
| int phase_len, | |||
| int start_idx, | |||
| int n_out, | |||
| float2* __restrict__ out | |||
| ) { | |||
| int out_idx = blockIdx.x * blockDim.x + threadIdx.x; | |||
| if (out_idx >= n_out) return; | |||
| int newest = history_len + start_idx + out_idx * decim; | |||
| float acc_r = 0.0f; | |||
| float acc_i = 0.0f; | |||
| for (int p = 0; p < decim; ++p) { | |||
| for (int k = 0; k < phase_len; ++k) { | |||
| int tap_idx = p * phase_len + k; | |||
| if (tap_idx >= polyphase_len) continue; | |||
| float tap = polyphase_taps[tap_idx]; | |||
| if (tap == 0.0f) continue; | |||
| int src_back = p + k * decim; | |||
| int src_idx = newest - src_back; | |||
| float2 sample = gpud_stream_sample_at(history_state, history_len, shifted_new, n_new, src_idx); | |||
| acc_r += sample.x * tap; | |||
| acc_i += sample.y * tap; | |||
| } | |||
| } | |||
| out[out_idx] = make_float2(acc_r, acc_i); | |||
| } | |||
| if (n_out) *n_out = out_count; | |||
| if (phase_count_out) *phase_count_out = phase_count; | |||
| if (phase_end_out) *phase_end_out = phase_start + phase_inc * (double)n_new; | |||
| __global__ void gpud_streaming_history_tail_kernel( | |||
| const float2* __restrict__ history_state, | |||
| int history_len, | |||
| const float2* __restrict__ shifted_new, | |||
| int n_new, | |||
| int keep, | |||
| float2* __restrict__ history_out | |||
| ) { | |||
| int idx = blockIdx.x * blockDim.x + threadIdx.x; | |||
| if (idx >= keep) return; | |||
| int combined_len = history_len + n_new; | |||
| int src_idx = combined_len - keep + idx; | |||
| history_out[idx] = gpud_stream_sample_at(history_state, history_len, shifted_new, n_new, src_idx); | |||
| } | |||
| static __forceinline__ double gpud_reduce_phase(double phase) { | |||
| const double TWO_PI = 6.283185307179586; | |||
| return phase - rint(phase / TWO_PI) * TWO_PI; | |||
| } | |||
| // Production-native candidate entrypoint for the stateful streaming extractor. | |||
| // Callers provide only NEW samples; overlap+trim is intentionally not part of this path. | |||
| GPUD_API int GPUD_CALL gpud_launch_streaming_polyphase_stateful_cuda( | |||
| const float2* in_new, | |||
| int n_new, | |||
| float2* shifted_new_tmp, | |||
| const float* polyphase_taps, | |||
| int polyphase_len, | |||
| int decim, | |||
| int num_taps, | |||
| float2* history_state, | |||
| float2* history_scratch, | |||
| int history_cap, | |||
| int* history_len_io, | |||
| int* phase_count_state, | |||
| double* phase_state, | |||
| double phase_inc, | |||
| float2* out, | |||
| int out_cap, | |||
| int* n_out | |||
| ) { | |||
| if (!polyphase_taps || decim <= 0 || num_taps <= 0 || !history_len_io || !phase_count_state || !phase_state || !n_out) return -10; | |||
| if (n_new < 0 || out_cap < 0 || history_cap < 0) return -11; | |||
| const int phase_len = (num_taps + decim - 1) / decim; | |||
| if (polyphase_len < decim * phase_len) return -12; | |||
| int history_len = *history_len_io; | |||
| if (history_len < 0) history_len = 0; | |||
| if (history_len > history_cap) history_len = history_cap; | |||
| int phase_count = *phase_count_state; | |||
| if (phase_count < 0) phase_count = 0; | |||
| if (phase_count >= decim) phase_count %= decim; | |||
| double phase_start = *phase_state; | |||
| if (n_new > 0) { | |||
| if (!in_new || !shifted_new_tmp) return -13; | |||
| const int block = 256; | |||
| const int grid = (n_new + block - 1) / block; | |||
| gpud_freq_shift_kernel<<<grid, block>>>(in_new, shifted_new_tmp, n_new, phase_inc, phase_start); | |||
| cudaError_t err = cudaGetLastError(); | |||
| if (err != cudaSuccess) return (int)err; | |||
| } | |||
| const int total_phase = phase_count + n_new; | |||
| const int out_count = total_phase / decim; | |||
| if (out_count > out_cap) return -14; | |||
| if (out_count > 0) { | |||
| if (!out) return -15; | |||
| const int block = 256; | |||
| const int grid = (out_count + block - 1) / block; | |||
| const int start_idx = decim - phase_count - 1; | |||
| gpud_streaming_polyphase_accum_kernel<<<grid, block>>>( | |||
| history_state, | |||
| history_len, | |||
| shifted_new_tmp, | |||
| n_new, | |||
| polyphase_taps, | |||
| polyphase_len, | |||
| decim, | |||
| phase_len, | |||
| start_idx, | |||
| out_count, | |||
| out | |||
| ); | |||
| cudaError_t err = cudaGetLastError(); | |||
| if (err != cudaSuccess) return (int)err; | |||
| } | |||
| int new_history_len = history_len; | |||
| if (history_cap > 0) { | |||
| new_history_len = history_len + n_new; | |||
| if (new_history_len > history_cap) new_history_len = history_cap; | |||
| if (new_history_len > 0) { | |||
| if (!history_state || !history_scratch) return -16; | |||
| const int block = 256; | |||
| const int grid = (new_history_len + block - 1) / block; | |||
| gpud_streaming_history_tail_kernel<<<grid, block>>>( | |||
| history_state, | |||
| history_len, | |||
| shifted_new_tmp, | |||
| n_new, | |||
| new_history_len, | |||
| history_scratch | |||
| ); | |||
| cudaError_t err = cudaGetLastError(); | |||
| if (err != cudaSuccess) return (int)err; | |||
| err = cudaMemcpy(history_state, history_scratch, (size_t)new_history_len * sizeof(float2), cudaMemcpyDeviceToDevice); | |||
| if (err != cudaSuccess) return (int)err; | |||
| } | |||
| } else { | |||
| new_history_len = 0; | |||
| } | |||
| cudaFree(shifted); | |||
| cudaFree(combined); | |||
| *history_len_io = new_history_len; | |||
| *phase_count_state = total_phase % decim; | |||
| *phase_state = gpud_reduce_phase(phase_start + phase_inc * (double)n_new); | |||
| *n_out = out_count; | |||
| return 0; | |||
| } | |||
| @@ -7,6 +7,7 @@ func (r *BatchRunner) ResetSignalState(signalID int64) { | |||
| return | |||
| } | |||
| delete(r.streamState, signalID) | |||
| r.resetNativeStreamingState(signalID) | |||
| } | |||
| func (r *BatchRunner) ResetAllSignalStates() { | |||
| @@ -14,6 +15,7 @@ func (r *BatchRunner) ResetAllSignalStates() { | |||
| return | |||
| } | |||
| r.streamState = make(map[int64]*ExtractStreamState) | |||
| r.resetAllNativeStreamingStates() | |||
| } | |||
| func (r *BatchRunner) getOrInitExtractState(job StreamingExtractJob, sampleRate int) (*ExtractStreamState, error) { | |||
| @@ -10,6 +10,7 @@ const ( | |||
| type StreamingGPUInvocation struct { | |||
| SignalID int64 | |||
| ConfigHash uint64 | |||
| OffsetHz float64 | |||
| OutRate int | |||
| Bandwidth float64 | |||
| @@ -9,15 +9,17 @@ func (r *BatchRunner) StreamingExtractGPUExec(iqNew []complex64, jobs []Streamin | |||
| if err != nil { | |||
| return nil, err | |||
| } | |||
| if useGPUHostOracleExecution { | |||
| execResults, err := r.executeStreamingGPUHostOraclePrepared(invocations) | |||
| if err != nil { | |||
| if useGPUNativePreparedExecution { | |||
| execResults, err := r.executeStreamingGPUNativePrepared(invocations) | |||
| if err == nil { | |||
| return r.applyStreamingGPUExecutionResults(execResults), nil | |||
| } | |||
| if !useGPUHostOracleExecution { | |||
| return nil, err | |||
| } | |||
| return r.applyStreamingGPUExecutionResults(execResults), nil | |||
| } | |||
| if useGPUNativePreparedExecution { | |||
| execResults, err := r.executeStreamingGPUNativePrepared(invocations) | |||
| if useGPUHostOracleExecution { | |||
| execResults, err := r.executeStreamingGPUHostOraclePrepared(invocations) | |||
| if err != nil { | |||
| return nil, err | |||
| } | |||
| @@ -2,7 +2,7 @@ package gpudemod | |||
| import "testing" | |||
| func TestStreamingExtractGPUExecUnavailableByDefault(t *testing.T) { | |||
| func TestStreamingExtractGPUExecUsesSafeDefaultMode(t *testing.T) { | |||
| r := &BatchRunner{eng: &Engine{sampleRate: 4000000}, streamState: make(map[int64]*ExtractStreamState)} | |||
| job := StreamingExtractJob{ | |||
| SignalID: 1, | |||
| @@ -12,8 +12,101 @@ func TestStreamingExtractGPUExecUnavailableByDefault(t *testing.T) { | |||
| NumTaps: 65, | |||
| ConfigHash: 777, | |||
| } | |||
| _, err := r.StreamingExtractGPUExec(makeDeterministicIQ(2048), []StreamingExtractJob{job}) | |||
| if err == nil { | |||
| t.Fatalf("expected unavailable/disabled execution path by default") | |||
| res, err := r.StreamingExtractGPUExec(makeDeterministicIQ(2048), []StreamingExtractJob{job}) | |||
| if err != nil { | |||
| t.Fatalf("expected safe default execution path, got error: %v", err) | |||
| } | |||
| if len(res) != 1 { | |||
| t.Fatalf("expected 1 result, got %d", len(res)) | |||
| } | |||
| if res[0].Rate != job.OutRate { | |||
| t.Fatalf("expected output rate %d, got %d", job.OutRate, res[0].Rate) | |||
| } | |||
| if res[0].NOut <= 0 { | |||
| t.Fatalf("expected streaming output samples") | |||
| } | |||
| } | |||
| func TestStreamingGPUExecMatchesCPUOracleAcrossChunkPatterns(t *testing.T) { | |||
| job := StreamingExtractJob{ | |||
| SignalID: 1, | |||
| OffsetHz: 12500, | |||
| Bandwidth: 20000, | |||
| OutRate: 200000, | |||
| NumTaps: 65, | |||
| ConfigHash: 777, | |||
| } | |||
| t.Run("DeterministicIQ", func(t *testing.T) { | |||
| r := &BatchRunner{eng: &Engine{sampleRate: 4000000}, streamState: make(map[int64]*ExtractStreamState)} | |||
| steps := makeStreamingValidationSteps( | |||
| makeDeterministicIQ(1500), | |||
| []int{0, 1, 2, 17, 63, 64, 65, 129, 511}, | |||
| []StreamingExtractJob{job}, | |||
| ) | |||
| runStreamingExecSequenceAgainstOracle(t, r, steps, 1e-5, 1e-9) | |||
| }) | |||
| t.Run("ToneNoiseIQ", func(t *testing.T) { | |||
| r := &BatchRunner{eng: &Engine{sampleRate: 4000000}, streamState: make(map[int64]*ExtractStreamState)} | |||
| steps := makeStreamingValidationSteps( | |||
| makeToneNoiseIQ(4096, 0.023), | |||
| []int{7, 20, 3, 63, 64, 65, 777}, | |||
| []StreamingExtractJob{job}, | |||
| ) | |||
| runStreamingExecSequenceAgainstOracle(t, r, steps, 1e-5, 1e-9) | |||
| }) | |||
| } | |||
| func TestStreamingGPUExecLifecycleMatchesCPUOracle(t *testing.T) { | |||
| r := &BatchRunner{ | |||
| eng: &Engine{sampleRate: 4000000}, | |||
| streamState: make(map[int64]*ExtractStreamState), | |||
| nativeState: make(map[int64]*nativeStreamingSignalState), | |||
| } | |||
| baseA := StreamingExtractJob{ | |||
| SignalID: 11, | |||
| OffsetHz: 12500, | |||
| Bandwidth: 20000, | |||
| OutRate: 200000, | |||
| NumTaps: 65, | |||
| ConfigHash: 1001, | |||
| } | |||
| baseB := StreamingExtractJob{ | |||
| SignalID: 22, | |||
| OffsetHz: -18750, | |||
| Bandwidth: 16000, | |||
| OutRate: 100000, | |||
| NumTaps: 33, | |||
| ConfigHash: 2002, | |||
| } | |||
| steps := []streamingValidationStep{ | |||
| { | |||
| name: "prime_both_signals", | |||
| iq: makeDeterministicIQ(512), | |||
| jobs: []StreamingExtractJob{baseA, baseB}, | |||
| }, | |||
| { | |||
| name: "config_reset_with_zero_new", | |||
| iq: nil, | |||
| jobs: []StreamingExtractJob{{SignalID: baseA.SignalID, OffsetHz: baseA.OffsetHz, Bandwidth: baseA.Bandwidth, OutRate: baseA.OutRate, NumTaps: baseA.NumTaps, ConfigHash: baseA.ConfigHash + 1}, baseB}, | |||
| }, | |||
| { | |||
| name: "signal_b_disappears", | |||
| iq: makeToneNoiseIQ(96, 0.041), | |||
| jobs: []StreamingExtractJob{baseA}, | |||
| }, | |||
| { | |||
| name: "signal_b_reappears_fresh", | |||
| iq: makeDeterministicIQ(160), | |||
| jobs: []StreamingExtractJob{baseA, baseB}, | |||
| }, | |||
| { | |||
| name: "small_history_boundary_chunk", | |||
| iq: makeToneNoiseIQ(65, 0.017), | |||
| jobs: []StreamingExtractJob{baseA, baseB}, | |||
| }, | |||
| } | |||
| runStreamingExecSequenceAgainstOracle(t, r, steps, 1e-5, 1e-9) | |||
| if _, ok := r.nativeState[baseB.SignalID]; ok { | |||
| t.Fatalf("expected safe host-oracle path to keep native state inactive while gate is off") | |||
| } | |||
| } | |||
| @@ -15,101 +15,270 @@ import ( | |||
| ) | |||
| func (r *BatchRunner) executeStreamingGPUNativePrepared(invocations []StreamingGPUInvocation) ([]StreamingGPUExecutionResult, error) { | |||
| if r == nil || r.eng == nil { | |||
| return nil, ErrUnavailable | |||
| } | |||
| if r.nativeState == nil { | |||
| r.nativeState = make(map[int64]*nativeStreamingSignalState) | |||
| } | |||
| results := make([]StreamingGPUExecutionResult, len(invocations)) | |||
| for i, inv := range invocations { | |||
| phaseInc := -2.0 * math.Pi * inv.OffsetHz / float64(inv.SampleRate) | |||
| outCap := len(inv.IQNew)/maxInt(1, inv.Decim) + 2 | |||
| outHost := make([]complex64, outCap) | |||
| histCap := maxInt(0, inv.NumTaps-1) | |||
| histHost := make([]complex64, histCap) | |||
| var nOut C.int | |||
| var phaseCountOut C.int | |||
| var phaseEndOut C.double | |||
| var dInNew, dHistIn, dOut, dHistOut unsafe.Pointer | |||
| var dTaps unsafe.Pointer | |||
| if len(inv.IQNew) > 0 { | |||
| if bridgeCudaMalloc(&dInNew, uintptr(len(inv.IQNew))*unsafe.Sizeof(C.gpud_float2{})) != 0 { | |||
| return nil, ErrUnavailable | |||
| } | |||
| defer bridgeCudaFree(dInNew) | |||
| if bridgeMemcpyH2D(dInNew, unsafe.Pointer(&inv.IQNew[0]), uintptr(len(inv.IQNew))*unsafe.Sizeof(complex64(0))) != 0 { | |||
| return nil, ErrUnavailable | |||
| } | |||
| state, err := r.getOrInitNativeStreamingState(inv) | |||
| if err != nil { | |||
| return nil, err | |||
| } | |||
| if len(inv.ShiftedHistory) > 0 { | |||
| if bridgeCudaMalloc(&dHistIn, uintptr(len(inv.ShiftedHistory))*unsafe.Sizeof(C.gpud_float2{})) != 0 { | |||
| return nil, ErrUnavailable | |||
| } | |||
| defer bridgeCudaFree(dHistIn) | |||
| if bridgeMemcpyH2D(dHistIn, unsafe.Pointer(&inv.ShiftedHistory[0]), uintptr(len(inv.ShiftedHistory))*unsafe.Sizeof(complex64(0))) != 0 { | |||
| return nil, ErrUnavailable | |||
| } | |||
| } | |||
| if len(inv.PolyphaseTaps) > 0 { | |||
| if bridgeCudaMalloc(&dTaps, uintptr(len(inv.PolyphaseTaps))*unsafe.Sizeof(C.float(0))) != 0 { | |||
| return nil, ErrUnavailable | |||
| if len(inv.IQNew) > 0 { | |||
| if err := ensureNativeBuffer(&state.dInNew, &state.inNewCap, len(inv.IQNew), unsafe.Sizeof(C.gpud_float2{})); err != nil { | |||
| return nil, err | |||
| } | |||
| defer bridgeCudaFree(dTaps) | |||
| if bridgeMemcpyH2D(dTaps, unsafe.Pointer(&inv.PolyphaseTaps[0]), uintptr(len(inv.PolyphaseTaps))*unsafe.Sizeof(float32(0))) != 0 { | |||
| if bridgeMemcpyH2D(state.dInNew, unsafe.Pointer(&inv.IQNew[0]), uintptr(len(inv.IQNew))*unsafe.Sizeof(complex64(0))) != 0 { | |||
| return nil, ErrUnavailable | |||
| } | |||
| } | |||
| outCap := len(inv.IQNew)/maxInt(1, inv.Decim) + 2 | |||
| if outCap > 0 { | |||
| if bridgeCudaMalloc(&dOut, uintptr(outCap)*unsafe.Sizeof(C.gpud_float2{})) != 0 { | |||
| return nil, ErrUnavailable | |||
| } | |||
| defer bridgeCudaFree(dOut) | |||
| } | |||
| if histCap > 0 { | |||
| if bridgeCudaMalloc(&dHistOut, uintptr(histCap)*unsafe.Sizeof(C.gpud_float2{})) != 0 { | |||
| return nil, ErrUnavailable | |||
| if err := ensureNativeBuffer(&state.dOut, &state.outCap, outCap, unsafe.Sizeof(C.gpud_float2{})); err != nil { | |||
| return nil, err | |||
| } | |||
| defer bridgeCudaFree(dHistOut) | |||
| } | |||
| res := bridgeLaunchStreamingPolyphasePrepare( | |||
| (*C.gpud_float2)(dInNew), | |||
| phaseInc := -2.0 * math.Pi * inv.OffsetHz / float64(inv.SampleRate) | |||
| // The native export consumes phase carry as host scalars while sample/history | |||
| // buffers remain device-resident, so keep these counters in nativeState. | |||
| var nOut C.int | |||
| historyLen := C.int(state.historyLen) | |||
| phaseCount := C.int(state.phaseCount) | |||
| phaseNCO := C.double(state.phaseNCO) | |||
| res := bridgeLaunchStreamingPolyphaseStateful( | |||
| (*C.gpud_float2)(state.dInNew), | |||
| len(inv.IQNew), | |||
| (*C.gpud_float2)(dHistIn), | |||
| len(inv.ShiftedHistory), | |||
| (*C.float)(dTaps), | |||
| len(inv.PolyphaseTaps), | |||
| inv.Decim, | |||
| inv.NumTaps, | |||
| inv.PhaseCountIn, | |||
| inv.NCOPhaseIn, | |||
| (*C.gpud_float2)(state.dShifted), | |||
| (*C.float)(state.dTaps), | |||
| state.tapsLen, | |||
| state.decim, | |||
| state.numTaps, | |||
| (*C.gpud_float2)(state.dHistory), | |||
| (*C.gpud_float2)(state.dHistoryScratch), | |||
| state.historyCap, | |||
| &historyLen, | |||
| &phaseCount, | |||
| &phaseNCO, | |||
| phaseInc, | |||
| (*C.gpud_float2)(dOut), | |||
| (*C.gpud_float2)(state.dOut), | |||
| outCap, | |||
| &nOut, | |||
| &phaseCountOut, | |||
| &phaseEndOut, | |||
| (*C.gpud_float2)(dHistOut), | |||
| ) | |||
| if res != 0 { | |||
| return nil, ErrUnavailable | |||
| } | |||
| if int(nOut) > 0 { | |||
| if bridgeMemcpyD2H(unsafe.Pointer(&outHost[0]), dOut, uintptr(int(nOut))*unsafe.Sizeof(complex64(0))) != 0 { | |||
| state.historyLen = int(historyLen) | |||
| state.phaseCount = int(phaseCount) | |||
| state.phaseNCO = float64(phaseNCO) | |||
| outHost := make([]complex64, int(nOut)) | |||
| if len(outHost) > 0 { | |||
| if bridgeMemcpyD2H(unsafe.Pointer(&outHost[0]), state.dOut, uintptr(len(outHost))*unsafe.Sizeof(complex64(0))) != 0 { | |||
| return nil, ErrUnavailable | |||
| } | |||
| } | |||
| if histCap > 0 { | |||
| if bridgeMemcpyD2H(unsafe.Pointer(&histHost[0]), dHistOut, uintptr(histCap)*unsafe.Sizeof(complex64(0))) != 0 { | |||
| histHost := make([]complex64, state.historyLen) | |||
| if state.historyLen > 0 { | |||
| if bridgeMemcpyD2H(unsafe.Pointer(&histHost[0]), state.dHistory, uintptr(state.historyLen)*unsafe.Sizeof(complex64(0))) != 0 { | |||
| return nil, ErrUnavailable | |||
| } | |||
| } | |||
| results[i] = StreamingGPUExecutionResult{ | |||
| SignalID: inv.SignalID, | |||
| Mode: StreamingGPUExecCUDA, | |||
| IQ: append([]complex64(nil), outHost[:int(nOut)]...), | |||
| IQ: outHost, | |||
| Rate: inv.OutRate, | |||
| NOut: int(nOut), | |||
| PhaseCountOut: int(phaseCountOut), | |||
| NCOPhaseOut: float64(phaseEndOut), | |||
| HistoryOut: append([]complex64(nil), histHost...), | |||
| HistoryLenOut: histCap, | |||
| NOut: len(outHost), | |||
| PhaseCountOut: state.phaseCount, | |||
| NCOPhaseOut: state.phaseNCO, | |||
| HistoryOut: histHost, | |||
| HistoryLenOut: len(histHost), | |||
| } | |||
| } | |||
| return results, nil | |||
| } | |||
| func (r *BatchRunner) getOrInitNativeStreamingState(inv StreamingGPUInvocation) (*nativeStreamingSignalState, error) { | |||
| state := r.nativeState[inv.SignalID] | |||
| needReset := false | |||
| historyCap := maxInt(0, inv.NumTaps-1) | |||
| if state == nil { | |||
| state = &nativeStreamingSignalState{signalID: inv.SignalID} | |||
| r.nativeState[inv.SignalID] = state | |||
| needReset = true | |||
| } | |||
| if state.configHash != inv.ConfigHash { | |||
| needReset = true | |||
| } | |||
| if state.decim != inv.Decim || state.numTaps != inv.NumTaps || state.tapsLen != len(inv.PolyphaseTaps) { | |||
| needReset = true | |||
| } | |||
| if state.historyCap != historyCap { | |||
| needReset = true | |||
| } | |||
| if needReset { | |||
| releaseNativeStreamingSignalState(state) | |||
| } | |||
| if len(inv.PolyphaseTaps) == 0 { | |||
| return nil, ErrUnavailable | |||
| } | |||
| if state.dTaps == nil && len(inv.PolyphaseTaps) > 0 { | |||
| if bridgeCudaMalloc(&state.dTaps, uintptr(len(inv.PolyphaseTaps))*unsafe.Sizeof(C.float(0))) != 0 { | |||
| return nil, ErrUnavailable | |||
| } | |||
| if bridgeMemcpyH2D(state.dTaps, unsafe.Pointer(&inv.PolyphaseTaps[0]), uintptr(len(inv.PolyphaseTaps))*unsafe.Sizeof(float32(0))) != 0 { | |||
| return nil, ErrUnavailable | |||
| } | |||
| state.tapsLen = len(inv.PolyphaseTaps) | |||
| } | |||
| if state.dShifted == nil { | |||
| minCap := maxInt(1, len(inv.IQNew)) | |||
| if bridgeCudaMalloc(&state.dShifted, uintptr(minCap)*unsafe.Sizeof(C.gpud_float2{})) != 0 { | |||
| return nil, ErrUnavailable | |||
| } | |||
| state.shiftedCap = minCap | |||
| } | |||
| if state.shiftedCap < len(inv.IQNew) { | |||
| if bridgeCudaFree(state.dShifted) != 0 { | |||
| return nil, ErrUnavailable | |||
| } | |||
| state.dShifted = nil | |||
| state.shiftedCap = 0 | |||
| if bridgeCudaMalloc(&state.dShifted, uintptr(len(inv.IQNew))*unsafe.Sizeof(C.gpud_float2{})) != 0 { | |||
| return nil, ErrUnavailable | |||
| } | |||
| state.shiftedCap = len(inv.IQNew) | |||
| } | |||
| if state.dHistory == nil && historyCap > 0 { | |||
| if bridgeCudaMalloc(&state.dHistory, uintptr(historyCap)*unsafe.Sizeof(C.gpud_float2{})) != 0 { | |||
| return nil, ErrUnavailable | |||
| } | |||
| } | |||
| if state.dHistoryScratch == nil && historyCap > 0 { | |||
| if bridgeCudaMalloc(&state.dHistoryScratch, uintptr(historyCap)*unsafe.Sizeof(C.gpud_float2{})) != 0 { | |||
| return nil, ErrUnavailable | |||
| } | |||
| state.historyScratchCap = historyCap | |||
| } | |||
| if needReset { | |||
| state.phaseCount = inv.PhaseCountIn | |||
| state.phaseNCO = inv.NCOPhaseIn | |||
| state.historyLen = minInt(len(inv.ShiftedHistory), historyCap) | |||
| if state.historyLen > 0 { | |||
| if bridgeMemcpyH2D(state.dHistory, unsafe.Pointer(&inv.ShiftedHistory[len(inv.ShiftedHistory)-state.historyLen]), uintptr(state.historyLen)*unsafe.Sizeof(complex64(0))) != 0 { | |||
| return nil, ErrUnavailable | |||
| } | |||
| } | |||
| } | |||
| state.decim = inv.Decim | |||
| state.numTaps = inv.NumTaps | |||
| state.historyCap = historyCap | |||
| state.historyScratchCap = historyCap | |||
| state.configHash = inv.ConfigHash | |||
| return state, nil | |||
| } | |||
| func ensureNativeBuffer(ptr *unsafe.Pointer, capRef *int, need int, elemSize uintptr) error { | |||
| if need <= 0 { | |||
| return nil | |||
| } | |||
| if *ptr != nil && *capRef >= need { | |||
| return nil | |||
| } | |||
| if *ptr != nil { | |||
| if bridgeCudaFree(*ptr) != 0 { | |||
| return ErrUnavailable | |||
| } | |||
| *ptr = nil | |||
| *capRef = 0 | |||
| } | |||
| if bridgeCudaMalloc(ptr, uintptr(need)*elemSize) != 0 { | |||
| return ErrUnavailable | |||
| } | |||
| *capRef = need | |||
| return nil | |||
| } | |||
| func (r *BatchRunner) syncNativeStreamingStates(active map[int64]struct{}) { | |||
| if r == nil || r.nativeState == nil { | |||
| return | |||
| } | |||
| for id, state := range r.nativeState { | |||
| if _, ok := active[id]; ok { | |||
| continue | |||
| } | |||
| releaseNativeStreamingSignalState(state) | |||
| delete(r.nativeState, id) | |||
| } | |||
| } | |||
| func (r *BatchRunner) resetNativeStreamingState(signalID int64) { | |||
| if r == nil || r.nativeState == nil { | |||
| return | |||
| } | |||
| if state := r.nativeState[signalID]; state != nil { | |||
| releaseNativeStreamingSignalState(state) | |||
| } | |||
| delete(r.nativeState, signalID) | |||
| } | |||
| func (r *BatchRunner) resetAllNativeStreamingStates() { | |||
| if r == nil { | |||
| return | |||
| } | |||
| r.freeAllNativeStreamingStates() | |||
| r.nativeState = make(map[int64]*nativeStreamingSignalState) | |||
| } | |||
| func (r *BatchRunner) freeAllNativeStreamingStates() { | |||
| if r == nil || r.nativeState == nil { | |||
| return | |||
| } | |||
| for id, state := range r.nativeState { | |||
| releaseNativeStreamingSignalState(state) | |||
| delete(r.nativeState, id) | |||
| } | |||
| } | |||
| func releaseNativeStreamingSignalState(state *nativeStreamingSignalState) { | |||
| if state == nil { | |||
| return | |||
| } | |||
| for _, ptr := range []*unsafe.Pointer{ | |||
| &state.dInNew, | |||
| &state.dShifted, | |||
| &state.dOut, | |||
| &state.dTaps, | |||
| &state.dHistory, | |||
| &state.dHistoryScratch, | |||
| } { | |||
| if *ptr != nil { | |||
| _ = bridgeCudaFree(*ptr) | |||
| *ptr = nil | |||
| } | |||
| } | |||
| state.inNewCap = 0 | |||
| state.shiftedCap = 0 | |||
| state.outCap = 0 | |||
| state.tapsLen = 0 | |||
| state.historyCap = 0 | |||
| state.historyLen = 0 | |||
| state.historyScratchCap = 0 | |||
| state.phaseCount = 0 | |||
| state.phaseNCO = 0 | |||
| state.decim = 0 | |||
| state.numTaps = 0 | |||
| state.configHash = 0 | |||
| } | |||
| func minInt(a int, b int) int { | |||
| if a < b { | |||
| return a | |||
| } | |||
| return b | |||
| } | |||
| @@ -6,3 +6,39 @@ func (r *BatchRunner) executeStreamingGPUNativePrepared(invocations []StreamingG | |||
| _ = invocations | |||
| return nil, ErrUnavailable | |||
| } | |||
| func (r *BatchRunner) syncNativeStreamingStates(active map[int64]struct{}) { | |||
| _ = active | |||
| if r == nil { | |||
| return | |||
| } | |||
| if r.nativeState == nil { | |||
| r.nativeState = make(map[int64]*nativeStreamingSignalState) | |||
| } | |||
| for id := range r.nativeState { | |||
| if _, ok := active[id]; !ok { | |||
| delete(r.nativeState, id) | |||
| } | |||
| } | |||
| } | |||
| func (r *BatchRunner) resetNativeStreamingState(signalID int64) { | |||
| if r == nil || r.nativeState == nil { | |||
| return | |||
| } | |||
| delete(r.nativeState, signalID) | |||
| } | |||
| func (r *BatchRunner) resetAllNativeStreamingStates() { | |||
| if r == nil { | |||
| return | |||
| } | |||
| r.nativeState = make(map[int64]*nativeStreamingSignalState) | |||
| } | |||
| func (r *BatchRunner) freeAllNativeStreamingStates() { | |||
| if r == nil { | |||
| return | |||
| } | |||
| r.nativeState = nil | |||
| } | |||
| @@ -2,10 +2,49 @@ | |||
| package gpudemod | |||
| import "testing" | |||
| import ( | |||
| "os" | |||
| "path/filepath" | |||
| "testing" | |||
| ) | |||
| func TestStreamingGPUNativePreparedComparableToCPUOracle(t *testing.T) { | |||
| r := &BatchRunner{eng: &Engine{sampleRate: 4000000}, streamState: make(map[int64]*ExtractStreamState)} | |||
| func configureNativePreparedDLLPath(t *testing.T) { | |||
| t.Helper() | |||
| candidates := []string{ | |||
| filepath.Join("build", "gpudemod_kernels.dll"), | |||
| filepath.Join("internal", "demod", "gpudemod", "build", "gpudemod_kernels.dll"), | |||
| "gpudemod_kernels.dll", | |||
| } | |||
| for _, candidate := range candidates { | |||
| if _, err := os.Stat(candidate); err == nil { | |||
| abs, err := filepath.Abs(candidate) | |||
| if err != nil { | |||
| t.Fatalf("resolve native prepared DLL path: %v", err) | |||
| } | |||
| t.Setenv("GPUMOD_DLL", abs) | |||
| return | |||
| } | |||
| } | |||
| } | |||
| func requireNativePreparedTestRunner(t *testing.T) *BatchRunner { | |||
| t.Helper() | |||
| configureNativePreparedDLLPath(t) | |||
| if err := ensureDLLLoaded(); err != nil { | |||
| t.Skipf("native prepared path unavailable: %v", err) | |||
| } | |||
| if !Available() { | |||
| t.Skip("native prepared path unavailable: cuda device not available") | |||
| } | |||
| r, err := NewBatchRunner(32768, 4000000) | |||
| if err != nil { | |||
| t.Skipf("native prepared path unavailable: %v", err) | |||
| } | |||
| t.Cleanup(r.Close) | |||
| return r | |||
| } | |||
| func TestStreamingGPUNativePreparedMatchesCPUOracleAcrossChunkPatterns(t *testing.T) { | |||
| job := StreamingExtractJob{ | |||
| SignalID: 1, | |||
| OffsetHz: 12500, | |||
| @@ -14,24 +53,154 @@ func TestStreamingGPUNativePreparedComparableToCPUOracle(t *testing.T) { | |||
| NumTaps: 65, | |||
| ConfigHash: 777, | |||
| } | |||
| iq := makeDeterministicIQ(16000) | |||
| gpuRes, err := r.StreamingExtractGPU(iq, []StreamingExtractJob{job}) | |||
| if err != nil { | |||
| t.Fatalf("unexpected native prepared GPU error: %v", err) | |||
| exec := func(r *BatchRunner, invocations []StreamingGPUInvocation) ([]StreamingGPUExecutionResult, error) { | |||
| return r.executeStreamingGPUNativePrepared(invocations) | |||
| } | |||
| oracleRunner := NewCPUOracleRunner(4000000) | |||
| oracleRes, err := oracleRunner.StreamingExtract(iq, []StreamingExtractJob{job}) | |||
| if err != nil { | |||
| t.Fatalf("unexpected oracle error: %v", err) | |||
| t.Run("DeterministicIQ", func(t *testing.T) { | |||
| r := requireNativePreparedTestRunner(t) | |||
| steps := makeStreamingValidationSteps( | |||
| makeDeterministicIQ(8192), | |||
| []int{0, 1, 2, 17, 63, 64, 65, 129, 511, 2048}, | |||
| []StreamingExtractJob{job}, | |||
| ) | |||
| runPreparedSequenceAgainstOracle(t, r, exec, steps, 1e-4, 1e-8) | |||
| }) | |||
| t.Run("ToneNoiseIQ", func(t *testing.T) { | |||
| r := requireNativePreparedTestRunner(t) | |||
| steps := makeStreamingValidationSteps( | |||
| makeToneNoiseIQ(12288, 0.023), | |||
| []int{7, 20, 3, 63, 64, 65, 777, 2048, 4096}, | |||
| []StreamingExtractJob{job}, | |||
| ) | |||
| runPreparedSequenceAgainstOracle(t, r, exec, steps, 1e-4, 1e-8) | |||
| }) | |||
| } | |||
| func TestStreamingGPUNativePreparedLifecycleResetAndCapacity(t *testing.T) { | |||
| r := requireNativePreparedTestRunner(t) | |||
| exec := func(invocations []StreamingGPUInvocation) ([]StreamingGPUExecutionResult, error) { | |||
| return r.executeStreamingGPUNativePrepared(invocations) | |||
| } | |||
| if len(gpuRes) != 1 || len(oracleRes) != 1 { | |||
| t.Fatalf("unexpected result sizes: gpu=%d oracle=%d", len(gpuRes), len(oracleRes)) | |||
| jobA := StreamingExtractJob{ | |||
| SignalID: 11, | |||
| OffsetHz: 12500, | |||
| Bandwidth: 20000, | |||
| OutRate: 200000, | |||
| NumTaps: 65, | |||
| ConfigHash: 3001, | |||
| } | |||
| jobB := StreamingExtractJob{ | |||
| SignalID: 22, | |||
| OffsetHz: -18750, | |||
| Bandwidth: 16000, | |||
| OutRate: 100000, | |||
| NumTaps: 33, | |||
| ConfigHash: 4002, | |||
| } | |||
| metrics, stats := CompareOracleAndGPUHostOracle(oracleRes[0], gpuRes[0]) | |||
| if stats.Count == 0 { | |||
| t.Fatalf("expected compare count > 0") | |||
| steps := []streamingValidationStep{ | |||
| { | |||
| name: "prime_both_signals", | |||
| iq: makeDeterministicIQ(256), | |||
| jobs: []StreamingExtractJob{jobA, jobB}, | |||
| }, | |||
| { | |||
| name: "grow_capacity", | |||
| iq: makeToneNoiseIQ(4096, 0.037), | |||
| jobs: []StreamingExtractJob{jobA, jobB}, | |||
| }, | |||
| { | |||
| name: "config_reset_zero_new", | |||
| iq: nil, | |||
| jobs: []StreamingExtractJob{{SignalID: jobA.SignalID, OffsetHz: jobA.OffsetHz, Bandwidth: jobA.Bandwidth, OutRate: jobA.OutRate, NumTaps: jobA.NumTaps, ConfigHash: jobA.ConfigHash + 1}, jobB}, | |||
| }, | |||
| { | |||
| name: "signal_b_disappears", | |||
| iq: makeDeterministicIQ(64), | |||
| jobs: []StreamingExtractJob{jobA}, | |||
| }, | |||
| { | |||
| name: "signal_b_reappears", | |||
| iq: makeToneNoiseIQ(96, 0.017), | |||
| jobs: []StreamingExtractJob{jobA, jobB}, | |||
| }, | |||
| { | |||
| name: "history_boundary", | |||
| iq: makeDeterministicIQ(65), | |||
| jobs: []StreamingExtractJob{jobA, jobB}, | |||
| }, | |||
| } | |||
| if metrics.RefMaxAbsErr > 1e-4 { | |||
| t.Fatalf("native prepared path diverges too much from oracle: max abs err=%f", metrics.RefMaxAbsErr) | |||
| oracle := NewCPUOracleRunner(r.eng.sampleRate) | |||
| var grownCap int | |||
| for idx, step := range steps { | |||
| invocations, err := r.buildStreamingGPUInvocations(step.iq, step.jobs) | |||
| if err != nil { | |||
| t.Fatalf("step %d (%s): build invocations failed: %v", idx, step.name, err) | |||
| } | |||
| got, err := exec(invocations) | |||
| if err != nil { | |||
| t.Fatalf("step %d (%s): native prepared exec failed: %v", idx, step.name, err) | |||
| } | |||
| want, err := oracle.StreamingExtract(step.iq, step.jobs) | |||
| if err != nil { | |||
| t.Fatalf("step %d (%s): oracle failed: %v", idx, step.name, err) | |||
| } | |||
| if len(got) != len(want) { | |||
| t.Fatalf("step %d (%s): result count mismatch: got=%d want=%d", idx, step.name, len(got), len(want)) | |||
| } | |||
| applied := r.applyStreamingGPUExecutionResults(got) | |||
| for i, job := range step.jobs { | |||
| oracleState := oracle.States[job.SignalID] | |||
| requirePreparedExecutionResultMatchesOracle(t, got[i], want[i], oracleState, 1e-4, 1e-8) | |||
| requireStreamingExtractResultMatchesOracle(t, applied[i], want[i]) | |||
| requireExtractStateMatchesOracle(t, r.streamState[job.SignalID], oracleState, 1e-8, 1e-4) | |||
| state := r.nativeState[job.SignalID] | |||
| if state == nil { | |||
| t.Fatalf("step %d (%s): missing native state for signal %d", idx, step.name, job.SignalID) | |||
| } | |||
| if state.configHash != job.ConfigHash { | |||
| t.Fatalf("step %d (%s): native config hash mismatch for signal %d: got=%d want=%d", idx, step.name, job.SignalID, state.configHash, job.ConfigHash) | |||
| } | |||
| if state.decim != oracleState.Decim { | |||
| t.Fatalf("step %d (%s): native decim mismatch for signal %d: got=%d want=%d", idx, step.name, job.SignalID, state.decim, oracleState.Decim) | |||
| } | |||
| if state.numTaps != oracleState.NumTaps { | |||
| t.Fatalf("step %d (%s): native num taps mismatch for signal %d: got=%d want=%d", idx, step.name, job.SignalID, state.numTaps, oracleState.NumTaps) | |||
| } | |||
| if state.historyCap != maxInt(0, oracleState.NumTaps-1) { | |||
| t.Fatalf("step %d (%s): native history cap mismatch for signal %d: got=%d want=%d", idx, step.name, job.SignalID, state.historyCap, maxInt(0, oracleState.NumTaps-1)) | |||
| } | |||
| if state.historyLen != len(oracleState.ShiftedHistory) { | |||
| t.Fatalf("step %d (%s): native history len mismatch for signal %d: got=%d want=%d", idx, step.name, job.SignalID, state.historyLen, len(oracleState.ShiftedHistory)) | |||
| } | |||
| if len(step.iq) > 0 && state.shiftedCap < len(step.iq) { | |||
| t.Fatalf("step %d (%s): native shifted capacity too small for signal %d: got=%d need>=%d", idx, step.name, job.SignalID, state.shiftedCap, len(step.iq)) | |||
| } | |||
| if state.outCap < got[i].NOut { | |||
| t.Fatalf("step %d (%s): native out capacity too small for signal %d: got=%d need>=%d", idx, step.name, job.SignalID, state.outCap, got[i].NOut) | |||
| } | |||
| if job.SignalID == jobA.SignalID && state.shiftedCap > grownCap { | |||
| grownCap = state.shiftedCap | |||
| } | |||
| } | |||
| if step.name == "grow_capacity" && grownCap < len(step.iq) { | |||
| t.Fatalf("expected capacity growth for signal %d, got=%d want>=%d", jobA.SignalID, grownCap, len(step.iq)) | |||
| } | |||
| if step.name == "config_reset_zero_new" { | |||
| state := r.nativeState[jobA.SignalID] | |||
| if state == nil { | |||
| t.Fatalf("missing native state for signal %d after config reset", jobA.SignalID) | |||
| } | |||
| if state.historyLen != 0 { | |||
| t.Fatalf("expected cleared native history after config reset, got=%d", state.historyLen) | |||
| } | |||
| } | |||
| if step.name == "signal_b_disappears" { | |||
| if _, ok := r.nativeState[jobB.SignalID]; ok { | |||
| t.Fatalf("expected native state for signal %d to be removed on disappearance", jobB.SignalID) | |||
| } | |||
| } | |||
| } | |||
| } | |||
| @@ -0,0 +1,28 @@ | |||
| package gpudemod | |||
| import "unsafe" | |||
| type nativeStreamingSignalState struct { | |||
| signalID int64 | |||
| configHash uint64 | |||
| decim int | |||
| numTaps int | |||
| dInNew unsafe.Pointer | |||
| dShifted unsafe.Pointer | |||
| dOut unsafe.Pointer | |||
| dTaps unsafe.Pointer | |||
| dHistory unsafe.Pointer | |||
| dHistoryScratch unsafe.Pointer | |||
| inNewCap int | |||
| shiftedCap int | |||
| outCap int | |||
| tapsLen int | |||
| historyCap int | |||
| historyLen int | |||
| historyScratchCap int | |||
| phaseCount int | |||
| phaseNCO float64 | |||
| } | |||
| @@ -14,6 +14,7 @@ func (r *BatchRunner) buildStreamingGPUInvocations(iqNew []complex64, jobs []Str | |||
| } | |||
| invocations[i] = StreamingGPUInvocation{ | |||
| SignalID: job.SignalID, | |||
| ConfigHash: state.ConfigHash, | |||
| OffsetHz: job.OffsetHz, | |||
| OutRate: job.OutRate, | |||
| Bandwidth: job.Bandwidth, | |||
| @@ -34,6 +35,7 @@ func (r *BatchRunner) buildStreamingGPUInvocations(iqNew []complex64, jobs []Str | |||
| delete(r.streamState, signalID) | |||
| } | |||
| } | |||
| r.syncNativeStreamingStates(active) | |||
| return invocations, nil | |||
| } | |||
| @@ -1,7 +1,5 @@ | |||
| package gpudemod | |||
| import "fmt" | |||
| func updateShiftedHistory(prev []complex64, shiftedNew []complex64, numTaps int) []complex64 { | |||
| need := numTaps - 1 | |||
| if need <= 0 { | |||
| @@ -18,22 +16,11 @@ func updateShiftedHistory(prev []complex64, shiftedNew []complex64, numTaps int) | |||
| return out | |||
| } | |||
| // StreamingExtractGPU is the planned production entry point for the stateful | |||
| // GPU extractor path. It intentionally exists early as an explicit boundary so | |||
| // callers can migrate away from legacy overlap+trim semantics. | |||
| // | |||
| // Current status: | |||
| // - validates jobs against persistent per-signal state ownership | |||
| // - enforces exact integer decimation | |||
| // - initializes per-signal state (config hash, taps, history capacity) | |||
| // - does not yet execute the final stateful polyphase GPU kernel path | |||
| // StreamingExtractGPU is the production entry point for the stateful streaming | |||
| // extractor path. Execution strategy is selected by StreamingExtractGPUExec. | |||
| func (r *BatchRunner) StreamingExtractGPU(iqNew []complex64, jobs []StreamingExtractJob) ([]StreamingExtractResult, error) { | |||
| if r == nil || r.eng == nil { | |||
| return nil, ErrUnavailable | |||
| } | |||
| if results, err := r.StreamingExtractGPUExec(iqNew, jobs); err == nil { | |||
| return results, nil | |||
| } | |||
| _, _ = iqNew, jobs | |||
| return nil, fmt.Errorf("StreamingExtractGPU not implemented yet: stateful polyphase GPU path pending") | |||
| return r.StreamingExtractGPUExec(iqNew, jobs) | |||
| } | |||
| @@ -2,7 +2,7 @@ package gpudemod | |||
| import "testing" | |||
| func TestStreamingGPUStubRemainsExplicitlyUnimplemented(t *testing.T) { | |||
| func TestStreamingGPUUsesSafeProductionDefault(t *testing.T) { | |||
| r := &BatchRunner{eng: &Engine{sampleRate: 4000000}, streamState: make(map[int64]*ExtractStreamState)} | |||
| job := StreamingExtractJob{ | |||
| SignalID: 1, | |||
| @@ -13,9 +13,15 @@ func TestStreamingGPUStubRemainsExplicitlyUnimplemented(t *testing.T) { | |||
| ConfigHash: 777, | |||
| } | |||
| iq := makeDeterministicIQ(1000) | |||
| _, err := r.StreamingExtractGPU(iq, []StreamingExtractJob{job}) | |||
| if err == nil { | |||
| t.Fatalf("expected not-implemented error from GPU stub") | |||
| results, err := r.StreamingExtractGPU(iq, []StreamingExtractJob{job}) | |||
| if err != nil { | |||
| t.Fatalf("expected safe production default path, got error: %v", err) | |||
| } | |||
| if len(results) != 1 { | |||
| t.Fatalf("expected 1 result, got %d", len(results)) | |||
| } | |||
| if results[0].NOut == 0 { | |||
| t.Fatalf("expected non-zero output count from safe production path") | |||
| } | |||
| } | |||
| @@ -0,0 +1,213 @@ | |||
| package gpudemod | |||
| import ( | |||
| "math" | |||
| "testing" | |||
| ) | |||
| type streamingValidationStep struct { | |||
| name string | |||
| iq []complex64 | |||
| jobs []StreamingExtractJob | |||
| } | |||
| type streamingPreparedExecutor func(*BatchRunner, []StreamingGPUInvocation) ([]StreamingGPUExecutionResult, error) | |||
| func makeToneNoiseIQ(n int, phaseInc float64) []complex64 { | |||
| out := make([]complex64, n) | |||
| phase := 0.0 | |||
| for i := 0; i < n; i++ { | |||
| tone := complex(math.Cos(phase), math.Sin(phase)) | |||
| noiseI := 0.17*math.Cos(0.113*float64(i)+0.31) + 0.07*math.Sin(0.071*float64(i)) | |||
| noiseQ := 0.13*math.Sin(0.097*float64(i)+0.11) - 0.05*math.Cos(0.043*float64(i)) | |||
| out[i] = complex64(0.85*tone + 0.15*complex(noiseI, noiseQ)) | |||
| phase += phaseInc | |||
| } | |||
| return out | |||
| } | |||
| func makeStreamingValidationSteps(iq []complex64, chunkSizes []int, jobs []StreamingExtractJob) []streamingValidationStep { | |||
| steps := make([]streamingValidationStep, 0, len(chunkSizes)+1) | |||
| pos := 0 | |||
| for idx, n := range chunkSizes { | |||
| if n < 0 { | |||
| n = 0 | |||
| } | |||
| end := pos + n | |||
| if end > len(iq) { | |||
| end = len(iq) | |||
| } | |||
| steps = append(steps, streamingValidationStep{ | |||
| name: "chunk", | |||
| iq: append([]complex64(nil), iq[pos:end]...), | |||
| jobs: append([]StreamingExtractJob(nil), jobs...), | |||
| }) | |||
| _ = idx | |||
| pos = end | |||
| } | |||
| if pos < len(iq) { | |||
| steps = append(steps, streamingValidationStep{ | |||
| name: "remainder", | |||
| iq: append([]complex64(nil), iq[pos:]...), | |||
| jobs: append([]StreamingExtractJob(nil), jobs...), | |||
| }) | |||
| } | |||
| return steps | |||
| } | |||
| func requirePhaseClose(t *testing.T, got float64, want float64, tol float64) { | |||
| t.Helper() | |||
| diff := got - want | |||
| for diff > math.Pi { | |||
| diff -= 2 * math.Pi | |||
| } | |||
| for diff < -math.Pi { | |||
| diff += 2 * math.Pi | |||
| } | |||
| if math.Abs(diff) > tol { | |||
| t.Fatalf("phase mismatch: got=%0.12f want=%0.12f diff=%0.12f tol=%0.12f", got, want, diff, tol) | |||
| } | |||
| } | |||
| func requireStreamingExtractResultMatchesOracle(t *testing.T, got StreamingExtractResult, want StreamingExtractResult) { | |||
| t.Helper() | |||
| if got.SignalID != want.SignalID { | |||
| t.Fatalf("signal id mismatch: got=%d want=%d", got.SignalID, want.SignalID) | |||
| } | |||
| if got.Rate != want.Rate { | |||
| t.Fatalf("rate mismatch for signal %d: got=%d want=%d", got.SignalID, got.Rate, want.Rate) | |||
| } | |||
| if got.NOut != want.NOut { | |||
| t.Fatalf("n_out mismatch for signal %d: got=%d want=%d", got.SignalID, got.NOut, want.NOut) | |||
| } | |||
| if got.PhaseCount != want.PhaseCount { | |||
| t.Fatalf("phase count mismatch for signal %d: got=%d want=%d", got.SignalID, got.PhaseCount, want.PhaseCount) | |||
| } | |||
| if got.HistoryLen != want.HistoryLen { | |||
| t.Fatalf("history len mismatch for signal %d: got=%d want=%d", got.SignalID, got.HistoryLen, want.HistoryLen) | |||
| } | |||
| } | |||
| func requirePreparedExecutionResultMatchesOracle(t *testing.T, got StreamingGPUExecutionResult, want StreamingExtractResult, oracleState *CPUOracleState, sampleTol float64, phaseTol float64) { | |||
| t.Helper() | |||
| if oracleState == nil { | |||
| t.Fatalf("missing oracle state for signal %d", got.SignalID) | |||
| } | |||
| if got.SignalID != want.SignalID { | |||
| t.Fatalf("signal id mismatch: got=%d want=%d", got.SignalID, want.SignalID) | |||
| } | |||
| if got.Rate != want.Rate { | |||
| t.Fatalf("rate mismatch for signal %d: got=%d want=%d", got.SignalID, got.Rate, want.Rate) | |||
| } | |||
| if got.NOut != want.NOut { | |||
| t.Fatalf("n_out mismatch for signal %d: got=%d want=%d", got.SignalID, got.NOut, want.NOut) | |||
| } | |||
| if got.PhaseCountOut != oracleState.PhaseCount { | |||
| t.Fatalf("phase count mismatch for signal %d: got=%d want=%d", got.SignalID, got.PhaseCountOut, oracleState.PhaseCount) | |||
| } | |||
| requirePhaseClose(t, got.NCOPhaseOut, oracleState.NCOPhase, phaseTol) | |||
| if got.HistoryLenOut != len(oracleState.ShiftedHistory) { | |||
| t.Fatalf("history len mismatch for signal %d: got=%d want=%d", got.SignalID, got.HistoryLenOut, len(oracleState.ShiftedHistory)) | |||
| } | |||
| requireComplexSlicesClose(t, got.IQ, want.IQ, sampleTol) | |||
| requireComplexSlicesClose(t, got.HistoryOut, oracleState.ShiftedHistory, sampleTol) | |||
| } | |||
| func requireExtractStateMatchesOracle(t *testing.T, got *ExtractStreamState, want *CPUOracleState, phaseTol float64, sampleTol float64) { | |||
| t.Helper() | |||
| if got == nil || want == nil { | |||
| t.Fatalf("state mismatch: got nil=%t want nil=%t", got == nil, want == nil) | |||
| } | |||
| if got.SignalID != want.SignalID { | |||
| t.Fatalf("signal id mismatch: got=%d want=%d", got.SignalID, want.SignalID) | |||
| } | |||
| if got.ConfigHash != want.ConfigHash { | |||
| t.Fatalf("config hash mismatch for signal %d: got=%d want=%d", got.SignalID, got.ConfigHash, want.ConfigHash) | |||
| } | |||
| if got.Decim != want.Decim { | |||
| t.Fatalf("decim mismatch for signal %d: got=%d want=%d", got.SignalID, got.Decim, want.Decim) | |||
| } | |||
| if got.NumTaps != want.NumTaps { | |||
| t.Fatalf("num taps mismatch for signal %d: got=%d want=%d", got.SignalID, got.NumTaps, want.NumTaps) | |||
| } | |||
| if got.PhaseCount != want.PhaseCount { | |||
| t.Fatalf("phase count mismatch for signal %d: got=%d want=%d", got.SignalID, got.PhaseCount, want.PhaseCount) | |||
| } | |||
| requirePhaseClose(t, got.NCOPhase, want.NCOPhase, phaseTol) | |||
| requireComplexSlicesClose(t, got.ShiftedHistory, want.ShiftedHistory, sampleTol) | |||
| } | |||
| func requireStateKeysMatchOracle(t *testing.T, got map[int64]*ExtractStreamState, want map[int64]*CPUOracleState) { | |||
| t.Helper() | |||
| if len(got) != len(want) { | |||
| t.Fatalf("active state count mismatch: got=%d want=%d", len(got), len(want)) | |||
| } | |||
| for signalID := range want { | |||
| if got[signalID] == nil { | |||
| t.Fatalf("missing active state for signal %d", signalID) | |||
| } | |||
| } | |||
| for signalID := range got { | |||
| if want[signalID] == nil { | |||
| t.Fatalf("unexpected active state for signal %d", signalID) | |||
| } | |||
| } | |||
| } | |||
| func runStreamingExecSequenceAgainstOracle(t *testing.T, runner *BatchRunner, steps []streamingValidationStep, sampleTol float64, phaseTol float64) { | |||
| t.Helper() | |||
| oracle := NewCPUOracleRunner(runner.eng.sampleRate) | |||
| for idx, step := range steps { | |||
| got, err := runner.StreamingExtractGPUExec(step.iq, step.jobs) | |||
| if err != nil { | |||
| t.Fatalf("step %d (%s): exec failed: %v", idx, step.name, err) | |||
| } | |||
| want, err := oracle.StreamingExtract(step.iq, step.jobs) | |||
| if err != nil { | |||
| t.Fatalf("step %d (%s): oracle failed: %v", idx, step.name, err) | |||
| } | |||
| if len(got) != len(want) { | |||
| t.Fatalf("step %d (%s): result count mismatch: got=%d want=%d", idx, step.name, len(got), len(want)) | |||
| } | |||
| for i, job := range step.jobs { | |||
| requireStreamingExtractResultMatchesOracle(t, got[i], want[i]) | |||
| requireComplexSlicesClose(t, got[i].IQ, want[i].IQ, sampleTol) | |||
| requireExtractStateMatchesOracle(t, runner.streamState[job.SignalID], oracle.States[job.SignalID], phaseTol, sampleTol) | |||
| } | |||
| requireStateKeysMatchOracle(t, runner.streamState, oracle.States) | |||
| } | |||
| } | |||
| func runPreparedSequenceAgainstOracle(t *testing.T, runner *BatchRunner, exec streamingPreparedExecutor, steps []streamingValidationStep, sampleTol float64, phaseTol float64) { | |||
| t.Helper() | |||
| oracle := NewCPUOracleRunner(runner.eng.sampleRate) | |||
| for idx, step := range steps { | |||
| invocations, err := runner.buildStreamingGPUInvocations(step.iq, step.jobs) | |||
| if err != nil { | |||
| t.Fatalf("step %d (%s): build invocations failed: %v", idx, step.name, err) | |||
| } | |||
| got, err := exec(runner, invocations) | |||
| if err != nil { | |||
| t.Fatalf("step %d (%s): prepared exec failed: %v", idx, step.name, err) | |||
| } | |||
| want, err := oracle.StreamingExtract(step.iq, step.jobs) | |||
| if err != nil { | |||
| t.Fatalf("step %d (%s): oracle failed: %v", idx, step.name, err) | |||
| } | |||
| if len(got) != len(want) { | |||
| t.Fatalf("step %d (%s): result count mismatch: got=%d want=%d", idx, step.name, len(got), len(want)) | |||
| } | |||
| applied := runner.applyStreamingGPUExecutionResults(got) | |||
| if len(applied) != len(want) { | |||
| t.Fatalf("step %d (%s): applied result count mismatch: got=%d want=%d", idx, step.name, len(applied), len(want)) | |||
| } | |||
| for i, job := range step.jobs { | |||
| oracleState := oracle.States[job.SignalID] | |||
| requirePreparedExecutionResultMatchesOracle(t, got[i], want[i], oracleState, sampleTol, phaseTol) | |||
| requireStreamingExtractResultMatchesOracle(t, applied[i], want[i]) | |||
| requireComplexSlicesClose(t, applied[i].IQ, want[i].IQ, sampleTol) | |||
| requireExtractStateMatchesOracle(t, runner.streamState[job.SignalID], oracleState, phaseTol, sampleTol) | |||
| } | |||
| requireStateKeysMatchOracle(t, runner.streamState, oracle.States) | |||
| } | |||
| } | |||
| @@ -4,7 +4,7 @@ package gpudemod | |||
| /* | |||
| #cgo windows CFLAGS: -I"C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v13.2/include" | |||
| #cgo windows LDFLAGS: -lcudart64_13 -lkernel32 | |||
| #cgo windows LDFLAGS: -L"C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v13.2/bin/x64" -l:cudart64_13.dll -lkernel32 | |||
| #include <windows.h> | |||
| #include <stdlib.h> | |||
| #include <cuda_runtime.h> | |||
| @@ -27,6 +27,7 @@ typedef int (__stdcall *gpud_launch_decimate_fn)(const gpud_float2* in, gpud_flo | |||
| typedef int (__stdcall *gpud_launch_am_envelope_fn)(const gpud_float2* in, float* out, int n); | |||
| typedef int (__stdcall *gpud_launch_ssb_product_fn)(const gpud_float2* in, float* out, int n, double phase_inc, double phase_start); | |||
| typedef int (__stdcall *gpud_launch_streaming_polyphase_prepare_fn)(const gpud_float2* in_new, int n_new, const gpud_float2* history_in, int history_len, const float* polyphase_taps, int polyphase_len, int decim, int num_taps, int phase_count_in, double phase_start, double phase_inc, gpud_float2* out, int* n_out, int* phase_count_out, double* phase_end_out, gpud_float2* history_out); | |||
| typedef int (__stdcall *gpud_launch_streaming_polyphase_stateful_fn)(const gpud_float2* in_new, int n_new, gpud_float2* shifted_new_tmp, const float* polyphase_taps, int polyphase_len, int decim, int num_taps, gpud_float2* history_state, gpud_float2* history_scratch, int history_cap, int* history_len_io, int* phase_count_state, double* phase_state, double phase_inc, gpud_float2* out, int out_cap, int* n_out); | |||
| static HMODULE gpud_mod = NULL; | |||
| static gpud_stream_create_fn gpud_p_stream_create = NULL; | |||
| @@ -44,6 +45,7 @@ static gpud_launch_decimate_fn gpud_p_launch_decimate = NULL; | |||
| static gpud_launch_am_envelope_fn gpud_p_launch_am_envelope = NULL; | |||
| static gpud_launch_ssb_product_fn gpud_p_launch_ssb_product = NULL; | |||
| static gpud_launch_streaming_polyphase_prepare_fn gpud_p_launch_streaming_polyphase_prepare = NULL; | |||
| static gpud_launch_streaming_polyphase_stateful_fn gpud_p_launch_streaming_polyphase_stateful = NULL; | |||
| static int gpud_cuda_malloc(void **ptr, size_t bytes) { return (int)cudaMalloc(ptr, bytes); } | |||
| static int gpud_cuda_free(void *ptr) { return (int)cudaFree(ptr); } | |||
| @@ -70,6 +72,7 @@ static int gpud_load_library(const char* path) { | |||
| gpud_p_launch_am_envelope = (gpud_launch_am_envelope_fn)GetProcAddress(gpud_mod, "gpud_launch_am_envelope_cuda"); | |||
| gpud_p_launch_ssb_product = (gpud_launch_ssb_product_fn)GetProcAddress(gpud_mod, "gpud_launch_ssb_product_cuda"); | |||
| gpud_p_launch_streaming_polyphase_prepare = (gpud_launch_streaming_polyphase_prepare_fn)GetProcAddress(gpud_mod, "gpud_launch_streaming_polyphase_prepare_cuda"); | |||
| gpud_p_launch_streaming_polyphase_stateful = (gpud_launch_streaming_polyphase_stateful_fn)GetProcAddress(gpud_mod, "gpud_launch_streaming_polyphase_stateful_cuda"); | |||
| if (!gpud_p_stream_create || !gpud_p_stream_destroy || !gpud_p_stream_sync || !gpud_p_upload_fir_taps || !gpud_p_launch_freq_shift_stream || !gpud_p_launch_freq_shift || !gpud_p_launch_fm_discrim || !gpud_p_launch_fir_stream || !gpud_p_launch_fir || !gpud_p_launch_decimate_stream || !gpud_p_launch_decimate || !gpud_p_launch_am_envelope || !gpud_p_launch_ssb_product) { | |||
| FreeLibrary(gpud_mod); | |||
| gpud_mod = NULL; | |||
| @@ -93,6 +96,7 @@ static int gpud_launch_decimate(gpud_float2 *in, gpud_float2 *out, int n_out, in | |||
| static int gpud_launch_am_envelope(gpud_float2 *in, float *out, int n) { if (!gpud_p_launch_am_envelope) return -1; return gpud_p_launch_am_envelope(in, out, n); } | |||
| static int gpud_launch_ssb_product(gpud_float2 *in, float *out, int n, double phase_inc, double phase_start) { if (!gpud_p_launch_ssb_product) return -1; return gpud_p_launch_ssb_product(in, out, n, phase_inc, phase_start); } | |||
| static int gpud_launch_streaming_polyphase_prepare(gpud_float2 *in_new, int n_new, gpud_float2 *history_in, int history_len, float *polyphase_taps, int polyphase_len, int decim, int num_taps, int phase_count_in, double phase_start, double phase_inc, gpud_float2 *out, int *n_out, int *phase_count_out, double *phase_end_out, gpud_float2 *history_out) { if (!gpud_p_launch_streaming_polyphase_prepare) return -1; return gpud_p_launch_streaming_polyphase_prepare(in_new, n_new, history_in, history_len, polyphase_taps, polyphase_len, decim, num_taps, phase_count_in, phase_start, phase_inc, out, n_out, phase_count_out, phase_end_out, history_out); } | |||
| static int gpud_launch_streaming_polyphase_stateful(gpud_float2 *in_new, int n_new, gpud_float2 *shifted_new_tmp, float *polyphase_taps, int polyphase_len, int decim, int num_taps, gpud_float2 *history_state, gpud_float2 *history_scratch, int history_cap, int *history_len_io, int *phase_count_state, double *phase_state, double phase_inc, gpud_float2 *out, int out_cap, int *n_out) { if (!gpud_p_launch_streaming_polyphase_stateful) return -1; return gpud_p_launch_streaming_polyphase_stateful(in_new, n_new, shifted_new_tmp, polyphase_taps, polyphase_len, decim, num_taps, history_state, history_scratch, history_cap, history_len_io, phase_count_state, phase_state, phase_inc, out, out_cap, n_out); } | |||
| */ | |||
| import "C" | |||
| @@ -107,41 +111,68 @@ func bridgeLoadLibrary(path string) int { | |||
| defer C.free(unsafe.Pointer(cp)) | |||
| return int(C.gpud_load_library(cp)) | |||
| } | |||
| func bridgeCudaMalloc(ptr *unsafe.Pointer, bytes uintptr) int { return int(C.gpud_cuda_malloc(ptr, C.size_t(bytes))) } | |||
| func bridgeCudaMalloc(ptr *unsafe.Pointer, bytes uintptr) int { | |||
| return int(C.gpud_cuda_malloc(ptr, C.size_t(bytes))) | |||
| } | |||
| func bridgeCudaFree(ptr unsafe.Pointer) int { return int(C.gpud_cuda_free(ptr)) } | |||
| func bridgeMemcpyH2D(dst unsafe.Pointer, src unsafe.Pointer, bytes uintptr) int { return int(C.gpud_memcpy_h2d(dst, src, C.size_t(bytes))) } | |||
| func bridgeMemcpyD2H(dst unsafe.Pointer, src unsafe.Pointer, bytes uintptr) int { return int(C.gpud_memcpy_d2h(dst, src, C.size_t(bytes))) } | |||
| func bridgeMemcpyH2D(dst unsafe.Pointer, src unsafe.Pointer, bytes uintptr) int { | |||
| return int(C.gpud_memcpy_h2d(dst, src, C.size_t(bytes))) | |||
| } | |||
| func bridgeMemcpyD2H(dst unsafe.Pointer, src unsafe.Pointer, bytes uintptr) int { | |||
| return int(C.gpud_memcpy_d2h(dst, src, C.size_t(bytes))) | |||
| } | |||
| func bridgeDeviceSync() int { return int(C.gpud_device_sync()) } | |||
| func bridgeUploadFIRTaps(taps *C.float, n int) int { return int(C.gpud_upload_fir_taps(taps, C.int(n))) } | |||
| func bridgeUploadFIRTaps(taps *C.float, n int) int { | |||
| return int(C.gpud_upload_fir_taps(taps, C.int(n))) | |||
| } | |||
| func bridgeLaunchFreqShift(in *C.gpud_float2, out *C.gpud_float2, n int, phaseInc float64, phaseStart float64) int { | |||
| return int(C.gpud_launch_freq_shift(in, out, C.int(n), C.double(phaseInc), C.double(phaseStart))) | |||
| } | |||
| func bridgeLaunchFreqShiftStream(in *C.gpud_float2, out *C.gpud_float2, n int, phaseInc float64, phaseStart float64, stream streamHandle) int { | |||
| return int(C.gpud_launch_freq_shift_stream(in, out, C.int(n), C.double(phaseInc), C.double(phaseStart), C.gpud_stream_handle(stream))) | |||
| } | |||
| func bridgeLaunchFIR(in *C.gpud_float2, out *C.gpud_float2, n int, numTaps int) int { return int(C.gpud_launch_fir(in, out, C.int(n), C.int(numTaps))) } | |||
| func bridgeLaunchFIR(in *C.gpud_float2, out *C.gpud_float2, n int, numTaps int) int { | |||
| return int(C.gpud_launch_fir(in, out, C.int(n), C.int(numTaps))) | |||
| } | |||
| func bridgeLaunchFIRStream(in *C.gpud_float2, out *C.gpud_float2, n int, numTaps int, stream streamHandle) int { | |||
| return int(C.gpud_launch_fir_stream(in, out, C.int(n), C.int(numTaps), C.gpud_stream_handle(stream))) | |||
| } | |||
| func bridgeLaunchFIRv2Stream(in *C.gpud_float2, out *C.gpud_float2, taps *C.float, n int, numTaps int, stream streamHandle) int { | |||
| return int(C.gpud_launch_fir_v2_stream(in, out, taps, C.int(n), C.int(numTaps), C.gpud_stream_handle(stream))) | |||
| } | |||
| func bridgeLaunchDecimate(in *C.gpud_float2, out *C.gpud_float2, nOut int, factor int) int { return int(C.gpud_launch_decimate(in, out, C.int(nOut), C.int(factor))) } | |||
| func bridgeLaunchDecimate(in *C.gpud_float2, out *C.gpud_float2, nOut int, factor int) int { | |||
| return int(C.gpud_launch_decimate(in, out, C.int(nOut), C.int(factor))) | |||
| } | |||
| func bridgeLaunchDecimateStream(in *C.gpud_float2, out *C.gpud_float2, nOut int, factor int, stream streamHandle) int { | |||
| return int(C.gpud_launch_decimate_stream(in, out, C.int(nOut), C.int(factor), C.gpud_stream_handle(stream))) | |||
| } | |||
| func bridgeLaunchFMDiscrim(in *C.gpud_float2, out *C.float, n int) int { return int(C.gpud_launch_fm_discrim(in, out, C.int(n))) } | |||
| func bridgeLaunchAMEnvelope(in *C.gpud_float2, out *C.float, n int) int { return int(C.gpud_launch_am_envelope(in, out, C.int(n))) } | |||
| func bridgeLaunchFMDiscrim(in *C.gpud_float2, out *C.float, n int) int { | |||
| return int(C.gpud_launch_fm_discrim(in, out, C.int(n))) | |||
| } | |||
| func bridgeLaunchAMEnvelope(in *C.gpud_float2, out *C.float, n int) int { | |||
| return int(C.gpud_launch_am_envelope(in, out, C.int(n))) | |||
| } | |||
| func bridgeLaunchSSBProduct(in *C.gpud_float2, out *C.float, n int, phaseInc float64, phaseStart float64) int { | |||
| return int(C.gpud_launch_ssb_product(in, out, C.int(n), C.double(phaseInc), C.double(phaseStart))) | |||
| } | |||
| // bridgeLaunchStreamingPolyphasePrepare is a transitional bridge for the | |||
| // legacy single-call prepare path. The stateful native path uses | |||
| // bridgeLaunchStreamingPolyphaseStateful. | |||
| func bridgeLaunchStreamingPolyphasePrepare(inNew *C.gpud_float2, nNew int, historyIn *C.gpud_float2, historyLen int, polyphaseTaps *C.float, polyphaseLen int, decim int, numTaps int, phaseCountIn int, phaseStart float64, phaseInc float64, out *C.gpud_float2, nOut *C.int, phaseCountOut *C.int, phaseEndOut *C.double, historyOut *C.gpud_float2) int { | |||
| return int(C.gpud_launch_streaming_polyphase_prepare(inNew, C.int(nNew), historyIn, C.int(historyLen), polyphaseTaps, C.int(polyphaseLen), C.int(decim), C.int(numTaps), C.int(phaseCountIn), C.double(phaseStart), C.double(phaseInc), out, nOut, phaseCountOut, phaseEndOut, historyOut)) | |||
| } | |||
| func bridgeLaunchStreamingPolyphaseStateful(inNew *C.gpud_float2, nNew int, shiftedNewTmp *C.gpud_float2, polyphaseTaps *C.float, polyphaseLen int, decim int, numTaps int, historyState *C.gpud_float2, historyScratch *C.gpud_float2, historyCap int, historyLenIO *C.int, phaseCountState *C.int, phaseState *C.double, phaseInc float64, out *C.gpud_float2, outCap int, nOut *C.int) int { | |||
| return int(C.gpud_launch_streaming_polyphase_stateful(inNew, C.int(nNew), shiftedNewTmp, polyphaseTaps, C.int(polyphaseLen), C.int(decim), C.int(numTaps), historyState, historyScratch, C.int(historyCap), historyLenIO, phaseCountState, phaseState, C.double(phaseInc), out, C.int(outCap), nOut)) | |||
| } | |||
| func bridgeStreamCreate() (streamHandle, int) { | |||
| var s C.gpud_stream_handle | |||
| res := int(C.gpud_stream_create(&s)) | |||
| return streamHandle(s), res | |||
| } | |||
| func bridgeStreamDestroy(stream streamHandle) int { return int(C.gpud_stream_destroy(C.gpud_stream_handle(stream))) } | |||
| func bridgeStreamSync(stream streamHandle) int { return int(C.gpud_stream_sync(C.gpud_stream_handle(stream))) } | |||
| func bridgeStreamDestroy(stream streamHandle) int { | |||
| return int(C.gpud_stream_destroy(C.gpud_stream_handle(stream))) | |||
| } | |||
| func bridgeStreamSync(stream streamHandle) int { | |||
| return int(C.gpud_stream_sync(C.gpud_stream_handle(stream))) | |||
| } | |||