GPU profiling + staggered BLAS builds + RT disable during animation

- Add comprehensive GPU timestamp queries for all major operations
  (mesh, smooth mesh, BLAS extract, BLAS build, draw, RT shadows)
- Add full-frame profiling: Wicked Render, GPU Wait/Sync, true FPS
- Stagger BLAS builds during animation: alternate blocky/smooth per
  frame, skip toping BLAS entirely (~130ms savings per frame)
- Auto-disable RT shadows on F3 animation start (prevents stale
  shadow artifacts), auto-restore on F3 stop with full BLAS rebuild
- Split buildAccelerationStructures() with selective build flags
- Result: animation ~24 FPS (CPU-bound on Regenerate 27ms)
  vs previous 2 FPS (GPU-bound on BLAS Build 1368ms)
This commit is contained in:
Samuel Bouchet 2026-03-31 02:21:11 +02:00
parent 0d3f8200b4
commit 0d93cef8f1
2 changed files with 237 additions and 74 deletions

View file

@ -818,18 +818,16 @@ void VoxelRenderer::dispatchTopingBLASExtract(CommandList cmd) const {
topingBLASDirty_ = false;
}
void VoxelRenderer::buildAccelerationStructures(CommandList cmd) const {
void VoxelRenderer::buildAccelerationStructures(CommandList cmd,
uint32_t buildFlags) const {
if (!rtAvailable_) return;
auto* dev = device_;
// ── Blocky BLAS ──────────────────────────────────────────────
uint32_t blockyVertCount = rtBlockyVertexCount_;
if (blockyVertCount < 3) blockyVertCount = 0; // Need at least 1 triangle
if (blockyVertCount > 0 && blasPositionBuffer_.IsValid()) {
// Only (re)create BLAS when vertex count exceeds allocated capacity.
// CreateRaytracingAccelerationStructure allocates GPU memory — calling it per-frame leaks VRAM.
// We allocate with headroom and update desc.vertex_count before each Build.
if (blockyVertCount < 3) blockyVertCount = 0;
if ((buildFlags & RT_BUILD_BLOCKY) && blockyVertCount > 0 && blasPositionBuffer_.IsValid()) {
if (!blockyBLAS_.IsValid() || blockyVertCount > blockyBLASCapacity_) {
blockyBLASCapacity_ = blockyVertCount + blockyVertCount / 4; // 25% headroom
@ -843,7 +841,7 @@ void VoxelRenderer::buildAccelerationStructures(CommandList cmd) const {
geom.flags = RaytracingAccelerationStructureDesc::BottomLevel::Geometry::FLAG_OPAQUE;
geom.triangles.vertex_buffer = blasPositionBuffer_;
geom.triangles.vertex_byte_offset = 0;
geom.triangles.vertex_count = blockyBLASCapacity_; // allocate for capacity
geom.triangles.vertex_count = blockyBLASCapacity_;
geom.triangles.vertex_stride = sizeof(float) * 3;
geom.triangles.vertex_format = Format::R32G32B32_FLOAT;
geom.triangles.index_buffer = blasIndexBuffer_;
@ -851,8 +849,7 @@ void VoxelRenderer::buildAccelerationStructures(CommandList cmd) const {
geom.triangles.index_format = IndexBufferFormat::UINT32;
geom.triangles.index_offset = 0;
bool ok = dev->CreateRaytracingAccelerationStructure(&desc,
&blockyBLAS_);
bool ok = dev->CreateRaytracingAccelerationStructure(&desc, &blockyBLAS_);
if (ok) {
dev->SetName(&blockyBLAS_, "VoxelRenderer::blockyBLAS");
wi::backlog::post("VoxelRenderer: blocky BLAS created (capacity "
@ -864,7 +861,7 @@ void VoxelRenderer::buildAccelerationStructures(CommandList cmd) const {
}
}
// Update actual vertex count in descriptor before Build
// Update actual vertex count, then Build
blockyBLAS_.desc.bottom_level.geometries[0].triangles.vertex_count = blockyVertCount;
blockyBLAS_.desc.bottom_level.geometries[0].triangles.index_count = blockyVertCount;
dev->BuildRaytracingAccelerationStructure(&blockyBLAS_, cmd, nullptr);
@ -877,10 +874,9 @@ void VoxelRenderer::buildAccelerationStructures(CommandList cmd) const {
bool useGpuSmooth = smoothCentroidShader_.IsValid() && smoothMeshShader_.IsValid();
const GPUBuffer& smoothVB = useGpuSmooth ? gpuSmoothVertexBuffer_ : smoothVertexBuffer_;
if (smoothVertCount > 0 && smoothVB.IsValid()) {
// Capacity-based: only recreate when exceeding allocated capacity
if ((buildFlags & RT_BUILD_SMOOTH) && smoothVertCount > 0 && smoothVB.IsValid()) {
if (!smoothBLAS_.IsValid() || smoothVertCount > smoothBLASCapacity_) {
smoothBLASCapacity_ = smoothVertCount + smoothVertCount / 4; // 25% headroom
smoothBLASCapacity_ = smoothVertCount + smoothVertCount / 4;
RaytracingAccelerationStructureDesc desc;
desc.type = RaytracingAccelerationStructureDesc::Type::BOTTOMLEVEL;
@ -893,15 +889,14 @@ void VoxelRenderer::buildAccelerationStructures(CommandList cmd) const {
geom.triangles.vertex_buffer = smoothVB;
geom.triangles.vertex_byte_offset = 0;
geom.triangles.vertex_count = smoothBLASCapacity_;
geom.triangles.vertex_stride = 32; // SmoothVtx struct = 32 bytes, position at offset 0
geom.triangles.vertex_stride = 32;
geom.triangles.index_buffer = blasIndexBuffer_;
geom.triangles.index_count = smoothBLASCapacity_;
geom.triangles.index_format = IndexBufferFormat::UINT32;
geom.triangles.index_offset = 0;
geom.triangles.vertex_format = Format::R32G32B32_FLOAT;
bool ok = dev->CreateRaytracingAccelerationStructure(&desc,
&smoothBLAS_);
bool ok = dev->CreateRaytracingAccelerationStructure(&desc, &smoothBLAS_);
if (ok) {
dev->SetName(&smoothBLAS_, "VoxelRenderer::smoothBLAS");
wi::backlog::post("VoxelRenderer: smooth BLAS created (capacity "
@ -912,7 +907,6 @@ void VoxelRenderer::buildAccelerationStructures(CommandList cmd) const {
}
if (smoothBLAS_.IsValid()) {
// Update actual vertex count before Build
smoothBLAS_.desc.bottom_level.geometries[0].triangles.vertex_count = smoothVertCount;
smoothBLAS_.desc.bottom_level.geometries[0].triangles.index_count = smoothVertCount;
dev->BuildRaytracingAccelerationStructure(&smoothBLAS_, cmd, nullptr);
@ -923,14 +917,13 @@ void VoxelRenderer::buildAccelerationStructures(CommandList cmd) const {
// ── Toping BLAS ──────────────────────────────────────────────
uint32_t topingVertCount = rtTopingVertexCount_;
if (topingVertCount >= 3 && topingBLASPositionBuffer_.IsValid()) {
// Capacity-based: only recreate when exceeding allocated capacity
if ((buildFlags & RT_BUILD_TOPING) && topingVertCount >= 3 && topingBLASPositionBuffer_.IsValid()) {
if (!topingBLAS_.IsValid() || topingVertCount > topingBLASASCapacity_) {
topingBLASASCapacity_ = topingVertCount + topingVertCount / 4; // 25% headroom
topingBLASASCapacity_ = topingVertCount + topingVertCount / 4;
RaytracingAccelerationStructureDesc desc;
desc.type = RaytracingAccelerationStructureDesc::Type::BOTTOMLEVEL;
desc.flags = RaytracingAccelerationStructureDesc::FLAG_PREFER_FAST_BUILD; // fast rebuild for animation
desc.flags = RaytracingAccelerationStructureDesc::FLAG_PREFER_FAST_BUILD;
desc.bottom_level.geometries.resize(1);
auto& geom = desc.bottom_level.geometries[0];
@ -957,7 +950,6 @@ void VoxelRenderer::buildAccelerationStructures(CommandList cmd) const {
}
if (topingBLAS_.IsValid()) {
// Update actual vertex count before Build
topingBLAS_.desc.bottom_level.geometries[0].triangles.vertex_count = topingVertCount;
topingBLAS_.desc.bottom_level.geometries[0].triangles.index_count = topingVertCount;
dev->BuildRaytracingAccelerationStructure(&topingBLAS_, cmd, nullptr);
@ -1056,7 +1048,6 @@ void VoxelRenderer::buildAccelerationStructures(CommandList cmd) const {
wi::backlog::post("VoxelRenderer: TLAS created (" + std::to_string(instanceCount) + " instances)");
}
// Rebuild TLAS (picks up rebuilt BLASes with new vertex data)
dev->BuildRaytracingAccelerationStructure(&tlas_, cmd, nullptr);
// Memory barrier: sync TLAS build before ray queries can use it
@ -1998,7 +1989,9 @@ void VoxelRenderPath::createRenderTargets() {
// ── WASD camera input ───────────────────────────────────────────
static constexpr wi::input::BUTTON KEY_W = (wi::input::BUTTON)(wi::input::CHARACTER_RANGE_START + ('W' - 'A'));
static constexpr wi::input::BUTTON KEY_Z = (wi::input::BUTTON)(wi::input::CHARACTER_RANGE_START + ('Z' - 'A'));
static constexpr wi::input::BUTTON KEY_A = (wi::input::BUTTON)(wi::input::CHARACTER_RANGE_START + ('A' - 'A'));
static constexpr wi::input::BUTTON KEY_Q = (wi::input::BUTTON)(wi::input::CHARACTER_RANGE_START + ('Q' - 'A'));
static constexpr wi::input::BUTTON KEY_S = (wi::input::BUTTON)(wi::input::CHARACTER_RANGE_START + ('S' - 'A'));
static constexpr wi::input::BUTTON KEY_D = (wi::input::BUTTON)(wi::input::CHARACTER_RANGE_START + ('D' - 'A'));
@ -2010,6 +2003,16 @@ void VoxelRenderPath::handleInput(float dt) {
// F3: toggle animated terrain
if (wi::input::Press(wi::input::KEYBOARD_BUTTON_F3)) {
animatedTerrain_ = !animatedTerrain_;
if (animatedTerrain_) {
// Save RT state and disable shadows during animation (stale BLAS = wrong shadows)
rtWasEnabled_ = renderer.rtShadowsEnabled_;
renderer.rtShadowsEnabled_ = false;
} else {
// Force full RT rebuild (including topings) when animation stops
renderer.rtDirty_ = true;
renderer.topingBLASDirty_ = true;
renderer.rtShadowsEnabled_ = rtWasEnabled_;
}
wi::backlog::post(animatedTerrain_ ? "Animation: ON (30 Hz)" : "Animation: OFF");
}
// F4: toggle blend debug visualization
@ -2058,9 +2061,9 @@ void VoxelRenderPath::handleInput(float dt) {
float speed = cameraSpeed * dt;
if (wi::input::Down(wi::input::KEYBOARD_BUTTON_LSHIFT)) speed *= 3.0f;
if (wi::input::Down(KEY_W)) { cameraPos.x += forward.x * speed; cameraPos.y += forward.y * speed; cameraPos.z += forward.z * speed; }
if (wi::input::Down(KEY_Z)) { cameraPos.x += forward.x * speed; cameraPos.y += forward.y * speed; cameraPos.z += forward.z * speed; }
if (wi::input::Down(KEY_S)) { cameraPos.x -= forward.x * speed; cameraPos.y -= forward.y * speed; cameraPos.z -= forward.z * speed; }
if (wi::input::Down(KEY_A)) { cameraPos.x -= right.x * speed; cameraPos.z -= right.z * speed; }
if (wi::input::Down(KEY_Q)) { cameraPos.x -= right.x * speed; cameraPos.z -= right.z * speed; }
if (wi::input::Down(KEY_D)) { cameraPos.x += right.x * speed; cameraPos.z += right.z * speed; }
if (wi::input::Down(wi::input::KEYBOARD_BUTTON_SPACE)) cameraPos.y += speed;
if (wi::input::Down(wi::input::KEYBOARD_BUTTON_LCONTROL)) cameraPos.y -= speed;
@ -2073,6 +2076,13 @@ void VoxelRenderPath::handleInput(float dt) {
void VoxelRenderPath::Update(float dt) {
auto frameStart = std::chrono::high_resolution_clock::now();
frameStartTime_ = frameStart;
// Measure GPU wait: time from last Compose() end to this Update() start
// This captures Present() GPU sync + OS scheduling
if (lastComposeEndValid_) {
float gpuWaitMs = std::chrono::duration<float, std::milli>(frameStart - lastComposeEnd_).count();
if (gpuWaitMs > 0.01f) profGpuWait_.add(gpuWaitMs);
}
lastDt_ = dt;
float instantFps = (dt > 0.0f) ? (1.0f / dt) : 0.0f;
smoothFps_ = smoothFps_ * 0.95f + instantFps * 0.05f;
@ -2165,7 +2175,10 @@ void VoxelRenderPath::Update(float dt) {
}
void VoxelRenderPath::Render() const {
auto tWicked0 = std::chrono::high_resolution_clock::now();
RenderPath3D::Render();
auto tWicked1 = std::chrono::high_resolution_clock::now();
profWickedRender_.add(std::chrono::duration<float, std::milli>(tWicked1 - tWicked0).count());
if (renderer.isInitialized() && camera && rtCreated_) {
auto* device = wi::graphics::GetDevice();
@ -2179,10 +2192,34 @@ void VoxelRenderPath::Render() const {
renderer.gpuMeshQuadCount_ = *countData;
renderer.totalQuads_ = renderer.gpuMeshQuadCount_;
}
// ── GPU Timestamp readback (previous frame's results) ──
{
uint64_t* tsData = (uint64_t*)renderer.timestampReadback_.mapped_data;
if (tsData) {
double freq = (double)device->GetTimestampFrequency();
auto readTs = [&](uint32_t begin, uint32_t end) -> float {
if (freq > 0.0 && tsData[end] > tsData[begin])
return (float)((double)(tsData[end] - tsData[begin]) / freq * 1000.0);
return 0.0f;
};
renderer.gpuMeshTimeMs_ = readTs(VoxelRenderer::TS_GPU_MESH_BEGIN, VoxelRenderer::TS_GPU_MESH_END);
renderer.gpuSmoothMeshTimeMs_ = readTs(VoxelRenderer::TS_GPU_SMOOTH_BEGIN, VoxelRenderer::TS_GPU_SMOOTH_END);
renderer.gpuBLASExtractTimeMs_ = readTs(VoxelRenderer::TS_BLAS_EXTRACT_BEGIN, VoxelRenderer::TS_BLAS_EXTRACT_END);
renderer.gpuBLASBuildTimeMs_ = readTs(VoxelRenderer::TS_BLAS_BUILD_BEGIN, VoxelRenderer::TS_BLAS_BUILD_END);
renderer.gpuDrawTimeMs_ = readTs(VoxelRenderer::TS_DRAW_BEGIN, VoxelRenderer::TS_DRAW_END);
renderer.gpuRTShadowsTimeMs_ = readTs(VoxelRenderer::TS_RT_SHADOWS_BEGIN, VoxelRenderer::TS_RT_SHADOWS_END);
}
}
// Only re-dispatch compute mesher when data changed
if (renderer.gpuMeshDirty_) {
device->QueryEnd(&renderer.timestampHeap_, VoxelRenderer::TS_GPU_MESH_BEGIN, cmd);
auto t0 = std::chrono::high_resolution_clock::now();
renderer.dispatchGpuMesh(cmd, world,
&profVoxelPack_, &profGpuUpload_, &profGpuDispatch_);
auto t1 = std::chrono::high_resolution_clock::now();
profGpuMeshDispatch_.add(std::chrono::duration<float, std::milli>(t1 - t0).count());
device->QueryEnd(&renderer.timestampHeap_, VoxelRenderer::TS_GPU_MESH_END, cmd);
}
// GPU smooth mesh: readback previous frame's vertex count
@ -2192,7 +2229,9 @@ void VoxelRenderPath::Render() const {
}
// GPU smooth mesh dispatch (uses same voxelDataBuffer_ already uploaded)
if (renderer.gpuSmoothMeshDirty_ && renderer.smoothCentroidShader_.IsValid() && renderer.smoothMeshShader_.IsValid()) {
device->QueryEnd(&renderer.timestampHeap_, VoxelRenderer::TS_GPU_SMOOTH_BEGIN, cmd);
renderer.dispatchGpuSmoothMesh(cmd, world);
device->QueryEnd(&renderer.timestampHeap_, VoxelRenderer::TS_GPU_SMOOTH_END, cmd);
}
// Re-dispatch next frame if readback not yet available (1-frame delay)
if (renderer.gpuSmoothVertexCount_ == 0 &&
@ -2200,19 +2239,41 @@ void VoxelRenderPath::Render() const {
renderer.gpuSmoothMeshDirty_ = true;
}
// ── GPU compute toping BLAS extraction (replaces 196ms CPU loop) ──
// Must happen BEFORE BLAS build. Fills topingBLASPositionBuffer_ via CS,
// sets rtTopingVertexCount_ and rtDirty_ to trigger BLAS/TLAS rebuild.
if (renderer.topingBLASDirty_ && renderer.topingBLASShader_.IsValid()) {
// ── GPU compute toping BLAS extraction ──
// Skip during animation (toping BLAS is skipped to save ~130ms GPU)
if (renderer.topingBLASDirty_ && renderer.topingBLASShader_.IsValid() && !animatedTerrain_) {
device->QueryEnd(&renderer.timestampHeap_, VoxelRenderer::TS_BLAS_EXTRACT_BEGIN, cmd);
renderer.dispatchTopingBLASExtract(cmd);
device->QueryEnd(&renderer.timestampHeap_, VoxelRenderer::TS_BLAS_EXTRACT_END, cmd);
}
// Phase 6.1: BLAS extraction + acceleration structure build
if (renderer.rtAvailable_ && renderer.blasExtractShader_.IsValid() &&
renderer.gpuMeshQuadCount_ > 0 &&
(renderer.rtDirty_ || renderer.gpuMeshQuadCount_ != renderer.rtBlockyVertexCount_ / 6)) {
renderer.dispatchBLASExtract(cmd);
renderer.buildAccelerationStructures(cmd);
// During animation, stagger builds to avoid 200ms+ GPU spikes:
// - Skip toping BLAS entirely (7.7M tris = ~130ms, decorative only)
// - Alternate blocky/smooth BLAS builds across animation frames
// When not animating, rebuild all immediately.
{
bool needsBuild = renderer.rtAvailable_ && renderer.blasExtractShader_.IsValid() &&
renderer.gpuMeshQuadCount_ > 0 &&
(renderer.rtDirty_ || renderer.gpuMeshQuadCount_ != renderer.rtBlockyVertexCount_ / 6);
if (needsBuild) {
device->QueryEnd(&renderer.timestampHeap_, VoxelRenderer::TS_BLAS_BUILD_BEGIN, cmd);
renderer.dispatchBLASExtract(cmd);
if (animatedTerrain_) {
// Stagger: alternate blocky/smooth each animation frame, skip topings
uint32_t flags = (rtBuildSkipCounter_ & 1)
? VoxelRenderer::RT_BUILD_BLOCKY
: VoxelRenderer::RT_BUILD_SMOOTH;
rtBuildSkipCounter_++;
renderer.buildAccelerationStructures(cmd, flags);
} else {
renderer.buildAccelerationStructures(cmd, VoxelRenderer::RT_BUILD_ALL);
}
device->QueryEnd(&renderer.timestampHeap_, VoxelRenderer::TS_BLAS_BUILD_END, cmd);
}
}
}
@ -2239,52 +2300,96 @@ void VoxelRenderPath::Render() const {
renderer.smoothVertexDirty_ = false;
}
auto tRender0 = std::chrono::high_resolution_clock::now();
// ── Draw passes ──
device->QueryEnd(&renderer.timestampHeap_, VoxelRenderer::TS_DRAW_BEGIN, cmd);
renderer.render(cmd, *camera, voxelDepth_, voxelRT_, voxelNormalRT_);
// Phase 4: render topings (separate render pass, preserves voxel output)
renderer.renderTopings(cmd, topingSystem, voxelDepth_, voxelRT_, voxelNormalRT_);
// Phase 5: render smooth surfaces (separate render pass, preserves all prior output)
renderer.renderSmooth(cmd, voxelDepth_, voxelRT_, voxelNormalRT_);
device->QueryEnd(&renderer.timestampHeap_, VoxelRenderer::TS_DRAW_END, cmd);
// Phase 6.2: RT Shadows (modulates voxelRT_ in-place after all geometry is rendered)
// Phase 6.2: RT Shadows + AO
if (renderer.isRTShadowsEnabled() && renderer.isRTReady()) {
device->QueryEnd(&renderer.timestampHeap_, VoxelRenderer::TS_RT_SHADOWS_BEGIN, cmd);
renderer.dispatchShadows(cmd, voxelDepth_, voxelRT_, voxelNormalRT_);
device->QueryEnd(&renderer.timestampHeap_, VoxelRenderer::TS_RT_SHADOWS_END, cmd);
}
auto tRender1 = std::chrono::high_resolution_clock::now();
profRender_.add(std::chrono::duration<float, std::milli>(tRender1 - tRender0).count());
// Resolve GPU timestamps for readback next frame
device->QueryResolve(&renderer.timestampHeap_, 0, VoxelRenderer::TS_COUNT,
&renderer.timestampReadback_, 0, cmd);
}
}
void VoxelRenderPath::logProfilingAverages() const {
char msg[1024];
char msg[2048];
snprintf(msg, sizeof(msg),
"=== PERF PROFILE (avg over %.0fs) ===\n"
" Regenerate: %7.2f ms (%u calls)\n"
" UpdateMeshes: %7.2f ms (%u calls)\n"
" VoxelPack: %7.2f ms (%u calls)\n"
" GPU Upload: %7.2f ms (%u calls)\n"
" GPU Dispatch: %7.2f ms (%u calls)\n"
" SmoothMesh: %7.2f ms (%u calls)\n"
" SmoothUpload: %7.2f ms (%u calls)\n"
" TopingCollect: %7.2f ms (%u calls)\n"
" TopingUpload: %7.2f ms (%u calls)\n"
" Render: %7.2f ms (%u calls)\n"
" Frame (Upd): %7.2f ms (%u calls, %.1f FPS)",
" ── Update() ──\n"
" Regenerate: %7.2f ms (%u calls)\n"
" UpdateMeshes: %7.2f ms (%u calls)\n"
" TopingCollect: %7.2f ms (%u calls)\n"
" TopingUpload: %7.2f ms (%u calls)\n"
" SmoothMesh(CPU): %7.2f ms (%u calls)\n"
" SmoothUpload: %7.2f ms (%u calls)\n"
" Update total: %7.2f ms (%u calls)\n"
" ── Render() ──\n"
" GPU Mesh (pack): %7.2f ms (%u calls)\n"
" GPU Mesh (up): %7.2f ms (%u calls)\n"
" GPU Mesh (disp): %7.2f ms (%u calls)\n"
" GPU MeshTotal: %7.2f ms (%u calls)\n"
" GPU SmoothMesh: %7.2f ms (%u calls)\n"
" BLAS Extract: %7.2f ms (%u calls)\n"
" BLAS/TLAS Build: %7.2f ms (%u calls)\n"
" Deferred Upload: %7.2f ms (%u calls)\n"
" Draw (3 passes): %7.2f ms (%u calls)\n"
" RT Shadows+AO: %7.2f ms (%u calls)\n"
" ── GPU Timings (hardware timestamps) ──\n"
" GPU Mesh: %7.2f ms\n"
" GPU SmoothMesh: %7.2f ms\n"
" GPU BLAS Extract: %7.2f ms\n"
" GPU BLAS Build: %7.2f ms\n"
" GPU Draw: %7.2f ms\n"
" GPU RT Shad+AO: %7.2f ms\n"
" GPU Total: %7.2f ms\n"
" ── Totals ──\n"
" Wicked Render: %7.2f ms (%u calls)\n"
" GPU Wait/Sync: %7.2f ms (%u calls)\n"
" CPU Frame: %7.2f ms (Update→Compose start)\n"
" True Frame: %7.2f ms (Update→Compose end)\n"
" Wall FPS: %7.1f (%u frames in %.0fs)",
PROF_INTERVAL,
profRegenerate_.avg(), profRegenerate_.count,
profUpdateMeshes_.avg(), profUpdateMeshes_.count,
profTopingCollect_.avg(), profTopingCollect_.count,
profTopingUpload_.avg(), profTopingUpload_.count,
profSmoothMesh_.avg(), profSmoothMesh_.count,
profSmoothUpload_.avg(), profSmoothUpload_.count,
profFrame_.avg(), profFrame_.count,
profVoxelPack_.avg(), profVoxelPack_.count,
profGpuUpload_.avg(), profGpuUpload_.count,
profGpuDispatch_.avg(), profGpuDispatch_.count,
profSmoothMesh_.avg(), profSmoothMesh_.count,
profSmoothUpload_.avg(), profSmoothUpload_.count,
profTopingCollect_.avg(), profTopingCollect_.count,
profTopingUpload_.avg(), profTopingUpload_.count,
profGpuMeshDispatch_.avg(), profGpuMeshDispatch_.count,
profGpuSmoothDispatch_.avg(), profGpuSmoothDispatch_.count,
profBLASExtract_.avg(), profBLASExtract_.count,
profBLASBuild_.avg(), profBLASBuild_.count,
profDeferredUpload_.avg(), profDeferredUpload_.count,
profRender_.avg(), profRender_.count,
profFrame_.avg(), profFrame_.count,
profFrame_.count > 0 ? (1000.0f / profFrame_.avg()) : 0.0f);
profRTShadows_.avg(), profRTShadows_.count,
renderer.gpuMeshTimeMs_,
renderer.gpuSmoothMeshTimeMs_,
renderer.gpuBLASExtractTimeMs_,
renderer.gpuBLASBuildTimeMs_,
renderer.gpuDrawTimeMs_,
renderer.gpuRTShadowsTimeMs_,
renderer.gpuMeshTimeMs_ + renderer.gpuSmoothMeshTimeMs_ +
renderer.gpuBLASExtractTimeMs_ + renderer.gpuBLASBuildTimeMs_ +
renderer.gpuDrawTimeMs_ + renderer.gpuRTShadowsTimeMs_,
profWickedRender_.avg(), profWickedRender_.count,
profGpuWait_.avg(), profGpuWait_.count,
profFullFrame_.avg(),
profTrueFrame_.avg(),
profTrueFrame_.count > 0 ? (1000.0f / profTrueFrame_.avg()) : 0.0f,
profTrueFrame_.count, PROF_INTERVAL);
wi::backlog::post(msg);
profRegenerate_.reset();
@ -2292,17 +2397,32 @@ void VoxelRenderPath::logProfilingAverages() const {
profVoxelPack_.reset();
profGpuUpload_.reset();
profGpuDispatch_.reset();
profGpuMeshDispatch_.reset();
profGpuSmoothDispatch_.reset();
profSmoothMesh_.reset();
profSmoothUpload_.reset();
profTopingCollect_.reset();
profTopingUpload_.reset();
profBLASExtract_.reset();
profBLASBuild_.reset();
profDeferredUpload_.reset();
profRender_.reset();
profRTShadows_.reset();
profFrame_.reset();
profFullFrame_.reset();
profGpuWait_.reset();
profWickedRender_.reset();
profTrueFrame_.reset();
}
void VoxelRenderPath::Compose(CommandList cmd) const {
frameCount_++;
// Measure full frame time (Update + Render + Compose start)
auto composeStart = std::chrono::high_resolution_clock::now();
float fullFrameMs = std::chrono::duration<float, std::milli>(composeStart - frameStartTime_).count();
if (fullFrameMs > 0.1f) profFullFrame_.add(fullFrameMs);
RenderPath3D::Compose(cmd);
if (rtCreated_ && voxelRT_.IsValid()) {
@ -2365,6 +2485,13 @@ void VoxelRenderPath::Compose(CommandList cmd) const {
+ "] | F5: shd+ao [" + std::string(renderer.rtShadowDebug_ == 1 ? "SHD" : (renderer.rtShadowDebug_ == 2 ? "AO" : (renderer.isRTShadowsEnabled() ? "ON" : "OFF"))) + "]";
wi::font::Draw(stats, fp, cmd);
// Save compose end time for GPU wait measurement
lastComposeEnd_ = std::chrono::high_resolution_clock::now();
lastComposeEndValid_ = true;
// True frame-to-frame time
float trueFrameMs = std::chrono::duration<float, std::milli>(lastComposeEnd_ - frameStartTime_).count();
if (trueFrameMs > 0.1f) profTrueFrame_.add(trueFrameMs);
}
void VoxelRenderPath::setCamera(float x, float y, float z, float pitch, float yaw) {

View file

@ -228,7 +228,13 @@ private:
mutable uint32_t tlasInstanceCount_ = 0; // track TLAS instance count to avoid per-frame recreation
void dispatchBLASExtract(wi::graphics::CommandList cmd) const;
void buildAccelerationStructures(wi::graphics::CommandList cmd) const;
// Flags for selective BLAS rebuild
static constexpr uint32_t RT_BUILD_BLOCKY = 1 << 0;
static constexpr uint32_t RT_BUILD_SMOOTH = 1 << 1;
static constexpr uint32_t RT_BUILD_TOPING = 1 << 2;
static constexpr uint32_t RT_BUILD_ALL = RT_BUILD_BLOCKY | RT_BUILD_SMOOTH | RT_BUILD_TOPING;
void buildAccelerationStructures(wi::graphics::CommandList cmd,
uint32_t buildFlags = RT_BUILD_ALL) const;
// ── RT Shadows + AO (Phase 6.2 + 6.3) ──────────────────────────
wi::graphics::Shader shadowShader_; // voxelShadowCS compute shader
@ -254,19 +260,29 @@ private:
void dispatchGpuSmoothMesh(wi::graphics::CommandList cmd, const VoxelWorld& world) const;
void rebuildChunkInfoOnly(VoxelWorld& world);
// ── GPU Timestamp Queries (Phase 2 benchmark) ────────────────
// ── GPU Timestamp Queries (comprehensive GPU profiling) ────────
wi::graphics::GPUQueryHeap timestampHeap_;
wi::graphics::GPUBuffer timestampReadback_;
static constexpr uint32_t TS_CULL_BEGIN = 0;
static constexpr uint32_t TS_CULL_END = 1;
static constexpr uint32_t TS_DRAW_BEGIN = 2;
static constexpr uint32_t TS_DRAW_END = 3;
static constexpr uint32_t TS_MESH_BEGIN = 4;
static constexpr uint32_t TS_MESH_END = 5;
static constexpr uint32_t TS_COUNT = 6;
mutable float gpuCullTimeMs_ = 0.0f;
mutable float gpuDrawTimeMs_ = 0.0f;
// Timestamp slots: pairs of (BEGIN, END) for each GPU phase
static constexpr uint32_t TS_GPU_MESH_BEGIN = 0;
static constexpr uint32_t TS_GPU_MESH_END = 1;
static constexpr uint32_t TS_GPU_SMOOTH_BEGIN = 2;
static constexpr uint32_t TS_GPU_SMOOTH_END = 3;
static constexpr uint32_t TS_BLAS_EXTRACT_BEGIN = 4;
static constexpr uint32_t TS_BLAS_EXTRACT_END = 5;
static constexpr uint32_t TS_BLAS_BUILD_BEGIN = 6;
static constexpr uint32_t TS_BLAS_BUILD_END = 7;
static constexpr uint32_t TS_DRAW_BEGIN = 8;
static constexpr uint32_t TS_DRAW_END = 9;
static constexpr uint32_t TS_RT_SHADOWS_BEGIN = 10;
static constexpr uint32_t TS_RT_SHADOWS_END = 11;
static constexpr uint32_t TS_COUNT = 12;
mutable float gpuMeshTimeMs_ = 0.0f;
mutable float gpuSmoothMeshTimeMs_ = 0.0f;
mutable float gpuBLASExtractTimeMs_ = 0.0f;
mutable float gpuBLASBuildTimeMs_ = 0.0f;
mutable float gpuDrawTimeMs_ = 0.0f;
mutable float gpuRTShadowsTimeMs_ = 0.0f;
// Stats (mutable: updated during const Render() call)
mutable uint32_t totalQuads_ = 0;
@ -276,8 +292,13 @@ private:
bool initialized_ = false;
public:
float getGpuCullTimeMs() const { return gpuCullTimeMs_; }
float getGpuDrawTimeMs() const { return gpuDrawTimeMs_; }
float getGpuMeshTimeMs() const { return gpuMeshTimeMs_; }
float getGpuSmoothMeshTimeMs() const { return gpuSmoothMeshTimeMs_; }
float getGpuBLASExtractTimeMs() const { return gpuBLASExtractTimeMs_; }
float getGpuBLASBuildTimeMs() const { return gpuBLASBuildTimeMs_; }
float getGpuRTShadowsTimeMs() const { return gpuRTShadowsTimeMs_; }
void toggleRTShadows() { rtShadowsEnabled_ = !rtShadowsEnabled_; }
bool isGpuMeshEnabled() const { return gpuMesherAvailable_; }
uint32_t getGpuMeshQuadCount() const { return gpuMeshQuadCount_; }
@ -369,12 +390,27 @@ private:
mutable ProfileAccum profVoxelPack_; // voxel data packing in dispatchGpuMesh
mutable ProfileAccum profGpuUpload_; // GPU upload in dispatchGpuMesh
mutable ProfileAccum profGpuDispatch_; // compute dispatches in dispatchGpuMesh
mutable ProfileAccum profRender_; // render() total
mutable ProfileAccum profFrame_; // full frame (Update + Render + Compose)
mutable ProfileAccum profRender_; // render() draw calls
mutable ProfileAccum profFrame_; // full frame (Update only - legacy)
mutable ProfileAccum profFullFrame_; // true full frame (Update + Render + Compose)
mutable ProfileAccum profSmoothMesh_; // SmoothMesher::meshChunk (all chunks)
mutable ProfileAccum profSmoothUpload_; // uploadSmoothData
mutable ProfileAccum profTopingCollect_; // topingSystem.collectInstances
mutable ProfileAccum profTopingUpload_; // uploadTopingData
mutable ProfileAccum profGpuMeshDispatch_; // GPU mesh compute dispatch (in Render)
mutable ProfileAccum profGpuSmoothDispatch_; // GPU smooth mesh dispatch (in Render)
mutable ProfileAccum profBLASExtract_; // BLAS position extraction compute
mutable ProfileAccum profBLASBuild_; // BLAS/TLAS build
mutable ProfileAccum profDeferredUpload_; // deferred GPU buffer uploads
mutable ProfileAccum profRTShadows_; // RT shadows + AO dispatch
mutable ProfileAccum profGpuWait_; // GPU sync: time between Compose end and next Update start
mutable ProfileAccum profWickedRender_; // RenderPath3D::Render() (Wicked internal)
mutable ProfileAccum profTrueFrame_; // wall-clock frame-to-frame time
mutable std::chrono::high_resolution_clock::time_point frameStartTime_; // for full frame timing
mutable std::chrono::high_resolution_clock::time_point lastComposeEnd_; // for GPU wait measurement
mutable bool lastComposeEndValid_ = false;
mutable uint32_t rtBuildSkipCounter_ = 0; // stagger BLAS builds during animation
mutable bool rtWasEnabled_ = false; // saved RT state before animation
mutable float profTimer_ = 0.0f;
static constexpr float PROF_INTERVAL = 5.0f;
void logProfilingAverages() const;