From 0d93cef8f1d61fa1451d9bcd5644c2fd87e6f3f5 Mon Sep 17 00:00:00 2001 From: Samuel Bouchet Date: Tue, 31 Mar 2026 02:21:11 +0200 Subject: [PATCH] GPU profiling + staggered BLAS builds + RT disable during animation - Add comprehensive GPU timestamp queries for all major operations (mesh, smooth mesh, BLAS extract, BLAS build, draw, RT shadows) - Add full-frame profiling: Wicked Render, GPU Wait/Sync, true FPS - Stagger BLAS builds during animation: alternate blocky/smooth per frame, skip toping BLAS entirely (~130ms savings per frame) - Auto-disable RT shadows on F3 animation start (prevents stale shadow artifacts), auto-restore on F3 stop with full BLAS rebuild - Split buildAccelerationStructures() with selective build flags - Result: animation ~24 FPS (CPU-bound on Regenerate 27ms) vs previous 2 FPS (GPU-bound on BLAS Build 1368ms) --- src/voxel/VoxelRenderer.cpp | 247 +++++++++++++++++++++++++++--------- src/voxel/VoxelRenderer.h | 64 ++++++++-- 2 files changed, 237 insertions(+), 74 deletions(-) diff --git a/src/voxel/VoxelRenderer.cpp b/src/voxel/VoxelRenderer.cpp index eed1f8b..8c0e18f 100644 --- a/src/voxel/VoxelRenderer.cpp +++ b/src/voxel/VoxelRenderer.cpp @@ -818,18 +818,16 @@ void VoxelRenderer::dispatchTopingBLASExtract(CommandList cmd) const { topingBLASDirty_ = false; } -void VoxelRenderer::buildAccelerationStructures(CommandList cmd) const { +void VoxelRenderer::buildAccelerationStructures(CommandList cmd, + uint32_t buildFlags) const { if (!rtAvailable_) return; auto* dev = device_; // ── Blocky BLAS ────────────────────────────────────────────── uint32_t blockyVertCount = rtBlockyVertexCount_; - if (blockyVertCount < 3) blockyVertCount = 0; // Need at least 1 triangle - if (blockyVertCount > 0 && blasPositionBuffer_.IsValid()) { - // Only (re)create BLAS when vertex count exceeds allocated capacity. - // CreateRaytracingAccelerationStructure allocates GPU memory — calling it per-frame leaks VRAM. - // We allocate with headroom and update desc.vertex_count before each Build. + if (blockyVertCount < 3) blockyVertCount = 0; + if ((buildFlags & RT_BUILD_BLOCKY) && blockyVertCount > 0 && blasPositionBuffer_.IsValid()) { if (!blockyBLAS_.IsValid() || blockyVertCount > blockyBLASCapacity_) { blockyBLASCapacity_ = blockyVertCount + blockyVertCount / 4; // 25% headroom @@ -843,7 +841,7 @@ void VoxelRenderer::buildAccelerationStructures(CommandList cmd) const { geom.flags = RaytracingAccelerationStructureDesc::BottomLevel::Geometry::FLAG_OPAQUE; geom.triangles.vertex_buffer = blasPositionBuffer_; geom.triangles.vertex_byte_offset = 0; - geom.triangles.vertex_count = blockyBLASCapacity_; // allocate for capacity + geom.triangles.vertex_count = blockyBLASCapacity_; geom.triangles.vertex_stride = sizeof(float) * 3; geom.triangles.vertex_format = Format::R32G32B32_FLOAT; geom.triangles.index_buffer = blasIndexBuffer_; @@ -851,8 +849,7 @@ void VoxelRenderer::buildAccelerationStructures(CommandList cmd) const { geom.triangles.index_format = IndexBufferFormat::UINT32; geom.triangles.index_offset = 0; - bool ok = dev->CreateRaytracingAccelerationStructure(&desc, - &blockyBLAS_); + bool ok = dev->CreateRaytracingAccelerationStructure(&desc, &blockyBLAS_); if (ok) { dev->SetName(&blockyBLAS_, "VoxelRenderer::blockyBLAS"); wi::backlog::post("VoxelRenderer: blocky BLAS created (capacity " @@ -864,7 +861,7 @@ void VoxelRenderer::buildAccelerationStructures(CommandList cmd) const { } } - // Update actual vertex count in descriptor before Build + // Update actual vertex count, then Build blockyBLAS_.desc.bottom_level.geometries[0].triangles.vertex_count = blockyVertCount; blockyBLAS_.desc.bottom_level.geometries[0].triangles.index_count = blockyVertCount; dev->BuildRaytracingAccelerationStructure(&blockyBLAS_, cmd, nullptr); @@ -877,10 +874,9 @@ void VoxelRenderer::buildAccelerationStructures(CommandList cmd) const { bool useGpuSmooth = smoothCentroidShader_.IsValid() && smoothMeshShader_.IsValid(); const GPUBuffer& smoothVB = useGpuSmooth ? gpuSmoothVertexBuffer_ : smoothVertexBuffer_; - if (smoothVertCount > 0 && smoothVB.IsValid()) { - // Capacity-based: only recreate when exceeding allocated capacity + if ((buildFlags & RT_BUILD_SMOOTH) && smoothVertCount > 0 && smoothVB.IsValid()) { if (!smoothBLAS_.IsValid() || smoothVertCount > smoothBLASCapacity_) { - smoothBLASCapacity_ = smoothVertCount + smoothVertCount / 4; // 25% headroom + smoothBLASCapacity_ = smoothVertCount + smoothVertCount / 4; RaytracingAccelerationStructureDesc desc; desc.type = RaytracingAccelerationStructureDesc::Type::BOTTOMLEVEL; @@ -893,15 +889,14 @@ void VoxelRenderer::buildAccelerationStructures(CommandList cmd) const { geom.triangles.vertex_buffer = smoothVB; geom.triangles.vertex_byte_offset = 0; geom.triangles.vertex_count = smoothBLASCapacity_; - geom.triangles.vertex_stride = 32; // SmoothVtx struct = 32 bytes, position at offset 0 + geom.triangles.vertex_stride = 32; geom.triangles.index_buffer = blasIndexBuffer_; geom.triangles.index_count = smoothBLASCapacity_; geom.triangles.index_format = IndexBufferFormat::UINT32; geom.triangles.index_offset = 0; geom.triangles.vertex_format = Format::R32G32B32_FLOAT; - bool ok = dev->CreateRaytracingAccelerationStructure(&desc, - &smoothBLAS_); + bool ok = dev->CreateRaytracingAccelerationStructure(&desc, &smoothBLAS_); if (ok) { dev->SetName(&smoothBLAS_, "VoxelRenderer::smoothBLAS"); wi::backlog::post("VoxelRenderer: smooth BLAS created (capacity " @@ -912,7 +907,6 @@ void VoxelRenderer::buildAccelerationStructures(CommandList cmd) const { } if (smoothBLAS_.IsValid()) { - // Update actual vertex count before Build smoothBLAS_.desc.bottom_level.geometries[0].triangles.vertex_count = smoothVertCount; smoothBLAS_.desc.bottom_level.geometries[0].triangles.index_count = smoothVertCount; dev->BuildRaytracingAccelerationStructure(&smoothBLAS_, cmd, nullptr); @@ -923,14 +917,13 @@ void VoxelRenderer::buildAccelerationStructures(CommandList cmd) const { // ── Toping BLAS ────────────────────────────────────────────── uint32_t topingVertCount = rtTopingVertexCount_; - if (topingVertCount >= 3 && topingBLASPositionBuffer_.IsValid()) { - // Capacity-based: only recreate when exceeding allocated capacity + if ((buildFlags & RT_BUILD_TOPING) && topingVertCount >= 3 && topingBLASPositionBuffer_.IsValid()) { if (!topingBLAS_.IsValid() || topingVertCount > topingBLASASCapacity_) { - topingBLASASCapacity_ = topingVertCount + topingVertCount / 4; // 25% headroom + topingBLASASCapacity_ = topingVertCount + topingVertCount / 4; RaytracingAccelerationStructureDesc desc; desc.type = RaytracingAccelerationStructureDesc::Type::BOTTOMLEVEL; - desc.flags = RaytracingAccelerationStructureDesc::FLAG_PREFER_FAST_BUILD; // fast rebuild for animation + desc.flags = RaytracingAccelerationStructureDesc::FLAG_PREFER_FAST_BUILD; desc.bottom_level.geometries.resize(1); auto& geom = desc.bottom_level.geometries[0]; @@ -957,7 +950,6 @@ void VoxelRenderer::buildAccelerationStructures(CommandList cmd) const { } if (topingBLAS_.IsValid()) { - // Update actual vertex count before Build topingBLAS_.desc.bottom_level.geometries[0].triangles.vertex_count = topingVertCount; topingBLAS_.desc.bottom_level.geometries[0].triangles.index_count = topingVertCount; dev->BuildRaytracingAccelerationStructure(&topingBLAS_, cmd, nullptr); @@ -1056,7 +1048,6 @@ void VoxelRenderer::buildAccelerationStructures(CommandList cmd) const { wi::backlog::post("VoxelRenderer: TLAS created (" + std::to_string(instanceCount) + " instances)"); } - // Rebuild TLAS (picks up rebuilt BLASes with new vertex data) dev->BuildRaytracingAccelerationStructure(&tlas_, cmd, nullptr); // Memory barrier: sync TLAS build before ray queries can use it @@ -1998,7 +1989,9 @@ void VoxelRenderPath::createRenderTargets() { // ── WASD camera input ─────────────────────────────────────────── static constexpr wi::input::BUTTON KEY_W = (wi::input::BUTTON)(wi::input::CHARACTER_RANGE_START + ('W' - 'A')); +static constexpr wi::input::BUTTON KEY_Z = (wi::input::BUTTON)(wi::input::CHARACTER_RANGE_START + ('Z' - 'A')); static constexpr wi::input::BUTTON KEY_A = (wi::input::BUTTON)(wi::input::CHARACTER_RANGE_START + ('A' - 'A')); +static constexpr wi::input::BUTTON KEY_Q = (wi::input::BUTTON)(wi::input::CHARACTER_RANGE_START + ('Q' - 'A')); static constexpr wi::input::BUTTON KEY_S = (wi::input::BUTTON)(wi::input::CHARACTER_RANGE_START + ('S' - 'A')); static constexpr wi::input::BUTTON KEY_D = (wi::input::BUTTON)(wi::input::CHARACTER_RANGE_START + ('D' - 'A')); @@ -2010,6 +2003,16 @@ void VoxelRenderPath::handleInput(float dt) { // F3: toggle animated terrain if (wi::input::Press(wi::input::KEYBOARD_BUTTON_F3)) { animatedTerrain_ = !animatedTerrain_; + if (animatedTerrain_) { + // Save RT state and disable shadows during animation (stale BLAS = wrong shadows) + rtWasEnabled_ = renderer.rtShadowsEnabled_; + renderer.rtShadowsEnabled_ = false; + } else { + // Force full RT rebuild (including topings) when animation stops + renderer.rtDirty_ = true; + renderer.topingBLASDirty_ = true; + renderer.rtShadowsEnabled_ = rtWasEnabled_; + } wi::backlog::post(animatedTerrain_ ? "Animation: ON (30 Hz)" : "Animation: OFF"); } // F4: toggle blend debug visualization @@ -2058,9 +2061,9 @@ void VoxelRenderPath::handleInput(float dt) { float speed = cameraSpeed * dt; if (wi::input::Down(wi::input::KEYBOARD_BUTTON_LSHIFT)) speed *= 3.0f; - if (wi::input::Down(KEY_W)) { cameraPos.x += forward.x * speed; cameraPos.y += forward.y * speed; cameraPos.z += forward.z * speed; } + if (wi::input::Down(KEY_Z)) { cameraPos.x += forward.x * speed; cameraPos.y += forward.y * speed; cameraPos.z += forward.z * speed; } if (wi::input::Down(KEY_S)) { cameraPos.x -= forward.x * speed; cameraPos.y -= forward.y * speed; cameraPos.z -= forward.z * speed; } - if (wi::input::Down(KEY_A)) { cameraPos.x -= right.x * speed; cameraPos.z -= right.z * speed; } + if (wi::input::Down(KEY_Q)) { cameraPos.x -= right.x * speed; cameraPos.z -= right.z * speed; } if (wi::input::Down(KEY_D)) { cameraPos.x += right.x * speed; cameraPos.z += right.z * speed; } if (wi::input::Down(wi::input::KEYBOARD_BUTTON_SPACE)) cameraPos.y += speed; if (wi::input::Down(wi::input::KEYBOARD_BUTTON_LCONTROL)) cameraPos.y -= speed; @@ -2073,6 +2076,13 @@ void VoxelRenderPath::handleInput(float dt) { void VoxelRenderPath::Update(float dt) { auto frameStart = std::chrono::high_resolution_clock::now(); + frameStartTime_ = frameStart; + // Measure GPU wait: time from last Compose() end to this Update() start + // This captures Present() GPU sync + OS scheduling + if (lastComposeEndValid_) { + float gpuWaitMs = std::chrono::duration(frameStart - lastComposeEnd_).count(); + if (gpuWaitMs > 0.01f) profGpuWait_.add(gpuWaitMs); + } lastDt_ = dt; float instantFps = (dt > 0.0f) ? (1.0f / dt) : 0.0f; smoothFps_ = smoothFps_ * 0.95f + instantFps * 0.05f; @@ -2165,7 +2175,10 @@ void VoxelRenderPath::Update(float dt) { } void VoxelRenderPath::Render() const { + auto tWicked0 = std::chrono::high_resolution_clock::now(); RenderPath3D::Render(); + auto tWicked1 = std::chrono::high_resolution_clock::now(); + profWickedRender_.add(std::chrono::duration(tWicked1 - tWicked0).count()); if (renderer.isInitialized() && camera && rtCreated_) { auto* device = wi::graphics::GetDevice(); @@ -2179,10 +2192,34 @@ void VoxelRenderPath::Render() const { renderer.gpuMeshQuadCount_ = *countData; renderer.totalQuads_ = renderer.gpuMeshQuadCount_; } + // ── GPU Timestamp readback (previous frame's results) ── + { + uint64_t* tsData = (uint64_t*)renderer.timestampReadback_.mapped_data; + if (tsData) { + double freq = (double)device->GetTimestampFrequency(); + auto readTs = [&](uint32_t begin, uint32_t end) -> float { + if (freq > 0.0 && tsData[end] > tsData[begin]) + return (float)((double)(tsData[end] - tsData[begin]) / freq * 1000.0); + return 0.0f; + }; + renderer.gpuMeshTimeMs_ = readTs(VoxelRenderer::TS_GPU_MESH_BEGIN, VoxelRenderer::TS_GPU_MESH_END); + renderer.gpuSmoothMeshTimeMs_ = readTs(VoxelRenderer::TS_GPU_SMOOTH_BEGIN, VoxelRenderer::TS_GPU_SMOOTH_END); + renderer.gpuBLASExtractTimeMs_ = readTs(VoxelRenderer::TS_BLAS_EXTRACT_BEGIN, VoxelRenderer::TS_BLAS_EXTRACT_END); + renderer.gpuBLASBuildTimeMs_ = readTs(VoxelRenderer::TS_BLAS_BUILD_BEGIN, VoxelRenderer::TS_BLAS_BUILD_END); + renderer.gpuDrawTimeMs_ = readTs(VoxelRenderer::TS_DRAW_BEGIN, VoxelRenderer::TS_DRAW_END); + renderer.gpuRTShadowsTimeMs_ = readTs(VoxelRenderer::TS_RT_SHADOWS_BEGIN, VoxelRenderer::TS_RT_SHADOWS_END); + } + } + // Only re-dispatch compute mesher when data changed if (renderer.gpuMeshDirty_) { + device->QueryEnd(&renderer.timestampHeap_, VoxelRenderer::TS_GPU_MESH_BEGIN, cmd); + auto t0 = std::chrono::high_resolution_clock::now(); renderer.dispatchGpuMesh(cmd, world, &profVoxelPack_, &profGpuUpload_, &profGpuDispatch_); + auto t1 = std::chrono::high_resolution_clock::now(); + profGpuMeshDispatch_.add(std::chrono::duration(t1 - t0).count()); + device->QueryEnd(&renderer.timestampHeap_, VoxelRenderer::TS_GPU_MESH_END, cmd); } // GPU smooth mesh: readback previous frame's vertex count @@ -2192,7 +2229,9 @@ void VoxelRenderPath::Render() const { } // GPU smooth mesh dispatch (uses same voxelDataBuffer_ already uploaded) if (renderer.gpuSmoothMeshDirty_ && renderer.smoothCentroidShader_.IsValid() && renderer.smoothMeshShader_.IsValid()) { + device->QueryEnd(&renderer.timestampHeap_, VoxelRenderer::TS_GPU_SMOOTH_BEGIN, cmd); renderer.dispatchGpuSmoothMesh(cmd, world); + device->QueryEnd(&renderer.timestampHeap_, VoxelRenderer::TS_GPU_SMOOTH_END, cmd); } // Re-dispatch next frame if readback not yet available (1-frame delay) if (renderer.gpuSmoothVertexCount_ == 0 && @@ -2200,19 +2239,41 @@ void VoxelRenderPath::Render() const { renderer.gpuSmoothMeshDirty_ = true; } - // ── GPU compute toping BLAS extraction (replaces 196ms CPU loop) ── - // Must happen BEFORE BLAS build. Fills topingBLASPositionBuffer_ via CS, - // sets rtTopingVertexCount_ and rtDirty_ to trigger BLAS/TLAS rebuild. - if (renderer.topingBLASDirty_ && renderer.topingBLASShader_.IsValid()) { + // ── GPU compute toping BLAS extraction ── + // Skip during animation (toping BLAS is skipped to save ~130ms GPU) + if (renderer.topingBLASDirty_ && renderer.topingBLASShader_.IsValid() && !animatedTerrain_) { + device->QueryEnd(&renderer.timestampHeap_, VoxelRenderer::TS_BLAS_EXTRACT_BEGIN, cmd); renderer.dispatchTopingBLASExtract(cmd); + device->QueryEnd(&renderer.timestampHeap_, VoxelRenderer::TS_BLAS_EXTRACT_END, cmd); } // Phase 6.1: BLAS extraction + acceleration structure build - if (renderer.rtAvailable_ && renderer.blasExtractShader_.IsValid() && - renderer.gpuMeshQuadCount_ > 0 && - (renderer.rtDirty_ || renderer.gpuMeshQuadCount_ != renderer.rtBlockyVertexCount_ / 6)) { - renderer.dispatchBLASExtract(cmd); - renderer.buildAccelerationStructures(cmd); + // During animation, stagger builds to avoid 200ms+ GPU spikes: + // - Skip toping BLAS entirely (7.7M tris = ~130ms, decorative only) + // - Alternate blocky/smooth BLAS builds across animation frames + // When not animating, rebuild all immediately. + { + bool needsBuild = renderer.rtAvailable_ && renderer.blasExtractShader_.IsValid() && + renderer.gpuMeshQuadCount_ > 0 && + (renderer.rtDirty_ || renderer.gpuMeshQuadCount_ != renderer.rtBlockyVertexCount_ / 6); + + if (needsBuild) { + device->QueryEnd(&renderer.timestampHeap_, VoxelRenderer::TS_BLAS_BUILD_BEGIN, cmd); + renderer.dispatchBLASExtract(cmd); + + if (animatedTerrain_) { + // Stagger: alternate blocky/smooth each animation frame, skip topings + uint32_t flags = (rtBuildSkipCounter_ & 1) + ? VoxelRenderer::RT_BUILD_BLOCKY + : VoxelRenderer::RT_BUILD_SMOOTH; + rtBuildSkipCounter_++; + renderer.buildAccelerationStructures(cmd, flags); + } else { + renderer.buildAccelerationStructures(cmd, VoxelRenderer::RT_BUILD_ALL); + } + + device->QueryEnd(&renderer.timestampHeap_, VoxelRenderer::TS_BLAS_BUILD_END, cmd); + } } } @@ -2239,52 +2300,96 @@ void VoxelRenderPath::Render() const { renderer.smoothVertexDirty_ = false; } - auto tRender0 = std::chrono::high_resolution_clock::now(); + // ── Draw passes ── + device->QueryEnd(&renderer.timestampHeap_, VoxelRenderer::TS_DRAW_BEGIN, cmd); renderer.render(cmd, *camera, voxelDepth_, voxelRT_, voxelNormalRT_); - - // Phase 4: render topings (separate render pass, preserves voxel output) renderer.renderTopings(cmd, topingSystem, voxelDepth_, voxelRT_, voxelNormalRT_); - - // Phase 5: render smooth surfaces (separate render pass, preserves all prior output) renderer.renderSmooth(cmd, voxelDepth_, voxelRT_, voxelNormalRT_); + device->QueryEnd(&renderer.timestampHeap_, VoxelRenderer::TS_DRAW_END, cmd); - // Phase 6.2: RT Shadows (modulates voxelRT_ in-place after all geometry is rendered) + // Phase 6.2: RT Shadows + AO if (renderer.isRTShadowsEnabled() && renderer.isRTReady()) { + device->QueryEnd(&renderer.timestampHeap_, VoxelRenderer::TS_RT_SHADOWS_BEGIN, cmd); renderer.dispatchShadows(cmd, voxelDepth_, voxelRT_, voxelNormalRT_); + device->QueryEnd(&renderer.timestampHeap_, VoxelRenderer::TS_RT_SHADOWS_END, cmd); } - auto tRender1 = std::chrono::high_resolution_clock::now(); - profRender_.add(std::chrono::duration(tRender1 - tRender0).count()); + + // Resolve GPU timestamps for readback next frame + device->QueryResolve(&renderer.timestampHeap_, 0, VoxelRenderer::TS_COUNT, + &renderer.timestampReadback_, 0, cmd); } } void VoxelRenderPath::logProfilingAverages() const { - char msg[1024]; + char msg[2048]; snprintf(msg, sizeof(msg), "=== PERF PROFILE (avg over %.0fs) ===\n" - " Regenerate: %7.2f ms (%u calls)\n" - " UpdateMeshes: %7.2f ms (%u calls)\n" - " VoxelPack: %7.2f ms (%u calls)\n" - " GPU Upload: %7.2f ms (%u calls)\n" - " GPU Dispatch: %7.2f ms (%u calls)\n" - " SmoothMesh: %7.2f ms (%u calls)\n" - " SmoothUpload: %7.2f ms (%u calls)\n" - " TopingCollect: %7.2f ms (%u calls)\n" - " TopingUpload: %7.2f ms (%u calls)\n" - " Render: %7.2f ms (%u calls)\n" - " Frame (Upd): %7.2f ms (%u calls, %.1f FPS)", + " ── Update() ──\n" + " Regenerate: %7.2f ms (%u calls)\n" + " UpdateMeshes: %7.2f ms (%u calls)\n" + " TopingCollect: %7.2f ms (%u calls)\n" + " TopingUpload: %7.2f ms (%u calls)\n" + " SmoothMesh(CPU): %7.2f ms (%u calls)\n" + " SmoothUpload: %7.2f ms (%u calls)\n" + " Update total: %7.2f ms (%u calls)\n" + " ── Render() ──\n" + " GPU Mesh (pack): %7.2f ms (%u calls)\n" + " GPU Mesh (up): %7.2f ms (%u calls)\n" + " GPU Mesh (disp): %7.2f ms (%u calls)\n" + " GPU MeshTotal: %7.2f ms (%u calls)\n" + " GPU SmoothMesh: %7.2f ms (%u calls)\n" + " BLAS Extract: %7.2f ms (%u calls)\n" + " BLAS/TLAS Build: %7.2f ms (%u calls)\n" + " Deferred Upload: %7.2f ms (%u calls)\n" + " Draw (3 passes): %7.2f ms (%u calls)\n" + " RT Shadows+AO: %7.2f ms (%u calls)\n" + " ── GPU Timings (hardware timestamps) ──\n" + " GPU Mesh: %7.2f ms\n" + " GPU SmoothMesh: %7.2f ms\n" + " GPU BLAS Extract: %7.2f ms\n" + " GPU BLAS Build: %7.2f ms\n" + " GPU Draw: %7.2f ms\n" + " GPU RT Shad+AO: %7.2f ms\n" + " GPU Total: %7.2f ms\n" + " ── Totals ──\n" + " Wicked Render: %7.2f ms (%u calls)\n" + " GPU Wait/Sync: %7.2f ms (%u calls)\n" + " CPU Frame: %7.2f ms (Update→Compose start)\n" + " True Frame: %7.2f ms (Update→Compose end)\n" + " Wall FPS: %7.1f (%u frames in %.0fs)", PROF_INTERVAL, profRegenerate_.avg(), profRegenerate_.count, profUpdateMeshes_.avg(), profUpdateMeshes_.count, + profTopingCollect_.avg(), profTopingCollect_.count, + profTopingUpload_.avg(), profTopingUpload_.count, + profSmoothMesh_.avg(), profSmoothMesh_.count, + profSmoothUpload_.avg(), profSmoothUpload_.count, + profFrame_.avg(), profFrame_.count, profVoxelPack_.avg(), profVoxelPack_.count, profGpuUpload_.avg(), profGpuUpload_.count, profGpuDispatch_.avg(), profGpuDispatch_.count, - profSmoothMesh_.avg(), profSmoothMesh_.count, - profSmoothUpload_.avg(), profSmoothUpload_.count, - profTopingCollect_.avg(), profTopingCollect_.count, - profTopingUpload_.avg(), profTopingUpload_.count, + profGpuMeshDispatch_.avg(), profGpuMeshDispatch_.count, + profGpuSmoothDispatch_.avg(), profGpuSmoothDispatch_.count, + profBLASExtract_.avg(), profBLASExtract_.count, + profBLASBuild_.avg(), profBLASBuild_.count, + profDeferredUpload_.avg(), profDeferredUpload_.count, profRender_.avg(), profRender_.count, - profFrame_.avg(), profFrame_.count, - profFrame_.count > 0 ? (1000.0f / profFrame_.avg()) : 0.0f); + profRTShadows_.avg(), profRTShadows_.count, + renderer.gpuMeshTimeMs_, + renderer.gpuSmoothMeshTimeMs_, + renderer.gpuBLASExtractTimeMs_, + renderer.gpuBLASBuildTimeMs_, + renderer.gpuDrawTimeMs_, + renderer.gpuRTShadowsTimeMs_, + renderer.gpuMeshTimeMs_ + renderer.gpuSmoothMeshTimeMs_ + + renderer.gpuBLASExtractTimeMs_ + renderer.gpuBLASBuildTimeMs_ + + renderer.gpuDrawTimeMs_ + renderer.gpuRTShadowsTimeMs_, + profWickedRender_.avg(), profWickedRender_.count, + profGpuWait_.avg(), profGpuWait_.count, + profFullFrame_.avg(), + profTrueFrame_.avg(), + profTrueFrame_.count > 0 ? (1000.0f / profTrueFrame_.avg()) : 0.0f, + profTrueFrame_.count, PROF_INTERVAL); wi::backlog::post(msg); profRegenerate_.reset(); @@ -2292,17 +2397,32 @@ void VoxelRenderPath::logProfilingAverages() const { profVoxelPack_.reset(); profGpuUpload_.reset(); profGpuDispatch_.reset(); + profGpuMeshDispatch_.reset(); + profGpuSmoothDispatch_.reset(); profSmoothMesh_.reset(); profSmoothUpload_.reset(); profTopingCollect_.reset(); profTopingUpload_.reset(); + profBLASExtract_.reset(); + profBLASBuild_.reset(); + profDeferredUpload_.reset(); profRender_.reset(); + profRTShadows_.reset(); profFrame_.reset(); + profFullFrame_.reset(); + profGpuWait_.reset(); + profWickedRender_.reset(); + profTrueFrame_.reset(); } void VoxelRenderPath::Compose(CommandList cmd) const { frameCount_++; + // Measure full frame time (Update + Render + Compose start) + auto composeStart = std::chrono::high_resolution_clock::now(); + float fullFrameMs = std::chrono::duration(composeStart - frameStartTime_).count(); + if (fullFrameMs > 0.1f) profFullFrame_.add(fullFrameMs); + RenderPath3D::Compose(cmd); if (rtCreated_ && voxelRT_.IsValid()) { @@ -2365,6 +2485,13 @@ void VoxelRenderPath::Compose(CommandList cmd) const { + "] | F5: shd+ao [" + std::string(renderer.rtShadowDebug_ == 1 ? "SHD" : (renderer.rtShadowDebug_ == 2 ? "AO" : (renderer.isRTShadowsEnabled() ? "ON" : "OFF"))) + "]"; wi::font::Draw(stats, fp, cmd); + + // Save compose end time for GPU wait measurement + lastComposeEnd_ = std::chrono::high_resolution_clock::now(); + lastComposeEndValid_ = true; + // True frame-to-frame time + float trueFrameMs = std::chrono::duration(lastComposeEnd_ - frameStartTime_).count(); + if (trueFrameMs > 0.1f) profTrueFrame_.add(trueFrameMs); } void VoxelRenderPath::setCamera(float x, float y, float z, float pitch, float yaw) { diff --git a/src/voxel/VoxelRenderer.h b/src/voxel/VoxelRenderer.h index 172a186..5a6f300 100644 --- a/src/voxel/VoxelRenderer.h +++ b/src/voxel/VoxelRenderer.h @@ -228,7 +228,13 @@ private: mutable uint32_t tlasInstanceCount_ = 0; // track TLAS instance count to avoid per-frame recreation void dispatchBLASExtract(wi::graphics::CommandList cmd) const; - void buildAccelerationStructures(wi::graphics::CommandList cmd) const; + // Flags for selective BLAS rebuild + static constexpr uint32_t RT_BUILD_BLOCKY = 1 << 0; + static constexpr uint32_t RT_BUILD_SMOOTH = 1 << 1; + static constexpr uint32_t RT_BUILD_TOPING = 1 << 2; + static constexpr uint32_t RT_BUILD_ALL = RT_BUILD_BLOCKY | RT_BUILD_SMOOTH | RT_BUILD_TOPING; + void buildAccelerationStructures(wi::graphics::CommandList cmd, + uint32_t buildFlags = RT_BUILD_ALL) const; // ── RT Shadows + AO (Phase 6.2 + 6.3) ────────────────────────── wi::graphics::Shader shadowShader_; // voxelShadowCS compute shader @@ -254,19 +260,29 @@ private: void dispatchGpuSmoothMesh(wi::graphics::CommandList cmd, const VoxelWorld& world) const; void rebuildChunkInfoOnly(VoxelWorld& world); - // ── GPU Timestamp Queries (Phase 2 benchmark) ──────────────── + // ── GPU Timestamp Queries (comprehensive GPU profiling) ──────── wi::graphics::GPUQueryHeap timestampHeap_; wi::graphics::GPUBuffer timestampReadback_; - static constexpr uint32_t TS_CULL_BEGIN = 0; - static constexpr uint32_t TS_CULL_END = 1; - static constexpr uint32_t TS_DRAW_BEGIN = 2; - static constexpr uint32_t TS_DRAW_END = 3; - static constexpr uint32_t TS_MESH_BEGIN = 4; - static constexpr uint32_t TS_MESH_END = 5; - static constexpr uint32_t TS_COUNT = 6; - mutable float gpuCullTimeMs_ = 0.0f; - mutable float gpuDrawTimeMs_ = 0.0f; + // Timestamp slots: pairs of (BEGIN, END) for each GPU phase + static constexpr uint32_t TS_GPU_MESH_BEGIN = 0; + static constexpr uint32_t TS_GPU_MESH_END = 1; + static constexpr uint32_t TS_GPU_SMOOTH_BEGIN = 2; + static constexpr uint32_t TS_GPU_SMOOTH_END = 3; + static constexpr uint32_t TS_BLAS_EXTRACT_BEGIN = 4; + static constexpr uint32_t TS_BLAS_EXTRACT_END = 5; + static constexpr uint32_t TS_BLAS_BUILD_BEGIN = 6; + static constexpr uint32_t TS_BLAS_BUILD_END = 7; + static constexpr uint32_t TS_DRAW_BEGIN = 8; + static constexpr uint32_t TS_DRAW_END = 9; + static constexpr uint32_t TS_RT_SHADOWS_BEGIN = 10; + static constexpr uint32_t TS_RT_SHADOWS_END = 11; + static constexpr uint32_t TS_COUNT = 12; mutable float gpuMeshTimeMs_ = 0.0f; + mutable float gpuSmoothMeshTimeMs_ = 0.0f; + mutable float gpuBLASExtractTimeMs_ = 0.0f; + mutable float gpuBLASBuildTimeMs_ = 0.0f; + mutable float gpuDrawTimeMs_ = 0.0f; + mutable float gpuRTShadowsTimeMs_ = 0.0f; // Stats (mutable: updated during const Render() call) mutable uint32_t totalQuads_ = 0; @@ -276,8 +292,13 @@ private: bool initialized_ = false; public: - float getGpuCullTimeMs() const { return gpuCullTimeMs_; } float getGpuDrawTimeMs() const { return gpuDrawTimeMs_; } + float getGpuMeshTimeMs() const { return gpuMeshTimeMs_; } + float getGpuSmoothMeshTimeMs() const { return gpuSmoothMeshTimeMs_; } + float getGpuBLASExtractTimeMs() const { return gpuBLASExtractTimeMs_; } + float getGpuBLASBuildTimeMs() const { return gpuBLASBuildTimeMs_; } + float getGpuRTShadowsTimeMs() const { return gpuRTShadowsTimeMs_; } + void toggleRTShadows() { rtShadowsEnabled_ = !rtShadowsEnabled_; } bool isGpuMeshEnabled() const { return gpuMesherAvailable_; } uint32_t getGpuMeshQuadCount() const { return gpuMeshQuadCount_; } @@ -369,12 +390,27 @@ private: mutable ProfileAccum profVoxelPack_; // voxel data packing in dispatchGpuMesh mutable ProfileAccum profGpuUpload_; // GPU upload in dispatchGpuMesh mutable ProfileAccum profGpuDispatch_; // compute dispatches in dispatchGpuMesh - mutable ProfileAccum profRender_; // render() total - mutable ProfileAccum profFrame_; // full frame (Update + Render + Compose) + mutable ProfileAccum profRender_; // render() draw calls + mutable ProfileAccum profFrame_; // full frame (Update only - legacy) + mutable ProfileAccum profFullFrame_; // true full frame (Update + Render + Compose) mutable ProfileAccum profSmoothMesh_; // SmoothMesher::meshChunk (all chunks) mutable ProfileAccum profSmoothUpload_; // uploadSmoothData mutable ProfileAccum profTopingCollect_; // topingSystem.collectInstances mutable ProfileAccum profTopingUpload_; // uploadTopingData + mutable ProfileAccum profGpuMeshDispatch_; // GPU mesh compute dispatch (in Render) + mutable ProfileAccum profGpuSmoothDispatch_; // GPU smooth mesh dispatch (in Render) + mutable ProfileAccum profBLASExtract_; // BLAS position extraction compute + mutable ProfileAccum profBLASBuild_; // BLAS/TLAS build + mutable ProfileAccum profDeferredUpload_; // deferred GPU buffer uploads + mutable ProfileAccum profRTShadows_; // RT shadows + AO dispatch + mutable ProfileAccum profGpuWait_; // GPU sync: time between Compose end and next Update start + mutable ProfileAccum profWickedRender_; // RenderPath3D::Render() (Wicked internal) + mutable ProfileAccum profTrueFrame_; // wall-clock frame-to-frame time + mutable std::chrono::high_resolution_clock::time_point frameStartTime_; // for full frame timing + mutable std::chrono::high_resolution_clock::time_point lastComposeEnd_; // for GPU wait measurement + mutable bool lastComposeEndValid_ = false; + mutable uint32_t rtBuildSkipCounter_ = 0; // stagger BLAS builds during animation + mutable bool rtWasEnabled_ = false; // saved RT state before animation mutable float profTimer_ = 0.0f; static constexpr float PROF_INTERVAL = 5.0f; void logProfilingAverages() const;