From 1bfadc2f7ccd23239fa0fbafe52b01d8e84202b8 Mon Sep 17 00:00:00 2001 From: Samuel Bouchet Date: Wed, 25 Mar 2026 22:30:50 +0100 Subject: [PATCH] Phase 2.3: GPU compute culling with frustum + backface cull MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Compute shader fills indirect args buffer, replacing CPU cull loop. Single DrawInstancedIndirectCount renders all visible face groups. Key fixes: - Compute shader: pack chunkIndex|(faceIndex<<16) in push constant, startVertexLocation=0 (aligned with Phase 2.2 SV_VertexID fix) - PushConstants must be called AFTER BindPipelineState, not before. Wicked Engine dispatches to SetGraphicsRoot32BitConstants only when active_pso is set; after BindComputeShader it targets compute instead. - Barriers: UNDEFINED(COMMON)→UAV before compute, UAV→INDIRECT_ARGUMENT after - Buffer decay: DX12 buffers always return to COMMON between frames, no cross-frame state tracking needed --- CLAUDE.md | 31 ++++++++++++++++++++++++++----- shaders/voxelCullCS.hlsl | 13 +++++++------ src/voxel/VoxelRenderer.cpp | 36 +++++++++++++++++++++--------------- src/voxel/VoxelRenderer.h | 3 +-- 4 files changed, 55 insertions(+), 28 deletions(-) diff --git a/CLAUDE.md b/CLAUDE.md index eb3a1c9..791efb4 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -173,7 +173,25 @@ Les shaders custom doivent respecter le **binding model de Wicked Engine** : Les buffers `Usage::DEFAULT` démarrent en COMMON et décayent vers COMMON après chaque exécution de command list. La promotion implicite COMMON → COPY_DST (via UpdateBuffer) et COMMON → INDIRECT_ARGUMENT (via DrawInstancedIndirectCount) fonctionne sans barriers explicites. C'est le même pattern que les SRV buffers (megaQuadBuffer_, chunkInfoBuffer_) qui passent de COPY_DST à SRV usage sans barrier en Phase 2.1. - **⚠️ Pour la Phase 2.3 (compute cull)**, des barriers explicites SERONT nécessaires : COMMON → UAV n'est PAS supporté par la promotion implicite. Il faudra alors un barrier UAV → INDIRECT_ARGUMENT après le compute dispatch. + **⚠️ Pour la Phase 2.3 (compute cull)**, des barriers explicites SONT nécessaires : + - `drawCountBuffer_` : COPY_DST → UAV (après UpdateBuffer zero) puis UAV → INDIRECT_ARGUMENT (après dispatch) + - `indirectArgsBuffer_` : UNDEFINED → UAV (COMMON après decay, `ResourceState::UNDEFINED = 0` = COMMON en Wicked) puis UAV → INDIRECT_ARGUMENT + - Wicked Engine appelle `DiscardResource()` quand `state_before == UNDEFINED`, ce qui est OK (le compute écrase les données) + +10. **PushConstants après BindComputeShader — PIÈGE MAJEUR** : + + `PushConstants()` dispatche vers `SetGraphicsRoot32BitConstants` ou `SetComputeRoot32BitConstants` selon l'état actif : + - Si `active_pso != nullptr` → **GRAPHICS** push constants + - Sinon si `active_cs != nullptr` → **COMPUTE** push constants + + Après `BindComputeShader` + `Dispatch`, `active_cs` reste actif. Appeler `PushConstants` à ce moment écrit dans les push constants **compute**, pas **graphics**. Le vertex shader ne voit jamais la valeur ! + + **Règle** : toujours appeler `PushConstants` **APRÈS** `BindPipelineState` (qui set `active_pso`) pour cibler les push constants graphics. L'ordre correct : + ```cpp + BindPipelineState(&pso_); // ← active_pso = &pso_ + PushConstants(&data, ...); // ← SetGraphicsRoot32BitConstants ✓ + Draw*(...); + ``` ### Diagnostics et debugging @@ -306,12 +324,15 @@ Découpée en sous-phases pour isoler les sources de bugs potentiels : - `SV_VertexID` n'inclut pas `startVertexLocation` avec ExecuteIndirect — voir point 8 - Pas de barriers explicites nécessaires — voir point 9 -#### Phase 2.3 - GPU compute culling [A FAIRE] +#### Phase 2.3 - GPU compute culling [FAIT] - Le compute shader `voxelCullCS.hlsl` remplace le CPU pour remplir les indirect args -- Barriers DX12 : COPY_DST → UAV (pre-compute) → INDIRECT_ARGUMENT (post-compute) -- GPU timestamp queries pour mesurer le coût du culling -- **Prérequis** : 2.2 doit fonctionner d'abord (garantit que le MDI rendering marche) +- Barriers DX12 : UNDEFINED → UAV (pre-compute) → INDIRECT_ARGUMENT (post-compute) +- GPU timestamp queries actifs (GPU Cull ~0.006 ms pour 168 chunks) +- **Pièges résolus** : + - `PushConstants` DOIT être appelé APRÈS `BindPipelineState` — voir point 10 + - Compute shader corrigé : push constant packing + startVertexLocation=0 — voir points 7-8 + - `ResourceState::UNDEFINED` = COMMON en Wicked (valeur 0), déclenche `DiscardResource()` — OK pour les buffers réécrits #### Phase 2.4 - GPU compute mesher (benchmark) [A FAIRE] diff --git a/shaders/voxelCullCS.hlsl b/shaders/voxelCullCS.hlsl index 138b4ad..8ada552 100644 --- a/shaders/voxelCullCS.hlsl +++ b/shaders/voxelCullCS.hlsl @@ -80,15 +80,16 @@ void main(uint3 DTid : SV_DispatchThreadID) uint drawIdx; drawCount.InterlockedAdd(0, 1, drawIdx); - // The face group's quads start at (chunk's mega-buffer offset + face offset within chunk) - uint faceQuadOffset = info.quadOffset + getFaceOffset(info, f); - + // Pack chunkIndex (low 16 bits) + faceIndex (high 16 bits) into push constant. + // The VS unpacks this to look up quadOffset from GPUChunkInfo. + // startVertexLocation = 0 because SV_VertexID does not reliably include it + // with ExecuteIndirect (see CLAUDE.md point 8). IndirectDrawArgsInstanced args; - args.pushConstant = 0; // written to b999[0] by ExecuteIndirect (unused in MDI VS path) + args.pushConstant = chunkIdx | (f << 16); args.vertexCountPerInstance = fCnt * 6; args.instanceCount = 1; - args.startVertexLocation = faceQuadOffset * 6; - args.startInstanceLocation = chunkIdx; + args.startVertexLocation = 0; + args.startInstanceLocation = 0; indirectArgs[drawIdx] = args; } } diff --git a/src/voxel/VoxelRenderer.cpp b/src/voxel/VoxelRenderer.cpp index 4378a69..6d483fa 100644 --- a/src/voxel/VoxelRenderer.cpp +++ b/src/voxel/VoxelRenderer.cpp @@ -150,13 +150,12 @@ void VoxelRenderer::createPipeline() { wi::backlog::post("VoxelRenderer: shader loading failed", wi::backlog::LogLevel::Error); return; } - // GPU cull shader loads but MDI path is disabled pending barrier debugging. - // CPU fallback with per-face-group DrawInstanced + backface culling is used instead. - gpuCullingEnabled_ = false; if (cullShader_.IsValid()) { - wi::backlog::post("VoxelRenderer: cull compute shader compiled (GPU cull path disabled, using CPU fallback)"); + gpuCullingEnabled_ = true; + wi::backlog::post("VoxelRenderer: GPU cull compute shader enabled"); } else { - wi::backlog::post("VoxelRenderer: cull compute shader not available", wi::backlog::LogLevel::Warning); + gpuCullingEnabled_ = false; + wi::backlog::post("VoxelRenderer: cull compute shader not available, using CPU fallback", wi::backlog::LogLevel::Warning); } // Pipeline: backface cull, depth test, opaque blend, triangle list @@ -385,16 +384,20 @@ void VoxelRenderer::render( // ── GPU Cull + MDI path ──────────────────────────────────────── if (gpuCullingEnabled_) { - // Zero the draw count buffer (sets state to COPY_DST) + // DX12 buffer decay: all buffers return to COMMON after ExecuteCommandLists. + // So every frame starts clean — no cross-frame state tracking needed. + + // Zero the draw count via UpdateBuffer (COMMON → COPY_DST implicit promotion) uint32_t zero = 0; dev->UpdateBuffer(&drawCountBuffer_, &zero, cmd, sizeof(uint32_t)); - // Touch indirect args buffer to establish COPY_DST state - dev->UpdateBuffer(&indirectArgsBuffer_, &zero, cmd, sizeof(uint32_t)); - // Barriers: COPY_DST → UAV for compute shader writes + // Barriers to UAV for compute shader writes: + // - drawCountBuffer_: COPY_DST → UAV (was promoted to COPY_DST by UpdateBuffer) + // - indirectArgsBuffer_: COMMON → UAV (explicit, required because COMMON can't + // be implicitly promoted to UAV) GPUBarrier preBarriers[] = { GPUBarrier::Buffer(&drawCountBuffer_, ResourceState::COPY_DST, ResourceState::UNORDERED_ACCESS), - GPUBarrier::Buffer(&indirectArgsBuffer_, ResourceState::COPY_DST, ResourceState::UNORDERED_ACCESS), + GPUBarrier::Buffer(&indirectArgsBuffer_, ResourceState::UNDEFINED, ResourceState::UNORDERED_ACCESS), }; dev->Barrier(preBarriers, 2, cmd); @@ -419,11 +422,6 @@ void VoxelRenderer::render( }; dev->Barrier(postBarriers, 2, cmd); - // Set MDI flag in push constants (VS uses binary search for chunk index) - VoxelPush pushData = {}; - pushData.flags = 1; // MDI mode - dev->PushConstants(&pushData, sizeof(pushData), cmd); - // ── Render pass ──────────────────────────────────────────── RenderPassImage rp[] = { RenderPassImage::RenderTarget( @@ -461,6 +459,14 @@ void VoxelRenderer::render( dev->BindResource(&chunkInfoBuffer_, 2, cmd); dev->BindSampler(&sampler_, 0, cmd); + // IMPORTANT: PushConstants must be called AFTER BindPipelineState. + // Wicked Engine's PushConstants uses SetGraphicsRoot32BitConstants only + // when active_pso is set. If called before (with active_cs from compute), + // it would set COMPUTE push constants instead of GRAPHICS ones. + VoxelPush pushData = {}; + pushData.flags = 1; // MDI mode + dev->PushConstants(&pushData, sizeof(pushData), cmd); + // Timestamp: draw begin dev->QueryEnd(×tampHeap_, TS_DRAW_BEGIN, cmd); diff --git a/src/voxel/VoxelRenderer.h b/src/voxel/VoxelRenderer.h index d9bff54..d8322f3 100644 --- a/src/voxel/VoxelRenderer.h +++ b/src/voxel/VoxelRenderer.h @@ -99,9 +99,8 @@ private: wi::graphics::GPUBuffer indirectArgsBuffer_; // IndirectDrawArgs[MAX_DRAWS] wi::graphics::GPUBuffer drawCountBuffer_; // uint32_t[1] mutable std::vector cpuIndirectArgs_; - bool gpuCullingEnabled_ = false; // GPU compute cull vs CPU fallback + bool gpuCullingEnabled_ = true; // Phase 2.3: GPU compute cull (true) vs CPU fallback (false) bool mdiEnabled_ = true; // Phase 2.2: MDI rendering with CPU-filled indirect args - mutable bool indirectBuffersInArgState_ = false; // DX12 resource state tracking // Constants buffer (must match HLSL VoxelCB) struct VoxelConstants {