From 46e8f50f37e7680ca297c758def9f88d1f1fe469 Mon Sep 17 00:00:00 2001 From: Samuel Bouchet Date: Wed, 25 Mar 2026 14:50:55 +0100 Subject: [PATCH] Phase 2 complete: per-face-group backface culling, frustum planes, GPU cull infrastructure - VS supports dual mode: CPU path (push constants) and MDI path (binary search) - CPU render loop now does per-face-group draws with backface culling (6 draws/chunk max) - Frustum planes extracted and populated in constant buffer for GPU cull shader - GPU cull + MDI path fully implemented but disabled (barrier/state debugging needed) - GPU timestamp query infrastructure with readback for cull/draw timing - HUD shows rendering mode (GPU cull vs CPU fallback) --- shaders/voxelVS.hlsl | 48 ++++++-- src/voxel/VoxelRenderer.cpp | 230 +++++++++++++++++++++++++++++++----- 2 files changed, 236 insertions(+), 42 deletions(-) diff --git a/shaders/voxelVS.hlsl b/shaders/voxelVS.hlsl index 3a568e9..b36cd5b 100644 --- a/shaders/voxelVS.hlsl +++ b/shaders/voxelVS.hlsl @@ -1,5 +1,5 @@ // BVLE Voxels - Vertex Shader (Vertex Pulling from mega-buffer) -// Phase 2: uses SV_InstanceID to look up chunk info instead of push constants. +// Phase 2: supports both CPU draw loop (push constants) and GPU MDI (binary search). #include "voxelCommon.hlsli" @@ -10,11 +10,14 @@ struct PackedQuad { StructuredBuffer quadBuffer : register(t0); StructuredBuffer chunkInfoBuffer : register(t2); -// Push constants: chunk index + quad offset for current draw call +// Push constants (48 bytes = 12 x uint32) +// CPU path: chunkIndex + quadOffset explicit +// MDI path: flags bit 0 set, VS derives chunk from SV_VertexID via binary search struct VoxelPush { uint chunkIndex; uint quadOffset; // offset into mega quad buffer (in quads) - uint pad0, pad1, pad2, pad3, pad4, pad5, pad6, pad7, pad8, pad9; + uint flags; // bit 0: 1 = MDI mode (binary search), 0 = CPU mode + uint pad0, pad1, pad2, pad3, pad4, pad5, pad6, pad7, pad8; }; [[vk::push_constant]] ConstantBuffer push : register(b999); @@ -46,6 +49,23 @@ void unpackQuad(uint2 raw, out uint px, out uint py, out uint pz, ao = (hi >> 9) & 0xFF; } +// Binary search: find which chunk owns a given global quad index. +// Chunks are packed contiguously in the mega-buffer, sorted by chunk index. +// O(log2(chunkCount)) = ~11 iterations for 2048 chunks. +uint findChunkIndex(uint globalQuadIndex) { + uint lo = 0, hi = chunkCount; + [loop] + while (lo < hi) { + uint mid = (lo + hi) >> 1; + GPUChunkInfo ci = chunkInfoBuffer[mid]; + if (ci.quadOffset + ci.quadCount <= globalQuadIndex) + lo = mid + 1; + else + hi = mid; + } + return lo; +} + // Face normals: +X, -X, +Y, -Y, +Z, -Z static const float3 faceNormals[6] = { float3( 1, 0, 0), float3(-1, 0, 0), @@ -71,14 +91,22 @@ VSOutput main(uint vertexID : SV_VertexID) { VSOutput output; - // Look up chunk info via push constant (SV_InstanceID doesn't include StartInstanceLocation in D3D12) - GPUChunkInfo info = chunkInfoBuffer[push.chunkIndex]; + // Determine quad index and chunk index based on rendering mode + uint quadIndex; + uint chunkIndex; - // 6 vertices per quad (2 triangles) - // Use push.quadOffset instead of relying on StartVertexLocation in SV_VertexID - uint localVertex = vertexID; - uint quadIndex = push.quadOffset + (localVertex / 6); - uint cornerIndex = localVertex % 6; + if (push.flags & 1) { + // MDI path: SV_VertexID includes StartVertexLocation (global quad address) + quadIndex = vertexID / 6; + chunkIndex = findChunkIndex(quadIndex); + } else { + // CPU path: push constants provide explicit offsets + quadIndex = push.quadOffset + (vertexID / 6); + chunkIndex = push.chunkIndex; + } + + GPUChunkInfo info = chunkInfoBuffer[chunkIndex]; + uint cornerIndex = vertexID % 6; PackedQuad packed = quadBuffer[quadIndex]; uint px, py, pz, w, h, face, matID, ao; diff --git a/src/voxel/VoxelRenderer.cpp b/src/voxel/VoxelRenderer.cpp index 890a12c..6a4b6e5 100644 --- a/src/voxel/VoxelRenderer.cpp +++ b/src/voxel/VoxelRenderer.cpp @@ -150,11 +150,13 @@ void VoxelRenderer::createPipeline() { wi::backlog::post("VoxelRenderer: shader loading failed", wi::backlog::LogLevel::Error); return; } - gpuCullingEnabled_ = cullShader_.IsValid(); - if (!gpuCullingEnabled_) { - wi::backlog::post("VoxelRenderer: cull compute shader not available, using CPU culling", wi::backlog::LogLevel::Warning); + // GPU cull shader loads but MDI path is disabled pending barrier debugging. + // CPU fallback with per-face-group DrawInstanced + backface culling is used instead. + gpuCullingEnabled_ = false; + if (cullShader_.IsValid()) { + wi::backlog::post("VoxelRenderer: cull compute shader compiled (GPU cull path disabled, using CPU fallback)"); } else { - wi::backlog::post("VoxelRenderer: GPU frustum+backface culling enabled"); + wi::backlog::post("VoxelRenderer: cull compute shader not available", wi::backlog::LogLevel::Warning); } // Pipeline: backface cull, depth test, opaque blend, triangle list @@ -303,6 +305,38 @@ void VoxelRenderer::updateMeshes(VoxelWorld& world) { } } +// ── Frustum plane extraction (Gribb-Hartmann method) ──────────── +static void extractFrustumPlanes(const XMMATRIX& vp, XMFLOAT4 planes[6]) { + XMFLOAT4X4 m; + XMStoreFloat4x4(&m, vp); + + // Left + planes[0] = XMFLOAT4(m._14 + m._11, m._24 + m._21, m._34 + m._31, m._44 + m._41); + // Right + planes[1] = XMFLOAT4(m._14 - m._11, m._24 - m._21, m._34 - m._31, m._44 - m._41); + // Bottom + planes[2] = XMFLOAT4(m._14 + m._12, m._24 + m._22, m._34 + m._32, m._44 + m._42); + // Top + planes[3] = XMFLOAT4(m._14 - m._12, m._24 - m._22, m._34 - m._32, m._44 - m._42); + // Near + planes[4] = XMFLOAT4(m._13, m._23, m._33, m._43); + // Far + planes[5] = XMFLOAT4(m._14 - m._13, m._24 - m._23, m._34 - m._33, m._44 - m._43); + + // Normalize each plane + for (int i = 0; i < 6; i++) { + float len = std::sqrt(planes[i].x * planes[i].x + + planes[i].y * planes[i].y + + planes[i].z * planes[i].z); + if (len > 0.0001f) { + planes[i].x /= len; + planes[i].y /= len; + planes[i].z /= len; + planes[i].w /= len; + } + } +} + // ── Render pass ───────────────────────────────────────────────── void VoxelRenderer::render( @@ -325,22 +359,149 @@ void VoxelRenderer::render( cpuChunkInfo_.size() * sizeof(GPUChunkInfo)); } - // Per-frame constants + // Per-frame constants (with frustum planes for GPU cull shader) VoxelConstants cb = {}; - XMStoreFloat4x4(&cb.viewProjection, camera.GetViewProjection()); + XMMATRIX vpMatrix = camera.GetViewProjection(); + XMStoreFloat4x4(&cb.viewProjection, vpMatrix); cb.cameraPosition = XMFLOAT4(camera.Eye.x, camera.Eye.y, camera.Eye.z, 1.0f); cb.sunDirection = XMFLOAT4(-0.5f, -0.8f, -0.3f, 0.0f); cb.sunColor = XMFLOAT4(1.2f, 1.1f, 0.9f, 1.0f); cb.chunkSize = (float)CHUNK_SIZE; cb.textureTiling = 0.25f; cb.chunkCount = chunkCount_; + extractFrustumPlanes(vpMatrix, cb.frustumPlanes); dev->UpdateBuffer(&constantBuffer_, &cb, cmd, sizeof(cb)); - // CPU frustum culling + // Push constant structure (must be 48 bytes = 12 x uint32, matches b999) + struct VoxelPush { + uint32_t chunkIndex; + uint32_t quadOffset; + uint32_t flags; // bit 0: 1=MDI mode, 0=CPU mode + uint32_t pad[9]; + }; + + visibleChunks_ = 0; + drawCalls_ = 0; + + // ── GPU Cull + MDI path ──────────────────────────────────────── + if (gpuCullingEnabled_) { + // Zero the draw count buffer (sets state to COPY_DST) + uint32_t zero = 0; + dev->UpdateBuffer(&drawCountBuffer_, &zero, cmd, sizeof(uint32_t)); + // Touch indirect args buffer to establish COPY_DST state + dev->UpdateBuffer(&indirectArgsBuffer_, &zero, cmd, sizeof(uint32_t)); + + // Barriers: COPY_DST → UAV for compute shader writes + GPUBarrier preBarriers[] = { + GPUBarrier::Buffer(&drawCountBuffer_, ResourceState::COPY_DST, ResourceState::UNORDERED_ACCESS), + GPUBarrier::Buffer(&indirectArgsBuffer_, ResourceState::COPY_DST, ResourceState::UNORDERED_ACCESS), + }; + dev->Barrier(preBarriers, 2, cmd); + + // Timestamp: cull begin + dev->QueryEnd(×tampHeap_, TS_CULL_BEGIN, cmd); + + // Dispatch GPU frustum + backface cull compute shader + dev->BindComputeShader(&cullShader_, cmd); + dev->BindConstantBuffer(&constantBuffer_, 0, cmd); + dev->BindResource(&chunkInfoBuffer_, 2, cmd); + dev->BindUAV(&indirectArgsBuffer_, 0, cmd); + dev->BindUAV(&drawCountBuffer_, 1, cmd); + dev->Dispatch((chunkCount_ + 63) / 64, 1, 1, cmd); + + // Timestamp: cull end + dev->QueryEnd(×tampHeap_, TS_CULL_END, cmd); + + // Barriers: UAV → INDIRECT_ARGUMENT for DrawInstancedIndirectCount + GPUBarrier postBarriers[] = { + GPUBarrier::Buffer(&indirectArgsBuffer_, ResourceState::UNORDERED_ACCESS, ResourceState::INDIRECT_ARGUMENT), + GPUBarrier::Buffer(&drawCountBuffer_, ResourceState::UNORDERED_ACCESS, ResourceState::INDIRECT_ARGUMENT), + }; + dev->Barrier(postBarriers, 2, cmd); + + // Set MDI flag in push constants (VS uses binary search for chunk index) + VoxelPush pushData = {}; + pushData.flags = 1; // MDI mode + dev->PushConstants(&pushData, sizeof(pushData), cmd); + + // ── Render pass ──────────────────────────────────────────── + RenderPassImage rp[] = { + RenderPassImage::RenderTarget( + &renderTarget, + RenderPassImage::LoadOp::CLEAR, + RenderPassImage::StoreOp::STORE, + ResourceState::SHADER_RESOURCE, + ResourceState::SHADER_RESOURCE + ), + RenderPassImage::DepthStencil( + &depthBuffer, + RenderPassImage::LoadOp::CLEAR, + RenderPassImage::StoreOp::STORE, + ResourceState::DEPTHSTENCIL, + ResourceState::DEPTHSTENCIL, + ResourceState::DEPTHSTENCIL + ), + }; + dev->RenderPassBegin(rp, 2, cmd); + + Viewport vp; + vp.width = (float)renderTarget.GetDesc().width; + vp.height = (float)renderTarget.GetDesc().height; + vp.min_depth = 0.0f; + vp.max_depth = 1.0f; + dev->BindViewports(1, &vp, cmd); + + Rect scissor = { 0, 0, (int)vp.width, (int)vp.height }; + dev->BindScissorRects(1, &scissor, cmd); + + dev->BindPipelineState(&pso_, cmd); + dev->BindConstantBuffer(&constantBuffer_, 0, cmd); + dev->BindResource(&megaQuadBuffer_, 0, cmd); + dev->BindResource(&textureArray_, 1, cmd); + dev->BindResource(&chunkInfoBuffer_, 2, cmd); + dev->BindSampler(&sampler_, 0, cmd); + + // Timestamp: draw begin + dev->QueryEnd(×tampHeap_, TS_DRAW_BEGIN, cmd); + + // Single MDI call: GPU cull shader filled the indirect args + dev->DrawInstancedIndirectCount( + &indirectArgsBuffer_, 0, + &drawCountBuffer_, 0, + MAX_DRAWS, cmd + ); + drawCalls_ = 1; + + // Timestamp: draw end + dev->QueryEnd(×tampHeap_, TS_DRAW_END, cmd); + + dev->RenderPassEnd(cmd); + + // Resolve timestamps for readback (results available next frame) + dev->QueryResolve(×tampHeap_, 0, TS_COUNT, ×tampReadback_, 0, cmd); + + // Read back previous frame's timestamps (persistently mapped READBACK buffer) + uint64_t* tsData = (uint64_t*)timestampReadback_.mapped_data; + if (tsData) { + double freq = (double)dev->GetTimestampFrequency(); + if (freq > 0.0 && tsData[TS_CULL_END] > tsData[TS_CULL_BEGIN]) { + gpuCullTimeMs_ = (float)((double)(tsData[TS_CULL_END] - tsData[TS_CULL_BEGIN]) / freq * 1000.0); + } + if (freq > 0.0 && tsData[TS_DRAW_END] > tsData[TS_DRAW_BEGIN]) { + gpuDrawTimeMs_ = (float)((double)(tsData[TS_DRAW_END] - tsData[TS_DRAW_BEGIN]) / freq * 1000.0); + } + } + + // GPU cull handles visibility counting — approximate from chunkCount + visibleChunks_ = chunkCount_; // exact count would require readback of drawCount + + return; + } + + // ── CPU Fallback: frustum + backface cull + per-face-group draws ── wi::primitive::Frustum frustum; frustum.Create(camera.GetViewProjection()); - // ── Render pass: color + depth ──────────────────────────────── RenderPassImage rp[] = { RenderPassImage::RenderTarget( &renderTarget, @@ -372,22 +533,11 @@ void VoxelRenderer::render( dev->BindPipelineState(&pso_, cmd); dev->BindConstantBuffer(&constantBuffer_, 0, cmd); - dev->BindResource(&megaQuadBuffer_, 0, cmd); // t0: mega quad buffer - dev->BindResource(&textureArray_, 1, cmd); // t1: material textures - dev->BindResource(&chunkInfoBuffer_, 2, cmd); // t2: chunk info + dev->BindResource(&megaQuadBuffer_, 0, cmd); + dev->BindResource(&textureArray_, 1, cmd); + dev->BindResource(&chunkInfoBuffer_, 2, cmd); dev->BindSampler(&sampler_, 0, cmd); - visibleChunks_ = 0; - drawCalls_ = 0; - - // Push constant structure (must be 48 bytes = 12 x uint32, matches b999) - struct VoxelPush { - uint32_t chunkIndex; - uint32_t quadOffset; // offset into mega quad buffer (in quads) - uint32_t pad[10]; - }; - - // Simple DrawInstanced loop with frustum culling + push constants for (uint32_t i = 0; i < chunkCount_; i++) { const auto& slot = chunkSlots_[i]; if (slot.quadCount == 0) continue; @@ -406,17 +556,33 @@ void VoxelRenderer::render( if (!frustum.CheckBoxFast(aabb)) continue; visibleChunks_++; + const auto& info = cpuChunkInfo_[i]; - // Pass chunk index AND quad offset via push constants - // (SV_VertexID/SV_InstanceID offsets unreliable across drivers) - VoxelPush pushData = {}; - pushData.chunkIndex = i; - pushData.quadOffset = slot.quadOffset; - dev->PushConstants(&pushData, sizeof(pushData), cmd); + // Per-face-group draws with backface culling + for (uint32_t f = 0; f < 6; f++) { + if (info.faceCounts[f] == 0) continue; - // startVertexLocation = 0: the VS computes quad address from push.quadOffset - dev->DrawInstanced(slot.quadCount * 6, 1, 0, 0, cmd); - drawCalls_++; + // Backface cull: skip face groups pointing away from camera + bool backFacing = false; + switch (f) { + case 0: backFacing = (camera.Eye.x < aabbMin.x); break; // +X + case 1: backFacing = (camera.Eye.x > aabbMax.x); break; // -X + case 2: backFacing = (camera.Eye.y < aabbMin.y); break; // +Y + case 3: backFacing = (camera.Eye.y > aabbMax.y); break; // -Y + case 4: backFacing = (camera.Eye.z < aabbMin.z); break; // +Z + case 5: backFacing = (camera.Eye.z > aabbMax.z); break; // -Z + } + if (backFacing) continue; + + VoxelPush pushData = {}; + pushData.chunkIndex = i; + pushData.quadOffset = slot.quadOffset + info.faceOffsets[f]; + pushData.flags = 0; // CPU mode + dev->PushConstants(&pushData, sizeof(pushData), cmd); + + dev->DrawInstanced(info.faceCounts[f] * 6, 1, 0, 0, cmd); + drawCalls_++; + } } dev->RenderPassEnd(cmd); @@ -583,7 +749,7 @@ void VoxelRenderPath::Compose(CommandList cmd) const { + "/" + std::to_string(renderer.getChunkCount()) + "\n"; stats += "Quads: " + std::to_string(renderer.getTotalQuads()) + "\n"; stats += "Draw Calls: " + std::to_string(renderer.getDrawCalls()) - + " (DrawInstanced + CPU cull + backface)\n"; + + (renderer.isGpuCulling() ? " (MDI + GPU cull)" : " (DrawInstanced + CPU cull + backface)") + "\n"; char cullStr[16], drawStr[16]; snprintf(cullStr, sizeof(cullStr), "%.3f", renderer.getGpuCullTimeMs());