Phase 2 complete: per-face-group backface culling, frustum planes, GPU cull infrastructure

- VS supports dual mode: CPU path (push constants) and MDI path (binary search) - CPU render loop now does per-face-group draws with backface culling (6 draws/chunk max) - Frustum planes extracted and populated in constant buffer for GPU cull shader - GPU cull + MDI path fully implemented but disabled (barrier/state debugging needed) - GPU timestamp query infrastructure with readback for cull/draw timing - HUD shows rendering mode (GPU cull vs CPU fallback)
2026-03-25 14:50:55 +01:00 · 2026-03-25 14:50:55 +01:00 · 46e8f50f37
commit 46e8f50f37
parent 5f346bb14a
2 changed files with 236 additions and 42 deletions
--- a/shaders/voxelVS.hlsl
+++ b/shaders/voxelVS.hlsl
@ -1,5 +1,5 @@
 // BVLE Voxels - Vertex Shader (Vertex Pulling from mega-buffer)
-// Phase 2: uses SV_InstanceID to look up chunk info instead of push constants.
+// Phase 2: supports both CPU draw loop (push constants) and GPU MDI (binary search).

 #include "voxelCommon.hlsli"

@ -10,11 +10,14 @@ struct PackedQuad {
 StructuredBuffer<PackedQuad> quadBuffer : register(t0);
 StructuredBuffer<GPUChunkInfo> chunkInfoBuffer : register(t2);

-// Push constants: chunk index + quad offset for current draw call
+// Push constants (48 bytes = 12 x uint32)
+//   CPU path: chunkIndex + quadOffset explicit
+//   MDI path: flags bit 0 set, VS derives chunk from SV_VertexID via binary search
 struct VoxelPush {
    uint chunkIndex;
    uint quadOffset;   // offset into mega quad buffer (in quads)
-    uint pad0, pad1, pad2, pad3, pad4, pad5, pad6, pad7, pad8, pad9;
+    uint flags;        // bit 0: 1 = MDI mode (binary search), 0 = CPU mode
+    uint pad0, pad1, pad2, pad3, pad4, pad5, pad6, pad7, pad8;
 };
 [[vk::push_constant]] ConstantBuffer<VoxelPush> push : register(b999);

@ -46,6 +49,23 @@ void unpackQuad(uint2 raw, out uint px, out uint py, out uint pz,
    ao    = (hi >> 9) & 0xFF;
 }

+// Binary search: find which chunk owns a given global quad index.
+// Chunks are packed contiguously in the mega-buffer, sorted by chunk index.
+// O(log2(chunkCount)) = ~11 iterations for 2048 chunks.
+uint findChunkIndex(uint globalQuadIndex) {
+    uint lo = 0, hi = chunkCount;
+    [loop]
+    while (lo < hi) {
+        uint mid = (lo + hi) >> 1;
+        GPUChunkInfo ci = chunkInfoBuffer[mid];
+        if (ci.quadOffset + ci.quadCount <= globalQuadIndex)
+            lo = mid + 1;
+        else
+            hi = mid;
+    }
+    return lo;
+}
+
 // Face normals: +X, -X, +Y, -Y, +Z, -Z
 static const float3 faceNormals[6] = {
    float3( 1, 0, 0), float3(-1, 0, 0),
@ -71,14 +91,22 @@ VSOutput main(uint vertexID : SV_VertexID)
 {
    VSOutput output;

-    // Look up chunk info via push constant (SV_InstanceID doesn't include StartInstanceLocation in D3D12)
-    GPUChunkInfo info = chunkInfoBuffer[push.chunkIndex];
+    // Determine quad index and chunk index based on rendering mode
+    uint quadIndex;
+    uint chunkIndex;

-    // 6 vertices per quad (2 triangles)
-    // Use push.quadOffset instead of relying on StartVertexLocation in SV_VertexID
-    uint localVertex = vertexID;
-    uint quadIndex = push.quadOffset + (localVertex / 6);
-    uint cornerIndex = localVertex % 6;
+    if (push.flags & 1) {
+        // MDI path: SV_VertexID includes StartVertexLocation (global quad address)
+        quadIndex = vertexID / 6;
+        chunkIndex = findChunkIndex(quadIndex);
+    } else {
+        // CPU path: push constants provide explicit offsets
+        quadIndex = push.quadOffset + (vertexID / 6);
+        chunkIndex = push.chunkIndex;
+    }
+
+    GPUChunkInfo info = chunkInfoBuffer[chunkIndex];
+    uint cornerIndex = vertexID % 6;

    PackedQuad packed = quadBuffer[quadIndex];
    uint px, py, pz, w, h, face, matID, ao;
--- a/src/voxel/VoxelRenderer.cpp
+++ b/src/voxel/VoxelRenderer.cpp
@ -150,11 +150,13 @@ void VoxelRenderer::createPipeline() {
        wi::backlog::post("VoxelRenderer: shader loading failed", wi::backlog::LogLevel::Error);
        return;
    }
-    gpuCullingEnabled_ = cullShader_.IsValid();
-    if (!gpuCullingEnabled_) {
-        wi::backlog::post("VoxelRenderer: cull compute shader not available, using CPU culling", wi::backlog::LogLevel::Warning);
+    // GPU cull shader loads but MDI path is disabled pending barrier debugging.
+    // CPU fallback with per-face-group DrawInstanced + backface culling is used instead.
+    gpuCullingEnabled_ = false;
+    if (cullShader_.IsValid()) {
+        wi::backlog::post("VoxelRenderer: cull compute shader compiled (GPU cull path disabled, using CPU fallback)");
    } else {
-        wi::backlog::post("VoxelRenderer: GPU frustum+backface culling enabled");
+        wi::backlog::post("VoxelRenderer: cull compute shader not available", wi::backlog::LogLevel::Warning);
    }

    // Pipeline: backface cull, depth test, opaque blend, triangle list
@ -303,6 +305,38 @@ void VoxelRenderer::updateMeshes(VoxelWorld& world) {
    }
 }

+// ── Frustum plane extraction (Gribb-Hartmann method) ────────────
+static void extractFrustumPlanes(const XMMATRIX& vp, XMFLOAT4 planes[6]) {
+    XMFLOAT4X4 m;
+    XMStoreFloat4x4(&m, vp);
+
+    // Left
+    planes[0] = XMFLOAT4(m._14 + m._11, m._24 + m._21, m._34 + m._31, m._44 + m._41);
+    // Right
+    planes[1] = XMFLOAT4(m._14 - m._11, m._24 - m._21, m._34 - m._31, m._44 - m._41);
+    // Bottom
+    planes[2] = XMFLOAT4(m._14 + m._12, m._24 + m._22, m._34 + m._32, m._44 + m._42);
+    // Top
+    planes[3] = XMFLOAT4(m._14 - m._12, m._24 - m._22, m._34 - m._32, m._44 - m._42);
+    // Near
+    planes[4] = XMFLOAT4(m._13, m._23, m._33, m._43);
+    // Far
+    planes[5] = XMFLOAT4(m._14 - m._13, m._24 - m._23, m._34 - m._33, m._44 - m._43);
+
+    // Normalize each plane
+    for (int i = 0; i < 6; i++) {
+        float len = std::sqrt(planes[i].x * planes[i].x +
+                              planes[i].y * planes[i].y +
+                              planes[i].z * planes[i].z);
+        if (len > 0.0001f) {
+            planes[i].x /= len;
+            planes[i].y /= len;
+            planes[i].z /= len;
+            planes[i].w /= len;
+        }
+    }
+}
+
 // ── Render pass ─────────────────────────────────────────────────

 void VoxelRenderer::render(
@ -325,22 +359,149 @@ void VoxelRenderer::render(
            cpuChunkInfo_.size() * sizeof(GPUChunkInfo));
    }

-    // Per-frame constants
+    // Per-frame constants (with frustum planes for GPU cull shader)
    VoxelConstants cb = {};
-    XMStoreFloat4x4(&cb.viewProjection, camera.GetViewProjection());
+    XMMATRIX vpMatrix = camera.GetViewProjection();
+    XMStoreFloat4x4(&cb.viewProjection, vpMatrix);
    cb.cameraPosition = XMFLOAT4(camera.Eye.x, camera.Eye.y, camera.Eye.z, 1.0f);
    cb.sunDirection = XMFLOAT4(-0.5f, -0.8f, -0.3f, 0.0f);
    cb.sunColor = XMFLOAT4(1.2f, 1.1f, 0.9f, 1.0f);
    cb.chunkSize = (float)CHUNK_SIZE;
    cb.textureTiling = 0.25f;
    cb.chunkCount = chunkCount_;
+    extractFrustumPlanes(vpMatrix, cb.frustumPlanes);
    dev->UpdateBuffer(&constantBuffer_, &cb, cmd, sizeof(cb));

-    // CPU frustum culling
+    // Push constant structure (must be 48 bytes = 12 x uint32, matches b999)
+    struct VoxelPush {
+        uint32_t chunkIndex;
+        uint32_t quadOffset;
+        uint32_t flags;       // bit 0: 1=MDI mode, 0=CPU mode
+        uint32_t pad[9];
+    };
+
+    visibleChunks_ = 0;
+    drawCalls_ = 0;
+
+    // ── GPU Cull + MDI path ────────────────────────────────────────
+    if (gpuCullingEnabled_) {
+        // Zero the draw count buffer (sets state to COPY_DST)
+        uint32_t zero = 0;
+        dev->UpdateBuffer(&drawCountBuffer_, &zero, cmd, sizeof(uint32_t));
+        // Touch indirect args buffer to establish COPY_DST state
+        dev->UpdateBuffer(&indirectArgsBuffer_, &zero, cmd, sizeof(uint32_t));
+
+        // Barriers: COPY_DST → UAV for compute shader writes
+        GPUBarrier preBarriers[] = {
+            GPUBarrier::Buffer(&drawCountBuffer_, ResourceState::COPY_DST, ResourceState::UNORDERED_ACCESS),
+            GPUBarrier::Buffer(&indirectArgsBuffer_, ResourceState::COPY_DST, ResourceState::UNORDERED_ACCESS),
+        };
+        dev->Barrier(preBarriers, 2, cmd);
+
+        // Timestamp: cull begin
+        dev->QueryEnd(&timestampHeap_, TS_CULL_BEGIN, cmd);
+
+        // Dispatch GPU frustum + backface cull compute shader
+        dev->BindComputeShader(&cullShader_, cmd);
+        dev->BindConstantBuffer(&constantBuffer_, 0, cmd);
+        dev->BindResource(&chunkInfoBuffer_, 2, cmd);
+        dev->BindUAV(&indirectArgsBuffer_, 0, cmd);
+        dev->BindUAV(&drawCountBuffer_, 1, cmd);
+        dev->Dispatch((chunkCount_ + 63) / 64, 1, 1, cmd);
+
+        // Timestamp: cull end
+        dev->QueryEnd(&timestampHeap_, TS_CULL_END, cmd);
+
+        // Barriers: UAV → INDIRECT_ARGUMENT for DrawInstancedIndirectCount
+        GPUBarrier postBarriers[] = {
+            GPUBarrier::Buffer(&indirectArgsBuffer_, ResourceState::UNORDERED_ACCESS, ResourceState::INDIRECT_ARGUMENT),
+            GPUBarrier::Buffer(&drawCountBuffer_, ResourceState::UNORDERED_ACCESS, ResourceState::INDIRECT_ARGUMENT),
+        };
+        dev->Barrier(postBarriers, 2, cmd);
+
+        // Set MDI flag in push constants (VS uses binary search for chunk index)
+        VoxelPush pushData = {};
+        pushData.flags = 1; // MDI mode
+        dev->PushConstants(&pushData, sizeof(pushData), cmd);
+
+        // ── Render pass ────────────────────────────────────────────
+        RenderPassImage rp[] = {
+            RenderPassImage::RenderTarget(
+                &renderTarget,
+                RenderPassImage::LoadOp::CLEAR,
+                RenderPassImage::StoreOp::STORE,
+                ResourceState::SHADER_RESOURCE,
+                ResourceState::SHADER_RESOURCE
+            ),
+            RenderPassImage::DepthStencil(
+                &depthBuffer,
+                RenderPassImage::LoadOp::CLEAR,
+                RenderPassImage::StoreOp::STORE,
+                ResourceState::DEPTHSTENCIL,
+                ResourceState::DEPTHSTENCIL,
+                ResourceState::DEPTHSTENCIL
+            ),
+        };
+        dev->RenderPassBegin(rp, 2, cmd);
+
+        Viewport vp;
+        vp.width = (float)renderTarget.GetDesc().width;
+        vp.height = (float)renderTarget.GetDesc().height;
+        vp.min_depth = 0.0f;
+        vp.max_depth = 1.0f;
+        dev->BindViewports(1, &vp, cmd);
+
+        Rect scissor = { 0, 0, (int)vp.width, (int)vp.height };
+        dev->BindScissorRects(1, &scissor, cmd);
+
+        dev->BindPipelineState(&pso_, cmd);
+        dev->BindConstantBuffer(&constantBuffer_, 0, cmd);
+        dev->BindResource(&megaQuadBuffer_, 0, cmd);
+        dev->BindResource(&textureArray_, 1, cmd);
+        dev->BindResource(&chunkInfoBuffer_, 2, cmd);
+        dev->BindSampler(&sampler_, 0, cmd);
+
+        // Timestamp: draw begin
+        dev->QueryEnd(&timestampHeap_, TS_DRAW_BEGIN, cmd);
+
+        // Single MDI call: GPU cull shader filled the indirect args
+        dev->DrawInstancedIndirectCount(
+            &indirectArgsBuffer_, 0,
+            &drawCountBuffer_, 0,
+            MAX_DRAWS, cmd
+        );
+        drawCalls_ = 1;
+
+        // Timestamp: draw end
+        dev->QueryEnd(&timestampHeap_, TS_DRAW_END, cmd);
+
+        dev->RenderPassEnd(cmd);
+
+        // Resolve timestamps for readback (results available next frame)
+        dev->QueryResolve(&timestampHeap_, 0, TS_COUNT, &timestampReadback_, 0, cmd);
+
+        // Read back previous frame's timestamps (persistently mapped READBACK buffer)
+        uint64_t* tsData = (uint64_t*)timestampReadback_.mapped_data;
+        if (tsData) {
+            double freq = (double)dev->GetTimestampFrequency();
+            if (freq > 0.0 && tsData[TS_CULL_END] > tsData[TS_CULL_BEGIN]) {
+                gpuCullTimeMs_ = (float)((double)(tsData[TS_CULL_END] - tsData[TS_CULL_BEGIN]) / freq * 1000.0);
+            }
+            if (freq > 0.0 && tsData[TS_DRAW_END] > tsData[TS_DRAW_BEGIN]) {
+                gpuDrawTimeMs_ = (float)((double)(tsData[TS_DRAW_END] - tsData[TS_DRAW_BEGIN]) / freq * 1000.0);
+            }
+        }
+
+        // GPU cull handles visibility counting — approximate from chunkCount
+        visibleChunks_ = chunkCount_; // exact count would require readback of drawCount
+
+        return;
+    }
+
+    // ── CPU Fallback: frustum + backface cull + per-face-group draws ──
    wi::primitive::Frustum frustum;
    frustum.Create(camera.GetViewProjection());

-    // ── Render pass: color + depth ────────────────────────────────
    RenderPassImage rp[] = {
        RenderPassImage::RenderTarget(
            &renderTarget,
@ -372,22 +533,11 @@ void VoxelRenderer::render(

    dev->BindPipelineState(&pso_, cmd);
    dev->BindConstantBuffer(&constantBuffer_, 0, cmd);
-    dev->BindResource(&megaQuadBuffer_, 0, cmd);   // t0: mega quad buffer
-    dev->BindResource(&textureArray_, 1, cmd);      // t1: material textures
-    dev->BindResource(&chunkInfoBuffer_, 2, cmd);   // t2: chunk info
+    dev->BindResource(&megaQuadBuffer_, 0, cmd);
+    dev->BindResource(&textureArray_, 1, cmd);
+    dev->BindResource(&chunkInfoBuffer_, 2, cmd);
    dev->BindSampler(&sampler_, 0, cmd);

-    visibleChunks_ = 0;
-    drawCalls_ = 0;
-
-    // Push constant structure (must be 48 bytes = 12 x uint32, matches b999)
-    struct VoxelPush {
-        uint32_t chunkIndex;
-        uint32_t quadOffset;  // offset into mega quad buffer (in quads)
-        uint32_t pad[10];
-    };
-
-    // Simple DrawInstanced loop with frustum culling + push constants
    for (uint32_t i = 0; i < chunkCount_; i++) {
        const auto& slot = chunkSlots_[i];
        if (slot.quadCount == 0) continue;
@ -406,17 +556,33 @@ void VoxelRenderer::render(
        if (!frustum.CheckBoxFast(aabb)) continue;

        visibleChunks_++;
+        const auto& info = cpuChunkInfo_[i];

-        // Pass chunk index AND quad offset via push constants
-        // (SV_VertexID/SV_InstanceID offsets unreliable across drivers)
-        VoxelPush pushData = {};
-        pushData.chunkIndex = i;
-        pushData.quadOffset = slot.quadOffset;
-        dev->PushConstants(&pushData, sizeof(pushData), cmd);
+        // Per-face-group draws with backface culling
+        for (uint32_t f = 0; f < 6; f++) {
+            if (info.faceCounts[f] == 0) continue;

-        // startVertexLocation = 0: the VS computes quad address from push.quadOffset
-        dev->DrawInstanced(slot.quadCount * 6, 1, 0, 0, cmd);
-        drawCalls_++;
+            // Backface cull: skip face groups pointing away from camera
+            bool backFacing = false;
+            switch (f) {
+            case 0: backFacing = (camera.Eye.x < aabbMin.x); break; // +X
+            case 1: backFacing = (camera.Eye.x > aabbMax.x); break; // -X
+            case 2: backFacing = (camera.Eye.y < aabbMin.y); break; // +Y
+            case 3: backFacing = (camera.Eye.y > aabbMax.y); break; // -Y
+            case 4: backFacing = (camera.Eye.z < aabbMin.z); break; // +Z
+            case 5: backFacing = (camera.Eye.z > aabbMax.z); break; // -Z
+            }
+            if (backFacing) continue;
+
+            VoxelPush pushData = {};
+            pushData.chunkIndex = i;
+            pushData.quadOffset = slot.quadOffset + info.faceOffsets[f];
+            pushData.flags = 0; // CPU mode
+            dev->PushConstants(&pushData, sizeof(pushData), cmd);
+
+            dev->DrawInstanced(info.faceCounts[f] * 6, 1, 0, 0, cmd);
+            drawCalls_++;
+        }
    }

    dev->RenderPassEnd(cmd);
@ -583,7 +749,7 @@ void VoxelRenderPath::Compose(CommandList cmd) const {
           + "/" + std::to_string(renderer.getChunkCount()) + "\n";
    stats += "Quads: " + std::to_string(renderer.getTotalQuads()) + "\n";
    stats += "Draw Calls: " + std::to_string(renderer.getDrawCalls())
-           + " (DrawInstanced + CPU cull + backface)\n";
+           + (renderer.isGpuCulling() ? " (MDI + GPU cull)" : " (DrawInstanced + CPU cull + backface)") + "\n";

    char cullStr[16], drawStr[16];
    snprintf(cullStr, sizeof(cullStr), "%.3f", renderer.getGpuCullTimeMs());