Refactor: remove dead CPU/MDI paths, GPU BLAS compute, 30Hz animation

- Remove ~430 lines of dead CPU mesh, MDI, and GPU cull render paths (rebuildMegaBuffer, IndirectDrawArgs, drawCountBuffer, cullShader, etc.) - Add voxelTopingBLASCS.hlsl compute shader replacing 196ms CPU loop for toping BLAS position extraction (<1ms on GPU) - Reduce animation rate from 60Hz to 30Hz (halves CPU regen cost) - Simplify render() to GPU mesh path only (no conditional branches) - Remove benchmark state machine and stale mode strings
2026-03-31 01:43:53 +02:00 · 2026-03-31 01:43:53 +02:00 · 0d3f8200b4
commit 0d3f8200b4
parent f134a5786d
3 changed files with 281 additions and 833 deletions
--- a/shaders/voxelTopingBLASCS.hlsl
+++ b/shaders/voxelTopingBLASCS.hlsl
@ -0,0 +1,80 @@
+// BVLE Voxels - Toping BLAS Position Extraction Compute Shader
+// Replaces the 196ms CPU loop that computed world-space toping positions.
+// Reads vertex templates (t4) + instance positions (t5) + group table (t7),
+// writes flat float3 positions (u0) for DXR BLAS construction.
+//
+// One thread per output vertex. Group table maps global vertex index to
+// the correct (instance, local vertex) pair via prefix-sum offsets.
+
+#include "voxelCommon.hlsli"
+
+// Toping mesh vertex (must match C++ TopingVertex, 24 bytes)
+struct TopingVtx {
+    float3 position; // local to voxel [0,1]^3
+    float3 normal;   // unused here, but struct must match
+};
+
+// Toping instance (just the world position, 12 bytes)
+struct TopingInst {
+    float3 worldPos;
+};
+
+// Draw group descriptor for BLAS extraction (must match C++ TopingBLASGroupGPU, 20 bytes)
+struct TopingBLASGroup {
+    uint globalVertexOffset;    // prefix sum: first global vertex index for this group
+    uint vertexTemplateOffset;  // offset into topingVertices (t4)
+    uint vertexCount;           // vertices per instance (mesh slice count)
+    uint instanceOffset;        // offset into topingInstances (t5)
+    uint instanceCount;         // number of instances in this group
+};
+
+StructuredBuffer<TopingVtx>       topingVertices  : register(t4);
+StructuredBuffer<TopingInst>      topingInstances : register(t5);
+StructuredBuffer<TopingBLASGroup> topingGroups    : register(t7);
+
+// Output: raw float3 positions (12 bytes each)
+RWByteAddressBuffer blasPositions : register(u0);
+
+// Push constants (b999)
+struct TopingBLASPush {
+    uint totalVertices;
+    uint groupCount;
+    uint pad0, pad1, pad2, pad3, pad4, pad5, pad6, pad7, pad8, pad9;
+};
+[[vk::push_constant]] ConstantBuffer<TopingBLASPush> push : register(b999);
+
+void storeFloat3(uint byteOffset, float3 v) {
+    blasPositions.Store(byteOffset,      asuint(v.x));
+    blasPositions.Store(byteOffset + 4,  asuint(v.y));
+    blasPositions.Store(byteOffset + 8,  asuint(v.z));
+}
+
+[RootSignature(VOXEL_ROOTSIG)]
+[numthreads(64, 1, 1)]
+void main(uint3 DTid : SV_DispatchThreadID) {
+    uint globalIdx = DTid.x;
+    if (globalIdx >= push.totalVertices) return;
+
+    // Find which group this vertex belongs to (linear scan, max ~32 groups)
+    uint groupIdx = 0;
+    for (uint g = 1; g < push.groupCount; g++) {
+        if (globalIdx >= topingGroups[g].globalVertexOffset)
+            groupIdx = g;
+        else
+            break;
+    }
+
+    TopingBLASGroup grp = topingGroups[groupIdx];
+
+    // Map global vertex to (instance, local vertex) within this group
+    uint localIdx    = globalIdx - grp.globalVertexOffset;
+    uint instanceIdx = grp.instanceOffset + localIdx / grp.vertexCount;
+    uint vertexIdx   = grp.vertexTemplateOffset + localIdx % grp.vertexCount;
+
+    TopingVtx vtx   = topingVertices[vertexIdx];
+    TopingInst inst  = topingInstances[instanceIdx];
+
+    float3 worldPos = inst.worldPos + vtx.position;
+
+    storeFloat3(globalIdx * 12, worldPos);
+}
--- a/src/voxel/VoxelRenderer.cpp
+++ b/src/voxel/VoxelRenderer.cpp
--- a/src/voxel/VoxelRenderer.h
+++ b/src/voxel/VoxelRenderer.h
@ -27,7 +27,7 @@ struct GPUChunkInfo {
    uint32_t pad2[2];          // pad to 112 bytes (7 × float4)
 };

-// ── Voxel Renderer (Phase 2: mega-buffer + MDI pipeline) ────────
+// ── Voxel Renderer (GPU mesh pipeline) ──────────────────────────
 class VoxelRenderer {
    friend class VoxelRenderPath;
 public:
@ -58,8 +58,6 @@ public:
    uint32_t getDrawCalls() const { return drawCalls_; }
    uint32_t getChunkCount() const { return chunkCount_; }
    bool isInitialized() const { return initialized_; }
-    bool isGpuCulling() const { return gpuCullingEnabled_; }
-    bool isMdiEnabled() const { return mdiEnabled_; }

    bool debugFaceColors_ = false;
    bool debugBlend_ = false;
@ -67,7 +65,6 @@ public:

 private:
    void createPipeline();
-    void rebuildMegaBuffer(VoxelWorld& world);

    wi::graphics::GraphicsDevice* device_ = nullptr;

@ -75,8 +72,6 @@ private:
    wi::graphics::Shader vertexShader_;
    wi::graphics::Shader pixelShader_;
    wi::graphics::PipelineState pso_;
-    wi::graphics::Shader cullShader_; // Frustum cull compute shader
-
    // Shaders & Pipeline (topings, Phase 4)
    wi::graphics::Shader topingVS_;
    wi::graphics::Shader topingPS_;
@ -93,6 +88,29 @@ private:
    std::vector<TopingGPUInst> topingGpuInsts_;
    mutable uint32_t topingDrawCalls_ = 0;

+    // ── Toping draw groups (shared between render + BLAS CS) ─────
+    struct TopingDrawGroup {
+        uint16_t type, variant;
+        uint32_t instanceOffset, instanceCount;
+        uint32_t vertexTemplateOffset, vertexCount; // from TopingDef::variants[]
+    };
+    std::vector<TopingDrawGroup> topingDrawGroups_; // built in uploadTopingData, reused in renderTopings
+
+    // ── GPU compute toping BLAS extraction (replaces 196ms CPU loop) ──
+    wi::graphics::Shader topingBLASShader_;         // voxelTopingBLASCS compute shader
+    struct TopingBLASGroupGPU {
+        uint32_t globalVertexOffset;    // prefix sum of total vertices before this group
+        uint32_t vertexTemplateOffset;  // offset into topingVertices (t4)
+        uint32_t vertexCount;           // vertices per instance
+        uint32_t instanceOffset;        // offset into topingInstances (t5)
+        uint32_t instanceCount;         // instances in this group
+    };
+    wi::graphics::GPUBuffer topingBLASGroupBuffer_; // StructuredBuffer<TopingBLASGroupGPU>, SRV t7
+    std::vector<TopingBLASGroupGPU> topingBLASGroupsGPU_; // CPU staging for group table
+    mutable uint32_t topingBLASTotalVertices_ = 0;
+    static constexpr uint32_t MAX_TOPING_BLAS_GROUPS = 64;
+    void dispatchTopingBLASExtract(wi::graphics::CommandList cmd) const;
+
    // Shaders & Pipeline (smooth surfaces, Phase 5)
    wi::graphics::Shader smoothVS_;
    wi::graphics::Shader smoothPS_;
@ -114,9 +132,7 @@ private:
    // ── Mega-buffer architecture (Phase 2) ──────────────────────
    static constexpr uint32_t MEGA_BUFFER_CAPACITY = 2 * 1024 * 1024; // 2M quads max (16 MB)
    static constexpr uint32_t MAX_CHUNKS = 2048;
-    static constexpr uint32_t MAX_DRAWS = MAX_CHUNKS * 6; // up to 6 face groups per chunk

-    wi::graphics::GPUBuffer megaQuadBuffer_;    // StructuredBuffer<PackedQuad>, SRV t0
    wi::graphics::GPUBuffer chunkInfoBuffer_;   // StructuredBuffer<GPUChunkInfo>, SRV t2

    // CPU-side tracking
@ -127,27 +143,9 @@ private:
    };
    std::vector<ChunkSlot> chunkSlots_;
    std::vector<GPUChunkInfo> cpuChunkInfo_;
-    std::vector<PackedQuad> cpuMegaQuads_;       // CPU staging for mega-buffer
    uint32_t chunkCount_ = 0;
    bool megaBufferDirty_ = true;

-    // ── Indirect draw (Phase 2 MDI) ─────────────────────────────
-    // Wicked Engine's DrawInstancedIndirectCount command signature includes a
-    // push constant (1 × uint32 at b999) BEFORE each D3D12_DRAW_ARGUMENTS.
-    // Total stride = 4 + 16 = 20 bytes per draw entry.
-    struct IndirectDrawArgs {
-        uint32_t pushConstant;              // written to b999[0] by ExecuteIndirect
-        uint32_t vertexCountPerInstance;
-        uint32_t instanceCount;
-        uint32_t startVertexLocation;
-        uint32_t startInstanceLocation;
-    };
-    wi::graphics::GPUBuffer indirectArgsBuffer_;   // IndirectDrawArgs[MAX_DRAWS]
-    wi::graphics::GPUBuffer drawCountBuffer_;      // uint32_t[1]
-    mutable std::vector<IndirectDrawArgs> cpuIndirectArgs_;
-    bool gpuCullingEnabled_ = true;                // Phase 2.3: GPU compute cull (true) vs CPU fallback (false)
-    bool mdiEnabled_ = true;                       // Phase 2.2: MDI rendering with CPU-filled indirect args
-
    // Constants buffer (must match HLSL VoxelCB)
    struct VoxelConstants {
        XMFLOAT4X4 viewProjection;
@ -184,7 +182,6 @@ private:
    wi::graphics::GPUBuffer gpuQuadCounter_;  // atomic counter for GPU mesh output
    wi::graphics::GPUBuffer meshCounterReadback_; // READBACK buffer for quad counter
    bool gpuMesherAvailable_ = false;
-    bool gpuMeshEnabled_ = true;              // Use GPU meshing instead of CPU greedy
    mutable uint32_t gpuMeshQuadCount_ = 0;   // Readback from previous frame (1-frame delay)
    mutable uint32_t voxelDataCapacity_ = 0;  // Current capacity of voxelDataBuffer_ (in uint32s)
    mutable std::vector<uint32_t> packedVoxelCache_; // cached packed voxel data for all chunks
@ -216,9 +213,8 @@ private:
    mutable wi::graphics::GPUBuffer topingBLASIndexBuffer_;    // sequential indices for toping BLAS
    mutable uint32_t topingBLASPositionCapacity_ = 0;          // pre-allocated capacity (vertices)
    mutable uint32_t topingBLASIndexCount_ = 0;                // size of toping index buffer
-    mutable bool topingBLASDirty_ = false;                     // deferred BLAS position upload + rebuild
+    mutable bool topingBLASDirty_ = false;                     // GPU compute BLAS extract + rebuild needed
    mutable uint32_t topingBLASVertexCount_ = 0;               // actual vertex count for current frame
-    std::vector<float> topingBLASPositionStaging_;             // CPU staging for deferred upload
    static constexpr uint32_t MAX_BLAS_VERTICES = MEGA_BUFFER_CAPACITY * 6; // 6 verts per quad
    mutable bool rtAvailable_ = false;                    // GPU supports RT
    mutable bool rtDirty_ = true;                         // BLAS/TLAS need rebuild
@ -252,14 +248,6 @@ private:
                         const wi::graphics::Texture& renderTarget,
                         const wi::graphics::Texture& normalTarget) const;

-    // Benchmark state machine: runs once after world gen
-    enum class BenchState { IDLE, DISPATCH, READBACK, DONE };
-    mutable BenchState benchState_ = BenchState::IDLE;
-    mutable float cpuMeshTimeMs_ = 0.0f;
-    mutable uint32_t gpuBaselineQuads_ = 0;
-
-    void dispatchGpuMeshBenchmark(wi::graphics::CommandList cmd, const VoxelWorld& world) const;
-    void readbackGpuMeshBenchmark() const;
    void dispatchGpuMesh(wi::graphics::CommandList cmd, const VoxelWorld& world,
        ProfileAccum* profPack = nullptr, ProfileAccum* profUpload = nullptr,
        ProfileAccum* profDispatch = nullptr) const;
@ -290,7 +278,7 @@ private:
 public:
    float getGpuCullTimeMs() const { return gpuCullTimeMs_; }
    float getGpuDrawTimeMs() const { return gpuDrawTimeMs_; }
-    bool isGpuMeshEnabled() const { return gpuMeshEnabled_ && gpuMesherAvailable_; }
+    bool isGpuMeshEnabled() const { return gpuMesherAvailable_; }
    uint32_t getGpuMeshQuadCount() const { return gpuMeshQuadCount_; }

    // Phase 4: Toping rendering
@ -364,11 +352,11 @@ private:
    // Wind animation (continuous, always running)
    float windTime_ = 0.0f;

-    // Animated terrain (wave effect at 60 Hz, toggled with F3)
+    // Animated terrain (wave effect at 30 Hz, toggled with F3)
    bool animatedTerrain_ = false;
    float animTime_ = 0.0f;
    float animAccum_ = 0.0f;
-    static constexpr float ANIM_INTERVAL = 1.0f / 60.0f; // ~16.7ms = 60 Hz
+    static constexpr float ANIM_INTERVAL = 1.0f / 30.0f; // ~33.3ms = 30 Hz

    wi::graphics::Texture voxelRT_;
    wi::graphics::Texture voxelNormalRT_;  // Phase 6: world-space normals for RT shadows/AO