Refactor: remove dead CPU/MDI paths, GPU BLAS compute, 30Hz animation

- Remove ~430 lines of dead CPU mesh, MDI, and GPU cull render paths (rebuildMegaBuffer, IndirectDrawArgs, drawCountBuffer, cullShader, etc.) - Add voxelTopingBLASCS.hlsl compute shader replacing 196ms CPU loop for toping BLAS position extraction (<1ms on GPU) - Reduce animation rate from 60Hz to 30Hz (halves CPU regen cost) - Simplify render() to GPU mesh path only (no conditional branches) - Remove benchmark state machine and stale mode strings
2026-03-31 01:43:53 +02:00 · 2026-03-31 01:43:53 +02:00 · 0d3f8200b4
commit 0d3f8200b4
parent f134a5786d
3 changed files with 281 additions and 833 deletions
--- a/shaders/voxelTopingBLASCS.hlsl
+++ b/shaders/voxelTopingBLASCS.hlsl
@ -0,0 +1,80 @@
 // BVLE Voxels - Toping BLAS Position Extraction Compute Shader
 // Replaces the 196ms CPU loop that computed world-space toping positions.
 // Reads vertex templates (t4) + instance positions (t5) + group table (t7),
 // writes flat float3 positions (u0) for DXR BLAS construction.
 //
 // One thread per output vertex. Group table maps global vertex index to
 // the correct (instance, local vertex) pair via prefix-sum offsets.
 #include "voxelCommon.hlsli"
 // Toping mesh vertex (must match C++ TopingVertex, 24 bytes)
 struct TopingVtx {
    float3 position; // local to voxel [0,1]^3
    float3 normal;   // unused here, but struct must match
 };
 // Toping instance (just the world position, 12 bytes)
 struct TopingInst {
    float3 worldPos;
 };
 // Draw group descriptor for BLAS extraction (must match C++ TopingBLASGroupGPU, 20 bytes)
 struct TopingBLASGroup {
    uint globalVertexOffset;    // prefix sum: first global vertex index for this group
    uint vertexTemplateOffset;  // offset into topingVertices (t4)
    uint vertexCount;           // vertices per instance (mesh slice count)
    uint instanceOffset;        // offset into topingInstances (t5)
    uint instanceCount;         // number of instances in this group
 };
 StructuredBuffer<TopingVtx>       topingVertices  : register(t4);
 StructuredBuffer<TopingInst>      topingInstances : register(t5);
 StructuredBuffer<TopingBLASGroup> topingGroups    : register(t7);
 // Output: raw float3 positions (12 bytes each)
 RWByteAddressBuffer blasPositions : register(u0);
 // Push constants (b999)
 struct TopingBLASPush {
    uint totalVertices;
    uint groupCount;
    uint pad0, pad1, pad2, pad3, pad4, pad5, pad6, pad7, pad8, pad9;
 };
 [[vk::push_constant]] ConstantBuffer<TopingBLASPush> push : register(b999);
 void storeFloat3(uint byteOffset, float3 v) {
    blasPositions.Store(byteOffset,      asuint(v.x));
    blasPositions.Store(byteOffset + 4,  asuint(v.y));
    blasPositions.Store(byteOffset + 8,  asuint(v.z));
 }
 [RootSignature(VOXEL_ROOTSIG)]
 [numthreads(64, 1, 1)]
 void main(uint3 DTid : SV_DispatchThreadID) {
    uint globalIdx = DTid.x;
    if (globalIdx >= push.totalVertices) return;
    // Find which group this vertex belongs to (linear scan, max ~32 groups)
    uint groupIdx = 0;
    for (uint g = 1; g < push.groupCount; g++) {
        if (globalIdx >= topingGroups[g].globalVertexOffset)
            groupIdx = g;
        else
            break;
    }
    TopingBLASGroup grp = topingGroups[groupIdx];
    // Map global vertex to (instance, local vertex) within this group
    uint localIdx    = globalIdx - grp.globalVertexOffset;
    uint instanceIdx = grp.instanceOffset + localIdx / grp.vertexCount;
    uint vertexIdx   = grp.vertexTemplateOffset + localIdx % grp.vertexCount;
    TopingVtx vtx   = topingVertices[vertexIdx];
    TopingInst inst  = topingInstances[instanceIdx];
    float3 worldPos = inst.worldPos + vtx.position;
    storeFloat3(globalIdx * 12, worldPos);
 }
--- a/src/voxel/VoxelRenderer.cpp
+++ b/src/voxel/VoxelRenderer.cpp
--- a/src/voxel/VoxelRenderer.h
+++ b/src/voxel/VoxelRenderer.h
@ -27,7 +27,7 @@ struct GPUChunkInfo {
    uint32_t pad2[2];          // pad to 112 bytes (7 × float4)
 };
-// ── Voxel Renderer (Phase 2: mega-buffer + MDI pipeline) ────────
+// ── Voxel Renderer (GPU mesh pipeline) ──────────────────────────
 class VoxelRenderer {
    friend class VoxelRenderPath;
 public:
@ -58,8 +58,6 @@ public:
    uint32_t getDrawCalls() const { return drawCalls_; }
    uint32_t getChunkCount() const { return chunkCount_; }
    bool isInitialized() const { return initialized_; }
    bool isGpuCulling() const { return gpuCullingEnabled_; }
    bool isMdiEnabled() const { return mdiEnabled_; }
    bool debugFaceColors_ = false;
    bool debugBlend_ = false;
@ -67,7 +65,6 @@ public:
 private:
    void createPipeline();
    void rebuildMegaBuffer(VoxelWorld& world);
    wi::graphics::GraphicsDevice* device_ = nullptr;
@ -75,8 +72,6 @@ private:
    wi::graphics::Shader vertexShader_;
    wi::graphics::Shader pixelShader_;
    wi::graphics::PipelineState pso_;
    wi::graphics::Shader cullShader_; // Frustum cull compute shader
    // Shaders & Pipeline (topings, Phase 4)
    wi::graphics::Shader topingVS_;
    wi::graphics::Shader topingPS_;
@ -93,6 +88,29 @@ private:
    std::vector<TopingGPUInst> topingGpuInsts_;
    mutable uint32_t topingDrawCalls_ = 0;
    // ── Toping draw groups (shared between render + BLAS CS) ─────
    struct TopingDrawGroup {
        uint16_t type, variant;
        uint32_t instanceOffset, instanceCount;
        uint32_t vertexTemplateOffset, vertexCount; // from TopingDef::variants[]
    };
    std::vector<TopingDrawGroup> topingDrawGroups_; // built in uploadTopingData, reused in renderTopings
    // ── GPU compute toping BLAS extraction (replaces 196ms CPU loop) ──
    wi::graphics::Shader topingBLASShader_;         // voxelTopingBLASCS compute shader
    struct TopingBLASGroupGPU {
        uint32_t globalVertexOffset;    // prefix sum of total vertices before this group
        uint32_t vertexTemplateOffset;  // offset into topingVertices (t4)
        uint32_t vertexCount;           // vertices per instance
        uint32_t instanceOffset;        // offset into topingInstances (t5)
        uint32_t instanceCount;         // instances in this group
    };
    wi::graphics::GPUBuffer topingBLASGroupBuffer_; // StructuredBuffer<TopingBLASGroupGPU>, SRV t7
    std::vector<TopingBLASGroupGPU> topingBLASGroupsGPU_; // CPU staging for group table
    mutable uint32_t topingBLASTotalVertices_ = 0;
    static constexpr uint32_t MAX_TOPING_BLAS_GROUPS = 64;
    void dispatchTopingBLASExtract(wi::graphics::CommandList cmd) const;
    // Shaders & Pipeline (smooth surfaces, Phase 5)
    wi::graphics::Shader smoothVS_;
    wi::graphics::Shader smoothPS_;
@ -114,9 +132,7 @@ private:
    // ── Mega-buffer architecture (Phase 2) ──────────────────────
    static constexpr uint32_t MEGA_BUFFER_CAPACITY = 2 * 1024 * 1024; // 2M quads max (16 MB)
    static constexpr uint32_t MAX_CHUNKS = 2048;
    static constexpr uint32_t MAX_DRAWS = MAX_CHUNKS * 6; // up to 6 face groups per chunk
    wi::graphics::GPUBuffer megaQuadBuffer_;    // StructuredBuffer<PackedQuad>, SRV t0
    wi::graphics::GPUBuffer chunkInfoBuffer_;   // StructuredBuffer<GPUChunkInfo>, SRV t2
    // CPU-side tracking
@ -127,27 +143,9 @@ private:
    };
    std::vector<ChunkSlot> chunkSlots_;
    std::vector<GPUChunkInfo> cpuChunkInfo_;
    std::vector<PackedQuad> cpuMegaQuads_;       // CPU staging for mega-buffer
    uint32_t chunkCount_ = 0;
    bool megaBufferDirty_ = true;
    // ── Indirect draw (Phase 2 MDI) ─────────────────────────────
    // Wicked Engine's DrawInstancedIndirectCount command signature includes a
    // push constant (1 × uint32 at b999) BEFORE each D3D12_DRAW_ARGUMENTS.
    // Total stride = 4 + 16 = 20 bytes per draw entry.
    struct IndirectDrawArgs {
        uint32_t pushConstant;              // written to b999[0] by ExecuteIndirect
        uint32_t vertexCountPerInstance;
        uint32_t instanceCount;
        uint32_t startVertexLocation;
        uint32_t startInstanceLocation;
    };
    wi::graphics::GPUBuffer indirectArgsBuffer_;   // IndirectDrawArgs[MAX_DRAWS]
    wi::graphics::GPUBuffer drawCountBuffer_;      // uint32_t[1]
    mutable std::vector<IndirectDrawArgs> cpuIndirectArgs_;
    bool gpuCullingEnabled_ = true;                // Phase 2.3: GPU compute cull (true) vs CPU fallback (false)
    bool mdiEnabled_ = true;                       // Phase 2.2: MDI rendering with CPU-filled indirect args
    // Constants buffer (must match HLSL VoxelCB)
    struct VoxelConstants {
        XMFLOAT4X4 viewProjection;
@ -184,7 +182,6 @@ private:
    wi::graphics::GPUBuffer gpuQuadCounter_;  // atomic counter for GPU mesh output
    wi::graphics::GPUBuffer meshCounterReadback_; // READBACK buffer for quad counter
    bool gpuMesherAvailable_ = false;
    bool gpuMeshEnabled_ = true;              // Use GPU meshing instead of CPU greedy
    mutable uint32_t gpuMeshQuadCount_ = 0;   // Readback from previous frame (1-frame delay)
    mutable uint32_t voxelDataCapacity_ = 0;  // Current capacity of voxelDataBuffer_ (in uint32s)
    mutable std::vector<uint32_t> packedVoxelCache_; // cached packed voxel data for all chunks
@ -216,9 +213,8 @@ private:
    mutable wi::graphics::GPUBuffer topingBLASIndexBuffer_;    // sequential indices for toping BLAS
    mutable uint32_t topingBLASPositionCapacity_ = 0;          // pre-allocated capacity (vertices)
    mutable uint32_t topingBLASIndexCount_ = 0;                // size of toping index buffer
-    mutable bool topingBLASDirty_ = false;                     // deferred BLAS position upload + rebuild
+    mutable bool topingBLASDirty_ = false;                     // GPU compute BLAS extract + rebuild needed
    mutable uint32_t topingBLASVertexCount_ = 0;               // actual vertex count for current frame
    std::vector<float> topingBLASPositionStaging_;             // CPU staging for deferred upload
    static constexpr uint32_t MAX_BLAS_VERTICES = MEGA_BUFFER_CAPACITY * 6; // 6 verts per quad
    mutable bool rtAvailable_ = false;                    // GPU supports RT
    mutable bool rtDirty_ = true;                         // BLAS/TLAS need rebuild
@ -252,14 +248,6 @@ private:
                         const wi::graphics::Texture& renderTarget,
                         const wi::graphics::Texture& normalTarget) const;
    // Benchmark state machine: runs once after world gen
    enum class BenchState { IDLE, DISPATCH, READBACK, DONE };
    mutable BenchState benchState_ = BenchState::IDLE;
    mutable float cpuMeshTimeMs_ = 0.0f;
    mutable uint32_t gpuBaselineQuads_ = 0;
    void dispatchGpuMeshBenchmark(wi::graphics::CommandList cmd, const VoxelWorld& world) const;
    void readbackGpuMeshBenchmark() const;
    void dispatchGpuMesh(wi::graphics::CommandList cmd, const VoxelWorld& world,
        ProfileAccum* profPack = nullptr, ProfileAccum* profUpload = nullptr,
        ProfileAccum* profDispatch = nullptr) const;
@ -290,7 +278,7 @@ private:
 public:
    float getGpuCullTimeMs() const { return gpuCullTimeMs_; }
    float getGpuDrawTimeMs() const { return gpuDrawTimeMs_; }
-    bool isGpuMeshEnabled() const { return gpuMeshEnabled_ && gpuMesherAvailable_; }
+    bool isGpuMeshEnabled() const { return gpuMesherAvailable_; }
    uint32_t getGpuMeshQuadCount() const { return gpuMeshQuadCount_; }
    // Phase 4: Toping rendering
@ -364,11 +352,11 @@ private:
    // Wind animation (continuous, always running)
    float windTime_ = 0.0f;
-    // Animated terrain (wave effect at 60 Hz, toggled with F3)
+    // Animated terrain (wave effect at 30 Hz, toggled with F3)
    bool animatedTerrain_ = false;
    float animTime_ = 0.0f;
    float animAccum_ = 0.0f;
-    static constexpr float ANIM_INTERVAL = 1.0f / 60.0f; // ~16.7ms = 60 Hz
+    static constexpr float ANIM_INTERVAL = 1.0f / 30.0f; // ~33.3ms = 30 Hz
    wi::graphics::Texture voxelRT_;
    wi::graphics::Texture voxelNormalRT_;  // Phase 6: world-space normals for RT shadows/AO