Refactor: remove dead CPU/MDI paths, GPU BLAS compute, 30Hz animation

- Remove ~430 lines of dead CPU mesh, MDI, and GPU cull render paths
  (rebuildMegaBuffer, IndirectDrawArgs, drawCountBuffer, cullShader, etc.)
- Add voxelTopingBLASCS.hlsl compute shader replacing 196ms CPU loop
  for toping BLAS position extraction (<1ms on GPU)
- Reduce animation rate from 60Hz to 30Hz (halves CPU regen cost)
- Simplify render() to GPU mesh path only (no conditional branches)
- Remove benchmark state machine and stale mode strings
This commit is contained in:
Samuel Bouchet 2026-03-31 01:43:53 +02:00
parent f134a5786d
commit 0d3f8200b4
3 changed files with 281 additions and 833 deletions

View file

@ -0,0 +1,80 @@
// BVLE Voxels - Toping BLAS Position Extraction Compute Shader
// Replaces the 196ms CPU loop that computed world-space toping positions.
// Reads vertex templates (t4) + instance positions (t5) + group table (t7),
// writes flat float3 positions (u0) for DXR BLAS construction.
//
// One thread per output vertex. Group table maps global vertex index to
// the correct (instance, local vertex) pair via prefix-sum offsets.
#include "voxelCommon.hlsli"
// Toping mesh vertex (must match C++ TopingVertex, 24 bytes)
struct TopingVtx {
float3 position; // local to voxel [0,1]^3
float3 normal; // unused here, but struct must match
};
// Toping instance (just the world position, 12 bytes)
struct TopingInst {
float3 worldPos;
};
// Draw group descriptor for BLAS extraction (must match C++ TopingBLASGroupGPU, 20 bytes)
struct TopingBLASGroup {
uint globalVertexOffset; // prefix sum: first global vertex index for this group
uint vertexTemplateOffset; // offset into topingVertices (t4)
uint vertexCount; // vertices per instance (mesh slice count)
uint instanceOffset; // offset into topingInstances (t5)
uint instanceCount; // number of instances in this group
};
StructuredBuffer<TopingVtx> topingVertices : register(t4);
StructuredBuffer<TopingInst> topingInstances : register(t5);
StructuredBuffer<TopingBLASGroup> topingGroups : register(t7);
// Output: raw float3 positions (12 bytes each)
RWByteAddressBuffer blasPositions : register(u0);
// Push constants (b999)
struct TopingBLASPush {
uint totalVertices;
uint groupCount;
uint pad0, pad1, pad2, pad3, pad4, pad5, pad6, pad7, pad8, pad9;
};
[[vk::push_constant]] ConstantBuffer<TopingBLASPush> push : register(b999);
void storeFloat3(uint byteOffset, float3 v) {
blasPositions.Store(byteOffset, asuint(v.x));
blasPositions.Store(byteOffset + 4, asuint(v.y));
blasPositions.Store(byteOffset + 8, asuint(v.z));
}
[RootSignature(VOXEL_ROOTSIG)]
[numthreads(64, 1, 1)]
void main(uint3 DTid : SV_DispatchThreadID) {
uint globalIdx = DTid.x;
if (globalIdx >= push.totalVertices) return;
// Find which group this vertex belongs to (linear scan, max ~32 groups)
uint groupIdx = 0;
for (uint g = 1; g < push.groupCount; g++) {
if (globalIdx >= topingGroups[g].globalVertexOffset)
groupIdx = g;
else
break;
}
TopingBLASGroup grp = topingGroups[groupIdx];
// Map global vertex to (instance, local vertex) within this group
uint localIdx = globalIdx - grp.globalVertexOffset;
uint instanceIdx = grp.instanceOffset + localIdx / grp.vertexCount;
uint vertexIdx = grp.vertexTemplateOffset + localIdx % grp.vertexCount;
TopingVtx vtx = topingVertices[vertexIdx];
TopingInst inst = topingInstances[instanceIdx];
float3 worldPos = inst.worldPos + vtx.position;
storeFloat3(globalIdx * 12, worldPos);
}

File diff suppressed because it is too large Load diff

View file

@ -27,7 +27,7 @@ struct GPUChunkInfo {
uint32_t pad2[2]; // pad to 112 bytes (7 × float4) uint32_t pad2[2]; // pad to 112 bytes (7 × float4)
}; };
// ── Voxel Renderer (Phase 2: mega-buffer + MDI pipeline) ──────── // ── Voxel Renderer (GPU mesh pipeline) ──────────────────────────
class VoxelRenderer { class VoxelRenderer {
friend class VoxelRenderPath; friend class VoxelRenderPath;
public: public:
@ -58,8 +58,6 @@ public:
uint32_t getDrawCalls() const { return drawCalls_; } uint32_t getDrawCalls() const { return drawCalls_; }
uint32_t getChunkCount() const { return chunkCount_; } uint32_t getChunkCount() const { return chunkCount_; }
bool isInitialized() const { return initialized_; } bool isInitialized() const { return initialized_; }
bool isGpuCulling() const { return gpuCullingEnabled_; }
bool isMdiEnabled() const { return mdiEnabled_; }
bool debugFaceColors_ = false; bool debugFaceColors_ = false;
bool debugBlend_ = false; bool debugBlend_ = false;
@ -67,7 +65,6 @@ public:
private: private:
void createPipeline(); void createPipeline();
void rebuildMegaBuffer(VoxelWorld& world);
wi::graphics::GraphicsDevice* device_ = nullptr; wi::graphics::GraphicsDevice* device_ = nullptr;
@ -75,8 +72,6 @@ private:
wi::graphics::Shader vertexShader_; wi::graphics::Shader vertexShader_;
wi::graphics::Shader pixelShader_; wi::graphics::Shader pixelShader_;
wi::graphics::PipelineState pso_; wi::graphics::PipelineState pso_;
wi::graphics::Shader cullShader_; // Frustum cull compute shader
// Shaders & Pipeline (topings, Phase 4) // Shaders & Pipeline (topings, Phase 4)
wi::graphics::Shader topingVS_; wi::graphics::Shader topingVS_;
wi::graphics::Shader topingPS_; wi::graphics::Shader topingPS_;
@ -93,6 +88,29 @@ private:
std::vector<TopingGPUInst> topingGpuInsts_; std::vector<TopingGPUInst> topingGpuInsts_;
mutable uint32_t topingDrawCalls_ = 0; mutable uint32_t topingDrawCalls_ = 0;
// ── Toping draw groups (shared between render + BLAS CS) ─────
struct TopingDrawGroup {
uint16_t type, variant;
uint32_t instanceOffset, instanceCount;
uint32_t vertexTemplateOffset, vertexCount; // from TopingDef::variants[]
};
std::vector<TopingDrawGroup> topingDrawGroups_; // built in uploadTopingData, reused in renderTopings
// ── GPU compute toping BLAS extraction (replaces 196ms CPU loop) ──
wi::graphics::Shader topingBLASShader_; // voxelTopingBLASCS compute shader
struct TopingBLASGroupGPU {
uint32_t globalVertexOffset; // prefix sum of total vertices before this group
uint32_t vertexTemplateOffset; // offset into topingVertices (t4)
uint32_t vertexCount; // vertices per instance
uint32_t instanceOffset; // offset into topingInstances (t5)
uint32_t instanceCount; // instances in this group
};
wi::graphics::GPUBuffer topingBLASGroupBuffer_; // StructuredBuffer<TopingBLASGroupGPU>, SRV t7
std::vector<TopingBLASGroupGPU> topingBLASGroupsGPU_; // CPU staging for group table
mutable uint32_t topingBLASTotalVertices_ = 0;
static constexpr uint32_t MAX_TOPING_BLAS_GROUPS = 64;
void dispatchTopingBLASExtract(wi::graphics::CommandList cmd) const;
// Shaders & Pipeline (smooth surfaces, Phase 5) // Shaders & Pipeline (smooth surfaces, Phase 5)
wi::graphics::Shader smoothVS_; wi::graphics::Shader smoothVS_;
wi::graphics::Shader smoothPS_; wi::graphics::Shader smoothPS_;
@ -114,9 +132,7 @@ private:
// ── Mega-buffer architecture (Phase 2) ────────────────────── // ── Mega-buffer architecture (Phase 2) ──────────────────────
static constexpr uint32_t MEGA_BUFFER_CAPACITY = 2 * 1024 * 1024; // 2M quads max (16 MB) static constexpr uint32_t MEGA_BUFFER_CAPACITY = 2 * 1024 * 1024; // 2M quads max (16 MB)
static constexpr uint32_t MAX_CHUNKS = 2048; static constexpr uint32_t MAX_CHUNKS = 2048;
static constexpr uint32_t MAX_DRAWS = MAX_CHUNKS * 6; // up to 6 face groups per chunk
wi::graphics::GPUBuffer megaQuadBuffer_; // StructuredBuffer<PackedQuad>, SRV t0
wi::graphics::GPUBuffer chunkInfoBuffer_; // StructuredBuffer<GPUChunkInfo>, SRV t2 wi::graphics::GPUBuffer chunkInfoBuffer_; // StructuredBuffer<GPUChunkInfo>, SRV t2
// CPU-side tracking // CPU-side tracking
@ -127,27 +143,9 @@ private:
}; };
std::vector<ChunkSlot> chunkSlots_; std::vector<ChunkSlot> chunkSlots_;
std::vector<GPUChunkInfo> cpuChunkInfo_; std::vector<GPUChunkInfo> cpuChunkInfo_;
std::vector<PackedQuad> cpuMegaQuads_; // CPU staging for mega-buffer
uint32_t chunkCount_ = 0; uint32_t chunkCount_ = 0;
bool megaBufferDirty_ = true; bool megaBufferDirty_ = true;
// ── Indirect draw (Phase 2 MDI) ─────────────────────────────
// Wicked Engine's DrawInstancedIndirectCount command signature includes a
// push constant (1 × uint32 at b999) BEFORE each D3D12_DRAW_ARGUMENTS.
// Total stride = 4 + 16 = 20 bytes per draw entry.
struct IndirectDrawArgs {
uint32_t pushConstant; // written to b999[0] by ExecuteIndirect
uint32_t vertexCountPerInstance;
uint32_t instanceCount;
uint32_t startVertexLocation;
uint32_t startInstanceLocation;
};
wi::graphics::GPUBuffer indirectArgsBuffer_; // IndirectDrawArgs[MAX_DRAWS]
wi::graphics::GPUBuffer drawCountBuffer_; // uint32_t[1]
mutable std::vector<IndirectDrawArgs> cpuIndirectArgs_;
bool gpuCullingEnabled_ = true; // Phase 2.3: GPU compute cull (true) vs CPU fallback (false)
bool mdiEnabled_ = true; // Phase 2.2: MDI rendering with CPU-filled indirect args
// Constants buffer (must match HLSL VoxelCB) // Constants buffer (must match HLSL VoxelCB)
struct VoxelConstants { struct VoxelConstants {
XMFLOAT4X4 viewProjection; XMFLOAT4X4 viewProjection;
@ -184,7 +182,6 @@ private:
wi::graphics::GPUBuffer gpuQuadCounter_; // atomic counter for GPU mesh output wi::graphics::GPUBuffer gpuQuadCounter_; // atomic counter for GPU mesh output
wi::graphics::GPUBuffer meshCounterReadback_; // READBACK buffer for quad counter wi::graphics::GPUBuffer meshCounterReadback_; // READBACK buffer for quad counter
bool gpuMesherAvailable_ = false; bool gpuMesherAvailable_ = false;
bool gpuMeshEnabled_ = true; // Use GPU meshing instead of CPU greedy
mutable uint32_t gpuMeshQuadCount_ = 0; // Readback from previous frame (1-frame delay) mutable uint32_t gpuMeshQuadCount_ = 0; // Readback from previous frame (1-frame delay)
mutable uint32_t voxelDataCapacity_ = 0; // Current capacity of voxelDataBuffer_ (in uint32s) mutable uint32_t voxelDataCapacity_ = 0; // Current capacity of voxelDataBuffer_ (in uint32s)
mutable std::vector<uint32_t> packedVoxelCache_; // cached packed voxel data for all chunks mutable std::vector<uint32_t> packedVoxelCache_; // cached packed voxel data for all chunks
@ -216,9 +213,8 @@ private:
mutable wi::graphics::GPUBuffer topingBLASIndexBuffer_; // sequential indices for toping BLAS mutable wi::graphics::GPUBuffer topingBLASIndexBuffer_; // sequential indices for toping BLAS
mutable uint32_t topingBLASPositionCapacity_ = 0; // pre-allocated capacity (vertices) mutable uint32_t topingBLASPositionCapacity_ = 0; // pre-allocated capacity (vertices)
mutable uint32_t topingBLASIndexCount_ = 0; // size of toping index buffer mutable uint32_t topingBLASIndexCount_ = 0; // size of toping index buffer
mutable bool topingBLASDirty_ = false; // deferred BLAS position upload + rebuild mutable bool topingBLASDirty_ = false; // GPU compute BLAS extract + rebuild needed
mutable uint32_t topingBLASVertexCount_ = 0; // actual vertex count for current frame mutable uint32_t topingBLASVertexCount_ = 0; // actual vertex count for current frame
std::vector<float> topingBLASPositionStaging_; // CPU staging for deferred upload
static constexpr uint32_t MAX_BLAS_VERTICES = MEGA_BUFFER_CAPACITY * 6; // 6 verts per quad static constexpr uint32_t MAX_BLAS_VERTICES = MEGA_BUFFER_CAPACITY * 6; // 6 verts per quad
mutable bool rtAvailable_ = false; // GPU supports RT mutable bool rtAvailable_ = false; // GPU supports RT
mutable bool rtDirty_ = true; // BLAS/TLAS need rebuild mutable bool rtDirty_ = true; // BLAS/TLAS need rebuild
@ -252,14 +248,6 @@ private:
const wi::graphics::Texture& renderTarget, const wi::graphics::Texture& renderTarget,
const wi::graphics::Texture& normalTarget) const; const wi::graphics::Texture& normalTarget) const;
// Benchmark state machine: runs once after world gen
enum class BenchState { IDLE, DISPATCH, READBACK, DONE };
mutable BenchState benchState_ = BenchState::IDLE;
mutable float cpuMeshTimeMs_ = 0.0f;
mutable uint32_t gpuBaselineQuads_ = 0;
void dispatchGpuMeshBenchmark(wi::graphics::CommandList cmd, const VoxelWorld& world) const;
void readbackGpuMeshBenchmark() const;
void dispatchGpuMesh(wi::graphics::CommandList cmd, const VoxelWorld& world, void dispatchGpuMesh(wi::graphics::CommandList cmd, const VoxelWorld& world,
ProfileAccum* profPack = nullptr, ProfileAccum* profUpload = nullptr, ProfileAccum* profPack = nullptr, ProfileAccum* profUpload = nullptr,
ProfileAccum* profDispatch = nullptr) const; ProfileAccum* profDispatch = nullptr) const;
@ -290,7 +278,7 @@ private:
public: public:
float getGpuCullTimeMs() const { return gpuCullTimeMs_; } float getGpuCullTimeMs() const { return gpuCullTimeMs_; }
float getGpuDrawTimeMs() const { return gpuDrawTimeMs_; } float getGpuDrawTimeMs() const { return gpuDrawTimeMs_; }
bool isGpuMeshEnabled() const { return gpuMeshEnabled_ && gpuMesherAvailable_; } bool isGpuMeshEnabled() const { return gpuMesherAvailable_; }
uint32_t getGpuMeshQuadCount() const { return gpuMeshQuadCount_; } uint32_t getGpuMeshQuadCount() const { return gpuMeshQuadCount_; }
// Phase 4: Toping rendering // Phase 4: Toping rendering
@ -364,11 +352,11 @@ private:
// Wind animation (continuous, always running) // Wind animation (continuous, always running)
float windTime_ = 0.0f; float windTime_ = 0.0f;
// Animated terrain (wave effect at 60 Hz, toggled with F3) // Animated terrain (wave effect at 30 Hz, toggled with F3)
bool animatedTerrain_ = false; bool animatedTerrain_ = false;
float animTime_ = 0.0f; float animTime_ = 0.0f;
float animAccum_ = 0.0f; float animAccum_ = 0.0f;
static constexpr float ANIM_INTERVAL = 1.0f / 60.0f; // ~16.7ms = 60 Hz static constexpr float ANIM_INTERVAL = 1.0f / 30.0f; // ~33.3ms = 30 Hz
wi::graphics::Texture voxelRT_; wi::graphics::Texture voxelRT_;
wi::graphics::Texture voxelNormalRT_; // Phase 6: world-space normals for RT shadows/AO wi::graphics::Texture voxelNormalRT_; // Phase 6: world-space normals for RT shadows/AO