Refactor: remove dead CPU/MDI paths, GPU BLAS compute, 30Hz animation
- Remove ~430 lines of dead CPU mesh, MDI, and GPU cull render paths (rebuildMegaBuffer, IndirectDrawArgs, drawCountBuffer, cullShader, etc.) - Add voxelTopingBLASCS.hlsl compute shader replacing 196ms CPU loop for toping BLAS position extraction (<1ms on GPU) - Reduce animation rate from 60Hz to 30Hz (halves CPU regen cost) - Simplify render() to GPU mesh path only (no conditional branches) - Remove benchmark state machine and stale mode strings
This commit is contained in:
parent
f134a5786d
commit
0d3f8200b4
3 changed files with 281 additions and 833 deletions
80
shaders/voxelTopingBLASCS.hlsl
Normal file
80
shaders/voxelTopingBLASCS.hlsl
Normal file
|
|
@ -0,0 +1,80 @@
|
|||
// BVLE Voxels - Toping BLAS Position Extraction Compute Shader
|
||||
// Replaces the 196ms CPU loop that computed world-space toping positions.
|
||||
// Reads vertex templates (t4) + instance positions (t5) + group table (t7),
|
||||
// writes flat float3 positions (u0) for DXR BLAS construction.
|
||||
//
|
||||
// One thread per output vertex. Group table maps global vertex index to
|
||||
// the correct (instance, local vertex) pair via prefix-sum offsets.
|
||||
|
||||
#include "voxelCommon.hlsli"
|
||||
|
||||
// Toping mesh vertex (must match C++ TopingVertex, 24 bytes)
|
||||
struct TopingVtx {
|
||||
float3 position; // local to voxel [0,1]^3
|
||||
float3 normal; // unused here, but struct must match
|
||||
};
|
||||
|
||||
// Toping instance (just the world position, 12 bytes)
|
||||
struct TopingInst {
|
||||
float3 worldPos;
|
||||
};
|
||||
|
||||
// Draw group descriptor for BLAS extraction (must match C++ TopingBLASGroupGPU, 20 bytes)
|
||||
struct TopingBLASGroup {
|
||||
uint globalVertexOffset; // prefix sum: first global vertex index for this group
|
||||
uint vertexTemplateOffset; // offset into topingVertices (t4)
|
||||
uint vertexCount; // vertices per instance (mesh slice count)
|
||||
uint instanceOffset; // offset into topingInstances (t5)
|
||||
uint instanceCount; // number of instances in this group
|
||||
};
|
||||
|
||||
StructuredBuffer<TopingVtx> topingVertices : register(t4);
|
||||
StructuredBuffer<TopingInst> topingInstances : register(t5);
|
||||
StructuredBuffer<TopingBLASGroup> topingGroups : register(t7);
|
||||
|
||||
// Output: raw float3 positions (12 bytes each)
|
||||
RWByteAddressBuffer blasPositions : register(u0);
|
||||
|
||||
// Push constants (b999)
|
||||
struct TopingBLASPush {
|
||||
uint totalVertices;
|
||||
uint groupCount;
|
||||
uint pad0, pad1, pad2, pad3, pad4, pad5, pad6, pad7, pad8, pad9;
|
||||
};
|
||||
[[vk::push_constant]] ConstantBuffer<TopingBLASPush> push : register(b999);
|
||||
|
||||
void storeFloat3(uint byteOffset, float3 v) {
|
||||
blasPositions.Store(byteOffset, asuint(v.x));
|
||||
blasPositions.Store(byteOffset + 4, asuint(v.y));
|
||||
blasPositions.Store(byteOffset + 8, asuint(v.z));
|
||||
}
|
||||
|
||||
[RootSignature(VOXEL_ROOTSIG)]
|
||||
[numthreads(64, 1, 1)]
|
||||
void main(uint3 DTid : SV_DispatchThreadID) {
|
||||
uint globalIdx = DTid.x;
|
||||
if (globalIdx >= push.totalVertices) return;
|
||||
|
||||
// Find which group this vertex belongs to (linear scan, max ~32 groups)
|
||||
uint groupIdx = 0;
|
||||
for (uint g = 1; g < push.groupCount; g++) {
|
||||
if (globalIdx >= topingGroups[g].globalVertexOffset)
|
||||
groupIdx = g;
|
||||
else
|
||||
break;
|
||||
}
|
||||
|
||||
TopingBLASGroup grp = topingGroups[groupIdx];
|
||||
|
||||
// Map global vertex to (instance, local vertex) within this group
|
||||
uint localIdx = globalIdx - grp.globalVertexOffset;
|
||||
uint instanceIdx = grp.instanceOffset + localIdx / grp.vertexCount;
|
||||
uint vertexIdx = grp.vertexTemplateOffset + localIdx % grp.vertexCount;
|
||||
|
||||
TopingVtx vtx = topingVertices[vertexIdx];
|
||||
TopingInst inst = topingInstances[instanceIdx];
|
||||
|
||||
float3 worldPos = inst.worldPos + vtx.position;
|
||||
|
||||
storeFloat3(globalIdx * 12, worldPos);
|
||||
}
|
||||
File diff suppressed because it is too large
Load diff
|
|
@ -27,7 +27,7 @@ struct GPUChunkInfo {
|
|||
uint32_t pad2[2]; // pad to 112 bytes (7 × float4)
|
||||
};
|
||||
|
||||
// ── Voxel Renderer (Phase 2: mega-buffer + MDI pipeline) ────────
|
||||
// ── Voxel Renderer (GPU mesh pipeline) ──────────────────────────
|
||||
class VoxelRenderer {
|
||||
friend class VoxelRenderPath;
|
||||
public:
|
||||
|
|
@ -58,8 +58,6 @@ public:
|
|||
uint32_t getDrawCalls() const { return drawCalls_; }
|
||||
uint32_t getChunkCount() const { return chunkCount_; }
|
||||
bool isInitialized() const { return initialized_; }
|
||||
bool isGpuCulling() const { return gpuCullingEnabled_; }
|
||||
bool isMdiEnabled() const { return mdiEnabled_; }
|
||||
|
||||
bool debugFaceColors_ = false;
|
||||
bool debugBlend_ = false;
|
||||
|
|
@ -67,7 +65,6 @@ public:
|
|||
|
||||
private:
|
||||
void createPipeline();
|
||||
void rebuildMegaBuffer(VoxelWorld& world);
|
||||
|
||||
wi::graphics::GraphicsDevice* device_ = nullptr;
|
||||
|
||||
|
|
@ -75,8 +72,6 @@ private:
|
|||
wi::graphics::Shader vertexShader_;
|
||||
wi::graphics::Shader pixelShader_;
|
||||
wi::graphics::PipelineState pso_;
|
||||
wi::graphics::Shader cullShader_; // Frustum cull compute shader
|
||||
|
||||
// Shaders & Pipeline (topings, Phase 4)
|
||||
wi::graphics::Shader topingVS_;
|
||||
wi::graphics::Shader topingPS_;
|
||||
|
|
@ -93,6 +88,29 @@ private:
|
|||
std::vector<TopingGPUInst> topingGpuInsts_;
|
||||
mutable uint32_t topingDrawCalls_ = 0;
|
||||
|
||||
// ── Toping draw groups (shared between render + BLAS CS) ─────
|
||||
struct TopingDrawGroup {
|
||||
uint16_t type, variant;
|
||||
uint32_t instanceOffset, instanceCount;
|
||||
uint32_t vertexTemplateOffset, vertexCount; // from TopingDef::variants[]
|
||||
};
|
||||
std::vector<TopingDrawGroup> topingDrawGroups_; // built in uploadTopingData, reused in renderTopings
|
||||
|
||||
// ── GPU compute toping BLAS extraction (replaces 196ms CPU loop) ──
|
||||
wi::graphics::Shader topingBLASShader_; // voxelTopingBLASCS compute shader
|
||||
struct TopingBLASGroupGPU {
|
||||
uint32_t globalVertexOffset; // prefix sum of total vertices before this group
|
||||
uint32_t vertexTemplateOffset; // offset into topingVertices (t4)
|
||||
uint32_t vertexCount; // vertices per instance
|
||||
uint32_t instanceOffset; // offset into topingInstances (t5)
|
||||
uint32_t instanceCount; // instances in this group
|
||||
};
|
||||
wi::graphics::GPUBuffer topingBLASGroupBuffer_; // StructuredBuffer<TopingBLASGroupGPU>, SRV t7
|
||||
std::vector<TopingBLASGroupGPU> topingBLASGroupsGPU_; // CPU staging for group table
|
||||
mutable uint32_t topingBLASTotalVertices_ = 0;
|
||||
static constexpr uint32_t MAX_TOPING_BLAS_GROUPS = 64;
|
||||
void dispatchTopingBLASExtract(wi::graphics::CommandList cmd) const;
|
||||
|
||||
// Shaders & Pipeline (smooth surfaces, Phase 5)
|
||||
wi::graphics::Shader smoothVS_;
|
||||
wi::graphics::Shader smoothPS_;
|
||||
|
|
@ -114,9 +132,7 @@ private:
|
|||
// ── Mega-buffer architecture (Phase 2) ──────────────────────
|
||||
static constexpr uint32_t MEGA_BUFFER_CAPACITY = 2 * 1024 * 1024; // 2M quads max (16 MB)
|
||||
static constexpr uint32_t MAX_CHUNKS = 2048;
|
||||
static constexpr uint32_t MAX_DRAWS = MAX_CHUNKS * 6; // up to 6 face groups per chunk
|
||||
|
||||
wi::graphics::GPUBuffer megaQuadBuffer_; // StructuredBuffer<PackedQuad>, SRV t0
|
||||
wi::graphics::GPUBuffer chunkInfoBuffer_; // StructuredBuffer<GPUChunkInfo>, SRV t2
|
||||
|
||||
// CPU-side tracking
|
||||
|
|
@ -127,27 +143,9 @@ private:
|
|||
};
|
||||
std::vector<ChunkSlot> chunkSlots_;
|
||||
std::vector<GPUChunkInfo> cpuChunkInfo_;
|
||||
std::vector<PackedQuad> cpuMegaQuads_; // CPU staging for mega-buffer
|
||||
uint32_t chunkCount_ = 0;
|
||||
bool megaBufferDirty_ = true;
|
||||
|
||||
// ── Indirect draw (Phase 2 MDI) ─────────────────────────────
|
||||
// Wicked Engine's DrawInstancedIndirectCount command signature includes a
|
||||
// push constant (1 × uint32 at b999) BEFORE each D3D12_DRAW_ARGUMENTS.
|
||||
// Total stride = 4 + 16 = 20 bytes per draw entry.
|
||||
struct IndirectDrawArgs {
|
||||
uint32_t pushConstant; // written to b999[0] by ExecuteIndirect
|
||||
uint32_t vertexCountPerInstance;
|
||||
uint32_t instanceCount;
|
||||
uint32_t startVertexLocation;
|
||||
uint32_t startInstanceLocation;
|
||||
};
|
||||
wi::graphics::GPUBuffer indirectArgsBuffer_; // IndirectDrawArgs[MAX_DRAWS]
|
||||
wi::graphics::GPUBuffer drawCountBuffer_; // uint32_t[1]
|
||||
mutable std::vector<IndirectDrawArgs> cpuIndirectArgs_;
|
||||
bool gpuCullingEnabled_ = true; // Phase 2.3: GPU compute cull (true) vs CPU fallback (false)
|
||||
bool mdiEnabled_ = true; // Phase 2.2: MDI rendering with CPU-filled indirect args
|
||||
|
||||
// Constants buffer (must match HLSL VoxelCB)
|
||||
struct VoxelConstants {
|
||||
XMFLOAT4X4 viewProjection;
|
||||
|
|
@ -184,7 +182,6 @@ private:
|
|||
wi::graphics::GPUBuffer gpuQuadCounter_; // atomic counter for GPU mesh output
|
||||
wi::graphics::GPUBuffer meshCounterReadback_; // READBACK buffer for quad counter
|
||||
bool gpuMesherAvailable_ = false;
|
||||
bool gpuMeshEnabled_ = true; // Use GPU meshing instead of CPU greedy
|
||||
mutable uint32_t gpuMeshQuadCount_ = 0; // Readback from previous frame (1-frame delay)
|
||||
mutable uint32_t voxelDataCapacity_ = 0; // Current capacity of voxelDataBuffer_ (in uint32s)
|
||||
mutable std::vector<uint32_t> packedVoxelCache_; // cached packed voxel data for all chunks
|
||||
|
|
@ -216,9 +213,8 @@ private:
|
|||
mutable wi::graphics::GPUBuffer topingBLASIndexBuffer_; // sequential indices for toping BLAS
|
||||
mutable uint32_t topingBLASPositionCapacity_ = 0; // pre-allocated capacity (vertices)
|
||||
mutable uint32_t topingBLASIndexCount_ = 0; // size of toping index buffer
|
||||
mutable bool topingBLASDirty_ = false; // deferred BLAS position upload + rebuild
|
||||
mutable bool topingBLASDirty_ = false; // GPU compute BLAS extract + rebuild needed
|
||||
mutable uint32_t topingBLASVertexCount_ = 0; // actual vertex count for current frame
|
||||
std::vector<float> topingBLASPositionStaging_; // CPU staging for deferred upload
|
||||
static constexpr uint32_t MAX_BLAS_VERTICES = MEGA_BUFFER_CAPACITY * 6; // 6 verts per quad
|
||||
mutable bool rtAvailable_ = false; // GPU supports RT
|
||||
mutable bool rtDirty_ = true; // BLAS/TLAS need rebuild
|
||||
|
|
@ -252,14 +248,6 @@ private:
|
|||
const wi::graphics::Texture& renderTarget,
|
||||
const wi::graphics::Texture& normalTarget) const;
|
||||
|
||||
// Benchmark state machine: runs once after world gen
|
||||
enum class BenchState { IDLE, DISPATCH, READBACK, DONE };
|
||||
mutable BenchState benchState_ = BenchState::IDLE;
|
||||
mutable float cpuMeshTimeMs_ = 0.0f;
|
||||
mutable uint32_t gpuBaselineQuads_ = 0;
|
||||
|
||||
void dispatchGpuMeshBenchmark(wi::graphics::CommandList cmd, const VoxelWorld& world) const;
|
||||
void readbackGpuMeshBenchmark() const;
|
||||
void dispatchGpuMesh(wi::graphics::CommandList cmd, const VoxelWorld& world,
|
||||
ProfileAccum* profPack = nullptr, ProfileAccum* profUpload = nullptr,
|
||||
ProfileAccum* profDispatch = nullptr) const;
|
||||
|
|
@ -290,7 +278,7 @@ private:
|
|||
public:
|
||||
float getGpuCullTimeMs() const { return gpuCullTimeMs_; }
|
||||
float getGpuDrawTimeMs() const { return gpuDrawTimeMs_; }
|
||||
bool isGpuMeshEnabled() const { return gpuMeshEnabled_ && gpuMesherAvailable_; }
|
||||
bool isGpuMeshEnabled() const { return gpuMesherAvailable_; }
|
||||
uint32_t getGpuMeshQuadCount() const { return gpuMeshQuadCount_; }
|
||||
|
||||
// Phase 4: Toping rendering
|
||||
|
|
@ -364,11 +352,11 @@ private:
|
|||
// Wind animation (continuous, always running)
|
||||
float windTime_ = 0.0f;
|
||||
|
||||
// Animated terrain (wave effect at 60 Hz, toggled with F3)
|
||||
// Animated terrain (wave effect at 30 Hz, toggled with F3)
|
||||
bool animatedTerrain_ = false;
|
||||
float animTime_ = 0.0f;
|
||||
float animAccum_ = 0.0f;
|
||||
static constexpr float ANIM_INTERVAL = 1.0f / 60.0f; // ~16.7ms = 60 Hz
|
||||
static constexpr float ANIM_INTERVAL = 1.0f / 30.0f; // ~33.3ms = 30 Hz
|
||||
|
||||
wi::graphics::Texture voxelRT_;
|
||||
wi::graphics::Texture voxelNormalRT_; // Phase 6: world-space normals for RT shadows/AO
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue