diff --git a/shaders/voxelTopingBLASCS.hlsl b/shaders/voxelTopingBLASCS.hlsl new file mode 100644 index 0000000..18269df --- /dev/null +++ b/shaders/voxelTopingBLASCS.hlsl @@ -0,0 +1,80 @@ +// BVLE Voxels - Toping BLAS Position Extraction Compute Shader +// Replaces the 196ms CPU loop that computed world-space toping positions. +// Reads vertex templates (t4) + instance positions (t5) + group table (t7), +// writes flat float3 positions (u0) for DXR BLAS construction. +// +// One thread per output vertex. Group table maps global vertex index to +// the correct (instance, local vertex) pair via prefix-sum offsets. + +#include "voxelCommon.hlsli" + +// Toping mesh vertex (must match C++ TopingVertex, 24 bytes) +struct TopingVtx { + float3 position; // local to voxel [0,1]^3 + float3 normal; // unused here, but struct must match +}; + +// Toping instance (just the world position, 12 bytes) +struct TopingInst { + float3 worldPos; +}; + +// Draw group descriptor for BLAS extraction (must match C++ TopingBLASGroupGPU, 20 bytes) +struct TopingBLASGroup { + uint globalVertexOffset; // prefix sum: first global vertex index for this group + uint vertexTemplateOffset; // offset into topingVertices (t4) + uint vertexCount; // vertices per instance (mesh slice count) + uint instanceOffset; // offset into topingInstances (t5) + uint instanceCount; // number of instances in this group +}; + +StructuredBuffer topingVertices : register(t4); +StructuredBuffer topingInstances : register(t5); +StructuredBuffer topingGroups : register(t7); + +// Output: raw float3 positions (12 bytes each) +RWByteAddressBuffer blasPositions : register(u0); + +// Push constants (b999) +struct TopingBLASPush { + uint totalVertices; + uint groupCount; + uint pad0, pad1, pad2, pad3, pad4, pad5, pad6, pad7, pad8, pad9; +}; +[[vk::push_constant]] ConstantBuffer push : register(b999); + +void storeFloat3(uint byteOffset, float3 v) { + blasPositions.Store(byteOffset, asuint(v.x)); + blasPositions.Store(byteOffset + 4, asuint(v.y)); + blasPositions.Store(byteOffset + 8, asuint(v.z)); +} + +[RootSignature(VOXEL_ROOTSIG)] +[numthreads(64, 1, 1)] +void main(uint3 DTid : SV_DispatchThreadID) { + uint globalIdx = DTid.x; + if (globalIdx >= push.totalVertices) return; + + // Find which group this vertex belongs to (linear scan, max ~32 groups) + uint groupIdx = 0; + for (uint g = 1; g < push.groupCount; g++) { + if (globalIdx >= topingGroups[g].globalVertexOffset) + groupIdx = g; + else + break; + } + + TopingBLASGroup grp = topingGroups[groupIdx]; + + // Map global vertex to (instance, local vertex) within this group + uint localIdx = globalIdx - grp.globalVertexOffset; + uint instanceIdx = grp.instanceOffset + localIdx / grp.vertexCount; + uint vertexIdx = grp.vertexTemplateOffset + localIdx % grp.vertexCount; + + TopingVtx vtx = topingVertices[vertexIdx]; + TopingInst inst = topingInstances[instanceIdx]; + + float3 worldPos = inst.worldPos + vtx.position; + + storeFloat3(globalIdx * 12, worldPos); +} diff --git a/src/voxel/VoxelRenderer.cpp b/src/voxel/VoxelRenderer.cpp index 42e267f..eed1f8b 100644 --- a/src/voxel/VoxelRenderer.cpp +++ b/src/voxel/VoxelRenderer.cpp @@ -28,15 +28,6 @@ void VoxelRenderer::initialize(GraphicsDevice* dev) { } generateTextures(); - // Create mega quad buffer (SRV for vertex pulling) - GPUBufferDesc megaDesc; - megaDesc.size = MEGA_BUFFER_CAPACITY * sizeof(PackedQuad); - megaDesc.bind_flags = BindFlag::SHADER_RESOURCE; - megaDesc.misc_flags = ResourceMiscFlag::BUFFER_STRUCTURED; - megaDesc.stride = sizeof(PackedQuad); - megaDesc.usage = Usage::DEFAULT; - device_->CreateBuffer(&megaDesc, nullptr, &megaQuadBuffer_); - // Create chunk info buffer (SRV for VS chunk lookup) GPUBufferDesc infoDesc; infoDesc.size = MAX_CHUNKS * sizeof(GPUChunkInfo); @@ -46,25 +37,6 @@ void VoxelRenderer::initialize(GraphicsDevice* dev) { infoDesc.usage = Usage::DEFAULT; device_->CreateBuffer(&infoDesc, nullptr, &chunkInfoBuffer_); - // Create indirect args buffer (for DrawInstancedIndirectCount, up to 6 draws per chunk) - // UAV bind flag needed for GPU cull compute shader to write args - GPUBufferDesc argsDesc; - argsDesc.size = MAX_DRAWS * sizeof(IndirectDrawArgs); - argsDesc.bind_flags = BindFlag::UNORDERED_ACCESS; - argsDesc.misc_flags = ResourceMiscFlag::BUFFER_STRUCTURED | ResourceMiscFlag::INDIRECT_ARGS; - argsDesc.stride = sizeof(IndirectDrawArgs); - argsDesc.usage = Usage::DEFAULT; - device_->CreateBuffer(&argsDesc, nullptr, &indirectArgsBuffer_); - - // Create draw count buffer (single uint32, raw for RWByteAddressBuffer) - // UAV bind flag needed for GPU cull compute shader atomic counter - GPUBufferDesc countDesc; - countDesc.size = sizeof(uint32_t); - countDesc.bind_flags = BindFlag::UNORDERED_ACCESS; - countDesc.misc_flags = ResourceMiscFlag::BUFFER_RAW | ResourceMiscFlag::INDIRECT_ARGS; - countDesc.usage = Usage::DEFAULT; - device_->CreateBuffer(&countDesc, nullptr, &drawCountBuffer_); - // ── GPU Timestamp Queries ────────────────────────────────────── GPUQueryHeapDesc queryDesc; queryDesc.type = GpuQueryType::TIMESTAMP; @@ -197,6 +169,22 @@ void VoxelRenderer::initialize(GraphicsDevice* dev) { rtAvailable_ = false; wi::backlog::post("VoxelRenderer: RT available but BLAS extraction shader failed", wi::backlog::LogLevel::Warning); } + // ── Toping BLAS CS (replaces 196ms CPU loop) ───────────────── + wi::renderer::LoadShader(ShaderStage::CS, topingBLASShader_, "voxel/voxelTopingBLASCS.cso"); + if (topingBLASShader_.IsValid()) { + // Pre-allocate small group table buffer (max 64 groups × 20 bytes = 1.25 KB) + GPUBufferDesc grpDesc; + grpDesc.size = MAX_TOPING_BLAS_GROUPS * sizeof(TopingBLASGroupGPU); + grpDesc.bind_flags = BindFlag::SHADER_RESOURCE; + grpDesc.misc_flags = ResourceMiscFlag::BUFFER_STRUCTURED; + grpDesc.stride = sizeof(TopingBLASGroupGPU); + grpDesc.usage = Usage::DEFAULT; + device_->CreateBuffer(&grpDesc, nullptr, &topingBLASGroupBuffer_); + wi::backlog::post("VoxelRenderer: toping BLAS CS available"); + } else { + wi::backlog::post("VoxelRenderer: toping BLAS CS failed", wi::backlog::LogLevel::Warning); + } + // ── RT Shadows + AO (Phase 6.2 + 6.3) ──────────────────────── wi::renderer::LoadShader(ShaderStage::CS, shadowShader_, "voxel/voxelShadowCS.cso", wi::graphics::ShaderModel::SM_6_5); @@ -213,20 +201,16 @@ void VoxelRenderer::initialize(GraphicsDevice* dev) { wi::backlog::post("VoxelRenderer: RT not available (GPU does not support ray tracing)"); } - cpuMegaQuads_.reserve(MEGA_BUFFER_CAPACITY); cpuChunkInfo_.reserve(MAX_CHUNKS); chunkSlots_.reserve(MAX_CHUNKS); - cpuIndirectArgs_.reserve(MAX_CHUNKS); initialized_ = true; - wi::backlog::post("VoxelRenderer: initialized (mega-buffer: " - + std::to_string(MEGA_BUFFER_CAPACITY) + " quads capacity)"); + wi::backlog::post("VoxelRenderer: initialized (GPU mesh pipeline)"); } void VoxelRenderer::shutdown() { chunkSlots_.clear(); cpuChunkInfo_.clear(); - cpuMegaQuads_.clear(); initialized_ = false; } @@ -250,19 +234,10 @@ void VoxelRenderer::createPipeline() { // Load shaders wi::renderer::LoadShader(ShaderStage::VS, vertexShader_, "voxel/voxelVS.cso"); wi::renderer::LoadShader(ShaderStage::PS, pixelShader_, "voxel/voxelPS.cso"); - wi::renderer::LoadShader(ShaderStage::CS, cullShader_, "voxel/voxelCullCS.cso"); - if (!vertexShader_.IsValid() || !pixelShader_.IsValid()) { wi::backlog::post("VoxelRenderer: shader loading failed", wi::backlog::LogLevel::Error); return; } - if (cullShader_.IsValid()) { - gpuCullingEnabled_ = true; - wi::backlog::post("VoxelRenderer: GPU cull compute shader enabled"); - } else { - gpuCullingEnabled_ = false; - wi::backlog::post("VoxelRenderer: cull compute shader not available, using CPU fallback", wi::backlog::LogLevel::Warning); - } // Pipeline: backface cull, depth test, opaque blend, triangle list PipelineStateDesc psoDesc; @@ -406,74 +381,7 @@ void VoxelRenderer::generateTextures() { // ── Mega-buffer rebuild ───────────────────────────────────────── // Packs all chunk quads contiguously into a single buffer. -// Simple strategy: full rebuild whenever any chunk is dirty. - -void VoxelRenderer::rebuildMegaBuffer(VoxelWorld& world) { - cpuMegaQuads_.clear(); - chunkSlots_.clear(); - cpuChunkInfo_.clear(); - - // Position → index map for neighbor lookup - std::unordered_map posToIdx; - auto posKey = [](const ChunkPos& p) -> uint64_t { - return ((uint64_t)(uint16_t)p.x) | ((uint64_t)(uint16_t)p.y << 16) | ((uint64_t)(uint16_t)p.z << 32); - }; - - uint32_t offset = 0; - float debugFlag = debugFaceColors_ ? 1.0f : 0.0f; - - world.forEachChunk([&](const ChunkPos& pos, Chunk& chunk) { - if (chunk.quadCount == 0) return; - if (offset + chunk.quadCount > MEGA_BUFFER_CAPACITY) return; - - uint32_t curIdx = (uint32_t)chunkSlots_.size(); - ChunkSlot slot; - slot.pos = pos; - slot.quadOffset = offset; - slot.quadCount = chunk.quadCount; - chunkSlots_.push_back(slot); - - GPUChunkInfo info = {}; - info.worldPos = XMFLOAT4( - (float)(pos.x * CHUNK_SIZE), - (float)(pos.y * CHUNK_SIZE), - (float)(pos.z * CHUNK_SIZE), - debugFlag - ); - info.quadOffset = offset; - info.quadCount = chunk.quadCount; - for (int f = 0; f < 6; f++) { - info.faceOffsets[f] = chunk.faceOffsets[f]; - info.faceCounts[f] = chunk.faceCounts[f]; - info.neighbors[f] = 0xFFFFFFFF; - } - cpuChunkInfo_.push_back(info); - posToIdx[posKey(pos)] = curIdx; - - cpuMegaQuads_.insert(cpuMegaQuads_.end(), chunk.quads.begin(), chunk.quads.end()); - offset += chunk.quadCount; - }); - - // Fill neighbor indices - static const int offsets[6][3] = { - {1,0,0}, {-1,0,0}, {0,1,0}, {0,-1,0}, {0,0,1}, {0,0,-1} - }; - for (uint32_t i = 0; i < (uint32_t)chunkSlots_.size(); i++) { - const auto& pos = chunkSlots_[i].pos; - for (int f = 0; f < 6; f++) { - ChunkPos npos = { pos.x + offsets[f][0], pos.y + offsets[f][1], pos.z + offsets[f][2] }; - auto it = posToIdx.find(posKey(npos)); - if (it != posToIdx.end()) { - cpuChunkInfo_[i].neighbors[f] = it->second; - } - } - } - - chunkCount_ = (uint32_t)chunkSlots_.size(); - totalQuads_ = offset; -} - -// Build chunkInfoBuffer without CPU meshing (for GPU mesh path) +// Build chunkInfoBuffer (chunk metadata for GPU mesh path) void VoxelRenderer::rebuildChunkInfoOnly(VoxelWorld& world) { chunkSlots_.clear(); cpuChunkInfo_.clear(); @@ -530,175 +438,24 @@ void VoxelRenderer::rebuildChunkInfoOnly(VoxelWorld& world) { void VoxelRenderer::updateMeshes(VoxelWorld& world) { if (!device_) return; - // GPU mesh path: skip CPU meshing entirely, just rebuild chunk info - if (gpuMeshEnabled_ && gpuMesherAvailable_) { - bool anyDirty = false; - world.forEachChunk([&](const ChunkPos& pos, Chunk& chunk) { - if (chunk.dirty) { anyDirty = true; chunk.dirty = false; } - }); - if (anyDirty || megaBufferDirty_) { - rebuildChunkInfoOnly(world); - // If cache wasn't already filled by fused regen+pack, mark for repack - if (!gpuMeshDirty_) { - // Non-fused dirty (e.g. initial load): need both repack and GPU update - voxelCacheDirty_ = true; - gpuMeshDirty_ = true; - } - // else: fused path already set gpuMeshDirty_=true, cache is clean - chunkInfoDirty_ = true; - megaBufferDirty_ = false; - } - return; - } - - // CPU meshing path (fallback) - // Collect dirty chunks for parallel meshing - std::vector dirtyChunks; + // Rebuild chunk info when any chunks are dirty + bool anyDirty = false; world.forEachChunk([&](const ChunkPos& pos, Chunk& chunk) { - if (chunk.dirty) dirtyChunks.push_back(&chunk); + if (chunk.dirty) { anyDirty = true; chunk.dirty = false; } }); - bool anyDirty = !dirtyChunks.empty(); - - // Parallel CPU greedy meshing via wi::jobsystem - auto cpuStart = std::chrono::high_resolution_clock::now(); - if (anyDirty) { - wi::jobsystem::context ctx; - wi::jobsystem::Dispatch(ctx, (uint32_t)dirtyChunks.size(), 1, - [&dirtyChunks, &world](wi::jobsystem::JobArgs args) { - VoxelMesher::meshChunk(*dirtyChunks[args.jobIndex], world); - }); - wi::jobsystem::Wait(ctx); - } - auto cpuEnd = std::chrono::high_resolution_clock::now(); - - if (anyDirty) { - cpuMeshTimeMs_ = std::chrono::duration(cpuEnd - cpuStart).count(); - // Trigger GPU benchmark on next render frame - if (gpuMesherAvailable_ && benchState_ == BenchState::IDLE) { - benchState_ = BenchState::DISPATCH; - } - } - if (anyDirty || megaBufferDirty_) { - rebuildMegaBuffer(world); + rebuildChunkInfoOnly(world); + // If cache wasn't already filled by fused regen+pack, mark for repack + if (!gpuMeshDirty_) { + voxelCacheDirty_ = true; + gpuMeshDirty_ = true; + } + chunkInfoDirty_ = true; megaBufferDirty_ = false; } } -// ── GPU Mesh Benchmark (Phase 2.4) ────────────────────────────── -// Dispatches the baseline 1x1 GPU mesher for ALL chunks and measures timing. -// State machine: DISPATCH (frame N) → READBACK (frame N+1) → DONE. - -void VoxelRenderer::dispatchGpuMeshBenchmark(CommandList cmd, const VoxelWorld& world) const { - auto* dev = device_; - - // Zero the quad counter - uint32_t zero = 0; - dev->UpdateBuffer(&gpuQuadCounter_, &zero, cmd, sizeof(uint32_t)); - - // Barrier: COPY_DST → UAV for counter, UNDEFINED → UAV for output buffer - GPUBarrier preBarriers[] = { - GPUBarrier::Buffer(&gpuQuadCounter_, ResourceState::COPY_DST, ResourceState::UNORDERED_ACCESS), - GPUBarrier::Buffer(&gpuQuadBuffer_, ResourceState::UNDEFINED, ResourceState::UNORDERED_ACCESS), - }; - dev->Barrier(preBarriers, 2, cmd); - - dev->BindComputeShader(&meshShader_, cmd); - - // GPU timestamp: mesh begin - dev->QueryEnd(×tampHeap_, TS_MESH_BEGIN, cmd); - - // Dispatch for each chunk - uint32_t chunkIdx = 0; - world.forEachChunk([&](const ChunkPos& pos, Chunk& chunk) { - // Pack voxel data: 32^3 voxels → 16384 uint32s (2 voxels per uint) - std::vector packed(CHUNK_VOLUME / 2, 0); - for (int i = 0; i < CHUNK_VOLUME; i++) { - uint32_t v = chunk.voxels[i].packed; - if (i & 1) - packed[i >> 1] |= (v << 16); - else - packed[i >> 1] = v; - } - - // Upload voxel data (re-uses the single-chunk buffer) - dev->UpdateBuffer(&voxelDataBuffer_, packed.data(), cmd, - packed.size() * sizeof(uint32_t)); - - // Bind resources (after BindComputeShader, so PushConstants targets compute) - dev->BindResource(&voxelDataBuffer_, 0, cmd); - dev->BindUAV(&gpuQuadBuffer_, 0, cmd); - dev->BindUAV(&gpuQuadCounter_, 1, cmd); - - // Push constants for this chunk - struct MeshPush { - uint32_t chunkIndex; - uint32_t voxelBufferOffset; - uint32_t quadBufferOffset; - uint32_t maxOutputQuads; - uint32_t pad[8]; - }; - MeshPush pushData = {}; - pushData.chunkIndex = chunkIdx; - pushData.voxelBufferOffset = 0; // single-chunk buffer, always at offset 0 - pushData.quadBufferOffset = 0; // all chunks share global atomic counter - pushData.maxOutputQuads = MEGA_BUFFER_CAPACITY; - dev->PushConstants(&pushData, sizeof(pushData), cmd); - - // Dispatch: 32/8 = 4 groups per axis → 64 groups total - dev->Dispatch(4, 4, 4, cmd); - - chunkIdx++; - }); - - // GPU timestamp: mesh end - dev->QueryEnd(×tampHeap_, TS_MESH_END, cmd); - - // Copy quad counter to readback buffer - GPUBarrier postBarrier = GPUBarrier::Buffer( - &gpuQuadCounter_, ResourceState::UNORDERED_ACCESS, ResourceState::COPY_SRC); - dev->Barrier(&postBarrier, 1, cmd); - dev->CopyBuffer(&meshCounterReadback_, 0, &gpuQuadCounter_, 0, sizeof(uint32_t), cmd); - - // Resolve timestamps - dev->QueryResolve(×tampHeap_, TS_MESH_BEGIN, 2, ×tampReadback_, - TS_MESH_BEGIN * sizeof(uint64_t), cmd); - - benchState_ = BenchState::READBACK; -} - -void VoxelRenderer::readbackGpuMeshBenchmark() const { - // Read quad count from readback buffer - uint32_t* countData = (uint32_t*)meshCounterReadback_.mapped_data; - if (countData) { - gpuBaselineQuads_ = *countData; - } - - // Read GPU mesh timestamps - uint64_t* tsData = (uint64_t*)timestampReadback_.mapped_data; - if (tsData) { - double freq = (double)device_->GetTimestampFrequency(); - if (freq > 0.0 && tsData[TS_MESH_END] > tsData[TS_MESH_BEGIN]) { - gpuMeshTimeMs_ = (float)((double)(tsData[TS_MESH_END] - tsData[TS_MESH_BEGIN]) / freq * 1000.0); - } - } - - // Log benchmark results - char msg[256]; - snprintf(msg, sizeof(msg), - "=== MESH BENCHMARK ===\n" - " CPU greedy: %.2f ms, %u quads (%u chunks)\n" - " GPU baseline: %.3f ms, %u quads (1x1, no merge)\n" - " Ratio quads: %.1fx more (GPU baseline vs CPU greedy)", - cpuMeshTimeMs_, totalQuads_, chunkCount_, - gpuMeshTimeMs_, gpuBaselineQuads_, - totalQuads_ > 0 ? (float)gpuBaselineQuads_ / totalQuads_ : 0.0f); - wi::backlog::post(msg); - - benchState_ = BenchState::DONE; -} - -// ── GPU Mesh Dispatch (production path) ───────────────────────── +// ── GPU Mesh Dispatch ──────────────────────────────────────────── // Dispatches GPU mesher for ALL chunks every frame. Replaces CPU greedy meshing. // Uses the atomic quad counter for 1-frame-delayed readback of total quad count. @@ -1004,6 +761,63 @@ void VoxelRenderer::dispatchBLASExtract(CommandList cmd) const { rtBlockyVertexCount_ = quadCount * 6; } +// ── GPU compute toping BLAS position extraction ───────────────── +// Replaces the 196ms CPU nested loop. Reads vertex templates (t4) + +// instance positions (t5) + group table (t7), writes positions to u0. +void VoxelRenderer::dispatchTopingBLASExtract(CommandList cmd) const { + if (!topingBLASShader_.IsValid() || !topingBLASGroupBuffer_.IsValid() || + !topingBLASPositionBuffer_.IsValid() || !topingVertexBuffer_.IsValid() || + !topingInstanceBuffer_.IsValid() || topingBLASTotalVertices_ == 0 || + topingBLASGroupsGPU_.empty()) return; + + auto* dev = device_; + + // Upload group table (tiny: ~32 × 20 bytes = 640 bytes) + size_t groupUploadSize = topingBLASGroupsGPU_.size() * sizeof(TopingBLASGroupGPU); + dev->UpdateBuffer(&topingBLASGroupBuffer_, + topingBLASGroupsGPU_.data(), cmd, groupUploadSize); + + // Pre-barriers + GPUBarrier preBarriers[] = { + GPUBarrier::Buffer(&topingBLASGroupBuffer_, + ResourceState::COPY_DST, ResourceState::SHADER_RESOURCE), + GPUBarrier::Buffer(&topingBLASPositionBuffer_, + ResourceState::UNDEFINED, ResourceState::UNORDERED_ACCESS), + }; + dev->Barrier(preBarriers, 2, cmd); + + // Bind compute shader + resources + dev->BindComputeShader(&topingBLASShader_, cmd); + dev->BindResource(&topingVertexBuffer_, 4, cmd); // t4 + dev->BindResource(&topingInstanceBuffer_, 5, cmd); // t5 + dev->BindResource(&topingBLASGroupBuffer_, 7, cmd); // t7 + dev->BindUAV(&topingBLASPositionBuffer_, 0, cmd); // u0 + + struct { + uint32_t totalVertices; + uint32_t groupCount; + uint32_t pad[10]; + } pushData = {}; + pushData.totalVertices = topingBLASTotalVertices_; + pushData.groupCount = (uint32_t)topingBLASGroupsGPU_.size(); + dev->PushConstants(&pushData, sizeof(pushData), cmd); + + // Dispatch: 64 threads per group + uint32_t threadGroups = (topingBLASTotalVertices_ + 63) / 64; + dev->Dispatch(threadGroups, 1, 1, cmd); + + // Post-barrier: UAV → SHADER_RESOURCE (for BLAS build) + GPUBarrier postBarriers[] = { + GPUBarrier::Buffer(&topingBLASPositionBuffer_, + ResourceState::UNORDERED_ACCESS, ResourceState::SHADER_RESOURCE), + }; + dev->Barrier(postBarriers, 1, cmd); + + rtTopingVertexCount_ = topingBLASTotalVertices_; + rtDirty_ = true; + topingBLASDirty_ = false; +} + void VoxelRenderer::buildAccelerationStructures(CommandList cmd) const { if (!rtAvailable_) return; @@ -1477,14 +1291,12 @@ void VoxelRenderer::render( auto* dev = device_; - // ── GPU Mesh path: quads already dispatched in Render(), just draw ── - if (gpuMeshEnabled_ && gpuMesherAvailable_) { - // Upload chunk info only when chunks changed - if (!cpuChunkInfo_.empty() && chunkInfoDirty_) { - dev->UpdateBuffer(&chunkInfoBuffer_, cpuChunkInfo_.data(), cmd, - cpuChunkInfo_.size() * sizeof(GPUChunkInfo)); - chunkInfoDirty_ = false; - } + // Upload chunk info only when chunks changed + if (!cpuChunkInfo_.empty() && chunkInfoDirty_) { + dev->UpdateBuffer(&chunkInfoBuffer_, cpuChunkInfo_.data(), cmd, + cpuChunkInfo_.size() * sizeof(GPUChunkInfo)); + chunkInfoDirty_ = false; + } // Per-frame constants VoxelConstants cb = {}; @@ -1580,402 +1392,11 @@ void VoxelRenderer::render( drawCalls_ = 1; } - dev->RenderPassEnd(cmd); - visibleChunks_ = chunkCount_; - return; - } - - // Upload mega-buffer and chunk info to GPU - if (!cpuMegaQuads_.empty()) { - dev->UpdateBuffer(&megaQuadBuffer_, cpuMegaQuads_.data(), cmd, - cpuMegaQuads_.size() * sizeof(PackedQuad)); - } - if (!cpuChunkInfo_.empty()) { - dev->UpdateBuffer(&chunkInfoBuffer_, cpuChunkInfo_.data(), cmd, - cpuChunkInfo_.size() * sizeof(GPUChunkInfo)); - } - - // Per-frame constants (with frustum planes for GPU cull shader) - VoxelConstants cb = {}; - XMMATRIX vpMatrix = camera.GetViewProjection(); - XMStoreFloat4x4(&cb.viewProjection, vpMatrix); - cb.cameraPosition = XMFLOAT4(camera.Eye.x, camera.Eye.y, camera.Eye.z, 1.0f); - cb.sunDirection = XMFLOAT4(-0.7f, -0.4f, -0.3f, 0.0f); // lower sun = longer cast shadows - cb.sunColor = XMFLOAT4(1.2f, 1.1f, 0.9f, 1.0f); - cb.chunkSize = (float)CHUNK_SIZE; - cb.textureTiling = 0.25f; - cb.blendEnabled = 0.0f; // Phase 3: blending disabled in CPU/MDI paths (no voxel data SRV) - cb.debugBlend = 0.0f; - cb.bleedMask = 0; - cb.resistBleedMask = 0; - cb.windTime = windTime_; - cb.chunkCount = chunkCount_; - extractFrustumPlanes(vpMatrix, cb.frustumPlanes); - dev->UpdateBuffer(&constantBuffer_, &cb, cmd, sizeof(cb)); - - // Push constant structure (must be 48 bytes = 12 x uint32, matches b999) - struct VoxelPush { - uint32_t chunkIndex; - uint32_t quadOffset; - uint32_t flags; // bit 0: 1=MDI mode, 0=CPU mode - uint32_t pad[9]; - }; - - visibleChunks_ = 0; - drawCalls_ = 0; - - // ── GPU Cull + MDI path ──────────────────────────────────────── - if (gpuCullingEnabled_) { - // DX12 buffer decay: all buffers return to COMMON after ExecuteCommandLists. - // So every frame starts clean — no cross-frame state tracking needed. - - // Zero the draw count via UpdateBuffer (COMMON → COPY_DST implicit promotion) - uint32_t zero = 0; - dev->UpdateBuffer(&drawCountBuffer_, &zero, cmd, sizeof(uint32_t)); - - // Barriers to UAV for compute shader writes: - // - drawCountBuffer_: COPY_DST → UAV (was promoted to COPY_DST by UpdateBuffer) - // - indirectArgsBuffer_: COMMON → UAV (explicit, required because COMMON can't - // be implicitly promoted to UAV) - GPUBarrier preBarriers[] = { - GPUBarrier::Buffer(&drawCountBuffer_, ResourceState::COPY_DST, ResourceState::UNORDERED_ACCESS), - GPUBarrier::Buffer(&indirectArgsBuffer_, ResourceState::UNDEFINED, ResourceState::UNORDERED_ACCESS), - }; - dev->Barrier(preBarriers, 2, cmd); - - // Timestamp: cull begin - dev->QueryEnd(×tampHeap_, TS_CULL_BEGIN, cmd); - - // Dispatch GPU frustum + backface cull compute shader - dev->BindComputeShader(&cullShader_, cmd); - dev->BindConstantBuffer(&constantBuffer_, 0, cmd); - dev->BindResource(&chunkInfoBuffer_, 2, cmd); - dev->BindUAV(&indirectArgsBuffer_, 0, cmd); - dev->BindUAV(&drawCountBuffer_, 1, cmd); - dev->Dispatch((chunkCount_ + 63) / 64, 1, 1, cmd); - - // Timestamp: cull end - dev->QueryEnd(×tampHeap_, TS_CULL_END, cmd); - - // Barriers: UAV → INDIRECT_ARGUMENT for DrawInstancedIndirectCount - GPUBarrier postBarriers[] = { - GPUBarrier::Buffer(&indirectArgsBuffer_, ResourceState::UNORDERED_ACCESS, ResourceState::INDIRECT_ARGUMENT), - GPUBarrier::Buffer(&drawCountBuffer_, ResourceState::UNORDERED_ACCESS, ResourceState::INDIRECT_ARGUMENT), - }; - dev->Barrier(postBarriers, 2, cmd); - - // ── Render pass (MRT: color + normals + depth) ────────────── - RenderPassImage rp[] = { - RenderPassImage::RenderTarget( - &renderTarget, - RenderPassImage::LoadOp::CLEAR, - RenderPassImage::StoreOp::STORE, - ResourceState::SHADER_RESOURCE, - ResourceState::SHADER_RESOURCE - ), - RenderPassImage::RenderTarget( - &normalTarget, - RenderPassImage::LoadOp::CLEAR, - RenderPassImage::StoreOp::STORE, - ResourceState::SHADER_RESOURCE, - ResourceState::SHADER_RESOURCE - ), - RenderPassImage::DepthStencil( - &depthBuffer, - RenderPassImage::LoadOp::CLEAR, - RenderPassImage::StoreOp::STORE, - ResourceState::DEPTHSTENCIL, - ResourceState::DEPTHSTENCIL, - ResourceState::DEPTHSTENCIL - ), - }; - dev->RenderPassBegin(rp, 3, cmd); - - Viewport vp; - vp.width = (float)renderTarget.GetDesc().width; - vp.height = (float)renderTarget.GetDesc().height; - vp.min_depth = 0.0f; - vp.max_depth = 1.0f; - dev->BindViewports(1, &vp, cmd); - - Rect scissor = { 0, 0, (int)vp.width, (int)vp.height }; - dev->BindScissorRects(1, &scissor, cmd); - - dev->BindPipelineState(&pso_, cmd); - dev->BindConstantBuffer(&constantBuffer_, 0, cmd); - dev->BindResource(&megaQuadBuffer_, 0, cmd); - dev->BindResource(&textureArray_, 1, cmd); - dev->BindResource(&chunkInfoBuffer_, 2, cmd); - dev->BindSampler(&sampler_, 0, cmd); - - // IMPORTANT: PushConstants must be called AFTER BindPipelineState. - // Wicked Engine's PushConstants uses SetGraphicsRoot32BitConstants only - // when active_pso is set. If called before (with active_cs from compute), - // it would set COMPUTE push constants instead of GRAPHICS ones. - VoxelPush pushData = {}; - pushData.flags = 1; // MDI mode - dev->PushConstants(&pushData, sizeof(pushData), cmd); - - // Timestamp: draw begin - dev->QueryEnd(×tampHeap_, TS_DRAW_BEGIN, cmd); - - // Single MDI call: GPU cull shader filled the indirect args - dev->DrawInstancedIndirectCount( - &indirectArgsBuffer_, 0, - &drawCountBuffer_, 0, - MAX_DRAWS, cmd - ); - drawCalls_ = 1; - - // Timestamp: draw end - dev->QueryEnd(×tampHeap_, TS_DRAW_END, cmd); - - dev->RenderPassEnd(cmd); - - // Resolve timestamps for readback (results available next frame) - dev->QueryResolve(×tampHeap_, 0, TS_COUNT, ×tampReadback_, 0, cmd); - - // Read back previous frame's timestamps (persistently mapped READBACK buffer) - uint64_t* tsData = (uint64_t*)timestampReadback_.mapped_data; - if (tsData) { - double freq = (double)dev->GetTimestampFrequency(); - if (freq > 0.0 && tsData[TS_CULL_END] > tsData[TS_CULL_BEGIN]) { - gpuCullTimeMs_ = (float)((double)(tsData[TS_CULL_END] - tsData[TS_CULL_BEGIN]) / freq * 1000.0); - } - if (freq > 0.0 && tsData[TS_DRAW_END] > tsData[TS_DRAW_BEGIN]) { - gpuDrawTimeMs_ = (float)((double)(tsData[TS_DRAW_END] - tsData[TS_DRAW_BEGIN]) / freq * 1000.0); - } - } - - // GPU cull handles visibility counting — approximate from chunkCount - visibleChunks_ = chunkCount_; // exact count would require readback of drawCount - - return; - } - - // ── CPU frustum + backface cull (shared by MDI and per-face paths) ── - wi::primitive::Frustum frustum; - frustum.Create(camera.GetViewProjection()); - - // ── Phase 2.2: CPU-filled indirect args + MDI draw ────────────── - if (mdiEnabled_) { - // CPU cull: fill indirect args with visible face groups - cpuIndirectArgs_.clear(); - uint32_t cpuDrawCount = 0; - - for (uint32_t i = 0; i < chunkCount_; i++) { - const auto& slot = chunkSlots_[i]; - if (slot.quadCount == 0) continue; - - XMFLOAT3 aabbMin( - (float)(slot.pos.x * CHUNK_SIZE), - (float)(slot.pos.y * CHUNK_SIZE), - (float)(slot.pos.z * CHUNK_SIZE) - ); - XMFLOAT3 aabbMax( - aabbMin.x + CHUNK_SIZE, - aabbMin.y + CHUNK_SIZE, - aabbMin.z + CHUNK_SIZE - ); - wi::primitive::AABB aabb(aabbMin, aabbMax); - if (!frustum.CheckBoxFast(aabb)) continue; - - visibleChunks_++; - const auto& info = cpuChunkInfo_[i]; - - for (uint32_t f = 0; f < 6; f++) { - if (info.faceCounts[f] == 0) continue; - - bool backFacing = false; - switch (f) { - case 0: backFacing = (camera.Eye.x < aabbMin.x); break; - case 1: backFacing = (camera.Eye.x > aabbMax.x); break; - case 2: backFacing = (camera.Eye.y < aabbMin.y); break; - case 3: backFacing = (camera.Eye.y > aabbMax.y); break; - case 4: backFacing = (camera.Eye.z < aabbMin.z); break; - case 5: backFacing = (camera.Eye.z > aabbMax.z); break; - } - if (backFacing) continue; - - IndirectDrawArgs args = {}; - // Pack chunkIndex (low 16 bits) + faceIndex (high 16 bits) into push constant. - // The shader unpacks this to look up quadOffset from GPUChunkInfo. - // We do NOT use startVertexLocation because SV_VertexID may not include it - // reliably in ExecuteIndirect context. - args.pushConstant = i | (f << 16); - args.vertexCountPerInstance = info.faceCounts[f] * 6; - args.instanceCount = 1; - args.startVertexLocation = 0; - args.startInstanceLocation = 0; - cpuIndirectArgs_.push_back(args); - cpuDrawCount++; - } - } - - // Upload indirect args and draw count to GPU - // Note: no explicit barriers needed here. Buffers start in COMMON each frame - // (DX12 buffer decay after command list execution). COMMON is implicitly - // promoted to COPY_DST by UpdateBuffer, then to INDIRECT_ARGUMENT by - // DrawInstancedIndirectCount. This matches Phase 2.1 pattern (no barriers - // between UpdateBuffer and SRV usage for megaQuadBuffer_/chunkInfoBuffer_). - if (!cpuIndirectArgs_.empty()) { - dev->UpdateBuffer(&indirectArgsBuffer_, cpuIndirectArgs_.data(), cmd, - cpuIndirectArgs_.size() * sizeof(IndirectDrawArgs)); - } - dev->UpdateBuffer(&drawCountBuffer_, &cpuDrawCount, cmd, sizeof(uint32_t)); - - // ── Render pass (MRT: color + normals + depth) ────────────── - RenderPassImage rp[] = { - RenderPassImage::RenderTarget( - &renderTarget, - RenderPassImage::LoadOp::CLEAR, - RenderPassImage::StoreOp::STORE, - ResourceState::SHADER_RESOURCE, - ResourceState::SHADER_RESOURCE - ), - RenderPassImage::RenderTarget( - &normalTarget, - RenderPassImage::LoadOp::CLEAR, - RenderPassImage::StoreOp::STORE, - ResourceState::SHADER_RESOURCE, - ResourceState::SHADER_RESOURCE - ), - RenderPassImage::DepthStencil( - &depthBuffer, - RenderPassImage::LoadOp::CLEAR, - RenderPassImage::StoreOp::STORE, - ResourceState::DEPTHSTENCIL, - ResourceState::DEPTHSTENCIL, - ResourceState::DEPTHSTENCIL - ), - }; - dev->RenderPassBegin(rp, 3, cmd); - - Viewport vp; - vp.width = (float)renderTarget.GetDesc().width; - vp.height = (float)renderTarget.GetDesc().height; - vp.min_depth = 0.0f; - vp.max_depth = 1.0f; - dev->BindViewports(1, &vp, cmd); - - Rect scissor = { 0, 0, (int)vp.width, (int)vp.height }; - dev->BindScissorRects(1, &scissor, cmd); - - dev->BindPipelineState(&pso_, cmd); - dev->BindConstantBuffer(&constantBuffer_, 0, cmd); - dev->BindResource(&megaQuadBuffer_, 0, cmd); - dev->BindResource(&textureArray_, 1, cmd); - dev->BindResource(&chunkInfoBuffer_, 2, cmd); - dev->BindSampler(&sampler_, 0, cmd); - - // MDI mode: VS uses binary search to find chunk from SV_VertexID - VoxelPush pushData = {}; - pushData.flags = 1; // MDI mode - dev->PushConstants(&pushData, sizeof(pushData), cmd); - - dev->DrawInstancedIndirectCount( - &indirectArgsBuffer_, 0, - &drawCountBuffer_, 0, - MAX_DRAWS, cmd - ); - drawCalls_ = 1; - - dev->RenderPassEnd(cmd); - return; - } - - // ── Phase 2.1 Fallback: per-face-group DrawInstanced ──────────── - RenderPassImage rp[] = { - RenderPassImage::RenderTarget( - &renderTarget, - RenderPassImage::LoadOp::CLEAR, - RenderPassImage::StoreOp::STORE, - ResourceState::SHADER_RESOURCE, - ResourceState::SHADER_RESOURCE - ), - RenderPassImage::RenderTarget( - &normalTarget, - RenderPassImage::LoadOp::CLEAR, - RenderPassImage::StoreOp::STORE, - ResourceState::SHADER_RESOURCE, - ResourceState::SHADER_RESOURCE - ), - RenderPassImage::DepthStencil( - &depthBuffer, - RenderPassImage::LoadOp::CLEAR, - RenderPassImage::StoreOp::STORE, - ResourceState::DEPTHSTENCIL, - ResourceState::DEPTHSTENCIL, - ResourceState::DEPTHSTENCIL - ), - }; - dev->RenderPassBegin(rp, 3, cmd); - - Viewport vp; - vp.width = (float)renderTarget.GetDesc().width; - vp.height = (float)renderTarget.GetDesc().height; - vp.min_depth = 0.0f; - vp.max_depth = 1.0f; - dev->BindViewports(1, &vp, cmd); - - Rect scissor = { 0, 0, (int)vp.width, (int)vp.height }; - dev->BindScissorRects(1, &scissor, cmd); - - dev->BindPipelineState(&pso_, cmd); - dev->BindConstantBuffer(&constantBuffer_, 0, cmd); - dev->BindResource(&megaQuadBuffer_, 0, cmd); - dev->BindResource(&textureArray_, 1, cmd); - dev->BindResource(&chunkInfoBuffer_, 2, cmd); - dev->BindSampler(&sampler_, 0, cmd); - - for (uint32_t i = 0; i < chunkCount_; i++) { - const auto& slot = chunkSlots_[i]; - if (slot.quadCount == 0) continue; - - XMFLOAT3 aabbMin( - (float)(slot.pos.x * CHUNK_SIZE), - (float)(slot.pos.y * CHUNK_SIZE), - (float)(slot.pos.z * CHUNK_SIZE) - ); - XMFLOAT3 aabbMax( - aabbMin.x + CHUNK_SIZE, - aabbMin.y + CHUNK_SIZE, - aabbMin.z + CHUNK_SIZE - ); - wi::primitive::AABB aabb(aabbMin, aabbMax); - if (!frustum.CheckBoxFast(aabb)) continue; - - visibleChunks_++; - const auto& info = cpuChunkInfo_[i]; - - for (uint32_t f = 0; f < 6; f++) { - if (info.faceCounts[f] == 0) continue; - - bool backFacing = false; - switch (f) { - case 0: backFacing = (camera.Eye.x < aabbMin.x); break; - case 1: backFacing = (camera.Eye.x > aabbMax.x); break; - case 2: backFacing = (camera.Eye.y < aabbMin.y); break; - case 3: backFacing = (camera.Eye.y > aabbMax.y); break; - case 4: backFacing = (camera.Eye.z < aabbMin.z); break; - case 5: backFacing = (camera.Eye.z > aabbMax.z); break; - } - if (backFacing) continue; - - VoxelPush pushData = {}; - pushData.chunkIndex = i; - pushData.quadOffset = slot.quadOffset + info.faceOffsets[f]; - pushData.flags = 0; // CPU mode - dev->PushConstants(&pushData, sizeof(pushData), cmd); - - dev->DrawInstanced(info.faceCounts[f] * 6, 1, 0, 0, cmd); - drawCalls_++; - } - } - dev->RenderPassEnd(cmd); + visibleChunks_ = chunkCount_; } + // ── Phase 4: Toping GPU upload + rendering ───────────────────── void VoxelRenderer::uploadTopingData(const TopingSystem& topingSystem) { @@ -2061,48 +1482,74 @@ void VoxelRenderer::uploadTopingData(const TopingSystem& topingSystem) { topingInstanceDirty_ = true; // deferred upload in Render() } - // ── Build toping BLAS position data ──────────────────────── - // Compute world-space positions for all toping instances. - // Buffer is pre-allocated once (no per-frame CreateBuffer), data uploaded - // in Render() via UpdateBuffer (deferred — needs CommandList). + // ── Build draw groups + BLAS group table ─────────────────── + // Draw groups are built once here from sorted instances and reused by: + // 1. renderTopings() — for instanced draw calls + // 2. dispatchTopingBLASExtract() — GPU compute shader fills BLAS positions + // This replaces the 196ms CPU loop that computed world-space positions. const auto& defs = topingSystem.getDefs(); - uint32_t totalTopingVerts = 0; - for (uint32_t i = 0; i < instCount; i++) { - const auto& si = topingSorted_[i]; - if (si.type >= defs.size()) continue; - totalTopingVerts += defs[si.type].variants[si.variant].count; - } - - if (totalTopingVerts > 0 && !verts.empty()) { - // Fill staging buffer with world-space positions - topingBLASPositionStaging_.resize(totalTopingVerts * 3); - uint32_t outIdx = 0; - for (uint32_t i = 0; i < instCount; i++) { - const auto& si = topingSorted_[i]; - if (si.type >= defs.size()) continue; - if (si.variant >= 16) continue; - const auto& slice = defs[si.type].variants[si.variant]; - for (uint32_t v = 0; v < slice.count; v++) { - if (slice.offset + v >= verts.size()) break; - if (outIdx >= totalTopingVerts) break; - const auto& vtx = verts[slice.offset + v]; - topingBLASPositionStaging_[outIdx * 3 + 0] = vtx.px + si.wx; - topingBLASPositionStaging_[outIdx * 3 + 1] = vtx.py + si.wy; - topingBLASPositionStaging_[outIdx * 3 + 2] = vtx.pz + si.wz; - outIdx++; + topingDrawGroups_.clear(); + if (instCount > 0) { + TopingDrawGroup g; + g.type = topingSorted_[0].type; + g.variant = topingSorted_[0].variant; + g.instanceOffset = 0; + g.instanceCount = 1; + g.vertexTemplateOffset = 0; + g.vertexCount = 0; + if (g.type < defs.size() && g.variant < 16) { + const auto& slice = defs[g.type].variants[g.variant]; + g.vertexTemplateOffset = slice.offset; + g.vertexCount = slice.count; + } + for (uint32_t i = 1; i < instCount; i++) { + if (topingSorted_[i].type == g.type && topingSorted_[i].variant == g.variant) { + g.instanceCount++; + } else { + if (g.vertexCount > 0) topingDrawGroups_.push_back(g); + g.type = topingSorted_[i].type; + g.variant = topingSorted_[i].variant; + g.instanceOffset = i; + g.instanceCount = 1; + g.vertexTemplateOffset = 0; + g.vertexCount = 0; + if (g.type < defs.size() && g.variant < 16) { + const auto& slice = defs[g.type].variants[g.variant]; + g.vertexTemplateOffset = slice.offset; + g.vertexCount = slice.count; + } } } - topingBLASVertexCount_ = outIdx; + if (g.vertexCount > 0) topingDrawGroups_.push_back(g); + } - // Pre-allocate GPU buffer once; grow only when needed. - // No RAY_TRACING flag — BLAS vertex buffers work with SHADER_RESOURCE - // (same pattern as blocky blasPositionBuffer_). This allows UpdateBuffer. - if (!topingBLASPositionBuffer_.IsValid() || topingBLASPositionCapacity_ < outIdx) { - topingBLASPositionCapacity_ = outIdx + outIdx / 4; // 25% headroom + // Build GPU group table with prefix sums for the BLAS CS + topingBLASGroupsGPU_.resize(topingDrawGroups_.size()); + uint32_t totalTopingVerts = 0; + for (size_t i = 0; i < topingDrawGroups_.size(); i++) { + const auto& dg = topingDrawGroups_[i]; + topingBLASGroupsGPU_[i] = { + totalTopingVerts, // globalVertexOffset (prefix sum) + dg.vertexTemplateOffset, + dg.vertexCount, + dg.instanceOffset, + dg.instanceCount + }; + totalTopingVerts += dg.vertexCount * dg.instanceCount; + } + topingBLASTotalVertices_ = totalTopingVerts; + topingBLASVertexCount_ = totalTopingVerts; + + // Pre-allocate GPU BLAS position buffer (UAV+SRV, raw for RWByteAddressBuffer). + // Same pattern as blocky blasPositionBuffer_. + if (totalTopingVerts > 0) { + if (!topingBLASPositionBuffer_.IsValid() || topingBLASPositionCapacity_ < totalTopingVerts) { + topingBLASPositionCapacity_ = totalTopingVerts + totalTopingVerts / 4; // 25% headroom GPUBufferDesc posDesc; posDesc.size = (size_t)topingBLASPositionCapacity_ * 3 * sizeof(float); - posDesc.bind_flags = BindFlag::SHADER_RESOURCE; - posDesc.misc_flags = ResourceMiscFlag::NONE; + posDesc.bind_flags = BindFlag::UNORDERED_ACCESS | BindFlag::SHADER_RESOURCE; + posDesc.misc_flags = ResourceMiscFlag::BUFFER_RAW; + posDesc.stride = 0; posDesc.usage = Usage::DEFAULT; device_->CreateBuffer(&posDesc, nullptr, &topingBLASPositionBuffer_); @@ -2127,7 +1574,7 @@ void VoxelRenderer::uploadTopingData(const TopingSystem& topingSystem) { topingBLASIndexCount_ = idxCount; } - topingBLASDirty_ = true; // deferred upload + BLAS rebuild in Render() + topingBLASDirty_ = true; // GPU compute dispatch + BLAS rebuild in Render() } } @@ -2192,38 +1639,7 @@ void VoxelRenderer::renderTopings( dev->BindResource(&topingInstanceBuffer_, 5, cmd); // t5 dev->BindSampler(&sampler_, 0, cmd); - // Build sorted draw groups (same sort order as uploadTopingData) - struct DrawGroup { - uint16_t type, variant; - uint32_t instanceOffset, instanceCount; - }; - struct SortKey { uint16_t type, variant; }; - std::vector sortedKeys(instances.size()); - for (size_t i = 0; i < instances.size(); i++) { - sortedKeys[i] = { instances[i].topingType, instances[i].variant }; - } - std::sort(sortedKeys.begin(), sortedKeys.end(), [](const SortKey& a, const SortKey& b) { - if (a.type != b.type) return a.type < b.type; - return a.variant < b.variant; - }); - - // Identify contiguous groups - std::vector groups; - uint32_t instCount = (uint32_t)std::min(sortedKeys.size(), (size_t)MAX_TOPING_INSTANCES); - if (instCount > 0) { - DrawGroup g = { sortedKeys[0].type, sortedKeys[0].variant, 0, 1 }; - for (uint32_t i = 1; i < instCount; i++) { - if (sortedKeys[i].type == g.type && sortedKeys[i].variant == g.variant) { - g.instanceCount++; - } else { - groups.push_back(g); - g = { sortedKeys[i].type, sortedKeys[i].variant, i, 1 }; - } - } - groups.push_back(g); - } - - // Issue one DrawInstanced per group + // Reuse draw groups built in uploadTopingData (avoids redundant sort) topingDrawCalls_ = 0; struct TopingPush { uint32_t vertexOffset; @@ -2232,19 +1648,17 @@ void VoxelRenderer::renderTopings( uint32_t pad[9]; }; - for (const auto& g : groups) { + for (const auto& g : topingDrawGroups_) { if (g.type >= defs.size()) continue; - const TopingDef& def = defs[g.type]; - const MeshSlice& slice = def.variants[g.variant]; - if (slice.count == 0) continue; // empty mesh (all neighbors present) + if (g.vertexCount == 0) continue; TopingPush pushData = {}; - pushData.vertexOffset = slice.offset; + pushData.vertexOffset = g.vertexTemplateOffset; pushData.instanceOffset = g.instanceOffset; - pushData.materialID = def.materialID; + pushData.materialID = defs[g.type].materialID; dev->PushConstants(&pushData, sizeof(pushData), cmd); - dev->DrawInstanced(slice.count, g.instanceCount, 0, 0, cmd); + dev->DrawInstanced(g.vertexCount, g.instanceCount, 0, 0, cmd); topingDrawCalls_++; } @@ -2596,7 +2010,7 @@ void VoxelRenderPath::handleInput(float dt) { // F3: toggle animated terrain if (wi::input::Press(wi::input::KEYBOARD_BUTTON_F3)) { animatedTerrain_ = !animatedTerrain_; - wi::backlog::post(animatedTerrain_ ? "Animation: ON (60 Hz)" : "Animation: OFF"); + wi::backlog::post(animatedTerrain_ ? "Animation: ON (30 Hz)" : "Animation: OFF"); } // F4: toggle blend debug visualization if (wi::input::Press(wi::input::KEYBOARD_BUTTON_F4)) { @@ -2666,7 +2080,7 @@ void VoxelRenderPath::Update(float dt) { windTime_ += dt; renderer.windTime_ = windTime_; - // Animated terrain: regenerate at 60 Hz with time-shifted noise + // Animated terrain: regenerate at 30 Hz with time-shifted noise // Fused: regenerate + pack voxel data in the same parallel pass if (animatedTerrain_ && renderer.isInitialized()) { animAccum_ += dt; @@ -2758,7 +2172,7 @@ void VoxelRenderPath::Render() const { CommandList cmd = device->BeginCommandList(); // GPU mesh path: only re-dispatch when voxel data changed - if (renderer.gpuMeshEnabled_ && renderer.gpuMesherAvailable_) { + if (renderer.gpuMesherAvailable_) { // Always readback previous frame's quad count uint32_t* countData = (uint32_t*)renderer.meshCounterReadback_.mapped_data; if (countData) { @@ -2786,19 +2200,11 @@ void VoxelRenderPath::Render() const { renderer.gpuSmoothMeshDirty_ = true; } - // ── Deferred toping BLAS position upload (must happen BEFORE BLAS build) ── - if (renderer.topingBLASDirty_ && renderer.topingBLASPositionBuffer_.IsValid() && - renderer.topingBLASVertexCount_ > 0 && - !renderer.topingBLASPositionStaging_.empty()) { - size_t uploadSize = (size_t)renderer.topingBLASVertexCount_ * 3 * sizeof(float); - size_t bufferSize = (size_t)renderer.topingBLASPositionCapacity_ * 3 * sizeof(float); - if (uploadSize <= bufferSize) { - device->UpdateBuffer(&renderer.topingBLASPositionBuffer_, - renderer.topingBLASPositionStaging_.data(), cmd, uploadSize); - } - renderer.rtTopingVertexCount_ = renderer.topingBLASVertexCount_; - renderer.rtDirty_ = true; // trigger BLAS/TLAS rebuild - renderer.topingBLASDirty_ = false; + // ── GPU compute toping BLAS extraction (replaces 196ms CPU loop) ── + // Must happen BEFORE BLAS build. Fills topingBLASPositionBuffer_ via CS, + // sets rtTopingVertexCount_ and rtDirty_ to trigger BLAS/TLAS rebuild. + if (renderer.topingBLASDirty_ && renderer.topingBLASShader_.IsValid()) { + renderer.dispatchTopingBLASExtract(cmd); } // Phase 6.1: BLAS extraction + acceleration structure build @@ -2810,15 +2216,6 @@ void VoxelRenderPath::Render() const { } } - // GPU mesh benchmark state machine (runs once after world gen, CPU path only) - if (!renderer.gpuMeshEnabled_) { - if (renderer.benchState_ == VoxelRenderer::BenchState::DISPATCH) { - renderer.dispatchGpuMeshBenchmark(cmd, world); - } else if (renderer.benchState_ == VoxelRenderer::BenchState::READBACK) { - renderer.readbackGpuMeshBenchmark(); - } - } - // ── Deferred GPU uploads (dirty flags set in Update(), need CommandList) ── if (renderer.topingInstanceDirty_ && renderer.topingInstanceBuffer_.IsValid() && !renderer.topingGpuInsts_.empty()) { @@ -2938,26 +2335,9 @@ void VoxelRenderPath::Compose(CommandList cmd) const { stats += "Chunks: " + std::to_string(renderer.getVisibleChunks()) + "/" + std::to_string(renderer.getChunkCount()) + "\n"; stats += "Quads: " + std::to_string(renderer.getTotalQuads()) + "\n"; - std::string renderMode; - if (renderer.isGpuMeshEnabled()) - renderMode = "GPU mesh (1x1) + DrawInstanced"; - else if (renderer.isGpuCulling()) - renderMode = "CPU greedy + MDI + GPU cull"; - else if (renderer.isMdiEnabled()) - renderMode = "CPU greedy + MDI + CPU cull"; - else - renderMode = "CPU greedy + DrawInstanced + CPU cull"; stats += "Draw Calls: " + std::to_string(renderer.getDrawCalls()) - + " (" + renderMode + ")\n"; - - if (renderer.isGpuMeshEnabled()) { - stats += "GPU Mesh Quads: " + std::to_string(renderer.getGpuMeshQuadCount()) + "\n"; - } else { - char cullStr[16], drawStr[16]; - snprintf(cullStr, sizeof(cullStr), "%.3f", renderer.getGpuCullTimeMs()); - snprintf(drawStr, sizeof(drawStr), "%.3f", renderer.getGpuDrawTimeMs()); - stats += "GPU Cull: " + std::string(cullStr) + " ms | Draw: " + std::string(drawStr) + " ms\n"; - } + + " (GPU mesh + DrawInstanced)\n"; + stats += "GPU Mesh Quads: " + std::to_string(renderer.getGpuMeshQuadCount()) + "\n"; stats += "Topings: " + std::to_string(topingSystem.getInstanceCount()) + " instances, " + std::to_string(renderer.getTopingDrawCalls()) + " draws (" + std::to_string(topingSystem.getDefCount()) + " types)\n"; diff --git a/src/voxel/VoxelRenderer.h b/src/voxel/VoxelRenderer.h index 42f9281..172a186 100644 --- a/src/voxel/VoxelRenderer.h +++ b/src/voxel/VoxelRenderer.h @@ -27,7 +27,7 @@ struct GPUChunkInfo { uint32_t pad2[2]; // pad to 112 bytes (7 × float4) }; -// ── Voxel Renderer (Phase 2: mega-buffer + MDI pipeline) ──────── +// ── Voxel Renderer (GPU mesh pipeline) ────────────────────────── class VoxelRenderer { friend class VoxelRenderPath; public: @@ -58,8 +58,6 @@ public: uint32_t getDrawCalls() const { return drawCalls_; } uint32_t getChunkCount() const { return chunkCount_; } bool isInitialized() const { return initialized_; } - bool isGpuCulling() const { return gpuCullingEnabled_; } - bool isMdiEnabled() const { return mdiEnabled_; } bool debugFaceColors_ = false; bool debugBlend_ = false; @@ -67,7 +65,6 @@ public: private: void createPipeline(); - void rebuildMegaBuffer(VoxelWorld& world); wi::graphics::GraphicsDevice* device_ = nullptr; @@ -75,8 +72,6 @@ private: wi::graphics::Shader vertexShader_; wi::graphics::Shader pixelShader_; wi::graphics::PipelineState pso_; - wi::graphics::Shader cullShader_; // Frustum cull compute shader - // Shaders & Pipeline (topings, Phase 4) wi::graphics::Shader topingVS_; wi::graphics::Shader topingPS_; @@ -93,6 +88,29 @@ private: std::vector topingGpuInsts_; mutable uint32_t topingDrawCalls_ = 0; + // ── Toping draw groups (shared between render + BLAS CS) ───── + struct TopingDrawGroup { + uint16_t type, variant; + uint32_t instanceOffset, instanceCount; + uint32_t vertexTemplateOffset, vertexCount; // from TopingDef::variants[] + }; + std::vector topingDrawGroups_; // built in uploadTopingData, reused in renderTopings + + // ── GPU compute toping BLAS extraction (replaces 196ms CPU loop) ── + wi::graphics::Shader topingBLASShader_; // voxelTopingBLASCS compute shader + struct TopingBLASGroupGPU { + uint32_t globalVertexOffset; // prefix sum of total vertices before this group + uint32_t vertexTemplateOffset; // offset into topingVertices (t4) + uint32_t vertexCount; // vertices per instance + uint32_t instanceOffset; // offset into topingInstances (t5) + uint32_t instanceCount; // instances in this group + }; + wi::graphics::GPUBuffer topingBLASGroupBuffer_; // StructuredBuffer, SRV t7 + std::vector topingBLASGroupsGPU_; // CPU staging for group table + mutable uint32_t topingBLASTotalVertices_ = 0; + static constexpr uint32_t MAX_TOPING_BLAS_GROUPS = 64; + void dispatchTopingBLASExtract(wi::graphics::CommandList cmd) const; + // Shaders & Pipeline (smooth surfaces, Phase 5) wi::graphics::Shader smoothVS_; wi::graphics::Shader smoothPS_; @@ -114,9 +132,7 @@ private: // ── Mega-buffer architecture (Phase 2) ────────────────────── static constexpr uint32_t MEGA_BUFFER_CAPACITY = 2 * 1024 * 1024; // 2M quads max (16 MB) static constexpr uint32_t MAX_CHUNKS = 2048; - static constexpr uint32_t MAX_DRAWS = MAX_CHUNKS * 6; // up to 6 face groups per chunk - wi::graphics::GPUBuffer megaQuadBuffer_; // StructuredBuffer, SRV t0 wi::graphics::GPUBuffer chunkInfoBuffer_; // StructuredBuffer, SRV t2 // CPU-side tracking @@ -127,27 +143,9 @@ private: }; std::vector chunkSlots_; std::vector cpuChunkInfo_; - std::vector cpuMegaQuads_; // CPU staging for mega-buffer uint32_t chunkCount_ = 0; bool megaBufferDirty_ = true; - // ── Indirect draw (Phase 2 MDI) ───────────────────────────── - // Wicked Engine's DrawInstancedIndirectCount command signature includes a - // push constant (1 × uint32 at b999) BEFORE each D3D12_DRAW_ARGUMENTS. - // Total stride = 4 + 16 = 20 bytes per draw entry. - struct IndirectDrawArgs { - uint32_t pushConstant; // written to b999[0] by ExecuteIndirect - uint32_t vertexCountPerInstance; - uint32_t instanceCount; - uint32_t startVertexLocation; - uint32_t startInstanceLocation; - }; - wi::graphics::GPUBuffer indirectArgsBuffer_; // IndirectDrawArgs[MAX_DRAWS] - wi::graphics::GPUBuffer drawCountBuffer_; // uint32_t[1] - mutable std::vector cpuIndirectArgs_; - bool gpuCullingEnabled_ = true; // Phase 2.3: GPU compute cull (true) vs CPU fallback (false) - bool mdiEnabled_ = true; // Phase 2.2: MDI rendering with CPU-filled indirect args - // Constants buffer (must match HLSL VoxelCB) struct VoxelConstants { XMFLOAT4X4 viewProjection; @@ -184,7 +182,6 @@ private: wi::graphics::GPUBuffer gpuQuadCounter_; // atomic counter for GPU mesh output wi::graphics::GPUBuffer meshCounterReadback_; // READBACK buffer for quad counter bool gpuMesherAvailable_ = false; - bool gpuMeshEnabled_ = true; // Use GPU meshing instead of CPU greedy mutable uint32_t gpuMeshQuadCount_ = 0; // Readback from previous frame (1-frame delay) mutable uint32_t voxelDataCapacity_ = 0; // Current capacity of voxelDataBuffer_ (in uint32s) mutable std::vector packedVoxelCache_; // cached packed voxel data for all chunks @@ -216,9 +213,8 @@ private: mutable wi::graphics::GPUBuffer topingBLASIndexBuffer_; // sequential indices for toping BLAS mutable uint32_t topingBLASPositionCapacity_ = 0; // pre-allocated capacity (vertices) mutable uint32_t topingBLASIndexCount_ = 0; // size of toping index buffer - mutable bool topingBLASDirty_ = false; // deferred BLAS position upload + rebuild + mutable bool topingBLASDirty_ = false; // GPU compute BLAS extract + rebuild needed mutable uint32_t topingBLASVertexCount_ = 0; // actual vertex count for current frame - std::vector topingBLASPositionStaging_; // CPU staging for deferred upload static constexpr uint32_t MAX_BLAS_VERTICES = MEGA_BUFFER_CAPACITY * 6; // 6 verts per quad mutable bool rtAvailable_ = false; // GPU supports RT mutable bool rtDirty_ = true; // BLAS/TLAS need rebuild @@ -252,14 +248,6 @@ private: const wi::graphics::Texture& renderTarget, const wi::graphics::Texture& normalTarget) const; - // Benchmark state machine: runs once after world gen - enum class BenchState { IDLE, DISPATCH, READBACK, DONE }; - mutable BenchState benchState_ = BenchState::IDLE; - mutable float cpuMeshTimeMs_ = 0.0f; - mutable uint32_t gpuBaselineQuads_ = 0; - - void dispatchGpuMeshBenchmark(wi::graphics::CommandList cmd, const VoxelWorld& world) const; - void readbackGpuMeshBenchmark() const; void dispatchGpuMesh(wi::graphics::CommandList cmd, const VoxelWorld& world, ProfileAccum* profPack = nullptr, ProfileAccum* profUpload = nullptr, ProfileAccum* profDispatch = nullptr) const; @@ -290,7 +278,7 @@ private: public: float getGpuCullTimeMs() const { return gpuCullTimeMs_; } float getGpuDrawTimeMs() const { return gpuDrawTimeMs_; } - bool isGpuMeshEnabled() const { return gpuMeshEnabled_ && gpuMesherAvailable_; } + bool isGpuMeshEnabled() const { return gpuMesherAvailable_; } uint32_t getGpuMeshQuadCount() const { return gpuMeshQuadCount_; } // Phase 4: Toping rendering @@ -364,11 +352,11 @@ private: // Wind animation (continuous, always running) float windTime_ = 0.0f; - // Animated terrain (wave effect at 60 Hz, toggled with F3) + // Animated terrain (wave effect at 30 Hz, toggled with F3) bool animatedTerrain_ = false; float animTime_ = 0.0f; float animAccum_ = 0.0f; - static constexpr float ANIM_INTERVAL = 1.0f / 60.0f; // ~16.7ms = 60 Hz + static constexpr float ANIM_INTERVAL = 1.0f / 30.0f; // ~33.3ms = 30 Hz wi::graphics::Texture voxelRT_; wi::graphics::Texture voxelNormalRT_; // Phase 6: world-space normals for RT shadows/AO