From 57ac08f231b08ec61862211f7f7f282a05fe0c9c Mon Sep 17 00:00:00 2001 From: Samuel Bouchet Date: Tue, 31 Mar 2026 13:46:35 +0200 Subject: [PATCH] Refactor: extract VoxelRTManager, DeferredGPUBuffer, decompose VoxelRenderPath MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Extract DeferredGPUBuffer utility (staging→dirty→capacity GPU buffer pattern) - Extract VoxelRTManager from VoxelRenderer (~500 lines: BLAS/TLAS, RT shadows+AO) - Decompose VoxelRenderPath into CameraController, AnimationState, VoxelProfiler - Replace toping std::sort with O(n) counting sort by (type, variant) - Update CLAUDE.md architecture docs to reflect new file structure --- CLAUDE.md | 10 +- src/voxel/DeferredGPUBuffer.h | 68 ++ src/voxel/VoxelRTManager.cpp | 610 ++++++++++++++++ src/voxel/VoxelRTManager.h | 124 ++++ src/voxel/VoxelRenderer.cpp | 1224 ++++++++------------------------- src/voxel/VoxelRenderer.h | 214 +++--- voxel_engine_spec.md | 114 ++- 7 files changed, 1294 insertions(+), 1070 deletions(-) create mode 100644 src/voxel/DeferredGPUBuffer.h create mode 100644 src/voxel/VoxelRTManager.cpp create mode 100644 src/voxel/VoxelRTManager.h diff --git a/CLAUDE.md b/CLAUDE.md index 539ebb1..226af5c 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -18,7 +18,9 @@ bvle-voxels/ │ │ ├── VoxelTypes.h # Types fondamentaux (VoxelData, PackedQuad, MaterialDesc, ChunkPos) │ │ ├── VoxelWorld.h/.cpp # Monde voxel (hashmap de chunks, génération procédurale) │ │ ├── VoxelMesher.h/.cpp # Binary Greedy Mesher CPU + SmoothMesher (Naive Surface Nets) -│ │ ├── VoxelRenderer.h/.cpp# Renderer + VoxelRenderPath (sous-classe RenderPath3D) +│ │ ├── VoxelRenderer.h/.cpp# Renderer + VoxelRenderPath (CameraController, AnimationState, VoxelProfiler) +│ │ ├── VoxelRTManager.h/.cpp # Ray tracing: BLAS/TLAS lifecycle, shadows+AO dispatches +│ │ ├── DeferredGPUBuffer.h # Utilitaire staging→dirty→capacity GPU buffer upload │ │ └── TopingSystem.h/.cpp # Système de topings (biseaux décoratifs sur faces +Y) │ └── app/ │ └── main.cpp # Point d'entrée Win32 + crash handler SEH @@ -129,7 +131,11 @@ Perlin noise 3D, fBm 5 octaves (2 en animation), caves 3D, matériaux par altitu - **Per-chunk info** : `StructuredBuffer` (80 bytes/chunk) - **Height-based blending** (Phase 3) : PS lit `voxelDataBuffer` (t3), winner-takes-all heightmap, corner attenuation - **Render targets propres** : `voxelRT_` (R8G8B8A8) + `voxelDepth_` (D32_FLOAT) -- **CPU profiling** : `ProfileAccum` avec moyennes toutes les 5s +- **CPU profiling** : `VoxelProfiler` (21 `ProfileAccum`, moyennes toutes les 5s) +- **DeferredGPUBuffer** : utilitaire pour buffers GPU avec staging CPU, dirty flag, capacity-based growth (25% headroom) +- **VoxelRTManager** (`VoxelRTManager.h/.cpp`) : gère BLAS/TLAS, dispatches RT shadows+AO, isolé du renderer +- **VoxelRenderPath** décomposé en : `CameraController` (mouvement/souris), `AnimationState` (tick terrain), `VoxelProfiler` +- **Toping sort** : counting sort O(n) par (type, variant) au lieu de `std::sort` ## Phases de développement diff --git a/src/voxel/DeferredGPUBuffer.h b/src/voxel/DeferredGPUBuffer.h new file mode 100644 index 0000000..da52954 --- /dev/null +++ b/src/voxel/DeferredGPUBuffer.h @@ -0,0 +1,68 @@ +#pragma once +#include "WickedEngine.h" + +namespace voxel { + +// ── Deferred GPU Buffer ───────────────────────────────────────── +// Encapsulates the repeated pattern of: +// 1. CPU staging data prepared during Update() +// 2. GPU buffer with capacity-based growth (25% headroom) +// 3. Dirty flag for deferred upload in Render() +// +// Eliminates ~50 lines of boilerplate per buffer and centralizes +// the invariants (capacity >= count, CreateBuffer with nullptr, +// UpdateBuffer with actual data size). + +struct DeferredGPUBuffer { + wi::graphics::GPUBuffer gpu; + mutable uint32_t capacity = 0; // in elements + mutable bool dirty = false; + uint32_t stride = 0; // bytes per element + + // Ensure GPU buffer has enough capacity for elementCount elements. + // Creates/recreates buffer only when capacity is insufficient. + // Returns true if buffer was (re)created. + bool ensureCapacity(wi::graphics::GraphicsDevice* device, + uint32_t elementCount, + uint32_t elementStride, + wi::graphics::BindFlag bindFlags, + wi::graphics::ResourceMiscFlag miscFlags = wi::graphics::ResourceMiscFlag::BUFFER_STRUCTURED) + { + stride = elementStride; + if (gpu.IsValid() && capacity >= elementCount) return false; + + capacity = elementCount + elementCount / 4; // 25% headroom + wi::graphics::GPUBufferDesc desc; + desc.size = (uint64_t)capacity * stride; + desc.bind_flags = bindFlags; + desc.misc_flags = miscFlags; + desc.stride = (miscFlags == wi::graphics::ResourceMiscFlag::BUFFER_STRUCTURED) ? stride : 0; + desc.usage = wi::graphics::Usage::DEFAULT; + device->CreateBuffer(&desc, nullptr, &gpu); + dirty = true; + return true; + } + + // Upload data to GPU. Call from Render() with a valid CommandList. + // dataCount = number of elements to upload (may be < capacity). + void upload(wi::graphics::GraphicsDevice* device, + wi::graphics::CommandList cmd, + const void* data, + uint32_t dataCount) const + { + if (!dirty || !gpu.IsValid() || dataCount == 0 || !data) return; + size_t uploadSize = (size_t)dataCount * stride; + size_t bufferSize = (size_t)capacity * stride; + if (uploadSize <= bufferSize) { + device->UpdateBuffer(&gpu, data, cmd, uploadSize); + } + dirty = false; + } + + // Mark as needing upload (call after staging data changes). + void markDirty() { dirty = true; } + + bool isValid() const { return gpu.IsValid(); } +}; + +} // namespace voxel diff --git a/src/voxel/VoxelRTManager.cpp b/src/voxel/VoxelRTManager.cpp new file mode 100644 index 0000000..577bd90 --- /dev/null +++ b/src/voxel/VoxelRTManager.cpp @@ -0,0 +1,610 @@ +#include "VoxelRTManager.h" +#include + +using namespace wi::graphics; + +namespace voxel { + +void VoxelRTManager::initialize(GraphicsDevice* dev, uint32_t maxBlasVertices) { + device_ = dev; + maxBlasVertices_ = maxBlasVertices; + + available_ = dev->CheckCapability(GraphicsDeviceCapability::RAYTRACING); + if (!available_) { + wi::backlog::post("VoxelRTManager: RT not available (GPU does not support ray tracing)"); + return; + } + + wi::renderer::LoadShader(ShaderStage::CS, blasExtractShader_, "voxel/voxelBLASExtractCS.cso"); + if (blasExtractShader_.IsValid()) { + // BLAS position buffer: 6 float3 per quad (non-indexed triangles), raw buffer + GPUBufferDesc posDesc; + posDesc.size = (uint64_t)maxBlasVertices * sizeof(float) * 3; + posDesc.bind_flags = BindFlag::UNORDERED_ACCESS | BindFlag::SHADER_RESOURCE; + posDesc.misc_flags = ResourceMiscFlag::BUFFER_RAW; + posDesc.stride = 0; + posDesc.usage = Usage::DEFAULT; + bool ok = dev->CreateBuffer(&posDesc, nullptr, &blasPositionBuffer_); + + // Sequential index buffer for BLAS + GPUBufferDesc idxDesc; + idxDesc.size = (uint64_t)maxBlasVertices * sizeof(uint32_t); + idxDesc.bind_flags = BindFlag::SHADER_RESOURCE; + idxDesc.usage = Usage::DEFAULT; + auto fillIndices = [maxBlasVertices](void* dest) { + uint32_t* p = (uint32_t*)dest; + for (uint32_t i = 0; i < maxBlasVertices; i++) + p[i] = i; + }; + bool okIdx = dev->CreateBuffer2(&idxDesc, fillIndices, &blasIndexBuffer_); + + if (ok && blasPositionBuffer_.IsValid() && okIdx && blasIndexBuffer_.IsValid()) { + dev->SetName(&blasPositionBuffer_, "VoxelRTManager::blasPositionBuffer"); + dev->SetName(&blasIndexBuffer_, "VoxelRTManager::blasIndexBuffer"); + wi::backlog::post("VoxelRTManager: RT available (BLAS pos " + + std::to_string(posDesc.size / (1024*1024)) + " MB + idx " + + std::to_string(idxDesc.size / (1024*1024)) + " MB)"); + } else { + available_ = false; + wi::backlog::post("VoxelRTManager: RT buffer creation failed", wi::backlog::LogLevel::Warning); + } + } else { + available_ = false; + wi::backlog::post("VoxelRTManager: BLAS extraction shader failed", wi::backlog::LogLevel::Warning); + } + + // Toping BLAS CS + wi::renderer::LoadShader(ShaderStage::CS, topingBLASShader_, "voxel/voxelTopingBLASCS.cso"); + if (topingBLASShader_.IsValid()) { + static constexpr uint32_t MAX_GROUPS = 64; + GPUBufferDesc grpDesc; + grpDesc.size = MAX_GROUPS * 20; // 5 × uint32 per group + grpDesc.bind_flags = BindFlag::SHADER_RESOURCE; + grpDesc.misc_flags = ResourceMiscFlag::BUFFER_STRUCTURED; + grpDesc.stride = 20; + grpDesc.usage = Usage::DEFAULT; + dev->CreateBuffer(&grpDesc, nullptr, &topingBLASGroupBuffer_); + wi::backlog::post("VoxelRTManager: toping BLAS CS available"); + } else { + wi::backlog::post("VoxelRTManager: toping BLAS CS failed", wi::backlog::LogLevel::Warning); + } + + // RT Shadows + AO + wi::renderer::LoadShader(ShaderStage::CS, shadowShader_, "voxel/voxelShadowCS.cso", + ShaderModel::SM_6_5); + wi::renderer::LoadShader(ShaderStage::CS, aoBlurShader_, "voxel/voxelAOBlurCS.cso"); + wi::renderer::LoadShader(ShaderStage::CS, aoApplyShader_, "voxel/voxelAOApplyCS.cso"); + if (shadowShader_.IsValid() && aoBlurShader_.IsValid() && aoApplyShader_.IsValid()) { + shadowsEnabled_ = true; + wi::backlog::post("VoxelRTManager: RT shadows + AO blur available"); + } else { + wi::backlog::post("VoxelRTManager: RT shadow/AO shader(s) failed", + wi::backlog::LogLevel::Warning); + } +} + +// ── BLAS extraction: blocky quads → float3 positions ──────────── + +void VoxelRTManager::dispatchBLASExtract(CommandList cmd, + const GPUBuffer& quadBuffer, + const GPUBuffer& chunkInfoBuffer, + uint32_t quadCount) const +{ + if (!available_ || !blasExtractShader_.IsValid() || quadCount == 0) return; + + auto* dev = device_; + + GPUBarrier preBarriers[] = { + GPUBarrier::Buffer(&blasPositionBuffer_, + ResourceState::UNDEFINED, ResourceState::UNORDERED_ACCESS), + }; + dev->Barrier(preBarriers, 1, cmd); + + dev->BindComputeShader(&blasExtractShader_, cmd); + dev->BindResource(&quadBuffer, 0, cmd); // t0 + dev->BindResource(&chunkInfoBuffer, 2, cmd); // t2 + dev->BindUAV(&blasPositionBuffer_, 0, cmd); // u0 + + struct BLASPush { + uint32_t quadCount; + uint32_t pad[11]; + } pushData = {}; + pushData.quadCount = quadCount; + dev->PushConstants(&pushData, sizeof(pushData), cmd); + + uint32_t groupCount = (quadCount + 63) / 64; + dev->Dispatch(groupCount, 1, 1, cmd); + + GPUBarrier postBarriers[] = { + GPUBarrier::Buffer(&blasPositionBuffer_, + ResourceState::UNORDERED_ACCESS, ResourceState::SHADER_RESOURCE), + }; + dev->Barrier(postBarriers, 1, cmd); + + blockyVertexCount_ = quadCount * 6; +} + +// ── Toping BLAS extraction (GPU compute) ──────────────────────── + +void VoxelRTManager::dispatchTopingBLASExtract(CommandList cmd, + const GPUBuffer& topingVertexBuffer, + const GPUBuffer& topingInstanceBuffer, + const void* groupsGPUData, size_t groupsGPUSize, + uint32_t groupCount, uint32_t totalVertices) const +{ + if (!topingBLASShader_.IsValid() || !topingBLASGroupBuffer_.IsValid() || + !topingBLASPositionBuf_.isValid() || !topingVertexBuffer.IsValid() || + !topingInstanceBuffer.IsValid() || totalVertices == 0 || groupCount == 0) + return; + + auto* dev = device_; + + // Upload group table + dev->UpdateBuffer(&topingBLASGroupBuffer_, groupsGPUData, cmd, groupsGPUSize); + + GPUBarrier preBarriers[] = { + GPUBarrier::Buffer(&topingBLASGroupBuffer_, + ResourceState::COPY_DST, ResourceState::SHADER_RESOURCE), + GPUBarrier::Buffer(&topingBLASPositionBuf_.gpu, + ResourceState::UNDEFINED, ResourceState::UNORDERED_ACCESS), + }; + dev->Barrier(preBarriers, 2, cmd); + + dev->BindComputeShader(&topingBLASShader_, cmd); + dev->BindResource(&topingVertexBuffer, 4, cmd); // t4 + dev->BindResource(&topingInstanceBuffer, 5, cmd); // t5 + dev->BindResource(&topingBLASGroupBuffer_, 7, cmd); // t7 + dev->BindUAV(&topingBLASPositionBuf_.gpu, 0, cmd); // u0 + + struct { + uint32_t totalVertices; + uint32_t groupCount; + uint32_t pad[10]; + } pushData = {}; + pushData.totalVertices = totalVertices; + pushData.groupCount = groupCount; + dev->PushConstants(&pushData, sizeof(pushData), cmd); + + uint32_t threadGroups = (totalVertices + 63) / 64; + dev->Dispatch(threadGroups, 1, 1, cmd); + + GPUBarrier postBarriers[] = { + GPUBarrier::Buffer(&topingBLASPositionBuf_.gpu, + ResourceState::UNORDERED_ACCESS, ResourceState::SHADER_RESOURCE), + }; + dev->Barrier(postBarriers, 1, cmd); + + topingVertexCount_ = totalVertices; + dirty = true; + topingBLASDirty = false; +} + +// ── Ensure toping BLAS buffer capacity ────────────────────────── + +bool VoxelRTManager::ensureTopingBLASCapacity(uint32_t totalVertices) { + if (totalVertices == 0) return false; + + bool recreated = topingBLASPositionBuf_.ensureCapacity(device_, totalVertices, + 3 * sizeof(float), + BindFlag::UNORDERED_ACCESS | BindFlag::SHADER_RESOURCE, + ResourceMiscFlag::BUFFER_RAW); + + if (recreated) { + char msg[256]; + snprintf(msg, sizeof(msg), "VoxelRTManager: toping BLAS pos buffer (%u capacity, %.1f MB)", + topingBLASPositionBuf_.capacity, + (size_t)topingBLASPositionBuf_.capacity * 3 * sizeof(float) / (1024.0 * 1024.0)); + wi::backlog::post(msg); + } + + // Index buffer: grow if needed + if (topingBLASIndexCount_ < topingBLASPositionBuf_.capacity) { + uint32_t idxCount = topingBLASPositionBuf_.capacity; + std::vector indices(idxCount); + for (uint32_t j = 0; j < idxCount; j++) indices[j] = j; + + GPUBufferDesc idxDesc; + idxDesc.size = (size_t)idxCount * sizeof(uint32_t); + idxDesc.bind_flags = BindFlag::SHADER_RESOURCE; + idxDesc.misc_flags = ResourceMiscFlag::NONE; + idxDesc.usage = Usage::DEFAULT; + device_->CreateBuffer(&idxDesc, indices.data(), &topingBLASIndexBuffer_); + topingBLASIndexCount_ = idxCount; + recreated = true; + } + + topingBLASDirty = true; + return recreated; +} + +// ── Acceleration structure build ──────────────────────────────── + +void VoxelRTManager::buildAccelerationStructures(CommandList cmd, + uint32_t buildFlags, + const GPUBuffer& smoothVB, + uint32_t smoothVertCount) const +{ + if (!available_) return; + + auto* dev = device_; + + // ── Blocky BLAS ── + uint32_t blockyVertCount = blockyVertexCount_; + if (blockyVertCount < 3) blockyVertCount = 0; + if ((buildFlags & BUILD_BLOCKY) && blockyVertCount > 0 && blasPositionBuffer_.IsValid()) { + if (!blockyBLAS_.IsValid() || blockyVertCount > blockyBLASCapacity_) { + blockyBLASCapacity_ = blockyVertCount + blockyVertCount / 4; + + RaytracingAccelerationStructureDesc desc; + desc.type = RaytracingAccelerationStructureDesc::Type::BOTTOMLEVEL; + desc.flags = RaytracingAccelerationStructureDesc::FLAG_PREFER_FAST_BUILD; + + desc.bottom_level.geometries.resize(1); + auto& geom = desc.bottom_level.geometries[0]; + geom.type = RaytracingAccelerationStructureDesc::BottomLevel::Geometry::Type::TRIANGLES; + geom.flags = RaytracingAccelerationStructureDesc::BottomLevel::Geometry::FLAG_OPAQUE; + geom.triangles.vertex_buffer = blasPositionBuffer_; + geom.triangles.vertex_byte_offset = 0; + geom.triangles.vertex_count = blockyBLASCapacity_; + geom.triangles.vertex_stride = sizeof(float) * 3; + geom.triangles.vertex_format = Format::R32G32B32_FLOAT; + geom.triangles.index_buffer = blasIndexBuffer_; + geom.triangles.index_count = blockyBLASCapacity_; + geom.triangles.index_format = IndexBufferFormat::UINT32; + geom.triangles.index_offset = 0; + + bool ok = dev->CreateRaytracingAccelerationStructure(&desc, &blockyBLAS_); + if (ok) { + dev->SetName(&blockyBLAS_, "VoxelRTManager::blockyBLAS"); + wi::backlog::post("VoxelRTManager: blocky BLAS created (capacity " + + std::to_string(blockyBLASCapacity_ / 3) + " tris)"); + } else { + wi::backlog::post("VoxelRTManager: failed to create blocky BLAS", wi::backlog::LogLevel::Error); + available_ = false; + return; + } + } + + blockyBLAS_.desc.bottom_level.geometries[0].triangles.vertex_count = blockyVertCount; + blockyBLAS_.desc.bottom_level.geometries[0].triangles.index_count = blockyVertCount; + dev->BuildRaytracingAccelerationStructure(&blockyBLAS_, cmd, nullptr); + } + + // ── Smooth BLAS ── + if (smoothVertCount < 3) smoothVertCount = 0; + if ((buildFlags & BUILD_SMOOTH) && smoothVertCount > 0 && smoothVB.IsValid()) { + if (!smoothBLAS_.IsValid() || smoothVertCount > smoothBLASCapacity_) { + smoothBLASCapacity_ = smoothVertCount + smoothVertCount / 4; + + RaytracingAccelerationStructureDesc desc; + desc.type = RaytracingAccelerationStructureDesc::Type::BOTTOMLEVEL; + desc.flags = RaytracingAccelerationStructureDesc::FLAG_PREFER_FAST_BUILD; + + desc.bottom_level.geometries.resize(1); + auto& geom = desc.bottom_level.geometries[0]; + geom.type = RaytracingAccelerationStructureDesc::BottomLevel::Geometry::Type::TRIANGLES; + geom.flags = RaytracingAccelerationStructureDesc::BottomLevel::Geometry::FLAG_OPAQUE; + geom.triangles.vertex_buffer = smoothVB; + geom.triangles.vertex_byte_offset = 0; + geom.triangles.vertex_count = smoothBLASCapacity_; + geom.triangles.vertex_stride = 32; + geom.triangles.index_buffer = blasIndexBuffer_; + geom.triangles.index_count = smoothBLASCapacity_; + geom.triangles.index_format = IndexBufferFormat::UINT32; + geom.triangles.index_offset = 0; + geom.triangles.vertex_format = Format::R32G32B32_FLOAT; + + bool ok = dev->CreateRaytracingAccelerationStructure(&desc, &smoothBLAS_); + if (ok) { + dev->SetName(&smoothBLAS_, "VoxelRTManager::smoothBLAS"); + wi::backlog::post("VoxelRTManager: smooth BLAS created (capacity " + + std::to_string(smoothBLASCapacity_ / 3) + " tris)"); + } else { + wi::backlog::post("VoxelRTManager: failed to create smooth BLAS", wi::backlog::LogLevel::Error); + } + } + + if (smoothBLAS_.IsValid()) { + smoothBLAS_.desc.bottom_level.geometries[0].triangles.vertex_count = smoothVertCount; + smoothBLAS_.desc.bottom_level.geometries[0].triangles.index_count = smoothVertCount; + dev->BuildRaytracingAccelerationStructure(&smoothBLAS_, cmd, nullptr); + } + + smoothVertexCount_ = smoothVertCount; + } + + // ── Toping BLAS ── + uint32_t topingVertCount = topingVertexCount_; + if ((buildFlags & BUILD_TOPING) && topingVertCount >= 3 && topingBLASPositionBuf_.isValid()) { + if (!topingBLAS_.IsValid() || topingVertCount > topingBLASASCapacity_) { + topingBLASASCapacity_ = topingVertCount + topingVertCount / 4; + + RaytracingAccelerationStructureDesc desc; + desc.type = RaytracingAccelerationStructureDesc::Type::BOTTOMLEVEL; + desc.flags = RaytracingAccelerationStructureDesc::FLAG_PREFER_FAST_BUILD; + + desc.bottom_level.geometries.resize(1); + auto& geom = desc.bottom_level.geometries[0]; + geom.type = RaytracingAccelerationStructureDesc::BottomLevel::Geometry::Type::TRIANGLES; + geom.flags = RaytracingAccelerationStructureDesc::BottomLevel::Geometry::FLAG_OPAQUE; + geom.triangles.vertex_buffer = topingBLASPositionBuf_.gpu; + geom.triangles.vertex_byte_offset = 0; + geom.triangles.vertex_count = topingBLASASCapacity_; + geom.triangles.vertex_stride = sizeof(float) * 3; + geom.triangles.vertex_format = Format::R32G32B32_FLOAT; + geom.triangles.index_buffer = topingBLASIndexBuffer_; + geom.triangles.index_count = topingBLASASCapacity_; + geom.triangles.index_format = IndexBufferFormat::UINT32; + geom.triangles.index_offset = 0; + + bool ok = dev->CreateRaytracingAccelerationStructure(&desc, &topingBLAS_); + if (ok) { + dev->SetName(&topingBLAS_, "VoxelRTManager::topingBLAS"); + wi::backlog::post("VoxelRTManager: toping BLAS created (capacity " + + std::to_string(topingBLASASCapacity_ / 3) + " tris)"); + } else { + wi::backlog::post("VoxelRTManager: failed to create toping BLAS", wi::backlog::LogLevel::Error); + } + } + + if (topingBLAS_.IsValid()) { + topingBLAS_.desc.bottom_level.geometries[0].triangles.vertex_count = topingVertCount; + topingBLAS_.desc.bottom_level.geometries[0].triangles.index_count = topingVertCount; + dev->BuildRaytracingAccelerationStructure(&topingBLAS_, cmd, nullptr); + } + } + + // Memory barrier: sync BLAS builds before TLAS + { + GPUBarrier barriers[] = { GPUBarrier::Memory() }; + dev->Barrier(barriers, 1, cmd); + } + + // ── TLAS ── + uint32_t instanceCount = 0; + if (blockyBLAS_.IsValid()) instanceCount++; + if (smoothBLAS_.IsValid() && smoothVertCount > 0) instanceCount++; + if (topingBLAS_.IsValid() && topingVertCount >= 3) instanceCount++; + if (instanceCount == 0) { dirty = false; return; } + + if (!tlas_.IsValid() || instanceCount != tlasInstanceCount_) { + const size_t instSize = dev->GetTopLevelAccelerationStructureInstanceSize(); + + auto setIdentity = [](float transform[3][4]) { + std::memset(transform, 0, sizeof(float) * 12); + transform[0][0] = 1.0f; + transform[1][1] = 1.0f; + transform[2][2] = 1.0f; + }; + + const RaytracingAccelerationStructure* blockyPtr = blockyBLAS_.IsValid() ? &blockyBLAS_ : nullptr; + const RaytracingAccelerationStructure* smoothPtr = (smoothBLAS_.IsValid() && smoothVertCount > 0) ? &smoothBLAS_ : nullptr; + const RaytracingAccelerationStructure* topingPtr = (topingBLAS_.IsValid() && topingVertCount >= 3) ? &topingBLAS_ : nullptr; + + RaytracingAccelerationStructureDesc desc; + desc.flags = RaytracingAccelerationStructureDesc::FLAG_PREFER_FAST_BUILD; + desc.type = RaytracingAccelerationStructureDesc::Type::TOPLEVEL; + desc.top_level.count = instanceCount; + + GPUBufferDesc bufdesc; + bufdesc.misc_flags = ResourceMiscFlag::RAY_TRACING; + bufdesc.stride = (uint32_t)instSize; + bufdesc.size = bufdesc.stride * desc.top_level.count; + + auto initInstances = [&](void* dest) { + uint32_t idx = 0; + auto addInstance = [&](const RaytracingAccelerationStructure* blas, uint32_t id) { + if (!blas) return; + RaytracingAccelerationStructureDesc::TopLevel::Instance inst; + setIdentity(inst.transform); + inst.instance_id = id; inst.instance_mask = 0xFF; + inst.instance_contribution_to_hit_group_index = 0; inst.flags = 0; + inst.bottom_level = blas; + dev->WriteTopLevelAccelerationStructureInstance(&inst, (uint8_t*)dest + idx * instSize); + idx++; + }; + addInstance(blockyPtr, 0); + addInstance(smoothPtr, 1); + addInstance(topingPtr, 2); + }; + + bool ok = dev->CreateBuffer2(&bufdesc, initInstances, &desc.top_level.instance_buffer); + if (!ok) { + wi::backlog::post("VoxelRTManager: failed to create TLAS instance buffer", wi::backlog::LogLevel::Error); + dirty = false; + return; + } + + ok = dev->CreateRaytracingAccelerationStructure(&desc, &tlas_); + if (!ok) { + wi::backlog::post("VoxelRTManager: failed to create TLAS", wi::backlog::LogLevel::Error); + dirty = false; + return; + } + + tlasInstanceCount_ = instanceCount; + wi::backlog::post("VoxelRTManager: TLAS created (" + std::to_string(instanceCount) + " instances)"); + } + + dev->BuildRaytracingAccelerationStructure(&tlas_, cmd, nullptr); + + { + GPUBarrier barriers[] = { GPUBarrier::Memory(&tlas_) }; + dev->Barrier(barriers, 1, cmd); + } + + dirty = false; +} + +// ── RT Shadow + AO dispatch ───────────────────────────────────── + +void VoxelRTManager::dispatchShadows(CommandList cmd, + const Texture& depthBuffer, + const Texture& renderTarget, + const Texture& normalTarget, + const GPUBuffer& constantBuffer) const +{ + if (!shadowsEnabled_ || !shadowShader_.IsValid() || !tlas_.IsValid()) + return; + + auto* dev = device_; + uint32_t w = renderTarget.GetDesc().width; + uint32_t h = renderTarget.GetDesc().height; + uint32_t gx = (w + 7) / 8; + uint32_t gy = (h + 7) / 8; + + // Pass 1: Shadow + raw AO + { + GPUBarrier preBarriers[] = { + GPUBarrier::Image(&const_cast(depthBuffer), + ResourceState::DEPTHSTENCIL, ResourceState::SHADER_RESOURCE), + GPUBarrier::Image(&const_cast(renderTarget), + ResourceState::SHADER_RESOURCE, ResourceState::UNORDERED_ACCESS), + GPUBarrier::Image(&aoRawTexture, + ResourceState::SHADER_RESOURCE, ResourceState::UNORDERED_ACCESS), + }; + dev->Barrier(preBarriers, 3, cmd); + + dev->BindComputeShader(&shadowShader_, cmd); + dev->BindResource(&depthBuffer, 0, cmd); + dev->BindResource(&normalTarget, 1, cmd); + dev->BindResource(&tlas_, 2, cmd); + dev->BindResource(&aoHistoryTexture, 3, cmd); + dev->BindUAV(&renderTarget, 0, cmd); + dev->BindUAV(&aoRawTexture, 1, cmd); + dev->BindConstantBuffer(&constantBuffer, 0, cmd); + + struct ShadowPush { + uint32_t width, height; + float normalBias, shadowMaxDist; + uint32_t debugMode; + float aoRadius; + uint32_t aoRayCount; + float aoStrength; + uint32_t frameIndex; + uint32_t historyValid; + uint32_t pad[2]; + } pushData = {}; + pushData.width = w; + pushData.height = h; + pushData.normalBias = 0.15f; + pushData.shadowMaxDist = 512.0f; + pushData.debugMode = shadowDebug_; + pushData.aoRadius = 8.0f; + pushData.aoRayCount = 4; + pushData.aoStrength = 0.7f; + pushData.frameIndex = frameCounter++; + pushData.historyValid = aoHistoryValid ? 1u : 0u; + dev->PushConstants(&pushData, sizeof(pushData), cmd); + dev->Dispatch(gx, gy, 1, cmd); + } + + // Pass 1.5: Copy raw AO → history + { + GPUBarrier copyBarriers[] = { + GPUBarrier::Image(&aoRawTexture, + ResourceState::UNORDERED_ACCESS, ResourceState::COPY_SRC), + GPUBarrier::Image(&aoHistoryTexture, + ResourceState::SHADER_RESOURCE, ResourceState::COPY_DST), + }; + dev->Barrier(copyBarriers, 2, cmd); + dev->CopyResource(&aoHistoryTexture, &aoRawTexture, cmd); + + GPUBarrier postCopyBarriers[] = { + GPUBarrier::Image(&aoRawTexture, + ResourceState::COPY_SRC, ResourceState::SHADER_RESOURCE), + GPUBarrier::Image(&aoHistoryTexture, + ResourceState::COPY_DST, ResourceState::SHADER_RESOURCE), + }; + dev->Barrier(postCopyBarriers, 2, cmd); + aoHistoryValid = true; + } + + // Pass 2: Bilateral blur horizontal + { + GPUBarrier barriers[] = { + GPUBarrier::Image(&aoBlurredTexture, + ResourceState::SHADER_RESOURCE, ResourceState::UNORDERED_ACCESS), + }; + dev->Barrier(barriers, 1, cmd); + + dev->BindComputeShader(&aoBlurShader_, cmd); + dev->BindResource(&aoRawTexture, 0, cmd); + dev->BindResource(&depthBuffer, 1, cmd); + dev->BindResource(&normalTarget, 2, cmd); + dev->BindUAV(&aoBlurredTexture, 0, cmd); + + struct BlurPush { + uint32_t width, height, direction, radius; + float depthThreshold, normalThreshold; + uint32_t pad[6]; + } blurPush = {}; + blurPush.width = w; blurPush.height = h; + blurPush.direction = 0; blurPush.radius = 6; + blurPush.depthThreshold = 0.001f; blurPush.normalThreshold = 0.9f; + dev->PushConstants(&blurPush, sizeof(blurPush), cmd); + dev->Dispatch(gx, gy, 1, cmd); + } + + // Pass 3: Bilateral blur vertical + { + GPUBarrier barriers[] = { + GPUBarrier::Image(&aoBlurredTexture, + ResourceState::UNORDERED_ACCESS, ResourceState::SHADER_RESOURCE), + GPUBarrier::Image(&aoRawTexture, + ResourceState::SHADER_RESOURCE, ResourceState::UNORDERED_ACCESS), + }; + dev->Barrier(barriers, 2, cmd); + + dev->BindComputeShader(&aoBlurShader_, cmd); + dev->BindResource(&aoBlurredTexture, 0, cmd); + dev->BindResource(&depthBuffer, 1, cmd); + dev->BindResource(&normalTarget, 2, cmd); + dev->BindUAV(&aoRawTexture, 0, cmd); + + struct BlurPush { + uint32_t width, height, direction, radius; + float depthThreshold, normalThreshold; + uint32_t pad[6]; + } blurPush = {}; + blurPush.width = w; blurPush.height = h; + blurPush.direction = 1; blurPush.radius = 6; + blurPush.depthThreshold = 0.001f; blurPush.normalThreshold = 0.9f; + dev->PushConstants(&blurPush, sizeof(blurPush), cmd); + dev->Dispatch(gx, gy, 1, cmd); + } + + // Pass 4: Apply blurred AO + { + GPUBarrier barriers[] = { + GPUBarrier::Image(&aoRawTexture, + ResourceState::UNORDERED_ACCESS, ResourceState::SHADER_RESOURCE), + }; + dev->Barrier(barriers, 1, cmd); + + dev->BindComputeShader(&aoApplyShader_, cmd); + dev->BindResource(&aoRawTexture, 0, cmd); + dev->BindResource(&depthBuffer, 1, cmd); + dev->BindUAV(&renderTarget, 0, cmd); + + struct ApplyPush { + uint32_t width, height, debugMode; + uint32_t pad[9]; + } applyPush = {}; + applyPush.width = w; applyPush.height = h; + applyPush.debugMode = shadowDebug_; + dev->PushConstants(&applyPush, sizeof(applyPush), cmd); + dev->Dispatch(gx, gy, 1, cmd); + } + + // Restore resource states + GPUBarrier postBarriers[] = { + GPUBarrier::Image(&const_cast(depthBuffer), + ResourceState::SHADER_RESOURCE, ResourceState::DEPTHSTENCIL), + GPUBarrier::Image(&const_cast(renderTarget), + ResourceState::UNORDERED_ACCESS, ResourceState::SHADER_RESOURCE), + }; + dev->Barrier(postBarriers, 2, cmd); +} + +} // namespace voxel diff --git a/src/voxel/VoxelRTManager.h b/src/voxel/VoxelRTManager.h new file mode 100644 index 0000000..df0a70c --- /dev/null +++ b/src/voxel/VoxelRTManager.h @@ -0,0 +1,124 @@ +#pragma once +#include "DeferredGPUBuffer.h" +#include "WickedEngine.h" + +namespace voxel { + +// ── Ray Tracing Manager (Phase 6) ────────────────────────────── +// Groups all RT state: BLAS/TLAS management, shadow/AO dispatches. +// Extracted from VoxelRenderer to isolate the ~500 lines of RT code +// and its 20+ members for easier debugging and maintenance. + +class VoxelRTManager { +public: + // ── Initialization ────────────────────────────────────────── + void initialize(wi::graphics::GraphicsDevice* device, uint32_t maxBlasVertices); + + // ── BLAS extraction (compute shaders) ─────────────────────── + + // Extract blocky quad positions into BLAS vertex buffer. + void dispatchBLASExtract(wi::graphics::CommandList cmd, + const wi::graphics::GPUBuffer& quadBuffer, + const wi::graphics::GPUBuffer& chunkInfoBuffer, + uint32_t quadCount) const; + + // Extract toping instance positions via GPU compute. + // groupBuffer/groupsGPU: toping BLAS group table. + void dispatchTopingBLASExtract(wi::graphics::CommandList cmd, + const wi::graphics::GPUBuffer& topingVertexBuffer, + const wi::graphics::GPUBuffer& topingInstanceBuffer, + const void* groupsGPUData, size_t groupsGPUSize, + uint32_t groupCount, uint32_t totalVertices) const; + + // ── Acceleration structure build ──────────────────────────── + static constexpr uint32_t BUILD_BLOCKY = 1 << 0; + static constexpr uint32_t BUILD_SMOOTH = 1 << 1; + static constexpr uint32_t BUILD_TOPING = 1 << 2; + static constexpr uint32_t BUILD_ALL = BUILD_BLOCKY | BUILD_SMOOTH | BUILD_TOPING; + + void buildAccelerationStructures(wi::graphics::CommandList cmd, + uint32_t buildFlags, + const wi::graphics::GPUBuffer& smoothVB, + uint32_t smoothVertCount) const; + + // ── RT Shadows + AO dispatch ──────────────────────────────── + void dispatchShadows(wi::graphics::CommandList cmd, + const wi::graphics::Texture& depthBuffer, + const wi::graphics::Texture& renderTarget, + const wi::graphics::Texture& normalTarget, + const wi::graphics::GPUBuffer& constantBuffer) const; + + // ── Toping BLAS buffer management ─────────────────────────── + // Ensure capacity for toping BLAS position + index buffers. + // Returns true if buffers were (re)created. + bool ensureTopingBLASCapacity(uint32_t totalVertices); + + // ── State queries ─────────────────────────────────────────── + bool isAvailable() const { return available_; } + bool isReady() const { return available_ && tlas_.IsValid(); } + bool isShadowsEnabled() const { return shadowsEnabled_; } + void setShadowsEnabled(bool v) { shadowsEnabled_ = v; } + uint32_t getShadowDebug() const { return shadowDebug_; } + void setShadowDebug(uint32_t v) { shadowDebug_ = v; } + + uint32_t getBlockyTriCount() const { return blockyVertexCount_ / 3; } + uint32_t getSmoothTriCount() const { return smoothVertexCount_ / 3; } + uint32_t getTopingTriCount() const { return topingVertexCount_ / 3; } + uint32_t getTopingVertexCount() const { return topingVertexCount_; } + uint32_t getTlasInstanceCount() const { return tlasInstanceCount_; } + const wi::graphics::RaytracingAccelerationStructure& getTLAS() const { return tlas_; } + + // Dirty flags (public for VoxelRenderPath orchestration) + mutable bool dirty = true; // BLAS/TLAS need rebuild + mutable bool topingBLASDirty = false; // toping BLAS extract + rebuild needed + mutable bool aoHistoryValid = false; + mutable uint32_t frameCounter = 0; + mutable XMFLOAT4X4 prevViewProjection; + + // AO textures (created by VoxelRenderPath::createRenderTargets) + mutable wi::graphics::Texture aoRawTexture; + mutable wi::graphics::Texture aoBlurredTexture; + mutable wi::graphics::Texture aoHistoryTexture; + +private: + wi::graphics::GraphicsDevice* device_ = nullptr; + mutable bool available_ = false; + mutable bool shadowsEnabled_ = false; + mutable uint32_t shadowDebug_ = 0; + + // Shaders + wi::graphics::Shader blasExtractShader_; + wi::graphics::Shader topingBLASShader_; + wi::graphics::Shader shadowShader_; + wi::graphics::Shader aoBlurShader_; + wi::graphics::Shader aoApplyShader_; + + // Blocky BLAS resources + mutable wi::graphics::GPUBuffer blasPositionBuffer_; + wi::graphics::GPUBuffer blasIndexBuffer_; + mutable wi::graphics::RaytracingAccelerationStructure blockyBLAS_; + mutable uint32_t blockyBLASCapacity_ = 0; + mutable uint32_t blockyVertexCount_ = 0; + + // Smooth BLAS + mutable wi::graphics::RaytracingAccelerationStructure smoothBLAS_; + mutable uint32_t smoothBLASCapacity_ = 0; + mutable uint32_t smoothVertexCount_ = 0; + + // Toping BLAS + mutable wi::graphics::RaytracingAccelerationStructure topingBLAS_; + mutable uint32_t topingBLASASCapacity_ = 0; + mutable uint32_t topingVertexCount_ = 0; + mutable DeferredGPUBuffer topingBLASPositionBuf_; + mutable wi::graphics::GPUBuffer topingBLASIndexBuffer_; + mutable uint32_t topingBLASIndexCount_ = 0; + wi::graphics::GPUBuffer topingBLASGroupBuffer_; + + // TLAS + mutable wi::graphics::RaytracingAccelerationStructure tlas_; + mutable uint32_t tlasInstanceCount_ = 0; + + uint32_t maxBlasVertices_ = 0; +}; + +} // namespace voxel diff --git a/src/voxel/VoxelRenderer.cpp b/src/voxel/VoxelRenderer.cpp index b49392c..a3864ea 100644 --- a/src/voxel/VoxelRenderer.cpp +++ b/src/voxel/VoxelRenderer.cpp @@ -128,78 +128,8 @@ void VoxelRenderer::initialize(GraphicsDevice* dev) { wi::backlog::post("VoxelRenderer: GPU smooth mesher available (2-pass with smooth normals)"); } - // ── Ray Tracing (Phase 6.1) ──────────────────────────────────── - rtAvailable_ = device_->CheckCapability(GraphicsDeviceCapability::RAYTRACING); - if (rtAvailable_) { - wi::renderer::LoadShader(ShaderStage::CS, blasExtractShader_, "voxel/voxelBLASExtractCS.cso"); - if (blasExtractShader_.IsValid()) { - // BLAS position buffer: 6 float3 per quad (non-indexed triangles) - // Use BUFFER_RAW (ByteAddressBuffer) — structured buffers may not work as BLAS vertex input - GPUBufferDesc posDesc; - posDesc.size = (uint64_t)MAX_BLAS_VERTICES * sizeof(float) * 3; // float3 per vertex - posDesc.bind_flags = BindFlag::UNORDERED_ACCESS | BindFlag::SHADER_RESOURCE; - posDesc.misc_flags = ResourceMiscFlag::BUFFER_RAW; - posDesc.stride = 0; // raw buffer, no stride - posDesc.usage = Usage::DEFAULT; - bool ok = device_->CreateBuffer(&posDesc, nullptr, &blasPositionBuffer_); - // Sequential index buffer for BLAS (DX12 requires valid index buffer, - // Wicked always writes IndexBuffer GPU address even for "non-indexed"). - GPUBufferDesc idxDesc; - idxDesc.size = (uint64_t)MAX_BLAS_VERTICES * sizeof(uint32_t); - idxDesc.bind_flags = BindFlag::SHADER_RESOURCE; - idxDesc.usage = Usage::DEFAULT; - auto fillIndices = [](void* dest) { - uint32_t* p = (uint32_t*)dest; - for (uint32_t i = 0; i < MAX_BLAS_VERTICES; i++) - p[i] = i; - }; - bool okIdx = device_->CreateBuffer2(&idxDesc, fillIndices, &blasIndexBuffer_); - - if (ok && blasPositionBuffer_.IsValid() && okIdx && blasIndexBuffer_.IsValid()) { - device_->SetName(&blasPositionBuffer_, "VoxelRenderer::blasPositionBuffer"); - device_->SetName(&blasIndexBuffer_, "VoxelRenderer::blasIndexBuffer"); - wi::backlog::post("VoxelRenderer: RT available (BLAS pos " - + std::to_string(posDesc.size / (1024*1024)) + " MB + idx " - + std::to_string(idxDesc.size / (1024*1024)) + " MB)"); - } else { - rtAvailable_ = false; - wi::backlog::post("VoxelRenderer: RT buffer creation failed", wi::backlog::LogLevel::Warning); - } - } else { - rtAvailable_ = false; - wi::backlog::post("VoxelRenderer: RT available but BLAS extraction shader failed", wi::backlog::LogLevel::Warning); - } - // ── Toping BLAS CS (replaces 196ms CPU loop) ───────────────── - wi::renderer::LoadShader(ShaderStage::CS, topingBLASShader_, "voxel/voxelTopingBLASCS.cso"); - if (topingBLASShader_.IsValid()) { - // Pre-allocate small group table buffer (max 64 groups × 20 bytes = 1.25 KB) - GPUBufferDesc grpDesc; - grpDesc.size = MAX_TOPING_BLAS_GROUPS * sizeof(TopingBLASGroupGPU); - grpDesc.bind_flags = BindFlag::SHADER_RESOURCE; - grpDesc.misc_flags = ResourceMiscFlag::BUFFER_STRUCTURED; - grpDesc.stride = sizeof(TopingBLASGroupGPU); - grpDesc.usage = Usage::DEFAULT; - device_->CreateBuffer(&grpDesc, nullptr, &topingBLASGroupBuffer_); - wi::backlog::post("VoxelRenderer: toping BLAS CS available"); - } else { - wi::backlog::post("VoxelRenderer: toping BLAS CS failed", wi::backlog::LogLevel::Warning); - } - - // ── RT Shadows + AO (Phase 6.2 + 6.3) ──────────────────────── - wi::renderer::LoadShader(ShaderStage::CS, shadowShader_, "voxel/voxelShadowCS.cso", - wi::graphics::ShaderModel::SM_6_5); - wi::renderer::LoadShader(ShaderStage::CS, aoBlurShader_, "voxel/voxelAOBlurCS.cso"); - wi::renderer::LoadShader(ShaderStage::CS, aoApplyShader_, "voxel/voxelAOApplyCS.cso"); - if (shadowShader_.IsValid() && aoBlurShader_.IsValid() && aoApplyShader_.IsValid()) { - rtShadowsEnabled_ = true; - wi::backlog::post("VoxelRenderer: RT shadows + AO blur available"); - } else { - wi::backlog::post("VoxelRenderer: RT shadow/AO shader(s) failed to compile", - wi::backlog::LogLevel::Warning); - } - } else { - wi::backlog::post("VoxelRenderer: RT not available (GPU does not support ray tracing)"); - } + // ── Ray Tracing (Phase 6) ──────────────────────────────────────── + rt_.initialize(device_, MAX_BLAS_VERTICES); cpuChunkInfo_.reserve(MAX_CHUNKS); chunkSlots_.reserve(MAX_CHUNKS); @@ -715,527 +645,8 @@ void VoxelRenderer::dispatchGpuSmoothMesh(CommandList cmd, const VoxelWorld& wor gpuSmoothMeshDirty_ = false; } -// ── Ray Tracing: BLAS extraction + AS build (Phase 6.1) ────────── +// ── Ray Tracing methods now in VoxelRTManager.cpp ──────────────── -void VoxelRenderer::dispatchBLASExtract(CommandList cmd) const { - if (!rtAvailable_ || !blasExtractShader_.IsValid()) return; - - auto* dev = device_; - uint32_t quadCount = gpuMeshQuadCount_; - if (quadCount == 0) return; - - // Pre-barriers: blasPositionBuffer_ UNDEFINED → UAV - GPUBarrier preBarriers[] = { - GPUBarrier::Buffer(&blasPositionBuffer_, - ResourceState::UNDEFINED, ResourceState::UNORDERED_ACCESS), - }; - dev->Barrier(preBarriers, 1, cmd); - - // Bind compute shader - dev->BindComputeShader(&blasExtractShader_, cmd); - - // Bind resources: t0 = gpuQuadBuffer (SRV), t2 = chunkInfoBuffer (SRV), u0 = blasPositionBuffer (UAV) - dev->BindResource(&gpuQuadBuffer_, 0, cmd); // t0 - dev->BindResource(&chunkInfoBuffer_, 2, cmd); // t2 - dev->BindUAV(&blasPositionBuffer_, 0, cmd); // u0 - - // Push constants: quadCount - struct BLASPush { - uint32_t quadCount; - uint32_t pad[11]; - } pushData = {}; - pushData.quadCount = quadCount; - dev->PushConstants(&pushData, sizeof(pushData), cmd); - - // Dispatch: 64 threads per group - uint32_t groupCount = (quadCount + 63) / 64; - dev->Dispatch(groupCount, 1, 1, cmd); - - // Post-barrier: blasPositionBuffer_ UAV → SHADER_RESOURCE (for BLAS build) - GPUBarrier postBarriers[] = { - GPUBarrier::Buffer(&blasPositionBuffer_, - ResourceState::UNORDERED_ACCESS, ResourceState::SHADER_RESOURCE), - }; - dev->Barrier(postBarriers, 1, cmd); - - rtBlockyVertexCount_ = quadCount * 6; -} - -// ── GPU compute toping BLAS position extraction ───────────────── -// Replaces the 196ms CPU nested loop. Reads vertex templates (t4) + -// instance positions (t5) + group table (t7), writes positions to u0. -void VoxelRenderer::dispatchTopingBLASExtract(CommandList cmd) const { - if (!topingBLASShader_.IsValid() || !topingBLASGroupBuffer_.IsValid() || - !topingBLASPositionBuffer_.IsValid() || !topingVertexBuffer_.IsValid() || - !topingInstanceBuffer_.IsValid() || topingBLASTotalVertices_ == 0 || - topingBLASGroupsGPU_.empty()) return; - - auto* dev = device_; - - // Upload group table (tiny: ~32 × 20 bytes = 640 bytes) - size_t groupUploadSize = topingBLASGroupsGPU_.size() * sizeof(TopingBLASGroupGPU); - dev->UpdateBuffer(&topingBLASGroupBuffer_, - topingBLASGroupsGPU_.data(), cmd, groupUploadSize); - - // Pre-barriers - GPUBarrier preBarriers[] = { - GPUBarrier::Buffer(&topingBLASGroupBuffer_, - ResourceState::COPY_DST, ResourceState::SHADER_RESOURCE), - GPUBarrier::Buffer(&topingBLASPositionBuffer_, - ResourceState::UNDEFINED, ResourceState::UNORDERED_ACCESS), - }; - dev->Barrier(preBarriers, 2, cmd); - - // Bind compute shader + resources - dev->BindComputeShader(&topingBLASShader_, cmd); - dev->BindResource(&topingVertexBuffer_, 4, cmd); // t4 - dev->BindResource(&topingInstanceBuffer_, 5, cmd); // t5 - dev->BindResource(&topingBLASGroupBuffer_, 7, cmd); // t7 - dev->BindUAV(&topingBLASPositionBuffer_, 0, cmd); // u0 - - struct { - uint32_t totalVertices; - uint32_t groupCount; - uint32_t pad[10]; - } pushData = {}; - pushData.totalVertices = topingBLASTotalVertices_; - pushData.groupCount = (uint32_t)topingBLASGroupsGPU_.size(); - dev->PushConstants(&pushData, sizeof(pushData), cmd); - - // Dispatch: 64 threads per group - uint32_t threadGroups = (topingBLASTotalVertices_ + 63) / 64; - dev->Dispatch(threadGroups, 1, 1, cmd); - - // Post-barrier: UAV → SHADER_RESOURCE (for BLAS build) - GPUBarrier postBarriers[] = { - GPUBarrier::Buffer(&topingBLASPositionBuffer_, - ResourceState::UNORDERED_ACCESS, ResourceState::SHADER_RESOURCE), - }; - dev->Barrier(postBarriers, 1, cmd); - - rtTopingVertexCount_ = topingBLASTotalVertices_; - rtDirty_ = true; - topingBLASDirty_ = false; -} - -void VoxelRenderer::buildAccelerationStructures(CommandList cmd, - uint32_t buildFlags) const { - if (!rtAvailable_) return; - - auto* dev = device_; - - // ── Blocky BLAS ────────────────────────────────────────────── - uint32_t blockyVertCount = rtBlockyVertexCount_; - if (blockyVertCount < 3) blockyVertCount = 0; - if ((buildFlags & RT_BUILD_BLOCKY) && blockyVertCount > 0 && blasPositionBuffer_.IsValid()) { - if (!blockyBLAS_.IsValid() || blockyVertCount > blockyBLASCapacity_) { - blockyBLASCapacity_ = blockyVertCount + blockyVertCount / 4; // 25% headroom - - RaytracingAccelerationStructureDesc desc; - desc.type = RaytracingAccelerationStructureDesc::Type::BOTTOMLEVEL; - desc.flags = RaytracingAccelerationStructureDesc::FLAG_PREFER_FAST_BUILD; - - desc.bottom_level.geometries.resize(1); - auto& geom = desc.bottom_level.geometries[0]; - geom.type = RaytracingAccelerationStructureDesc::BottomLevel::Geometry::Type::TRIANGLES; - geom.flags = RaytracingAccelerationStructureDesc::BottomLevel::Geometry::FLAG_OPAQUE; - geom.triangles.vertex_buffer = blasPositionBuffer_; - geom.triangles.vertex_byte_offset = 0; - geom.triangles.vertex_count = blockyBLASCapacity_; - geom.triangles.vertex_stride = sizeof(float) * 3; - geom.triangles.vertex_format = Format::R32G32B32_FLOAT; - geom.triangles.index_buffer = blasIndexBuffer_; - geom.triangles.index_count = blockyBLASCapacity_; - geom.triangles.index_format = IndexBufferFormat::UINT32; - geom.triangles.index_offset = 0; - - bool ok = dev->CreateRaytracingAccelerationStructure(&desc, &blockyBLAS_); - if (ok) { - dev->SetName(&blockyBLAS_, "VoxelRenderer::blockyBLAS"); - wi::backlog::post("VoxelRenderer: blocky BLAS created (capacity " - + std::to_string(blockyBLASCapacity_ / 3) + " tris)"); - } else { - wi::backlog::post("VoxelRenderer: failed to create blocky BLAS", wi::backlog::LogLevel::Error); - rtAvailable_ = false; - return; - } - } - - // Update actual vertex count, then Build - blockyBLAS_.desc.bottom_level.geometries[0].triangles.vertex_count = blockyVertCount; - blockyBLAS_.desc.bottom_level.geometries[0].triangles.index_count = blockyVertCount; - dev->BuildRaytracingAccelerationStructure(&blockyBLAS_, cmd, nullptr); - } - - // ── Smooth BLAS ────────────────────────────────────────────── - // Smooth vertex buffer: float3 position at offset 0, stride 32 bytes - uint32_t smoothVertCount = gpuSmoothVertexCount_; - if (smoothVertCount < 3) smoothVertCount = 0; // Need at least 1 triangle - bool useGpuSmooth = smoothCentroidShader_.IsValid() && smoothMeshShader_.IsValid(); - const GPUBuffer& smoothVB = useGpuSmooth ? gpuSmoothVertexBuffer_ : smoothVertexBuffer_; - - if ((buildFlags & RT_BUILD_SMOOTH) && smoothVertCount > 0 && smoothVB.IsValid()) { - if (!smoothBLAS_.IsValid() || smoothVertCount > smoothBLASCapacity_) { - smoothBLASCapacity_ = smoothVertCount + smoothVertCount / 4; - - RaytracingAccelerationStructureDesc desc; - desc.type = RaytracingAccelerationStructureDesc::Type::BOTTOMLEVEL; - desc.flags = RaytracingAccelerationStructureDesc::FLAG_PREFER_FAST_BUILD; - - desc.bottom_level.geometries.resize(1); - auto& geom = desc.bottom_level.geometries[0]; - geom.type = RaytracingAccelerationStructureDesc::BottomLevel::Geometry::Type::TRIANGLES; - geom.flags = RaytracingAccelerationStructureDesc::BottomLevel::Geometry::FLAG_OPAQUE; - geom.triangles.vertex_buffer = smoothVB; - geom.triangles.vertex_byte_offset = 0; - geom.triangles.vertex_count = smoothBLASCapacity_; - geom.triangles.vertex_stride = 32; - geom.triangles.index_buffer = blasIndexBuffer_; - geom.triangles.index_count = smoothBLASCapacity_; - geom.triangles.index_format = IndexBufferFormat::UINT32; - geom.triangles.index_offset = 0; - geom.triangles.vertex_format = Format::R32G32B32_FLOAT; - - bool ok = dev->CreateRaytracingAccelerationStructure(&desc, &smoothBLAS_); - if (ok) { - dev->SetName(&smoothBLAS_, "VoxelRenderer::smoothBLAS"); - wi::backlog::post("VoxelRenderer: smooth BLAS created (capacity " - + std::to_string(smoothBLASCapacity_ / 3) + " tris)"); - } else { - wi::backlog::post("VoxelRenderer: failed to create smooth BLAS", wi::backlog::LogLevel::Error); - } - } - - if (smoothBLAS_.IsValid()) { - smoothBLAS_.desc.bottom_level.geometries[0].triangles.vertex_count = smoothVertCount; - smoothBLAS_.desc.bottom_level.geometries[0].triangles.index_count = smoothVertCount; - dev->BuildRaytracingAccelerationStructure(&smoothBLAS_, cmd, nullptr); - } - - rtSmoothVertexCount_ = smoothVertCount; - } - - // ── Toping BLAS ────────────────────────────────────────────── - uint32_t topingVertCount = rtTopingVertexCount_; - if ((buildFlags & RT_BUILD_TOPING) && topingVertCount >= 3 && topingBLASPositionBuffer_.IsValid()) { - if (!topingBLAS_.IsValid() || topingVertCount > topingBLASASCapacity_) { - topingBLASASCapacity_ = topingVertCount + topingVertCount / 4; - - RaytracingAccelerationStructureDesc desc; - desc.type = RaytracingAccelerationStructureDesc::Type::BOTTOMLEVEL; - desc.flags = RaytracingAccelerationStructureDesc::FLAG_PREFER_FAST_BUILD; - - desc.bottom_level.geometries.resize(1); - auto& geom = desc.bottom_level.geometries[0]; - geom.type = RaytracingAccelerationStructureDesc::BottomLevel::Geometry::Type::TRIANGLES; - geom.flags = RaytracingAccelerationStructureDesc::BottomLevel::Geometry::FLAG_OPAQUE; - geom.triangles.vertex_buffer = topingBLASPositionBuffer_; - geom.triangles.vertex_byte_offset = 0; - geom.triangles.vertex_count = topingBLASASCapacity_; - geom.triangles.vertex_stride = sizeof(float) * 3; - geom.triangles.vertex_format = Format::R32G32B32_FLOAT; - geom.triangles.index_buffer = topingBLASIndexBuffer_; - geom.triangles.index_count = topingBLASASCapacity_; - geom.triangles.index_format = IndexBufferFormat::UINT32; - geom.triangles.index_offset = 0; - - bool ok = dev->CreateRaytracingAccelerationStructure(&desc, &topingBLAS_); - if (ok) { - dev->SetName(&topingBLAS_, "VoxelRenderer::topingBLAS"); - wi::backlog::post("VoxelRenderer: toping BLAS created (capacity " - + std::to_string(topingBLASASCapacity_ / 3) + " tris)"); - } else { - wi::backlog::post("VoxelRenderer: failed to create toping BLAS", wi::backlog::LogLevel::Error); - } - } - - if (topingBLAS_.IsValid()) { - topingBLAS_.desc.bottom_level.geometries[0].triangles.vertex_count = topingVertCount; - topingBLAS_.desc.bottom_level.geometries[0].triangles.index_count = topingVertCount; - dev->BuildRaytracingAccelerationStructure(&topingBLAS_, cmd, nullptr); - } - } - - // ── Memory barrier: sync BLAS builds before TLAS ────────────── - // Without this, TLAS build can execute before BLASes are complete. - // (Same pattern as wiRenderer.cpp line 5788) - { - GPUBarrier barriers[] = { GPUBarrier::Memory() }; - dev->Barrier(barriers, 1, cmd); - } - - // ── TLAS (up to 3 instances: blocky + smooth + topings) ──── - // Only recreate TLAS when instance count changes. BLASes are capacity-based - // (never recreated during animation), so BLAS pointers remain stable. - // For subsequent frames, just rebuild the TLAS to pick up rebuilt BLASes. - uint32_t instanceCount = 0; - if (blockyBLAS_.IsValid()) instanceCount++; - if (smoothBLAS_.IsValid() && smoothVertCount > 0) instanceCount++; - if (topingBLAS_.IsValid() && topingVertCount >= 3) instanceCount++; - if (instanceCount == 0) { rtDirty_ = false; return; } - - // Recreate TLAS only when instance count changes (avoids per-frame GPU allocation) - if (!tlas_.IsValid() || instanceCount != tlasInstanceCount_) { - const size_t instSize = dev->GetTopLevelAccelerationStructureInstanceSize(); - - auto setIdentity = [](float transform[3][4]) { - std::memset(transform, 0, sizeof(float) * 12); - transform[0][0] = 1.0f; - transform[1][1] = 1.0f; - transform[2][2] = 1.0f; - }; - - const RaytracingAccelerationStructure* blockyBLASPtr = blockyBLAS_.IsValid() ? &blockyBLAS_ : nullptr; - const RaytracingAccelerationStructure* smoothBLASPtr = (smoothBLAS_.IsValid() && smoothVertCount > 0) ? &smoothBLAS_ : nullptr; - const RaytracingAccelerationStructure* topingBLASPtr = (topingBLAS_.IsValid() && topingVertCount >= 3) ? &topingBLAS_ : nullptr; - - RaytracingAccelerationStructureDesc desc; - desc.flags = RaytracingAccelerationStructureDesc::FLAG_PREFER_FAST_BUILD; - desc.type = RaytracingAccelerationStructureDesc::Type::TOPLEVEL; - desc.top_level.count = instanceCount; - - GPUBufferDesc bufdesc; - bufdesc.misc_flags = ResourceMiscFlag::RAY_TRACING; - bufdesc.stride = (uint32_t)instSize; - bufdesc.size = bufdesc.stride * desc.top_level.count; - - auto initInstances = [&](void* dest) { - uint32_t idx = 0; - if (blockyBLASPtr) { - RaytracingAccelerationStructureDesc::TopLevel::Instance inst; - setIdentity(inst.transform); - inst.instance_id = 0; inst.instance_mask = 0xFF; - inst.instance_contribution_to_hit_group_index = 0; inst.flags = 0; - inst.bottom_level = blockyBLASPtr; - dev->WriteTopLevelAccelerationStructureInstance(&inst, (uint8_t*)dest + idx * instSize); - idx++; - } - if (smoothBLASPtr) { - RaytracingAccelerationStructureDesc::TopLevel::Instance inst; - setIdentity(inst.transform); - inst.instance_id = 1; inst.instance_mask = 0xFF; - inst.instance_contribution_to_hit_group_index = 0; inst.flags = 0; - inst.bottom_level = smoothBLASPtr; - dev->WriteTopLevelAccelerationStructureInstance(&inst, (uint8_t*)dest + idx * instSize); - idx++; - } - if (topingBLASPtr) { - RaytracingAccelerationStructureDesc::TopLevel::Instance inst; - setIdentity(inst.transform); - inst.instance_id = 2; inst.instance_mask = 0xFF; - inst.instance_contribution_to_hit_group_index = 0; inst.flags = 0; - inst.bottom_level = topingBLASPtr; - dev->WriteTopLevelAccelerationStructureInstance(&inst, (uint8_t*)dest + idx * instSize); - idx++; - } - }; - - bool ok = dev->CreateBuffer2(&bufdesc, initInstances, &desc.top_level.instance_buffer); - if (!ok) { - wi::backlog::post("VoxelRenderer: failed to create TLAS instance buffer", wi::backlog::LogLevel::Error); - rtDirty_ = false; - return; - } - - ok = dev->CreateRaytracingAccelerationStructure(&desc, &tlas_); - if (!ok) { - wi::backlog::post("VoxelRenderer: failed to create TLAS", wi::backlog::LogLevel::Error); - rtDirty_ = false; - return; - } - - tlasInstanceCount_ = instanceCount; - wi::backlog::post("VoxelRenderer: TLAS created (" + std::to_string(instanceCount) + " instances)"); - } - - dev->BuildRaytracingAccelerationStructure(&tlas_, cmd, nullptr); - - // Memory barrier: sync TLAS build before ray queries can use it - // (Same pattern as wiRenderer.cpp line 5808) - { - GPUBarrier barriers[] = { GPUBarrier::Memory(&tlas_) }; - dev->Barrier(barriers, 1, cmd); - } - - rtDirty_ = false; -} - -// ── RT Shadow + AO dispatch (Phase 6.2 + 6.3) ────────────────── -void VoxelRenderer::dispatchShadows(CommandList cmd, - const Texture& depthBuffer, - const Texture& renderTarget, - const Texture& normalTarget) const -{ - if (!rtShadowsEnabled_ || !shadowShader_.IsValid() || !tlas_.IsValid()) - return; - - auto* dev = device_; - uint32_t w = renderTarget.GetDesc().width; - uint32_t h = renderTarget.GetDesc().height; - uint32_t gx = (w + 7) / 8; - uint32_t gy = (h + 7) / 8; - - // ── Pass 1: Shadow + raw AO (with temporal accumulation) ─────── - { - GPUBarrier preBarriers[] = { - GPUBarrier::Image(&const_cast(depthBuffer), - ResourceState::DEPTHSTENCIL, ResourceState::SHADER_RESOURCE), - GPUBarrier::Image(&const_cast(renderTarget), - ResourceState::SHADER_RESOURCE, ResourceState::UNORDERED_ACCESS), - GPUBarrier::Image(&aoRawTexture_, - ResourceState::SHADER_RESOURCE, ResourceState::UNORDERED_ACCESS), - }; - dev->Barrier(preBarriers, 3, cmd); - - dev->BindComputeShader(&shadowShader_, cmd); - dev->BindResource(&depthBuffer, 0, cmd); // t0 = depth - dev->BindResource(&normalTarget, 1, cmd); // t1 = normals - dev->BindResource(&tlas_, 2, cmd); // t2 = TLAS - dev->BindResource(&aoHistoryTexture_, 3, cmd); // t3 = AO history (prev frame) - dev->BindUAV(&renderTarget, 0, cmd); // u0 = color - dev->BindUAV(&aoRawTexture_, 1, cmd); // u1 = raw AO output - dev->BindConstantBuffer(&constantBuffer_, 0, cmd); - - struct ShadowPush { - uint32_t width, height; - float normalBias, shadowMaxDist; - uint32_t debugMode; - float aoRadius; - uint32_t aoRayCount; - float aoStrength; - uint32_t frameIndex; - uint32_t historyValid; - uint32_t pad[2]; - } pushData = {}; - pushData.width = w; - pushData.height = h; - pushData.normalBias = 0.15f; - pushData.shadowMaxDist = 512.0f; - pushData.debugMode = rtShadowDebug_; - pushData.aoRadius = 8.0f; - pushData.aoRayCount = 4; - pushData.aoStrength = 0.7f; - pushData.frameIndex = frameCounter_++; - pushData.historyValid = aoHistoryValid_ ? 1u : 0u; - dev->PushConstants(&pushData, sizeof(pushData), cmd); - dev->Dispatch(gx, gy, 1, cmd); - } - - // ── Pass 1.5: Copy raw AO → history (before blur, for next frame) ── - { - GPUBarrier copyBarriers[] = { - GPUBarrier::Image(&aoRawTexture_, - ResourceState::UNORDERED_ACCESS, ResourceState::COPY_SRC), - GPUBarrier::Image(&aoHistoryTexture_, - ResourceState::SHADER_RESOURCE, ResourceState::COPY_DST), - }; - dev->Barrier(copyBarriers, 2, cmd); - dev->CopyResource(&aoHistoryTexture_, &aoRawTexture_, cmd); - - GPUBarrier postCopyBarriers[] = { - GPUBarrier::Image(&aoRawTexture_, - ResourceState::COPY_SRC, ResourceState::SHADER_RESOURCE), - GPUBarrier::Image(&aoHistoryTexture_, - ResourceState::COPY_DST, ResourceState::SHADER_RESOURCE), - }; - dev->Barrier(postCopyBarriers, 2, cmd); - aoHistoryValid_ = true; - } - - // ── Pass 2: Bilateral blur horizontal (aoRaw → aoBlurred) ────── - { - // aoRawTexture_ already in SHADER_RESOURCE from copy pass - GPUBarrier barriers[] = { - GPUBarrier::Image(&aoBlurredTexture_, - ResourceState::SHADER_RESOURCE, ResourceState::UNORDERED_ACCESS), - }; - dev->Barrier(barriers, 1, cmd); - - dev->BindComputeShader(&aoBlurShader_, cmd); - dev->BindResource(&aoRawTexture_, 0, cmd); // t0 = AO input - dev->BindResource(&depthBuffer, 1, cmd); // t1 = depth - dev->BindResource(&normalTarget, 2, cmd); // t2 = normals - dev->BindUAV(&aoBlurredTexture_, 0, cmd); // u0 = AO output - - struct BlurPush { - uint32_t width, height, direction, radius; - float depthThreshold, normalThreshold; - uint32_t pad[6]; - } blurPush = {}; - blurPush.width = w; - blurPush.height = h; - blurPush.direction = 0; // horizontal - blurPush.radius = 6; - blurPush.depthThreshold = 0.001f; - blurPush.normalThreshold = 0.9f; - dev->PushConstants(&blurPush, sizeof(blurPush), cmd); - dev->Dispatch(gx, gy, 1, cmd); - } - - // ── Pass 3: Bilateral blur vertical (aoBlurred → aoRaw) ──────── - { - GPUBarrier barriers[] = { - GPUBarrier::Image(&aoBlurredTexture_, - ResourceState::UNORDERED_ACCESS, ResourceState::SHADER_RESOURCE), - GPUBarrier::Image(&aoRawTexture_, - ResourceState::SHADER_RESOURCE, ResourceState::UNORDERED_ACCESS), - }; - dev->Barrier(barriers, 2, cmd); - - dev->BindComputeShader(&aoBlurShader_, cmd); - dev->BindResource(&aoBlurredTexture_, 0, cmd); // t0 = AO input (H-blurred) - dev->BindResource(&depthBuffer, 1, cmd); - dev->BindResource(&normalTarget, 2, cmd); - dev->BindUAV(&aoRawTexture_, 0, cmd); // u0 = AO output (fully blurred) - - struct BlurPush { - uint32_t width, height, direction, radius; - float depthThreshold, normalThreshold; - uint32_t pad[6]; - } blurPush = {}; - blurPush.width = w; - blurPush.height = h; - blurPush.direction = 1; // vertical - blurPush.radius = 6; - blurPush.depthThreshold = 0.001f; - blurPush.normalThreshold = 0.9f; - dev->PushConstants(&blurPush, sizeof(blurPush), cmd); - dev->Dispatch(gx, gy, 1, cmd); - } - - // ── Pass 4: Apply blurred AO to color ────────────────────────── - { - GPUBarrier barriers[] = { - GPUBarrier::Image(&aoRawTexture_, - ResourceState::UNORDERED_ACCESS, ResourceState::SHADER_RESOURCE), - }; - dev->Barrier(barriers, 1, cmd); - - dev->BindComputeShader(&aoApplyShader_, cmd); - dev->BindResource(&aoRawTexture_, 0, cmd); // t0 = blurred AO - dev->BindResource(&depthBuffer, 1, cmd); // t1 = depth (for sky detection) - dev->BindUAV(&renderTarget, 0, cmd); // u0 = color - - struct ApplyPush { - uint32_t width, height, debugMode; - uint32_t pad[9]; - } applyPush = {}; - applyPush.width = w; - applyPush.height = h; - applyPush.debugMode = rtShadowDebug_; - dev->PushConstants(&applyPush, sizeof(applyPush), cmd); - dev->Dispatch(gx, gy, 1, cmd); - } - - // ── Restore resource states ──────────────────────────────────── - GPUBarrier postBarriers[] = { - GPUBarrier::Image(&const_cast(depthBuffer), - ResourceState::SHADER_RESOURCE, ResourceState::DEPTHSTENCIL), - GPUBarrier::Image(&const_cast(renderTarget), - ResourceState::UNORDERED_ACCESS, ResourceState::SHADER_RESOURCE), - }; - dev->Barrier(postBarriers, 2, cmd); -} // ── Frustum plane extraction (Gribb-Hartmann method) ──────────── static void extractFrustumPlanes(const XMMATRIX& vp, XMFLOAT4 planes[6]) { @@ -1295,7 +706,7 @@ void VoxelRenderer::render( XMStoreFloat4x4(&cb.viewProjection, vpMatrix); XMMATRIX invVP = XMMatrixInverse(nullptr, vpMatrix); XMStoreFloat4x4(&cb.inverseViewProjection, invVP); - cb.prevViewProjection = prevViewProjection_; // from last frame + cb.prevViewProjection = rt_.prevViewProjection; // from last frame cb.cameraPosition = XMFLOAT4(camera.Eye.x, camera.Eye.y, camera.Eye.z, 1.0f); cb.sunDirection = XMFLOAT4(-0.7f, -0.4f, -0.3f, 0.0f); // lower sun = longer cast shadows cb.sunColor = XMFLOAT4(1.35f, 1.15f, 0.75f, 1.0f); // warm golden sun @@ -1318,7 +729,7 @@ void VoxelRenderer::render( cb.toneMapParams = XMFLOAT4(1.10f, 1.8f, 0.0f, 0.0f); // natural saturation, balanced exposure dev->UpdateBuffer(&constantBuffer_, &cb, cmd, sizeof(cb)); // Save current VP for next frame's temporal reprojection - XMStoreFloat4x4(&prevViewProjection_, vpMatrix); + XMStoreFloat4x4(&rt_.prevViewProjection, vpMatrix); // Render pass (MRT: color + normals + depth) RenderPassImage rp[] = { @@ -1433,15 +844,39 @@ void VoxelRenderer::uploadTopingData(const TopingSystem& topingSystem) { // GPU instances are just float3 (12 bytes), sorted by (type, variant) for batched draws. // We sort a copy and build a draw group table. // Reuse persistent vectors to avoid per-frame allocations. - topingSorted_.resize(instances.size()); - for (size_t i = 0; i < instances.size(); i++) { - topingSorted_[i] = { instances[i].wx, instances[i].wy, instances[i].wz, - instances[i].topingType, instances[i].variant }; + // Counting sort by (type, variant) — O(n) since key space is tiny + // (types < ~10, variants 0-15 → max ~160 buckets). + { + const size_t n = instances.size(); + topingSorted_.resize(n); + if (n > 0) { + // Find max composite key to size the count array + uint32_t maxKey = 0; + for (size_t i = 0; i < n; i++) { + uint32_t key = ((uint32_t)instances[i].topingType << 4) | instances[i].variant; + if (key > maxKey) maxKey = key; + } + + // Count occurrences + std::vector counts(maxKey + 2, 0); + for (size_t i = 0; i < n; i++) { + uint32_t key = ((uint32_t)instances[i].topingType << 4) | instances[i].variant; + counts[key + 1]++; + } + + // Prefix sum → write offsets + for (uint32_t k = 1; k < (uint32_t)counts.size(); k++) + counts[k] += counts[k - 1]; + + // Scatter into sorted output + for (size_t i = 0; i < n; i++) { + uint32_t key = ((uint32_t)instances[i].topingType << 4) | instances[i].variant; + uint32_t pos = counts[key]++; + topingSorted_[pos] = { instances[i].wx, instances[i].wy, instances[i].wz, + instances[i].topingType, instances[i].variant }; + } + } } - std::sort(topingSorted_.begin(), topingSorted_.end(), [](const TopingSortedInst& a, const TopingSortedInst& b) { - if (a.type != b.type) return a.type < b.type; - return a.variant < b.variant; - }); // Pack GPU instance data (just float3 positions) uint32_t instCount = (uint32_t)std::min(topingSorted_.size(), (size_t)MAX_TOPING_INSTANCES); @@ -1451,26 +886,15 @@ void VoxelRenderer::uploadTopingData(const TopingSystem& topingSystem) { } // Pre-allocate instance buffer; only recreate when capacity needs to grow. - // Data upload deferred to Render() via UpdateBuffer (needs CommandList). - if (!topingInstanceBuffer_.IsValid() || topingInstanceCapacity_ < instCount) { - topingInstanceCapacity_ = instCount + instCount / 4; // 25% headroom - GPUBufferDesc ibDesc; - ibDesc.size = topingInstanceCapacity_ * sizeof(TopingGPUInst); - ibDesc.bind_flags = BindFlag::SHADER_RESOURCE; - ibDesc.misc_flags = ResourceMiscFlag::BUFFER_STRUCTURED; - ibDesc.stride = sizeof(TopingGPUInst); - ibDesc.usage = Usage::DEFAULT; - // Create WITHOUT data — Wicked copies desc.size bytes from data ptr, - // which would overread our vector (instCount < topingInstanceCapacity_). - // Actual data upload deferred to Render() via UpdateBuffer. - device_->CreateBuffer(&ibDesc, nullptr, &topingInstanceBuffer_); - topingInstanceDirty_ = true; // upload data in Render() + // Data upload deferred to Render() via DeferredGPUBuffer::upload(). + if (topingInstanceBuf_.ensureCapacity(device_, instCount, sizeof(TopingGPUInst), + BindFlag::SHADER_RESOURCE)) { char msg[128]; snprintf(msg, sizeof(msg), "Toping: allocated instance buffer (%u capacity, %.1f KB)", - topingInstanceCapacity_, topingInstanceCapacity_ * sizeof(TopingGPUInst) / 1024.0f); + topingInstanceBuf_.capacity, topingInstanceBuf_.capacity * sizeof(TopingGPUInst) / 1024.0f); wi::backlog::post(msg); } else { - topingInstanceDirty_ = true; // deferred upload in Render() + topingInstanceBuf_.markDirty(); // deferred upload in Render() } // ── Build draw groups + BLAS group table ─────────────────── @@ -1529,43 +953,10 @@ void VoxelRenderer::uploadTopingData(const TopingSystem& topingSystem) { totalTopingVerts += dg.vertexCount * dg.instanceCount; } topingBLASTotalVertices_ = totalTopingVerts; - topingBLASVertexCount_ = totalTopingVerts; - // Pre-allocate GPU BLAS position buffer (UAV+SRV, raw for RWByteAddressBuffer). - // Same pattern as blocky blasPositionBuffer_. + // Delegate BLAS buffer allocation to VoxelRTManager if (totalTopingVerts > 0) { - if (!topingBLASPositionBuffer_.IsValid() || topingBLASPositionCapacity_ < totalTopingVerts) { - topingBLASPositionCapacity_ = totalTopingVerts + totalTopingVerts / 4; // 25% headroom - GPUBufferDesc posDesc; - posDesc.size = (size_t)topingBLASPositionCapacity_ * 3 * sizeof(float); - posDesc.bind_flags = BindFlag::UNORDERED_ACCESS | BindFlag::SHADER_RESOURCE; - posDesc.misc_flags = ResourceMiscFlag::BUFFER_RAW; - posDesc.stride = 0; - posDesc.usage = Usage::DEFAULT; - device_->CreateBuffer(&posDesc, nullptr, &topingBLASPositionBuffer_); - - char msg[256]; - snprintf(msg, sizeof(msg), "Toping BLAS: allocated pos buffer (%u capacity, %.1f MB)", - topingBLASPositionCapacity_, posDesc.size / (1024.0 * 1024.0)); - wi::backlog::post(msg); - } - - // Pre-allocate index buffer (sequential [0,1,2,...]) — grow only when needed - if (topingBLASIndexCount_ < topingBLASPositionCapacity_) { - uint32_t idxCount = topingBLASPositionCapacity_; - std::vector indices(idxCount); - for (uint32_t j = 0; j < idxCount; j++) indices[j] = j; - - GPUBufferDesc idxDesc; - idxDesc.size = (size_t)idxCount * sizeof(uint32_t); - idxDesc.bind_flags = BindFlag::SHADER_RESOURCE; - idxDesc.misc_flags = ResourceMiscFlag::NONE; - idxDesc.usage = Usage::DEFAULT; - device_->CreateBuffer(&idxDesc, indices.data(), &topingBLASIndexBuffer_); - topingBLASIndexCount_ = idxCount; - } - - topingBLASDirty_ = true; // GPU compute dispatch + BLAS rebuild in Render() + rt_.ensureTopingBLASCapacity(totalTopingVerts); } } @@ -1577,7 +968,7 @@ void VoxelRenderer::renderTopings( const Texture& normalTarget ) const { if (!topingPso_.IsValid() || !topingVertexBuffer_.IsValid() || - !topingInstanceBuffer_.IsValid()) return; + !topingInstanceBuf_.isValid()) return; const auto& instances = topingSystem.getInstances(); const auto& defs = topingSystem.getDefs(); @@ -1627,7 +1018,7 @@ void VoxelRenderer::renderTopings( dev->BindConstantBuffer(&constantBuffer_, 0, cmd); dev->BindResource(&textureArray_, 1, cmd); dev->BindResource(&topingVertexBuffer_, 4, cmd); // t4 - dev->BindResource(&topingInstanceBuffer_, 5, cmd); // t5 + dev->BindResource(&topingInstanceBuf_.gpu, 5, cmd); // t5 dev->BindSampler(&sampler_, 0, cmd); // Reuse draw groups built in uploadTopingData (avoids redundant sort) @@ -1689,21 +1080,12 @@ void VoxelRenderer::uploadSmoothData(VoxelWorld& world) { } // Pre-allocate smooth buffer; only recreate when capacity needs to grow. - if (!smoothVertexBuffer_.IsValid() || smoothVertexCapacity_ < smoothVertexCount_) { - smoothVertexCapacity_ = smoothVertexCount_ + smoothVertexCount_ / 4; // 25% headroom - GPUBufferDesc vbDesc; - vbDesc.size = smoothVertexCapacity_ * sizeof(SmoothVertex); - vbDesc.bind_flags = BindFlag::SHADER_RESOURCE; - vbDesc.misc_flags = ResourceMiscFlag::BUFFER_STRUCTURED; - vbDesc.stride = sizeof(SmoothVertex); - vbDesc.usage = Usage::DEFAULT; - // Create WITHOUT data — capacity > vertex count, Wicked would overread. - device_->CreateBuffer(&vbDesc, nullptr, &smoothVertexBuffer_); - smoothVertexDirty_ = true; // upload data in Render() - wi::backlog::post("Smooth: allocated vertex buffer (" + std::to_string(smoothVertexCapacity_) - + " capacity, " + std::to_string(smoothVertexCapacity_ * sizeof(SmoothVertex) / 1024) + " KB)"); + if (smoothVertexBuf_.ensureCapacity(device_, smoothVertexCount_, sizeof(SmoothVertex), + BindFlag::SHADER_RESOURCE)) { + wi::backlog::post("Smooth: allocated vertex buffer (" + std::to_string(smoothVertexBuf_.capacity) + + " capacity, " + std::to_string(smoothVertexBuf_.capacity * sizeof(SmoothVertex) / 1024) + " KB)"); } else { - smoothVertexDirty_ = true; // deferred upload in Render() + smoothVertexBuf_.markDirty(); // deferred upload in Render() } smoothDirty_ = false; @@ -1734,19 +1116,11 @@ void VoxelRenderer::uploadSmoothDataFast(VoxelWorld& world) { } // Pre-allocate smooth buffer; only recreate when capacity needs to grow. - if (!smoothVertexBuffer_.IsValid() || smoothVertexCapacity_ < smoothVertexCount_) { - smoothVertexCapacity_ = smoothVertexCount_ + smoothVertexCount_ / 4; - GPUBufferDesc vbDesc; - vbDesc.size = smoothVertexCapacity_ * sizeof(SmoothVertex); - vbDesc.bind_flags = BindFlag::SHADER_RESOURCE; - vbDesc.misc_flags = ResourceMiscFlag::BUFFER_STRUCTURED; - vbDesc.stride = sizeof(SmoothVertex); - vbDesc.usage = Usage::DEFAULT; - // Create WITHOUT data — capacity > vertex count, Wicked would overread. - device_->CreateBuffer(&vbDesc, nullptr, &smoothVertexBuffer_); - smoothVertexDirty_ = true; // upload data in Render() + if (smoothVertexBuf_.ensureCapacity(device_, smoothVertexCount_, sizeof(SmoothVertex), + BindFlag::SHADER_RESOURCE)) { + // Buffer recreated with 25% headroom } else { - smoothVertexDirty_ = true; // deferred upload in Render() + smoothVertexBuf_.markDirty(); // deferred upload in Render() } smoothDirty_ = false; @@ -1760,7 +1134,7 @@ void VoxelRenderer::renderSmooth( ) const { // Use GPU-generated smooth buffer if available, otherwise CPU buffer const bool useGpuSmooth = smoothCentroidShader_.IsValid() && smoothMeshShader_.IsValid(); - const auto& smoothBuf = useGpuSmooth ? gpuSmoothVertexBuffer_ : smoothVertexBuffer_; + const auto& smoothBuf = useGpuSmooth ? gpuSmoothVertexBuffer_ : smoothVertexBuf_.gpu; uint32_t vertCount = useGpuSmooth ? gpuSmoothVertexCount_ : smoothVertexCount_; if (!smoothPso_.IsValid() || !smoothBuf.IsValid() || vertCount == 0) return; @@ -1839,22 +1213,16 @@ void VoxelRenderPath::Start() { // Generate world if (debugSmooth) { world.generateDebugSmooth(); - cameraPos = { 15.0f, 12.0f, -5.0f }; - cameraPitch = -0.5f; - cameraYaw = 0.8f; + camera_.set(15.0f, 12.0f, -5.0f, -0.5f, 0.8f); } else if (debugMode) { world.generateDebug(); - cameraPos = { 10.0f, 10.0f, 0.0f }; - cameraPitch = -0.4f; - cameraYaw = 0.5f; + camera_.set(10.0f, 10.0f, 0.0f, -0.4f, 0.5f); } else { - world.generateAround(cameraPos.x, cameraPos.y, cameraPos.z, 4); + world.generateAround(camera_.pos.x, camera_.pos.y, camera_.pos.z, 4); } // Screenshot mode: fixed camera with good framing of terrain if (screenshotMode) { - cameraPos = { 270.0f, 50.0f, 240.0f }; // above terrain, below sky - cameraPitch = -0.25f; // slight downward look - cameraYaw = 0.6f; // angled view for depth + camera_.set(270.0f, 50.0f, 240.0f, -0.25f, 0.6f); } if (renderer.isInitialized()) { renderer.updateMeshes(world); @@ -1975,13 +1343,13 @@ void VoxelRenderPath::createRenderTargets() { aoDesc.mip_levels = 1; aoDesc.sample_count = 1; aoDesc.layout = wi::graphics::ResourceState::SHADER_RESOURCE; - device->CreateTexture(&aoDesc, nullptr, &renderer.aoRawTexture_); - device->CreateTexture(&aoDesc, nullptr, &renderer.aoBlurredTexture_); - device->CreateTexture(&aoDesc, nullptr, &renderer.aoHistoryTexture_); - renderer.aoHistoryValid_ = false; // no history on first frame + device->CreateTexture(&aoDesc, nullptr, &renderer.rt_.aoRawTexture); + device->CreateTexture(&aoDesc, nullptr, &renderer.rt_.aoBlurredTexture); + device->CreateTexture(&aoDesc, nullptr, &renderer.rt_.aoHistoryTexture); + renderer.rt_.aoHistoryValid = false; // no history on first frame rtCreated_ = voxelRT_.IsValid() && voxelNormalRT_.IsValid() && voxelDepth_.IsValid() - && renderer.aoRawTexture_.IsValid() && renderer.aoBlurredTexture_.IsValid(); + && renderer.rt_.aoRawTexture.IsValid() && renderer.rt_.aoBlurredTexture.IsValid(); wi::backlog::post("VoxelRenderPath: render targets " + std::string(rtCreated_ ? "OK" : "FAILED") + " (" + std::to_string(w) + "x" + std::to_string(h) + ")"); } @@ -1995,50 +1363,7 @@ static constexpr wi::input::BUTTON KEY_Q = (wi::input::BUTTON)(wi::input::CHARAC static constexpr wi::input::BUTTON KEY_S = (wi::input::BUTTON)(wi::input::CHARACTER_RANGE_START + ('S' - 'A')); static constexpr wi::input::BUTTON KEY_D = (wi::input::BUTTON)(wi::input::CHARACTER_RANGE_START + ('D' - 'A')); -void VoxelRenderPath::handleInput(float dt) { - // F2: toggle backlog console - if (wi::input::Press(wi::input::KEYBOARD_BUTTON_F2)) { - wi::backlog::Toggle(); - } - // F3: toggle animated terrain - if (wi::input::Press(wi::input::KEYBOARD_BUTTON_F3)) { - animatedTerrain_ = !animatedTerrain_; - if (animatedTerrain_) { - // Save RT state and disable shadows during animation (stale BLAS = wrong shadows) - rtWasEnabled_ = renderer.rtShadowsEnabled_; - renderer.rtShadowsEnabled_ = false; - } else { - // Force full RT rebuild (including topings) when animation stops - renderer.rtDirty_ = true; - renderer.topingBLASDirty_ = true; - renderer.aoHistoryValid_ = false; // clear stale temporal AO from pre-animation - renderer.rtShadowsEnabled_ = rtWasEnabled_; - } - wi::backlog::post(animatedTerrain_ ? "Animation: ON (30 Hz)" : "Animation: OFF"); - } - // F4: toggle blend debug visualization - if (wi::input::Press(wi::input::KEYBOARD_BUTTON_F4)) { - renderer.debugBlend_ = !renderer.debugBlend_; - wi::backlog::post(renderer.debugBlend_ ? "Blend debug: ON" : "Blend debug: OFF"); - } - if (wi::input::Press(wi::input::KEYBOARD_BUTTON_F5)) { - // Cycle: OFF → ON → DBG_SHADOW → DBG_AO → OFF - if (!renderer.rtShadowsEnabled_) { - renderer.rtShadowsEnabled_ = true; - renderer.rtShadowDebug_ = 0; - wi::backlog::post("RT Shadows+AO: ON"); - } else if (renderer.rtShadowDebug_ == 0) { - renderer.rtShadowDebug_ = 1; - wi::backlog::post("RT Debug: SHADOWS (red=shadow, green=lit, blue=backface)"); - } else if (renderer.rtShadowDebug_ == 1) { - renderer.rtShadowDebug_ = 2; - wi::backlog::post("RT Debug: AO (white=open, black=occluded)"); - } else { - renderer.rtShadowsEnabled_ = false; - renderer.rtShadowDebug_ = 0; - wi::backlog::post("RT Shadows+AO: OFF"); - } - } +void CameraController::handleInput(float dt, wi::scene::CameraComponent* cam) { if (wi::input::Press(wi::input::MOUSE_BUTTON_RIGHT)) { mouseCaptured = !mouseCaptured; wi::input::HidePointer(mouseCaptured); @@ -2046,112 +1371,140 @@ void VoxelRenderPath::handleInput(float dt) { if (mouseCaptured) { auto mouseState = wi::input::GetMouseState(); - cameraYaw += mouseState.delta_position.x * cameraSensitivity; - cameraPitch += mouseState.delta_position.y * cameraSensitivity; - cameraPitch = std::clamp(cameraPitch, -1.5f, 1.5f); + yaw += mouseState.delta_position.x * sensitivity; + pitch += mouseState.delta_position.y * sensitivity; + pitch = std::clamp(pitch, -1.5f, 1.5f); } - float cosPitch = std::cos(cameraPitch); + float cosPitch = std::cos(pitch); XMFLOAT3 forward( - std::sin(cameraYaw) * cosPitch, - -std::sin(cameraPitch), - std::cos(cameraYaw) * cosPitch + std::sin(yaw) * cosPitch, + -std::sin(pitch), + std::cos(yaw) * cosPitch ); - XMFLOAT3 right(std::cos(cameraYaw), 0.0f, -std::sin(cameraYaw)); + XMFLOAT3 right(std::cos(yaw), 0.0f, -std::sin(yaw)); - float speed = cameraSpeed * dt; - if (wi::input::Down(wi::input::KEYBOARD_BUTTON_LSHIFT)) speed *= 3.0f; + float spd = speed * dt; + if (wi::input::Down(wi::input::KEYBOARD_BUTTON_LSHIFT)) spd *= 3.0f; - if (wi::input::Down(KEY_Z)) { cameraPos.x += forward.x * speed; cameraPos.y += forward.y * speed; cameraPos.z += forward.z * speed; } - if (wi::input::Down(KEY_S)) { cameraPos.x -= forward.x * speed; cameraPos.y -= forward.y * speed; cameraPos.z -= forward.z * speed; } - if (wi::input::Down(KEY_Q)) { cameraPos.x -= right.x * speed; cameraPos.z -= right.z * speed; } - if (wi::input::Down(KEY_D)) { cameraPos.x += right.x * speed; cameraPos.z += right.z * speed; } - if (wi::input::Down(wi::input::KEYBOARD_BUTTON_SPACE)) cameraPos.y += speed; - if (wi::input::Down(wi::input::KEYBOARD_BUTTON_LCONTROL)) cameraPos.y -= speed; + if (wi::input::Down(KEY_Z)) { pos.x += forward.x * spd; pos.y += forward.y * spd; pos.z += forward.z * spd; } + if (wi::input::Down(KEY_S)) { pos.x -= forward.x * spd; pos.y -= forward.y * spd; pos.z -= forward.z * spd; } + if (wi::input::Down(KEY_Q)) { pos.x -= right.x * spd; pos.z -= right.z * spd; } + if (wi::input::Down(KEY_D)) { pos.x += right.x * spd; pos.z += right.z * spd; } + if (wi::input::Down(wi::input::KEYBOARD_BUTTON_SPACE)) pos.y += spd; + if (wi::input::Down(wi::input::KEYBOARD_BUTTON_LCONTROL)) pos.y -= spd; - camera->Eye = cameraPos; - camera->At = forward; - camera->Up = XMFLOAT3(0, 1, 0); - camera->UpdateCamera(); + cam->Eye = pos; + cam->At = forward; + cam->Up = XMFLOAT3(0, 1, 0); + cam->UpdateCamera(); } void VoxelRenderPath::Update(float dt) { auto frameStart = std::chrono::high_resolution_clock::now(); - frameStartTime_ = frameStart; + prof_.frameStart = frameStart; // Measure GPU wait: time from last Compose() end to this Update() start - // This captures Present() GPU sync + OS scheduling - if (lastComposeEndValid_) { - float gpuWaitMs = std::chrono::duration(frameStart - lastComposeEnd_).count(); - if (gpuWaitMs > 0.01f) profGpuWait_.add(gpuWaitMs); + if (prof_.lastComposeEndValid) { + float gpuWaitMs = std::chrono::duration(frameStart - prof_.lastComposeEnd).count(); + if (gpuWaitMs > 0.01f) prof_.gpuWait.add(gpuWaitMs); } lastDt_ = dt; float instantFps = (dt > 0.0f) ? (1.0f / dt) : 0.0f; smoothFps_ = smoothFps_ * 0.95f + instantFps * 0.05f; - if (camera) handleInput(dt); - windTime_ += dt; - renderer.windTime_ = windTime_; + + // F-key toggles (kept here: they need access to renderer + anim state) + if (wi::input::Press(wi::input::KEYBOARD_BUTTON_F2)) { + wi::backlog::Toggle(); + } + if (wi::input::Press(wi::input::KEYBOARD_BUTTON_F3)) { + anim_.terrainAnimated = !anim_.terrainAnimated; + if (anim_.terrainAnimated) { + rtWasEnabled_ = renderer.rt_.isShadowsEnabled(); + renderer.rt_.setShadowsEnabled(false); + } else { + renderer.rt_.dirty = true; + renderer.rt_.topingBLASDirty = true; + renderer.rt_.aoHistoryValid = false; + renderer.rt_.setShadowsEnabled(rtWasEnabled_); + } + wi::backlog::post(anim_.terrainAnimated ? "Animation: ON (30 Hz)" : "Animation: OFF"); + } + if (wi::input::Press(wi::input::KEYBOARD_BUTTON_F4)) { + renderer.debugBlend_ = !renderer.debugBlend_; + wi::backlog::post(renderer.debugBlend_ ? "Blend debug: ON" : "Blend debug: OFF"); + } + if (wi::input::Press(wi::input::KEYBOARD_BUTTON_F5)) { + if (!renderer.rt_.isShadowsEnabled()) { + renderer.rt_.setShadowsEnabled(true); renderer.rt_.setShadowDebug(0); + wi::backlog::post("RT Shadows+AO: ON"); + } else if (renderer.rt_.getShadowDebug() == 0) { + renderer.rt_.setShadowDebug(1); + wi::backlog::post("RT Debug: SHADOWS (red=shadow, green=lit, blue=backface)"); + } else if (renderer.rt_.getShadowDebug() == 1) { + renderer.rt_.setShadowDebug(2); + wi::backlog::post("RT Debug: AO (white=open, black=occluded)"); + } else { + renderer.rt_.setShadowsEnabled(false); renderer.rt_.setShadowDebug(0); + wi::backlog::post("RT Shadows+AO: OFF"); + } + } + + if (camera) camera_.handleInput(dt, camera); + anim_.windTime += dt; + renderer.windTime_ = anim_.windTime; // Animated terrain: regenerate at 30 Hz with time-shifted noise - // Fused: regenerate + pack voxel data in the same parallel pass - if (animatedTerrain_ && renderer.isInitialized()) { - animAccum_ += dt; - if (animAccum_ >= ANIM_INTERVAL) { - animAccum_ -= ANIM_INTERVAL; - animTime_ += ANIM_INTERVAL; + if (anim_.tick(dt) && renderer.isInitialized()) { + // Prepare pack cache for fused regenerate+pack + const uint32_t wordsPerChunk = CHUNK_VOLUME / 2; + uint32_t totalWords = (uint32_t)world.chunkCount() * wordsPerChunk; + renderer.packedVoxelCache_.resize(totalWords); - // Prepare pack cache for fused regenerate+pack - const uint32_t wordsPerChunk = CHUNK_VOLUME / 2; - uint32_t totalWords = (uint32_t)world.chunkCount() * wordsPerChunk; - renderer.packedVoxelCache_.resize(totalWords); + auto t0 = std::chrono::high_resolution_clock::now(); + world.regenerateAnimated(anim_.time, + renderer.packedVoxelCache_.data(), totalWords); + auto t1 = std::chrono::high_resolution_clock::now(); + prof_.regenerate.add(std::chrono::duration(t1 - t0).count()); - auto t0 = std::chrono::high_resolution_clock::now(); - world.regenerateAnimated(animTime_, - renderer.packedVoxelCache_.data(), totalWords); - auto t1 = std::chrono::high_resolution_clock::now(); - profRegenerate_.add(std::chrono::duration(t1 - t0).count()); + renderer.voxelCacheDirty_ = false; + renderer.gpuMeshDirty_ = true; + renderer.rt_.aoHistoryValid = false; - renderer.voxelCacheDirty_ = false; // cache already filled by fused pack - renderer.gpuMeshDirty_ = true; // GPU still needs upload + dispatch - renderer.aoHistoryValid_ = false; // invalidate temporal AO (geometry changed) - - // Re-mesh smooth surfaces — GPU path or CPU fallback - if (renderer.smoothCentroidShader_.IsValid() && renderer.smoothMeshShader_.IsValid()) { - renderer.gpuSmoothMeshDirty_ = true; // will dispatch in Render() - } else { - // CPU fallback (Surface Nets) — parallelized - auto ts0 = std::chrono::high_resolution_clock::now(); - std::vector chunkPtrs; - world.forEachChunk([&](const ChunkPos& pos, Chunk& chunk) { - chunkPtrs.push_back(&chunk); + // Re-mesh smooth surfaces — GPU path or CPU fallback + if (renderer.smoothCentroidShader_.IsValid() && renderer.smoothMeshShader_.IsValid()) { + renderer.gpuSmoothMeshDirty_ = true; + } else { + auto ts0 = std::chrono::high_resolution_clock::now(); + std::vector chunkPtrs; + world.forEachChunk([&](const ChunkPos& pos, Chunk& chunk) { + chunkPtrs.push_back(&chunk); + }); + const VoxelWorld& worldRef = world; + wi::jobsystem::context ctx; + wi::jobsystem::Dispatch(ctx, (uint32_t)chunkPtrs.size(), 1, + [&chunkPtrs, &worldRef](wi::jobsystem::JobArgs args) { + uint32_t idx = args.jobIndex; + SmoothMesher::meshChunk(*chunkPtrs[idx], worldRef); + for (auto& sv : chunkPtrs[idx]->smoothVertices) + sv.chunkIndex = (uint16_t)idx; }); - const VoxelWorld& worldRef = world; - wi::jobsystem::context ctx; - wi::jobsystem::Dispatch(ctx, (uint32_t)chunkPtrs.size(), 1, - [&chunkPtrs, &worldRef](wi::jobsystem::JobArgs args) { - uint32_t idx = args.jobIndex; - SmoothMesher::meshChunk(*chunkPtrs[idx], worldRef); - // Stamp chunkIndex during parallel pass (avoids sequential loop in upload) - for (auto& sv : chunkPtrs[idx]->smoothVertices) - sv.chunkIndex = (uint16_t)idx; - }); - wi::jobsystem::Wait(ctx); - auto ts1 = std::chrono::high_resolution_clock::now(); - profSmoothMesh_.add(std::chrono::duration(ts1 - ts0).count()); - renderer.uploadSmoothDataFast(world); - auto ts2 = std::chrono::high_resolution_clock::now(); - profSmoothUpload_.add(std::chrono::duration(ts2 - ts1).count()); - } + wi::jobsystem::Wait(ctx); + auto ts1 = std::chrono::high_resolution_clock::now(); + prof_.smoothMesh.add(std::chrono::duration(ts1 - ts0).count()); + renderer.uploadSmoothDataFast(world); + auto ts2 = std::chrono::high_resolution_clock::now(); + prof_.smoothUpload.add(std::chrono::duration(ts2 - ts1).count()); + } - // Re-collect toping instances — parallelized - { - auto tt0 = std::chrono::high_resolution_clock::now(); - topingSystem.collectInstancesParallel(world); - auto tt1 = std::chrono::high_resolution_clock::now(); - profTopingCollect_.add(std::chrono::duration(tt1 - tt0).count()); - renderer.uploadTopingData(topingSystem); - auto tt2 = std::chrono::high_resolution_clock::now(); - profTopingUpload_.add(std::chrono::duration(tt2 - tt1).count()); - } + // Re-collect toping instances — parallelized + { + auto tt0 = std::chrono::high_resolution_clock::now(); + topingSystem.collectInstancesParallel(world); + auto tt1 = std::chrono::high_resolution_clock::now(); + prof_.topingCollect.add(std::chrono::duration(tt1 - tt0).count()); + renderer.uploadTopingData(topingSystem); + auto tt2 = std::chrono::high_resolution_clock::now(); + prof_.topingUpload.add(std::chrono::duration(tt2 - tt1).count()); } } @@ -2159,19 +1512,18 @@ void VoxelRenderPath::Update(float dt) { auto t0 = std::chrono::high_resolution_clock::now(); renderer.updateMeshes(world); auto t1 = std::chrono::high_resolution_clock::now(); - profUpdateMeshes_.add(std::chrono::duration(t1 - t0).count()); + prof_.updateMeshes.add(std::chrono::duration(t1 - t0).count()); } RenderPath3D::Update(dt); - // Profiling: accumulate frame time (will be completed in Compose) auto frameEnd = std::chrono::high_resolution_clock::now(); - profFrame_.add(std::chrono::duration(frameEnd - frameStart).count()); + prof_.frame.add(std::chrono::duration(frameEnd - frameStart).count()); - // Log averages every 5 seconds - profTimer_ += dt; - if (profTimer_ >= PROF_INTERVAL) { - logProfilingAverages(); - profTimer_ -= PROF_INTERVAL; + prof_.timer += dt; + if (prof_.timer >= VoxelProfiler::INTERVAL) { + prof_.log(renderer); + prof_.resetAll(); + prof_.timer -= VoxelProfiler::INTERVAL; } } @@ -2179,7 +1531,7 @@ void VoxelRenderPath::Render() const { auto tWicked0 = std::chrono::high_resolution_clock::now(); RenderPath3D::Render(); auto tWicked1 = std::chrono::high_resolution_clock::now(); - profWickedRender_.add(std::chrono::duration(tWicked1 - tWicked0).count()); + prof_.wickedRender.add(std::chrono::duration(tWicked1 - tWicked0).count()); if (renderer.isInitialized() && camera && rtCreated_) { auto* device = wi::graphics::GetDevice(); @@ -2217,9 +1569,9 @@ void VoxelRenderPath::Render() const { device->QueryEnd(&renderer.timestampHeap_, VoxelRenderer::TS_GPU_MESH_BEGIN, cmd); auto t0 = std::chrono::high_resolution_clock::now(); renderer.dispatchGpuMesh(cmd, world, - &profVoxelPack_, &profGpuUpload_, &profGpuDispatch_); + &prof_.voxelPack, &prof_.gpuUpload, &prof_.gpuDispatch); auto t1 = std::chrono::high_resolution_clock::now(); - profGpuMeshDispatch_.add(std::chrono::duration(t1 - t0).count()); + prof_.gpuMeshDispatch.add(std::chrono::duration(t1 - t0).count()); device->QueryEnd(&renderer.timestampHeap_, VoxelRenderer::TS_GPU_MESH_END, cmd); } @@ -2241,71 +1593,56 @@ void VoxelRenderPath::Render() const { } // ── Deferred GPU uploads BEFORE compute dispatches that read them ── - // topingInstanceBuffer_ must be filled before dispatchTopingBLASExtract reads it (t5) - if (renderer.topingInstanceDirty_ && renderer.topingInstanceBuffer_.IsValid() && - !renderer.topingGpuInsts_.empty()) { - size_t uploadSize = renderer.topingGpuInsts_.size() * sizeof(VoxelRenderer::TopingGPUInst); - size_t bufferSize = renderer.topingInstanceCapacity_ * sizeof(VoxelRenderer::TopingGPUInst); - if (uploadSize <= bufferSize) { - device->UpdateBuffer(&renderer.topingInstanceBuffer_, - renderer.topingGpuInsts_.data(), cmd, uploadSize); - } - renderer.topingInstanceDirty_ = false; - } - if (renderer.smoothVertexDirty_ && renderer.smoothVertexBuffer_.IsValid() && - renderer.smoothVertexCount_ > 0 && - renderer.smoothVertexCount_ <= renderer.smoothStagingVerts_.size()) { - size_t uploadSize = renderer.smoothVertexCount_ * sizeof(SmoothVertex); - size_t bufferSize = renderer.smoothVertexCapacity_ * sizeof(SmoothVertex); - if (uploadSize <= bufferSize) { - device->UpdateBuffer(&renderer.smoothVertexBuffer_, - renderer.smoothStagingVerts_.data(), cmd, uploadSize); - } - renderer.smoothVertexDirty_ = false; - } + // topingInstanceBuf_ must be filled before dispatchTopingBLASExtract reads it (t5) + renderer.topingInstanceBuf_.upload(device, cmd, + renderer.topingGpuInsts_.data(), (uint32_t)renderer.topingGpuInsts_.size()); + renderer.smoothVertexBuf_.upload(device, cmd, + renderer.smoothStagingVerts_.data(), renderer.smoothVertexCount_); // ── GPU compute toping BLAS extraction ── // Skip during animation (toping BLAS is skipped to save ~130ms GPU) - if (renderer.topingBLASDirty_ && renderer.topingBLASShader_.IsValid() && !animatedTerrain_) { + auto& rt = renderer.rt_; + if (rt.topingBLASDirty && !anim_.terrainAnimated) { device->QueryEnd(&renderer.timestampHeap_, VoxelRenderer::TS_BLAS_EXTRACT_BEGIN, cmd); - renderer.dispatchTopingBLASExtract(cmd); + rt.dispatchTopingBLASExtract(cmd, + renderer.topingVertexBuffer_, renderer.topingInstanceBuf_.gpu, + renderer.topingBLASGroupsGPU_.data(), + renderer.topingBLASGroupsGPU_.size() * sizeof(VoxelRenderer::TopingBLASGroupGPU), + (uint32_t)renderer.topingBLASGroupsGPU_.size(), + renderer.topingBLASTotalVertices_); device->QueryEnd(&renderer.timestampHeap_, VoxelRenderer::TS_BLAS_EXTRACT_END, cmd); } // Phase 6.1: BLAS extraction + acceleration structure build - // During animation, stagger builds to avoid 200ms+ GPU spikes: - // - Skip toping BLAS entirely (7.7M tris = ~130ms, decorative only) - // - Alternate blocky/smooth BLAS builds across animation frames - // When not animating, rebuild all immediately. { - // Detect if new BLAS instances became available since last TLAS creation. - // Without this, the TLAS stays at 1-2 instances and never includes - // late-arriving smooth/toping BLASes (due to 1-frame readback delay). uint32_t potentialInstances = 0; if (renderer.gpuMeshQuadCount_ > 0) potentialInstances++; if (renderer.gpuSmoothVertexCount_ >= 3) potentialInstances++; - if (renderer.rtTopingVertexCount_ >= 3) potentialInstances++; - bool tlasNeedsMoreInstances = potentialInstances > renderer.tlasInstanceCount_; + if (rt.getTopingVertexCount() >= 3) potentialInstances++; + bool tlasNeedsMoreInstances = potentialInstances > rt.getTlasInstanceCount(); - bool needsBuild = renderer.rtAvailable_ && renderer.blasExtractShader_.IsValid() && + bool needsBuild = rt.isAvailable() && renderer.gpuMeshQuadCount_ > 0 && - (renderer.rtDirty_ || - renderer.gpuMeshQuadCount_ != renderer.rtBlockyVertexCount_ / 6 || + (rt.dirty || + renderer.gpuMeshQuadCount_ != rt.getBlockyTriCount() * 3 / 6 || tlasNeedsMoreInstances); if (needsBuild) { device->QueryEnd(&renderer.timestampHeap_, VoxelRenderer::TS_BLAS_BUILD_BEGIN, cmd); - renderer.dispatchBLASExtract(cmd); + rt.dispatchBLASExtract(cmd, renderer.gpuQuadBuffer_, + renderer.chunkInfoBuffer_, renderer.gpuMeshQuadCount_); - if (animatedTerrain_) { - // Stagger: alternate blocky/smooth each animation frame, skip topings + bool useGpuSmooth = renderer.smoothCentroidShader_.IsValid() && renderer.smoothMeshShader_.IsValid(); + const auto& smoothVB = useGpuSmooth ? renderer.gpuSmoothVertexBuffer_ : renderer.smoothVertexBuf_.gpu; + + if (anim_.terrainAnimated) { uint32_t flags = (rtBuildSkipCounter_ & 1) - ? VoxelRenderer::RT_BUILD_BLOCKY - : VoxelRenderer::RT_BUILD_SMOOTH; + ? VoxelRTManager::BUILD_BLOCKY + : VoxelRTManager::BUILD_SMOOTH; rtBuildSkipCounter_++; - renderer.buildAccelerationStructures(cmd, flags); + rt.buildAccelerationStructures(cmd, flags, smoothVB, renderer.gpuSmoothVertexCount_); } else { - renderer.buildAccelerationStructures(cmd, VoxelRenderer::RT_BUILD_ALL); + rt.buildAccelerationStructures(cmd, VoxelRTManager::BUILD_ALL, smoothVB, renderer.gpuSmoothVertexCount_); } device->QueryEnd(&renderer.timestampHeap_, VoxelRenderer::TS_BLAS_BUILD_END, cmd); @@ -2323,7 +1660,8 @@ void VoxelRenderPath::Render() const { // Phase 6.2: RT Shadows + AO if (renderer.isRTShadowsEnabled() && renderer.isRTReady()) { device->QueryEnd(&renderer.timestampHeap_, VoxelRenderer::TS_RT_SHADOWS_BEGIN, cmd); - renderer.dispatchShadows(cmd, voxelDepth_, voxelRT_, voxelNormalRT_); + renderer.rt_.dispatchShadows(cmd, voxelDepth_, voxelRT_, voxelNormalRT_, + renderer.constantBuffer_); device->QueryEnd(&renderer.timestampHeap_, VoxelRenderer::TS_RT_SHADOWS_END, cmd); } @@ -2333,7 +1671,7 @@ void VoxelRenderPath::Render() const { } } -void VoxelRenderPath::logProfilingAverages() const { +void VoxelProfiler::log(const VoxelRenderer& r) const { char msg[2048]; snprintf(msg, sizeof(msg), "=== PERF PROFILE (avg over %.0fs) ===\n" @@ -2370,62 +1708,52 @@ void VoxelRenderPath::logProfilingAverages() const { " CPU Frame: %7.2f ms (Update→Compose start)\n" " True Frame: %7.2f ms (Update→Compose end)\n" " Wall FPS: %7.1f (%u frames in %.0fs)", - PROF_INTERVAL, - profRegenerate_.avg(), profRegenerate_.count, - profUpdateMeshes_.avg(), profUpdateMeshes_.count, - profTopingCollect_.avg(), profTopingCollect_.count, - profTopingUpload_.avg(), profTopingUpload_.count, - profSmoothMesh_.avg(), profSmoothMesh_.count, - profSmoothUpload_.avg(), profSmoothUpload_.count, - profFrame_.avg(), profFrame_.count, - profVoxelPack_.avg(), profVoxelPack_.count, - profGpuUpload_.avg(), profGpuUpload_.count, - profGpuDispatch_.avg(), profGpuDispatch_.count, - profGpuMeshDispatch_.avg(), profGpuMeshDispatch_.count, - profGpuSmoothDispatch_.avg(), profGpuSmoothDispatch_.count, - profBLASExtract_.avg(), profBLASExtract_.count, - profBLASBuild_.avg(), profBLASBuild_.count, - profDeferredUpload_.avg(), profDeferredUpload_.count, - profRender_.avg(), profRender_.count, - profRTShadows_.avg(), profRTShadows_.count, - renderer.gpuMeshTimeMs_, - renderer.gpuSmoothMeshTimeMs_, - renderer.gpuBLASExtractTimeMs_, - renderer.gpuBLASBuildTimeMs_, - renderer.gpuDrawTimeMs_, - renderer.gpuRTShadowsTimeMs_, - renderer.gpuMeshTimeMs_ + renderer.gpuSmoothMeshTimeMs_ + - renderer.gpuBLASExtractTimeMs_ + renderer.gpuBLASBuildTimeMs_ + - renderer.gpuDrawTimeMs_ + renderer.gpuRTShadowsTimeMs_, - profWickedRender_.avg(), profWickedRender_.count, - profGpuWait_.avg(), profGpuWait_.count, - profFullFrame_.avg(), - profTrueFrame_.avg(), - profTrueFrame_.count > 0 ? (1000.0f / profTrueFrame_.avg()) : 0.0f, - profTrueFrame_.count, PROF_INTERVAL); + INTERVAL, + regenerate.avg(), regenerate.count, + updateMeshes.avg(), updateMeshes.count, + topingCollect.avg(), topingCollect.count, + topingUpload.avg(), topingUpload.count, + smoothMesh.avg(), smoothMesh.count, + smoothUpload.avg(), smoothUpload.count, + frame.avg(), frame.count, + voxelPack.avg(), voxelPack.count, + gpuUpload.avg(), gpuUpload.count, + gpuDispatch.avg(), gpuDispatch.count, + gpuMeshDispatch.avg(), gpuMeshDispatch.count, + gpuSmoothDispatch.avg(), gpuSmoothDispatch.count, + blasExtract.avg(), blasExtract.count, + blasBuild.avg(), blasBuild.count, + deferredUpload.avg(), deferredUpload.count, + render.avg(), render.count, + rtShadows.avg(), rtShadows.count, + r.getGpuMeshTimeMs(), + r.getGpuSmoothMeshTimeMs(), + r.getGpuBLASExtractTimeMs(), + r.getGpuBLASBuildTimeMs(), + r.getGpuDrawTimeMs(), + r.getGpuRTShadowsTimeMs(), + r.getGpuMeshTimeMs() + r.getGpuSmoothMeshTimeMs() + + r.getGpuBLASExtractTimeMs() + r.getGpuBLASBuildTimeMs() + + r.getGpuDrawTimeMs() + r.getGpuRTShadowsTimeMs(), + wickedRender.avg(), wickedRender.count, + gpuWait.avg(), gpuWait.count, + fullFrame.avg(), + trueFrame.avg(), + trueFrame.count > 0 ? (1000.0f / trueFrame.avg()) : 0.0f, + trueFrame.count, INTERVAL); wi::backlog::post(msg); +} - profRegenerate_.reset(); - profUpdateMeshes_.reset(); - profVoxelPack_.reset(); - profGpuUpload_.reset(); - profGpuDispatch_.reset(); - profGpuMeshDispatch_.reset(); - profGpuSmoothDispatch_.reset(); - profSmoothMesh_.reset(); - profSmoothUpload_.reset(); - profTopingCollect_.reset(); - profTopingUpload_.reset(); - profBLASExtract_.reset(); - profBLASBuild_.reset(); - profDeferredUpload_.reset(); - profRender_.reset(); - profRTShadows_.reset(); - profFrame_.reset(); - profFullFrame_.reset(); - profGpuWait_.reset(); - profWickedRender_.reset(); - profTrueFrame_.reset(); +void VoxelProfiler::resetAll() { + regenerate.reset(); updateMeshes.reset(); + voxelPack.reset(); gpuUpload.reset(); gpuDispatch.reset(); + gpuMeshDispatch.reset(); gpuSmoothDispatch.reset(); + smoothMesh.reset(); smoothUpload.reset(); + topingCollect.reset(); topingUpload.reset(); + blasExtract.reset(); blasBuild.reset(); + deferredUpload.reset(); render.reset(); rtShadows.reset(); + frame.reset(); fullFrame.reset(); + gpuWait.reset(); wickedRender.reset(); trueFrame.reset(); } void VoxelRenderPath::Compose(CommandList cmd) const { @@ -2433,8 +1761,8 @@ void VoxelRenderPath::Compose(CommandList cmd) const { // Measure full frame time (Update + Render + Compose start) auto composeStart = std::chrono::high_resolution_clock::now(); - float fullFrameMs = std::chrono::duration(composeStart - frameStartTime_).count(); - if (fullFrameMs > 0.1f) profFullFrame_.add(fullFrameMs); + float fullFrameMs = std::chrono::duration(composeStart - prof_.frameStart).count(); + if (fullFrameMs > 0.1f) prof_.fullFrame.add(fullFrameMs); RenderPath3D::Compose(cmd); @@ -2494,7 +1822,7 @@ void VoxelRenderPath::Compose(CommandList cmd) const { + std::to_string(renderer.getRTBlockyTriCount()) + " tris | Smooth " + std::to_string(renderer.getRTSmoothTriCount()) + " tris | Topings " + std::to_string(renderer.getRTTopingTriCount()) + " tris" - + " | Shadows+AO " + std::string(renderer.rtShadowDebug_ == 1 ? "DBG_SHD" : (renderer.rtShadowDebug_ == 2 ? "DBG_AO" : (renderer.isRTShadowsEnabled() ? "ON" : "OFF"))) + "\n"; + + " | Shadows+AO " + std::string(renderer.rt_.getShadowDebug() == 1 ? "DBG_SHD" : (renderer.rt_.getShadowDebug() == 2 ? "DBG_AO" : (renderer.isRTShadowsEnabled() ? "ON" : "OFF"))) + "\n"; } else { stats += "RT: building...\n"; } @@ -2502,29 +1830,23 @@ void VoxelRenderPath::Compose(CommandList cmd) const { stats += "RT: not available\n"; } stats += "WASD+Space/Ctrl: move | Shift: fast | Right-click: capture mouse\n"; - stats += "F2: console | F3: anim [" + std::string(animatedTerrain_ ? "ON" : "OFF") + stats += "F2: console | F3: anim [" + std::string(anim_.terrainAnimated ? "ON" : "OFF") + "] | F4: dbg [" + std::string(renderer.debugBlend_ ? "ON" : "OFF") - + "] | F5: shd+ao [" + std::string(renderer.rtShadowDebug_ == 1 ? "SHD" : (renderer.rtShadowDebug_ == 2 ? "AO" : (renderer.isRTShadowsEnabled() ? "ON" : "OFF"))) + "]"; + + "] | F5: shd+ao [" + std::string(renderer.rt_.getShadowDebug() == 1 ? "SHD" : (renderer.rt_.getShadowDebug() == 2 ? "AO" : (renderer.isRTShadowsEnabled() ? "ON" : "OFF"))) + "]"; wi::font::Draw(stats, fp, cmd); // Save compose end time for GPU wait measurement - lastComposeEnd_ = std::chrono::high_resolution_clock::now(); - lastComposeEndValid_ = true; + prof_.lastComposeEnd = std::chrono::high_resolution_clock::now(); + prof_.lastComposeEndValid = true; // True frame-to-frame time - float trueFrameMs = std::chrono::duration(lastComposeEnd_ - frameStartTime_).count(); - if (trueFrameMs > 0.1f) profTrueFrame_.add(trueFrameMs); -} - -void VoxelRenderPath::setCamera(float x, float y, float z, float pitch, float yaw) { - cameraPos = { x, y, z }; - cameraPitch = pitch; - cameraYaw = yaw; + float trueFrameMs = std::chrono::duration(prof_.lastComposeEnd - prof_.frameStart).count(); + if (trueFrameMs > 0.1f) prof_.trueFrame.add(trueFrameMs); } void VoxelRenderPath::resetAOHistory() { - renderer.aoHistoryValid_ = false; - renderer.frameCounter_ = 0; + renderer.rt_.aoHistoryValid = false; + renderer.rt_.frameCounter = 0; } } // namespace voxel diff --git a/src/voxel/VoxelRenderer.h b/src/voxel/VoxelRenderer.h index 5a6f300..263ca23 100644 --- a/src/voxel/VoxelRenderer.h +++ b/src/voxel/VoxelRenderer.h @@ -2,6 +2,8 @@ #include "VoxelWorld.h" #include "VoxelMesher.h" #include "TopingSystem.h" +#include "DeferredGPUBuffer.h" +#include "VoxelRTManager.h" #include "WickedEngine.h" namespace voxel { @@ -77,9 +79,7 @@ private: wi::graphics::Shader topingPS_; wi::graphics::PipelineState topingPso_; wi::graphics::GPUBuffer topingVertexBuffer_; // StructuredBuffer, SRV t4 - wi::graphics::GPUBuffer topingInstanceBuffer_; // StructuredBuffer, SRV t5 - mutable uint32_t topingInstanceCapacity_ = 0; // pre-allocated capacity (avoid per-frame CreateBuffer) - mutable bool topingInstanceDirty_ = false; // deferred upload via UpdateBuffer in Render() + DeferredGPUBuffer topingInstanceBuf_; // StructuredBuffer, SRV t5 static constexpr uint32_t MAX_TOPING_INSTANCES = 256 * 1024; // 256K instances max // Persistent staging buffers for toping upload (avoids per-frame allocations) struct TopingSortedInst { float wx, wy, wz; uint16_t type, variant; }; @@ -96,8 +96,7 @@ private: }; std::vector topingDrawGroups_; // built in uploadTopingData, reused in renderTopings - // ── GPU compute toping BLAS extraction (replaces 196ms CPU loop) ── - wi::graphics::Shader topingBLASShader_; // voxelTopingBLASCS compute shader + // ── Toping BLAS group staging (passed to VoxelRTManager) ────── struct TopingBLASGroupGPU { uint32_t globalVertexOffset; // prefix sum of total vertices before this group uint32_t vertexTemplateOffset; // offset into topingVertices (t4) @@ -105,24 +104,19 @@ private: uint32_t instanceOffset; // offset into topingInstances (t5) uint32_t instanceCount; // instances in this group }; - wi::graphics::GPUBuffer topingBLASGroupBuffer_; // StructuredBuffer, SRV t7 std::vector topingBLASGroupsGPU_; // CPU staging for group table mutable uint32_t topingBLASTotalVertices_ = 0; - static constexpr uint32_t MAX_TOPING_BLAS_GROUPS = 64; - void dispatchTopingBLASExtract(wi::graphics::CommandList cmd) const; // Shaders & Pipeline (smooth surfaces, Phase 5) wi::graphics::Shader smoothVS_; wi::graphics::Shader smoothPS_; wi::graphics::RasterizerState smoothRasterizer_; wi::graphics::PipelineState smoothPso_; - wi::graphics::GPUBuffer smoothVertexBuffer_; // StructuredBuffer, SRV t6 - mutable uint32_t smoothVertexCapacity_ = 0; // pre-allocated capacity (avoid per-frame CreateBuffer) + DeferredGPUBuffer smoothVertexBuf_; // StructuredBuffer, SRV t6 std::vector smoothStagingVerts_; // persistent staging buffer (avoids per-frame alloc) static constexpr uint32_t MAX_SMOOTH_VERTICES = 4 * 1024 * 1024; // 4M vertices max mutable uint32_t smoothVertexCount_ = 0; mutable uint32_t smoothDrawCalls_ = 0; - mutable bool smoothVertexDirty_ = false; // deferred upload via UpdateBuffer in Render() bool smoothDirty_ = true; // Texture array for materials (256x256, 5 layers for prototype) @@ -201,58 +195,9 @@ private: mutable uint32_t gpuSmoothVertexCount_ = 0; // readback from previous frame mutable bool gpuSmoothMeshDirty_ = true; - // ── Ray Tracing (Phase 6.1) ───────────────────────────────────── - wi::graphics::Shader blasExtractShader_; // voxelBLASExtractCS compute shader - mutable wi::graphics::GPUBuffer blasPositionBuffer_; // float3[] for blocky BLAS (6 verts per quad) - wi::graphics::GPUBuffer blasIndexBuffer_; // sequential uint32 indices [0,1,2,...] for BLAS - mutable wi::graphics::RaytracingAccelerationStructure blockyBLAS_; - mutable wi::graphics::RaytracingAccelerationStructure smoothBLAS_; - mutable wi::graphics::RaytracingAccelerationStructure topingBLAS_; - mutable wi::graphics::RaytracingAccelerationStructure tlas_; - mutable wi::graphics::GPUBuffer topingBLASPositionBuffer_; // float3[] world-space toping positions - mutable wi::graphics::GPUBuffer topingBLASIndexBuffer_; // sequential indices for toping BLAS - mutable uint32_t topingBLASPositionCapacity_ = 0; // pre-allocated capacity (vertices) - mutable uint32_t topingBLASIndexCount_ = 0; // size of toping index buffer - mutable bool topingBLASDirty_ = false; // GPU compute BLAS extract + rebuild needed - mutable uint32_t topingBLASVertexCount_ = 0; // actual vertex count for current frame + // ── Ray Tracing (Phase 6) ──────────────────────────────────────── static constexpr uint32_t MAX_BLAS_VERTICES = MEGA_BUFFER_CAPACITY * 6; // 6 verts per quad - mutable bool rtAvailable_ = false; // GPU supports RT - mutable bool rtDirty_ = true; // BLAS/TLAS need rebuild - mutable uint32_t rtBlockyVertexCount_ = 0; // current blocky BLAS vertex count - mutable uint32_t rtSmoothVertexCount_ = 0; // current smooth BLAS vertex count - mutable uint32_t rtTopingVertexCount_ = 0; // current toping BLAS vertex count - // BLAS capacity tracking: only recreate AS when vertex count exceeds capacity - mutable uint32_t blockyBLASCapacity_ = 0; // vertex count at BLAS creation - mutable uint32_t smoothBLASCapacity_ = 0; - mutable uint32_t topingBLASASCapacity_ = 0; // separate from topingBLASPositionCapacity_ (buffer capacity) - mutable uint32_t tlasInstanceCount_ = 0; // track TLAS instance count to avoid per-frame recreation - - void dispatchBLASExtract(wi::graphics::CommandList cmd) const; - // Flags for selective BLAS rebuild - static constexpr uint32_t RT_BUILD_BLOCKY = 1 << 0; - static constexpr uint32_t RT_BUILD_SMOOTH = 1 << 1; - static constexpr uint32_t RT_BUILD_TOPING = 1 << 2; - static constexpr uint32_t RT_BUILD_ALL = RT_BUILD_BLOCKY | RT_BUILD_SMOOTH | RT_BUILD_TOPING; - void buildAccelerationStructures(wi::graphics::CommandList cmd, - uint32_t buildFlags = RT_BUILD_ALL) const; - - // ── RT Shadows + AO (Phase 6.2 + 6.3) ────────────────────────── - wi::graphics::Shader shadowShader_; // voxelShadowCS compute shader - wi::graphics::Shader aoBlurShader_; // voxelAOBlurCS compute shader - wi::graphics::Shader aoApplyShader_; // voxelAOApplyCS compute shader - mutable wi::graphics::Texture aoRawTexture_; // R8_UNORM: raw AO from shadow CS - mutable wi::graphics::Texture aoBlurredTexture_; // R8_UNORM: after bilateral blur - mutable wi::graphics::Texture aoHistoryTexture_; // R8_UNORM: previous frame's temporally accumulated AO - mutable XMFLOAT4X4 prevViewProjection_; // previous frame's VP matrix - mutable uint32_t frameCounter_ = 0; - mutable bool aoHistoryValid_ = false; - mutable bool rtShadowsEnabled_ = false; // true when shader + TLAS ready - mutable uint32_t rtShadowDebug_ = 0; // 0=off, 1=debug shadows, 2=debug AO - - void dispatchShadows(wi::graphics::CommandList cmd, - const wi::graphics::Texture& depthBuffer, - const wi::graphics::Texture& renderTarget, - const wi::graphics::Texture& normalTarget) const; + mutable VoxelRTManager rt_; void dispatchGpuMesh(wi::graphics::CommandList cmd, const VoxelWorld& world, ProfileAccum* profPack = nullptr, ProfileAccum* profUpload = nullptr, @@ -298,9 +243,9 @@ public: float getGpuBLASExtractTimeMs() const { return gpuBLASExtractTimeMs_; } float getGpuBLASBuildTimeMs() const { return gpuBLASBuildTimeMs_; } float getGpuRTShadowsTimeMs() const { return gpuRTShadowsTimeMs_; } - void toggleRTShadows() { rtShadowsEnabled_ = !rtShadowsEnabled_; } bool isGpuMeshEnabled() const { return gpuMesherAvailable_; } uint32_t getGpuMeshQuadCount() const { return gpuMeshQuadCount_; } + VoxelRTManager& rt() const { return rt_; } // Phase 4: Toping rendering void uploadTopingData(const TopingSystem& topingSystem); @@ -325,14 +270,90 @@ public: uint32_t getSmoothVertexCount() const { return (smoothCentroidShader_.IsValid() && smoothMeshShader_.IsValid()) ? gpuSmoothVertexCount_ : smoothVertexCount_; } uint32_t getSmoothDrawCalls() const { return smoothDrawCalls_; } - // Phase 6: Ray Tracing - bool isRTAvailable() const { return rtAvailable_; } - bool isRTReady() const { return rtAvailable_ && tlas_.IsValid(); } - bool isRTShadowsEnabled() const { return rtShadowsEnabled_; } - uint32_t getRTBlockyTriCount() const { return rtBlockyVertexCount_ / 3; } - uint32_t getRTSmoothTriCount() const { return rtSmoothVertexCount_ / 3; } - uint32_t getRTTopingTriCount() const { return rtTopingVertexCount_ / 3; } - const wi::graphics::RaytracingAccelerationStructure& getTLAS() const { return tlas_; } + // Phase 6: Ray Tracing (delegated to VoxelRTManager) + bool isRTAvailable() const { return rt_.isAvailable(); } + bool isRTReady() const { return rt_.isReady(); } + bool isRTShadowsEnabled() const { return rt_.isShadowsEnabled(); } + uint32_t getRTBlockyTriCount() const { return rt_.getBlockyTriCount(); } + uint32_t getRTSmoothTriCount() const { return rt_.getSmoothTriCount(); } + uint32_t getRTTopingTriCount() const { return rt_.getTopingTriCount(); } + const wi::graphics::RaytracingAccelerationStructure& getTLAS() const { return rt_.getTLAS(); } +}; + +// ── Camera Controller ──────────────────────────────────────────── +struct CameraController { + float speed = 50.0f; + float sensitivity = 0.003f; + XMFLOAT3 pos = { 256.0f, 100.0f, 256.0f }; + float pitch = -0.3f; + float yaw = 0.0f; + bool mouseCaptured = false; + + void set(float x, float y, float z, float p, float yw) { + pos = { x, y, z }; pitch = p; yaw = yw; + } + void handleInput(float dt, wi::scene::CameraComponent* camera); +}; + +// ── Animation State ───────────────────────────────────────────── +struct AnimationState { + float windTime = 0.0f; // continuous, always running + bool terrainAnimated = false; // toggled with F3 + float time = 0.0f; // current animation time offset + float accum = 0.0f; // accumulator for 30 Hz timer + static constexpr float INTERVAL = 1.0f / 30.0f; // ~33.3ms = 30 Hz + + // Returns true when an animation tick should fire (call every frame). + bool tick(float dt) { + windTime += dt; + if (!terrainAnimated) return false; + accum += dt; + if (accum < INTERVAL) return false; + accum -= INTERVAL; + time += INTERVAL; + return true; + } +}; + +// ── CPU Profiling (averages every INTERVAL seconds) ───────────── +struct VoxelProfiler { + static constexpr float INTERVAL = 5.0f; + + // Update() phase + ProfileAccum regenerate; // regenerateAnimated + ProfileAccum updateMeshes; // updateMeshes (rebuildChunkInfoOnly) + ProfileAccum topingCollect; // topingSystem.collectInstances + ProfileAccum topingUpload; // uploadTopingData + ProfileAccum smoothMesh; // SmoothMesher::meshChunk (all chunks) + ProfileAccum smoothUpload; // uploadSmoothData + ProfileAccum frame; // full frame (Update only - legacy) + + // Render() phase + ProfileAccum voxelPack; // voxel data packing in dispatchGpuMesh + ProfileAccum gpuUpload; // GPU upload in dispatchGpuMesh + ProfileAccum gpuDispatch; // compute dispatches in dispatchGpuMesh + ProfileAccum gpuMeshDispatch; // GPU mesh compute dispatch (in Render) + ProfileAccum gpuSmoothDispatch; // GPU smooth mesh dispatch (in Render) + ProfileAccum blasExtract; // BLAS position extraction compute + ProfileAccum blasBuild; // BLAS/TLAS build + ProfileAccum deferredUpload; // deferred GPU buffer uploads + ProfileAccum render; // render() draw calls + ProfileAccum rtShadows; // RT shadows + AO dispatch + + // Totals + ProfileAccum fullFrame; // true full frame (Update + Render + Compose) + ProfileAccum gpuWait; // GPU sync: time between Compose end and next Update start + ProfileAccum wickedRender; // RenderPath3D::Render() (Wicked internal) + ProfileAccum trueFrame; // wall-clock frame-to-frame time + + // Timing helpers + std::chrono::high_resolution_clock::time_point frameStart; + std::chrono::high_resolution_clock::time_point lastComposeEnd; + bool lastComposeEndValid = false; + float timer = 0.0f; + + void log(const VoxelRenderer& renderer) const; + void resetAll(); }; // ── Custom RenderPath that integrates voxel rendering ─────────── @@ -345,15 +366,14 @@ public: bool debugMode = false; bool debugSmooth = false; bool screenshotMode = false; // CLI "screenshot": auto-position camera, capture, quit - void setCamera(float x, float y, float z, float pitch, float yaw); + void setCamera(float x, float y, float z, float pitch, float yaw) { + camera_.set(x, y, z, pitch, yaw); + } void resetAOHistory(); // invalidate temporal AO after camera jump - float cameraSpeed = 50.0f; - float cameraSensitivity = 0.003f; - XMFLOAT3 cameraPos = { 256.0f, 100.0f, 256.0f }; - float cameraPitch = -0.3f; - float cameraYaw = 0.0f; - bool mouseCaptured = false; + CameraController camera_; + AnimationState anim_; + mutable VoxelProfiler prof_; const wi::graphics::Texture& getVoxelRT() const { return voxelRT_; } @@ -363,57 +383,19 @@ public: void Compose(wi::graphics::CommandList cmd) const override; private: - void handleInput(float dt); void createRenderTargets(); mutable bool worldGenerated_ = false; mutable int frameCount_ = 0; mutable float lastDt_ = 0.016f; mutable float smoothFps_ = 60.0f; - // Wind animation (continuous, always running) - float windTime_ = 0.0f; - - // Animated terrain (wave effect at 30 Hz, toggled with F3) - bool animatedTerrain_ = false; - float animTime_ = 0.0f; - float animAccum_ = 0.0f; - static constexpr float ANIM_INTERVAL = 1.0f / 30.0f; // ~33.3ms = 30 Hz - wi::graphics::Texture voxelRT_; wi::graphics::Texture voxelNormalRT_; // Phase 6: world-space normals for RT shadows/AO wi::graphics::Texture voxelDepth_; mutable bool rtCreated_ = false; - // ── CPU Profiling (averages every 5 seconds) ───────────────── - mutable ProfileAccum profRegenerate_; // regenerateAnimated - mutable ProfileAccum profUpdateMeshes_; // updateMeshes (rebuildChunkInfoOnly or CPU mesh) - mutable ProfileAccum profVoxelPack_; // voxel data packing in dispatchGpuMesh - mutable ProfileAccum profGpuUpload_; // GPU upload in dispatchGpuMesh - mutable ProfileAccum profGpuDispatch_; // compute dispatches in dispatchGpuMesh - mutable ProfileAccum profRender_; // render() draw calls - mutable ProfileAccum profFrame_; // full frame (Update only - legacy) - mutable ProfileAccum profFullFrame_; // true full frame (Update + Render + Compose) - mutable ProfileAccum profSmoothMesh_; // SmoothMesher::meshChunk (all chunks) - mutable ProfileAccum profSmoothUpload_; // uploadSmoothData - mutable ProfileAccum profTopingCollect_; // topingSystem.collectInstances - mutable ProfileAccum profTopingUpload_; // uploadTopingData - mutable ProfileAccum profGpuMeshDispatch_; // GPU mesh compute dispatch (in Render) - mutable ProfileAccum profGpuSmoothDispatch_; // GPU smooth mesh dispatch (in Render) - mutable ProfileAccum profBLASExtract_; // BLAS position extraction compute - mutable ProfileAccum profBLASBuild_; // BLAS/TLAS build - mutable ProfileAccum profDeferredUpload_; // deferred GPU buffer uploads - mutable ProfileAccum profRTShadows_; // RT shadows + AO dispatch - mutable ProfileAccum profGpuWait_; // GPU sync: time between Compose end and next Update start - mutable ProfileAccum profWickedRender_; // RenderPath3D::Render() (Wicked internal) - mutable ProfileAccum profTrueFrame_; // wall-clock frame-to-frame time - mutable std::chrono::high_resolution_clock::time_point frameStartTime_; // for full frame timing - mutable std::chrono::high_resolution_clock::time_point lastComposeEnd_; // for GPU wait measurement - mutable bool lastComposeEndValid_ = false; mutable uint32_t rtBuildSkipCounter_ = 0; // stagger BLAS builds during animation mutable bool rtWasEnabled_ = false; // saved RT state before animation - mutable float profTimer_ = 0.0f; - static constexpr float PROF_INTERVAL = 5.0f; - void logProfilingAverages() const; }; } // namespace voxel diff --git a/voxel_engine_spec.md b/voxel_engine_spec.md index 6a5d099..245dbb8 100644 --- a/voxel_engine_spec.md +++ b/voxel_engine_spec.md @@ -329,4 +329,116 @@ Le ciel te plaît — parfait ! On continue vers Wonderbox. Qu'est-ce que tu vou Couleurs plus saturées/profondes — le vert de l'herbe Wonderbox est plus riche et profond Fog atmosphérique — la brume chaude au loin qui fond le terrain dans le ciel Ombres plus marquées — le contraste ombre/lumière est plus prononcé dans Wonderbox -Faces latérales des blocs — plus texturées/détaillées dans Wonderbox \ No newline at end of file +Faces latérales des blocs — plus texturées/détaillées dans Wonderbox + +Remaining (deferred): + +DeferredGPUBuffer utility extraction — moderate refactor, 3 instances, lower priority +VoxelRTManager extraction — significant refactor, deferred +Toping instance sort+pack parallelization — P3 +Voici mon analyse complète et mes propositions. + +## 1. Gestion mémoire : état des lieux et clarification + +**Pattern actuel (répété 5× dans le code) :** +``` +uploadXxxData() { + staging.resize(count); // CPU staging vector + /* fill staging */ + if (capacity < count) { + capacity = count * 1.25; + CreateBuffer(nullptr); // GPU buffer, no data + } + dirty = true; // flag for Render() +} + +Render() { + if (dirty) { + UpdateBuffer(staging); // actual GPU upload + dirty = false; + } +} +``` + +**Problèmes :** +- Pattern dupliqué pour : `topingInstanceBuffer_`, `topingBLASPositionBuffer_`, `smoothVertexBuffer_`, et les 3 BLAS + TLAS +- Chaque duplication a engendré des bugs (le crash `memmove`, la fuite VRAM BLAS, les ombres figées) +- Les dirty flags sont dispersés (`topingInstanceDirty_`, `smoothVertexDirty_`, `topingBLASDirty_`, `rtDirty_`) avec des dépendances d'ordre non-évidentes (le BLAS upload doit précéder le BLAS build) +- 15 membres `mutable` juste pour les flags + capacités + +**Proposition :** Extraire un `DeferredGPUBuffer` encapsulant ce pattern : + +```cpp +struct DeferredGPUBuffer { + GPUBuffer gpu; + std::vector staging; + uint32_t count = 0; + uint32_t capacity = 0; + uint32_t stride = 0; + bool dirty = false; + + void prepare(uint32_t newCount, const void* data); // resize + fill + dirty=true + void upload(GraphicsDevice* dev, CommandList cmd); // UpdateBuffer + dirty=false + void ensureCapacity(GraphicsDevice* dev, uint32_t newCount, BindFlag flags); +}; +``` + +Ça élimine ~50 lignes de boilerplate par buffer et centralise les invariants (capacity > count, create avec nullptr, upload avec taille réelle). + +--- + +## 2. Performance : propositions sans régression fonctionnelle + +### 2.2 — Paralléliser le tri + packing d'instances toping (~5ms → ~1ms) + +Le `std::sort` sur 30K éléments et la copie dans `topingGpuInsts_` sont single-thread. Utiliser `wi::jobsystem` pour partitionner par type (2 types = 2 jobs), ou un counting sort (16 buckets par variant × 2 types = 32 buckets) qui est O(N) au lieu de O(N log N). + +### 2.3 — Skip le BLAS rebuild quand seul le blocky change + +Actuellement buildAccelerationStructures() rebuild les 3 BLAS + TLAS à chaque frame d'animation. Si seul le terrain blocky change (pas de vent/toping), le toping BLAS rebuild est inutile. Ajouter des dirty flags granulaires : + +mutable bool blockyBLASDirty_ = false; +mutable bool smoothBLASDirty_ = false; +// topingBLASDirty_ existe déjà + +## 3. Refactoring : axes de simplification + +### 3.2 — Extraire le RT dans une classe dédiée + +`VoxelRenderer` fait 2900+ lignes et mélange rendering, meshing, et ray tracing. Extraire un `VoxelRTManager` : + +```cpp +class VoxelRTManager { + // BLAS/TLAS management, capacity tracking + // dispatchBLASExtract(), buildAccelerationStructures() + // dispatchShadows() + // All RT-related state (rtAvailable_, rtDirty_, aoTextures_, etc.) +}; +``` + +Ça isole les ~500 lignes de RT et ses 20+ membres, rendant le debugging plus ciblé. + +### 3.3 — Unifier le pattern deferred upload + +Comme décrit en §1, le `DeferredGPUBuffer` centralisé évite la duplication error-prone. Chaque bug rencontré (crash memmove, VRAM leak, ombres figées) vient d'une variation mal implémentée de ce même pattern. + +### 3.4 — Simplifier `VoxelRenderPath` + +`VoxelRenderPath` fait office de "god object" : caméra, input, animation, profiling, render targets, wind. Extraire : +- Input/caméra → struct `CameraController` +- Profiling → struct `VoxelProfiler` (déjà un bon candidat, les `ProfileAccum` sont isolables) +- Animation → struct `AnimationState` + +## Priorisation recommandée + +| Priorité | Action | Impact perf | Effort | +|----------|--------|-------------|--------| +| **P1** | `DeferredGPUBuffer` (§3.3) | Prévention bugs | Moyen | +| **P2** | Extraire RT dans classe (§3.2) | Maintenabilité | Moyen | +| **P2** | Dirty flags granulaires BLAS (§2.3) | ~2-5ms/frame | Faible | +| **P3** | Paralléliser tri toping (§2.2) | ~4ms | Faible | +| **P3** | LOD topings en animation (§4.1) | Raster + BLAS | Moyen | + +**Le P0 seul ramènerait le frame time de 232ms à ~35ms (~28 FPS), soit 6.5× mieux.** Combiné avec P2 dirty flags, on approche les 60 FPS cibles. + +Dis-moi quelles priorités tu veux attaquer et dans quel ordre. \ No newline at end of file