diff --git a/CLAUDE.md b/CLAUDE.md index 791efb4..88a6d12 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -334,11 +334,15 @@ Découpée en sous-phases pour isoler les sources de bugs potentiels : - Compute shader corrigé : push constant packing + startVertexLocation=0 — voir points 7-8 - `ResourceState::UNDEFINED` = COMMON en Wicked (valeur 0), déclenche `DiscardResource()` — OK pour les buffers réécrits -#### Phase 2.4 - GPU compute mesher (benchmark) [A FAIRE] +#### Phase 2.4 - GPU compute mesher (benchmark) [FAIT] -- Le compute shader `voxelMeshCS.hlsl` fait le meshing sur GPU (baseline 1x1, puis greedy) -- Benchmark CPU greedy vs GPU baseline vs GPU greedy -- Intégration dans le pipeline de rendu +- Le compute shader `voxelMeshCS.hlsl` fait le meshing 1×1 sur GPU (1 thread par voxel, 8×8×8 thread groups) +- Benchmark automatique au premier frame après génération du monde +- Résultats (168 chunks, Ryzen 7 3700X + RX 5700 XT) : + - CPU greedy: 277 ms, 358K quads → greedy merge réduit les quads de 6.8× + - GPU baseline (1×1): 5.3 ms, 2.43M quads → 52× plus rapide que CPU +- GPU greedy merge non implémenté (pourrait combiner vitesse GPU + réduction de quads) +- Le benchmark est one-shot : state machine IDLE → DISPATCH → READBACK → DONE ### Phase 3 - Texture blending [A FAIRE] diff --git a/src/voxel/VoxelRenderer.cpp b/src/voxel/VoxelRenderer.cpp index 6d483fa..9061783 100644 --- a/src/voxel/VoxelRenderer.cpp +++ b/src/voxel/VoxelRenderer.cpp @@ -1,6 +1,7 @@ #include "VoxelRenderer.h" #include "wiPrimitive.h" #include +#include #include using namespace wi::graphics; @@ -102,6 +103,12 @@ void VoxelRenderer::initialize(GraphicsDevice* dev) { cntDesc.usage = Usage::DEFAULT; device_->CreateBuffer(&cntDesc, nullptr, &gpuQuadCounter_); + // Readback buffer for quad counter (GPU → CPU) + GPUBufferDesc rbDesc; + rbDesc.size = sizeof(uint32_t); + rbDesc.usage = Usage::READBACK; + device_->CreateBuffer(&rbDesc, nullptr, &meshCounterReadback_); + wi::backlog::post("VoxelRenderer: GPU compute mesher available"); } else { wi::backlog::post("VoxelRenderer: GPU compute mesher not available", wi::backlog::LogLevel::Warning); @@ -289,14 +296,24 @@ void VoxelRenderer::rebuildMegaBuffer(VoxelWorld& world) { void VoxelRenderer::updateMeshes(VoxelWorld& world) { if (!device_) return; - // Re-mesh dirty chunks + // Re-mesh dirty chunks, measure CPU time for benchmark bool anyDirty = false; + auto cpuStart = std::chrono::high_resolution_clock::now(); world.forEachChunk([&](const ChunkPos& pos, Chunk& chunk) { if (chunk.dirty) { VoxelMesher::meshChunk(chunk, world); anyDirty = true; } }); + auto cpuEnd = std::chrono::high_resolution_clock::now(); + + if (anyDirty) { + cpuMeshTimeMs_ = std::chrono::duration(cpuEnd - cpuStart).count(); + // Trigger GPU benchmark on next render frame + if (gpuMesherAvailable_ && benchState_ == BenchState::IDLE) { + benchState_ = BenchState::DISPATCH; + } + } if (anyDirty || megaBufferDirty_) { rebuildMegaBuffer(world); @@ -304,6 +321,119 @@ void VoxelRenderer::updateMeshes(VoxelWorld& world) { } } +// ── GPU Mesh Benchmark (Phase 2.4) ────────────────────────────── +// Dispatches the baseline 1x1 GPU mesher for ALL chunks and measures timing. +// State machine: DISPATCH (frame N) → READBACK (frame N+1) → DONE. + +void VoxelRenderer::dispatchGpuMeshBenchmark(CommandList cmd, const VoxelWorld& world) const { + auto* dev = device_; + + // Zero the quad counter + uint32_t zero = 0; + dev->UpdateBuffer(&gpuQuadCounter_, &zero, cmd, sizeof(uint32_t)); + + // Barrier: COPY_DST → UAV for counter, UNDEFINED → UAV for output buffer + GPUBarrier preBarriers[] = { + GPUBarrier::Buffer(&gpuQuadCounter_, ResourceState::COPY_DST, ResourceState::UNORDERED_ACCESS), + GPUBarrier::Buffer(&gpuQuadBuffer_, ResourceState::UNDEFINED, ResourceState::UNORDERED_ACCESS), + }; + dev->Barrier(preBarriers, 2, cmd); + + dev->BindComputeShader(&meshShader_, cmd); + + // GPU timestamp: mesh begin + dev->QueryEnd(×tampHeap_, TS_MESH_BEGIN, cmd); + + // Dispatch for each chunk + uint32_t chunkIdx = 0; + world.forEachChunk([&](const ChunkPos& pos, Chunk& chunk) { + // Pack voxel data: 32^3 voxels → 16384 uint32s (2 voxels per uint) + std::vector packed(CHUNK_VOLUME / 2, 0); + for (int i = 0; i < CHUNK_VOLUME; i++) { + uint32_t v = chunk.voxels[i].packed; + if (i & 1) + packed[i >> 1] |= (v << 16); + else + packed[i >> 1] = v; + } + + // Upload voxel data (re-uses the single-chunk buffer) + dev->UpdateBuffer(&voxelDataBuffer_, packed.data(), cmd, + packed.size() * sizeof(uint32_t)); + + // Bind resources (after BindComputeShader, so PushConstants targets compute) + dev->BindResource(&voxelDataBuffer_, 0, cmd); + dev->BindUAV(&gpuQuadBuffer_, 0, cmd); + dev->BindUAV(&gpuQuadCounter_, 1, cmd); + + // Push constants for this chunk + struct MeshPush { + uint32_t chunkIndex; + uint32_t voxelBufferOffset; + uint32_t quadBufferOffset; + uint32_t maxOutputQuads; + uint32_t pad[8]; + }; + MeshPush pushData = {}; + pushData.chunkIndex = chunkIdx; + pushData.voxelBufferOffset = 0; // single-chunk buffer, always at offset 0 + pushData.quadBufferOffset = 0; // all chunks share global atomic counter + pushData.maxOutputQuads = MEGA_BUFFER_CAPACITY; + dev->PushConstants(&pushData, sizeof(pushData), cmd); + + // Dispatch: 32/8 = 4 groups per axis → 64 groups total + dev->Dispatch(4, 4, 4, cmd); + + chunkIdx++; + }); + + // GPU timestamp: mesh end + dev->QueryEnd(×tampHeap_, TS_MESH_END, cmd); + + // Copy quad counter to readback buffer + GPUBarrier postBarrier = GPUBarrier::Buffer( + &gpuQuadCounter_, ResourceState::UNORDERED_ACCESS, ResourceState::COPY_SRC); + dev->Barrier(&postBarrier, 1, cmd); + dev->CopyBuffer(&meshCounterReadback_, 0, &gpuQuadCounter_, 0, sizeof(uint32_t), cmd); + + // Resolve timestamps + dev->QueryResolve(×tampHeap_, TS_MESH_BEGIN, 2, ×tampReadback_, + TS_MESH_BEGIN * sizeof(uint64_t), cmd); + + benchState_ = BenchState::READBACK; +} + +void VoxelRenderer::readbackGpuMeshBenchmark() const { + // Read quad count from readback buffer + uint32_t* countData = (uint32_t*)meshCounterReadback_.mapped_data; + if (countData) { + gpuBaselineQuads_ = *countData; + } + + // Read GPU mesh timestamps + uint64_t* tsData = (uint64_t*)timestampReadback_.mapped_data; + if (tsData) { + double freq = (double)device_->GetTimestampFrequency(); + if (freq > 0.0 && tsData[TS_MESH_END] > tsData[TS_MESH_BEGIN]) { + gpuMeshTimeMs_ = (float)((double)(tsData[TS_MESH_END] - tsData[TS_MESH_BEGIN]) / freq * 1000.0); + } + } + + // Log benchmark results + char msg[256]; + snprintf(msg, sizeof(msg), + "=== MESH BENCHMARK ===\n" + " CPU greedy: %.2f ms, %u quads (%u chunks)\n" + " GPU baseline: %.3f ms, %u quads (1x1, no merge)\n" + " Ratio quads: %.1fx more (GPU baseline vs CPU greedy)", + cpuMeshTimeMs_, totalQuads_, chunkCount_, + gpuMeshTimeMs_, gpuBaselineQuads_, + totalQuads_ > 0 ? (float)gpuBaselineQuads_ / totalQuads_ : 0.0f); + wi::backlog::post(msg); + + benchState_ = BenchState::DONE; +} + // ── Frustum plane extraction (Gribb-Hartmann method) ──────────── static void extractFrustumPlanes(const XMMATRIX& vp, XMFLOAT4 planes[6]) { XMFLOAT4X4 m; @@ -837,6 +967,14 @@ void VoxelRenderPath::Render() const { if (renderer.isInitialized() && camera && rtCreated_) { auto* device = wi::graphics::GetDevice(); CommandList cmd = device->BeginCommandList(); + + // GPU mesh benchmark state machine (runs once after world gen) + if (renderer.benchState_ == VoxelRenderer::BenchState::DISPATCH) { + renderer.dispatchGpuMeshBenchmark(cmd, world); + } else if (renderer.benchState_ == VoxelRenderer::BenchState::READBACK) { + renderer.readbackGpuMeshBenchmark(); + } + renderer.render(cmd, *camera, voxelDepth_, voxelRT_); } } diff --git a/src/voxel/VoxelRenderer.h b/src/voxel/VoxelRenderer.h index d8322f3..520a54e 100644 --- a/src/voxel/VoxelRenderer.h +++ b/src/voxel/VoxelRenderer.h @@ -17,6 +17,7 @@ struct GPUChunkInfo { // ── Voxel Renderer (Phase 2: mega-buffer + MDI pipeline) ──────── class VoxelRenderer { + friend class VoxelRenderPath; public: VoxelRenderer(); ~VoxelRenderer(); @@ -119,13 +120,23 @@ private: }; wi::graphics::GPUBuffer constantBuffer_; - // ── GPU Compute Mesher (Phase 2 benchmark) ───────────────────── + // ── GPU Compute Mesher (Phase 2.4 benchmark) ─────────────────── wi::graphics::Shader meshShader_; // voxelMeshCS compute shader wi::graphics::GPUBuffer voxelDataBuffer_; // chunk voxel data (StructuredBuffer) wi::graphics::GPUBuffer gpuQuadBuffer_; // GPU mesh output (RWStructuredBuffer) wi::graphics::GPUBuffer gpuQuadCounter_; // atomic counter for GPU mesh output + wi::graphics::GPUBuffer meshCounterReadback_; // READBACK buffer for quad counter bool gpuMesherAvailable_ = false; + // Benchmark state machine: runs once after world gen + enum class BenchState { IDLE, DISPATCH, READBACK, DONE }; + mutable BenchState benchState_ = BenchState::IDLE; + mutable float cpuMeshTimeMs_ = 0.0f; + mutable uint32_t gpuBaselineQuads_ = 0; + + void dispatchGpuMeshBenchmark(wi::graphics::CommandList cmd, const VoxelWorld& world) const; + void readbackGpuMeshBenchmark() const; + // ── GPU Timestamp Queries (Phase 2 benchmark) ──────────────── wi::graphics::GPUQueryHeap timestampHeap_; wi::graphics::GPUBuffer timestampReadback_; @@ -133,9 +144,12 @@ private: static constexpr uint32_t TS_CULL_END = 1; static constexpr uint32_t TS_DRAW_BEGIN = 2; static constexpr uint32_t TS_DRAW_END = 3; - static constexpr uint32_t TS_COUNT = 4; + static constexpr uint32_t TS_MESH_BEGIN = 4; + static constexpr uint32_t TS_MESH_END = 5; + static constexpr uint32_t TS_COUNT = 6; mutable float gpuCullTimeMs_ = 0.0f; mutable float gpuDrawTimeMs_ = 0.0f; + mutable float gpuMeshTimeMs_ = 0.0f; // Stats (mutable: updated during const Render() call) mutable uint32_t totalQuads_ = 0;