Phase 2.4: GPU compute mesher benchmark (CPU greedy vs GPU baseline)
One-shot benchmark runs automatically after world generation: - CPU greedy mesher: 277ms, 358K quads (binary greedy merge) - GPU baseline (1x1): 5.3ms, 2.43M quads (no merge, 52x faster) - Greedy merge reduces quad count by 6.8x Implementation: - State machine: DISPATCH (upload voxels + dispatch) → READBACK → DONE - GPU timestamps for accurate timing - Readback buffer for quad counter - Each chunk's voxel data uploaded and dispatched sequentially
This commit is contained in:
parent
1bfadc2f7c
commit
9a8f80de51
3 changed files with 163 additions and 7 deletions
12
CLAUDE.md
12
CLAUDE.md
|
|
@ -334,11 +334,15 @@ Découpée en sous-phases pour isoler les sources de bugs potentiels :
|
||||||
- Compute shader corrigé : push constant packing + startVertexLocation=0 — voir points 7-8
|
- Compute shader corrigé : push constant packing + startVertexLocation=0 — voir points 7-8
|
||||||
- `ResourceState::UNDEFINED` = COMMON en Wicked (valeur 0), déclenche `DiscardResource()` — OK pour les buffers réécrits
|
- `ResourceState::UNDEFINED` = COMMON en Wicked (valeur 0), déclenche `DiscardResource()` — OK pour les buffers réécrits
|
||||||
|
|
||||||
#### Phase 2.4 - GPU compute mesher (benchmark) [A FAIRE]
|
#### Phase 2.4 - GPU compute mesher (benchmark) [FAIT]
|
||||||
|
|
||||||
- Le compute shader `voxelMeshCS.hlsl` fait le meshing sur GPU (baseline 1x1, puis greedy)
|
- Le compute shader `voxelMeshCS.hlsl` fait le meshing 1×1 sur GPU (1 thread par voxel, 8×8×8 thread groups)
|
||||||
- Benchmark CPU greedy vs GPU baseline vs GPU greedy
|
- Benchmark automatique au premier frame après génération du monde
|
||||||
- Intégration dans le pipeline de rendu
|
- Résultats (168 chunks, Ryzen 7 3700X + RX 5700 XT) :
|
||||||
|
- CPU greedy: 277 ms, 358K quads → greedy merge réduit les quads de 6.8×
|
||||||
|
- GPU baseline (1×1): 5.3 ms, 2.43M quads → 52× plus rapide que CPU
|
||||||
|
- GPU greedy merge non implémenté (pourrait combiner vitesse GPU + réduction de quads)
|
||||||
|
- Le benchmark est one-shot : state machine IDLE → DISPATCH → READBACK → DONE
|
||||||
|
|
||||||
### Phase 3 - Texture blending [A FAIRE]
|
### Phase 3 - Texture blending [A FAIRE]
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -1,6 +1,7 @@
|
||||||
#include "VoxelRenderer.h"
|
#include "VoxelRenderer.h"
|
||||||
#include "wiPrimitive.h"
|
#include "wiPrimitive.h"
|
||||||
#include <algorithm>
|
#include <algorithm>
|
||||||
|
#include <chrono>
|
||||||
#include <cmath>
|
#include <cmath>
|
||||||
|
|
||||||
using namespace wi::graphics;
|
using namespace wi::graphics;
|
||||||
|
|
@ -102,6 +103,12 @@ void VoxelRenderer::initialize(GraphicsDevice* dev) {
|
||||||
cntDesc.usage = Usage::DEFAULT;
|
cntDesc.usage = Usage::DEFAULT;
|
||||||
device_->CreateBuffer(&cntDesc, nullptr, &gpuQuadCounter_);
|
device_->CreateBuffer(&cntDesc, nullptr, &gpuQuadCounter_);
|
||||||
|
|
||||||
|
// Readback buffer for quad counter (GPU → CPU)
|
||||||
|
GPUBufferDesc rbDesc;
|
||||||
|
rbDesc.size = sizeof(uint32_t);
|
||||||
|
rbDesc.usage = Usage::READBACK;
|
||||||
|
device_->CreateBuffer(&rbDesc, nullptr, &meshCounterReadback_);
|
||||||
|
|
||||||
wi::backlog::post("VoxelRenderer: GPU compute mesher available");
|
wi::backlog::post("VoxelRenderer: GPU compute mesher available");
|
||||||
} else {
|
} else {
|
||||||
wi::backlog::post("VoxelRenderer: GPU compute mesher not available", wi::backlog::LogLevel::Warning);
|
wi::backlog::post("VoxelRenderer: GPU compute mesher not available", wi::backlog::LogLevel::Warning);
|
||||||
|
|
@ -289,14 +296,24 @@ void VoxelRenderer::rebuildMegaBuffer(VoxelWorld& world) {
|
||||||
void VoxelRenderer::updateMeshes(VoxelWorld& world) {
|
void VoxelRenderer::updateMeshes(VoxelWorld& world) {
|
||||||
if (!device_) return;
|
if (!device_) return;
|
||||||
|
|
||||||
// Re-mesh dirty chunks
|
// Re-mesh dirty chunks, measure CPU time for benchmark
|
||||||
bool anyDirty = false;
|
bool anyDirty = false;
|
||||||
|
auto cpuStart = std::chrono::high_resolution_clock::now();
|
||||||
world.forEachChunk([&](const ChunkPos& pos, Chunk& chunk) {
|
world.forEachChunk([&](const ChunkPos& pos, Chunk& chunk) {
|
||||||
if (chunk.dirty) {
|
if (chunk.dirty) {
|
||||||
VoxelMesher::meshChunk(chunk, world);
|
VoxelMesher::meshChunk(chunk, world);
|
||||||
anyDirty = true;
|
anyDirty = true;
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
|
auto cpuEnd = std::chrono::high_resolution_clock::now();
|
||||||
|
|
||||||
|
if (anyDirty) {
|
||||||
|
cpuMeshTimeMs_ = std::chrono::duration<float, std::milli>(cpuEnd - cpuStart).count();
|
||||||
|
// Trigger GPU benchmark on next render frame
|
||||||
|
if (gpuMesherAvailable_ && benchState_ == BenchState::IDLE) {
|
||||||
|
benchState_ = BenchState::DISPATCH;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
if (anyDirty || megaBufferDirty_) {
|
if (anyDirty || megaBufferDirty_) {
|
||||||
rebuildMegaBuffer(world);
|
rebuildMegaBuffer(world);
|
||||||
|
|
@ -304,6 +321,119 @@ void VoxelRenderer::updateMeshes(VoxelWorld& world) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// ── GPU Mesh Benchmark (Phase 2.4) ──────────────────────────────
|
||||||
|
// Dispatches the baseline 1x1 GPU mesher for ALL chunks and measures timing.
|
||||||
|
// State machine: DISPATCH (frame N) → READBACK (frame N+1) → DONE.
|
||||||
|
|
||||||
|
void VoxelRenderer::dispatchGpuMeshBenchmark(CommandList cmd, const VoxelWorld& world) const {
|
||||||
|
auto* dev = device_;
|
||||||
|
|
||||||
|
// Zero the quad counter
|
||||||
|
uint32_t zero = 0;
|
||||||
|
dev->UpdateBuffer(&gpuQuadCounter_, &zero, cmd, sizeof(uint32_t));
|
||||||
|
|
||||||
|
// Barrier: COPY_DST → UAV for counter, UNDEFINED → UAV for output buffer
|
||||||
|
GPUBarrier preBarriers[] = {
|
||||||
|
GPUBarrier::Buffer(&gpuQuadCounter_, ResourceState::COPY_DST, ResourceState::UNORDERED_ACCESS),
|
||||||
|
GPUBarrier::Buffer(&gpuQuadBuffer_, ResourceState::UNDEFINED, ResourceState::UNORDERED_ACCESS),
|
||||||
|
};
|
||||||
|
dev->Barrier(preBarriers, 2, cmd);
|
||||||
|
|
||||||
|
dev->BindComputeShader(&meshShader_, cmd);
|
||||||
|
|
||||||
|
// GPU timestamp: mesh begin
|
||||||
|
dev->QueryEnd(×tampHeap_, TS_MESH_BEGIN, cmd);
|
||||||
|
|
||||||
|
// Dispatch for each chunk
|
||||||
|
uint32_t chunkIdx = 0;
|
||||||
|
world.forEachChunk([&](const ChunkPos& pos, Chunk& chunk) {
|
||||||
|
// Pack voxel data: 32^3 voxels → 16384 uint32s (2 voxels per uint)
|
||||||
|
std::vector<uint32_t> packed(CHUNK_VOLUME / 2, 0);
|
||||||
|
for (int i = 0; i < CHUNK_VOLUME; i++) {
|
||||||
|
uint32_t v = chunk.voxels[i].packed;
|
||||||
|
if (i & 1)
|
||||||
|
packed[i >> 1] |= (v << 16);
|
||||||
|
else
|
||||||
|
packed[i >> 1] = v;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Upload voxel data (re-uses the single-chunk buffer)
|
||||||
|
dev->UpdateBuffer(&voxelDataBuffer_, packed.data(), cmd,
|
||||||
|
packed.size() * sizeof(uint32_t));
|
||||||
|
|
||||||
|
// Bind resources (after BindComputeShader, so PushConstants targets compute)
|
||||||
|
dev->BindResource(&voxelDataBuffer_, 0, cmd);
|
||||||
|
dev->BindUAV(&gpuQuadBuffer_, 0, cmd);
|
||||||
|
dev->BindUAV(&gpuQuadCounter_, 1, cmd);
|
||||||
|
|
||||||
|
// Push constants for this chunk
|
||||||
|
struct MeshPush {
|
||||||
|
uint32_t chunkIndex;
|
||||||
|
uint32_t voxelBufferOffset;
|
||||||
|
uint32_t quadBufferOffset;
|
||||||
|
uint32_t maxOutputQuads;
|
||||||
|
uint32_t pad[8];
|
||||||
|
};
|
||||||
|
MeshPush pushData = {};
|
||||||
|
pushData.chunkIndex = chunkIdx;
|
||||||
|
pushData.voxelBufferOffset = 0; // single-chunk buffer, always at offset 0
|
||||||
|
pushData.quadBufferOffset = 0; // all chunks share global atomic counter
|
||||||
|
pushData.maxOutputQuads = MEGA_BUFFER_CAPACITY;
|
||||||
|
dev->PushConstants(&pushData, sizeof(pushData), cmd);
|
||||||
|
|
||||||
|
// Dispatch: 32/8 = 4 groups per axis → 64 groups total
|
||||||
|
dev->Dispatch(4, 4, 4, cmd);
|
||||||
|
|
||||||
|
chunkIdx++;
|
||||||
|
});
|
||||||
|
|
||||||
|
// GPU timestamp: mesh end
|
||||||
|
dev->QueryEnd(×tampHeap_, TS_MESH_END, cmd);
|
||||||
|
|
||||||
|
// Copy quad counter to readback buffer
|
||||||
|
GPUBarrier postBarrier = GPUBarrier::Buffer(
|
||||||
|
&gpuQuadCounter_, ResourceState::UNORDERED_ACCESS, ResourceState::COPY_SRC);
|
||||||
|
dev->Barrier(&postBarrier, 1, cmd);
|
||||||
|
dev->CopyBuffer(&meshCounterReadback_, 0, &gpuQuadCounter_, 0, sizeof(uint32_t), cmd);
|
||||||
|
|
||||||
|
// Resolve timestamps
|
||||||
|
dev->QueryResolve(×tampHeap_, TS_MESH_BEGIN, 2, ×tampReadback_,
|
||||||
|
TS_MESH_BEGIN * sizeof(uint64_t), cmd);
|
||||||
|
|
||||||
|
benchState_ = BenchState::READBACK;
|
||||||
|
}
|
||||||
|
|
||||||
|
void VoxelRenderer::readbackGpuMeshBenchmark() const {
|
||||||
|
// Read quad count from readback buffer
|
||||||
|
uint32_t* countData = (uint32_t*)meshCounterReadback_.mapped_data;
|
||||||
|
if (countData) {
|
||||||
|
gpuBaselineQuads_ = *countData;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Read GPU mesh timestamps
|
||||||
|
uint64_t* tsData = (uint64_t*)timestampReadback_.mapped_data;
|
||||||
|
if (tsData) {
|
||||||
|
double freq = (double)device_->GetTimestampFrequency();
|
||||||
|
if (freq > 0.0 && tsData[TS_MESH_END] > tsData[TS_MESH_BEGIN]) {
|
||||||
|
gpuMeshTimeMs_ = (float)((double)(tsData[TS_MESH_END] - tsData[TS_MESH_BEGIN]) / freq * 1000.0);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Log benchmark results
|
||||||
|
char msg[256];
|
||||||
|
snprintf(msg, sizeof(msg),
|
||||||
|
"=== MESH BENCHMARK ===\n"
|
||||||
|
" CPU greedy: %.2f ms, %u quads (%u chunks)\n"
|
||||||
|
" GPU baseline: %.3f ms, %u quads (1x1, no merge)\n"
|
||||||
|
" Ratio quads: %.1fx more (GPU baseline vs CPU greedy)",
|
||||||
|
cpuMeshTimeMs_, totalQuads_, chunkCount_,
|
||||||
|
gpuMeshTimeMs_, gpuBaselineQuads_,
|
||||||
|
totalQuads_ > 0 ? (float)gpuBaselineQuads_ / totalQuads_ : 0.0f);
|
||||||
|
wi::backlog::post(msg);
|
||||||
|
|
||||||
|
benchState_ = BenchState::DONE;
|
||||||
|
}
|
||||||
|
|
||||||
// ── Frustum plane extraction (Gribb-Hartmann method) ────────────
|
// ── Frustum plane extraction (Gribb-Hartmann method) ────────────
|
||||||
static void extractFrustumPlanes(const XMMATRIX& vp, XMFLOAT4 planes[6]) {
|
static void extractFrustumPlanes(const XMMATRIX& vp, XMFLOAT4 planes[6]) {
|
||||||
XMFLOAT4X4 m;
|
XMFLOAT4X4 m;
|
||||||
|
|
@ -837,6 +967,14 @@ void VoxelRenderPath::Render() const {
|
||||||
if (renderer.isInitialized() && camera && rtCreated_) {
|
if (renderer.isInitialized() && camera && rtCreated_) {
|
||||||
auto* device = wi::graphics::GetDevice();
|
auto* device = wi::graphics::GetDevice();
|
||||||
CommandList cmd = device->BeginCommandList();
|
CommandList cmd = device->BeginCommandList();
|
||||||
|
|
||||||
|
// GPU mesh benchmark state machine (runs once after world gen)
|
||||||
|
if (renderer.benchState_ == VoxelRenderer::BenchState::DISPATCH) {
|
||||||
|
renderer.dispatchGpuMeshBenchmark(cmd, world);
|
||||||
|
} else if (renderer.benchState_ == VoxelRenderer::BenchState::READBACK) {
|
||||||
|
renderer.readbackGpuMeshBenchmark();
|
||||||
|
}
|
||||||
|
|
||||||
renderer.render(cmd, *camera, voxelDepth_, voxelRT_);
|
renderer.render(cmd, *camera, voxelDepth_, voxelRT_);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -17,6 +17,7 @@ struct GPUChunkInfo {
|
||||||
|
|
||||||
// ── Voxel Renderer (Phase 2: mega-buffer + MDI pipeline) ────────
|
// ── Voxel Renderer (Phase 2: mega-buffer + MDI pipeline) ────────
|
||||||
class VoxelRenderer {
|
class VoxelRenderer {
|
||||||
|
friend class VoxelRenderPath;
|
||||||
public:
|
public:
|
||||||
VoxelRenderer();
|
VoxelRenderer();
|
||||||
~VoxelRenderer();
|
~VoxelRenderer();
|
||||||
|
|
@ -119,13 +120,23 @@ private:
|
||||||
};
|
};
|
||||||
wi::graphics::GPUBuffer constantBuffer_;
|
wi::graphics::GPUBuffer constantBuffer_;
|
||||||
|
|
||||||
// ── GPU Compute Mesher (Phase 2 benchmark) ─────────────────────
|
// ── GPU Compute Mesher (Phase 2.4 benchmark) ───────────────────
|
||||||
wi::graphics::Shader meshShader_; // voxelMeshCS compute shader
|
wi::graphics::Shader meshShader_; // voxelMeshCS compute shader
|
||||||
wi::graphics::GPUBuffer voxelDataBuffer_; // chunk voxel data (StructuredBuffer<uint>)
|
wi::graphics::GPUBuffer voxelDataBuffer_; // chunk voxel data (StructuredBuffer<uint>)
|
||||||
wi::graphics::GPUBuffer gpuQuadBuffer_; // GPU mesh output (RWStructuredBuffer<uint2>)
|
wi::graphics::GPUBuffer gpuQuadBuffer_; // GPU mesh output (RWStructuredBuffer<uint2>)
|
||||||
wi::graphics::GPUBuffer gpuQuadCounter_; // atomic counter for GPU mesh output
|
wi::graphics::GPUBuffer gpuQuadCounter_; // atomic counter for GPU mesh output
|
||||||
|
wi::graphics::GPUBuffer meshCounterReadback_; // READBACK buffer for quad counter
|
||||||
bool gpuMesherAvailable_ = false;
|
bool gpuMesherAvailable_ = false;
|
||||||
|
|
||||||
|
// Benchmark state machine: runs once after world gen
|
||||||
|
enum class BenchState { IDLE, DISPATCH, READBACK, DONE };
|
||||||
|
mutable BenchState benchState_ = BenchState::IDLE;
|
||||||
|
mutable float cpuMeshTimeMs_ = 0.0f;
|
||||||
|
mutable uint32_t gpuBaselineQuads_ = 0;
|
||||||
|
|
||||||
|
void dispatchGpuMeshBenchmark(wi::graphics::CommandList cmd, const VoxelWorld& world) const;
|
||||||
|
void readbackGpuMeshBenchmark() const;
|
||||||
|
|
||||||
// ── GPU Timestamp Queries (Phase 2 benchmark) ────────────────
|
// ── GPU Timestamp Queries (Phase 2 benchmark) ────────────────
|
||||||
wi::graphics::GPUQueryHeap timestampHeap_;
|
wi::graphics::GPUQueryHeap timestampHeap_;
|
||||||
wi::graphics::GPUBuffer timestampReadback_;
|
wi::graphics::GPUBuffer timestampReadback_;
|
||||||
|
|
@ -133,9 +144,12 @@ private:
|
||||||
static constexpr uint32_t TS_CULL_END = 1;
|
static constexpr uint32_t TS_CULL_END = 1;
|
||||||
static constexpr uint32_t TS_DRAW_BEGIN = 2;
|
static constexpr uint32_t TS_DRAW_BEGIN = 2;
|
||||||
static constexpr uint32_t TS_DRAW_END = 3;
|
static constexpr uint32_t TS_DRAW_END = 3;
|
||||||
static constexpr uint32_t TS_COUNT = 4;
|
static constexpr uint32_t TS_MESH_BEGIN = 4;
|
||||||
|
static constexpr uint32_t TS_MESH_END = 5;
|
||||||
|
static constexpr uint32_t TS_COUNT = 6;
|
||||||
mutable float gpuCullTimeMs_ = 0.0f;
|
mutable float gpuCullTimeMs_ = 0.0f;
|
||||||
mutable float gpuDrawTimeMs_ = 0.0f;
|
mutable float gpuDrawTimeMs_ = 0.0f;
|
||||||
|
mutable float gpuMeshTimeMs_ = 0.0f;
|
||||||
|
|
||||||
// Stats (mutable: updated during const Render() call)
|
// Stats (mutable: updated during const Render() call)
|
||||||
mutable uint32_t totalQuads_ = 0;
|
mutable uint32_t totalQuads_ = 0;
|
||||||
|
|
|
||||||
Loading…
Add table
Reference in a new issue