Phase 2 complete: per-face-group backface culling, frustum planes, GPU cull infrastructure

- VS supports dual mode: CPU path (push constants) and MDI path (binary search)
- CPU render loop now does per-face-group draws with backface culling (6 draws/chunk max)
- Frustum planes extracted and populated in constant buffer for GPU cull shader
- GPU cull + MDI path fully implemented but disabled (barrier/state debugging needed)
- GPU timestamp query infrastructure with readback for cull/draw timing
- HUD shows rendering mode (GPU cull vs CPU fallback)
This commit is contained in:
Samuel Bouchet 2026-03-25 14:50:55 +01:00
parent 5f346bb14a
commit 46e8f50f37
2 changed files with 236 additions and 42 deletions

View file

@ -1,5 +1,5 @@
// BVLE Voxels - Vertex Shader (Vertex Pulling from mega-buffer) // BVLE Voxels - Vertex Shader (Vertex Pulling from mega-buffer)
// Phase 2: uses SV_InstanceID to look up chunk info instead of push constants. // Phase 2: supports both CPU draw loop (push constants) and GPU MDI (binary search).
#include "voxelCommon.hlsli" #include "voxelCommon.hlsli"
@ -10,11 +10,14 @@ struct PackedQuad {
StructuredBuffer<PackedQuad> quadBuffer : register(t0); StructuredBuffer<PackedQuad> quadBuffer : register(t0);
StructuredBuffer<GPUChunkInfo> chunkInfoBuffer : register(t2); StructuredBuffer<GPUChunkInfo> chunkInfoBuffer : register(t2);
// Push constants: chunk index + quad offset for current draw call // Push constants (48 bytes = 12 x uint32)
// CPU path: chunkIndex + quadOffset explicit
// MDI path: flags bit 0 set, VS derives chunk from SV_VertexID via binary search
struct VoxelPush { struct VoxelPush {
uint chunkIndex; uint chunkIndex;
uint quadOffset; // offset into mega quad buffer (in quads) uint quadOffset; // offset into mega quad buffer (in quads)
uint pad0, pad1, pad2, pad3, pad4, pad5, pad6, pad7, pad8, pad9; uint flags; // bit 0: 1 = MDI mode (binary search), 0 = CPU mode
uint pad0, pad1, pad2, pad3, pad4, pad5, pad6, pad7, pad8;
}; };
[[vk::push_constant]] ConstantBuffer<VoxelPush> push : register(b999); [[vk::push_constant]] ConstantBuffer<VoxelPush> push : register(b999);
@ -46,6 +49,23 @@ void unpackQuad(uint2 raw, out uint px, out uint py, out uint pz,
ao = (hi >> 9) & 0xFF; ao = (hi >> 9) & 0xFF;
} }
// Binary search: find which chunk owns a given global quad index.
// Chunks are packed contiguously in the mega-buffer, sorted by chunk index.
// O(log2(chunkCount)) = ~11 iterations for 2048 chunks.
uint findChunkIndex(uint globalQuadIndex) {
uint lo = 0, hi = chunkCount;
[loop]
while (lo < hi) {
uint mid = (lo + hi) >> 1;
GPUChunkInfo ci = chunkInfoBuffer[mid];
if (ci.quadOffset + ci.quadCount <= globalQuadIndex)
lo = mid + 1;
else
hi = mid;
}
return lo;
}
// Face normals: +X, -X, +Y, -Y, +Z, -Z // Face normals: +X, -X, +Y, -Y, +Z, -Z
static const float3 faceNormals[6] = { static const float3 faceNormals[6] = {
float3( 1, 0, 0), float3(-1, 0, 0), float3( 1, 0, 0), float3(-1, 0, 0),
@ -71,14 +91,22 @@ VSOutput main(uint vertexID : SV_VertexID)
{ {
VSOutput output; VSOutput output;
// Look up chunk info via push constant (SV_InstanceID doesn't include StartInstanceLocation in D3D12) // Determine quad index and chunk index based on rendering mode
GPUChunkInfo info = chunkInfoBuffer[push.chunkIndex]; uint quadIndex;
uint chunkIndex;
// 6 vertices per quad (2 triangles) if (push.flags & 1) {
// Use push.quadOffset instead of relying on StartVertexLocation in SV_VertexID // MDI path: SV_VertexID includes StartVertexLocation (global quad address)
uint localVertex = vertexID; quadIndex = vertexID / 6;
uint quadIndex = push.quadOffset + (localVertex / 6); chunkIndex = findChunkIndex(quadIndex);
uint cornerIndex = localVertex % 6; } else {
// CPU path: push constants provide explicit offsets
quadIndex = push.quadOffset + (vertexID / 6);
chunkIndex = push.chunkIndex;
}
GPUChunkInfo info = chunkInfoBuffer[chunkIndex];
uint cornerIndex = vertexID % 6;
PackedQuad packed = quadBuffer[quadIndex]; PackedQuad packed = quadBuffer[quadIndex];
uint px, py, pz, w, h, face, matID, ao; uint px, py, pz, w, h, face, matID, ao;

View file

@ -150,11 +150,13 @@ void VoxelRenderer::createPipeline() {
wi::backlog::post("VoxelRenderer: shader loading failed", wi::backlog::LogLevel::Error); wi::backlog::post("VoxelRenderer: shader loading failed", wi::backlog::LogLevel::Error);
return; return;
} }
gpuCullingEnabled_ = cullShader_.IsValid(); // GPU cull shader loads but MDI path is disabled pending barrier debugging.
if (!gpuCullingEnabled_) { // CPU fallback with per-face-group DrawInstanced + backface culling is used instead.
wi::backlog::post("VoxelRenderer: cull compute shader not available, using CPU culling", wi::backlog::LogLevel::Warning); gpuCullingEnabled_ = false;
if (cullShader_.IsValid()) {
wi::backlog::post("VoxelRenderer: cull compute shader compiled (GPU cull path disabled, using CPU fallback)");
} else { } else {
wi::backlog::post("VoxelRenderer: GPU frustum+backface culling enabled"); wi::backlog::post("VoxelRenderer: cull compute shader not available", wi::backlog::LogLevel::Warning);
} }
// Pipeline: backface cull, depth test, opaque blend, triangle list // Pipeline: backface cull, depth test, opaque blend, triangle list
@ -303,6 +305,38 @@ void VoxelRenderer::updateMeshes(VoxelWorld& world) {
} }
} }
// ── Frustum plane extraction (Gribb-Hartmann method) ────────────
static void extractFrustumPlanes(const XMMATRIX& vp, XMFLOAT4 planes[6]) {
XMFLOAT4X4 m;
XMStoreFloat4x4(&m, vp);
// Left
planes[0] = XMFLOAT4(m._14 + m._11, m._24 + m._21, m._34 + m._31, m._44 + m._41);
// Right
planes[1] = XMFLOAT4(m._14 - m._11, m._24 - m._21, m._34 - m._31, m._44 - m._41);
// Bottom
planes[2] = XMFLOAT4(m._14 + m._12, m._24 + m._22, m._34 + m._32, m._44 + m._42);
// Top
planes[3] = XMFLOAT4(m._14 - m._12, m._24 - m._22, m._34 - m._32, m._44 - m._42);
// Near
planes[4] = XMFLOAT4(m._13, m._23, m._33, m._43);
// Far
planes[5] = XMFLOAT4(m._14 - m._13, m._24 - m._23, m._34 - m._33, m._44 - m._43);
// Normalize each plane
for (int i = 0; i < 6; i++) {
float len = std::sqrt(planes[i].x * planes[i].x +
planes[i].y * planes[i].y +
planes[i].z * planes[i].z);
if (len > 0.0001f) {
planes[i].x /= len;
planes[i].y /= len;
planes[i].z /= len;
planes[i].w /= len;
}
}
}
// ── Render pass ───────────────────────────────────────────────── // ── Render pass ─────────────────────────────────────────────────
void VoxelRenderer::render( void VoxelRenderer::render(
@ -325,22 +359,72 @@ void VoxelRenderer::render(
cpuChunkInfo_.size() * sizeof(GPUChunkInfo)); cpuChunkInfo_.size() * sizeof(GPUChunkInfo));
} }
// Per-frame constants // Per-frame constants (with frustum planes for GPU cull shader)
VoxelConstants cb = {}; VoxelConstants cb = {};
XMStoreFloat4x4(&cb.viewProjection, camera.GetViewProjection()); XMMATRIX vpMatrix = camera.GetViewProjection();
XMStoreFloat4x4(&cb.viewProjection, vpMatrix);
cb.cameraPosition = XMFLOAT4(camera.Eye.x, camera.Eye.y, camera.Eye.z, 1.0f); cb.cameraPosition = XMFLOAT4(camera.Eye.x, camera.Eye.y, camera.Eye.z, 1.0f);
cb.sunDirection = XMFLOAT4(-0.5f, -0.8f, -0.3f, 0.0f); cb.sunDirection = XMFLOAT4(-0.5f, -0.8f, -0.3f, 0.0f);
cb.sunColor = XMFLOAT4(1.2f, 1.1f, 0.9f, 1.0f); cb.sunColor = XMFLOAT4(1.2f, 1.1f, 0.9f, 1.0f);
cb.chunkSize = (float)CHUNK_SIZE; cb.chunkSize = (float)CHUNK_SIZE;
cb.textureTiling = 0.25f; cb.textureTiling = 0.25f;
cb.chunkCount = chunkCount_; cb.chunkCount = chunkCount_;
extractFrustumPlanes(vpMatrix, cb.frustumPlanes);
dev->UpdateBuffer(&constantBuffer_, &cb, cmd, sizeof(cb)); dev->UpdateBuffer(&constantBuffer_, &cb, cmd, sizeof(cb));
// CPU frustum culling // Push constant structure (must be 48 bytes = 12 x uint32, matches b999)
wi::primitive::Frustum frustum; struct VoxelPush {
frustum.Create(camera.GetViewProjection()); uint32_t chunkIndex;
uint32_t quadOffset;
uint32_t flags; // bit 0: 1=MDI mode, 0=CPU mode
uint32_t pad[9];
};
// ── Render pass: color + depth ──────────────────────────────── visibleChunks_ = 0;
drawCalls_ = 0;
// ── GPU Cull + MDI path ────────────────────────────────────────
if (gpuCullingEnabled_) {
// Zero the draw count buffer (sets state to COPY_DST)
uint32_t zero = 0;
dev->UpdateBuffer(&drawCountBuffer_, &zero, cmd, sizeof(uint32_t));
// Touch indirect args buffer to establish COPY_DST state
dev->UpdateBuffer(&indirectArgsBuffer_, &zero, cmd, sizeof(uint32_t));
// Barriers: COPY_DST → UAV for compute shader writes
GPUBarrier preBarriers[] = {
GPUBarrier::Buffer(&drawCountBuffer_, ResourceState::COPY_DST, ResourceState::UNORDERED_ACCESS),
GPUBarrier::Buffer(&indirectArgsBuffer_, ResourceState::COPY_DST, ResourceState::UNORDERED_ACCESS),
};
dev->Barrier(preBarriers, 2, cmd);
// Timestamp: cull begin
dev->QueryEnd(&timestampHeap_, TS_CULL_BEGIN, cmd);
// Dispatch GPU frustum + backface cull compute shader
dev->BindComputeShader(&cullShader_, cmd);
dev->BindConstantBuffer(&constantBuffer_, 0, cmd);
dev->BindResource(&chunkInfoBuffer_, 2, cmd);
dev->BindUAV(&indirectArgsBuffer_, 0, cmd);
dev->BindUAV(&drawCountBuffer_, 1, cmd);
dev->Dispatch((chunkCount_ + 63) / 64, 1, 1, cmd);
// Timestamp: cull end
dev->QueryEnd(&timestampHeap_, TS_CULL_END, cmd);
// Barriers: UAV → INDIRECT_ARGUMENT for DrawInstancedIndirectCount
GPUBarrier postBarriers[] = {
GPUBarrier::Buffer(&indirectArgsBuffer_, ResourceState::UNORDERED_ACCESS, ResourceState::INDIRECT_ARGUMENT),
GPUBarrier::Buffer(&drawCountBuffer_, ResourceState::UNORDERED_ACCESS, ResourceState::INDIRECT_ARGUMENT),
};
dev->Barrier(postBarriers, 2, cmd);
// Set MDI flag in push constants (VS uses binary search for chunk index)
VoxelPush pushData = {};
pushData.flags = 1; // MDI mode
dev->PushConstants(&pushData, sizeof(pushData), cmd);
// ── Render pass ────────────────────────────────────────────
RenderPassImage rp[] = { RenderPassImage rp[] = {
RenderPassImage::RenderTarget( RenderPassImage::RenderTarget(
&renderTarget, &renderTarget,
@ -372,22 +456,88 @@ void VoxelRenderer::render(
dev->BindPipelineState(&pso_, cmd); dev->BindPipelineState(&pso_, cmd);
dev->BindConstantBuffer(&constantBuffer_, 0, cmd); dev->BindConstantBuffer(&constantBuffer_, 0, cmd);
dev->BindResource(&megaQuadBuffer_, 0, cmd); // t0: mega quad buffer dev->BindResource(&megaQuadBuffer_, 0, cmd);
dev->BindResource(&textureArray_, 1, cmd); // t1: material textures dev->BindResource(&textureArray_, 1, cmd);
dev->BindResource(&chunkInfoBuffer_, 2, cmd); // t2: chunk info dev->BindResource(&chunkInfoBuffer_, 2, cmd);
dev->BindSampler(&sampler_, 0, cmd); dev->BindSampler(&sampler_, 0, cmd);
visibleChunks_ = 0; // Timestamp: draw begin
drawCalls_ = 0; dev->QueryEnd(&timestampHeap_, TS_DRAW_BEGIN, cmd);
// Push constant structure (must be 48 bytes = 12 x uint32, matches b999) // Single MDI call: GPU cull shader filled the indirect args
struct VoxelPush { dev->DrawInstancedIndirectCount(
uint32_t chunkIndex; &indirectArgsBuffer_, 0,
uint32_t quadOffset; // offset into mega quad buffer (in quads) &drawCountBuffer_, 0,
uint32_t pad[10]; MAX_DRAWS, cmd
);
drawCalls_ = 1;
// Timestamp: draw end
dev->QueryEnd(&timestampHeap_, TS_DRAW_END, cmd);
dev->RenderPassEnd(cmd);
// Resolve timestamps for readback (results available next frame)
dev->QueryResolve(&timestampHeap_, 0, TS_COUNT, &timestampReadback_, 0, cmd);
// Read back previous frame's timestamps (persistently mapped READBACK buffer)
uint64_t* tsData = (uint64_t*)timestampReadback_.mapped_data;
if (tsData) {
double freq = (double)dev->GetTimestampFrequency();
if (freq > 0.0 && tsData[TS_CULL_END] > tsData[TS_CULL_BEGIN]) {
gpuCullTimeMs_ = (float)((double)(tsData[TS_CULL_END] - tsData[TS_CULL_BEGIN]) / freq * 1000.0);
}
if (freq > 0.0 && tsData[TS_DRAW_END] > tsData[TS_DRAW_BEGIN]) {
gpuDrawTimeMs_ = (float)((double)(tsData[TS_DRAW_END] - tsData[TS_DRAW_BEGIN]) / freq * 1000.0);
}
}
// GPU cull handles visibility counting — approximate from chunkCount
visibleChunks_ = chunkCount_; // exact count would require readback of drawCount
return;
}
// ── CPU Fallback: frustum + backface cull + per-face-group draws ──
wi::primitive::Frustum frustum;
frustum.Create(camera.GetViewProjection());
RenderPassImage rp[] = {
RenderPassImage::RenderTarget(
&renderTarget,
RenderPassImage::LoadOp::CLEAR,
RenderPassImage::StoreOp::STORE,
ResourceState::SHADER_RESOURCE,
ResourceState::SHADER_RESOURCE
),
RenderPassImage::DepthStencil(
&depthBuffer,
RenderPassImage::LoadOp::CLEAR,
RenderPassImage::StoreOp::STORE,
ResourceState::DEPTHSTENCIL,
ResourceState::DEPTHSTENCIL,
ResourceState::DEPTHSTENCIL
),
}; };
dev->RenderPassBegin(rp, 2, cmd);
Viewport vp;
vp.width = (float)renderTarget.GetDesc().width;
vp.height = (float)renderTarget.GetDesc().height;
vp.min_depth = 0.0f;
vp.max_depth = 1.0f;
dev->BindViewports(1, &vp, cmd);
Rect scissor = { 0, 0, (int)vp.width, (int)vp.height };
dev->BindScissorRects(1, &scissor, cmd);
dev->BindPipelineState(&pso_, cmd);
dev->BindConstantBuffer(&constantBuffer_, 0, cmd);
dev->BindResource(&megaQuadBuffer_, 0, cmd);
dev->BindResource(&textureArray_, 1, cmd);
dev->BindResource(&chunkInfoBuffer_, 2, cmd);
dev->BindSampler(&sampler_, 0, cmd);
// Simple DrawInstanced loop with frustum culling + push constants
for (uint32_t i = 0; i < chunkCount_; i++) { for (uint32_t i = 0; i < chunkCount_; i++) {
const auto& slot = chunkSlots_[i]; const auto& slot = chunkSlots_[i];
if (slot.quadCount == 0) continue; if (slot.quadCount == 0) continue;
@ -406,18 +556,34 @@ void VoxelRenderer::render(
if (!frustum.CheckBoxFast(aabb)) continue; if (!frustum.CheckBoxFast(aabb)) continue;
visibleChunks_++; visibleChunks_++;
const auto& info = cpuChunkInfo_[i];
// Per-face-group draws with backface culling
for (uint32_t f = 0; f < 6; f++) {
if (info.faceCounts[f] == 0) continue;
// Backface cull: skip face groups pointing away from camera
bool backFacing = false;
switch (f) {
case 0: backFacing = (camera.Eye.x < aabbMin.x); break; // +X
case 1: backFacing = (camera.Eye.x > aabbMax.x); break; // -X
case 2: backFacing = (camera.Eye.y < aabbMin.y); break; // +Y
case 3: backFacing = (camera.Eye.y > aabbMax.y); break; // -Y
case 4: backFacing = (camera.Eye.z < aabbMin.z); break; // +Z
case 5: backFacing = (camera.Eye.z > aabbMax.z); break; // -Z
}
if (backFacing) continue;
// Pass chunk index AND quad offset via push constants
// (SV_VertexID/SV_InstanceID offsets unreliable across drivers)
VoxelPush pushData = {}; VoxelPush pushData = {};
pushData.chunkIndex = i; pushData.chunkIndex = i;
pushData.quadOffset = slot.quadOffset; pushData.quadOffset = slot.quadOffset + info.faceOffsets[f];
pushData.flags = 0; // CPU mode
dev->PushConstants(&pushData, sizeof(pushData), cmd); dev->PushConstants(&pushData, sizeof(pushData), cmd);
// startVertexLocation = 0: the VS computes quad address from push.quadOffset dev->DrawInstanced(info.faceCounts[f] * 6, 1, 0, 0, cmd);
dev->DrawInstanced(slot.quadCount * 6, 1, 0, 0, cmd);
drawCalls_++; drawCalls_++;
} }
}
dev->RenderPassEnd(cmd); dev->RenderPassEnd(cmd);
} }
@ -583,7 +749,7 @@ void VoxelRenderPath::Compose(CommandList cmd) const {
+ "/" + std::to_string(renderer.getChunkCount()) + "\n"; + "/" + std::to_string(renderer.getChunkCount()) + "\n";
stats += "Quads: " + std::to_string(renderer.getTotalQuads()) + "\n"; stats += "Quads: " + std::to_string(renderer.getTotalQuads()) + "\n";
stats += "Draw Calls: " + std::to_string(renderer.getDrawCalls()) stats += "Draw Calls: " + std::to_string(renderer.getDrawCalls())
+ " (DrawInstanced + CPU cull + backface)\n"; + (renderer.isGpuCulling() ? " (MDI + GPU cull)" : " (DrawInstanced + CPU cull + backface)") + "\n";
char cullStr[16], drawStr[16]; char cullStr[16], drawStr[16];
snprintf(cullStr, sizeof(cullStr), "%.3f", renderer.getGpuCullTimeMs()); snprintf(cullStr, sizeof(cullStr), "%.3f", renderer.getGpuCullTimeMs());