Phase 2 complete: per-face-group backface culling, frustum planes, GPU cull infrastructure
- VS supports dual mode: CPU path (push constants) and MDI path (binary search) - CPU render loop now does per-face-group draws with backface culling (6 draws/chunk max) - Frustum planes extracted and populated in constant buffer for GPU cull shader - GPU cull + MDI path fully implemented but disabled (barrier/state debugging needed) - GPU timestamp query infrastructure with readback for cull/draw timing - HUD shows rendering mode (GPU cull vs CPU fallback)
This commit is contained in:
parent
5f346bb14a
commit
46e8f50f37
2 changed files with 236 additions and 42 deletions
|
|
@ -1,5 +1,5 @@
|
|||
// BVLE Voxels - Vertex Shader (Vertex Pulling from mega-buffer)
|
||||
// Phase 2: uses SV_InstanceID to look up chunk info instead of push constants.
|
||||
// Phase 2: supports both CPU draw loop (push constants) and GPU MDI (binary search).
|
||||
|
||||
#include "voxelCommon.hlsli"
|
||||
|
||||
|
|
@ -10,11 +10,14 @@ struct PackedQuad {
|
|||
StructuredBuffer<PackedQuad> quadBuffer : register(t0);
|
||||
StructuredBuffer<GPUChunkInfo> chunkInfoBuffer : register(t2);
|
||||
|
||||
// Push constants: chunk index + quad offset for current draw call
|
||||
// Push constants (48 bytes = 12 x uint32)
|
||||
// CPU path: chunkIndex + quadOffset explicit
|
||||
// MDI path: flags bit 0 set, VS derives chunk from SV_VertexID via binary search
|
||||
struct VoxelPush {
|
||||
uint chunkIndex;
|
||||
uint quadOffset; // offset into mega quad buffer (in quads)
|
||||
uint pad0, pad1, pad2, pad3, pad4, pad5, pad6, pad7, pad8, pad9;
|
||||
uint flags; // bit 0: 1 = MDI mode (binary search), 0 = CPU mode
|
||||
uint pad0, pad1, pad2, pad3, pad4, pad5, pad6, pad7, pad8;
|
||||
};
|
||||
[[vk::push_constant]] ConstantBuffer<VoxelPush> push : register(b999);
|
||||
|
||||
|
|
@ -46,6 +49,23 @@ void unpackQuad(uint2 raw, out uint px, out uint py, out uint pz,
|
|||
ao = (hi >> 9) & 0xFF;
|
||||
}
|
||||
|
||||
// Binary search: find which chunk owns a given global quad index.
|
||||
// Chunks are packed contiguously in the mega-buffer, sorted by chunk index.
|
||||
// O(log2(chunkCount)) = ~11 iterations for 2048 chunks.
|
||||
uint findChunkIndex(uint globalQuadIndex) {
|
||||
uint lo = 0, hi = chunkCount;
|
||||
[loop]
|
||||
while (lo < hi) {
|
||||
uint mid = (lo + hi) >> 1;
|
||||
GPUChunkInfo ci = chunkInfoBuffer[mid];
|
||||
if (ci.quadOffset + ci.quadCount <= globalQuadIndex)
|
||||
lo = mid + 1;
|
||||
else
|
||||
hi = mid;
|
||||
}
|
||||
return lo;
|
||||
}
|
||||
|
||||
// Face normals: +X, -X, +Y, -Y, +Z, -Z
|
||||
static const float3 faceNormals[6] = {
|
||||
float3( 1, 0, 0), float3(-1, 0, 0),
|
||||
|
|
@ -71,14 +91,22 @@ VSOutput main(uint vertexID : SV_VertexID)
|
|||
{
|
||||
VSOutput output;
|
||||
|
||||
// Look up chunk info via push constant (SV_InstanceID doesn't include StartInstanceLocation in D3D12)
|
||||
GPUChunkInfo info = chunkInfoBuffer[push.chunkIndex];
|
||||
// Determine quad index and chunk index based on rendering mode
|
||||
uint quadIndex;
|
||||
uint chunkIndex;
|
||||
|
||||
// 6 vertices per quad (2 triangles)
|
||||
// Use push.quadOffset instead of relying on StartVertexLocation in SV_VertexID
|
||||
uint localVertex = vertexID;
|
||||
uint quadIndex = push.quadOffset + (localVertex / 6);
|
||||
uint cornerIndex = localVertex % 6;
|
||||
if (push.flags & 1) {
|
||||
// MDI path: SV_VertexID includes StartVertexLocation (global quad address)
|
||||
quadIndex = vertexID / 6;
|
||||
chunkIndex = findChunkIndex(quadIndex);
|
||||
} else {
|
||||
// CPU path: push constants provide explicit offsets
|
||||
quadIndex = push.quadOffset + (vertexID / 6);
|
||||
chunkIndex = push.chunkIndex;
|
||||
}
|
||||
|
||||
GPUChunkInfo info = chunkInfoBuffer[chunkIndex];
|
||||
uint cornerIndex = vertexID % 6;
|
||||
|
||||
PackedQuad packed = quadBuffer[quadIndex];
|
||||
uint px, py, pz, w, h, face, matID, ao;
|
||||
|
|
|
|||
|
|
@ -150,11 +150,13 @@ void VoxelRenderer::createPipeline() {
|
|||
wi::backlog::post("VoxelRenderer: shader loading failed", wi::backlog::LogLevel::Error);
|
||||
return;
|
||||
}
|
||||
gpuCullingEnabled_ = cullShader_.IsValid();
|
||||
if (!gpuCullingEnabled_) {
|
||||
wi::backlog::post("VoxelRenderer: cull compute shader not available, using CPU culling", wi::backlog::LogLevel::Warning);
|
||||
// GPU cull shader loads but MDI path is disabled pending barrier debugging.
|
||||
// CPU fallback with per-face-group DrawInstanced + backface culling is used instead.
|
||||
gpuCullingEnabled_ = false;
|
||||
if (cullShader_.IsValid()) {
|
||||
wi::backlog::post("VoxelRenderer: cull compute shader compiled (GPU cull path disabled, using CPU fallback)");
|
||||
} else {
|
||||
wi::backlog::post("VoxelRenderer: GPU frustum+backface culling enabled");
|
||||
wi::backlog::post("VoxelRenderer: cull compute shader not available", wi::backlog::LogLevel::Warning);
|
||||
}
|
||||
|
||||
// Pipeline: backface cull, depth test, opaque blend, triangle list
|
||||
|
|
@ -303,6 +305,38 @@ void VoxelRenderer::updateMeshes(VoxelWorld& world) {
|
|||
}
|
||||
}
|
||||
|
||||
// ── Frustum plane extraction (Gribb-Hartmann method) ────────────
|
||||
static void extractFrustumPlanes(const XMMATRIX& vp, XMFLOAT4 planes[6]) {
|
||||
XMFLOAT4X4 m;
|
||||
XMStoreFloat4x4(&m, vp);
|
||||
|
||||
// Left
|
||||
planes[0] = XMFLOAT4(m._14 + m._11, m._24 + m._21, m._34 + m._31, m._44 + m._41);
|
||||
// Right
|
||||
planes[1] = XMFLOAT4(m._14 - m._11, m._24 - m._21, m._34 - m._31, m._44 - m._41);
|
||||
// Bottom
|
||||
planes[2] = XMFLOAT4(m._14 + m._12, m._24 + m._22, m._34 + m._32, m._44 + m._42);
|
||||
// Top
|
||||
planes[3] = XMFLOAT4(m._14 - m._12, m._24 - m._22, m._34 - m._32, m._44 - m._42);
|
||||
// Near
|
||||
planes[4] = XMFLOAT4(m._13, m._23, m._33, m._43);
|
||||
// Far
|
||||
planes[5] = XMFLOAT4(m._14 - m._13, m._24 - m._23, m._34 - m._33, m._44 - m._43);
|
||||
|
||||
// Normalize each plane
|
||||
for (int i = 0; i < 6; i++) {
|
||||
float len = std::sqrt(planes[i].x * planes[i].x +
|
||||
planes[i].y * planes[i].y +
|
||||
planes[i].z * planes[i].z);
|
||||
if (len > 0.0001f) {
|
||||
planes[i].x /= len;
|
||||
planes[i].y /= len;
|
||||
planes[i].z /= len;
|
||||
planes[i].w /= len;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// ── Render pass ─────────────────────────────────────────────────
|
||||
|
||||
void VoxelRenderer::render(
|
||||
|
|
@ -325,22 +359,149 @@ void VoxelRenderer::render(
|
|||
cpuChunkInfo_.size() * sizeof(GPUChunkInfo));
|
||||
}
|
||||
|
||||
// Per-frame constants
|
||||
// Per-frame constants (with frustum planes for GPU cull shader)
|
||||
VoxelConstants cb = {};
|
||||
XMStoreFloat4x4(&cb.viewProjection, camera.GetViewProjection());
|
||||
XMMATRIX vpMatrix = camera.GetViewProjection();
|
||||
XMStoreFloat4x4(&cb.viewProjection, vpMatrix);
|
||||
cb.cameraPosition = XMFLOAT4(camera.Eye.x, camera.Eye.y, camera.Eye.z, 1.0f);
|
||||
cb.sunDirection = XMFLOAT4(-0.5f, -0.8f, -0.3f, 0.0f);
|
||||
cb.sunColor = XMFLOAT4(1.2f, 1.1f, 0.9f, 1.0f);
|
||||
cb.chunkSize = (float)CHUNK_SIZE;
|
||||
cb.textureTiling = 0.25f;
|
||||
cb.chunkCount = chunkCount_;
|
||||
extractFrustumPlanes(vpMatrix, cb.frustumPlanes);
|
||||
dev->UpdateBuffer(&constantBuffer_, &cb, cmd, sizeof(cb));
|
||||
|
||||
// CPU frustum culling
|
||||
// Push constant structure (must be 48 bytes = 12 x uint32, matches b999)
|
||||
struct VoxelPush {
|
||||
uint32_t chunkIndex;
|
||||
uint32_t quadOffset;
|
||||
uint32_t flags; // bit 0: 1=MDI mode, 0=CPU mode
|
||||
uint32_t pad[9];
|
||||
};
|
||||
|
||||
visibleChunks_ = 0;
|
||||
drawCalls_ = 0;
|
||||
|
||||
// ── GPU Cull + MDI path ────────────────────────────────────────
|
||||
if (gpuCullingEnabled_) {
|
||||
// Zero the draw count buffer (sets state to COPY_DST)
|
||||
uint32_t zero = 0;
|
||||
dev->UpdateBuffer(&drawCountBuffer_, &zero, cmd, sizeof(uint32_t));
|
||||
// Touch indirect args buffer to establish COPY_DST state
|
||||
dev->UpdateBuffer(&indirectArgsBuffer_, &zero, cmd, sizeof(uint32_t));
|
||||
|
||||
// Barriers: COPY_DST → UAV for compute shader writes
|
||||
GPUBarrier preBarriers[] = {
|
||||
GPUBarrier::Buffer(&drawCountBuffer_, ResourceState::COPY_DST, ResourceState::UNORDERED_ACCESS),
|
||||
GPUBarrier::Buffer(&indirectArgsBuffer_, ResourceState::COPY_DST, ResourceState::UNORDERED_ACCESS),
|
||||
};
|
||||
dev->Barrier(preBarriers, 2, cmd);
|
||||
|
||||
// Timestamp: cull begin
|
||||
dev->QueryEnd(×tampHeap_, TS_CULL_BEGIN, cmd);
|
||||
|
||||
// Dispatch GPU frustum + backface cull compute shader
|
||||
dev->BindComputeShader(&cullShader_, cmd);
|
||||
dev->BindConstantBuffer(&constantBuffer_, 0, cmd);
|
||||
dev->BindResource(&chunkInfoBuffer_, 2, cmd);
|
||||
dev->BindUAV(&indirectArgsBuffer_, 0, cmd);
|
||||
dev->BindUAV(&drawCountBuffer_, 1, cmd);
|
||||
dev->Dispatch((chunkCount_ + 63) / 64, 1, 1, cmd);
|
||||
|
||||
// Timestamp: cull end
|
||||
dev->QueryEnd(×tampHeap_, TS_CULL_END, cmd);
|
||||
|
||||
// Barriers: UAV → INDIRECT_ARGUMENT for DrawInstancedIndirectCount
|
||||
GPUBarrier postBarriers[] = {
|
||||
GPUBarrier::Buffer(&indirectArgsBuffer_, ResourceState::UNORDERED_ACCESS, ResourceState::INDIRECT_ARGUMENT),
|
||||
GPUBarrier::Buffer(&drawCountBuffer_, ResourceState::UNORDERED_ACCESS, ResourceState::INDIRECT_ARGUMENT),
|
||||
};
|
||||
dev->Barrier(postBarriers, 2, cmd);
|
||||
|
||||
// Set MDI flag in push constants (VS uses binary search for chunk index)
|
||||
VoxelPush pushData = {};
|
||||
pushData.flags = 1; // MDI mode
|
||||
dev->PushConstants(&pushData, sizeof(pushData), cmd);
|
||||
|
||||
// ── Render pass ────────────────────────────────────────────
|
||||
RenderPassImage rp[] = {
|
||||
RenderPassImage::RenderTarget(
|
||||
&renderTarget,
|
||||
RenderPassImage::LoadOp::CLEAR,
|
||||
RenderPassImage::StoreOp::STORE,
|
||||
ResourceState::SHADER_RESOURCE,
|
||||
ResourceState::SHADER_RESOURCE
|
||||
),
|
||||
RenderPassImage::DepthStencil(
|
||||
&depthBuffer,
|
||||
RenderPassImage::LoadOp::CLEAR,
|
||||
RenderPassImage::StoreOp::STORE,
|
||||
ResourceState::DEPTHSTENCIL,
|
||||
ResourceState::DEPTHSTENCIL,
|
||||
ResourceState::DEPTHSTENCIL
|
||||
),
|
||||
};
|
||||
dev->RenderPassBegin(rp, 2, cmd);
|
||||
|
||||
Viewport vp;
|
||||
vp.width = (float)renderTarget.GetDesc().width;
|
||||
vp.height = (float)renderTarget.GetDesc().height;
|
||||
vp.min_depth = 0.0f;
|
||||
vp.max_depth = 1.0f;
|
||||
dev->BindViewports(1, &vp, cmd);
|
||||
|
||||
Rect scissor = { 0, 0, (int)vp.width, (int)vp.height };
|
||||
dev->BindScissorRects(1, &scissor, cmd);
|
||||
|
||||
dev->BindPipelineState(&pso_, cmd);
|
||||
dev->BindConstantBuffer(&constantBuffer_, 0, cmd);
|
||||
dev->BindResource(&megaQuadBuffer_, 0, cmd);
|
||||
dev->BindResource(&textureArray_, 1, cmd);
|
||||
dev->BindResource(&chunkInfoBuffer_, 2, cmd);
|
||||
dev->BindSampler(&sampler_, 0, cmd);
|
||||
|
||||
// Timestamp: draw begin
|
||||
dev->QueryEnd(×tampHeap_, TS_DRAW_BEGIN, cmd);
|
||||
|
||||
// Single MDI call: GPU cull shader filled the indirect args
|
||||
dev->DrawInstancedIndirectCount(
|
||||
&indirectArgsBuffer_, 0,
|
||||
&drawCountBuffer_, 0,
|
||||
MAX_DRAWS, cmd
|
||||
);
|
||||
drawCalls_ = 1;
|
||||
|
||||
// Timestamp: draw end
|
||||
dev->QueryEnd(×tampHeap_, TS_DRAW_END, cmd);
|
||||
|
||||
dev->RenderPassEnd(cmd);
|
||||
|
||||
// Resolve timestamps for readback (results available next frame)
|
||||
dev->QueryResolve(×tampHeap_, 0, TS_COUNT, ×tampReadback_, 0, cmd);
|
||||
|
||||
// Read back previous frame's timestamps (persistently mapped READBACK buffer)
|
||||
uint64_t* tsData = (uint64_t*)timestampReadback_.mapped_data;
|
||||
if (tsData) {
|
||||
double freq = (double)dev->GetTimestampFrequency();
|
||||
if (freq > 0.0 && tsData[TS_CULL_END] > tsData[TS_CULL_BEGIN]) {
|
||||
gpuCullTimeMs_ = (float)((double)(tsData[TS_CULL_END] - tsData[TS_CULL_BEGIN]) / freq * 1000.0);
|
||||
}
|
||||
if (freq > 0.0 && tsData[TS_DRAW_END] > tsData[TS_DRAW_BEGIN]) {
|
||||
gpuDrawTimeMs_ = (float)((double)(tsData[TS_DRAW_END] - tsData[TS_DRAW_BEGIN]) / freq * 1000.0);
|
||||
}
|
||||
}
|
||||
|
||||
// GPU cull handles visibility counting — approximate from chunkCount
|
||||
visibleChunks_ = chunkCount_; // exact count would require readback of drawCount
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
// ── CPU Fallback: frustum + backface cull + per-face-group draws ──
|
||||
wi::primitive::Frustum frustum;
|
||||
frustum.Create(camera.GetViewProjection());
|
||||
|
||||
// ── Render pass: color + depth ────────────────────────────────
|
||||
RenderPassImage rp[] = {
|
||||
RenderPassImage::RenderTarget(
|
||||
&renderTarget,
|
||||
|
|
@ -372,22 +533,11 @@ void VoxelRenderer::render(
|
|||
|
||||
dev->BindPipelineState(&pso_, cmd);
|
||||
dev->BindConstantBuffer(&constantBuffer_, 0, cmd);
|
||||
dev->BindResource(&megaQuadBuffer_, 0, cmd); // t0: mega quad buffer
|
||||
dev->BindResource(&textureArray_, 1, cmd); // t1: material textures
|
||||
dev->BindResource(&chunkInfoBuffer_, 2, cmd); // t2: chunk info
|
||||
dev->BindResource(&megaQuadBuffer_, 0, cmd);
|
||||
dev->BindResource(&textureArray_, 1, cmd);
|
||||
dev->BindResource(&chunkInfoBuffer_, 2, cmd);
|
||||
dev->BindSampler(&sampler_, 0, cmd);
|
||||
|
||||
visibleChunks_ = 0;
|
||||
drawCalls_ = 0;
|
||||
|
||||
// Push constant structure (must be 48 bytes = 12 x uint32, matches b999)
|
||||
struct VoxelPush {
|
||||
uint32_t chunkIndex;
|
||||
uint32_t quadOffset; // offset into mega quad buffer (in quads)
|
||||
uint32_t pad[10];
|
||||
};
|
||||
|
||||
// Simple DrawInstanced loop with frustum culling + push constants
|
||||
for (uint32_t i = 0; i < chunkCount_; i++) {
|
||||
const auto& slot = chunkSlots_[i];
|
||||
if (slot.quadCount == 0) continue;
|
||||
|
|
@ -406,17 +556,33 @@ void VoxelRenderer::render(
|
|||
if (!frustum.CheckBoxFast(aabb)) continue;
|
||||
|
||||
visibleChunks_++;
|
||||
const auto& info = cpuChunkInfo_[i];
|
||||
|
||||
// Pass chunk index AND quad offset via push constants
|
||||
// (SV_VertexID/SV_InstanceID offsets unreliable across drivers)
|
||||
VoxelPush pushData = {};
|
||||
pushData.chunkIndex = i;
|
||||
pushData.quadOffset = slot.quadOffset;
|
||||
dev->PushConstants(&pushData, sizeof(pushData), cmd);
|
||||
// Per-face-group draws with backface culling
|
||||
for (uint32_t f = 0; f < 6; f++) {
|
||||
if (info.faceCounts[f] == 0) continue;
|
||||
|
||||
// startVertexLocation = 0: the VS computes quad address from push.quadOffset
|
||||
dev->DrawInstanced(slot.quadCount * 6, 1, 0, 0, cmd);
|
||||
drawCalls_++;
|
||||
// Backface cull: skip face groups pointing away from camera
|
||||
bool backFacing = false;
|
||||
switch (f) {
|
||||
case 0: backFacing = (camera.Eye.x < aabbMin.x); break; // +X
|
||||
case 1: backFacing = (camera.Eye.x > aabbMax.x); break; // -X
|
||||
case 2: backFacing = (camera.Eye.y < aabbMin.y); break; // +Y
|
||||
case 3: backFacing = (camera.Eye.y > aabbMax.y); break; // -Y
|
||||
case 4: backFacing = (camera.Eye.z < aabbMin.z); break; // +Z
|
||||
case 5: backFacing = (camera.Eye.z > aabbMax.z); break; // -Z
|
||||
}
|
||||
if (backFacing) continue;
|
||||
|
||||
VoxelPush pushData = {};
|
||||
pushData.chunkIndex = i;
|
||||
pushData.quadOffset = slot.quadOffset + info.faceOffsets[f];
|
||||
pushData.flags = 0; // CPU mode
|
||||
dev->PushConstants(&pushData, sizeof(pushData), cmd);
|
||||
|
||||
dev->DrawInstanced(info.faceCounts[f] * 6, 1, 0, 0, cmd);
|
||||
drawCalls_++;
|
||||
}
|
||||
}
|
||||
|
||||
dev->RenderPassEnd(cmd);
|
||||
|
|
@ -583,7 +749,7 @@ void VoxelRenderPath::Compose(CommandList cmd) const {
|
|||
+ "/" + std::to_string(renderer.getChunkCount()) + "\n";
|
||||
stats += "Quads: " + std::to_string(renderer.getTotalQuads()) + "\n";
|
||||
stats += "Draw Calls: " + std::to_string(renderer.getDrawCalls())
|
||||
+ " (DrawInstanced + CPU cull + backface)\n";
|
||||
+ (renderer.isGpuCulling() ? " (MDI + GPU cull)" : " (DrawInstanced + CPU cull + backface)") + "\n";
|
||||
|
||||
char cullStr[16], drawStr[16];
|
||||
snprintf(cullStr, sizeof(cullStr), "%.3f", renderer.getGpuCullTimeMs());
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue