bvle-voxels/src/voxel/VoxelRenderer.cpp
Samuel Bouchet 6b41da0932 Phase 6.2: RT shadows — inline ray queries with BLAS/TLAS fix
Add shadow compute shader (voxelShadowCS.hlsl) that traces rays toward
the sun using DXR inline ray queries (RayQuery<>, SM 6.5). Shadows
modulate voxelRT_ in-place via RWTexture2D (no extra render target).

Key fixes to Phase 6.1 BLAS/TLAS infrastructure:
- Sequential index buffer required: Wicked treats IndexCount=0 with
  non-null IndexBuffer as "0 indexed triangles" → empty BLAS
- Memory barriers between BLAS→TLAS→RT: without GPUBarrier::Memory()
  the TLAS build races with BLAS builds, causing zero ray hits
- inverseViewProjection added to VoxelCB for depth reconstruction

F5 toggles shadows OFF→ON→DEBUG (red=hit, green=miss, blue=backface).
2026-03-28 20:01:18 +01:00

2631 lines
110 KiB
C++
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#include "VoxelRenderer.h"
#include "wiJobSystem.h"
#include "wiPrimitive.h"
#include <algorithm>
#include <chrono>
#include <cmath>
#include <cstring>
#include <unordered_map>
using namespace wi::graphics;
namespace voxel {
// ── VoxelRenderer Implementation ────────────────────────────────
VoxelRenderer::VoxelRenderer() = default;
VoxelRenderer::~VoxelRenderer() { shutdown(); }
void VoxelRenderer::initialize(GraphicsDevice* dev) {
device_ = dev;
if (!device_) return;
createPipeline();
if (!pso_.IsValid()) {
wi::backlog::post("VoxelRenderer: pipeline creation failed", wi::backlog::LogLevel::Error);
initialized_ = false;
return;
}
generateTextures();
// Create mega quad buffer (SRV for vertex pulling)
GPUBufferDesc megaDesc;
megaDesc.size = MEGA_BUFFER_CAPACITY * sizeof(PackedQuad);
megaDesc.bind_flags = BindFlag::SHADER_RESOURCE;
megaDesc.misc_flags = ResourceMiscFlag::BUFFER_STRUCTURED;
megaDesc.stride = sizeof(PackedQuad);
megaDesc.usage = Usage::DEFAULT;
device_->CreateBuffer(&megaDesc, nullptr, &megaQuadBuffer_);
// Create chunk info buffer (SRV for VS chunk lookup)
GPUBufferDesc infoDesc;
infoDesc.size = MAX_CHUNKS * sizeof(GPUChunkInfo);
infoDesc.bind_flags = BindFlag::SHADER_RESOURCE;
infoDesc.misc_flags = ResourceMiscFlag::BUFFER_STRUCTURED;
infoDesc.stride = sizeof(GPUChunkInfo);
infoDesc.usage = Usage::DEFAULT;
device_->CreateBuffer(&infoDesc, nullptr, &chunkInfoBuffer_);
// Create indirect args buffer (for DrawInstancedIndirectCount, up to 6 draws per chunk)
// UAV bind flag needed for GPU cull compute shader to write args
GPUBufferDesc argsDesc;
argsDesc.size = MAX_DRAWS * sizeof(IndirectDrawArgs);
argsDesc.bind_flags = BindFlag::UNORDERED_ACCESS;
argsDesc.misc_flags = ResourceMiscFlag::BUFFER_STRUCTURED | ResourceMiscFlag::INDIRECT_ARGS;
argsDesc.stride = sizeof(IndirectDrawArgs);
argsDesc.usage = Usage::DEFAULT;
device_->CreateBuffer(&argsDesc, nullptr, &indirectArgsBuffer_);
// Create draw count buffer (single uint32, raw for RWByteAddressBuffer)
// UAV bind flag needed for GPU cull compute shader atomic counter
GPUBufferDesc countDesc;
countDesc.size = sizeof(uint32_t);
countDesc.bind_flags = BindFlag::UNORDERED_ACCESS;
countDesc.misc_flags = ResourceMiscFlag::BUFFER_RAW | ResourceMiscFlag::INDIRECT_ARGS;
countDesc.usage = Usage::DEFAULT;
device_->CreateBuffer(&countDesc, nullptr, &drawCountBuffer_);
// ── GPU Timestamp Queries ──────────────────────────────────────
GPUQueryHeapDesc queryDesc;
queryDesc.type = GpuQueryType::TIMESTAMP;
queryDesc.query_count = TS_COUNT;
device_->CreateQueryHeap(&queryDesc, &timestampHeap_);
GPUBufferDesc readbackDesc;
readbackDesc.size = TS_COUNT * sizeof(uint64_t);
readbackDesc.usage = Usage::READBACK;
device_->CreateBuffer(&readbackDesc, nullptr, &timestampReadback_);
// ── GPU Compute Mesher resources ─────────────────────────────
wi::renderer::LoadShader(ShaderStage::CS, meshShader_, "voxel/voxelMeshCS.cso");
gpuMesherAvailable_ = meshShader_.IsValid();
if (gpuMesherAvailable_) {
// Voxel data buffer: 1 chunk's worth (32^3 voxels / 2 per uint = 16384 uint)
GPUBufferDesc voxDesc;
voxDesc.size = (CHUNK_VOLUME / 2) * sizeof(uint32_t);
voxDesc.bind_flags = BindFlag::SHADER_RESOURCE;
voxDesc.misc_flags = ResourceMiscFlag::BUFFER_STRUCTURED;
voxDesc.stride = sizeof(uint32_t);
voxDesc.usage = Usage::DEFAULT;
device_->CreateBuffer(&voxDesc, nullptr, &voxelDataBuffer_);
// GPU quad output: same capacity as mega-buffer
GPUBufferDesc gpuQDesc;
gpuQDesc.size = MEGA_BUFFER_CAPACITY * sizeof(uint64_t); // PackedQuad = 8 bytes
gpuQDesc.bind_flags = BindFlag::UNORDERED_ACCESS | BindFlag::SHADER_RESOURCE;
gpuQDesc.misc_flags = ResourceMiscFlag::BUFFER_STRUCTURED;
gpuQDesc.stride = sizeof(uint64_t); // uint2 = 8 bytes
gpuQDesc.usage = Usage::DEFAULT;
device_->CreateBuffer(&gpuQDesc, nullptr, &gpuQuadBuffer_);
// Quad counter
GPUBufferDesc cntDesc;
cntDesc.size = sizeof(uint32_t);
cntDesc.bind_flags = BindFlag::UNORDERED_ACCESS;
cntDesc.misc_flags = ResourceMiscFlag::BUFFER_RAW;
cntDesc.usage = Usage::DEFAULT;
device_->CreateBuffer(&cntDesc, nullptr, &gpuQuadCounter_);
// Readback buffer for quad counter (GPU → CPU)
GPUBufferDesc rbDesc;
rbDesc.size = sizeof(uint32_t);
rbDesc.usage = Usage::READBACK;
device_->CreateBuffer(&rbDesc, nullptr, &meshCounterReadback_);
wi::backlog::post("VoxelRenderer: GPU compute mesher available");
} else {
wi::backlog::post("VoxelRenderer: GPU compute mesher not available", wi::backlog::LogLevel::Warning);
}
// ── GPU Smooth Mesher resources (Phase 5.3) ───────────────────
wi::renderer::LoadShader(ShaderStage::CS, smoothCentroidShader_, "voxel/voxelSmoothCentroidCS.cso");
wi::renderer::LoadShader(ShaderStage::CS, smoothMeshShader_, "voxel/voxelSmoothCS.cso");
if (smoothCentroidShader_.IsValid() && smoothMeshShader_.IsValid()) {
// Centroid grid buffer (34^3 float4, reused per-chunk sequentially)
GPUBufferDesc cgDesc;
cgDesc.size = CENTROID_GRID_SIZE * 16; // float4 = 16 bytes
cgDesc.bind_flags = BindFlag::UNORDERED_ACCESS | BindFlag::SHADER_RESOURCE;
cgDesc.misc_flags = ResourceMiscFlag::BUFFER_STRUCTURED;
cgDesc.stride = 16;
cgDesc.usage = Usage::DEFAULT;
device_->CreateBuffer(&cgDesc, nullptr, &centroidGridBuffer_);
// GPU smooth vertex output buffer (GPUSmoothVertex = 32 bytes)
GPUBufferDesc svDesc;
svDesc.size = MAX_GPU_SMOOTH_VERTICES * 32;
svDesc.bind_flags = BindFlag::UNORDERED_ACCESS | BindFlag::SHADER_RESOURCE;
svDesc.misc_flags = ResourceMiscFlag::BUFFER_STRUCTURED;
svDesc.stride = 32;
svDesc.usage = Usage::DEFAULT;
device_->CreateBuffer(&svDesc, nullptr, &gpuSmoothVertexBuffer_);
// Atomic counter
GPUBufferDesc scDesc;
scDesc.size = sizeof(uint32_t);
scDesc.bind_flags = BindFlag::UNORDERED_ACCESS;
scDesc.misc_flags = ResourceMiscFlag::BUFFER_RAW;
scDesc.usage = Usage::DEFAULT;
device_->CreateBuffer(&scDesc, nullptr, &gpuSmoothCounter_);
// Readback
GPUBufferDesc srbDesc;
srbDesc.size = sizeof(uint32_t);
srbDesc.usage = Usage::READBACK;
device_->CreateBuffer(&srbDesc, nullptr, &smoothCounterReadback_);
wi::backlog::post("VoxelRenderer: GPU smooth mesher available (2-pass with smooth normals)");
}
// ── Ray Tracing (Phase 6.1) ────────────────────────────────────
rtAvailable_ = device_->CheckCapability(GraphicsDeviceCapability::RAYTRACING);
if (rtAvailable_) {
wi::renderer::LoadShader(ShaderStage::CS, blasExtractShader_, "voxel/voxelBLASExtractCS.cso");
if (blasExtractShader_.IsValid()) {
// BLAS position buffer: 6 float3 per quad (non-indexed triangles)
// Use BUFFER_RAW (ByteAddressBuffer) — structured buffers may not work as BLAS vertex input
GPUBufferDesc posDesc;
posDesc.size = (uint64_t)MAX_BLAS_VERTICES * sizeof(float) * 3; // float3 per vertex
posDesc.bind_flags = BindFlag::UNORDERED_ACCESS | BindFlag::SHADER_RESOURCE;
posDesc.misc_flags = ResourceMiscFlag::BUFFER_RAW;
posDesc.stride = 0; // raw buffer, no stride
posDesc.usage = Usage::DEFAULT;
bool ok = device_->CreateBuffer(&posDesc, nullptr, &blasPositionBuffer_);
// Sequential index buffer for BLAS (DX12 requires valid index buffer,
// Wicked always writes IndexBuffer GPU address even for "non-indexed").
GPUBufferDesc idxDesc;
idxDesc.size = (uint64_t)MAX_BLAS_VERTICES * sizeof(uint32_t);
idxDesc.bind_flags = BindFlag::SHADER_RESOURCE;
idxDesc.usage = Usage::DEFAULT;
auto fillIndices = [](void* dest) {
uint32_t* p = (uint32_t*)dest;
for (uint32_t i = 0; i < MAX_BLAS_VERTICES; i++)
p[i] = i;
};
bool okIdx = device_->CreateBuffer2(&idxDesc, fillIndices, &blasIndexBuffer_);
if (ok && blasPositionBuffer_.IsValid() && okIdx && blasIndexBuffer_.IsValid()) {
device_->SetName(&blasPositionBuffer_, "VoxelRenderer::blasPositionBuffer");
device_->SetName(&blasIndexBuffer_, "VoxelRenderer::blasIndexBuffer");
wi::backlog::post("VoxelRenderer: RT available (BLAS pos "
+ std::to_string(posDesc.size / (1024*1024)) + " MB + idx "
+ std::to_string(idxDesc.size / (1024*1024)) + " MB)");
} else {
rtAvailable_ = false;
wi::backlog::post("VoxelRenderer: RT buffer creation failed", wi::backlog::LogLevel::Warning);
}
} else {
rtAvailable_ = false;
wi::backlog::post("VoxelRenderer: RT available but BLAS extraction shader failed", wi::backlog::LogLevel::Warning);
}
// ── RT Shadows (Phase 6.2) ────────────────────────────────────
wi::renderer::LoadShader(ShaderStage::CS, shadowShader_, "voxel/voxelShadowCS.cso",
wi::graphics::ShaderModel::SM_6_5);
if (shadowShader_.IsValid()) {
rtShadowsEnabled_ = true;
wi::backlog::post("VoxelRenderer: RT shadows available");
} else {
wi::backlog::post("VoxelRenderer: RT shadow shader failed to compile",
wi::backlog::LogLevel::Warning);
}
} else {
wi::backlog::post("VoxelRenderer: RT not available (GPU does not support ray tracing)");
}
cpuMegaQuads_.reserve(MEGA_BUFFER_CAPACITY);
cpuChunkInfo_.reserve(MAX_CHUNKS);
chunkSlots_.reserve(MAX_CHUNKS);
cpuIndirectArgs_.reserve(MAX_CHUNKS);
initialized_ = true;
wi::backlog::post("VoxelRenderer: initialized (mega-buffer: "
+ std::to_string(MEGA_BUFFER_CAPACITY) + " quads capacity)");
}
void VoxelRenderer::shutdown() {
chunkSlots_.clear();
cpuChunkInfo_.clear();
cpuMegaQuads_.clear();
initialized_ = false;
}
void VoxelRenderer::createPipeline() {
// Constant buffer for per-frame data
GPUBufferDesc cbDesc;
cbDesc.size = sizeof(VoxelConstants);
cbDesc.bind_flags = BindFlag::CONSTANT_BUFFER;
cbDesc.usage = Usage::DEFAULT;
device_->CreateBuffer(&cbDesc, nullptr, &constantBuffer_);
// Anisotropic wrap sampler
SamplerDesc samplerDesc;
samplerDesc.filter = Filter::ANISOTROPIC;
samplerDesc.address_u = TextureAddressMode::WRAP;
samplerDesc.address_v = TextureAddressMode::WRAP;
samplerDesc.address_w = TextureAddressMode::WRAP;
samplerDesc.max_anisotropy = 16;
device_->CreateSampler(&samplerDesc, &sampler_);
// Load shaders
wi::renderer::LoadShader(ShaderStage::VS, vertexShader_, "voxel/voxelVS.cso");
wi::renderer::LoadShader(ShaderStage::PS, pixelShader_, "voxel/voxelPS.cso");
wi::renderer::LoadShader(ShaderStage::CS, cullShader_, "voxel/voxelCullCS.cso");
if (!vertexShader_.IsValid() || !pixelShader_.IsValid()) {
wi::backlog::post("VoxelRenderer: shader loading failed", wi::backlog::LogLevel::Error);
return;
}
if (cullShader_.IsValid()) {
gpuCullingEnabled_ = true;
wi::backlog::post("VoxelRenderer: GPU cull compute shader enabled");
} else {
gpuCullingEnabled_ = false;
wi::backlog::post("VoxelRenderer: cull compute shader not available, using CPU fallback", wi::backlog::LogLevel::Warning);
}
// Pipeline: backface cull, depth test, opaque blend, triangle list
PipelineStateDesc psoDesc;
psoDesc.vs = &vertexShader_;
psoDesc.ps = &pixelShader_;
psoDesc.rs = wi::renderer::GetRasterizerState(wi::enums::RSTYPE_FRONT);
psoDesc.dss = wi::renderer::GetDepthStencilState(wi::enums::DSSTYPE_DEFAULT);
psoDesc.bs = wi::renderer::GetBlendState(wi::enums::BSTYPE_OPAQUE);
psoDesc.pt = PrimitiveTopology::TRIANGLELIST;
device_->CreatePipelineState(&psoDesc, &pso_);
// ── Toping pipeline (Phase 4) ────────────────────────────────
wi::renderer::LoadShader(ShaderStage::VS, topingVS_, "voxel/voxelTopingVS.cso");
wi::renderer::LoadShader(ShaderStage::PS, topingPS_, "voxel/voxelTopingPS.cso");
if (topingVS_.IsValid() && topingPS_.IsValid()) {
PipelineStateDesc topingPsoDesc;
topingPsoDesc.vs = &topingVS_;
topingPsoDesc.ps = &topingPS_;
topingPsoDesc.rs = wi::renderer::GetRasterizerState(wi::enums::RSTYPE_FRONT);
topingPsoDesc.dss = wi::renderer::GetDepthStencilState(wi::enums::DSSTYPE_DEFAULT);
topingPsoDesc.bs = wi::renderer::GetBlendState(wi::enums::BSTYPE_OPAQUE);
topingPsoDesc.pt = PrimitiveTopology::TRIANGLELIST;
device_->CreatePipelineState(&topingPsoDesc, &topingPso_);
wi::backlog::post("VoxelRenderer: toping pipeline created");
} else {
wi::backlog::post("VoxelRenderer: toping shader loading failed", wi::backlog::LogLevel::Warning);
}
// ── Smooth surface pipeline (Phase 5) ────────────────────────
wi::renderer::LoadShader(ShaderStage::VS, smoothVS_, "voxel/voxelSmoothVS.cso");
wi::renderer::LoadShader(ShaderStage::PS, smoothPS_, "voxel/voxelSmoothPS.cso");
if (smoothVS_.IsValid() && smoothPS_.IsValid()) {
// Custom rasterizer with depth bias to resolve z-fighting at smooth↔blocky boundaries
smoothRasterizer_ = *wi::renderer::GetRasterizerState(wi::enums::RSTYPE_FRONT);
smoothRasterizer_.depth_bias = 2; // small integer bias
smoothRasterizer_.slope_scaled_depth_bias = 1.0f; // scale with surface slope
PipelineStateDesc smoothPsoDesc;
smoothPsoDesc.vs = &smoothVS_;
smoothPsoDesc.ps = &smoothPS_;
smoothPsoDesc.rs = &smoothRasterizer_;
smoothPsoDesc.dss = wi::renderer::GetDepthStencilState(wi::enums::DSSTYPE_DEFAULT);
smoothPsoDesc.bs = wi::renderer::GetBlendState(wi::enums::BSTYPE_OPAQUE);
smoothPsoDesc.pt = PrimitiveTopology::TRIANGLELIST;
device_->CreatePipelineState(&smoothPsoDesc, &smoothPso_);
wi::backlog::post("VoxelRenderer: smooth surface pipeline created");
} else {
wi::backlog::post("VoxelRenderer: smooth shader loading failed", wi::backlog::LogLevel::Warning);
}
}
// ── Procedural texture generation ───────────────────────────────
static void generateNoiseTexture(uint8_t* pixels, int w, int h,
uint8_t r0, uint8_t g0, uint8_t b0,
uint8_t r1, uint8_t g1, uint8_t b1,
uint32_t seed, float heightFreq = 1.0f, float heightContrast = 1.0f)
{
uint32_t s = seed;
uint32_t s2 = seed * 7919u + 104729u; // separate seed for heightmap
for (int y = 0; y < h; y++) {
for (int x = 0; x < w; x++) {
s = s * 1664525u + 1013904223u;
float noise = (float)(s & 0xFFFF) / 65535.0f;
float fx = (float)x / w;
float fy = (float)y / h;
float pattern = 0.5f + 0.5f * std::sin(fx * 20.0f + noise * 3.0f) *
std::cos(fy * 20.0f + noise * 3.0f);
float t = noise * 0.6f + pattern * 0.4f;
int idx = (y * w + x) * 4;
pixels[idx + 0] = (uint8_t)(r0 + (r1 - r0) * t);
pixels[idx + 1] = (uint8_t)(g0 + (g1 - g0) * t);
pixels[idx + 2] = (uint8_t)(b0 + (b1 - b0) * t);
// Heightmap in alpha: separate noise for height-based material blending
s2 = s2 * 1664525u + 1013904223u;
float hn = (float)(s2 & 0xFFFF) / 65535.0f;
float hPattern = 0.5f + 0.5f * std::sin(fx * 12.0f * heightFreq + hn * 2.0f) *
std::cos(fy * 12.0f * heightFreq + hn * 2.0f);
float heightVal = hn * 0.5f + hPattern * 0.5f;
heightVal = std::clamp(heightVal * heightContrast, 0.0f, 1.0f);
pixels[idx + 3] = (uint8_t)(heightVal * 255.0f);
}
}
}
void VoxelRenderer::generateTextures() {
const int TEX_SIZE = 256;
const int NUM_MATERIALS = 6;
std::vector<uint8_t> allPixels(TEX_SIZE * TEX_SIZE * 4 * NUM_MATERIALS);
struct MatColor {
uint8_t r0,g0,b0, r1,g1,b1;
uint32_t seed;
float heightFreq; // heightmap noise frequency
float heightContrast; // heightmap contrast (higher = more defined peaks)
};
MatColor colors[NUM_MATERIALS] = {
{ 60, 140, 40, 80, 180, 60, 101, 1.5f, 0.8f }, // 1: Grass: medium bumps
{ 100, 70, 40, 140, 100, 60, 202, 0.8f, 0.6f }, // 2: Dirt: smooth mounds
{ 80, 80, 90, 120, 120, 130, 303, 2.5f, 0.5f }, // 3: Stone (blocky): darker blue-gray
{ 220, 200, 130, 245, 230, 160, 404, 3.0f, 0.4f }, // 4: Sand: warmer yellow, fine
{ 220, 225, 230, 245, 248, 252, 505, 1.0f, 0.5f }, // 5: Snow: smooth, soft
{ 100, 100, 110, 145, 145, 155, 606, 2.0f, 0.6f }, // 6: SmoothStone: lighter blue-gray, distinct from blocky stone
};
for (int i = 0; i < NUM_MATERIALS; i++) {
auto& c = colors[i];
generateNoiseTexture(
allPixels.data() + i * TEX_SIZE * TEX_SIZE * 4,
TEX_SIZE, TEX_SIZE,
c.r0, c.g0, c.b0, c.r1, c.g1, c.b1, c.seed,
c.heightFreq, c.heightContrast
);
}
TextureDesc texDesc;
texDesc.type = TextureDesc::Type::TEXTURE_2D;
texDesc.width = TEX_SIZE;
texDesc.height = TEX_SIZE;
texDesc.array_size = NUM_MATERIALS;
texDesc.mip_levels = 1;
texDesc.format = Format::R8G8B8A8_UNORM;
texDesc.bind_flags = BindFlag::SHADER_RESOURCE;
texDesc.usage = Usage::DEFAULT;
std::vector<SubresourceData> subData(NUM_MATERIALS);
for (int i = 0; i < NUM_MATERIALS; i++) {
subData[i].data_ptr = allPixels.data() + i * TEX_SIZE * TEX_SIZE * 4;
subData[i].row_pitch = TEX_SIZE * 4;
subData[i].slice_pitch = TEX_SIZE * TEX_SIZE * 4;
}
device_->CreateTexture(&texDesc, subData.data(), &textureArray_);
}
// ── Mega-buffer rebuild ─────────────────────────────────────────
// Packs all chunk quads contiguously into a single buffer.
// Simple strategy: full rebuild whenever any chunk is dirty.
void VoxelRenderer::rebuildMegaBuffer(VoxelWorld& world) {
cpuMegaQuads_.clear();
chunkSlots_.clear();
cpuChunkInfo_.clear();
// Position → index map for neighbor lookup
std::unordered_map<uint64_t, uint32_t> posToIdx;
auto posKey = [](const ChunkPos& p) -> uint64_t {
return ((uint64_t)(uint16_t)p.x) | ((uint64_t)(uint16_t)p.y << 16) | ((uint64_t)(uint16_t)p.z << 32);
};
uint32_t offset = 0;
float debugFlag = debugFaceColors_ ? 1.0f : 0.0f;
world.forEachChunk([&](const ChunkPos& pos, Chunk& chunk) {
if (chunk.quadCount == 0) return;
if (offset + chunk.quadCount > MEGA_BUFFER_CAPACITY) return;
uint32_t curIdx = (uint32_t)chunkSlots_.size();
ChunkSlot slot;
slot.pos = pos;
slot.quadOffset = offset;
slot.quadCount = chunk.quadCount;
chunkSlots_.push_back(slot);
GPUChunkInfo info = {};
info.worldPos = XMFLOAT4(
(float)(pos.x * CHUNK_SIZE),
(float)(pos.y * CHUNK_SIZE),
(float)(pos.z * CHUNK_SIZE),
debugFlag
);
info.quadOffset = offset;
info.quadCount = chunk.quadCount;
for (int f = 0; f < 6; f++) {
info.faceOffsets[f] = chunk.faceOffsets[f];
info.faceCounts[f] = chunk.faceCounts[f];
info.neighbors[f] = 0xFFFFFFFF;
}
cpuChunkInfo_.push_back(info);
posToIdx[posKey(pos)] = curIdx;
cpuMegaQuads_.insert(cpuMegaQuads_.end(), chunk.quads.begin(), chunk.quads.end());
offset += chunk.quadCount;
});
// Fill neighbor indices
static const int offsets[6][3] = {
{1,0,0}, {-1,0,0}, {0,1,0}, {0,-1,0}, {0,0,1}, {0,0,-1}
};
for (uint32_t i = 0; i < (uint32_t)chunkSlots_.size(); i++) {
const auto& pos = chunkSlots_[i].pos;
for (int f = 0; f < 6; f++) {
ChunkPos npos = { pos.x + offsets[f][0], pos.y + offsets[f][1], pos.z + offsets[f][2] };
auto it = posToIdx.find(posKey(npos));
if (it != posToIdx.end()) {
cpuChunkInfo_[i].neighbors[f] = it->second;
}
}
}
chunkCount_ = (uint32_t)chunkSlots_.size();
totalQuads_ = offset;
}
// Build chunkInfoBuffer without CPU meshing (for GPU mesh path)
void VoxelRenderer::rebuildChunkInfoOnly(VoxelWorld& world) {
chunkSlots_.clear();
cpuChunkInfo_.clear();
// First pass: build position → index map and chunk info
std::unordered_map<uint64_t, uint32_t> posToIdx;
auto posKey = [](const ChunkPos& p) -> uint64_t {
return ((uint64_t)(uint16_t)p.x) | ((uint64_t)(uint16_t)p.y << 16) | ((uint64_t)(uint16_t)p.z << 32);
};
uint32_t idx = 0;
float debugFlag = debugFaceColors_ ? 1.0f : 0.0f;
world.forEachChunk([&](const ChunkPos& pos, Chunk& chunk) {
ChunkSlot slot;
slot.pos = pos;
slot.quadOffset = 0;
slot.quadCount = 0;
chunkSlots_.push_back(slot);
GPUChunkInfo info = {};
info.worldPos = XMFLOAT4(
(float)(pos.x * CHUNK_SIZE),
(float)(pos.y * CHUNK_SIZE),
(float)(pos.z * CHUNK_SIZE),
debugFlag
);
info.quadOffset = 0;
info.quadCount = 0;
for (int i = 0; i < 6; i++) info.neighbors[i] = 0xFFFFFFFF;
cpuChunkInfo_.push_back(info);
posToIdx[posKey(pos)] = idx;
idx++;
});
// Second pass: fill neighbor indices
static const int offsets[6][3] = {
{1,0,0}, {-1,0,0}, {0,1,0}, {0,-1,0}, {0,0,1}, {0,0,-1}
};
for (uint32_t i = 0; i < (uint32_t)chunkSlots_.size(); i++) {
const auto& pos = chunkSlots_[i].pos;
for (int f = 0; f < 6; f++) {
ChunkPos npos = { pos.x + offsets[f][0], pos.y + offsets[f][1], pos.z + offsets[f][2] };
auto it = posToIdx.find(posKey(npos));
if (it != posToIdx.end()) {
cpuChunkInfo_[i].neighbors[f] = it->second;
}
}
}
chunkCount_ = (uint32_t)chunkSlots_.size();
}
void VoxelRenderer::updateMeshes(VoxelWorld& world) {
if (!device_) return;
// GPU mesh path: skip CPU meshing entirely, just rebuild chunk info
if (gpuMeshEnabled_ && gpuMesherAvailable_) {
bool anyDirty = false;
world.forEachChunk([&](const ChunkPos& pos, Chunk& chunk) {
if (chunk.dirty) { anyDirty = true; chunk.dirty = false; }
});
if (anyDirty || megaBufferDirty_) {
rebuildChunkInfoOnly(world);
// If cache wasn't already filled by fused regen+pack, mark for repack
if (!gpuMeshDirty_) {
// Non-fused dirty (e.g. initial load): need both repack and GPU update
voxelCacheDirty_ = true;
gpuMeshDirty_ = true;
}
// else: fused path already set gpuMeshDirty_=true, cache is clean
chunkInfoDirty_ = true;
megaBufferDirty_ = false;
}
return;
}
// CPU meshing path (fallback)
// Collect dirty chunks for parallel meshing
std::vector<Chunk*> dirtyChunks;
world.forEachChunk([&](const ChunkPos& pos, Chunk& chunk) {
if (chunk.dirty) dirtyChunks.push_back(&chunk);
});
bool anyDirty = !dirtyChunks.empty();
// Parallel CPU greedy meshing via wi::jobsystem
auto cpuStart = std::chrono::high_resolution_clock::now();
if (anyDirty) {
wi::jobsystem::context ctx;
wi::jobsystem::Dispatch(ctx, (uint32_t)dirtyChunks.size(), 1,
[&dirtyChunks, &world](wi::jobsystem::JobArgs args) {
VoxelMesher::meshChunk(*dirtyChunks[args.jobIndex], world);
});
wi::jobsystem::Wait(ctx);
}
auto cpuEnd = std::chrono::high_resolution_clock::now();
if (anyDirty) {
cpuMeshTimeMs_ = std::chrono::duration<float, std::milli>(cpuEnd - cpuStart).count();
// Trigger GPU benchmark on next render frame
if (gpuMesherAvailable_ && benchState_ == BenchState::IDLE) {
benchState_ = BenchState::DISPATCH;
}
}
if (anyDirty || megaBufferDirty_) {
rebuildMegaBuffer(world);
megaBufferDirty_ = false;
}
}
// ── GPU Mesh Benchmark (Phase 2.4) ──────────────────────────────
// Dispatches the baseline 1x1 GPU mesher for ALL chunks and measures timing.
// State machine: DISPATCH (frame N) → READBACK (frame N+1) → DONE.
void VoxelRenderer::dispatchGpuMeshBenchmark(CommandList cmd, const VoxelWorld& world) const {
auto* dev = device_;
// Zero the quad counter
uint32_t zero = 0;
dev->UpdateBuffer(&gpuQuadCounter_, &zero, cmd, sizeof(uint32_t));
// Barrier: COPY_DST → UAV for counter, UNDEFINED → UAV for output buffer
GPUBarrier preBarriers[] = {
GPUBarrier::Buffer(&gpuQuadCounter_, ResourceState::COPY_DST, ResourceState::UNORDERED_ACCESS),
GPUBarrier::Buffer(&gpuQuadBuffer_, ResourceState::UNDEFINED, ResourceState::UNORDERED_ACCESS),
};
dev->Barrier(preBarriers, 2, cmd);
dev->BindComputeShader(&meshShader_, cmd);
// GPU timestamp: mesh begin
dev->QueryEnd(&timestampHeap_, TS_MESH_BEGIN, cmd);
// Dispatch for each chunk
uint32_t chunkIdx = 0;
world.forEachChunk([&](const ChunkPos& pos, Chunk& chunk) {
// Pack voxel data: 32^3 voxels → 16384 uint32s (2 voxels per uint)
std::vector<uint32_t> packed(CHUNK_VOLUME / 2, 0);
for (int i = 0; i < CHUNK_VOLUME; i++) {
uint32_t v = chunk.voxels[i].packed;
if (i & 1)
packed[i >> 1] |= (v << 16);
else
packed[i >> 1] = v;
}
// Upload voxel data (re-uses the single-chunk buffer)
dev->UpdateBuffer(&voxelDataBuffer_, packed.data(), cmd,
packed.size() * sizeof(uint32_t));
// Bind resources (after BindComputeShader, so PushConstants targets compute)
dev->BindResource(&voxelDataBuffer_, 0, cmd);
dev->BindUAV(&gpuQuadBuffer_, 0, cmd);
dev->BindUAV(&gpuQuadCounter_, 1, cmd);
// Push constants for this chunk
struct MeshPush {
uint32_t chunkIndex;
uint32_t voxelBufferOffset;
uint32_t quadBufferOffset;
uint32_t maxOutputQuads;
uint32_t pad[8];
};
MeshPush pushData = {};
pushData.chunkIndex = chunkIdx;
pushData.voxelBufferOffset = 0; // single-chunk buffer, always at offset 0
pushData.quadBufferOffset = 0; // all chunks share global atomic counter
pushData.maxOutputQuads = MEGA_BUFFER_CAPACITY;
dev->PushConstants(&pushData, sizeof(pushData), cmd);
// Dispatch: 32/8 = 4 groups per axis → 64 groups total
dev->Dispatch(4, 4, 4, cmd);
chunkIdx++;
});
// GPU timestamp: mesh end
dev->QueryEnd(&timestampHeap_, TS_MESH_END, cmd);
// Copy quad counter to readback buffer
GPUBarrier postBarrier = GPUBarrier::Buffer(
&gpuQuadCounter_, ResourceState::UNORDERED_ACCESS, ResourceState::COPY_SRC);
dev->Barrier(&postBarrier, 1, cmd);
dev->CopyBuffer(&meshCounterReadback_, 0, &gpuQuadCounter_, 0, sizeof(uint32_t), cmd);
// Resolve timestamps
dev->QueryResolve(&timestampHeap_, TS_MESH_BEGIN, 2, &timestampReadback_,
TS_MESH_BEGIN * sizeof(uint64_t), cmd);
benchState_ = BenchState::READBACK;
}
void VoxelRenderer::readbackGpuMeshBenchmark() const {
// Read quad count from readback buffer
uint32_t* countData = (uint32_t*)meshCounterReadback_.mapped_data;
if (countData) {
gpuBaselineQuads_ = *countData;
}
// Read GPU mesh timestamps
uint64_t* tsData = (uint64_t*)timestampReadback_.mapped_data;
if (tsData) {
double freq = (double)device_->GetTimestampFrequency();
if (freq > 0.0 && tsData[TS_MESH_END] > tsData[TS_MESH_BEGIN]) {
gpuMeshTimeMs_ = (float)((double)(tsData[TS_MESH_END] - tsData[TS_MESH_BEGIN]) / freq * 1000.0);
}
}
// Log benchmark results
char msg[256];
snprintf(msg, sizeof(msg),
"=== MESH BENCHMARK ===\n"
" CPU greedy: %.2f ms, %u quads (%u chunks)\n"
" GPU baseline: %.3f ms, %u quads (1x1, no merge)\n"
" Ratio quads: %.1fx more (GPU baseline vs CPU greedy)",
cpuMeshTimeMs_, totalQuads_, chunkCount_,
gpuMeshTimeMs_, gpuBaselineQuads_,
totalQuads_ > 0 ? (float)gpuBaselineQuads_ / totalQuads_ : 0.0f);
wi::backlog::post(msg);
benchState_ = BenchState::DONE;
}
// ── GPU Mesh Dispatch (production path) ─────────────────────────
// Dispatches GPU mesher for ALL chunks every frame. Replaces CPU greedy meshing.
// Uses the atomic quad counter for 1-frame-delayed readback of total quad count.
void VoxelRenderer::dispatchGpuMesh(CommandList cmd, const VoxelWorld& world,
ProfileAccum* profPack, ProfileAccum* profUpload, ProfileAccum* profDispatch) const {
auto* dev = device_;
// Zero the quad counter
uint32_t zero = 0;
dev->UpdateBuffer(&gpuQuadCounter_, &zero, cmd, sizeof(uint32_t));
// Barrier: COPY_DST → UAV for counter, UNDEFINED → UAV for output buffer
GPUBarrier preBarriers[] = {
GPUBarrier::Buffer(&gpuQuadCounter_, ResourceState::COPY_DST, ResourceState::UNORDERED_ACCESS),
GPUBarrier::Buffer(&gpuQuadBuffer_, ResourceState::UNDEFINED, ResourceState::UNORDERED_ACCESS),
};
dev->Barrier(preBarriers, 2, cmd);
dev->BindComputeShader(&meshShader_, cmd);
// Pack and upload all chunks' voxel data
// Each chunk = 32^3/2 = 16384 uint32 (two voxels per uint)
const uint32_t wordsPerChunk = CHUNK_VOLUME / 2;
uint32_t totalWords = chunkCount_ * wordsPerChunk;
// Resize voxel data buffer if needed
if (totalWords > voxelDataCapacity_) {
voxelDataCapacity_ = totalWords;
GPUBufferDesc voxDesc;
voxDesc.size = totalWords * sizeof(uint32_t);
voxDesc.bind_flags = BindFlag::SHADER_RESOURCE;
voxDesc.misc_flags = ResourceMiscFlag::BUFFER_STRUCTURED;
voxDesc.stride = sizeof(uint32_t);
voxDesc.usage = Usage::DEFAULT;
dev->CreateBuffer(&voxDesc, nullptr, const_cast<GPUBuffer*>(&voxelDataBuffer_));
}
// Pack voxel data — use cached copy, only update when dirty.
// VoxelData is exactly uint16_t, so voxels[] is a packed uint16 array.
// Two consecutive uint16 = one uint32 → direct memcpy, no bit manipulation.
static_assert(sizeof(VoxelData) == sizeof(uint16_t),
"VoxelData must be 2 bytes for direct memcpy to GPU buffer");
auto tPack0 = std::chrono::high_resolution_clock::now();
if (voxelCacheDirty_) {
packedVoxelCache_.resize(totalWords);
uint32_t chunkI = 0;
world.forEachChunk([&](const ChunkPos& pos, const Chunk& chunk) {
std::memcpy(
packedVoxelCache_.data() + chunkI * wordsPerChunk,
chunk.voxels,
wordsPerChunk * sizeof(uint32_t) // = CHUNK_VOLUME * 2 bytes
);
chunkI++;
});
voxelCacheDirty_ = false;
}
auto tPack1 = std::chrono::high_resolution_clock::now();
if (profPack) profPack->add(std::chrono::duration<float, std::milli>(tPack1 - tPack0).count());
// Upload all voxel data at once
auto tUpload0 = std::chrono::high_resolution_clock::now();
dev->UpdateBuffer(&voxelDataBuffer_, packedVoxelCache_.data(), cmd,
totalWords * sizeof(uint32_t));
auto tUpload1 = std::chrono::high_resolution_clock::now();
if (profUpload) profUpload->add(std::chrono::duration<float, std::milli>(tUpload1 - tUpload0).count());
// Bind resources (shared across all chunk dispatches)
dev->BindResource(&voxelDataBuffer_, 0, cmd);
dev->BindUAV(&gpuQuadBuffer_, 0, cmd);
dev->BindUAV(&gpuQuadCounter_, 1, cmd);
// Dispatch for each chunk
struct MeshPush {
uint32_t chunkIndex;
uint32_t voxelBufferOffset;
uint32_t quadBufferOffset;
uint32_t maxOutputQuads;
uint32_t pad[8];
};
auto tDisp0 = std::chrono::high_resolution_clock::now();
uint32_t chunkIdx = 0;
world.forEachChunk([&](const ChunkPos& pos, const Chunk& chunk) {
MeshPush pushData = {};
pushData.chunkIndex = chunkIdx;
pushData.voxelBufferOffset = chunkIdx * wordsPerChunk;
pushData.quadBufferOffset = 0; // global atomic counter handles offsets
pushData.maxOutputQuads = MEGA_BUFFER_CAPACITY;
dev->PushConstants(&pushData, sizeof(pushData), cmd);
// Dispatch: 32/8 = 4 groups per axis → 64 groups per chunk
dev->Dispatch(4, 4, 4, cmd);
chunkIdx++;
});
auto tDisp1 = std::chrono::high_resolution_clock::now();
if (profDispatch) profDispatch->add(std::chrono::duration<float, std::milli>(tDisp1 - tDisp0).count());
// Barriers: UAV → COPY_SRC for counter readback, UAV → SRV for quad buffer (rendering)
GPUBarrier postBarriers[] = {
GPUBarrier::Buffer(&gpuQuadCounter_, ResourceState::UNORDERED_ACCESS, ResourceState::COPY_SRC),
GPUBarrier::Buffer(&gpuQuadBuffer_, ResourceState::UNORDERED_ACCESS, ResourceState::SHADER_RESOURCE),
};
dev->Barrier(postBarriers, 2, cmd);
// Copy quad counter to readback buffer (result available next frame)
dev->CopyBuffer(&meshCounterReadback_, 0, &gpuQuadCounter_, 0, sizeof(uint32_t), cmd);
totalQuads_ = gpuMeshQuadCount_; // display previous frame's count in HUD
gpuMeshDirty_ = false;
}
// ── GPU Smooth Mesh Dispatch (Phase 5.3) ─────────────────────────
// Dispatches GPU Surface Nets compute shader for all chunks.
// Uses voxelDataBuffer_ (already uploaded by dispatchGpuMesh).
void VoxelRenderer::dispatchGpuSmoothMesh(CommandList cmd, const VoxelWorld& world) const {
if (!smoothCentroidShader_.IsValid() || !smoothMeshShader_.IsValid()) return;
auto* dev = device_;
// ── Collect smooth chunk indices (chunks that contain smooth OR neighbor smooth) ──
struct SmoothChunkEntry { uint32_t chunkIdx; };
std::vector<SmoothChunkEntry> smoothChunks;
smoothChunks.reserve(256);
{
// Build chunk index list + check containsSmooth for neighbors
std::vector<std::pair<ChunkPos, uint32_t>> allChunks;
allChunks.reserve(chunkCount_);
uint32_t ci = 0;
world.forEachChunk([&](const ChunkPos& pos, const Chunk& chunk) {
allChunks.push_back({pos, ci});
ci++;
});
// Build position→index map for neighbor lookup
std::unordered_map<uint64_t, uint32_t> posToLocal;
auto posKey = [](const ChunkPos& p) -> uint64_t {
return ((uint64_t)(uint16_t)p.x) | ((uint64_t)(uint16_t)p.y << 16) | ((uint64_t)(uint16_t)p.z << 32);
};
for (uint32_t i = 0; i < (uint32_t)allChunks.size(); i++) {
posToLocal[posKey(allChunks[i].first)] = i;
}
static const int offs[6][3] = {{1,0,0},{-1,0,0},{0,1,0},{0,-1,0},{0,0,1},{0,0,-1}};
for (auto& [pos, idx] : allChunks) {
const Chunk* c = world.getChunk(pos);
if (!c) continue;
bool needed = c->containsSmooth;
if (!needed) {
for (int f = 0; f < 6 && !needed; f++) {
ChunkPos np = {pos.x + offs[f][0], pos.y + offs[f][1], pos.z + offs[f][2]};
const Chunk* nc = world.getChunk(np);
if (nc && nc->containsSmooth) needed = true;
}
}
if (needed) smoothChunks.push_back({idx});
}
}
if (smoothChunks.empty()) {
gpuSmoothMeshDirty_ = false;
return;
}
uint32_t smoothCount = (uint32_t)smoothChunks.size();
// ── Resize centroid grid buffer if needed (one slot per smooth chunk) ──
uint32_t requiredGridSize = smoothCount * CENTROID_GRID_SIZE * 16; // bytes
if (!centroidGridBuffer_.IsValid() || centroidGridBuffer_.desc.size < requiredGridSize) {
GPUBufferDesc cgDesc;
cgDesc.size = requiredGridSize;
cgDesc.bind_flags = BindFlag::UNORDERED_ACCESS | BindFlag::SHADER_RESOURCE;
cgDesc.misc_flags = ResourceMiscFlag::BUFFER_STRUCTURED;
cgDesc.stride = 16;
cgDesc.usage = Usage::DEFAULT;
dev->CreateBuffer(&cgDesc, nullptr, const_cast<GPUBuffer*>(&centroidGridBuffer_));
wi::backlog::post("VoxelRenderer: resized centroid grid for " + std::to_string(smoothCount)
+ " smooth chunks (" + std::to_string(requiredGridSize / 1024) + " KB)");
}
// Zero the smooth vertex counter
uint32_t zero = 0;
dev->UpdateBuffer(const_cast<GPUBuffer*>(&gpuSmoothCounter_), &zero, cmd, sizeof(uint32_t));
// Pre-barriers
GPUBarrier preBarriers[] = {
GPUBarrier::Buffer(const_cast<GPUBuffer*>(&gpuSmoothCounter_), ResourceState::COPY_DST, ResourceState::UNORDERED_ACCESS),
GPUBarrier::Buffer(const_cast<GPUBuffer*>(&gpuSmoothVertexBuffer_), ResourceState::UNDEFINED, ResourceState::UNORDERED_ACCESS),
GPUBarrier::Buffer(const_cast<GPUBuffer*>(&centroidGridBuffer_), ResourceState::UNDEFINED, ResourceState::UNORDERED_ACCESS),
};
dev->Barrier(preBarriers, 3, cmd);
struct SmoothPush {
uint32_t chunkIndex;
uint32_t voxelBufferOffset;
uint32_t maxOutputVerts;
uint32_t centroidGridOffset;
uint32_t pad[8];
};
const uint32_t wordsPerChunk = CHUNK_VOLUME / 2;
// ── Pass 1: Dispatch ALL centroid computations (batched, no barriers) ──
dev->BindComputeShader(&smoothCentroidShader_, cmd);
dev->BindResource(&voxelDataBuffer_, 0, cmd); // t0
dev->BindResource(&chunkInfoBuffer_, 1, cmd); // t1
dev->BindUAV(const_cast<GPUBuffer*>(&centroidGridBuffer_), 0, cmd); // u0
for (uint32_t i = 0; i < smoothCount; i++) {
uint32_t ci = smoothChunks[i].chunkIdx;
SmoothPush pushData = {};
pushData.chunkIndex = ci;
pushData.voxelBufferOffset = ci * wordsPerChunk;
pushData.maxOutputVerts = MAX_GPU_SMOOTH_VERTICES;
pushData.centroidGridOffset = i * CENTROID_GRID_SIZE;
dev->PushConstants(&pushData, sizeof(pushData), cmd);
dev->Dispatch(5, 5, 5, cmd);
}
// ── Single barrier: centroid grid UAV → SRV ──
GPUBarrier midBarrier = GPUBarrier::Buffer(
const_cast<GPUBuffer*>(&centroidGridBuffer_),
ResourceState::UNORDERED_ACCESS, ResourceState::SHADER_RESOURCE);
dev->Barrier(&midBarrier, 1, cmd);
// ── Pass 2: Dispatch ALL emit passes (batched, no barriers) ──
// Emit shader reads ONLY from centroid grid (no voxelData access)
dev->BindComputeShader(&smoothMeshShader_, cmd);
dev->BindResource(&chunkInfoBuffer_, 1, cmd); // t1
dev->BindResource(&centroidGridBuffer_, 2, cmd); // t2: centroid grid (SRV)
dev->BindUAV(const_cast<GPUBuffer*>(&gpuSmoothVertexBuffer_), 0, cmd); // u0
dev->BindUAV(const_cast<GPUBuffer*>(&gpuSmoothCounter_), 1, cmd); // u1
for (uint32_t i = 0; i < smoothCount; i++) {
uint32_t ci = smoothChunks[i].chunkIdx;
SmoothPush pushData = {};
pushData.chunkIndex = ci;
pushData.voxelBufferOffset = ci * wordsPerChunk;
pushData.maxOutputVerts = MAX_GPU_SMOOTH_VERTICES;
pushData.centroidGridOffset = i * CENTROID_GRID_SIZE;
dev->PushConstants(&pushData, sizeof(pushData), cmd);
dev->Dispatch(4, 4, 4, cmd);
}
// Post-barriers
GPUBarrier postBarriers[] = {
GPUBarrier::Buffer(const_cast<GPUBuffer*>(&gpuSmoothCounter_), ResourceState::UNORDERED_ACCESS, ResourceState::COPY_SRC),
GPUBarrier::Buffer(const_cast<GPUBuffer*>(&gpuSmoothVertexBuffer_), ResourceState::UNORDERED_ACCESS, ResourceState::SHADER_RESOURCE),
};
dev->Barrier(postBarriers, 2, cmd);
// Readback counter (result available next frame)
dev->CopyBuffer(const_cast<GPUBuffer*>(&smoothCounterReadback_), 0,
const_cast<GPUBuffer*>(&gpuSmoothCounter_), 0, sizeof(uint32_t), cmd);
gpuSmoothMeshDirty_ = false;
}
// ── Ray Tracing: BLAS extraction + AS build (Phase 6.1) ──────────
void VoxelRenderer::dispatchBLASExtract(CommandList cmd) const {
if (!rtAvailable_ || !blasExtractShader_.IsValid()) return;
auto* dev = device_;
uint32_t quadCount = gpuMeshQuadCount_;
if (quadCount == 0) return;
// Pre-barriers: blasPositionBuffer_ UNDEFINED → UAV
GPUBarrier preBarriers[] = {
GPUBarrier::Buffer(&blasPositionBuffer_,
ResourceState::UNDEFINED, ResourceState::UNORDERED_ACCESS),
};
dev->Barrier(preBarriers, 1, cmd);
// Bind compute shader
dev->BindComputeShader(&blasExtractShader_, cmd);
// Bind resources: t0 = gpuQuadBuffer (SRV), t2 = chunkInfoBuffer (SRV), u0 = blasPositionBuffer (UAV)
dev->BindResource(&gpuQuadBuffer_, 0, cmd); // t0
dev->BindResource(&chunkInfoBuffer_, 2, cmd); // t2
dev->BindUAV(&blasPositionBuffer_, 0, cmd); // u0
// Push constants: quadCount
struct BLASPush {
uint32_t quadCount;
uint32_t pad[11];
} pushData = {};
pushData.quadCount = quadCount;
dev->PushConstants(&pushData, sizeof(pushData), cmd);
// Dispatch: 64 threads per group
uint32_t groupCount = (quadCount + 63) / 64;
dev->Dispatch(groupCount, 1, 1, cmd);
// Post-barrier: blasPositionBuffer_ UAV → SHADER_RESOURCE (for BLAS build)
GPUBarrier postBarriers[] = {
GPUBarrier::Buffer(&blasPositionBuffer_,
ResourceState::UNORDERED_ACCESS, ResourceState::SHADER_RESOURCE),
};
dev->Barrier(postBarriers, 1, cmd);
rtBlockyVertexCount_ = quadCount * 6;
}
void VoxelRenderer::buildAccelerationStructures(CommandList cmd) const {
if (!rtAvailable_) return;
auto* dev = device_;
// ── Blocky BLAS ──────────────────────────────────────────────
uint32_t blockyVertCount = rtBlockyVertexCount_;
if (blockyVertCount < 3) blockyVertCount = 0; // Need at least 1 triangle
if (blockyVertCount > 0 && blasPositionBuffer_.IsValid()) {
// (Re)create BLAS if needed (vertex count changed or first time)
if (!blockyBLAS_.IsValid() || blockyBLAS_.desc.bottom_level.geometries.empty() ||
blockyBLAS_.desc.bottom_level.geometries[0].triangles.vertex_count != blockyVertCount) {
RaytracingAccelerationStructureDesc desc;
desc.type = RaytracingAccelerationStructureDesc::Type::BOTTOMLEVEL;
desc.flags = RaytracingAccelerationStructureDesc::FLAG_PREFER_FAST_BUILD;
desc.bottom_level.geometries.resize(1);
auto& geom = desc.bottom_level.geometries[0];
geom.type = RaytracingAccelerationStructureDesc::BottomLevel::Geometry::Type::TRIANGLES;
geom.flags = RaytracingAccelerationStructureDesc::BottomLevel::Geometry::FLAG_OPAQUE;
geom.triangles.vertex_buffer = blasPositionBuffer_;
geom.triangles.vertex_byte_offset = 0;
geom.triangles.vertex_count = blockyVertCount;
geom.triangles.vertex_stride = sizeof(float) * 3; // 12 bytes per float3
geom.triangles.vertex_format = Format::R32G32B32_FLOAT;
// Wicked ALWAYS accesses index_buffer via to_internal() — a default GPUBuffer
// causes null deref. And DX12 treats non-zero IndexBuffer + IndexCount=0 as
// "indexed with 0 triangles" → empty BLAS. Solution: real sequential index buffer.
geom.triangles.index_buffer = blasIndexBuffer_;
geom.triangles.index_count = blockyVertCount;
geom.triangles.index_format = IndexBufferFormat::UINT32;
geom.triangles.index_offset = 0;
bool ok = dev->CreateRaytracingAccelerationStructure(&desc,
&blockyBLAS_);
if (ok) {
dev->SetName(&blockyBLAS_, "VoxelRenderer::blockyBLAS");
wi::backlog::post("VoxelRenderer: blocky BLAS created ("
+ std::to_string(blockyVertCount / 3) + " tris)");
} else {
wi::backlog::post("VoxelRenderer: failed to create blocky BLAS", wi::backlog::LogLevel::Error);
rtAvailable_ = false;
return;
}
}
// Build BLAS
dev->BuildRaytracingAccelerationStructure(&blockyBLAS_, cmd, nullptr);
}
// ── Smooth BLAS ──────────────────────────────────────────────
// Smooth vertex buffer: float3 position at offset 0, stride 32 bytes
uint32_t smoothVertCount = gpuSmoothVertexCount_;
if (smoothVertCount < 3) smoothVertCount = 0; // Need at least 1 triangle
bool useGpuSmooth = smoothCentroidShader_.IsValid() && smoothMeshShader_.IsValid();
const GPUBuffer& smoothVB = useGpuSmooth ? gpuSmoothVertexBuffer_ : smoothVertexBuffer_;
if (smoothVertCount > 0 && smoothVB.IsValid()) {
if (!smoothBLAS_.IsValid() || smoothBLAS_.desc.bottom_level.geometries.empty() ||
smoothBLAS_.desc.bottom_level.geometries[0].triangles.vertex_count != smoothVertCount) {
RaytracingAccelerationStructureDesc desc;
desc.type = RaytracingAccelerationStructureDesc::Type::BOTTOMLEVEL;
desc.flags = RaytracingAccelerationStructureDesc::FLAG_PREFER_FAST_BUILD;
desc.bottom_level.geometries.resize(1);
auto& geom = desc.bottom_level.geometries[0];
geom.type = RaytracingAccelerationStructureDesc::BottomLevel::Geometry::Type::TRIANGLES;
geom.flags = RaytracingAccelerationStructureDesc::BottomLevel::Geometry::FLAG_OPAQUE;
geom.triangles.vertex_buffer = smoothVB;
geom.triangles.vertex_byte_offset = 0;
geom.triangles.vertex_count = smoothVertCount;
geom.triangles.vertex_stride = 32; // SmoothVtx struct = 32 bytes, position at offset 0
// Wicked always accesses index_buffer — must be valid + use real indices
geom.triangles.index_buffer = blasIndexBuffer_;
geom.triangles.index_count = smoothVertCount;
geom.triangles.index_format = IndexBufferFormat::UINT32;
geom.triangles.index_offset = 0;
geom.triangles.vertex_format = Format::R32G32B32_FLOAT;
bool ok = dev->CreateRaytracingAccelerationStructure(&desc,
&smoothBLAS_);
if (ok) {
dev->SetName(&smoothBLAS_, "VoxelRenderer::smoothBLAS");
wi::backlog::post("VoxelRenderer: smooth BLAS created ("
+ std::to_string(smoothVertCount / 3) + " tris)");
} else {
wi::backlog::post("VoxelRenderer: failed to create smooth BLAS", wi::backlog::LogLevel::Error);
}
}
if (smoothBLAS_.IsValid()) {
dev->BuildRaytracingAccelerationStructure(&smoothBLAS_, cmd, nullptr);
}
rtSmoothVertexCount_ = smoothVertCount;
}
// ── Memory barrier: sync BLAS builds before TLAS ──────────────
// Without this, TLAS build can execute before BLASes are complete.
// (Same pattern as wiRenderer.cpp line 5788)
{
GPUBarrier barriers[] = { GPUBarrier::Memory() };
dev->Barrier(barriers, 1, cmd);
}
// ── TLAS (2 instances: blocky + smooth) ──────────────────────
// Always recreate TLAS with pre-filled instance data via CreateBuffer2.
// RAY_TRACING instance buffers have special resource state requirements,
// so UpdateBuffer (CopyBufferRegion) would crash on state mismatch.
uint32_t instanceCount = 0;
if (blockyBLAS_.IsValid()) instanceCount++;
if (smoothBLAS_.IsValid() && smoothVertCount > 0) instanceCount++;
if (instanceCount == 0) { rtDirty_ = false; return; }
const size_t instSize = dev->GetTopLevelAccelerationStructureInstanceSize();
// Identity transform (3x4 row-major)
auto setIdentity = [](float transform[3][4]) {
std::memset(transform, 0, sizeof(float) * 12);
transform[0][0] = 1.0f;
transform[1][1] = 1.0f;
transform[2][2] = 1.0f;
};
// Capture BLAS pointers for the lambda (can't capture member references)
const RaytracingAccelerationStructure* blockyBLASPtr = blockyBLAS_.IsValid() ? &blockyBLAS_ : nullptr;
const RaytracingAccelerationStructure* smoothBLASPtr = (smoothBLAS_.IsValid() && smoothVertCount > 0) ? &smoothBLAS_ : nullptr;
// Create TLAS with instance data pre-filled in the creation callback.
// This avoids any UpdateBuffer on RAY_TRACING flagged buffers.
RaytracingAccelerationStructureDesc desc;
desc.flags = RaytracingAccelerationStructureDesc::FLAG_PREFER_FAST_BUILD;
desc.type = RaytracingAccelerationStructureDesc::Type::TOPLEVEL;
desc.top_level.count = instanceCount;
GPUBufferDesc bufdesc;
bufdesc.misc_flags = ResourceMiscFlag::RAY_TRACING;
bufdesc.stride = (uint32_t)instSize;
bufdesc.size = bufdesc.stride * desc.top_level.count;
auto initInstances = [&](void* dest) {
uint32_t idx = 0;
if (blockyBLASPtr) {
RaytracingAccelerationStructureDesc::TopLevel::Instance inst;
setIdentity(inst.transform);
inst.instance_id = 0;
inst.instance_mask = 0xFF;
inst.instance_contribution_to_hit_group_index = 0;
inst.flags = 0;
inst.bottom_level = blockyBLASPtr;
dev->WriteTopLevelAccelerationStructureInstance(&inst, (uint8_t*)dest + idx * instSize);
idx++;
}
if (smoothBLASPtr) {
RaytracingAccelerationStructureDesc::TopLevel::Instance inst;
setIdentity(inst.transform);
inst.instance_id = 1;
inst.instance_mask = 0xFF;
inst.instance_contribution_to_hit_group_index = 0;
inst.flags = 0;
inst.bottom_level = smoothBLASPtr;
dev->WriteTopLevelAccelerationStructureInstance(&inst, (uint8_t*)dest + idx * instSize);
idx++;
}
};
bool ok = dev->CreateBuffer2(&bufdesc, initInstances, &desc.top_level.instance_buffer);
if (!ok) {
wi::backlog::post("VoxelRenderer: failed to create TLAS instance buffer", wi::backlog::LogLevel::Error);
rtDirty_ = false;
return;
}
ok = dev->CreateRaytracingAccelerationStructure(&desc,
&tlas_);
if (!ok) {
wi::backlog::post("VoxelRenderer: failed to create TLAS", wi::backlog::LogLevel::Error);
rtDirty_ = false;
return;
}
// Build TLAS
dev->BuildRaytracingAccelerationStructure(&tlas_, cmd, nullptr);
// Memory barrier: sync TLAS build before ray queries can use it
// (Same pattern as wiRenderer.cpp line 5808)
{
GPUBarrier barriers[] = { GPUBarrier::Memory(&tlas_) };
dev->Barrier(barriers, 1, cmd);
}
rtDirty_ = false;
}
// ── RT Shadow dispatch (Phase 6.2) ──────────────────────────────
void VoxelRenderer::dispatchShadows(CommandList cmd,
const Texture& depthBuffer,
const Texture& renderTarget,
const Texture& normalTarget) const
{
if (!rtShadowsEnabled_ || !shadowShader_.IsValid() || !tlas_.IsValid())
return;
auto* dev = device_;
uint32_t w = renderTarget.GetDesc().width;
uint32_t h = renderTarget.GetDesc().height;
// Pre-barriers:
// - voxelDepth_: DEPTHSTENCIL → SHADER_RESOURCE (for depth reads)
// - voxelRT_: SHADER_RESOURCE → UNORDERED_ACCESS (for in-place shadow modulation)
// - voxelNormalRT_ is already in SHADER_RESOURCE state from render pass
GPUBarrier preBarriers[] = {
GPUBarrier::Image(&const_cast<Texture&>(depthBuffer),
ResourceState::DEPTHSTENCIL, ResourceState::SHADER_RESOURCE),
GPUBarrier::Image(&const_cast<Texture&>(renderTarget),
ResourceState::SHADER_RESOURCE, ResourceState::UNORDERED_ACCESS),
};
dev->Barrier(preBarriers, 2, cmd);
dev->BindComputeShader(&shadowShader_, cmd);
// Bind resources
dev->BindResource(&depthBuffer, 0, cmd); // t0 = depth
dev->BindResource(&normalTarget, 1, cmd); // t1 = normals
dev->BindResource(&tlas_, 2, cmd); // t2 = TLAS
dev->BindUAV(&renderTarget, 0, cmd); // u0 = color (read-modify-write)
dev->BindConstantBuffer(&constantBuffer_, 0, cmd); // b0 = VoxelCB
// Push constants
struct ShadowPush {
uint32_t width;
uint32_t height;
float normalBias;
float maxDistance;
uint32_t debugMode;
uint32_t pad[7];
} pushData = {};
pushData.width = w;
pushData.height = h;
pushData.normalBias = 0.15f; // offset along normal to avoid self-intersection
pushData.maxDistance = 512.0f; // max shadow ray distance
pushData.debugMode = rtShadowDebug_ ? 1 : 0;
dev->PushConstants(&pushData, sizeof(pushData), cmd);
// Dispatch: 8×8 thread groups covering the screen
dev->Dispatch((w + 7) / 8, (h + 7) / 8, 1, cmd);
// Post-barriers: restore states for Compose()
GPUBarrier postBarriers[] = {
GPUBarrier::Image(&const_cast<Texture&>(depthBuffer),
ResourceState::SHADER_RESOURCE, ResourceState::DEPTHSTENCIL),
GPUBarrier::Image(&const_cast<Texture&>(renderTarget),
ResourceState::UNORDERED_ACCESS, ResourceState::SHADER_RESOURCE),
};
dev->Barrier(postBarriers, 2, cmd);
}
// ── Frustum plane extraction (Gribb-Hartmann method) ────────────
static void extractFrustumPlanes(const XMMATRIX& vp, XMFLOAT4 planes[6]) {
XMFLOAT4X4 m;
XMStoreFloat4x4(&m, vp);
// Left
planes[0] = XMFLOAT4(m._14 + m._11, m._24 + m._21, m._34 + m._31, m._44 + m._41);
// Right
planes[1] = XMFLOAT4(m._14 - m._11, m._24 - m._21, m._34 - m._31, m._44 - m._41);
// Bottom
planes[2] = XMFLOAT4(m._14 + m._12, m._24 + m._22, m._34 + m._32, m._44 + m._42);
// Top
planes[3] = XMFLOAT4(m._14 - m._12, m._24 - m._22, m._34 - m._32, m._44 - m._42);
// Near
planes[4] = XMFLOAT4(m._13, m._23, m._33, m._43);
// Far
planes[5] = XMFLOAT4(m._14 - m._13, m._24 - m._23, m._34 - m._33, m._44 - m._43);
// Normalize each plane
for (int i = 0; i < 6; i++) {
float len = std::sqrt(planes[i].x * planes[i].x +
planes[i].y * planes[i].y +
planes[i].z * planes[i].z);
if (len > 0.0001f) {
planes[i].x /= len;
planes[i].y /= len;
planes[i].z /= len;
planes[i].w /= len;
}
}
}
// ── Render pass ─────────────────────────────────────────────────
void VoxelRenderer::render(
CommandList cmd,
const wi::scene::CameraComponent& camera,
const Texture& depthBuffer,
const Texture& renderTarget,
const Texture& normalTarget
) const {
if (!initialized_ || chunkCount_ == 0 || !pso_.IsValid()) return;
auto* dev = device_;
// ── GPU Mesh path: quads already dispatched in Render(), just draw ──
if (gpuMeshEnabled_ && gpuMesherAvailable_) {
// Upload chunk info only when chunks changed
if (!cpuChunkInfo_.empty() && chunkInfoDirty_) {
dev->UpdateBuffer(&chunkInfoBuffer_, cpuChunkInfo_.data(), cmd,
cpuChunkInfo_.size() * sizeof(GPUChunkInfo));
chunkInfoDirty_ = false;
}
// Per-frame constants
VoxelConstants cb = {};
XMMATRIX vpMatrix = camera.GetViewProjection();
XMStoreFloat4x4(&cb.viewProjection, vpMatrix);
XMMATRIX invVP = XMMatrixInverse(nullptr, vpMatrix);
XMStoreFloat4x4(&cb.inverseViewProjection, invVP);
cb.cameraPosition = XMFLOAT4(camera.Eye.x, camera.Eye.y, camera.Eye.z, 1.0f);
cb.sunDirection = XMFLOAT4(-0.7f, -0.4f, -0.3f, 0.0f); // lower sun = longer cast shadows
cb.sunColor = XMFLOAT4(1.2f, 1.1f, 0.9f, 1.0f);
cb.chunkSize = (float)CHUNK_SIZE;
cb.textureTiling = 0.25f;
cb.blendEnabled = 1.0f; // Phase 3: PS-based blending enabled in GPU mesh path
cb.debugBlend = debugBlend_ ? 1.0f : 0.0f;
cb.chunkCount = chunkCount_;
// Per-material blend flags (bit N = material N):
// canBleed: material can overflow visually onto adjacent voxels
// resistBleed: adjacent materials cannot overflow onto this material
// Material IDs: 1=Grass, 2=Dirt, 3=Stone, 4=Sand, 5=Snow, 6=SmoothStone
cb.bleedMask = (1u << 1) | (1u << 2) | (1u << 4) | (1u << 5); // Grass, Dirt, Sand, Snow can bleed (NOT Stone/SmoothStone)
cb.resistBleedMask = (1u << 1); // Grass resists bleed (she bleeds onto others, not the reverse)
cb.windTime = windTime_;
dev->UpdateBuffer(&constantBuffer_, &cb, cmd, sizeof(cb));
// Render pass (MRT: color + normals + depth)
RenderPassImage rp[] = {
RenderPassImage::RenderTarget(
&renderTarget,
RenderPassImage::LoadOp::CLEAR,
RenderPassImage::StoreOp::STORE,
ResourceState::SHADER_RESOURCE,
ResourceState::SHADER_RESOURCE
),
RenderPassImage::RenderTarget(
&normalTarget,
RenderPassImage::LoadOp::CLEAR,
RenderPassImage::StoreOp::STORE,
ResourceState::SHADER_RESOURCE,
ResourceState::SHADER_RESOURCE
),
RenderPassImage::DepthStencil(
&depthBuffer,
RenderPassImage::LoadOp::CLEAR,
RenderPassImage::StoreOp::STORE,
ResourceState::DEPTHSTENCIL,
ResourceState::DEPTHSTENCIL,
ResourceState::DEPTHSTENCIL
),
};
dev->RenderPassBegin(rp, 3, cmd);
Viewport vp;
vp.width = (float)renderTarget.GetDesc().width;
vp.height = (float)renderTarget.GetDesc().height;
vp.min_depth = 0.0f;
vp.max_depth = 1.0f;
dev->BindViewports(1, &vp, cmd);
Rect scissor = { 0, 0, (int)vp.width, (int)vp.height };
dev->BindScissorRects(1, &scissor, cmd);
dev->BindPipelineState(&pso_, cmd);
dev->BindConstantBuffer(&constantBuffer_, 0, cmd);
dev->BindResource(&gpuQuadBuffer_, 0, cmd); // GPU quads, not mega-buffer
dev->BindResource(&textureArray_, 1, cmd);
dev->BindResource(&chunkInfoBuffer_, 2, cmd);
dev->BindResource(&voxelDataBuffer_, 3, cmd); // Phase 3: voxel data for PS neighbor lookups
dev->BindSampler(&sampler_, 0, cmd);
// GPU mesh mode: flags=2, MUST be after BindPipelineState
struct VoxelPush {
uint32_t chunkIndex;
uint32_t quadOffset;
uint32_t flags;
uint32_t pad[9];
};
VoxelPush pushData = {};
pushData.flags = 2; // GPU mesh mode
pushData.quadOffset = 0;
dev->PushConstants(&pushData, sizeof(pushData), cmd);
// Draw using previous frame's quad count (1-frame delay)
if (gpuMeshQuadCount_ > 0) {
dev->DrawInstanced(gpuMeshQuadCount_ * 6, 1, 0, 0, cmd);
drawCalls_ = 1;
}
dev->RenderPassEnd(cmd);
visibleChunks_ = chunkCount_;
return;
}
// Upload mega-buffer and chunk info to GPU
if (!cpuMegaQuads_.empty()) {
dev->UpdateBuffer(&megaQuadBuffer_, cpuMegaQuads_.data(), cmd,
cpuMegaQuads_.size() * sizeof(PackedQuad));
}
if (!cpuChunkInfo_.empty()) {
dev->UpdateBuffer(&chunkInfoBuffer_, cpuChunkInfo_.data(), cmd,
cpuChunkInfo_.size() * sizeof(GPUChunkInfo));
}
// Per-frame constants (with frustum planes for GPU cull shader)
VoxelConstants cb = {};
XMMATRIX vpMatrix = camera.GetViewProjection();
XMStoreFloat4x4(&cb.viewProjection, vpMatrix);
cb.cameraPosition = XMFLOAT4(camera.Eye.x, camera.Eye.y, camera.Eye.z, 1.0f);
cb.sunDirection = XMFLOAT4(-0.7f, -0.4f, -0.3f, 0.0f); // lower sun = longer cast shadows
cb.sunColor = XMFLOAT4(1.2f, 1.1f, 0.9f, 1.0f);
cb.chunkSize = (float)CHUNK_SIZE;
cb.textureTiling = 0.25f;
cb.blendEnabled = 0.0f; // Phase 3: blending disabled in CPU/MDI paths (no voxel data SRV)
cb.debugBlend = 0.0f;
cb.bleedMask = 0;
cb.resistBleedMask = 0;
cb.windTime = windTime_;
cb.chunkCount = chunkCount_;
extractFrustumPlanes(vpMatrix, cb.frustumPlanes);
dev->UpdateBuffer(&constantBuffer_, &cb, cmd, sizeof(cb));
// Push constant structure (must be 48 bytes = 12 x uint32, matches b999)
struct VoxelPush {
uint32_t chunkIndex;
uint32_t quadOffset;
uint32_t flags; // bit 0: 1=MDI mode, 0=CPU mode
uint32_t pad[9];
};
visibleChunks_ = 0;
drawCalls_ = 0;
// ── GPU Cull + MDI path ────────────────────────────────────────
if (gpuCullingEnabled_) {
// DX12 buffer decay: all buffers return to COMMON after ExecuteCommandLists.
// So every frame starts clean — no cross-frame state tracking needed.
// Zero the draw count via UpdateBuffer (COMMON → COPY_DST implicit promotion)
uint32_t zero = 0;
dev->UpdateBuffer(&drawCountBuffer_, &zero, cmd, sizeof(uint32_t));
// Barriers to UAV for compute shader writes:
// - drawCountBuffer_: COPY_DST → UAV (was promoted to COPY_DST by UpdateBuffer)
// - indirectArgsBuffer_: COMMON → UAV (explicit, required because COMMON can't
// be implicitly promoted to UAV)
GPUBarrier preBarriers[] = {
GPUBarrier::Buffer(&drawCountBuffer_, ResourceState::COPY_DST, ResourceState::UNORDERED_ACCESS),
GPUBarrier::Buffer(&indirectArgsBuffer_, ResourceState::UNDEFINED, ResourceState::UNORDERED_ACCESS),
};
dev->Barrier(preBarriers, 2, cmd);
// Timestamp: cull begin
dev->QueryEnd(&timestampHeap_, TS_CULL_BEGIN, cmd);
// Dispatch GPU frustum + backface cull compute shader
dev->BindComputeShader(&cullShader_, cmd);
dev->BindConstantBuffer(&constantBuffer_, 0, cmd);
dev->BindResource(&chunkInfoBuffer_, 2, cmd);
dev->BindUAV(&indirectArgsBuffer_, 0, cmd);
dev->BindUAV(&drawCountBuffer_, 1, cmd);
dev->Dispatch((chunkCount_ + 63) / 64, 1, 1, cmd);
// Timestamp: cull end
dev->QueryEnd(&timestampHeap_, TS_CULL_END, cmd);
// Barriers: UAV → INDIRECT_ARGUMENT for DrawInstancedIndirectCount
GPUBarrier postBarriers[] = {
GPUBarrier::Buffer(&indirectArgsBuffer_, ResourceState::UNORDERED_ACCESS, ResourceState::INDIRECT_ARGUMENT),
GPUBarrier::Buffer(&drawCountBuffer_, ResourceState::UNORDERED_ACCESS, ResourceState::INDIRECT_ARGUMENT),
};
dev->Barrier(postBarriers, 2, cmd);
// ── Render pass (MRT: color + normals + depth) ──────────────
RenderPassImage rp[] = {
RenderPassImage::RenderTarget(
&renderTarget,
RenderPassImage::LoadOp::CLEAR,
RenderPassImage::StoreOp::STORE,
ResourceState::SHADER_RESOURCE,
ResourceState::SHADER_RESOURCE
),
RenderPassImage::RenderTarget(
&normalTarget,
RenderPassImage::LoadOp::CLEAR,
RenderPassImage::StoreOp::STORE,
ResourceState::SHADER_RESOURCE,
ResourceState::SHADER_RESOURCE
),
RenderPassImage::DepthStencil(
&depthBuffer,
RenderPassImage::LoadOp::CLEAR,
RenderPassImage::StoreOp::STORE,
ResourceState::DEPTHSTENCIL,
ResourceState::DEPTHSTENCIL,
ResourceState::DEPTHSTENCIL
),
};
dev->RenderPassBegin(rp, 3, cmd);
Viewport vp;
vp.width = (float)renderTarget.GetDesc().width;
vp.height = (float)renderTarget.GetDesc().height;
vp.min_depth = 0.0f;
vp.max_depth = 1.0f;
dev->BindViewports(1, &vp, cmd);
Rect scissor = { 0, 0, (int)vp.width, (int)vp.height };
dev->BindScissorRects(1, &scissor, cmd);
dev->BindPipelineState(&pso_, cmd);
dev->BindConstantBuffer(&constantBuffer_, 0, cmd);
dev->BindResource(&megaQuadBuffer_, 0, cmd);
dev->BindResource(&textureArray_, 1, cmd);
dev->BindResource(&chunkInfoBuffer_, 2, cmd);
dev->BindSampler(&sampler_, 0, cmd);
// IMPORTANT: PushConstants must be called AFTER BindPipelineState.
// Wicked Engine's PushConstants uses SetGraphicsRoot32BitConstants only
// when active_pso is set. If called before (with active_cs from compute),
// it would set COMPUTE push constants instead of GRAPHICS ones.
VoxelPush pushData = {};
pushData.flags = 1; // MDI mode
dev->PushConstants(&pushData, sizeof(pushData), cmd);
// Timestamp: draw begin
dev->QueryEnd(&timestampHeap_, TS_DRAW_BEGIN, cmd);
// Single MDI call: GPU cull shader filled the indirect args
dev->DrawInstancedIndirectCount(
&indirectArgsBuffer_, 0,
&drawCountBuffer_, 0,
MAX_DRAWS, cmd
);
drawCalls_ = 1;
// Timestamp: draw end
dev->QueryEnd(&timestampHeap_, TS_DRAW_END, cmd);
dev->RenderPassEnd(cmd);
// Resolve timestamps for readback (results available next frame)
dev->QueryResolve(&timestampHeap_, 0, TS_COUNT, &timestampReadback_, 0, cmd);
// Read back previous frame's timestamps (persistently mapped READBACK buffer)
uint64_t* tsData = (uint64_t*)timestampReadback_.mapped_data;
if (tsData) {
double freq = (double)dev->GetTimestampFrequency();
if (freq > 0.0 && tsData[TS_CULL_END] > tsData[TS_CULL_BEGIN]) {
gpuCullTimeMs_ = (float)((double)(tsData[TS_CULL_END] - tsData[TS_CULL_BEGIN]) / freq * 1000.0);
}
if (freq > 0.0 && tsData[TS_DRAW_END] > tsData[TS_DRAW_BEGIN]) {
gpuDrawTimeMs_ = (float)((double)(tsData[TS_DRAW_END] - tsData[TS_DRAW_BEGIN]) / freq * 1000.0);
}
}
// GPU cull handles visibility counting — approximate from chunkCount
visibleChunks_ = chunkCount_; // exact count would require readback of drawCount
return;
}
// ── CPU frustum + backface cull (shared by MDI and per-face paths) ──
wi::primitive::Frustum frustum;
frustum.Create(camera.GetViewProjection());
// ── Phase 2.2: CPU-filled indirect args + MDI draw ──────────────
if (mdiEnabled_) {
// CPU cull: fill indirect args with visible face groups
cpuIndirectArgs_.clear();
uint32_t cpuDrawCount = 0;
for (uint32_t i = 0; i < chunkCount_; i++) {
const auto& slot = chunkSlots_[i];
if (slot.quadCount == 0) continue;
XMFLOAT3 aabbMin(
(float)(slot.pos.x * CHUNK_SIZE),
(float)(slot.pos.y * CHUNK_SIZE),
(float)(slot.pos.z * CHUNK_SIZE)
);
XMFLOAT3 aabbMax(
aabbMin.x + CHUNK_SIZE,
aabbMin.y + CHUNK_SIZE,
aabbMin.z + CHUNK_SIZE
);
wi::primitive::AABB aabb(aabbMin, aabbMax);
if (!frustum.CheckBoxFast(aabb)) continue;
visibleChunks_++;
const auto& info = cpuChunkInfo_[i];
for (uint32_t f = 0; f < 6; f++) {
if (info.faceCounts[f] == 0) continue;
bool backFacing = false;
switch (f) {
case 0: backFacing = (camera.Eye.x < aabbMin.x); break;
case 1: backFacing = (camera.Eye.x > aabbMax.x); break;
case 2: backFacing = (camera.Eye.y < aabbMin.y); break;
case 3: backFacing = (camera.Eye.y > aabbMax.y); break;
case 4: backFacing = (camera.Eye.z < aabbMin.z); break;
case 5: backFacing = (camera.Eye.z > aabbMax.z); break;
}
if (backFacing) continue;
IndirectDrawArgs args = {};
// Pack chunkIndex (low 16 bits) + faceIndex (high 16 bits) into push constant.
// The shader unpacks this to look up quadOffset from GPUChunkInfo.
// We do NOT use startVertexLocation because SV_VertexID may not include it
// reliably in ExecuteIndirect context.
args.pushConstant = i | (f << 16);
args.vertexCountPerInstance = info.faceCounts[f] * 6;
args.instanceCount = 1;
args.startVertexLocation = 0;
args.startInstanceLocation = 0;
cpuIndirectArgs_.push_back(args);
cpuDrawCount++;
}
}
// Upload indirect args and draw count to GPU
// Note: no explicit barriers needed here. Buffers start in COMMON each frame
// (DX12 buffer decay after command list execution). COMMON is implicitly
// promoted to COPY_DST by UpdateBuffer, then to INDIRECT_ARGUMENT by
// DrawInstancedIndirectCount. This matches Phase 2.1 pattern (no barriers
// between UpdateBuffer and SRV usage for megaQuadBuffer_/chunkInfoBuffer_).
if (!cpuIndirectArgs_.empty()) {
dev->UpdateBuffer(&indirectArgsBuffer_, cpuIndirectArgs_.data(), cmd,
cpuIndirectArgs_.size() * sizeof(IndirectDrawArgs));
}
dev->UpdateBuffer(&drawCountBuffer_, &cpuDrawCount, cmd, sizeof(uint32_t));
// ── Render pass (MRT: color + normals + depth) ──────────────
RenderPassImage rp[] = {
RenderPassImage::RenderTarget(
&renderTarget,
RenderPassImage::LoadOp::CLEAR,
RenderPassImage::StoreOp::STORE,
ResourceState::SHADER_RESOURCE,
ResourceState::SHADER_RESOURCE
),
RenderPassImage::RenderTarget(
&normalTarget,
RenderPassImage::LoadOp::CLEAR,
RenderPassImage::StoreOp::STORE,
ResourceState::SHADER_RESOURCE,
ResourceState::SHADER_RESOURCE
),
RenderPassImage::DepthStencil(
&depthBuffer,
RenderPassImage::LoadOp::CLEAR,
RenderPassImage::StoreOp::STORE,
ResourceState::DEPTHSTENCIL,
ResourceState::DEPTHSTENCIL,
ResourceState::DEPTHSTENCIL
),
};
dev->RenderPassBegin(rp, 3, cmd);
Viewport vp;
vp.width = (float)renderTarget.GetDesc().width;
vp.height = (float)renderTarget.GetDesc().height;
vp.min_depth = 0.0f;
vp.max_depth = 1.0f;
dev->BindViewports(1, &vp, cmd);
Rect scissor = { 0, 0, (int)vp.width, (int)vp.height };
dev->BindScissorRects(1, &scissor, cmd);
dev->BindPipelineState(&pso_, cmd);
dev->BindConstantBuffer(&constantBuffer_, 0, cmd);
dev->BindResource(&megaQuadBuffer_, 0, cmd);
dev->BindResource(&textureArray_, 1, cmd);
dev->BindResource(&chunkInfoBuffer_, 2, cmd);
dev->BindSampler(&sampler_, 0, cmd);
// MDI mode: VS uses binary search to find chunk from SV_VertexID
VoxelPush pushData = {};
pushData.flags = 1; // MDI mode
dev->PushConstants(&pushData, sizeof(pushData), cmd);
dev->DrawInstancedIndirectCount(
&indirectArgsBuffer_, 0,
&drawCountBuffer_, 0,
MAX_DRAWS, cmd
);
drawCalls_ = 1;
dev->RenderPassEnd(cmd);
return;
}
// ── Phase 2.1 Fallback: per-face-group DrawInstanced ────────────
RenderPassImage rp[] = {
RenderPassImage::RenderTarget(
&renderTarget,
RenderPassImage::LoadOp::CLEAR,
RenderPassImage::StoreOp::STORE,
ResourceState::SHADER_RESOURCE,
ResourceState::SHADER_RESOURCE
),
RenderPassImage::RenderTarget(
&normalTarget,
RenderPassImage::LoadOp::CLEAR,
RenderPassImage::StoreOp::STORE,
ResourceState::SHADER_RESOURCE,
ResourceState::SHADER_RESOURCE
),
RenderPassImage::DepthStencil(
&depthBuffer,
RenderPassImage::LoadOp::CLEAR,
RenderPassImage::StoreOp::STORE,
ResourceState::DEPTHSTENCIL,
ResourceState::DEPTHSTENCIL,
ResourceState::DEPTHSTENCIL
),
};
dev->RenderPassBegin(rp, 3, cmd);
Viewport vp;
vp.width = (float)renderTarget.GetDesc().width;
vp.height = (float)renderTarget.GetDesc().height;
vp.min_depth = 0.0f;
vp.max_depth = 1.0f;
dev->BindViewports(1, &vp, cmd);
Rect scissor = { 0, 0, (int)vp.width, (int)vp.height };
dev->BindScissorRects(1, &scissor, cmd);
dev->BindPipelineState(&pso_, cmd);
dev->BindConstantBuffer(&constantBuffer_, 0, cmd);
dev->BindResource(&megaQuadBuffer_, 0, cmd);
dev->BindResource(&textureArray_, 1, cmd);
dev->BindResource(&chunkInfoBuffer_, 2, cmd);
dev->BindSampler(&sampler_, 0, cmd);
for (uint32_t i = 0; i < chunkCount_; i++) {
const auto& slot = chunkSlots_[i];
if (slot.quadCount == 0) continue;
XMFLOAT3 aabbMin(
(float)(slot.pos.x * CHUNK_SIZE),
(float)(slot.pos.y * CHUNK_SIZE),
(float)(slot.pos.z * CHUNK_SIZE)
);
XMFLOAT3 aabbMax(
aabbMin.x + CHUNK_SIZE,
aabbMin.y + CHUNK_SIZE,
aabbMin.z + CHUNK_SIZE
);
wi::primitive::AABB aabb(aabbMin, aabbMax);
if (!frustum.CheckBoxFast(aabb)) continue;
visibleChunks_++;
const auto& info = cpuChunkInfo_[i];
for (uint32_t f = 0; f < 6; f++) {
if (info.faceCounts[f] == 0) continue;
bool backFacing = false;
switch (f) {
case 0: backFacing = (camera.Eye.x < aabbMin.x); break;
case 1: backFacing = (camera.Eye.x > aabbMax.x); break;
case 2: backFacing = (camera.Eye.y < aabbMin.y); break;
case 3: backFacing = (camera.Eye.y > aabbMax.y); break;
case 4: backFacing = (camera.Eye.z < aabbMin.z); break;
case 5: backFacing = (camera.Eye.z > aabbMax.z); break;
}
if (backFacing) continue;
VoxelPush pushData = {};
pushData.chunkIndex = i;
pushData.quadOffset = slot.quadOffset + info.faceOffsets[f];
pushData.flags = 0; // CPU mode
dev->PushConstants(&pushData, sizeof(pushData), cmd);
dev->DrawInstanced(info.faceCounts[f] * 6, 1, 0, 0, cmd);
drawCalls_++;
}
}
dev->RenderPassEnd(cmd);
}
// ── Phase 4: Toping GPU upload + rendering ─────────────────────
void VoxelRenderer::uploadTopingData(const TopingSystem& topingSystem) {
if (!device_ || !topingPso_.IsValid()) return;
// Upload mesh vertices (done once, meshes are static)
const auto& verts = topingSystem.getVertices();
if (!verts.empty() && !topingVertexBuffer_.IsValid()) {
GPUBufferDesc vbDesc;
vbDesc.size = verts.size() * sizeof(TopingVertex);
vbDesc.bind_flags = BindFlag::SHADER_RESOURCE;
vbDesc.misc_flags = ResourceMiscFlag::BUFFER_STRUCTURED;
vbDesc.stride = sizeof(TopingVertex);
vbDesc.usage = Usage::DEFAULT;
device_->CreateBuffer(&vbDesc, verts.data(), &topingVertexBuffer_);
char msg[128];
snprintf(msg, sizeof(msg), "Toping: uploaded %zu vertices (%zu bytes)",
verts.size(), verts.size() * sizeof(TopingVertex));
wi::backlog::post(msg);
}
// Upload instance positions (re-upload when world changes)
const auto& instances = topingSystem.getInstances();
if (instances.empty()) return;
// GPU instances are just float3 (12 bytes), sorted by (type, variant) for batched draws.
// We sort a copy and build a draw group table.
// Reuse persistent vectors to avoid per-frame allocations.
topingSorted_.resize(instances.size());
for (size_t i = 0; i < instances.size(); i++) {
topingSorted_[i] = { instances[i].wx, instances[i].wy, instances[i].wz,
instances[i].topingType, instances[i].variant };
}
std::sort(topingSorted_.begin(), topingSorted_.end(), [](const TopingSortedInst& a, const TopingSortedInst& b) {
if (a.type != b.type) return a.type < b.type;
return a.variant < b.variant;
});
// Pack GPU instance data (just float3 positions)
uint32_t instCount = (uint32_t)std::min(topingSorted_.size(), (size_t)MAX_TOPING_INSTANCES);
topingGpuInsts_.resize(instCount);
for (uint32_t i = 0; i < instCount; i++) {
topingGpuInsts_[i] = { topingSorted_[i].wx, topingSorted_[i].wy, topingSorted_[i].wz };
}
// Recreate buffer each frame (UpdateBuffer requires barrier management).
// Persistent staging vectors eliminate per-frame heap allocations.
GPUBufferDesc ibDesc;
ibDesc.size = instCount * sizeof(TopingGPUInst);
ibDesc.bind_flags = BindFlag::SHADER_RESOURCE;
ibDesc.misc_flags = ResourceMiscFlag::BUFFER_STRUCTURED;
ibDesc.stride = sizeof(TopingGPUInst);
ibDesc.usage = Usage::DEFAULT;
device_->CreateBuffer(&ibDesc, topingGpuInsts_.data(), &topingInstanceBuffer_);
}
void VoxelRenderer::renderTopings(
CommandList cmd,
const TopingSystem& topingSystem,
const Texture& depthBuffer,
const Texture& renderTarget,
const Texture& normalTarget
) const {
if (!topingPso_.IsValid() || !topingVertexBuffer_.IsValid() ||
!topingInstanceBuffer_.IsValid()) return;
const auto& instances = topingSystem.getInstances();
const auto& defs = topingSystem.getDefs();
if (instances.empty()) return;
auto* dev = device_;
// Open render pass with LOAD (preserve voxel render output)
RenderPassImage rp[] = {
RenderPassImage::RenderTarget(
&renderTarget,
RenderPassImage::LoadOp::LOAD,
RenderPassImage::StoreOp::STORE,
ResourceState::SHADER_RESOURCE,
ResourceState::SHADER_RESOURCE
),
RenderPassImage::RenderTarget(
&normalTarget,
RenderPassImage::LoadOp::LOAD,
RenderPassImage::StoreOp::STORE,
ResourceState::SHADER_RESOURCE,
ResourceState::SHADER_RESOURCE
),
RenderPassImage::DepthStencil(
&depthBuffer,
RenderPassImage::LoadOp::LOAD,
RenderPassImage::StoreOp::STORE,
ResourceState::DEPTHSTENCIL,
ResourceState::DEPTHSTENCIL,
ResourceState::DEPTHSTENCIL
),
};
dev->RenderPassBegin(rp, 3, cmd);
// Viewport & scissor
Viewport vp;
vp.top_left_x = 0; vp.top_left_y = 0;
vp.width = (float)renderTarget.GetDesc().width;
vp.height = (float)renderTarget.GetDesc().height;
vp.min_depth = 0.0f; vp.max_depth = 1.0f;
Rect scissor = { 0, 0, (int)renderTarget.GetDesc().width, (int)renderTarget.GetDesc().height };
dev->BindViewports(1, &vp, cmd);
dev->BindScissorRects(1, &scissor, cmd);
// Bind toping pipeline (MUST be before PushConstants!)
dev->BindPipelineState(&topingPso_, cmd);
dev->BindConstantBuffer(&constantBuffer_, 0, cmd);
dev->BindResource(&textureArray_, 1, cmd);
dev->BindResource(&topingVertexBuffer_, 4, cmd); // t4
dev->BindResource(&topingInstanceBuffer_, 5, cmd); // t5
dev->BindSampler(&sampler_, 0, cmd);
// Build sorted draw groups (same sort order as uploadTopingData)
struct DrawGroup {
uint16_t type, variant;
uint32_t instanceOffset, instanceCount;
};
struct SortKey { uint16_t type, variant; };
std::vector<SortKey> sortedKeys(instances.size());
for (size_t i = 0; i < instances.size(); i++) {
sortedKeys[i] = { instances[i].topingType, instances[i].variant };
}
std::sort(sortedKeys.begin(), sortedKeys.end(), [](const SortKey& a, const SortKey& b) {
if (a.type != b.type) return a.type < b.type;
return a.variant < b.variant;
});
// Identify contiguous groups
std::vector<DrawGroup> groups;
uint32_t instCount = (uint32_t)std::min(sortedKeys.size(), (size_t)MAX_TOPING_INSTANCES);
if (instCount > 0) {
DrawGroup g = { sortedKeys[0].type, sortedKeys[0].variant, 0, 1 };
for (uint32_t i = 1; i < instCount; i++) {
if (sortedKeys[i].type == g.type && sortedKeys[i].variant == g.variant) {
g.instanceCount++;
} else {
groups.push_back(g);
g = { sortedKeys[i].type, sortedKeys[i].variant, i, 1 };
}
}
groups.push_back(g);
}
// Issue one DrawInstanced per group
topingDrawCalls_ = 0;
struct TopingPush {
uint32_t vertexOffset;
uint32_t instanceOffset;
uint32_t materialID;
uint32_t pad[9];
};
for (const auto& g : groups) {
if (g.type >= defs.size()) continue;
const TopingDef& def = defs[g.type];
const MeshSlice& slice = def.variants[g.variant];
if (slice.count == 0) continue; // empty mesh (all neighbors present)
TopingPush pushData = {};
pushData.vertexOffset = slice.offset;
pushData.instanceOffset = g.instanceOffset;
pushData.materialID = def.materialID;
dev->PushConstants(&pushData, sizeof(pushData), cmd);
dev->DrawInstanced(slice.count, g.instanceCount, 0, 0, cmd);
topingDrawCalls_++;
}
dev->RenderPassEnd(cmd);
}
// ── Phase 5: Smooth Surface Nets upload + rendering ─────────────
void VoxelRenderer::uploadSmoothData(VoxelWorld& world) {
if (!device_ || !smoothPso_.IsValid()) return;
// Collect all smooth vertices from all chunks, stamping each with its chunkIndex.
// The chunkIndex must match the order in chunkInfoBuffer_ (assigned by forEachChunk).
// Reuse a persistent staging vector to avoid per-frame allocations.
smoothStagingVerts_.clear();
if (smoothStagingVerts_.capacity() < 64 * 1024)
smoothStagingVerts_.reserve(64 * 1024);
uint32_t chunkIdx = 0;
world.forEachChunk([&](const ChunkPos& pos, Chunk& chunk) {
if (chunk.hasSmooth && chunk.smoothVertexCount > 0) {
for (auto& sv : chunk.smoothVertices) {
sv.chunkIndex = (uint16_t)chunkIdx;
}
smoothStagingVerts_.insert(smoothStagingVerts_.end(),
chunk.smoothVertices.begin(),
chunk.smoothVertices.end());
}
chunkIdx++;
});
smoothVertexCount_ = (uint32_t)std::min(smoothStagingVerts_.size(), (size_t)MAX_SMOOTH_VERTICES);
if (smoothVertexCount_ == 0) {
smoothDirty_ = false;
return;
}
// Recreate buffer each frame (UpdateBuffer requires barrier management).
// Persistent staging vector eliminates per-frame heap allocations.
GPUBufferDesc vbDesc;
vbDesc.size = smoothVertexCount_ * sizeof(SmoothVertex);
vbDesc.bind_flags = BindFlag::SHADER_RESOURCE;
vbDesc.misc_flags = ResourceMiscFlag::BUFFER_STRUCTURED;
vbDesc.stride = sizeof(SmoothVertex);
vbDesc.usage = Usage::DEFAULT;
device_->CreateBuffer(&vbDesc, smoothStagingVerts_.data(), &smoothVertexBuffer_);
smoothDirty_ = false;
}
void VoxelRenderer::uploadSmoothDataFast(VoxelWorld& world) {
if (!device_ || !smoothPso_.IsValid()) return;
// Fast path: chunkIndex already stamped during parallel meshChunk.
// Just collect vertices (no per-vertex stamping needed).
smoothStagingVerts_.clear();
if (smoothStagingVerts_.capacity() < 64 * 1024)
smoothStagingVerts_.reserve(64 * 1024);
world.forEachChunk([&](const ChunkPos& pos, Chunk& chunk) {
if (chunk.hasSmooth && chunk.smoothVertexCount > 0) {
smoothStagingVerts_.insert(smoothStagingVerts_.end(),
chunk.smoothVertices.begin(),
chunk.smoothVertices.end());
}
});
smoothVertexCount_ = (uint32_t)std::min(smoothStagingVerts_.size(), (size_t)MAX_SMOOTH_VERTICES);
if (smoothVertexCount_ == 0) {
smoothDirty_ = false;
return;
}
GPUBufferDesc vbDesc;
vbDesc.size = smoothVertexCount_ * sizeof(SmoothVertex);
vbDesc.bind_flags = BindFlag::SHADER_RESOURCE;
vbDesc.misc_flags = ResourceMiscFlag::BUFFER_STRUCTURED;
vbDesc.stride = sizeof(SmoothVertex);
vbDesc.usage = Usage::DEFAULT;
device_->CreateBuffer(&vbDesc, smoothStagingVerts_.data(), &smoothVertexBuffer_);
smoothDirty_ = false;
}
void VoxelRenderer::renderSmooth(
CommandList cmd,
const Texture& depthBuffer,
const Texture& renderTarget,
const Texture& normalTarget
) const {
// Use GPU-generated smooth buffer if available, otherwise CPU buffer
const bool useGpuSmooth = smoothCentroidShader_.IsValid() && smoothMeshShader_.IsValid();
const auto& smoothBuf = useGpuSmooth ? gpuSmoothVertexBuffer_ : smoothVertexBuffer_;
uint32_t vertCount = useGpuSmooth ? gpuSmoothVertexCount_ : smoothVertexCount_;
if (!smoothPso_.IsValid() || !smoothBuf.IsValid() || vertCount == 0) return;
auto* dev = device_;
// Open render pass with LOAD (preserve voxel + toping render output)
RenderPassImage rp[] = {
RenderPassImage::RenderTarget(
&renderTarget,
RenderPassImage::LoadOp::LOAD,
RenderPassImage::StoreOp::STORE,
ResourceState::SHADER_RESOURCE,
ResourceState::SHADER_RESOURCE
),
RenderPassImage::RenderTarget(
&normalTarget,
RenderPassImage::LoadOp::LOAD,
RenderPassImage::StoreOp::STORE,
ResourceState::SHADER_RESOURCE,
ResourceState::SHADER_RESOURCE
),
RenderPassImage::DepthStencil(
&depthBuffer,
RenderPassImage::LoadOp::LOAD,
RenderPassImage::StoreOp::STORE,
ResourceState::DEPTHSTENCIL,
ResourceState::DEPTHSTENCIL,
ResourceState::DEPTHSTENCIL
),
};
dev->RenderPassBegin(rp, 3, cmd);
// Viewport & scissor
Viewport vp;
vp.top_left_x = 0; vp.top_left_y = 0;
vp.width = (float)renderTarget.GetDesc().width;
vp.height = (float)renderTarget.GetDesc().height;
vp.min_depth = 0.0f; vp.max_depth = 1.0f;
Rect scissor = { 0, 0, (int)renderTarget.GetDesc().width, (int)renderTarget.GetDesc().height };
dev->BindViewports(1, &vp, cmd);
dev->BindScissorRects(1, &scissor, cmd);
// Bind smooth pipeline (MUST be before PushConstants!)
dev->BindPipelineState(&smoothPso_, cmd);
dev->BindConstantBuffer(&constantBuffer_, 0, cmd);
dev->BindResource(&textureArray_, 1, cmd);
dev->BindResource(&chunkInfoBuffer_, 2, cmd); // t2: chunk info for PS voxel lookups
dev->BindResource(&voxelDataBuffer_, 3, cmd); // t3: voxel data for PS neighbor blending
dev->BindResource(&smoothBuf, 6, cmd); // t6: smooth vertices (GPU or CPU buffer)
dev->BindSampler(&sampler_, 0, cmd);
// Push constants (unused by smooth VS, but must be valid 48 bytes)
struct SmoothPush {
uint32_t pad[12];
};
SmoothPush pushData = {};
dev->PushConstants(&pushData, sizeof(pushData), cmd);
// Single draw call for all smooth vertices
dev->DrawInstanced(vertCount, 1, 0, 0, cmd);
smoothDrawCalls_ = 1;
dev->RenderPassEnd(cmd);
}
// ── VoxelRenderPath (custom RenderPath3D) ───────────────────────
void VoxelRenderPath::Start() {
RenderPath3D::Start();
auto* device = wi::graphics::GetDevice();
renderer.initialize(device);
renderer.debugFaceColors_ = debugMode;
// Generate world
if (debugSmooth) {
world.generateDebugSmooth();
cameraPos = { 15.0f, 12.0f, -5.0f };
cameraPitch = -0.5f;
cameraYaw = 0.8f;
} else if (debugMode) {
world.generateDebug();
cameraPos = { 10.0f, 10.0f, 0.0f };
cameraPitch = -0.4f;
cameraYaw = 0.5f;
} else {
world.generateAround(cameraPos.x, cameraPos.y, cameraPos.z, 4);
}
if (renderer.isInitialized()) {
renderer.updateMeshes(world);
}
// Phase 4: Initialize toping system, collect instances, upload to GPU
topingSystem.initialize();
topingSystem.collectInstances(world);
if (renderer.isInitialized()) {
renderer.uploadTopingData(topingSystem);
}
{
char msg[256];
snprintf(msg, sizeof(msg),
"TopingSystem: %zu defs, %zu vertices, %zu instances",
topingSystem.getDefCount(),
topingSystem.getVertexCount(),
topingSystem.getInstanceCount());
wi::backlog::post(msg);
}
// Phase 5: Smooth surface mesh — GPU path or CPU fallback
if (renderer.isInitialized()) {
if (renderer.smoothCentroidShader_.IsValid() && renderer.smoothMeshShader_.IsValid()) {
// GPU smooth mesher available — will dispatch in first Render()
renderer.gpuSmoothMeshDirty_ = true;
wi::backlog::post("SmoothMesher: GPU path active, dispatch deferred to Render()");
} else {
// CPU fallback: Surface Nets mesh for smooth voxels (parallelized)
std::vector<Chunk*> chunkPtrs;
world.forEachChunk([&](const ChunkPos& pos, Chunk& chunk) {
chunkPtrs.push_back(&chunk);
});
const VoxelWorld& worldRef = world;
wi::jobsystem::context smoothCtx;
wi::jobsystem::Dispatch(smoothCtx, (uint32_t)chunkPtrs.size(), 1,
[&chunkPtrs, &worldRef](wi::jobsystem::JobArgs args) {
SmoothMesher::meshChunk(*chunkPtrs[args.jobIndex], worldRef);
});
wi::jobsystem::Wait(smoothCtx);
uint32_t totalSmooth = 0;
uint32_t smoothChunks = 0;
for (auto* c : chunkPtrs) {
if (c->smoothVertexCount > 0) {
totalSmooth += c->smoothVertexCount;
smoothChunks++;
}
}
renderer.uploadSmoothData(world);
char msg[256];
snprintf(msg, sizeof(msg),
"SmoothMesher: %u vertices (%u tris) in %u chunks",
totalSmooth, totalSmooth / 3, smoothChunks);
wi::backlog::post(msg);
}
}
worldGenerated_ = true;
setAO(AO_DISABLED);
setFXAAEnabled(true);
setBloomEnabled(false);
createRenderTargets();
}
void VoxelRenderPath::createRenderTargets() {
auto* device = wi::graphics::GetDevice();
if (!device) return;
uint32_t w = GetPhysicalWidth();
uint32_t h = GetPhysicalHeight();
if (w == 0 || h == 0) { w = 1920; h = 1080; }
wi::graphics::TextureDesc rtDesc;
rtDesc.type = wi::graphics::TextureDesc::Type::TEXTURE_2D;
rtDesc.width = w;
rtDesc.height = h;
rtDesc.format = wi::graphics::Format::R8G8B8A8_UNORM;
rtDesc.bind_flags = wi::graphics::BindFlag::RENDER_TARGET | wi::graphics::BindFlag::SHADER_RESOURCE
| wi::graphics::BindFlag::UNORDERED_ACCESS; // RT shadows modify in-place
rtDesc.mip_levels = 1;
rtDesc.sample_count = 1;
rtDesc.layout = wi::graphics::ResourceState::SHADER_RESOURCE;
device->CreateTexture(&rtDesc, nullptr, &voxelRT_);
// Normal render target (world-space normals for RT shadows/AO)
wi::graphics::TextureDesc normalDesc;
normalDesc.type = wi::graphics::TextureDesc::Type::TEXTURE_2D;
normalDesc.width = w;
normalDesc.height = h;
normalDesc.format = wi::graphics::Format::R16G16B16A16_SNORM;
normalDesc.bind_flags = wi::graphics::BindFlag::RENDER_TARGET | wi::graphics::BindFlag::SHADER_RESOURCE;
normalDesc.mip_levels = 1;
normalDesc.sample_count = 1;
normalDesc.layout = wi::graphics::ResourceState::SHADER_RESOURCE;
device->CreateTexture(&normalDesc, nullptr, &voxelNormalRT_);
wi::graphics::TextureDesc depthDesc;
depthDesc.type = wi::graphics::TextureDesc::Type::TEXTURE_2D;
depthDesc.width = w;
depthDesc.height = h;
depthDesc.format = wi::graphics::Format::D32_FLOAT;
depthDesc.bind_flags = wi::graphics::BindFlag::DEPTH_STENCIL | wi::graphics::BindFlag::SHADER_RESOURCE;
depthDesc.mip_levels = 1;
depthDesc.sample_count = 1;
depthDesc.layout = wi::graphics::ResourceState::DEPTHSTENCIL;
device->CreateTexture(&depthDesc, nullptr, &voxelDepth_);
rtCreated_ = voxelRT_.IsValid() && voxelNormalRT_.IsValid() && voxelDepth_.IsValid();
wi::backlog::post("VoxelRenderPath: render targets " + std::string(rtCreated_ ? "OK" : "FAILED")
+ " (" + std::to_string(w) + "x" + std::to_string(h) + ")");
}
// ── WASD camera input ───────────────────────────────────────────
static constexpr wi::input::BUTTON KEY_W = (wi::input::BUTTON)(wi::input::CHARACTER_RANGE_START + ('W' - 'A'));
static constexpr wi::input::BUTTON KEY_A = (wi::input::BUTTON)(wi::input::CHARACTER_RANGE_START + ('A' - 'A'));
static constexpr wi::input::BUTTON KEY_S = (wi::input::BUTTON)(wi::input::CHARACTER_RANGE_START + ('S' - 'A'));
static constexpr wi::input::BUTTON KEY_D = (wi::input::BUTTON)(wi::input::CHARACTER_RANGE_START + ('D' - 'A'));
void VoxelRenderPath::handleInput(float dt) {
// F2: toggle backlog console
if (wi::input::Press(wi::input::KEYBOARD_BUTTON_F2)) {
wi::backlog::Toggle();
}
// F3: toggle animated terrain
if (wi::input::Press(wi::input::KEYBOARD_BUTTON_F3)) {
animatedTerrain_ = !animatedTerrain_;
wi::backlog::post(animatedTerrain_ ? "Animation: ON (60 Hz)" : "Animation: OFF");
}
// F4: toggle blend debug visualization
if (wi::input::Press(wi::input::KEYBOARD_BUTTON_F4)) {
renderer.debugBlend_ = !renderer.debugBlend_;
wi::backlog::post(renderer.debugBlend_ ? "Blend debug: ON" : "Blend debug: OFF");
}
if (wi::input::Press(wi::input::KEYBOARD_BUTTON_F5)) {
// Cycle: OFF → ON → DEBUG → OFF
if (!renderer.rtShadowsEnabled_) {
renderer.rtShadowsEnabled_ = true;
renderer.rtShadowDebug_ = false;
wi::backlog::post("RT Shadows: ON");
} else if (!renderer.rtShadowDebug_) {
renderer.rtShadowDebug_ = true;
wi::backlog::post("RT Shadows: DEBUG (red=shadow, green=lit, blue=backface)");
} else {
renderer.rtShadowsEnabled_ = false;
renderer.rtShadowDebug_ = false;
wi::backlog::post("RT Shadows: OFF");
}
}
if (wi::input::Press(wi::input::MOUSE_BUTTON_RIGHT)) {
mouseCaptured = !mouseCaptured;
wi::input::HidePointer(mouseCaptured);
}
if (mouseCaptured) {
auto mouseState = wi::input::GetMouseState();
cameraYaw += mouseState.delta_position.x * cameraSensitivity;
cameraPitch += mouseState.delta_position.y * cameraSensitivity;
cameraPitch = std::clamp(cameraPitch, -1.5f, 1.5f);
}
float cosPitch = std::cos(cameraPitch);
XMFLOAT3 forward(
std::sin(cameraYaw) * cosPitch,
-std::sin(cameraPitch),
std::cos(cameraYaw) * cosPitch
);
XMFLOAT3 right(std::cos(cameraYaw), 0.0f, -std::sin(cameraYaw));
float speed = cameraSpeed * dt;
if (wi::input::Down(wi::input::KEYBOARD_BUTTON_LSHIFT)) speed *= 3.0f;
if (wi::input::Down(KEY_W)) { cameraPos.x += forward.x * speed; cameraPos.y += forward.y * speed; cameraPos.z += forward.z * speed; }
if (wi::input::Down(KEY_S)) { cameraPos.x -= forward.x * speed; cameraPos.y -= forward.y * speed; cameraPos.z -= forward.z * speed; }
if (wi::input::Down(KEY_A)) { cameraPos.x -= right.x * speed; cameraPos.z -= right.z * speed; }
if (wi::input::Down(KEY_D)) { cameraPos.x += right.x * speed; cameraPos.z += right.z * speed; }
if (wi::input::Down(wi::input::KEYBOARD_BUTTON_SPACE)) cameraPos.y += speed;
if (wi::input::Down(wi::input::KEYBOARD_BUTTON_LCONTROL)) cameraPos.y -= speed;
camera->Eye = cameraPos;
camera->At = forward;
camera->Up = XMFLOAT3(0, 1, 0);
camera->UpdateCamera();
}
void VoxelRenderPath::Update(float dt) {
auto frameStart = std::chrono::high_resolution_clock::now();
lastDt_ = dt;
float instantFps = (dt > 0.0f) ? (1.0f / dt) : 0.0f;
smoothFps_ = smoothFps_ * 0.95f + instantFps * 0.05f;
if (camera) handleInput(dt);
windTime_ += dt;
renderer.windTime_ = windTime_;
// Animated terrain: regenerate at 60 Hz with time-shifted noise
// Fused: regenerate + pack voxel data in the same parallel pass
if (animatedTerrain_ && renderer.isInitialized()) {
animAccum_ += dt;
if (animAccum_ >= ANIM_INTERVAL) {
animAccum_ -= ANIM_INTERVAL;
animTime_ += ANIM_INTERVAL;
// Prepare pack cache for fused regenerate+pack
const uint32_t wordsPerChunk = CHUNK_VOLUME / 2;
uint32_t totalWords = (uint32_t)world.chunkCount() * wordsPerChunk;
renderer.packedVoxelCache_.resize(totalWords);
auto t0 = std::chrono::high_resolution_clock::now();
world.regenerateAnimated(animTime_,
renderer.packedVoxelCache_.data(), totalWords);
auto t1 = std::chrono::high_resolution_clock::now();
profRegenerate_.add(std::chrono::duration<float, std::milli>(t1 - t0).count());
renderer.voxelCacheDirty_ = false; // cache already filled by fused pack
renderer.gpuMeshDirty_ = true; // GPU still needs upload + dispatch
// Re-mesh smooth surfaces — GPU path or CPU fallback
if (renderer.smoothCentroidShader_.IsValid() && renderer.smoothMeshShader_.IsValid()) {
renderer.gpuSmoothMeshDirty_ = true; // will dispatch in Render()
} else {
// CPU fallback (Surface Nets) — parallelized
auto ts0 = std::chrono::high_resolution_clock::now();
std::vector<Chunk*> chunkPtrs;
world.forEachChunk([&](const ChunkPos& pos, Chunk& chunk) {
chunkPtrs.push_back(&chunk);
});
const VoxelWorld& worldRef = world;
wi::jobsystem::context ctx;
wi::jobsystem::Dispatch(ctx, (uint32_t)chunkPtrs.size(), 1,
[&chunkPtrs, &worldRef](wi::jobsystem::JobArgs args) {
uint32_t idx = args.jobIndex;
SmoothMesher::meshChunk(*chunkPtrs[idx], worldRef);
// Stamp chunkIndex during parallel pass (avoids sequential loop in upload)
for (auto& sv : chunkPtrs[idx]->smoothVertices)
sv.chunkIndex = (uint16_t)idx;
});
wi::jobsystem::Wait(ctx);
auto ts1 = std::chrono::high_resolution_clock::now();
profSmoothMesh_.add(std::chrono::duration<float, std::milli>(ts1 - ts0).count());
renderer.uploadSmoothDataFast(world);
auto ts2 = std::chrono::high_resolution_clock::now();
profSmoothUpload_.add(std::chrono::duration<float, std::milli>(ts2 - ts1).count());
}
// Re-collect toping instances — parallelized
{
auto tt0 = std::chrono::high_resolution_clock::now();
topingSystem.collectInstancesParallel(world);
auto tt1 = std::chrono::high_resolution_clock::now();
profTopingCollect_.add(std::chrono::duration<float, std::milli>(tt1 - tt0).count());
renderer.uploadTopingData(topingSystem);
auto tt2 = std::chrono::high_resolution_clock::now();
profTopingUpload_.add(std::chrono::duration<float, std::milli>(tt2 - tt1).count());
}
}
}
if (renderer.isInitialized()) {
auto t0 = std::chrono::high_resolution_clock::now();
renderer.updateMeshes(world);
auto t1 = std::chrono::high_resolution_clock::now();
profUpdateMeshes_.add(std::chrono::duration<float, std::milli>(t1 - t0).count());
}
RenderPath3D::Update(dt);
// Profiling: accumulate frame time (will be completed in Compose)
auto frameEnd = std::chrono::high_resolution_clock::now();
profFrame_.add(std::chrono::duration<float, std::milli>(frameEnd - frameStart).count());
// Log averages every 5 seconds
profTimer_ += dt;
if (profTimer_ >= PROF_INTERVAL) {
logProfilingAverages();
profTimer_ -= PROF_INTERVAL;
}
}
void VoxelRenderPath::Render() const {
RenderPath3D::Render();
if (renderer.isInitialized() && camera && rtCreated_) {
auto* device = wi::graphics::GetDevice();
CommandList cmd = device->BeginCommandList();
// GPU mesh path: only re-dispatch when voxel data changed
if (renderer.gpuMeshEnabled_ && renderer.gpuMesherAvailable_) {
// Always readback previous frame's quad count
uint32_t* countData = (uint32_t*)renderer.meshCounterReadback_.mapped_data;
if (countData) {
renderer.gpuMeshQuadCount_ = *countData;
renderer.totalQuads_ = renderer.gpuMeshQuadCount_;
}
// Only re-dispatch compute mesher when data changed
if (renderer.gpuMeshDirty_) {
renderer.dispatchGpuMesh(cmd, world,
&profVoxelPack_, &profGpuUpload_, &profGpuDispatch_);
}
// GPU smooth mesh: readback previous frame's vertex count
if (renderer.smoothCounterReadback_.mapped_data) {
uint32_t* smoothCount = (uint32_t*)renderer.smoothCounterReadback_.mapped_data;
renderer.gpuSmoothVertexCount_ = *smoothCount;
}
// GPU smooth mesh dispatch (uses same voxelDataBuffer_ already uploaded)
if (renderer.gpuSmoothMeshDirty_ && renderer.smoothCentroidShader_.IsValid() && renderer.smoothMeshShader_.IsValid()) {
renderer.dispatchGpuSmoothMesh(cmd, world);
}
// Re-dispatch next frame if readback not yet available (1-frame delay)
if (renderer.gpuSmoothVertexCount_ == 0 &&
renderer.smoothCentroidShader_.IsValid() && renderer.smoothMeshShader_.IsValid()) {
renderer.gpuSmoothMeshDirty_ = true;
}
// Phase 6.1: BLAS extraction + acceleration structure build
if (renderer.rtAvailable_ && renderer.blasExtractShader_.IsValid() &&
renderer.gpuMeshQuadCount_ > 0 &&
(renderer.rtDirty_ || renderer.gpuMeshQuadCount_ != renderer.rtBlockyVertexCount_ / 6)) {
renderer.dispatchBLASExtract(cmd);
renderer.buildAccelerationStructures(cmd);
}
}
// GPU mesh benchmark state machine (runs once after world gen, CPU path only)
if (!renderer.gpuMeshEnabled_) {
if (renderer.benchState_ == VoxelRenderer::BenchState::DISPATCH) {
renderer.dispatchGpuMeshBenchmark(cmd, world);
} else if (renderer.benchState_ == VoxelRenderer::BenchState::READBACK) {
renderer.readbackGpuMeshBenchmark();
}
}
auto tRender0 = std::chrono::high_resolution_clock::now();
renderer.render(cmd, *camera, voxelDepth_, voxelRT_, voxelNormalRT_);
// Phase 4: render topings (separate render pass, preserves voxel output)
renderer.renderTopings(cmd, topingSystem, voxelDepth_, voxelRT_, voxelNormalRT_);
// Phase 5: render smooth surfaces (separate render pass, preserves all prior output)
renderer.renderSmooth(cmd, voxelDepth_, voxelRT_, voxelNormalRT_);
// Phase 6.2: RT Shadows (modulates voxelRT_ in-place after all geometry is rendered)
if (renderer.isRTShadowsEnabled() && renderer.isRTReady()) {
renderer.dispatchShadows(cmd, voxelDepth_, voxelRT_, voxelNormalRT_);
}
auto tRender1 = std::chrono::high_resolution_clock::now();
profRender_.add(std::chrono::duration<float, std::milli>(tRender1 - tRender0).count());
}
}
void VoxelRenderPath::logProfilingAverages() const {
char msg[1024];
snprintf(msg, sizeof(msg),
"=== PERF PROFILE (avg over %.0fs) ===\n"
" Regenerate: %7.2f ms (%u calls)\n"
" UpdateMeshes: %7.2f ms (%u calls)\n"
" VoxelPack: %7.2f ms (%u calls)\n"
" GPU Upload: %7.2f ms (%u calls)\n"
" GPU Dispatch: %7.2f ms (%u calls)\n"
" SmoothMesh: %7.2f ms (%u calls)\n"
" SmoothUpload: %7.2f ms (%u calls)\n"
" TopingCollect: %7.2f ms (%u calls)\n"
" TopingUpload: %7.2f ms (%u calls)\n"
" Render: %7.2f ms (%u calls)\n"
" Frame (Upd): %7.2f ms (%u calls, %.1f FPS)",
PROF_INTERVAL,
profRegenerate_.avg(), profRegenerate_.count,
profUpdateMeshes_.avg(), profUpdateMeshes_.count,
profVoxelPack_.avg(), profVoxelPack_.count,
profGpuUpload_.avg(), profGpuUpload_.count,
profGpuDispatch_.avg(), profGpuDispatch_.count,
profSmoothMesh_.avg(), profSmoothMesh_.count,
profSmoothUpload_.avg(), profSmoothUpload_.count,
profTopingCollect_.avg(), profTopingCollect_.count,
profTopingUpload_.avg(), profTopingUpload_.count,
profRender_.avg(), profRender_.count,
profFrame_.avg(), profFrame_.count,
profFrame_.count > 0 ? (1000.0f / profFrame_.avg()) : 0.0f);
wi::backlog::post(msg);
profRegenerate_.reset();
profUpdateMeshes_.reset();
profVoxelPack_.reset();
profGpuUpload_.reset();
profGpuDispatch_.reset();
profSmoothMesh_.reset();
profSmoothUpload_.reset();
profTopingCollect_.reset();
profTopingUpload_.reset();
profRender_.reset();
profFrame_.reset();
}
void VoxelRenderPath::Compose(CommandList cmd) const {
frameCount_++;
RenderPath3D::Compose(cmd);
if (rtCreated_ && voxelRT_.IsValid()) {
wi::image::Params fx;
fx.enableFullScreen();
fx.blendFlag = wi::enums::BLENDMODE_OPAQUE;
wi::image::Draw(&voxelRT_, fx, cmd);
}
// HUD overlay
wi::font::Params fp;
fp.posX = 10; fp.posY = 10; fp.size = 20;
fp.color = wi::Color(255, 255, 255, 230);
fp.shadowColor = wi::Color(0, 0, 0, 180);
char fpsStr[16];
snprintf(fpsStr, sizeof(fpsStr), "%.1f", smoothFps_);
char dtStr[16];
snprintf(dtStr, sizeof(dtStr), "%.2f", lastDt_ * 1000.0f);
std::string stats = "BVLE Voxel Engine (Phase 6 — Ray Tracing)\n";
stats += "FPS: " + std::string(fpsStr) + " (" + std::string(dtStr) + " ms)\n";
if (debugMode) {
stats += "=== DEBUG FACE MODE ===\n";
stats += "+X=Red -X=DkRed +Y=Green -Y=DkGreen +Z=Blue -Z=DkBlue\n";
}
stats += "Chunks: " + std::to_string(renderer.getVisibleChunks())
+ "/" + std::to_string(renderer.getChunkCount()) + "\n";
stats += "Quads: " + std::to_string(renderer.getTotalQuads()) + "\n";
std::string renderMode;
if (renderer.isGpuMeshEnabled())
renderMode = "GPU mesh (1x1) + DrawInstanced";
else if (renderer.isGpuCulling())
renderMode = "CPU greedy + MDI + GPU cull";
else if (renderer.isMdiEnabled())
renderMode = "CPU greedy + MDI + CPU cull";
else
renderMode = "CPU greedy + DrawInstanced + CPU cull";
stats += "Draw Calls: " + std::to_string(renderer.getDrawCalls())
+ " (" + renderMode + ")\n";
if (renderer.isGpuMeshEnabled()) {
stats += "GPU Mesh Quads: " + std::to_string(renderer.getGpuMeshQuadCount()) + "\n";
} else {
char cullStr[16], drawStr[16];
snprintf(cullStr, sizeof(cullStr), "%.3f", renderer.getGpuCullTimeMs());
snprintf(drawStr, sizeof(drawStr), "%.3f", renderer.getGpuDrawTimeMs());
stats += "GPU Cull: " + std::string(cullStr) + " ms | Draw: " + std::string(drawStr) + " ms\n";
}
stats += "Topings: " + std::to_string(topingSystem.getInstanceCount())
+ " instances, " + std::to_string(renderer.getTopingDrawCalls())
+ " draws (" + std::to_string(topingSystem.getDefCount()) + " types)\n";
if (renderer.getSmoothVertexCount() > 0) {
stats += "Smooth: " + std::to_string(renderer.getSmoothVertexCount())
+ " verts (" + std::to_string(renderer.getSmoothVertexCount() / 3)
+ " tris), " + std::to_string(renderer.getSmoothDrawCalls()) + " draws\n";
}
if (renderer.isRTAvailable()) {
if (renderer.isRTReady()) {
stats += "RT: TLAS ready | Blocky "
+ std::to_string(renderer.getRTBlockyTriCount()) + " tris | Smooth "
+ std::to_string(renderer.getRTSmoothTriCount()) + " tris"
+ " | Shadows " + std::string(renderer.rtShadowDebug_ ? "DEBUG" : (renderer.isRTShadowsEnabled() ? "ON" : "OFF")) + "\n";
} else {
stats += "RT: building...\n";
}
} else {
stats += "RT: not available\n";
}
stats += "WASD+Space/Ctrl: move | Shift: fast | Right-click: capture mouse\n";
stats += "F2: console | F3: anim [" + std::string(animatedTerrain_ ? "ON" : "OFF")
+ "] | F4: dbg [" + std::string(renderer.debugBlend_ ? "ON" : "OFF")
+ "] | F5: shadows [" + std::string(renderer.rtShadowDebug_ ? "DBG" : (renderer.isRTShadowsEnabled() ? "ON" : "OFF")) + "]";
wi::font::Draw(stats, fp, cmd);
}
} // namespace voxel