#include "VoxelRenderer.h" #include "wiJobSystem.h" #include "wiPrimitive.h" #include #include #include #include using namespace wi::graphics; namespace voxel { // ── VoxelRenderer Implementation ──────────────────────────────── VoxelRenderer::VoxelRenderer() = default; VoxelRenderer::~VoxelRenderer() { shutdown(); } void VoxelRenderer::initialize(GraphicsDevice* dev) { device_ = dev; if (!device_) return; createPipeline(); if (!pso_.IsValid()) { wi::backlog::post("VoxelRenderer: pipeline creation failed", wi::backlog::LogLevel::Error); initialized_ = false; return; } generateTextures(); // Create mega quad buffer (SRV for vertex pulling) GPUBufferDesc megaDesc; megaDesc.size = MEGA_BUFFER_CAPACITY * sizeof(PackedQuad); megaDesc.bind_flags = BindFlag::SHADER_RESOURCE; megaDesc.misc_flags = ResourceMiscFlag::BUFFER_STRUCTURED; megaDesc.stride = sizeof(PackedQuad); megaDesc.usage = Usage::DEFAULT; device_->CreateBuffer(&megaDesc, nullptr, &megaQuadBuffer_); // Create chunk info buffer (SRV for VS chunk lookup) GPUBufferDesc infoDesc; infoDesc.size = MAX_CHUNKS * sizeof(GPUChunkInfo); infoDesc.bind_flags = BindFlag::SHADER_RESOURCE; infoDesc.misc_flags = ResourceMiscFlag::BUFFER_STRUCTURED; infoDesc.stride = sizeof(GPUChunkInfo); infoDesc.usage = Usage::DEFAULT; device_->CreateBuffer(&infoDesc, nullptr, &chunkInfoBuffer_); // Create indirect args buffer (for DrawInstancedIndirectCount, up to 6 draws per chunk) // UAV bind flag needed for GPU cull compute shader to write args GPUBufferDesc argsDesc; argsDesc.size = MAX_DRAWS * sizeof(IndirectDrawArgs); argsDesc.bind_flags = BindFlag::UNORDERED_ACCESS; argsDesc.misc_flags = ResourceMiscFlag::BUFFER_STRUCTURED | ResourceMiscFlag::INDIRECT_ARGS; argsDesc.stride = sizeof(IndirectDrawArgs); argsDesc.usage = Usage::DEFAULT; device_->CreateBuffer(&argsDesc, nullptr, &indirectArgsBuffer_); // Create draw count buffer (single uint32, raw for RWByteAddressBuffer) // UAV bind flag needed for GPU cull compute shader atomic counter GPUBufferDesc countDesc; countDesc.size = sizeof(uint32_t); countDesc.bind_flags = BindFlag::UNORDERED_ACCESS; countDesc.misc_flags = ResourceMiscFlag::BUFFER_RAW | ResourceMiscFlag::INDIRECT_ARGS; countDesc.usage = Usage::DEFAULT; device_->CreateBuffer(&countDesc, nullptr, &drawCountBuffer_); // ── GPU Timestamp Queries ────────────────────────────────────── GPUQueryHeapDesc queryDesc; queryDesc.type = GpuQueryType::TIMESTAMP; queryDesc.query_count = TS_COUNT; device_->CreateQueryHeap(&queryDesc, ×tampHeap_); GPUBufferDesc readbackDesc; readbackDesc.size = TS_COUNT * sizeof(uint64_t); readbackDesc.usage = Usage::READBACK; device_->CreateBuffer(&readbackDesc, nullptr, ×tampReadback_); // ── GPU Compute Mesher resources ───────────────────────────── wi::renderer::LoadShader(ShaderStage::CS, meshShader_, "voxel/voxelMeshCS.cso"); gpuMesherAvailable_ = meshShader_.IsValid(); if (gpuMesherAvailable_) { // Voxel data buffer: 1 chunk's worth (32^3 voxels / 2 per uint = 16384 uint) GPUBufferDesc voxDesc; voxDesc.size = (CHUNK_VOLUME / 2) * sizeof(uint32_t); voxDesc.bind_flags = BindFlag::SHADER_RESOURCE; voxDesc.misc_flags = ResourceMiscFlag::BUFFER_STRUCTURED; voxDesc.stride = sizeof(uint32_t); voxDesc.usage = Usage::DEFAULT; device_->CreateBuffer(&voxDesc, nullptr, &voxelDataBuffer_); // GPU quad output: same capacity as mega-buffer GPUBufferDesc gpuQDesc; gpuQDesc.size = MEGA_BUFFER_CAPACITY * sizeof(uint64_t); // PackedQuad = 8 bytes gpuQDesc.bind_flags = BindFlag::UNORDERED_ACCESS | BindFlag::SHADER_RESOURCE; gpuQDesc.misc_flags = ResourceMiscFlag::BUFFER_STRUCTURED; gpuQDesc.stride = sizeof(uint64_t); // uint2 = 8 bytes gpuQDesc.usage = Usage::DEFAULT; device_->CreateBuffer(&gpuQDesc, nullptr, &gpuQuadBuffer_); // Quad counter GPUBufferDesc cntDesc; cntDesc.size = sizeof(uint32_t); cntDesc.bind_flags = BindFlag::UNORDERED_ACCESS; cntDesc.misc_flags = ResourceMiscFlag::BUFFER_RAW; cntDesc.usage = Usage::DEFAULT; device_->CreateBuffer(&cntDesc, nullptr, &gpuQuadCounter_); // Readback buffer for quad counter (GPU → CPU) GPUBufferDesc rbDesc; rbDesc.size = sizeof(uint32_t); rbDesc.usage = Usage::READBACK; device_->CreateBuffer(&rbDesc, nullptr, &meshCounterReadback_); wi::backlog::post("VoxelRenderer: GPU compute mesher available"); } else { wi::backlog::post("VoxelRenderer: GPU compute mesher not available", wi::backlog::LogLevel::Warning); } cpuMegaQuads_.reserve(MEGA_BUFFER_CAPACITY); cpuChunkInfo_.reserve(MAX_CHUNKS); chunkSlots_.reserve(MAX_CHUNKS); cpuIndirectArgs_.reserve(MAX_CHUNKS); initialized_ = true; wi::backlog::post("VoxelRenderer: initialized (mega-buffer: " + std::to_string(MEGA_BUFFER_CAPACITY) + " quads capacity)"); } void VoxelRenderer::shutdown() { chunkSlots_.clear(); cpuChunkInfo_.clear(); cpuMegaQuads_.clear(); initialized_ = false; } void VoxelRenderer::createPipeline() { // Constant buffer for per-frame data GPUBufferDesc cbDesc; cbDesc.size = sizeof(VoxelConstants); cbDesc.bind_flags = BindFlag::CONSTANT_BUFFER; cbDesc.usage = Usage::DEFAULT; device_->CreateBuffer(&cbDesc, nullptr, &constantBuffer_); // Anisotropic wrap sampler SamplerDesc samplerDesc; samplerDesc.filter = Filter::ANISOTROPIC; samplerDesc.address_u = TextureAddressMode::WRAP; samplerDesc.address_v = TextureAddressMode::WRAP; samplerDesc.address_w = TextureAddressMode::WRAP; samplerDesc.max_anisotropy = 16; device_->CreateSampler(&samplerDesc, &sampler_); // Load shaders wi::renderer::LoadShader(ShaderStage::VS, vertexShader_, "voxel/voxelVS.cso"); wi::renderer::LoadShader(ShaderStage::PS, pixelShader_, "voxel/voxelPS.cso"); wi::renderer::LoadShader(ShaderStage::CS, cullShader_, "voxel/voxelCullCS.cso"); if (!vertexShader_.IsValid() || !pixelShader_.IsValid()) { wi::backlog::post("VoxelRenderer: shader loading failed", wi::backlog::LogLevel::Error); return; } if (cullShader_.IsValid()) { gpuCullingEnabled_ = true; wi::backlog::post("VoxelRenderer: GPU cull compute shader enabled"); } else { gpuCullingEnabled_ = false; wi::backlog::post("VoxelRenderer: cull compute shader not available, using CPU fallback", wi::backlog::LogLevel::Warning); } // Pipeline: backface cull, depth test, opaque blend, triangle list PipelineStateDesc psoDesc; psoDesc.vs = &vertexShader_; psoDesc.ps = &pixelShader_; psoDesc.rs = wi::renderer::GetRasterizerState(wi::enums::RSTYPE_FRONT); psoDesc.dss = wi::renderer::GetDepthStencilState(wi::enums::DSSTYPE_DEFAULT); psoDesc.bs = wi::renderer::GetBlendState(wi::enums::BSTYPE_OPAQUE); psoDesc.pt = PrimitiveTopology::TRIANGLELIST; device_->CreatePipelineState(&psoDesc, &pso_); } // ── Procedural texture generation ─────────────────────────────── static void generateNoiseTexture(uint8_t* pixels, int w, int h, uint8_t r0, uint8_t g0, uint8_t b0, uint8_t r1, uint8_t g1, uint8_t b1, uint32_t seed, float heightFreq = 1.0f, float heightContrast = 1.0f) { uint32_t s = seed; uint32_t s2 = seed * 7919u + 104729u; // separate seed for heightmap for (int y = 0; y < h; y++) { for (int x = 0; x < w; x++) { s = s * 1664525u + 1013904223u; float noise = (float)(s & 0xFFFF) / 65535.0f; float fx = (float)x / w; float fy = (float)y / h; float pattern = 0.5f + 0.5f * std::sin(fx * 20.0f + noise * 3.0f) * std::cos(fy * 20.0f + noise * 3.0f); float t = noise * 0.6f + pattern * 0.4f; int idx = (y * w + x) * 4; pixels[idx + 0] = (uint8_t)(r0 + (r1 - r0) * t); pixels[idx + 1] = (uint8_t)(g0 + (g1 - g0) * t); pixels[idx + 2] = (uint8_t)(b0 + (b1 - b0) * t); // Heightmap in alpha: separate noise for height-based material blending s2 = s2 * 1664525u + 1013904223u; float hn = (float)(s2 & 0xFFFF) / 65535.0f; float hPattern = 0.5f + 0.5f * std::sin(fx * 12.0f * heightFreq + hn * 2.0f) * std::cos(fy * 12.0f * heightFreq + hn * 2.0f); float heightVal = hn * 0.5f + hPattern * 0.5f; heightVal = std::clamp(heightVal * heightContrast, 0.0f, 1.0f); pixels[idx + 3] = (uint8_t)(heightVal * 255.0f); } } } void VoxelRenderer::generateTextures() { const int TEX_SIZE = 256; const int NUM_MATERIALS = 5; std::vector allPixels(TEX_SIZE * TEX_SIZE * 4 * NUM_MATERIALS); struct MatColor { uint8_t r0,g0,b0, r1,g1,b1; uint32_t seed; float heightFreq; // heightmap noise frequency float heightContrast; // heightmap contrast (higher = more defined peaks) }; MatColor colors[NUM_MATERIALS] = { { 60, 140, 40, 80, 180, 60, 101, 1.5f, 0.8f }, // Grass: medium bumps { 100, 70, 40, 140, 100, 60, 202, 0.8f, 0.6f }, // Dirt: smooth mounds { 80, 80, 90, 120, 120, 130, 303, 2.5f, 0.5f }, // Stone: darker blue-gray, moderate height (was 1.2, lowered so neighbors bleed onto it more) { 220, 200, 130, 245, 230, 160, 404, 3.0f, 0.4f }, // Sand: warmer yellow, fine { 220, 225, 230, 245, 248, 252, 505, 1.0f, 0.5f }, // Snow: smooth, soft }; for (int i = 0; i < NUM_MATERIALS; i++) { auto& c = colors[i]; generateNoiseTexture( allPixels.data() + i * TEX_SIZE * TEX_SIZE * 4, TEX_SIZE, TEX_SIZE, c.r0, c.g0, c.b0, c.r1, c.g1, c.b1, c.seed, c.heightFreq, c.heightContrast ); } TextureDesc texDesc; texDesc.type = TextureDesc::Type::TEXTURE_2D; texDesc.width = TEX_SIZE; texDesc.height = TEX_SIZE; texDesc.array_size = NUM_MATERIALS; texDesc.mip_levels = 1; texDesc.format = Format::R8G8B8A8_UNORM; texDesc.bind_flags = BindFlag::SHADER_RESOURCE; texDesc.usage = Usage::DEFAULT; std::vector subData(NUM_MATERIALS); for (int i = 0; i < NUM_MATERIALS; i++) { subData[i].data_ptr = allPixels.data() + i * TEX_SIZE * TEX_SIZE * 4; subData[i].row_pitch = TEX_SIZE * 4; subData[i].slice_pitch = TEX_SIZE * TEX_SIZE * 4; } device_->CreateTexture(&texDesc, subData.data(), &textureArray_); } // ── Mega-buffer rebuild ───────────────────────────────────────── // Packs all chunk quads contiguously into a single buffer. // Simple strategy: full rebuild whenever any chunk is dirty. void VoxelRenderer::rebuildMegaBuffer(VoxelWorld& world) { cpuMegaQuads_.clear(); chunkSlots_.clear(); cpuChunkInfo_.clear(); uint32_t offset = 0; float debugFlag = debugFaceColors_ ? 1.0f : 0.0f; world.forEachChunk([&](const ChunkPos& pos, Chunk& chunk) { if (chunk.quadCount == 0) return; if (offset + chunk.quadCount > MEGA_BUFFER_CAPACITY) return; // overflow guard ChunkSlot slot; slot.pos = pos; slot.quadOffset = offset; slot.quadCount = chunk.quadCount; chunkSlots_.push_back(slot); GPUChunkInfo info = {}; info.worldPos = XMFLOAT4( (float)(pos.x * CHUNK_SIZE), (float)(pos.y * CHUNK_SIZE), (float)(pos.z * CHUNK_SIZE), debugFlag ); info.quadOffset = offset; info.quadCount = chunk.quadCount; for (int f = 0; f < 6; f++) { info.faceOffsets[f] = chunk.faceOffsets[f]; info.faceCounts[f] = chunk.faceCounts[f]; } cpuChunkInfo_.push_back(info); cpuMegaQuads_.insert(cpuMegaQuads_.end(), chunk.quads.begin(), chunk.quads.end()); offset += chunk.quadCount; }); chunkCount_ = (uint32_t)chunkSlots_.size(); totalQuads_ = offset; } // Build chunkInfoBuffer without CPU meshing (for GPU mesh path) void VoxelRenderer::rebuildChunkInfoOnly(VoxelWorld& world) { chunkSlots_.clear(); cpuChunkInfo_.clear(); uint32_t idx = 0; float debugFlag = debugFaceColors_ ? 1.0f : 0.0f; world.forEachChunk([&](const ChunkPos& pos, Chunk& chunk) { ChunkSlot slot; slot.pos = pos; slot.quadOffset = 0; // not used in GPU mesh path slot.quadCount = 0; chunkSlots_.push_back(slot); GPUChunkInfo info = {}; info.worldPos = XMFLOAT4( (float)(pos.x * CHUNK_SIZE), (float)(pos.y * CHUNK_SIZE), (float)(pos.z * CHUNK_SIZE), debugFlag ); info.quadOffset = 0; info.quadCount = 0; cpuChunkInfo_.push_back(info); idx++; }); chunkCount_ = (uint32_t)chunkSlots_.size(); } void VoxelRenderer::updateMeshes(VoxelWorld& world) { if (!device_) return; // GPU mesh path: skip CPU meshing entirely, just rebuild chunk info if (gpuMeshEnabled_ && gpuMesherAvailable_) { bool anyDirty = false; world.forEachChunk([&](const ChunkPos& pos, Chunk& chunk) { if (chunk.dirty) { anyDirty = true; chunk.dirty = false; } }); if (anyDirty || megaBufferDirty_) { rebuildChunkInfoOnly(world); // If cache wasn't already filled by fused regen+pack, mark for repack if (!gpuMeshDirty_) { // Non-fused dirty (e.g. initial load): need both repack and GPU update voxelCacheDirty_ = true; gpuMeshDirty_ = true; } // else: fused path already set gpuMeshDirty_=true, cache is clean chunkInfoDirty_ = true; megaBufferDirty_ = false; } return; } // CPU meshing path (fallback) // Collect dirty chunks for parallel meshing std::vector dirtyChunks; world.forEachChunk([&](const ChunkPos& pos, Chunk& chunk) { if (chunk.dirty) dirtyChunks.push_back(&chunk); }); bool anyDirty = !dirtyChunks.empty(); // Parallel CPU greedy meshing via wi::jobsystem auto cpuStart = std::chrono::high_resolution_clock::now(); if (anyDirty) { wi::jobsystem::context ctx; wi::jobsystem::Dispatch(ctx, (uint32_t)dirtyChunks.size(), 1, [&dirtyChunks, &world](wi::jobsystem::JobArgs args) { VoxelMesher::meshChunk(*dirtyChunks[args.jobIndex], world); }); wi::jobsystem::Wait(ctx); } auto cpuEnd = std::chrono::high_resolution_clock::now(); if (anyDirty) { cpuMeshTimeMs_ = std::chrono::duration(cpuEnd - cpuStart).count(); // Trigger GPU benchmark on next render frame if (gpuMesherAvailable_ && benchState_ == BenchState::IDLE) { benchState_ = BenchState::DISPATCH; } } if (anyDirty || megaBufferDirty_) { rebuildMegaBuffer(world); megaBufferDirty_ = false; } } // ── GPU Mesh Benchmark (Phase 2.4) ────────────────────────────── // Dispatches the baseline 1x1 GPU mesher for ALL chunks and measures timing. // State machine: DISPATCH (frame N) → READBACK (frame N+1) → DONE. void VoxelRenderer::dispatchGpuMeshBenchmark(CommandList cmd, const VoxelWorld& world) const { auto* dev = device_; // Zero the quad counter uint32_t zero = 0; dev->UpdateBuffer(&gpuQuadCounter_, &zero, cmd, sizeof(uint32_t)); // Barrier: COPY_DST → UAV for counter, UNDEFINED → UAV for output buffer GPUBarrier preBarriers[] = { GPUBarrier::Buffer(&gpuQuadCounter_, ResourceState::COPY_DST, ResourceState::UNORDERED_ACCESS), GPUBarrier::Buffer(&gpuQuadBuffer_, ResourceState::UNDEFINED, ResourceState::UNORDERED_ACCESS), }; dev->Barrier(preBarriers, 2, cmd); dev->BindComputeShader(&meshShader_, cmd); // GPU timestamp: mesh begin dev->QueryEnd(×tampHeap_, TS_MESH_BEGIN, cmd); // Dispatch for each chunk uint32_t chunkIdx = 0; world.forEachChunk([&](const ChunkPos& pos, Chunk& chunk) { // Pack voxel data: 32^3 voxels → 16384 uint32s (2 voxels per uint) std::vector packed(CHUNK_VOLUME / 2, 0); for (int i = 0; i < CHUNK_VOLUME; i++) { uint32_t v = chunk.voxels[i].packed; if (i & 1) packed[i >> 1] |= (v << 16); else packed[i >> 1] = v; } // Upload voxel data (re-uses the single-chunk buffer) dev->UpdateBuffer(&voxelDataBuffer_, packed.data(), cmd, packed.size() * sizeof(uint32_t)); // Bind resources (after BindComputeShader, so PushConstants targets compute) dev->BindResource(&voxelDataBuffer_, 0, cmd); dev->BindUAV(&gpuQuadBuffer_, 0, cmd); dev->BindUAV(&gpuQuadCounter_, 1, cmd); // Push constants for this chunk struct MeshPush { uint32_t chunkIndex; uint32_t voxelBufferOffset; uint32_t quadBufferOffset; uint32_t maxOutputQuads; uint32_t pad[8]; }; MeshPush pushData = {}; pushData.chunkIndex = chunkIdx; pushData.voxelBufferOffset = 0; // single-chunk buffer, always at offset 0 pushData.quadBufferOffset = 0; // all chunks share global atomic counter pushData.maxOutputQuads = MEGA_BUFFER_CAPACITY; dev->PushConstants(&pushData, sizeof(pushData), cmd); // Dispatch: 32/8 = 4 groups per axis → 64 groups total dev->Dispatch(4, 4, 4, cmd); chunkIdx++; }); // GPU timestamp: mesh end dev->QueryEnd(×tampHeap_, TS_MESH_END, cmd); // Copy quad counter to readback buffer GPUBarrier postBarrier = GPUBarrier::Buffer( &gpuQuadCounter_, ResourceState::UNORDERED_ACCESS, ResourceState::COPY_SRC); dev->Barrier(&postBarrier, 1, cmd); dev->CopyBuffer(&meshCounterReadback_, 0, &gpuQuadCounter_, 0, sizeof(uint32_t), cmd); // Resolve timestamps dev->QueryResolve(×tampHeap_, TS_MESH_BEGIN, 2, ×tampReadback_, TS_MESH_BEGIN * sizeof(uint64_t), cmd); benchState_ = BenchState::READBACK; } void VoxelRenderer::readbackGpuMeshBenchmark() const { // Read quad count from readback buffer uint32_t* countData = (uint32_t*)meshCounterReadback_.mapped_data; if (countData) { gpuBaselineQuads_ = *countData; } // Read GPU mesh timestamps uint64_t* tsData = (uint64_t*)timestampReadback_.mapped_data; if (tsData) { double freq = (double)device_->GetTimestampFrequency(); if (freq > 0.0 && tsData[TS_MESH_END] > tsData[TS_MESH_BEGIN]) { gpuMeshTimeMs_ = (float)((double)(tsData[TS_MESH_END] - tsData[TS_MESH_BEGIN]) / freq * 1000.0); } } // Log benchmark results char msg[256]; snprintf(msg, sizeof(msg), "=== MESH BENCHMARK ===\n" " CPU greedy: %.2f ms, %u quads (%u chunks)\n" " GPU baseline: %.3f ms, %u quads (1x1, no merge)\n" " Ratio quads: %.1fx more (GPU baseline vs CPU greedy)", cpuMeshTimeMs_, totalQuads_, chunkCount_, gpuMeshTimeMs_, gpuBaselineQuads_, totalQuads_ > 0 ? (float)gpuBaselineQuads_ / totalQuads_ : 0.0f); wi::backlog::post(msg); benchState_ = BenchState::DONE; } // ── GPU Mesh Dispatch (production path) ───────────────────────── // Dispatches GPU mesher for ALL chunks every frame. Replaces CPU greedy meshing. // Uses the atomic quad counter for 1-frame-delayed readback of total quad count. void VoxelRenderer::dispatchGpuMesh(CommandList cmd, const VoxelWorld& world, ProfileAccum* profPack, ProfileAccum* profUpload, ProfileAccum* profDispatch) const { auto* dev = device_; // Zero the quad counter uint32_t zero = 0; dev->UpdateBuffer(&gpuQuadCounter_, &zero, cmd, sizeof(uint32_t)); // Barrier: COPY_DST → UAV for counter, UNDEFINED → UAV for output buffer GPUBarrier preBarriers[] = { GPUBarrier::Buffer(&gpuQuadCounter_, ResourceState::COPY_DST, ResourceState::UNORDERED_ACCESS), GPUBarrier::Buffer(&gpuQuadBuffer_, ResourceState::UNDEFINED, ResourceState::UNORDERED_ACCESS), }; dev->Barrier(preBarriers, 2, cmd); dev->BindComputeShader(&meshShader_, cmd); // Pack and upload all chunks' voxel data // Each chunk = 32^3/2 = 16384 uint32 (two voxels per uint) const uint32_t wordsPerChunk = CHUNK_VOLUME / 2; uint32_t totalWords = chunkCount_ * wordsPerChunk; // Resize voxel data buffer if needed if (totalWords > voxelDataCapacity_) { voxelDataCapacity_ = totalWords; GPUBufferDesc voxDesc; voxDesc.size = totalWords * sizeof(uint32_t); voxDesc.bind_flags = BindFlag::SHADER_RESOURCE; voxDesc.misc_flags = ResourceMiscFlag::BUFFER_STRUCTURED; voxDesc.stride = sizeof(uint32_t); voxDesc.usage = Usage::DEFAULT; dev->CreateBuffer(&voxDesc, nullptr, const_cast(&voxelDataBuffer_)); } // Pack voxel data — use cached copy, only update when dirty. // VoxelData is exactly uint16_t, so voxels[] is a packed uint16 array. // Two consecutive uint16 = one uint32 → direct memcpy, no bit manipulation. static_assert(sizeof(VoxelData) == sizeof(uint16_t), "VoxelData must be 2 bytes for direct memcpy to GPU buffer"); auto tPack0 = std::chrono::high_resolution_clock::now(); if (voxelCacheDirty_) { packedVoxelCache_.resize(totalWords); uint32_t chunkI = 0; world.forEachChunk([&](const ChunkPos& pos, const Chunk& chunk) { std::memcpy( packedVoxelCache_.data() + chunkI * wordsPerChunk, chunk.voxels, wordsPerChunk * sizeof(uint32_t) // = CHUNK_VOLUME * 2 bytes ); chunkI++; }); voxelCacheDirty_ = false; } auto tPack1 = std::chrono::high_resolution_clock::now(); if (profPack) profPack->add(std::chrono::duration(tPack1 - tPack0).count()); // Upload all voxel data at once auto tUpload0 = std::chrono::high_resolution_clock::now(); dev->UpdateBuffer(&voxelDataBuffer_, packedVoxelCache_.data(), cmd, totalWords * sizeof(uint32_t)); auto tUpload1 = std::chrono::high_resolution_clock::now(); if (profUpload) profUpload->add(std::chrono::duration(tUpload1 - tUpload0).count()); // Bind resources (shared across all chunk dispatches) dev->BindResource(&voxelDataBuffer_, 0, cmd); dev->BindUAV(&gpuQuadBuffer_, 0, cmd); dev->BindUAV(&gpuQuadCounter_, 1, cmd); // Dispatch for each chunk struct MeshPush { uint32_t chunkIndex; uint32_t voxelBufferOffset; uint32_t quadBufferOffset; uint32_t maxOutputQuads; uint32_t pad[8]; }; auto tDisp0 = std::chrono::high_resolution_clock::now(); uint32_t chunkIdx = 0; world.forEachChunk([&](const ChunkPos& pos, const Chunk& chunk) { MeshPush pushData = {}; pushData.chunkIndex = chunkIdx; pushData.voxelBufferOffset = chunkIdx * wordsPerChunk; pushData.quadBufferOffset = 0; // global atomic counter handles offsets pushData.maxOutputQuads = MEGA_BUFFER_CAPACITY; dev->PushConstants(&pushData, sizeof(pushData), cmd); // Dispatch: 32/8 = 4 groups per axis → 64 groups per chunk dev->Dispatch(4, 4, 4, cmd); chunkIdx++; }); auto tDisp1 = std::chrono::high_resolution_clock::now(); if (profDispatch) profDispatch->add(std::chrono::duration(tDisp1 - tDisp0).count()); // Barriers: UAV → COPY_SRC for counter readback, UAV → SRV for quad buffer (rendering) GPUBarrier postBarriers[] = { GPUBarrier::Buffer(&gpuQuadCounter_, ResourceState::UNORDERED_ACCESS, ResourceState::COPY_SRC), GPUBarrier::Buffer(&gpuQuadBuffer_, ResourceState::UNORDERED_ACCESS, ResourceState::SHADER_RESOURCE), }; dev->Barrier(postBarriers, 2, cmd); // Copy quad counter to readback buffer (result available next frame) dev->CopyBuffer(&meshCounterReadback_, 0, &gpuQuadCounter_, 0, sizeof(uint32_t), cmd); totalQuads_ = gpuMeshQuadCount_; // display previous frame's count in HUD gpuMeshDirty_ = false; } // ── Frustum plane extraction (Gribb-Hartmann method) ──────────── static void extractFrustumPlanes(const XMMATRIX& vp, XMFLOAT4 planes[6]) { XMFLOAT4X4 m; XMStoreFloat4x4(&m, vp); // Left planes[0] = XMFLOAT4(m._14 + m._11, m._24 + m._21, m._34 + m._31, m._44 + m._41); // Right planes[1] = XMFLOAT4(m._14 - m._11, m._24 - m._21, m._34 - m._31, m._44 - m._41); // Bottom planes[2] = XMFLOAT4(m._14 + m._12, m._24 + m._22, m._34 + m._32, m._44 + m._42); // Top planes[3] = XMFLOAT4(m._14 - m._12, m._24 - m._22, m._34 - m._32, m._44 - m._42); // Near planes[4] = XMFLOAT4(m._13, m._23, m._33, m._43); // Far planes[5] = XMFLOAT4(m._14 - m._13, m._24 - m._23, m._34 - m._33, m._44 - m._43); // Normalize each plane for (int i = 0; i < 6; i++) { float len = std::sqrt(planes[i].x * planes[i].x + planes[i].y * planes[i].y + planes[i].z * planes[i].z); if (len > 0.0001f) { planes[i].x /= len; planes[i].y /= len; planes[i].z /= len; planes[i].w /= len; } } } // ── Render pass ───────────────────────────────────────────────── void VoxelRenderer::render( CommandList cmd, const wi::scene::CameraComponent& camera, const Texture& depthBuffer, const Texture& renderTarget ) const { if (!initialized_ || chunkCount_ == 0 || !pso_.IsValid()) return; auto* dev = device_; // ── GPU Mesh path: quads already dispatched in Render(), just draw ── if (gpuMeshEnabled_ && gpuMesherAvailable_) { // Upload chunk info only when chunks changed if (!cpuChunkInfo_.empty() && chunkInfoDirty_) { dev->UpdateBuffer(&chunkInfoBuffer_, cpuChunkInfo_.data(), cmd, cpuChunkInfo_.size() * sizeof(GPUChunkInfo)); chunkInfoDirty_ = false; } // Per-frame constants VoxelConstants cb = {}; XMMATRIX vpMatrix = camera.GetViewProjection(); XMStoreFloat4x4(&cb.viewProjection, vpMatrix); cb.cameraPosition = XMFLOAT4(camera.Eye.x, camera.Eye.y, camera.Eye.z, 1.0f); cb.sunDirection = XMFLOAT4(-0.5f, -0.8f, -0.3f, 0.0f); cb.sunColor = XMFLOAT4(1.2f, 1.1f, 0.9f, 1.0f); cb.chunkSize = (float)CHUNK_SIZE; cb.textureTiling = 0.25f; cb.blendEnabled = 1.0f; // Phase 3: PS-based blending enabled in GPU mesh path cb.debugBlend = debugBlend_ ? 1.0f : 0.0f; cb.chunkCount = chunkCount_; // Per-material blend flags (bit N = material N): // canBleed: material can overflow visually onto adjacent voxels // resistBleed: adjacent materials cannot overflow onto this material // Material IDs: 1=Grass, 2=Dirt, 3=Stone, 4=Sand, 5=Snow cb.bleedMask = (1u << 1) | (1u << 2) | (1u << 4) | (1u << 5); // Grass, Dirt, Sand, Snow can bleed (NOT Stone) cb.resistBleedMask = (1u << 1); // Grass resists bleed (she bleeds onto others, not the reverse) cb._cullPad2 = 0; dev->UpdateBuffer(&constantBuffer_, &cb, cmd, sizeof(cb)); // Render pass RenderPassImage rp[] = { RenderPassImage::RenderTarget( &renderTarget, RenderPassImage::LoadOp::CLEAR, RenderPassImage::StoreOp::STORE, ResourceState::SHADER_RESOURCE, ResourceState::SHADER_RESOURCE ), RenderPassImage::DepthStencil( &depthBuffer, RenderPassImage::LoadOp::CLEAR, RenderPassImage::StoreOp::STORE, ResourceState::DEPTHSTENCIL, ResourceState::DEPTHSTENCIL, ResourceState::DEPTHSTENCIL ), }; dev->RenderPassBegin(rp, 2, cmd); Viewport vp; vp.width = (float)renderTarget.GetDesc().width; vp.height = (float)renderTarget.GetDesc().height; vp.min_depth = 0.0f; vp.max_depth = 1.0f; dev->BindViewports(1, &vp, cmd); Rect scissor = { 0, 0, (int)vp.width, (int)vp.height }; dev->BindScissorRects(1, &scissor, cmd); dev->BindPipelineState(&pso_, cmd); dev->BindConstantBuffer(&constantBuffer_, 0, cmd); dev->BindResource(&gpuQuadBuffer_, 0, cmd); // GPU quads, not mega-buffer dev->BindResource(&textureArray_, 1, cmd); dev->BindResource(&chunkInfoBuffer_, 2, cmd); dev->BindResource(&voxelDataBuffer_, 3, cmd); // Phase 3: voxel data for PS neighbor lookups dev->BindSampler(&sampler_, 0, cmd); // GPU mesh mode: flags=2, MUST be after BindPipelineState struct VoxelPush { uint32_t chunkIndex; uint32_t quadOffset; uint32_t flags; uint32_t pad[9]; }; VoxelPush pushData = {}; pushData.flags = 2; // GPU mesh mode pushData.quadOffset = 0; dev->PushConstants(&pushData, sizeof(pushData), cmd); // Draw using previous frame's quad count (1-frame delay) if (gpuMeshQuadCount_ > 0) { dev->DrawInstanced(gpuMeshQuadCount_ * 6, 1, 0, 0, cmd); drawCalls_ = 1; } dev->RenderPassEnd(cmd); visibleChunks_ = chunkCount_; return; } // Upload mega-buffer and chunk info to GPU if (!cpuMegaQuads_.empty()) { dev->UpdateBuffer(&megaQuadBuffer_, cpuMegaQuads_.data(), cmd, cpuMegaQuads_.size() * sizeof(PackedQuad)); } if (!cpuChunkInfo_.empty()) { dev->UpdateBuffer(&chunkInfoBuffer_, cpuChunkInfo_.data(), cmd, cpuChunkInfo_.size() * sizeof(GPUChunkInfo)); } // Per-frame constants (with frustum planes for GPU cull shader) VoxelConstants cb = {}; XMMATRIX vpMatrix = camera.GetViewProjection(); XMStoreFloat4x4(&cb.viewProjection, vpMatrix); cb.cameraPosition = XMFLOAT4(camera.Eye.x, camera.Eye.y, camera.Eye.z, 1.0f); cb.sunDirection = XMFLOAT4(-0.5f, -0.8f, -0.3f, 0.0f); cb.sunColor = XMFLOAT4(1.2f, 1.1f, 0.9f, 1.0f); cb.chunkSize = (float)CHUNK_SIZE; cb.textureTiling = 0.25f; cb.blendEnabled = 0.0f; // Phase 3: blending disabled in CPU/MDI paths (no voxel data SRV) cb.debugBlend = 0.0f; cb.bleedMask = 0; cb.resistBleedMask = 0; cb._cullPad2 = 0; cb.chunkCount = chunkCount_; extractFrustumPlanes(vpMatrix, cb.frustumPlanes); dev->UpdateBuffer(&constantBuffer_, &cb, cmd, sizeof(cb)); // Push constant structure (must be 48 bytes = 12 x uint32, matches b999) struct VoxelPush { uint32_t chunkIndex; uint32_t quadOffset; uint32_t flags; // bit 0: 1=MDI mode, 0=CPU mode uint32_t pad[9]; }; visibleChunks_ = 0; drawCalls_ = 0; // ── GPU Cull + MDI path ──────────────────────────────────────── if (gpuCullingEnabled_) { // DX12 buffer decay: all buffers return to COMMON after ExecuteCommandLists. // So every frame starts clean — no cross-frame state tracking needed. // Zero the draw count via UpdateBuffer (COMMON → COPY_DST implicit promotion) uint32_t zero = 0; dev->UpdateBuffer(&drawCountBuffer_, &zero, cmd, sizeof(uint32_t)); // Barriers to UAV for compute shader writes: // - drawCountBuffer_: COPY_DST → UAV (was promoted to COPY_DST by UpdateBuffer) // - indirectArgsBuffer_: COMMON → UAV (explicit, required because COMMON can't // be implicitly promoted to UAV) GPUBarrier preBarriers[] = { GPUBarrier::Buffer(&drawCountBuffer_, ResourceState::COPY_DST, ResourceState::UNORDERED_ACCESS), GPUBarrier::Buffer(&indirectArgsBuffer_, ResourceState::UNDEFINED, ResourceState::UNORDERED_ACCESS), }; dev->Barrier(preBarriers, 2, cmd); // Timestamp: cull begin dev->QueryEnd(×tampHeap_, TS_CULL_BEGIN, cmd); // Dispatch GPU frustum + backface cull compute shader dev->BindComputeShader(&cullShader_, cmd); dev->BindConstantBuffer(&constantBuffer_, 0, cmd); dev->BindResource(&chunkInfoBuffer_, 2, cmd); dev->BindUAV(&indirectArgsBuffer_, 0, cmd); dev->BindUAV(&drawCountBuffer_, 1, cmd); dev->Dispatch((chunkCount_ + 63) / 64, 1, 1, cmd); // Timestamp: cull end dev->QueryEnd(×tampHeap_, TS_CULL_END, cmd); // Barriers: UAV → INDIRECT_ARGUMENT for DrawInstancedIndirectCount GPUBarrier postBarriers[] = { GPUBarrier::Buffer(&indirectArgsBuffer_, ResourceState::UNORDERED_ACCESS, ResourceState::INDIRECT_ARGUMENT), GPUBarrier::Buffer(&drawCountBuffer_, ResourceState::UNORDERED_ACCESS, ResourceState::INDIRECT_ARGUMENT), }; dev->Barrier(postBarriers, 2, cmd); // ── Render pass ──────────────────────────────────────────── RenderPassImage rp[] = { RenderPassImage::RenderTarget( &renderTarget, RenderPassImage::LoadOp::CLEAR, RenderPassImage::StoreOp::STORE, ResourceState::SHADER_RESOURCE, ResourceState::SHADER_RESOURCE ), RenderPassImage::DepthStencil( &depthBuffer, RenderPassImage::LoadOp::CLEAR, RenderPassImage::StoreOp::STORE, ResourceState::DEPTHSTENCIL, ResourceState::DEPTHSTENCIL, ResourceState::DEPTHSTENCIL ), }; dev->RenderPassBegin(rp, 2, cmd); Viewport vp; vp.width = (float)renderTarget.GetDesc().width; vp.height = (float)renderTarget.GetDesc().height; vp.min_depth = 0.0f; vp.max_depth = 1.0f; dev->BindViewports(1, &vp, cmd); Rect scissor = { 0, 0, (int)vp.width, (int)vp.height }; dev->BindScissorRects(1, &scissor, cmd); dev->BindPipelineState(&pso_, cmd); dev->BindConstantBuffer(&constantBuffer_, 0, cmd); dev->BindResource(&megaQuadBuffer_, 0, cmd); dev->BindResource(&textureArray_, 1, cmd); dev->BindResource(&chunkInfoBuffer_, 2, cmd); dev->BindSampler(&sampler_, 0, cmd); // IMPORTANT: PushConstants must be called AFTER BindPipelineState. // Wicked Engine's PushConstants uses SetGraphicsRoot32BitConstants only // when active_pso is set. If called before (with active_cs from compute), // it would set COMPUTE push constants instead of GRAPHICS ones. VoxelPush pushData = {}; pushData.flags = 1; // MDI mode dev->PushConstants(&pushData, sizeof(pushData), cmd); // Timestamp: draw begin dev->QueryEnd(×tampHeap_, TS_DRAW_BEGIN, cmd); // Single MDI call: GPU cull shader filled the indirect args dev->DrawInstancedIndirectCount( &indirectArgsBuffer_, 0, &drawCountBuffer_, 0, MAX_DRAWS, cmd ); drawCalls_ = 1; // Timestamp: draw end dev->QueryEnd(×tampHeap_, TS_DRAW_END, cmd); dev->RenderPassEnd(cmd); // Resolve timestamps for readback (results available next frame) dev->QueryResolve(×tampHeap_, 0, TS_COUNT, ×tampReadback_, 0, cmd); // Read back previous frame's timestamps (persistently mapped READBACK buffer) uint64_t* tsData = (uint64_t*)timestampReadback_.mapped_data; if (tsData) { double freq = (double)dev->GetTimestampFrequency(); if (freq > 0.0 && tsData[TS_CULL_END] > tsData[TS_CULL_BEGIN]) { gpuCullTimeMs_ = (float)((double)(tsData[TS_CULL_END] - tsData[TS_CULL_BEGIN]) / freq * 1000.0); } if (freq > 0.0 && tsData[TS_DRAW_END] > tsData[TS_DRAW_BEGIN]) { gpuDrawTimeMs_ = (float)((double)(tsData[TS_DRAW_END] - tsData[TS_DRAW_BEGIN]) / freq * 1000.0); } } // GPU cull handles visibility counting — approximate from chunkCount visibleChunks_ = chunkCount_; // exact count would require readback of drawCount return; } // ── CPU frustum + backface cull (shared by MDI and per-face paths) ── wi::primitive::Frustum frustum; frustum.Create(camera.GetViewProjection()); // ── Phase 2.2: CPU-filled indirect args + MDI draw ────────────── if (mdiEnabled_) { // CPU cull: fill indirect args with visible face groups cpuIndirectArgs_.clear(); uint32_t cpuDrawCount = 0; for (uint32_t i = 0; i < chunkCount_; i++) { const auto& slot = chunkSlots_[i]; if (slot.quadCount == 0) continue; XMFLOAT3 aabbMin( (float)(slot.pos.x * CHUNK_SIZE), (float)(slot.pos.y * CHUNK_SIZE), (float)(slot.pos.z * CHUNK_SIZE) ); XMFLOAT3 aabbMax( aabbMin.x + CHUNK_SIZE, aabbMin.y + CHUNK_SIZE, aabbMin.z + CHUNK_SIZE ); wi::primitive::AABB aabb(aabbMin, aabbMax); if (!frustum.CheckBoxFast(aabb)) continue; visibleChunks_++; const auto& info = cpuChunkInfo_[i]; for (uint32_t f = 0; f < 6; f++) { if (info.faceCounts[f] == 0) continue; bool backFacing = false; switch (f) { case 0: backFacing = (camera.Eye.x < aabbMin.x); break; case 1: backFacing = (camera.Eye.x > aabbMax.x); break; case 2: backFacing = (camera.Eye.y < aabbMin.y); break; case 3: backFacing = (camera.Eye.y > aabbMax.y); break; case 4: backFacing = (camera.Eye.z < aabbMin.z); break; case 5: backFacing = (camera.Eye.z > aabbMax.z); break; } if (backFacing) continue; IndirectDrawArgs args = {}; // Pack chunkIndex (low 16 bits) + faceIndex (high 16 bits) into push constant. // The shader unpacks this to look up quadOffset from GPUChunkInfo. // We do NOT use startVertexLocation because SV_VertexID may not include it // reliably in ExecuteIndirect context. args.pushConstant = i | (f << 16); args.vertexCountPerInstance = info.faceCounts[f] * 6; args.instanceCount = 1; args.startVertexLocation = 0; args.startInstanceLocation = 0; cpuIndirectArgs_.push_back(args); cpuDrawCount++; } } // Upload indirect args and draw count to GPU // Note: no explicit barriers needed here. Buffers start in COMMON each frame // (DX12 buffer decay after command list execution). COMMON is implicitly // promoted to COPY_DST by UpdateBuffer, then to INDIRECT_ARGUMENT by // DrawInstancedIndirectCount. This matches Phase 2.1 pattern (no barriers // between UpdateBuffer and SRV usage for megaQuadBuffer_/chunkInfoBuffer_). if (!cpuIndirectArgs_.empty()) { dev->UpdateBuffer(&indirectArgsBuffer_, cpuIndirectArgs_.data(), cmd, cpuIndirectArgs_.size() * sizeof(IndirectDrawArgs)); } dev->UpdateBuffer(&drawCountBuffer_, &cpuDrawCount, cmd, sizeof(uint32_t)); // ── Render pass ──────────────────────────────────────────── RenderPassImage rp[] = { RenderPassImage::RenderTarget( &renderTarget, RenderPassImage::LoadOp::CLEAR, RenderPassImage::StoreOp::STORE, ResourceState::SHADER_RESOURCE, ResourceState::SHADER_RESOURCE ), RenderPassImage::DepthStencil( &depthBuffer, RenderPassImage::LoadOp::CLEAR, RenderPassImage::StoreOp::STORE, ResourceState::DEPTHSTENCIL, ResourceState::DEPTHSTENCIL, ResourceState::DEPTHSTENCIL ), }; dev->RenderPassBegin(rp, 2, cmd); Viewport vp; vp.width = (float)renderTarget.GetDesc().width; vp.height = (float)renderTarget.GetDesc().height; vp.min_depth = 0.0f; vp.max_depth = 1.0f; dev->BindViewports(1, &vp, cmd); Rect scissor = { 0, 0, (int)vp.width, (int)vp.height }; dev->BindScissorRects(1, &scissor, cmd); dev->BindPipelineState(&pso_, cmd); dev->BindConstantBuffer(&constantBuffer_, 0, cmd); dev->BindResource(&megaQuadBuffer_, 0, cmd); dev->BindResource(&textureArray_, 1, cmd); dev->BindResource(&chunkInfoBuffer_, 2, cmd); dev->BindSampler(&sampler_, 0, cmd); // MDI mode: VS uses binary search to find chunk from SV_VertexID VoxelPush pushData = {}; pushData.flags = 1; // MDI mode dev->PushConstants(&pushData, sizeof(pushData), cmd); dev->DrawInstancedIndirectCount( &indirectArgsBuffer_, 0, &drawCountBuffer_, 0, MAX_DRAWS, cmd ); drawCalls_ = 1; dev->RenderPassEnd(cmd); return; } // ── Phase 2.1 Fallback: per-face-group DrawInstanced ──────────── RenderPassImage rp[] = { RenderPassImage::RenderTarget( &renderTarget, RenderPassImage::LoadOp::CLEAR, RenderPassImage::StoreOp::STORE, ResourceState::SHADER_RESOURCE, ResourceState::SHADER_RESOURCE ), RenderPassImage::DepthStencil( &depthBuffer, RenderPassImage::LoadOp::CLEAR, RenderPassImage::StoreOp::STORE, ResourceState::DEPTHSTENCIL, ResourceState::DEPTHSTENCIL, ResourceState::DEPTHSTENCIL ), }; dev->RenderPassBegin(rp, 2, cmd); Viewport vp; vp.width = (float)renderTarget.GetDesc().width; vp.height = (float)renderTarget.GetDesc().height; vp.min_depth = 0.0f; vp.max_depth = 1.0f; dev->BindViewports(1, &vp, cmd); Rect scissor = { 0, 0, (int)vp.width, (int)vp.height }; dev->BindScissorRects(1, &scissor, cmd); dev->BindPipelineState(&pso_, cmd); dev->BindConstantBuffer(&constantBuffer_, 0, cmd); dev->BindResource(&megaQuadBuffer_, 0, cmd); dev->BindResource(&textureArray_, 1, cmd); dev->BindResource(&chunkInfoBuffer_, 2, cmd); dev->BindSampler(&sampler_, 0, cmd); for (uint32_t i = 0; i < chunkCount_; i++) { const auto& slot = chunkSlots_[i]; if (slot.quadCount == 0) continue; XMFLOAT3 aabbMin( (float)(slot.pos.x * CHUNK_SIZE), (float)(slot.pos.y * CHUNK_SIZE), (float)(slot.pos.z * CHUNK_SIZE) ); XMFLOAT3 aabbMax( aabbMin.x + CHUNK_SIZE, aabbMin.y + CHUNK_SIZE, aabbMin.z + CHUNK_SIZE ); wi::primitive::AABB aabb(aabbMin, aabbMax); if (!frustum.CheckBoxFast(aabb)) continue; visibleChunks_++; const auto& info = cpuChunkInfo_[i]; for (uint32_t f = 0; f < 6; f++) { if (info.faceCounts[f] == 0) continue; bool backFacing = false; switch (f) { case 0: backFacing = (camera.Eye.x < aabbMin.x); break; case 1: backFacing = (camera.Eye.x > aabbMax.x); break; case 2: backFacing = (camera.Eye.y < aabbMin.y); break; case 3: backFacing = (camera.Eye.y > aabbMax.y); break; case 4: backFacing = (camera.Eye.z < aabbMin.z); break; case 5: backFacing = (camera.Eye.z > aabbMax.z); break; } if (backFacing) continue; VoxelPush pushData = {}; pushData.chunkIndex = i; pushData.quadOffset = slot.quadOffset + info.faceOffsets[f]; pushData.flags = 0; // CPU mode dev->PushConstants(&pushData, sizeof(pushData), cmd); dev->DrawInstanced(info.faceCounts[f] * 6, 1, 0, 0, cmd); drawCalls_++; } } dev->RenderPassEnd(cmd); } // ── VoxelRenderPath (custom RenderPath3D) ─────────────────────── void VoxelRenderPath::Start() { RenderPath3D::Start(); auto* device = wi::graphics::GetDevice(); renderer.initialize(device); renderer.debugFaceColors_ = debugMode; // Generate world if (debugMode) { world.generateDebug(); cameraPos = { 10.0f, 10.0f, 0.0f }; cameraPitch = -0.4f; cameraYaw = 0.5f; } else { world.generateAround(cameraPos.x, cameraPos.y, cameraPos.z, 4); } if (renderer.isInitialized()) { renderer.updateMeshes(world); } // Phase 4: Initialize toping system and collect instances topingSystem.initialize(); topingSystem.collectInstances(world); { char msg[256]; snprintf(msg, sizeof(msg), "TopingSystem: %zu defs, %zu vertices, %zu instances", topingSystem.getDefCount(), topingSystem.getVertexCount(), topingSystem.getInstanceCount()); wi::backlog::post(msg); } worldGenerated_ = true; setAO(AO_DISABLED); setFXAAEnabled(true); setBloomEnabled(false); createRenderTargets(); } void VoxelRenderPath::createRenderTargets() { auto* device = wi::graphics::GetDevice(); if (!device) return; uint32_t w = GetPhysicalWidth(); uint32_t h = GetPhysicalHeight(); if (w == 0 || h == 0) { w = 1920; h = 1080; } wi::graphics::TextureDesc rtDesc; rtDesc.type = wi::graphics::TextureDesc::Type::TEXTURE_2D; rtDesc.width = w; rtDesc.height = h; rtDesc.format = wi::graphics::Format::R8G8B8A8_UNORM; rtDesc.bind_flags = wi::graphics::BindFlag::RENDER_TARGET | wi::graphics::BindFlag::SHADER_RESOURCE; rtDesc.mip_levels = 1; rtDesc.sample_count = 1; rtDesc.layout = wi::graphics::ResourceState::SHADER_RESOURCE; device->CreateTexture(&rtDesc, nullptr, &voxelRT_); wi::graphics::TextureDesc depthDesc; depthDesc.type = wi::graphics::TextureDesc::Type::TEXTURE_2D; depthDesc.width = w; depthDesc.height = h; depthDesc.format = wi::graphics::Format::D32_FLOAT; depthDesc.bind_flags = wi::graphics::BindFlag::DEPTH_STENCIL | wi::graphics::BindFlag::SHADER_RESOURCE; depthDesc.mip_levels = 1; depthDesc.sample_count = 1; depthDesc.layout = wi::graphics::ResourceState::DEPTHSTENCIL; device->CreateTexture(&depthDesc, nullptr, &voxelDepth_); rtCreated_ = voxelRT_.IsValid() && voxelDepth_.IsValid(); wi::backlog::post("VoxelRenderPath: render targets " + std::string(rtCreated_ ? "OK" : "FAILED") + " (" + std::to_string(w) + "x" + std::to_string(h) + ")"); } // ── WASD camera input ─────────────────────────────────────────── static constexpr wi::input::BUTTON KEY_W = (wi::input::BUTTON)(wi::input::CHARACTER_RANGE_START + ('W' - 'A')); static constexpr wi::input::BUTTON KEY_A = (wi::input::BUTTON)(wi::input::CHARACTER_RANGE_START + ('A' - 'A')); static constexpr wi::input::BUTTON KEY_S = (wi::input::BUTTON)(wi::input::CHARACTER_RANGE_START + ('S' - 'A')); static constexpr wi::input::BUTTON KEY_D = (wi::input::BUTTON)(wi::input::CHARACTER_RANGE_START + ('D' - 'A')); void VoxelRenderPath::handleInput(float dt) { // F2: toggle backlog console if (wi::input::Press(wi::input::KEYBOARD_BUTTON_F2)) { wi::backlog::Toggle(); } // F3: toggle animated terrain if (wi::input::Press(wi::input::KEYBOARD_BUTTON_F3)) { animatedTerrain_ = !animatedTerrain_; wi::backlog::post(animatedTerrain_ ? "Animation: ON (60 Hz)" : "Animation: OFF"); } // F4: toggle blend debug visualization if (wi::input::Press(wi::input::KEYBOARD_BUTTON_F4)) { renderer.debugBlend_ = !renderer.debugBlend_; wi::backlog::post(renderer.debugBlend_ ? "Blend debug: ON" : "Blend debug: OFF"); } if (wi::input::Press(wi::input::MOUSE_BUTTON_RIGHT)) { mouseCaptured = !mouseCaptured; wi::input::HidePointer(mouseCaptured); } if (mouseCaptured) { auto mouseState = wi::input::GetMouseState(); cameraYaw += mouseState.delta_position.x * cameraSensitivity; cameraPitch += mouseState.delta_position.y * cameraSensitivity; cameraPitch = std::clamp(cameraPitch, -1.5f, 1.5f); } float cosPitch = std::cos(cameraPitch); XMFLOAT3 forward( std::sin(cameraYaw) * cosPitch, -std::sin(cameraPitch), std::cos(cameraYaw) * cosPitch ); XMFLOAT3 right(std::cos(cameraYaw), 0.0f, -std::sin(cameraYaw)); float speed = cameraSpeed * dt; if (wi::input::Down(wi::input::KEYBOARD_BUTTON_LSHIFT)) speed *= 3.0f; if (wi::input::Down(KEY_W)) { cameraPos.x += forward.x * speed; cameraPos.y += forward.y * speed; cameraPos.z += forward.z * speed; } if (wi::input::Down(KEY_S)) { cameraPos.x -= forward.x * speed; cameraPos.y -= forward.y * speed; cameraPos.z -= forward.z * speed; } if (wi::input::Down(KEY_A)) { cameraPos.x -= right.x * speed; cameraPos.z -= right.z * speed; } if (wi::input::Down(KEY_D)) { cameraPos.x += right.x * speed; cameraPos.z += right.z * speed; } if (wi::input::Down(wi::input::KEYBOARD_BUTTON_SPACE)) cameraPos.y += speed; if (wi::input::Down(wi::input::KEYBOARD_BUTTON_LCONTROL)) cameraPos.y -= speed; camera->Eye = cameraPos; camera->At = forward; camera->Up = XMFLOAT3(0, 1, 0); camera->UpdateCamera(); } void VoxelRenderPath::Update(float dt) { auto frameStart = std::chrono::high_resolution_clock::now(); lastDt_ = dt; float instantFps = (dt > 0.0f) ? (1.0f / dt) : 0.0f; smoothFps_ = smoothFps_ * 0.95f + instantFps * 0.05f; if (camera) handleInput(dt); // Animated terrain: regenerate at 60 Hz with time-shifted noise // Fused: regenerate + pack voxel data in the same parallel pass if (animatedTerrain_ && renderer.isInitialized()) { animAccum_ += dt; if (animAccum_ >= ANIM_INTERVAL) { animAccum_ -= ANIM_INTERVAL; animTime_ += ANIM_INTERVAL; // Prepare pack cache for fused regenerate+pack const uint32_t wordsPerChunk = CHUNK_VOLUME / 2; uint32_t totalWords = (uint32_t)world.chunkCount() * wordsPerChunk; renderer.packedVoxelCache_.resize(totalWords); auto t0 = std::chrono::high_resolution_clock::now(); world.regenerateAnimated(animTime_, renderer.packedVoxelCache_.data(), totalWords); auto t1 = std::chrono::high_resolution_clock::now(); profRegenerate_.add(std::chrono::duration(t1 - t0).count()); renderer.voxelCacheDirty_ = false; // cache already filled by fused pack renderer.gpuMeshDirty_ = true; // GPU still needs upload + dispatch } } if (renderer.isInitialized()) { auto t0 = std::chrono::high_resolution_clock::now(); renderer.updateMeshes(world); auto t1 = std::chrono::high_resolution_clock::now(); profUpdateMeshes_.add(std::chrono::duration(t1 - t0).count()); } RenderPath3D::Update(dt); // Profiling: accumulate frame time (will be completed in Compose) auto frameEnd = std::chrono::high_resolution_clock::now(); profFrame_.add(std::chrono::duration(frameEnd - frameStart).count()); // Log averages every 5 seconds profTimer_ += dt; if (profTimer_ >= PROF_INTERVAL) { logProfilingAverages(); profTimer_ -= PROF_INTERVAL; } } void VoxelRenderPath::Render() const { RenderPath3D::Render(); if (renderer.isInitialized() && camera && rtCreated_) { auto* device = wi::graphics::GetDevice(); CommandList cmd = device->BeginCommandList(); // GPU mesh path: only re-dispatch when voxel data changed if (renderer.gpuMeshEnabled_ && renderer.gpuMesherAvailable_) { // Always readback previous frame's quad count uint32_t* countData = (uint32_t*)renderer.meshCounterReadback_.mapped_data; if (countData) { renderer.gpuMeshQuadCount_ = *countData; renderer.totalQuads_ = renderer.gpuMeshQuadCount_; } // Only re-dispatch compute mesher when data changed if (renderer.gpuMeshDirty_) { renderer.dispatchGpuMesh(cmd, world, &profVoxelPack_, &profGpuUpload_, &profGpuDispatch_); } } // GPU mesh benchmark state machine (runs once after world gen, CPU path only) if (!renderer.gpuMeshEnabled_) { if (renderer.benchState_ == VoxelRenderer::BenchState::DISPATCH) { renderer.dispatchGpuMeshBenchmark(cmd, world); } else if (renderer.benchState_ == VoxelRenderer::BenchState::READBACK) { renderer.readbackGpuMeshBenchmark(); } } auto tRender0 = std::chrono::high_resolution_clock::now(); renderer.render(cmd, *camera, voxelDepth_, voxelRT_); auto tRender1 = std::chrono::high_resolution_clock::now(); profRender_.add(std::chrono::duration(tRender1 - tRender0).count()); } } void VoxelRenderPath::logProfilingAverages() const { char msg[512]; snprintf(msg, sizeof(msg), "=== PERF PROFILE (avg over %.0fs) ===\n" " Regenerate: %7.2f ms (%u calls)\n" " UpdateMeshes: %7.2f ms (%u calls)\n" " VoxelPack: %7.2f ms (%u calls)\n" " GPU Upload: %7.2f ms (%u calls)\n" " GPU Dispatch: %7.2f ms (%u calls)\n" " Render: %7.2f ms (%u calls)\n" " Frame (Upd): %7.2f ms (%u calls, %.1f FPS)", PROF_INTERVAL, profRegenerate_.avg(), profRegenerate_.count, profUpdateMeshes_.avg(), profUpdateMeshes_.count, profVoxelPack_.avg(), profVoxelPack_.count, profGpuUpload_.avg(), profGpuUpload_.count, profGpuDispatch_.avg(), profGpuDispatch_.count, profRender_.avg(), profRender_.count, profFrame_.avg(), profFrame_.count, profFrame_.count > 0 ? (1000.0f / profFrame_.avg()) : 0.0f); wi::backlog::post(msg); profRegenerate_.reset(); profUpdateMeshes_.reset(); profVoxelPack_.reset(); profGpuUpload_.reset(); profGpuDispatch_.reset(); profRender_.reset(); profFrame_.reset(); } void VoxelRenderPath::Compose(CommandList cmd) const { frameCount_++; RenderPath3D::Compose(cmd); if (rtCreated_ && voxelRT_.IsValid()) { wi::image::Params fx; fx.enableFullScreen(); fx.blendFlag = wi::enums::BLENDMODE_OPAQUE; wi::image::Draw(&voxelRT_, fx, cmd); } // HUD overlay wi::font::Params fp; fp.posX = 10; fp.posY = 10; fp.size = 20; fp.color = wi::Color(255, 255, 255, 230); fp.shadowColor = wi::Color(0, 0, 0, 180); char fpsStr[16]; snprintf(fpsStr, sizeof(fpsStr), "%.1f", smoothFps_); char dtStr[16]; snprintf(dtStr, sizeof(dtStr), "%.2f", lastDt_ * 1000.0f); std::string stats = "BVLE Voxel Engine (Phase 4 — Toping)\n"; stats += "FPS: " + std::string(fpsStr) + " (" + std::string(dtStr) + " ms)\n"; if (debugMode) { stats += "=== DEBUG FACE MODE ===\n"; stats += "+X=Red -X=DkRed +Y=Green -Y=DkGreen +Z=Blue -Z=DkBlue\n"; } stats += "Chunks: " + std::to_string(renderer.getVisibleChunks()) + "/" + std::to_string(renderer.getChunkCount()) + "\n"; stats += "Quads: " + std::to_string(renderer.getTotalQuads()) + "\n"; std::string renderMode; if (renderer.isGpuMeshEnabled()) renderMode = "GPU mesh (1x1) + DrawInstanced"; else if (renderer.isGpuCulling()) renderMode = "CPU greedy + MDI + GPU cull"; else if (renderer.isMdiEnabled()) renderMode = "CPU greedy + MDI + CPU cull"; else renderMode = "CPU greedy + DrawInstanced + CPU cull"; stats += "Draw Calls: " + std::to_string(renderer.getDrawCalls()) + " (" + renderMode + ")\n"; if (renderer.isGpuMeshEnabled()) { stats += "GPU Mesh Quads: " + std::to_string(renderer.getGpuMeshQuadCount()) + "\n"; } else { char cullStr[16], drawStr[16]; snprintf(cullStr, sizeof(cullStr), "%.3f", renderer.getGpuCullTimeMs()); snprintf(drawStr, sizeof(drawStr), "%.3f", renderer.getGpuDrawTimeMs()); stats += "GPU Cull: " + std::string(cullStr) + " ms | Draw: " + std::string(drawStr) + " ms\n"; } stats += "Topings: " + std::to_string(topingSystem.getInstanceCount()) + " instances (" + std::to_string(topingSystem.getDefCount()) + " types, " + std::to_string(topingSystem.getVertexCount()) + " verts)\n"; stats += "WASD+Space/Ctrl: move | Shift: fast | Right-click: capture mouse\n"; stats += "F2: console | F3: anim [" + std::string(animatedTerrain_ ? "ON" : "OFF") + "] | F4: dbg [" + std::string(renderer.debugBlend_ ? "ON" : "OFF") + "]"; wi::font::Draw(stats, fp, cmd); } } // namespace voxel