bvle-voxels/src/voxel/VoxelRenderer.h
Samuel Bouchet 6b41da0932 Phase 6.2: RT shadows — inline ray queries with BLAS/TLAS fix
Add shadow compute shader (voxelShadowCS.hlsl) that traces rays toward
the sun using DXR inline ray queries (RayQuery<>, SM 6.5). Shadows
modulate voxelRT_ in-place via RWTexture2D (no extra render target).

Key fixes to Phase 6.1 BLAS/TLAS infrastructure:
- Sequential index buffer required: Wicked treats IndexCount=0 with
  non-null IndexBuffer as "0 indexed triangles" → empty BLAS
- Memory barriers between BLAS→TLAS→RT: without GPUBarrier::Memory()
  the TLAS build races with BLAS builds, causing zero ray hits
- inverseViewProjection added to VoxelCB for depth reconstruction

F5 toggles shadows OFF→ON→DEBUG (red=hit, green=miss, blue=backface).
2026-03-28 20:01:18 +01:00

353 lines
17 KiB
C++
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#pragma once
#include "VoxelWorld.h"
#include "VoxelMesher.h"
#include "TopingSystem.h"
#include "WickedEngine.h"
namespace voxel {
// ── CPU Profiling accumulator ────────────────────────────────────
struct ProfileAccum {
double totalMs = 0.0;
uint32_t count = 0;
void add(float ms) { totalMs += ms; count++; }
float avg() const { return count > 0 ? (float)(totalMs / count) : 0.0f; }
void reset() { totalMs = 0.0; count = 0; }
};
// ── GPU-visible chunk info (must match HLSL GPUChunkInfo) ────────
struct GPUChunkInfo {
XMFLOAT4 worldPos; // xyz = chunk origin, w = debug flag
uint32_t quadOffset; // offset into mega quad buffer
uint32_t quadCount; // number of quads for this chunk
uint32_t pad[2]; // align to 32 bytes
uint32_t faceOffsets[6]; // per-face quad offset within this chunk's quads
uint32_t faceCounts[6]; // per-face quad count
uint32_t neighbors[6]; // chunk index of face neighbors (+X,-X,+Y,-Y,+Z,-Z), 0xFFFFFFFF = none
uint32_t pad2[2]; // pad to 112 bytes (7 × float4)
};
// ── Voxel Renderer (Phase 2: mega-buffer + MDI pipeline) ────────
class VoxelRenderer {
friend class VoxelRenderPath;
public:
VoxelRenderer();
~VoxelRenderer();
void initialize(wi::graphics::GraphicsDevice* device);
void shutdown();
// Mesh dirty chunks and repack the mega-buffer
void updateMeshes(VoxelWorld& world);
// Render all visible chunks
void render(
wi::graphics::CommandList cmd,
const wi::scene::CameraComponent& camera,
const wi::graphics::Texture& depthBuffer,
const wi::graphics::Texture& renderTarget,
const wi::graphics::Texture& normalTarget
) const;
// Generate procedural textures for materials
void generateTextures();
// Stats
uint32_t getTotalQuads() const { return totalQuads_; }
uint32_t getVisibleChunks() const { return visibleChunks_; }
uint32_t getDrawCalls() const { return drawCalls_; }
uint32_t getChunkCount() const { return chunkCount_; }
bool isInitialized() const { return initialized_; }
bool isGpuCulling() const { return gpuCullingEnabled_; }
bool isMdiEnabled() const { return mdiEnabled_; }
bool debugFaceColors_ = false;
bool debugBlend_ = false;
float windTime_ = 0.0f; // set by VoxelRenderPath::Update each frame
private:
void createPipeline();
void rebuildMegaBuffer(VoxelWorld& world);
wi::graphics::GraphicsDevice* device_ = nullptr;
// Shaders & Pipeline (voxels)
wi::graphics::Shader vertexShader_;
wi::graphics::Shader pixelShader_;
wi::graphics::PipelineState pso_;
wi::graphics::Shader cullShader_; // Frustum cull compute shader
// Shaders & Pipeline (topings, Phase 4)
wi::graphics::Shader topingVS_;
wi::graphics::Shader topingPS_;
wi::graphics::PipelineState topingPso_;
wi::graphics::GPUBuffer topingVertexBuffer_; // StructuredBuffer<TopingVertex>, SRV t4
wi::graphics::GPUBuffer topingInstanceBuffer_; // StructuredBuffer<float3>, SRV t5
static constexpr uint32_t MAX_TOPING_INSTANCES = 256 * 1024; // 256K instances max
// Persistent staging buffers for toping upload (avoids per-frame allocations)
struct TopingSortedInst { float wx, wy, wz; uint16_t type, variant; };
struct TopingGPUInst { float x, y, z; };
std::vector<TopingSortedInst> topingSorted_;
std::vector<TopingGPUInst> topingGpuInsts_;
mutable uint32_t topingDrawCalls_ = 0;
// Shaders & Pipeline (smooth surfaces, Phase 5)
wi::graphics::Shader smoothVS_;
wi::graphics::Shader smoothPS_;
wi::graphics::RasterizerState smoothRasterizer_;
wi::graphics::PipelineState smoothPso_;
wi::graphics::GPUBuffer smoothVertexBuffer_; // StructuredBuffer<SmoothVertex>, SRV t6
std::vector<SmoothVertex> smoothStagingVerts_; // persistent staging buffer (avoids per-frame alloc)
static constexpr uint32_t MAX_SMOOTH_VERTICES = 4 * 1024 * 1024; // 4M vertices max
mutable uint32_t smoothVertexCount_ = 0;
mutable uint32_t smoothDrawCalls_ = 0;
bool smoothDirty_ = true;
// Texture array for materials (256x256, 5 layers for prototype)
wi::graphics::Texture textureArray_;
wi::graphics::Sampler sampler_;
// ── Mega-buffer architecture (Phase 2) ──────────────────────
static constexpr uint32_t MEGA_BUFFER_CAPACITY = 2 * 1024 * 1024; // 2M quads max (16 MB)
static constexpr uint32_t MAX_CHUNKS = 2048;
static constexpr uint32_t MAX_DRAWS = MAX_CHUNKS * 6; // up to 6 face groups per chunk
wi::graphics::GPUBuffer megaQuadBuffer_; // StructuredBuffer<PackedQuad>, SRV t0
wi::graphics::GPUBuffer chunkInfoBuffer_; // StructuredBuffer<GPUChunkInfo>, SRV t2
// CPU-side tracking
struct ChunkSlot {
ChunkPos pos;
uint32_t quadOffset; // offset into mega-buffer (in quads)
uint32_t quadCount;
};
std::vector<ChunkSlot> chunkSlots_;
std::vector<GPUChunkInfo> cpuChunkInfo_;
std::vector<PackedQuad> cpuMegaQuads_; // CPU staging for mega-buffer
uint32_t chunkCount_ = 0;
bool megaBufferDirty_ = true;
// ── Indirect draw (Phase 2 MDI) ─────────────────────────────
// Wicked Engine's DrawInstancedIndirectCount command signature includes a
// push constant (1 × uint32 at b999) BEFORE each D3D12_DRAW_ARGUMENTS.
// Total stride = 4 + 16 = 20 bytes per draw entry.
struct IndirectDrawArgs {
uint32_t pushConstant; // written to b999[0] by ExecuteIndirect
uint32_t vertexCountPerInstance;
uint32_t instanceCount;
uint32_t startVertexLocation;
uint32_t startInstanceLocation;
};
wi::graphics::GPUBuffer indirectArgsBuffer_; // IndirectDrawArgs[MAX_DRAWS]
wi::graphics::GPUBuffer drawCountBuffer_; // uint32_t[1]
mutable std::vector<IndirectDrawArgs> cpuIndirectArgs_;
bool gpuCullingEnabled_ = true; // Phase 2.3: GPU compute cull (true) vs CPU fallback (false)
bool mdiEnabled_ = true; // Phase 2.2: MDI rendering with CPU-filled indirect args
// Constants buffer (must match HLSL VoxelCB)
struct VoxelConstants {
XMFLOAT4X4 viewProjection;
XMFLOAT4X4 inverseViewProjection; // for depth-to-world reconstruction (RT shadows)
XMFLOAT4 cameraPosition;
XMFLOAT4 sunDirection;
XMFLOAT4 sunColor;
float chunkSize;
float textureTiling;
float blendEnabled;
float debugBlend;
XMFLOAT4 frustumPlanes[6]; // ax+by+cz+d=0
uint32_t chunkCount;
uint32_t bleedMask; // bit N set = material N can bleed onto neighbors
uint32_t resistBleedMask; // bit N set = material N resists bleed from neighbors
float windTime;
};
wi::graphics::GPUBuffer constantBuffer_;
// ── GPU Compute Mesher ──────────────────────────────────────────
wi::graphics::Shader meshShader_; // voxelMeshCS compute shader
mutable wi::graphics::GPUBuffer voxelDataBuffer_; // chunk voxel data (StructuredBuffer<uint>)
wi::graphics::GPUBuffer gpuQuadBuffer_; // GPU mesh output (RWStructuredBuffer<uint2>)
wi::graphics::GPUBuffer gpuQuadCounter_; // atomic counter for GPU mesh output
wi::graphics::GPUBuffer meshCounterReadback_; // READBACK buffer for quad counter
bool gpuMesherAvailable_ = false;
bool gpuMeshEnabled_ = true; // Use GPU meshing instead of CPU greedy
mutable uint32_t gpuMeshQuadCount_ = 0; // Readback from previous frame (1-frame delay)
mutable uint32_t voxelDataCapacity_ = 0; // Current capacity of voxelDataBuffer_ (in uint32s)
mutable std::vector<uint32_t> packedVoxelCache_; // cached packed voxel data for all chunks
mutable bool voxelCacheDirty_ = true; // true: packedVoxelCache_ needs repack from chunks
mutable bool gpuMeshDirty_ = true; // true: GPU needs upload + re-dispatch
mutable bool chunkInfoDirty_ = true; // true: chunkInfoBuffer needs re-upload
// ── GPU Smooth Mesher (Phase 5.3) ─────────────────────────────
wi::graphics::Shader smoothCentroidShader_; // voxelSmoothCentroidCS (pass 1: centroid grid)
wi::graphics::Shader smoothMeshShader_; // voxelSmoothCS (pass 2: emit with smooth normals)
wi::graphics::GPUBuffer centroidGridBuffer_; // float4[34^3] per-chunk centroid grid (reused)
wi::graphics::GPUBuffer gpuSmoothVertexBuffer_; // RWStructuredBuffer<GPUSmoothVertex>, UAV+SRV
wi::graphics::GPUBuffer gpuSmoothCounter_; // atomic counter for smooth vertices
wi::graphics::GPUBuffer smoothCounterReadback_; // READBACK buffer for vertex counter
static constexpr uint32_t CENTROID_GRID_SIZE = 34 * 34 * 34; // 39304 entries per chunk
static constexpr uint32_t MAX_GPU_SMOOTH_VERTICES = 2 * 1024 * 1024; // 2M vertices max
mutable uint32_t gpuSmoothVertexCount_ = 0; // readback from previous frame
mutable bool gpuSmoothMeshDirty_ = true;
// ── Ray Tracing (Phase 6.1) ─────────────────────────────────────
wi::graphics::Shader blasExtractShader_; // voxelBLASExtractCS compute shader
mutable wi::graphics::GPUBuffer blasPositionBuffer_; // float3[] for blocky BLAS (6 verts per quad)
wi::graphics::GPUBuffer blasIndexBuffer_; // sequential uint32 indices [0,1,2,...] for BLAS
mutable wi::graphics::RaytracingAccelerationStructure blockyBLAS_;
mutable wi::graphics::RaytracingAccelerationStructure smoothBLAS_;
mutable wi::graphics::RaytracingAccelerationStructure tlas_;
static constexpr uint32_t MAX_BLAS_VERTICES = MEGA_BUFFER_CAPACITY * 6; // 6 verts per quad
mutable bool rtAvailable_ = false; // GPU supports RT
mutable bool rtDirty_ = true; // BLAS/TLAS need rebuild
mutable uint32_t rtBlockyVertexCount_ = 0; // current blocky BLAS vertex count
mutable uint32_t rtSmoothVertexCount_ = 0; // current smooth BLAS vertex count
void dispatchBLASExtract(wi::graphics::CommandList cmd) const;
void buildAccelerationStructures(wi::graphics::CommandList cmd) const;
// ── RT Shadows (Phase 6.2) ─────────────────────────────────────
wi::graphics::Shader shadowShader_; // voxelShadowCS compute shader
mutable bool rtShadowsEnabled_ = false; // true when shader + TLAS ready
mutable bool rtShadowDebug_ = false; // debug visualization mode
void dispatchShadows(wi::graphics::CommandList cmd,
const wi::graphics::Texture& depthBuffer,
const wi::graphics::Texture& renderTarget,
const wi::graphics::Texture& normalTarget) const;
// Benchmark state machine: runs once after world gen
enum class BenchState { IDLE, DISPATCH, READBACK, DONE };
mutable BenchState benchState_ = BenchState::IDLE;
mutable float cpuMeshTimeMs_ = 0.0f;
mutable uint32_t gpuBaselineQuads_ = 0;
void dispatchGpuMeshBenchmark(wi::graphics::CommandList cmd, const VoxelWorld& world) const;
void readbackGpuMeshBenchmark() const;
void dispatchGpuMesh(wi::graphics::CommandList cmd, const VoxelWorld& world,
ProfileAccum* profPack = nullptr, ProfileAccum* profUpload = nullptr,
ProfileAccum* profDispatch = nullptr) const;
void dispatchGpuSmoothMesh(wi::graphics::CommandList cmd, const VoxelWorld& world) const;
void rebuildChunkInfoOnly(VoxelWorld& world);
// ── GPU Timestamp Queries (Phase 2 benchmark) ────────────────
wi::graphics::GPUQueryHeap timestampHeap_;
wi::graphics::GPUBuffer timestampReadback_;
static constexpr uint32_t TS_CULL_BEGIN = 0;
static constexpr uint32_t TS_CULL_END = 1;
static constexpr uint32_t TS_DRAW_BEGIN = 2;
static constexpr uint32_t TS_DRAW_END = 3;
static constexpr uint32_t TS_MESH_BEGIN = 4;
static constexpr uint32_t TS_MESH_END = 5;
static constexpr uint32_t TS_COUNT = 6;
mutable float gpuCullTimeMs_ = 0.0f;
mutable float gpuDrawTimeMs_ = 0.0f;
mutable float gpuMeshTimeMs_ = 0.0f;
// Stats (mutable: updated during const Render() call)
mutable uint32_t totalQuads_ = 0;
mutable uint32_t visibleChunks_ = 0;
mutable uint32_t drawCalls_ = 0;
bool initialized_ = false;
public:
float getGpuCullTimeMs() const { return gpuCullTimeMs_; }
float getGpuDrawTimeMs() const { return gpuDrawTimeMs_; }
bool isGpuMeshEnabled() const { return gpuMeshEnabled_ && gpuMesherAvailable_; }
uint32_t getGpuMeshQuadCount() const { return gpuMeshQuadCount_; }
// Phase 4: Toping rendering
void uploadTopingData(const TopingSystem& topingSystem);
void renderTopings(
wi::graphics::CommandList cmd,
const TopingSystem& topingSystem,
const wi::graphics::Texture& depthBuffer,
const wi::graphics::Texture& renderTarget,
const wi::graphics::Texture& normalTarget
) const;
uint32_t getTopingDrawCalls() const { return topingDrawCalls_; }
// Phase 5: Smooth surface rendering
void uploadSmoothData(VoxelWorld& world);
void uploadSmoothDataFast(VoxelWorld& world); // chunkIndex already stamped
void renderSmooth(
wi::graphics::CommandList cmd,
const wi::graphics::Texture& depthBuffer,
const wi::graphics::Texture& renderTarget,
const wi::graphics::Texture& normalTarget
) const;
uint32_t getSmoothVertexCount() const { return (smoothCentroidShader_.IsValid() && smoothMeshShader_.IsValid()) ? gpuSmoothVertexCount_ : smoothVertexCount_; }
uint32_t getSmoothDrawCalls() const { return smoothDrawCalls_; }
// Phase 6: Ray Tracing
bool isRTAvailable() const { return rtAvailable_; }
bool isRTReady() const { return rtAvailable_ && tlas_.IsValid(); }
bool isRTShadowsEnabled() const { return rtShadowsEnabled_; }
uint32_t getRTBlockyTriCount() const { return rtBlockyVertexCount_ / 3; }
uint32_t getRTSmoothTriCount() const { return rtSmoothVertexCount_ / 3; }
const wi::graphics::RaytracingAccelerationStructure& getTLAS() const { return tlas_; }
};
// ── Custom RenderPath that integrates voxel rendering ───────────
class VoxelRenderPath : public wi::RenderPath3D {
public:
VoxelWorld world;
VoxelRenderer renderer;
TopingSystem topingSystem;
bool debugMode = false;
bool debugSmooth = false;
float cameraSpeed = 50.0f;
float cameraSensitivity = 0.003f;
XMFLOAT3 cameraPos = { 256.0f, 100.0f, 256.0f };
float cameraPitch = -0.3f;
float cameraYaw = 0.0f;
bool mouseCaptured = false;
void Start() override;
void Update(float dt) override;
void Render() const override;
void Compose(wi::graphics::CommandList cmd) const override;
private:
void handleInput(float dt);
void createRenderTargets();
mutable bool worldGenerated_ = false;
mutable int frameCount_ = 0;
mutable float lastDt_ = 0.016f;
mutable float smoothFps_ = 60.0f;
// Wind animation (continuous, always running)
float windTime_ = 0.0f;
// Animated terrain (wave effect at 60 Hz, toggled with F3)
bool animatedTerrain_ = false;
float animTime_ = 0.0f;
float animAccum_ = 0.0f;
static constexpr float ANIM_INTERVAL = 1.0f / 60.0f; // ~16.7ms = 60 Hz
wi::graphics::Texture voxelRT_;
wi::graphics::Texture voxelNormalRT_; // Phase 6: world-space normals for RT shadows/AO
wi::graphics::Texture voxelDepth_;
mutable bool rtCreated_ = false;
// ── CPU Profiling (averages every 5 seconds) ─────────────────
mutable ProfileAccum profRegenerate_; // regenerateAnimated
mutable ProfileAccum profUpdateMeshes_; // updateMeshes (rebuildChunkInfoOnly or CPU mesh)
mutable ProfileAccum profVoxelPack_; // voxel data packing in dispatchGpuMesh
mutable ProfileAccum profGpuUpload_; // GPU upload in dispatchGpuMesh
mutable ProfileAccum profGpuDispatch_; // compute dispatches in dispatchGpuMesh
mutable ProfileAccum profRender_; // render() total
mutable ProfileAccum profFrame_; // full frame (Update + Render + Compose)
mutable ProfileAccum profSmoothMesh_; // SmoothMesher::meshChunk (all chunks)
mutable ProfileAccum profSmoothUpload_; // uploadSmoothData
mutable ProfileAccum profTopingCollect_; // topingSystem.collectInstances
mutable ProfileAccum profTopingUpload_; // uploadTopingData
mutable float profTimer_ = 0.0f;
static constexpr float PROF_INTERVAL = 5.0f;
void logProfilingAverages() const;
};
} // namespace voxel