Add shadow compute shader (voxelShadowCS.hlsl) that traces rays toward the sun using DXR inline ray queries (RayQuery<>, SM 6.5). Shadows modulate voxelRT_ in-place via RWTexture2D (no extra render target). Key fixes to Phase 6.1 BLAS/TLAS infrastructure: - Sequential index buffer required: Wicked treats IndexCount=0 with non-null IndexBuffer as "0 indexed triangles" → empty BLAS - Memory barriers between BLAS→TLAS→RT: without GPUBarrier::Memory() the TLAS build races with BLAS builds, causing zero ray hits - inverseViewProjection added to VoxelCB for depth reconstruction F5 toggles shadows OFF→ON→DEBUG (red=hit, green=miss, blue=backface).
353 lines
17 KiB
C++
353 lines
17 KiB
C++
#pragma once
|
||
#include "VoxelWorld.h"
|
||
#include "VoxelMesher.h"
|
||
#include "TopingSystem.h"
|
||
#include "WickedEngine.h"
|
||
|
||
namespace voxel {
|
||
|
||
// ── CPU Profiling accumulator ────────────────────────────────────
|
||
struct ProfileAccum {
|
||
double totalMs = 0.0;
|
||
uint32_t count = 0;
|
||
void add(float ms) { totalMs += ms; count++; }
|
||
float avg() const { return count > 0 ? (float)(totalMs / count) : 0.0f; }
|
||
void reset() { totalMs = 0.0; count = 0; }
|
||
};
|
||
|
||
// ── GPU-visible chunk info (must match HLSL GPUChunkInfo) ────────
|
||
struct GPUChunkInfo {
|
||
XMFLOAT4 worldPos; // xyz = chunk origin, w = debug flag
|
||
uint32_t quadOffset; // offset into mega quad buffer
|
||
uint32_t quadCount; // number of quads for this chunk
|
||
uint32_t pad[2]; // align to 32 bytes
|
||
uint32_t faceOffsets[6]; // per-face quad offset within this chunk's quads
|
||
uint32_t faceCounts[6]; // per-face quad count
|
||
uint32_t neighbors[6]; // chunk index of face neighbors (+X,-X,+Y,-Y,+Z,-Z), 0xFFFFFFFF = none
|
||
uint32_t pad2[2]; // pad to 112 bytes (7 × float4)
|
||
};
|
||
|
||
// ── Voxel Renderer (Phase 2: mega-buffer + MDI pipeline) ────────
|
||
class VoxelRenderer {
|
||
friend class VoxelRenderPath;
|
||
public:
|
||
VoxelRenderer();
|
||
~VoxelRenderer();
|
||
|
||
void initialize(wi::graphics::GraphicsDevice* device);
|
||
void shutdown();
|
||
|
||
// Mesh dirty chunks and repack the mega-buffer
|
||
void updateMeshes(VoxelWorld& world);
|
||
|
||
// Render all visible chunks
|
||
void render(
|
||
wi::graphics::CommandList cmd,
|
||
const wi::scene::CameraComponent& camera,
|
||
const wi::graphics::Texture& depthBuffer,
|
||
const wi::graphics::Texture& renderTarget,
|
||
const wi::graphics::Texture& normalTarget
|
||
) const;
|
||
|
||
// Generate procedural textures for materials
|
||
void generateTextures();
|
||
|
||
// Stats
|
||
uint32_t getTotalQuads() const { return totalQuads_; }
|
||
uint32_t getVisibleChunks() const { return visibleChunks_; }
|
||
uint32_t getDrawCalls() const { return drawCalls_; }
|
||
uint32_t getChunkCount() const { return chunkCount_; }
|
||
bool isInitialized() const { return initialized_; }
|
||
bool isGpuCulling() const { return gpuCullingEnabled_; }
|
||
bool isMdiEnabled() const { return mdiEnabled_; }
|
||
|
||
bool debugFaceColors_ = false;
|
||
bool debugBlend_ = false;
|
||
float windTime_ = 0.0f; // set by VoxelRenderPath::Update each frame
|
||
|
||
private:
|
||
void createPipeline();
|
||
void rebuildMegaBuffer(VoxelWorld& world);
|
||
|
||
wi::graphics::GraphicsDevice* device_ = nullptr;
|
||
|
||
// Shaders & Pipeline (voxels)
|
||
wi::graphics::Shader vertexShader_;
|
||
wi::graphics::Shader pixelShader_;
|
||
wi::graphics::PipelineState pso_;
|
||
wi::graphics::Shader cullShader_; // Frustum cull compute shader
|
||
|
||
// Shaders & Pipeline (topings, Phase 4)
|
||
wi::graphics::Shader topingVS_;
|
||
wi::graphics::Shader topingPS_;
|
||
wi::graphics::PipelineState topingPso_;
|
||
wi::graphics::GPUBuffer topingVertexBuffer_; // StructuredBuffer<TopingVertex>, SRV t4
|
||
wi::graphics::GPUBuffer topingInstanceBuffer_; // StructuredBuffer<float3>, SRV t5
|
||
static constexpr uint32_t MAX_TOPING_INSTANCES = 256 * 1024; // 256K instances max
|
||
// Persistent staging buffers for toping upload (avoids per-frame allocations)
|
||
struct TopingSortedInst { float wx, wy, wz; uint16_t type, variant; };
|
||
struct TopingGPUInst { float x, y, z; };
|
||
std::vector<TopingSortedInst> topingSorted_;
|
||
std::vector<TopingGPUInst> topingGpuInsts_;
|
||
mutable uint32_t topingDrawCalls_ = 0;
|
||
|
||
// Shaders & Pipeline (smooth surfaces, Phase 5)
|
||
wi::graphics::Shader smoothVS_;
|
||
wi::graphics::Shader smoothPS_;
|
||
wi::graphics::RasterizerState smoothRasterizer_;
|
||
wi::graphics::PipelineState smoothPso_;
|
||
wi::graphics::GPUBuffer smoothVertexBuffer_; // StructuredBuffer<SmoothVertex>, SRV t6
|
||
std::vector<SmoothVertex> smoothStagingVerts_; // persistent staging buffer (avoids per-frame alloc)
|
||
static constexpr uint32_t MAX_SMOOTH_VERTICES = 4 * 1024 * 1024; // 4M vertices max
|
||
mutable uint32_t smoothVertexCount_ = 0;
|
||
mutable uint32_t smoothDrawCalls_ = 0;
|
||
bool smoothDirty_ = true;
|
||
|
||
// Texture array for materials (256x256, 5 layers for prototype)
|
||
wi::graphics::Texture textureArray_;
|
||
wi::graphics::Sampler sampler_;
|
||
|
||
// ── Mega-buffer architecture (Phase 2) ──────────────────────
|
||
static constexpr uint32_t MEGA_BUFFER_CAPACITY = 2 * 1024 * 1024; // 2M quads max (16 MB)
|
||
static constexpr uint32_t MAX_CHUNKS = 2048;
|
||
static constexpr uint32_t MAX_DRAWS = MAX_CHUNKS * 6; // up to 6 face groups per chunk
|
||
|
||
wi::graphics::GPUBuffer megaQuadBuffer_; // StructuredBuffer<PackedQuad>, SRV t0
|
||
wi::graphics::GPUBuffer chunkInfoBuffer_; // StructuredBuffer<GPUChunkInfo>, SRV t2
|
||
|
||
// CPU-side tracking
|
||
struct ChunkSlot {
|
||
ChunkPos pos;
|
||
uint32_t quadOffset; // offset into mega-buffer (in quads)
|
||
uint32_t quadCount;
|
||
};
|
||
std::vector<ChunkSlot> chunkSlots_;
|
||
std::vector<GPUChunkInfo> cpuChunkInfo_;
|
||
std::vector<PackedQuad> cpuMegaQuads_; // CPU staging for mega-buffer
|
||
uint32_t chunkCount_ = 0;
|
||
bool megaBufferDirty_ = true;
|
||
|
||
// ── Indirect draw (Phase 2 MDI) ─────────────────────────────
|
||
// Wicked Engine's DrawInstancedIndirectCount command signature includes a
|
||
// push constant (1 × uint32 at b999) BEFORE each D3D12_DRAW_ARGUMENTS.
|
||
// Total stride = 4 + 16 = 20 bytes per draw entry.
|
||
struct IndirectDrawArgs {
|
||
uint32_t pushConstant; // written to b999[0] by ExecuteIndirect
|
||
uint32_t vertexCountPerInstance;
|
||
uint32_t instanceCount;
|
||
uint32_t startVertexLocation;
|
||
uint32_t startInstanceLocation;
|
||
};
|
||
wi::graphics::GPUBuffer indirectArgsBuffer_; // IndirectDrawArgs[MAX_DRAWS]
|
||
wi::graphics::GPUBuffer drawCountBuffer_; // uint32_t[1]
|
||
mutable std::vector<IndirectDrawArgs> cpuIndirectArgs_;
|
||
bool gpuCullingEnabled_ = true; // Phase 2.3: GPU compute cull (true) vs CPU fallback (false)
|
||
bool mdiEnabled_ = true; // Phase 2.2: MDI rendering with CPU-filled indirect args
|
||
|
||
// Constants buffer (must match HLSL VoxelCB)
|
||
struct VoxelConstants {
|
||
XMFLOAT4X4 viewProjection;
|
||
XMFLOAT4X4 inverseViewProjection; // for depth-to-world reconstruction (RT shadows)
|
||
XMFLOAT4 cameraPosition;
|
||
XMFLOAT4 sunDirection;
|
||
XMFLOAT4 sunColor;
|
||
float chunkSize;
|
||
float textureTiling;
|
||
float blendEnabled;
|
||
float debugBlend;
|
||
XMFLOAT4 frustumPlanes[6]; // ax+by+cz+d=0
|
||
uint32_t chunkCount;
|
||
uint32_t bleedMask; // bit N set = material N can bleed onto neighbors
|
||
uint32_t resistBleedMask; // bit N set = material N resists bleed from neighbors
|
||
float windTime;
|
||
};
|
||
wi::graphics::GPUBuffer constantBuffer_;
|
||
|
||
// ── GPU Compute Mesher ──────────────────────────────────────────
|
||
wi::graphics::Shader meshShader_; // voxelMeshCS compute shader
|
||
mutable wi::graphics::GPUBuffer voxelDataBuffer_; // chunk voxel data (StructuredBuffer<uint>)
|
||
wi::graphics::GPUBuffer gpuQuadBuffer_; // GPU mesh output (RWStructuredBuffer<uint2>)
|
||
wi::graphics::GPUBuffer gpuQuadCounter_; // atomic counter for GPU mesh output
|
||
wi::graphics::GPUBuffer meshCounterReadback_; // READBACK buffer for quad counter
|
||
bool gpuMesherAvailable_ = false;
|
||
bool gpuMeshEnabled_ = true; // Use GPU meshing instead of CPU greedy
|
||
mutable uint32_t gpuMeshQuadCount_ = 0; // Readback from previous frame (1-frame delay)
|
||
mutable uint32_t voxelDataCapacity_ = 0; // Current capacity of voxelDataBuffer_ (in uint32s)
|
||
mutable std::vector<uint32_t> packedVoxelCache_; // cached packed voxel data for all chunks
|
||
mutable bool voxelCacheDirty_ = true; // true: packedVoxelCache_ needs repack from chunks
|
||
mutable bool gpuMeshDirty_ = true; // true: GPU needs upload + re-dispatch
|
||
mutable bool chunkInfoDirty_ = true; // true: chunkInfoBuffer needs re-upload
|
||
|
||
// ── GPU Smooth Mesher (Phase 5.3) ─────────────────────────────
|
||
wi::graphics::Shader smoothCentroidShader_; // voxelSmoothCentroidCS (pass 1: centroid grid)
|
||
wi::graphics::Shader smoothMeshShader_; // voxelSmoothCS (pass 2: emit with smooth normals)
|
||
wi::graphics::GPUBuffer centroidGridBuffer_; // float4[34^3] per-chunk centroid grid (reused)
|
||
wi::graphics::GPUBuffer gpuSmoothVertexBuffer_; // RWStructuredBuffer<GPUSmoothVertex>, UAV+SRV
|
||
wi::graphics::GPUBuffer gpuSmoothCounter_; // atomic counter for smooth vertices
|
||
wi::graphics::GPUBuffer smoothCounterReadback_; // READBACK buffer for vertex counter
|
||
static constexpr uint32_t CENTROID_GRID_SIZE = 34 * 34 * 34; // 39304 entries per chunk
|
||
static constexpr uint32_t MAX_GPU_SMOOTH_VERTICES = 2 * 1024 * 1024; // 2M vertices max
|
||
mutable uint32_t gpuSmoothVertexCount_ = 0; // readback from previous frame
|
||
mutable bool gpuSmoothMeshDirty_ = true;
|
||
|
||
// ── Ray Tracing (Phase 6.1) ─────────────────────────────────────
|
||
wi::graphics::Shader blasExtractShader_; // voxelBLASExtractCS compute shader
|
||
mutable wi::graphics::GPUBuffer blasPositionBuffer_; // float3[] for blocky BLAS (6 verts per quad)
|
||
wi::graphics::GPUBuffer blasIndexBuffer_; // sequential uint32 indices [0,1,2,...] for BLAS
|
||
mutable wi::graphics::RaytracingAccelerationStructure blockyBLAS_;
|
||
mutable wi::graphics::RaytracingAccelerationStructure smoothBLAS_;
|
||
mutable wi::graphics::RaytracingAccelerationStructure tlas_;
|
||
static constexpr uint32_t MAX_BLAS_VERTICES = MEGA_BUFFER_CAPACITY * 6; // 6 verts per quad
|
||
mutable bool rtAvailable_ = false; // GPU supports RT
|
||
mutable bool rtDirty_ = true; // BLAS/TLAS need rebuild
|
||
mutable uint32_t rtBlockyVertexCount_ = 0; // current blocky BLAS vertex count
|
||
mutable uint32_t rtSmoothVertexCount_ = 0; // current smooth BLAS vertex count
|
||
|
||
void dispatchBLASExtract(wi::graphics::CommandList cmd) const;
|
||
void buildAccelerationStructures(wi::graphics::CommandList cmd) const;
|
||
|
||
// ── RT Shadows (Phase 6.2) ─────────────────────────────────────
|
||
wi::graphics::Shader shadowShader_; // voxelShadowCS compute shader
|
||
mutable bool rtShadowsEnabled_ = false; // true when shader + TLAS ready
|
||
mutable bool rtShadowDebug_ = false; // debug visualization mode
|
||
|
||
void dispatchShadows(wi::graphics::CommandList cmd,
|
||
const wi::graphics::Texture& depthBuffer,
|
||
const wi::graphics::Texture& renderTarget,
|
||
const wi::graphics::Texture& normalTarget) const;
|
||
|
||
// Benchmark state machine: runs once after world gen
|
||
enum class BenchState { IDLE, DISPATCH, READBACK, DONE };
|
||
mutable BenchState benchState_ = BenchState::IDLE;
|
||
mutable float cpuMeshTimeMs_ = 0.0f;
|
||
mutable uint32_t gpuBaselineQuads_ = 0;
|
||
|
||
void dispatchGpuMeshBenchmark(wi::graphics::CommandList cmd, const VoxelWorld& world) const;
|
||
void readbackGpuMeshBenchmark() const;
|
||
void dispatchGpuMesh(wi::graphics::CommandList cmd, const VoxelWorld& world,
|
||
ProfileAccum* profPack = nullptr, ProfileAccum* profUpload = nullptr,
|
||
ProfileAccum* profDispatch = nullptr) const;
|
||
void dispatchGpuSmoothMesh(wi::graphics::CommandList cmd, const VoxelWorld& world) const;
|
||
void rebuildChunkInfoOnly(VoxelWorld& world);
|
||
|
||
// ── GPU Timestamp Queries (Phase 2 benchmark) ────────────────
|
||
wi::graphics::GPUQueryHeap timestampHeap_;
|
||
wi::graphics::GPUBuffer timestampReadback_;
|
||
static constexpr uint32_t TS_CULL_BEGIN = 0;
|
||
static constexpr uint32_t TS_CULL_END = 1;
|
||
static constexpr uint32_t TS_DRAW_BEGIN = 2;
|
||
static constexpr uint32_t TS_DRAW_END = 3;
|
||
static constexpr uint32_t TS_MESH_BEGIN = 4;
|
||
static constexpr uint32_t TS_MESH_END = 5;
|
||
static constexpr uint32_t TS_COUNT = 6;
|
||
mutable float gpuCullTimeMs_ = 0.0f;
|
||
mutable float gpuDrawTimeMs_ = 0.0f;
|
||
mutable float gpuMeshTimeMs_ = 0.0f;
|
||
|
||
// Stats (mutable: updated during const Render() call)
|
||
mutable uint32_t totalQuads_ = 0;
|
||
mutable uint32_t visibleChunks_ = 0;
|
||
mutable uint32_t drawCalls_ = 0;
|
||
|
||
bool initialized_ = false;
|
||
|
||
public:
|
||
float getGpuCullTimeMs() const { return gpuCullTimeMs_; }
|
||
float getGpuDrawTimeMs() const { return gpuDrawTimeMs_; }
|
||
bool isGpuMeshEnabled() const { return gpuMeshEnabled_ && gpuMesherAvailable_; }
|
||
uint32_t getGpuMeshQuadCount() const { return gpuMeshQuadCount_; }
|
||
|
||
// Phase 4: Toping rendering
|
||
void uploadTopingData(const TopingSystem& topingSystem);
|
||
void renderTopings(
|
||
wi::graphics::CommandList cmd,
|
||
const TopingSystem& topingSystem,
|
||
const wi::graphics::Texture& depthBuffer,
|
||
const wi::graphics::Texture& renderTarget,
|
||
const wi::graphics::Texture& normalTarget
|
||
) const;
|
||
uint32_t getTopingDrawCalls() const { return topingDrawCalls_; }
|
||
|
||
// Phase 5: Smooth surface rendering
|
||
void uploadSmoothData(VoxelWorld& world);
|
||
void uploadSmoothDataFast(VoxelWorld& world); // chunkIndex already stamped
|
||
void renderSmooth(
|
||
wi::graphics::CommandList cmd,
|
||
const wi::graphics::Texture& depthBuffer,
|
||
const wi::graphics::Texture& renderTarget,
|
||
const wi::graphics::Texture& normalTarget
|
||
) const;
|
||
uint32_t getSmoothVertexCount() const { return (smoothCentroidShader_.IsValid() && smoothMeshShader_.IsValid()) ? gpuSmoothVertexCount_ : smoothVertexCount_; }
|
||
uint32_t getSmoothDrawCalls() const { return smoothDrawCalls_; }
|
||
|
||
// Phase 6: Ray Tracing
|
||
bool isRTAvailable() const { return rtAvailable_; }
|
||
bool isRTReady() const { return rtAvailable_ && tlas_.IsValid(); }
|
||
bool isRTShadowsEnabled() const { return rtShadowsEnabled_; }
|
||
uint32_t getRTBlockyTriCount() const { return rtBlockyVertexCount_ / 3; }
|
||
uint32_t getRTSmoothTriCount() const { return rtSmoothVertexCount_ / 3; }
|
||
const wi::graphics::RaytracingAccelerationStructure& getTLAS() const { return tlas_; }
|
||
};
|
||
|
||
// ── Custom RenderPath that integrates voxel rendering ───────────
|
||
class VoxelRenderPath : public wi::RenderPath3D {
|
||
public:
|
||
VoxelWorld world;
|
||
VoxelRenderer renderer;
|
||
TopingSystem topingSystem;
|
||
|
||
bool debugMode = false;
|
||
bool debugSmooth = false;
|
||
|
||
float cameraSpeed = 50.0f;
|
||
float cameraSensitivity = 0.003f;
|
||
XMFLOAT3 cameraPos = { 256.0f, 100.0f, 256.0f };
|
||
float cameraPitch = -0.3f;
|
||
float cameraYaw = 0.0f;
|
||
bool mouseCaptured = false;
|
||
|
||
void Start() override;
|
||
void Update(float dt) override;
|
||
void Render() const override;
|
||
void Compose(wi::graphics::CommandList cmd) const override;
|
||
|
||
private:
|
||
void handleInput(float dt);
|
||
void createRenderTargets();
|
||
mutable bool worldGenerated_ = false;
|
||
mutable int frameCount_ = 0;
|
||
mutable float lastDt_ = 0.016f;
|
||
mutable float smoothFps_ = 60.0f;
|
||
|
||
// Wind animation (continuous, always running)
|
||
float windTime_ = 0.0f;
|
||
|
||
// Animated terrain (wave effect at 60 Hz, toggled with F3)
|
||
bool animatedTerrain_ = false;
|
||
float animTime_ = 0.0f;
|
||
float animAccum_ = 0.0f;
|
||
static constexpr float ANIM_INTERVAL = 1.0f / 60.0f; // ~16.7ms = 60 Hz
|
||
|
||
wi::graphics::Texture voxelRT_;
|
||
wi::graphics::Texture voxelNormalRT_; // Phase 6: world-space normals for RT shadows/AO
|
||
wi::graphics::Texture voxelDepth_;
|
||
mutable bool rtCreated_ = false;
|
||
|
||
// ── CPU Profiling (averages every 5 seconds) ─────────────────
|
||
mutable ProfileAccum profRegenerate_; // regenerateAnimated
|
||
mutable ProfileAccum profUpdateMeshes_; // updateMeshes (rebuildChunkInfoOnly or CPU mesh)
|
||
mutable ProfileAccum profVoxelPack_; // voxel data packing in dispatchGpuMesh
|
||
mutable ProfileAccum profGpuUpload_; // GPU upload in dispatchGpuMesh
|
||
mutable ProfileAccum profGpuDispatch_; // compute dispatches in dispatchGpuMesh
|
||
mutable ProfileAccum profRender_; // render() total
|
||
mutable ProfileAccum profFrame_; // full frame (Update + Render + Compose)
|
||
mutable ProfileAccum profSmoothMesh_; // SmoothMesher::meshChunk (all chunks)
|
||
mutable ProfileAccum profSmoothUpload_; // uploadSmoothData
|
||
mutable ProfileAccum profTopingCollect_; // topingSystem.collectInstances
|
||
mutable ProfileAccum profTopingUpload_; // uploadTopingData
|
||
mutable float profTimer_ = 0.0f;
|
||
static constexpr float PROF_INTERVAL = 5.0f;
|
||
void logProfilingAverages() const;
|
||
};
|
||
|
||
} // namespace voxel
|