bvle-voxels/src/voxel/VoxelRenderer.h

#pragma once
#include "VoxelWorld.h"
#include "VoxelMesher.h"
#include "TopingSystem.h"
#include "WickedEngine.h"

namespace voxel {

// ── CPU Profiling accumulator ────────────────────────────────────
struct ProfileAccum {
    double totalMs = 0.0;
    uint32_t count = 0;
    void add(float ms) { totalMs += ms; count++; }
    float avg() const { return count > 0 ? (float)(totalMs / count) : 0.0f; }
    void reset() { totalMs = 0.0; count = 0; }
};

// ── GPU-visible chunk info (must match HLSL GPUChunkInfo) ────────
struct GPUChunkInfo {
    XMFLOAT4 worldPos;         // xyz = chunk origin, w = debug flag
    uint32_t quadOffset;        // offset into mega quad buffer
    uint32_t quadCount;         // number of quads for this chunk
    uint32_t pad[2];            // align to 32 bytes
    uint32_t faceOffsets[6];    // per-face quad offset within this chunk's quads
    uint32_t faceCounts[6];     // per-face quad count
};

// ── Voxel Renderer (Phase 2: mega-buffer + MDI pipeline) ────────
class VoxelRenderer {
    friend class VoxelRenderPath;
public:
    VoxelRenderer();
    ~VoxelRenderer();

    void initialize(wi::graphics::GraphicsDevice* device);
    void shutdown();

    // Mesh dirty chunks and repack the mega-buffer
    void updateMeshes(VoxelWorld& world);

    // Render all visible chunks
    void render(
        wi::graphics::CommandList cmd,
        const wi::scene::CameraComponent& camera,
        const wi::graphics::Texture& depthBuffer,
        const wi::graphics::Texture& renderTarget
    ) const;

    // Generate procedural textures for materials
    void generateTextures();

    // Stats
    uint32_t getTotalQuads() const { return totalQuads_; }
    uint32_t getVisibleChunks() const { return visibleChunks_; }
    uint32_t getDrawCalls() const { return drawCalls_; }
    uint32_t getChunkCount() const { return chunkCount_; }
    bool isInitialized() const { return initialized_; }
    bool isGpuCulling() const { return gpuCullingEnabled_; }
    bool isMdiEnabled() const { return mdiEnabled_; }

    bool debugFaceColors_ = false;
    bool debugBlend_ = false;

private:
    void createPipeline();
    void rebuildMegaBuffer(VoxelWorld& world);

    wi::graphics::GraphicsDevice* device_ = nullptr;

    // Shaders & Pipeline
    wi::graphics::Shader vertexShader_;
    wi::graphics::Shader pixelShader_;
    wi::graphics::PipelineState pso_;
    wi::graphics::Shader cullShader_; // Frustum cull compute shader

    // Texture array for materials (256x256, 5 layers for prototype)
    wi::graphics::Texture textureArray_;
    wi::graphics::Sampler sampler_;

    // ── Mega-buffer architecture (Phase 2) ──────────────────────
    static constexpr uint32_t MEGA_BUFFER_CAPACITY = 2 * 1024 * 1024; // 2M quads max (16 MB)
    static constexpr uint32_t MAX_CHUNKS = 2048;
    static constexpr uint32_t MAX_DRAWS = MAX_CHUNKS * 6; // up to 6 face groups per chunk

    wi::graphics::GPUBuffer megaQuadBuffer_;    // StructuredBuffer<PackedQuad>, SRV t0
    wi::graphics::GPUBuffer chunkInfoBuffer_;   // StructuredBuffer<GPUChunkInfo>, SRV t2

    // CPU-side tracking
    struct ChunkSlot {
        ChunkPos pos;
        uint32_t quadOffset;    // offset into mega-buffer (in quads)
        uint32_t quadCount;
    };
    std::vector<ChunkSlot> chunkSlots_;
    std::vector<GPUChunkInfo> cpuChunkInfo_;
    std::vector<PackedQuad> cpuMegaQuads_;       // CPU staging for mega-buffer
    uint32_t chunkCount_ = 0;
    bool megaBufferDirty_ = true;

    // ── Indirect draw (Phase 2 MDI) ─────────────────────────────
    // Wicked Engine's DrawInstancedIndirectCount command signature includes a
    // push constant (1 × uint32 at b999) BEFORE each D3D12_DRAW_ARGUMENTS.
    // Total stride = 4 + 16 = 20 bytes per draw entry.
    struct IndirectDrawArgs {
        uint32_t pushConstant;              // written to b999[0] by ExecuteIndirect
        uint32_t vertexCountPerInstance;
        uint32_t instanceCount;
        uint32_t startVertexLocation;
        uint32_t startInstanceLocation;
    };
    wi::graphics::GPUBuffer indirectArgsBuffer_;   // IndirectDrawArgs[MAX_DRAWS]
    wi::graphics::GPUBuffer drawCountBuffer_;      // uint32_t[1]
    mutable std::vector<IndirectDrawArgs> cpuIndirectArgs_;
    bool gpuCullingEnabled_ = true;                // Phase 2.3: GPU compute cull (true) vs CPU fallback (false)
    bool mdiEnabled_ = true;                       // Phase 2.2: MDI rendering with CPU-filled indirect args

    // Constants buffer (must match HLSL VoxelCB)
    struct VoxelConstants {
        XMFLOAT4X4 viewProjection;
        XMFLOAT4 cameraPosition;
        XMFLOAT4 sunDirection;
        XMFLOAT4 sunColor;
        float chunkSize;
        float textureTiling;
        float blendEnabled;
        float debugBlend;
        XMFLOAT4 frustumPlanes[6]; // ax+by+cz+d=0
        uint32_t chunkCount;
        uint32_t bleedMask;       // bit N set = material N can bleed onto neighbors
        uint32_t resistBleedMask; // bit N set = material N resists bleed from neighbors
        uint32_t _cullPad2;
    };
    wi::graphics::GPUBuffer constantBuffer_;

    // ── GPU Compute Mesher ──────────────────────────────────────────
    wi::graphics::Shader meshShader_;         // voxelMeshCS compute shader
    mutable wi::graphics::GPUBuffer voxelDataBuffer_; // chunk voxel data (StructuredBuffer<uint>)
    wi::graphics::GPUBuffer gpuQuadBuffer_;   // GPU mesh output (RWStructuredBuffer<uint2>)
    wi::graphics::GPUBuffer gpuQuadCounter_;  // atomic counter for GPU mesh output
    wi::graphics::GPUBuffer meshCounterReadback_; // READBACK buffer for quad counter
    bool gpuMesherAvailable_ = false;
    bool gpuMeshEnabled_ = true;              // Use GPU meshing instead of CPU greedy
    mutable uint32_t gpuMeshQuadCount_ = 0;   // Readback from previous frame (1-frame delay)
    mutable uint32_t voxelDataCapacity_ = 0;  // Current capacity of voxelDataBuffer_ (in uint32s)
    mutable std::vector<uint32_t> packedVoxelCache_; // cached packed voxel data for all chunks
    mutable bool voxelCacheDirty_ = true;     // true: packedVoxelCache_ needs repack from chunks
    mutable bool gpuMeshDirty_ = true;        // true: GPU needs upload + re-dispatch
    mutable bool chunkInfoDirty_ = true;      // true: chunkInfoBuffer needs re-upload

    // Benchmark state machine: runs once after world gen
    enum class BenchState { IDLE, DISPATCH, READBACK, DONE };
    mutable BenchState benchState_ = BenchState::IDLE;
    mutable float cpuMeshTimeMs_ = 0.0f;
    mutable uint32_t gpuBaselineQuads_ = 0;

    void dispatchGpuMeshBenchmark(wi::graphics::CommandList cmd, const VoxelWorld& world) const;
    void readbackGpuMeshBenchmark() const;
    void dispatchGpuMesh(wi::graphics::CommandList cmd, const VoxelWorld& world,
        ProfileAccum* profPack = nullptr, ProfileAccum* profUpload = nullptr,
        ProfileAccum* profDispatch = nullptr) const;
    void rebuildChunkInfoOnly(VoxelWorld& world);

    // ── GPU Timestamp Queries (Phase 2 benchmark) ────────────────
    wi::graphics::GPUQueryHeap timestampHeap_;
    wi::graphics::GPUBuffer timestampReadback_;
    static constexpr uint32_t TS_CULL_BEGIN = 0;
    static constexpr uint32_t TS_CULL_END = 1;
    static constexpr uint32_t TS_DRAW_BEGIN = 2;
    static constexpr uint32_t TS_DRAW_END = 3;
    static constexpr uint32_t TS_MESH_BEGIN = 4;
    static constexpr uint32_t TS_MESH_END = 5;
    static constexpr uint32_t TS_COUNT = 6;
    mutable float gpuCullTimeMs_ = 0.0f;
    mutable float gpuDrawTimeMs_ = 0.0f;
    mutable float gpuMeshTimeMs_ = 0.0f;

    // Stats (mutable: updated during const Render() call)
    mutable uint32_t totalQuads_ = 0;
    mutable uint32_t visibleChunks_ = 0;
    mutable uint32_t drawCalls_ = 0;

    bool initialized_ = false;

public:
    float getGpuCullTimeMs() const { return gpuCullTimeMs_; }
    float getGpuDrawTimeMs() const { return gpuDrawTimeMs_; }
    bool isGpuMeshEnabled() const { return gpuMeshEnabled_ && gpuMesherAvailable_; }
    uint32_t getGpuMeshQuadCount() const { return gpuMeshQuadCount_; }
};

// ── Custom RenderPath that integrates voxel rendering ───────────
class VoxelRenderPath : public wi::RenderPath3D {
public:
    VoxelWorld world;
    VoxelRenderer renderer;
    TopingSystem topingSystem;

    bool debugMode = false;

    float cameraSpeed = 50.0f;
    float cameraSensitivity = 0.003f;
    XMFLOAT3 cameraPos = { 256.0f, 100.0f, 256.0f };
    float cameraPitch = -0.3f;
    float cameraYaw = 0.0f;
    bool mouseCaptured = false;

    void Start() override;
    void Update(float dt) override;
    void Render() const override;
    void Compose(wi::graphics::CommandList cmd) const override;

private:
    void handleInput(float dt);
    void createRenderTargets();
    mutable bool worldGenerated_ = false;
    mutable int frameCount_ = 0;
    mutable float lastDt_ = 0.016f;
    mutable float smoothFps_ = 60.0f;

    // Animated terrain (wave effect at 60 Hz, toggled with F3)
    bool animatedTerrain_ = false;
    float animTime_ = 0.0f;
    float animAccum_ = 0.0f;
    static constexpr float ANIM_INTERVAL = 1.0f / 60.0f; // ~16.7ms = 60 Hz

    wi::graphics::Texture voxelRT_;
    wi::graphics::Texture voxelDepth_;
    mutable bool rtCreated_ = false;

    // ── CPU Profiling (averages every 5 seconds) ─────────────────
    mutable ProfileAccum profRegenerate_;     // regenerateAnimated
    mutable ProfileAccum profUpdateMeshes_;   // updateMeshes (rebuildChunkInfoOnly or CPU mesh)
    mutable ProfileAccum profVoxelPack_;      // voxel data packing in dispatchGpuMesh
    mutable ProfileAccum profGpuUpload_;      // GPU upload in dispatchGpuMesh
    mutable ProfileAccum profGpuDispatch_;    // compute dispatches in dispatchGpuMesh
    mutable ProfileAccum profRender_;         // render() total
    mutable ProfileAccum profFrame_;          // full frame (Update + Render + Compose)
    mutable float profTimer_ = 0.0f;
    static constexpr float PROF_INTERVAL = 5.0f;
    void logProfilingAverages() const;
};

} // namespace voxel