bvle-voxels/shaders/voxelVS.hlsl

// BVLE Voxels - Vertex Shader (Vertex Pulling from mega-buffer)
// Phase 2: supports both CPU draw loop (push constants) and GPU MDI (binary search).

#include "voxelCommon.hlsli"

struct PackedQuad {
    uint2 data; // 8 bytes = 2 x uint32
};

StructuredBuffer<PackedQuad> quadBuffer : register(t0);
StructuredBuffer<GPUChunkInfo> chunkInfoBuffer : register(t2);

// Push constants (48 bytes = 12 x uint32)
//   CPU path: chunkIndex + quadOffset explicit
//   MDI path: flags bit 0 set, VS derives chunk from SV_VertexID via binary search
struct VoxelPush {
    uint chunkIndex;
    uint quadOffset;   // offset into mega quad buffer (in quads)
    uint flags;        // bit 0: 1 = MDI mode (binary search), 0 = CPU mode
    uint pad0, pad1, pad2, pad3, pad4, pad5, pad6, pad7, pad8;
};
[[vk::push_constant]] ConstantBuffer<VoxelPush> push : register(b999);

struct VSOutput {
    float4 position : SV_POSITION;
    float3 worldPos : WORLDPOS;
    float3 normal   : NORMAL;
    float2 uv       : TEXCOORD0;
    nointerpolation uint materialID : MATERIALID;
    nointerpolation uint faceID : FACEID;
    nointerpolation float debugFlag : DEBUGFLAG;
    float ao        : AO;
};

// Unpack 64 bits from 2 x uint32
void unpackQuad(uint2 raw, out uint px, out uint py, out uint pz,
                out uint w, out uint h, out uint face,
                out uint matID, out uint ao)
{
    uint lo = raw.x;
    uint hi = raw.y;
    px    = lo & 0x3F;
    py    = (lo >> 6) & 0x3F;
    pz    = (lo >> 12) & 0x3F;
    w     = (lo >> 18) & 0x3F;
    h     = (lo >> 24) & 0x3F;
    face  = ((lo >> 30) & 0x3) | ((hi & 0x1) << 2);
    matID = (hi >> 1) & 0xFF;
    ao    = (hi >> 9) & 0xFF;
}

// Binary search: find which chunk owns a given global quad index.
// Chunks are packed contiguously in the mega-buffer, sorted by chunk index.
// O(log2(chunkCount)) = ~11 iterations for 2048 chunks.
uint findChunkIndex(uint globalQuadIndex) {
    uint lo = 0, hi = chunkCount;
    [loop]
    while (lo < hi) {
        uint mid = (lo + hi) >> 1;
        GPUChunkInfo ci = chunkInfoBuffer[mid];
        if (ci.quadOffset + ci.quadCount <= globalQuadIndex)
            lo = mid + 1;
        else
            hi = mid;
    }
    return lo;
}

// Face normals: +X, -X, +Y, -Y, +Z, -Z
static const float3 faceNormals[6] = {
    float3( 1, 0, 0), float3(-1, 0, 0),
    float3( 0, 1, 0), float3( 0,-1, 0),
    float3( 0, 0, 1), float3( 0, 0,-1)
};

// Face U/V tangent axes for quad expansion
static const float3 faceU[6] = {
    float3(0, 1, 0), float3(0, 1, 0),
    float3(1, 0, 0), float3(1, 0, 0),
    float3(1, 0, 0), float3(1, 0, 0)
};

static const float3 faceV[6] = {
    float3(0, 0, 1), float3(0, 0, 1),
    float3(0, 0, 1), float3(0, 0, 1),
    float3(0, 1, 0), float3(0, 1, 0)
};

[RootSignature(VOXEL_ROOTSIG)]
VSOutput main(uint vertexID : SV_VertexID)
{
    VSOutput output;

    // Determine quad index and chunk index based on rendering mode
    uint quadIndex;
    uint chunkIndex = 0;

    if (push.flags & 2) {
        // GPU mesh path: quads are in a flat buffer, chunk index is embedded
        // in each quad's flags field (bits [31:17] of hi word = 11-bit chunk index).
        // push.quadOffset = base offset into the GPU quad buffer.
        quadIndex = push.quadOffset + (vertexID / 6);
    } else if (push.flags & 1) {
        // MDI path: push.chunkIndex is packed by ExecuteIndirect command signature:
        //   low 16 bits  = chunk index into chunkInfoBuffer
        //   high 16 bits = face index (0-5)
        // SV_VertexID starts at 0 (startVertexLocation=0), so we compute the
        // global quad index from the GPUChunkInfo face offset.
        chunkIndex = push.chunkIndex & 0xFFFF;
        uint faceIdx = push.chunkIndex >> 16;
        GPUChunkInfo ci = chunkInfoBuffer[chunkIndex];
        uint faceOff = getFaceOffset(ci, faceIdx);
        quadIndex = ci.quadOffset + faceOff + (vertexID / 6);
    } else {
        // CPU path: push constants provide explicit offsets
        quadIndex = push.quadOffset + (vertexID / 6);
        chunkIndex = push.chunkIndex;
    }

    uint cornerIndex = vertexID % 6;

    PackedQuad packed = quadBuffer[quadIndex];
    uint px, py, pz, w, h, face, matID, ao;
    unpackQuad(packed.data, px, py, pz, w, h, face, matID, ao);

    // GPU mesh path: extract chunk index from quad flags field (bits [31:17] of hi word)
    if (push.flags & 2) {
        chunkIndex = (packed.data.y >> 17) & 0x7FF;
    }

    GPUChunkInfo info = chunkInfoBuffer[chunkIndex];

    // Corner offsets for 2 triangles (6 vertices per quad)
    // cross(U,V) matches N for faces: +X(0), -Y(3), +Z(4) -> CW corners
    // cross(U,V) opposes N for faces: -X(1), +Y(2), -Z(5) -> CCW corners
    static const float2 cornersCW[6] = {
        float2(0, 0), float2(0, 1), float2(1, 0),
        float2(1, 0), float2(0, 1), float2(1, 1)
    };
    static const float2 cornersCCW[6] = {
        float2(0, 0), float2(1, 0), float2(0, 1),
        float2(0, 1), float2(1, 0), float2(1, 1)
    };
    bool useCCW = (face == 1 || face == 2 || face == 5);
    float2 corner = useCCW ? cornersCCW[cornerIndex] : cornersCW[cornerIndex];

    float3 basePos = float3((float)px, (float)py, (float)pz);
    float3 normal = faceNormals[face];
    float3 uAxis = faceU[face];
    float3 vAxis = faceV[face];

    // Positive faces: offset by 1 in normal direction
    float3 faceOffset = (face % 2 == 0) ? normal : float3(0, 0, 0);

    // Expand quad
    float3 localPos = basePos + faceOffset + uAxis * corner.x * (float)w + vAxis * corner.y * (float)h;
    float3 worldPos = localPos + info.worldPos.xyz;

    output.position = mul(viewProjection, float4(worldPos, 1.0));
    output.worldPos = worldPos;
    output.normal = normal;
    output.uv = corner * float2((float)w, (float)h) * textureTiling;
    output.materialID = matID;
    output.faceID = face;
    output.debugFlag = info.worldPos.w;

    // AO: 4 corners x 2 bits
    uint aoCorner = min(cornerIndex, 3u);
    float aoValue = (float)((ao >> (aoCorner * 2u)) & 3u) / 3.0;
    output.ao = 1.0 - aoValue * 0.4;

    return output;
}
Phase 2: GPU-driven voxel rendering pipeline Mega-buffer architecture replacing per-chunk GPU buffers: - Single StructuredBuffer<PackedQuad> for all chunks (2M quads, 16 MB) - StructuredBuffer<GPUChunkInfo> with per-chunk metadata (position, quad offsets, face groups) - VS reads chunk info via push constants (b999) for driver-safe chunk indexing - CPU frustum culling with wi::primitive::Frustum + AABB per chunk - Quads sorted by face direction in greedy mesher (faceOffsets/faceCounts) - GPU frustum + backface cull compute shader (voxelCullCS.hlsl) - GPU binary mesher compute shader baseline (voxelMeshCS.hlsl) - Indirect draw buffers and timestamp query infrastructure - README with build instructions and project architecture 2026-03-25 14:24:05 +01:00			`// BVLE Voxels - Vertex Shader (Vertex Pulling from mega-buffer)`
Phase 2 complete: per-face-group backface culling, frustum planes, GPU cull infrastructure - VS supports dual mode: CPU path (push constants) and MDI path (binary search) - CPU render loop now does per-face-group draws with backface culling (6 draws/chunk max) - Frustum planes extracted and populated in constant buffer for GPU cull shader - GPU cull + MDI path fully implemented but disabled (barrier/state debugging needed) - GPU timestamp query infrastructure with readback for cull/draw timing - HUD shows rendering mode (GPU cull vs CPU fallback) 2026-03-25 14:50:55 +01:00			`// Phase 2: supports both CPU draw loop (push constants) and GPU MDI (binary search).`
Phase 2: GPU-driven voxel rendering pipeline Mega-buffer architecture replacing per-chunk GPU buffers: - Single StructuredBuffer<PackedQuad> for all chunks (2M quads, 16 MB) - StructuredBuffer<GPUChunkInfo> with per-chunk metadata (position, quad offsets, face groups) - VS reads chunk info via push constants (b999) for driver-safe chunk indexing - CPU frustum culling with wi::primitive::Frustum + AABB per chunk - Quads sorted by face direction in greedy mesher (faceOffsets/faceCounts) - GPU frustum + backface cull compute shader (voxelCullCS.hlsl) - GPU binary mesher compute shader baseline (voxelMeshCS.hlsl) - Indirect draw buffers and timestamp query infrastructure - README with build instructions and project architecture 2026-03-25 14:24:05 +01:00
			`#include "voxelCommon.hlsli"`

			`struct PackedQuad {`
			`uint2 data; // 8 bytes = 2 x uint32`
			`};`

			`StructuredBuffer<PackedQuad> quadBuffer : register(t0);`
			`StructuredBuffer<GPUChunkInfo> chunkInfoBuffer : register(t2);`

Phase 2 complete: per-face-group backface culling, frustum planes, GPU cull infrastructure - VS supports dual mode: CPU path (push constants) and MDI path (binary search) - CPU render loop now does per-face-group draws with backface culling (6 draws/chunk max) - Frustum planes extracted and populated in constant buffer for GPU cull shader - GPU cull + MDI path fully implemented but disabled (barrier/state debugging needed) - GPU timestamp query infrastructure with readback for cull/draw timing - HUD shows rendering mode (GPU cull vs CPU fallback) 2026-03-25 14:50:55 +01:00			`// Push constants (48 bytes = 12 x uint32)`
			`// CPU path: chunkIndex + quadOffset explicit`
			`// MDI path: flags bit 0 set, VS derives chunk from SV_VertexID via binary search`
Phase 2: GPU-driven voxel rendering pipeline Mega-buffer architecture replacing per-chunk GPU buffers: - Single StructuredBuffer<PackedQuad> for all chunks (2M quads, 16 MB) - StructuredBuffer<GPUChunkInfo> with per-chunk metadata (position, quad offsets, face groups) - VS reads chunk info via push constants (b999) for driver-safe chunk indexing - CPU frustum culling with wi::primitive::Frustum + AABB per chunk - Quads sorted by face direction in greedy mesher (faceOffsets/faceCounts) - GPU frustum + backface cull compute shader (voxelCullCS.hlsl) - GPU binary mesher compute shader baseline (voxelMeshCS.hlsl) - Indirect draw buffers and timestamp query infrastructure - README with build instructions and project architecture 2026-03-25 14:24:05 +01:00			`struct VoxelPush {`
			`uint chunkIndex;`
			`uint quadOffset; // offset into mega quad buffer (in quads)`
Phase 2 complete: per-face-group backface culling, frustum planes, GPU cull infrastructure - VS supports dual mode: CPU path (push constants) and MDI path (binary search) - CPU render loop now does per-face-group draws with backface culling (6 draws/chunk max) - Frustum planes extracted and populated in constant buffer for GPU cull shader - GPU cull + MDI path fully implemented but disabled (barrier/state debugging needed) - GPU timestamp query infrastructure with readback for cull/draw timing - HUD shows rendering mode (GPU cull vs CPU fallback) 2026-03-25 14:50:55 +01:00			`uint flags; // bit 0: 1 = MDI mode (binary search), 0 = CPU mode`
			`uint pad0, pad1, pad2, pad3, pad4, pad5, pad6, pad7, pad8;`
Phase 2: GPU-driven voxel rendering pipeline Mega-buffer architecture replacing per-chunk GPU buffers: - Single StructuredBuffer<PackedQuad> for all chunks (2M quads, 16 MB) - StructuredBuffer<GPUChunkInfo> with per-chunk metadata (position, quad offsets, face groups) - VS reads chunk info via push constants (b999) for driver-safe chunk indexing - CPU frustum culling with wi::primitive::Frustum + AABB per chunk - Quads sorted by face direction in greedy mesher (faceOffsets/faceCounts) - GPU frustum + backface cull compute shader (voxelCullCS.hlsl) - GPU binary mesher compute shader baseline (voxelMeshCS.hlsl) - Indirect draw buffers and timestamp query infrastructure - README with build instructions and project architecture 2026-03-25 14:24:05 +01:00			`};`
			`[[vk::push_constant]] ConstantBuffer<VoxelPush> push : register(b999);`

			`struct VSOutput {`
			`float4 position : SV_POSITION;`
			`float3 worldPos : WORLDPOS;`
			`float3 normal : NORMAL;`
			`float2 uv : TEXCOORD0;`
			`nointerpolation uint materialID : MATERIALID;`
			`nointerpolation uint faceID : FACEID;`
			`nointerpolation float debugFlag : DEBUGFLAG;`
			`float ao : AO;`
			`};`

			`// Unpack 64 bits from 2 x uint32`
			`void unpackQuad(uint2 raw, out uint px, out uint py, out uint pz,`
			`out uint w, out uint h, out uint face,`
			`out uint matID, out uint ao)`
			`{`
			`uint lo = raw.x;`
			`uint hi = raw.y;`
			`px = lo & 0x3F;`
			`py = (lo >> 6) & 0x3F;`
			`pz = (lo >> 12) & 0x3F;`
			`w = (lo >> 18) & 0x3F;`
			`h = (lo >> 24) & 0x3F;`
			`face = ((lo >> 30) & 0x3) \| ((hi & 0x1) << 2);`
			`matID = (hi >> 1) & 0xFF;`
			`ao = (hi >> 9) & 0xFF;`
			`}`

Phase 2 complete: per-face-group backface culling, frustum planes, GPU cull infrastructure - VS supports dual mode: CPU path (push constants) and MDI path (binary search) - CPU render loop now does per-face-group draws with backface culling (6 draws/chunk max) - Frustum planes extracted and populated in constant buffer for GPU cull shader - GPU cull + MDI path fully implemented but disabled (barrier/state debugging needed) - GPU timestamp query infrastructure with readback for cull/draw timing - HUD shows rendering mode (GPU cull vs CPU fallback) 2026-03-25 14:50:55 +01:00			`// Binary search: find which chunk owns a given global quad index.`
			`// Chunks are packed contiguously in the mega-buffer, sorted by chunk index.`
			`// O(log2(chunkCount)) = ~11 iterations for 2048 chunks.`
			`uint findChunkIndex(uint globalQuadIndex) {`
			`uint lo = 0, hi = chunkCount;`
			`[loop]`
			`while (lo < hi) {`
			`uint mid = (lo + hi) >> 1;`
			`GPUChunkInfo ci = chunkInfoBuffer[mid];`
			`if (ci.quadOffset + ci.quadCount <= globalQuadIndex)`
			`lo = mid + 1;`
			`else`
			`hi = mid;`
			`}`
			`return lo;`
			`}`

Phase 2: GPU-driven voxel rendering pipeline Mega-buffer architecture replacing per-chunk GPU buffers: - Single StructuredBuffer<PackedQuad> for all chunks (2M quads, 16 MB) - StructuredBuffer<GPUChunkInfo> with per-chunk metadata (position, quad offsets, face groups) - VS reads chunk info via push constants (b999) for driver-safe chunk indexing - CPU frustum culling with wi::primitive::Frustum + AABB per chunk - Quads sorted by face direction in greedy mesher (faceOffsets/faceCounts) - GPU frustum + backface cull compute shader (voxelCullCS.hlsl) - GPU binary mesher compute shader baseline (voxelMeshCS.hlsl) - Indirect draw buffers and timestamp query infrastructure - README with build instructions and project architecture 2026-03-25 14:24:05 +01:00			`// Face normals: +X, -X, +Y, -Y, +Z, -Z`
			`static const float3 faceNormals[6] = {`
			`float3( 1, 0, 0), float3(-1, 0, 0),`
			`float3( 0, 1, 0), float3( 0,-1, 0),`
			`float3( 0, 0, 1), float3( 0, 0,-1)`
			`};`

			`// Face U/V tangent axes for quad expansion`
			`static const float3 faceU[6] = {`
			`float3(0, 1, 0), float3(0, 1, 0),`
			`float3(1, 0, 0), float3(1, 0, 0),`
			`float3(1, 0, 0), float3(1, 0, 0)`
			`};`

			`static const float3 faceV[6] = {`
			`float3(0, 0, 1), float3(0, 0, 1),`
			`float3(0, 0, 1), float3(0, 0, 1),`
			`float3(0, 1, 0), float3(0, 1, 0)`
			`};`

			`[RootSignature(VOXEL_ROOTSIG)]`
			`VSOutput main(uint vertexID : SV_VertexID)`
			`{`
			`VSOutput output;`

Phase 2 complete: per-face-group backface culling, frustum planes, GPU cull infrastructure - VS supports dual mode: CPU path (push constants) and MDI path (binary search) - CPU render loop now does per-face-group draws with backface culling (6 draws/chunk max) - Frustum planes extracted and populated in constant buffer for GPU cull shader - GPU cull + MDI path fully implemented but disabled (barrier/state debugging needed) - GPU timestamp query infrastructure with readback for cull/draw timing - HUD shows rendering mode (GPU cull vs CPU fallback) 2026-03-25 14:50:55 +01:00			`// Determine quad index and chunk index based on rendering mode`
			`uint quadIndex;`
Phase 2.5: GPU meshing production pipeline + perf optimizations (80+ FPS) Replace CPU greedy mesher with GPU compute mesher as default rendering pipeline. Key optimizations identified via CPU profiling (ProfileAccum, 5s averages): - Fused regenerate+pack: parallel noise gen + memcpy in same jobsystem pass (6ms → 0ms) - VoxelData memcpy: sizeof(VoxelData)==2 enables direct memcpy instead of bit-shift loop (28ms → <1ms) - Dirty-skip: GPU dispatch/upload only when chunks change, not every frame - Animation: 2 fBm octaves + no caves in animation mode (54ms → 8ms) - Result: 80-110 FPS with 60Hz terrain animation, 700+ FPS static 2026-03-26 09:05:52 +01:00			`uint chunkIndex = 0;`
Phase 2 complete: per-face-group backface culling, frustum planes, GPU cull infrastructure - VS supports dual mode: CPU path (push constants) and MDI path (binary search) - CPU render loop now does per-face-group draws with backface culling (6 draws/chunk max) - Frustum planes extracted and populated in constant buffer for GPU cull shader - GPU cull + MDI path fully implemented but disabled (barrier/state debugging needed) - GPU timestamp query infrastructure with readback for cull/draw timing - HUD shows rendering mode (GPU cull vs CPU fallback) 2026-03-25 14:50:55 +01:00
Phase 2.5: GPU meshing production pipeline + perf optimizations (80+ FPS) Replace CPU greedy mesher with GPU compute mesher as default rendering pipeline. Key optimizations identified via CPU profiling (ProfileAccum, 5s averages): - Fused regenerate+pack: parallel noise gen + memcpy in same jobsystem pass (6ms → 0ms) - VoxelData memcpy: sizeof(VoxelData)==2 enables direct memcpy instead of bit-shift loop (28ms → <1ms) - Dirty-skip: GPU dispatch/upload only when chunks change, not every frame - Animation: 2 fBm octaves + no caves in animation mode (54ms → 8ms) - Result: 80-110 FPS with 60Hz terrain animation, 700+ FPS static 2026-03-26 09:05:52 +01:00			`if (push.flags & 2) {`
			`// GPU mesh path: quads are in a flat buffer, chunk index is embedded`
			`// in each quad's flags field (bits [31:17] of hi word = 11-bit chunk index).`
			`// push.quadOffset = base offset into the GPU quad buffer.`
			`quadIndex = push.quadOffset + (vertexID / 6);`
			`} else if (push.flags & 1) {`
Phase 2.2: MDI rendering with CPU-filled indirect args Replace per-chunk DrawInstanced loop with a single DrawInstancedIndirectCount. CPU fills indirect args buffer with same frustum+backface cull logic as Phase 2.1. Key discoveries: - Wicked Engine command signature includes push constant (20-byte stride, not 16) - SV_VertexID does not reliably include startVertexLocation with ExecuteIndirect - Solution: pack chunkIndex\|(faceIndex<<16) in push constant, VS reconstructs quad offset from GPUChunkInfo lookup - No explicit DX12 barriers needed (implicit promotion from COMMON suffices) Also adds voxel_engine_spec.md and updates references from .docx to .md. 2026-03-25 22:07:22 +01:00			`// MDI path: push.chunkIndex is packed by ExecuteIndirect command signature:`
			`// low 16 bits = chunk index into chunkInfoBuffer`
			`// high 16 bits = face index (0-5)`
			`// SV_VertexID starts at 0 (startVertexLocation=0), so we compute the`
			`// global quad index from the GPUChunkInfo face offset.`
			`chunkIndex = push.chunkIndex & 0xFFFF;`
			`uint faceIdx = push.chunkIndex >> 16;`
			`GPUChunkInfo ci = chunkInfoBuffer[chunkIndex];`
			`uint faceOff = getFaceOffset(ci, faceIdx);`
			`quadIndex = ci.quadOffset + faceOff + (vertexID / 6);`
Phase 2 complete: per-face-group backface culling, frustum planes, GPU cull infrastructure - VS supports dual mode: CPU path (push constants) and MDI path (binary search) - CPU render loop now does per-face-group draws with backface culling (6 draws/chunk max) - Frustum planes extracted and populated in constant buffer for GPU cull shader - GPU cull + MDI path fully implemented but disabled (barrier/state debugging needed) - GPU timestamp query infrastructure with readback for cull/draw timing - HUD shows rendering mode (GPU cull vs CPU fallback) 2026-03-25 14:50:55 +01:00			`} else {`
			`// CPU path: push constants provide explicit offsets`
			`quadIndex = push.quadOffset + (vertexID / 6);`
			`chunkIndex = push.chunkIndex;`
			`}`
Phase 2: GPU-driven voxel rendering pipeline Mega-buffer architecture replacing per-chunk GPU buffers: - Single StructuredBuffer<PackedQuad> for all chunks (2M quads, 16 MB) - StructuredBuffer<GPUChunkInfo> with per-chunk metadata (position, quad offsets, face groups) - VS reads chunk info via push constants (b999) for driver-safe chunk indexing - CPU frustum culling with wi::primitive::Frustum + AABB per chunk - Quads sorted by face direction in greedy mesher (faceOffsets/faceCounts) - GPU frustum + backface cull compute shader (voxelCullCS.hlsl) - GPU binary mesher compute shader baseline (voxelMeshCS.hlsl) - Indirect draw buffers and timestamp query infrastructure - README with build instructions and project architecture 2026-03-25 14:24:05 +01:00
Phase 2 complete: per-face-group backface culling, frustum planes, GPU cull infrastructure - VS supports dual mode: CPU path (push constants) and MDI path (binary search) - CPU render loop now does per-face-group draws with backface culling (6 draws/chunk max) - Frustum planes extracted and populated in constant buffer for GPU cull shader - GPU cull + MDI path fully implemented but disabled (barrier/state debugging needed) - GPU timestamp query infrastructure with readback for cull/draw timing - HUD shows rendering mode (GPU cull vs CPU fallback) 2026-03-25 14:50:55 +01:00			`uint cornerIndex = vertexID % 6;`
Phase 2: GPU-driven voxel rendering pipeline Mega-buffer architecture replacing per-chunk GPU buffers: - Single StructuredBuffer<PackedQuad> for all chunks (2M quads, 16 MB) - StructuredBuffer<GPUChunkInfo> with per-chunk metadata (position, quad offsets, face groups) - VS reads chunk info via push constants (b999) for driver-safe chunk indexing - CPU frustum culling with wi::primitive::Frustum + AABB per chunk - Quads sorted by face direction in greedy mesher (faceOffsets/faceCounts) - GPU frustum + backface cull compute shader (voxelCullCS.hlsl) - GPU binary mesher compute shader baseline (voxelMeshCS.hlsl) - Indirect draw buffers and timestamp query infrastructure - README with build instructions and project architecture 2026-03-25 14:24:05 +01:00
			`PackedQuad packed = quadBuffer[quadIndex];`
			`uint px, py, pz, w, h, face, matID, ao;`
			`unpackQuad(packed.data, px, py, pz, w, h, face, matID, ao);`

Phase 2.5: GPU meshing production pipeline + perf optimizations (80+ FPS) Replace CPU greedy mesher with GPU compute mesher as default rendering pipeline. Key optimizations identified via CPU profiling (ProfileAccum, 5s averages): - Fused regenerate+pack: parallel noise gen + memcpy in same jobsystem pass (6ms → 0ms) - VoxelData memcpy: sizeof(VoxelData)==2 enables direct memcpy instead of bit-shift loop (28ms → <1ms) - Dirty-skip: GPU dispatch/upload only when chunks change, not every frame - Animation: 2 fBm octaves + no caves in animation mode (54ms → 8ms) - Result: 80-110 FPS with 60Hz terrain animation, 700+ FPS static 2026-03-26 09:05:52 +01:00			`// GPU mesh path: extract chunk index from quad flags field (bits [31:17] of hi word)`
			`if (push.flags & 2) {`
			`chunkIndex = (packed.data.y >> 17) & 0x7FF;`
			`}`

			`GPUChunkInfo info = chunkInfoBuffer[chunkIndex];`

Phase 2: GPU-driven voxel rendering pipeline Mega-buffer architecture replacing per-chunk GPU buffers: - Single StructuredBuffer<PackedQuad> for all chunks (2M quads, 16 MB) - StructuredBuffer<GPUChunkInfo> with per-chunk metadata (position, quad offsets, face groups) - VS reads chunk info via push constants (b999) for driver-safe chunk indexing - CPU frustum culling with wi::primitive::Frustum + AABB per chunk - Quads sorted by face direction in greedy mesher (faceOffsets/faceCounts) - GPU frustum + backface cull compute shader (voxelCullCS.hlsl) - GPU binary mesher compute shader baseline (voxelMeshCS.hlsl) - Indirect draw buffers and timestamp query infrastructure - README with build instructions and project architecture 2026-03-25 14:24:05 +01:00			`// Corner offsets for 2 triangles (6 vertices per quad)`
			`// cross(U,V) matches N for faces: +X(0), -Y(3), +Z(4) -> CW corners`
			`// cross(U,V) opposes N for faces: -X(1), +Y(2), -Z(5) -> CCW corners`
			`static const float2 cornersCW[6] = {`
			`float2(0, 0), float2(0, 1), float2(1, 0),`
			`float2(1, 0), float2(0, 1), float2(1, 1)`
			`};`
			`static const float2 cornersCCW[6] = {`
			`float2(0, 0), float2(1, 0), float2(0, 1),`
			`float2(0, 1), float2(1, 0), float2(1, 1)`
			`};`
			`bool useCCW = (face == 1 \|\| face == 2 \|\| face == 5);`
			`float2 corner = useCCW ? cornersCCW[cornerIndex] : cornersCW[cornerIndex];`

			`float3 basePos = float3((float)px, (float)py, (float)pz);`
			`float3 normal = faceNormals[face];`
			`float3 uAxis = faceU[face];`
			`float3 vAxis = faceV[face];`

			`// Positive faces: offset by 1 in normal direction`
			`float3 faceOffset = (face % 2 == 0) ? normal : float3(0, 0, 0);`

			`// Expand quad`
			`float3 localPos = basePos + faceOffset + uAxis * corner.x * (float)w + vAxis * corner.y * (float)h;`
			`float3 worldPos = localPos + info.worldPos.xyz;`

			`output.position = mul(viewProjection, float4(worldPos, 1.0));`
			`output.worldPos = worldPos;`
			`output.normal = normal;`
			`output.uv = corner * float2((float)w, (float)h) * textureTiling;`
			`output.materialID = matID;`
			`output.faceID = face;`
			`output.debugFlag = info.worldPos.w;`

			`// AO: 4 corners x 2 bits`
			`uint aoCorner = min(cornerIndex, 3u);`
			`float aoValue = (float)((ao >> (aoCorner * 2u)) & 3u) / 3.0;`
			`output.ao = 1.0 - aoValue * 0.4;`

			`return output;`
			`}`