bvle-voxels/shaders/voxelMeshCS.hlsl

// BVLE Voxels - GPU Compute Mesher (Binary Face Culling only)
// 1 thread per voxel: checks 6 neighbors, emits 1x1 PackedQuad per visible face.
// No greedy merge — this is the simple GPU baseline.
// Phase 3: blend info is computed per-pixel in the PS (not pre-encoded here).

#include "voxelCommon.hlsli"

// Push constants: chunk index + output offset
struct MeshPush {
    uint chunkIndex;          // which chunk to mesh
    uint voxelBufferOffset;   // offset into the voxel data buffer (in uint16 pairs)
    uint quadBufferOffset;    // offset into the output quad buffer (in quads)
    uint maxOutputQuads;      // safety cap on output
    uint pad[8];              // pad to 48 bytes (12 x uint32)
};
[[vk::push_constant]] ConstantBuffer<MeshPush> push : register(b999);

// Input: voxel data for one chunk (32^3 = 32768 voxels, packed as uint16 pairs in uint)
// Each uint holds 2 voxels: low 16 bits = voxel A, high 16 bits = voxel B
StructuredBuffer<uint> voxelData : register(t0);

// Output: packed quads (append buffer with atomic counter)
RWStructuredBuffer<uint2> outputQuads : register(u0);  // uint2 = 8 bytes = PackedQuad
RWByteAddressBuffer quadCounter : register(u1);        // atomic counter

// Constants
static const uint CSIZE = 32;
static const uint CVOL = CSIZE * CSIZE * CSIZE; // 32768

// Read a single voxel (16-bit) from the packed buffer
uint readVoxel(uint flatIndex) {
    uint pairIndex = flatIndex >> 1;       // which uint (2 voxels per uint)
    uint shift = (flatIndex & 1) * 16;     // 0 or 16
    return (voxelData[push.voxelBufferOffset + pairIndex] >> shift) & 0xFFFF;
}

// Check if neighbor is air (handles out-of-bounds as air for chunk boundaries)
bool isNeighborAir(int3 pos, int3 dir) {
    int3 n = pos + dir;
    // Out-of-chunk = treat as air (boundary faces always visible)
    if (any(n < 0) || any(n >= (int3)CSIZE))
        return true;
    uint flatN = (uint)n.x + (uint)n.y * CSIZE + (uint)n.z * CSIZE * CSIZE;
    return readVoxel(flatN) == 0; // materialID 0 = air
}

// Pack a quad into uint2 (matches CPU PackedQuad format)
// chunkIdx is stored in bits [27:17] of hi word for VS lookup
uint2 packQuad(uint x, uint y, uint z, uint w, uint h, uint face, uint matID, uint chunkIdx) {
    uint lo = x | (y << 6) | (z << 12) | (w << 18) | (h << 24) | (face << 30);
    uint hi = (face >> 2) | (matID << 1) | ((chunkIdx & 0x7FF) << 17);
    return uint2(lo, hi);
}

// Face directions
static const int3 faceDirs[6] = {
    int3( 1, 0, 0), int3(-1, 0, 0),
    int3( 0, 1, 0), int3( 0,-1, 0),
    int3( 0, 0, 1), int3( 0, 0,-1)
};

[RootSignature(VOXEL_ROOTSIG)]
[numthreads(8, 8, 8)]  // 512 threads = covers 32^3 with 64 groups of 512
void main(uint3 DTid : SV_DispatchThreadID)
{
    if (any(DTid >= CSIZE)) return;

    uint flatIdx = DTid.x + DTid.y * CSIZE + DTid.z * CSIZE * CSIZE;
    uint voxel = readVoxel(flatIdx);
    if (voxel == 0) return; // air voxel, nothing to emit

    uint matID = voxel >> 8; // high 8 bits = material ID

    // Check each face direction
    [unroll]
    for (uint f = 0; f < 6; f++) {
        if (!isNeighborAir((int3)DTid, faceDirs[f])) continue;

        // Emit a 1x1 quad
        uint slot;
        quadCounter.InterlockedAdd(0, 1, slot);
        if (slot >= push.maxOutputQuads) return; // overflow guard

        outputQuads[push.quadBufferOffset + slot] = packQuad(
            DTid.x, DTid.y, DTid.z, 1, 1, f, matID, push.chunkIndex
        );
    }
}
Phase 2: GPU-driven voxel rendering pipeline Mega-buffer architecture replacing per-chunk GPU buffers: - Single StructuredBuffer<PackedQuad> for all chunks (2M quads, 16 MB) - StructuredBuffer<GPUChunkInfo> with per-chunk metadata (position, quad offsets, face groups) - VS reads chunk info via push constants (b999) for driver-safe chunk indexing - CPU frustum culling with wi::primitive::Frustum + AABB per chunk - Quads sorted by face direction in greedy mesher (faceOffsets/faceCounts) - GPU frustum + backface cull compute shader (voxelCullCS.hlsl) - GPU binary mesher compute shader baseline (voxelMeshCS.hlsl) - Indirect draw buffers and timestamp query infrastructure - README with build instructions and project architecture 2026-03-25 14:24:05 +01:00			`// BVLE Voxels - GPU Compute Mesher (Binary Face Culling only)`
			`// 1 thread per voxel: checks 6 neighbors, emits 1x1 PackedQuad per visible face.`
Phase 3: PS-based texture blending with winner-takes-all heightmap Replace pre-encoded quad blend data (v1) with per-pixel voxel data lookups in the pixel shader. The PS reads voxelDataBuffer (SRV t3) to find neighbor materials dynamically, enabling 2 independent blend axes, stair-priority neighbor detection, and winner-takes-all heightmap-driven transitions. Key design decisions validated through 6 iterations (see blending_experiments.md): - Winner-takes-all: material with highest heightmap score wins 100% (sharp but organic transitions, not smooth gradient) - Symmetric bias: bias = 0.5 - weight ensures equal chance at border - Subtractive corner attenuation (param=0.80): xAdj = xEdge - saturate(yEdge - 0.80) reduces blend at corners naturally - Blend zone = 0.25 voxels from each edge (50% of face) - Debug mode (F4) visualizes blend zones as colors 2026-03-26 12:14:08 +01:00			`// No greedy merge — this is the simple GPU baseline.`
			`// Phase 3: blend info is computed per-pixel in the PS (not pre-encoded here).`
Phase 2: GPU-driven voxel rendering pipeline Mega-buffer architecture replacing per-chunk GPU buffers: - Single StructuredBuffer<PackedQuad> for all chunks (2M quads, 16 MB) - StructuredBuffer<GPUChunkInfo> with per-chunk metadata (position, quad offsets, face groups) - VS reads chunk info via push constants (b999) for driver-safe chunk indexing - CPU frustum culling with wi::primitive::Frustum + AABB per chunk - Quads sorted by face direction in greedy mesher (faceOffsets/faceCounts) - GPU frustum + backface cull compute shader (voxelCullCS.hlsl) - GPU binary mesher compute shader baseline (voxelMeshCS.hlsl) - Indirect draw buffers and timestamp query infrastructure - README with build instructions and project architecture 2026-03-25 14:24:05 +01:00
			`#include "voxelCommon.hlsli"`

			`// Push constants: chunk index + output offset`
			`struct MeshPush {`
			`uint chunkIndex; // which chunk to mesh`
			`uint voxelBufferOffset; // offset into the voxel data buffer (in uint16 pairs)`
			`uint quadBufferOffset; // offset into the output quad buffer (in quads)`
			`uint maxOutputQuads; // safety cap on output`
			`uint pad[8]; // pad to 48 bytes (12 x uint32)`
			`};`
			`[[vk::push_constant]] ConstantBuffer<MeshPush> push : register(b999);`

			`// Input: voxel data for one chunk (32^3 = 32768 voxels, packed as uint16 pairs in uint)`
			`// Each uint holds 2 voxels: low 16 bits = voxel A, high 16 bits = voxel B`
			`StructuredBuffer<uint> voxelData : register(t0);`

			`// Output: packed quads (append buffer with atomic counter)`
			`RWStructuredBuffer<uint2> outputQuads : register(u0); // uint2 = 8 bytes = PackedQuad`
			`RWByteAddressBuffer quadCounter : register(u1); // atomic counter`

			`// Constants`
			`static const uint CSIZE = 32;`
			`static const uint CVOL = CSIZE * CSIZE * CSIZE; // 32768`

			`// Read a single voxel (16-bit) from the packed buffer`
			`uint readVoxel(uint flatIndex) {`
			`uint pairIndex = flatIndex >> 1; // which uint (2 voxels per uint)`
			`uint shift = (flatIndex & 1) * 16; // 0 or 16`
			`return (voxelData[push.voxelBufferOffset + pairIndex] >> shift) & 0xFFFF;`
			`}`

			`// Check if neighbor is air (handles out-of-bounds as air for chunk boundaries)`
			`bool isNeighborAir(int3 pos, int3 dir) {`
			`int3 n = pos + dir;`
			`// Out-of-chunk = treat as air (boundary faces always visible)`
			`if (any(n < 0) \|\| any(n >= (int3)CSIZE))`
			`return true;`
			`uint flatN = (uint)n.x + (uint)n.y * CSIZE + (uint)n.z * CSIZE * CSIZE;`
			`return readVoxel(flatN) == 0; // materialID 0 = air`
			`}`

			`// Pack a quad into uint2 (matches CPU PackedQuad format)`
Phase 3: PS-based texture blending with winner-takes-all heightmap Replace pre-encoded quad blend data (v1) with per-pixel voxel data lookups in the pixel shader. The PS reads voxelDataBuffer (SRV t3) to find neighbor materials dynamically, enabling 2 independent blend axes, stair-priority neighbor detection, and winner-takes-all heightmap-driven transitions. Key design decisions validated through 6 iterations (see blending_experiments.md): - Winner-takes-all: material with highest heightmap score wins 100% (sharp but organic transitions, not smooth gradient) - Symmetric bias: bias = 0.5 - weight ensures equal chance at border - Subtractive corner attenuation (param=0.80): xAdj = xEdge - saturate(yEdge - 0.80) reduces blend at corners naturally - Blend zone = 0.25 voxels from each edge (50% of face) - Debug mode (F4) visualizes blend zones as colors 2026-03-26 12:14:08 +01:00			`// chunkIdx is stored in bits [27:17] of hi word for VS lookup`
Phase 2.5: GPU meshing production pipeline + perf optimizations (80+ FPS) Replace CPU greedy mesher with GPU compute mesher as default rendering pipeline. Key optimizations identified via CPU profiling (ProfileAccum, 5s averages): - Fused regenerate+pack: parallel noise gen + memcpy in same jobsystem pass (6ms → 0ms) - VoxelData memcpy: sizeof(VoxelData)==2 enables direct memcpy instead of bit-shift loop (28ms → <1ms) - Dirty-skip: GPU dispatch/upload only when chunks change, not every frame - Animation: 2 fBm octaves + no caves in animation mode (54ms → 8ms) - Result: 80-110 FPS with 60Hz terrain animation, 700+ FPS static 2026-03-26 09:05:52 +01:00			`uint2 packQuad(uint x, uint y, uint z, uint w, uint h, uint face, uint matID, uint chunkIdx) {`
Phase 2: GPU-driven voxel rendering pipeline Mega-buffer architecture replacing per-chunk GPU buffers: - Single StructuredBuffer<PackedQuad> for all chunks (2M quads, 16 MB) - StructuredBuffer<GPUChunkInfo> with per-chunk metadata (position, quad offsets, face groups) - VS reads chunk info via push constants (b999) for driver-safe chunk indexing - CPU frustum culling with wi::primitive::Frustum + AABB per chunk - Quads sorted by face direction in greedy mesher (faceOffsets/faceCounts) - GPU frustum + backface cull compute shader (voxelCullCS.hlsl) - GPU binary mesher compute shader baseline (voxelMeshCS.hlsl) - Indirect draw buffers and timestamp query infrastructure - README with build instructions and project architecture 2026-03-25 14:24:05 +01:00			`uint lo = x \| (y << 6) \| (z << 12) \| (w << 18) \| (h << 24) \| (face << 30);`
Phase 3: PS-based texture blending with winner-takes-all heightmap Replace pre-encoded quad blend data (v1) with per-pixel voxel data lookups in the pixel shader. The PS reads voxelDataBuffer (SRV t3) to find neighbor materials dynamically, enabling 2 independent blend axes, stair-priority neighbor detection, and winner-takes-all heightmap-driven transitions. Key design decisions validated through 6 iterations (see blending_experiments.md): - Winner-takes-all: material with highest heightmap score wins 100% (sharp but organic transitions, not smooth gradient) - Symmetric bias: bias = 0.5 - weight ensures equal chance at border - Subtractive corner attenuation (param=0.80): xAdj = xEdge - saturate(yEdge - 0.80) reduces blend at corners naturally - Blend zone = 0.25 voxels from each edge (50% of face) - Debug mode (F4) visualizes blend zones as colors 2026-03-26 12:14:08 +01:00			`uint hi = (face >> 2) \| (matID << 1) \| ((chunkIdx & 0x7FF) << 17);`
Phase 2: GPU-driven voxel rendering pipeline Mega-buffer architecture replacing per-chunk GPU buffers: - Single StructuredBuffer<PackedQuad> for all chunks (2M quads, 16 MB) - StructuredBuffer<GPUChunkInfo> with per-chunk metadata (position, quad offsets, face groups) - VS reads chunk info via push constants (b999) for driver-safe chunk indexing - CPU frustum culling with wi::primitive::Frustum + AABB per chunk - Quads sorted by face direction in greedy mesher (faceOffsets/faceCounts) - GPU frustum + backface cull compute shader (voxelCullCS.hlsl) - GPU binary mesher compute shader baseline (voxelMeshCS.hlsl) - Indirect draw buffers and timestamp query infrastructure - README with build instructions and project architecture 2026-03-25 14:24:05 +01:00			`return uint2(lo, hi);`
			`}`

			`// Face directions`
			`static const int3 faceDirs[6] = {`
			`int3( 1, 0, 0), int3(-1, 0, 0),`
			`int3( 0, 1, 0), int3( 0,-1, 0),`
			`int3( 0, 0, 1), int3( 0, 0,-1)`
			`};`

			`[RootSignature(VOXEL_ROOTSIG)]`
			`[numthreads(8, 8, 8)] // 512 threads = covers 32^3 with 64 groups of 512`
			`void main(uint3 DTid : SV_DispatchThreadID)`
			`{`
			`if (any(DTid >= CSIZE)) return;`

			`uint flatIdx = DTid.x + DTid.y * CSIZE + DTid.z * CSIZE * CSIZE;`
			`uint voxel = readVoxel(flatIdx);`
			`if (voxel == 0) return; // air voxel, nothing to emit`

			`uint matID = voxel >> 8; // high 8 bits = material ID`

			`// Check each face direction`
			`[unroll]`
			`for (uint f = 0; f < 6; f++) {`
			`if (!isNeighborAir((int3)DTid, faceDirs[f])) continue;`

			`// Emit a 1x1 quad`
			`uint slot;`
			`quadCounter.InterlockedAdd(0, 1, slot);`
			`if (slot >= push.maxOutputQuads) return; // overflow guard`

			`outputQuads[push.quadBufferOffset + slot] = packQuad(`
Phase 2.5: GPU meshing production pipeline + perf optimizations (80+ FPS) Replace CPU greedy mesher with GPU compute mesher as default rendering pipeline. Key optimizations identified via CPU profiling (ProfileAccum, 5s averages): - Fused regenerate+pack: parallel noise gen + memcpy in same jobsystem pass (6ms → 0ms) - VoxelData memcpy: sizeof(VoxelData)==2 enables direct memcpy instead of bit-shift loop (28ms → <1ms) - Dirty-skip: GPU dispatch/upload only when chunks change, not every frame - Animation: 2 fBm octaves + no caves in animation mode (54ms → 8ms) - Result: 80-110 FPS with 60Hz terrain animation, 700+ FPS static 2026-03-26 09:05:52 +01:00			`DTid.x, DTid.y, DTid.z, 1, 1, f, matID, push.chunkIndex`
Phase 2: GPU-driven voxel rendering pipeline Mega-buffer architecture replacing per-chunk GPU buffers: - Single StructuredBuffer<PackedQuad> for all chunks (2M quads, 16 MB) - StructuredBuffer<GPUChunkInfo> with per-chunk metadata (position, quad offsets, face groups) - VS reads chunk info via push constants (b999) for driver-safe chunk indexing - CPU frustum culling with wi::primitive::Frustum + AABB per chunk - Quads sorted by face direction in greedy mesher (faceOffsets/faceCounts) - GPU frustum + backface cull compute shader (voxelCullCS.hlsl) - GPU binary mesher compute shader baseline (voxelMeshCS.hlsl) - Indirect draw buffers and timestamp query infrastructure - README with build instructions and project architecture 2026-03-25 14:24:05 +01:00			`);`
			`}`
			`}`