bvle-voxels/shaders/voxelMeshCS.hlsl

// BVLE Voxels - GPU Compute Mesher (Binary Face Culling only)
// 1 thread per voxel: checks 6 neighbors, emits 1x1 PackedQuad per visible face.
// No greedy merge — this is the simple GPU baseline for benchmark comparison.

#include "voxelCommon.hlsli"

// Push constants: chunk index + output offset
struct MeshPush {
    uint chunkIndex;          // which chunk to mesh
    uint voxelBufferOffset;   // offset into the voxel data buffer (in uint16 pairs)
    uint quadBufferOffset;    // offset into the output quad buffer (in quads)
    uint maxOutputQuads;      // safety cap on output
    uint pad[8];              // pad to 48 bytes (12 x uint32)
};
[[vk::push_constant]] ConstantBuffer<MeshPush> push : register(b999);

// Input: voxel data for one chunk (32^3 = 32768 voxels, packed as uint16 pairs in uint)
// Each uint holds 2 voxels: low 16 bits = voxel A, high 16 bits = voxel B
StructuredBuffer<uint> voxelData : register(t0);

// Output: packed quads (append buffer with atomic counter)
RWStructuredBuffer<uint2> outputQuads : register(u0);  // uint2 = 8 bytes = PackedQuad
RWByteAddressBuffer quadCounter : register(u1);        // atomic counter

// Constants
static const uint CSIZE = 32;
static const uint CVOL = CSIZE * CSIZE * CSIZE; // 32768

// Read a single voxel (16-bit) from the packed buffer
uint readVoxel(uint flatIndex) {
    uint pairIndex = flatIndex >> 1;       // which uint (2 voxels per uint)
    uint shift = (flatIndex & 1) * 16;     // 0 or 16
    return (voxelData[push.voxelBufferOffset + pairIndex] >> shift) & 0xFFFF;
}

// Check if neighbor is air (handles out-of-bounds as air for chunk boundaries)
bool isNeighborAir(int3 pos, int3 dir) {
    int3 n = pos + dir;
    // Out-of-chunk = treat as air (boundary faces always visible)
    if (any(n < 0) || any(n >= (int3)CSIZE))
        return true;
    uint flatN = (uint)n.x + (uint)n.y * CSIZE + (uint)n.z * CSIZE * CSIZE;
    return readVoxel(flatN) == 0; // materialID 0 = air
}

// Pack a quad into uint2 (matches CPU PackedQuad format)
uint2 packQuad(uint x, uint y, uint z, uint w, uint h, uint face, uint matID) {
    uint lo = x | (y << 6) | (z << 12) | (w << 18) | (h << 24) | (face << 30);
    uint hi = (face >> 2) | (matID << 1) | (0 << 9) | (0 << 17); // AO=0, flags=0
    return uint2(lo, hi);
}

// Face directions
static const int3 faceDirs[6] = {
    int3( 1, 0, 0), int3(-1, 0, 0),
    int3( 0, 1, 0), int3( 0,-1, 0),
    int3( 0, 0, 1), int3( 0, 0,-1)
};

[RootSignature(VOXEL_ROOTSIG)]
[numthreads(8, 8, 8)]  // 512 threads = covers 32^3 with 64 groups of 512
void main(uint3 DTid : SV_DispatchThreadID)
{
    if (any(DTid >= CSIZE)) return;

    uint flatIdx = DTid.x + DTid.y * CSIZE + DTid.z * CSIZE * CSIZE;
    uint voxel = readVoxel(flatIdx);
    if (voxel == 0) return; // air voxel, nothing to emit

    uint matID = voxel >> 8; // high 8 bits = material ID

    // Check each face direction
    [unroll]
    for (uint f = 0; f < 6; f++) {
        if (!isNeighborAir((int3)DTid, faceDirs[f])) continue;

        // Emit a 1x1 quad
        uint slot;
        quadCounter.InterlockedAdd(0, 1, slot);
        if (slot >= push.maxOutputQuads) return; // overflow guard

        outputQuads[push.quadBufferOffset + slot] = packQuad(
            DTid.x, DTid.y, DTid.z, 1, 1, f, matID
        );
    }
}
Phase 2: GPU-driven voxel rendering pipeline Mega-buffer architecture replacing per-chunk GPU buffers: - Single StructuredBuffer<PackedQuad> for all chunks (2M quads, 16 MB) - StructuredBuffer<GPUChunkInfo> with per-chunk metadata (position, quad offsets, face groups) - VS reads chunk info via push constants (b999) for driver-safe chunk indexing - CPU frustum culling with wi::primitive::Frustum + AABB per chunk - Quads sorted by face direction in greedy mesher (faceOffsets/faceCounts) - GPU frustum + backface cull compute shader (voxelCullCS.hlsl) - GPU binary mesher compute shader baseline (voxelMeshCS.hlsl) - Indirect draw buffers and timestamp query infrastructure - README with build instructions and project architecture 2026-03-25 14:24:05 +01:00			`// BVLE Voxels - GPU Compute Mesher (Binary Face Culling only)`
			`// 1 thread per voxel: checks 6 neighbors, emits 1x1 PackedQuad per visible face.`
			`// No greedy merge — this is the simple GPU baseline for benchmark comparison.`

			`#include "voxelCommon.hlsli"`

			`// Push constants: chunk index + output offset`
			`struct MeshPush {`
			`uint chunkIndex; // which chunk to mesh`
			`uint voxelBufferOffset; // offset into the voxel data buffer (in uint16 pairs)`
			`uint quadBufferOffset; // offset into the output quad buffer (in quads)`
			`uint maxOutputQuads; // safety cap on output`
			`uint pad[8]; // pad to 48 bytes (12 x uint32)`
			`};`
			`[[vk::push_constant]] ConstantBuffer<MeshPush> push : register(b999);`

			`// Input: voxel data for one chunk (32^3 = 32768 voxels, packed as uint16 pairs in uint)`
			`// Each uint holds 2 voxels: low 16 bits = voxel A, high 16 bits = voxel B`
			`StructuredBuffer<uint> voxelData : register(t0);`

			`// Output: packed quads (append buffer with atomic counter)`
			`RWStructuredBuffer<uint2> outputQuads : register(u0); // uint2 = 8 bytes = PackedQuad`
			`RWByteAddressBuffer quadCounter : register(u1); // atomic counter`

			`// Constants`
			`static const uint CSIZE = 32;`
			`static const uint CVOL = CSIZE * CSIZE * CSIZE; // 32768`

			`// Read a single voxel (16-bit) from the packed buffer`
			`uint readVoxel(uint flatIndex) {`
			`uint pairIndex = flatIndex >> 1; // which uint (2 voxels per uint)`
			`uint shift = (flatIndex & 1) * 16; // 0 or 16`
			`return (voxelData[push.voxelBufferOffset + pairIndex] >> shift) & 0xFFFF;`
			`}`

			`// Check if neighbor is air (handles out-of-bounds as air for chunk boundaries)`
			`bool isNeighborAir(int3 pos, int3 dir) {`
			`int3 n = pos + dir;`
			`// Out-of-chunk = treat as air (boundary faces always visible)`
			`if (any(n < 0) \|\| any(n >= (int3)CSIZE))`
			`return true;`
			`uint flatN = (uint)n.x + (uint)n.y * CSIZE + (uint)n.z * CSIZE * CSIZE;`
			`return readVoxel(flatN) == 0; // materialID 0 = air`
			`}`

			`// Pack a quad into uint2 (matches CPU PackedQuad format)`
			`uint2 packQuad(uint x, uint y, uint z, uint w, uint h, uint face, uint matID) {`
			`uint lo = x \| (y << 6) \| (z << 12) \| (w << 18) \| (h << 24) \| (face << 30);`
			`uint hi = (face >> 2) \| (matID << 1) \| (0 << 9) \| (0 << 17); // AO=0, flags=0`
			`return uint2(lo, hi);`
			`}`

			`// Face directions`
			`static const int3 faceDirs[6] = {`
			`int3( 1, 0, 0), int3(-1, 0, 0),`
			`int3( 0, 1, 0), int3( 0,-1, 0),`
			`int3( 0, 0, 1), int3( 0, 0,-1)`
			`};`

			`[RootSignature(VOXEL_ROOTSIG)]`
			`[numthreads(8, 8, 8)] // 512 threads = covers 32^3 with 64 groups of 512`
			`void main(uint3 DTid : SV_DispatchThreadID)`
			`{`
			`if (any(DTid >= CSIZE)) return;`

			`uint flatIdx = DTid.x + DTid.y * CSIZE + DTid.z * CSIZE * CSIZE;`
			`uint voxel = readVoxel(flatIdx);`
			`if (voxel == 0) return; // air voxel, nothing to emit`

			`uint matID = voxel >> 8; // high 8 bits = material ID`

			`// Check each face direction`
			`[unroll]`
			`for (uint f = 0; f < 6; f++) {`
			`if (!isNeighborAir((int3)DTid, faceDirs[f])) continue;`

			`// Emit a 1x1 quad`
			`uint slot;`
			`quadCounter.InterlockedAdd(0, 1, slot);`
			`if (slot >= push.maxOutputQuads) return; // overflow guard`

			`outputQuads[push.quadBufferOffset + slot] = packQuad(`
			`DTid.x, DTid.y, DTid.z, 1, 1, f, matID`
			`);`
			`}`
			`}`