87 lines
3.2 KiB
HLSL
87 lines
3.2 KiB
HLSL
|
|
// BVLE Voxels - GPU Compute Mesher (Binary Face Culling only)
|
||
|
|
// 1 thread per voxel: checks 6 neighbors, emits 1x1 PackedQuad per visible face.
|
||
|
|
// No greedy merge — this is the simple GPU baseline for benchmark comparison.
|
||
|
|
|
||
|
|
#include "voxelCommon.hlsli"
|
||
|
|
|
||
|
|
// Push constants: chunk index + output offset
|
||
|
|
struct MeshPush {
|
||
|
|
uint chunkIndex; // which chunk to mesh
|
||
|
|
uint voxelBufferOffset; // offset into the voxel data buffer (in uint16 pairs)
|
||
|
|
uint quadBufferOffset; // offset into the output quad buffer (in quads)
|
||
|
|
uint maxOutputQuads; // safety cap on output
|
||
|
|
uint pad[8]; // pad to 48 bytes (12 x uint32)
|
||
|
|
};
|
||
|
|
[[vk::push_constant]] ConstantBuffer<MeshPush> push : register(b999);
|
||
|
|
|
||
|
|
// Input: voxel data for one chunk (32^3 = 32768 voxels, packed as uint16 pairs in uint)
|
||
|
|
// Each uint holds 2 voxels: low 16 bits = voxel A, high 16 bits = voxel B
|
||
|
|
StructuredBuffer<uint> voxelData : register(t0);
|
||
|
|
|
||
|
|
// Output: packed quads (append buffer with atomic counter)
|
||
|
|
RWStructuredBuffer<uint2> outputQuads : register(u0); // uint2 = 8 bytes = PackedQuad
|
||
|
|
RWByteAddressBuffer quadCounter : register(u1); // atomic counter
|
||
|
|
|
||
|
|
// Constants
|
||
|
|
static const uint CSIZE = 32;
|
||
|
|
static const uint CVOL = CSIZE * CSIZE * CSIZE; // 32768
|
||
|
|
|
||
|
|
// Read a single voxel (16-bit) from the packed buffer
|
||
|
|
uint readVoxel(uint flatIndex) {
|
||
|
|
uint pairIndex = flatIndex >> 1; // which uint (2 voxels per uint)
|
||
|
|
uint shift = (flatIndex & 1) * 16; // 0 or 16
|
||
|
|
return (voxelData[push.voxelBufferOffset + pairIndex] >> shift) & 0xFFFF;
|
||
|
|
}
|
||
|
|
|
||
|
|
// Check if neighbor is air (handles out-of-bounds as air for chunk boundaries)
|
||
|
|
bool isNeighborAir(int3 pos, int3 dir) {
|
||
|
|
int3 n = pos + dir;
|
||
|
|
// Out-of-chunk = treat as air (boundary faces always visible)
|
||
|
|
if (any(n < 0) || any(n >= (int3)CSIZE))
|
||
|
|
return true;
|
||
|
|
uint flatN = (uint)n.x + (uint)n.y * CSIZE + (uint)n.z * CSIZE * CSIZE;
|
||
|
|
return readVoxel(flatN) == 0; // materialID 0 = air
|
||
|
|
}
|
||
|
|
|
||
|
|
// Pack a quad into uint2 (matches CPU PackedQuad format)
|
||
|
|
uint2 packQuad(uint x, uint y, uint z, uint w, uint h, uint face, uint matID) {
|
||
|
|
uint lo = x | (y << 6) | (z << 12) | (w << 18) | (h << 24) | (face << 30);
|
||
|
|
uint hi = (face >> 2) | (matID << 1) | (0 << 9) | (0 << 17); // AO=0, flags=0
|
||
|
|
return uint2(lo, hi);
|
||
|
|
}
|
||
|
|
|
||
|
|
// Face directions
|
||
|
|
static const int3 faceDirs[6] = {
|
||
|
|
int3( 1, 0, 0), int3(-1, 0, 0),
|
||
|
|
int3( 0, 1, 0), int3( 0,-1, 0),
|
||
|
|
int3( 0, 0, 1), int3( 0, 0,-1)
|
||
|
|
};
|
||
|
|
|
||
|
|
[RootSignature(VOXEL_ROOTSIG)]
|
||
|
|
[numthreads(8, 8, 8)] // 512 threads = covers 32^3 with 64 groups of 512
|
||
|
|
void main(uint3 DTid : SV_DispatchThreadID)
|
||
|
|
{
|
||
|
|
if (any(DTid >= CSIZE)) return;
|
||
|
|
|
||
|
|
uint flatIdx = DTid.x + DTid.y * CSIZE + DTid.z * CSIZE * CSIZE;
|
||
|
|
uint voxel = readVoxel(flatIdx);
|
||
|
|
if (voxel == 0) return; // air voxel, nothing to emit
|
||
|
|
|
||
|
|
uint matID = voxel >> 8; // high 8 bits = material ID
|
||
|
|
|
||
|
|
// Check each face direction
|
||
|
|
[unroll]
|
||
|
|
for (uint f = 0; f < 6; f++) {
|
||
|
|
if (!isNeighborAir((int3)DTid, faceDirs[f])) continue;
|
||
|
|
|
||
|
|
// Emit a 1x1 quad
|
||
|
|
uint slot;
|
||
|
|
quadCounter.InterlockedAdd(0, 1, slot);
|
||
|
|
if (slot >= push.maxOutputQuads) return; // overflow guard
|
||
|
|
|
||
|
|
outputQuads[push.quadBufferOffset + slot] = packQuad(
|
||
|
|
DTid.x, DTid.y, DTid.z, 1, 1, f, matID
|
||
|
|
);
|
||
|
|
}
|
||
|
|
}
|