Replace CPU greedy mesher with GPU compute mesher as default rendering pipeline. Key optimizations identified via CPU profiling (ProfileAccum, 5s averages): - Fused regenerate+pack: parallel noise gen + memcpy in same jobsystem pass (6ms → 0ms) - VoxelData memcpy: sizeof(VoxelData)==2 enables direct memcpy instead of bit-shift loop (28ms → <1ms) - Dirty-skip: GPU dispatch/upload only when chunks change, not every frame - Animation: 2 fBm octaves + no caves in animation mode (54ms → 8ms) - Result: 80-110 FPS with 60Hz terrain animation, 700+ FPS static
173 lines
6 KiB
HLSL
173 lines
6 KiB
HLSL
// BVLE Voxels - Vertex Shader (Vertex Pulling from mega-buffer)
|
|
// Phase 2: supports both CPU draw loop (push constants) and GPU MDI (binary search).
|
|
|
|
#include "voxelCommon.hlsli"
|
|
|
|
struct PackedQuad {
|
|
uint2 data; // 8 bytes = 2 x uint32
|
|
};
|
|
|
|
StructuredBuffer<PackedQuad> quadBuffer : register(t0);
|
|
StructuredBuffer<GPUChunkInfo> chunkInfoBuffer : register(t2);
|
|
|
|
// Push constants (48 bytes = 12 x uint32)
|
|
// CPU path: chunkIndex + quadOffset explicit
|
|
// MDI path: flags bit 0 set, VS derives chunk from SV_VertexID via binary search
|
|
struct VoxelPush {
|
|
uint chunkIndex;
|
|
uint quadOffset; // offset into mega quad buffer (in quads)
|
|
uint flags; // bit 0: 1 = MDI mode (binary search), 0 = CPU mode
|
|
uint pad0, pad1, pad2, pad3, pad4, pad5, pad6, pad7, pad8;
|
|
};
|
|
[[vk::push_constant]] ConstantBuffer<VoxelPush> push : register(b999);
|
|
|
|
struct VSOutput {
|
|
float4 position : SV_POSITION;
|
|
float3 worldPos : WORLDPOS;
|
|
float3 normal : NORMAL;
|
|
float2 uv : TEXCOORD0;
|
|
nointerpolation uint materialID : MATERIALID;
|
|
nointerpolation uint faceID : FACEID;
|
|
nointerpolation float debugFlag : DEBUGFLAG;
|
|
float ao : AO;
|
|
};
|
|
|
|
// Unpack 64 bits from 2 x uint32
|
|
void unpackQuad(uint2 raw, out uint px, out uint py, out uint pz,
|
|
out uint w, out uint h, out uint face,
|
|
out uint matID, out uint ao)
|
|
{
|
|
uint lo = raw.x;
|
|
uint hi = raw.y;
|
|
px = lo & 0x3F;
|
|
py = (lo >> 6) & 0x3F;
|
|
pz = (lo >> 12) & 0x3F;
|
|
w = (lo >> 18) & 0x3F;
|
|
h = (lo >> 24) & 0x3F;
|
|
face = ((lo >> 30) & 0x3) | ((hi & 0x1) << 2);
|
|
matID = (hi >> 1) & 0xFF;
|
|
ao = (hi >> 9) & 0xFF;
|
|
}
|
|
|
|
// Binary search: find which chunk owns a given global quad index.
|
|
// Chunks are packed contiguously in the mega-buffer, sorted by chunk index.
|
|
// O(log2(chunkCount)) = ~11 iterations for 2048 chunks.
|
|
uint findChunkIndex(uint globalQuadIndex) {
|
|
uint lo = 0, hi = chunkCount;
|
|
[loop]
|
|
while (lo < hi) {
|
|
uint mid = (lo + hi) >> 1;
|
|
GPUChunkInfo ci = chunkInfoBuffer[mid];
|
|
if (ci.quadOffset + ci.quadCount <= globalQuadIndex)
|
|
lo = mid + 1;
|
|
else
|
|
hi = mid;
|
|
}
|
|
return lo;
|
|
}
|
|
|
|
// Face normals: +X, -X, +Y, -Y, +Z, -Z
|
|
static const float3 faceNormals[6] = {
|
|
float3( 1, 0, 0), float3(-1, 0, 0),
|
|
float3( 0, 1, 0), float3( 0,-1, 0),
|
|
float3( 0, 0, 1), float3( 0, 0,-1)
|
|
};
|
|
|
|
// Face U/V tangent axes for quad expansion
|
|
static const float3 faceU[6] = {
|
|
float3(0, 1, 0), float3(0, 1, 0),
|
|
float3(1, 0, 0), float3(1, 0, 0),
|
|
float3(1, 0, 0), float3(1, 0, 0)
|
|
};
|
|
|
|
static const float3 faceV[6] = {
|
|
float3(0, 0, 1), float3(0, 0, 1),
|
|
float3(0, 0, 1), float3(0, 0, 1),
|
|
float3(0, 1, 0), float3(0, 1, 0)
|
|
};
|
|
|
|
[RootSignature(VOXEL_ROOTSIG)]
|
|
VSOutput main(uint vertexID : SV_VertexID)
|
|
{
|
|
VSOutput output;
|
|
|
|
// Determine quad index and chunk index based on rendering mode
|
|
uint quadIndex;
|
|
uint chunkIndex = 0;
|
|
|
|
if (push.flags & 2) {
|
|
// GPU mesh path: quads are in a flat buffer, chunk index is embedded
|
|
// in each quad's flags field (bits [31:17] of hi word = 11-bit chunk index).
|
|
// push.quadOffset = base offset into the GPU quad buffer.
|
|
quadIndex = push.quadOffset + (vertexID / 6);
|
|
} else if (push.flags & 1) {
|
|
// MDI path: push.chunkIndex is packed by ExecuteIndirect command signature:
|
|
// low 16 bits = chunk index into chunkInfoBuffer
|
|
// high 16 bits = face index (0-5)
|
|
// SV_VertexID starts at 0 (startVertexLocation=0), so we compute the
|
|
// global quad index from the GPUChunkInfo face offset.
|
|
chunkIndex = push.chunkIndex & 0xFFFF;
|
|
uint faceIdx = push.chunkIndex >> 16;
|
|
GPUChunkInfo ci = chunkInfoBuffer[chunkIndex];
|
|
uint faceOff = getFaceOffset(ci, faceIdx);
|
|
quadIndex = ci.quadOffset + faceOff + (vertexID / 6);
|
|
} else {
|
|
// CPU path: push constants provide explicit offsets
|
|
quadIndex = push.quadOffset + (vertexID / 6);
|
|
chunkIndex = push.chunkIndex;
|
|
}
|
|
|
|
uint cornerIndex = vertexID % 6;
|
|
|
|
PackedQuad packed = quadBuffer[quadIndex];
|
|
uint px, py, pz, w, h, face, matID, ao;
|
|
unpackQuad(packed.data, px, py, pz, w, h, face, matID, ao);
|
|
|
|
// GPU mesh path: extract chunk index from quad flags field (bits [31:17] of hi word)
|
|
if (push.flags & 2) {
|
|
chunkIndex = (packed.data.y >> 17) & 0x7FF;
|
|
}
|
|
|
|
GPUChunkInfo info = chunkInfoBuffer[chunkIndex];
|
|
|
|
// Corner offsets for 2 triangles (6 vertices per quad)
|
|
// cross(U,V) matches N for faces: +X(0), -Y(3), +Z(4) -> CW corners
|
|
// cross(U,V) opposes N for faces: -X(1), +Y(2), -Z(5) -> CCW corners
|
|
static const float2 cornersCW[6] = {
|
|
float2(0, 0), float2(0, 1), float2(1, 0),
|
|
float2(1, 0), float2(0, 1), float2(1, 1)
|
|
};
|
|
static const float2 cornersCCW[6] = {
|
|
float2(0, 0), float2(1, 0), float2(0, 1),
|
|
float2(0, 1), float2(1, 0), float2(1, 1)
|
|
};
|
|
bool useCCW = (face == 1 || face == 2 || face == 5);
|
|
float2 corner = useCCW ? cornersCCW[cornerIndex] : cornersCW[cornerIndex];
|
|
|
|
float3 basePos = float3((float)px, (float)py, (float)pz);
|
|
float3 normal = faceNormals[face];
|
|
float3 uAxis = faceU[face];
|
|
float3 vAxis = faceV[face];
|
|
|
|
// Positive faces: offset by 1 in normal direction
|
|
float3 faceOffset = (face % 2 == 0) ? normal : float3(0, 0, 0);
|
|
|
|
// Expand quad
|
|
float3 localPos = basePos + faceOffset + uAxis * corner.x * (float)w + vAxis * corner.y * (float)h;
|
|
float3 worldPos = localPos + info.worldPos.xyz;
|
|
|
|
output.position = mul(viewProjection, float4(worldPos, 1.0));
|
|
output.worldPos = worldPos;
|
|
output.normal = normal;
|
|
output.uv = corner * float2((float)w, (float)h) * textureTiling;
|
|
output.materialID = matID;
|
|
output.faceID = face;
|
|
output.debugFlag = info.worldPos.w;
|
|
|
|
// AO: 4 corners x 2 bits
|
|
uint aoCorner = min(cornerIndex, 3u);
|
|
float aoValue = (float)((ao >> (aoCorner * 2u)) & 3u) / 3.0;
|
|
output.ao = 1.0 - aoValue * 0.4;
|
|
|
|
return output;
|
|
}
|