bvle-voxels/shaders/voxelBLASExtractCS.hlsl

// BVLE Voxels - BLAS Position Extraction Compute Shader (Phase 6.1)
// Reads GPU-generated PackedQuads and writes flat float3 positions
// suitable for DXR BLAS construction (non-indexed, 6 vertices per quad).
//
// Uses the exact same unpack + winding logic as voxelVS.hlsl.
// Output is RWByteAddressBuffer (raw buffer) for BLAS vertex compatibility.

#include "voxelCommon.hlsli"

struct PackedQuad {
    uint2 data; // 8 bytes = 2 x uint32
};

StructuredBuffer<PackedQuad> quadBuffer : register(t0);
StructuredBuffer<GPUChunkInfo> chunkInfoBuffer : register(t2);

// Output: raw float3 positions (12 bytes each), 6 per quad
RWByteAddressBuffer blasPositions : register(u0);

// Push constants (b999)
struct BLASPush {
    uint quadCount;
    uint pad0, pad1, pad2, pad3, pad4, pad5, pad6, pad7, pad8, pad9, pad10;
};
[[vk::push_constant]] ConstantBuffer<BLASPush> push : register(b999);

// ── Face direction tables (SAME as voxelVS.hlsl) ───────────────────
static const float3 faceNormals[6] = {
    float3( 1, 0, 0), float3(-1, 0, 0),
    float3( 0, 1, 0), float3( 0,-1, 0),
    float3( 0, 0, 1), float3( 0, 0,-1)
};

static const float3 faceU[6] = {
    float3(0, 1, 0), float3(0, 1, 0),
    float3(1, 0, 0), float3(1, 0, 0),
    float3(1, 0, 0), float3(1, 0, 0)
};

static const float3 faceV[6] = {
    float3(0, 0, 1), float3(0, 0, 1),
    float3(0, 0, 1), float3(0, 0, 1),
    float3(0, 1, 0), float3(0, 1, 0)
};

// Helper: store float3 at byte offset in raw buffer
void storeFloat3(uint byteOffset, float3 v) {
    blasPositions.Store(byteOffset,      asuint(v.x));
    blasPositions.Store(byteOffset + 4,  asuint(v.y));
    blasPositions.Store(byteOffset + 8,  asuint(v.z));
}

// ── Quad unpacking (SAME as voxelVS.hlsl + chunkIndex from GPU mesh bits) ──
void unpackQuad(uint2 raw, out uint px, out uint py, out uint pz,
                out uint w, out uint h, out uint face, out uint chunkIdx)
{
    uint lo = raw.x;
    uint hi = raw.y;
    px    = lo & 0x3F;
    py    = (lo >> 6) & 0x3F;
    pz    = (lo >> 12) & 0x3F;
    w     = (lo >> 18) & 0x3F;
    h     = (lo >> 24) & 0x3F;
    face  = ((lo >> 30) & 0x3) | ((hi & 0x1) << 2);
    // GPU mesh path: chunkIndex in bits [27:17] of hi word
    chunkIdx = (hi >> 17) & 0x7FF;
}

[RootSignature(VOXEL_ROOTSIG)]
[numthreads(64, 1, 1)]
void main(uint3 DTid : SV_DispatchThreadID) {
    uint quadIdx = DTid.x;
    if (quadIdx >= push.quadCount) return;

    PackedQuad packed = quadBuffer[quadIdx];
    uint px, py, pz, w, h, face, chunkIdx;
    unpackQuad(packed.data, px, py, pz, w, h, face, chunkIdx);

    GPUChunkInfo info = chunkInfoBuffer[chunkIdx];

    // ── Compute 4 corner world positions (same math as voxelVS.hlsl) ──
    float3 basePos = float3((float)px, (float)py, (float)pz);
    float3 normal  = faceNormals[face];
    float3 uAxis   = faceU[face];
    float3 vAxis   = faceV[face];

    // Positive faces: offset by 1 in normal direction
    float3 faceOffset = (face % 2 == 0) ? normal : float3(0, 0, 0);

    float3 origin = basePos + faceOffset + info.worldPos.xyz;
    float3 p00 = origin;
    float3 p10 = origin + uAxis * (float)w;
    float3 p01 = origin + vAxis * (float)h;
    float3 p11 = origin + uAxis * (float)w + vAxis * (float)h;

    // ── Winding: must match voxelVS.hlsl ──
    // CW for faces 0,3,4 ; CCW for faces 1,2,5
    bool useCCW = (face == 1 || face == 2 || face == 5);

    // 6 vertices × 12 bytes (float3) = 72 bytes per quad
    uint byteBase = quadIdx * 72;
    if (useCCW) {
        // CCW: (0,0)(1,0)(0,1), (0,1)(1,0)(1,1)
        storeFloat3(byteBase +  0, p00);
        storeFloat3(byteBase + 12, p10);
        storeFloat3(byteBase + 24, p01);
        storeFloat3(byteBase + 36, p01);
        storeFloat3(byteBase + 48, p10);
        storeFloat3(byteBase + 60, p11);
    } else {
        // CW: (0,0)(0,1)(1,0), (1,0)(0,1)(1,1)
        storeFloat3(byteBase +  0, p00);
        storeFloat3(byteBase + 12, p01);
        storeFloat3(byteBase + 24, p10);
        storeFloat3(byteBase + 36, p10);
        storeFloat3(byteBase + 48, p01);
        storeFloat3(byteBase + 60, p11);
    }
}