bvle-voxels/shaders/voxelSmoothCS.hlsl

// BVLE Voxels - GPU Smooth Mesher Pass 2: Emit with Smooth Normals
// Reads ONLY from centroid grid (written by pass 1). No voxel buffer access.
// This keeps the shader simple and fast to compile.
//
// Centroid grid format (float4 per cell, cells [-1..32]):
//   xyz = chunk-local position (valid for surface cells)
//   w   = packed flags: bit24=valid, bit25=solid, [7:0]=mat, [15:8]=secMat, [23:16]=blend
//
// Dispatch: 4x4x4 groups of 8x8x8 threads per chunk (cells [0..31])

#include "voxelCommon.hlsli"

struct SmoothPush {
    uint chunkIndex;
    uint voxelBufferOffset;   // unused in this shader
    uint maxOutputVerts;
    uint centroidGridOffset;
    uint pad[8];
};
[[vk::push_constant]] ConstantBuffer<SmoothPush> push : register(b999);

StructuredBuffer<GPUChunkInfo> chunkInfo : register(t1);
StructuredBuffer<float4> centroidGrid : register(t2);

struct GPUSmoothVertex {
    float px, py, pz;
    float nx, ny, nz;
    uint packedMat;
    uint packedChunk;
};
RWStructuredBuffer<GPUSmoothVertex> outputVerts : register(u0);
RWByteAddressBuffer vertCounter : register(u1);

static const uint CSIZE = 32;
static const uint GRID_DIM = 34;

// ── Grid access helpers ─────────────────────────────────────────────
uint gridIndex(int3 cellPos) {
    return push.centroidGridOffset +
           (uint)(cellPos.z + 1) * GRID_DIM * GRID_DIM +
           (uint)(cellPos.y + 1) * GRID_DIM +
           (uint)(cellPos.x + 1);
}

uint readGridPacked(int3 cellPos) {
    if (any(cellPos < -1) || any(cellPos > 32)) return 0;
    return asuint(centroidGrid[gridIndex(cellPos)].w);
}

bool isCentroidValid(int3 cellPos) {
    return (readGridPacked(cellPos) >> 24) & 1;
}

bool isCellSolid(int3 cellPos) {
    return ((readGridPacked(cellPos) >> 25) & 1) != 0;
}

float3 readCentroidPos(int3 cellPos) {
    return centroidGrid[gridIndex(cellPos)].xyz;
}

// ── Face normal for one quad (4 sharing cells) ──────────────────────
float3 computeQuadFaceNormal(int3 c0, int3 c1, int3 c2, int3 c3,
                              bool solid0, int edgeAxis) {
    if (!isCentroidValid(c0) || !isCentroidValid(c1) ||
        !isCentroidValid(c2) || !isCentroidValid(c3))
        return float3(0, 0, 0);

    float3 p0 = readCentroidPos(c0);
    float3 p1 = readCentroidPos(c1);
    float3 p3 = readCentroidPos(c3);

    float3 fn = cross(p1 - p0, p3 - p0);

    // Orient: solid→empty direction
    int s = solid0 ? +1 : -1;
    float fnAxis = (edgeAxis == 0) ? fn.x : ((edgeAxis == 1) ? fn.y : fn.z);
    if ((fnAxis > 0.0) != (s > 0)) fn = -fn;

    return fn; // area-weighted (not normalized)
}

// ── Smooth normal for a vertex at cell v ────────────────────────────
// Checks all 12 incident edges (4 per axis), computes face normals from
// centroid grid, averages them. All reads from grid only.
float3 computeSmoothNormal(int3 v) {
    float3 accum = float3(0, 0, 0);

    // X-edges: at (v.x, v.y+dy, v.z+dz) for dy,dz in {0,1}
    {
        bool sv = isCellSolid(v);
        bool sv_x1 = isCellSolid(v + int3(1,0,0));
        bool sv_01 = isCellSolid(int3(v.x, v.y+1, v.z));
        bool sv_01_x1 = isCellSolid(int3(v.x+1, v.y+1, v.z));
        bool sv_10 = isCellSolid(int3(v.x, v.y, v.z+1));
        bool sv_10_x1 = isCellSolid(int3(v.x+1, v.y, v.z+1));
        bool sv_11 = isCellSolid(int3(v.x, v.y+1, v.z+1));
        bool sv_11_x1 = isCellSolid(int3(v.x+1, v.y+1, v.z+1));

        // Edge (v.x, v.y, v.z)
        if (sv != sv_x1) {
            accum += computeQuadFaceNormal(
                v + int3(0,-1,-1), v + int3(0,0,-1),
                v + int3(0,-1,0),  v, sv, 0);
        }
        // Edge (v.x, v.y+1, v.z)
        if (sv_01 != sv_01_x1) {
            accum += computeQuadFaceNormal(
                int3(v.x, v.y, v.z-1), int3(v.x, v.y+1, v.z-1),
                v, int3(v.x, v.y+1, v.z), sv_01, 0);
        }
        // Edge (v.x, v.y, v.z+1)
        if (sv_10 != sv_10_x1) {
            accum += computeQuadFaceNormal(
                int3(v.x, v.y-1, v.z), v,
                int3(v.x, v.y-1, v.z+1), int3(v.x, v.y, v.z+1), sv_10, 0);
        }
        // Edge (v.x, v.y+1, v.z+1)
        if (sv_11 != sv_11_x1) {
            accum += computeQuadFaceNormal(
                v, int3(v.x, v.y+1, v.z),
                int3(v.x, v.y, v.z+1), int3(v.x, v.y+1, v.z+1), sv_11, 0);
        }
    }

    // Y-edges: at (v.x+dx, v.y, v.z+dz) for dx,dz in {0,1}
    {
        bool sv = isCellSolid(v);
        bool sv_y1 = isCellSolid(v + int3(0,1,0));
        bool sv_10 = isCellSolid(int3(v.x+1, v.y, v.z));
        bool sv_10_y1 = isCellSolid(int3(v.x+1, v.y+1, v.z));
        bool sv_01 = isCellSolid(int3(v.x, v.y, v.z+1));
        bool sv_01_y1 = isCellSolid(int3(v.x, v.y+1, v.z+1));
        bool sv_11 = isCellSolid(int3(v.x+1, v.y, v.z+1));
        bool sv_11_y1 = isCellSolid(int3(v.x+1, v.y+1, v.z+1));

        if (sv != sv_y1) {
            accum += computeQuadFaceNormal(
                v + int3(-1,0,-1), v + int3(0,0,-1),
                v + int3(-1,0,0),  v, sv, 1);
        }
        if (sv_10 != sv_10_y1) {
            accum += computeQuadFaceNormal(
                int3(v.x, v.y, v.z-1), int3(v.x+1, v.y, v.z-1),
                v, int3(v.x+1, v.y, v.z), sv_10, 1);
        }
        if (sv_01 != sv_01_y1) {
            accum += computeQuadFaceNormal(
                int3(v.x-1, v.y, v.z), v,
                int3(v.x-1, v.y, v.z+1), int3(v.x, v.y, v.z+1), sv_01, 1);
        }
        if (sv_11 != sv_11_y1) {
            accum += computeQuadFaceNormal(
                v, int3(v.x+1, v.y, v.z),
                int3(v.x, v.y, v.z+1), int3(v.x+1, v.y, v.z+1), sv_11, 1);
        }
    }

    // Z-edges: at (v.x+dx, v.y+dy, v.z) for dx,dy in {0,1}
    {
        bool sv = isCellSolid(v);
        bool sv_z1 = isCellSolid(v + int3(0,0,1));
        bool sv_10 = isCellSolid(int3(v.x+1, v.y, v.z));
        bool sv_10_z1 = isCellSolid(int3(v.x+1, v.y, v.z+1));
        bool sv_01 = isCellSolid(int3(v.x, v.y+1, v.z));
        bool sv_01_z1 = isCellSolid(int3(v.x, v.y+1, v.z+1));
        bool sv_11 = isCellSolid(int3(v.x+1, v.y+1, v.z));
        bool sv_11_z1 = isCellSolid(int3(v.x+1, v.y+1, v.z+1));

        if (sv != sv_z1) {
            accum += computeQuadFaceNormal(
                v + int3(-1,-1,0), v + int3(0,-1,0),
                v + int3(-1,0,0),  v, sv, 2);
        }
        if (sv_10 != sv_10_z1) {
            accum += computeQuadFaceNormal(
                int3(v.x, v.y-1, v.z), int3(v.x+1, v.y-1, v.z),
                v, int3(v.x+1, v.y, v.z), sv_10, 2);
        }
        if (sv_01 != sv_01_z1) {
            accum += computeQuadFaceNormal(
                int3(v.x-1, v.y, v.z), v,
                int3(v.x-1, v.y+1, v.z), int3(v.x, v.y+1, v.z), sv_01, 2);
        }
        if (sv_11 != sv_11_z1) {
            accum += computeQuadFaceNormal(
                v, int3(v.x+1, v.y, v.z),
                int3(v.x, v.y+1, v.z), int3(v.x+1, v.y+1, v.z), sv_11, 2);
        }
    }

    float len = length(accum);
    return (len > 0.0001) ? accum / len : float3(0, 1, 0);
}

// ── Emit helpers ────────────────────────────────────────────────────
void emitVertex(uint slot, float3 pos, float3 normal, uint primaryMat, uint secondaryMat, uint blendWeight) {
    GPUSmoothVertex vert;
    vert.px = pos.x; vert.py = pos.y; vert.pz = pos.z;
    vert.nx = normal.x; vert.ny = normal.y; vert.nz = normal.z;
    vert.packedMat = (primaryMat & 0xFF) | ((secondaryMat & 0xFF) << 8) | ((blendWeight & 0xFF) << 16);
    vert.packedChunk = push.chunkIndex & 0xFFFF;
    outputVerts[slot] = vert;
}

void emitQuad(float3 p[4], float3 n[4], uint mat, uint secMat, uint blendW, bool windingA) {
    uint slot;
    vertCounter.InterlockedAdd(0, 6, slot);
    if (slot + 6 > push.maxOutputVerts) return;

    if (windingA) {
        emitVertex(slot + 0, p[0], n[0], mat, secMat, blendW);
        emitVertex(slot + 1, p[1], n[1], mat, secMat, blendW);
        emitVertex(slot + 2, p[3], n[3], mat, secMat, blendW);
        emitVertex(slot + 3, p[0], n[0], mat, secMat, blendW);
        emitVertex(slot + 4, p[3], n[3], mat, secMat, blendW);
        emitVertex(slot + 5, p[2], n[2], mat, secMat, blendW);
    } else {
        emitVertex(slot + 0, p[0], n[0], mat, secMat, blendW);
        emitVertex(slot + 1, p[3], n[3], mat, secMat, blendW);
        emitVertex(slot + 2, p[1], n[1], mat, secMat, blendW);
        emitVertex(slot + 3, p[0], n[0], mat, secMat, blendW);
        emitVertex(slot + 4, p[2], n[2], mat, secMat, blendW);
        emitVertex(slot + 5, p[3], n[3], mat, secMat, blendW);
    }
}

// ── Main ────────────────────────────────────────────────────────────
[RootSignature(VOXEL_ROOTSIG)]
[numthreads(8, 8, 8)]
void main(uint3 DTid : SV_DispatchThreadID)
{
    if (any(DTid >= CSIZE)) return;
    int3 cellPos = int3(DTid);

    bool cellSolid = isCellSolid(cellPos);
    float3 chunkWorldPos = chunkInfo[push.chunkIndex].worldPos.xyz;

    // ── X-edge: cellPos → cellPos + (1,0,0) ────────────────────────
    {
        bool neighborSolid = isCellSolid(cellPos + int3(1, 0, 0));
        if (cellSolid != neighborSolid) {
            int3 cells[4] = {
                cellPos + int3(0, -1, -1),
                cellPos + int3(0,  0, -1),
                cellPos + int3(0, -1,  0),
                cellPos
            };
            if (isCentroidValid(cells[0]) && isCentroidValid(cells[1]) &&
                isCentroidValid(cells[2]) && isCentroidValid(cells[3])) {
                float3 p[4], n[4];
                [loop] for (uint i = 0; i < 4; i++)
                    p[i] = chunkWorldPos + readCentroidPos(cells[i]);
                [loop] for (uint i = 0; i < 4; i++)
                    n[i] = computeSmoothNormal(cells[i]);

                float3 fn = cross(p[1] - p[0], p[3] - p[0]);
                int s = cellSolid ? +1 : -1;
                if ((fn.x > 0.0) != (s > 0)) fn = -fn;
                bool windingA = !cellSolid;

                uint packed = readGridPacked(cells[3]);
                uint mat = packed & 0xFF;
                uint secMat = (packed >> 8) & 0xFF;
                uint blendW = (packed >> 16) & 0xFF;
                emitQuad(p, n, mat, secMat, blendW, windingA);
            }
        }
    }

    // ── Y-edge: cellPos → cellPos + (0,1,0) ────────────────────────
    {
        bool neighborSolid = isCellSolid(cellPos + int3(0, 1, 0));
        if (cellSolid != neighborSolid) {
            int3 cells[4] = {
                cellPos + int3(-1, 0, -1),
                cellPos + int3( 0, 0, -1),
                cellPos + int3(-1, 0,  0),
                cellPos
            };
            if (isCentroidValid(cells[0]) && isCentroidValid(cells[1]) &&
                isCentroidValid(cells[2]) && isCentroidValid(cells[3])) {
                float3 p[4], n[4];
                [loop] for (uint i = 0; i < 4; i++)
                    p[i] = chunkWorldPos + readCentroidPos(cells[i]);
                [loop] for (uint i = 0; i < 4; i++)
                    n[i] = computeSmoothNormal(cells[i]);

                float3 fn = cross(p[1] - p[0], p[3] - p[0]);
                int s = cellSolid ? +1 : -1;
                if ((fn.y > 0.0) != (s > 0)) fn = -fn;
                bool windingA = !cellSolid;
                windingA = !windingA; // Y-axis winding flip

                uint packed = readGridPacked(cells[3]);
                uint mat = packed & 0xFF;
                uint secMat = (packed >> 8) & 0xFF;
                uint blendW = (packed >> 16) & 0xFF;
                emitQuad(p, n, mat, secMat, blendW, windingA);
            }
        }
    }

    // ── Z-edge: cellPos → cellPos + (0,0,1) ────────────────────────
    {
        bool neighborSolid = isCellSolid(cellPos + int3(0, 0, 1));
        if (cellSolid != neighborSolid) {
            int3 cells[4] = {
                cellPos + int3(-1, -1, 0),
                cellPos + int3( 0, -1, 0),
                cellPos + int3(-1,  0, 0),
                cellPos
            };
            if (isCentroidValid(cells[0]) && isCentroidValid(cells[1]) &&
                isCentroidValid(cells[2]) && isCentroidValid(cells[3])) {
                float3 p[4], n[4];
                [loop] for (uint i = 0; i < 4; i++)
                    p[i] = chunkWorldPos + readCentroidPos(cells[i]);
                [loop] for (uint i = 0; i < 4; i++)
                    n[i] = computeSmoothNormal(cells[i]);

                float3 fn = cross(p[1] - p[0], p[3] - p[0]);
                int s = cellSolid ? +1 : -1;
                if ((fn.z > 0.0) != (s > 0)) fn = -fn;
                bool windingA = !cellSolid;

                uint packed = readGridPacked(cells[3]);
                uint mat = packed & 0xFF;
                uint secMat = (packed >> 8) & 0xFF;
                uint blendW = (packed >> 16) & 0xFF;
                emitQuad(p, n, mat, secMat, blendW, windingA);
            }
        }
    }
}
Phase 5.2-5.3: CPU perf optimizations + GPU compute Surface Nets CPU smooth mesher optimizations (560ms → 17ms): - VoxelData grid cache eliminates redundant readVoxel calls - Pre-cached 27 neighbor chunk pointers (readVoxelFast) - smoothNear dilation (8 lookups/cell instead of 56) - Early exit via containsSmooth flag on chunks - Thread-local scratch buffers (SmoothScratch ~600KB) - wi::jobsystem parallelization across all cores - Persistent staging vectors for upload TopingSystem optimizations (58ms → 6ms): - collectInstancesParallel() with per-chunk local vectors - Neighbor chunk pointer caching GPU compute Surface Nets (Phase 5.3): - Two-pass compute shader: centroid grid + emit with smooth normals - Pass 1 (voxelSmoothCentroidCS): computes centroids + solid flags for cells [-1..32], cross-chunk neighbor voxel reading - Pass 2 (voxelSmoothCS): reads ONLY from centroid grid, computes area-weighted smooth normals from 12 incident edges per vertex - Batched dispatch: all centroid passes then all emit passes with single UAV→SRV barrier (instead of 2 barriers per chunk) - Smooth chunk filtering: only dispatches chunks with containsSmooth - Centroid grid buffer dynamically sized per smooth chunk count - 1-frame readback delay with auto-redispatch on first frame 2026-03-27 22:30:43 +01:00			`// BVLE Voxels - GPU Smooth Mesher Pass 2: Emit with Smooth Normals`
			`// Reads ONLY from centroid grid (written by pass 1). No voxel buffer access.`
			`// This keeps the shader simple and fast to compile.`
			`//`
			`// Centroid grid format (float4 per cell, cells [-1..32]):`
			`// xyz = chunk-local position (valid for surface cells)`
			`// w = packed flags: bit24=valid, bit25=solid, [7:0]=mat, [15:8]=secMat, [23:16]=blend`
			`//`
			`// Dispatch: 4x4x4 groups of 8x8x8 threads per chunk (cells [0..31])`

			`#include "voxelCommon.hlsli"`

			`struct SmoothPush {`
			`uint chunkIndex;`
			`uint voxelBufferOffset; // unused in this shader`
			`uint maxOutputVerts;`
			`uint centroidGridOffset;`
			`uint pad[8];`
			`};`
			`[[vk::push_constant]] ConstantBuffer<SmoothPush> push : register(b999);`

			`StructuredBuffer<GPUChunkInfo> chunkInfo : register(t1);`
			`StructuredBuffer<float4> centroidGrid : register(t2);`

			`struct GPUSmoothVertex {`
			`float px, py, pz;`
			`float nx, ny, nz;`
			`uint packedMat;`
			`uint packedChunk;`
			`};`
			`RWStructuredBuffer<GPUSmoothVertex> outputVerts : register(u0);`
			`RWByteAddressBuffer vertCounter : register(u1);`

			`static const uint CSIZE = 32;`
			`static const uint GRID_DIM = 34;`

			`// ── Grid access helpers ─────────────────────────────────────────────`
			`uint gridIndex(int3 cellPos) {`
			`return push.centroidGridOffset +`
			`(uint)(cellPos.z + 1) * GRID_DIM * GRID_DIM +`
			`(uint)(cellPos.y + 1) * GRID_DIM +`
			`(uint)(cellPos.x + 1);`
			`}`

			`uint readGridPacked(int3 cellPos) {`
			`if (any(cellPos < -1) \|\| any(cellPos > 32)) return 0;`
			`return asuint(centroidGrid[gridIndex(cellPos)].w);`
			`}`

			`bool isCentroidValid(int3 cellPos) {`
			`return (readGridPacked(cellPos) >> 24) & 1;`
			`}`

			`bool isCellSolid(int3 cellPos) {`
			`return ((readGridPacked(cellPos) >> 25) & 1) != 0;`
			`}`

			`float3 readCentroidPos(int3 cellPos) {`
			`return centroidGrid[gridIndex(cellPos)].xyz;`
			`}`

			`// ── Face normal for one quad (4 sharing cells) ──────────────────────`
			`float3 computeQuadFaceNormal(int3 c0, int3 c1, int3 c2, int3 c3,`
			`bool solid0, int edgeAxis) {`
			`if (!isCentroidValid(c0) \|\| !isCentroidValid(c1) \|\|`
			`!isCentroidValid(c2) \|\| !isCentroidValid(c3))`
			`return float3(0, 0, 0);`

			`float3 p0 = readCentroidPos(c0);`
			`float3 p1 = readCentroidPos(c1);`
			`float3 p3 = readCentroidPos(c3);`

			`float3 fn = cross(p1 - p0, p3 - p0);`

			`// Orient: solid→empty direction`
			`int s = solid0 ? +1 : -1;`
			`float fnAxis = (edgeAxis == 0) ? fn.x : ((edgeAxis == 1) ? fn.y : fn.z);`
			`if ((fnAxis > 0.0) != (s > 0)) fn = -fn;`

			`return fn; // area-weighted (not normalized)`
			`}`

			`// ── Smooth normal for a vertex at cell v ────────────────────────────`
			`// Checks all 12 incident edges (4 per axis), computes face normals from`
			`// centroid grid, averages them. All reads from grid only.`
			`float3 computeSmoothNormal(int3 v) {`
			`float3 accum = float3(0, 0, 0);`

			`// X-edges: at (v.x, v.y+dy, v.z+dz) for dy,dz in {0,1}`
			`{`
			`bool sv = isCellSolid(v);`
			`bool sv_x1 = isCellSolid(v + int3(1,0,0));`
			`bool sv_01 = isCellSolid(int3(v.x, v.y+1, v.z));`
			`bool sv_01_x1 = isCellSolid(int3(v.x+1, v.y+1, v.z));`
			`bool sv_10 = isCellSolid(int3(v.x, v.y, v.z+1));`
			`bool sv_10_x1 = isCellSolid(int3(v.x+1, v.y, v.z+1));`
			`bool sv_11 = isCellSolid(int3(v.x, v.y+1, v.z+1));`
			`bool sv_11_x1 = isCellSolid(int3(v.x+1, v.y+1, v.z+1));`

			`// Edge (v.x, v.y, v.z)`
			`if (sv != sv_x1) {`
			`accum += computeQuadFaceNormal(`
			`v + int3(0,-1,-1), v + int3(0,0,-1),`
			`v + int3(0,-1,0), v, sv, 0);`
			`}`
			`// Edge (v.x, v.y+1, v.z)`
			`if (sv_01 != sv_01_x1) {`
			`accum += computeQuadFaceNormal(`
			`int3(v.x, v.y, v.z-1), int3(v.x, v.y+1, v.z-1),`
			`v, int3(v.x, v.y+1, v.z), sv_01, 0);`
			`}`
			`// Edge (v.x, v.y, v.z+1)`
			`if (sv_10 != sv_10_x1) {`
			`accum += computeQuadFaceNormal(`
			`int3(v.x, v.y-1, v.z), v,`
			`int3(v.x, v.y-1, v.z+1), int3(v.x, v.y, v.z+1), sv_10, 0);`
			`}`
			`// Edge (v.x, v.y+1, v.z+1)`
			`if (sv_11 != sv_11_x1) {`
			`accum += computeQuadFaceNormal(`
			`v, int3(v.x, v.y+1, v.z),`
			`int3(v.x, v.y, v.z+1), int3(v.x, v.y+1, v.z+1), sv_11, 0);`
			`}`
			`}`

			`// Y-edges: at (v.x+dx, v.y, v.z+dz) for dx,dz in {0,1}`
			`{`
			`bool sv = isCellSolid(v);`
			`bool sv_y1 = isCellSolid(v + int3(0,1,0));`
			`bool sv_10 = isCellSolid(int3(v.x+1, v.y, v.z));`
			`bool sv_10_y1 = isCellSolid(int3(v.x+1, v.y+1, v.z));`
			`bool sv_01 = isCellSolid(int3(v.x, v.y, v.z+1));`
			`bool sv_01_y1 = isCellSolid(int3(v.x, v.y+1, v.z+1));`
			`bool sv_11 = isCellSolid(int3(v.x+1, v.y, v.z+1));`
			`bool sv_11_y1 = isCellSolid(int3(v.x+1, v.y+1, v.z+1));`

			`if (sv != sv_y1) {`
			`accum += computeQuadFaceNormal(`
			`v + int3(-1,0,-1), v + int3(0,0,-1),`
			`v + int3(-1,0,0), v, sv, 1);`
			`}`
			`if (sv_10 != sv_10_y1) {`
			`accum += computeQuadFaceNormal(`
			`int3(v.x, v.y, v.z-1), int3(v.x+1, v.y, v.z-1),`
			`v, int3(v.x+1, v.y, v.z), sv_10, 1);`
			`}`
			`if (sv_01 != sv_01_y1) {`
			`accum += computeQuadFaceNormal(`
			`int3(v.x-1, v.y, v.z), v,`
			`int3(v.x-1, v.y, v.z+1), int3(v.x, v.y, v.z+1), sv_01, 1);`
			`}`
			`if (sv_11 != sv_11_y1) {`
			`accum += computeQuadFaceNormal(`
			`v, int3(v.x+1, v.y, v.z),`
			`int3(v.x, v.y, v.z+1), int3(v.x+1, v.y, v.z+1), sv_11, 1);`
			`}`
			`}`

			`// Z-edges: at (v.x+dx, v.y+dy, v.z) for dx,dy in {0,1}`
			`{`
			`bool sv = isCellSolid(v);`
			`bool sv_z1 = isCellSolid(v + int3(0,0,1));`
			`bool sv_10 = isCellSolid(int3(v.x+1, v.y, v.z));`
			`bool sv_10_z1 = isCellSolid(int3(v.x+1, v.y, v.z+1));`
			`bool sv_01 = isCellSolid(int3(v.x, v.y+1, v.z));`
			`bool sv_01_z1 = isCellSolid(int3(v.x, v.y+1, v.z+1));`
			`bool sv_11 = isCellSolid(int3(v.x+1, v.y+1, v.z));`
			`bool sv_11_z1 = isCellSolid(int3(v.x+1, v.y+1, v.z+1));`

			`if (sv != sv_z1) {`
			`accum += computeQuadFaceNormal(`
			`v + int3(-1,-1,0), v + int3(0,-1,0),`
			`v + int3(-1,0,0), v, sv, 2);`
			`}`
			`if (sv_10 != sv_10_z1) {`
			`accum += computeQuadFaceNormal(`
			`int3(v.x, v.y-1, v.z), int3(v.x+1, v.y-1, v.z),`
			`v, int3(v.x+1, v.y, v.z), sv_10, 2);`
			`}`
			`if (sv_01 != sv_01_z1) {`
			`accum += computeQuadFaceNormal(`
			`int3(v.x-1, v.y, v.z), v,`
			`int3(v.x-1, v.y+1, v.z), int3(v.x, v.y+1, v.z), sv_01, 2);`
			`}`
			`if (sv_11 != sv_11_z1) {`
			`accum += computeQuadFaceNormal(`
			`v, int3(v.x+1, v.y, v.z),`
			`int3(v.x, v.y+1, v.z), int3(v.x+1, v.y+1, v.z), sv_11, 2);`
			`}`
			`}`

			`float len = length(accum);`
			`return (len > 0.0001) ? accum / len : float3(0, 1, 0);`
			`}`

			`// ── Emit helpers ────────────────────────────────────────────────────`
			`void emitVertex(uint slot, float3 pos, float3 normal, uint primaryMat, uint secondaryMat, uint blendWeight) {`
			`GPUSmoothVertex vert;`
			`vert.px = pos.x; vert.py = pos.y; vert.pz = pos.z;`
			`vert.nx = normal.x; vert.ny = normal.y; vert.nz = normal.z;`
			`vert.packedMat = (primaryMat & 0xFF) \| ((secondaryMat & 0xFF) << 8) \| ((blendWeight & 0xFF) << 16);`
			`vert.packedChunk = push.chunkIndex & 0xFFFF;`
			`outputVerts[slot] = vert;`
			`}`

			`void emitQuad(float3 p[4], float3 n[4], uint mat, uint secMat, uint blendW, bool windingA) {`
			`uint slot;`
			`vertCounter.InterlockedAdd(0, 6, slot);`
			`if (slot + 6 > push.maxOutputVerts) return;`

			`if (windingA) {`
			`emitVertex(slot + 0, p[0], n[0], mat, secMat, blendW);`
			`emitVertex(slot + 1, p[1], n[1], mat, secMat, blendW);`
			`emitVertex(slot + 2, p[3], n[3], mat, secMat, blendW);`
			`emitVertex(slot + 3, p[0], n[0], mat, secMat, blendW);`
			`emitVertex(slot + 4, p[3], n[3], mat, secMat, blendW);`
			`emitVertex(slot + 5, p[2], n[2], mat, secMat, blendW);`
			`} else {`
			`emitVertex(slot + 0, p[0], n[0], mat, secMat, blendW);`
			`emitVertex(slot + 1, p[3], n[3], mat, secMat, blendW);`
			`emitVertex(slot + 2, p[1], n[1], mat, secMat, blendW);`
			`emitVertex(slot + 3, p[0], n[0], mat, secMat, blendW);`
			`emitVertex(slot + 4, p[2], n[2], mat, secMat, blendW);`
			`emitVertex(slot + 5, p[3], n[3], mat, secMat, blendW);`
			`}`
			`}`

			`// ── Main ────────────────────────────────────────────────────────────`
			`[RootSignature(VOXEL_ROOTSIG)]`
			`[numthreads(8, 8, 8)]`
			`void main(uint3 DTid : SV_DispatchThreadID)`
			`{`
			`if (any(DTid >= CSIZE)) return;`
			`int3 cellPos = int3(DTid);`

			`bool cellSolid = isCellSolid(cellPos);`
			`float3 chunkWorldPos = chunkInfo[push.chunkIndex].worldPos.xyz;`

			`// ── X-edge: cellPos → cellPos + (1,0,0) ────────────────────────`
			`{`
			`bool neighborSolid = isCellSolid(cellPos + int3(1, 0, 0));`
			`if (cellSolid != neighborSolid) {`
			`int3 cells[4] = {`
			`cellPos + int3(0, -1, -1),`
			`cellPos + int3(0, 0, -1),`
			`cellPos + int3(0, -1, 0),`
			`cellPos`
			`};`
			`if (isCentroidValid(cells[0]) && isCentroidValid(cells[1]) &&`
			`isCentroidValid(cells[2]) && isCentroidValid(cells[3])) {`
			`float3 p[4], n[4];`
			`[loop] for (uint i = 0; i < 4; i++)`
			`p[i] = chunkWorldPos + readCentroidPos(cells[i]);`
			`[loop] for (uint i = 0; i < 4; i++)`
			`n[i] = computeSmoothNormal(cells[i]);`

			`float3 fn = cross(p[1] - p[0], p[3] - p[0]);`
			`int s = cellSolid ? +1 : -1;`
			`if ((fn.x > 0.0) != (s > 0)) fn = -fn;`
			`bool windingA = !cellSolid;`

			`uint packed = readGridPacked(cells[3]);`
			`uint mat = packed & 0xFF;`
			`uint secMat = (packed >> 8) & 0xFF;`
			`uint blendW = (packed >> 16) & 0xFF;`
			`emitQuad(p, n, mat, secMat, blendW, windingA);`
			`}`
			`}`
			`}`

			`// ── Y-edge: cellPos → cellPos + (0,1,0) ────────────────────────`
			`{`
			`bool neighborSolid = isCellSolid(cellPos + int3(0, 1, 0));`
			`if (cellSolid != neighborSolid) {`
			`int3 cells[4] = {`
			`cellPos + int3(-1, 0, -1),`
			`cellPos + int3( 0, 0, -1),`
			`cellPos + int3(-1, 0, 0),`
			`cellPos`
			`};`
			`if (isCentroidValid(cells[0]) && isCentroidValid(cells[1]) &&`
			`isCentroidValid(cells[2]) && isCentroidValid(cells[3])) {`
			`float3 p[4], n[4];`
			`[loop] for (uint i = 0; i < 4; i++)`
			`p[i] = chunkWorldPos + readCentroidPos(cells[i]);`
			`[loop] for (uint i = 0; i < 4; i++)`
			`n[i] = computeSmoothNormal(cells[i]);`

			`float3 fn = cross(p[1] - p[0], p[3] - p[0]);`
			`int s = cellSolid ? +1 : -1;`
			`if ((fn.y > 0.0) != (s > 0)) fn = -fn;`
			`bool windingA = !cellSolid;`
			`windingA = !windingA; // Y-axis winding flip`

			`uint packed = readGridPacked(cells[3]);`
			`uint mat = packed & 0xFF;`
			`uint secMat = (packed >> 8) & 0xFF;`
			`uint blendW = (packed >> 16) & 0xFF;`
			`emitQuad(p, n, mat, secMat, blendW, windingA);`
			`}`
			`}`
			`}`

			`// ── Z-edge: cellPos → cellPos + (0,0,1) ────────────────────────`
			`{`
			`bool neighborSolid = isCellSolid(cellPos + int3(0, 0, 1));`
			`if (cellSolid != neighborSolid) {`
			`int3 cells[4] = {`
			`cellPos + int3(-1, -1, 0),`
			`cellPos + int3( 0, -1, 0),`
			`cellPos + int3(-1, 0, 0),`
			`cellPos`
			`};`
			`if (isCentroidValid(cells[0]) && isCentroidValid(cells[1]) &&`
			`isCentroidValid(cells[2]) && isCentroidValid(cells[3])) {`
			`float3 p[4], n[4];`
			`[loop] for (uint i = 0; i < 4; i++)`
			`p[i] = chunkWorldPos + readCentroidPos(cells[i]);`
			`[loop] for (uint i = 0; i < 4; i++)`
			`n[i] = computeSmoothNormal(cells[i]);`

			`float3 fn = cross(p[1] - p[0], p[3] - p[0]);`
			`int s = cellSolid ? +1 : -1;`
			`if ((fn.z > 0.0) != (s > 0)) fn = -fn;`
			`bool windingA = !cellSolid;`

			`uint packed = readGridPacked(cells[3]);`
			`uint mat = packed & 0xFF;`
			`uint secMat = (packed >> 8) & 0xFF;`
			`uint blendW = (packed >> 16) & 0xFF;`
			`emitQuad(p, n, mat, secMat, blendW, windingA);`
			`}`
			`}`
			`}`
			`}`