From 626fbaea80b10c5c6a2d1ef96e18bc03ba1e2dc3 Mon Sep 17 00:00:00 2001
From: Samuel Bouchet <contact@samuel-bouchet.fr>
Date: Wed, 1 Apr 2026 20:35:42 +0200
Subject: [PATCH] Fix smooth Surface Nets rendering: eliminate faceting, fix
 blocky junction
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- Remove geoN (ddx/ddy) from smooth PS entirely — use smooth interpolated
  normal N for all triplanar sampling (albedo, heightmap, normal map).
  geoN changes discontinuously at triangle edges, causing per-triangle
  faceting in texture weights and normal perturbation.
- Tune consistency-based vertex normal blend to smoothstep(0.70, 0.90):
  snaps to face normal at 90° boundaries (seamless blocky join) while
  preserving smooth normals on curved terrain.
- Unify all 3 edge axes (X/Y/Z) to same smoothstep formula (was mixed
  smoothstep + pow4).
- Remove grass-specific hardcoded shading from both PS (side darkening,
  warm shift, ambient boost) — will be data-driven per-material later.
- Remove CPU SmoothMesher code (GPU-only path).
- Document all findings in TROUBLESHOOTING.md with calibration table.
---
 TROUBLESHOOTING.md          |  61 ++++
 shaders/voxelPS.hlsl        |  18 --
 shaders/voxelSmoothCS.hlsl  | 157 ++++++-----
 shaders/voxelSmoothPS.hlsl  |  41 ++-
 src/voxel/VoxelMesher.cpp   | 537 +-----------------------------------
 src/voxel/VoxelMesher.h     |  21 --
 src/voxel/VoxelRenderer.cpp | 155 +----------
 src/voxel/VoxelRenderer.h   |  15 +-
 src/voxel/VoxelWorld.h      |   5 +-
 9 files changed, 186 insertions(+), 824 deletions(-)

diff --git a/TROUBLESHOOTING.md b/TROUBLESHOOTING.md
index 54db1fd..f478c3a 100644
--- a/TROUBLESHOOTING.md
+++ b/TROUBLESHOOTING.md
@@ -19,6 +19,7 @@
 - [CreateBuffer avec capacity > data size](#createbuffer-avec-capacity--data-size)
 - [BLAS/TLAS per-frame recreation — VRAM leak](#blastlas-per-frame-recreation--vram-leak)
 - [Diagnostics et debugging](#diagnostics-et-debugging)
+- [Smooth Surface Nets — Rendu facetté et jointure blocky](#smooth-surface-nets--rendu-facetté-et-jointure-blocky)
 - [Gestion des resource states DX12 (buffers)](#gestion-des-resource-states-dx12-buffers)
 
 ---
@@ -320,6 +321,66 @@ dev->BuildRaytracingAccelerationStructure(&blas, cmd, nullptr);
 
 ---
 
+## Smooth Surface Nets — Rendu facetté et jointure blocky
+
+### Problème 1 : Rendu smooth facetté malgré normales lisses
+
+**Symptôme** : en mode debug (FLAT, NdotL, NORMAL), la surface smooth est parfaitement lisse. Mais en rendu final (ALL), elle apparaît facettée avec des arêtes de triangles visibles.
+
+**Cause racine** : `geoN` (geometric normal via `ddx(worldPos)`/`ddy(worldPos)`) était utilisé pour le triplanar sampling (poids de projection) ET le normal mapping. Cette valeur est la **face normal du triangle à l'écran** — elle change de manière **discontinue** à chaque arête de triangle. Résultat :
+
+1. **Poids triplanar discontinus** → la texture saute aux arêtes (coutures visibles)
+2. **Normal map discontinu** → la perturbation normale diffère par triangle → NdotL facetté
+
+Les modes debug étaient lisses car ils utilisaient `flatN` (smooth normal **avant** perturbation normal map), pas le `N` perturbé.
+
+**Correction** : utiliser `N` (smooth interpolated normal) pour **tout** le triplanar dans `voxelSmoothPS.hlsl` :
+- Poids triplanar albedo/heightmap → `N` (pas `geoN`)
+- Normal map sampling → `N` (pas `geoN`)
+- `geoN` n'est plus calculé/utilisé du tout
+
+`N` varie continûment entre vertices → transitions lisses partout.
+
+### Problème 2 : Jointure visible smooth/blocky
+
+**Symptôme** : contraste visible entre faces smooth et blocky adjacentes, quasi-coplanaires.
+
+**Causes racines** (cumulatives) :
+
+1. **Traitements per-material dans un seul PS** — le blocky PS avait un shading spécifique grass (side darkening 60%, warm shift chromatique, ambient boost ×1.15) absent du smooth PS. Pour une face grass +X, ça créait ~40% d'écart de luminosité.
+
+2. **Smooth normals biaisées aux frontières** — les vertex normals aux arêtes 90° (mur smooth → sol) étaient moyennées entre faces perpendiculaires (consistency ≈ 0.707), produisant une normale biaisée vers +Y au lieu de +X pur.
+
+**Correction** :
+- **Supprimer les traitements per-material hardcodés** des deux PS. Quand on aura besoin de shading par matériau, le rendre data-driven et l'appliquer identiquement dans les deux shaders.
+- **Consistency-based vertex normal blend** dans `voxelSmoothCS.hlsl` : métrique `|Σfn| / Σ|fn|` qui mesure l'accord des face normals incidentes. Les vertices à faible consistency (arêtes nettes, frontières) reçoivent la face normal pure ; les vertices à haute consistency (surfaces courbes) gardent la smooth normal.
+
+### Calibration du seuil de consistency
+
+Le seuil `smoothstep(low, high, consistency)` contrôle le compromis lisse/net :
+
+| Seuil | con=0.707 (90° edge) | con=0.85 (courbe) | con=0.95 (pente) | Résultat |
+|---|---|---|---|---|
+| `(0.85, 1.0)` | t=0 face ✓ | t=0 face ✗ | t=0.26 ≈ face ✗ | Trop agressif, tout facetté |
+| `(0.60, 0.85)` | t=0.27 ≈ 73% face | t=1.0 smooth ✓ | t=1.0 smooth ✓ | Frontière visible, intérieur lisse |
+| `(0.70, 0.90)` | t≈0 face ✓ | t=0.84 smooth ✓ | t=1.0 smooth ✓ | **Bon compromis** |
+
+**Valeur retenue : `smoothstep(0.70, 0.90)`** — les arêtes 90° (con ≤ 0.707) reçoivent 100% face normal (jointure nette avec blocky), les courbes modérées (con > 0.85) restent smooth.
+
+### Normal map strength
+
+Le smooth PS utilise `nmStrength * 0.7` (vs `nmStrength * 1.0` pour blocky). Les surfaces courbes nécessitent des normal maps atténuées pour que les perturbations ne cassent pas la continuité visuelle du smooth shading.
+
+### Règles
+
+- **Toute modification de lighting/texturing** dans `voxelPS.hlsl` doit être portée dans `voxelSmoothPS.hlsl` (et vice-versa)
+- **Ne JAMAIS utiliser `geoN`** (ddx/ddy) dans le smooth PS pour le triplanar ou le normal mapping — utiliser `N` exclusivement
+- Les deux PS doivent produire un résultat identique sur des faces coplanaires de même matériau
+
+**Fichiers** : `shaders/voxelSmoothCS.hlsl` (consistency blend), `shaders/voxelSmoothPS.hlsl` (triplanar + normal map), `shaders/voxelPS.hlsl` (blocky reference)
+
+---
+
 ## Gestion des resource states DX12 (buffers)
 
 **Wicked Engine ne fait AUCUN tracking automatique d'état pour les buffers.** Les `GPUBarrier::Buffer(buf, before, after)` sont passées directement à D3D12 sans validation. **Le `state_before` DOIT correspondre à l'état DX12 réel, sinon → DXGI_ERROR_INVALID_CALL.**
diff --git a/shaders/voxelPS.hlsl b/shaders/voxelPS.hlsl
index 9be2527..b73ee27 100644
--- a/shaders/voxelPS.hlsl
+++ b/shaders/voxelPS.hlsl
@@ -356,24 +356,6 @@ PSOutput main(PSInput input)
     float3 ambient = lerp(groundAmbient.rgb, skyAmbient.rgb, hemiLerp);
     float3 diffuse = sunColor.rgb * NdotL;
 
-    // Grass-specific shading (Wonderbox style)
-    bool isGrass = (texIndex == 0); // material 1 = grass = texture layer 0
-    if (isGrass) {
-        // Vertical face darkening: use FLAT normal for consistency
-        float verticalDarken = saturate(abs(flatN.y)); // 1=top, 0=side
-        float sideFactor = lerp(0.60, 1.0, verticalDarken); // sides at 60% brightness
-        albedo *= sideFactor;
-
-        // Subtle warm shift: sunlit grass slightly warmer
-        if (NdotL > 0.0) {
-            float3 warmShift = float3(0.08, 0.05, -0.03) * NdotL;
-            diffuse += warmShift;
-        }
-
-        // Boost ambient for grass: inter-reflection from dense foliage
-        ambient *= 1.15;
-    }
-
     // ── Debug lighting modes (F9 cycle) ──
     uint dbgLight = (uint)toneMapParams.w;
     if (dbgLight == 2) {
diff --git a/shaders/voxelSmoothCS.hlsl b/shaders/voxelSmoothCS.hlsl
index 0ddb7f7..f4bfb0d 100644
--- a/shaders/voxelSmoothCS.hlsl
+++ b/shaders/voxelSmoothCS.hlsl
@@ -80,11 +80,25 @@ float3 computeQuadFaceNormal(int3 c0, int3 c1, int3 c2, int3 c3,
     return fn; // area-weighted (not normalized)
 }
 
-// ── Smooth normal for a vertex at cell v ────────────────────────────
+// ── Smooth normal + consistency for a vertex at cell v ──────────────
 // Checks all 12 incident edges (4 per axis), computes face normals from
-// centroid grid, averages them. All reads from grid only.
-float3 computeSmoothNormal(int3 v) {
+// centroid grid, averages them. Also returns a consistency metric:
+//   consistency = |sum(fn)| / sum(|fn|)
+//   = 1.0 when all face normals agree (flat surface)
+//   ≈ 0.707 at a 90° edge (two perpendicular faces)
+//   → 0 when faces cancel out
+// Used at emission time to blend between smooth normal (interior) and
+// face normal (edge vertices).
+float3 computeSmoothNormal(int3 v, out float consistency) {
     float3 accum = float3(0, 0, 0);
+    float totalMag = 0;
+
+    // Helper macro: accumulate one quad's face normal + its magnitude
+    #define ACCUM_QUAD(c0,c1,c2,c3,solid,axis) { \
+        float3 fn_ = computeQuadFaceNormal(c0,c1,c2,c3,solid,axis); \
+        accum += fn_; \
+        totalMag += length(fn_); \
+    }
 
     // X-edges: at (v.x, v.y+dy, v.z+dz) for dy,dz in {0,1}
     {
@@ -97,30 +111,14 @@ float3 computeSmoothNormal(int3 v) {
         bool sv_11 = isCellSolid(int3(v.x, v.y+1, v.z+1));
         bool sv_11_x1 = isCellSolid(int3(v.x+1, v.y+1, v.z+1));
 
-        // Edge (v.x, v.y, v.z)
-        if (sv != sv_x1) {
-            accum += computeQuadFaceNormal(
-                v + int3(0,-1,-1), v + int3(0,0,-1),
-                v + int3(0,-1,0),  v, sv, 0);
-        }
-        // Edge (v.x, v.y+1, v.z)
-        if (sv_01 != sv_01_x1) {
-            accum += computeQuadFaceNormal(
-                int3(v.x, v.y, v.z-1), int3(v.x, v.y+1, v.z-1),
-                v, int3(v.x, v.y+1, v.z), sv_01, 0);
-        }
-        // Edge (v.x, v.y, v.z+1)
-        if (sv_10 != sv_10_x1) {
-            accum += computeQuadFaceNormal(
-                int3(v.x, v.y-1, v.z), v,
-                int3(v.x, v.y-1, v.z+1), int3(v.x, v.y, v.z+1), sv_10, 0);
-        }
-        // Edge (v.x, v.y+1, v.z+1)
-        if (sv_11 != sv_11_x1) {
-            accum += computeQuadFaceNormal(
-                v, int3(v.x, v.y+1, v.z),
-                int3(v.x, v.y, v.z+1), int3(v.x, v.y+1, v.z+1), sv_11, 0);
-        }
+        if (sv != sv_x1)
+            ACCUM_QUAD(v+int3(0,-1,-1), v+int3(0,0,-1), v+int3(0,-1,0), v, sv, 0)
+        if (sv_01 != sv_01_x1)
+            ACCUM_QUAD(int3(v.x,v.y,v.z-1), int3(v.x,v.y+1,v.z-1), v, int3(v.x,v.y+1,v.z), sv_01, 0)
+        if (sv_10 != sv_10_x1)
+            ACCUM_QUAD(int3(v.x,v.y-1,v.z), v, int3(v.x,v.y-1,v.z+1), int3(v.x,v.y,v.z+1), sv_10, 0)
+        if (sv_11 != sv_11_x1)
+            ACCUM_QUAD(v, int3(v.x,v.y+1,v.z), int3(v.x,v.y,v.z+1), int3(v.x,v.y+1,v.z+1), sv_11, 0)
     }
 
     // Y-edges: at (v.x+dx, v.y, v.z+dz) for dx,dz in {0,1}
@@ -134,26 +132,14 @@ float3 computeSmoothNormal(int3 v) {
         bool sv_11 = isCellSolid(int3(v.x+1, v.y, v.z+1));
         bool sv_11_y1 = isCellSolid(int3(v.x+1, v.y+1, v.z+1));
 
-        if (sv != sv_y1) {
-            accum += computeQuadFaceNormal(
-                v + int3(-1,0,-1), v + int3(0,0,-1),
-                v + int3(-1,0,0),  v, sv, 1);
-        }
-        if (sv_10 != sv_10_y1) {
-            accum += computeQuadFaceNormal(
-                int3(v.x, v.y, v.z-1), int3(v.x+1, v.y, v.z-1),
-                v, int3(v.x+1, v.y, v.z), sv_10, 1);
-        }
-        if (sv_01 != sv_01_y1) {
-            accum += computeQuadFaceNormal(
-                int3(v.x-1, v.y, v.z), v,
-                int3(v.x-1, v.y, v.z+1), int3(v.x, v.y, v.z+1), sv_01, 1);
-        }
-        if (sv_11 != sv_11_y1) {
-            accum += computeQuadFaceNormal(
-                v, int3(v.x+1, v.y, v.z),
-                int3(v.x, v.y, v.z+1), int3(v.x+1, v.y, v.z+1), sv_11, 1);
-        }
+        if (sv != sv_y1)
+            ACCUM_QUAD(v+int3(-1,0,-1), v+int3(0,0,-1), v+int3(-1,0,0), v, sv, 1)
+        if (sv_10 != sv_10_y1)
+            ACCUM_QUAD(int3(v.x,v.y,v.z-1), int3(v.x+1,v.y,v.z-1), v, int3(v.x+1,v.y,v.z), sv_10, 1)
+        if (sv_01 != sv_01_y1)
+            ACCUM_QUAD(int3(v.x-1,v.y,v.z), v, int3(v.x-1,v.y,v.z+1), int3(v.x,v.y,v.z+1), sv_01, 1)
+        if (sv_11 != sv_11_y1)
+            ACCUM_QUAD(v, int3(v.x+1,v.y,v.z), int3(v.x,v.y,v.z+1), int3(v.x+1,v.y,v.z+1), sv_11, 1)
     }
 
     // Z-edges: at (v.x+dx, v.y+dy, v.z) for dx,dy in {0,1}
@@ -167,30 +153,21 @@ float3 computeSmoothNormal(int3 v) {
         bool sv_11 = isCellSolid(int3(v.x+1, v.y+1, v.z));
         bool sv_11_z1 = isCellSolid(int3(v.x+1, v.y+1, v.z+1));
 
-        if (sv != sv_z1) {
-            accum += computeQuadFaceNormal(
-                v + int3(-1,-1,0), v + int3(0,-1,0),
-                v + int3(-1,0,0),  v, sv, 2);
-        }
-        if (sv_10 != sv_10_z1) {
-            accum += computeQuadFaceNormal(
-                int3(v.x, v.y-1, v.z), int3(v.x+1, v.y-1, v.z),
-                v, int3(v.x+1, v.y, v.z), sv_10, 2);
-        }
-        if (sv_01 != sv_01_z1) {
-            accum += computeQuadFaceNormal(
-                int3(v.x-1, v.y, v.z), v,
-                int3(v.x-1, v.y+1, v.z), int3(v.x, v.y+1, v.z), sv_01, 2);
-        }
-        if (sv_11 != sv_11_z1) {
-            accum += computeQuadFaceNormal(
-                v, int3(v.x+1, v.y, v.z),
-                int3(v.x, v.y+1, v.z), int3(v.x+1, v.y+1, v.z), sv_11, 2);
-        }
+        if (sv != sv_z1)
+            ACCUM_QUAD(v+int3(-1,-1,0), v+int3(0,-1,0), v+int3(-1,0,0), v, sv, 2)
+        if (sv_10 != sv_10_z1)
+            ACCUM_QUAD(int3(v.x,v.y-1,v.z), int3(v.x+1,v.y-1,v.z), v, int3(v.x+1,v.y,v.z), sv_10, 2)
+        if (sv_01 != sv_01_z1)
+            ACCUM_QUAD(int3(v.x-1,v.y,v.z), v, int3(v.x-1,v.y+1,v.z), int3(v.x,v.y+1,v.z), sv_01, 2)
+        if (sv_11 != sv_11_z1)
+            ACCUM_QUAD(v, int3(v.x+1,v.y,v.z), int3(v.x,v.y+1,v.z), int3(v.x+1,v.y+1,v.z), sv_11, 2)
     }
+    #undef ACCUM_QUAD
 
-    float len = length(accum);
-    return (len > 0.0001) ? accum / len : float3(0, 1, 0);
+    float accumLen = length(accum);
+    // consistency: 1.0 = all faces agree, <1.0 = diverging face directions
+    consistency = (totalMag > 0.0001) ? accumLen / totalMag : 1.0;
+    return (accumLen > 0.0001) ? accum / accumLen : float3(0, 1, 0);
 }
 
 // ── Emit helpers ────────────────────────────────────────────────────
@@ -249,16 +226,30 @@ void main(uint3 DTid : SV_DispatchThreadID)
             if (isCentroidValid(cells[0]) && isCentroidValid(cells[1]) &&
                 isCentroidValid(cells[2]) && isCentroidValid(cells[3])) {
                 float3 p[4], n[4];
+                float con[4];
                 [loop] for (uint i = 0; i < 4; i++)
                     p[i] = chunkWorldPos + readCentroidPos(cells[i]);
                 [loop] for (uint i = 0; i < 4; i++)
-                    n[i] = computeSmoothNormal(cells[i]);
+                    n[i] = computeSmoothNormal(cells[i], con[i]);
 
                 float3 fn = cross(p[1] - p[0], p[3] - p[0]);
                 int s = cellSolid ? +1 : -1;
                 if ((fn.x > 0.0) != (s > 0)) fn = -fn;
                 bool windingA = !cellSolid;
 
+                // Consistency-based blend: sharp edge vertices → face normal, curved → smooth
+                // consistency ≈ 1.0 = flat, ≈ 0.707 = 90° edge, < 0.5 = sharp corner
+                // smoothstep(0.70, 0.90): snaps to face normal at 90° boundaries (con<0.70)
+                // for seamless join with blocky, preserves smooth for terrain curves (con>0.90)
+                float fnLen = length(fn);
+                if (fnLen > 0.0001) {
+                    float3 fnN = fn / fnLen;
+                    [loop] for (uint i = 0; i < 4; i++) {
+                        float t = smoothstep(0.70, 0.90, con[i]);
+                        n[i] = normalize(lerp(fnN, n[i], t));
+                    }
+                }
+
                 uint packed = readGridPacked(cells[3]);
                 uint mat = packed & 0xFF;
                 uint secMat = (packed >> 8) & 0xFF;
@@ -281,10 +272,11 @@ void main(uint3 DTid : SV_DispatchThreadID)
             if (isCentroidValid(cells[0]) && isCentroidValid(cells[1]) &&
                 isCentroidValid(cells[2]) && isCentroidValid(cells[3])) {
                 float3 p[4], n[4];
+                float con[4];
                 [loop] for (uint i = 0; i < 4; i++)
                     p[i] = chunkWorldPos + readCentroidPos(cells[i]);
                 [loop] for (uint i = 0; i < 4; i++)
-                    n[i] = computeSmoothNormal(cells[i]);
+                    n[i] = computeSmoothNormal(cells[i], con[i]);
 
                 float3 fn = cross(p[1] - p[0], p[3] - p[0]);
                 int s = cellSolid ? +1 : -1;
@@ -292,6 +284,16 @@ void main(uint3 DTid : SV_DispatchThreadID)
                 bool windingA = !cellSolid;
                 windingA = !windingA; // Y-axis winding flip
 
+                // Consistency-based blend (same formula as X-edge)
+                float fnLen = length(fn);
+                if (fnLen > 0.0001) {
+                    float3 fnN = fn / fnLen;
+                    [loop] for (uint i = 0; i < 4; i++) {
+                        float t = smoothstep(0.70, 0.90, con[i]);
+                        n[i] = normalize(lerp(fnN, n[i], t));
+                    }
+                }
+
                 uint packed = readGridPacked(cells[3]);
                 uint mat = packed & 0xFF;
                 uint secMat = (packed >> 8) & 0xFF;
@@ -314,16 +316,27 @@ void main(uint3 DTid : SV_DispatchThreadID)
             if (isCentroidValid(cells[0]) && isCentroidValid(cells[1]) &&
                 isCentroidValid(cells[2]) && isCentroidValid(cells[3])) {
                 float3 p[4], n[4];
+                float con[4];
                 [loop] for (uint i = 0; i < 4; i++)
                     p[i] = chunkWorldPos + readCentroidPos(cells[i]);
                 [loop] for (uint i = 0; i < 4; i++)
-                    n[i] = computeSmoothNormal(cells[i]);
+                    n[i] = computeSmoothNormal(cells[i], con[i]);
 
                 float3 fn = cross(p[1] - p[0], p[3] - p[0]);
                 int s = cellSolid ? +1 : -1;
                 if ((fn.z > 0.0) != (s > 0)) fn = -fn;
                 bool windingA = !cellSolid;
 
+                // Consistency-based blend (same formula as X-edge)
+                float fnLen = length(fn);
+                if (fnLen > 0.0001) {
+                    float3 fnN = fn / fnLen;
+                    [loop] for (uint i = 0; i < 4; i++) {
+                        float t = smoothstep(0.70, 0.90, con[i]);
+                        n[i] = normalize(lerp(fnN, n[i], t));
+                    }
+                }
+
                 uint packed = readGridPacked(cells[3]);
                 uint mat = packed & 0xFF;
                 uint secMat = (packed >> 8) & 0xFF;
diff --git a/shaders/voxelSmoothPS.hlsl b/shaders/voxelSmoothPS.hlsl
index c411a0d..2469ca9 100644
--- a/shaders/voxelSmoothPS.hlsl
+++ b/shaders/voxelSmoothPS.hlsl
@@ -124,14 +124,11 @@ PSOutput main(PSInput input) {
     PSOutput output;
     float3 N = normalize(input.normal); // smooth normal (for lighting)
 
-    // Geometric normal from screen-space derivatives of worldPos.
-    // This is the true triangle face normal — use it for triplanar weights
-    // to avoid texture stretching caused by smooth normal interpolation.
-    float3 dpx = ddx(input.worldPos);
-    float3 dpy = ddy(input.worldPos);
-    float3 geoN = normalize(cross(dpx, dpy));
-    // Ensure geometric normal faces same hemisphere as smooth normal
-    if (dot(geoN, N) < 0.0) geoN = -geoN;
+    // NOTE: geoN (ddx/ddy geometric normal) is NOT used for triplanar sampling
+    // or normal mapping on smooth surfaces. It changes abruptly at triangle edges,
+    // causing per-triangle faceting in texture weights, normal perturbation, and
+    // therefore lighting (NdotL). All triplanar operations use N (smooth interpolated
+    // normal) which varies continuously across vertices → seamless result.
 
     float tiling = textureTiling;
 
@@ -209,13 +206,13 @@ PSOutput main(PSInput input) {
     float3 albedo;
 
     if (uBlend || vBlend) {
-        float4 mainTex = sampleTriplanarRGBA(input.worldPos, geoN, selfTexIdx, tiling);
+        float4 mainTex = sampleTriplanarRGBA(input.worldPos, N, selfTexIdx, tiling);
         float3 result = mainTex.rgb;
         float sharpness = 16.0;
 
         if (uBlend) {
             uint uTexIdx = clamp(uNeighborMat - 1u, 0u, 5u);
-            float4 uTex = sampleTriplanarRGBA(input.worldPos, geoN, uTexIdx, tiling);
+            float4 uTex = sampleTriplanarRGBA(input.worldPos, N, uTexIdx, tiling);
             float bias;
             if (uNeighResists) {
                 bias = 0.5 - uWeight * 1.6;
@@ -230,7 +227,7 @@ PSOutput main(PSInput input) {
 
         if (vBlend) {
             uint vTexIdx = clamp(vNeighborMat - 1u, 0u, 5u);
-            float4 vTex = sampleTriplanarRGBA(input.worldPos, geoN, vTexIdx, tiling);
+            float4 vTex = sampleTriplanarRGBA(input.worldPos, N, vTexIdx, tiling);
             float bias;
             if (vNeighResists) {
                 bias = 0.5 - vWeight * 1.6;
@@ -245,17 +242,24 @@ PSOutput main(PSInput input) {
 
         albedo = result;
     } else {
-        albedo = sampleTriplanar(input.worldPos, geoN, selfTexIdx, tiling);
+        albedo = sampleTriplanar(input.worldPos, N, selfTexIdx, tiling);
     }
 
     // ── Normal map perturbation ──
     float3 flatN = N; // preserve for ambient
     float nmStrength = toneMapParams.z;
     if (nmStrength > 0.0) {
-        float3 perturbedN = sampleTriplanarNormal(input.worldPos, geoN, selfTexIdx, tiling);
-        N = normalize(lerp(N, perturbedN, nmStrength * 0.7)); // lighter on smooth
+        float3 perturbedN = sampleTriplanarNormal(input.worldPos, N, selfTexIdx, tiling);
+        N = normalize(lerp(N, perturbedN, nmStrength * 0.7)); // lighter on smooth for softer transitions
     }
 
+    // ── Lighting ──
+    float3 L = normalize(-sunDirection.xyz);
+    float NdotL = max(dot(N, L), 0.0);
+    float hemiLerp = flatN.y * 0.5 + 0.5;
+    float3 ambient = lerp(groundAmbient.rgb, skyAmbient.rgb, hemiLerp);
+    float3 diffuse = sunColor.rgb * NdotL;
+
     // ── Debug lighting modes (F9 cycle) ──
     uint dbgLight = (uint)toneMapParams.w;
     if (dbgLight == 2) {
@@ -275,7 +279,7 @@ PSOutput main(PSInput input) {
         return output;
     }
     if (dbgLight == 4) {
-        // NdotL only: grayscale NdotL with geometric normal (no normal map)
+        // NdotL only: grayscale NdotL with flat normal (no normal map)
         float flatNdotL = max(dot(flatN, normalize(-sunDirection.xyz)), 0.0);
         output.color = float4(flatNdotL, flatNdotL, flatNdotL, 1.0);
         output.normal = float4(flatN, 0.0);
@@ -288,12 +292,7 @@ PSOutput main(PSInput input) {
         return output;
     }
 
-    // Lighting: flat normal for ambient (consistent), perturbed for NdotL (detail)
-    float3 L = normalize(-sunDirection.xyz);
-    float NdotL = max(dot(N, L), 0.0);
-    float hemiLerp = flatN.y * 0.5 + 0.5;
-    float3 ambient = lerp(groundAmbient.rgb, skyAmbient.rgb, hemiLerp);
-    float3 color = albedo * (sunColor.rgb * NdotL + ambient);
+    float3 color = albedo * (ambient + diffuse);
 
     // ── Rim light ──
     float3 V = normalize(cameraPosition.xyz - input.worldPos);
diff --git a/src/voxel/VoxelMesher.cpp b/src/voxel/VoxelMesher.cpp
index 8239323..bae5b45 100644
--- a/src/voxel/VoxelMesher.cpp
+++ b/src/voxel/VoxelMesher.cpp
@@ -243,538 +243,11 @@ uint8_t VoxelMesher::calcAO(const VoxelWorld& world, const ChunkPos& cpos,
 }
 
 // ══════════════════════════════════════════════════════════════════
-// ── Naive Surface Nets Mesher (Phase 5) ─────────────────────────
+// ── Smooth meshing (Phase 5) ────────────────────────────────────
 // ══════════════════════════════════════════════════════════════════
-//
-// Algorithm:
-//   1. Compute SDF for each voxel: smooth solid = -1, empty = +1
-//      Non-smooth solid voxels act as hard walls (SDF crushed to -1).
-//   2. For each cell on the surface (SDF sign differs from at least one neighbor),
-//      place a vertex at the centroid of edge crossings.
-//   3. For each edge (pair of adjacent cells) with a sign change,
-//      emit a quad connecting the 4 cells that share that edge, then split to 2 triangles.
-//   4. Normals derived from SDF gradient (central differences).
-
-// Padded grid: +2 border for cross-chunk SDF lookups and neighbor smooth detection
-static constexpr int PAD = 2;
-static constexpr int GRID = CHUNK_SIZE + 2 * PAD; // 36
-
-static inline int gridIdx(int x, int y, int z) {
-    return (x + PAD) + (y + PAD) * GRID + (z + PAD) * GRID * GRID;
-}
-
-// Helper: read voxel data at chunk-local coords (with cross-chunk fallback)
-static VoxelData readVoxel(const Chunk& chunk, const VoxelWorld& world, int x, int y, int z) {
-    if (chunk.isInBounds(x, y, z))
-        return chunk.at(x, y, z);
-    return world.getVoxel(
-        chunk.pos.x * CHUNK_SIZE + x,
-        chunk.pos.y * CHUNK_SIZE + y,
-        chunk.pos.z * CHUNK_SIZE + z);
-}
-
-float SmoothMesher::computeSDF(const Chunk& chunk, const VoxelWorld& world,
-                                int x, int y, int z) {
-    VoxelData v = readVoxel(chunk, world, x, y, z);
-    if (v.isEmpty()) return 1.0f;       // empty → positive SDF
-    return -1.0f;                        // any solid → negative SDF
-}
-
-void SmoothMesher::computeNormal(const Chunk& chunk, const VoxelWorld& world,
-                                  int x, int y, int z,
-                                  float& nx, float& ny, float& nz) {
-    // Central differences of the SDF
-    float dx = computeSDF(chunk, world, x+1, y, z) - computeSDF(chunk, world, x-1, y, z);
-    float dy = computeSDF(chunk, world, x, y+1, z) - computeSDF(chunk, world, x, y-1, z);
-    float dz = computeSDF(chunk, world, x, y, z+1) - computeSDF(chunk, world, x, y, z-1);
-
-    float len = std::sqrt(dx*dx + dy*dy + dz*dz);
-    if (len > 0.0001f) {
-        nx = dx / len;
-        ny = dy / len;
-        nz = dz / len;
-    } else {
-        nx = 0.0f; ny = 1.0f; nz = 0.0f;
-    }
-}
-
-// Thread-local scratch buffers to avoid per-chunk allocation overhead.
-// Each worker thread gets its own set, eliminating malloc/free thrashing.
-struct SmoothScratch {
-    float sdf[GRID * GRID * GRID];
-    uint8_t smoothGrid[GRID * GRID * GRID];
-    uint8_t smoothNear[GRID * GRID * GRID]; // dilated: 1 if smooth OR face-adjacent to smooth
-    VoxelData voxelGrid[GRID * GRID * GRID];
-    int32_t vertexMap[33 * 33 * 33]; // VERT_RANGE³
-};
-static thread_local SmoothScratch* tls_scratch = nullptr;
-
-uint32_t SmoothMesher::meshChunk(Chunk& chunk, const VoxelWorld& world) {
-    chunk.smoothVertices.clear();
-    chunk.hasSmooth = false;
-
-    // ── Early exit: skip chunks far from any smooth voxels ──────
-    // Check this chunk + 26 neighbors for containsSmooth flag.
-    // This avoids the expensive 36³ grid fill for ~70% of chunks.
-    {
-        bool nearSmooth = chunk.containsSmooth;
-        if (!nearSmooth) {
-            for (int dz = -1; dz <= 1 && !nearSmooth; dz++)
-            for (int dy = -1; dy <= 1 && !nearSmooth; dy++)
-            for (int dx = -1; dx <= 1 && !nearSmooth; dx++) {
-                if (dx == 0 && dy == 0 && dz == 0) continue;
-                const Chunk* nc = world.getChunk(
-                    ChunkPos{chunk.pos.x + dx, chunk.pos.y + dy, chunk.pos.z + dz});
-                if (nc && nc->containsSmooth) nearSmooth = true;
-            }
-        }
-        if (!nearSmooth) return 0;
-    }
-
-    // Allocate thread-local scratch once per thread (persists across calls)
-    if (!tls_scratch) tls_scratch = new SmoothScratch();
-    auto& scratch = *tls_scratch;
-
-    // ── Step 1: Build SDF grid + smooth flag grid + voxel cache ──
-    // PAD=2 so we have SDF data for cells at [-1..CHUNK_SIZE] (all 8 corners accessible)
-    // Also build a "isSmooth" grid for the same range to detect proximity to smooth voxels.
-    // voxelGrid caches VoxelData to avoid repeated cross-chunk hashmap lookups later.
-    float* sdf = scratch.sdf;
-    uint8_t* smoothGrid = scratch.smoothGrid;
-    VoxelData* voxelGrid = scratch.voxelGrid;
-    constexpr int GRID3 = GRID * GRID * GRID;
-    std::memset(smoothGrid, 0, GRID3);
-    // SDF defaults to 1.0f (empty) — fill below
-    for (int i = 0; i < GRID3; i++) sdf[i] = 1.0f;
-    bool anySmooth = false;
-
-    // Pre-cache neighbor chunk pointers for fast cross-chunk access
-    const Chunk* neighborChunks[3][3][3] = {};
-    for (int dz = -1; dz <= 1; dz++)
-    for (int dy = -1; dy <= 1; dy++)
-    for (int dx = -1; dx <= 1; dx++) {
-        neighborChunks[dx+1][dy+1][dz+1] = world.getChunk(
-            ChunkPos{chunk.pos.x + dx, chunk.pos.y + dy, chunk.pos.z + dz});
-    }
-
-    // Helper: fast voxel read using cached neighbor chunk pointers
-    auto readVoxelFast = [&](int x, int y, int z) -> VoxelData {
-        if (x >= 0 && x < CHUNK_SIZE && y >= 0 && y < CHUNK_SIZE && z >= 0 && z < CHUNK_SIZE)
-            return chunk.at(x, y, z);
-        // Determine which neighbor chunk
-        int cx = (x < 0) ? 0 : (x >= CHUNK_SIZE) ? 2 : 1;
-        int cy = (y < 0) ? 0 : (y >= CHUNK_SIZE) ? 2 : 1;
-        int cz = (z < 0) ? 0 : (z >= CHUNK_SIZE) ? 2 : 1;
-        const Chunk* nc = neighborChunks[cx][cy][cz];
-        if (!nc) return VoxelData{};  // empty if chunk not loaded
-        int lx = ((x % CHUNK_SIZE) + CHUNK_SIZE) % CHUNK_SIZE;
-        int ly = ((y % CHUNK_SIZE) + CHUNK_SIZE) % CHUNK_SIZE;
-        int lz = ((z % CHUNK_SIZE) + CHUNK_SIZE) % CHUNK_SIZE;
-        return nc->at(lx, ly, lz);
-    };
-
-    for (int z = -PAD; z < CHUNK_SIZE + PAD; z++) {
-        for (int y = -PAD; y < CHUNK_SIZE + PAD; y++) {
-            for (int x = -PAD; x < CHUNK_SIZE + PAD; x++) {
-                int gi = gridIdx(x, y, z);
-                VoxelData v = readVoxelFast(x, y, z);
-                voxelGrid[gi] = v;
-                sdf[gi] = v.isEmpty() ? 1.0f : -1.0f;
-                if (v.isSmooth()) {
-                    smoothGrid[gi] = 1;
-                    // Only need anySmooth for this chunk's own voxels
-                    if (chunk.isInBounds(x, y, z)) anySmooth = true;
-                }
-            }
-        }
-    }
-
-    // Also check 1 beyond the chunk (neighbor chunks may have smooth voxels that
-    // affect cells at the chunk boundary)
-    if (!anySmooth) {
-        // Check if any neighbor voxels just outside the chunk are smooth
-        for (int z = -1; z <= CHUNK_SIZE && !anySmooth; z++)
-        for (int y = -1; y <= CHUNK_SIZE && !anySmooth; y++)
-        for (int x = -1; x <= CHUNK_SIZE && !anySmooth; x++) {
-            if (chunk.isInBounds(x, y, z)) continue; // already checked
-            if (smoothGrid[gridIdx(x, y, z)]) anySmooth = true;
-        }
-    }
-
-    if (!anySmooth) return 0;
-    chunk.hasSmooth = true;
-
-    // ── Step 1b: Dilate smoothGrid → smoothNear ──────────────────
-    // Pre-compute "smooth or face-adjacent to smooth" to reduce the
-    // per-cell hasSmooth check from 56 lookups to 8 lookups.
-    uint8_t* smoothNear = scratch.smoothNear;
-    std::memcpy(smoothNear, smoothGrid, GRID3);
-    for (int z = -PAD + 1; z < CHUNK_SIZE + PAD - 1; z++)
-    for (int y = -PAD + 1; y < CHUNK_SIZE + PAD - 1; y++)
-    for (int x = -PAD + 1; x < CHUNK_SIZE + PAD - 1; x++) {
-        if (smoothGrid[gridIdx(x, y, z)]) {
-            smoothNear[gridIdx(x+1, y, z)] = 1;
-            smoothNear[gridIdx(x-1, y, z)] = 1;
-            smoothNear[gridIdx(x, y+1, z)] = 1;
-            smoothNear[gridIdx(x, y-1, z)] = 1;
-            smoothNear[gridIdx(x, y, z+1)] = 1;
-            smoothNear[gridIdx(x, y, z-1)] = 1;
-        }
-    }
-
-    // ── Step 2: Generate vertices for surface cells ──────────────
-    // Extended range: [-1, CHUNK_SIZE) for cross-chunk connectivity.
-    // This chunk generates vertices for cells at [-1..CHUNK_SIZE-1].
-    // The vertex map covers [-1..CHUNK_SIZE-1] → size = CHUNK_SIZE+1, offset by +1.
-    static constexpr int VERT_MIN = -1;
-    static constexpr int VERT_MAX = CHUNK_SIZE; // exclusive
-    static constexpr int VERT_RANGE = VERT_MAX - VERT_MIN; // CHUNK_SIZE + 1 = 33
-    int32_t* vertexMap = scratch.vertexMap;
-    std::memset(vertexMap, -1, VERT_RANGE * VERT_RANGE * VERT_RANGE * sizeof(int32_t));
-
-    auto vertMapIdx = [](int x, int y, int z) -> int {
-        // shift coordinates by -VERT_MIN = +1 so index range is [0, VERT_RANGE)
-        return (x - VERT_MIN) + (y - VERT_MIN) * VERT_RANGE + (z - VERT_MIN) * VERT_RANGE * VERT_RANGE;
-    };
-
-    // World offset for this chunk
-    float ox = (float)(chunk.pos.x * CHUNK_SIZE);
-    float oy = (float)(chunk.pos.y * CHUNK_SIZE);
-    float oz = (float)(chunk.pos.z * CHUNK_SIZE);
-
-    // Corner offsets: (dx,dy,dz) for corner index 0-7 of a cell
-    static const int cornerOff[8][3] = {
-        {0,0,0}, {1,0,0}, {0,1,0}, {1,1,0},
-        {0,0,1}, {1,0,1}, {0,1,1}, {1,1,1},
-    };
-    static const float cornerOffF[8][3] = {
-        {0,0,0}, {1,0,0}, {0,1,0}, {1,1,0},
-        {0,0,1}, {1,0,1}, {0,1,1}, {1,1,1},
-    };
-    static const int edges[12][2] = {
-        {0,1}, {2,3}, {4,5}, {6,7}, // X-axis edges
-        {0,2}, {1,3}, {4,6}, {5,7}, // Y-axis edges
-        {0,4}, {1,5}, {2,6}, {3,7}, // Z-axis edges
-    };
-
-    for (int z = VERT_MIN; z < VERT_MAX; z++) {
-        for (int y = VERT_MIN; y < VERT_MAX; y++) {
-            for (int x = VERT_MIN; x < VERT_MAX; x++) {
-                // hasSmooth check via dilated grid: at least one corner must be
-                // smooth or face-adjacent to smooth. Uses pre-dilated smoothNear
-                // grid → only 8 lookups instead of 56.
-                bool hasSmooth = false;
-                for (int c = 0; c < 8 && !hasSmooth; c++) {
-                    if (smoothNear[gridIdx(x + cornerOff[c][0], y + cornerOff[c][1], z + cornerOff[c][2])])
-                        hasSmooth = true;
-                }
-                if (!hasSmooth) continue;
-
-                // Get SDF at 8 corners of cell (x,y,z)
-                float corner[8];
-                bool hasPos = false, hasNeg = false;
-                for (int c = 0; c < 8; c++) {
-                    corner[c] = sdf[gridIdx(x + cornerOff[c][0], y + cornerOff[c][1], z + cornerOff[c][2])];
-                    if (corner[c] < 0.0f) hasNeg = true;
-                    else hasPos = true;
-                }
-
-                if (!hasPos || !hasNeg) continue; // no sign change → not on surface
-
-                // Compute vertex position as centroid of edge crossings.
-                // +0.5 offset: SDF is sampled at voxel centers, so the cell spans
-                // from (x+0.5) to (x+1.5) in world space. This naturally aligns
-                // the isosurface with the integer grid (voxel face positions).
-                float sumX = 0, sumY = 0, sumZ = 0;
-                int crossCount = 0;
-
-                for (int e = 0; e < 12; e++) {
-                    float s0 = corner[edges[e][0]];
-                    float s1 = corner[edges[e][1]];
-                    if ((s0 < 0.0f) == (s1 < 0.0f)) continue;
-
-                    float t = s0 / (s0 - s1);
-                    t = std::clamp(t, 0.01f, 0.99f);
-
-                    const float* c0 = cornerOffF[edges[e][0]];
-                    const float* c1 = cornerOffF[edges[e][1]];
-                    sumX += c0[0] + t * (c1[0] - c0[0]);
-                    sumY += c0[1] + t * (c1[1] - c0[1]);
-                    sumZ += c0[2] + t * (c1[2] - c0[2]);
-                    crossCount++;
-                }
-
-                if (crossCount == 0) continue;
-
-                float invCross = 1.0f / (float)crossCount;
-                // centroid in [0,1] within the cell
-                float cx = sumX * invCross;
-                float cy = sumY * invCross;
-                float cz = sumZ * invCross;
-
-                // ── Per-axis clamping at blocky boundaries ───────────
-                // With +0.5 offset, the cell spans [x+0.5, x+1.5] in world space.
-                // The integer grid (blocky faces) is at x+1. In centroid coords,
-                // that's centroid = 0.5 (the midpoint of the cell).
-                // If the +side corners (dx=1) contain a blocky solid, clamp centroid ≤ 0.5
-                // If the -side corners (dx=0) contain a blocky solid, clamp centroid ≥ 0.5
-                // This prevents the smooth mesh from extending into blocky territory.
-                bool blockyXlo = false, blockyXhi = false;
-                bool blockyYlo = false, blockyYhi = false;
-                bool blockyZlo = false, blockyZhi = false;
-                for (int c = 0; c < 8; c++) {
-                    if (corner[c] >= 0.0f) continue; // empty corner
-                    VoxelData v = voxelGrid[gridIdx(
-                        x + cornerOff[c][0], y + cornerOff[c][1], z + cornerOff[c][2])];
-                    if (!v.isEmpty() && !v.isSmooth()) {
-                        // This corner is a blocky solid
-                        if (cornerOff[c][0] == 0) blockyXlo = true; else blockyXhi = true;
-                        if (cornerOff[c][1] == 0) blockyYlo = true; else blockyYhi = true;
-                        if (cornerOff[c][2] == 0) blockyZlo = true; else blockyZhi = true;
-                    }
-                }
-                if (blockyXhi) cx = std::min(cx, 0.5f);
-                if (blockyXlo) cx = std::max(cx, 0.5f);
-                if (blockyYhi) cy = std::min(cy, 0.5f);
-                if (blockyYlo) cy = std::max(cy, 0.5f);
-                if (blockyZhi) cz = std::min(cz, 0.5f);
-                if (blockyZlo) cz = std::max(cz, 0.5f);
-
-                // World position with +0.5 offset (SDF at voxel centers)
-                float vx = (float)x + 0.5f + cx;
-                float vy = (float)y + 0.5f + cy;
-                float vz = (float)z + 0.5f + cz;
-
-                // Determine material: prefer smooth voxels' materials to avoid
-                // picking up subsurface blocky materials (e.g., dirt under stone)
-                uint8_t smoothMatCounts[256] = {};
-                uint8_t allMatCounts[256] = {};
-                int smoothCount = 0;
-                for (int c = 0; c < 8; c++) {
-                    if (corner[c] < 0.0f) {
-                        VoxelData v = voxelGrid[gridIdx(
-                            x + cornerOff[c][0], y + cornerOff[c][1], z + cornerOff[c][2])];
-                        if (!v.isEmpty()) {
-                            allMatCounts[v.getMaterialID()]++;
-                            if (v.isSmooth()) {
-                                smoothMatCounts[v.getMaterialID()]++;
-                                smoothCount++;
-                            }
-                        }
-                    }
-                }
-                // Primary material: prefer smooth-only counts to avoid subsurface bleed
-                uint8_t* primaryCounts = (smoothCount > 0) ? smoothMatCounts : allMatCounts;
-                uint8_t bestMat = 6, bestCount = 0;
-                for (int m = 1; m < 256; m++) {
-                    if (primaryCounts[m] > bestCount) {
-                        bestMat = (uint8_t)m; bestCount = primaryCounts[m];
-                    }
-                }
-                // Secondary material: only count SURFACE-EXPOSED voxels (at least one
-                // empty neighbor). This prevents underground materials (dirt under stone)
-                // from bleeding through — same principle as blocky face blending.
-                static const int dirs6[6][3] = {{1,0,0},{-1,0,0},{0,1,0},{0,-1,0},{0,0,1},{0,0,-1}};
-                uint8_t surfaceMatCounts[256] = {};
-                for (int c = 0; c < 8; c++) {
-                    if (corner[c] >= 0.0f) continue;
-                    int cx = x + cornerOff[c][0], cy = y + cornerOff[c][1], cz = z + cornerOff[c][2];
-                    VoxelData v = voxelGrid[gridIdx(cx, cy, cz)];
-                    if (v.isEmpty()) continue;
-                    // Check if this voxel is on the surface
-                    bool onSurface = false;
-                    for (int d = 0; d < 6 && !onSurface; d++) {
-                        if (sdf[gridIdx(cx + dirs6[d][0], cy + dirs6[d][1], cz + dirs6[d][2])] > 0.0f)
-                            onSurface = true;
-                    }
-                    if (onSurface) surfaceMatCounts[v.getMaterialID()]++;
-                }
-                uint8_t secMat = bestMat, secCount = 0;
-                for (int m = 1; m < 256; m++) {
-                    if (m == bestMat) continue;
-                    if (surfaceMatCounts[m] > secCount) {
-                        secMat = (uint8_t)m; secCount = surfaceMatCounts[m];
-                    }
-                }
-                // blendWeight: binary flag — 255 at material boundary, 0 at interior.
-                // GPU interpolation creates the smooth edge-to-interior falloff.
-                uint8_t blendW = (secCount > 0 && secMat != bestMat) ? 255 : 0;
-
-                // Store vertex (normals zeroed — computed later from face normals in Step 4)
-                int32_t vertIdx = (int32_t)chunk.smoothVertices.size();
-                vertexMap[vertMapIdx(x, y, z)] = vertIdx;
-
-                SmoothVertex sv;
-                sv.px = ox + vx;
-                sv.py = oy + vy;
-                sv.pz = oz + vz;
-                sv.nx = 0;
-                sv.ny = 0;
-                sv.nz = 0;
-                sv.materialID = bestMat;
-                sv.secondaryMat = secMat;
-                sv.blendWeight = blendW;
-                sv._pad1 = 0;
-                sv.chunkIndex = 0;
-                sv._pad2 = 0;
-                chunk.smoothVertices.push_back(sv);
-            }
-        }
-    }
-
-    if (chunk.smoothVertices.empty()) {
-        chunk.hasSmooth = false;
-        return 0;
-    }
-
-    // ── Step 3: Emit quads for edges with sign change ────────────
-    // Canonical ownership: this chunk owns edges whose lower endpoint
-    // is in [0, CHUNK_SIZE). Extended to check edges at the chunk
-    // boundary (lower endpoint at CHUNK_SIZE-1, upper at CHUNK_SIZE).
-    // The sharing cells may be at [-1..CHUNK_SIZE-1], all covered by vertex map.
-
-    // Tri with edge axis info for correct normal orientation.
-    // normalAxis: 0=X, 1=Y, 2=Z — the axis of the edge that generated this quad.
-    // normalSign: +1 if the normal should point in +axis direction, -1 for -axis.
-    struct Tri { int32_t a, b, c; int8_t normalAxis; int8_t normalSign; };
-    std::vector<Tri> triangles;
-    triangles.reserve(chunk.smoothVertices.size() * 2);
-
-    // Helper: safe vertex map lookup (returns -1 if out of range)
-    auto safeVertMap = [&](int x, int y, int z) -> int32_t {
-        if (x < VERT_MIN || x >= VERT_MAX ||
-            y < VERT_MIN || y >= VERT_MAX ||
-            z < VERT_MIN || z >= VERT_MAX) return -1;
-        return vertexMap[vertMapIdx(x, y, z)];
-    };
-
-    // Helper: emit 2 triangles for a quad (a,b,c,d) with known desired normal.
-    // The Y-axis sharing cells have a different spatial arrangement from X and Z,
-    // requiring opposite winding to produce correct front-facing triangles.
-    auto emitQuad = [&](int a, int b, int c, int d, float s0, int8_t axis) {
-        if (a < 0 || b < 0 || c < 0 || d < 0) return;
-        int8_t sign = (s0 < 0.0f) ? +1 : -1;
-        // Y-axis has natural winding swapped relative to X and Z
-        bool useWindingA = (s0 > 0.0f);
-        if (axis == 1) useWindingA = !useWindingA;
-        if (useWindingA) {
-            triangles.push_back({a, b, d, axis, sign});
-            triangles.push_back({a, d, c, axis, sign});
-        } else {
-            triangles.push_back({a, d, b, axis, sign});
-            triangles.push_back({a, c, d, axis, sign});
-        }
-    };
-
-    // Iterate over edges owned by this chunk: grid points [0, CHUNK_SIZE)
-    for (int z = 0; z < CHUNK_SIZE; z++) {
-        for (int y = 0; y < CHUNK_SIZE; y++) {
-            for (int x = 0; x < CHUNK_SIZE; x++) {
-                float s0 = sdf[gridIdx(x, y, z)];
-
-                // X-axis edge: (x,y,z) → (x+1,y,z)
-                {
-                    float s1 = sdf[gridIdx(x+1, y, z)];
-                    if ((s0 < 0.0f) != (s1 < 0.0f)) {
-                        emitQuad(
-                            safeVertMap(x, y-1, z-1), safeVertMap(x, y, z-1),
-                            safeVertMap(x, y-1, z),   safeVertMap(x, y, z),
-                            s0, 0);
-                    }
-                }
-
-                // Y-axis edge: (x,y,z) → (x,y+1,z)
-                {
-                    float s1 = sdf[gridIdx(x, y+1, z)];
-                    if ((s0 < 0.0f) != (s1 < 0.0f)) {
-                        emitQuad(
-                            safeVertMap(x-1, y, z-1), safeVertMap(x, y, z-1),
-                            safeVertMap(x-1, y, z),   safeVertMap(x, y, z),
-                            s0, 1);
-                    }
-                }
-
-                // Z-axis edge: (x,y,z) → (x,y,z+1)
-                {
-                    float s1 = sdf[gridIdx(x, y, z+1)];
-                    if ((s0 < 0.0f) != (s1 < 0.0f)) {
-                        emitQuad(
-                            safeVertMap(x-1, y-1, z), safeVertMap(x, y-1, z),
-                            safeVertMap(x-1, y, z),   safeVertMap(x, y, z),
-                            s0, 2);
-                    }
-                }
-            }
-        }
-    }
-
-    // ── Step 4: Compute smooth vertex normals ──────────────────────
-    // Accumulate area-weighted face normals into each indexed vertex,
-    // then normalize. This gives Gouraud-style smooth shading across
-    // the Surface Nets mesh without adding geometry.
-
-    const int vertCount = (int)chunk.smoothVertices.size();
-
-    // Zero out vertex normals (will accumulate face normals)
-    for (auto& sv : chunk.smoothVertices) {
-        sv.nx = 0; sv.ny = 0; sv.nz = 0;
-    }
-
-    // For each triangle: compute oriented face normal, accumulate into vertices.
-    // The cross product magnitude is proportional to triangle area, so larger
-    // triangles contribute more — this is the standard area-weighted approach.
-    for (const auto& tri : triangles) {
-        const SmoothVertex& va = chunk.smoothVertices[tri.a];
-        const SmoothVertex& vb = chunk.smoothVertices[tri.b];
-        const SmoothVertex& vc = chunk.smoothVertices[tri.c];
-
-        float e1x = vb.px - va.px, e1y = vb.py - va.py, e1z = vb.pz - va.pz;
-        float e2x = vc.px - va.px, e2y = vc.py - va.py, e2z = vc.pz - va.pz;
-        float fnx = e1y * e2z - e1z * e2y;
-        float fny = e1z * e2x - e1x * e2z;
-        float fnz = e1x * e2y - e1y * e2x;
-
-        // Orient using the known edge axis (same logic as before)
-        float component = (tri.normalAxis == 0) ? fnx : (tri.normalAxis == 1) ? fny : fnz;
-        if ((component > 0.0f) != (tri.normalSign > 0)) {
-            fnx = -fnx; fny = -fny; fnz = -fnz;
-        }
-
-        // Accumulate (area-weighted — cross product magnitude IS the area×2)
-        chunk.smoothVertices[tri.a].nx += fnx;
-        chunk.smoothVertices[tri.a].ny += fny;
-        chunk.smoothVertices[tri.a].nz += fnz;
-        chunk.smoothVertices[tri.b].nx += fnx;
-        chunk.smoothVertices[tri.b].ny += fny;
-        chunk.smoothVertices[tri.b].nz += fnz;
-        chunk.smoothVertices[tri.c].nx += fnx;
-        chunk.smoothVertices[tri.c].ny += fny;
-        chunk.smoothVertices[tri.c].nz += fnz;
-    }
-
-    // Normalize accumulated vertex normals
-    for (auto& sv : chunk.smoothVertices) {
-        float len = std::sqrt(sv.nx*sv.nx + sv.ny*sv.ny + sv.nz*sv.nz);
-        if (len > 0.0001f) {
-            sv.nx /= len; sv.ny /= len; sv.nz /= len;
-        } else {
-            sv.nx = 0; sv.ny = 1; sv.nz = 0;
-        }
-    }
-
-    // ── Step 5: Expand indexed triangles to triangle list ─────────
-    std::vector<SmoothVertex> expanded;
-    expanded.reserve(triangles.size() * 3);
-    for (const auto& tri : triangles) {
-        expanded.push_back(chunk.smoothVertices[tri.a]);
-        expanded.push_back(chunk.smoothVertices[tri.b]);
-        expanded.push_back(chunk.smoothVertices[tri.c]);
-    }
-
-    chunk.smoothVertices = std::move(expanded);
-    chunk.smoothVertexCount = (uint32_t)chunk.smoothVertices.size();
-
-    return chunk.smoothVertexCount;
-}
+// The CPU SmoothMesher has been removed. Smooth meshing is now handled
+// exclusively by the GPU compute shaders (voxelSmoothCentroidCS.hlsl
+// + voxelSmoothCS.hlsl) which include crease-angle correction for
+// correct normals at sharp edges (e.g. vertical walls).
 
 } // namespace voxel
diff --git a/src/voxel/VoxelMesher.h b/src/voxel/VoxelMesher.h
index b0322a9..49f329f 100644
--- a/src/voxel/VoxelMesher.h
+++ b/src/voxel/VoxelMesher.h
@@ -37,25 +37,4 @@ private:
                           int x, int y, int z, uint8_t face);
 };
 
-// ── Naive Surface Nets Mesher (Phase 5) ─────────────────────────
-// Generates smooth triangle mesh for voxels marked FLAG_SMOOTH.
-// Algorithm: one vertex per surface cell, positioned at edge-crossing centroid.
-// Quads emitted for each edge with sign change, then split into 2 triangles.
-class SmoothMesher {
-public:
-    // Mesh smooth voxels in a chunk, populating chunk.smoothVertices.
-    // Returns number of smooth vertices generated (always multiple of 3, triangle list).
-    static uint32_t meshChunk(Chunk& chunk, const VoxelWorld& world);
-
-private:
-    // SDF value at a voxel position (solid smooth = -1, empty = +1)
-    // Non-smooth solid voxels are treated as walls (SDF = -1 at boundary)
-    static float computeSDF(const Chunk& chunk, const VoxelWorld& world,
-                            int x, int y, int z);
-
-    // Compute SDF gradient (numerical central differences) for normal
-    static void computeNormal(const Chunk& chunk, const VoxelWorld& world,
-                              int x, int y, int z, float& nx, float& ny, float& nz);
-};
-
 } // namespace voxel
diff --git a/src/voxel/VoxelRenderer.cpp b/src/voxel/VoxelRenderer.cpp
index 1ad7a69..678fb1f 100644
--- a/src/voxel/VoxelRenderer.cpp
+++ b/src/voxel/VoxelRenderer.cpp
@@ -1108,84 +1108,7 @@ void VoxelRenderer::renderTopings(
     dev->RenderPassEnd(cmd);
 }
 
-// ── Phase 5: Smooth Surface Nets upload + rendering ─────────────
-
-void VoxelRenderer::uploadSmoothData(VoxelWorld& world) {
-    if (!device_ || !smoothPso_.IsValid()) return;
-
-    // Collect all smooth vertices from all chunks, stamping each with its chunkIndex.
-    // The chunkIndex must match the order in chunkInfoBuffer_ (assigned by forEachChunk).
-    // Reuse a persistent staging vector to avoid per-frame allocations.
-    smoothStagingVerts_.clear();
-    if (smoothStagingVerts_.capacity() < 64 * 1024)
-        smoothStagingVerts_.reserve(64 * 1024);
-
-    uint32_t chunkIdx = 0;
-    world.forEachChunk([&](const ChunkPos& pos, Chunk& chunk) {
-        if (chunk.hasSmooth && chunk.smoothVertexCount > 0) {
-            for (auto& sv : chunk.smoothVertices) {
-                sv.chunkIndex = (uint16_t)chunkIdx;
-            }
-            smoothStagingVerts_.insert(smoothStagingVerts_.end(),
-                chunk.smoothVertices.begin(),
-                chunk.smoothVertices.end());
-        }
-        chunkIdx++;
-    });
-
-    smoothVertexCount_ = (uint32_t)std::min(smoothStagingVerts_.size(), (size_t)MAX_SMOOTH_VERTICES);
-
-    if (smoothVertexCount_ == 0) {
-        smoothDirty_ = false;
-        return;
-    }
-
-    // Pre-allocate smooth buffer; only recreate when capacity needs to grow.
-    if (smoothVertexBuf_.ensureCapacity(device_, smoothVertexCount_, sizeof(SmoothVertex),
-            BindFlag::SHADER_RESOURCE)) {
-        wi::backlog::post("Smooth: allocated vertex buffer (" + std::to_string(smoothVertexBuf_.capacity)
-            + " capacity, " + std::to_string(smoothVertexBuf_.capacity * sizeof(SmoothVertex) / 1024) + " KB)");
-    } else {
-        smoothVertexBuf_.markDirty(); // deferred upload in Render()
-    }
-
-    smoothDirty_ = false;
-}
-
-void VoxelRenderer::uploadSmoothDataFast(VoxelWorld& world) {
-    if (!device_ || !smoothPso_.IsValid()) return;
-
-    // Fast path: chunkIndex already stamped during parallel meshChunk.
-    // Just collect vertices (no per-vertex stamping needed).
-    smoothStagingVerts_.clear();
-    if (smoothStagingVerts_.capacity() < 64 * 1024)
-        smoothStagingVerts_.reserve(64 * 1024);
-
-    world.forEachChunk([&](const ChunkPos& pos, Chunk& chunk) {
-        if (chunk.hasSmooth && chunk.smoothVertexCount > 0) {
-            smoothStagingVerts_.insert(smoothStagingVerts_.end(),
-                chunk.smoothVertices.begin(),
-                chunk.smoothVertices.end());
-        }
-    });
-
-    smoothVertexCount_ = (uint32_t)std::min(smoothStagingVerts_.size(), (size_t)MAX_SMOOTH_VERTICES);
-
-    if (smoothVertexCount_ == 0) {
-        smoothDirty_ = false;
-        return;
-    }
-
-    // Pre-allocate smooth buffer; only recreate when capacity needs to grow.
-    if (smoothVertexBuf_.ensureCapacity(device_, smoothVertexCount_, sizeof(SmoothVertex),
-            BindFlag::SHADER_RESOURCE)) {
-        // Buffer recreated with 25% headroom
-    } else {
-        smoothVertexBuf_.markDirty(); // deferred upload in Render()
-    }
-
-    smoothDirty_ = false;
-}
+// ── Phase 5: Smooth Surface Nets rendering (GPU compute only) ───
 
 void VoxelRenderer::renderSmooth(
     CommandList cmd,
@@ -1193,10 +1116,9 @@ void VoxelRenderer::renderSmooth(
     const Texture& renderTarget,
     const Texture& normalTarget
 ) const {
-    // Use GPU-generated smooth buffer if available, otherwise CPU buffer
-    const bool useGpuSmooth = smoothCentroidShader_.IsValid() && smoothMeshShader_.IsValid();
-    const auto& smoothBuf = useGpuSmooth ? gpuSmoothVertexBuffer_ : smoothVertexBuf_.gpu;
-    uint32_t vertCount = useGpuSmooth ? gpuSmoothVertexCount_ : smoothVertexCount_;
+    // GPU compute smooth buffer only (CPU fallback removed)
+    const auto& smoothBuf = gpuSmoothVertexBuffer_;
+    uint32_t vertCount = gpuSmoothVertexCount_;
 
     if (!smoothPso_.IsValid() || !smoothBuf.IsValid() || vertCount == 0) return;
 
@@ -1306,41 +1228,10 @@ void VoxelRenderPath::Start() {
         wi::backlog::post(msg);
     }
 
-    // Phase 5: Smooth surface mesh — GPU path or CPU fallback
+    // Phase 5: Smooth surface mesh — GPU compute only, dispatched in first Render()
     if (renderer.isInitialized()) {
-        if (renderer.smoothCentroidShader_.IsValid() && renderer.smoothMeshShader_.IsValid()) {
-            // GPU smooth mesher available — will dispatch in first Render()
-            renderer.gpuSmoothMeshDirty_ = true;
-            wi::backlog::post("SmoothMesher: GPU path active, dispatch deferred to Render()");
-        } else {
-            // CPU fallback: Surface Nets mesh for smooth voxels (parallelized)
-            std::vector<Chunk*> chunkPtrs;
-            world.forEachChunk([&](const ChunkPos& pos, Chunk& chunk) {
-                chunkPtrs.push_back(&chunk);
-            });
-            const VoxelWorld& worldRef = world;
-            wi::jobsystem::context smoothCtx;
-            wi::jobsystem::Dispatch(smoothCtx, (uint32_t)chunkPtrs.size(), 1,
-                [&chunkPtrs, &worldRef](wi::jobsystem::JobArgs args) {
-                    SmoothMesher::meshChunk(*chunkPtrs[args.jobIndex], worldRef);
-                });
-            wi::jobsystem::Wait(smoothCtx);
-
-            uint32_t totalSmooth = 0;
-            uint32_t smoothChunks = 0;
-            for (auto* c : chunkPtrs) {
-                if (c->smoothVertexCount > 0) {
-                    totalSmooth += c->smoothVertexCount;
-                    smoothChunks++;
-                }
-            }
-            renderer.uploadSmoothData(world);
-            char msg[256];
-            snprintf(msg, sizeof(msg),
-                "SmoothMesher: %u vertices (%u tris) in %u chunks",
-                totalSmooth, totalSmooth / 3, smoothChunks);
-            wi::backlog::post(msg);
-        }
+        renderer.gpuSmoothMeshDirty_ = true;
+        wi::backlog::post("SmoothMesher: GPU path active, dispatch deferred to Render()");
     }
 
     worldGenerated_ = true;
@@ -1584,31 +1475,8 @@ void VoxelRenderPath::Update(float dt) {
         renderer.gpuMeshDirty_ = true;
         renderer.rt_.aoHistoryValid = false;
 
-        // Re-mesh smooth surfaces — GPU path or CPU fallback
-        if (renderer.smoothCentroidShader_.IsValid() && renderer.smoothMeshShader_.IsValid()) {
-            renderer.gpuSmoothMeshDirty_ = true;
-        } else {
-            auto ts0 = std::chrono::high_resolution_clock::now();
-            std::vector<Chunk*> chunkPtrs;
-            world.forEachChunk([&](const ChunkPos& pos, Chunk& chunk) {
-                chunkPtrs.push_back(&chunk);
-            });
-            const VoxelWorld& worldRef = world;
-            wi::jobsystem::context ctx;
-            wi::jobsystem::Dispatch(ctx, (uint32_t)chunkPtrs.size(), 1,
-                [&chunkPtrs, &worldRef](wi::jobsystem::JobArgs args) {
-                    uint32_t idx = args.jobIndex;
-                    SmoothMesher::meshChunk(*chunkPtrs[idx], worldRef);
-                    for (auto& sv : chunkPtrs[idx]->smoothVertices)
-                        sv.chunkIndex = (uint16_t)idx;
-                });
-            wi::jobsystem::Wait(ctx);
-            auto ts1 = std::chrono::high_resolution_clock::now();
-            prof_.smoothMesh.add(std::chrono::duration<float, std::milli>(ts1 - ts0).count());
-            renderer.uploadSmoothDataFast(world);
-            auto ts2 = std::chrono::high_resolution_clock::now();
-            prof_.smoothUpload.add(std::chrono::duration<float, std::milli>(ts2 - ts1).count());
-        }
+        // Re-mesh smooth surfaces — GPU compute only
+        renderer.gpuSmoothMeshDirty_ = true;
 
         // Re-collect toping instances — parallelized
         {
@@ -1710,8 +1578,6 @@ void VoxelRenderPath::Render() const {
             // topingInstanceBuf_ must be filled before dispatchTopingBLASExtract reads it (t5)
             renderer.topingInstanceBuf_.upload(device, cmd,
                 renderer.topingGpuInsts_.data(), (uint32_t)renderer.topingGpuInsts_.size());
-            renderer.smoothVertexBuf_.upload(device, cmd,
-                renderer.smoothStagingVerts_.data(), renderer.smoothVertexCount_);
 
             // ── GPU compute toping BLAS extraction ──
             // Skip during animation (toping BLAS is skipped to save ~130ms GPU)
@@ -1746,8 +1612,7 @@ void VoxelRenderPath::Render() const {
                     rt.dispatchBLASExtract(cmd, renderer.gpuQuadBuffer_,
                         renderer.chunkInfoBuffer_, renderer.gpuMeshQuadCount_);
 
-                    bool useGpuSmooth = renderer.smoothCentroidShader_.IsValid() && renderer.smoothMeshShader_.IsValid();
-                    const auto& smoothVB = useGpuSmooth ? renderer.gpuSmoothVertexBuffer_ : renderer.smoothVertexBuf_.gpu;
+                    const auto& smoothVB = renderer.gpuSmoothVertexBuffer_;
 
                     if (anim_.terrainAnimated) {
                         uint32_t flags = (rtBuildSkipCounter_ & 1)
diff --git a/src/voxel/VoxelRenderer.h b/src/voxel/VoxelRenderer.h
index f965251..7231593 100644
--- a/src/voxel/VoxelRenderer.h
+++ b/src/voxel/VoxelRenderer.h
@@ -115,12 +115,7 @@ private:
     wi::graphics::Shader smoothPS_;
     wi::graphics::RasterizerState smoothRasterizer_;
     wi::graphics::PipelineState smoothPso_;
-    DeferredGPUBuffer smoothVertexBuf_;              // StructuredBuffer<SmoothVertex>, SRV t6
-    std::vector<SmoothVertex> smoothStagingVerts_;  // persistent staging buffer (avoids per-frame alloc)
-    static constexpr uint32_t MAX_SMOOTH_VERTICES = 4 * 1024 * 1024; // 4M vertices max
-    mutable uint32_t smoothVertexCount_ = 0;
     mutable uint32_t smoothDrawCalls_ = 0;
-    bool smoothDirty_ = true;
 
     // Texture arrays for materials (512x512, 6 layers each)
     wi::graphics::Texture textureArray_;      // RGBA: RGB=albedo, A=heightmap (t1)
@@ -262,16 +257,14 @@ public:
     ) const;
     uint32_t getTopingDrawCalls() const { return topingDrawCalls_; }
 
-    // Phase 5: Smooth surface rendering
-    void uploadSmoothData(VoxelWorld& world);
-    void uploadSmoothDataFast(VoxelWorld& world); // chunkIndex already stamped
+    // Phase 5: Smooth surface rendering (GPU compute only)
     void renderSmooth(
         wi::graphics::CommandList cmd,
         const wi::graphics::Texture& depthBuffer,
         const wi::graphics::Texture& renderTarget,
         const wi::graphics::Texture& normalTarget
     ) const;
-    uint32_t getSmoothVertexCount() const { return (smoothCentroidShader_.IsValid() && smoothMeshShader_.IsValid()) ? gpuSmoothVertexCount_ : smoothVertexCount_; }
+    uint32_t getSmoothVertexCount() const { return gpuSmoothVertexCount_; }
     uint32_t getSmoothDrawCalls() const { return smoothDrawCalls_; }
 
     // Phase 6: Ray Tracing (delegated to VoxelRTManager)
@@ -333,8 +326,8 @@ struct VoxelProfiler {
     ProfileAccum updateMeshes;    // updateMeshes (rebuildChunkInfoOnly)
     ProfileAccum topingCollect;   // topingSystem.collectInstances
     ProfileAccum topingUpload;    // uploadTopingData
-    ProfileAccum smoothMesh;      // SmoothMesher::meshChunk (all chunks)
-    ProfileAccum smoothUpload;    // uploadSmoothData
+    ProfileAccum smoothMesh;      // (legacy, unused — GPU smooth only)
+    ProfileAccum smoothUpload;    // (legacy, unused — GPU smooth only)
     ProfileAccum frame;           // full frame (Update only - legacy)
 
     // Render() phase
diff --git a/src/voxel/VoxelWorld.h b/src/voxel/VoxelWorld.h
index 6cebecb..f3a0ab8 100644
--- a/src/voxel/VoxelWorld.h
+++ b/src/voxel/VoxelWorld.h
@@ -19,10 +19,7 @@ struct Chunk {
     uint32_t faceOffsets[6] = {}; // offset (in quads) for each face group within quads[]
     uint32_t faceCounts[6] = {};  // number of quads per face group
 
-    // Smooth mesh data (output of Surface Nets mesher, Phase 5)
-    std::vector<SmoothVertex> smoothVertices;
-    uint32_t smoothVertexCount = 0;
-    bool hasSmooth = false; // true if chunk has smooth mesh output (set by mesher)
+    // Smooth voxel flags (used by GPU smooth mesher to decide which chunks to dispatch)
     bool containsSmooth = false; // true if chunk contains any FLAG_SMOOTH voxels (set during generation)
 
     // Cached surface material per column (set during initial generation, reused during animation)