diff --git a/shaders/voxelTopingBLASCS.hlsl b/shaders/voxelTopingBLASCS.hlsl
new file mode 100644
index 0000000..18269df
--- /dev/null
+++ b/shaders/voxelTopingBLASCS.hlsl
@@ -0,0 +1,80 @@
+// BVLE Voxels - Toping BLAS Position Extraction Compute Shader
+// Replaces the 196ms CPU loop that computed world-space toping positions.
+// Reads vertex templates (t4) + instance positions (t5) + group table (t7),
+// writes flat float3 positions (u0) for DXR BLAS construction.
+//
+// One thread per output vertex. Group table maps global vertex index to
+// the correct (instance, local vertex) pair via prefix-sum offsets.
+
+#include "voxelCommon.hlsli"
+
+// Toping mesh vertex (must match C++ TopingVertex, 24 bytes)
+struct TopingVtx {
+    float3 position; // local to voxel [0,1]^3
+    float3 normal;   // unused here, but struct must match
+};
+
+// Toping instance (just the world position, 12 bytes)
+struct TopingInst {
+    float3 worldPos;
+};
+
+// Draw group descriptor for BLAS extraction (must match C++ TopingBLASGroupGPU, 20 bytes)
+struct TopingBLASGroup {
+    uint globalVertexOffset;    // prefix sum: first global vertex index for this group
+    uint vertexTemplateOffset;  // offset into topingVertices (t4)
+    uint vertexCount;           // vertices per instance (mesh slice count)
+    uint instanceOffset;        // offset into topingInstances (t5)
+    uint instanceCount;         // number of instances in this group
+};
+
+StructuredBuffer<TopingVtx>       topingVertices  : register(t4);
+StructuredBuffer<TopingInst>      topingInstances : register(t5);
+StructuredBuffer<TopingBLASGroup> topingGroups    : register(t7);
+
+// Output: raw float3 positions (12 bytes each)
+RWByteAddressBuffer blasPositions : register(u0);
+
+// Push constants (b999)
+struct TopingBLASPush {
+    uint totalVertices;
+    uint groupCount;
+    uint pad0, pad1, pad2, pad3, pad4, pad5, pad6, pad7, pad8, pad9;
+};
+[[vk::push_constant]] ConstantBuffer<TopingBLASPush> push : register(b999);
+
+void storeFloat3(uint byteOffset, float3 v) {
+    blasPositions.Store(byteOffset,      asuint(v.x));
+    blasPositions.Store(byteOffset + 4,  asuint(v.y));
+    blasPositions.Store(byteOffset + 8,  asuint(v.z));
+}
+
+[RootSignature(VOXEL_ROOTSIG)]
+[numthreads(64, 1, 1)]
+void main(uint3 DTid : SV_DispatchThreadID) {
+    uint globalIdx = DTid.x;
+    if (globalIdx >= push.totalVertices) return;
+
+    // Find which group this vertex belongs to (linear scan, max ~32 groups)
+    uint groupIdx = 0;
+    for (uint g = 1; g < push.groupCount; g++) {
+        if (globalIdx >= topingGroups[g].globalVertexOffset)
+            groupIdx = g;
+        else
+            break;
+    }
+
+    TopingBLASGroup grp = topingGroups[groupIdx];
+
+    // Map global vertex to (instance, local vertex) within this group
+    uint localIdx    = globalIdx - grp.globalVertexOffset;
+    uint instanceIdx = grp.instanceOffset + localIdx / grp.vertexCount;
+    uint vertexIdx   = grp.vertexTemplateOffset + localIdx % grp.vertexCount;
+
+    TopingVtx vtx   = topingVertices[vertexIdx];
+    TopingInst inst  = topingInstances[instanceIdx];
+
+    float3 worldPos = inst.worldPos + vtx.position;
+
+    storeFloat3(globalIdx * 12, worldPos);
+}
diff --git a/src/voxel/VoxelRenderer.cpp b/src/voxel/VoxelRenderer.cpp
index 42e267f..eed1f8b 100644
--- a/src/voxel/VoxelRenderer.cpp
+++ b/src/voxel/VoxelRenderer.cpp
@@ -28,15 +28,6 @@ void VoxelRenderer::initialize(GraphicsDevice* dev) {
     }
     generateTextures();
 
-    // Create mega quad buffer (SRV for vertex pulling)
-    GPUBufferDesc megaDesc;
-    megaDesc.size = MEGA_BUFFER_CAPACITY * sizeof(PackedQuad);
-    megaDesc.bind_flags = BindFlag::SHADER_RESOURCE;
-    megaDesc.misc_flags = ResourceMiscFlag::BUFFER_STRUCTURED;
-    megaDesc.stride = sizeof(PackedQuad);
-    megaDesc.usage = Usage::DEFAULT;
-    device_->CreateBuffer(&megaDesc, nullptr, &megaQuadBuffer_);
-
     // Create chunk info buffer (SRV for VS chunk lookup)
     GPUBufferDesc infoDesc;
     infoDesc.size = MAX_CHUNKS * sizeof(GPUChunkInfo);
@@ -46,25 +37,6 @@ void VoxelRenderer::initialize(GraphicsDevice* dev) {
     infoDesc.usage = Usage::DEFAULT;
     device_->CreateBuffer(&infoDesc, nullptr, &chunkInfoBuffer_);
 
-    // Create indirect args buffer (for DrawInstancedIndirectCount, up to 6 draws per chunk)
-    // UAV bind flag needed for GPU cull compute shader to write args
-    GPUBufferDesc argsDesc;
-    argsDesc.size = MAX_DRAWS * sizeof(IndirectDrawArgs);
-    argsDesc.bind_flags = BindFlag::UNORDERED_ACCESS;
-    argsDesc.misc_flags = ResourceMiscFlag::BUFFER_STRUCTURED | ResourceMiscFlag::INDIRECT_ARGS;
-    argsDesc.stride = sizeof(IndirectDrawArgs);
-    argsDesc.usage = Usage::DEFAULT;
-    device_->CreateBuffer(&argsDesc, nullptr, &indirectArgsBuffer_);
-
-    // Create draw count buffer (single uint32, raw for RWByteAddressBuffer)
-    // UAV bind flag needed for GPU cull compute shader atomic counter
-    GPUBufferDesc countDesc;
-    countDesc.size = sizeof(uint32_t);
-    countDesc.bind_flags = BindFlag::UNORDERED_ACCESS;
-    countDesc.misc_flags = ResourceMiscFlag::BUFFER_RAW | ResourceMiscFlag::INDIRECT_ARGS;
-    countDesc.usage = Usage::DEFAULT;
-    device_->CreateBuffer(&countDesc, nullptr, &drawCountBuffer_);
-
     // ── GPU Timestamp Queries ──────────────────────────────────────
     GPUQueryHeapDesc queryDesc;
     queryDesc.type = GpuQueryType::TIMESTAMP;
@@ -197,6 +169,22 @@ void VoxelRenderer::initialize(GraphicsDevice* dev) {
             rtAvailable_ = false;
             wi::backlog::post("VoxelRenderer: RT available but BLAS extraction shader failed", wi::backlog::LogLevel::Warning);
         }
+        // ── Toping BLAS CS (replaces 196ms CPU loop) ─────────────────
+        wi::renderer::LoadShader(ShaderStage::CS, topingBLASShader_, "voxel/voxelTopingBLASCS.cso");
+        if (topingBLASShader_.IsValid()) {
+            // Pre-allocate small group table buffer (max 64 groups × 20 bytes = 1.25 KB)
+            GPUBufferDesc grpDesc;
+            grpDesc.size = MAX_TOPING_BLAS_GROUPS * sizeof(TopingBLASGroupGPU);
+            grpDesc.bind_flags = BindFlag::SHADER_RESOURCE;
+            grpDesc.misc_flags = ResourceMiscFlag::BUFFER_STRUCTURED;
+            grpDesc.stride = sizeof(TopingBLASGroupGPU);
+            grpDesc.usage = Usage::DEFAULT;
+            device_->CreateBuffer(&grpDesc, nullptr, &topingBLASGroupBuffer_);
+            wi::backlog::post("VoxelRenderer: toping BLAS CS available");
+        } else {
+            wi::backlog::post("VoxelRenderer: toping BLAS CS failed", wi::backlog::LogLevel::Warning);
+        }
+
         // ── RT Shadows + AO (Phase 6.2 + 6.3) ────────────────────────
         wi::renderer::LoadShader(ShaderStage::CS, shadowShader_, "voxel/voxelShadowCS.cso",
             wi::graphics::ShaderModel::SM_6_5);
@@ -213,20 +201,16 @@ void VoxelRenderer::initialize(GraphicsDevice* dev) {
         wi::backlog::post("VoxelRenderer: RT not available (GPU does not support ray tracing)");
     }
 
-    cpuMegaQuads_.reserve(MEGA_BUFFER_CAPACITY);
     cpuChunkInfo_.reserve(MAX_CHUNKS);
     chunkSlots_.reserve(MAX_CHUNKS);
-    cpuIndirectArgs_.reserve(MAX_CHUNKS);
 
     initialized_ = true;
-    wi::backlog::post("VoxelRenderer: initialized (mega-buffer: "
-        + std::to_string(MEGA_BUFFER_CAPACITY) + " quads capacity)");
+    wi::backlog::post("VoxelRenderer: initialized (GPU mesh pipeline)");
 }
 
 void VoxelRenderer::shutdown() {
     chunkSlots_.clear();
     cpuChunkInfo_.clear();
-    cpuMegaQuads_.clear();
     initialized_ = false;
 }
 
@@ -250,19 +234,10 @@ void VoxelRenderer::createPipeline() {
     // Load shaders
     wi::renderer::LoadShader(ShaderStage::VS, vertexShader_, "voxel/voxelVS.cso");
     wi::renderer::LoadShader(ShaderStage::PS, pixelShader_, "voxel/voxelPS.cso");
-    wi::renderer::LoadShader(ShaderStage::CS, cullShader_, "voxel/voxelCullCS.cso");
-
     if (!vertexShader_.IsValid() || !pixelShader_.IsValid()) {
         wi::backlog::post("VoxelRenderer: shader loading failed", wi::backlog::LogLevel::Error);
         return;
     }
-    if (cullShader_.IsValid()) {
-        gpuCullingEnabled_ = true;
-        wi::backlog::post("VoxelRenderer: GPU cull compute shader enabled");
-    } else {
-        gpuCullingEnabled_ = false;
-        wi::backlog::post("VoxelRenderer: cull compute shader not available, using CPU fallback", wi::backlog::LogLevel::Warning);
-    }
 
     // Pipeline: backface cull, depth test, opaque blend, triangle list
     PipelineStateDesc psoDesc;
@@ -406,74 +381,7 @@ void VoxelRenderer::generateTextures() {
 
 // ── Mega-buffer rebuild ─────────────────────────────────────────
 // Packs all chunk quads contiguously into a single buffer.
-// Simple strategy: full rebuild whenever any chunk is dirty.
-
-void VoxelRenderer::rebuildMegaBuffer(VoxelWorld& world) {
-    cpuMegaQuads_.clear();
-    chunkSlots_.clear();
-    cpuChunkInfo_.clear();
-
-    // Position → index map for neighbor lookup
-    std::unordered_map<uint64_t, uint32_t> posToIdx;
-    auto posKey = [](const ChunkPos& p) -> uint64_t {
-        return ((uint64_t)(uint16_t)p.x) | ((uint64_t)(uint16_t)p.y << 16) | ((uint64_t)(uint16_t)p.z << 32);
-    };
-
-    uint32_t offset = 0;
-    float debugFlag = debugFaceColors_ ? 1.0f : 0.0f;
-
-    world.forEachChunk([&](const ChunkPos& pos, Chunk& chunk) {
-        if (chunk.quadCount == 0) return;
-        if (offset + chunk.quadCount > MEGA_BUFFER_CAPACITY) return;
-
-        uint32_t curIdx = (uint32_t)chunkSlots_.size();
-        ChunkSlot slot;
-        slot.pos = pos;
-        slot.quadOffset = offset;
-        slot.quadCount = chunk.quadCount;
-        chunkSlots_.push_back(slot);
-
-        GPUChunkInfo info = {};
-        info.worldPos = XMFLOAT4(
-            (float)(pos.x * CHUNK_SIZE),
-            (float)(pos.y * CHUNK_SIZE),
-            (float)(pos.z * CHUNK_SIZE),
-            debugFlag
-        );
-        info.quadOffset = offset;
-        info.quadCount = chunk.quadCount;
-        for (int f = 0; f < 6; f++) {
-            info.faceOffsets[f] = chunk.faceOffsets[f];
-            info.faceCounts[f] = chunk.faceCounts[f];
-            info.neighbors[f] = 0xFFFFFFFF;
-        }
-        cpuChunkInfo_.push_back(info);
-        posToIdx[posKey(pos)] = curIdx;
-
-        cpuMegaQuads_.insert(cpuMegaQuads_.end(), chunk.quads.begin(), chunk.quads.end());
-        offset += chunk.quadCount;
-    });
-
-    // Fill neighbor indices
-    static const int offsets[6][3] = {
-        {1,0,0}, {-1,0,0}, {0,1,0}, {0,-1,0}, {0,0,1}, {0,0,-1}
-    };
-    for (uint32_t i = 0; i < (uint32_t)chunkSlots_.size(); i++) {
-        const auto& pos = chunkSlots_[i].pos;
-        for (int f = 0; f < 6; f++) {
-            ChunkPos npos = { pos.x + offsets[f][0], pos.y + offsets[f][1], pos.z + offsets[f][2] };
-            auto it = posToIdx.find(posKey(npos));
-            if (it != posToIdx.end()) {
-                cpuChunkInfo_[i].neighbors[f] = it->second;
-            }
-        }
-    }
-
-    chunkCount_ = (uint32_t)chunkSlots_.size();
-    totalQuads_ = offset;
-}
-
-// Build chunkInfoBuffer without CPU meshing (for GPU mesh path)
+// Build chunkInfoBuffer (chunk metadata for GPU mesh path)
 void VoxelRenderer::rebuildChunkInfoOnly(VoxelWorld& world) {
     chunkSlots_.clear();
     cpuChunkInfo_.clear();
@@ -530,175 +438,24 @@ void VoxelRenderer::rebuildChunkInfoOnly(VoxelWorld& world) {
 void VoxelRenderer::updateMeshes(VoxelWorld& world) {
     if (!device_) return;
 
-    // GPU mesh path: skip CPU meshing entirely, just rebuild chunk info
-    if (gpuMeshEnabled_ && gpuMesherAvailable_) {
-        bool anyDirty = false;
-        world.forEachChunk([&](const ChunkPos& pos, Chunk& chunk) {
-            if (chunk.dirty) { anyDirty = true; chunk.dirty = false; }
-        });
-        if (anyDirty || megaBufferDirty_) {
-            rebuildChunkInfoOnly(world);
-            // If cache wasn't already filled by fused regen+pack, mark for repack
-            if (!gpuMeshDirty_) {
-                // Non-fused dirty (e.g. initial load): need both repack and GPU update
-                voxelCacheDirty_ = true;
-                gpuMeshDirty_ = true;
-            }
-            // else: fused path already set gpuMeshDirty_=true, cache is clean
-            chunkInfoDirty_ = true;
-            megaBufferDirty_ = false;
-        }
-        return;
-    }
-
-    // CPU meshing path (fallback)
-    // Collect dirty chunks for parallel meshing
-    std::vector<Chunk*> dirtyChunks;
+    // Rebuild chunk info when any chunks are dirty
+    bool anyDirty = false;
     world.forEachChunk([&](const ChunkPos& pos, Chunk& chunk) {
-        if (chunk.dirty) dirtyChunks.push_back(&chunk);
+        if (chunk.dirty) { anyDirty = true; chunk.dirty = false; }
     });
-    bool anyDirty = !dirtyChunks.empty();
-
-    // Parallel CPU greedy meshing via wi::jobsystem
-    auto cpuStart = std::chrono::high_resolution_clock::now();
-    if (anyDirty) {
-        wi::jobsystem::context ctx;
-        wi::jobsystem::Dispatch(ctx, (uint32_t)dirtyChunks.size(), 1,
-            [&dirtyChunks, &world](wi::jobsystem::JobArgs args) {
-                VoxelMesher::meshChunk(*dirtyChunks[args.jobIndex], world);
-            });
-        wi::jobsystem::Wait(ctx);
-    }
-    auto cpuEnd = std::chrono::high_resolution_clock::now();
-
-    if (anyDirty) {
-        cpuMeshTimeMs_ = std::chrono::duration<float, std::milli>(cpuEnd - cpuStart).count();
-        // Trigger GPU benchmark on next render frame
-        if (gpuMesherAvailable_ && benchState_ == BenchState::IDLE) {
-            benchState_ = BenchState::DISPATCH;
-        }
-    }
-
     if (anyDirty || megaBufferDirty_) {
-        rebuildMegaBuffer(world);
+        rebuildChunkInfoOnly(world);
+        // If cache wasn't already filled by fused regen+pack, mark for repack
+        if (!gpuMeshDirty_) {
+            voxelCacheDirty_ = true;
+            gpuMeshDirty_ = true;
+        }
+        chunkInfoDirty_ = true;
         megaBufferDirty_ = false;
     }
 }
 
-// ── GPU Mesh Benchmark (Phase 2.4) ──────────────────────────────
-// Dispatches the baseline 1x1 GPU mesher for ALL chunks and measures timing.
-// State machine: DISPATCH (frame N) → READBACK (frame N+1) → DONE.
-
-void VoxelRenderer::dispatchGpuMeshBenchmark(CommandList cmd, const VoxelWorld& world) const {
-    auto* dev = device_;
-
-    // Zero the quad counter
-    uint32_t zero = 0;
-    dev->UpdateBuffer(&gpuQuadCounter_, &zero, cmd, sizeof(uint32_t));
-
-    // Barrier: COPY_DST → UAV for counter, UNDEFINED → UAV for output buffer
-    GPUBarrier preBarriers[] = {
-        GPUBarrier::Buffer(&gpuQuadCounter_, ResourceState::COPY_DST, ResourceState::UNORDERED_ACCESS),
-        GPUBarrier::Buffer(&gpuQuadBuffer_, ResourceState::UNDEFINED, ResourceState::UNORDERED_ACCESS),
-    };
-    dev->Barrier(preBarriers, 2, cmd);
-
-    dev->BindComputeShader(&meshShader_, cmd);
-
-    // GPU timestamp: mesh begin
-    dev->QueryEnd(&timestampHeap_, TS_MESH_BEGIN, cmd);
-
-    // Dispatch for each chunk
-    uint32_t chunkIdx = 0;
-    world.forEachChunk([&](const ChunkPos& pos, Chunk& chunk) {
-        // Pack voxel data: 32^3 voxels → 16384 uint32s (2 voxels per uint)
-        std::vector<uint32_t> packed(CHUNK_VOLUME / 2, 0);
-        for (int i = 0; i < CHUNK_VOLUME; i++) {
-            uint32_t v = chunk.voxels[i].packed;
-            if (i & 1)
-                packed[i >> 1] |= (v << 16);
-            else
-                packed[i >> 1] = v;
-        }
-
-        // Upload voxel data (re-uses the single-chunk buffer)
-        dev->UpdateBuffer(&voxelDataBuffer_, packed.data(), cmd,
-            packed.size() * sizeof(uint32_t));
-
-        // Bind resources (after BindComputeShader, so PushConstants targets compute)
-        dev->BindResource(&voxelDataBuffer_, 0, cmd);
-        dev->BindUAV(&gpuQuadBuffer_, 0, cmd);
-        dev->BindUAV(&gpuQuadCounter_, 1, cmd);
-
-        // Push constants for this chunk
-        struct MeshPush {
-            uint32_t chunkIndex;
-            uint32_t voxelBufferOffset;
-            uint32_t quadBufferOffset;
-            uint32_t maxOutputQuads;
-            uint32_t pad[8];
-        };
-        MeshPush pushData = {};
-        pushData.chunkIndex = chunkIdx;
-        pushData.voxelBufferOffset = 0; // single-chunk buffer, always at offset 0
-        pushData.quadBufferOffset = 0;  // all chunks share global atomic counter
-        pushData.maxOutputQuads = MEGA_BUFFER_CAPACITY;
-        dev->PushConstants(&pushData, sizeof(pushData), cmd);
-
-        // Dispatch: 32/8 = 4 groups per axis → 64 groups total
-        dev->Dispatch(4, 4, 4, cmd);
-
-        chunkIdx++;
-    });
-
-    // GPU timestamp: mesh end
-    dev->QueryEnd(&timestampHeap_, TS_MESH_END, cmd);
-
-    // Copy quad counter to readback buffer
-    GPUBarrier postBarrier = GPUBarrier::Buffer(
-        &gpuQuadCounter_, ResourceState::UNORDERED_ACCESS, ResourceState::COPY_SRC);
-    dev->Barrier(&postBarrier, 1, cmd);
-    dev->CopyBuffer(&meshCounterReadback_, 0, &gpuQuadCounter_, 0, sizeof(uint32_t), cmd);
-
-    // Resolve timestamps
-    dev->QueryResolve(&timestampHeap_, TS_MESH_BEGIN, 2, &timestampReadback_,
-        TS_MESH_BEGIN * sizeof(uint64_t), cmd);
-
-    benchState_ = BenchState::READBACK;
-}
-
-void VoxelRenderer::readbackGpuMeshBenchmark() const {
-    // Read quad count from readback buffer
-    uint32_t* countData = (uint32_t*)meshCounterReadback_.mapped_data;
-    if (countData) {
-        gpuBaselineQuads_ = *countData;
-    }
-
-    // Read GPU mesh timestamps
-    uint64_t* tsData = (uint64_t*)timestampReadback_.mapped_data;
-    if (tsData) {
-        double freq = (double)device_->GetTimestampFrequency();
-        if (freq > 0.0 && tsData[TS_MESH_END] > tsData[TS_MESH_BEGIN]) {
-            gpuMeshTimeMs_ = (float)((double)(tsData[TS_MESH_END] - tsData[TS_MESH_BEGIN]) / freq * 1000.0);
-        }
-    }
-
-    // Log benchmark results
-    char msg[256];
-    snprintf(msg, sizeof(msg),
-        "=== MESH BENCHMARK ===\n"
-        "  CPU greedy:    %.2f ms, %u quads (%u chunks)\n"
-        "  GPU baseline:  %.3f ms, %u quads (1x1, no merge)\n"
-        "  Ratio quads: %.1fx more (GPU baseline vs CPU greedy)",
-        cpuMeshTimeMs_, totalQuads_, chunkCount_,
-        gpuMeshTimeMs_, gpuBaselineQuads_,
-        totalQuads_ > 0 ? (float)gpuBaselineQuads_ / totalQuads_ : 0.0f);
-    wi::backlog::post(msg);
-
-    benchState_ = BenchState::DONE;
-}
-
-// ── GPU Mesh Dispatch (production path) ─────────────────────────
+// ── GPU Mesh Dispatch ────────────────────────────────────────────
 // Dispatches GPU mesher for ALL chunks every frame. Replaces CPU greedy meshing.
 // Uses the atomic quad counter for 1-frame-delayed readback of total quad count.
 
@@ -1004,6 +761,63 @@ void VoxelRenderer::dispatchBLASExtract(CommandList cmd) const {
     rtBlockyVertexCount_ = quadCount * 6;
 }
 
+// ── GPU compute toping BLAS position extraction ─────────────────
+// Replaces the 196ms CPU nested loop. Reads vertex templates (t4) +
+// instance positions (t5) + group table (t7), writes positions to u0.
+void VoxelRenderer::dispatchTopingBLASExtract(CommandList cmd) const {
+    if (!topingBLASShader_.IsValid() || !topingBLASGroupBuffer_.IsValid() ||
+        !topingBLASPositionBuffer_.IsValid() || !topingVertexBuffer_.IsValid() ||
+        !topingInstanceBuffer_.IsValid() || topingBLASTotalVertices_ == 0 ||
+        topingBLASGroupsGPU_.empty()) return;
+
+    auto* dev = device_;
+
+    // Upload group table (tiny: ~32 × 20 bytes = 640 bytes)
+    size_t groupUploadSize = topingBLASGroupsGPU_.size() * sizeof(TopingBLASGroupGPU);
+    dev->UpdateBuffer(&topingBLASGroupBuffer_,
+        topingBLASGroupsGPU_.data(), cmd, groupUploadSize);
+
+    // Pre-barriers
+    GPUBarrier preBarriers[] = {
+        GPUBarrier::Buffer(&topingBLASGroupBuffer_,
+            ResourceState::COPY_DST, ResourceState::SHADER_RESOURCE),
+        GPUBarrier::Buffer(&topingBLASPositionBuffer_,
+            ResourceState::UNDEFINED, ResourceState::UNORDERED_ACCESS),
+    };
+    dev->Barrier(preBarriers, 2, cmd);
+
+    // Bind compute shader + resources
+    dev->BindComputeShader(&topingBLASShader_, cmd);
+    dev->BindResource(&topingVertexBuffer_, 4, cmd);    // t4
+    dev->BindResource(&topingInstanceBuffer_, 5, cmd);  // t5
+    dev->BindResource(&topingBLASGroupBuffer_, 7, cmd); // t7
+    dev->BindUAV(&topingBLASPositionBuffer_, 0, cmd);   // u0
+
+    struct {
+        uint32_t totalVertices;
+        uint32_t groupCount;
+        uint32_t pad[10];
+    } pushData = {};
+    pushData.totalVertices = topingBLASTotalVertices_;
+    pushData.groupCount = (uint32_t)topingBLASGroupsGPU_.size();
+    dev->PushConstants(&pushData, sizeof(pushData), cmd);
+
+    // Dispatch: 64 threads per group
+    uint32_t threadGroups = (topingBLASTotalVertices_ + 63) / 64;
+    dev->Dispatch(threadGroups, 1, 1, cmd);
+
+    // Post-barrier: UAV → SHADER_RESOURCE (for BLAS build)
+    GPUBarrier postBarriers[] = {
+        GPUBarrier::Buffer(&topingBLASPositionBuffer_,
+            ResourceState::UNORDERED_ACCESS, ResourceState::SHADER_RESOURCE),
+    };
+    dev->Barrier(postBarriers, 1, cmd);
+
+    rtTopingVertexCount_ = topingBLASTotalVertices_;
+    rtDirty_ = true;
+    topingBLASDirty_ = false;
+}
+
 void VoxelRenderer::buildAccelerationStructures(CommandList cmd) const {
     if (!rtAvailable_) return;
 
@@ -1477,14 +1291,12 @@ void VoxelRenderer::render(
 
     auto* dev = device_;
 
-    // ── GPU Mesh path: quads already dispatched in Render(), just draw ──
-    if (gpuMeshEnabled_ && gpuMesherAvailable_) {
-        // Upload chunk info only when chunks changed
-        if (!cpuChunkInfo_.empty() && chunkInfoDirty_) {
-            dev->UpdateBuffer(&chunkInfoBuffer_, cpuChunkInfo_.data(), cmd,
-                cpuChunkInfo_.size() * sizeof(GPUChunkInfo));
-            chunkInfoDirty_ = false;
-        }
+    // Upload chunk info only when chunks changed
+    if (!cpuChunkInfo_.empty() && chunkInfoDirty_) {
+        dev->UpdateBuffer(&chunkInfoBuffer_, cpuChunkInfo_.data(), cmd,
+            cpuChunkInfo_.size() * sizeof(GPUChunkInfo));
+        chunkInfoDirty_ = false;
+    }
 
         // Per-frame constants
         VoxelConstants cb = {};
@@ -1580,402 +1392,11 @@ void VoxelRenderer::render(
             drawCalls_ = 1;
         }
 
-        dev->RenderPassEnd(cmd);
-        visibleChunks_ = chunkCount_;
-        return;
-    }
-
-    // Upload mega-buffer and chunk info to GPU
-    if (!cpuMegaQuads_.empty()) {
-        dev->UpdateBuffer(&megaQuadBuffer_, cpuMegaQuads_.data(), cmd,
-            cpuMegaQuads_.size() * sizeof(PackedQuad));
-    }
-    if (!cpuChunkInfo_.empty()) {
-        dev->UpdateBuffer(&chunkInfoBuffer_, cpuChunkInfo_.data(), cmd,
-            cpuChunkInfo_.size() * sizeof(GPUChunkInfo));
-    }
-
-    // Per-frame constants (with frustum planes for GPU cull shader)
-    VoxelConstants cb = {};
-    XMMATRIX vpMatrix = camera.GetViewProjection();
-    XMStoreFloat4x4(&cb.viewProjection, vpMatrix);
-    cb.cameraPosition = XMFLOAT4(camera.Eye.x, camera.Eye.y, camera.Eye.z, 1.0f);
-    cb.sunDirection = XMFLOAT4(-0.7f, -0.4f, -0.3f, 0.0f); // lower sun = longer cast shadows
-    cb.sunColor = XMFLOAT4(1.2f, 1.1f, 0.9f, 1.0f);
-    cb.chunkSize = (float)CHUNK_SIZE;
-    cb.textureTiling = 0.25f;
-    cb.blendEnabled = 0.0f; // Phase 3: blending disabled in CPU/MDI paths (no voxel data SRV)
-    cb.debugBlend = 0.0f;
-    cb.bleedMask = 0;
-    cb.resistBleedMask = 0;
-    cb.windTime = windTime_;
-    cb.chunkCount = chunkCount_;
-    extractFrustumPlanes(vpMatrix, cb.frustumPlanes);
-    dev->UpdateBuffer(&constantBuffer_, &cb, cmd, sizeof(cb));
-
-    // Push constant structure (must be 48 bytes = 12 x uint32, matches b999)
-    struct VoxelPush {
-        uint32_t chunkIndex;
-        uint32_t quadOffset;
-        uint32_t flags;       // bit 0: 1=MDI mode, 0=CPU mode
-        uint32_t pad[9];
-    };
-
-    visibleChunks_ = 0;
-    drawCalls_ = 0;
-
-    // ── GPU Cull + MDI path ────────────────────────────────────────
-    if (gpuCullingEnabled_) {
-        // DX12 buffer decay: all buffers return to COMMON after ExecuteCommandLists.
-        // So every frame starts clean — no cross-frame state tracking needed.
-
-        // Zero the draw count via UpdateBuffer (COMMON → COPY_DST implicit promotion)
-        uint32_t zero = 0;
-        dev->UpdateBuffer(&drawCountBuffer_, &zero, cmd, sizeof(uint32_t));
-
-        // Barriers to UAV for compute shader writes:
-        // - drawCountBuffer_: COPY_DST → UAV (was promoted to COPY_DST by UpdateBuffer)
-        // - indirectArgsBuffer_: COMMON → UAV (explicit, required because COMMON can't
-        //   be implicitly promoted to UAV)
-        GPUBarrier preBarriers[] = {
-            GPUBarrier::Buffer(&drawCountBuffer_, ResourceState::COPY_DST, ResourceState::UNORDERED_ACCESS),
-            GPUBarrier::Buffer(&indirectArgsBuffer_, ResourceState::UNDEFINED, ResourceState::UNORDERED_ACCESS),
-        };
-        dev->Barrier(preBarriers, 2, cmd);
-
-        // Timestamp: cull begin
-        dev->QueryEnd(&timestampHeap_, TS_CULL_BEGIN, cmd);
-
-        // Dispatch GPU frustum + backface cull compute shader
-        dev->BindComputeShader(&cullShader_, cmd);
-        dev->BindConstantBuffer(&constantBuffer_, 0, cmd);
-        dev->BindResource(&chunkInfoBuffer_, 2, cmd);
-        dev->BindUAV(&indirectArgsBuffer_, 0, cmd);
-        dev->BindUAV(&drawCountBuffer_, 1, cmd);
-        dev->Dispatch((chunkCount_ + 63) / 64, 1, 1, cmd);
-
-        // Timestamp: cull end
-        dev->QueryEnd(&timestampHeap_, TS_CULL_END, cmd);
-
-        // Barriers: UAV → INDIRECT_ARGUMENT for DrawInstancedIndirectCount
-        GPUBarrier postBarriers[] = {
-            GPUBarrier::Buffer(&indirectArgsBuffer_, ResourceState::UNORDERED_ACCESS, ResourceState::INDIRECT_ARGUMENT),
-            GPUBarrier::Buffer(&drawCountBuffer_, ResourceState::UNORDERED_ACCESS, ResourceState::INDIRECT_ARGUMENT),
-        };
-        dev->Barrier(postBarriers, 2, cmd);
-
-        // ── Render pass (MRT: color + normals + depth) ──────────────
-        RenderPassImage rp[] = {
-            RenderPassImage::RenderTarget(
-                &renderTarget,
-                RenderPassImage::LoadOp::CLEAR,
-                RenderPassImage::StoreOp::STORE,
-                ResourceState::SHADER_RESOURCE,
-                ResourceState::SHADER_RESOURCE
-            ),
-            RenderPassImage::RenderTarget(
-                &normalTarget,
-                RenderPassImage::LoadOp::CLEAR,
-                RenderPassImage::StoreOp::STORE,
-                ResourceState::SHADER_RESOURCE,
-                ResourceState::SHADER_RESOURCE
-            ),
-            RenderPassImage::DepthStencil(
-                &depthBuffer,
-                RenderPassImage::LoadOp::CLEAR,
-                RenderPassImage::StoreOp::STORE,
-                ResourceState::DEPTHSTENCIL,
-                ResourceState::DEPTHSTENCIL,
-                ResourceState::DEPTHSTENCIL
-            ),
-        };
-        dev->RenderPassBegin(rp, 3, cmd);
-
-        Viewport vp;
-        vp.width = (float)renderTarget.GetDesc().width;
-        vp.height = (float)renderTarget.GetDesc().height;
-        vp.min_depth = 0.0f;
-        vp.max_depth = 1.0f;
-        dev->BindViewports(1, &vp, cmd);
-
-        Rect scissor = { 0, 0, (int)vp.width, (int)vp.height };
-        dev->BindScissorRects(1, &scissor, cmd);
-
-        dev->BindPipelineState(&pso_, cmd);
-        dev->BindConstantBuffer(&constantBuffer_, 0, cmd);
-        dev->BindResource(&megaQuadBuffer_, 0, cmd);
-        dev->BindResource(&textureArray_, 1, cmd);
-        dev->BindResource(&chunkInfoBuffer_, 2, cmd);
-        dev->BindSampler(&sampler_, 0, cmd);
-
-        // IMPORTANT: PushConstants must be called AFTER BindPipelineState.
-        // Wicked Engine's PushConstants uses SetGraphicsRoot32BitConstants only
-        // when active_pso is set. If called before (with active_cs from compute),
-        // it would set COMPUTE push constants instead of GRAPHICS ones.
-        VoxelPush pushData = {};
-        pushData.flags = 1; // MDI mode
-        dev->PushConstants(&pushData, sizeof(pushData), cmd);
-
-        // Timestamp: draw begin
-        dev->QueryEnd(&timestampHeap_, TS_DRAW_BEGIN, cmd);
-
-        // Single MDI call: GPU cull shader filled the indirect args
-        dev->DrawInstancedIndirectCount(
-            &indirectArgsBuffer_, 0,
-            &drawCountBuffer_, 0,
-            MAX_DRAWS, cmd
-        );
-        drawCalls_ = 1;
-
-        // Timestamp: draw end
-        dev->QueryEnd(&timestampHeap_, TS_DRAW_END, cmd);
-
-        dev->RenderPassEnd(cmd);
-
-        // Resolve timestamps for readback (results available next frame)
-        dev->QueryResolve(&timestampHeap_, 0, TS_COUNT, &timestampReadback_, 0, cmd);
-
-        // Read back previous frame's timestamps (persistently mapped READBACK buffer)
-        uint64_t* tsData = (uint64_t*)timestampReadback_.mapped_data;
-        if (tsData) {
-            double freq = (double)dev->GetTimestampFrequency();
-            if (freq > 0.0 && tsData[TS_CULL_END] > tsData[TS_CULL_BEGIN]) {
-                gpuCullTimeMs_ = (float)((double)(tsData[TS_CULL_END] - tsData[TS_CULL_BEGIN]) / freq * 1000.0);
-            }
-            if (freq > 0.0 && tsData[TS_DRAW_END] > tsData[TS_DRAW_BEGIN]) {
-                gpuDrawTimeMs_ = (float)((double)(tsData[TS_DRAW_END] - tsData[TS_DRAW_BEGIN]) / freq * 1000.0);
-            }
-        }
-
-        // GPU cull handles visibility counting — approximate from chunkCount
-        visibleChunks_ = chunkCount_; // exact count would require readback of drawCount
-
-        return;
-    }
-
-    // ── CPU frustum + backface cull (shared by MDI and per-face paths) ──
-    wi::primitive::Frustum frustum;
-    frustum.Create(camera.GetViewProjection());
-
-    // ── Phase 2.2: CPU-filled indirect args + MDI draw ──────────────
-    if (mdiEnabled_) {
-        // CPU cull: fill indirect args with visible face groups
-        cpuIndirectArgs_.clear();
-        uint32_t cpuDrawCount = 0;
-
-        for (uint32_t i = 0; i < chunkCount_; i++) {
-            const auto& slot = chunkSlots_[i];
-            if (slot.quadCount == 0) continue;
-
-            XMFLOAT3 aabbMin(
-                (float)(slot.pos.x * CHUNK_SIZE),
-                (float)(slot.pos.y * CHUNK_SIZE),
-                (float)(slot.pos.z * CHUNK_SIZE)
-            );
-            XMFLOAT3 aabbMax(
-                aabbMin.x + CHUNK_SIZE,
-                aabbMin.y + CHUNK_SIZE,
-                aabbMin.z + CHUNK_SIZE
-            );
-            wi::primitive::AABB aabb(aabbMin, aabbMax);
-            if (!frustum.CheckBoxFast(aabb)) continue;
-
-            visibleChunks_++;
-            const auto& info = cpuChunkInfo_[i];
-
-            for (uint32_t f = 0; f < 6; f++) {
-                if (info.faceCounts[f] == 0) continue;
-
-                bool backFacing = false;
-                switch (f) {
-                case 0: backFacing = (camera.Eye.x < aabbMin.x); break;
-                case 1: backFacing = (camera.Eye.x > aabbMax.x); break;
-                case 2: backFacing = (camera.Eye.y < aabbMin.y); break;
-                case 3: backFacing = (camera.Eye.y > aabbMax.y); break;
-                case 4: backFacing = (camera.Eye.z < aabbMin.z); break;
-                case 5: backFacing = (camera.Eye.z > aabbMax.z); break;
-                }
-                if (backFacing) continue;
-
-                IndirectDrawArgs args = {};
-                // Pack chunkIndex (low 16 bits) + faceIndex (high 16 bits) into push constant.
-                // The shader unpacks this to look up quadOffset from GPUChunkInfo.
-                // We do NOT use startVertexLocation because SV_VertexID may not include it
-                // reliably in ExecuteIndirect context.
-                args.pushConstant = i | (f << 16);
-                args.vertexCountPerInstance = info.faceCounts[f] * 6;
-                args.instanceCount = 1;
-                args.startVertexLocation = 0;
-                args.startInstanceLocation = 0;
-                cpuIndirectArgs_.push_back(args);
-                cpuDrawCount++;
-            }
-        }
-
-        // Upload indirect args and draw count to GPU
-        // Note: no explicit barriers needed here. Buffers start in COMMON each frame
-        // (DX12 buffer decay after command list execution). COMMON is implicitly
-        // promoted to COPY_DST by UpdateBuffer, then to INDIRECT_ARGUMENT by
-        // DrawInstancedIndirectCount. This matches Phase 2.1 pattern (no barriers
-        // between UpdateBuffer and SRV usage for megaQuadBuffer_/chunkInfoBuffer_).
-        if (!cpuIndirectArgs_.empty()) {
-            dev->UpdateBuffer(&indirectArgsBuffer_, cpuIndirectArgs_.data(), cmd,
-                cpuIndirectArgs_.size() * sizeof(IndirectDrawArgs));
-        }
-        dev->UpdateBuffer(&drawCountBuffer_, &cpuDrawCount, cmd, sizeof(uint32_t));
-
-        // ── Render pass (MRT: color + normals + depth) ──────────────
-        RenderPassImage rp[] = {
-            RenderPassImage::RenderTarget(
-                &renderTarget,
-                RenderPassImage::LoadOp::CLEAR,
-                RenderPassImage::StoreOp::STORE,
-                ResourceState::SHADER_RESOURCE,
-                ResourceState::SHADER_RESOURCE
-            ),
-            RenderPassImage::RenderTarget(
-                &normalTarget,
-                RenderPassImage::LoadOp::CLEAR,
-                RenderPassImage::StoreOp::STORE,
-                ResourceState::SHADER_RESOURCE,
-                ResourceState::SHADER_RESOURCE
-            ),
-            RenderPassImage::DepthStencil(
-                &depthBuffer,
-                RenderPassImage::LoadOp::CLEAR,
-                RenderPassImage::StoreOp::STORE,
-                ResourceState::DEPTHSTENCIL,
-                ResourceState::DEPTHSTENCIL,
-                ResourceState::DEPTHSTENCIL
-            ),
-        };
-        dev->RenderPassBegin(rp, 3, cmd);
-
-        Viewport vp;
-        vp.width = (float)renderTarget.GetDesc().width;
-        vp.height = (float)renderTarget.GetDesc().height;
-        vp.min_depth = 0.0f;
-        vp.max_depth = 1.0f;
-        dev->BindViewports(1, &vp, cmd);
-
-        Rect scissor = { 0, 0, (int)vp.width, (int)vp.height };
-        dev->BindScissorRects(1, &scissor, cmd);
-
-        dev->BindPipelineState(&pso_, cmd);
-        dev->BindConstantBuffer(&constantBuffer_, 0, cmd);
-        dev->BindResource(&megaQuadBuffer_, 0, cmd);
-        dev->BindResource(&textureArray_, 1, cmd);
-        dev->BindResource(&chunkInfoBuffer_, 2, cmd);
-        dev->BindSampler(&sampler_, 0, cmd);
-
-        // MDI mode: VS uses binary search to find chunk from SV_VertexID
-        VoxelPush pushData = {};
-        pushData.flags = 1; // MDI mode
-        dev->PushConstants(&pushData, sizeof(pushData), cmd);
-
-        dev->DrawInstancedIndirectCount(
-            &indirectArgsBuffer_, 0,
-            &drawCountBuffer_, 0,
-            MAX_DRAWS, cmd
-        );
-        drawCalls_ = 1;
-
-        dev->RenderPassEnd(cmd);
-        return;
-    }
-
-    // ── Phase 2.1 Fallback: per-face-group DrawInstanced ────────────
-    RenderPassImage rp[] = {
-        RenderPassImage::RenderTarget(
-            &renderTarget,
-            RenderPassImage::LoadOp::CLEAR,
-            RenderPassImage::StoreOp::STORE,
-            ResourceState::SHADER_RESOURCE,
-            ResourceState::SHADER_RESOURCE
-        ),
-        RenderPassImage::RenderTarget(
-            &normalTarget,
-            RenderPassImage::LoadOp::CLEAR,
-            RenderPassImage::StoreOp::STORE,
-            ResourceState::SHADER_RESOURCE,
-            ResourceState::SHADER_RESOURCE
-        ),
-        RenderPassImage::DepthStencil(
-            &depthBuffer,
-            RenderPassImage::LoadOp::CLEAR,
-            RenderPassImage::StoreOp::STORE,
-            ResourceState::DEPTHSTENCIL,
-            ResourceState::DEPTHSTENCIL,
-            ResourceState::DEPTHSTENCIL
-        ),
-    };
-    dev->RenderPassBegin(rp, 3, cmd);
-
-    Viewport vp;
-    vp.width = (float)renderTarget.GetDesc().width;
-    vp.height = (float)renderTarget.GetDesc().height;
-    vp.min_depth = 0.0f;
-    vp.max_depth = 1.0f;
-    dev->BindViewports(1, &vp, cmd);
-
-    Rect scissor = { 0, 0, (int)vp.width, (int)vp.height };
-    dev->BindScissorRects(1, &scissor, cmd);
-
-    dev->BindPipelineState(&pso_, cmd);
-    dev->BindConstantBuffer(&constantBuffer_, 0, cmd);
-    dev->BindResource(&megaQuadBuffer_, 0, cmd);
-    dev->BindResource(&textureArray_, 1, cmd);
-    dev->BindResource(&chunkInfoBuffer_, 2, cmd);
-    dev->BindSampler(&sampler_, 0, cmd);
-
-    for (uint32_t i = 0; i < chunkCount_; i++) {
-        const auto& slot = chunkSlots_[i];
-        if (slot.quadCount == 0) continue;
-
-        XMFLOAT3 aabbMin(
-            (float)(slot.pos.x * CHUNK_SIZE),
-            (float)(slot.pos.y * CHUNK_SIZE),
-            (float)(slot.pos.z * CHUNK_SIZE)
-        );
-        XMFLOAT3 aabbMax(
-            aabbMin.x + CHUNK_SIZE,
-            aabbMin.y + CHUNK_SIZE,
-            aabbMin.z + CHUNK_SIZE
-        );
-        wi::primitive::AABB aabb(aabbMin, aabbMax);
-        if (!frustum.CheckBoxFast(aabb)) continue;
-
-        visibleChunks_++;
-        const auto& info = cpuChunkInfo_[i];
-
-        for (uint32_t f = 0; f < 6; f++) {
-            if (info.faceCounts[f] == 0) continue;
-
-            bool backFacing = false;
-            switch (f) {
-            case 0: backFacing = (camera.Eye.x < aabbMin.x); break;
-            case 1: backFacing = (camera.Eye.x > aabbMax.x); break;
-            case 2: backFacing = (camera.Eye.y < aabbMin.y); break;
-            case 3: backFacing = (camera.Eye.y > aabbMax.y); break;
-            case 4: backFacing = (camera.Eye.z < aabbMin.z); break;
-            case 5: backFacing = (camera.Eye.z > aabbMax.z); break;
-            }
-            if (backFacing) continue;
-
-            VoxelPush pushData = {};
-            pushData.chunkIndex = i;
-            pushData.quadOffset = slot.quadOffset + info.faceOffsets[f];
-            pushData.flags = 0; // CPU mode
-            dev->PushConstants(&pushData, sizeof(pushData), cmd);
-
-            dev->DrawInstanced(info.faceCounts[f] * 6, 1, 0, 0, cmd);
-            drawCalls_++;
-        }
-    }
-
     dev->RenderPassEnd(cmd);
+    visibleChunks_ = chunkCount_;
 }
 
+
 // ── Phase 4: Toping GPU upload + rendering ─────────────────────
 
 void VoxelRenderer::uploadTopingData(const TopingSystem& topingSystem) {
@@ -2061,48 +1482,74 @@ void VoxelRenderer::uploadTopingData(const TopingSystem& topingSystem) {
         topingInstanceDirty_ = true; // deferred upload in Render()
     }
 
-    // ── Build toping BLAS position data ────────────────────────
-    // Compute world-space positions for all toping instances.
-    // Buffer is pre-allocated once (no per-frame CreateBuffer), data uploaded
-    // in Render() via UpdateBuffer (deferred — needs CommandList).
+    // ── Build draw groups + BLAS group table ───────────────────
+    // Draw groups are built once here from sorted instances and reused by:
+    // 1. renderTopings() — for instanced draw calls
+    // 2. dispatchTopingBLASExtract() — GPU compute shader fills BLAS positions
+    // This replaces the 196ms CPU loop that computed world-space positions.
     const auto& defs = topingSystem.getDefs();
-    uint32_t totalTopingVerts = 0;
-    for (uint32_t i = 0; i < instCount; i++) {
-        const auto& si = topingSorted_[i];
-        if (si.type >= defs.size()) continue;
-        totalTopingVerts += defs[si.type].variants[si.variant].count;
-    }
-
-    if (totalTopingVerts > 0 && !verts.empty()) {
-        // Fill staging buffer with world-space positions
-        topingBLASPositionStaging_.resize(totalTopingVerts * 3);
-        uint32_t outIdx = 0;
-        for (uint32_t i = 0; i < instCount; i++) {
-            const auto& si = topingSorted_[i];
-            if (si.type >= defs.size()) continue;
-            if (si.variant >= 16) continue;
-            const auto& slice = defs[si.type].variants[si.variant];
-            for (uint32_t v = 0; v < slice.count; v++) {
-                if (slice.offset + v >= verts.size()) break;
-                if (outIdx >= totalTopingVerts) break;
-                const auto& vtx = verts[slice.offset + v];
-                topingBLASPositionStaging_[outIdx * 3 + 0] = vtx.px + si.wx;
-                topingBLASPositionStaging_[outIdx * 3 + 1] = vtx.py + si.wy;
-                topingBLASPositionStaging_[outIdx * 3 + 2] = vtx.pz + si.wz;
-                outIdx++;
+    topingDrawGroups_.clear();
+    if (instCount > 0) {
+        TopingDrawGroup g;
+        g.type = topingSorted_[0].type;
+        g.variant = topingSorted_[0].variant;
+        g.instanceOffset = 0;
+        g.instanceCount = 1;
+        g.vertexTemplateOffset = 0;
+        g.vertexCount = 0;
+        if (g.type < defs.size() && g.variant < 16) {
+            const auto& slice = defs[g.type].variants[g.variant];
+            g.vertexTemplateOffset = slice.offset;
+            g.vertexCount = slice.count;
+        }
+        for (uint32_t i = 1; i < instCount; i++) {
+            if (topingSorted_[i].type == g.type && topingSorted_[i].variant == g.variant) {
+                g.instanceCount++;
+            } else {
+                if (g.vertexCount > 0) topingDrawGroups_.push_back(g);
+                g.type = topingSorted_[i].type;
+                g.variant = topingSorted_[i].variant;
+                g.instanceOffset = i;
+                g.instanceCount = 1;
+                g.vertexTemplateOffset = 0;
+                g.vertexCount = 0;
+                if (g.type < defs.size() && g.variant < 16) {
+                    const auto& slice = defs[g.type].variants[g.variant];
+                    g.vertexTemplateOffset = slice.offset;
+                    g.vertexCount = slice.count;
+                }
             }
         }
-        topingBLASVertexCount_ = outIdx;
+        if (g.vertexCount > 0) topingDrawGroups_.push_back(g);
+    }
 
-        // Pre-allocate GPU buffer once; grow only when needed.
-        // No RAY_TRACING flag — BLAS vertex buffers work with SHADER_RESOURCE
-        // (same pattern as blocky blasPositionBuffer_). This allows UpdateBuffer.
-        if (!topingBLASPositionBuffer_.IsValid() || topingBLASPositionCapacity_ < outIdx) {
-            topingBLASPositionCapacity_ = outIdx + outIdx / 4; // 25% headroom
+    // Build GPU group table with prefix sums for the BLAS CS
+    topingBLASGroupsGPU_.resize(topingDrawGroups_.size());
+    uint32_t totalTopingVerts = 0;
+    for (size_t i = 0; i < topingDrawGroups_.size(); i++) {
+        const auto& dg = topingDrawGroups_[i];
+        topingBLASGroupsGPU_[i] = {
+            totalTopingVerts,          // globalVertexOffset (prefix sum)
+            dg.vertexTemplateOffset,
+            dg.vertexCount,
+            dg.instanceOffset,
+            dg.instanceCount
+        };
+        totalTopingVerts += dg.vertexCount * dg.instanceCount;
+    }
+    topingBLASTotalVertices_ = totalTopingVerts;
+    topingBLASVertexCount_ = totalTopingVerts;
+
+    // Pre-allocate GPU BLAS position buffer (UAV+SRV, raw for RWByteAddressBuffer).
+    // Same pattern as blocky blasPositionBuffer_.
+    if (totalTopingVerts > 0) {
+        if (!topingBLASPositionBuffer_.IsValid() || topingBLASPositionCapacity_ < totalTopingVerts) {
+            topingBLASPositionCapacity_ = totalTopingVerts + totalTopingVerts / 4; // 25% headroom
             GPUBufferDesc posDesc;
             posDesc.size = (size_t)topingBLASPositionCapacity_ * 3 * sizeof(float);
-            posDesc.bind_flags = BindFlag::SHADER_RESOURCE;
-            posDesc.misc_flags = ResourceMiscFlag::NONE;
+            posDesc.bind_flags = BindFlag::UNORDERED_ACCESS | BindFlag::SHADER_RESOURCE;
+            posDesc.misc_flags = ResourceMiscFlag::BUFFER_RAW;
+            posDesc.stride = 0;
             posDesc.usage = Usage::DEFAULT;
             device_->CreateBuffer(&posDesc, nullptr, &topingBLASPositionBuffer_);
 
@@ -2127,7 +1574,7 @@ void VoxelRenderer::uploadTopingData(const TopingSystem& topingSystem) {
             topingBLASIndexCount_ = idxCount;
         }
 
-        topingBLASDirty_ = true; // deferred upload + BLAS rebuild in Render()
+        topingBLASDirty_ = true; // GPU compute dispatch + BLAS rebuild in Render()
     }
 }
 
@@ -2192,38 +1639,7 @@ void VoxelRenderer::renderTopings(
     dev->BindResource(&topingInstanceBuffer_, 5, cmd); // t5
     dev->BindSampler(&sampler_, 0, cmd);
 
-    // Build sorted draw groups (same sort order as uploadTopingData)
-    struct DrawGroup {
-        uint16_t type, variant;
-        uint32_t instanceOffset, instanceCount;
-    };
-    struct SortKey { uint16_t type, variant; };
-    std::vector<SortKey> sortedKeys(instances.size());
-    for (size_t i = 0; i < instances.size(); i++) {
-        sortedKeys[i] = { instances[i].topingType, instances[i].variant };
-    }
-    std::sort(sortedKeys.begin(), sortedKeys.end(), [](const SortKey& a, const SortKey& b) {
-        if (a.type != b.type) return a.type < b.type;
-        return a.variant < b.variant;
-    });
-
-    // Identify contiguous groups
-    std::vector<DrawGroup> groups;
-    uint32_t instCount = (uint32_t)std::min(sortedKeys.size(), (size_t)MAX_TOPING_INSTANCES);
-    if (instCount > 0) {
-        DrawGroup g = { sortedKeys[0].type, sortedKeys[0].variant, 0, 1 };
-        for (uint32_t i = 1; i < instCount; i++) {
-            if (sortedKeys[i].type == g.type && sortedKeys[i].variant == g.variant) {
-                g.instanceCount++;
-            } else {
-                groups.push_back(g);
-                g = { sortedKeys[i].type, sortedKeys[i].variant, i, 1 };
-            }
-        }
-        groups.push_back(g);
-    }
-
-    // Issue one DrawInstanced per group
+    // Reuse draw groups built in uploadTopingData (avoids redundant sort)
     topingDrawCalls_ = 0;
     struct TopingPush {
         uint32_t vertexOffset;
@@ -2232,19 +1648,17 @@ void VoxelRenderer::renderTopings(
         uint32_t pad[9];
     };
 
-    for (const auto& g : groups) {
+    for (const auto& g : topingDrawGroups_) {
         if (g.type >= defs.size()) continue;
-        const TopingDef& def = defs[g.type];
-        const MeshSlice& slice = def.variants[g.variant];
-        if (slice.count == 0) continue; // empty mesh (all neighbors present)
+        if (g.vertexCount == 0) continue;
 
         TopingPush pushData = {};
-        pushData.vertexOffset = slice.offset;
+        pushData.vertexOffset = g.vertexTemplateOffset;
         pushData.instanceOffset = g.instanceOffset;
-        pushData.materialID = def.materialID;
+        pushData.materialID = defs[g.type].materialID;
         dev->PushConstants(&pushData, sizeof(pushData), cmd);
 
-        dev->DrawInstanced(slice.count, g.instanceCount, 0, 0, cmd);
+        dev->DrawInstanced(g.vertexCount, g.instanceCount, 0, 0, cmd);
         topingDrawCalls_++;
     }
 
@@ -2596,7 +2010,7 @@ void VoxelRenderPath::handleInput(float dt) {
     // F3: toggle animated terrain
     if (wi::input::Press(wi::input::KEYBOARD_BUTTON_F3)) {
         animatedTerrain_ = !animatedTerrain_;
-        wi::backlog::post(animatedTerrain_ ? "Animation: ON (60 Hz)" : "Animation: OFF");
+        wi::backlog::post(animatedTerrain_ ? "Animation: ON (30 Hz)" : "Animation: OFF");
     }
     // F4: toggle blend debug visualization
     if (wi::input::Press(wi::input::KEYBOARD_BUTTON_F4)) {
@@ -2666,7 +2080,7 @@ void VoxelRenderPath::Update(float dt) {
     windTime_ += dt;
     renderer.windTime_ = windTime_;
 
-    // Animated terrain: regenerate at 60 Hz with time-shifted noise
+    // Animated terrain: regenerate at 30 Hz with time-shifted noise
     // Fused: regenerate + pack voxel data in the same parallel pass
     if (animatedTerrain_ && renderer.isInitialized()) {
         animAccum_ += dt;
@@ -2758,7 +2172,7 @@ void VoxelRenderPath::Render() const {
         CommandList cmd = device->BeginCommandList();
 
         // GPU mesh path: only re-dispatch when voxel data changed
-        if (renderer.gpuMeshEnabled_ && renderer.gpuMesherAvailable_) {
+        if (renderer.gpuMesherAvailable_) {
             // Always readback previous frame's quad count
             uint32_t* countData = (uint32_t*)renderer.meshCounterReadback_.mapped_data;
             if (countData) {
@@ -2786,19 +2200,11 @@ void VoxelRenderPath::Render() const {
                 renderer.gpuSmoothMeshDirty_ = true;
             }
 
-            // ── Deferred toping BLAS position upload (must happen BEFORE BLAS build) ──
-            if (renderer.topingBLASDirty_ && renderer.topingBLASPositionBuffer_.IsValid() &&
-                renderer.topingBLASVertexCount_ > 0 &&
-                !renderer.topingBLASPositionStaging_.empty()) {
-                size_t uploadSize = (size_t)renderer.topingBLASVertexCount_ * 3 * sizeof(float);
-                size_t bufferSize = (size_t)renderer.topingBLASPositionCapacity_ * 3 * sizeof(float);
-                if (uploadSize <= bufferSize) {
-                    device->UpdateBuffer(&renderer.topingBLASPositionBuffer_,
-                        renderer.topingBLASPositionStaging_.data(), cmd, uploadSize);
-                }
-                renderer.rtTopingVertexCount_ = renderer.topingBLASVertexCount_;
-                renderer.rtDirty_ = true; // trigger BLAS/TLAS rebuild
-                renderer.topingBLASDirty_ = false;
+            // ── GPU compute toping BLAS extraction (replaces 196ms CPU loop) ──
+            // Must happen BEFORE BLAS build. Fills topingBLASPositionBuffer_ via CS,
+            // sets rtTopingVertexCount_ and rtDirty_ to trigger BLAS/TLAS rebuild.
+            if (renderer.topingBLASDirty_ && renderer.topingBLASShader_.IsValid()) {
+                renderer.dispatchTopingBLASExtract(cmd);
             }
 
             // Phase 6.1: BLAS extraction + acceleration structure build
@@ -2810,15 +2216,6 @@ void VoxelRenderPath::Render() const {
             }
         }
 
-        // GPU mesh benchmark state machine (runs once after world gen, CPU path only)
-        if (!renderer.gpuMeshEnabled_) {
-            if (renderer.benchState_ == VoxelRenderer::BenchState::DISPATCH) {
-                renderer.dispatchGpuMeshBenchmark(cmd, world);
-            } else if (renderer.benchState_ == VoxelRenderer::BenchState::READBACK) {
-                renderer.readbackGpuMeshBenchmark();
-            }
-        }
-
         // ── Deferred GPU uploads (dirty flags set in Update(), need CommandList) ──
         if (renderer.topingInstanceDirty_ && renderer.topingInstanceBuffer_.IsValid() &&
             !renderer.topingGpuInsts_.empty()) {
@@ -2938,26 +2335,9 @@ void VoxelRenderPath::Compose(CommandList cmd) const {
     stats += "Chunks: " + std::to_string(renderer.getVisibleChunks())
            + "/" + std::to_string(renderer.getChunkCount()) + "\n";
     stats += "Quads: " + std::to_string(renderer.getTotalQuads()) + "\n";
-    std::string renderMode;
-    if (renderer.isGpuMeshEnabled())
-        renderMode = "GPU mesh (1x1) + DrawInstanced";
-    else if (renderer.isGpuCulling())
-        renderMode = "CPU greedy + MDI + GPU cull";
-    else if (renderer.isMdiEnabled())
-        renderMode = "CPU greedy + MDI + CPU cull";
-    else
-        renderMode = "CPU greedy + DrawInstanced + CPU cull";
     stats += "Draw Calls: " + std::to_string(renderer.getDrawCalls())
-           + " (" + renderMode + ")\n";
-
-    if (renderer.isGpuMeshEnabled()) {
-        stats += "GPU Mesh Quads: " + std::to_string(renderer.getGpuMeshQuadCount()) + "\n";
-    } else {
-        char cullStr[16], drawStr[16];
-        snprintf(cullStr, sizeof(cullStr), "%.3f", renderer.getGpuCullTimeMs());
-        snprintf(drawStr, sizeof(drawStr), "%.3f", renderer.getGpuDrawTimeMs());
-        stats += "GPU Cull: " + std::string(cullStr) + " ms | Draw: " + std::string(drawStr) + " ms\n";
-    }
+           + " (GPU mesh + DrawInstanced)\n";
+    stats += "GPU Mesh Quads: " + std::to_string(renderer.getGpuMeshQuadCount()) + "\n";
     stats += "Topings: " + std::to_string(topingSystem.getInstanceCount())
            + " instances, " + std::to_string(renderer.getTopingDrawCalls())
            + " draws (" + std::to_string(topingSystem.getDefCount()) + " types)\n";
diff --git a/src/voxel/VoxelRenderer.h b/src/voxel/VoxelRenderer.h
index 42f9281..172a186 100644
--- a/src/voxel/VoxelRenderer.h
+++ b/src/voxel/VoxelRenderer.h
@@ -27,7 +27,7 @@ struct GPUChunkInfo {
     uint32_t pad2[2];          // pad to 112 bytes (7 × float4)
 };
 
-// ── Voxel Renderer (Phase 2: mega-buffer + MDI pipeline) ────────
+// ── Voxel Renderer (GPU mesh pipeline) ──────────────────────────
 class VoxelRenderer {
     friend class VoxelRenderPath;
 public:
@@ -58,8 +58,6 @@ public:
     uint32_t getDrawCalls() const { return drawCalls_; }
     uint32_t getChunkCount() const { return chunkCount_; }
     bool isInitialized() const { return initialized_; }
-    bool isGpuCulling() const { return gpuCullingEnabled_; }
-    bool isMdiEnabled() const { return mdiEnabled_; }
 
     bool debugFaceColors_ = false;
     bool debugBlend_ = false;
@@ -67,7 +65,6 @@ public:
 
 private:
     void createPipeline();
-    void rebuildMegaBuffer(VoxelWorld& world);
 
     wi::graphics::GraphicsDevice* device_ = nullptr;
 
@@ -75,8 +72,6 @@ private:
     wi::graphics::Shader vertexShader_;
     wi::graphics::Shader pixelShader_;
     wi::graphics::PipelineState pso_;
-    wi::graphics::Shader cullShader_; // Frustum cull compute shader
-
     // Shaders & Pipeline (topings, Phase 4)
     wi::graphics::Shader topingVS_;
     wi::graphics::Shader topingPS_;
@@ -93,6 +88,29 @@ private:
     std::vector<TopingGPUInst> topingGpuInsts_;
     mutable uint32_t topingDrawCalls_ = 0;
 
+    // ── Toping draw groups (shared between render + BLAS CS) ─────
+    struct TopingDrawGroup {
+        uint16_t type, variant;
+        uint32_t instanceOffset, instanceCount;
+        uint32_t vertexTemplateOffset, vertexCount; // from TopingDef::variants[]
+    };
+    std::vector<TopingDrawGroup> topingDrawGroups_; // built in uploadTopingData, reused in renderTopings
+
+    // ── GPU compute toping BLAS extraction (replaces 196ms CPU loop) ──
+    wi::graphics::Shader topingBLASShader_;         // voxelTopingBLASCS compute shader
+    struct TopingBLASGroupGPU {
+        uint32_t globalVertexOffset;    // prefix sum of total vertices before this group
+        uint32_t vertexTemplateOffset;  // offset into topingVertices (t4)
+        uint32_t vertexCount;           // vertices per instance
+        uint32_t instanceOffset;        // offset into topingInstances (t5)
+        uint32_t instanceCount;         // instances in this group
+    };
+    wi::graphics::GPUBuffer topingBLASGroupBuffer_; // StructuredBuffer<TopingBLASGroupGPU>, SRV t7
+    std::vector<TopingBLASGroupGPU> topingBLASGroupsGPU_; // CPU staging for group table
+    mutable uint32_t topingBLASTotalVertices_ = 0;
+    static constexpr uint32_t MAX_TOPING_BLAS_GROUPS = 64;
+    void dispatchTopingBLASExtract(wi::graphics::CommandList cmd) const;
+
     // Shaders & Pipeline (smooth surfaces, Phase 5)
     wi::graphics::Shader smoothVS_;
     wi::graphics::Shader smoothPS_;
@@ -114,9 +132,7 @@ private:
     // ── Mega-buffer architecture (Phase 2) ──────────────────────
     static constexpr uint32_t MEGA_BUFFER_CAPACITY = 2 * 1024 * 1024; // 2M quads max (16 MB)
     static constexpr uint32_t MAX_CHUNKS = 2048;
-    static constexpr uint32_t MAX_DRAWS = MAX_CHUNKS * 6; // up to 6 face groups per chunk
 
-    wi::graphics::GPUBuffer megaQuadBuffer_;    // StructuredBuffer<PackedQuad>, SRV t0
     wi::graphics::GPUBuffer chunkInfoBuffer_;   // StructuredBuffer<GPUChunkInfo>, SRV t2
 
     // CPU-side tracking
@@ -127,27 +143,9 @@ private:
     };
     std::vector<ChunkSlot> chunkSlots_;
     std::vector<GPUChunkInfo> cpuChunkInfo_;
-    std::vector<PackedQuad> cpuMegaQuads_;       // CPU staging for mega-buffer
     uint32_t chunkCount_ = 0;
     bool megaBufferDirty_ = true;
 
-    // ── Indirect draw (Phase 2 MDI) ─────────────────────────────
-    // Wicked Engine's DrawInstancedIndirectCount command signature includes a
-    // push constant (1 × uint32 at b999) BEFORE each D3D12_DRAW_ARGUMENTS.
-    // Total stride = 4 + 16 = 20 bytes per draw entry.
-    struct IndirectDrawArgs {
-        uint32_t pushConstant;              // written to b999[0] by ExecuteIndirect
-        uint32_t vertexCountPerInstance;
-        uint32_t instanceCount;
-        uint32_t startVertexLocation;
-        uint32_t startInstanceLocation;
-    };
-    wi::graphics::GPUBuffer indirectArgsBuffer_;   // IndirectDrawArgs[MAX_DRAWS]
-    wi::graphics::GPUBuffer drawCountBuffer_;      // uint32_t[1]
-    mutable std::vector<IndirectDrawArgs> cpuIndirectArgs_;
-    bool gpuCullingEnabled_ = true;                // Phase 2.3: GPU compute cull (true) vs CPU fallback (false)
-    bool mdiEnabled_ = true;                       // Phase 2.2: MDI rendering with CPU-filled indirect args
-
     // Constants buffer (must match HLSL VoxelCB)
     struct VoxelConstants {
         XMFLOAT4X4 viewProjection;
@@ -184,7 +182,6 @@ private:
     wi::graphics::GPUBuffer gpuQuadCounter_;  // atomic counter for GPU mesh output
     wi::graphics::GPUBuffer meshCounterReadback_; // READBACK buffer for quad counter
     bool gpuMesherAvailable_ = false;
-    bool gpuMeshEnabled_ = true;              // Use GPU meshing instead of CPU greedy
     mutable uint32_t gpuMeshQuadCount_ = 0;   // Readback from previous frame (1-frame delay)
     mutable uint32_t voxelDataCapacity_ = 0;  // Current capacity of voxelDataBuffer_ (in uint32s)
     mutable std::vector<uint32_t> packedVoxelCache_; // cached packed voxel data for all chunks
@@ -216,9 +213,8 @@ private:
     mutable wi::graphics::GPUBuffer topingBLASIndexBuffer_;    // sequential indices for toping BLAS
     mutable uint32_t topingBLASPositionCapacity_ = 0;          // pre-allocated capacity (vertices)
     mutable uint32_t topingBLASIndexCount_ = 0;                // size of toping index buffer
-    mutable bool topingBLASDirty_ = false;                     // deferred BLAS position upload + rebuild
+    mutable bool topingBLASDirty_ = false;                     // GPU compute BLAS extract + rebuild needed
     mutable uint32_t topingBLASVertexCount_ = 0;               // actual vertex count for current frame
-    std::vector<float> topingBLASPositionStaging_;             // CPU staging for deferred upload
     static constexpr uint32_t MAX_BLAS_VERTICES = MEGA_BUFFER_CAPACITY * 6; // 6 verts per quad
     mutable bool rtAvailable_ = false;                    // GPU supports RT
     mutable bool rtDirty_ = true;                         // BLAS/TLAS need rebuild
@@ -252,14 +248,6 @@ private:
                          const wi::graphics::Texture& renderTarget,
                          const wi::graphics::Texture& normalTarget) const;
 
-    // Benchmark state machine: runs once after world gen
-    enum class BenchState { IDLE, DISPATCH, READBACK, DONE };
-    mutable BenchState benchState_ = BenchState::IDLE;
-    mutable float cpuMeshTimeMs_ = 0.0f;
-    mutable uint32_t gpuBaselineQuads_ = 0;
-
-    void dispatchGpuMeshBenchmark(wi::graphics::CommandList cmd, const VoxelWorld& world) const;
-    void readbackGpuMeshBenchmark() const;
     void dispatchGpuMesh(wi::graphics::CommandList cmd, const VoxelWorld& world,
         ProfileAccum* profPack = nullptr, ProfileAccum* profUpload = nullptr,
         ProfileAccum* profDispatch = nullptr) const;
@@ -290,7 +278,7 @@ private:
 public:
     float getGpuCullTimeMs() const { return gpuCullTimeMs_; }
     float getGpuDrawTimeMs() const { return gpuDrawTimeMs_; }
-    bool isGpuMeshEnabled() const { return gpuMeshEnabled_ && gpuMesherAvailable_; }
+    bool isGpuMeshEnabled() const { return gpuMesherAvailable_; }
     uint32_t getGpuMeshQuadCount() const { return gpuMeshQuadCount_; }
 
     // Phase 4: Toping rendering
@@ -364,11 +352,11 @@ private:
     // Wind animation (continuous, always running)
     float windTime_ = 0.0f;
 
-    // Animated terrain (wave effect at 60 Hz, toggled with F3)
+    // Animated terrain (wave effect at 30 Hz, toggled with F3)
     bool animatedTerrain_ = false;
     float animTime_ = 0.0f;
     float animAccum_ = 0.0f;
-    static constexpr float ANIM_INTERVAL = 1.0f / 60.0f; // ~16.7ms = 60 Hz
+    static constexpr float ANIM_INTERVAL = 1.0f / 30.0f; // ~33.3ms = 30 Hz
 
     wi::graphics::Texture voxelRT_;
     wi::graphics::Texture voxelNormalRT_;  // Phase 6: world-space normals for RT shadows/AO