diff --git a/CLAUDE.md b/CLAUDE.md index 54a8e49..438155d 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -549,7 +549,12 @@ Système de biseaux décoratifs (« topings ») sur les faces +Y exposées pour - **Smooth BLAS** : single BLAS from `gpuSmoothVertexBuffer_` directly (no extraction needed) - Position at offset 0, stride 32 bytes (SmoothVtx struct) - Same `PREFER_FAST_BUILD` flag -- **TLAS** : 2 instances (blocky + smooth), identity transforms (all positions are world-space) +- **Toping BLAS** : single BLAS from expanded toping vertices (mesh × instances → world-space float3) + - CPU-side expansion in `uploadTopingData()`: iterates sorted instances, transforms local vertices to world positions + - Dedicated `topingBLASPositionBuffer_` + `topingBLASIndexBuffer_` (separate from blocky) + - `PREFER_FAST_TRACE` flag (optimizes BVH traversal, important for 23M tris) + - ~23M triangles for ~153K instances (varies with blade count/segments) +- **TLAS** : 3 instances (blocky + smooth + topings), identity transforms (all positions are world-space) - Instance buffer created via `CreateBuffer2` with pre-filled instance data (callback) - `instance_mask = 0xFF` for both instances - Recreated each rebuild (avoids `UpdateBuffer` on RAY_TRACING flagged buffers) @@ -573,13 +578,21 @@ Système de biseaux décoratifs (« topings ») sur les faces +Y exposées pour - **Compute shader** (`voxelShadowCS.hlsl`) avec inline ray queries (`RayQuery<>`, SM 6.5) - Lit `voxelDepth_` (t0, D32→R32_FLOAT) + `voxelNormalRT_` (t1) + TLAS (t2) - Reconstruit worldPos depuis depth via `inverseViewProjection` (ajouté au VoxelCB) - - Trace un rayon vers le soleil : `L = normalize(-sunDirection.xyz)` + - 3 shadow rays jittered vers le soleil (cone 0.012 rad ≈ 0.7°) pour soft shadows - `RAY_FLAG_ACCEPT_FIRST_HIT_AND_END_SEARCH` + `RAY_FLAG_SKIP_PROCEDURAL_PRIMITIVES` (shadow binaire) - - Normal bias (0.15) pour éviter l'auto-intersection - - Surfaces back-facing (NdotL ≤ 0) : assombries sans ray trace + - Surfaces back-facing (NdotL ≤ 0) : assombries sans ray trace (shadowFactor=0.45) - **In-place modulation** : `RWTexture2D` sur `voxelRT_` (u0), chaque thread lit/modifie son pixel (pas de race) - - Shadow factor : `color.rgb *= 0.3` pour les pixels en ombre + - Colored shadows : `lerp(color, color * shadowTint, shadowAmount)` au lieu de simple darkening - `voxelRT_` créé avec `UNORDERED_ACCESS` additionnel pour permettre l'écriture compute +- **Shadow origin bias — PIÈGE MAJEUR** : + - L'origine du shadow ray utilise `shadowOrigin = worldPos` (zéro bias position) au lieu du normal bias de l'AO + - Le normal bias (`worldPos + N * 0.15`) pousse l'origine AU-DESSUS des bases de brins d'herbe → gap entre le brin et son ombre + - Le light bias (`worldPos + L * offset`) crée aussi un gap (proportionnel au bias) + - **TMin adaptatif** résout le dilemme self-hit vs gap : + - `TMin = lerp(0.002, 0.10, 1.0 - abs(N.y))` + - Sol (N.y ≈ 1) → TMin=0.002 : ombres collées aux bases des brins + - Brins d'herbe (N.y ≈ 0) → TMin=0.10 : skip la propre géométrie du brin + - **Screen-space contact shadows ne fonctionnent PAS** pour l'herbe : les brins sont trop fins pour que le delta de profondeur NDC soit distinguable du bruit. Testé en 4 itérations (NDC comparison, world-space reconstruction, height filter) — tous échouent - **Dispatch** : 8×8 thread groups, `ceil(w/8) × ceil(h/8)`, après les 3 render passes (blocky+topings+smooth) - **Barriers** : - Pre : `voxelDepth_` DEPTHSTENCIL→SHADER_RESOURCE + `voxelRT_` SHADER_RESOURCE→UAV @@ -591,7 +604,7 @@ Système de biseaux décoratifs (« topings ») sur les faces +Y exposées pour #### Phase 6.3 - RT AO [FAIT] -- **Intégré dans `voxelShadowCS.hlsl`** : 8 rayons hémisphère cosine-weighted par pixel + 1 rayon soleil +- **Intégré dans `voxelShadowCS.hlsl`** : 4 rayons AO hémisphère cosine-weighted + 3 rayons shadow jittered par pixel (7 total) - **Distance-weighted AO** : `(1 - hitT/aoRadius)²` — falloff quadratique, valeurs continues au lieu de binaire hit/miss - **Interleaved Gradient Noise (IGN)** : remplace le hash world-space pour le sampling. Bruit structuré screen-space avec excellentes propriétés spectrales (Jorge Jimenez, 2014) - **Cranley-Patterson rotation** : `frac(IGN + frameIndex * φ)` — chaque frame explore de nouvelles directions de rayons. Golden ratio (φ ≈ 0.618) assure une couverture maximale de l'hémisphère au fil des frames diff --git a/shaders/voxelShadowCS.hlsl b/shaders/voxelShadowCS.hlsl index 8ec3bde..624153a 100644 --- a/shaders/voxelShadowCS.hlsl +++ b/shaders/voxelShadowCS.hlsl @@ -99,12 +99,17 @@ void main(uint3 DTid : SV_DispatchThreadID) { float3 worldPos = worldPos4.xyz / worldPos4.w; float3 N = normalTexture[DTid.xy].xyz; - float3 origin = worldPos + N * push.normalBias; + // Two bias strategies: normal-bias for AO (hemisphere rays), light-bias for shadows + float3 aoOrigin = worldPos + N * push.normalBias; // push along normal for AO self-avoidance // ── Soft shadow: multiple jittered rays toward sun ───────── float3 L = normalize(-sunDirection.xyz); float NdotL = dot(N, L); + // Shadow origin: bias along L (not N) so grass blade bases aren't skipped + // Minimal bias to reduce gap between blade base and its shadow + float3 shadowOrigin = worldPos; + float shadowFactor = 1.0; if (NdotL <= 0.0) { shadowFactor = 0.45; // back-facing = fully in shadow @@ -113,9 +118,10 @@ void main(uint3 DTid : SV_DispatchThreadID) { float3 sunT, sunB; buildBasis(L, sunT, sunB); - // 2 shadow rays with IGN-based jitter (soft penumbra, temporally accumulated) - const uint shadowRays = 2; - const float coneAngle = 0.04; // ~2.3° cone = soft sun + // 3 shadow rays with tight jitter (sharper shadows for thin geometry like grass) + // Softness comes from temporal accumulation over ~20 frames, not cone spread + const uint shadowRays = 3; + const float coneAngle = 0.012; // ~0.7° cone = sharp but not pixel-perfect float shadowHits = 0; float ignBase = interleavedGradientNoise(float2(DTid.xy)); float frameRot = float(push.frameIndex) * GOLDEN_RATIO; @@ -132,9 +138,11 @@ void main(uint3 DTid : SV_DispatchThreadID) { float3 jitteredL = normalize(L + r * cos(phi) * sunT + r * sin(phi) * sunB); RayDesc ray; - ray.Origin = origin; + ray.Origin = shadowOrigin; ray.Direction = jitteredL; - ray.TMin = 0.01; + // Adaptive TMin: tight for ground (N.y≈1) to catch blade bases, + // larger for blade surfaces (N.y≈0) to skip own geometry + ray.TMin = lerp(0.002, 0.10, 1.0 - abs(N.y)); ray.TMax = push.shadowMaxDist; RayQuery q; @@ -179,7 +187,7 @@ void main(uint3 DTid : SV_DispatchThreadID) { float3 dir = cosineSampleHemisphere(u1, u2, N, T, B); RayDesc aoRay; - aoRay.Origin = origin; + aoRay.Origin = aoOrigin; aoRay.Direction = dir; aoRay.TMin = 0.05; aoRay.TMax = push.aoRadius; diff --git a/src/voxel/VoxelRenderer.cpp b/src/voxel/VoxelRenderer.cpp index 6b004c7..2bb5e3f 100644 --- a/src/voxel/VoxelRenderer.cpp +++ b/src/voxel/VoxelRenderer.cpp @@ -1103,10 +1103,44 @@ void VoxelRenderer::buildAccelerationStructures(CommandList cmd) const { rtSmoothVertexCount_ = smoothVertCount; } - // ── Toping BLAS: SKIPPED ───────────────────────────────────── - // Topings generate 23M+ tris which massively slows ray traversal - // for negligible shadow contribution (blades too thin). - // Toping BLAS build and TLAS inclusion both disabled for performance. + // ── Toping BLAS ────────────────────────────────────────────── + uint32_t topingVertCount = rtTopingVertexCount_; + if (topingVertCount >= 3 && topingBLASPositionBuffer_.IsValid()) { + if (!topingBLAS_.IsValid() || topingBLAS_.desc.bottom_level.geometries.empty() || + topingBLAS_.desc.bottom_level.geometries[0].triangles.vertex_count != topingVertCount) { + + RaytracingAccelerationStructureDesc desc; + desc.type = RaytracingAccelerationStructureDesc::Type::BOTTOMLEVEL; + desc.flags = RaytracingAccelerationStructureDesc::FLAG_PREFER_FAST_TRACE; // optimize traversal + + desc.bottom_level.geometries.resize(1); + auto& geom = desc.bottom_level.geometries[0]; + geom.type = RaytracingAccelerationStructureDesc::BottomLevel::Geometry::Type::TRIANGLES; + geom.flags = RaytracingAccelerationStructureDesc::BottomLevel::Geometry::FLAG_OPAQUE; + geom.triangles.vertex_buffer = topingBLASPositionBuffer_; + geom.triangles.vertex_byte_offset = 0; + geom.triangles.vertex_count = topingVertCount; + geom.triangles.vertex_stride = sizeof(float) * 3; + geom.triangles.vertex_format = Format::R32G32B32_FLOAT; + geom.triangles.index_buffer = topingBLASIndexBuffer_; + geom.triangles.index_count = topingVertCount; + geom.triangles.index_format = IndexBufferFormat::UINT32; + geom.triangles.index_offset = 0; + + bool ok = dev->CreateRaytracingAccelerationStructure(&desc, &topingBLAS_); + if (ok) { + dev->SetName(&topingBLAS_, "VoxelRenderer::topingBLAS"); + wi::backlog::post("VoxelRenderer: toping BLAS created (" + + std::to_string(topingVertCount / 3) + " tris)"); + } else { + wi::backlog::post("VoxelRenderer: failed to create toping BLAS", wi::backlog::LogLevel::Error); + } + } + + if (topingBLAS_.IsValid()) { + dev->BuildRaytracingAccelerationStructure(&topingBLAS_, cmd, nullptr); + } + } // ── Memory barrier: sync BLAS builds before TLAS ────────────── // Without this, TLAS build can execute before BLASes are complete. @@ -1116,15 +1150,14 @@ void VoxelRenderer::buildAccelerationStructures(CommandList cmd) const { dev->Barrier(barriers, 1, cmd); } - // ── TLAS (2 instances: blocky + smooth) ───────────────────── - // Topings excluded from TLAS: 23M+ tris slows all ray traversal - // for negligible shadow contribution (blades too thin). + // ── TLAS (up to 3 instances: blocky + smooth + topings) ──── // Always recreate TLAS with pre-filled instance data via CreateBuffer2. // RAY_TRACING instance buffers have special resource state requirements, // so UpdateBuffer (CopyBufferRegion) would crash on state mismatch. uint32_t instanceCount = 0; if (blockyBLAS_.IsValid()) instanceCount++; if (smoothBLAS_.IsValid() && smoothVertCount > 0) instanceCount++; + if (topingBLAS_.IsValid() && topingVertCount >= 3) instanceCount++; if (instanceCount == 0) { rtDirty_ = false; return; } const size_t instSize = dev->GetTopLevelAccelerationStructureInstanceSize(); @@ -1140,6 +1173,7 @@ void VoxelRenderer::buildAccelerationStructures(CommandList cmd) const { // Capture BLAS pointers for the lambda (can't capture member references) const RaytracingAccelerationStructure* blockyBLASPtr = blockyBLAS_.IsValid() ? &blockyBLAS_ : nullptr; const RaytracingAccelerationStructure* smoothBLASPtr = (smoothBLAS_.IsValid() && smoothVertCount > 0) ? &smoothBLAS_ : nullptr; + const RaytracingAccelerationStructure* topingBLASPtr = (topingBLAS_.IsValid() && topingVertCount >= 3) ? &topingBLAS_ : nullptr; // Create TLAS with instance data pre-filled in the creation callback. // This avoids any UpdateBuffer on RAY_TRACING flagged buffers. @@ -1180,7 +1214,17 @@ void VoxelRenderer::buildAccelerationStructures(CommandList cmd) const { idx++; } - // Topings excluded from TLAS for performance (23M+ tris, negligible shadows) + if (topingBLASPtr) { + RaytracingAccelerationStructureDesc::TopLevel::Instance inst; + setIdentity(inst.transform); + inst.instance_id = 2; + inst.instance_mask = 0xFF; + inst.instance_contribution_to_hit_group_index = 0; + inst.flags = 0; + inst.bottom_level = topingBLASPtr; + dev->WriteTopLevelAccelerationStructureInstance(&inst, (uint8_t*)dest + idx * instSize); + idx++; + } }; bool ok = dev->CreateBuffer2(&bufdesc, initInstances, &desc.top_level.instance_buffer); @@ -1264,7 +1308,7 @@ void VoxelRenderer::dispatchShadows(CommandList cmd, pushData.shadowMaxDist = 512.0f; pushData.debugMode = rtShadowDebug_; pushData.aoRadius = 8.0f; - pushData.aoRayCount = 8; + pushData.aoRayCount = 4; pushData.aoStrength = 0.7f; pushData.frameIndex = frameCounter_++; pushData.historyValid = aoHistoryValid_ ? 1u : 0u; @@ -2003,7 +2047,62 @@ void VoxelRenderer::uploadTopingData(const TopingSystem& topingSystem) { ibDesc.usage = Usage::DEFAULT; device_->CreateBuffer(&ibDesc, topingGpuInsts_.data(), &topingInstanceBuffer_); - // Toping BLAS: SKIPPED (23M+ tris slows RT for negligible shadow contribution) + // ── Build toping BLAS position buffer ─────────────────────── + // Expand (mesh vertices × instances) → world-space float3 positions. + const auto& defs = topingSystem.getDefs(); + uint32_t totalTopingVerts = 0; + for (uint32_t i = 0; i < instCount; i++) { + const auto& si = topingSorted_[i]; + if (si.type >= defs.size()) continue; + totalTopingVerts += defs[si.type].variants[si.variant].count; + } + + if (totalTopingVerts > 0 && !verts.empty()) { + std::vector positions(totalTopingVerts * 3); // float3 per vertex + uint32_t outIdx = 0; + for (uint32_t i = 0; i < instCount; i++) { + const auto& si = topingSorted_[i]; + if (si.type >= defs.size()) continue; + const auto& slice = defs[si.type].variants[si.variant]; + for (uint32_t v = 0; v < slice.count; v++) { + const auto& vtx = verts[slice.offset + v]; + positions[outIdx * 3 + 0] = vtx.px + si.wx; + positions[outIdx * 3 + 1] = vtx.py + si.wy; + positions[outIdx * 3 + 2] = vtx.pz + si.wz; + outIdx++; + } + } + + // Create position buffer + GPUBufferDesc posDesc; + posDesc.size = totalTopingVerts * sizeof(float) * 3; + posDesc.bind_flags = BindFlag::SHADER_RESOURCE; + posDesc.misc_flags = ResourceMiscFlag::RAY_TRACING; + posDesc.usage = Usage::DEFAULT; + device_->CreateBuffer(&posDesc, positions.data(), &topingBLASPositionBuffer_); + + // Create sequential index buffer (Wicked requires valid index buffer for BLAS) + if (topingBLASIndexCount_ < totalTopingVerts) { + std::vector indices(totalTopingVerts); + for (uint32_t j = 0; j < totalTopingVerts; j++) indices[j] = j; + + GPUBufferDesc idxDesc; + idxDesc.size = totalTopingVerts * sizeof(uint32_t); + idxDesc.bind_flags = BindFlag::SHADER_RESOURCE; + idxDesc.misc_flags = ResourceMiscFlag::RAY_TRACING; + idxDesc.usage = Usage::DEFAULT; + device_->CreateBuffer(&idxDesc, indices.data(), &topingBLASIndexBuffer_); + topingBLASIndexCount_ = totalTopingVerts; + } + + rtTopingVertexCount_ = totalTopingVerts; + rtDirty_ = true; + + char msg[128]; + snprintf(msg, sizeof(msg), "Toping BLAS: %u vertices (%u tris)", + totalTopingVerts, totalTopingVerts / 3); + wi::backlog::post(msg); + } } void VoxelRenderer::renderTopings(