bvle-voxels/shaders/voxelShadowCS.hlsl
Samuel Bouchet dac63e3be5 Phase 6.2: toping BLAS shadows + adaptive TMin + perf optimization
- Re-enable toping BLAS in TLAS (3 instances: blocky + smooth + topings)
  with PREFER_FAST_TRACE for optimized BVH traversal (23M tris)
- Separate shadow/AO ray origins: shadow uses worldPos directly (zero bias),
  AO keeps normal bias (0.15) for hemisphere self-avoidance
- Adaptive TMin solves self-hit vs gap dilemma:
  ground (N.y≈1) → TMin=0.002 for tight blade shadows,
  blade surfaces (N.y≈0) → TMin=0.10 to skip own geometry
- Shadow rays 4→3 with tight cone (0.012 rad), AO rays 8→4
  (7 total rays/pixel, temporal accumulation compensates)
- Remove screen-space contact shadows (doesn't work for thin geometry)
2026-03-30 13:58:57 +02:00

249 lines
10 KiB
HLSL
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

// BVLE Voxels - RT Shadow + AO Compute Shader (Phase 6.2 + 6.3)
// Per-pixel: traces 1 shadow ray toward sun + N hemisphere rays for AO.
// Temporal accumulation: blends current AO with reprojected history.
// Modulates voxelRT_ in-place via RWTexture2D.
#include "voxelCommon.hlsli"
// SRV bindings
Texture2D<float> depthTexture : register(t0); // voxelDepth_ (D32_FLOAT as R32_FLOAT SRV)
Texture2D<float4> normalTexture : register(t1); // voxelNormalRT_ (R16G16B16A16_SNORM)
RaytracingAccelerationStructure tlas : register(t2); // TLAS with blocky + smooth instances
Texture2D<float> aoHistory : register(t3); // previous frame's AO (temporally accumulated)
// UAV outputs
RWTexture2D<float4> colorOutput : register(u0); // voxelRT_ (shadow applied in-place)
RWTexture2D<float> aoOutput : register(u1); // raw AO factor (blurred separately)
// Push constants
struct ShadowPush {
uint width;
uint height;
float normalBias;
float shadowMaxDist;
uint debugMode; // 0=normal, 1=debug shadows, 2=debug AO
float aoRadius; // max distance for AO rays (e.g. 8.0 voxels)
uint aoRayCount; // number of hemisphere rays (e.g. 6)
float aoStrength; // how dark full occlusion is (e.g. 0.35 = 65% darkening)
uint frameIndex; // for temporal rotation of noise pattern
uint historyValid; // 0 = no history (first frame), 1 = blend with history
uint pad[2];
};
[[vk::push_constant]] ConstantBuffer<ShadowPush> push : register(b999);
// ── Interleaved Gradient Noise (Jorge Jimenez, 2014) ────────────
// Screen-space low-frequency noise with excellent spectral properties.
// Combined with Cranley-Patterson rotation per frame for temporal variation.
float interleavedGradientNoise(float2 pixelCoord) {
return frac(52.9829189 * frac(dot(pixelCoord, float2(0.06711056, 0.00583715))));
}
// Golden ratio for Cranley-Patterson rotation
static const float GOLDEN_RATIO = 0.618033988749895;
// ── Hash (kept for voxel-coord seed) ────────────────────────────
uint hashU(uint a, uint b) {
a ^= b * 0x9E3779B9u;
a ^= a >> 16;
a *= 0x45d9f3bu;
return a;
}
float hashF(uint x) {
x ^= x >> 16;
x *= 0x45d9f3bu;
x ^= x >> 16;
return float(x & 0xFFFFFF) / float(0xFFFFFF);
}
// Build orthonormal basis from normal (Frisvad's method, robust for all N)
void buildBasis(float3 N, out float3 T, out float3 B) {
if (N.z < -0.9999) {
T = float3(0, -1, 0);
B = float3(-1, 0, 0);
} else {
float a = 1.0 / (1.0 + N.z);
float b = -N.x * N.y * a;
T = float3(1.0 - N.x * N.x * a, b, -N.x);
B = float3(b, 1.0 - N.y * N.y * a, -N.y);
}
}
// Cosine-weighted hemisphere sample (probability ∝ cos(θ))
float3 cosineSampleHemisphere(float u1, float u2, float3 N, float3 T, float3 B) {
float r = sqrt(u1);
float phi = 6.28318530718 * u2;
float x = r * cos(phi);
float y = r * sin(phi);
float z = sqrt(max(0.0, 1.0 - u1));
return normalize(x * T + y * B + z * N);
}
[RootSignature(VOXEL_ROOTSIG)]
[numthreads(8, 8, 1)]
void main(uint3 DTid : SV_DispatchThreadID) {
if (DTid.x >= push.width || DTid.y >= push.height) return;
float depth = depthTexture[DTid.xy];
if (depth == 0.0) {
aoOutput[DTid.xy] = 1.0;
if (push.debugMode > 0) colorOutput[DTid.xy] = float4(0.1, 0.1, 0.1, 1);
return;
}
// Reconstruct world position from depth via inverse VP
float2 uv = (float2(DTid.xy) + 0.5) / float2(push.width, push.height);
float2 ndc = float2(uv.x * 2.0 - 1.0, (1.0 - uv.y) * 2.0 - 1.0);
float4 clipPos = float4(ndc, depth, 1.0);
float4 worldPos4 = mul(inverseViewProjection, clipPos);
float3 worldPos = worldPos4.xyz / worldPos4.w;
float3 N = normalTexture[DTid.xy].xyz;
// Two bias strategies: normal-bias for AO (hemisphere rays), light-bias for shadows
float3 aoOrigin = worldPos + N * push.normalBias; // push along normal for AO self-avoidance
// ── Soft shadow: multiple jittered rays toward sun ─────────
float3 L = normalize(-sunDirection.xyz);
float NdotL = dot(N, L);
// Shadow origin: bias along L (not N) so grass blade bases aren't skipped
// Minimal bias to reduce gap between blade base and its shadow
float3 shadowOrigin = worldPos;
float shadowFactor = 1.0;
if (NdotL <= 0.0) {
shadowFactor = 0.45; // back-facing = fully in shadow
} else {
// Build basis around sun direction for jitter cone
float3 sunT, sunB;
buildBasis(L, sunT, sunB);
// 3 shadow rays with tight jitter (sharper shadows for thin geometry like grass)
// Softness comes from temporal accumulation over ~20 frames, not cone spread
const uint shadowRays = 3;
const float coneAngle = 0.012; // ~0.7° cone = sharp but not pixel-perfect
float shadowHits = 0;
float ignBase = interleavedGradientNoise(float2(DTid.xy));
float frameRot = float(push.frameIndex) * GOLDEN_RATIO;
[loop]
for (uint si = 0; si < shadowRays; si++) {
// Per-ray noise with temporal variation
float xi1 = frac(ignBase + frameRot + float(si) * GOLDEN_RATIO);
float xi2 = frac(ignBase * 1.7 + frameRot * 0.7 + float(si) * 0.3819);
// Uniform disk → cone direction
float r = sqrt(xi1) * coneAngle;
float phi = 6.28318530718 * xi2;
float3 jitteredL = normalize(L + r * cos(phi) * sunT + r * sin(phi) * sunB);
RayDesc ray;
ray.Origin = shadowOrigin;
ray.Direction = jitteredL;
// Adaptive TMin: tight for ground (N.y≈1) to catch blade bases,
// larger for blade surfaces (N.y≈0) to skip own geometry
ray.TMin = lerp(0.002, 0.10, 1.0 - abs(N.y));
ray.TMax = push.shadowMaxDist;
RayQuery<RAY_FLAG_SKIP_PROCEDURAL_PRIMITIVES | RAY_FLAG_ACCEPT_FIRST_HIT_AND_END_SEARCH> q;
q.TraceRayInline(tlas, 0, 0xFF, ray);
[loop] while (q.Proceed()) {}
if (q.CommittedStatus() == COMMITTED_TRIANGLE_HIT) {
shadowHits += 1.0;
}
}
float shadowAmount = shadowHits / float(shadowRays); // 0=fully lit, 1=fully shadowed
shadowFactor = lerp(1.0, 0.45, shadowAmount);
}
// ── AO: hemisphere rays with IGN + temporal rotation ──────
float aoFactor = 1.0;
uint rayCount = push.aoRayCount;
if (rayCount > 0) {
float3 T, B;
buildBasis(N, T, B);
// IGN base noise: well-distributed screen-space pattern
// Cranley-Patterson rotation: offset by golden ratio * frameIndex
// Each frame explores different ray directions → temporal accumulation converges
float frameRotation = float(push.frameIndex) * GOLDEN_RATIO;
float totalOcclusion = 0.0;
[loop]
for (uint i = 0; i < rayCount; i++) {
// Per-ray IGN with spatial offset to decorrelate rays
// Each ray uses a different pixel offset → different IGN value
float2 rayPixel = float2(DTid.xy) + float2(i * 7.0, i * 3.0);
float ign = interleavedGradientNoise(rayPixel);
// Cranley-Patterson rotation: shift by golden ratio per frame + per ray
float u1 = frac(ign + frameRotation + float(i) * GOLDEN_RATIO);
float u2 = frac(interleavedGradientNoise(rayPixel + float2(47.0, 17.0))
+ frameRotation + float(i) * 0.381966011250105); // 1/φ²
float3 dir = cosineSampleHemisphere(u1, u2, N, T, B);
RayDesc aoRay;
aoRay.Origin = aoOrigin;
aoRay.Direction = dir;
aoRay.TMin = 0.05;
aoRay.TMax = push.aoRadius;
RayQuery<RAY_FLAG_SKIP_PROCEDURAL_PRIMITIVES | RAY_FLAG_ACCEPT_FIRST_HIT_AND_END_SEARCH> aoQ;
aoQ.TraceRayInline(tlas, 0, 0xFF, aoRay);
[loop] while (aoQ.Proceed()) {}
if (aoQ.CommittedStatus() == COMMITTED_TRIANGLE_HIT) {
float hitT = aoQ.CommittedRayT();
float falloff = 1.0 - saturate(hitT / push.aoRadius);
totalOcclusion += falloff * falloff;
}
}
float occlusionRatio = totalOcclusion / float(rayCount);
aoFactor = 1.0 - occlusionRatio * push.aoStrength;
}
// ── Temporal accumulation ────────────────────────────────────
// Reproject current pixel to previous frame's screen space
if (push.historyValid != 0) {
float4 prevClip = mul(prevViewProjection, float4(worldPos, 1.0));
float2 prevNDC = prevClip.xy / prevClip.w;
float2 prevUV = float2(prevNDC.x * 0.5 + 0.5, 0.5 - prevNDC.y * 0.5);
// Check if reprojected UV is within screen bounds
if (prevUV.x >= 0.0 && prevUV.x < 1.0 && prevUV.y >= 0.0 && prevUV.y < 1.0) {
int2 prevPixel = int2(prevUV * float2(push.width, push.height));
float historyAO = aoHistory.Load(int3(prevPixel, 0));
// Blend: low alpha = keep more history (smoother), high alpha = more responsive
float blendAlpha = 0.05; // accumulate ~20 frames
aoFactor = lerp(historyAO, aoFactor, blendAlpha);
}
}
// ── Write AO to separate buffer (will be blurred), apply shadow in-place ──
aoOutput[DTid.xy] = aoFactor;
if (push.debugMode == 1) {
if (NdotL <= 0.0)
colorOutput[DTid.xy] = float4(0, 0, 0.5, 1);
else if (shadowFactor < 1.0)
colorOutput[DTid.xy] = float4(1, 0, 0, 1);
else
colorOutput[DTid.xy] = float4(0, 1, 0, 1);
} else if (push.debugMode == 2) {
colorOutput[DTid.xy] = float4(1, 1, 1, 1);
} else {
float4 color = colorOutput[DTid.xy];
// Colored shadows: lerp toward shadow tint instead of just darkening
// shadowFactor=1 → no change, shadowFactor=0.3 → blend toward tinted shadow
float shadowAmount = 1.0 - shadowFactor; // 0=lit, 0.7=full shadow
float3 tintedColor = color.rgb * shadowTint.rgb; // shadow = original × tint color
color.rgb = lerp(color.rgb, tintedColor, shadowAmount);
colorOutput[DTid.xy] = color;
}
}