GPU profiling + staggered BLAS builds + RT disable during animation
- Add comprehensive GPU timestamp queries for all major operations (mesh, smooth mesh, BLAS extract, BLAS build, draw, RT shadows) - Add full-frame profiling: Wicked Render, GPU Wait/Sync, true FPS - Stagger BLAS builds during animation: alternate blocky/smooth per frame, skip toping BLAS entirely (~130ms savings per frame) - Auto-disable RT shadows on F3 animation start (prevents stale shadow artifacts), auto-restore on F3 stop with full BLAS rebuild - Split buildAccelerationStructures() with selective build flags - Result: animation ~24 FPS (CPU-bound on Regenerate 27ms) vs previous 2 FPS (GPU-bound on BLAS Build 1368ms)
This commit is contained in:
parent
0d3f8200b4
commit
0d93cef8f1
2 changed files with 237 additions and 74 deletions
|
|
@ -818,18 +818,16 @@ void VoxelRenderer::dispatchTopingBLASExtract(CommandList cmd) const {
|
||||||
topingBLASDirty_ = false;
|
topingBLASDirty_ = false;
|
||||||
}
|
}
|
||||||
|
|
||||||
void VoxelRenderer::buildAccelerationStructures(CommandList cmd) const {
|
void VoxelRenderer::buildAccelerationStructures(CommandList cmd,
|
||||||
|
uint32_t buildFlags) const {
|
||||||
if (!rtAvailable_) return;
|
if (!rtAvailable_) return;
|
||||||
|
|
||||||
auto* dev = device_;
|
auto* dev = device_;
|
||||||
|
|
||||||
// ── Blocky BLAS ──────────────────────────────────────────────
|
// ── Blocky BLAS ──────────────────────────────────────────────
|
||||||
uint32_t blockyVertCount = rtBlockyVertexCount_;
|
uint32_t blockyVertCount = rtBlockyVertexCount_;
|
||||||
if (blockyVertCount < 3) blockyVertCount = 0; // Need at least 1 triangle
|
if (blockyVertCount < 3) blockyVertCount = 0;
|
||||||
if (blockyVertCount > 0 && blasPositionBuffer_.IsValid()) {
|
if ((buildFlags & RT_BUILD_BLOCKY) && blockyVertCount > 0 && blasPositionBuffer_.IsValid()) {
|
||||||
// Only (re)create BLAS when vertex count exceeds allocated capacity.
|
|
||||||
// CreateRaytracingAccelerationStructure allocates GPU memory — calling it per-frame leaks VRAM.
|
|
||||||
// We allocate with headroom and update desc.vertex_count before each Build.
|
|
||||||
if (!blockyBLAS_.IsValid() || blockyVertCount > blockyBLASCapacity_) {
|
if (!blockyBLAS_.IsValid() || blockyVertCount > blockyBLASCapacity_) {
|
||||||
blockyBLASCapacity_ = blockyVertCount + blockyVertCount / 4; // 25% headroom
|
blockyBLASCapacity_ = blockyVertCount + blockyVertCount / 4; // 25% headroom
|
||||||
|
|
||||||
|
|
@ -843,7 +841,7 @@ void VoxelRenderer::buildAccelerationStructures(CommandList cmd) const {
|
||||||
geom.flags = RaytracingAccelerationStructureDesc::BottomLevel::Geometry::FLAG_OPAQUE;
|
geom.flags = RaytracingAccelerationStructureDesc::BottomLevel::Geometry::FLAG_OPAQUE;
|
||||||
geom.triangles.vertex_buffer = blasPositionBuffer_;
|
geom.triangles.vertex_buffer = blasPositionBuffer_;
|
||||||
geom.triangles.vertex_byte_offset = 0;
|
geom.triangles.vertex_byte_offset = 0;
|
||||||
geom.triangles.vertex_count = blockyBLASCapacity_; // allocate for capacity
|
geom.triangles.vertex_count = blockyBLASCapacity_;
|
||||||
geom.triangles.vertex_stride = sizeof(float) * 3;
|
geom.triangles.vertex_stride = sizeof(float) * 3;
|
||||||
geom.triangles.vertex_format = Format::R32G32B32_FLOAT;
|
geom.triangles.vertex_format = Format::R32G32B32_FLOAT;
|
||||||
geom.triangles.index_buffer = blasIndexBuffer_;
|
geom.triangles.index_buffer = blasIndexBuffer_;
|
||||||
|
|
@ -851,8 +849,7 @@ void VoxelRenderer::buildAccelerationStructures(CommandList cmd) const {
|
||||||
geom.triangles.index_format = IndexBufferFormat::UINT32;
|
geom.triangles.index_format = IndexBufferFormat::UINT32;
|
||||||
geom.triangles.index_offset = 0;
|
geom.triangles.index_offset = 0;
|
||||||
|
|
||||||
bool ok = dev->CreateRaytracingAccelerationStructure(&desc,
|
bool ok = dev->CreateRaytracingAccelerationStructure(&desc, &blockyBLAS_);
|
||||||
&blockyBLAS_);
|
|
||||||
if (ok) {
|
if (ok) {
|
||||||
dev->SetName(&blockyBLAS_, "VoxelRenderer::blockyBLAS");
|
dev->SetName(&blockyBLAS_, "VoxelRenderer::blockyBLAS");
|
||||||
wi::backlog::post("VoxelRenderer: blocky BLAS created (capacity "
|
wi::backlog::post("VoxelRenderer: blocky BLAS created (capacity "
|
||||||
|
|
@ -864,7 +861,7 @@ void VoxelRenderer::buildAccelerationStructures(CommandList cmd) const {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Update actual vertex count in descriptor before Build
|
// Update actual vertex count, then Build
|
||||||
blockyBLAS_.desc.bottom_level.geometries[0].triangles.vertex_count = blockyVertCount;
|
blockyBLAS_.desc.bottom_level.geometries[0].triangles.vertex_count = blockyVertCount;
|
||||||
blockyBLAS_.desc.bottom_level.geometries[0].triangles.index_count = blockyVertCount;
|
blockyBLAS_.desc.bottom_level.geometries[0].triangles.index_count = blockyVertCount;
|
||||||
dev->BuildRaytracingAccelerationStructure(&blockyBLAS_, cmd, nullptr);
|
dev->BuildRaytracingAccelerationStructure(&blockyBLAS_, cmd, nullptr);
|
||||||
|
|
@ -877,10 +874,9 @@ void VoxelRenderer::buildAccelerationStructures(CommandList cmd) const {
|
||||||
bool useGpuSmooth = smoothCentroidShader_.IsValid() && smoothMeshShader_.IsValid();
|
bool useGpuSmooth = smoothCentroidShader_.IsValid() && smoothMeshShader_.IsValid();
|
||||||
const GPUBuffer& smoothVB = useGpuSmooth ? gpuSmoothVertexBuffer_ : smoothVertexBuffer_;
|
const GPUBuffer& smoothVB = useGpuSmooth ? gpuSmoothVertexBuffer_ : smoothVertexBuffer_;
|
||||||
|
|
||||||
if (smoothVertCount > 0 && smoothVB.IsValid()) {
|
if ((buildFlags & RT_BUILD_SMOOTH) && smoothVertCount > 0 && smoothVB.IsValid()) {
|
||||||
// Capacity-based: only recreate when exceeding allocated capacity
|
|
||||||
if (!smoothBLAS_.IsValid() || smoothVertCount > smoothBLASCapacity_) {
|
if (!smoothBLAS_.IsValid() || smoothVertCount > smoothBLASCapacity_) {
|
||||||
smoothBLASCapacity_ = smoothVertCount + smoothVertCount / 4; // 25% headroom
|
smoothBLASCapacity_ = smoothVertCount + smoothVertCount / 4;
|
||||||
|
|
||||||
RaytracingAccelerationStructureDesc desc;
|
RaytracingAccelerationStructureDesc desc;
|
||||||
desc.type = RaytracingAccelerationStructureDesc::Type::BOTTOMLEVEL;
|
desc.type = RaytracingAccelerationStructureDesc::Type::BOTTOMLEVEL;
|
||||||
|
|
@ -893,15 +889,14 @@ void VoxelRenderer::buildAccelerationStructures(CommandList cmd) const {
|
||||||
geom.triangles.vertex_buffer = smoothVB;
|
geom.triangles.vertex_buffer = smoothVB;
|
||||||
geom.triangles.vertex_byte_offset = 0;
|
geom.triangles.vertex_byte_offset = 0;
|
||||||
geom.triangles.vertex_count = smoothBLASCapacity_;
|
geom.triangles.vertex_count = smoothBLASCapacity_;
|
||||||
geom.triangles.vertex_stride = 32; // SmoothVtx struct = 32 bytes, position at offset 0
|
geom.triangles.vertex_stride = 32;
|
||||||
geom.triangles.index_buffer = blasIndexBuffer_;
|
geom.triangles.index_buffer = blasIndexBuffer_;
|
||||||
geom.triangles.index_count = smoothBLASCapacity_;
|
geom.triangles.index_count = smoothBLASCapacity_;
|
||||||
geom.triangles.index_format = IndexBufferFormat::UINT32;
|
geom.triangles.index_format = IndexBufferFormat::UINT32;
|
||||||
geom.triangles.index_offset = 0;
|
geom.triangles.index_offset = 0;
|
||||||
geom.triangles.vertex_format = Format::R32G32B32_FLOAT;
|
geom.triangles.vertex_format = Format::R32G32B32_FLOAT;
|
||||||
|
|
||||||
bool ok = dev->CreateRaytracingAccelerationStructure(&desc,
|
bool ok = dev->CreateRaytracingAccelerationStructure(&desc, &smoothBLAS_);
|
||||||
&smoothBLAS_);
|
|
||||||
if (ok) {
|
if (ok) {
|
||||||
dev->SetName(&smoothBLAS_, "VoxelRenderer::smoothBLAS");
|
dev->SetName(&smoothBLAS_, "VoxelRenderer::smoothBLAS");
|
||||||
wi::backlog::post("VoxelRenderer: smooth BLAS created (capacity "
|
wi::backlog::post("VoxelRenderer: smooth BLAS created (capacity "
|
||||||
|
|
@ -912,7 +907,6 @@ void VoxelRenderer::buildAccelerationStructures(CommandList cmd) const {
|
||||||
}
|
}
|
||||||
|
|
||||||
if (smoothBLAS_.IsValid()) {
|
if (smoothBLAS_.IsValid()) {
|
||||||
// Update actual vertex count before Build
|
|
||||||
smoothBLAS_.desc.bottom_level.geometries[0].triangles.vertex_count = smoothVertCount;
|
smoothBLAS_.desc.bottom_level.geometries[0].triangles.vertex_count = smoothVertCount;
|
||||||
smoothBLAS_.desc.bottom_level.geometries[0].triangles.index_count = smoothVertCount;
|
smoothBLAS_.desc.bottom_level.geometries[0].triangles.index_count = smoothVertCount;
|
||||||
dev->BuildRaytracingAccelerationStructure(&smoothBLAS_, cmd, nullptr);
|
dev->BuildRaytracingAccelerationStructure(&smoothBLAS_, cmd, nullptr);
|
||||||
|
|
@ -923,14 +917,13 @@ void VoxelRenderer::buildAccelerationStructures(CommandList cmd) const {
|
||||||
|
|
||||||
// ── Toping BLAS ──────────────────────────────────────────────
|
// ── Toping BLAS ──────────────────────────────────────────────
|
||||||
uint32_t topingVertCount = rtTopingVertexCount_;
|
uint32_t topingVertCount = rtTopingVertexCount_;
|
||||||
if (topingVertCount >= 3 && topingBLASPositionBuffer_.IsValid()) {
|
if ((buildFlags & RT_BUILD_TOPING) && topingVertCount >= 3 && topingBLASPositionBuffer_.IsValid()) {
|
||||||
// Capacity-based: only recreate when exceeding allocated capacity
|
|
||||||
if (!topingBLAS_.IsValid() || topingVertCount > topingBLASASCapacity_) {
|
if (!topingBLAS_.IsValid() || topingVertCount > topingBLASASCapacity_) {
|
||||||
topingBLASASCapacity_ = topingVertCount + topingVertCount / 4; // 25% headroom
|
topingBLASASCapacity_ = topingVertCount + topingVertCount / 4;
|
||||||
|
|
||||||
RaytracingAccelerationStructureDesc desc;
|
RaytracingAccelerationStructureDesc desc;
|
||||||
desc.type = RaytracingAccelerationStructureDesc::Type::BOTTOMLEVEL;
|
desc.type = RaytracingAccelerationStructureDesc::Type::BOTTOMLEVEL;
|
||||||
desc.flags = RaytracingAccelerationStructureDesc::FLAG_PREFER_FAST_BUILD; // fast rebuild for animation
|
desc.flags = RaytracingAccelerationStructureDesc::FLAG_PREFER_FAST_BUILD;
|
||||||
|
|
||||||
desc.bottom_level.geometries.resize(1);
|
desc.bottom_level.geometries.resize(1);
|
||||||
auto& geom = desc.bottom_level.geometries[0];
|
auto& geom = desc.bottom_level.geometries[0];
|
||||||
|
|
@ -957,7 +950,6 @@ void VoxelRenderer::buildAccelerationStructures(CommandList cmd) const {
|
||||||
}
|
}
|
||||||
|
|
||||||
if (topingBLAS_.IsValid()) {
|
if (topingBLAS_.IsValid()) {
|
||||||
// Update actual vertex count before Build
|
|
||||||
topingBLAS_.desc.bottom_level.geometries[0].triangles.vertex_count = topingVertCount;
|
topingBLAS_.desc.bottom_level.geometries[0].triangles.vertex_count = topingVertCount;
|
||||||
topingBLAS_.desc.bottom_level.geometries[0].triangles.index_count = topingVertCount;
|
topingBLAS_.desc.bottom_level.geometries[0].triangles.index_count = topingVertCount;
|
||||||
dev->BuildRaytracingAccelerationStructure(&topingBLAS_, cmd, nullptr);
|
dev->BuildRaytracingAccelerationStructure(&topingBLAS_, cmd, nullptr);
|
||||||
|
|
@ -1056,7 +1048,6 @@ void VoxelRenderer::buildAccelerationStructures(CommandList cmd) const {
|
||||||
wi::backlog::post("VoxelRenderer: TLAS created (" + std::to_string(instanceCount) + " instances)");
|
wi::backlog::post("VoxelRenderer: TLAS created (" + std::to_string(instanceCount) + " instances)");
|
||||||
}
|
}
|
||||||
|
|
||||||
// Rebuild TLAS (picks up rebuilt BLASes with new vertex data)
|
|
||||||
dev->BuildRaytracingAccelerationStructure(&tlas_, cmd, nullptr);
|
dev->BuildRaytracingAccelerationStructure(&tlas_, cmd, nullptr);
|
||||||
|
|
||||||
// Memory barrier: sync TLAS build before ray queries can use it
|
// Memory barrier: sync TLAS build before ray queries can use it
|
||||||
|
|
@ -1998,7 +1989,9 @@ void VoxelRenderPath::createRenderTargets() {
|
||||||
// ── WASD camera input ───────────────────────────────────────────
|
// ── WASD camera input ───────────────────────────────────────────
|
||||||
|
|
||||||
static constexpr wi::input::BUTTON KEY_W = (wi::input::BUTTON)(wi::input::CHARACTER_RANGE_START + ('W' - 'A'));
|
static constexpr wi::input::BUTTON KEY_W = (wi::input::BUTTON)(wi::input::CHARACTER_RANGE_START + ('W' - 'A'));
|
||||||
|
static constexpr wi::input::BUTTON KEY_Z = (wi::input::BUTTON)(wi::input::CHARACTER_RANGE_START + ('Z' - 'A'));
|
||||||
static constexpr wi::input::BUTTON KEY_A = (wi::input::BUTTON)(wi::input::CHARACTER_RANGE_START + ('A' - 'A'));
|
static constexpr wi::input::BUTTON KEY_A = (wi::input::BUTTON)(wi::input::CHARACTER_RANGE_START + ('A' - 'A'));
|
||||||
|
static constexpr wi::input::BUTTON KEY_Q = (wi::input::BUTTON)(wi::input::CHARACTER_RANGE_START + ('Q' - 'A'));
|
||||||
static constexpr wi::input::BUTTON KEY_S = (wi::input::BUTTON)(wi::input::CHARACTER_RANGE_START + ('S' - 'A'));
|
static constexpr wi::input::BUTTON KEY_S = (wi::input::BUTTON)(wi::input::CHARACTER_RANGE_START + ('S' - 'A'));
|
||||||
static constexpr wi::input::BUTTON KEY_D = (wi::input::BUTTON)(wi::input::CHARACTER_RANGE_START + ('D' - 'A'));
|
static constexpr wi::input::BUTTON KEY_D = (wi::input::BUTTON)(wi::input::CHARACTER_RANGE_START + ('D' - 'A'));
|
||||||
|
|
||||||
|
|
@ -2010,6 +2003,16 @@ void VoxelRenderPath::handleInput(float dt) {
|
||||||
// F3: toggle animated terrain
|
// F3: toggle animated terrain
|
||||||
if (wi::input::Press(wi::input::KEYBOARD_BUTTON_F3)) {
|
if (wi::input::Press(wi::input::KEYBOARD_BUTTON_F3)) {
|
||||||
animatedTerrain_ = !animatedTerrain_;
|
animatedTerrain_ = !animatedTerrain_;
|
||||||
|
if (animatedTerrain_) {
|
||||||
|
// Save RT state and disable shadows during animation (stale BLAS = wrong shadows)
|
||||||
|
rtWasEnabled_ = renderer.rtShadowsEnabled_;
|
||||||
|
renderer.rtShadowsEnabled_ = false;
|
||||||
|
} else {
|
||||||
|
// Force full RT rebuild (including topings) when animation stops
|
||||||
|
renderer.rtDirty_ = true;
|
||||||
|
renderer.topingBLASDirty_ = true;
|
||||||
|
renderer.rtShadowsEnabled_ = rtWasEnabled_;
|
||||||
|
}
|
||||||
wi::backlog::post(animatedTerrain_ ? "Animation: ON (30 Hz)" : "Animation: OFF");
|
wi::backlog::post(animatedTerrain_ ? "Animation: ON (30 Hz)" : "Animation: OFF");
|
||||||
}
|
}
|
||||||
// F4: toggle blend debug visualization
|
// F4: toggle blend debug visualization
|
||||||
|
|
@ -2058,9 +2061,9 @@ void VoxelRenderPath::handleInput(float dt) {
|
||||||
float speed = cameraSpeed * dt;
|
float speed = cameraSpeed * dt;
|
||||||
if (wi::input::Down(wi::input::KEYBOARD_BUTTON_LSHIFT)) speed *= 3.0f;
|
if (wi::input::Down(wi::input::KEYBOARD_BUTTON_LSHIFT)) speed *= 3.0f;
|
||||||
|
|
||||||
if (wi::input::Down(KEY_W)) { cameraPos.x += forward.x * speed; cameraPos.y += forward.y * speed; cameraPos.z += forward.z * speed; }
|
if (wi::input::Down(KEY_Z)) { cameraPos.x += forward.x * speed; cameraPos.y += forward.y * speed; cameraPos.z += forward.z * speed; }
|
||||||
if (wi::input::Down(KEY_S)) { cameraPos.x -= forward.x * speed; cameraPos.y -= forward.y * speed; cameraPos.z -= forward.z * speed; }
|
if (wi::input::Down(KEY_S)) { cameraPos.x -= forward.x * speed; cameraPos.y -= forward.y * speed; cameraPos.z -= forward.z * speed; }
|
||||||
if (wi::input::Down(KEY_A)) { cameraPos.x -= right.x * speed; cameraPos.z -= right.z * speed; }
|
if (wi::input::Down(KEY_Q)) { cameraPos.x -= right.x * speed; cameraPos.z -= right.z * speed; }
|
||||||
if (wi::input::Down(KEY_D)) { cameraPos.x += right.x * speed; cameraPos.z += right.z * speed; }
|
if (wi::input::Down(KEY_D)) { cameraPos.x += right.x * speed; cameraPos.z += right.z * speed; }
|
||||||
if (wi::input::Down(wi::input::KEYBOARD_BUTTON_SPACE)) cameraPos.y += speed;
|
if (wi::input::Down(wi::input::KEYBOARD_BUTTON_SPACE)) cameraPos.y += speed;
|
||||||
if (wi::input::Down(wi::input::KEYBOARD_BUTTON_LCONTROL)) cameraPos.y -= speed;
|
if (wi::input::Down(wi::input::KEYBOARD_BUTTON_LCONTROL)) cameraPos.y -= speed;
|
||||||
|
|
@ -2073,6 +2076,13 @@ void VoxelRenderPath::handleInput(float dt) {
|
||||||
|
|
||||||
void VoxelRenderPath::Update(float dt) {
|
void VoxelRenderPath::Update(float dt) {
|
||||||
auto frameStart = std::chrono::high_resolution_clock::now();
|
auto frameStart = std::chrono::high_resolution_clock::now();
|
||||||
|
frameStartTime_ = frameStart;
|
||||||
|
// Measure GPU wait: time from last Compose() end to this Update() start
|
||||||
|
// This captures Present() GPU sync + OS scheduling
|
||||||
|
if (lastComposeEndValid_) {
|
||||||
|
float gpuWaitMs = std::chrono::duration<float, std::milli>(frameStart - lastComposeEnd_).count();
|
||||||
|
if (gpuWaitMs > 0.01f) profGpuWait_.add(gpuWaitMs);
|
||||||
|
}
|
||||||
lastDt_ = dt;
|
lastDt_ = dt;
|
||||||
float instantFps = (dt > 0.0f) ? (1.0f / dt) : 0.0f;
|
float instantFps = (dt > 0.0f) ? (1.0f / dt) : 0.0f;
|
||||||
smoothFps_ = smoothFps_ * 0.95f + instantFps * 0.05f;
|
smoothFps_ = smoothFps_ * 0.95f + instantFps * 0.05f;
|
||||||
|
|
@ -2165,7 +2175,10 @@ void VoxelRenderPath::Update(float dt) {
|
||||||
}
|
}
|
||||||
|
|
||||||
void VoxelRenderPath::Render() const {
|
void VoxelRenderPath::Render() const {
|
||||||
|
auto tWicked0 = std::chrono::high_resolution_clock::now();
|
||||||
RenderPath3D::Render();
|
RenderPath3D::Render();
|
||||||
|
auto tWicked1 = std::chrono::high_resolution_clock::now();
|
||||||
|
profWickedRender_.add(std::chrono::duration<float, std::milli>(tWicked1 - tWicked0).count());
|
||||||
|
|
||||||
if (renderer.isInitialized() && camera && rtCreated_) {
|
if (renderer.isInitialized() && camera && rtCreated_) {
|
||||||
auto* device = wi::graphics::GetDevice();
|
auto* device = wi::graphics::GetDevice();
|
||||||
|
|
@ -2179,10 +2192,34 @@ void VoxelRenderPath::Render() const {
|
||||||
renderer.gpuMeshQuadCount_ = *countData;
|
renderer.gpuMeshQuadCount_ = *countData;
|
||||||
renderer.totalQuads_ = renderer.gpuMeshQuadCount_;
|
renderer.totalQuads_ = renderer.gpuMeshQuadCount_;
|
||||||
}
|
}
|
||||||
|
// ── GPU Timestamp readback (previous frame's results) ──
|
||||||
|
{
|
||||||
|
uint64_t* tsData = (uint64_t*)renderer.timestampReadback_.mapped_data;
|
||||||
|
if (tsData) {
|
||||||
|
double freq = (double)device->GetTimestampFrequency();
|
||||||
|
auto readTs = [&](uint32_t begin, uint32_t end) -> float {
|
||||||
|
if (freq > 0.0 && tsData[end] > tsData[begin])
|
||||||
|
return (float)((double)(tsData[end] - tsData[begin]) / freq * 1000.0);
|
||||||
|
return 0.0f;
|
||||||
|
};
|
||||||
|
renderer.gpuMeshTimeMs_ = readTs(VoxelRenderer::TS_GPU_MESH_BEGIN, VoxelRenderer::TS_GPU_MESH_END);
|
||||||
|
renderer.gpuSmoothMeshTimeMs_ = readTs(VoxelRenderer::TS_GPU_SMOOTH_BEGIN, VoxelRenderer::TS_GPU_SMOOTH_END);
|
||||||
|
renderer.gpuBLASExtractTimeMs_ = readTs(VoxelRenderer::TS_BLAS_EXTRACT_BEGIN, VoxelRenderer::TS_BLAS_EXTRACT_END);
|
||||||
|
renderer.gpuBLASBuildTimeMs_ = readTs(VoxelRenderer::TS_BLAS_BUILD_BEGIN, VoxelRenderer::TS_BLAS_BUILD_END);
|
||||||
|
renderer.gpuDrawTimeMs_ = readTs(VoxelRenderer::TS_DRAW_BEGIN, VoxelRenderer::TS_DRAW_END);
|
||||||
|
renderer.gpuRTShadowsTimeMs_ = readTs(VoxelRenderer::TS_RT_SHADOWS_BEGIN, VoxelRenderer::TS_RT_SHADOWS_END);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// Only re-dispatch compute mesher when data changed
|
// Only re-dispatch compute mesher when data changed
|
||||||
if (renderer.gpuMeshDirty_) {
|
if (renderer.gpuMeshDirty_) {
|
||||||
|
device->QueryEnd(&renderer.timestampHeap_, VoxelRenderer::TS_GPU_MESH_BEGIN, cmd);
|
||||||
|
auto t0 = std::chrono::high_resolution_clock::now();
|
||||||
renderer.dispatchGpuMesh(cmd, world,
|
renderer.dispatchGpuMesh(cmd, world,
|
||||||
&profVoxelPack_, &profGpuUpload_, &profGpuDispatch_);
|
&profVoxelPack_, &profGpuUpload_, &profGpuDispatch_);
|
||||||
|
auto t1 = std::chrono::high_resolution_clock::now();
|
||||||
|
profGpuMeshDispatch_.add(std::chrono::duration<float, std::milli>(t1 - t0).count());
|
||||||
|
device->QueryEnd(&renderer.timestampHeap_, VoxelRenderer::TS_GPU_MESH_END, cmd);
|
||||||
}
|
}
|
||||||
|
|
||||||
// GPU smooth mesh: readback previous frame's vertex count
|
// GPU smooth mesh: readback previous frame's vertex count
|
||||||
|
|
@ -2192,7 +2229,9 @@ void VoxelRenderPath::Render() const {
|
||||||
}
|
}
|
||||||
// GPU smooth mesh dispatch (uses same voxelDataBuffer_ already uploaded)
|
// GPU smooth mesh dispatch (uses same voxelDataBuffer_ already uploaded)
|
||||||
if (renderer.gpuSmoothMeshDirty_ && renderer.smoothCentroidShader_.IsValid() && renderer.smoothMeshShader_.IsValid()) {
|
if (renderer.gpuSmoothMeshDirty_ && renderer.smoothCentroidShader_.IsValid() && renderer.smoothMeshShader_.IsValid()) {
|
||||||
|
device->QueryEnd(&renderer.timestampHeap_, VoxelRenderer::TS_GPU_SMOOTH_BEGIN, cmd);
|
||||||
renderer.dispatchGpuSmoothMesh(cmd, world);
|
renderer.dispatchGpuSmoothMesh(cmd, world);
|
||||||
|
device->QueryEnd(&renderer.timestampHeap_, VoxelRenderer::TS_GPU_SMOOTH_END, cmd);
|
||||||
}
|
}
|
||||||
// Re-dispatch next frame if readback not yet available (1-frame delay)
|
// Re-dispatch next frame if readback not yet available (1-frame delay)
|
||||||
if (renderer.gpuSmoothVertexCount_ == 0 &&
|
if (renderer.gpuSmoothVertexCount_ == 0 &&
|
||||||
|
|
@ -2200,19 +2239,41 @@ void VoxelRenderPath::Render() const {
|
||||||
renderer.gpuSmoothMeshDirty_ = true;
|
renderer.gpuSmoothMeshDirty_ = true;
|
||||||
}
|
}
|
||||||
|
|
||||||
// ── GPU compute toping BLAS extraction (replaces 196ms CPU loop) ──
|
// ── GPU compute toping BLAS extraction ──
|
||||||
// Must happen BEFORE BLAS build. Fills topingBLASPositionBuffer_ via CS,
|
// Skip during animation (toping BLAS is skipped to save ~130ms GPU)
|
||||||
// sets rtTopingVertexCount_ and rtDirty_ to trigger BLAS/TLAS rebuild.
|
if (renderer.topingBLASDirty_ && renderer.topingBLASShader_.IsValid() && !animatedTerrain_) {
|
||||||
if (renderer.topingBLASDirty_ && renderer.topingBLASShader_.IsValid()) {
|
device->QueryEnd(&renderer.timestampHeap_, VoxelRenderer::TS_BLAS_EXTRACT_BEGIN, cmd);
|
||||||
renderer.dispatchTopingBLASExtract(cmd);
|
renderer.dispatchTopingBLASExtract(cmd);
|
||||||
|
device->QueryEnd(&renderer.timestampHeap_, VoxelRenderer::TS_BLAS_EXTRACT_END, cmd);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Phase 6.1: BLAS extraction + acceleration structure build
|
// Phase 6.1: BLAS extraction + acceleration structure build
|
||||||
if (renderer.rtAvailable_ && renderer.blasExtractShader_.IsValid() &&
|
// During animation, stagger builds to avoid 200ms+ GPU spikes:
|
||||||
|
// - Skip toping BLAS entirely (7.7M tris = ~130ms, decorative only)
|
||||||
|
// - Alternate blocky/smooth BLAS builds across animation frames
|
||||||
|
// When not animating, rebuild all immediately.
|
||||||
|
{
|
||||||
|
bool needsBuild = renderer.rtAvailable_ && renderer.blasExtractShader_.IsValid() &&
|
||||||
renderer.gpuMeshQuadCount_ > 0 &&
|
renderer.gpuMeshQuadCount_ > 0 &&
|
||||||
(renderer.rtDirty_ || renderer.gpuMeshQuadCount_ != renderer.rtBlockyVertexCount_ / 6)) {
|
(renderer.rtDirty_ || renderer.gpuMeshQuadCount_ != renderer.rtBlockyVertexCount_ / 6);
|
||||||
|
|
||||||
|
if (needsBuild) {
|
||||||
|
device->QueryEnd(&renderer.timestampHeap_, VoxelRenderer::TS_BLAS_BUILD_BEGIN, cmd);
|
||||||
renderer.dispatchBLASExtract(cmd);
|
renderer.dispatchBLASExtract(cmd);
|
||||||
renderer.buildAccelerationStructures(cmd);
|
|
||||||
|
if (animatedTerrain_) {
|
||||||
|
// Stagger: alternate blocky/smooth each animation frame, skip topings
|
||||||
|
uint32_t flags = (rtBuildSkipCounter_ & 1)
|
||||||
|
? VoxelRenderer::RT_BUILD_BLOCKY
|
||||||
|
: VoxelRenderer::RT_BUILD_SMOOTH;
|
||||||
|
rtBuildSkipCounter_++;
|
||||||
|
renderer.buildAccelerationStructures(cmd, flags);
|
||||||
|
} else {
|
||||||
|
renderer.buildAccelerationStructures(cmd, VoxelRenderer::RT_BUILD_ALL);
|
||||||
|
}
|
||||||
|
|
||||||
|
device->QueryEnd(&renderer.timestampHeap_, VoxelRenderer::TS_BLAS_BUILD_END, cmd);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -2239,52 +2300,96 @@ void VoxelRenderPath::Render() const {
|
||||||
renderer.smoothVertexDirty_ = false;
|
renderer.smoothVertexDirty_ = false;
|
||||||
}
|
}
|
||||||
|
|
||||||
auto tRender0 = std::chrono::high_resolution_clock::now();
|
// ── Draw passes ──
|
||||||
|
device->QueryEnd(&renderer.timestampHeap_, VoxelRenderer::TS_DRAW_BEGIN, cmd);
|
||||||
renderer.render(cmd, *camera, voxelDepth_, voxelRT_, voxelNormalRT_);
|
renderer.render(cmd, *camera, voxelDepth_, voxelRT_, voxelNormalRT_);
|
||||||
|
|
||||||
// Phase 4: render topings (separate render pass, preserves voxel output)
|
|
||||||
renderer.renderTopings(cmd, topingSystem, voxelDepth_, voxelRT_, voxelNormalRT_);
|
renderer.renderTopings(cmd, topingSystem, voxelDepth_, voxelRT_, voxelNormalRT_);
|
||||||
|
|
||||||
// Phase 5: render smooth surfaces (separate render pass, preserves all prior output)
|
|
||||||
renderer.renderSmooth(cmd, voxelDepth_, voxelRT_, voxelNormalRT_);
|
renderer.renderSmooth(cmd, voxelDepth_, voxelRT_, voxelNormalRT_);
|
||||||
|
device->QueryEnd(&renderer.timestampHeap_, VoxelRenderer::TS_DRAW_END, cmd);
|
||||||
|
|
||||||
// Phase 6.2: RT Shadows (modulates voxelRT_ in-place after all geometry is rendered)
|
// Phase 6.2: RT Shadows + AO
|
||||||
if (renderer.isRTShadowsEnabled() && renderer.isRTReady()) {
|
if (renderer.isRTShadowsEnabled() && renderer.isRTReady()) {
|
||||||
|
device->QueryEnd(&renderer.timestampHeap_, VoxelRenderer::TS_RT_SHADOWS_BEGIN, cmd);
|
||||||
renderer.dispatchShadows(cmd, voxelDepth_, voxelRT_, voxelNormalRT_);
|
renderer.dispatchShadows(cmd, voxelDepth_, voxelRT_, voxelNormalRT_);
|
||||||
|
device->QueryEnd(&renderer.timestampHeap_, VoxelRenderer::TS_RT_SHADOWS_END, cmd);
|
||||||
}
|
}
|
||||||
auto tRender1 = std::chrono::high_resolution_clock::now();
|
|
||||||
profRender_.add(std::chrono::duration<float, std::milli>(tRender1 - tRender0).count());
|
// Resolve GPU timestamps for readback next frame
|
||||||
|
device->QueryResolve(&renderer.timestampHeap_, 0, VoxelRenderer::TS_COUNT,
|
||||||
|
&renderer.timestampReadback_, 0, cmd);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void VoxelRenderPath::logProfilingAverages() const {
|
void VoxelRenderPath::logProfilingAverages() const {
|
||||||
char msg[1024];
|
char msg[2048];
|
||||||
snprintf(msg, sizeof(msg),
|
snprintf(msg, sizeof(msg),
|
||||||
"=== PERF PROFILE (avg over %.0fs) ===\n"
|
"=== PERF PROFILE (avg over %.0fs) ===\n"
|
||||||
|
" ── Update() ──\n"
|
||||||
" Regenerate: %7.2f ms (%u calls)\n"
|
" Regenerate: %7.2f ms (%u calls)\n"
|
||||||
" UpdateMeshes: %7.2f ms (%u calls)\n"
|
" UpdateMeshes: %7.2f ms (%u calls)\n"
|
||||||
" VoxelPack: %7.2f ms (%u calls)\n"
|
|
||||||
" GPU Upload: %7.2f ms (%u calls)\n"
|
|
||||||
" GPU Dispatch: %7.2f ms (%u calls)\n"
|
|
||||||
" SmoothMesh: %7.2f ms (%u calls)\n"
|
|
||||||
" SmoothUpload: %7.2f ms (%u calls)\n"
|
|
||||||
" TopingCollect: %7.2f ms (%u calls)\n"
|
" TopingCollect: %7.2f ms (%u calls)\n"
|
||||||
" TopingUpload: %7.2f ms (%u calls)\n"
|
" TopingUpload: %7.2f ms (%u calls)\n"
|
||||||
" Render: %7.2f ms (%u calls)\n"
|
" SmoothMesh(CPU): %7.2f ms (%u calls)\n"
|
||||||
" Frame (Upd): %7.2f ms (%u calls, %.1f FPS)",
|
" SmoothUpload: %7.2f ms (%u calls)\n"
|
||||||
|
" Update total: %7.2f ms (%u calls)\n"
|
||||||
|
" ── Render() ──\n"
|
||||||
|
" GPU Mesh (pack): %7.2f ms (%u calls)\n"
|
||||||
|
" GPU Mesh (up): %7.2f ms (%u calls)\n"
|
||||||
|
" GPU Mesh (disp): %7.2f ms (%u calls)\n"
|
||||||
|
" GPU MeshTotal: %7.2f ms (%u calls)\n"
|
||||||
|
" GPU SmoothMesh: %7.2f ms (%u calls)\n"
|
||||||
|
" BLAS Extract: %7.2f ms (%u calls)\n"
|
||||||
|
" BLAS/TLAS Build: %7.2f ms (%u calls)\n"
|
||||||
|
" Deferred Upload: %7.2f ms (%u calls)\n"
|
||||||
|
" Draw (3 passes): %7.2f ms (%u calls)\n"
|
||||||
|
" RT Shadows+AO: %7.2f ms (%u calls)\n"
|
||||||
|
" ── GPU Timings (hardware timestamps) ──\n"
|
||||||
|
" GPU Mesh: %7.2f ms\n"
|
||||||
|
" GPU SmoothMesh: %7.2f ms\n"
|
||||||
|
" GPU BLAS Extract: %7.2f ms\n"
|
||||||
|
" GPU BLAS Build: %7.2f ms\n"
|
||||||
|
" GPU Draw: %7.2f ms\n"
|
||||||
|
" GPU RT Shad+AO: %7.2f ms\n"
|
||||||
|
" GPU Total: %7.2f ms\n"
|
||||||
|
" ── Totals ──\n"
|
||||||
|
" Wicked Render: %7.2f ms (%u calls)\n"
|
||||||
|
" GPU Wait/Sync: %7.2f ms (%u calls)\n"
|
||||||
|
" CPU Frame: %7.2f ms (Update→Compose start)\n"
|
||||||
|
" True Frame: %7.2f ms (Update→Compose end)\n"
|
||||||
|
" Wall FPS: %7.1f (%u frames in %.0fs)",
|
||||||
PROF_INTERVAL,
|
PROF_INTERVAL,
|
||||||
profRegenerate_.avg(), profRegenerate_.count,
|
profRegenerate_.avg(), profRegenerate_.count,
|
||||||
profUpdateMeshes_.avg(), profUpdateMeshes_.count,
|
profUpdateMeshes_.avg(), profUpdateMeshes_.count,
|
||||||
|
profTopingCollect_.avg(), profTopingCollect_.count,
|
||||||
|
profTopingUpload_.avg(), profTopingUpload_.count,
|
||||||
|
profSmoothMesh_.avg(), profSmoothMesh_.count,
|
||||||
|
profSmoothUpload_.avg(), profSmoothUpload_.count,
|
||||||
|
profFrame_.avg(), profFrame_.count,
|
||||||
profVoxelPack_.avg(), profVoxelPack_.count,
|
profVoxelPack_.avg(), profVoxelPack_.count,
|
||||||
profGpuUpload_.avg(), profGpuUpload_.count,
|
profGpuUpload_.avg(), profGpuUpload_.count,
|
||||||
profGpuDispatch_.avg(), profGpuDispatch_.count,
|
profGpuDispatch_.avg(), profGpuDispatch_.count,
|
||||||
profSmoothMesh_.avg(), profSmoothMesh_.count,
|
profGpuMeshDispatch_.avg(), profGpuMeshDispatch_.count,
|
||||||
profSmoothUpload_.avg(), profSmoothUpload_.count,
|
profGpuSmoothDispatch_.avg(), profGpuSmoothDispatch_.count,
|
||||||
profTopingCollect_.avg(), profTopingCollect_.count,
|
profBLASExtract_.avg(), profBLASExtract_.count,
|
||||||
profTopingUpload_.avg(), profTopingUpload_.count,
|
profBLASBuild_.avg(), profBLASBuild_.count,
|
||||||
|
profDeferredUpload_.avg(), profDeferredUpload_.count,
|
||||||
profRender_.avg(), profRender_.count,
|
profRender_.avg(), profRender_.count,
|
||||||
profFrame_.avg(), profFrame_.count,
|
profRTShadows_.avg(), profRTShadows_.count,
|
||||||
profFrame_.count > 0 ? (1000.0f / profFrame_.avg()) : 0.0f);
|
renderer.gpuMeshTimeMs_,
|
||||||
|
renderer.gpuSmoothMeshTimeMs_,
|
||||||
|
renderer.gpuBLASExtractTimeMs_,
|
||||||
|
renderer.gpuBLASBuildTimeMs_,
|
||||||
|
renderer.gpuDrawTimeMs_,
|
||||||
|
renderer.gpuRTShadowsTimeMs_,
|
||||||
|
renderer.gpuMeshTimeMs_ + renderer.gpuSmoothMeshTimeMs_ +
|
||||||
|
renderer.gpuBLASExtractTimeMs_ + renderer.gpuBLASBuildTimeMs_ +
|
||||||
|
renderer.gpuDrawTimeMs_ + renderer.gpuRTShadowsTimeMs_,
|
||||||
|
profWickedRender_.avg(), profWickedRender_.count,
|
||||||
|
profGpuWait_.avg(), profGpuWait_.count,
|
||||||
|
profFullFrame_.avg(),
|
||||||
|
profTrueFrame_.avg(),
|
||||||
|
profTrueFrame_.count > 0 ? (1000.0f / profTrueFrame_.avg()) : 0.0f,
|
||||||
|
profTrueFrame_.count, PROF_INTERVAL);
|
||||||
wi::backlog::post(msg);
|
wi::backlog::post(msg);
|
||||||
|
|
||||||
profRegenerate_.reset();
|
profRegenerate_.reset();
|
||||||
|
|
@ -2292,17 +2397,32 @@ void VoxelRenderPath::logProfilingAverages() const {
|
||||||
profVoxelPack_.reset();
|
profVoxelPack_.reset();
|
||||||
profGpuUpload_.reset();
|
profGpuUpload_.reset();
|
||||||
profGpuDispatch_.reset();
|
profGpuDispatch_.reset();
|
||||||
|
profGpuMeshDispatch_.reset();
|
||||||
|
profGpuSmoothDispatch_.reset();
|
||||||
profSmoothMesh_.reset();
|
profSmoothMesh_.reset();
|
||||||
profSmoothUpload_.reset();
|
profSmoothUpload_.reset();
|
||||||
profTopingCollect_.reset();
|
profTopingCollect_.reset();
|
||||||
profTopingUpload_.reset();
|
profTopingUpload_.reset();
|
||||||
|
profBLASExtract_.reset();
|
||||||
|
profBLASBuild_.reset();
|
||||||
|
profDeferredUpload_.reset();
|
||||||
profRender_.reset();
|
profRender_.reset();
|
||||||
|
profRTShadows_.reset();
|
||||||
profFrame_.reset();
|
profFrame_.reset();
|
||||||
|
profFullFrame_.reset();
|
||||||
|
profGpuWait_.reset();
|
||||||
|
profWickedRender_.reset();
|
||||||
|
profTrueFrame_.reset();
|
||||||
}
|
}
|
||||||
|
|
||||||
void VoxelRenderPath::Compose(CommandList cmd) const {
|
void VoxelRenderPath::Compose(CommandList cmd) const {
|
||||||
frameCount_++;
|
frameCount_++;
|
||||||
|
|
||||||
|
// Measure full frame time (Update + Render + Compose start)
|
||||||
|
auto composeStart = std::chrono::high_resolution_clock::now();
|
||||||
|
float fullFrameMs = std::chrono::duration<float, std::milli>(composeStart - frameStartTime_).count();
|
||||||
|
if (fullFrameMs > 0.1f) profFullFrame_.add(fullFrameMs);
|
||||||
|
|
||||||
RenderPath3D::Compose(cmd);
|
RenderPath3D::Compose(cmd);
|
||||||
|
|
||||||
if (rtCreated_ && voxelRT_.IsValid()) {
|
if (rtCreated_ && voxelRT_.IsValid()) {
|
||||||
|
|
@ -2365,6 +2485,13 @@ void VoxelRenderPath::Compose(CommandList cmd) const {
|
||||||
+ "] | F5: shd+ao [" + std::string(renderer.rtShadowDebug_ == 1 ? "SHD" : (renderer.rtShadowDebug_ == 2 ? "AO" : (renderer.isRTShadowsEnabled() ? "ON" : "OFF"))) + "]";
|
+ "] | F5: shd+ao [" + std::string(renderer.rtShadowDebug_ == 1 ? "SHD" : (renderer.rtShadowDebug_ == 2 ? "AO" : (renderer.isRTShadowsEnabled() ? "ON" : "OFF"))) + "]";
|
||||||
|
|
||||||
wi::font::Draw(stats, fp, cmd);
|
wi::font::Draw(stats, fp, cmd);
|
||||||
|
|
||||||
|
// Save compose end time for GPU wait measurement
|
||||||
|
lastComposeEnd_ = std::chrono::high_resolution_clock::now();
|
||||||
|
lastComposeEndValid_ = true;
|
||||||
|
// True frame-to-frame time
|
||||||
|
float trueFrameMs = std::chrono::duration<float, std::milli>(lastComposeEnd_ - frameStartTime_).count();
|
||||||
|
if (trueFrameMs > 0.1f) profTrueFrame_.add(trueFrameMs);
|
||||||
}
|
}
|
||||||
|
|
||||||
void VoxelRenderPath::setCamera(float x, float y, float z, float pitch, float yaw) {
|
void VoxelRenderPath::setCamera(float x, float y, float z, float pitch, float yaw) {
|
||||||
|
|
|
||||||
|
|
@ -228,7 +228,13 @@ private:
|
||||||
mutable uint32_t tlasInstanceCount_ = 0; // track TLAS instance count to avoid per-frame recreation
|
mutable uint32_t tlasInstanceCount_ = 0; // track TLAS instance count to avoid per-frame recreation
|
||||||
|
|
||||||
void dispatchBLASExtract(wi::graphics::CommandList cmd) const;
|
void dispatchBLASExtract(wi::graphics::CommandList cmd) const;
|
||||||
void buildAccelerationStructures(wi::graphics::CommandList cmd) const;
|
// Flags for selective BLAS rebuild
|
||||||
|
static constexpr uint32_t RT_BUILD_BLOCKY = 1 << 0;
|
||||||
|
static constexpr uint32_t RT_BUILD_SMOOTH = 1 << 1;
|
||||||
|
static constexpr uint32_t RT_BUILD_TOPING = 1 << 2;
|
||||||
|
static constexpr uint32_t RT_BUILD_ALL = RT_BUILD_BLOCKY | RT_BUILD_SMOOTH | RT_BUILD_TOPING;
|
||||||
|
void buildAccelerationStructures(wi::graphics::CommandList cmd,
|
||||||
|
uint32_t buildFlags = RT_BUILD_ALL) const;
|
||||||
|
|
||||||
// ── RT Shadows + AO (Phase 6.2 + 6.3) ──────────────────────────
|
// ── RT Shadows + AO (Phase 6.2 + 6.3) ──────────────────────────
|
||||||
wi::graphics::Shader shadowShader_; // voxelShadowCS compute shader
|
wi::graphics::Shader shadowShader_; // voxelShadowCS compute shader
|
||||||
|
|
@ -254,19 +260,29 @@ private:
|
||||||
void dispatchGpuSmoothMesh(wi::graphics::CommandList cmd, const VoxelWorld& world) const;
|
void dispatchGpuSmoothMesh(wi::graphics::CommandList cmd, const VoxelWorld& world) const;
|
||||||
void rebuildChunkInfoOnly(VoxelWorld& world);
|
void rebuildChunkInfoOnly(VoxelWorld& world);
|
||||||
|
|
||||||
// ── GPU Timestamp Queries (Phase 2 benchmark) ────────────────
|
// ── GPU Timestamp Queries (comprehensive GPU profiling) ────────
|
||||||
wi::graphics::GPUQueryHeap timestampHeap_;
|
wi::graphics::GPUQueryHeap timestampHeap_;
|
||||||
wi::graphics::GPUBuffer timestampReadback_;
|
wi::graphics::GPUBuffer timestampReadback_;
|
||||||
static constexpr uint32_t TS_CULL_BEGIN = 0;
|
// Timestamp slots: pairs of (BEGIN, END) for each GPU phase
|
||||||
static constexpr uint32_t TS_CULL_END = 1;
|
static constexpr uint32_t TS_GPU_MESH_BEGIN = 0;
|
||||||
static constexpr uint32_t TS_DRAW_BEGIN = 2;
|
static constexpr uint32_t TS_GPU_MESH_END = 1;
|
||||||
static constexpr uint32_t TS_DRAW_END = 3;
|
static constexpr uint32_t TS_GPU_SMOOTH_BEGIN = 2;
|
||||||
static constexpr uint32_t TS_MESH_BEGIN = 4;
|
static constexpr uint32_t TS_GPU_SMOOTH_END = 3;
|
||||||
static constexpr uint32_t TS_MESH_END = 5;
|
static constexpr uint32_t TS_BLAS_EXTRACT_BEGIN = 4;
|
||||||
static constexpr uint32_t TS_COUNT = 6;
|
static constexpr uint32_t TS_BLAS_EXTRACT_END = 5;
|
||||||
mutable float gpuCullTimeMs_ = 0.0f;
|
static constexpr uint32_t TS_BLAS_BUILD_BEGIN = 6;
|
||||||
mutable float gpuDrawTimeMs_ = 0.0f;
|
static constexpr uint32_t TS_BLAS_BUILD_END = 7;
|
||||||
|
static constexpr uint32_t TS_DRAW_BEGIN = 8;
|
||||||
|
static constexpr uint32_t TS_DRAW_END = 9;
|
||||||
|
static constexpr uint32_t TS_RT_SHADOWS_BEGIN = 10;
|
||||||
|
static constexpr uint32_t TS_RT_SHADOWS_END = 11;
|
||||||
|
static constexpr uint32_t TS_COUNT = 12;
|
||||||
mutable float gpuMeshTimeMs_ = 0.0f;
|
mutable float gpuMeshTimeMs_ = 0.0f;
|
||||||
|
mutable float gpuSmoothMeshTimeMs_ = 0.0f;
|
||||||
|
mutable float gpuBLASExtractTimeMs_ = 0.0f;
|
||||||
|
mutable float gpuBLASBuildTimeMs_ = 0.0f;
|
||||||
|
mutable float gpuDrawTimeMs_ = 0.0f;
|
||||||
|
mutable float gpuRTShadowsTimeMs_ = 0.0f;
|
||||||
|
|
||||||
// Stats (mutable: updated during const Render() call)
|
// Stats (mutable: updated during const Render() call)
|
||||||
mutable uint32_t totalQuads_ = 0;
|
mutable uint32_t totalQuads_ = 0;
|
||||||
|
|
@ -276,8 +292,13 @@ private:
|
||||||
bool initialized_ = false;
|
bool initialized_ = false;
|
||||||
|
|
||||||
public:
|
public:
|
||||||
float getGpuCullTimeMs() const { return gpuCullTimeMs_; }
|
|
||||||
float getGpuDrawTimeMs() const { return gpuDrawTimeMs_; }
|
float getGpuDrawTimeMs() const { return gpuDrawTimeMs_; }
|
||||||
|
float getGpuMeshTimeMs() const { return gpuMeshTimeMs_; }
|
||||||
|
float getGpuSmoothMeshTimeMs() const { return gpuSmoothMeshTimeMs_; }
|
||||||
|
float getGpuBLASExtractTimeMs() const { return gpuBLASExtractTimeMs_; }
|
||||||
|
float getGpuBLASBuildTimeMs() const { return gpuBLASBuildTimeMs_; }
|
||||||
|
float getGpuRTShadowsTimeMs() const { return gpuRTShadowsTimeMs_; }
|
||||||
|
void toggleRTShadows() { rtShadowsEnabled_ = !rtShadowsEnabled_; }
|
||||||
bool isGpuMeshEnabled() const { return gpuMesherAvailable_; }
|
bool isGpuMeshEnabled() const { return gpuMesherAvailable_; }
|
||||||
uint32_t getGpuMeshQuadCount() const { return gpuMeshQuadCount_; }
|
uint32_t getGpuMeshQuadCount() const { return gpuMeshQuadCount_; }
|
||||||
|
|
||||||
|
|
@ -369,12 +390,27 @@ private:
|
||||||
mutable ProfileAccum profVoxelPack_; // voxel data packing in dispatchGpuMesh
|
mutable ProfileAccum profVoxelPack_; // voxel data packing in dispatchGpuMesh
|
||||||
mutable ProfileAccum profGpuUpload_; // GPU upload in dispatchGpuMesh
|
mutable ProfileAccum profGpuUpload_; // GPU upload in dispatchGpuMesh
|
||||||
mutable ProfileAccum profGpuDispatch_; // compute dispatches in dispatchGpuMesh
|
mutable ProfileAccum profGpuDispatch_; // compute dispatches in dispatchGpuMesh
|
||||||
mutable ProfileAccum profRender_; // render() total
|
mutable ProfileAccum profRender_; // render() draw calls
|
||||||
mutable ProfileAccum profFrame_; // full frame (Update + Render + Compose)
|
mutable ProfileAccum profFrame_; // full frame (Update only - legacy)
|
||||||
|
mutable ProfileAccum profFullFrame_; // true full frame (Update + Render + Compose)
|
||||||
mutable ProfileAccum profSmoothMesh_; // SmoothMesher::meshChunk (all chunks)
|
mutable ProfileAccum profSmoothMesh_; // SmoothMesher::meshChunk (all chunks)
|
||||||
mutable ProfileAccum profSmoothUpload_; // uploadSmoothData
|
mutable ProfileAccum profSmoothUpload_; // uploadSmoothData
|
||||||
mutable ProfileAccum profTopingCollect_; // topingSystem.collectInstances
|
mutable ProfileAccum profTopingCollect_; // topingSystem.collectInstances
|
||||||
mutable ProfileAccum profTopingUpload_; // uploadTopingData
|
mutable ProfileAccum profTopingUpload_; // uploadTopingData
|
||||||
|
mutable ProfileAccum profGpuMeshDispatch_; // GPU mesh compute dispatch (in Render)
|
||||||
|
mutable ProfileAccum profGpuSmoothDispatch_; // GPU smooth mesh dispatch (in Render)
|
||||||
|
mutable ProfileAccum profBLASExtract_; // BLAS position extraction compute
|
||||||
|
mutable ProfileAccum profBLASBuild_; // BLAS/TLAS build
|
||||||
|
mutable ProfileAccum profDeferredUpload_; // deferred GPU buffer uploads
|
||||||
|
mutable ProfileAccum profRTShadows_; // RT shadows + AO dispatch
|
||||||
|
mutable ProfileAccum profGpuWait_; // GPU sync: time between Compose end and next Update start
|
||||||
|
mutable ProfileAccum profWickedRender_; // RenderPath3D::Render() (Wicked internal)
|
||||||
|
mutable ProfileAccum profTrueFrame_; // wall-clock frame-to-frame time
|
||||||
|
mutable std::chrono::high_resolution_clock::time_point frameStartTime_; // for full frame timing
|
||||||
|
mutable std::chrono::high_resolution_clock::time_point lastComposeEnd_; // for GPU wait measurement
|
||||||
|
mutable bool lastComposeEndValid_ = false;
|
||||||
|
mutable uint32_t rtBuildSkipCounter_ = 0; // stagger BLAS builds during animation
|
||||||
|
mutable bool rtWasEnabled_ = false; // saved RT state before animation
|
||||||
mutable float profTimer_ = 0.0f;
|
mutable float profTimer_ = 0.0f;
|
||||||
static constexpr float PROF_INTERVAL = 5.0f;
|
static constexpr float PROF_INTERVAL = 5.0f;
|
||||||
void logProfilingAverages() const;
|
void logProfilingAverages() const;
|
||||||
|
|
|
||||||
Loading…
Add table
Reference in a new issue