From aea2aa3fb5e63ccd9614c24a465f12fde4563d43 Mon Sep 17 00:00:00 2001
From: Williscool13 <twtw40@gmail.com>
Date: Sun, 23 Mar 2025 15:55:16 +0700
Subject: [PATCH 01/27] GTAO depth prefilter pipeline.

---
 CMakeLists.txt                                |   3 +
 assets/maps/sampleScene.willmap               |  47 +++++--
 assets/models/sponza2/Sponza.willmodel        |   8 ++
 .../ambient_occlusion/ground_truth/gtao.comp  |  92 ++++++++++++++
 .../ground_truth/gtaodepthprefilter.comp      |   5 +
 shaders/deferredResolve.comp                  |   5 +-
 src/core/engine.cpp                           |   1 -
 src/renderer/environment/environment.cpp      |   8 +-
 src/renderer/imgui_wrapper.cpp                |   2 +-
 .../ambient_occlusion_types.h                 |  34 +++++
 .../ground_truth_ambient_occlusion.cpp        | 119 ++++++++++++++++++
 .../ground_truth_ambient_occlusion.h          |  56 +++++++++
 .../lighting/shadows/cascaded_shadow_map.cpp  |  10 +-
 src/renderer/resource_manager.cpp             |  29 ++++-
 src/renderer/resource_manager.h               |   2 +
 15 files changed, 405 insertions(+), 16 deletions(-)
 create mode 100644 assets/models/sponza2/Sponza.willmodel
 create mode 100644 shaders/ambient_occlusion/ground_truth/gtao.comp
 create mode 100644 shaders/ambient_occlusion/ground_truth/gtaodepthprefilter.comp
 create mode 100644 src/renderer/lighting/ambient_occlusion/ambient_occlusion_types.h
 create mode 100644 src/renderer/lighting/ambient_occlusion/ground_truth/ground_truth_ambient_occlusion.cpp
 create mode 100644 src/renderer/lighting/ambient_occlusion/ground_truth/ground_truth_ambient_occlusion.h

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 17e4f71e..01c37f27 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -247,6 +247,9 @@ set(TEMP_SOURCES
         src/renderer/assets/texture/texture_resource.cpp
         src/renderer/assets/texture/texture_resource.h
         src/renderer/assets/texture/texture_types.h
+        src/renderer/lighting/ambient_occlusion/ground_truth/ground_truth_ambient_occlusion.cpp
+        src/renderer/lighting/ambient_occlusion/ground_truth/ground_truth_ambient_occlusion.h
+        src/renderer/lighting/ambient_occlusion/ambient_occlusion_types.h
 
 )
 
diff --git a/assets/maps/sampleScene.willmap b/assets/maps/sampleScene.willmap
index 6995a167..7cf48529 100644
--- a/assets/maps/sampleScene.willmap
+++ b/assets/maps/sampleScene.willmap
@@ -6,7 +6,7 @@
     },
     "metadata": {
         "name": "sampleScene",
-        "created": "2025-03-21 18:18:15",
+        "created": "2025-03-22 17:23:20",
         "formatVersion": 1
     },
     "rootComponents": {
@@ -280,15 +280,15 @@
                 "name": "Cube\u0000ing Orb",
                 "transform": {
                     "position": {
-                        "x": 13.52876091003418,
-                        "y": 21.364248275756836,
-                        "z": 37.19429397583008
+                        "x": 13.52898120880127,
+                        "y": 21.364240646362305,
+                        "z": 37.19432067871094
                     },
                     "rotation": {
-                        "x": 0.19281113147735596,
-                        "y": 0.20267827808856964,
-                        "z": 0.6591735482215881,
-                        "w": 0.6980226635932922
+                        "x": 0.19284461438655853,
+                        "y": 0.2026415914297104,
+                        "z": 0.6591105461120605,
+                        "w": 0.6980835795402527
                     },
                     "scale": {
                         "x": 1.0,
@@ -319,6 +319,37 @@
                         "componentName": "New RigidBodyComponent"
                     }
                 }
+            },
+            {
+                "id": 4,
+                "name": "Sponza\u0000ect_4",
+                "transform": {
+                    "position": {
+                        "x": 102.19999694824219,
+                        "y": 77.0,
+                        "z": -49.099998474121094
+                    },
+                    "rotation": {
+                        "x": 5.711166295441217e-08,
+                        "y": -0.3826834559440613,
+                        "z": 5.711166295441217e-08,
+                        "w": 0.9238795042037964
+                    },
+                    "scale": {
+                        "x": 0.05000000074505806,
+                        "y": 0.05000000074505806,
+                        "z": 0.05000000074505806
+                    }
+                },
+                "components": {
+                    "MeshRendererComponent": {
+                        "renderReference": 195023067,
+                        "renderMeshIndex": 0,
+                        "renderIsVisible": true,
+                        "renderIsShadowCaster": true,
+                        "componentName": "MeshRendererComponent"
+                    }
+                }
             }
         ]
     }
diff --git a/assets/models/sponza2/Sponza.willmodel b/assets/models/sponza2/Sponza.willmodel
new file mode 100644
index 00000000..d945e982
--- /dev/null
+++ b/assets/models/sponza2/Sponza.willmodel
@@ -0,0 +1,8 @@
+{
+    "renderObject": {
+        "gltfPath": "assets\\models\\sponza2\\Sponza.gltf",
+        "id": 195023067,
+        "name": "Sponza"
+    },
+    "version": 1
+}
diff --git a/shaders/ambient_occlusion/ground_truth/gtao.comp b/shaders/ambient_occlusion/ground_truth/gtao.comp
new file mode 100644
index 00000000..c48a172d
--- /dev/null
+++ b/shaders/ambient_occlusion/ground_truth/gtao.comp
@@ -0,0 +1,92 @@
+#version 460
+
+void main() {
+
+}
+
+void XeGTAO_PrefilterDepths16x16(
+uvec2 dispatchThreadID,
+uvec2 groupThreadID,
+const GTAOConstants consts,
+sampler2D sourceNDCDepth,
+out writeonly image2D outDepth0,
+out writeonly image2D outDepth1,
+out writeonly image2D outDepth2,
+out writeonly image2D outDepth3,
+out writeonly image2D outDepth4
+) {
+    // MIP 0
+    const uvec2 baseCoord = dispatchThreadID;
+    // 2x because ao image is downsampled
+    const uvec2 pixCoord = baseCoord * 2u;
+
+
+    // todo: get width and height from sceneData
+    vec2 uvCoord = vec2(pixCoord) * consts.ViewportPixelSize;
+
+    vec4 depths 4;
+    depths4.w = texture(sourceNDCDepth, uvCoord + vec2(0.0, 0.0) * consts.ViewportPixelSize).r;
+    depths4.z = texture(sourceNDCDepth, uvCoord + vec2(1.0, 0.0) * consts.ViewportPixelSize).r;
+    depths4.x = texture(sourceNDCDepth, uvCoord + vec2(0.0, 1.0) * consts.ViewportPixelSize).r;
+    depths4.y = texture(sourceNDCDepth, uvCoord + vec2(1.0, 1.0) * consts.ViewportPixelSize).r;
+
+    float depth0 = XeGTAO_ClampDepth(XeGTAO_ScreenSpaceToViewSpaceDepth(depths4.w, consts));
+    float depth1 = XeGTAO_ClampDepth(XeGTAO_ScreenSpaceToViewSpaceDepth(depths4.z, consts));
+    float depth2 = XeGTAO_ClampDepth(XeGTAO_ScreenSpaceToViewSpaceDepth(depths4.x, consts));
+    float depth3 = XeGTAO_ClampDepth(XeGTAO_ScreenSpaceToViewSpaceDepth(depths4.y, consts));
+
+    imageStore(outDepth0, ivec2(pixCoord + uvec2(0, 0)), vec4(depth0, 0.0, 0.0, 0.0));
+    imageStore(outDepth0, ivec2(pixCoord + uvec2(1, 0)), vec4(depth1, 0.0, 0.0, 0.0));
+    imageStore(outDepth0, ivec2(pixCoord + uvec2(0, 1)), vec4(depth2, 0.0, 0.0, 0.0));
+    imageStore(outDepth0, ivec2(pixCoord + uvec2(1, 1)), vec4(depth3, 0.0, 0.0, 0.0));
+
+    // MIP 1
+    float dm1 = XeGTAO_DepthMIPFilter(depth0, depth1, depth2, depth3, consts);
+    imageStore(outDepth1, ivec2(baseCoord), vec4(dm1, 0.0, 0.0, 0.0));
+    g_scratchDepths[groupThreadID.x][groupThreadID.y] = dm1;
+
+    memoryBarrierShared();
+    barrier();
+
+    // MIP 2
+    if (all(equal(groupThreadID.xy % 2u, uvec2(0u, 0u)))) {
+        float inTL = g_scratchDepths[groupThreadID.x+0u][groupThreadID.y+0u];
+        float inTR = g_scratchDepths[groupThreadID.x+1u][groupThreadID.y+0u];
+        float inBL = g_scratchDepths[groupThreadID.x+0u][groupThreadID.y+1u];
+        float inBR = g_scratchDepths[groupThreadID.x+1u][groupThreadID.y+1u];
+
+        float dm2 = XeGTAO_DepthMIPFilter(inTL, inTR, inBL, inBR, consts);
+        imageStore(outDepth2, ivec2(baseCoord / 2u), vec4(dm2, 0.0, 0.0, 0.0));
+        g_scratchDepths[groupThreadID.x][groupThreadID.y] = dm2;
+    }
+
+    memoryBarrierShared();
+    barrier();
+
+    // MIP 3
+    if (all(equal(groupThreadID.xy % 4u, uvec2(0u, 0u)))) {
+        float inTL = g_scratchDepths[groupThreadID.x+0u][groupThreadID.y+0u];
+        float inTR = g_scratchDepths[groupThreadID.x+2u][groupThreadID.y+0u];
+        float inBL = g_scratchDepths[groupThreadID.x+0u][groupThreadID.y+2u];
+        float inBR = g_scratchDepths[groupThreadID.x+2u][groupThreadID.y+2u];
+
+        float dm3 = XeGTAO_DepthMIPFilter(inTL, inTR, inBL, inBR, consts);
+        imageStore(outDepth3, ivec2(baseCoord / 4u), vec4(dm3, 0.0, 0.0, 0.0));
+        g_scratchDepths[groupThreadID.x][groupThreadID.y] = dm3;
+    }
+
+    memoryBarrierShared();
+    barrier();
+
+    // MIP 4
+    if (all(equal(groupThreadID.xy % 8u, uvec2(0u, 0u)))) {
+        float inTL = g_scratchDepths[groupThreadID.x+0u][groupThreadID.y+0u];
+        float inTR = g_scratchDepths[groupThreadID.x+4u][groupThreadID.y+0u];
+        float inBL = g_scratchDepths[groupThreadID.x+0u][groupThreadID.y+4u];
+        float inBR = g_scratchDepths[groupThreadID.x+4u][groupThreadID.y+4u];
+
+        float dm4 = XeGTAO_DepthMIPFilter(inTL, inTR, inBL, inBR, consts);
+        imageStore(outDepth4, ivec2(baseCoord / 8u), vec4(dm4, 0.0, 0.0, 0.0));
+        // g_scratchDepths[groupThreadID.x][groupThreadID.y] = dm4; // commented out as in original
+    }
+}
\ No newline at end of file
diff --git a/shaders/ambient_occlusion/ground_truth/gtaodepthprefilter.comp b/shaders/ambient_occlusion/ground_truth/gtaodepthprefilter.comp
new file mode 100644
index 00000000..747d43d5
--- /dev/null
+++ b/shaders/ambient_occlusion/ground_truth/gtaodepthprefilter.comp
@@ -0,0 +1,5 @@
+#version 460
+
+void main() {
+
+}
diff --git a/shaders/deferredResolve.comp b/shaders/deferredResolve.comp
index 6b5837aa..811c53ed 100644
--- a/shaders/deferredResolve.comp
+++ b/shaders/deferredResolve.comp
@@ -130,7 +130,7 @@ void main() {
         shadowFactor = 1.0f;
     }
 
-    float indirectAttenuation = mix(0.2, 1.0, shadowFactor);
+    float indirectAttenuation = mix(0.5, 1.0, shadowFactor);
 
     // IBL REFLECTIONS
     vec3 irradiance = DiffuseIrradiance(environmentDiffuseAndSpecular, N);
@@ -186,5 +186,8 @@ void main() {
         case 8:
             imageStore(outputImage, screenPos, vec4(vec3(dot(N, L)), 1.0));
             break;
+        case 9:
+        // imageStore(outputImage, screenPos, vec4(vec3(ao), 1.0f));
+            break;
     }
 }
\ No newline at end of file
diff --git a/src/core/engine.cpp b/src/core/engine.cpp
index 5a8d7cbb..4100a18b 100644
--- a/src/core/engine.cpp
+++ b/src/core/engine.cpp
@@ -490,7 +490,6 @@ void Engine::draw(float deltaTime)
         environmentMap->getCubemapDescriptorBuffer().getDescriptorBufferBindingInfo(),
         environmentMap->getCubemapDescriptorBuffer().getDescriptorBufferSize() * environmentMapIndex,
     };
-    // todo: make environment pipeline draw to the render targets rather than directly to the draw image!!!
     environmentPipeline->draw(cmd, environmentPipelineDrawInfo);
 
 
diff --git a/src/renderer/environment/environment.cpp b/src/renderer/environment/environment.cpp
index c0345fb9..8942f4f8 100644
--- a/src/renderer/environment/environment.cpp
+++ b/src/renderer/environment/environment.cpp
@@ -215,7 +215,13 @@ will_engine::environment::Environment::Environment(ResourceManager& resourceMana
     {
         lutDescriptorBuffer = resourceManager.createDescriptorBufferSampler(lutLayout, 1);
 
-        lutImage = resourceManager.createImage(LUT_IMAGE_EXTENT, VK_FORMAT_R32G32_SFLOAT, VK_IMAGE_USAGE_SAMPLED_BIT | VK_IMAGE_USAGE_STORAGE_BIT | VK_IMAGE_USAGE_TRANSFER_SRC_BIT);
+        VkImageUsageFlags usage{};
+        usage |= VK_IMAGE_USAGE_SAMPLED_BIT;
+        usage |= VK_IMAGE_USAGE_STORAGE_BIT;
+        usage |= VK_IMAGE_USAGE_TRANSFER_SRC_BIT;
+
+        VkImageCreateInfo imgInfo = vk_helpers::imageCreateInfo(VK_FORMAT_R32G32_SFLOAT, usage, LUT_IMAGE_EXTENT);
+        lutImage = resourceManager.createImage(imgInfo);
 
         VkDescriptorImageInfo lutDescriptorInfo{};
         lutDescriptorInfo.sampler = nullptr; // not sampled (storage)
diff --git a/src/renderer/imgui_wrapper.cpp b/src/renderer/imgui_wrapper.cpp
index 5d9376e6..e70f3f3a 100644
--- a/src/renderer/imgui_wrapper.cpp
+++ b/src/renderer/imgui_wrapper.cpp
@@ -642,7 +642,7 @@ void ImguiWrapper::imguiInterface(Engine* engine)
                                 gltfPath = IGFD::FileDialog::Instance()->GetFilePathName();
                                 gltfPath = file::getRelativePath(gltfPath);
 
-                                willmodelPath = std::filesystem::current_path() / "assets" / "willmodels" / gltfPath.filename().string();
+                                willmodelPath = gltfPath.parent_path() / gltfPath.filename().string();
                                 willmodelPath = file::getRelativePath(willmodelPath);
                                 willmodelPath.replace_extension(".willmodel");
                             }
diff --git a/src/renderer/lighting/ambient_occlusion/ambient_occlusion_types.h b/src/renderer/lighting/ambient_occlusion/ambient_occlusion_types.h
new file mode 100644
index 00000000..a65394df
--- /dev/null
+++ b/src/renderer/lighting/ambient_occlusion/ambient_occlusion_types.h
@@ -0,0 +1,34 @@
+//
+// Created by William on 2025-03-23.
+//
+
+#ifndef AMBIENT_OCCLUSION_TYPES_H
+#define AMBIENT_OCCLUSION_TYPES_H
+#include <glm/glm.hpp>
+
+namespace will_engine::ambient_occlusion
+{
+static constexpr int32_t DEPTH_PREFILTER_MIP_COUNT = 5;
+
+struct GTAOPushConstants
+{
+    glm::vec2 viewportSize;
+    glm::vec2 viewportPixelSize;
+
+    // AO parameters
+    float radius;
+    float falloff;
+    float strength;
+    float radiusMultiplier;
+
+    // Sampling parameters
+    uint32_t numDirections;
+    uint32_t numSteps;
+
+    // Temporal/filter parameters
+    float temporalWeight;
+    float spatialFilterRadius;
+};
+}
+
+#endif //AMBIENT_OCCLUSION_TYPES_H
diff --git a/src/renderer/lighting/ambient_occlusion/ground_truth/ground_truth_ambient_occlusion.cpp b/src/renderer/lighting/ambient_occlusion/ground_truth/ground_truth_ambient_occlusion.cpp
new file mode 100644
index 00000000..580d1a91
--- /dev/null
+++ b/src/renderer/lighting/ambient_occlusion/ground_truth/ground_truth_ambient_occlusion.cpp
@@ -0,0 +1,119 @@
+//
+// Created by William on 2025-03-23.
+//
+
+#include "ground_truth_ambient_occlusion.h"
+
+#include "src/renderer/renderer_constants.h"
+#include "src/renderer/vk_descriptors.h"
+#include "src/renderer/lighting/ambient_occlusion/ambient_occlusion_types.h"
+
+will_engine::ambient_occlusion::GroundTruthAmbientOcclusionPipeline::GroundTruthAmbientOcclusionPipeline(ResourceManager& resourceManager) : resourceManager(resourceManager)
+{
+    // Depth Pre-filtering
+    {
+        DescriptorLayoutBuilder layoutBuilder;
+        layoutBuilder.addBinding(0, VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER); // depth image
+        layoutBuilder.addBinding(1, VK_DESCRIPTOR_TYPE_STORAGE_IMAGE); // ao depth mip 0
+        layoutBuilder.addBinding(2, VK_DESCRIPTOR_TYPE_STORAGE_IMAGE); // ao depth mip 1
+        layoutBuilder.addBinding(3, VK_DESCRIPTOR_TYPE_STORAGE_IMAGE); // ao depth mip 2
+        layoutBuilder.addBinding(4, VK_DESCRIPTOR_TYPE_STORAGE_IMAGE); // ao depth mip 3
+        layoutBuilder.addBinding(5, VK_DESCRIPTOR_TYPE_STORAGE_IMAGE); // ao depth mip 4
+
+        depthPrefilterSetLayout = resourceManager.createDescriptorSetLayout(layoutBuilder, VK_SHADER_STAGE_COMPUTE_BIT, VK_DESCRIPTOR_SET_LAYOUT_CREATE_DESCRIPTOR_BUFFER_BIT_EXT);
+
+        VkPushConstantRange pushConstants{};
+        pushConstants.offset = 0;
+        pushConstants.size = sizeof(GTAOPushConstants);
+        pushConstants.stageFlags = VK_SHADER_STAGE_COMPUTE_BIT;
+
+        VkDescriptorSetLayout setLayouts[2];
+        setLayouts[0] = resourceManager.getSceneDataLayout();
+        setLayouts[1] = depthPrefilterSetLayout;
+
+        VkPipelineLayoutCreateInfo layoutInfo{};
+        layoutInfo.sType = VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO;
+        layoutInfo.pNext = nullptr;
+        layoutInfo.pSetLayouts = setLayouts;
+        layoutInfo.setLayoutCount = 2;
+        layoutInfo.pPushConstantRanges = &pushConstants;
+        layoutInfo.pushConstantRangeCount = 1;
+
+        depthPrefilterPipelineLayout = resourceManager.createPipelineLayout(layoutInfo);
+
+        createDepthPrefilterPipeline();
+
+        depthPrefilterDescriptorBuffer = resourceManager.createDescriptorBufferSampler(depthPrefilterSetLayout, 1);
+
+
+        VkImageUsageFlags usage{};
+        usage |= VK_IMAGE_USAGE_STORAGE_BIT;
+        usage |= VK_IMAGE_USAGE_SAMPLED_BIT;
+
+        VkImageCreateInfo imgInfo = vk_helpers::imageCreateInfo(depthPrefilterFormat, usage, {RENDER_EXTENTS.width, RENDER_EXTENTS.height, 1});
+
+        // 5 mips, suggested by Intel's implementation
+        // https://github.com/GameTechDev/XeGTAO
+        imgInfo.mipLevels = DEPTH_PREFILTER_MIP_COUNT;
+
+        depthPrefilterImage = resourceManager.createImage(imgInfo);
+
+        VkImageViewCreateInfo viewInfo = vk_helpers::imageviewCreateInfo(depthPrefilterFormat, depthPrefilterImage.image, VK_IMAGE_ASPECT_COLOR_BIT);
+
+        for (int32_t i = 0; i < DEPTH_PREFILTER_MIP_COUNT; ++i) {
+            viewInfo.subresourceRange.baseMipLevel = i;
+            depthPrefilterImageViews[i] = resourceManager.createImageView(viewInfo);
+        }
+    }
+}
+
+will_engine::ambient_occlusion::GroundTruthAmbientOcclusionPipeline::~GroundTruthAmbientOcclusionPipeline()
+{}
+
+void will_engine::ambient_occlusion::GroundTruthAmbientOcclusionPipeline::setupDepthPrefilterDescriptorBuffer(VkSampler depthImageSampler, VkImageView depthImageView)
+{
+    std::vector<DescriptorImageData> imageDescriptors{};
+    imageDescriptors.reserve(1 + DEPTH_PREFILTER_MIP_COUNT);
+
+    imageDescriptors.push_back(
+        {
+            VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER,
+            {depthImageSampler, depthImageView, VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL},
+            false
+        }
+    );
+
+    for (int32_t i = 0; i < DEPTH_PREFILTER_MIP_COUNT; ++i) {
+        DescriptorImageData imageData{};
+        imageData.type = VK_DESCRIPTOR_TYPE_STORAGE_IMAGE;
+        imageData.imageInfo = {VK_NULL_HANDLE, depthPrefilterImageViews[i], VK_IMAGE_LAYOUT_GENERAL};
+        imageData.bIsPadding = false;
+
+        imageDescriptors.push_back(imageData);
+    }
+
+    resourceManager.setupDescriptorBufferSampler(depthPrefilterDescriptorBuffer, imageDescriptors, 0);
+}
+
+void will_engine::ambient_occlusion::GroundTruthAmbientOcclusionPipeline::createDepthPrefilterPipeline()
+{
+    resourceManager.destroyPipeline(depthPrefilterPipeline);
+    VkShaderModule computeShader = resourceManager.createShaderModule("shaders/gtaodepthprefilter.comp");
+
+    VkPipelineShaderStageCreateInfo stageInfo{};
+    stageInfo.sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO;
+    stageInfo.pNext = nullptr;
+    stageInfo.stage = VK_SHADER_STAGE_COMPUTE_BIT;
+    stageInfo.module = computeShader;
+    stageInfo.pName = "main";
+
+    VkComputePipelineCreateInfo pipelineInfo{};
+    pipelineInfo.sType = VK_STRUCTURE_TYPE_COMPUTE_PIPELINE_CREATE_INFO;
+    pipelineInfo.pNext = nullptr;
+    pipelineInfo.layout = depthPrefilterPipelineLayout;
+    pipelineInfo.stage = stageInfo;
+    pipelineInfo.flags = VK_PIPELINE_CREATE_DESCRIPTOR_BUFFER_BIT_EXT;
+
+    depthPrefilterPipeline = resourceManager.createComputePipeline(pipelineInfo);
+    resourceManager.destroyShaderModule(computeShader);
+}
diff --git a/src/renderer/lighting/ambient_occlusion/ground_truth/ground_truth_ambient_occlusion.h b/src/renderer/lighting/ambient_occlusion/ground_truth/ground_truth_ambient_occlusion.h
new file mode 100644
index 00000000..25a78d3b
--- /dev/null
+++ b/src/renderer/lighting/ambient_occlusion/ground_truth/ground_truth_ambient_occlusion.h
@@ -0,0 +1,56 @@
+//
+// Created by William on 2025-03-23.
+//
+
+#ifndef GROUND_TRUTH_AMBIENT_OCCLUSION_H
+#define GROUND_TRUTH_AMBIENT_OCCLUSION_H
+
+#include <array>
+#include <volk/volk.h>
+
+#include "src/renderer/resource_manager.h"
+#include "src/renderer/lighting/ambient_occlusion/ambient_occlusion_types.h"
+
+
+namespace will_engine::ambient_occlusion
+{
+class GroundTruthAmbientOcclusionPipeline
+{
+public:
+    GroundTruthAmbientOcclusionPipeline(ResourceManager& resourceManager);
+
+    ~GroundTruthAmbientOcclusionPipeline();
+
+    void setupDepthPrefilterDescriptorBuffer(VkSampler depthImageSampler, VkImageView depthImageView);
+
+private:
+    void createDepthPrefilterPipeline();
+
+private:
+    VkDescriptorSetLayout depthPrefilterSetLayout{VK_NULL_HANDLE};
+    VkPipelineLayout depthPrefilterPipelineLayout{VK_NULL_HANDLE};
+    VkPipeline depthPrefilterPipeline{VK_NULL_HANDLE};
+
+    // 16 vs 32. look at cost later.
+    VkFormat depthPrefilterFormat{VK_FORMAT_R16_SFLOAT};
+    AllocatedImage depthPrefilterImage{VK_NULL_HANDLE};
+    std::array<VkImageView, DEPTH_PREFILTER_MIP_COUNT> depthPrefilterImageViews{};
+
+    DescriptorBufferSampler depthPrefilterDescriptorBuffer;
+
+    VkPipelineLayout ambientOcclusionPipelineLayout{VK_NULL_HANDLE};
+    VkPipeline ambientOcclusionPipeline{VK_NULL_HANDLE};
+
+    VkPipelineLayout spatialFilteringPipelineLayout{VK_NULL_HANDLE};
+    VkPipeline spatialFilteringPipeline{VK_NULL_HANDLE};
+
+    VkPipelineLayout temporalAccumulationPipelineLayout{VK_NULL_HANDLE};
+    VkPipeline temporalAccumulationPipeline{VK_NULL_HANDLE};
+
+private:
+    ResourceManager& resourceManager;
+};
+}
+
+
+#endif //GROUND_TRUTH_AMBIENT_OCCLUSION_H
diff --git a/src/renderer/lighting/shadows/cascaded_shadow_map.cpp b/src/renderer/lighting/shadows/cascaded_shadow_map.cpp
index 0627dd41..3bd9b091 100644
--- a/src/renderer/lighting/shadows/cascaded_shadow_map.cpp
+++ b/src/renderer/lighting/shadows/cascaded_shadow_map.cpp
@@ -159,8 +159,14 @@ will_engine::cascaded_shadows::CascadedShadowMap::CascadedShadowMap(ResourceMana
     sampler = resourceManager.createSampler(samplerCreateInfo);
 
     for (CascadeShadowMapData& cascadeShadowMapData : shadowMaps) {
-        cascadeShadowMapData.depthShadowMap = resourceManager.createImage({shadows::CASCADE_WIDTH, shadows::CASCADE_HEIGHT, 1}, shadows::CASCADE_DEPTH_FORMAT,
-                                                                          VK_IMAGE_USAGE_SAMPLED_BIT | VK_IMAGE_USAGE_DEPTH_STENCIL_ATTACHMENT_BIT | VK_IMAGE_USAGE_TRANSFER_SRC_BIT);
+
+        VkImageUsageFlags usage{};
+        usage |= VK_IMAGE_USAGE_SAMPLED_BIT;
+        usage |= VK_IMAGE_USAGE_DEPTH_STENCIL_ATTACHMENT_BIT;
+        usage |= VK_IMAGE_USAGE_TRANSFER_SRC_BIT;
+
+        VkImageCreateInfo imgInfo = vk_helpers::imageCreateInfo(shadows::CASCADE_DEPTH_FORMAT, usage, {shadows::CASCADE_WIDTH, shadows::CASCADE_HEIGHT, 1});
+        cascadeShadowMapData.depthShadowMap = resourceManager.createImage(imgInfo);
     }
 
     //
diff --git a/src/renderer/resource_manager.cpp b/src/renderer/resource_manager.cpp
index 3943a1e0..7767e8dc 100644
--- a/src/renderer/resource_manager.cpp
+++ b/src/renderer/resource_manager.cpp
@@ -308,6 +308,31 @@ VkSampler will_engine::ResourceManager::createSampler(const VkSamplerCreateInfo&
     return newSampler;
 }
 
+
+AllocatedImage will_engine::ResourceManager::createImage(const VkImageCreateInfo& createInfo) const
+{
+    AllocatedImage newImage{};
+    newImage.imageFormat = createInfo.format;
+    newImage.imageExtent = createInfo.extent;
+
+    VmaAllocationCreateInfo allocInfo = {};
+    allocInfo.usage = VMA_MEMORY_USAGE_GPU_ONLY;
+    allocInfo.requiredFlags = static_cast<VkMemoryPropertyFlags>(VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT);
+
+    // allocate and create the image
+    VK_CHECK(vmaCreateImage(context.allocator, &createInfo, &allocInfo, &newImage.image, &newImage.allocation, nullptr));
+
+    const VkImageAspectFlags aspectFlag = createInfo.format == VK_FORMAT_D32_SFLOAT ? VK_IMAGE_ASPECT_DEPTH_BIT : VK_IMAGE_ASPECT_COLOR_BIT;
+
+    // build an image-view for the image
+    VkImageViewCreateInfo view_info = vk_helpers::imageviewCreateInfo(createInfo.format, newImage.image, aspectFlag);
+    view_info.subresourceRange.levelCount = createInfo.mipLevels;
+
+    VK_CHECK(vkCreateImageView(context.device, &view_info, nullptr, &newImage.imageView));
+
+    return newImage;
+}
+
 AllocatedImage will_engine::ResourceManager::createImage(const VkExtent3D size, const VkFormat format, const VkImageUsageFlags usage, const bool mipmapped) const
 {
     AllocatedImage newImage{};
@@ -416,7 +441,7 @@ will_engine::DescriptorBufferSampler will_engine::ResourceManager::createDescrip
     return DescriptorBufferSampler(context, layout, maxObjectCount);
 }
 
-int32_t will_engine::ResourceManager::setupDescriptorBufferSampler(DescriptorBufferSampler& descriptorBuffer, const std::vector<will_engine::DescriptorImageData>& imageBuffers, const int index) const
+int32_t will_engine::ResourceManager::setupDescriptorBufferSampler(DescriptorBufferSampler& descriptorBuffer, const std::vector<DescriptorImageData>& imageBuffers, const int index) const
 {
     return descriptorBuffer.setupData(context.device, imageBuffers, index);
 }
@@ -426,7 +451,7 @@ will_engine::DescriptorBufferUniform will_engine::ResourceManager::createDescrip
     return DescriptorBufferUniform(context, layout, maxObjectCount);
 }
 
-int32_t will_engine::ResourceManager::setupDescriptorBufferUniform(DescriptorBufferUniform& descriptorBuffer, const std::vector<will_engine::DescriptorUniformData>& uniformBuffers, const int index) const
+int32_t will_engine::ResourceManager::setupDescriptorBufferUniform(DescriptorBufferUniform& descriptorBuffer, const std::vector<DescriptorUniformData>& uniformBuffers, const int index) const
 {
     return descriptorBuffer.setupData(context.device, uniformBuffers, index);
 }
diff --git a/src/renderer/resource_manager.h b/src/renderer/resource_manager.h
index 705011c0..d26134af 100644
--- a/src/renderer/resource_manager.h
+++ b/src/renderer/resource_manager.h
@@ -58,6 +58,8 @@ class ResourceManager
     void destroySampler(const VkSampler& sampler) const;
 
 
+    [[nodiscard]] AllocatedImage createImage(const VkImageCreateInfo& createInfo) const;
+
     [[nodiscard]] AllocatedImage createImage(VkExtent3D size, VkFormat format, VkImageUsageFlags usage, bool mipmapped = false) const;
 
     [[nodiscard]] AllocatedImage createImage(const void* data, size_t dataSize, VkExtent3D size, VkFormat format, VkImageUsageFlags usage, bool mipmapped = false) const;

From 103eb9c012fb5c18f70c42141beed8cfe2b301f5 Mon Sep 17 00:00:00 2001
From: Williscool13 <twtw40@gmail.com>
Date: Sun, 23 Mar 2025 16:15:47 +0700
Subject: [PATCH 02/27] Improved scene data buffer. Minor improvements

---
 assets/maps/sampleScene.willmap               | 28 +++++++++----------
 shaders/include/scene.glsl                    |  2 ++
 shaders/taa.comp                              | 23 +++++++--------
 src/core/engine.cpp                           |  1 +
 src/renderer/imgui_wrapper.cpp                | 20 ++++++-------
 .../temporal_antialiasing_pipeline.cpp        |  3 --
 .../temporal_antialiasing_pipeline.h          |  3 --
 src/renderer/resource_manager.cpp             |  7 ++++-
 src/renderer/resource_manager.h               |  2 +-
 src/renderer/vk_types.h                       |  1 +
 10 files changed, 44 insertions(+), 46 deletions(-)

diff --git a/assets/maps/sampleScene.willmap b/assets/maps/sampleScene.willmap
index 7cf48529..cb7e5575 100644
--- a/assets/maps/sampleScene.willmap
+++ b/assets/maps/sampleScene.willmap
@@ -6,7 +6,7 @@
     },
     "metadata": {
         "name": "sampleScene",
-        "created": "2025-03-22 17:23:20",
+        "created": "2025-03-23 16:02:52",
         "formatVersion": 1
     },
     "rootComponents": {
@@ -34,9 +34,9 @@
                         "y": 50.0
                     },
                     "baseColor": {
-                        "x": 0.5343137383460999,
-                        "y": 0.5343137383460999,
-                        "z": 0.5343137383460999,
+                        "x": 0.8725489974021912,
+                        "y": 0.8725489974021912,
+                        "z": 0.8725489974021912,
                         "w": 1.0
                     }
                 },
@@ -109,9 +109,9 @@
                             "y": 50.0
                         },
                         "baseColor": {
-                            "x": 0.5343137383460999,
-                            "y": 0.5343137383460999,
-                            "z": 0.5343137383460999,
+                            "x": 0.8725489974021912,
+                            "y": 0.8725489974021912,
+                            "z": 0.8725489974021912,
                             "w": 1.0
                         }
                     },
@@ -280,15 +280,15 @@
                 "name": "Cube\u0000ing Orb",
                 "transform": {
                     "position": {
-                        "x": 13.52898120880127,
-                        "y": 21.364240646362305,
-                        "z": 37.19432067871094
+                        "x": 13.530956268310547,
+                        "y": 21.363996505737305,
+                        "z": 37.1947135925293
                     },
                     "rotation": {
-                        "x": 0.19284461438655853,
-                        "y": 0.2026415914297104,
-                        "z": 0.6591105461120605,
-                        "w": 0.6980835795402527
+                        "x": 0.19317233562469482,
+                        "y": 0.2023322433233261,
+                        "z": 0.6584249138832092,
+                        "w": 0.698729395866394
                     },
                     "scale": {
                         "x": 1.0,
diff --git a/shaders/include/scene.glsl b/shaders/include/scene.glsl
index f511e143..9da8db70 100644
--- a/shaders/include/scene.glsl
+++ b/shaders/include/scene.glsl
@@ -25,5 +25,7 @@ layout (std140, set = 0, binding = 0) uniform SceneData {
     vec4 jitter;
 
     vec2 renderTargetSize;
+    // equal to 1 / renderTargetSize
+    vec2 texelSize;
     float deltaTime;
 } sceneData;
\ No newline at end of file
diff --git a/shaders/taa.comp b/shaders/taa.comp
index ea5c6b25..f0027bdf 100644
--- a/shaders/taa.comp
+++ b/shaders/taa.comp
@@ -15,9 +15,6 @@ layout (set = 1, binding = 3) uniform sampler2D velocityBuffer;
 layout (rgba16f, set = 1, binding = 4) uniform image2D outputImage;
 
 layout (push_constant) uniform PushConstants {
-    vec2 texelSize;
-    int width;
-    int height;
     float blendValue;
     int debug;
 } push;
@@ -42,8 +39,8 @@ vec3 clipAABB(vec3 cMin, vec3 cMax, vec3 cAvg, vec3 color) {
 
 vec3 findClosestFragment3x3(vec2 uv)
 {
-    vec2 du = vec2(push.texelSize.x);
-    vec2 dv = vec2(push.texelSize.y);
+    vec2 du = vec2(sceneData.texelSize.x);
+    vec2 dv = vec2(sceneData.texelSize.y);
 
     vec3 topL = vec3(-1, -1, texture(depthImage, uv - dv - du).x);
     vec3 topM = vec3(0, -1, texture(depthImage, uv - dv).x);
@@ -68,15 +65,15 @@ vec3 findClosestFragment3x3(vec2 uv)
     if (botM.z > dMin.z) dMin = botM;
     if (botR.z > dMin.z) dMin = botR;
 
-    return vec3(uv + push.texelSize * dMin.xy, dMin.z);
+    return vec3(uv + sceneData.texelSize * dMin.xy, dMin.z);
 }
 
 vec3 temporalReprojection(vec2 uv, vec2 velocity) {
     vec3 currentSample = RGBToYCoCg(texture(drawImage, uv).rgb);
     vec3 historySample = RGBToYCoCg(texture(drawHistory, uv - velocity).rgb);
 
-    vec2 du = vec2(push.texelSize.x);
-    vec2 dv = vec2(push.texelSize.y);
+    vec2 du = vec2(sceneData.texelSize.x);
+    vec2 dv = vec2(sceneData.texelSize.y);
 
     // MINMAX_3X3_ROUNDED
     vec3 topL = RGBToYCoCg(texture(drawImage, uv - dv - du).rgb);
@@ -138,7 +135,7 @@ vec3 varianceClipping(vec3 color, vec2 uv) {
 
     for (int y = -1; y <= 1; y++) {
         for (int x = -1; x <= 1; x++) {
-            vec2 offset = vec2(x, y) * push.texelSize;
+            vec2 offset = vec2(x, y) * sceneData.texelSize;
             vec3 sampleRGB = texture(drawImage, uv + offset).rgb;
             vec3 sampleYCoCg = RGBToYCoCg(sampleRGB);
 
@@ -166,8 +163,8 @@ vec3 varianceClipping(vec3 color, vec2 uv) {
 }
 
 vec3 neighborhoodClamping(vec3 historyColor, vec2 uv) {
-    vec2 du = vec2(push.texelSize.x);
-    vec2 dv = vec2(push.texelSize.y);
+    vec2 du = vec2(sceneData.texelSize.x);
+    vec2 dv = vec2(sceneData.texelSize.y);
 
     // Sample current frame neighborhood
     vec3 topL = texture(drawImage, uv - dv - du).rgb;
@@ -192,10 +189,10 @@ vec3 neighborhoodClamping(vec3 historyColor, vec2 uv) {
 
 void main() {
     ivec2 pixel = ivec2(gl_GlobalInvocationID.xy);
-    if (pixel.x >= push.width || pixel.y >= push.height) {
+    if (pixel.x >= sceneData.renderTargetSize.x || pixel.y >= sceneData.renderTargetSize.y) {
         return;
     }
-    vec2 uv = (vec2(pixel) + 0.5) * push.texelSize;
+    vec2 uv = (vec2(pixel) + 0.5) * sceneData.texelSize;
 
     if (push.debug == 1) {
         // disabled
diff --git a/src/core/engine.cpp b/src/core/engine.cpp
index 4100a18b..de85f276 100644
--- a/src/core/engine.cpp
+++ b/src/core/engine.cpp
@@ -389,6 +389,7 @@ void Engine::updateRender(const float deltaTime, const int32_t currentFrameOverl
 
 
     pSceneData->renderTargetSize = {RENDER_EXTENT_WIDTH, RENDER_EXTENT_HEIGHT};
+    pSceneData->texelSize = {1.0f / RENDER_EXTENT_WIDTH, 1.0f / RENDER_EXTENT_HEIGHT};
     pSceneData->deltaTime = deltaTime;
 
 
diff --git a/src/renderer/imgui_wrapper.cpp b/src/renderer/imgui_wrapper.cpp
index e70f3f3a..4178d9f1 100644
--- a/src/renderer/imgui_wrapper.cpp
+++ b/src/renderer/imgui_wrapper.cpp
@@ -933,19 +933,17 @@ void ImguiWrapper::drawSceneGraph(Engine* engine)
         if (ImGui::BeginTabItem("Terrain")) {
             ImGui::Checkbox("Draw Vertex Lines Only", &engine->bDrawTerrainLines);
 
-            if (ImGui::BeginTabBar("Terrain Tab Bar")) {
-                auto currentTerrainComponent = selectedMap->getComponent<components::TerrainComponent>();
+            const auto currentTerrainComponent = selectedMap->getComponent<components::TerrainComponent>();
+            ImGui::BeginDisabled(!currentTerrainComponent);
+            if (ImGui::Button("Save Terrain as HeightMap")) {
+                const std::vector<float> heightmapData = currentTerrainComponent->getHeightMapData();
+                const std::filesystem::path path = file::imagesSavePath / "TerrainHeightMap.png";
+                vk_helpers::saveHeightmap(heightmapData, NOISE_MAP_DIMENSIONS, NOISE_MAP_DIMENSIONS, path);
+            }
+            ImGui::EndDisabled();
 
+            if (ImGui::BeginTabBar("Terrain Tab Bar")) {
                 if (ImGui::BeginTabItem("Terrain Generation")) {
-                    ImGui::BeginDisabled(!currentTerrainComponent);
-                    if (ImGui::Button("Save Terrain as HeightMap")) {
-                        std::vector<float> heightmapData = currentTerrainComponent->getHeightMapData();
-                        std::filesystem::path path = file::imagesSavePath / "TerrainHeightMap.png";
-                        vk_helpers::saveHeightmap(heightmapData, NOISE_MAP_DIMENSIONS, NOISE_MAP_DIMENSIONS, path);
-                    }
-                    ImGui::EndDisabled();
-
-
                     ImGui::Separator();
 
                     if (ImGui::CollapsingHeader("Noise Settings", ImGuiTreeNodeFlags_DefaultOpen)) {
diff --git a/src/renderer/pipelines/temporal_antialiasing_pipeline/temporal_antialiasing_pipeline.cpp b/src/renderer/pipelines/temporal_antialiasing_pipeline/temporal_antialiasing_pipeline.cpp
index e682da01..3b188961 100644
--- a/src/renderer/pipelines/temporal_antialiasing_pipeline/temporal_antialiasing_pipeline.cpp
+++ b/src/renderer/pipelines/temporal_antialiasing_pipeline/temporal_antialiasing_pipeline.cpp
@@ -105,9 +105,6 @@ void will_engine::temporal_antialiasing_pipeline::TemporalAntialiasingPipeline::
     vkCmdBindPipeline(cmd, VK_PIPELINE_BIND_POINT_COMPUTE, pipeline);
 
     TemporalAntialiasingPushConstants properties{};;
-    properties.width = RENDER_EXTENT_WIDTH;
-    properties.height = RENDER_EXTENT_HEIGHT;
-    properties.texelSize = {1.0f / properties.width, 1.0f / properties.height};
     properties.blendValue = drawInfo.blendValue;
     properties.taaDebug = drawInfo.debugMode;
 
diff --git a/src/renderer/pipelines/temporal_antialiasing_pipeline/temporal_antialiasing_pipeline.h b/src/renderer/pipelines/temporal_antialiasing_pipeline/temporal_antialiasing_pipeline.h
index 0f8a7400..e3541bbc 100644
--- a/src/renderer/pipelines/temporal_antialiasing_pipeline/temporal_antialiasing_pipeline.h
+++ b/src/renderer/pipelines/temporal_antialiasing_pipeline/temporal_antialiasing_pipeline.h
@@ -11,9 +11,6 @@ namespace will_engine::temporal_antialiasing_pipeline
 {
 struct TemporalAntialiasingPushConstants
 {
-    glm::vec2 texelSize;
-    int32_t width;
-    int32_t height;
     float blendValue;
     int32_t taaDebug;
 };
diff --git a/src/renderer/resource_manager.cpp b/src/renderer/resource_manager.cpp
index 7767e8dc..14f2b7a5 100644
--- a/src/renderer/resource_manager.cpp
+++ b/src/renderer/resource_manager.cpp
@@ -425,10 +425,15 @@ AllocatedImage will_engine::ResourceManager::createCubemap(const VkExtent3D size
     return newImage;
 }
 
-void will_engine::ResourceManager::destroyImage(const AllocatedImage& img) const
+void will_engine::ResourceManager::destroyImage(AllocatedImage& img) const
 {
     vkDestroyImageView(context.device, img.imageView, nullptr);
     vmaDestroyImage(context.allocator, img.image, img.allocation);
+    img.image = VK_NULL_HANDLE;
+    img.imageView = VK_NULL_HANDLE;
+    img.allocation = VK_NULL_HANDLE;
+    img.imageExtent = {};
+    img.imageFormat = {};
 }
 
 void will_engine::ResourceManager::destroySampler(const VkSampler& sampler) const
diff --git a/src/renderer/resource_manager.h b/src/renderer/resource_manager.h
index d26134af..99f392cd 100644
--- a/src/renderer/resource_manager.h
+++ b/src/renderer/resource_manager.h
@@ -66,7 +66,7 @@ class ResourceManager
 
     [[nodiscard]] AllocatedImage createCubemap(VkExtent3D size, VkFormat format, VkImageUsageFlags usage, bool mipmapped = false) const;
 
-    void destroyImage(const AllocatedImage& img) const;
+    void destroyImage(AllocatedImage& img) const;
 
 
     [[nodiscard]] DescriptorBufferSampler createDescriptorBufferSampler(VkDescriptorSetLayout layout, int32_t maxObjectCount) const;
diff --git a/src/renderer/vk_types.h b/src/renderer/vk_types.h
index 7e82567f..634e270a 100644
--- a/src/renderer/vk_types.h
+++ b/src/renderer/vk_types.h
@@ -71,6 +71,7 @@ struct SceneData
     glm::vec4 jitter{0.0f};
 
     glm::vec2 renderTargetSize{};
+    glm::vec2 texelSize{};
     float deltaTime{};
 };
 

From 14bedb80cf528d73af1ff42057b02c30e15363d6 Mon Sep 17 00:00:00 2001
From: Williscool13 <twtw40@gmail.com>
Date: Sun, 23 Mar 2025 17:37:12 +0700
Subject: [PATCH 03/27] GTAO main ambient occlusion calculation pipeline.

---
 ...efilter.comp => gtao_depth_prefilter.comp} |   0
 .../ambient_occlusion_types.h                 |   7 +
 .../ground_truth_ambient_occlusion.cpp        | 241 +++++++++++++++++-
 .../ground_truth_ambient_occlusion.h          |  23 +-
 src/renderer/resource_manager.cpp             |   3 +-
 src/renderer/resource_manager.h               |   2 +-
 6 files changed, 264 insertions(+), 12 deletions(-)
 rename shaders/ambient_occlusion/ground_truth/{gtaodepthprefilter.comp => gtao_depth_prefilter.comp} (100%)

diff --git a/shaders/ambient_occlusion/ground_truth/gtaodepthprefilter.comp b/shaders/ambient_occlusion/ground_truth/gtao_depth_prefilter.comp
similarity index 100%
rename from shaders/ambient_occlusion/ground_truth/gtaodepthprefilter.comp
rename to shaders/ambient_occlusion/ground_truth/gtao_depth_prefilter.comp
diff --git a/src/renderer/lighting/ambient_occlusion/ambient_occlusion_types.h b/src/renderer/lighting/ambient_occlusion/ambient_occlusion_types.h
index a65394df..f9d7f0cd 100644
--- a/src/renderer/lighting/ambient_occlusion/ambient_occlusion_types.h
+++ b/src/renderer/lighting/ambient_occlusion/ambient_occlusion_types.h
@@ -29,6 +29,13 @@ struct GTAOPushConstants
     float temporalWeight;
     float spatialFilterRadius;
 };
+
+struct GTAODrawInfo
+{
+    GTAOPushConstants pushConstants{};
+    VkDescriptorBufferBindingInfoEXT sceneDataBinding{};
+    VkDeviceSize sceneDataOffset{0};
+};
 }
 
 #endif //AMBIENT_OCCLUSION_TYPES_H
diff --git a/src/renderer/lighting/ambient_occlusion/ground_truth/ground_truth_ambient_occlusion.cpp b/src/renderer/lighting/ambient_occlusion/ground_truth/ground_truth_ambient_occlusion.cpp
index 580d1a91..be502b76 100644
--- a/src/renderer/lighting/ambient_occlusion/ground_truth/ground_truth_ambient_occlusion.cpp
+++ b/src/renderer/lighting/ambient_occlusion/ground_truth/ground_truth_ambient_occlusion.cpp
@@ -42,6 +42,7 @@ will_engine::ambient_occlusion::GroundTruthAmbientOcclusionPipeline::GroundTruth
         depthPrefilterPipelineLayout = resourceManager.createPipelineLayout(layoutInfo);
 
         createDepthPrefilterPipeline();
+        createAmbientOcclusionPipeline();
 
         depthPrefilterDescriptorBuffer = resourceManager.createDescriptorBufferSampler(depthPrefilterSetLayout, 1);
 
@@ -51,26 +52,138 @@ will_engine::ambient_occlusion::GroundTruthAmbientOcclusionPipeline::GroundTruth
         usage |= VK_IMAGE_USAGE_SAMPLED_BIT;
 
         VkImageCreateInfo imgInfo = vk_helpers::imageCreateInfo(depthPrefilterFormat, usage, {RENDER_EXTENTS.width, RENDER_EXTENTS.height, 1});
-
         // 5 mips, suggested by Intel's implementation
         // https://github.com/GameTechDev/XeGTAO
         imgInfo.mipLevels = DEPTH_PREFILTER_MIP_COUNT;
-
         depthPrefilterImage = resourceManager.createImage(imgInfo);
-
         VkImageViewCreateInfo viewInfo = vk_helpers::imageviewCreateInfo(depthPrefilterFormat, depthPrefilterImage.image, VK_IMAGE_ASPECT_COLOR_BIT);
 
         for (int32_t i = 0; i < DEPTH_PREFILTER_MIP_COUNT; ++i) {
             viewInfo.subresourceRange.baseMipLevel = i;
             depthPrefilterImageViews[i] = resourceManager.createImageView(viewInfo);
         }
+
+        VkSamplerCreateInfo samplerInfo = {.sType = VK_STRUCTURE_TYPE_SAMPLER_CREATE_INFO};
+        samplerInfo.magFilter = VK_FILTER_NEAREST;
+        samplerInfo.minFilter = VK_FILTER_NEAREST;
+        samplerInfo.mipmapMode = VK_SAMPLER_MIPMAP_MODE_NEAREST;
+        samplerInfo.addressModeU = VK_SAMPLER_ADDRESS_MODE_CLAMP_TO_EDGE;
+        samplerInfo.addressModeV = VK_SAMPLER_ADDRESS_MODE_CLAMP_TO_EDGE;
+        samplerInfo.addressModeW = VK_SAMPLER_ADDRESS_MODE_CLAMP_TO_EDGE;
+        samplerInfo.anisotropyEnable = VK_FALSE;
+        samplerInfo.maxAnisotropy = 1.0f;
+        samplerInfo.compareEnable = VK_FALSE;
+        samplerInfo.minLod = 0.0f;
+        samplerInfo.maxLod = 0.0f;
+
+        depthPrefilterSampler = resourceManager.createSampler(samplerInfo);
+    }
+
+    // AO Calculation
+    {
+        DescriptorLayoutBuilder layoutBuilder;
+        layoutBuilder.addBinding(0, VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER); // pre-filtered depth
+        layoutBuilder.addBinding(1, VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER); // normal buffer
+        layoutBuilder.addBinding(2, VK_DESCRIPTOR_TYPE_STORAGE_IMAGE); // ao output
+
+        ambientOcclusionSetLayout = resourceManager.createDescriptorSetLayout(layoutBuilder, VK_SHADER_STAGE_COMPUTE_BIT, VK_DESCRIPTOR_SET_LAYOUT_CREATE_DESCRIPTOR_BUFFER_BIT_EXT);
+
+        VkPushConstantRange pushConstants{};
+        pushConstants.offset = 0;
+        pushConstants.size = sizeof(GTAOPushConstants);
+        pushConstants.stageFlags = VK_SHADER_STAGE_COMPUTE_BIT;
+
+        VkDescriptorSetLayout setLayouts[2];
+        setLayouts[0] = resourceManager.getSceneDataLayout();
+        setLayouts[1] = ambientOcclusionSetLayout;
+
+        VkPipelineLayoutCreateInfo layoutInfo{};
+        layoutInfo.sType = VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO;
+        layoutInfo.pNext = nullptr;
+        layoutInfo.pSetLayouts = setLayouts;
+        layoutInfo.setLayoutCount = 2;
+        layoutInfo.pPushConstantRanges = &pushConstants;
+        layoutInfo.pushConstantRangeCount = 1;
+
+        ambientOcclusionPipelineLayout = resourceManager.createPipelineLayout(layoutInfo);
+        createAmbientOcclusionPipeline();
+
+        ambientOcclusionDescriptorBuffer = resourceManager.createDescriptorBufferSampler(ambientOcclusionSetLayout, 1);
+
+        VkImageUsageFlags usage{};
+        usage |= VK_IMAGE_USAGE_STORAGE_BIT;
+        usage |= VK_IMAGE_USAGE_SAMPLED_BIT;
+
+        VkImageCreateInfo imgInfo = vk_helpers::imageCreateInfo(ambientOcclusionFormat, usage, {RENDER_EXTENTS.width, RENDER_EXTENTS.height, 1});
+        ambientOcclusionImage = resourceManager.createImage(imgInfo);
+
+        // Depth Mip sampler
+        {
+            VkSamplerCreateInfo samplerInfo = {.sType = VK_STRUCTURE_TYPE_SAMPLER_CREATE_INFO};
+            samplerInfo.magFilter = VK_FILTER_NEAREST;
+            samplerInfo.minFilter = VK_FILTER_NEAREST;
+            samplerInfo.mipmapMode = VK_SAMPLER_MIPMAP_MODE_NEAREST;
+            samplerInfo.addressModeU = VK_SAMPLER_ADDRESS_MODE_CLAMP_TO_EDGE;
+            samplerInfo.addressModeV = VK_SAMPLER_ADDRESS_MODE_CLAMP_TO_EDGE;
+            samplerInfo.addressModeW = VK_SAMPLER_ADDRESS_MODE_CLAMP_TO_EDGE;
+            samplerInfo.anisotropyEnable = VK_FALSE;
+            samplerInfo.maxAnisotropy = 1.0f;
+            samplerInfo.compareEnable = VK_FALSE;
+            samplerInfo.minLod = 0.0f;
+            samplerInfo.maxLod = DEPTH_PREFILTER_MIP_COUNT - 1;
+
+            ambientOcclusionDepthSampler = resourceManager.createSampler(samplerInfo);
+        }
+
+        // Normals sampler
+        {
+            VkSamplerCreateInfo samplerInfo = {.sType = VK_STRUCTURE_TYPE_SAMPLER_CREATE_INFO};
+            samplerInfo.magFilter = VK_FILTER_NEAREST;
+            samplerInfo.minFilter = VK_FILTER_NEAREST;
+            samplerInfo.mipmapMode = VK_SAMPLER_MIPMAP_MODE_NEAREST;
+            samplerInfo.addressModeU = VK_SAMPLER_ADDRESS_MODE_CLAMP_TO_EDGE;
+            samplerInfo.addressModeV = VK_SAMPLER_ADDRESS_MODE_CLAMP_TO_EDGE;
+            samplerInfo.addressModeW = VK_SAMPLER_ADDRESS_MODE_CLAMP_TO_EDGE;
+            samplerInfo.anisotropyEnable = VK_FALSE;
+            samplerInfo.maxAnisotropy = 1.0f;
+            samplerInfo.compareEnable = VK_FALSE;
+            samplerInfo.minLod = 0.0f;
+            samplerInfo.maxLod = 0.0f;
+
+            ambientOcclusionNormalsSampler = resourceManager.createSampler(samplerInfo);
+        }
     }
 }
 
 will_engine::ambient_occlusion::GroundTruthAmbientOcclusionPipeline::~GroundTruthAmbientOcclusionPipeline()
-{}
+{
+    // Depth Prefilter Resources
+    resourceManager.destroyDescriptorSetLayout(depthPrefilterSetLayout);
+    resourceManager.destroyPipelineLayout(depthPrefilterPipelineLayout);
+    resourceManager.destroyPipeline(depthPrefilterPipeline);
+
+    for (int32_t i = 0; i < DEPTH_PREFILTER_MIP_COUNT; ++i) {
+        resourceManager.destroyImageView(depthPrefilterImageViews[i]);
+    }
+
+    resourceManager.destroyImage(depthPrefilterImage);
+    resourceManager.destroySampler(depthPrefilterSampler);
+
+    resourceManager.destroyDescriptorBuffer(depthPrefilterDescriptorBuffer);
+
+    // AO Resources
+    resourceManager.destroyDescriptorSetLayout(ambientOcclusionSetLayout);
+    resourceManager.destroyPipelineLayout(ambientOcclusionPipelineLayout);
+    resourceManager.destroyPipeline(ambientOcclusionPipeline);
 
-void will_engine::ambient_occlusion::GroundTruthAmbientOcclusionPipeline::setupDepthPrefilterDescriptorBuffer(VkSampler depthImageSampler, VkImageView depthImageView)
+    resourceManager.destroySampler(ambientOcclusionDepthSampler);
+    resourceManager.destroySampler(ambientOcclusionNormalsSampler);
+    resourceManager.destroyImage(ambientOcclusionImage);
+
+    resourceManager.destroyDescriptorBuffer(ambientOcclusionDescriptorBuffer);
+}
+
+void will_engine::ambient_occlusion::GroundTruthAmbientOcclusionPipeline::setupDepthPrefilterDescriptorBuffer(const VkImageView depthImageView)
 {
     std::vector<DescriptorImageData> imageDescriptors{};
     imageDescriptors.reserve(1 + DEPTH_PREFILTER_MIP_COUNT);
@@ -78,7 +191,7 @@ void will_engine::ambient_occlusion::GroundTruthAmbientOcclusionPipeline::setupD
     imageDescriptors.push_back(
         {
             VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER,
-            {depthImageSampler, depthImageView, VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL},
+            {depthPrefilterSampler, depthImageView, VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL},
             false
         }
     );
@@ -95,10 +208,101 @@ void will_engine::ambient_occlusion::GroundTruthAmbientOcclusionPipeline::setupD
     resourceManager.setupDescriptorBufferSampler(depthPrefilterDescriptorBuffer, imageDescriptors, 0);
 }
 
+void will_engine::ambient_occlusion::GroundTruthAmbientOcclusionPipeline::setupAmbientOcclusionDescriptorBuffer(VkImageView normalsImageView)
+{
+    std::vector<DescriptorImageData> imageDescriptors{};
+    imageDescriptors.reserve(2);
+
+    imageDescriptors.push_back(
+        {
+            VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER,
+            {ambientOcclusionDepthSampler, depthPrefilterImage.imageView, VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL},
+            false
+        });
+    imageDescriptors.push_back(
+        {
+            VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER,
+            {ambientOcclusionNormalsSampler, normalsImageView, VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL},
+            false
+        });
+    imageDescriptors.push_back(
+        {
+            VK_DESCRIPTOR_TYPE_STORAGE_IMAGE,
+            {VK_NULL_HANDLE, ambientOcclusionImage.imageView, VK_IMAGE_LAYOUT_GENERAL},
+            false
+        });
+}
+
+void will_engine::ambient_occlusion::GroundTruthAmbientOcclusionPipeline::draw(VkCommandBuffer cmd, const GTAODrawInfo& drawInfo)
+{
+    VkDebugUtilsLabelEXT label{};
+    label.sType = VK_STRUCTURE_TYPE_DEBUG_UTILS_LABEL_EXT;
+    label.pLabelName = "GT Ambient Occlusion";
+    vkCmdBeginDebugUtilsLabelEXT(cmd, &label);
+
+    vk_helpers::transitionImage(cmd, depthPrefilterImage.image, VK_IMAGE_LAYOUT_UNDEFINED, VK_IMAGE_LAYOUT_GENERAL, VK_IMAGE_ASPECT_COLOR_BIT);
+
+    // Depth Prefilter
+    {
+        vkCmdBindPipeline(cmd, VK_PIPELINE_BIND_POINT_COMPUTE, depthPrefilterPipeline);
+        vkCmdPushConstants(cmd, depthPrefilterPipelineLayout, VK_SHADER_STAGE_COMPUTE_BIT, 0, sizeof(GTAOPushConstants), &drawInfo.pushConstants);
+
+        VkDescriptorBufferBindingInfoEXT bindingInfos[2] = {};
+        bindingInfos[0] = drawInfo.sceneDataBinding;
+        bindingInfos[1] = depthPrefilterDescriptorBuffer.getDescriptorBufferBindingInfo();
+        vkCmdBindDescriptorBuffersEXT(cmd, 2, bindingInfos);
+
+        constexpr VkDeviceSize zeroOffset{0};
+        constexpr uint32_t sceneDataIndex{0};
+        constexpr uint32_t descriptorIndex{1};
+
+        vkCmdSetDescriptorBufferOffsetsEXT(cmd, VK_PIPELINE_BIND_POINT_COMPUTE, depthPrefilterPipelineLayout, 0, 1, &sceneDataIndex, &drawInfo.sceneDataOffset);
+        vkCmdSetDescriptorBufferOffsetsEXT(cmd, VK_PIPELINE_BIND_POINT_COMPUTE, depthPrefilterPipelineLayout, 1, 1, &descriptorIndex, &zeroOffset);
+
+        const auto x = static_cast<uint32_t>(std::ceil(RENDER_EXTENT_WIDTH / 16.0f));
+        const auto y = static_cast<uint32_t>(std::ceil(RENDER_EXTENT_HEIGHT / 16.0f));
+        vkCmdDispatch(cmd, x, y, 1);
+        vkCmdEndRendering(cmd);
+    }
+
+    vk_helpers::transitionImage(cmd, depthPrefilterImage.image, VK_IMAGE_LAYOUT_GENERAL, VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL, VK_IMAGE_ASPECT_COLOR_BIT);
+    vk_helpers::transitionImage(cmd, ambientOcclusionImage.image, VK_IMAGE_LAYOUT_UNDEFINED, VK_IMAGE_LAYOUT_GENERAL, VK_IMAGE_ASPECT_COLOR_BIT);
+
+    // Ambient Occlusion
+    {
+        vkCmdBindPipeline(cmd, VK_PIPELINE_BIND_POINT_COMPUTE, ambientOcclusionPipeline);
+        vkCmdPushConstants(cmd, ambientOcclusionPipelineLayout, VK_SHADER_STAGE_COMPUTE_BIT, 0, sizeof(GTAOPushConstants), &drawInfo.pushConstants);
+
+        VkDescriptorBufferBindingInfoEXT bindingInfos[2] = {};
+        bindingInfos[0] = drawInfo.sceneDataBinding;
+        bindingInfos[1] = ambientOcclusionDescriptorBuffer.getDescriptorBufferBindingInfo();
+        vkCmdBindDescriptorBuffersEXT(cmd, 2, bindingInfos);
+
+        constexpr VkDeviceSize zeroOffset{0};
+        constexpr uint32_t sceneDataIndex{0};
+        constexpr uint32_t descriptorIndex{1};
+
+        vkCmdSetDescriptorBufferOffsetsEXT(cmd, VK_PIPELINE_BIND_POINT_COMPUTE, ambientOcclusionPipelineLayout, 0, 1, &sceneDataIndex, &drawInfo.sceneDataOffset);
+        vkCmdSetDescriptorBufferOffsetsEXT(cmd, VK_PIPELINE_BIND_POINT_COMPUTE, ambientOcclusionPipelineLayout, 1, 1, &descriptorIndex, &zeroOffset);
+
+        const auto x = static_cast<uint32_t>(std::ceil(RENDER_EXTENT_WIDTH / 16.0f));
+        const auto y = static_cast<uint32_t>(std::ceil(RENDER_EXTENT_HEIGHT / 16.0f));
+        vkCmdDispatch(cmd, x, y, 1);
+        vkCmdEndRendering(cmd);
+    }
+
+
+    vk_helpers::transitionImage(cmd, ambientOcclusionImage.image, VK_IMAGE_LAYOUT_GENERAL, VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL, VK_IMAGE_ASPECT_COLOR_BIT);
+
+
+
+    vkCmdEndDebugUtilsLabelEXT(cmd);
+}
+
 void will_engine::ambient_occlusion::GroundTruthAmbientOcclusionPipeline::createDepthPrefilterPipeline()
 {
     resourceManager.destroyPipeline(depthPrefilterPipeline);
-    VkShaderModule computeShader = resourceManager.createShaderModule("shaders/gtaodepthprefilter.comp");
+    VkShaderModule computeShader = resourceManager.createShaderModule("shaders/gtao_depth_prefilter.comp");
 
     VkPipelineShaderStageCreateInfo stageInfo{};
     stageInfo.sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO;
@@ -117,3 +321,26 @@ void will_engine::ambient_occlusion::GroundTruthAmbientOcclusionPipeline::create
     depthPrefilterPipeline = resourceManager.createComputePipeline(pipelineInfo);
     resourceManager.destroyShaderModule(computeShader);
 }
+
+void will_engine::ambient_occlusion::GroundTruthAmbientOcclusionPipeline::createAmbientOcclusionPipeline()
+{
+    resourceManager.destroyPipeline(ambientOcclusionPipeline);
+    VkShaderModule computeShader = resourceManager.createShaderModule("shaders/gtao_ambient_occlusion.comp");
+
+    VkPipelineShaderStageCreateInfo stageInfo{};
+    stageInfo.sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO;
+    stageInfo.pNext = nullptr;
+    stageInfo.stage = VK_SHADER_STAGE_COMPUTE_BIT;
+    stageInfo.module = computeShader;
+    stageInfo.pName = "main";
+
+    VkComputePipelineCreateInfo pipelineInfo{};
+    pipelineInfo.sType = VK_STRUCTURE_TYPE_COMPUTE_PIPELINE_CREATE_INFO;
+    pipelineInfo.pNext = nullptr;
+    pipelineInfo.layout = ambientOcclusionPipelineLayout;
+    pipelineInfo.stage = stageInfo;
+    pipelineInfo.flags = VK_PIPELINE_CREATE_DESCRIPTOR_BUFFER_BIT_EXT;
+
+    ambientOcclusionPipeline = resourceManager.createComputePipeline(pipelineInfo);
+    resourceManager.destroyShaderModule(computeShader);
+}
diff --git a/src/renderer/lighting/ambient_occlusion/ground_truth/ground_truth_ambient_occlusion.h b/src/renderer/lighting/ambient_occlusion/ground_truth/ground_truth_ambient_occlusion.h
index 25a78d3b..4abba7b3 100644
--- a/src/renderer/lighting/ambient_occlusion/ground_truth/ground_truth_ambient_occlusion.h
+++ b/src/renderer/lighting/ambient_occlusion/ground_truth/ground_truth_ambient_occlusion.h
@@ -17,20 +17,26 @@ namespace will_engine::ambient_occlusion
 class GroundTruthAmbientOcclusionPipeline
 {
 public:
-    GroundTruthAmbientOcclusionPipeline(ResourceManager& resourceManager);
+    explicit GroundTruthAmbientOcclusionPipeline(ResourceManager& resourceManager);
 
     ~GroundTruthAmbientOcclusionPipeline();
 
-    void setupDepthPrefilterDescriptorBuffer(VkSampler depthImageSampler, VkImageView depthImageView);
+    void setupDepthPrefilterDescriptorBuffer(VkImageView depthImageView);
+
+    void setupAmbientOcclusionDescriptorBuffer(VkImageView normalsImageView);
+
+    void draw(VkCommandBuffer cmd, const GTAODrawInfo& drawInfo);
 
 private:
     void createDepthPrefilterPipeline();
+    void createAmbientOcclusionPipeline();
 
-private:
+private: // Depth prefilter
     VkDescriptorSetLayout depthPrefilterSetLayout{VK_NULL_HANDLE};
     VkPipelineLayout depthPrefilterPipelineLayout{VK_NULL_HANDLE};
     VkPipeline depthPrefilterPipeline{VK_NULL_HANDLE};
 
+    VkSampler depthPrefilterSampler{VK_NULL_HANDLE};
     // 16 vs 32. look at cost later.
     VkFormat depthPrefilterFormat{VK_FORMAT_R16_SFLOAT};
     AllocatedImage depthPrefilterImage{VK_NULL_HANDLE};
@@ -38,9 +44,20 @@ class GroundTruthAmbientOcclusionPipeline
 
     DescriptorBufferSampler depthPrefilterDescriptorBuffer;
 
+private: // ao
+    VkDescriptorSetLayout ambientOcclusionSetLayout{VK_NULL_HANDLE};
     VkPipelineLayout ambientOcclusionPipelineLayout{VK_NULL_HANDLE};
     VkPipeline ambientOcclusionPipeline{VK_NULL_HANDLE};
 
+    VkSampler ambientOcclusionDepthSampler{VK_NULL_HANDLE};
+    VkSampler ambientOcclusionNormalsSampler{VK_NULL_HANDLE};
+    VkFormat ambientOcclusionFormat{VK_FORMAT_R8_UNORM};
+    AllocatedImage ambientOcclusionImage{VK_NULL_HANDLE};
+
+    DescriptorBufferSampler ambientOcclusionDescriptorBuffer;
+
+private: //
+
     VkPipelineLayout spatialFilteringPipelineLayout{VK_NULL_HANDLE};
     VkPipeline spatialFilteringPipeline{VK_NULL_HANDLE};
 
diff --git a/src/renderer/resource_manager.cpp b/src/renderer/resource_manager.cpp
index 14f2b7a5..de3c8bfc 100644
--- a/src/renderer/resource_manager.cpp
+++ b/src/renderer/resource_manager.cpp
@@ -436,9 +436,10 @@ void will_engine::ResourceManager::destroyImage(AllocatedImage& img) const
     img.imageFormat = {};
 }
 
-void will_engine::ResourceManager::destroySampler(const VkSampler& sampler) const
+void will_engine::ResourceManager::destroySampler(VkSampler& sampler) const
 {
     vkDestroySampler(context.device, sampler, nullptr);
+    sampler = VK_NULL_HANDLE;
 }
 
 will_engine::DescriptorBufferSampler will_engine::ResourceManager::createDescriptorBufferSampler(VkDescriptorSetLayout layout, int32_t maxObjectCount) const
diff --git a/src/renderer/resource_manager.h b/src/renderer/resource_manager.h
index 99f392cd..c767589a 100644
--- a/src/renderer/resource_manager.h
+++ b/src/renderer/resource_manager.h
@@ -55,7 +55,7 @@ class ResourceManager
 
     [[nodiscard]] VkSampler createSampler(const VkSamplerCreateInfo& createInfo) const;
 
-    void destroySampler(const VkSampler& sampler) const;
+    void destroySampler(VkSampler& sampler) const;
 
 
     [[nodiscard]] AllocatedImage createImage(const VkImageCreateInfo& createInfo) const;

From 3731e2302aadeded370e0a90c029c0d211c73752 Mon Sep 17 00:00:00 2001
From: Williscool13 <twtw40@gmail.com>
Date: Mon, 24 Mar 2025 21:18:32 +0700
Subject: [PATCH 04/27] Spatial Prefilter and Temporal Accumulation.

---
 .../ground_truth/gtao_depth_prefilter.comp    |  11 ++
 .../ground_truth/gtao_main_pass.comp          |  13 ++
 .../ground_truth/gtao_spatial_filter.comp     |  14 ++
 .../gtao_temporal_accumulation.comp           |  15 ++
 src/core/engine.cpp                           |   6 +
 src/core/engine.h                             |   8 +
 .../assets/render_object/render_object.cpp    |   2 +-
 .../ground_truth_ambient_occlusion.cpp        | 171 +++++++++++++++++-
 .../ground_truth_ambient_occlusion.h          |  29 ++-
 9 files changed, 254 insertions(+), 15 deletions(-)
 create mode 100644 shaders/ambient_occlusion/ground_truth/gtao_main_pass.comp
 create mode 100644 shaders/ambient_occlusion/ground_truth/gtao_spatial_filter.comp
 create mode 100644 shaders/ambient_occlusion/ground_truth/gtao_temporal_accumulation.comp

diff --git a/shaders/ambient_occlusion/ground_truth/gtao_depth_prefilter.comp b/shaders/ambient_occlusion/ground_truth/gtao_depth_prefilter.comp
index 747d43d5..e92bdd91 100644
--- a/shaders/ambient_occlusion/ground_truth/gtao_depth_prefilter.comp
+++ b/shaders/ambient_occlusion/ground_truth/gtao_depth_prefilter.comp
@@ -1,5 +1,16 @@
 #version 460
 
+#include "scene.glsl"
+
+// layout (std140, set = 0, binding = 0) uniform SceneData - scene.glsl
+
+layout (set = 1, binding = 0) uniform sampler2D depthImage;
+layout (r16f, set = 1, binding = 1) uniform image2D depthMip0;
+layout (r16f, set = 1, binding = 2) uniform image2D depthMip1;
+layout (r16f, set = 1, binding = 3) uniform image2D depthMip2;
+layout (r16f, set = 1, binding = 4) uniform image2D depthMip3;
+layout (r16f, set = 1, binding = 5) uniform image2D depthMip4;
+
 void main() {
 
 }
diff --git a/shaders/ambient_occlusion/ground_truth/gtao_main_pass.comp b/shaders/ambient_occlusion/ground_truth/gtao_main_pass.comp
new file mode 100644
index 00000000..6f9db5be
--- /dev/null
+++ b/shaders/ambient_occlusion/ground_truth/gtao_main_pass.comp
@@ -0,0 +1,13 @@
+#version 460
+
+#include "scene.glsl"
+
+// layout (std140, set = 0, binding = 0) uniform SceneData - scene.glsl
+
+layout (set = 1, binding = 0) uniform sampler2D prefilteredDepth;
+layout (set = 1, binding = 1) uniform sampler2D normalBuffer;
+layout (r8, set = 1, binding = 2) uniform image2D aoOutput;
+
+void main() {
+
+}
diff --git a/shaders/ambient_occlusion/ground_truth/gtao_spatial_filter.comp b/shaders/ambient_occlusion/ground_truth/gtao_spatial_filter.comp
new file mode 100644
index 00000000..ee6acbe2
--- /dev/null
+++ b/shaders/ambient_occlusion/ground_truth/gtao_spatial_filter.comp
@@ -0,0 +1,14 @@
+#version 460
+
+#include "scene.glsl"
+
+// layout (std140, set = 0, binding = 0) uniform SceneData - scene.glsl
+
+layout (set = 1, binding = 0) uniform sampler2D rawAO;
+layout (set = 1, binding = 1) uniform sampler2D depthBuffer;
+layout (set = 1, binding = 2) uniform sampler2D normalBuffer;
+layout (r8, set = 1, binding = 3) uniform image2D filteredAO;
+
+void main() {
+
+}
diff --git a/shaders/ambient_occlusion/ground_truth/gtao_temporal_accumulation.comp b/shaders/ambient_occlusion/ground_truth/gtao_temporal_accumulation.comp
new file mode 100644
index 00000000..bf32d9ad
--- /dev/null
+++ b/shaders/ambient_occlusion/ground_truth/gtao_temporal_accumulation.comp
@@ -0,0 +1,15 @@
+#version 460
+
+#include "scene.glsl"
+
+// layout (std140, set = 0, binding = 0) uniform SceneData - scene.glsl
+
+layout (set = 1, binding = 0) uniform sampler2D filteredAO;
+layout (set = 1, binding = 1) uniform sampler2D historyOutputAO;
+layout (set = 1, binding = 2) uniform sampler2D depthBuffer;
+layout (set = 1, binding = 2) uniform sampler2D velocityBuffer;
+layout (r8, set = 1, binding = 2) uniform image2D outputAO;
+
+void main() {
+
+}
diff --git a/src/core/engine.cpp b/src/core/engine.cpp
index de85f276..6e5b420c 100644
--- a/src/core/engine.cpp
+++ b/src/core/engine.cpp
@@ -27,6 +27,7 @@
 #include "src/renderer/assets/render_object/render_object.h"
 #include "src/renderer/descriptor_buffer/descriptor_buffer_uniform.h"
 #include "src/renderer/environment/environment.h"
+#include "src/renderer/lighting/ambient_occlusion/ground_truth/ground_truth_ambient_occlusion.h"
 #include "src/renderer/lighting/shadows/cascaded_shadow_map.h"
 #include "src/renderer/pipelines/deferred_mrt/deferred_mrt.h"
 #include "src/renderer/pipelines/deferred_resolve/deferred_resolve.h"
@@ -199,11 +200,15 @@ void Engine::initRenderer()
     environmentPipeline = new environment_pipeline::EnvironmentPipeline(*resourceManager, environmentMap->getCubemapDescriptorSetLayout());
     terrainPipeline = new terrain::TerrainPipeline(*resourceManager);
     deferredMrtPipeline = new deferred_mrt::DeferredMrtPipeline(*resourceManager);
+    ambientOcclusionPipeline = new ambient_occlusion::GroundTruthAmbientOcclusionPipeline(*resourceManager);
     deferredResolvePipeline = new deferred_resolve::DeferredResolvePipeline(*resourceManager, environmentMap->getDiffSpecMapDescriptorSetlayout(),
                                                                             cascadedShadowMap->getCascadedShadowMapUniformLayout(), cascadedShadowMap->getCascadedShadowMapSamplerLayout());
     temporalAntialiasingPipeline = new temporal_antialiasing_pipeline::TemporalAntialiasingPipeline(*resourceManager);
     postProcessPipeline = new post_process_pipeline::PostProcessPipeline(*resourceManager);
 
+    ambientOcclusionPipeline->setupDepthPrefilterDescriptorBuffer(depthImage.imageView);
+    ambientOcclusionPipeline->setupAmbientOcclusionDescriptorBuffer(normalRenderTarget.imageView);
+
     const deferred_resolve::DeferredResolveDescriptor deferredResolveDescriptor{
         normalRenderTarget.imageView,
         albedoRenderTarget.imageView,
@@ -654,6 +659,7 @@ void Engine::cleanup()
     delete environmentPipeline;
     delete terrainPipeline;
     delete deferredMrtPipeline;
+    delete ambientOcclusionPipeline;
     delete deferredResolvePipeline;
     delete temporalAntialiasingPipeline;
     delete postProcessPipeline;
diff --git a/src/core/engine.h b/src/core/engine.h
index a5abaa7f..547d5b5c 100644
--- a/src/core/engine.h
+++ b/src/core/engine.h
@@ -26,6 +26,11 @@ class VulkanContext;
 
 namespace will_engine
 {
+namespace ambient_occlusion
+{
+    class GroundTruthAmbientOcclusionPipeline;
+}
+
 namespace terrain
 {
     class TerrainChunk;
@@ -142,8 +147,10 @@ class Engine
     ImmediateSubmitter* immediate = nullptr;
     ResourceManager* resourceManager = nullptr;
     identifier::IdentifierManager* identifierManager = nullptr;
+
     environment::Environment* environmentMap{nullptr};
     cascaded_shadows::CascadedShadowMap* cascadedShadowMap{nullptr};
+
     terrain::TerrainManager* terrainManager{nullptr};
     ImguiWrapper* imguiWrapper = nullptr;
 
@@ -195,6 +202,7 @@ class Engine
     terrain::TerrainPipeline* terrainPipeline{nullptr};
     deferred_mrt::DeferredMrtPipeline* deferredMrtPipeline{nullptr};
     deferred_resolve::DeferredResolvePipeline* deferredResolvePipeline{nullptr};
+    ambient_occlusion::GroundTruthAmbientOcclusionPipeline* ambientOcclusionPipeline{nullptr};
     temporal_antialiasing_pipeline::TemporalAntialiasingPipeline* temporalAntialiasingPipeline{nullptr};
     post_process_pipeline::PostProcessPipeline* postProcessPipeline{nullptr};
 
diff --git a/src/renderer/assets/render_object/render_object.cpp b/src/renderer/assets/render_object/render_object.cpp
index 9041b37c..9de1b356 100644
--- a/src/renderer/assets/render_object/render_object.cpp
+++ b/src/renderer/assets/render_object/render_object.cpp
@@ -599,7 +599,7 @@ void RenderObject::unload()
         resourceManager.destroyImage(image);
     }
 
-    for (const auto& sampler : samplers) {
+    for (auto& sampler : samplers) {
         if (sampler == resourceManager.getDefaultSamplerNearest() || sampler == resourceManager.getDefaultSamplerLinear()) {
             //dont destroy the default samplers
             continue;
diff --git a/src/renderer/lighting/ambient_occlusion/ground_truth/ground_truth_ambient_occlusion.cpp b/src/renderer/lighting/ambient_occlusion/ground_truth/ground_truth_ambient_occlusion.cpp
index be502b76..14231bda 100644
--- a/src/renderer/lighting/ambient_occlusion/ground_truth/ground_truth_ambient_occlusion.cpp
+++ b/src/renderer/lighting/ambient_occlusion/ground_truth/ground_truth_ambient_occlusion.cpp
@@ -4,6 +4,8 @@
 
 #include "ground_truth_ambient_occlusion.h"
 
+#include <volk/volk.h>
+
 #include "src/renderer/renderer_constants.h"
 #include "src/renderer/vk_descriptors.h"
 #include "src/renderer/lighting/ambient_occlusion/ambient_occlusion_types.h"
@@ -13,7 +15,7 @@ will_engine::ambient_occlusion::GroundTruthAmbientOcclusionPipeline::GroundTruth
     // Depth Pre-filtering
     {
         DescriptorLayoutBuilder layoutBuilder;
-        layoutBuilder.addBinding(0, VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER); // depth image
+        layoutBuilder.addBinding(0, VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER); // MRT depth buffer
         layoutBuilder.addBinding(1, VK_DESCRIPTOR_TYPE_STORAGE_IMAGE); // ao depth mip 0
         layoutBuilder.addBinding(2, VK_DESCRIPTOR_TYPE_STORAGE_IMAGE); // ao depth mip 1
         layoutBuilder.addBinding(3, VK_DESCRIPTOR_TYPE_STORAGE_IMAGE); // ao depth mip 2
@@ -40,9 +42,7 @@ will_engine::ambient_occlusion::GroundTruthAmbientOcclusionPipeline::GroundTruth
         layoutInfo.pushConstantRangeCount = 1;
 
         depthPrefilterPipelineLayout = resourceManager.createPipelineLayout(layoutInfo);
-
         createDepthPrefilterPipeline();
-        createAmbientOcclusionPipeline();
 
         depthPrefilterDescriptorBuffer = resourceManager.createDescriptorBufferSampler(depthPrefilterSetLayout, 1);
 
@@ -83,7 +83,7 @@ will_engine::ambient_occlusion::GroundTruthAmbientOcclusionPipeline::GroundTruth
     {
         DescriptorLayoutBuilder layoutBuilder;
         layoutBuilder.addBinding(0, VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER); // pre-filtered depth
-        layoutBuilder.addBinding(1, VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER); // normal buffer
+        layoutBuilder.addBinding(1, VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER); // MRT normal buffer
         layoutBuilder.addBinding(2, VK_DESCRIPTOR_TYPE_STORAGE_IMAGE); // ao output
 
         ambientOcclusionSetLayout = resourceManager.createDescriptorSetLayout(layoutBuilder, VK_SHADER_STAGE_COMPUTE_BIT, VK_DESCRIPTOR_SET_LAYOUT_CREATE_DESCRIPTOR_BUFFER_BIT_EXT);
@@ -153,6 +153,94 @@ will_engine::ambient_occlusion::GroundTruthAmbientOcclusionPipeline::GroundTruth
             ambientOcclusionNormalsSampler = resourceManager.createSampler(samplerInfo);
         }
     }
+
+    // Spatial Filtering
+    {
+        DescriptorLayoutBuilder layoutBuilder;
+        layoutBuilder.addBinding(0, VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER); // raw ao
+        layoutBuilder.addBinding(1, VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER); // MRT depth buffer
+        layoutBuilder.addBinding(2, VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER); // MRT normal buffer
+        layoutBuilder.addBinding(3, VK_DESCRIPTOR_TYPE_STORAGE_IMAGE); // filtered ao
+
+        spatialFilteringSetLayout = resourceManager.createDescriptorSetLayout(layoutBuilder, VK_SHADER_STAGE_COMPUTE_BIT, VK_DESCRIPTOR_SET_LAYOUT_CREATE_DESCRIPTOR_BUFFER_BIT_EXT);
+
+        VkPushConstantRange pushConstants{};
+        pushConstants.offset = 0;
+        pushConstants.size = sizeof(GTAOPushConstants);
+        pushConstants.stageFlags = VK_SHADER_STAGE_COMPUTE_BIT;
+
+        VkDescriptorSetLayout setLayouts[2];
+        setLayouts[0] = resourceManager.getSceneDataLayout();
+        setLayouts[1] = spatialFilteringSetLayout;
+
+        VkPipelineLayoutCreateInfo layoutInfo{};
+        layoutInfo.sType = VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO;
+        layoutInfo.pNext = nullptr;
+        layoutInfo.pSetLayouts = setLayouts;
+        layoutInfo.setLayoutCount = 2;
+        layoutInfo.pPushConstantRanges = &pushConstants;
+        layoutInfo.pushConstantRangeCount = 1;
+
+        spatialFilteringPipelineLayout = resourceManager.createPipelineLayout(layoutInfo);
+        createSpatialFilteringPipeline();
+
+        spatialFilteringDescriptorBuffer = resourceManager.createDescriptorBufferSampler(spatialFilteringSetLayout, 1);
+
+        VkImageUsageFlags usage{};
+        usage |= VK_IMAGE_USAGE_STORAGE_BIT;
+        usage |= VK_IMAGE_USAGE_SAMPLED_BIT;
+
+        VkImageCreateInfo imgInfo = vk_helpers::imageCreateInfo(ambientOcclusionFormat, usage, {RENDER_EXTENTS.width, RENDER_EXTENTS.height, 1});
+        spatialFilteringImage = resourceManager.createImage(imgInfo);
+    }
+
+    // Temporal Accumulation
+    {
+        DescriptorLayoutBuilder layoutBuilder;
+        layoutBuilder.addBinding(0, VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER); // filtered ao
+        layoutBuilder.addBinding(1, VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER); // final output history
+        layoutBuilder.addBinding(2, VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER); // MRT velocity buffer
+        layoutBuilder.addBinding(3, VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER); // MRT depth buffer
+        layoutBuilder.addBinding(4, VK_DESCRIPTOR_TYPE_STORAGE_IMAGE); // final output
+
+        temporalAccumulationSetLayout = resourceManager.createDescriptorSetLayout(layoutBuilder, VK_SHADER_STAGE_COMPUTE_BIT, VK_DESCRIPTOR_SET_LAYOUT_CREATE_DESCRIPTOR_BUFFER_BIT_EXT);
+
+        VkPushConstantRange pushConstants{};
+        pushConstants.offset = 0;
+        pushConstants.size = sizeof(GTAOPushConstants);
+        pushConstants.stageFlags = VK_SHADER_STAGE_COMPUTE_BIT;
+
+        VkDescriptorSetLayout setLayouts[2];
+        setLayouts[0] = resourceManager.getSceneDataLayout();
+        setLayouts[1] = temporalAccumulationSetLayout;
+
+        VkPipelineLayoutCreateInfo layoutInfo{};
+        layoutInfo.sType = VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO;
+        layoutInfo.pNext = nullptr;
+        layoutInfo.pSetLayouts = setLayouts;
+        layoutInfo.setLayoutCount = 2;
+        layoutInfo.pPushConstantRanges = &pushConstants;
+        layoutInfo.pushConstantRangeCount = 1;
+
+        temporalAccumulationPipelineLayout = resourceManager.createPipelineLayout(layoutInfo);
+        createTemporalAccumulationPipeline();
+
+        temporalAccumulationDescriptorBuffer = resourceManager.createDescriptorBufferSampler(temporalAccumulationSetLayout, 1);
+
+
+        VkImageUsageFlags usage{};
+        usage |= VK_IMAGE_USAGE_TRANSFER_DST_BIT;
+        usage |= VK_IMAGE_USAGE_SAMPLED_BIT;
+        VkImageCreateInfo imgInfo = vk_helpers::imageCreateInfo(ambientOcclusionFormat, usage, {RENDER_EXTENTS.width, RENDER_EXTENTS.height, 1});
+        historyOutputImage = resourceManager.createImage(imgInfo);
+
+        usage = {};
+        usage |= VK_IMAGE_USAGE_TRANSFER_SRC_BIT;
+        usage |= VK_IMAGE_USAGE_STORAGE_BIT;
+        usage |= VK_IMAGE_USAGE_SAMPLED_BIT;
+        imgInfo = vk_helpers::imageCreateInfo(ambientOcclusionFormat, usage, {RENDER_EXTENTS.width, RENDER_EXTENTS.height, 1});
+        ambientOcclusionOutputImage = resourceManager.createImage(imgInfo);
+    }
 }
 
 will_engine::ambient_occlusion::GroundTruthAmbientOcclusionPipeline::~GroundTruthAmbientOcclusionPipeline()
@@ -181,6 +269,26 @@ will_engine::ambient_occlusion::GroundTruthAmbientOcclusionPipeline::~GroundTrut
     resourceManager.destroyImage(ambientOcclusionImage);
 
     resourceManager.destroyDescriptorBuffer(ambientOcclusionDescriptorBuffer);
+
+    // Spatial Filtering Resources
+    resourceManager.destroyDescriptorSetLayout(spatialFilteringSetLayout);
+    resourceManager.destroyPipelineLayout(spatialFilteringPipelineLayout);
+    resourceManager.destroyPipeline(spatialFilteringPipeline);
+
+    resourceManager.destroyImage(spatialFilteringImage);
+
+    resourceManager.destroyDescriptorBuffer(spatialFilteringDescriptorBuffer);
+
+
+    // Temporal Accumulation Resources
+    resourceManager.destroyDescriptorSetLayout(temporalAccumulationSetLayout);
+    resourceManager.destroyPipelineLayout(temporalAccumulationPipelineLayout);
+    resourceManager.destroyPipeline(temporalAccumulationPipeline);
+
+    resourceManager.destroyImage(historyOutputImage);
+    resourceManager.destroyImage(ambientOcclusionOutputImage);
+
+    resourceManager.destroyDescriptorBuffer(temporalAccumulationDescriptorBuffer);
 }
 
 void will_engine::ambient_occlusion::GroundTruthAmbientOcclusionPipeline::setupDepthPrefilterDescriptorBuffer(const VkImageView depthImageView)
@@ -231,9 +339,11 @@ void will_engine::ambient_occlusion::GroundTruthAmbientOcclusionPipeline::setupA
             {VK_NULL_HANDLE, ambientOcclusionImage.imageView, VK_IMAGE_LAYOUT_GENERAL},
             false
         });
+
+    resourceManager.setupDescriptorBufferSampler(ambientOcclusionDescriptorBuffer, imageDescriptors, 0);
 }
 
-void will_engine::ambient_occlusion::GroundTruthAmbientOcclusionPipeline::draw(VkCommandBuffer cmd, const GTAODrawInfo& drawInfo)
+void will_engine::ambient_occlusion::GroundTruthAmbientOcclusionPipeline::draw(VkCommandBuffer cmd, const GTAODrawInfo& drawInfo) const
 {
     VkDebugUtilsLabelEXT label{};
     label.sType = VK_STRUCTURE_TYPE_DEBUG_UTILS_LABEL_EXT;
@@ -295,14 +405,13 @@ void will_engine::ambient_occlusion::GroundTruthAmbientOcclusionPipeline::draw(V
     vk_helpers::transitionImage(cmd, ambientOcclusionImage.image, VK_IMAGE_LAYOUT_GENERAL, VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL, VK_IMAGE_ASPECT_COLOR_BIT);
 
 
-
     vkCmdEndDebugUtilsLabelEXT(cmd);
 }
 
 void will_engine::ambient_occlusion::GroundTruthAmbientOcclusionPipeline::createDepthPrefilterPipeline()
 {
     resourceManager.destroyPipeline(depthPrefilterPipeline);
-    VkShaderModule computeShader = resourceManager.createShaderModule("shaders/gtao_depth_prefilter.comp");
+    VkShaderModule computeShader = resourceManager.createShaderModule("shaders/ambient_occlusion/ground_truth/gtao_depth_prefilter.comp");
 
     VkPipelineShaderStageCreateInfo stageInfo{};
     stageInfo.sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO;
@@ -325,7 +434,7 @@ void will_engine::ambient_occlusion::GroundTruthAmbientOcclusionPipeline::create
 void will_engine::ambient_occlusion::GroundTruthAmbientOcclusionPipeline::createAmbientOcclusionPipeline()
 {
     resourceManager.destroyPipeline(ambientOcclusionPipeline);
-    VkShaderModule computeShader = resourceManager.createShaderModule("shaders/gtao_ambient_occlusion.comp");
+    VkShaderModule computeShader = resourceManager.createShaderModule("shaders/ambient_occlusion/ground_truth/gtao_main_pass.comp");
 
     VkPipelineShaderStageCreateInfo stageInfo{};
     stageInfo.sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO;
@@ -344,3 +453,49 @@ void will_engine::ambient_occlusion::GroundTruthAmbientOcclusionPipeline::create
     ambientOcclusionPipeline = resourceManager.createComputePipeline(pipelineInfo);
     resourceManager.destroyShaderModule(computeShader);
 }
+
+void will_engine::ambient_occlusion::GroundTruthAmbientOcclusionPipeline::createSpatialFilteringPipeline()
+{
+    resourceManager.destroyPipeline(spatialFilteringPipeline);
+    VkShaderModule computeShader = resourceManager.createShaderModule("shaders/ambient_occlusion/ground_truth/gtao_spatial_filter.comp");
+
+    VkPipelineShaderStageCreateInfo stageInfo{};
+    stageInfo.sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO;
+    stageInfo.pNext = nullptr;
+    stageInfo.stage = VK_SHADER_STAGE_COMPUTE_BIT;
+    stageInfo.module = computeShader;
+    stageInfo.pName = "main";
+
+    VkComputePipelineCreateInfo pipelineInfo{};
+    pipelineInfo.sType = VK_STRUCTURE_TYPE_COMPUTE_PIPELINE_CREATE_INFO;
+    pipelineInfo.pNext = nullptr;
+    pipelineInfo.layout = spatialFilteringPipelineLayout;
+    pipelineInfo.stage = stageInfo;
+    pipelineInfo.flags = VK_PIPELINE_CREATE_DESCRIPTOR_BUFFER_BIT_EXT;
+
+    spatialFilteringPipeline = resourceManager.createComputePipeline(pipelineInfo);
+    resourceManager.destroyShaderModule(computeShader);
+}
+
+void will_engine::ambient_occlusion::GroundTruthAmbientOcclusionPipeline::createTemporalAccumulationPipeline()
+{
+    resourceManager.destroyPipeline(temporalAccumulationPipeline);
+    VkShaderModule computeShader = resourceManager.createShaderModule("shaders/ambient_occlusion/ground_truth/gtao_temporal_accumulation.comp");
+
+    VkPipelineShaderStageCreateInfo stageInfo{};
+    stageInfo.sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO;
+    stageInfo.pNext = nullptr;
+    stageInfo.stage = VK_SHADER_STAGE_COMPUTE_BIT;
+    stageInfo.module = computeShader;
+    stageInfo.pName = "main";
+
+    VkComputePipelineCreateInfo pipelineInfo{};
+    pipelineInfo.sType = VK_STRUCTURE_TYPE_COMPUTE_PIPELINE_CREATE_INFO;
+    pipelineInfo.pNext = nullptr;
+    pipelineInfo.layout = temporalAccumulationPipelineLayout;
+    pipelineInfo.stage = stageInfo;
+    pipelineInfo.flags = VK_PIPELINE_CREATE_DESCRIPTOR_BUFFER_BIT_EXT;
+
+    temporalAccumulationPipeline = resourceManager.createComputePipeline(pipelineInfo);
+    resourceManager.destroyShaderModule(computeShader);
+}
diff --git a/src/renderer/lighting/ambient_occlusion/ground_truth/ground_truth_ambient_occlusion.h b/src/renderer/lighting/ambient_occlusion/ground_truth/ground_truth_ambient_occlusion.h
index 4abba7b3..48fa8840 100644
--- a/src/renderer/lighting/ambient_occlusion/ground_truth/ground_truth_ambient_occlusion.h
+++ b/src/renderer/lighting/ambient_occlusion/ground_truth/ground_truth_ambient_occlusion.h
@@ -6,7 +6,6 @@
 #define GROUND_TRUTH_AMBIENT_OCCLUSION_H
 
 #include <array>
-#include <volk/volk.h>
 
 #include "src/renderer/resource_manager.h"
 #include "src/renderer/lighting/ambient_occlusion/ambient_occlusion_types.h"
@@ -25,18 +24,21 @@ class GroundTruthAmbientOcclusionPipeline
 
     void setupAmbientOcclusionDescriptorBuffer(VkImageView normalsImageView);
 
-    void draw(VkCommandBuffer cmd, const GTAODrawInfo& drawInfo);
+    void draw(VkCommandBuffer cmd, const GTAODrawInfo& drawInfo) const;
 
 private:
     void createDepthPrefilterPipeline();
     void createAmbientOcclusionPipeline();
+    void createSpatialFilteringPipeline();
+    void createTemporalAccumulationPipeline();
 
-private: // Depth prefilter
+private: // Depth Pre-filter
     VkDescriptorSetLayout depthPrefilterSetLayout{VK_NULL_HANDLE};
     VkPipelineLayout depthPrefilterPipelineLayout{VK_NULL_HANDLE};
     VkPipeline depthPrefilterPipeline{VK_NULL_HANDLE};
 
     VkSampler depthPrefilterSampler{VK_NULL_HANDLE};
+
     // 16 vs 32. look at cost later.
     VkFormat depthPrefilterFormat{VK_FORMAT_R16_SFLOAT};
     AllocatedImage depthPrefilterImage{VK_NULL_HANDLE};
@@ -44,26 +46,41 @@ class GroundTruthAmbientOcclusionPipeline
 
     DescriptorBufferSampler depthPrefilterDescriptorBuffer;
 
-private: // ao
+private: // Ambient Occlusion
     VkDescriptorSetLayout ambientOcclusionSetLayout{VK_NULL_HANDLE};
     VkPipelineLayout ambientOcclusionPipelineLayout{VK_NULL_HANDLE};
     VkPipeline ambientOcclusionPipeline{VK_NULL_HANDLE};
 
     VkSampler ambientOcclusionDepthSampler{VK_NULL_HANDLE};
     VkSampler ambientOcclusionNormalsSampler{VK_NULL_HANDLE};
+
+    // 8 is supposedly enough?
     VkFormat ambientOcclusionFormat{VK_FORMAT_R8_UNORM};
     AllocatedImage ambientOcclusionImage{VK_NULL_HANDLE};
 
     DescriptorBufferSampler ambientOcclusionDescriptorBuffer;
 
-private: //
-
+private: // Spatial Filtering
+    VkDescriptorSetLayout spatialFilteringSetLayout{VK_NULL_HANDLE};
     VkPipelineLayout spatialFilteringPipelineLayout{VK_NULL_HANDLE};
     VkPipeline spatialFilteringPipeline{VK_NULL_HANDLE};
 
+    AllocatedImage spatialFilteringImage{VK_NULL_HANDLE};
+
+    DescriptorBufferSampler spatialFilteringDescriptorBuffer;
+
+private: // Temporal Accumulation
+    VkDescriptorSetLayout temporalAccumulationSetLayout{VK_NULL_HANDLE};
     VkPipelineLayout temporalAccumulationPipelineLayout{VK_NULL_HANDLE};
     VkPipeline temporalAccumulationPipeline{VK_NULL_HANDLE};
 
+    AllocatedImage historyOutputImage{VK_NULL_HANDLE};
+
+    DescriptorBufferSampler temporalAccumulationDescriptorBuffer;
+
+private: // Output
+    AllocatedImage ambientOcclusionOutputImage{VK_NULL_HANDLE};
+
 private:
     ResourceManager& resourceManager;
 };

From 7d73ffbc4bfd2abec65d7bd7acb78a5cf2dd4f4a Mon Sep 17 00:00:00 2001
From: Williscool13 <twtw40@gmail.com>
Date: Mon, 24 Mar 2025 23:09:34 +0700
Subject: [PATCH 05/27] Depth prefilter shader progress.

---
 .../ground_truth/gtao_depth_prefilter.comp    | 66 ++++++++++++++--
 shaders/include/scene.glsl                    |  1 +
 src/core/engine.cpp                           |  2 +-
 .../ambient_occlusion_types.h                 | 11 ++-
 .../ground_truth_ambient_occlusion.cpp        | 77 ++++++++++---------
 src/renderer/vk_types.h                       |  2 +
 6 files changed, 115 insertions(+), 44 deletions(-)

diff --git a/shaders/ambient_occlusion/ground_truth/gtao_depth_prefilter.comp b/shaders/ambient_occlusion/ground_truth/gtao_depth_prefilter.comp
index e92bdd91..7b04259e 100644
--- a/shaders/ambient_occlusion/ground_truth/gtao_depth_prefilter.comp
+++ b/shaders/ambient_occlusion/ground_truth/gtao_depth_prefilter.comp
@@ -5,12 +5,66 @@
 // layout (std140, set = 0, binding = 0) uniform SceneData - scene.glsl
 
 layout (set = 1, binding = 0) uniform sampler2D depthImage;
-layout (r16f, set = 1, binding = 1) uniform image2D depthMip0;
-layout (r16f, set = 1, binding = 2) uniform image2D depthMip1;
-layout (r16f, set = 1, binding = 3) uniform image2D depthMip2;
-layout (r16f, set = 1, binding = 4) uniform image2D depthMip3;
-layout (r16f, set = 1, binding = 5) uniform image2D depthMip4;
+layout (r16f, set = 1, binding = 1) uniform image2D outDepth0;
+layout (r16f, set = 1, binding = 2) uniform image2D outDepth1;
+layout (r16f, set = 1, binding = 3) uniform image2D outDepth2;
+layout (r16f, set = 1, binding = 4) uniform image2D outDepth3;
+layout (r16f, set = 1, binding = 5) uniform image2D outDepth4;
 
-void main() {
+layout (push_constant) uniform PushConstants {
+    float depthLinearizeMult;
+    float depthLinearizeAdd;
+
+    vec2 ndcToViewMult;
+    vec2 ndcToViewAdd;
+
+    float radius;
+    float faloff;
+    float strength;
+    float radiusMultiplier;
+
+    int numDirections;
+    int numSteps;
+
+    float spatialFilterRadius;
+
+    float temporalWeight;
+} pushConstants;
 
+
+shared float g_scratchDepths[16][16];
+
+layout(local_size_x = 16, local_size_y = 16) in;
+
+//float screenSpaceToViewSpaceDepth(float screenDepth) {
+//    vec4 clipSpacePos = vec4(0.0, 0.0, screenDepth, 1.0);
+//    vec4 viewSpacePos = sceneData.invProjection * clipSpacePos;
+//    return -viewSpacePos.z / viewSpacePos.w;
+//}
+//
+//float clampDepth(){
+//    // using half float precision
+//    return clamp(depth, 0.0, 65504.0);
+//}
+
+void main() {
+//    vec2 dispatchID = gl_GlobalInvocationID.xy;
+//    uvec2 groupThreadID = gl_LocalInvocationID.xy;
+//
+//    // mip 0
+//    const uvec2 baseCoord = dispatchThreadID;
+//    const uvec2 pixCoord = baseCoord * 2;// We process 2x2 pixels in MIP 0
+//
+//    vec2 uvCoord = vec2(pixCoord) * sceneData.texelSize;
+//
+//    vec4 depths4;
+//    depths4.w = texture(depthImage, uvCoord + vec2(0.0, 0.0) * sceneData.texelSize).r;
+//    depths4.z = texture(depthImage, uvCoord + vec2(1.0, 0.0) * sceneData.texelSize).r;
+//    depths4.x = texture(depthImage, uvCoord + vec2(0.0, 1.0) * sceneData.texelSize).r;
+//    depths4.y = texture(depthImage, uvCoord + vec2(1.0, 1.0) * sceneData.texelSize).r;
+//
+//    float depth0 = clampDepth(ScreenSpaceToViewSpaceDepth(depths4.w));
+//    float depth1 = clampDepth(ScreenSpaceToViewSpaceDepth(depths4.z));
+//    float depth2 = clampDepth(ScreenSpaceToViewSpaceDepth(depths4.x));
+//    float depth3 = clampDepth(ScreenSpaceToViewSpaceDepth(depths4.y));
 }
diff --git a/shaders/include/scene.glsl b/shaders/include/scene.glsl
index 9da8db70..04176d1c 100644
--- a/shaders/include/scene.glsl
+++ b/shaders/include/scene.glsl
@@ -27,5 +27,6 @@ layout (std140, set = 0, binding = 0) uniform SceneData {
     vec2 renderTargetSize;
     // equal to 1 / renderTargetSize
     vec2 texelSize;
+    vec2 cameraPlanes;
     float deltaTime;
 } sceneData;
\ No newline at end of file
diff --git a/src/core/engine.cpp b/src/core/engine.cpp
index 6e5b420c..6c567e87 100644
--- a/src/core/engine.cpp
+++ b/src/core/engine.cpp
@@ -395,6 +395,7 @@ void Engine::updateRender(const float deltaTime, const int32_t currentFrameOverl
 
     pSceneData->renderTargetSize = {RENDER_EXTENT_WIDTH, RENDER_EXTENT_HEIGHT};
     pSceneData->texelSize = {1.0f / RENDER_EXTENT_WIDTH, 1.0f / RENDER_EXTENT_HEIGHT};
+    pSceneData->cameraPlanes = {camera->getNearPlane(), camera->getFarPlane()};
     pSceneData->deltaTime = deltaTime;
 
 
@@ -415,7 +416,6 @@ void Engine::updateRender(const float deltaTime, const int32_t currentFrameOverl
     pDebugSceneData->prevCameraWorldPos = glm::vec4(0.0f);
     pDebugSceneData->cameraWorldPos = glm::vec4(0.0f);
 
-
     pDebugSceneData->renderTargetSize = {RENDER_EXTENT_WIDTH, RENDER_EXTENT_HEIGHT};
     pDebugSceneData->deltaTime = deltaTime;
 }
diff --git a/src/renderer/lighting/ambient_occlusion/ambient_occlusion_types.h b/src/renderer/lighting/ambient_occlusion/ambient_occlusion_types.h
index f9d7f0cd..918fbd39 100644
--- a/src/renderer/lighting/ambient_occlusion/ambient_occlusion_types.h
+++ b/src/renderer/lighting/ambient_occlusion/ambient_occlusion_types.h
@@ -6,14 +6,20 @@
 #define AMBIENT_OCCLUSION_TYPES_H
 #include <glm/glm.hpp>
 
+#include "src/core/camera/camera.h"
+
 namespace will_engine::ambient_occlusion
 {
 static constexpr int32_t DEPTH_PREFILTER_MIP_COUNT = 5;
 
 struct GTAOPushConstants
 {
-    glm::vec2 viewportSize;
-    glm::vec2 viewportPixelSize;
+    // Depth prefilter parameters
+    float depthLinearizeMult;
+    float depthLinearizeAdd;
+
+    glm::vec2 ndcToViewMult;
+    glm::vec2 ndcToViewAdd;
 
     // AO parameters
     float radius;
@@ -32,6 +38,7 @@ struct GTAOPushConstants
 
 struct GTAODrawInfo
 {
+    Camera* camera{nullptr};
     GTAOPushConstants pushConstants{};
     VkDescriptorBufferBindingInfoEXT sceneDataBinding{};
     VkDeviceSize sceneDataOffset{0};
diff --git a/src/renderer/lighting/ambient_occlusion/ground_truth/ground_truth_ambient_occlusion.cpp b/src/renderer/lighting/ambient_occlusion/ground_truth/ground_truth_ambient_occlusion.cpp
index 14231bda..bddb0fd7 100644
--- a/src/renderer/lighting/ambient_occlusion/ground_truth/ground_truth_ambient_occlusion.cpp
+++ b/src/renderer/lighting/ambient_occlusion/ground_truth/ground_truth_ambient_occlusion.cpp
@@ -350,6 +350,12 @@ void will_engine::ambient_occlusion::GroundTruthAmbientOcclusionPipeline::draw(V
     label.pLabelName = "GT Ambient Occlusion";
     vkCmdBeginDebugUtilsLabelEXT(cmd, &label);
 
+    GTAOPushConstants push = drawInfo.pushConstants;
+    glm::mat4 projMatrix = drawInfo.camera->getProjMatrix();
+    push.depthLinearizeMult = -projMatrix[2][3];
+    push.depthLinearizeAdd = projMatrix[2][2];
+
+
     vk_helpers::transitionImage(cmd, depthPrefilterImage.image, VK_IMAGE_LAYOUT_UNDEFINED, VK_IMAGE_LAYOUT_GENERAL, VK_IMAGE_ASPECT_COLOR_BIT);
 
     // Depth Prefilter
@@ -362,48 +368,49 @@ void will_engine::ambient_occlusion::GroundTruthAmbientOcclusionPipeline::draw(V
         bindingInfos[1] = depthPrefilterDescriptorBuffer.getDescriptorBufferBindingInfo();
         vkCmdBindDescriptorBuffersEXT(cmd, 2, bindingInfos);
 
-        constexpr VkDeviceSize zeroOffset{0};
-        constexpr uint32_t sceneDataIndex{0};
-        constexpr uint32_t descriptorIndex{1};
+        constexpr std::array<uint32_t, 2> indices{0, 1};
+        const std::array offsets{drawInfo.sceneDataOffset, ZERO_DEVICE_SIZE};
 
-        vkCmdSetDescriptorBufferOffsetsEXT(cmd, VK_PIPELINE_BIND_POINT_COMPUTE, depthPrefilterPipelineLayout, 0, 1, &sceneDataIndex, &drawInfo.sceneDataOffset);
-        vkCmdSetDescriptorBufferOffsetsEXT(cmd, VK_PIPELINE_BIND_POINT_COMPUTE, depthPrefilterPipelineLayout, 1, 1, &descriptorIndex, &zeroOffset);
+        vkCmdSetDescriptorBufferOffsetsEXT(cmd, VK_PIPELINE_BIND_POINT_COMPUTE, depthPrefilterPipelineLayout, 0, 2, indices.data(), offsets.data());
 
-        const auto x = static_cast<uint32_t>(std::ceil(RENDER_EXTENT_WIDTH / 16.0f));
-        const auto y = static_cast<uint32_t>(std::ceil(RENDER_EXTENT_HEIGHT / 16.0f));
+        auto x = static_cast<uint32_t>(std::ceil(RENDER_EXTENT_WIDTH / 16.0f));
+        auto y = static_cast<uint32_t>(std::ceil(RENDER_EXTENT_HEIGHT / 16.0f));
+        // divided by 2 because depth prepass operates on 2x2 (still input4 -> output4)
+        x /= 2;
+        y /= 2;
         vkCmdDispatch(cmd, x, y, 1);
         vkCmdEndRendering(cmd);
     }
 
     vk_helpers::transitionImage(cmd, depthPrefilterImage.image, VK_IMAGE_LAYOUT_GENERAL, VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL, VK_IMAGE_ASPECT_COLOR_BIT);
-    vk_helpers::transitionImage(cmd, ambientOcclusionImage.image, VK_IMAGE_LAYOUT_UNDEFINED, VK_IMAGE_LAYOUT_GENERAL, VK_IMAGE_ASPECT_COLOR_BIT);
-
-    // Ambient Occlusion
-    {
-        vkCmdBindPipeline(cmd, VK_PIPELINE_BIND_POINT_COMPUTE, ambientOcclusionPipeline);
-        vkCmdPushConstants(cmd, ambientOcclusionPipelineLayout, VK_SHADER_STAGE_COMPUTE_BIT, 0, sizeof(GTAOPushConstants), &drawInfo.pushConstants);
-
-        VkDescriptorBufferBindingInfoEXT bindingInfos[2] = {};
-        bindingInfos[0] = drawInfo.sceneDataBinding;
-        bindingInfos[1] = ambientOcclusionDescriptorBuffer.getDescriptorBufferBindingInfo();
-        vkCmdBindDescriptorBuffersEXT(cmd, 2, bindingInfos);
-
-        constexpr VkDeviceSize zeroOffset{0};
-        constexpr uint32_t sceneDataIndex{0};
-        constexpr uint32_t descriptorIndex{1};
-
-        vkCmdSetDescriptorBufferOffsetsEXT(cmd, VK_PIPELINE_BIND_POINT_COMPUTE, ambientOcclusionPipelineLayout, 0, 1, &sceneDataIndex, &drawInfo.sceneDataOffset);
-        vkCmdSetDescriptorBufferOffsetsEXT(cmd, VK_PIPELINE_BIND_POINT_COMPUTE, ambientOcclusionPipelineLayout, 1, 1, &descriptorIndex, &zeroOffset);
-
-        const auto x = static_cast<uint32_t>(std::ceil(RENDER_EXTENT_WIDTH / 16.0f));
-        const auto y = static_cast<uint32_t>(std::ceil(RENDER_EXTENT_HEIGHT / 16.0f));
-        vkCmdDispatch(cmd, x, y, 1);
-        vkCmdEndRendering(cmd);
-    }
-
-
-    vk_helpers::transitionImage(cmd, ambientOcclusionImage.image, VK_IMAGE_LAYOUT_GENERAL, VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL, VK_IMAGE_ASPECT_COLOR_BIT);
-
+    // vk_helpers::transitionImage(cmd, ambientOcclusionImage.image, VK_IMAGE_LAYOUT_UNDEFINED, VK_IMAGE_LAYOUT_GENERAL, VK_IMAGE_ASPECT_COLOR_BIT);
+    //
+    // // Ambient Occlusion
+    // {
+    //     vkCmdBindPipeline(cmd, VK_PIPELINE_BIND_POINT_COMPUTE, ambientOcclusionPipeline);
+    //     vkCmdPushConstants(cmd, ambientOcclusionPipelineLayout, VK_SHADER_STAGE_COMPUTE_BIT, 0, sizeof(GTAOPushConstants), &drawInfo.pushConstants);
+    //
+    //     VkDescriptorBufferBindingInfoEXT bindingInfos[2] = {};
+    //     bindingInfos[0] = drawInfo.sceneDataBinding;
+    //     bindingInfos[1] = ambientOcclusionDescriptorBuffer.getDescriptorBufferBindingInfo();
+    //     vkCmdBindDescriptorBuffersEXT(cmd, 2, bindingInfos);
+    //
+    //     constexpr VkDeviceSize zeroOffset{0};
+    //     constexpr uint32_t sceneDataIndex{0};
+    //     constexpr uint32_t descriptorIndex{1};
+    //
+    //     vkCmdSetDescriptorBufferOffsetsEXT(cmd, VK_PIPELINE_BIND_POINT_COMPUTE, ambientOcclusionPipelineLayout, 0, 1, &sceneDataIndex, &drawInfo.sceneDataOffset);
+    //     vkCmdSetDescriptorBufferOffsetsEXT(cmd, VK_PIPELINE_BIND_POINT_COMPUTE, ambientOcclusionPipelineLayout, 1, 1, &descriptorIndex, &zeroOffset);
+    //
+    //     const auto x = static_cast<uint32_t>(std::ceil(RENDER_EXTENT_WIDTH / 16.0f));
+    //     const auto y = static_cast<uint32_t>(std::ceil(RENDER_EXTENT_HEIGHT / 16.0f));
+    //     vkCmdDispatch(cmd, x, y, 1);
+    //     vkCmdEndRendering(cmd);
+    // }
+    //
+    //
+    // vk_helpers::transitionImage(cmd, ambientOcclusionImage.image, VK_IMAGE_LAYOUT_GENERAL, VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL, VK_IMAGE_ASPECT_COLOR_BIT);
+    //
 
     vkCmdEndDebugUtilsLabelEXT(cmd);
 }
diff --git a/src/renderer/vk_types.h b/src/renderer/vk_types.h
index 634e270a..daafdd3a 100644
--- a/src/renderer/vk_types.h
+++ b/src/renderer/vk_types.h
@@ -72,6 +72,8 @@ struct SceneData
 
     glm::vec2 renderTargetSize{};
     glm::vec2 texelSize{};
+
+    glm::vec2 cameraPlanes{1000.0f, 0.1f};
     float deltaTime{};
 };
 

From 41663b05406621cce3b4a05f959393aa3ce93508 Mon Sep 17 00:00:00 2001
From: Williscool13 <twtw40@gmail.com>
Date: Tue, 25 Mar 2025 20:19:25 +0700
Subject: [PATCH 06/27] GTAO prefilter helper functions.

---
 .../ground_truth/gtao_depth_prefilter.comp     | 18 ++++++++++++++----
 1 file changed, 14 insertions(+), 4 deletions(-)

diff --git a/shaders/ambient_occlusion/ground_truth/gtao_depth_prefilter.comp b/shaders/ambient_occlusion/ground_truth/gtao_depth_prefilter.comp
index 7b04259e..557bf783 100644
--- a/shaders/ambient_occlusion/ground_truth/gtao_depth_prefilter.comp
+++ b/shaders/ambient_occlusion/ground_truth/gtao_depth_prefilter.comp
@@ -36,16 +36,26 @@ shared float g_scratchDepths[16][16];
 
 layout(local_size_x = 16, local_size_y = 16) in;
 
+// Using this technique because
+// depthLinearizeMul / (depthLinearizeAdd - screenDepth)
+// doesn't seem to be stable, maybe because of the use of a reversed depth buffer?
+float screenToViewSpaceDepth(float projDepth, float clipNear, float clipFar) {
+    return -clipNear / (clipFar - projDepth * (clipFar - clipNear)) * clipFar;
+}
+
+float clampDepth(){
+    // using half float precision
+    return clamp(depth, 0.0, 65504.0);
+}
+
+
 //float screenSpaceToViewSpaceDepth(float screenDepth) {
 //    vec4 clipSpacePos = vec4(0.0, 0.0, screenDepth, 1.0);
 //    vec4 viewSpacePos = sceneData.invProjection * clipSpacePos;
 //    return -viewSpacePos.z / viewSpacePos.w;
 //}
 //
-//float clampDepth(){
-//    // using half float precision
-//    return clamp(depth, 0.0, 65504.0);
-//}
+
 
 void main() {
 //    vec2 dispatchID = gl_GlobalInvocationID.xy;

From 2176da22840cc3aa3e4bba4afe9d3eb5b5dda47b Mon Sep 17 00:00:00 2001
From: Williscool13 <twtw40@gmail.com>
Date: Tue, 25 Mar 2025 21:03:18 +0700
Subject: [PATCH 07/27] Revert "GTAO prefilter helper functions."

This reverts commit 41663b05406621cce3b4a05f959393aa3ce93508.
---
 .../ground_truth/gtao_depth_prefilter.comp     | 18 ++++--------------
 1 file changed, 4 insertions(+), 14 deletions(-)

diff --git a/shaders/ambient_occlusion/ground_truth/gtao_depth_prefilter.comp b/shaders/ambient_occlusion/ground_truth/gtao_depth_prefilter.comp
index 557bf783..7b04259e 100644
--- a/shaders/ambient_occlusion/ground_truth/gtao_depth_prefilter.comp
+++ b/shaders/ambient_occlusion/ground_truth/gtao_depth_prefilter.comp
@@ -36,26 +36,16 @@ shared float g_scratchDepths[16][16];
 
 layout(local_size_x = 16, local_size_y = 16) in;
 
-// Using this technique because
-// depthLinearizeMul / (depthLinearizeAdd - screenDepth)
-// doesn't seem to be stable, maybe because of the use of a reversed depth buffer?
-float screenToViewSpaceDepth(float projDepth, float clipNear, float clipFar) {
-    return -clipNear / (clipFar - projDepth * (clipFar - clipNear)) * clipFar;
-}
-
-float clampDepth(){
-    // using half float precision
-    return clamp(depth, 0.0, 65504.0);
-}
-
-
 //float screenSpaceToViewSpaceDepth(float screenDepth) {
 //    vec4 clipSpacePos = vec4(0.0, 0.0, screenDepth, 1.0);
 //    vec4 viewSpacePos = sceneData.invProjection * clipSpacePos;
 //    return -viewSpacePos.z / viewSpacePos.w;
 //}
 //
-
+//float clampDepth(){
+//    // using half float precision
+//    return clamp(depth, 0.0, 65504.0);
+//}
 
 void main() {
 //    vec2 dispatchID = gl_GlobalInvocationID.xy;

From 5f770136111ec389222865a40e65bad44875fee5 Mon Sep 17 00:00:00 2001
From: Williscool13 <twtw40@gmail.com>
Date: Tue, 25 Mar 2025 22:25:32 +0700
Subject: [PATCH 08/27] GTAO depth prefilter mip 0.

---
 .../ambient_occlusion/ground_truth/gtao.comp  | 174 +++++++++---------
 .../ground_truth/gtao_depth_prefilter.comp    |  65 ++++---
 src/core/engine.cpp                           |   8 +
 .../ground_truth_ambient_occlusion.cpp        |   5 +-
 4 files changed, 133 insertions(+), 119 deletions(-)

diff --git a/shaders/ambient_occlusion/ground_truth/gtao.comp b/shaders/ambient_occlusion/ground_truth/gtao.comp
index c48a172d..a7cc35fb 100644
--- a/shaders/ambient_occlusion/ground_truth/gtao.comp
+++ b/shaders/ambient_occlusion/ground_truth/gtao.comp
@@ -3,90 +3,90 @@
 void main() {
 
 }
-
-void XeGTAO_PrefilterDepths16x16(
-uvec2 dispatchThreadID,
-uvec2 groupThreadID,
-const GTAOConstants consts,
-sampler2D sourceNDCDepth,
-out writeonly image2D outDepth0,
-out writeonly image2D outDepth1,
-out writeonly image2D outDepth2,
-out writeonly image2D outDepth3,
-out writeonly image2D outDepth4
-) {
-    // MIP 0
-    const uvec2 baseCoord = dispatchThreadID;
-    // 2x because ao image is downsampled
-    const uvec2 pixCoord = baseCoord * 2u;
-
-
-    // todo: get width and height from sceneData
-    vec2 uvCoord = vec2(pixCoord) * consts.ViewportPixelSize;
-
-    vec4 depths 4;
-    depths4.w = texture(sourceNDCDepth, uvCoord + vec2(0.0, 0.0) * consts.ViewportPixelSize).r;
-    depths4.z = texture(sourceNDCDepth, uvCoord + vec2(1.0, 0.0) * consts.ViewportPixelSize).r;
-    depths4.x = texture(sourceNDCDepth, uvCoord + vec2(0.0, 1.0) * consts.ViewportPixelSize).r;
-    depths4.y = texture(sourceNDCDepth, uvCoord + vec2(1.0, 1.0) * consts.ViewportPixelSize).r;
-
-    float depth0 = XeGTAO_ClampDepth(XeGTAO_ScreenSpaceToViewSpaceDepth(depths4.w, consts));
-    float depth1 = XeGTAO_ClampDepth(XeGTAO_ScreenSpaceToViewSpaceDepth(depths4.z, consts));
-    float depth2 = XeGTAO_ClampDepth(XeGTAO_ScreenSpaceToViewSpaceDepth(depths4.x, consts));
-    float depth3 = XeGTAO_ClampDepth(XeGTAO_ScreenSpaceToViewSpaceDepth(depths4.y, consts));
-
-    imageStore(outDepth0, ivec2(pixCoord + uvec2(0, 0)), vec4(depth0, 0.0, 0.0, 0.0));
-    imageStore(outDepth0, ivec2(pixCoord + uvec2(1, 0)), vec4(depth1, 0.0, 0.0, 0.0));
-    imageStore(outDepth0, ivec2(pixCoord + uvec2(0, 1)), vec4(depth2, 0.0, 0.0, 0.0));
-    imageStore(outDepth0, ivec2(pixCoord + uvec2(1, 1)), vec4(depth3, 0.0, 0.0, 0.0));
-
-    // MIP 1
-    float dm1 = XeGTAO_DepthMIPFilter(depth0, depth1, depth2, depth3, consts);
-    imageStore(outDepth1, ivec2(baseCoord), vec4(dm1, 0.0, 0.0, 0.0));
-    g_scratchDepths[groupThreadID.x][groupThreadID.y] = dm1;
-
-    memoryBarrierShared();
-    barrier();
-
-    // MIP 2
-    if (all(equal(groupThreadID.xy % 2u, uvec2(0u, 0u)))) {
-        float inTL = g_scratchDepths[groupThreadID.x+0u][groupThreadID.y+0u];
-        float inTR = g_scratchDepths[groupThreadID.x+1u][groupThreadID.y+0u];
-        float inBL = g_scratchDepths[groupThreadID.x+0u][groupThreadID.y+1u];
-        float inBR = g_scratchDepths[groupThreadID.x+1u][groupThreadID.y+1u];
-
-        float dm2 = XeGTAO_DepthMIPFilter(inTL, inTR, inBL, inBR, consts);
-        imageStore(outDepth2, ivec2(baseCoord / 2u), vec4(dm2, 0.0, 0.0, 0.0));
-        g_scratchDepths[groupThreadID.x][groupThreadID.y] = dm2;
-    }
-
-    memoryBarrierShared();
-    barrier();
-
-    // MIP 3
-    if (all(equal(groupThreadID.xy % 4u, uvec2(0u, 0u)))) {
-        float inTL = g_scratchDepths[groupThreadID.x+0u][groupThreadID.y+0u];
-        float inTR = g_scratchDepths[groupThreadID.x+2u][groupThreadID.y+0u];
-        float inBL = g_scratchDepths[groupThreadID.x+0u][groupThreadID.y+2u];
-        float inBR = g_scratchDepths[groupThreadID.x+2u][groupThreadID.y+2u];
-
-        float dm3 = XeGTAO_DepthMIPFilter(inTL, inTR, inBL, inBR, consts);
-        imageStore(outDepth3, ivec2(baseCoord / 4u), vec4(dm3, 0.0, 0.0, 0.0));
-        g_scratchDepths[groupThreadID.x][groupThreadID.y] = dm3;
-    }
-
-    memoryBarrierShared();
-    barrier();
-
-    // MIP 4
-    if (all(equal(groupThreadID.xy % 8u, uvec2(0u, 0u)))) {
-        float inTL = g_scratchDepths[groupThreadID.x+0u][groupThreadID.y+0u];
-        float inTR = g_scratchDepths[groupThreadID.x+4u][groupThreadID.y+0u];
-        float inBL = g_scratchDepths[groupThreadID.x+0u][groupThreadID.y+4u];
-        float inBR = g_scratchDepths[groupThreadID.x+4u][groupThreadID.y+4u];
-
-        float dm4 = XeGTAO_DepthMIPFilter(inTL, inTR, inBL, inBR, consts);
-        imageStore(outDepth4, ivec2(baseCoord / 8u), vec4(dm4, 0.0, 0.0, 0.0));
-        // g_scratchDepths[groupThreadID.x][groupThreadID.y] = dm4; // commented out as in original
-    }
-}
\ No newline at end of file
+//
+//void XeGTAO_PrefilterDepths16x16(
+//uvec2 dispatchThreadID,
+//uvec2 groupThreadID,
+//const GTAOConstants consts,
+//sampler2D sourceNDCDepth,
+//out writeonly image2D outDepth0,
+//out writeonly image2D outDepth1,
+//out writeonly image2D outDepth2,
+//out writeonly image2D outDepth3,
+//out writeonly image2D outDepth4
+//) {
+//    // MIP 0
+//    const uvec2 baseCoord = dispatchThreadID;
+//    // 2x because ao image is downsampled
+//    const uvec2 pixCoord = baseCoord * 2u;
+//
+//
+//    // todo: get width and height from sceneData
+//    vec2 uvCoord = vec2(pixCoord) * consts.ViewportPixelSize;
+//
+//    vec4 depths 4;
+//    depths4.w = texture(sourceNDCDepth, uvCoord + vec2(0.0, 0.0) * consts.ViewportPixelSize).r;
+//    depths4.z = texture(sourceNDCDepth, uvCoord + vec2(1.0, 0.0) * consts.ViewportPixelSize).r;
+//    depths4.x = texture(sourceNDCDepth, uvCoord + vec2(0.0, 1.0) * consts.ViewportPixelSize).r;
+//    depths4.y = texture(sourceNDCDepth, uvCoord + vec2(1.0, 1.0) * consts.ViewportPixelSize).r;
+//
+//    float depth0 = XeGTAO_ClampDepth(XeGTAO_ScreenSpaceToViewSpaceDepth(depths4.w, consts));
+//    float depth1 = XeGTAO_ClampDepth(XeGTAO_ScreenSpaceToViewSpaceDepth(depths4.z, consts));
+//    float depth2 = XeGTAO_ClampDepth(XeGTAO_ScreenSpaceToViewSpaceDepth(depths4.x, consts));
+//    float depth3 = XeGTAO_ClampDepth(XeGTAO_ScreenSpaceToViewSpaceDepth(depths4.y, consts));
+//
+//    imageStore(outDepth0, ivec2(pixCoord + uvec2(0, 0)), vec4(depth0, 0.0, 0.0, 0.0));
+//    imageStore(outDepth0, ivec2(pixCoord + uvec2(1, 0)), vec4(depth1, 0.0, 0.0, 0.0));
+//    imageStore(outDepth0, ivec2(pixCoord + uvec2(0, 1)), vec4(depth2, 0.0, 0.0, 0.0));
+//    imageStore(outDepth0, ivec2(pixCoord + uvec2(1, 1)), vec4(depth3, 0.0, 0.0, 0.0));
+//
+//    // MIP 1
+//    float dm1 = XeGTAO_DepthMIPFilter(depth0, depth1, depth2, depth3, consts);
+//    imageStore(outDepth1, ivec2(baseCoord), vec4(dm1, 0.0, 0.0, 0.0));
+//    g_scratchDepths[groupThreadID.x][groupThreadID.y] = dm1;
+//
+//    memoryBarrierShared();
+//    barrier();
+//
+//    // MIP 2
+//    if (all(equal(groupThreadID.xy % 2u, uvec2(0u, 0u)))) {
+//        float inTL = g_scratchDepths[groupThreadID.x+0u][groupThreadID.y+0u];
+//        float inTR = g_scratchDepths[groupThreadID.x+1u][groupThreadID.y+0u];
+//        float inBL = g_scratchDepths[groupThreadID.x+0u][groupThreadID.y+1u];
+//        float inBR = g_scratchDepths[groupThreadID.x+1u][groupThreadID.y+1u];
+//
+//        float dm2 = XeGTAO_DepthMIPFilter(inTL, inTR, inBL, inBR, consts);
+//        imageStore(outDepth2, ivec2(baseCoord / 2u), vec4(dm2, 0.0, 0.0, 0.0));
+//        g_scratchDepths[groupThreadID.x][groupThreadID.y] = dm2;
+//    }
+//
+//    memoryBarrierShared();
+//    barrier();
+//
+//    // MIP 3
+//    if (all(equal(groupThreadID.xy % 4u, uvec2(0u, 0u)))) {
+//        float inTL = g_scratchDepths[groupThreadID.x+0u][groupThreadID.y+0u];
+//        float inTR = g_scratchDepths[groupThreadID.x+2u][groupThreadID.y+0u];
+//        float inBL = g_scratchDepths[groupThreadID.x+0u][groupThreadID.y+2u];
+//        float inBR = g_scratchDepths[groupThreadID.x+2u][groupThreadID.y+2u];
+//
+//        float dm3 = XeGTAO_DepthMIPFilter(inTL, inTR, inBL, inBR, consts);
+//        imageStore(outDepth3, ivec2(baseCoord / 4u), vec4(dm3, 0.0, 0.0, 0.0));
+//        g_scratchDepths[groupThreadID.x][groupThreadID.y] = dm3;
+//    }
+//
+//    memoryBarrierShared();
+//    barrier();
+//
+//    // MIP 4
+//    if (all(equal(groupThreadID.xy % 8u, uvec2(0u, 0u)))) {
+//        float inTL = g_scratchDepths[groupThreadID.x+0u][groupThreadID.y+0u];
+//        float inTR = g_scratchDepths[groupThreadID.x+4u][groupThreadID.y+0u];
+//        float inBL = g_scratchDepths[groupThreadID.x+0u][groupThreadID.y+4u];
+//        float inBR = g_scratchDepths[groupThreadID.x+4u][groupThreadID.y+4u];
+//
+//        float dm4 = XeGTAO_DepthMIPFilter(inTL, inTR, inBL, inBR, consts);
+//        imageStore(outDepth4, ivec2(baseCoord / 8u), vec4(dm4, 0.0, 0.0, 0.0));
+//        // g_scratchDepths[groupThreadID.x][groupThreadID.y] = dm4; // commented out as in original
+//    }
+//}
\ No newline at end of file
diff --git a/shaders/ambient_occlusion/ground_truth/gtao_depth_prefilter.comp b/shaders/ambient_occlusion/ground_truth/gtao_depth_prefilter.comp
index 7b04259e..ebe3c57e 100644
--- a/shaders/ambient_occlusion/ground_truth/gtao_depth_prefilter.comp
+++ b/shaders/ambient_occlusion/ground_truth/gtao_depth_prefilter.comp
@@ -36,35 +36,42 @@ shared float g_scratchDepths[16][16];
 
 layout(local_size_x = 16, local_size_y = 16) in;
 
-//float screenSpaceToViewSpaceDepth(float screenDepth) {
-//    vec4 clipSpacePos = vec4(0.0, 0.0, screenDepth, 1.0);
-//    vec4 viewSpacePos = sceneData.invProjection * clipSpacePos;
-//    return -viewSpacePos.z / viewSpacePos.w;
-//}
-//
-//float clampDepth(){
-//    // using half float precision
-//    return clamp(depth, 0.0, 65504.0);
-//}
+float screenToViewSpaceDepth(float screenDepth, float depthLinearizeMul, float depthLinearizeAdd) {
+    // Optimization by XeGTAO
+    // https://github.com/GameTechDev/XeGTAO/blob/a5b1686c7ea37788eeb3576b5be47f7c03db532c/Source/Rendering/Shaders/XeGTAO.hlsli#L112
+    return depthLinearizeMul / (depthLinearizeAdd - screenDepth);
+}
+
+float clampDepth(float depth){
+    // using half float precision
+    return clamp(depth, 0.0, 65504.0);
+}
 
 void main() {
-//    vec2 dispatchID = gl_GlobalInvocationID.xy;
-//    uvec2 groupThreadID = gl_LocalInvocationID.xy;
-//
-//    // mip 0
-//    const uvec2 baseCoord = dispatchThreadID;
-//    const uvec2 pixCoord = baseCoord * 2;// We process 2x2 pixels in MIP 0
-//
-//    vec2 uvCoord = vec2(pixCoord) * sceneData.texelSize;
-//
-//    vec4 depths4;
-//    depths4.w = texture(depthImage, uvCoord + vec2(0.0, 0.0) * sceneData.texelSize).r;
-//    depths4.z = texture(depthImage, uvCoord + vec2(1.0, 0.0) * sceneData.texelSize).r;
-//    depths4.x = texture(depthImage, uvCoord + vec2(0.0, 1.0) * sceneData.texelSize).r;
-//    depths4.y = texture(depthImage, uvCoord + vec2(1.0, 1.0) * sceneData.texelSize).r;
-//
-//    float depth0 = clampDepth(ScreenSpaceToViewSpaceDepth(depths4.w));
-//    float depth1 = clampDepth(ScreenSpaceToViewSpaceDepth(depths4.z));
-//    float depth2 = clampDepth(ScreenSpaceToViewSpaceDepth(depths4.x));
-//    float depth3 = clampDepth(ScreenSpaceToViewSpaceDepth(depths4.y));
+    ivec2 screenPos = ivec2(gl_GlobalInvocationID.xy);
+
+    ivec2 dispatchID = ivec2(gl_GlobalInvocationID.xy);
+    ivec2 groupThreadID = ivec2(gl_LocalInvocationID.xy);
+
+    // MIP 0
+    const ivec2 baseCoord = dispatchID;
+    const ivec2 pixCoord = baseCoord * 2;// We process 2x2 pixels in MIP 0
+
+    vec2 uvCoord = vec2(pixCoord) * sceneData.texelSize;
+
+    vec4 depths4;
+    depths4.x = texture(depthImage, uvCoord + vec2(0.0, 0.0) * sceneData.texelSize).r;
+    depths4.y = texture(depthImage, uvCoord + vec2(1.0, 0.0) * sceneData.texelSize).r;
+    depths4.z = texture(depthImage, uvCoord + vec2(0.0, 1.0) * sceneData.texelSize).r;
+    depths4.w = texture(depthImage, uvCoord + vec2(1.0, 1.0) * sceneData.texelSize).r;
+
+    float depth0 = clampDepth(screenToViewSpaceDepth(depths4.x, pushConstants.depthLinearizeMult, pushConstants.depthLinearizeAdd));
+    float depth1 = clampDepth(screenToViewSpaceDepth(depths4.y, pushConstants.depthLinearizeMult, pushConstants.depthLinearizeAdd));
+    float depth2 = clampDepth(screenToViewSpaceDepth(depths4.z, pushConstants.depthLinearizeMult, pushConstants.depthLinearizeAdd));
+    float depth3 = clampDepth(screenToViewSpaceDepth(depths4.w, pushConstants.depthLinearizeMult, pushConstants.depthLinearizeAdd));
+
+    imageStore(outDepth0, pixCoord + ivec2(0, 0), vec4(depth0, 0.0f, 0.0f, 0.0f));
+    imageStore(outDepth0, pixCoord + ivec2(1, 0), vec4(depth1, 0.0f, 0.0f, 0.0f));
+    imageStore(outDepth0, pixCoord + ivec2(0, 1), vec4(depth2, 0.0f, 0.0f, 0.0f));
+    imageStore(outDepth0, pixCoord + ivec2(1, 1), vec4(depth3, 0.0f, 0.0f, 0.0f));
 }
diff --git a/src/core/engine.cpp b/src/core/engine.cpp
index 6c567e87..d58a8c6e 100644
--- a/src/core/engine.cpp
+++ b/src/core/engine.cpp
@@ -557,6 +557,14 @@ void Engine::draw(float deltaTime)
         deferredMrtPipeline->draw(cmd, debugDeferredMrtDrawInfo);
     }
 
+    ambient_occlusion::GTAODrawInfo gtaoDrawInfo{
+        camera,
+        {},
+        sceneDataDescriptorBuffer.getDescriptorBufferBindingInfo(),
+        sceneDataDescriptorBuffer.getDescriptorBufferSize() * FRAME_OVERLAP
+    };
+    ambientOcclusionPipeline->draw(cmd, gtaoDrawInfo);
+
     vk_helpers::transitionImage(cmd, normalRenderTarget.image, VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL, VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL, VK_IMAGE_ASPECT_COLOR_BIT);
     vk_helpers::transitionImage(cmd, albedoRenderTarget.image, VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL, VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL, VK_IMAGE_ASPECT_COLOR_BIT);
     vk_helpers::transitionImage(cmd, pbrRenderTarget.image, VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL, VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL, VK_IMAGE_ASPECT_COLOR_BIT);
diff --git a/src/renderer/lighting/ambient_occlusion/ground_truth/ground_truth_ambient_occlusion.cpp b/src/renderer/lighting/ambient_occlusion/ground_truth/ground_truth_ambient_occlusion.cpp
index bddb0fd7..6d64d039 100644
--- a/src/renderer/lighting/ambient_occlusion/ground_truth/ground_truth_ambient_occlusion.cpp
+++ b/src/renderer/lighting/ambient_occlusion/ground_truth/ground_truth_ambient_occlusion.cpp
@@ -352,14 +352,14 @@ void will_engine::ambient_occlusion::GroundTruthAmbientOcclusionPipeline::draw(V
 
     GTAOPushConstants push = drawInfo.pushConstants;
     glm::mat4 projMatrix = drawInfo.camera->getProjMatrix();
-    push.depthLinearizeMult = -projMatrix[2][3];
+    push.depthLinearizeMult = -projMatrix[3][2];
     push.depthLinearizeAdd = projMatrix[2][2];
 
 
     vk_helpers::transitionImage(cmd, depthPrefilterImage.image, VK_IMAGE_LAYOUT_UNDEFINED, VK_IMAGE_LAYOUT_GENERAL, VK_IMAGE_ASPECT_COLOR_BIT);
 
     // Depth Prefilter
-    {
+    {;
         vkCmdBindPipeline(cmd, VK_PIPELINE_BIND_POINT_COMPUTE, depthPrefilterPipeline);
         vkCmdPushConstants(cmd, depthPrefilterPipelineLayout, VK_SHADER_STAGE_COMPUTE_BIT, 0, sizeof(GTAOPushConstants), &drawInfo.pushConstants);
 
@@ -379,7 +379,6 @@ void will_engine::ambient_occlusion::GroundTruthAmbientOcclusionPipeline::draw(V
         x /= 2;
         y /= 2;
         vkCmdDispatch(cmd, x, y, 1);
-        vkCmdEndRendering(cmd);
     }
 
     vk_helpers::transitionImage(cmd, depthPrefilterImage.image, VK_IMAGE_LAYOUT_GENERAL, VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL, VK_IMAGE_ASPECT_COLOR_BIT);

From 3a3444a7d731d988d1029dbadcfbf39c7fe20080 Mon Sep 17 00:00:00 2001
From: Williscool13 <twtw40@gmail.com>
Date: Tue, 25 Mar 2025 22:51:10 +0700
Subject: [PATCH 09/27] GTAO depth prefilter mip 1.

---
 .../ground_truth/gtao_depth_prefilter.comp    | 39 +++++++++++++++++--
 .../ambient_occlusion_types.h                 |  9 +++--
 2 files changed, 42 insertions(+), 6 deletions(-)

diff --git a/shaders/ambient_occlusion/ground_truth/gtao_depth_prefilter.comp b/shaders/ambient_occlusion/ground_truth/gtao_depth_prefilter.comp
index ebe3c57e..2257837f 100644
--- a/shaders/ambient_occlusion/ground_truth/gtao_depth_prefilter.comp
+++ b/shaders/ambient_occlusion/ground_truth/gtao_depth_prefilter.comp
@@ -15,13 +15,14 @@ layout (push_constant) uniform PushConstants {
     float depthLinearizeMult;
     float depthLinearizeAdd;
 
+    float radius;
+    float radiusMultiplier;
+    float falloff;
+
     vec2 ndcToViewMult;
     vec2 ndcToViewAdd;
 
-    float radius;
-    float faloff;
     float strength;
-    float radiusMultiplier;
 
     int numDirections;
     int numSteps;
@@ -47,6 +48,29 @@ float clampDepth(float depth){
     return clamp(depth, 0.0, 65504.0);
 }
 
+float depthMipFilter(float depth0, float depth1, float depth2, float depth3, float effectRadius, float radiusMultiplier, float falloffRange){
+    float maxDepth = max(max(depth0, depth1), max(depth2, depth3));
+
+    // https://github.com/GameTechDev/XeGTAO/blob/e7698f874e90f2516fca26c696ec3cd2c70e505a/Source/Rendering/Shaders/XeGTAO.hlsli#L583C13-L583C14
+    const float depthRangeScaleFactor = 0.75f;// found empirically :)
+
+
+    const float _effectRadius = depthRangeScaleFactor * effectRadius * radiusMultiplier;
+    const float _falloffRange = falloffRange * _effectRadius;
+    const float falloffFrom = _effectRadius * (1 - falloffRange);
+
+    const float falloffMul = -1.0 /  _falloffRange;
+    const float falloffAdd = falloffFrom / (_falloffRange) + 1.0;
+
+    float weight0 = clamp((maxDepth-depth0) * falloffMul + falloffAdd, 0.0f, 1.0f);
+    float weight1 = clamp((maxDepth-depth1) * falloffMul + falloffAdd, 0.0f, 1.0f);
+    float weight2 = clamp((maxDepth-depth2) * falloffMul + falloffAdd, 0.0f, 1.0f);
+    float weight3 = clamp((maxDepth-depth3) * falloffMul + falloffAdd, 0.0f, 1.0f);
+
+    float weightSum = weight0 + weight1 + weight2 + weight3;
+    return (weight0 * depth0 + weight1 * depth1 + weight2 * depth2 + weight3 * depth3) / weightSum;
+}
+
 void main() {
     ivec2 screenPos = ivec2(gl_GlobalInvocationID.xy);
 
@@ -74,4 +98,13 @@ void main() {
     imageStore(outDepth0, pixCoord + ivec2(1, 0), vec4(depth1, 0.0f, 0.0f, 0.0f));
     imageStore(outDepth0, pixCoord + ivec2(0, 1), vec4(depth2, 0.0f, 0.0f, 0.0f));
     imageStore(outDepth0, pixCoord + ivec2(1, 1), vec4(depth3, 0.0f, 0.0f, 0.0f));
+
+    // MIP 1
+    float dm1 = depthMipFilter(depth0, depth1, depth2, depth3, pushConstants.radius, pushConstants.radiusMultiplier, pushConstants.falloff);
+    imageStore(outDepth1, ivec2(baseCoord), vec4(dm1));
+
+    g_scratchDepths[gl_LocalInvocationID.x][gl_LocalInvocationID.y] = dm1;
+
+    memoryBarrierShared();
+    barrier();
 }
diff --git a/src/renderer/lighting/ambient_occlusion/ambient_occlusion_types.h b/src/renderer/lighting/ambient_occlusion/ambient_occlusion_types.h
index 918fbd39..26589ab1 100644
--- a/src/renderer/lighting/ambient_occlusion/ambient_occlusion_types.h
+++ b/src/renderer/lighting/ambient_occlusion/ambient_occlusion_types.h
@@ -18,14 +18,17 @@ struct GTAOPushConstants
     float depthLinearizeMult;
     float depthLinearizeAdd;
 
+    // Defaults follow Intel's implementation
+    float radius = 0.5f;
+    float falloff = 0.615f;
+    float radiusMultiplier = 1.457f;
+
     glm::vec2 ndcToViewMult;
     glm::vec2 ndcToViewAdd;
 
     // AO parameters
-    float radius;
-    float falloff;
     float strength;
-    float radiusMultiplier;
+
 
     // Sampling parameters
     uint32_t numDirections;

From d068040a3912280d56a33a9ef35f660faeac7dfb Mon Sep 17 00:00:00 2001
From: Williscool13 <twtw40@gmail.com>
Date: Wed, 26 Mar 2025 23:21:34 +0700
Subject: [PATCH 10/27] GTAO depth prefilter shader final.

---
 .../ground_truth/gtao_depth_prefilter.comp    | 47 +++++++++++++++++--
 .../ground_truth_ambient_occlusion.cpp        |  4 +-
 2 files changed, 46 insertions(+), 5 deletions(-)

diff --git a/shaders/ambient_occlusion/ground_truth/gtao_depth_prefilter.comp b/shaders/ambient_occlusion/ground_truth/gtao_depth_prefilter.comp
index 2257837f..2551c2e7 100644
--- a/shaders/ambient_occlusion/ground_truth/gtao_depth_prefilter.comp
+++ b/shaders/ambient_occlusion/ground_truth/gtao_depth_prefilter.comp
@@ -33,9 +33,9 @@ layout (push_constant) uniform PushConstants {
 } pushConstants;
 
 
-shared float g_scratchDepths[16][16];
+shared float g_scratchDepths[8][8];
 
-layout(local_size_x = 16, local_size_y = 16) in;
+layout(local_size_x = 8, local_size_y = 8) in;
 
 float screenToViewSpaceDepth(float screenDepth, float depthLinearizeMul, float depthLinearizeAdd) {
     // Optimization by XeGTAO
@@ -102,9 +102,50 @@ void main() {
     // MIP 1
     float dm1 = depthMipFilter(depth0, depth1, depth2, depth3, pushConstants.radius, pushConstants.radiusMultiplier, pushConstants.falloff);
     imageStore(outDepth1, ivec2(baseCoord), vec4(dm1));
+    g_scratchDepths[groupThreadID.x][groupThreadID.y] = dm1;
 
-    g_scratchDepths[gl_LocalInvocationID.x][gl_LocalInvocationID.y] = dm1;
+    memoryBarrierShared();
+    barrier();
+
+    // MIP 2
+    if (all(equal(groupThreadID.xy % 2u, uvec2(0u)))){
+        float inTL = g_scratchDepths[groupThreadID.x+0][groupThreadID.y+0];
+        float inTR = g_scratchDepths[groupThreadID.x+1][groupThreadID.y+0];
+        float inBL = g_scratchDepths[groupThreadID.x+0][groupThreadID.y+1];
+        float inBR = g_scratchDepths[groupThreadID.x+1][groupThreadID.y+1];
+
+        float dm2 = depthMipFilter(inTL, inTR, inBL, inBR, pushConstants.radius, pushConstants.radiusMultiplier, pushConstants.falloff);
+        imageStore(outDepth2, ivec2(baseCoord/2u), vec4(dm2));
+        g_scratchDepths[groupThreadID.x][groupThreadID.y] = dm2;
+    }
+
+    memoryBarrierShared();
+    barrier();
+
+    // MIP 3
+    if (all(equal(groupThreadID.xy % 4u, uvec2(0u)))){
+        float inTL = g_scratchDepths[groupThreadID.x+0][groupThreadID.y+0];
+        float inTR = g_scratchDepths[groupThreadID.x+2][groupThreadID.y+0];
+        float inBL = g_scratchDepths[groupThreadID.x+0][groupThreadID.y+2];
+        float inBR = g_scratchDepths[groupThreadID.x+2][groupThreadID.y+2];
+
+        float dm3 = depthMipFilter(inTL, inTR, inBL, inBR, pushConstants.radius, pushConstants.radiusMultiplier, pushConstants.falloff);
+        imageStore(outDepth3, ivec2(baseCoord/4u), vec4(dm3));
+        g_scratchDepths[groupThreadID.x][groupThreadID.y] = dm3;
+    }
 
     memoryBarrierShared();
     barrier();
+
+    // MIP 4
+    if (all(equal(groupThreadID.xy % 8u, uvec2(0u)))){
+        float inTL = g_scratchDepths[groupThreadID.x+0][groupThreadID.y+0];
+        float inTR = g_scratchDepths[groupThreadID.x+4][groupThreadID.y+0];
+        float inBL = g_scratchDepths[groupThreadID.x+0][groupThreadID.y+4];
+        float inBR = g_scratchDepths[groupThreadID.x+4][groupThreadID.y+4];
+
+        float dm4 = depthMipFilter(inTL, inTR, inBL, inBR, pushConstants.radius, pushConstants.radiusMultiplier, pushConstants.falloff);
+        imageStore(outDepth4, ivec2(baseCoord/8u), vec4(dm4));
+        //g_scratchDepths[groupThreadID.x][groupThreadID.y] = dm4;
+    }
 }
diff --git a/src/renderer/lighting/ambient_occlusion/ground_truth/ground_truth_ambient_occlusion.cpp b/src/renderer/lighting/ambient_occlusion/ground_truth/ground_truth_ambient_occlusion.cpp
index 6d64d039..6d111e6f 100644
--- a/src/renderer/lighting/ambient_occlusion/ground_truth/ground_truth_ambient_occlusion.cpp
+++ b/src/renderer/lighting/ambient_occlusion/ground_truth/ground_truth_ambient_occlusion.cpp
@@ -373,8 +373,8 @@ void will_engine::ambient_occlusion::GroundTruthAmbientOcclusionPipeline::draw(V
 
         vkCmdSetDescriptorBufferOffsetsEXT(cmd, VK_PIPELINE_BIND_POINT_COMPUTE, depthPrefilterPipelineLayout, 0, 2, indices.data(), offsets.data());
 
-        auto x = static_cast<uint32_t>(std::ceil(RENDER_EXTENT_WIDTH / 16.0f));
-        auto y = static_cast<uint32_t>(std::ceil(RENDER_EXTENT_HEIGHT / 16.0f));
+        auto x = static_cast<uint32_t>(std::ceil(RENDER_EXTENT_WIDTH / 8.0f));
+        auto y = static_cast<uint32_t>(std::ceil(RENDER_EXTENT_HEIGHT / 8.0f));
         // divided by 2 because depth prepass operates on 2x2 (still input4 -> output4)
         x /= 2;
         y /= 2;

From f559a9ad199ccae39854525479f3406aa640de6f Mon Sep 17 00:00:00 2001
From: Williscool13 <twtw40@gmail.com>
Date: Thu, 27 Mar 2025 22:27:19 +0700
Subject: [PATCH 11/27] Debugging depth prefilter for GTAO

---
 .../ground_truth/gtao_depth_prefilter.comp    | 49 ++++++++++---------
 shaders/deferredResolve.comp                  |  9 ++--
 src/core/engine.cpp                           | 16 +++---
 src/renderer/imgui_wrapper.cpp                | 27 ++++++++++
 .../ambient_occlusion_types.h                 |  7 ++-
 .../ground_truth_ambient_occlusion.cpp        | 32 +++++-------
 .../ground_truth_ambient_occlusion.h          |  5 +-
 .../pipelines/deferred_mrt/deferred_mrt.cpp   |  1 -
 src/renderer/vk_helpers.cpp                   | 47 ++++++++++++++++++
 src/renderer/vk_helpers.h                     |  4 ++
 10 files changed, 136 insertions(+), 61 deletions(-)

diff --git a/shaders/ambient_occlusion/ground_truth/gtao_depth_prefilter.comp b/shaders/ambient_occlusion/ground_truth/gtao_depth_prefilter.comp
index 2551c2e7..ee53f7bf 100644
--- a/shaders/ambient_occlusion/ground_truth/gtao_depth_prefilter.comp
+++ b/shaders/ambient_occlusion/ground_truth/gtao_depth_prefilter.comp
@@ -5,31 +5,30 @@
 // layout (std140, set = 0, binding = 0) uniform SceneData - scene.glsl
 
 layout (set = 1, binding = 0) uniform sampler2D depthImage;
-layout (r16f, set = 1, binding = 1) uniform image2D outDepth0;
-layout (r16f, set = 1, binding = 2) uniform image2D outDepth1;
-layout (r16f, set = 1, binding = 3) uniform image2D outDepth2;
-layout (r16f, set = 1, binding = 4) uniform image2D outDepth3;
-layout (r16f, set = 1, binding = 5) uniform image2D outDepth4;
+layout (r32f, set = 1, binding = 1) uniform image2D outDepth0;
+layout (r32f, set = 1, binding = 2) uniform image2D outDepth1;
+layout (r32f, set = 1, binding = 3) uniform image2D outDepth2;
+layout (r32f, set = 1, binding = 4) uniform image2D outDepth3;
+layout (r32f, set = 1, binding = 5) uniform image2D outDepth4;
 
 layout (push_constant) uniform PushConstants {
+    vec2 ndcToViewMult;
+    vec2 ndcToViewAdd;
+
     float depthLinearizeMult;
     float depthLinearizeAdd;
 
     float radius;
-    float radiusMultiplier;
     float falloff;
-
-    vec2 ndcToViewMult;
-    vec2 ndcToViewAdd;
+    float radiusMultiplier;
 
     float strength;
 
     int numDirections;
     int numSteps;
 
-    float spatialFilterRadius;
-
     float temporalWeight;
+    float spatialFilterRadius;
 } pushConstants;
 
 
@@ -72,17 +71,12 @@ float depthMipFilter(float depth0, float depth1, float depth2, float depth3, flo
 }
 
 void main() {
-    ivec2 screenPos = ivec2(gl_GlobalInvocationID.xy);
-
-    ivec2 dispatchID = ivec2(gl_GlobalInvocationID.xy);
     ivec2 groupThreadID = ivec2(gl_LocalInvocationID.xy);
 
     // MIP 0
-    const ivec2 baseCoord = dispatchID;
-    const ivec2 pixCoord = baseCoord * 2;// We process 2x2 pixels in MIP 0
-
-    vec2 uvCoord = vec2(pixCoord) * sceneData.texelSize;
+    const ivec2 screenPos = ivec2(gl_GlobalInvocationID.xy) * 2;// We process 2x2 pixels in MIP 0
 
+    vec2 uvCoord = vec2(screenPos) * sceneData.texelSize;
     vec4 depths4;
     depths4.x = texture(depthImage, uvCoord + vec2(0.0, 0.0) * sceneData.texelSize).r;
     depths4.y = texture(depthImage, uvCoord + vec2(1.0, 0.0) * sceneData.texelSize).r;
@@ -94,12 +88,19 @@ void main() {
     float depth2 = clampDepth(screenToViewSpaceDepth(depths4.z, pushConstants.depthLinearizeMult, pushConstants.depthLinearizeAdd));
     float depth3 = clampDepth(screenToViewSpaceDepth(depths4.w, pushConstants.depthLinearizeMult, pushConstants.depthLinearizeAdd));
 
-    imageStore(outDepth0, pixCoord + ivec2(0, 0), vec4(depth0, 0.0f, 0.0f, 0.0f));
-    imageStore(outDepth0, pixCoord + ivec2(1, 0), vec4(depth1, 0.0f, 0.0f, 0.0f));
-    imageStore(outDepth0, pixCoord + ivec2(0, 1), vec4(depth2, 0.0f, 0.0f, 0.0f));
-    imageStore(outDepth0, pixCoord + ivec2(1, 1), vec4(depth3, 0.0f, 0.0f, 0.0f));
+//    imageStore(outDepth0, pixCoord + ivec2(0, 0), vec4(depth0));
+//    imageStore(outDepth0, pixCoord + ivec2(1, 0), vec4(depth1));
+//    imageStore(outDepth0, pixCoord + ivec2(0, 1), vec4(depth2));
+//    imageStore(outDepth0, pixCoord + ivec2(1, 1), vec4(depth3));
+
+    float test = texture(depthImage, uvCoord).r;
+    imageStore(outDepth0, screenPos + ivec2(0, 0), vec4(test));
+    imageStore(outDepth0, screenPos + ivec2(1, 0), vec4(depths4.y));
+    imageStore(outDepth0, screenPos + ivec2(0, 1), vec4(depths4.z));
+    imageStore(outDepth0, screenPos + ivec2(1, 1), vec4(depths4.w));
 
-    // MIP 1
+
+/*    // MIP 1
     float dm1 = depthMipFilter(depth0, depth1, depth2, depth3, pushConstants.radius, pushConstants.radiusMultiplier, pushConstants.falloff);
     imageStore(outDepth1, ivec2(baseCoord), vec4(dm1));
     g_scratchDepths[groupThreadID.x][groupThreadID.y] = dm1;
@@ -147,5 +148,5 @@ void main() {
         float dm4 = depthMipFilter(inTL, inTR, inBL, inBR, pushConstants.radius, pushConstants.radiusMultiplier, pushConstants.falloff);
         imageStore(outDepth4, ivec2(baseCoord/8u), vec4(dm4));
         //g_scratchDepths[groupThreadID.x][groupThreadID.y] = dm4;
-    }
+    }*/
 }
diff --git a/shaders/deferredResolve.comp b/shaders/deferredResolve.comp
index 811c53ed..a6de0812 100644
--- a/shaders/deferredResolve.comp
+++ b/shaders/deferredResolve.comp
@@ -50,10 +50,9 @@ layout (push_constant) uniform PushConstants {
     float farPlane;
 } pushConstants;
 
-vec3 reconstructPosition(ivec2 texCoord, float depth) {
+vec3 reconstructPosition(ivec2 texCoord, vec2 texelSize, float depth) {
     // Get normalized device coordinates
-    vec2 texSize = vec2(pushConstants.width, pushConstants.height);
-    vec2 ndc = (vec2(texCoord) + 0.5) / texSize * 2.0 - 1.0;
+    vec2 ndc = (vec2(texCoord) + 0.5) * texelSize * 2.0 - 1.0;
 
     // Reconstruct view-space position
     vec4 positionVS = sceneData.invProjection * vec4(ndc, depth, 1.0);
@@ -72,7 +71,7 @@ void main() {
         return;
     }
 
-    vec2 uv = (vec2(screenPos) + 0.5) / vec2(pushConstants.width, pushConstants.height);
+    vec2 uv = (vec2(screenPos) + 0.5) * sceneData.texelSize;
     vec4 albedo = texture(albedoRenderTarget, uv);
     if (albedo.w != 1) {
         if (pushConstants.debug == 2) {
@@ -88,7 +87,7 @@ void main() {
     float depth = texture(depthBuffer, uv).r;
     vec3 normal = texture(normalRenderTarget, uv).rgb;
     vec4 pbrData = texture(pbrRenderTarget, uv);
-    vec3 position = reconstructPosition(screenPos, depth);
+    vec3 position = reconstructPosition(screenPos, sceneData.texelSize, depth);
 
     float roughness = pbrData.g;
     float metallic = pbrData.r;
diff --git a/src/core/engine.cpp b/src/core/engine.cpp
index d58a8c6e..65f61d9a 100644
--- a/src/core/engine.cpp
+++ b/src/core/engine.cpp
@@ -417,6 +417,7 @@ void Engine::updateRender(const float deltaTime, const int32_t currentFrameOverl
     pDebugSceneData->cameraWorldPos = glm::vec4(0.0f);
 
     pDebugSceneData->renderTargetSize = {RENDER_EXTENT_WIDTH, RENDER_EXTENT_HEIGHT};
+    pDebugSceneData->texelSize = {1.0f / RENDER_EXTENT_WIDTH, 1.0f / RENDER_EXTENT_HEIGHT};
     pDebugSceneData->deltaTime = deltaTime;
 }
 
@@ -557,20 +558,21 @@ void Engine::draw(float deltaTime)
         deferredMrtPipeline->draw(cmd, debugDeferredMrtDrawInfo);
     }
 
+    vk_helpers::transitionImage(cmd, normalRenderTarget.image, VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL, VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL, VK_IMAGE_ASPECT_COLOR_BIT);
+    vk_helpers::transitionImage(cmd, albedoRenderTarget.image, VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL, VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL, VK_IMAGE_ASPECT_COLOR_BIT);
+    vk_helpers::transitionImage(cmd, pbrRenderTarget.image, VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL, VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL, VK_IMAGE_ASPECT_COLOR_BIT);
+    vk_helpers::transitionImage(cmd, velocityRenderTarget.image, VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL, VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL, VK_IMAGE_ASPECT_COLOR_BIT);
+    vk_helpers::transitionImage(cmd, depthImage.image, VK_IMAGE_LAYOUT_DEPTH_ATTACHMENT_OPTIMAL, VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL, VK_IMAGE_ASPECT_DEPTH_BIT);
+    vk_helpers::transitionImage(cmd, drawImage.image, VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL, VK_IMAGE_LAYOUT_GENERAL, VK_IMAGE_ASPECT_COLOR_BIT);
+
     ambient_occlusion::GTAODrawInfo gtaoDrawInfo{
         camera,
         {},
         sceneDataDescriptorBuffer.getDescriptorBufferBindingInfo(),
-        sceneDataDescriptorBuffer.getDescriptorBufferSize() * FRAME_OVERLAP
+        sceneDataDescriptorBuffer.getDescriptorBufferSize() * currentFrameOverlap
     };
     ambientOcclusionPipeline->draw(cmd, gtaoDrawInfo);
 
-    vk_helpers::transitionImage(cmd, normalRenderTarget.image, VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL, VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL, VK_IMAGE_ASPECT_COLOR_BIT);
-    vk_helpers::transitionImage(cmd, albedoRenderTarget.image, VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL, VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL, VK_IMAGE_ASPECT_COLOR_BIT);
-    vk_helpers::transitionImage(cmd, pbrRenderTarget.image, VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL, VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL, VK_IMAGE_ASPECT_COLOR_BIT);
-    vk_helpers::transitionImage(cmd, velocityRenderTarget.image, VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL, VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL, VK_IMAGE_ASPECT_COLOR_BIT);
-    vk_helpers::transitionImage(cmd, depthImage.image, VK_IMAGE_LAYOUT_DEPTH_ATTACHMENT_OPTIMAL, VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL, VK_IMAGE_ASPECT_DEPTH_BIT);
-    vk_helpers::transitionImage(cmd, drawImage.image, VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL, VK_IMAGE_LAYOUT_GENERAL, VK_IMAGE_ASPECT_COLOR_BIT);
     const deferred_resolve::DeferredResolveDrawInfo deferredResolveDrawInfo{
         deferredDebug,
         csmPcf,
diff --git a/src/renderer/imgui_wrapper.cpp b/src/renderer/imgui_wrapper.cpp
index 4178d9f1..b40098b6 100644
--- a/src/renderer/imgui_wrapper.cpp
+++ b/src/renderer/imgui_wrapper.cpp
@@ -12,6 +12,7 @@
 
 
 #include "environment/environment.h"
+#include "lighting/ambient_occlusion/ground_truth/ground_truth_ambient_occlusion.h"
 #include "lighting/shadows/cascaded_shadow_map.h"
 #include "lighting/shadows/shadow_constants.h"
 #include "src/core/engine.h"
@@ -867,6 +868,32 @@ void ImguiWrapper::imguiInterface(Engine* engine)
     }
     ImGui::End();
 
+    if (ImGui::Begin("Discardable Debug")) {
+        if (ImGui::Button("Save GTAO depth image")) {
+            if (file::getOrCreateDirectory(file::imagesSavePath)) {
+                const std::filesystem::path path = file::imagesSavePath / "gtao_depth.png";
+
+                auto depthNormalize = [](const float depth) {
+                    return depth;
+                };
+
+                vk_helpers::saveImageR32F(
+                    *engine->resourceManager,
+                    *engine->immediate,
+                    engine->ambientOcclusionPipeline->depthPrefilterImage,
+                    //engine->depthImage,
+                    VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL,
+                    VK_IMAGE_ASPECT_COLOR_BIT,
+                    path.string().c_str(),
+                    depthNormalize
+                );
+            }
+            else {
+                fmt::print(" Failed to find/create image save path directory");
+            }
+        }
+    }
+    ImGui::End();
 
     if (selectedItem) {
         if (IImguiRenderable* imguiRenderable = dynamic_cast<IImguiRenderable*>(selectedItem)) {
diff --git a/src/renderer/lighting/ambient_occlusion/ambient_occlusion_types.h b/src/renderer/lighting/ambient_occlusion/ambient_occlusion_types.h
index 26589ab1..55338e44 100644
--- a/src/renderer/lighting/ambient_occlusion/ambient_occlusion_types.h
+++ b/src/renderer/lighting/ambient_occlusion/ambient_occlusion_types.h
@@ -14,6 +14,9 @@ static constexpr int32_t DEPTH_PREFILTER_MIP_COUNT = 5;
 
 struct GTAOPushConstants
 {
+    glm::vec2 ndcToViewMult;
+    glm::vec2 ndcToViewAdd;
+
     // Depth prefilter parameters
     float depthLinearizeMult;
     float depthLinearizeAdd;
@@ -23,13 +26,9 @@ struct GTAOPushConstants
     float falloff = 0.615f;
     float radiusMultiplier = 1.457f;
 
-    glm::vec2 ndcToViewMult;
-    glm::vec2 ndcToViewAdd;
-
     // AO parameters
     float strength;
 
-
     // Sampling parameters
     uint32_t numDirections;
     uint32_t numSteps;
diff --git a/src/renderer/lighting/ambient_occlusion/ground_truth/ground_truth_ambient_occlusion.cpp b/src/renderer/lighting/ambient_occlusion/ground_truth/ground_truth_ambient_occlusion.cpp
index 6d111e6f..b6e7fad1 100644
--- a/src/renderer/lighting/ambient_occlusion/ground_truth/ground_truth_ambient_occlusion.cpp
+++ b/src/renderer/lighting/ambient_occlusion/ground_truth/ground_truth_ambient_occlusion.cpp
@@ -46,10 +46,11 @@ will_engine::ambient_occlusion::GroundTruthAmbientOcclusionPipeline::GroundTruth
 
         depthPrefilterDescriptorBuffer = resourceManager.createDescriptorBufferSampler(depthPrefilterSetLayout, 1);
 
-
         VkImageUsageFlags usage{};
         usage |= VK_IMAGE_USAGE_STORAGE_BIT;
         usage |= VK_IMAGE_USAGE_SAMPLED_BIT;
+        usage |= VK_IMAGE_USAGE_TRANSFER_SRC_BIT;
+        usage |= VK_IMAGE_USAGE_TRANSFER_DST_BIT;
 
         VkImageCreateInfo imgInfo = vk_helpers::imageCreateInfo(depthPrefilterFormat, usage, {RENDER_EXTENTS.width, RENDER_EXTENTS.height, 1});
         // 5 mips, suggested by Intel's implementation
@@ -66,15 +67,6 @@ will_engine::ambient_occlusion::GroundTruthAmbientOcclusionPipeline::GroundTruth
         VkSamplerCreateInfo samplerInfo = {.sType = VK_STRUCTURE_TYPE_SAMPLER_CREATE_INFO};
         samplerInfo.magFilter = VK_FILTER_NEAREST;
         samplerInfo.minFilter = VK_FILTER_NEAREST;
-        samplerInfo.mipmapMode = VK_SAMPLER_MIPMAP_MODE_NEAREST;
-        samplerInfo.addressModeU = VK_SAMPLER_ADDRESS_MODE_CLAMP_TO_EDGE;
-        samplerInfo.addressModeV = VK_SAMPLER_ADDRESS_MODE_CLAMP_TO_EDGE;
-        samplerInfo.addressModeW = VK_SAMPLER_ADDRESS_MODE_CLAMP_TO_EDGE;
-        samplerInfo.anisotropyEnable = VK_FALSE;
-        samplerInfo.maxAnisotropy = 1.0f;
-        samplerInfo.compareEnable = VK_FALSE;
-        samplerInfo.minLod = 0.0f;
-        samplerInfo.maxLod = 0.0f;
 
         depthPrefilterSampler = resourceManager.createSampler(samplerInfo);
     }
@@ -299,7 +291,7 @@ void will_engine::ambient_occlusion::GroundTruthAmbientOcclusionPipeline::setupD
     imageDescriptors.push_back(
         {
             VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER,
-            {depthPrefilterSampler, depthImageView, VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL},
+            {resourceManager.getDefaultSamplerLinear(), depthImageView, VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL},
             false
         }
     );
@@ -356,28 +348,30 @@ void will_engine::ambient_occlusion::GroundTruthAmbientOcclusionPipeline::draw(V
     push.depthLinearizeAdd = projMatrix[2][2];
 
 
-    vk_helpers::transitionImage(cmd, depthPrefilterImage.image, VK_IMAGE_LAYOUT_UNDEFINED, VK_IMAGE_LAYOUT_GENERAL, VK_IMAGE_ASPECT_COLOR_BIT);
-
+    //vk_helpers::transitionImage(cmd, depthPrefilterImage.image, VK_IMAGE_LAYOUT_UNDEFINED, VK_IMAGE_LAYOUT_GENERAL, VK_IMAGE_ASPECT_COLOR_BIT);
+    vk_helpers::clearColorImage(cmd, depthPrefilterImage.image, VK_IMAGE_LAYOUT_UNDEFINED, VK_IMAGE_LAYOUT_GENERAL);
     // Depth Prefilter
     {;
         vkCmdBindPipeline(cmd, VK_PIPELINE_BIND_POINT_COMPUTE, depthPrefilterPipeline);
-        vkCmdPushConstants(cmd, depthPrefilterPipelineLayout, VK_SHADER_STAGE_COMPUTE_BIT, 0, sizeof(GTAOPushConstants), &drawInfo.pushConstants);
+        vkCmdPushConstants(cmd, depthPrefilterPipelineLayout, VK_SHADER_STAGE_COMPUTE_BIT, 0, sizeof(GTAOPushConstants), &push);
 
         VkDescriptorBufferBindingInfoEXT bindingInfos[2] = {};
         bindingInfos[0] = drawInfo.sceneDataBinding;
         bindingInfos[1] = depthPrefilterDescriptorBuffer.getDescriptorBufferBindingInfo();
         vkCmdBindDescriptorBuffersEXT(cmd, 2, bindingInfos);
 
-        constexpr std::array<uint32_t, 2> indices{0, 1};
-        const std::array offsets{drawInfo.sceneDataOffset, ZERO_DEVICE_SIZE};
+        uint32_t index0 = 0;
+        uint32_t index1 = 1;
+        VkDeviceSize offset0 = drawInfo.sceneDataOffset;
 
-        vkCmdSetDescriptorBufferOffsetsEXT(cmd, VK_PIPELINE_BIND_POINT_COMPUTE, depthPrefilterPipelineLayout, 0, 2, indices.data(), offsets.data());
+        vkCmdSetDescriptorBufferOffsetsEXT(cmd, VK_PIPELINE_BIND_POINT_COMPUTE, depthPrefilterPipelineLayout, 0, 1, &index0, &offset0);
+        vkCmdSetDescriptorBufferOffsetsEXT(cmd, VK_PIPELINE_BIND_POINT_COMPUTE, depthPrefilterPipelineLayout, 1, 1, &index1, &ZERO_DEVICE_SIZE);
 
         auto x = static_cast<uint32_t>(std::ceil(RENDER_EXTENT_WIDTH / 8.0f));
         auto y = static_cast<uint32_t>(std::ceil(RENDER_EXTENT_HEIGHT / 8.0f));
         // divided by 2 because depth prepass operates on 2x2 (still input4 -> output4)
-        x /= 2;
-        y /= 2;
+        x = x / 2 + 1;
+        y = y / 2 + 1;
         vkCmdDispatch(cmd, x, y, 1);
     }
 
diff --git a/src/renderer/lighting/ambient_occlusion/ground_truth/ground_truth_ambient_occlusion.h b/src/renderer/lighting/ambient_occlusion/ground_truth/ground_truth_ambient_occlusion.h
index 48fa8840..a9d7c110 100644
--- a/src/renderer/lighting/ambient_occlusion/ground_truth/ground_truth_ambient_occlusion.h
+++ b/src/renderer/lighting/ambient_occlusion/ground_truth/ground_truth_ambient_occlusion.h
@@ -7,6 +7,7 @@
 
 #include <array>
 
+#include "src/renderer/imgui_wrapper.h"
 #include "src/renderer/resource_manager.h"
 #include "src/renderer/lighting/ambient_occlusion/ambient_occlusion_types.h"
 
@@ -40,7 +41,7 @@ class GroundTruthAmbientOcclusionPipeline
     VkSampler depthPrefilterSampler{VK_NULL_HANDLE};
 
     // 16 vs 32. look at cost later.
-    VkFormat depthPrefilterFormat{VK_FORMAT_R16_SFLOAT};
+    VkFormat depthPrefilterFormat{VK_FORMAT_R32_SFLOAT};
     AllocatedImage depthPrefilterImage{VK_NULL_HANDLE};
     std::array<VkImageView, DEPTH_PREFILTER_MIP_COUNT> depthPrefilterImageViews{};
 
@@ -83,6 +84,8 @@ class GroundTruthAmbientOcclusionPipeline
 
 private:
     ResourceManager& resourceManager;
+
+    friend void ImguiWrapper::imguiInterface(Engine* engine);
 };
 }
 
diff --git a/src/renderer/pipelines/deferred_mrt/deferred_mrt.cpp b/src/renderer/pipelines/deferred_mrt/deferred_mrt.cpp
index 51540dfe..862c8592 100644
--- a/src/renderer/pipelines/deferred_mrt/deferred_mrt.cpp
+++ b/src/renderer/pipelines/deferred_mrt/deferred_mrt.cpp
@@ -89,7 +89,6 @@ void will_engine::deferred_mrt::DeferredMrtPipeline::draw(VkCommandBuffer cmd, c
     scissor.extent.width = RENDER_EXTENTS.width;
     scissor.extent.height = RENDER_EXTENTS.height;
     vkCmdSetScissor(cmd, 0, 1, &scissor);
-
     constexpr VkDeviceSize zeroOffset{0};
 
     for (RenderObject* renderObject : drawInfo.renderObjects) {
diff --git a/src/renderer/vk_helpers.cpp b/src/renderer/vk_helpers.cpp
index 3edacf13..91325ca5 100644
--- a/src/renderer/vk_helpers.cpp
+++ b/src/renderer/vk_helpers.cpp
@@ -773,6 +773,53 @@ void will_engine::vk_helpers::saveImageR32F(const ResourceManager& resourceManag
     resourceManager.destroyBuffer(receivingBuffer);
 }
 
+void will_engine::vk_helpers::saveImageR16F(const ResourceManager& resourceManager, const ImmediateSubmitter& immediate, const AllocatedImage& image, VkImageLayout imageLayout,
+    VkImageAspectFlags aspectFlag, const char* savePath, const std::function<float(half_float::half)>& valueTransform)
+{
+    using half_float::half;
+    const size_t dataSize = image.imageExtent.width * image.imageExtent.height * 1 * sizeof(half);
+    AllocatedBuffer receivingBuffer = resourceManager.createReceivingBuffer(dataSize);
+
+    immediate.submit([&](VkCommandBuffer cmd) {
+        VkBufferImageCopy bufferCopyRegion{};
+        bufferCopyRegion.imageSubresource.aspectMask = aspectFlag;
+        bufferCopyRegion.imageSubresource.mipLevel = 0;
+        bufferCopyRegion.imageSubresource.baseArrayLayer = 0;
+        bufferCopyRegion.imageSubresource.layerCount = 1;
+        bufferCopyRegion.imageExtent = image.imageExtent;
+        bufferCopyRegion.bufferOffset = 0;
+        bufferCopyRegion.bufferRowLength = 0;
+        bufferCopyRegion.bufferImageHeight = 0;
+
+        vk_helpers::transitionImage(cmd, image.image, imageLayout, VK_IMAGE_LAYOUT_GENERAL, aspectFlag);
+
+        vkCmdCopyImageToBuffer(cmd, image.image, VK_IMAGE_LAYOUT_GENERAL, receivingBuffer.buffer, 1, &bufferCopyRegion);
+
+        vk_helpers::transitionImage(cmd, image.image, VK_IMAGE_LAYOUT_GENERAL, imageLayout, aspectFlag);
+    });
+
+    void* data = receivingBuffer.info.pMappedData;
+    const auto imageData = static_cast<half*>(data);
+
+    const auto byteImageData = new uint8_t[image.imageExtent.width * image.imageExtent.height * 4];
+    const auto powEight = static_cast<float>(pow(2, 8) - 1);
+    for (size_t i = 0; i < image.imageExtent.width * image.imageExtent.height; ++i) {
+        half originalData = imageData[i];
+        float floatData = half_float::detail::half2float(originalData);
+        const float halfValue = valueTransform(originalData);
+        const auto value = static_cast<uint8_t>(halfValue * powEight);
+        byteImageData[i * 4 + 0] = value;
+        byteImageData[i * 4 + 1] = value;
+        byteImageData[i * 4 + 2] = value;
+        byteImageData[i * 4 + 3] = 255;
+    }
+
+    stbi_write_png(savePath, image.imageExtent.width, image.imageExtent.height, 4, byteImageData, image.imageExtent.width * 4);
+
+    delete[] byteImageData;
+    resourceManager.destroyBuffer(receivingBuffer);
+}
+
 void will_engine::vk_helpers::saveImage(const std::vector<float>& imageData, int width, int height, std::filesystem::path filename, bool overrideAlpha)
 {
     const auto byteImageData = new uint8_t[width * height * 4];
diff --git a/src/renderer/vk_helpers.h b/src/renderer/vk_helpers.h
index 18cde1be..6ceaa419 100644
--- a/src/renderer/vk_helpers.h
+++ b/src/renderer/vk_helpers.h
@@ -12,6 +12,7 @@
 
 #include <fmt/format.h>
 #include <glm/glm.hpp>
+#include <half/half/half.hpp>
 
 #include "vk_types.h"
 
@@ -115,6 +116,9 @@ namespace vk_helpers
     void saveImageR32F(const ResourceManager& resourceManager, const ImmediateSubmitter& immediate, const AllocatedImage& image, VkImageLayout imageLayout, VkImageAspectFlags aspectFlag,
                        const char* savePath, const std::function<float(float)>& valueTransform);
 
+    void saveImageR16F(const ResourceManager& resourceManager, const ImmediateSubmitter& immediate, const AllocatedImage& image, VkImageLayout imageLayout, VkImageAspectFlags aspectFlag,
+                       const char* savePath, const std::function<float(half_float::half)>& valueTransform);
+
     void saveImage(const std::vector<float>& imageData, int width, int height, std::filesystem::path filename, bool overrideAlpha = true);
 
     void saveHeightmap(const std::vector<float>& heightData, int width, int height, const std::filesystem::path& filename);

From 01654ca0b0b4e45032ebcc13119b0867f6d3251b Mon Sep 17 00:00:00 2001
From: Williscool13 <twtw40@gmail.com>
Date: Fri, 28 Mar 2025 16:04:17 +0700
Subject: [PATCH 12/27] Finalize depth prefilter for GTAO.

---
 .../ground_truth/gtao_depth_prefilter.comp    | 50 +++++++++----------
 shaders/environment/environment.vert          |  2 +-
 src/core/engine.cpp                           |  1 +
 src/renderer/imgui_wrapper.cpp                | 11 +++-
 .../ground_truth_ambient_occlusion.cpp        | 14 ++++--
 .../ground_truth_ambient_occlusion.h          |  3 +-
 src/renderer/vk_helpers.cpp                   | 23 +++++----
 src/renderer/vk_helpers.h                     |  2 +-
 8 files changed, 61 insertions(+), 45 deletions(-)

diff --git a/shaders/ambient_occlusion/ground_truth/gtao_depth_prefilter.comp b/shaders/ambient_occlusion/ground_truth/gtao_depth_prefilter.comp
index ee53f7bf..f6f01790 100644
--- a/shaders/ambient_occlusion/ground_truth/gtao_depth_prefilter.comp
+++ b/shaders/ambient_occlusion/ground_truth/gtao_depth_prefilter.comp
@@ -1,7 +1,12 @@
 #version 460
+#extension GL_EXT_nonuniform_qualifier: enable
 
 #include "scene.glsl"
 
+layout(local_size_x = 8, local_size_y = 8) in;
+
+shared float g_scratchDepths[8][8];
+
 // layout (std140, set = 0, binding = 0) uniform SceneData - scene.glsl
 
 layout (set = 1, binding = 0) uniform sampler2D depthImage;
@@ -31,11 +36,6 @@ layout (push_constant) uniform PushConstants {
     float spatialFilterRadius;
 } pushConstants;
 
-
-shared float g_scratchDepths[8][8];
-
-layout(local_size_x = 8, local_size_y = 8) in;
-
 float screenToViewSpaceDepth(float screenDepth, float depthLinearizeMul, float depthLinearizeAdd) {
     // Optimization by XeGTAO
     // https://github.com/GameTechDev/XeGTAO/blob/a5b1686c7ea37788eeb3576b5be47f7c03db532c/Source/Rendering/Shaders/XeGTAO.hlsli#L112
@@ -43,6 +43,7 @@ float screenToViewSpaceDepth(float screenDepth, float depthLinearizeMul, float d
 }
 
 float clampDepth(float depth){
+    // kind of redundant, the view space depth can only be as far as the depth buffer (which is 1000.0f at time of writing)
     // using half float precision
     return clamp(depth, 0.0, 65504.0);
 }
@@ -74,33 +75,28 @@ void main() {
     ivec2 groupThreadID = ivec2(gl_LocalInvocationID.xy);
 
     // MIP 0
-    const ivec2 screenPos = ivec2(gl_GlobalInvocationID.xy) * 2;// We process 2x2 pixels in MIP 0
+    const uvec2 baseCoord = gl_GlobalInvocationID.xy;
+    const ivec2 screenPos = ivec2(baseCoord.xy) * 2;// We process 2x2 pixels in MIP 0
 
-    vec2 uvCoord = vec2(screenPos) * sceneData.texelSize;
-    vec4 depths4;
-    depths4.x = texture(depthImage, uvCoord + vec2(0.0, 0.0) * sceneData.texelSize).r;
-    depths4.y = texture(depthImage, uvCoord + vec2(1.0, 0.0) * sceneData.texelSize).r;
-    depths4.z = texture(depthImage, uvCoord + vec2(0.0, 1.0) * sceneData.texelSize).r;
-    depths4.w = texture(depthImage, uvCoord + vec2(1.0, 1.0) * sceneData.texelSize).r;
+    vec2 uv = (vec2(screenPos) + 0.5) * sceneData.texelSize;
 
-    float depth0 = clampDepth(screenToViewSpaceDepth(depths4.x, pushConstants.depthLinearizeMult, pushConstants.depthLinearizeAdd));
-    float depth1 = clampDepth(screenToViewSpaceDepth(depths4.y, pushConstants.depthLinearizeMult, pushConstants.depthLinearizeAdd));
-    float depth2 = clampDepth(screenToViewSpaceDepth(depths4.z, pushConstants.depthLinearizeMult, pushConstants.depthLinearizeAdd));
-    float depth3 = clampDepth(screenToViewSpaceDepth(depths4.w, pushConstants.depthLinearizeMult, pushConstants.depthLinearizeAdd));
+    float rDepth0 = texture(depthImage, uv + vec2(0.0, 0.0) * sceneData.texelSize).r;
+    float rDepth1 = texture(depthImage, uv + vec2(1.0, 0.0) * sceneData.texelSize).r;
+    float rDepth2 = texture(depthImage, uv + vec2(0.0, 1.0) * sceneData.texelSize).r;
+    float rDepth3 = texture(depthImage, uv + vec2(1.0, 1.0) * sceneData.texelSize).r;
 
-//    imageStore(outDepth0, pixCoord + ivec2(0, 0), vec4(depth0));
-//    imageStore(outDepth0, pixCoord + ivec2(1, 0), vec4(depth1));
-//    imageStore(outDepth0, pixCoord + ivec2(0, 1), vec4(depth2));
-//    imageStore(outDepth0, pixCoord + ivec2(1, 1), vec4(depth3));
+    float depth0 = clampDepth(screenToViewSpaceDepth(rDepth0, pushConstants.depthLinearizeMult, pushConstants.depthLinearizeAdd));
+    float depth1 = clampDepth(screenToViewSpaceDepth(rDepth1, pushConstants.depthLinearizeMult, pushConstants.depthLinearizeAdd));
+    float depth2 = clampDepth(screenToViewSpaceDepth(rDepth2, pushConstants.depthLinearizeMult, pushConstants.depthLinearizeAdd));
+    float depth3 = clampDepth(screenToViewSpaceDepth(rDepth3, pushConstants.depthLinearizeMult, pushConstants.depthLinearizeAdd));
 
-    float test = texture(depthImage, uvCoord).r;
-    imageStore(outDepth0, screenPos + ivec2(0, 0), vec4(test));
-    imageStore(outDepth0, screenPos + ivec2(1, 0), vec4(depths4.y));
-    imageStore(outDepth0, screenPos + ivec2(0, 1), vec4(depths4.z));
-    imageStore(outDepth0, screenPos + ivec2(1, 1), vec4(depths4.w));
+    imageStore(outDepth0, screenPos + ivec2(0, 0), vec4(depth0));
+    imageStore(outDepth0, screenPos + ivec2(1, 0), vec4(depth1));
+    imageStore(outDepth0, screenPos + ivec2(0, 1), vec4(depth2));
+    imageStore(outDepth0, screenPos + ivec2(1, 1), vec4(depth3));
 
 
-/*    // MIP 1
+    // MIP 1
     float dm1 = depthMipFilter(depth0, depth1, depth2, depth3, pushConstants.radius, pushConstants.radiusMultiplier, pushConstants.falloff);
     imageStore(outDepth1, ivec2(baseCoord), vec4(dm1));
     g_scratchDepths[groupThreadID.x][groupThreadID.y] = dm1;
@@ -148,5 +144,5 @@ void main() {
         float dm4 = depthMipFilter(inTL, inTR, inBL, inBR, pushConstants.radius, pushConstants.radiusMultiplier, pushConstants.falloff);
         imageStore(outDepth4, ivec2(baseCoord/8u), vec4(dm4));
         //g_scratchDepths[groupThreadID.x][groupThreadID.y] = dm4;
-    }*/
+    }
 }
diff --git a/shaders/environment/environment.vert b/shaders/environment/environment.vert
index af66d555..6ea6da69 100644
--- a/shaders/environment/environment.vert
+++ b/shaders/environment/environment.vert
@@ -9,7 +9,7 @@ layout (location = 2) out vec4 outPrevMvpPosition;
 // layout (std140, set = 0, binding = 0) uniform SceneData - scene.glsl
 
 void main() {
-    const vec3 vertices[3] = vec3[3](vec3(-1, -1, 0.00001), vec3(3, -1, 0.00001), vec3(-1, 3, 0.00001));
+    const vec3 vertices[3] = vec3[3](vec3(-1, -1, 0.0002), vec3(3, -1, 0.0002), vec3(-1, 3, 0.0002));
 
 
     vec4 currClipPos = vec4(vertices[gl_VertexIndex], 1);
diff --git a/src/core/engine.cpp b/src/core/engine.cpp
index 65f61d9a..fc8ca46b 100644
--- a/src/core/engine.cpp
+++ b/src/core/engine.cpp
@@ -955,6 +955,7 @@ void Engine::hotReloadShaders() const
     environmentPipeline->reloadShaders();
     terrainPipeline->reloadShaders();
     deferredMrtPipeline->reloadShaders();
+    ambientOcclusionPipeline->reloadShaders();
     deferredResolvePipeline->reloadShaders();
     temporalAntialiasingPipeline->reloadShaders();
     postProcessPipeline->reloadShaders();
diff --git a/src/renderer/imgui_wrapper.cpp b/src/renderer/imgui_wrapper.cpp
index b40098b6..421a7cac 100644
--- a/src/renderer/imgui_wrapper.cpp
+++ b/src/renderer/imgui_wrapper.cpp
@@ -869,12 +869,18 @@ void ImguiWrapper::imguiInterface(Engine* engine)
     ImGui::End();
 
     if (ImGui::Begin("Discardable Debug")) {
+        static int32_t gtaoMip;
+        constexpr uint32_t minMip = 0;
+        constexpr uint32_t maxMip = 4;
+
+        ImGui::SliderScalar("GTAO level", ImGuiDataType_S32, &gtaoMip, &minMip, &maxMip);
+
         if (ImGui::Button("Save GTAO depth image")) {
             if (file::getOrCreateDirectory(file::imagesSavePath)) {
                 const std::filesystem::path path = file::imagesSavePath / "gtao_depth.png";
 
                 auto depthNormalize = [](const float depth) {
-                    return depth;
+                    return depth / 1000.f;
                 };
 
                 vk_helpers::saveImageR32F(
@@ -885,7 +891,8 @@ void ImguiWrapper::imguiInterface(Engine* engine)
                     VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL,
                     VK_IMAGE_ASPECT_COLOR_BIT,
                     path.string().c_str(),
-                    depthNormalize
+                    depthNormalize,
+                    gtaoMip
                 );
             }
             else {
diff --git a/src/renderer/lighting/ambient_occlusion/ground_truth/ground_truth_ambient_occlusion.cpp b/src/renderer/lighting/ambient_occlusion/ground_truth/ground_truth_ambient_occlusion.cpp
index b6e7fad1..fc22b271 100644
--- a/src/renderer/lighting/ambient_occlusion/ground_truth/ground_truth_ambient_occlusion.cpp
+++ b/src/renderer/lighting/ambient_occlusion/ground_truth/ground_truth_ambient_occlusion.cpp
@@ -283,7 +283,7 @@ will_engine::ambient_occlusion::GroundTruthAmbientOcclusionPipeline::~GroundTrut
     resourceManager.destroyDescriptorBuffer(temporalAccumulationDescriptorBuffer);
 }
 
-void will_engine::ambient_occlusion::GroundTruthAmbientOcclusionPipeline::setupDepthPrefilterDescriptorBuffer(const VkImageView depthImageView)
+void will_engine::ambient_occlusion::GroundTruthAmbientOcclusionPipeline::setupDepthPrefilterDescriptorBuffer(const VkImageView& depthImageView)
 {
     std::vector<DescriptorImageData> imageDescriptors{};
     imageDescriptors.reserve(1 + DEPTH_PREFILTER_MIP_COUNT);
@@ -370,8 +370,8 @@ void will_engine::ambient_occlusion::GroundTruthAmbientOcclusionPipeline::draw(V
         auto x = static_cast<uint32_t>(std::ceil(RENDER_EXTENT_WIDTH / 8.0f));
         auto y = static_cast<uint32_t>(std::ceil(RENDER_EXTENT_HEIGHT / 8.0f));
         // divided by 2 because depth prepass operates on 2x2 (still input4 -> output4)
-        x = x / 2 + 1;
-        y = y / 2 + 1;
+        x = x / 2;
+        y = y / 2;
         vkCmdDispatch(cmd, x, y, 1);
     }
 
@@ -408,6 +408,14 @@ void will_engine::ambient_occlusion::GroundTruthAmbientOcclusionPipeline::draw(V
     vkCmdEndDebugUtilsLabelEXT(cmd);
 }
 
+void will_engine::ambient_occlusion::GroundTruthAmbientOcclusionPipeline::reloadShaders()
+{
+    createDepthPrefilterPipeline();
+    createAmbientOcclusionPipeline();
+    createSpatialFilteringPipeline();
+    createTemporalAccumulationPipeline();
+}
+
 void will_engine::ambient_occlusion::GroundTruthAmbientOcclusionPipeline::createDepthPrefilterPipeline()
 {
     resourceManager.destroyPipeline(depthPrefilterPipeline);
diff --git a/src/renderer/lighting/ambient_occlusion/ground_truth/ground_truth_ambient_occlusion.h b/src/renderer/lighting/ambient_occlusion/ground_truth/ground_truth_ambient_occlusion.h
index a9d7c110..f45c6a72 100644
--- a/src/renderer/lighting/ambient_occlusion/ground_truth/ground_truth_ambient_occlusion.h
+++ b/src/renderer/lighting/ambient_occlusion/ground_truth/ground_truth_ambient_occlusion.h
@@ -21,12 +21,13 @@ class GroundTruthAmbientOcclusionPipeline
 
     ~GroundTruthAmbientOcclusionPipeline();
 
-    void setupDepthPrefilterDescriptorBuffer(VkImageView depthImageView);
+    void setupDepthPrefilterDescriptorBuffer(const VkImageView& depthImageView);
 
     void setupAmbientOcclusionDescriptorBuffer(VkImageView normalsImageView);
 
     void draw(VkCommandBuffer cmd, const GTAODrawInfo& drawInfo) const;
 
+    void reloadShaders();
 private:
     void createDepthPrefilterPipeline();
     void createAmbientOcclusionPipeline();
diff --git a/src/renderer/vk_helpers.cpp b/src/renderer/vk_helpers.cpp
index 91325ca5..73e5e9e0 100644
--- a/src/renderer/vk_helpers.cpp
+++ b/src/renderer/vk_helpers.cpp
@@ -730,35 +730,38 @@ void will_engine::vk_helpers::savePacked64Bit(const ResourceManager& resourceMan
 }
 
 void will_engine::vk_helpers::saveImageR32F(const ResourceManager& resourceManager, const ImmediateSubmitter& immediate, const AllocatedImage& image, VkImageLayout imageLayout, VkImageAspectFlags aspectFlag,
-                               const char* savePath, const std::function<float(float)>& valueTransform)
+                               const char* savePath, const std::function<float(float)>& valueTransform, int32_t mipLevel)
 {
-    const size_t dataSize = image.imageExtent.width * image.imageExtent.height * 1 * sizeof(float);
+    size_t newXSize = image.imageExtent.width / static_cast<size_t>(std::pow(2, mipLevel));
+    size_t newYSize = image.imageExtent.height / static_cast<size_t>(std::pow(2, mipLevel));
+    const size_t texelCount = newXSize * newYSize;
+    const size_t dataSize = texelCount * 1 * sizeof(float);
     AllocatedBuffer receivingBuffer = resourceManager.createReceivingBuffer(dataSize);
 
-    immediate.submit([&](VkCommandBuffer cmd) {
+    immediate.submit([&, mipLevel](VkCommandBuffer cmd) {
         VkBufferImageCopy bufferCopyRegion{};
         bufferCopyRegion.imageSubresource.aspectMask = aspectFlag;
-        bufferCopyRegion.imageSubresource.mipLevel = 0;
+        bufferCopyRegion.imageSubresource.mipLevel = mipLevel;
         bufferCopyRegion.imageSubresource.baseArrayLayer = 0;
         bufferCopyRegion.imageSubresource.layerCount = 1;
-        bufferCopyRegion.imageExtent = image.imageExtent;
+        bufferCopyRegion.imageExtent = {static_cast<uint32_t>(newXSize), static_cast<uint32_t>(newYSize), 1u};
         bufferCopyRegion.bufferOffset = 0;
         bufferCopyRegion.bufferRowLength = 0;
         bufferCopyRegion.bufferImageHeight = 0;
 
-        vk_helpers::transitionImage(cmd, image.image, imageLayout, VK_IMAGE_LAYOUT_GENERAL, aspectFlag);
+        transitionImage(cmd, image.image, imageLayout, VK_IMAGE_LAYOUT_GENERAL, aspectFlag);
 
         vkCmdCopyImageToBuffer(cmd, image.image, VK_IMAGE_LAYOUT_GENERAL, receivingBuffer.buffer, 1, &bufferCopyRegion);
 
-        vk_helpers::transitionImage(cmd, image.image, VK_IMAGE_LAYOUT_GENERAL, imageLayout, aspectFlag);
+        transitionImage(cmd, image.image, VK_IMAGE_LAYOUT_GENERAL, imageLayout, aspectFlag);
     });
 
     void* data = receivingBuffer.info.pMappedData;
     const auto imageData = static_cast<float*>(data);
 
-    const auto byteImageData = new uint8_t[image.imageExtent.width * image.imageExtent.height * 4];
+    const auto byteImageData = new uint8_t[texelCount * 4];
     const auto powEight = static_cast<float>(pow(2, 8) - 1);
-    for (size_t i = 0; i < image.imageExtent.width * image.imageExtent.height; ++i) {
+    for (size_t i = 0; i < texelCount; ++i) {
         const float floatValue = valueTransform(imageData[i]);
         const auto value = static_cast<uint8_t>(floatValue * powEight);
         byteImageData[i * 4 + 0] = value;
@@ -767,7 +770,7 @@ void will_engine::vk_helpers::saveImageR32F(const ResourceManager& resourceManag
         byteImageData[i * 4 + 3] = 255;
     }
 
-    stbi_write_png(savePath, image.imageExtent.width, image.imageExtent.height, 4, byteImageData, image.imageExtent.width * 4);
+    stbi_write_png(savePath, static_cast<int>(newXSize), static_cast<int>(newYSize), 4, byteImageData, static_cast<int>(newXSize) * 4);
 
     delete[] byteImageData;
     resourceManager.destroyBuffer(receivingBuffer);
diff --git a/src/renderer/vk_helpers.h b/src/renderer/vk_helpers.h
index 6ceaa419..3a5358ce 100644
--- a/src/renderer/vk_helpers.h
+++ b/src/renderer/vk_helpers.h
@@ -114,7 +114,7 @@ namespace vk_helpers
      * Save the Allocated image as a grayscaled image. The image must be a format with only 1 channel (e.g. R32 or D32)
      */
     void saveImageR32F(const ResourceManager& resourceManager, const ImmediateSubmitter& immediate, const AllocatedImage& image, VkImageLayout imageLayout, VkImageAspectFlags aspectFlag,
-                       const char* savePath, const std::function<float(float)>& valueTransform);
+                       const char* savePath, const std::function<float(float)>& valueTransform, int32_t mipLevel = 0);
 
     void saveImageR16F(const ResourceManager& resourceManager, const ImmediateSubmitter& immediate, const AllocatedImage& image, VkImageLayout imageLayout, VkImageAspectFlags aspectFlag,
                        const char* savePath, const std::function<float(half_float::half)>& valueTransform);

From dea858e2bd997f2078bb35f0d568a6f865af0c4c Mon Sep 17 00:00:00 2001
From: Williscool13 <twtw40@gmail.com>
Date: Fri, 28 Mar 2025 17:18:01 +0700
Subject: [PATCH 13/27] removed half library

---
 .gitmodules | 3 ---
 extern/half | 1 -
 2 files changed, 4 deletions(-)
 delete mode 160000 extern/half

diff --git a/.gitmodules b/.gitmodules
index e3987ac8..51bf7966 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -7,6 +7,3 @@
 [submodule "extern/JoltPhysics"]
 	path = extern/JoltPhysics
 	url = https://github.com/jrouwe/JoltPhysics.git
-[submodule "extern/half"]
-	path = extern/half
-	url = https://github.com/melowntech/half.git
diff --git a/extern/half b/extern/half
deleted file mode 160000
index 972e0409..00000000
--- a/extern/half
+++ /dev/null
@@ -1 +0,0 @@
-Subproject commit 972e040989383764f30c1d45a9b666593e802741

From b695208824599f1e79e8ec2bc8c38a819ccd430a Mon Sep 17 00:00:00 2001
From: Williscool13 <twtw40@gmail.com>
Date: Fri, 28 Mar 2025 18:01:46 +0700
Subject: [PATCH 14/27] change depth prefilter format to FP16.

---
 CMakeLists.txt                                |    7 -
 extern/half/half.hpp                          | 4603 +++++++++++++++++
 .../ground_truth/gtao_depth_prefilter.comp    |   10 +-
 src/renderer/imgui_wrapper.cpp                |   36 +-
 .../ground_truth_ambient_occlusion.cpp        |    6 +-
 .../ground_truth_ambient_occlusion.h          |    2 +-
 src/renderer/vk_helpers.cpp                   |   36 +-
 src/renderer/vk_helpers.h                     |   39 +-
 src/util/math_utils.h                         |   26 +
 9 files changed, 4702 insertions(+), 63 deletions(-)
 create mode 100644 extern/half/half.hpp

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 01c37f27..6ea9f6a5 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -62,12 +62,6 @@ set(FASTGLTF_SOURCES
         ${FASTGLTF_DIR}/src/io.cpp
 )
 
-set(HALF_DIR ${CMAKE_CURRENT_SOURCE_DIR}/extern/half)
-set(HALF_SOURCES
-        ${HALF_DIR}/half/dummy.cpp
-        ${HALF_DIR}/half/half.hpp
-)
-
 set(VK_BOOTSTRAP_SOURCES
         ${CMAKE_CURRENT_SOURCE_DIR}/extern/vk-bootstrap/VkBootstrap.cpp
 )
@@ -256,7 +250,6 @@ set(TEMP_SOURCES
 add_executable(WillEngine main.cpp
         ${IMGUI_SOURCES}
         ${FASTGLTF_SOURCES}
-        ${HALF_SOURCES}
         ${VK_BOOTSTRAP_SOURCES}
         ${VOLK_SOURCES}
         ${ENGINE_SOURCES}
diff --git a/extern/half/half.hpp b/extern/half/half.hpp
new file mode 100644
index 00000000..cb658f22
--- /dev/null
+++ b/extern/half/half.hpp
@@ -0,0 +1,4603 @@
+// half - IEEE 754-based half-precision floating-point library.
+//
+// Copyright (c) 2012-2025 Christian Rau <rauy@users.sourceforge.net>
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation 
+// files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, 
+// modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the 
+// Software is furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE 
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR 
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, 
+// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+// Version 2.2.1
+
+/// \file
+/// Main header file for half-precision functionality.
+
+#ifndef HALF_HALF_HPP
+#define HALF_HALF_HPP
+
+#define HALF_GCC_VERSION (__GNUC__*100+__GNUC_MINOR__)
+
+#if defined(__INTEL_COMPILER)
+	#define HALF_ICC_VERSION __INTEL_COMPILER
+#elif defined(__ICC)
+	#define HALF_ICC_VERSION __ICC
+#elif defined(__ICL)
+	#define HALF_ICC_VERSION __ICL
+#else
+	#define HALF_ICC_VERSION 0
+#endif
+
+// check C++11 language features
+#if defined(__clang__)										// clang
+	#if __has_feature(cxx_static_assert) && !defined(HALF_ENABLE_CPP11_STATIC_ASSERT)
+		#define HALF_ENABLE_CPP11_STATIC_ASSERT 1
+	#endif
+	#if __has_feature(cxx_constexpr) && !defined(HALF_ENABLE_CPP11_CONSTEXPR)
+		#define HALF_ENABLE_CPP11_CONSTEXPR 1
+	#endif
+	#if __has_feature(cxx_noexcept) && !defined(HALF_ENABLE_CPP11_NOEXCEPT)
+		#define HALF_ENABLE_CPP11_NOEXCEPT 1
+	#endif
+	#if __has_feature(cxx_user_literals) && !defined(HALF_ENABLE_CPP11_USER_LITERALS)
+		#define HALF_ENABLE_CPP11_USER_LITERALS 1
+	#endif
+	#if __has_feature(cxx_thread_local) && !defined(HALF_ENABLE_CPP11_THREAD_LOCAL)
+		#define HALF_ENABLE_CPP11_THREAD_LOCAL 1
+	#endif
+	#if (defined(__GXX_EXPERIMENTAL_CXX0X__) || __cplusplus >= 201103L) && !defined(HALF_ENABLE_CPP11_LONG_LONG)
+		#define HALF_ENABLE_CPP11_LONG_LONG 1
+	#endif
+#elif HALF_ICC_VERSION && defined(__INTEL_CXX11_MODE__)		// Intel C++
+	#if HALF_ICC_VERSION >= 1500 && !defined(HALF_ENABLE_CPP11_THREAD_LOCAL)
+		#define HALF_ENABLE_CPP11_THREAD_LOCAL 1
+	#endif
+	#if HALF_ICC_VERSION >= 1500 && !defined(HALF_ENABLE_CPP11_USER_LITERALS)
+		#define HALF_ENABLE_CPP11_USER_LITERALS 1
+	#endif
+	#if HALF_ICC_VERSION >= 1400 && !defined(HALF_ENABLE_CPP11_CONSTEXPR)
+		#define HALF_ENABLE_CPP11_CONSTEXPR 1
+	#endif
+	#if HALF_ICC_VERSION >= 1400 && !defined(HALF_ENABLE_CPP11_NOEXCEPT)
+		#define HALF_ENABLE_CPP11_NOEXCEPT 1
+	#endif
+	#if HALF_ICC_VERSION >= 1110 && !defined(HALF_ENABLE_CPP11_STATIC_ASSERT)
+		#define HALF_ENABLE_CPP11_STATIC_ASSERT 1
+	#endif
+	#if HALF_ICC_VERSION >= 1110 && !defined(HALF_ENABLE_CPP11_LONG_LONG)
+		#define HALF_ENABLE_CPP11_LONG_LONG 1
+	#endif
+#elif defined(__GNUC__)										// gcc
+	#if defined(__GXX_EXPERIMENTAL_CXX0X__) || __cplusplus >= 201103L
+		#if HALF_GCC_VERSION >= 408 && !defined(HALF_ENABLE_CPP11_THREAD_LOCAL)
+			#define HALF_ENABLE_CPP11_THREAD_LOCAL 1
+		#endif
+		#if HALF_GCC_VERSION >= 407 && !defined(HALF_ENABLE_CPP11_USER_LITERALS)
+			#define HALF_ENABLE_CPP11_USER_LITERALS 1
+		#endif
+		#if HALF_GCC_VERSION >= 406 && !defined(HALF_ENABLE_CPP11_CONSTEXPR)
+			#define HALF_ENABLE_CPP11_CONSTEXPR 1
+		#endif
+		#if HALF_GCC_VERSION >= 406 && !defined(HALF_ENABLE_CPP11_NOEXCEPT)
+			#define HALF_ENABLE_CPP11_NOEXCEPT 1
+		#endif
+		#if HALF_GCC_VERSION >= 403 && !defined(HALF_ENABLE_CPP11_STATIC_ASSERT)
+			#define HALF_ENABLE_CPP11_STATIC_ASSERT 1
+		#endif
+		#if !defined(HALF_ENABLE_CPP11_LONG_LONG)
+			#define HALF_ENABLE_CPP11_LONG_LONG 1
+		#endif
+	#endif
+	#define HALF_TWOS_COMPLEMENT_INT 1
+#elif defined(_MSC_VER)										// Visual C++
+	#if _MSC_VER >= 1900 && !defined(HALF_ENABLE_CPP11_THREAD_LOCAL)
+		#define HALF_ENABLE_CPP11_THREAD_LOCAL 1
+	#endif
+	#if _MSC_VER >= 1900 && !defined(HALF_ENABLE_CPP11_USER_LITERALS)
+		#define HALF_ENABLE_CPP11_USER_LITERALS 1
+	#endif
+	#if _MSC_VER >= 1900 && !defined(HALF_ENABLE_CPP11_CONSTEXPR)
+		#define HALF_ENABLE_CPP11_CONSTEXPR 1
+	#endif
+	#if _MSC_VER >= 1900 && !defined(HALF_ENABLE_CPP11_NOEXCEPT)
+		#define HALF_ENABLE_CPP11_NOEXCEPT 1
+	#endif
+	#if _MSC_VER >= 1600 && !defined(HALF_ENABLE_CPP11_STATIC_ASSERT)
+		#define HALF_ENABLE_CPP11_STATIC_ASSERT 1
+	#endif
+	#if _MSC_VER >= 1310 && !defined(HALF_ENABLE_CPP11_LONG_LONG)
+		#define HALF_ENABLE_CPP11_LONG_LONG 1
+	#endif
+	#define HALF_TWOS_COMPLEMENT_INT 1
+	#define HALF_POP_WARNINGS 1
+	#pragma warning(push)
+	#pragma warning(disable : 4099 4127 4146)	//struct vs class, constant in if, negative unsigned
+#endif
+
+// check C++11 library features
+#include <utility>
+#if defined(_LIBCPP_VERSION)								// libc++
+	#if defined(__GXX_EXPERIMENTAL_CXX0X__) || __cplusplus >= 201103
+		#ifndef HALF_ENABLE_CPP11_TYPE_TRAITS
+			#define HALF_ENABLE_CPP11_TYPE_TRAITS 1
+		#endif
+		#ifndef HALF_ENABLE_CPP11_CSTDINT
+			#define HALF_ENABLE_CPP11_CSTDINT 1
+		#endif
+		#ifndef HALF_ENABLE_CPP11_CMATH
+			#define HALF_ENABLE_CPP11_CMATH 1
+		#endif
+		#ifndef HALF_ENABLE_CPP11_HASH
+			#define HALF_ENABLE_CPP11_HASH 1
+		#endif
+		#ifndef HALF_ENABLE_CPP11_CFENV
+			#define HALF_ENABLE_CPP11_CFENV 1
+		#endif
+	#endif
+#elif defined(__GLIBCXX__)									// libstdc++
+	#if defined(__GXX_EXPERIMENTAL_CXX0X__) || __cplusplus >= 201103
+		#ifdef __clang__
+			#if __GLIBCXX__ >= 20080606 && !defined(HALF_ENABLE_CPP11_TYPE_TRAITS)
+				#define HALF_ENABLE_CPP11_TYPE_TRAITS 1
+			#endif
+			#if __GLIBCXX__ >= 20080606 && !defined(HALF_ENABLE_CPP11_CSTDINT)
+				#define HALF_ENABLE_CPP11_CSTDINT 1
+			#endif
+			#if __GLIBCXX__ >= 20080606 && !defined(HALF_ENABLE_CPP11_CMATH)
+				#define HALF_ENABLE_CPP11_CMATH 1
+			#endif
+			#if __GLIBCXX__ >= 20080606 && !defined(HALF_ENABLE_CPP11_HASH)
+				#define HALF_ENABLE_CPP11_HASH 1
+			#endif
+			#if __GLIBCXX__ >= 20080606 && !defined(HALF_ENABLE_CPP11_CFENV)
+				#define HALF_ENABLE_CPP11_CFENV 1
+			#endif
+		#else
+			#if HALF_GCC_VERSION >= 403 && !defined(HALF_ENABLE_CPP11_TYPE_TRAITS)
+				#define HALF_ENABLE_CPP11_TYPE_TRAITS 1
+			#endif
+			#if HALF_GCC_VERSION >= 403 && !defined(HALF_ENABLE_CPP11_CSTDINT)
+				#define HALF_ENABLE_CPP11_CSTDINT 1
+			#endif
+			#if HALF_GCC_VERSION >= 403 && !defined(HALF_ENABLE_CPP11_CMATH)
+				#define HALF_ENABLE_CPP11_CMATH 1
+			#endif
+			#if HALF_GCC_VERSION >= 403 && !defined(HALF_ENABLE_CPP11_HASH)
+				#define HALF_ENABLE_CPP11_HASH 1
+			#endif
+			#if HALF_GCC_VERSION >= 403 && !defined(HALF_ENABLE_CPP11_CFENV)
+				#define HALF_ENABLE_CPP11_CFENV 1
+			#endif
+		#endif
+	#endif
+#elif defined(_CPPLIB_VER)									// Dinkumware/Visual C++
+	#if _CPPLIB_VER >= 520 && !defined(HALF_ENABLE_CPP11_TYPE_TRAITS)
+		#define HALF_ENABLE_CPP11_TYPE_TRAITS 1
+	#endif
+	#if _CPPLIB_VER >= 520 && !defined(HALF_ENABLE_CPP11_CSTDINT)
+			#define HALF_ENABLE_CPP11_CSTDINT 1
+	#endif
+	#if _CPPLIB_VER >= 520 && !defined(HALF_ENABLE_CPP11_HASH)
+		#define HALF_ENABLE_CPP11_HASH 1
+	#endif
+	#if _CPPLIB_VER >= 610 && !defined(HALF_ENABLE_CPP11_CMATH)
+		#define HALF_ENABLE_CPP11_CMATH 1
+	#endif
+	#if _CPPLIB_VER >= 610 && !defined(HALF_ENABLE_CPP11_CFENV)
+		#define HALF_ENABLE_CPP11_CFENV 1
+	#endif
+#endif
+#undef HALF_GCC_VERSION
+#undef HALF_ICC_VERSION
+
+// any error throwing C++ exceptions?
+#if defined(HALF_ERRHANDLING_THROW_INVALID) || defined(HALF_ERRHANDLING_THROW_DIVBYZERO) || defined(HALF_ERRHANDLING_THROW_OVERFLOW) || defined(HALF_ERRHANDLING_THROW_UNDERFLOW) || defined(HALF_ERRHANDLING_THROW_INEXACT)
+#define HALF_ERRHANDLING_THROWS 1
+#endif
+
+// any error handling enabled?
+#define HALF_ERRHANDLING	(HALF_ERRHANDLING_FLAGS||HALF_ERRHANDLING_ERRNO||HALF_ERRHANDLING_FENV||HALF_ERRHANDLING_THROWS)
+
+#if HALF_ERRHANDLING
+	#define HALF_UNUSED_NOERR(name) name
+#else
+	#define HALF_UNUSED_NOERR(name)
+#endif
+
+// support constexpr
+#if HALF_ENABLE_CPP11_CONSTEXPR
+	#define HALF_CONSTEXPR				constexpr
+	#define HALF_CONSTEXPR_CONST		constexpr
+	#if HALF_ERRHANDLING
+		#define HALF_CONSTEXPR_NOERR
+	#else
+		#define HALF_CONSTEXPR_NOERR	constexpr
+	#endif
+#else
+	#define HALF_CONSTEXPR
+	#define HALF_CONSTEXPR_CONST		const
+	#define HALF_CONSTEXPR_NOERR
+#endif
+
+// support noexcept
+#if HALF_ENABLE_CPP11_NOEXCEPT
+	#define HALF_NOEXCEPT	noexcept
+	#define HALF_NOTHROW	noexcept
+#else
+	#define HALF_NOEXCEPT
+	#define HALF_NOTHROW	throw()
+#endif
+
+// support thread storage
+#if HALF_ENABLE_CPP11_THREAD_LOCAL
+	#define HALF_THREAD_LOCAL	thread_local
+#else
+	#define HALF_THREAD_LOCAL	static
+#endif
+
+#include <utility>
+#include <algorithm>
+#include <istream>
+#include <ostream>
+#include <limits>
+#include <stdexcept>
+#include <climits>
+#include <cmath>
+#include <cstring>
+#include <cstdlib>
+#if HALF_ENABLE_CPP11_TYPE_TRAITS
+	#include <type_traits>
+#endif
+#if HALF_ENABLE_CPP11_CSTDINT
+	#include <cstdint>
+#endif
+#if HALF_ERRHANDLING_ERRNO
+	#include <cerrno>
+#endif
+#if HALF_ENABLE_CPP11_CFENV
+	#include <cfenv>
+#endif
+#if HALF_ENABLE_CPP11_HASH
+	#include <functional>
+#endif
+
+
+#ifndef HALF_ENABLE_F16C_INTRINSICS
+	/// Enable F16C intruction set intrinsics.
+	/// Defining this to 1 enables the use of [F16C compiler intrinsics](https://en.wikipedia.org/wiki/F16C) for converting between 
+	/// half-precision and single-precision values which may result in improved performance. This will not perform additional checks 
+	/// for support of the F16C instruction set, so an appropriate target platform is required when enabling this feature.
+	///
+	/// Unless predefined it will be enabled automatically when the `__F16C__` symbol is defined, which some compilers do on supporting platforms.
+	#define HALF_ENABLE_F16C_INTRINSICS __F16C__
+#endif
+#if HALF_ENABLE_F16C_INTRINSICS
+	#include <immintrin.h>
+#endif
+
+#ifdef HALF_DOXYGEN_ONLY
+/// Type for internal floating-point computations.
+/// This can be predefined to a built-in floating-point type (`float`, `double` or `long double`) to override the internal 
+/// half-precision implementation to use this type for computing arithmetic operations and mathematical functions (if available). 
+/// This can result in improved performance for arithmetic operators and mathematical functions but might cause results to 
+/// deviate from the specified half-precision rounding mode and inhibits proper detection of half-precision exceptions.
+#define HALF_ARITHMETIC_TYPE (undefined)
+
+/// Enable internal exception flags.
+/// Defining this to 1 causes operations on half-precision values to raise internal floating-point exception flags according to 
+/// the IEEE 754 standard. These can then be cleared and checked with clearexcept(), testexcept().
+#define HALF_ERRHANDLING_FLAGS	0
+
+/// Enable exception propagation to `errno`.
+/// Defining this to 1 causes operations on half-precision values to propagate floating-point exceptions to 
+/// [errno](https://en.cppreference.com/w/cpp/error/errno) from `<cerrno>`. Specifically this will propagate domain errors as 
+/// [EDOM](https://en.cppreference.com/w/cpp/error/errno_macros) and pole, overflow and underflow errors as 
+/// [ERANGE](https://en.cppreference.com/w/cpp/error/errno_macros). Inexact errors won't be propagated.
+#define HALF_ERRHANDLING_ERRNO	0
+
+/// Enable exception propagation to built-in floating-point platform.
+/// Defining this to 1 causes operations on half-precision values to propagate floating-point exceptions to the built-in 
+/// single- and double-precision implementation's exception flags using the 
+/// [C++11 floating-point environment control](https://en.cppreference.com/w/cpp/numeric/fenv) from `<cfenv>`. However, this 
+/// does not work in reverse and single- or double-precision exceptions will not raise the corresponding half-precision 
+/// exception flags, nor will explicitly clearing flags clear the corresponding built-in flags.
+#define HALF_ERRHANDLING_FENV	0
+
+/// Throw C++ exception on domain errors.
+/// Defining this to a string literal causes operations on half-precision values to throw a 
+/// [std::domain_error](https://en.cppreference.com/w/cpp/error/domain_error) with the specified message on domain errors.
+#define HALF_ERRHANDLING_THROW_INVALID		(undefined)
+
+/// Throw C++ exception on pole errors.
+/// Defining this to a string literal causes operations on half-precision values to throw a 
+/// [std::domain_error](https://en.cppreference.com/w/cpp/error/domain_error) with the specified message on pole errors.
+#define HALF_ERRHANDLING_THROW_DIVBYZERO	(undefined)
+
+/// Throw C++ exception on overflow errors.
+/// Defining this to a string literal causes operations on half-precision values to throw a 
+/// [std::overflow_error](https://en.cppreference.com/w/cpp/error/overflow_error) with the specified message on overflows.
+#define HALF_ERRHANDLING_THROW_OVERFLOW		(undefined)
+
+/// Throw C++ exception on underflow errors.
+/// Defining this to a string literal causes operations on half-precision values to throw a 
+/// [std::underflow_error](https://en.cppreference.com/w/cpp/error/underflow_error) with the specified message on underflows.
+#define HALF_ERRHANDLING_THROW_UNDERFLOW	(undefined)
+
+/// Throw C++ exception on rounding errors.
+/// Defining this to 1 causes operations on half-precision values to throw a 
+/// [std::range_error](https://en.cppreference.com/w/cpp/error/range_error) with the specified message on general rounding errors.
+#define HALF_ERRHANDLING_THROW_INEXACT		(undefined)
+#endif
+
+#ifndef HALF_ERRHANDLING_OVERFLOW_TO_INEXACT
+/// Raise INEXACT exception on overflow.
+/// Defining this to 1 (default) causes overflow errors to also raise inexact exceptions.
+/// These will be raised after any possible handling of the underflow exception.
+#define HALF_ERRHANDLING_OVERFLOW_TO_INEXACT	1
+#endif
+
+#ifndef HALF_ERRHANDLING_UNDERFLOW_TO_INEXACT
+/// Raise INEXACT exception on underflow.
+/// Defining this to 1 (default) causes underflow errors to also raise inexact exceptions.
+/// These will be raised after any possible handling of the underflow exception.
+///
+/// **Note:** This will actually cause underflow (and the accompanying inexact) exceptions to be raised *only* when the result 
+/// is inexact, while if disabled bare underflow errors will be raised for *any* (possibly exact) subnormal result.
+#define HALF_ERRHANDLING_UNDERFLOW_TO_INEXACT	1
+#endif
+
+/// Default rounding mode.
+/// This specifies the rounding mode used for all conversions between [half](\ref half_float::half)s and more precise types 
+/// (unless using half_cast() and specifying the rounding mode directly) as well as in arithmetic operations and mathematical 
+/// functions. It can be redefined (before including half.hpp) to one of the standard rounding modes using their respective 
+/// constants or the equivalent values of 
+/// [std::float_round_style](https://en.cppreference.com/w/cpp/types/numeric_limits/float_round_style):
+///
+/// `std::float_round_style`         | value | rounding
+/// ---------------------------------|-------|-------------------------
+/// `std::round_indeterminate`       | -1    | fastest
+/// `std::round_toward_zero`         | 0     | toward zero
+/// `std::round_to_nearest`          | 1     | to nearest (default)
+/// `std::round_toward_infinity`     | 2     | toward positive infinity
+/// `std::round_toward_neg_infinity` | 3     | toward negative infinity
+///
+/// By default this is set to `1` (`std::round_to_nearest`), which rounds results to the nearest representable value. It can even 
+/// be set to [std::numeric_limits<float>::round_style](https://en.cppreference.com/w/cpp/types/numeric_limits/round_style) to synchronize 
+/// the rounding mode with that of the built-in single-precision implementation (which is likely `std::round_to_nearest`, though).
+#ifndef HALF_ROUND_STYLE
+	#define HALF_ROUND_STYLE	1		// = std::round_to_nearest
+#endif
+
+/// Value signaling overflow.
+/// In correspondence with `HUGE_VAL[F|L]` from `<cmath>` this symbol expands to a positive value signaling the overflow of an 
+/// operation, in particular it just evaluates to positive infinity.
+///
+/// **See also:** Documentation for [HUGE_VAL](https://en.cppreference.com/w/cpp/numeric/math/HUGE_VAL)
+#define HUGE_VALH	std::numeric_limits<half_float::half>::infinity()
+
+/// Fast half-precision fma function.
+/// This symbol is defined if the fma() function generally executes as fast as, or faster than, a separate 
+/// half-precision multiplication followed by an addition, which is always the case.
+///
+/// **See also:** Documentation for [FP_FAST_FMA](https://en.cppreference.com/w/cpp/numeric/math/fma)
+#define FP_FAST_FMAH	1
+
+///	Half rounding mode.
+/// In correspondence with `FLT_ROUNDS` from `<cfloat>` this symbol expands to the rounding mode used for 
+/// half-precision operations. It is an alias for [HALF_ROUND_STYLE](\ref HALF_ROUND_STYLE).
+///
+/// **See also:** Documentation for [FLT_ROUNDS](https://en.cppreference.com/w/cpp/types/climits/FLT_ROUNDS)
+#define HLF_ROUNDS	HALF_ROUND_STYLE
+
+#ifndef FP_ILOGB0
+	#define FP_ILOGB0		INT_MIN
+#endif
+#ifndef FP_ILOGBNAN
+	#define FP_ILOGBNAN		INT_MAX
+#endif
+#ifndef FP_SUBNORMAL
+	#define FP_SUBNORMAL	0
+#endif
+#ifndef FP_ZERO
+	#define FP_ZERO			1
+#endif
+#ifndef FP_NAN
+	#define FP_NAN			2
+#endif
+#ifndef FP_INFINITE
+	#define FP_INFINITE		3
+#endif
+#ifndef FP_NORMAL
+	#define FP_NORMAL		4
+#endif
+
+#if !HALF_ENABLE_CPP11_CFENV && !defined(FE_ALL_EXCEPT)
+	#define FE_INVALID		0x10
+	#define FE_DIVBYZERO	0x08
+	#define FE_OVERFLOW		0x04
+	#define FE_UNDERFLOW	0x02
+	#define FE_INEXACT		0x01
+	#define FE_ALL_EXCEPT	(FE_INVALID|FE_DIVBYZERO|FE_OVERFLOW|FE_UNDERFLOW|FE_INEXACT)
+#endif
+
+
+/// Main namespace for half-precision functionality.
+/// This namespace contains all the functionality provided by the library.
+namespace half_float
+{
+	class half;
+
+#if HALF_ENABLE_CPP11_USER_LITERALS
+	/// Library-defined half-precision literals.
+	/// Import this namespace to enable half-precision floating-point literals:
+	/// ~~~~{.cpp}
+	/// using namespace half_float::literal;
+	/// half_float::half = 4.2_h;
+	/// ~~~~
+	namespace literal
+	{
+		half operator "" _h(long double);
+	}
+#endif
+
+	/// \internal
+	/// \brief Implementation details.
+	namespace detail
+	{
+	#if HALF_ENABLE_CPP11_TYPE_TRAITS
+		/// Conditional type.
+		template<bool B,typename T,typename F> struct conditional : std::conditional<B,T,F> {};
+
+		/// Helper for tag dispatching.
+		template<bool B> struct bool_type : std::integral_constant<bool,B> {};
+		using std::true_type;
+		using std::false_type;
+
+		/// Type traits for floating-point types.
+		template<typename T> struct is_float : std::is_floating_point<T> {};
+	#else
+		/// Conditional type.
+		template<bool,typename T,typename> struct conditional { typedef T type; };
+		template<typename T,typename F> struct conditional<false,T,F> { typedef F type; };
+
+		/// Helper for tag dispatching.
+		template<bool> struct bool_type {};
+		typedef bool_type<true> true_type;
+		typedef bool_type<false> false_type;
+
+		/// Type traits for floating-point types.
+		template<typename> struct is_float : false_type {};
+		template<typename T> struct is_float<const T> : is_float<T> {};
+		template<typename T> struct is_float<volatile T> : is_float<T> {};
+		template<typename T> struct is_float<const volatile T> : is_float<T> {};
+		template<> struct is_float<float> : true_type {};
+		template<> struct is_float<double> : true_type {};
+		template<> struct is_float<long double> : true_type {};
+	#endif
+
+		/// Type traits for floating-point bits.
+		template<typename T> struct bits { typedef unsigned char type; };
+		template<typename T> struct bits<const T> : bits<T> {};
+		template<typename T> struct bits<volatile T> : bits<T> {};
+		template<typename T> struct bits<const volatile T> : bits<T> {};
+
+	#if HALF_ENABLE_CPP11_CSTDINT
+		/// Unsigned integer of (at least) 16 bits width.
+		typedef std::uint_least16_t uint16;
+
+		/// Fastest unsigned integer of (at least) 32 bits width.
+		typedef std::uint_fast32_t uint32;
+
+		/// Fastest signed integer of (at least) 32 bits width.
+		typedef std::int_fast32_t int32;
+
+		/// Unsigned integer of (at least) 32 bits width.
+		template<> struct bits<float> { typedef std::uint_least32_t type; };
+
+		/// Unsigned integer of (at least) 64 bits width.
+		template<> struct bits<double> { typedef std::uint_least64_t type; };
+	#else
+		/// Unsigned integer of (at least) 16 bits width.
+		typedef unsigned short uint16;
+
+		/// Fastest unsigned integer of (at least) 32 bits width.
+		typedef unsigned long uint32;
+
+		/// Fastest unsigned integer of (at least) 32 bits width.
+		typedef long int32;
+
+		/// Unsigned integer of (at least) 32 bits width.
+		template<> struct bits<float> : conditional<std::numeric_limits<unsigned int>::digits>=32,unsigned int,unsigned long> {};
+
+		#if HALF_ENABLE_CPP11_LONG_LONG
+			/// Unsigned integer of (at least) 64 bits width.
+			template<> struct bits<double> : conditional<std::numeric_limits<unsigned long>::digits>=64,unsigned long,unsigned long long> {};
+		#else
+			/// Unsigned integer of (at least) 64 bits width.
+			template<> struct bits<double> { typedef unsigned long type; };
+		#endif
+	#endif
+
+	#ifdef HALF_ARITHMETIC_TYPE
+		/// Type to use for arithmetic computations and mathematical functions internally.
+		typedef HALF_ARITHMETIC_TYPE internal_t;
+	#endif
+
+		/// Tag type for binary construction.
+		struct binary_t {};
+
+		/// Tag for binary construction.
+		HALF_CONSTEXPR_CONST binary_t binary = binary_t();
+
+		/// \name Implementation defined classification and arithmetic
+		/// \{
+
+		/// Check for infinity.
+		/// \tparam T argument type (builtin floating-point type)
+		/// \param arg value to query
+		/// \retval true if infinity
+		/// \retval false else
+		template<typename T> bool builtin_isinf(T arg)
+		{
+		#if HALF_ENABLE_CPP11_CMATH
+			return std::isinf(arg);
+		#elif defined(_MSC_VER)
+			return !::_finite(static_cast<double>(arg)) && !::_isnan(static_cast<double>(arg));
+		#else
+			return arg == std::numeric_limits<T>::infinity() || arg == -std::numeric_limits<T>::infinity();
+		#endif
+		}
+
+		/// Check for NaN.
+		/// \tparam T argument type (builtin floating-point type)
+		/// \param arg value to query
+		/// \retval true if not a number
+		/// \retval false else
+		template<typename T> bool builtin_isnan(T arg)
+		{
+		#if HALF_ENABLE_CPP11_CMATH
+			return std::isnan(arg);
+		#elif defined(_MSC_VER)
+			return ::_isnan(static_cast<double>(arg)) != 0;
+		#else
+			return arg != arg;
+		#endif
+		}
+
+		/// Check sign.
+		/// \tparam T argument type (builtin floating-point type)
+		/// \param arg value to query
+		/// \retval true if signbit set
+		/// \retval false else
+		template<typename T> bool builtin_signbit(T arg)
+		{
+		#if HALF_ENABLE_CPP11_CMATH
+			return std::signbit(arg);
+		#else
+			return arg < T() || (arg == T() && T(1)/arg < T());
+		#endif
+		}
+
+		/// Platform-independent sign mask.
+		/// \param arg integer value in two's complement
+		/// \retval -1 if \a arg negative
+		/// \retval 0 if \a arg positive
+		inline uint32 sign_mask(uint32 arg)
+		{
+			static const int N = std::numeric_limits<uint32>::digits - 1;
+		#if HALF_TWOS_COMPLEMENT_INT
+			return static_cast<int32>(arg) >> N;
+		#else
+			return -((arg>>N)&1);
+		#endif
+		}
+
+		/// Platform-independent arithmetic right shift.
+		/// \param arg integer value in two's complement
+		/// \param i shift amount (at most 31)
+		/// \return \a arg right shifted for \a i bits with possible sign extension
+		inline uint32 arithmetic_shift(uint32 arg, int i)
+		{
+		#if HALF_TWOS_COMPLEMENT_INT
+			return static_cast<int32>(arg) >> i;
+		#else
+			return static_cast<int32>(arg)/(static_cast<int32>(1)<<i) - ((arg>>(std::numeric_limits<uint32>::digits-1))&1);
+		#endif
+		}
+
+		/// \}
+		/// \name Error handling
+		/// \{
+
+		/// Internal exception flags.
+		/// \return reference to global exception flags
+		inline int& errflags() { HALF_THREAD_LOCAL int flags = 0; return flags; }
+
+		/// Raise floating-point exception.
+		/// \param flags exceptions to raise
+		/// \param cond condition to raise exceptions for
+		inline void raise(int HALF_UNUSED_NOERR(flags), bool HALF_UNUSED_NOERR(cond) = true)
+		{
+		#if HALF_ERRHANDLING
+			if(!cond)
+				return;
+		#if HALF_ERRHANDLING_FLAGS
+			errflags() |= flags;
+		#endif
+		#if HALF_ERRHANDLING_ERRNO
+			if(flags & FE_INVALID)
+				errno = EDOM;
+			else if(flags & (FE_DIVBYZERO|FE_OVERFLOW|FE_UNDERFLOW))
+				errno = ERANGE;
+		#endif
+		#if HALF_ERRHANDLING_FENV && HALF_ENABLE_CPP11_CFENV
+			std::feraiseexcept(flags);
+		#endif
+		#ifdef HALF_ERRHANDLING_THROW_INVALID
+			if(flags & FE_INVALID)
+				throw std::domain_error(HALF_ERRHANDLING_THROW_INVALID);
+		#endif
+		#ifdef HALF_ERRHANDLING_THROW_DIVBYZERO
+			if(flags & FE_DIVBYZERO)
+				throw std::domain_error(HALF_ERRHANDLING_THROW_DIVBYZERO);
+		#endif
+		#ifdef HALF_ERRHANDLING_THROW_OVERFLOW
+			if(flags & FE_OVERFLOW)
+				throw std::overflow_error(HALF_ERRHANDLING_THROW_OVERFLOW);
+		#endif
+		#ifdef HALF_ERRHANDLING_THROW_UNDERFLOW
+			if(flags & FE_UNDERFLOW)
+				throw std::underflow_error(HALF_ERRHANDLING_THROW_UNDERFLOW);
+		#endif
+		#ifdef HALF_ERRHANDLING_THROW_INEXACT
+			if(flags & FE_INEXACT)
+				throw std::range_error(HALF_ERRHANDLING_THROW_INEXACT);
+		#endif
+		#if HALF_ERRHANDLING_UNDERFLOW_TO_INEXACT
+			if((flags & FE_UNDERFLOW) && !(flags & FE_INEXACT))
+				detail::raise(FE_INEXACT);
+		#endif
+		#if HALF_ERRHANDLING_OVERFLOW_TO_INEXACT
+			if((flags & FE_OVERFLOW) && !(flags & FE_INEXACT))
+				detail::raise(FE_INEXACT);
+		#endif
+		#endif
+		}
+
+		/// Check and signal for any NaN.
+		/// \param x first half-precision value to check
+		/// \param y second half-precision value to check
+		/// \retval true if either \a x or \a y is NaN
+		/// \retval false else
+		/// \exception FE_INVALID if \a x or \a y is NaN
+		inline HALF_CONSTEXPR_NOERR bool compsignal(unsigned int x, unsigned int y)
+		{
+		#if HALF_ERRHANDLING
+			detail::raise(FE_INVALID, (x&0x7FFF)>0x7C00 || (y&0x7FFF)>0x7C00);
+		#endif
+			return (x&0x7FFF) > 0x7C00 || (y&0x7FFF) > 0x7C00;
+		}
+
+		/// Signal and silence signaling NaN.
+		/// \param nan half-precision NaN value
+		/// \return quiet NaN
+		/// \exception FE_INVALID if \a nan is signaling NaN
+		inline HALF_CONSTEXPR_NOERR unsigned int signal(unsigned int nan)
+		{
+		#if HALF_ERRHANDLING
+			detail::raise(FE_INVALID, !(nan&0x200));
+		#endif
+			return nan | 0x200;
+		}
+
+		/// Signal and silence signaling NaNs.
+		/// \param x first half-precision value to check
+		/// \param y second half-precision value to check
+		/// \return quiet NaN
+		/// \exception FE_INVALID if \a x or \a y is signaling NaN
+		inline HALF_CONSTEXPR_NOERR unsigned int signal(unsigned int x, unsigned int y)
+		{
+		#if HALF_ERRHANDLING
+			detail::raise(FE_INVALID, ((x&0x7FFF)>0x7C00 && !(x&0x200)) || ((y&0x7FFF)>0x7C00 && !(y&0x200)));
+		#endif
+			return ((x&0x7FFF)>0x7C00) ? (x|0x200) : (y|0x200);
+		}
+
+		/// Signal and silence signaling NaNs.
+		/// \param x first half-precision value to check
+		/// \param y second half-precision value to check
+		/// \param z third half-precision value to check
+		/// \return quiet NaN
+		/// \exception FE_INVALID if \a x, \a y or \a z is signaling NaN
+		inline HALF_CONSTEXPR_NOERR unsigned int signal(unsigned int x, unsigned int y, unsigned int z)
+		{
+		#if HALF_ERRHANDLING
+			detail::raise(FE_INVALID, ((x&0x7FFF)>0x7C00 && !(x&0x200)) || ((y&0x7FFF)>0x7C00 && !(y&0x200)) || ((z&0x7FFF)>0x7C00 && !(z&0x200)));
+		#endif
+			return ((x&0x7FFF)>0x7C00) ? (x|0x200) : ((y&0x7FFF)>0x7C00) ? (y|0x200) : (z|0x200);
+		}
+
+		/// Select value or signaling NaN.
+		/// \param x preferred half-precision value
+		/// \param y ignored half-precision value except for signaling NaN
+		/// \return \a y if signaling NaN, \a x otherwise
+		/// \exception FE_INVALID if \a y is signaling NaN
+		inline HALF_CONSTEXPR_NOERR unsigned int select(unsigned int x, unsigned int HALF_UNUSED_NOERR(y))
+		{
+		#if HALF_ERRHANDLING
+			return (((y&0x7FFF)>0x7C00) && !(y&0x200)) ? detail::signal(y) : x;
+		#else
+			return x;
+		#endif
+		}
+
+		/// Raise domain error and return NaN.
+		/// return quiet NaN
+		/// \exception FE_INVALID
+		inline HALF_CONSTEXPR_NOERR unsigned int invalid()
+		{
+		#if HALF_ERRHANDLING
+			detail::raise(FE_INVALID);
+		#endif
+			return 0x7FFF;
+		}
+
+		/// Raise pole error and return infinity.
+		/// \param sign half-precision value with sign bit only
+		/// \return half-precision infinity with sign of \a sign
+		/// \exception FE_DIVBYZERO
+		inline HALF_CONSTEXPR_NOERR unsigned int pole(unsigned int sign = 0)
+		{
+		#if HALF_ERRHANDLING
+			detail::raise(FE_DIVBYZERO);
+		#endif
+			return sign | 0x7C00;
+		}
+
+		/// Check value for underflow.
+		/// \param arg non-zero half-precision value to check
+		/// \return \a arg
+		/// \exception FE_UNDERFLOW if arg is subnormal
+		inline HALF_CONSTEXPR_NOERR unsigned int check_underflow(unsigned int arg)
+		{
+		#if HALF_ERRHANDLING && !HALF_ERRHANDLING_UNDERFLOW_TO_INEXACT
+			detail::raise(FE_UNDERFLOW, !(arg&0x7C00));
+		#endif
+			return arg;
+		}
+
+		/// \}
+		/// \name Conversion and rounding
+		/// \{
+
+		/// Half-precision overflow.
+		/// \tparam R rounding mode to use
+		/// \param sign half-precision value with sign bit only
+		/// \return rounded overflowing half-precision value
+		/// \exception FE_OVERFLOW
+		template<std::float_round_style R> HALF_CONSTEXPR_NOERR unsigned int overflow(unsigned int sign = 0)
+		{
+		#if HALF_ERRHANDLING
+			detail::raise(FE_OVERFLOW);
+		#endif
+			return	(R==std::round_toward_infinity) ? (sign+0x7C00-(sign>>15)) :
+					(R==std::round_toward_neg_infinity) ? (sign+0x7BFF+(sign>>15)) :
+					(R==std::round_toward_zero) ? (sign|0x7BFF) :
+					(sign|0x7C00);
+		}
+
+		/// Half-precision underflow.
+		/// \tparam R rounding mode to use
+		/// \param sign half-precision value with sign bit only
+		/// \return rounded underflowing half-precision value
+		/// \exception FE_UNDERFLOW
+		template<std::float_round_style R> HALF_CONSTEXPR_NOERR unsigned int underflow(unsigned int sign = 0)
+		{
+		#if HALF_ERRHANDLING
+			detail::raise(FE_UNDERFLOW);
+		#endif
+			return	(R==std::round_toward_infinity) ? (sign+1-(sign>>15)) :
+					(R==std::round_toward_neg_infinity) ? (sign+(sign>>15)) :
+					sign;
+		}
+
+		/// Round half-precision number.
+		/// \tparam R rounding mode to use
+		/// \tparam I `true` to always raise INEXACT exception, `false` to raise only for rounded results
+		/// \param value finite half-precision number to round
+		/// \param g guard bit (most significant discarded bit)
+		/// \param s sticky bit (or of all but the most significant discarded bits)
+		/// \return rounded half-precision value
+		/// \exception FE_OVERFLOW on overflows
+		/// \exception FE_UNDERFLOW on underflows
+		/// \exception FE_INEXACT if value had to be rounded or \a I is `true`
+		template<std::float_round_style R,bool I> HALF_CONSTEXPR_NOERR unsigned int rounded(unsigned int value, int g, int s)
+		{
+		#if HALF_ERRHANDLING
+			value +=	(R==std::round_to_nearest) ? (g&(s|value)) :
+						(R==std::round_toward_infinity) ? (~(value>>15)&(g|s)) :
+						(R==std::round_toward_neg_infinity) ? ((value>>15)&(g|s)) : 0;
+			if((value&0x7C00) == 0x7C00)
+				detail::raise(FE_OVERFLOW);
+			else if(value & 0x7C00)
+				detail::raise(FE_INEXACT, I || (g|s)!=0);
+			else
+				detail::raise(FE_UNDERFLOW, !(HALF_ERRHANDLING_UNDERFLOW_TO_INEXACT) || I || (g|s)!=0);
+			return value;
+		#else
+			return	(R==std::round_to_nearest) ? (value+(g&(s|value))) :
+					(R==std::round_toward_infinity) ? (value+(~(value>>15)&(g|s))) :
+					(R==std::round_toward_neg_infinity) ? (value+((value>>15)&(g|s))) :
+					value;
+		#endif
+		}
+
+		/// Round half-precision number to nearest integer value.
+		/// \tparam R rounding mode to use
+		/// \tparam E `true` for round to even, `false` for round away from zero
+		/// \tparam I `true` to raise INEXACT exception (if inexact), `false` to never raise it
+		/// \param value half-precision value to round
+		/// \return nearest integral half-precision value
+		/// \exception FE_INVALID for signaling NaN
+		/// \exception FE_INEXACT if value had to be rounded and \a I is `true`
+		template<std::float_round_style R,bool E,bool I> unsigned int integral(unsigned int value)
+		{
+			unsigned int abs = value & 0x7FFF;
+			if(abs < 0x3C00)
+			{
+				detail::raise(FE_INEXACT, I);
+				return ((R==std::round_to_nearest) ? (0x3C00&-static_cast<unsigned>(abs>=(0x3800+E))) :
+						(R==std::round_toward_infinity) ? (0x3C00&-(~(value>>15)&(abs!=0))) :
+						(R==std::round_toward_neg_infinity) ? (0x3C00&-static_cast<unsigned>(value>0x8000)) :
+						0) | (value&0x8000);
+			}
+			if(abs >= 0x6400)
+				return (abs>0x7C00) ? detail::signal(value) : value;
+			unsigned int exp = 25 - (abs>>10), mask = (1<<exp) - 1;
+			detail::raise(FE_INEXACT, I && (value&mask));
+			return ((	(R==std::round_to_nearest) ? ((1<<(exp-1))-(~(value>>exp)&E)) :
+						(R==std::round_toward_infinity) ? (mask&((value>>15)-1)) :
+						(R==std::round_toward_neg_infinity) ? (mask&-(value>>15)) :
+						0) + value) & ~mask;
+		}
+
+		/// Convert fixed point to half-precision floating-point.
+		/// \tparam R rounding mode to use
+		/// \tparam F number of fractional bits in [11,31]
+		/// \tparam S `true` for signed, `false` for unsigned
+		/// \tparam N `true` for additional normalization step, `false` if already normalized to 1.F
+		/// \tparam I `true` to always raise INEXACT exception, `false` to raise only for rounded results
+		/// \param m mantissa in Q1.F fixed point format
+		/// \param exp biased exponent - 1
+		/// \param sign half-precision value with sign bit only
+		/// \param s sticky bit (or of all but the most significant already discarded bits)
+		/// \return value converted to half-precision
+		/// \exception FE_OVERFLOW on overflows
+		/// \exception FE_UNDERFLOW on underflows
+		/// \exception FE_INEXACT if value had to be rounded or \a I is `true`
+		template<std::float_round_style R,unsigned int F,bool S,bool N,bool I> unsigned int fixed2half(uint32 m, int exp = 14, unsigned int sign = 0, int s = 0)
+		{
+			if(S)
+			{
+				uint32 msign = sign_mask(m);
+				m = (m^msign) - msign;
+				sign = msign & 0x8000;
+			}
+			if(N)
+				for(; m<(static_cast<uint32>(1)<<F) && exp; m<<=1,--exp) ;
+			else if(exp < 0)
+				return rounded<R,I>(sign+static_cast<unsigned int>(m>>(F-10-exp)), static_cast<int>((m>>(F-11-exp))&1), s|((m&((static_cast<uint32>(1)<<(F-11-exp))-1))!=0));
+			return rounded<R,I>(sign+(exp<<10)+static_cast<unsigned int>(m>>(F-10)), static_cast<int>((m>>(F-11))&1), s|((m&((static_cast<uint32>(1)<<(F-11))-1))!=0));
+		}
+
+		/// Convert IEEE single-precision to half-precision.
+		/// Credit for this goes to [Jeroen van der Zijp](ftp://ftp.fox-toolkit.org/pub/fasthalffloatconversion.pdf).
+		/// \tparam R rounding mode to use
+		/// \param value single-precision value to convert
+		/// \return rounded half-precision value
+		/// \exception FE_OVERFLOW on overflows
+		/// \exception FE_UNDERFLOW on underflows
+		/// \exception FE_INEXACT if value had to be rounded
+		template<std::float_round_style R> unsigned int float2half_impl(float value, true_type)
+		{
+		#if HALF_ENABLE_F16C_INTRINSICS
+			return _mm_cvtsi128_si32(_mm_cvtps_ph(_mm_set_ss(value),
+				(R==std::round_to_nearest) ? _MM_FROUND_TO_NEAREST_INT :
+				(R==std::round_toward_zero) ? _MM_FROUND_TO_ZERO :
+				(R==std::round_toward_infinity) ? _MM_FROUND_TO_POS_INF :
+				(R==std::round_toward_neg_infinity) ? _MM_FROUND_TO_NEG_INF :
+				_MM_FROUND_CUR_DIRECTION));
+		#else
+			bits<float>::type fbits;
+			std::memcpy(&fbits, &value, sizeof(float));
+		#if 1
+			unsigned int sign = (fbits>>16) & 0x8000;
+			fbits &= 0x7FFFFFFF;
+			if(fbits >= 0x7F800000)
+				return sign | 0x7C00 | ((fbits>0x7F800000) ? (0x200|((fbits>>13)&0x3FF)) : 0);
+			if(fbits >= 0x47800000)
+				return overflow<R>(sign);
+			if(fbits >= 0x38800000)
+				return rounded<R,false>(sign|(((fbits>>23)-112)<<10)|((fbits>>13)&0x3FF), (fbits>>12)&1, (fbits&0xFFF)!=0);
+			if(fbits >= 0x33000000)
+			{
+				int i = 125 - (fbits>>23);
+				fbits = (fbits&0x7FFFFF) | 0x800000;
+				return rounded<R,false>(sign|(fbits>>(i+1)), (fbits>>i)&1, (fbits&((static_cast<uint32>(1)<<i)-1))!=0);
+			}
+			if(fbits != 0)
+				return underflow<R>(sign);
+			return sign;
+		#else
+			static const uint16 base_table[512] = {
+				0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 
+				0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 
+				0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 
+				0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 
+				0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 
+				0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 
+				0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0001, 0x0002, 0x0004, 0x0008, 0x0010, 0x0020, 0x0040, 0x0080, 0x0100, 
+				0x0200, 0x0400, 0x0800, 0x0C00, 0x1000, 0x1400, 0x1800, 0x1C00, 0x2000, 0x2400, 0x2800, 0x2C00, 0x3000, 0x3400, 0x3800, 0x3C00, 
+				0x4000, 0x4400, 0x4800, 0x4C00, 0x5000, 0x5400, 0x5800, 0x5C00, 0x6000, 0x6400, 0x6800, 0x6C00, 0x7000, 0x7400, 0x7800, 0x7BFF, 
+				0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 
+				0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 
+				0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 
+				0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 
+				0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 
+				0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 
+				0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7C00, 
+				0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 
+				0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 
+				0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 
+				0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 
+				0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 
+				0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 
+				0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8001, 0x8002, 0x8004, 0x8008, 0x8010, 0x8020, 0x8040, 0x8080, 0x8100, 
+				0x8200, 0x8400, 0x8800, 0x8C00, 0x9000, 0x9400, 0x9800, 0x9C00, 0xA000, 0xA400, 0xA800, 0xAC00, 0xB000, 0xB400, 0xB800, 0xBC00, 
+				0xC000, 0xC400, 0xC800, 0xCC00, 0xD000, 0xD400, 0xD800, 0xDC00, 0xE000, 0xE400, 0xE800, 0xEC00, 0xF000, 0xF400, 0xF800, 0xFBFF, 
+				0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 
+				0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 
+				0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 
+				0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 
+				0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 
+				0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 
+				0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFC00 };
+			static const unsigned char shift_table[256] = {
+				24, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 
+				25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 
+				25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 
+				25, 25, 25, 25, 25, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 
+				13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 
+				24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 
+				24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 
+				24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 13 };
+			int sexp = fbits >> 23, exp = sexp & 0xFF, i = shift_table[exp];
+			fbits &= 0x7FFFFF;
+			uint32 m = (fbits|((exp!=0)<<23)) & -static_cast<uint32>(exp!=0xFF);
+			return rounded<R,false>(base_table[sexp]+(fbits>>i), (m>>(i-1))&1, (((static_cast<uint32>(1)<<(i-1))-1)&m)!=0);
+		#endif
+		#endif
+		}
+
+		/// Convert IEEE double-precision to half-precision.
+		/// \tparam R rounding mode to use
+		/// \param value double-precision value to convert
+		/// \return rounded half-precision value
+		/// \exception FE_OVERFLOW on overflows
+		/// \exception FE_UNDERFLOW on underflows
+		/// \exception FE_INEXACT if value had to be rounded
+		template<std::float_round_style R> unsigned int float2half_impl(double value, true_type)
+		{
+		#if HALF_ENABLE_F16C_INTRINSICS
+			if(R == std::round_indeterminate)
+				return _mm_cvtsi128_si32(_mm_cvtps_ph(_mm_cvtpd_ps(_mm_set_sd(value)), _MM_FROUND_CUR_DIRECTION));
+		#endif
+			bits<double>::type dbits;
+			std::memcpy(&dbits, &value, sizeof(double));
+			uint32 hi = dbits >> 32, lo = dbits & 0xFFFFFFFF;
+			unsigned int sign = (hi>>16) & 0x8000;
+			hi &= 0x7FFFFFFF;
+			if(hi >= 0x7FF00000)
+				return sign | 0x7C00 | ((dbits&0xFFFFFFFFFFFFF) ? (0x200|((hi>>10)&0x3FF)) : 0);
+			if(hi >= 0x40F00000)
+				return overflow<R>(sign);
+			if(hi >= 0x3F100000)
+				return rounded<R,false>(sign|static_cast<unsigned int>(((hi>>20)-1008)<<10)|static_cast<unsigned int>((hi>>10)&0x3FF), static_cast<unsigned int>((hi>>9)&1), ((hi&0x1FF)|lo)!=0);
+			if(hi >= 0x3E600000)
+			{
+				int i = static_cast<int>(1018 - (hi>>20));
+				hi = (hi&0xFFFFF) | 0x100000;
+				return rounded<R,false>(sign|static_cast<unsigned int>(hi>>(i+1)), static_cast<unsigned int>((hi>>i)&1), ((hi&((static_cast<uint32>(1)<<i)-1))|lo)!=0);
+			}
+			if((hi|lo) != 0)
+				return underflow<R>(sign);
+			return sign;
+		}
+
+		/// Convert non-IEEE floating-point to half-precision.
+		/// \tparam R rounding mode to use
+		/// \tparam T source type (builtin floating-point type)
+		/// \param value floating-point value to convert
+		/// \return rounded half-precision value
+		/// \exception FE_OVERFLOW on overflows
+		/// \exception FE_UNDERFLOW on underflows
+		/// \exception FE_INEXACT if value had to be rounded
+		template<std::float_round_style R,typename T> unsigned int float2half_impl(T value, ...)
+		{
+			unsigned int hbits = static_cast<unsigned>(builtin_signbit(value)) << 15;
+			if(value == T())
+				return hbits;
+			if(builtin_isnan(value))
+				return hbits | 0x7FFF;
+			if(builtin_isinf(value))
+				return hbits | 0x7C00;
+			int exp;
+			std::frexp(value, &exp);
+			if(exp > 16)
+				return overflow<R>(hbits);
+			if(exp < -13)
+				value = std::ldexp(value, 25);
+			else
+			{
+				value = std::ldexp(value, 12-exp);
+				hbits |= ((exp+13)<<10);
+			}
+			T ival, frac = std::modf(value, &ival);
+			int m = std::abs(static_cast<int>(ival));
+			return rounded<R,false>(hbits+(m>>1), m&1, frac!=T());
+		}
+
+		/// Convert floating-point to half-precision.
+		/// \tparam R rounding mode to use
+		/// \tparam T source type (builtin floating-point type)
+		/// \param value floating-point value to convert
+		/// \return rounded half-precision value
+		/// \exception FE_OVERFLOW on overflows
+		/// \exception FE_UNDERFLOW on underflows
+		/// \exception FE_INEXACT if value had to be rounded
+		template<std::float_round_style R,typename T> unsigned int float2half(T value)
+		{
+			return float2half_impl<R>(value, bool_type<std::numeric_limits<T>::is_iec559&&sizeof(typename bits<T>::type)==sizeof(T)>());
+		}
+
+		/// Convert integer to half-precision floating-point.
+		/// \tparam R rounding mode to use
+		/// \tparam T type to convert (builtin integer type)
+		/// \param value integral value to convert
+		/// \return rounded half-precision value
+		/// \exception FE_OVERFLOW on overflows
+		/// \exception FE_INEXACT if value had to be rounded
+		template<std::float_round_style R,typename T> unsigned int int2half(T value)
+		{
+			unsigned int bits = static_cast<unsigned>(value<0) << 15;
+			if(!value)
+				return bits;
+			if(value > 0xFFE0 || (bits&&value < -0xFFE0))
+				return overflow<R>(bits);
+			unsigned int m = static_cast<unsigned int>(value), exp = 24;
+			if(bits)
+				m = -m;
+			for(; m<0x400; m<<=1,--exp) ;
+			for(; m>0x7FF; m>>=1,++exp) ;
+			bits |= (exp<<10) + m;
+			return (exp>24) ? rounded<R,false>(bits, static_cast<int>(value>>(exp-25))&1, (((1<<(exp-25))-1)&value)!=0) : bits;
+		}
+
+		/// Convert half-precision to IEEE single-precision.
+		/// Credit for this goes to [Jeroen van der Zijp](ftp://ftp.fox-toolkit.org/pub/fasthalffloatconversion.pdf).
+		/// \param value half-precision value to convert
+		/// \return single-precision value
+		inline float half2float_impl(unsigned int value, float, true_type)
+		{
+		#if HALF_ENABLE_F16C_INTRINSICS
+			return _mm_cvtss_f32(_mm_cvtph_ps(_mm_cvtsi32_si128(value)));
+		#else
+		#if 0
+			bits<float>::type fbits = static_cast<bits<float>::type>(value&0x8000) << 16;
+			int abs = value & 0x7FFF;
+			if(abs)
+			{
+				fbits |= 0x38000000 << static_cast<unsigned>(abs>=0x7C00);
+				for(; abs<0x400; abs<<=1,fbits-=0x800000) ;
+				fbits += static_cast<bits<float>::type>(abs) << 13;
+			}
+		#else
+			static const bits<float>::type mantissa_table[2048] = {
+				0x00000000, 0x33800000, 0x34000000, 0x34400000, 0x34800000, 0x34A00000, 0x34C00000, 0x34E00000, 0x35000000, 0x35100000, 0x35200000, 0x35300000, 0x35400000, 0x35500000, 0x35600000, 0x35700000, 
+				0x35800000, 0x35880000, 0x35900000, 0x35980000, 0x35A00000, 0x35A80000, 0x35B00000, 0x35B80000, 0x35C00000, 0x35C80000, 0x35D00000, 0x35D80000, 0x35E00000, 0x35E80000, 0x35F00000, 0x35F80000, 
+				0x36000000, 0x36040000, 0x36080000, 0x360C0000, 0x36100000, 0x36140000, 0x36180000, 0x361C0000, 0x36200000, 0x36240000, 0x36280000, 0x362C0000, 0x36300000, 0x36340000, 0x36380000, 0x363C0000, 
+				0x36400000, 0x36440000, 0x36480000, 0x364C0000, 0x36500000, 0x36540000, 0x36580000, 0x365C0000, 0x36600000, 0x36640000, 0x36680000, 0x366C0000, 0x36700000, 0x36740000, 0x36780000, 0x367C0000, 
+				0x36800000, 0x36820000, 0x36840000, 0x36860000, 0x36880000, 0x368A0000, 0x368C0000, 0x368E0000, 0x36900000, 0x36920000, 0x36940000, 0x36960000, 0x36980000, 0x369A0000, 0x369C0000, 0x369E0000, 
+				0x36A00000, 0x36A20000, 0x36A40000, 0x36A60000, 0x36A80000, 0x36AA0000, 0x36AC0000, 0x36AE0000, 0x36B00000, 0x36B20000, 0x36B40000, 0x36B60000, 0x36B80000, 0x36BA0000, 0x36BC0000, 0x36BE0000, 
+				0x36C00000, 0x36C20000, 0x36C40000, 0x36C60000, 0x36C80000, 0x36CA0000, 0x36CC0000, 0x36CE0000, 0x36D00000, 0x36D20000, 0x36D40000, 0x36D60000, 0x36D80000, 0x36DA0000, 0x36DC0000, 0x36DE0000, 
+				0x36E00000, 0x36E20000, 0x36E40000, 0x36E60000, 0x36E80000, 0x36EA0000, 0x36EC0000, 0x36EE0000, 0x36F00000, 0x36F20000, 0x36F40000, 0x36F60000, 0x36F80000, 0x36FA0000, 0x36FC0000, 0x36FE0000, 
+				0x37000000, 0x37010000, 0x37020000, 0x37030000, 0x37040000, 0x37050000, 0x37060000, 0x37070000, 0x37080000, 0x37090000, 0x370A0000, 0x370B0000, 0x370C0000, 0x370D0000, 0x370E0000, 0x370F0000, 
+				0x37100000, 0x37110000, 0x37120000, 0x37130000, 0x37140000, 0x37150000, 0x37160000, 0x37170000, 0x37180000, 0x37190000, 0x371A0000, 0x371B0000, 0x371C0000, 0x371D0000, 0x371E0000, 0x371F0000, 
+				0x37200000, 0x37210000, 0x37220000, 0x37230000, 0x37240000, 0x37250000, 0x37260000, 0x37270000, 0x37280000, 0x37290000, 0x372A0000, 0x372B0000, 0x372C0000, 0x372D0000, 0x372E0000, 0x372F0000, 
+				0x37300000, 0x37310000, 0x37320000, 0x37330000, 0x37340000, 0x37350000, 0x37360000, 0x37370000, 0x37380000, 0x37390000, 0x373A0000, 0x373B0000, 0x373C0000, 0x373D0000, 0x373E0000, 0x373F0000, 
+				0x37400000, 0x37410000, 0x37420000, 0x37430000, 0x37440000, 0x37450000, 0x37460000, 0x37470000, 0x37480000, 0x37490000, 0x374A0000, 0x374B0000, 0x374C0000, 0x374D0000, 0x374E0000, 0x374F0000, 
+				0x37500000, 0x37510000, 0x37520000, 0x37530000, 0x37540000, 0x37550000, 0x37560000, 0x37570000, 0x37580000, 0x37590000, 0x375A0000, 0x375B0000, 0x375C0000, 0x375D0000, 0x375E0000, 0x375F0000, 
+				0x37600000, 0x37610000, 0x37620000, 0x37630000, 0x37640000, 0x37650000, 0x37660000, 0x37670000, 0x37680000, 0x37690000, 0x376A0000, 0x376B0000, 0x376C0000, 0x376D0000, 0x376E0000, 0x376F0000, 
+				0x37700000, 0x37710000, 0x37720000, 0x37730000, 0x37740000, 0x37750000, 0x37760000, 0x37770000, 0x37780000, 0x37790000, 0x377A0000, 0x377B0000, 0x377C0000, 0x377D0000, 0x377E0000, 0x377F0000, 
+				0x37800000, 0x37808000, 0x37810000, 0x37818000, 0x37820000, 0x37828000, 0x37830000, 0x37838000, 0x37840000, 0x37848000, 0x37850000, 0x37858000, 0x37860000, 0x37868000, 0x37870000, 0x37878000, 
+				0x37880000, 0x37888000, 0x37890000, 0x37898000, 0x378A0000, 0x378A8000, 0x378B0000, 0x378B8000, 0x378C0000, 0x378C8000, 0x378D0000, 0x378D8000, 0x378E0000, 0x378E8000, 0x378F0000, 0x378F8000, 
+				0x37900000, 0x37908000, 0x37910000, 0x37918000, 0x37920000, 0x37928000, 0x37930000, 0x37938000, 0x37940000, 0x37948000, 0x37950000, 0x37958000, 0x37960000, 0x37968000, 0x37970000, 0x37978000, 
+				0x37980000, 0x37988000, 0x37990000, 0x37998000, 0x379A0000, 0x379A8000, 0x379B0000, 0x379B8000, 0x379C0000, 0x379C8000, 0x379D0000, 0x379D8000, 0x379E0000, 0x379E8000, 0x379F0000, 0x379F8000, 
+				0x37A00000, 0x37A08000, 0x37A10000, 0x37A18000, 0x37A20000, 0x37A28000, 0x37A30000, 0x37A38000, 0x37A40000, 0x37A48000, 0x37A50000, 0x37A58000, 0x37A60000, 0x37A68000, 0x37A70000, 0x37A78000, 
+				0x37A80000, 0x37A88000, 0x37A90000, 0x37A98000, 0x37AA0000, 0x37AA8000, 0x37AB0000, 0x37AB8000, 0x37AC0000, 0x37AC8000, 0x37AD0000, 0x37AD8000, 0x37AE0000, 0x37AE8000, 0x37AF0000, 0x37AF8000, 
+				0x37B00000, 0x37B08000, 0x37B10000, 0x37B18000, 0x37B20000, 0x37B28000, 0x37B30000, 0x37B38000, 0x37B40000, 0x37B48000, 0x37B50000, 0x37B58000, 0x37B60000, 0x37B68000, 0x37B70000, 0x37B78000, 
+				0x37B80000, 0x37B88000, 0x37B90000, 0x37B98000, 0x37BA0000, 0x37BA8000, 0x37BB0000, 0x37BB8000, 0x37BC0000, 0x37BC8000, 0x37BD0000, 0x37BD8000, 0x37BE0000, 0x37BE8000, 0x37BF0000, 0x37BF8000, 
+				0x37C00000, 0x37C08000, 0x37C10000, 0x37C18000, 0x37C20000, 0x37C28000, 0x37C30000, 0x37C38000, 0x37C40000, 0x37C48000, 0x37C50000, 0x37C58000, 0x37C60000, 0x37C68000, 0x37C70000, 0x37C78000, 
+				0x37C80000, 0x37C88000, 0x37C90000, 0x37C98000, 0x37CA0000, 0x37CA8000, 0x37CB0000, 0x37CB8000, 0x37CC0000, 0x37CC8000, 0x37CD0000, 0x37CD8000, 0x37CE0000, 0x37CE8000, 0x37CF0000, 0x37CF8000, 
+				0x37D00000, 0x37D08000, 0x37D10000, 0x37D18000, 0x37D20000, 0x37D28000, 0x37D30000, 0x37D38000, 0x37D40000, 0x37D48000, 0x37D50000, 0x37D58000, 0x37D60000, 0x37D68000, 0x37D70000, 0x37D78000, 
+				0x37D80000, 0x37D88000, 0x37D90000, 0x37D98000, 0x37DA0000, 0x37DA8000, 0x37DB0000, 0x37DB8000, 0x37DC0000, 0x37DC8000, 0x37DD0000, 0x37DD8000, 0x37DE0000, 0x37DE8000, 0x37DF0000, 0x37DF8000, 
+				0x37E00000, 0x37E08000, 0x37E10000, 0x37E18000, 0x37E20000, 0x37E28000, 0x37E30000, 0x37E38000, 0x37E40000, 0x37E48000, 0x37E50000, 0x37E58000, 0x37E60000, 0x37E68000, 0x37E70000, 0x37E78000, 
+				0x37E80000, 0x37E88000, 0x37E90000, 0x37E98000, 0x37EA0000, 0x37EA8000, 0x37EB0000, 0x37EB8000, 0x37EC0000, 0x37EC8000, 0x37ED0000, 0x37ED8000, 0x37EE0000, 0x37EE8000, 0x37EF0000, 0x37EF8000, 
+				0x37F00000, 0x37F08000, 0x37F10000, 0x37F18000, 0x37F20000, 0x37F28000, 0x37F30000, 0x37F38000, 0x37F40000, 0x37F48000, 0x37F50000, 0x37F58000, 0x37F60000, 0x37F68000, 0x37F70000, 0x37F78000, 
+				0x37F80000, 0x37F88000, 0x37F90000, 0x37F98000, 0x37FA0000, 0x37FA8000, 0x37FB0000, 0x37FB8000, 0x37FC0000, 0x37FC8000, 0x37FD0000, 0x37FD8000, 0x37FE0000, 0x37FE8000, 0x37FF0000, 0x37FF8000, 
+				0x38000000, 0x38004000, 0x38008000, 0x3800C000, 0x38010000, 0x38014000, 0x38018000, 0x3801C000, 0x38020000, 0x38024000, 0x38028000, 0x3802C000, 0x38030000, 0x38034000, 0x38038000, 0x3803C000, 
+				0x38040000, 0x38044000, 0x38048000, 0x3804C000, 0x38050000, 0x38054000, 0x38058000, 0x3805C000, 0x38060000, 0x38064000, 0x38068000, 0x3806C000, 0x38070000, 0x38074000, 0x38078000, 0x3807C000, 
+				0x38080000, 0x38084000, 0x38088000, 0x3808C000, 0x38090000, 0x38094000, 0x38098000, 0x3809C000, 0x380A0000, 0x380A4000, 0x380A8000, 0x380AC000, 0x380B0000, 0x380B4000, 0x380B8000, 0x380BC000, 
+				0x380C0000, 0x380C4000, 0x380C8000, 0x380CC000, 0x380D0000, 0x380D4000, 0x380D8000, 0x380DC000, 0x380E0000, 0x380E4000, 0x380E8000, 0x380EC000, 0x380F0000, 0x380F4000, 0x380F8000, 0x380FC000, 
+				0x38100000, 0x38104000, 0x38108000, 0x3810C000, 0x38110000, 0x38114000, 0x38118000, 0x3811C000, 0x38120000, 0x38124000, 0x38128000, 0x3812C000, 0x38130000, 0x38134000, 0x38138000, 0x3813C000, 
+				0x38140000, 0x38144000, 0x38148000, 0x3814C000, 0x38150000, 0x38154000, 0x38158000, 0x3815C000, 0x38160000, 0x38164000, 0x38168000, 0x3816C000, 0x38170000, 0x38174000, 0x38178000, 0x3817C000, 
+				0x38180000, 0x38184000, 0x38188000, 0x3818C000, 0x38190000, 0x38194000, 0x38198000, 0x3819C000, 0x381A0000, 0x381A4000, 0x381A8000, 0x381AC000, 0x381B0000, 0x381B4000, 0x381B8000, 0x381BC000, 
+				0x381C0000, 0x381C4000, 0x381C8000, 0x381CC000, 0x381D0000, 0x381D4000, 0x381D8000, 0x381DC000, 0x381E0000, 0x381E4000, 0x381E8000, 0x381EC000, 0x381F0000, 0x381F4000, 0x381F8000, 0x381FC000, 
+				0x38200000, 0x38204000, 0x38208000, 0x3820C000, 0x38210000, 0x38214000, 0x38218000, 0x3821C000, 0x38220000, 0x38224000, 0x38228000, 0x3822C000, 0x38230000, 0x38234000, 0x38238000, 0x3823C000, 
+				0x38240000, 0x38244000, 0x38248000, 0x3824C000, 0x38250000, 0x38254000, 0x38258000, 0x3825C000, 0x38260000, 0x38264000, 0x38268000, 0x3826C000, 0x38270000, 0x38274000, 0x38278000, 0x3827C000, 
+				0x38280000, 0x38284000, 0x38288000, 0x3828C000, 0x38290000, 0x38294000, 0x38298000, 0x3829C000, 0x382A0000, 0x382A4000, 0x382A8000, 0x382AC000, 0x382B0000, 0x382B4000, 0x382B8000, 0x382BC000, 
+				0x382C0000, 0x382C4000, 0x382C8000, 0x382CC000, 0x382D0000, 0x382D4000, 0x382D8000, 0x382DC000, 0x382E0000, 0x382E4000, 0x382E8000, 0x382EC000, 0x382F0000, 0x382F4000, 0x382F8000, 0x382FC000, 
+				0x38300000, 0x38304000, 0x38308000, 0x3830C000, 0x38310000, 0x38314000, 0x38318000, 0x3831C000, 0x38320000, 0x38324000, 0x38328000, 0x3832C000, 0x38330000, 0x38334000, 0x38338000, 0x3833C000, 
+				0x38340000, 0x38344000, 0x38348000, 0x3834C000, 0x38350000, 0x38354000, 0x38358000, 0x3835C000, 0x38360000, 0x38364000, 0x38368000, 0x3836C000, 0x38370000, 0x38374000, 0x38378000, 0x3837C000, 
+				0x38380000, 0x38384000, 0x38388000, 0x3838C000, 0x38390000, 0x38394000, 0x38398000, 0x3839C000, 0x383A0000, 0x383A4000, 0x383A8000, 0x383AC000, 0x383B0000, 0x383B4000, 0x383B8000, 0x383BC000, 
+				0x383C0000, 0x383C4000, 0x383C8000, 0x383CC000, 0x383D0000, 0x383D4000, 0x383D8000, 0x383DC000, 0x383E0000, 0x383E4000, 0x383E8000, 0x383EC000, 0x383F0000, 0x383F4000, 0x383F8000, 0x383FC000, 
+				0x38400000, 0x38404000, 0x38408000, 0x3840C000, 0x38410000, 0x38414000, 0x38418000, 0x3841C000, 0x38420000, 0x38424000, 0x38428000, 0x3842C000, 0x38430000, 0x38434000, 0x38438000, 0x3843C000, 
+				0x38440000, 0x38444000, 0x38448000, 0x3844C000, 0x38450000, 0x38454000, 0x38458000, 0x3845C000, 0x38460000, 0x38464000, 0x38468000, 0x3846C000, 0x38470000, 0x38474000, 0x38478000, 0x3847C000, 
+				0x38480000, 0x38484000, 0x38488000, 0x3848C000, 0x38490000, 0x38494000, 0x38498000, 0x3849C000, 0x384A0000, 0x384A4000, 0x384A8000, 0x384AC000, 0x384B0000, 0x384B4000, 0x384B8000, 0x384BC000, 
+				0x384C0000, 0x384C4000, 0x384C8000, 0x384CC000, 0x384D0000, 0x384D4000, 0x384D8000, 0x384DC000, 0x384E0000, 0x384E4000, 0x384E8000, 0x384EC000, 0x384F0000, 0x384F4000, 0x384F8000, 0x384FC000, 
+				0x38500000, 0x38504000, 0x38508000, 0x3850C000, 0x38510000, 0x38514000, 0x38518000, 0x3851C000, 0x38520000, 0x38524000, 0x38528000, 0x3852C000, 0x38530000, 0x38534000, 0x38538000, 0x3853C000, 
+				0x38540000, 0x38544000, 0x38548000, 0x3854C000, 0x38550000, 0x38554000, 0x38558000, 0x3855C000, 0x38560000, 0x38564000, 0x38568000, 0x3856C000, 0x38570000, 0x38574000, 0x38578000, 0x3857C000, 
+				0x38580000, 0x38584000, 0x38588000, 0x3858C000, 0x38590000, 0x38594000, 0x38598000, 0x3859C000, 0x385A0000, 0x385A4000, 0x385A8000, 0x385AC000, 0x385B0000, 0x385B4000, 0x385B8000, 0x385BC000, 
+				0x385C0000, 0x385C4000, 0x385C8000, 0x385CC000, 0x385D0000, 0x385D4000, 0x385D8000, 0x385DC000, 0x385E0000, 0x385E4000, 0x385E8000, 0x385EC000, 0x385F0000, 0x385F4000, 0x385F8000, 0x385FC000, 
+				0x38600000, 0x38604000, 0x38608000, 0x3860C000, 0x38610000, 0x38614000, 0x38618000, 0x3861C000, 0x38620000, 0x38624000, 0x38628000, 0x3862C000, 0x38630000, 0x38634000, 0x38638000, 0x3863C000, 
+				0x38640000, 0x38644000, 0x38648000, 0x3864C000, 0x38650000, 0x38654000, 0x38658000, 0x3865C000, 0x38660000, 0x38664000, 0x38668000, 0x3866C000, 0x38670000, 0x38674000, 0x38678000, 0x3867C000, 
+				0x38680000, 0x38684000, 0x38688000, 0x3868C000, 0x38690000, 0x38694000, 0x38698000, 0x3869C000, 0x386A0000, 0x386A4000, 0x386A8000, 0x386AC000, 0x386B0000, 0x386B4000, 0x386B8000, 0x386BC000, 
+				0x386C0000, 0x386C4000, 0x386C8000, 0x386CC000, 0x386D0000, 0x386D4000, 0x386D8000, 0x386DC000, 0x386E0000, 0x386E4000, 0x386E8000, 0x386EC000, 0x386F0000, 0x386F4000, 0x386F8000, 0x386FC000, 
+				0x38700000, 0x38704000, 0x38708000, 0x3870C000, 0x38710000, 0x38714000, 0x38718000, 0x3871C000, 0x38720000, 0x38724000, 0x38728000, 0x3872C000, 0x38730000, 0x38734000, 0x38738000, 0x3873C000, 
+				0x38740000, 0x38744000, 0x38748000, 0x3874C000, 0x38750000, 0x38754000, 0x38758000, 0x3875C000, 0x38760000, 0x38764000, 0x38768000, 0x3876C000, 0x38770000, 0x38774000, 0x38778000, 0x3877C000, 
+				0x38780000, 0x38784000, 0x38788000, 0x3878C000, 0x38790000, 0x38794000, 0x38798000, 0x3879C000, 0x387A0000, 0x387A4000, 0x387A8000, 0x387AC000, 0x387B0000, 0x387B4000, 0x387B8000, 0x387BC000, 
+				0x387C0000, 0x387C4000, 0x387C8000, 0x387CC000, 0x387D0000, 0x387D4000, 0x387D8000, 0x387DC000, 0x387E0000, 0x387E4000, 0x387E8000, 0x387EC000, 0x387F0000, 0x387F4000, 0x387F8000, 0x387FC000, 
+				0x38000000, 0x38002000, 0x38004000, 0x38006000, 0x38008000, 0x3800A000, 0x3800C000, 0x3800E000, 0x38010000, 0x38012000, 0x38014000, 0x38016000, 0x38018000, 0x3801A000, 0x3801C000, 0x3801E000, 
+				0x38020000, 0x38022000, 0x38024000, 0x38026000, 0x38028000, 0x3802A000, 0x3802C000, 0x3802E000, 0x38030000, 0x38032000, 0x38034000, 0x38036000, 0x38038000, 0x3803A000, 0x3803C000, 0x3803E000, 
+				0x38040000, 0x38042000, 0x38044000, 0x38046000, 0x38048000, 0x3804A000, 0x3804C000, 0x3804E000, 0x38050000, 0x38052000, 0x38054000, 0x38056000, 0x38058000, 0x3805A000, 0x3805C000, 0x3805E000, 
+				0x38060000, 0x38062000, 0x38064000, 0x38066000, 0x38068000, 0x3806A000, 0x3806C000, 0x3806E000, 0x38070000, 0x38072000, 0x38074000, 0x38076000, 0x38078000, 0x3807A000, 0x3807C000, 0x3807E000, 
+				0x38080000, 0x38082000, 0x38084000, 0x38086000, 0x38088000, 0x3808A000, 0x3808C000, 0x3808E000, 0x38090000, 0x38092000, 0x38094000, 0x38096000, 0x38098000, 0x3809A000, 0x3809C000, 0x3809E000, 
+				0x380A0000, 0x380A2000, 0x380A4000, 0x380A6000, 0x380A8000, 0x380AA000, 0x380AC000, 0x380AE000, 0x380B0000, 0x380B2000, 0x380B4000, 0x380B6000, 0x380B8000, 0x380BA000, 0x380BC000, 0x380BE000, 
+				0x380C0000, 0x380C2000, 0x380C4000, 0x380C6000, 0x380C8000, 0x380CA000, 0x380CC000, 0x380CE000, 0x380D0000, 0x380D2000, 0x380D4000, 0x380D6000, 0x380D8000, 0x380DA000, 0x380DC000, 0x380DE000, 
+				0x380E0000, 0x380E2000, 0x380E4000, 0x380E6000, 0x380E8000, 0x380EA000, 0x380EC000, 0x380EE000, 0x380F0000, 0x380F2000, 0x380F4000, 0x380F6000, 0x380F8000, 0x380FA000, 0x380FC000, 0x380FE000, 
+				0x38100000, 0x38102000, 0x38104000, 0x38106000, 0x38108000, 0x3810A000, 0x3810C000, 0x3810E000, 0x38110000, 0x38112000, 0x38114000, 0x38116000, 0x38118000, 0x3811A000, 0x3811C000, 0x3811E000, 
+				0x38120000, 0x38122000, 0x38124000, 0x38126000, 0x38128000, 0x3812A000, 0x3812C000, 0x3812E000, 0x38130000, 0x38132000, 0x38134000, 0x38136000, 0x38138000, 0x3813A000, 0x3813C000, 0x3813E000, 
+				0x38140000, 0x38142000, 0x38144000, 0x38146000, 0x38148000, 0x3814A000, 0x3814C000, 0x3814E000, 0x38150000, 0x38152000, 0x38154000, 0x38156000, 0x38158000, 0x3815A000, 0x3815C000, 0x3815E000, 
+				0x38160000, 0x38162000, 0x38164000, 0x38166000, 0x38168000, 0x3816A000, 0x3816C000, 0x3816E000, 0x38170000, 0x38172000, 0x38174000, 0x38176000, 0x38178000, 0x3817A000, 0x3817C000, 0x3817E000, 
+				0x38180000, 0x38182000, 0x38184000, 0x38186000, 0x38188000, 0x3818A000, 0x3818C000, 0x3818E000, 0x38190000, 0x38192000, 0x38194000, 0x38196000, 0x38198000, 0x3819A000, 0x3819C000, 0x3819E000, 
+				0x381A0000, 0x381A2000, 0x381A4000, 0x381A6000, 0x381A8000, 0x381AA000, 0x381AC000, 0x381AE000, 0x381B0000, 0x381B2000, 0x381B4000, 0x381B6000, 0x381B8000, 0x381BA000, 0x381BC000, 0x381BE000, 
+				0x381C0000, 0x381C2000, 0x381C4000, 0x381C6000, 0x381C8000, 0x381CA000, 0x381CC000, 0x381CE000, 0x381D0000, 0x381D2000, 0x381D4000, 0x381D6000, 0x381D8000, 0x381DA000, 0x381DC000, 0x381DE000, 
+				0x381E0000, 0x381E2000, 0x381E4000, 0x381E6000, 0x381E8000, 0x381EA000, 0x381EC000, 0x381EE000, 0x381F0000, 0x381F2000, 0x381F4000, 0x381F6000, 0x381F8000, 0x381FA000, 0x381FC000, 0x381FE000, 
+				0x38200000, 0x38202000, 0x38204000, 0x38206000, 0x38208000, 0x3820A000, 0x3820C000, 0x3820E000, 0x38210000, 0x38212000, 0x38214000, 0x38216000, 0x38218000, 0x3821A000, 0x3821C000, 0x3821E000, 
+				0x38220000, 0x38222000, 0x38224000, 0x38226000, 0x38228000, 0x3822A000, 0x3822C000, 0x3822E000, 0x38230000, 0x38232000, 0x38234000, 0x38236000, 0x38238000, 0x3823A000, 0x3823C000, 0x3823E000, 
+				0x38240000, 0x38242000, 0x38244000, 0x38246000, 0x38248000, 0x3824A000, 0x3824C000, 0x3824E000, 0x38250000, 0x38252000, 0x38254000, 0x38256000, 0x38258000, 0x3825A000, 0x3825C000, 0x3825E000, 
+				0x38260000, 0x38262000, 0x38264000, 0x38266000, 0x38268000, 0x3826A000, 0x3826C000, 0x3826E000, 0x38270000, 0x38272000, 0x38274000, 0x38276000, 0x38278000, 0x3827A000, 0x3827C000, 0x3827E000, 
+				0x38280000, 0x38282000, 0x38284000, 0x38286000, 0x38288000, 0x3828A000, 0x3828C000, 0x3828E000, 0x38290000, 0x38292000, 0x38294000, 0x38296000, 0x38298000, 0x3829A000, 0x3829C000, 0x3829E000, 
+				0x382A0000, 0x382A2000, 0x382A4000, 0x382A6000, 0x382A8000, 0x382AA000, 0x382AC000, 0x382AE000, 0x382B0000, 0x382B2000, 0x382B4000, 0x382B6000, 0x382B8000, 0x382BA000, 0x382BC000, 0x382BE000, 
+				0x382C0000, 0x382C2000, 0x382C4000, 0x382C6000, 0x382C8000, 0x382CA000, 0x382CC000, 0x382CE000, 0x382D0000, 0x382D2000, 0x382D4000, 0x382D6000, 0x382D8000, 0x382DA000, 0x382DC000, 0x382DE000, 
+				0x382E0000, 0x382E2000, 0x382E4000, 0x382E6000, 0x382E8000, 0x382EA000, 0x382EC000, 0x382EE000, 0x382F0000, 0x382F2000, 0x382F4000, 0x382F6000, 0x382F8000, 0x382FA000, 0x382FC000, 0x382FE000, 
+				0x38300000, 0x38302000, 0x38304000, 0x38306000, 0x38308000, 0x3830A000, 0x3830C000, 0x3830E000, 0x38310000, 0x38312000, 0x38314000, 0x38316000, 0x38318000, 0x3831A000, 0x3831C000, 0x3831E000, 
+				0x38320000, 0x38322000, 0x38324000, 0x38326000, 0x38328000, 0x3832A000, 0x3832C000, 0x3832E000, 0x38330000, 0x38332000, 0x38334000, 0x38336000, 0x38338000, 0x3833A000, 0x3833C000, 0x3833E000, 
+				0x38340000, 0x38342000, 0x38344000, 0x38346000, 0x38348000, 0x3834A000, 0x3834C000, 0x3834E000, 0x38350000, 0x38352000, 0x38354000, 0x38356000, 0x38358000, 0x3835A000, 0x3835C000, 0x3835E000, 
+				0x38360000, 0x38362000, 0x38364000, 0x38366000, 0x38368000, 0x3836A000, 0x3836C000, 0x3836E000, 0x38370000, 0x38372000, 0x38374000, 0x38376000, 0x38378000, 0x3837A000, 0x3837C000, 0x3837E000, 
+				0x38380000, 0x38382000, 0x38384000, 0x38386000, 0x38388000, 0x3838A000, 0x3838C000, 0x3838E000, 0x38390000, 0x38392000, 0x38394000, 0x38396000, 0x38398000, 0x3839A000, 0x3839C000, 0x3839E000, 
+				0x383A0000, 0x383A2000, 0x383A4000, 0x383A6000, 0x383A8000, 0x383AA000, 0x383AC000, 0x383AE000, 0x383B0000, 0x383B2000, 0x383B4000, 0x383B6000, 0x383B8000, 0x383BA000, 0x383BC000, 0x383BE000, 
+				0x383C0000, 0x383C2000, 0x383C4000, 0x383C6000, 0x383C8000, 0x383CA000, 0x383CC000, 0x383CE000, 0x383D0000, 0x383D2000, 0x383D4000, 0x383D6000, 0x383D8000, 0x383DA000, 0x383DC000, 0x383DE000, 
+				0x383E0000, 0x383E2000, 0x383E4000, 0x383E6000, 0x383E8000, 0x383EA000, 0x383EC000, 0x383EE000, 0x383F0000, 0x383F2000, 0x383F4000, 0x383F6000, 0x383F8000, 0x383FA000, 0x383FC000, 0x383FE000, 
+				0x38400000, 0x38402000, 0x38404000, 0x38406000, 0x38408000, 0x3840A000, 0x3840C000, 0x3840E000, 0x38410000, 0x38412000, 0x38414000, 0x38416000, 0x38418000, 0x3841A000, 0x3841C000, 0x3841E000, 
+				0x38420000, 0x38422000, 0x38424000, 0x38426000, 0x38428000, 0x3842A000, 0x3842C000, 0x3842E000, 0x38430000, 0x38432000, 0x38434000, 0x38436000, 0x38438000, 0x3843A000, 0x3843C000, 0x3843E000, 
+				0x38440000, 0x38442000, 0x38444000, 0x38446000, 0x38448000, 0x3844A000, 0x3844C000, 0x3844E000, 0x38450000, 0x38452000, 0x38454000, 0x38456000, 0x38458000, 0x3845A000, 0x3845C000, 0x3845E000, 
+				0x38460000, 0x38462000, 0x38464000, 0x38466000, 0x38468000, 0x3846A000, 0x3846C000, 0x3846E000, 0x38470000, 0x38472000, 0x38474000, 0x38476000, 0x38478000, 0x3847A000, 0x3847C000, 0x3847E000, 
+				0x38480000, 0x38482000, 0x38484000, 0x38486000, 0x38488000, 0x3848A000, 0x3848C000, 0x3848E000, 0x38490000, 0x38492000, 0x38494000, 0x38496000, 0x38498000, 0x3849A000, 0x3849C000, 0x3849E000, 
+				0x384A0000, 0x384A2000, 0x384A4000, 0x384A6000, 0x384A8000, 0x384AA000, 0x384AC000, 0x384AE000, 0x384B0000, 0x384B2000, 0x384B4000, 0x384B6000, 0x384B8000, 0x384BA000, 0x384BC000, 0x384BE000, 
+				0x384C0000, 0x384C2000, 0x384C4000, 0x384C6000, 0x384C8000, 0x384CA000, 0x384CC000, 0x384CE000, 0x384D0000, 0x384D2000, 0x384D4000, 0x384D6000, 0x384D8000, 0x384DA000, 0x384DC000, 0x384DE000, 
+				0x384E0000, 0x384E2000, 0x384E4000, 0x384E6000, 0x384E8000, 0x384EA000, 0x384EC000, 0x384EE000, 0x384F0000, 0x384F2000, 0x384F4000, 0x384F6000, 0x384F8000, 0x384FA000, 0x384FC000, 0x384FE000, 
+				0x38500000, 0x38502000, 0x38504000, 0x38506000, 0x38508000, 0x3850A000, 0x3850C000, 0x3850E000, 0x38510000, 0x38512000, 0x38514000, 0x38516000, 0x38518000, 0x3851A000, 0x3851C000, 0x3851E000, 
+				0x38520000, 0x38522000, 0x38524000, 0x38526000, 0x38528000, 0x3852A000, 0x3852C000, 0x3852E000, 0x38530000, 0x38532000, 0x38534000, 0x38536000, 0x38538000, 0x3853A000, 0x3853C000, 0x3853E000, 
+				0x38540000, 0x38542000, 0x38544000, 0x38546000, 0x38548000, 0x3854A000, 0x3854C000, 0x3854E000, 0x38550000, 0x38552000, 0x38554000, 0x38556000, 0x38558000, 0x3855A000, 0x3855C000, 0x3855E000, 
+				0x38560000, 0x38562000, 0x38564000, 0x38566000, 0x38568000, 0x3856A000, 0x3856C000, 0x3856E000, 0x38570000, 0x38572000, 0x38574000, 0x38576000, 0x38578000, 0x3857A000, 0x3857C000, 0x3857E000, 
+				0x38580000, 0x38582000, 0x38584000, 0x38586000, 0x38588000, 0x3858A000, 0x3858C000, 0x3858E000, 0x38590000, 0x38592000, 0x38594000, 0x38596000, 0x38598000, 0x3859A000, 0x3859C000, 0x3859E000, 
+				0x385A0000, 0x385A2000, 0x385A4000, 0x385A6000, 0x385A8000, 0x385AA000, 0x385AC000, 0x385AE000, 0x385B0000, 0x385B2000, 0x385B4000, 0x385B6000, 0x385B8000, 0x385BA000, 0x385BC000, 0x385BE000, 
+				0x385C0000, 0x385C2000, 0x385C4000, 0x385C6000, 0x385C8000, 0x385CA000, 0x385CC000, 0x385CE000, 0x385D0000, 0x385D2000, 0x385D4000, 0x385D6000, 0x385D8000, 0x385DA000, 0x385DC000, 0x385DE000, 
+				0x385E0000, 0x385E2000, 0x385E4000, 0x385E6000, 0x385E8000, 0x385EA000, 0x385EC000, 0x385EE000, 0x385F0000, 0x385F2000, 0x385F4000, 0x385F6000, 0x385F8000, 0x385FA000, 0x385FC000, 0x385FE000, 
+				0x38600000, 0x38602000, 0x38604000, 0x38606000, 0x38608000, 0x3860A000, 0x3860C000, 0x3860E000, 0x38610000, 0x38612000, 0x38614000, 0x38616000, 0x38618000, 0x3861A000, 0x3861C000, 0x3861E000, 
+				0x38620000, 0x38622000, 0x38624000, 0x38626000, 0x38628000, 0x3862A000, 0x3862C000, 0x3862E000, 0x38630000, 0x38632000, 0x38634000, 0x38636000, 0x38638000, 0x3863A000, 0x3863C000, 0x3863E000, 
+				0x38640000, 0x38642000, 0x38644000, 0x38646000, 0x38648000, 0x3864A000, 0x3864C000, 0x3864E000, 0x38650000, 0x38652000, 0x38654000, 0x38656000, 0x38658000, 0x3865A000, 0x3865C000, 0x3865E000, 
+				0x38660000, 0x38662000, 0x38664000, 0x38666000, 0x38668000, 0x3866A000, 0x3866C000, 0x3866E000, 0x38670000, 0x38672000, 0x38674000, 0x38676000, 0x38678000, 0x3867A000, 0x3867C000, 0x3867E000, 
+				0x38680000, 0x38682000, 0x38684000, 0x38686000, 0x38688000, 0x3868A000, 0x3868C000, 0x3868E000, 0x38690000, 0x38692000, 0x38694000, 0x38696000, 0x38698000, 0x3869A000, 0x3869C000, 0x3869E000, 
+				0x386A0000, 0x386A2000, 0x386A4000, 0x386A6000, 0x386A8000, 0x386AA000, 0x386AC000, 0x386AE000, 0x386B0000, 0x386B2000, 0x386B4000, 0x386B6000, 0x386B8000, 0x386BA000, 0x386BC000, 0x386BE000, 
+				0x386C0000, 0x386C2000, 0x386C4000, 0x386C6000, 0x386C8000, 0x386CA000, 0x386CC000, 0x386CE000, 0x386D0000, 0x386D2000, 0x386D4000, 0x386D6000, 0x386D8000, 0x386DA000, 0x386DC000, 0x386DE000, 
+				0x386E0000, 0x386E2000, 0x386E4000, 0x386E6000, 0x386E8000, 0x386EA000, 0x386EC000, 0x386EE000, 0x386F0000, 0x386F2000, 0x386F4000, 0x386F6000, 0x386F8000, 0x386FA000, 0x386FC000, 0x386FE000, 
+				0x38700000, 0x38702000, 0x38704000, 0x38706000, 0x38708000, 0x3870A000, 0x3870C000, 0x3870E000, 0x38710000, 0x38712000, 0x38714000, 0x38716000, 0x38718000, 0x3871A000, 0x3871C000, 0x3871E000, 
+				0x38720000, 0x38722000, 0x38724000, 0x38726000, 0x38728000, 0x3872A000, 0x3872C000, 0x3872E000, 0x38730000, 0x38732000, 0x38734000, 0x38736000, 0x38738000, 0x3873A000, 0x3873C000, 0x3873E000, 
+				0x38740000, 0x38742000, 0x38744000, 0x38746000, 0x38748000, 0x3874A000, 0x3874C000, 0x3874E000, 0x38750000, 0x38752000, 0x38754000, 0x38756000, 0x38758000, 0x3875A000, 0x3875C000, 0x3875E000, 
+				0x38760000, 0x38762000, 0x38764000, 0x38766000, 0x38768000, 0x3876A000, 0x3876C000, 0x3876E000, 0x38770000, 0x38772000, 0x38774000, 0x38776000, 0x38778000, 0x3877A000, 0x3877C000, 0x3877E000, 
+				0x38780000, 0x38782000, 0x38784000, 0x38786000, 0x38788000, 0x3878A000, 0x3878C000, 0x3878E000, 0x38790000, 0x38792000, 0x38794000, 0x38796000, 0x38798000, 0x3879A000, 0x3879C000, 0x3879E000, 
+				0x387A0000, 0x387A2000, 0x387A4000, 0x387A6000, 0x387A8000, 0x387AA000, 0x387AC000, 0x387AE000, 0x387B0000, 0x387B2000, 0x387B4000, 0x387B6000, 0x387B8000, 0x387BA000, 0x387BC000, 0x387BE000, 
+				0x387C0000, 0x387C2000, 0x387C4000, 0x387C6000, 0x387C8000, 0x387CA000, 0x387CC000, 0x387CE000, 0x387D0000, 0x387D2000, 0x387D4000, 0x387D6000, 0x387D8000, 0x387DA000, 0x387DC000, 0x387DE000, 
+				0x387E0000, 0x387E2000, 0x387E4000, 0x387E6000, 0x387E8000, 0x387EA000, 0x387EC000, 0x387EE000, 0x387F0000, 0x387F2000, 0x387F4000, 0x387F6000, 0x387F8000, 0x387FA000, 0x387FC000, 0x387FE000 };
+			static const bits<float>::type exponent_table[64] = {
+				0x00000000, 0x00800000, 0x01000000, 0x01800000, 0x02000000, 0x02800000, 0x03000000, 0x03800000, 0x04000000, 0x04800000, 0x05000000, 0x05800000, 0x06000000, 0x06800000, 0x07000000, 0x07800000, 
+				0x08000000, 0x08800000, 0x09000000, 0x09800000, 0x0A000000, 0x0A800000, 0x0B000000, 0x0B800000, 0x0C000000, 0x0C800000, 0x0D000000, 0x0D800000, 0x0E000000, 0x0E800000, 0x0F000000, 0x47800000, 
+				0x80000000, 0x80800000, 0x81000000, 0x81800000, 0x82000000, 0x82800000, 0x83000000, 0x83800000, 0x84000000, 0x84800000, 0x85000000, 0x85800000, 0x86000000, 0x86800000, 0x87000000, 0x87800000, 
+				0x88000000, 0x88800000, 0x89000000, 0x89800000, 0x8A000000, 0x8A800000, 0x8B000000, 0x8B800000, 0x8C000000, 0x8C800000, 0x8D000000, 0x8D800000, 0x8E000000, 0x8E800000, 0x8F000000, 0xC7800000 };
+			static const unsigned short offset_table[64] = {
+				0, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 
+				0, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024 };
+			bits<float>::type fbits = mantissa_table[offset_table[value>>10]+(value&0x3FF)] + exponent_table[value>>10];
+		#endif
+			float out;
+			std::memcpy(&out, &fbits, sizeof(float));
+			return out;
+		#endif
+		}
+
+		/// Convert half-precision to IEEE double-precision.
+		/// \param value half-precision value to convert
+		/// \return double-precision value
+		inline double half2float_impl(unsigned int value, double, true_type)
+		{
+		#if HALF_ENABLE_F16C_INTRINSICS
+			return _mm_cvtsd_f64(_mm_cvtps_pd(_mm_cvtph_ps(_mm_cvtsi32_si128(value))));
+		#else
+			uint32 hi = static_cast<uint32>(value&0x8000) << 16;
+			unsigned int abs = value & 0x7FFF;
+			if(abs)
+			{
+				hi |= 0x3F000000 << static_cast<unsigned>(abs>=0x7C00);
+				for(; abs<0x400; abs<<=1,hi-=0x100000) ;
+				hi += static_cast<uint32>(abs) << 10;
+			}
+			bits<double>::type dbits = static_cast<bits<double>::type>(hi) << 32;
+			double out;
+			std::memcpy(&out, &dbits, sizeof(double));
+			return out;
+		#endif
+		}
+
+		/// Convert half-precision to non-IEEE floating-point.
+		/// \tparam T type to convert to (builtin integer type)
+		/// \param value half-precision value to convert
+		/// \return floating-point value
+		template<typename T> T half2float_impl(unsigned int value, T, ...)
+		{
+			T out;
+			unsigned int abs = value & 0x7FFF;
+			if(abs > 0x7C00)
+				out = (std::numeric_limits<T>::has_signaling_NaN && !(abs&0x200)) ? std::numeric_limits<T>::signaling_NaN() :
+					std::numeric_limits<T>::has_quiet_NaN ? std::numeric_limits<T>::quiet_NaN() : T();
+			else if(abs == 0x7C00)
+				out = std::numeric_limits<T>::has_infinity ? std::numeric_limits<T>::infinity() : std::numeric_limits<T>::max();
+			else if(abs > 0x3FF)
+				out = std::ldexp(static_cast<T>((abs&0x3FF)|0x400), (abs>>10)-25);
+			else
+				out = std::ldexp(static_cast<T>(abs), -24);
+			return (value&0x8000) ? -out : out;
+		}
+
+		/// Convert half-precision to floating-point.
+		/// \tparam T type to convert to (builtin integer type)
+		/// \param value half-precision value to convert
+		/// \return floating-point value
+		template<typename T> T half2float(unsigned int value)
+		{
+			return half2float_impl(value, T(), bool_type<std::numeric_limits<T>::is_iec559&&sizeof(typename bits<T>::type)==sizeof(T)>());
+		}
+
+		/// Convert half-precision floating-point to integer.
+		/// \tparam R rounding mode to use
+		/// \tparam E `true` for round to even, `false` for round away from zero
+		/// \tparam I `true` to raise INEXACT exception (if inexact), `false` to never raise it
+		/// \tparam T type to convert to (buitlin integer type with at least 16 bits precision, excluding any implicit sign bits)
+		/// \param value half-precision value to convert
+		/// \return rounded integer value
+		/// \exception FE_INVALID if value is not representable in type \a T
+		/// \exception FE_INEXACT if value had to be rounded and \a I is `true`
+		template<std::float_round_style R,bool E,bool I,typename T> T half2int(unsigned int value)
+		{
+			unsigned int abs = value & 0x7FFF;
+			if(abs >= 0x7C00)
+			{
+				detail::raise(FE_INVALID);
+				return (value&0x8000) ? std::numeric_limits<T>::min() : std::numeric_limits<T>::max();
+			}
+			if(abs < 0x3800)
+			{
+				detail::raise(FE_INEXACT, I);
+				return	(R==std::round_toward_infinity) ? T(~(value>>15)&(abs!=0)) :
+						(R==std::round_toward_neg_infinity) ? -T(value>0x8000) :
+						T();
+			}
+			int exp = 25 - (abs>>10);
+			unsigned int m = (value&0x3FF) | 0x400;
+			int32 i = static_cast<int32>((exp<=0) ? (m<<-exp) : ((m+(
+				(R==std::round_to_nearest) ? ((1<<(exp-1))-(~(m>>exp)&E)) :
+				(R==std::round_toward_infinity) ? (((1<<exp)-1)&((value>>15)-1)) :
+				(R==std::round_toward_neg_infinity) ? (((1<<exp)-1)&-(value>>15)) : 0))>>exp));
+			if((!std::numeric_limits<T>::is_signed && (value&0x8000)) || (std::numeric_limits<T>::digits<16 &&
+				((value&0x8000) ? (-i<std::numeric_limits<T>::min()) : (i>std::numeric_limits<T>::max()))))
+				detail::raise(FE_INVALID);
+			else if(I && exp > 0 && (m&((1<<exp)-1)))
+				detail::raise(FE_INEXACT);
+			return static_cast<T>((value&0x8000) ? -i : i);
+		}
+
+		/// \}
+		/// \name Mathematics
+		/// \{
+
+		/// upper part of 64-bit multiplication.
+		/// \tparam R rounding mode to use
+		/// \param x first factor
+		/// \param y second factor
+		/// \return upper 32 bit of \a x * \a y
+		template<std::float_round_style R> uint32 mulhi(uint32 x, uint32 y)
+		{
+			uint32 xy = (x>>16) * (y&0xFFFF), yx = (x&0xFFFF) * (y>>16), c = (xy&0xFFFF) + (yx&0xFFFF) + (((x&0xFFFF)*(y&0xFFFF))>>16);
+			return (x>>16)*(y>>16) + (xy>>16) + (yx>>16) + (c>>16) +
+				((R==std::round_to_nearest) ? ((c>>15)&1) : (R==std::round_toward_infinity) ? ((c&0xFFFF)!=0) : 0);
+		}
+
+		/// 64-bit multiplication.
+		/// \param x first factor
+		/// \param y second factor
+		/// \return upper 32 bit of \a x * \a y rounded to nearest
+		inline uint32 multiply64(uint32 x, uint32 y)
+		{
+		#if HALF_ENABLE_CPP11_LONG_LONG
+			return static_cast<uint32>((static_cast<unsigned long long>(x)*static_cast<unsigned long long>(y)+0x80000000)>>32);
+		#else
+			return mulhi<std::round_to_nearest>(x, y);
+		#endif
+		}
+
+		/// 64-bit division.
+		/// \param x upper 32 bit of dividend
+		/// \param y divisor
+		/// \param s variable to store sticky bit for rounding
+		/// \return (\a x << 32) / \a y
+		inline uint32 divide64(uint32 x, uint32 y, int &s)
+		{
+		#if HALF_ENABLE_CPP11_LONG_LONG
+			unsigned long long xx = static_cast<unsigned long long>(x) << 32;
+			return s = (xx%y!=0), static_cast<uint32>(xx/y);
+		#else
+			y >>= 1;
+			uint32 rem = x, div = 0;
+			for(unsigned int i=0; i<32; ++i)
+			{
+				div <<= 1;
+				if(rem >= y)
+				{
+					rem -= y;
+					div |= 1;
+				}
+				rem <<= 1;
+			}
+			return s = rem > 1, div;
+		#endif
+		}
+
+		/// Half precision positive modulus.
+		/// \tparam Q `true` to compute full quotient, `false` else
+		/// \tparam R `true` to compute signed remainder, `false` for positive remainder
+		/// \param x first operand as positive finite half-precision value
+		/// \param y second operand as positive finite half-precision value
+		/// \param quo adress to store quotient at, `nullptr` if \a Q `false`
+		/// \return modulus of \a x / \a y
+		template<bool Q,bool R> unsigned int mod(unsigned int x, unsigned int y, int *quo = NULL)
+		{
+			unsigned int q = 0;
+			if(x > y)
+			{
+				int absx = x, absy = y, expx = 0, expy = 0;
+				for(; absx<0x400; absx<<=1,--expx) ;
+				for(; absy<0x400; absy<<=1,--expy) ;
+				expx += absx >> 10;
+				expy += absy >> 10;
+				int mx = (absx&0x3FF) | 0x400, my = (absy&0x3FF) | 0x400;
+				for(int d=expx-expy; d; --d)
+				{
+					if(!Q && mx == my)
+						return 0;
+					if(mx >= my)
+					{
+						mx -= my;
+						q += Q;
+					}
+					mx <<= 1;
+					q <<= static_cast<int>(Q);
+				}
+				if(!Q && mx == my)
+					return 0;
+				if(mx >= my)
+				{
+					mx -= my;
+					++q;
+				}
+				if(Q)
+				{
+					q &= (1<<(std::numeric_limits<int>::digits-1)) - 1;
+					if(!mx)
+						return *quo = q, 0;
+				}
+				for(; mx<0x400; mx<<=1,--expy) ;
+				x = (expy>0) ? ((expy<<10)|(mx&0x3FF)) : (mx>>(1-expy));
+			}
+			if(R)
+			{
+				unsigned int a, b;
+				if(y < 0x800)
+				{
+					a = (x<0x400) ? (x<<1) : (x+0x400);
+					b = y;
+				}
+				else
+				{
+					a = x;
+					b = y - 0x400;
+				}
+				if(a > b || (a == b && (q&1)))
+				{
+					int exp = (y>>10) + (y<=0x3FF), d = exp - (x>>10) - (x<=0x3FF);
+					int m = (((y&0x3FF)|((y>0x3FF)<<10))<<1) - (((x&0x3FF)|((x>0x3FF)<<10))<<(1-d));
+					for(; m<0x800 && exp>1; m<<=1,--exp) ;
+					x = 0x8000 + ((exp-1)<<10) + (m>>1);
+					q += Q;
+				}
+			}
+			if(Q)
+				*quo = q;
+			return x;
+		}
+
+		/// Fixed point square root.
+		/// \tparam F number of fractional bits
+		/// \param r radicand in Q1.F fixed point format
+		/// \param exp exponent
+		/// \return square root as Q1.F/2
+		template<unsigned int F> uint32 sqrt(uint32 &r, int &exp)
+		{
+			int i = exp & 1;
+			r <<= i;
+			exp = (exp-i) / 2;
+			uint32 m = 0;
+			for(uint32 bit=static_cast<uint32>(1)<<F; bit; bit>>=2)
+			{
+				if(r < m+bit)
+					m >>= 1;
+				else
+				{
+					r -= m + bit;
+					m = (m>>1) + bit;
+				}
+			}
+			return m;
+		}
+
+		/// Fixed point binary exponential.
+		/// This uses the BKM algorithm in E-mode.
+		/// \param m exponent in [0,1) as Q0.31
+		/// \param n number of iterations (at most 32)
+		/// \return 2 ^ \a m as Q1.31
+		inline uint32 exp2(uint32 m, unsigned int n = 32)
+		{
+			static const uint32 logs[] = {
+				0x80000000, 0x4AE00D1D, 0x2934F098, 0x15C01A3A, 0x0B31FB7D, 0x05AEB4DD, 0x02DCF2D1, 0x016FE50B,
+				0x00B84E23, 0x005C3E10, 0x002E24CA, 0x001713D6, 0x000B8A47, 0x0005C53B, 0x0002E2A3, 0x00017153,
+				0x0000B8AA, 0x00005C55, 0x00002E2B, 0x00001715, 0x00000B8B, 0x000005C5, 0x000002E3, 0x00000171,
+				0x000000B9, 0x0000005C, 0x0000002E, 0x00000017, 0x0000000C, 0x00000006, 0x00000003, 0x00000001 };
+			if(!m)
+				return 0x80000000;
+			uint32 mx = 0x80000000, my = 0;
+			for(unsigned int i=1; i<n; ++i)
+			{
+				uint32 mz = my + logs[i];
+				if(mz <= m)
+				{
+					my = mz;
+					mx += mx >> i;
+				}
+			}
+			return mx;
+		}
+
+		/// Fixed point binary logarithm.
+		/// This uses the BKM algorithm in L-mode.
+		/// \param m mantissa in [1,2) as Q1.30
+		/// \param n number of iterations (at most 32)
+		/// \return log2(\a m) as Q0.31
+		inline uint32 log2(uint32 m, unsigned int n = 32)
+		{
+			static const uint32 logs[] = {
+				0x80000000, 0x4AE00D1D, 0x2934F098, 0x15C01A3A, 0x0B31FB7D, 0x05AEB4DD, 0x02DCF2D1, 0x016FE50B,
+				0x00B84E23, 0x005C3E10, 0x002E24CA, 0x001713D6, 0x000B8A47, 0x0005C53B, 0x0002E2A3, 0x00017153,
+				0x0000B8AA, 0x00005C55, 0x00002E2B, 0x00001715, 0x00000B8B, 0x000005C5, 0x000002E3, 0x00000171,
+				0x000000B9, 0x0000005C, 0x0000002E, 0x00000017, 0x0000000C, 0x00000006, 0x00000003, 0x00000001 };
+			if(m == 0x40000000)
+				return 0;
+			uint32 mx = 0x40000000, my = 0;
+			for(unsigned int i=1; i<n; ++i)
+			{
+				uint32 mz = mx + (mx>>i);
+				if(mz <= m)
+				{
+					mx = mz;
+					my += logs[i];
+				}
+			}
+			return my;
+		}
+
+		/// Fixed point sine and cosine.
+		/// This uses the CORDIC algorithm in rotation mode.
+		/// \param mz angle in [-pi/2,pi/2] as Q1.30
+		/// \param n number of iterations (at most 31)
+		/// \return sine and cosine of \a mz as Q1.30
+		inline std::pair<uint32,uint32> sincos(uint32 mz, unsigned int n = 31)
+		{
+			static const uint32 angles[] = {
+				0x3243F6A9, 0x1DAC6705, 0x0FADBAFD, 0x07F56EA7, 0x03FEAB77, 0x01FFD55C, 0x00FFFAAB, 0x007FFF55,
+				0x003FFFEB, 0x001FFFFD, 0x00100000, 0x00080000, 0x00040000, 0x00020000, 0x00010000, 0x00008000,
+				0x00004000, 0x00002000, 0x00001000, 0x00000800, 0x00000400, 0x00000200, 0x00000100, 0x00000080,
+				0x00000040, 0x00000020, 0x00000010, 0x00000008, 0x00000004, 0x00000002, 0x00000001 };
+			uint32 mx = 0x26DD3B6A, my = 0;
+			for(unsigned int i=0; i<n; ++i)
+			{
+				uint32 sign = sign_mask(mz);
+				uint32 tx = mx - (arithmetic_shift(my, i)^sign) + sign;
+				uint32 ty = my + (arithmetic_shift(mx, i)^sign) - sign;
+				mx = tx; my = ty; mz -= (angles[i]^sign) - sign;
+			}
+			return std::make_pair(my, mx);
+		}
+
+		/// Fixed point arc tangent.
+		/// This uses the CORDIC algorithm in vectoring mode.
+		/// \param my y coordinate as Q0.30
+		/// \param mx x coordinate as Q0.30
+		/// \param n number of iterations (at most 31)
+		/// \return arc tangent of \a my / \a mx as Q1.30
+		inline uint32 atan2(uint32 my, uint32 mx, unsigned int n = 31)
+		{
+			static const uint32 angles[] = {
+				0x3243F6A9, 0x1DAC6705, 0x0FADBAFD, 0x07F56EA7, 0x03FEAB77, 0x01FFD55C, 0x00FFFAAB, 0x007FFF55,
+				0x003FFFEB, 0x001FFFFD, 0x00100000, 0x00080000, 0x00040000, 0x00020000, 0x00010000, 0x00008000,
+				0x00004000, 0x00002000, 0x00001000, 0x00000800, 0x00000400, 0x00000200, 0x00000100, 0x00000080,
+				0x00000040, 0x00000020, 0x00000010, 0x00000008, 0x00000004, 0x00000002, 0x00000001 };
+			uint32 mz = 0;
+			for(unsigned int i=0; i<n; ++i)
+			{
+				uint32 sign = sign_mask(my);
+				uint32 tx = mx + (arithmetic_shift(my, i)^sign) - sign;
+				uint32 ty = my - (arithmetic_shift(mx, i)^sign) + sign;
+				mx = tx; my = ty; mz += (angles[i]^sign) - sign;
+			}
+			return mz;
+		}
+
+		/// Reduce argument for trigonometric functions.
+		/// \param abs half-precision floating-point value
+		/// \param k value to take quarter period
+		/// \return \a abs reduced to [-pi/4,pi/4] as Q0.30
+		inline uint32 angle_arg(unsigned int abs, int &k)
+		{
+			uint32 m = (abs&0x3FF) | ((abs>0x3FF)<<10);
+			int exp = (abs>>10) + (abs<=0x3FF) - 15;
+			if(abs < 0x3A48)
+				return k = 0, m << (exp+20);
+		#if HALF_ENABLE_CPP11_LONG_LONG
+			unsigned long long y = m * 0xA2F9836E4E442, mask = (1ULL<<(62-exp)) - 1, yi = (y+(mask>>1)) & ~mask, f = y - yi;
+			uint32 sign = -static_cast<uint32>(f>>63);
+			k = static_cast<int>(yi>>(62-exp));
+			return (multiply64(static_cast<uint32>((sign ? -f : f)>>(31-exp)), 0xC90FDAA2)^sign) - sign;
+		#else
+			uint32 yh = m*0xA2F98 + mulhi<std::round_toward_zero>(m, 0x36E4E442), yl = (m*0x36E4E442) & 0xFFFFFFFF;
+			uint32 mask = (static_cast<uint32>(1)<<(30-exp)) - 1, yi = (yh+(mask>>1)) & ~mask, sign = -static_cast<uint32>(yi>yh);
+			k = static_cast<int>(yi>>(30-exp));
+			uint32 fh = (yh^sign) + (yi^~sign) - ~sign, fl = (yl^sign) - sign;
+			return (multiply64((exp>-1) ? (((fh<<(1+exp))&0xFFFFFFFF)|((fl&0xFFFFFFFF)>>(31-exp))) : fh, 0xC90FDAA2)^sign) - sign;
+		#endif
+		}
+
+		/// Get arguments for atan2 function.
+		/// \param abs half-precision floating-point value
+		/// \return \a abs and sqrt(1 - \a abs^2) as Q0.30
+		inline std::pair<uint32,uint32> atan2_args(unsigned int abs)
+		{
+			int exp = -15;
+			for(; abs<0x400; abs<<=1,--exp) ;
+			exp += abs >> 10;
+			uint32 my = ((abs&0x3FF)|0x400) << 5, r = my * my;
+			int rexp = 2 * exp;
+			r = 0x40000000 - ((rexp>-31) ? ((r>>-rexp)|((r&((static_cast<uint32>(1)<<-rexp)-1))!=0)) : 1);
+			for(rexp=0; r<0x40000000; r<<=1,--rexp) ;
+			uint32 mx = sqrt<30>(r, rexp);
+			int d = exp - rexp;
+			if(d < 0)
+				return std::make_pair((d<-14) ? ((my>>(-d-14))+((my>>(-d-15))&1)) : (my<<(14+d)), (mx<<14)+(r<<13)/mx);
+			if(d > 0)
+				return std::make_pair(my<<14, (d>14) ? ((mx>>(d-14))+((mx>>(d-15))&1)) : ((d==14) ? mx : ((mx<<(14-d))+(r<<(13-d))/mx)));
+			return std::make_pair(my<<13, (mx<<13)+(r<<12)/mx);
+		}
+
+		/// Get exponentials for hyperbolic computation
+		/// \param abs half-precision floating-point value
+		/// \param exp variable to take unbiased exponent of larger result
+		/// \param n number of BKM iterations (at most 32)
+		/// \return exp(abs) and exp(-\a abs) as Q1.31 with same exponent
+		inline std::pair<uint32,uint32> hyperbolic_args(unsigned int abs, int &exp, unsigned int n = 32)
+		{
+			uint32 mx = detail::multiply64(static_cast<uint32>((abs&0x3FF)+((abs>0x3FF)<<10))<<21, 0xB8AA3B29), my;
+			int e = (abs>>10) + (abs<=0x3FF);
+			if(e < 14)
+			{
+				exp = 0;
+				mx >>= 14 - e;
+			}
+			else
+			{
+				exp = static_cast<int>(mx >> (45-e));
+				mx = (mx<<(e-14)) & 0x7FFFFFFF;
+			}
+			mx = exp2(mx, n);
+			int d = exp << 1, s;
+			if(mx > 0x80000000)
+			{
+				my = divide64(0x80000000, mx, s);
+				my |= s;
+				++d;
+			}
+			else
+				my = mx;
+			return std::make_pair(mx, (d<31) ? ((my>>d)|((my&((static_cast<uint32>(1)<<d)-1))!=0)) : 1);
+		}
+
+		/// Postprocessing for binary exponential.
+		/// \tparam R rounding mode to use
+		/// \param m fractional part of exponent as Q0.31
+		/// \param exp absolute value of unbiased exponent
+		/// \param esign sign of actual exponent
+		/// \param sign sign bit of result
+		/// \param n number of BKM iterations (at most 32)
+		/// \return value converted to half-precision
+		/// \exception FE_OVERFLOW on overflows
+		/// \exception FE_UNDERFLOW on underflows
+		/// \exception FE_INEXACT if value had to be rounded or \a I is `true`
+		template<std::float_round_style R> unsigned int exp2_post(uint32 m, int exp, bool esign, unsigned int sign = 0, unsigned int n = 32)
+		{
+			if(esign)
+			{
+				exp = -exp - (m!=0);
+				if(exp < -25)
+					return underflow<R>(sign);
+				else if(exp == -25)
+					return rounded<R,false>(sign, 1, m!=0);
+			}
+			else if(exp > 15)
+				return overflow<R>(sign);
+			if(!m)
+				return sign | (((exp+=15)>0) ? (exp<<10) : check_underflow(0x200>>-exp));
+			m = exp2(m, n);
+			int s = 0;
+			if(esign)
+				m = divide64(0x80000000, m, s);
+			return fixed2half<R,31,false,false,true>(m, exp+14, sign, s);
+		}
+
+		/// Postprocessing for binary logarithm.
+		/// \tparam R rounding mode to use
+		/// \tparam L logarithm for base transformation as Q1.31
+		/// \param m fractional part of logarithm as Q0.31
+		/// \param ilog signed integer part of logarithm
+		/// \param exp biased exponent of result
+		/// \param sign sign bit of result
+		/// \return value base-transformed and converted to half-precision
+		/// \exception FE_OVERFLOW on overflows
+		/// \exception FE_UNDERFLOW on underflows
+		/// \exception FE_INEXACT if no other exception occurred
+		template<std::float_round_style R,uint32 L> unsigned int log2_post(uint32 m, int ilog, int exp, unsigned int sign = 0)
+		{
+			uint32 msign = sign_mask(ilog);
+			m = (((static_cast<uint32>(ilog)<<27)+(m>>4))^msign) - msign;
+			if(!m)
+				return 0;
+			for(; m<0x80000000; m<<=1,--exp) ;
+			int i = m >= L, s;
+			exp += i;
+			m >>= 1 + i;
+			sign ^= msign & 0x8000;
+			if(exp < -11)
+				return underflow<R>(sign);
+			m = divide64(m, L, s);
+			return fixed2half<R,30,false,false,true>(m, exp, sign, 1);
+		}
+
+		/// Hypotenuse square root and postprocessing.
+		/// \tparam R rounding mode to use
+		/// \param r mantissa as Q2.30
+		/// \param exp biased exponent
+		/// \return square root converted to half-precision
+		/// \exception FE_OVERFLOW on overflows
+		/// \exception FE_UNDERFLOW on underflows
+		/// \exception FE_INEXACT if value had to be rounded
+		template<std::float_round_style R> unsigned int hypot_post(uint32 r, int exp)
+		{
+			int i = static_cast<int>(r >> 31);
+			if((exp+=i) > 46)
+				return overflow<R>();
+			if(exp < -34)
+				return underflow<R>();
+			r = (r>>i) | (r&i);
+			uint32 m = sqrt<30>(r, exp+=15);
+			return fixed2half<R,15,false,false,false>(m, exp-1, 0, r!=0);
+		}
+
+		/// Division and postprocessing for tangents.
+		/// \tparam R rounding mode to use
+		/// \param my dividend as Q1.31
+		/// \param mx divisor as Q1.31
+		/// \param exp biased exponent of result
+		/// \param sign sign bit of result
+		/// \return quotient converted to half-precision
+		/// \exception FE_OVERFLOW on overflows
+		/// \exception FE_UNDERFLOW on underflows
+		/// \exception FE_INEXACT if no other exception occurred
+		template<std::float_round_style R> unsigned int tangent_post(uint32 my, uint32 mx, int exp, unsigned int sign = 0)
+		{
+			int i = my >= mx, s;
+			exp += i;
+			if(exp > 29)
+				return overflow<R>(sign);
+			if(exp < -11)
+				return underflow<R>(sign);
+			uint32 m = divide64(my>>(i+1), mx, s);
+			return fixed2half<R,30,false,false,true>(m, exp, sign, s);
+		}
+
+		/// Area function and postprocessing.
+		/// This computes the value directly in Q2.30 using the representation `asinh|acosh(x) = log(x+sqrt(x^2+|-1))`.
+		/// \tparam R rounding mode to use
+		/// \tparam S `true` for asinh, `false` for acosh
+		/// \param arg half-precision argument
+		/// \return asinh|acosh(\a arg) converted to half-precision
+		/// \exception FE_OVERFLOW on overflows
+		/// \exception FE_UNDERFLOW on underflows
+		/// \exception FE_INEXACT if no other exception occurred
+		template<std::float_round_style R,bool S> unsigned int area(unsigned int arg)
+		{
+			int abs = arg & 0x7FFF, expx = (abs>>10) + (abs<=0x3FF) - 15, expy = -15, ilog;
+			uint32 mx = static_cast<uint32>((abs&0x3FF)|((abs>0x3FF)<<10)) << 20, my, r;
+			for(; abs<0x400; abs<<=1,--expy) ;
+			expy += abs >> 10;
+			r = ((abs&0x3FF)|0x400) << 5;
+			r *= r;
+			int i = static_cast<int>(r >> 31);
+			expy = 2*expy + i;
+			r >>= i;
+			if(S)
+			{
+				if(expy < 0)
+				{
+					r = 0x40000000 + ((expy>-30) ? ((r>>-expy)|((r&((static_cast<uint32>(1)<<-expy)-1))!=0)) : 1);
+					expy = 0;
+				}
+				else
+				{
+					r += 0x40000000 >> expy;
+					i = static_cast<int>(r >> 31);
+					r = (r>>i) | (r&i);
+					expy += i;
+				}
+			}
+			else
+			{
+				r -= 0x40000000 >> expy;
+				for(; r<0x40000000; r<<=1,--expy) ;
+			}
+			my = sqrt<30>(r, expy);
+			my = (my<<15) + (r<<14)/my;
+			if(S)
+			{
+				mx >>= expy - expx;
+				ilog = expy;
+			}
+			else
+			{
+				my >>= expx - expy;
+				ilog = expx;
+			}
+			my += mx;
+			i = static_cast<int>(my >> 31);
+			static const int G = S && (R==std::round_to_nearest);
+			return log2_post<R,0xB8AA3B2A>(log2(my>>i, 26+S+G)+(G<<3), ilog+i, 17, arg&(static_cast<unsigned>(S)<<15));
+		}
+
+		/// Class for 1.31 unsigned floating-point computation
+		struct f31
+		{
+			/// Constructor.
+			/// \param mant mantissa as 1.31
+			/// \param e exponent
+			HALF_CONSTEXPR f31(uint32 mant, int e) : m(mant), exp(e) {}
+
+			/// Constructor.
+			/// \param abs unsigned half-precision value
+			f31(unsigned int abs) : exp(-15)
+			{
+				for(; abs<0x400; abs<<=1,--exp) ;
+				m = static_cast<uint32>((abs&0x3FF)|0x400) << 21;
+				exp += (abs>>10);
+			}
+
+			/// Addition operator.
+			/// \param a first operand
+			/// \param b second operand
+			/// \return \a a + \a b
+			friend f31 operator+(f31 a, f31 b)
+			{
+				if(b.exp > a.exp)
+					std::swap(a, b);
+				int d = a.exp - b.exp;
+				uint32 m = a.m + ((d<32) ? (b.m>>d) : 0);
+				int i = (m&0xFFFFFFFF) < a.m;
+				return f31(((m+i)>>i)|0x80000000, a.exp+i);
+			}
+
+			/// Subtraction operator.
+			/// \param a first operand
+			/// \param b second operand
+			/// \return \a a - \a b
+			friend f31 operator-(f31 a, f31 b)
+			{
+				int d = a.exp - b.exp, exp = a.exp;
+				uint32 m = a.m - ((d<32) ? (b.m>>d) : 0);
+				if(!m)
+					return f31(0, -32);
+				for(; m<0x80000000; m<<=1,--exp) ;
+				return f31(m, exp);
+			}
+
+			/// Multiplication operator.
+			/// \param a first operand
+			/// \param b second operand
+			/// \return \a a * \a b
+			friend f31 operator*(f31 a, f31 b)
+			{
+				uint32 m = multiply64(a.m, b.m);
+				int i = static_cast<int>(m >> 31);
+				return f31(m<<(1-i), a.exp + b.exp + i);
+			}
+
+			/// Division operator.
+			/// \param a first operand
+			/// \param b second operand
+			/// \return \a a / \a b
+			friend f31 operator/(f31 a, f31 b)
+			{
+				int i = a.m >= b.m, s;
+				uint32 m = divide64((a.m+i)>>i, b.m, s);
+				return f31(m, a.exp - b.exp + i - 1);
+			}
+
+			uint32 m;			///< mantissa as 1.31.
+			int exp;			///< exponent.
+		};
+
+		/// Error function and postprocessing.
+		/// This computes the value directly in Q1.31 using the approximations given 
+		/// [here](https://en.wikipedia.org/wiki/Error_function#Approximation_with_elementary_functions).
+		/// \tparam R rounding mode to use
+		/// \tparam C `true` for comlementary error function, `false` else
+		/// \param arg half-precision function argument
+		/// \return approximated value of error function in half-precision
+		/// \exception FE_OVERFLOW on overflows
+		/// \exception FE_UNDERFLOW on underflows
+		/// \exception FE_INEXACT if no other exception occurred
+		template<std::float_round_style R,bool C> unsigned int erf(unsigned int arg)
+		{
+			unsigned int abs = arg & 0x7FFF, sign = arg & 0x8000;
+			f31 x(abs), x2 = x * x * f31(0xB8AA3B29, 0), t = f31(0x80000000, 0) / (f31(0x80000000, 0)+f31(0xA7BA054A, -2)*x), t2 = t * t;
+			f31 e = ((f31(0x87DC2213, 0)*t2+f31(0xB5F0E2AE, 0))*t2+f31(0x82790637, -2)-(f31(0xBA00E2B8, 0)*t2+f31(0x91A98E62, -2))*t) * t /
+					((x2.exp<0) ? f31(exp2((x2.exp>-32) ? (x2.m>>-x2.exp) : 0, 30), 0) : f31(exp2((x2.m<<x2.exp)&0x7FFFFFFF, 22), static_cast<int>(x2.m>>(31-x2.exp))));
+			return (!C || sign) ? fixed2half<R,31,false,true,true>(0x80000000-(e.m>>(C-e.exp)), 14+C, sign&(C-1U)) :
+					(e.exp<-25) ? underflow<R>() : fixed2half<R,30,false,false,true>(e.m>>1, e.exp+14, 0, e.m&1);
+		}
+
+		/// Gamma function and postprocessing.
+		/// This approximates the value of either the gamma function or its logarithm directly in Q1.31.
+		/// \tparam R rounding mode to use
+		/// \tparam L `true` for lograithm of gamma function, `false` for gamma function
+		/// \param arg half-precision floating-point value
+		/// \return lgamma/tgamma(\a arg) in half-precision
+		/// \exception FE_OVERFLOW on overflows
+		/// \exception FE_UNDERFLOW on underflows
+		/// \exception FE_INEXACT if \a arg is not a positive integer
+		template<std::float_round_style R,bool L> unsigned int gamma(unsigned int arg)
+		{
+/*			static const double p[] ={ 2.50662827563479526904, 225.525584619175212544, -268.295973841304927459, 80.9030806934622512966, -5.00757863970517583837, 0.0114684895434781459556 };
+			double t = arg + 4.65, s = p[0];
+			for(unsigned int i=0; i<5; ++i)
+				s += p[i+1] / (arg+i);
+			return std::log(s) + (arg-0.5)*std::log(t) - t;
+*/			static const f31 pi(0xC90FDAA2, 1), lbe(0xB8AA3B29, 0);
+			unsigned int abs = arg & 0x7FFF, sign = arg & 0x8000;
+			bool bsign = sign != 0;
+			f31 z(abs), x = sign ? (z+f31(0x80000000, 0)) : z, t = x + f31(0x94CCCCCD, 2), s =
+				f31(0xA06C9901, 1) + f31(0xBBE654E2, -7)/(x+f31(0x80000000, 2)) + f31(0xA1CE6098, 6)/(x+f31(0x80000000, 1))
+				+ f31(0xE1868CB7, 7)/x - f31(0x8625E279, 8)/(x+f31(0x80000000, 0)) - f31(0xA03E158F, 2)/(x+f31(0xC0000000, 1));
+			int i = (s.exp>=2) + (s.exp>=4) + (s.exp>=8) + (s.exp>=16);
+			s = f31((static_cast<uint32>(s.exp)<<(31-i))+(log2(s.m>>1, 28)>>i), i) / lbe;
+			if(x.exp != -1 || x.m != 0x80000000)
+			{
+				i = (t.exp>=2) + (t.exp>=4) + (t.exp>=8);
+				f31 l = f31((static_cast<uint32>(t.exp)<<(31-i))+(log2(t.m>>1, 30)>>i), i) / lbe;
+				s = (x.exp<-1) ? (s-(f31(0x80000000, -1)-x)*l) : (s+(x-f31(0x80000000, -1))*l);
+			}
+			s = x.exp ? (s-t) : (t-s);
+			if(bsign)
+			{
+				if(z.exp >= 0)
+				{
+					sign &= (L|((z.m>>(31-z.exp))&1)) - 1;
+					for(z=f31((z.m<<(1+z.exp))&0xFFFFFFFF, -1); z.m<0x80000000; z.m<<=1,--z.exp) ;
+				}
+				if(z.exp == -1)
+					z = f31(0x80000000, 0) - z;
+				if(z.exp < -1)
+				{
+					z = z * pi;
+					z.m = sincos(z.m>>(1-z.exp), 30).first;
+					for(z.exp=1; z.m<0x80000000; z.m<<=1,--z.exp) ;
+				}
+				else
+					z = f31(0x80000000, 0);
+			}
+			if(L)
+			{
+				if(bsign)
+				{
+					f31 l(0x92868247, 0);
+					if(z.exp < 0)
+					{
+						uint32 m = log2((z.m+1)>>1, 27);
+						z = f31(-((static_cast<uint32>(z.exp)<<26)+(m>>5)), 5);
+						for(; z.m<0x80000000; z.m<<=1,--z.exp) ;
+						l = l + z / lbe;
+					}
+					sign = static_cast<unsigned>(x.exp&&(l.exp<s.exp||(l.exp==s.exp&&l.m<s.m))) << 15;
+					s = sign ? (s-l) : x.exp ? (l-s) : (l+s);
+				}
+				else
+				{
+					sign = static_cast<unsigned>(x.exp==0) << 15;
+					if(s.exp < -24)
+						return underflow<R>(sign);
+					if(s.exp > 15)
+						return overflow<R>(sign);
+				}
+			}
+			else
+			{
+				s = s * lbe;
+				uint32 m;
+				if(s.exp < 0)
+				{
+					m = s.m >> -s.exp;
+					s.exp = 0;
+				}
+				else
+				{
+					m = (s.m<<s.exp) & 0x7FFFFFFF;
+					s.exp = static_cast<int>(s.m>>(31-s.exp));
+				}
+				s.m = exp2(m, 27);
+				if(!x.exp)
+					s = f31(0x80000000, 0) / s;
+				if(bsign)
+				{
+					if(z.exp < 0)
+						s = s * z;
+					s = pi / s;
+					if(s.exp < -24)
+						return underflow<R>(sign);
+				}
+				else if(z.exp > 0 && !(z.m&((1<<(31-z.exp))-1)))
+					return ((s.exp+14)<<10) + static_cast<unsigned int>(s.m>>21);
+				if(s.exp > 15)
+					return overflow<R>(sign);
+			}
+			return fixed2half<R,31,false,false,true>(s.m, s.exp+14, sign);
+		}
+		/// \}
+
+		template<typename,typename,std::float_round_style> struct half_caster;
+	}
+
+	/// Half-precision floating-point type.
+	/// This class implements an IEEE-conformant half-precision floating-point type with the usual arithmetic 
+	/// operators and conversions. It is implicitly convertible to single-precision floating-point, which makes artihmetic 
+	/// expressions and functions with mixed-type operands to be of the most precise operand type.
+	///
+	/// According to the C++98/03 definition, the half type is not a POD type. But according to C++11's less strict and 
+	/// extended definitions it is both a standard layout type and a trivially copyable type (even if not a POD type), which 
+	/// means it can be standard-conformantly copied using raw binary copies. But in this context some more words about the 
+	/// actual size of the type. Although the half is representing an IEEE 16-bit type, it does not neccessarily have to be of 
+	/// exactly 16-bits size. But on any reasonable implementation the actual binary representation of this type will most 
+	/// probably not ivolve any additional "magic" or padding beyond the simple binary representation of the underlying 16-bit 
+	/// IEEE number, even if not strictly guaranteed by the standard. But even then it only has an actual size of 16 bits if 
+	/// your C++ implementation supports an unsigned integer type of exactly 16 bits width. But this should be the case on 
+	/// nearly any reasonable platform.
+	///
+	/// So if your C++ implementation is not totally exotic or imposes special alignment requirements, it is a reasonable 
+	/// assumption that the data of a half is just comprised of the 2 bytes of the underlying IEEE representation.
+	class half
+	{
+	public:
+		/// \name Construction and assignment
+		/// \{
+
+		/// Default constructor.
+		/// This initializes the half to 0. Although this does not match the builtin types' default-initialization semantics 
+		/// and may be less efficient than no initialization, it is needed to provide proper value-initialization semantics.
+		HALF_CONSTEXPR half() HALF_NOEXCEPT : data_() {}
+
+		/// Conversion constructor.
+		/// \param rhs float to convert
+		/// \exception FE_OVERFLOW, ...UNDERFLOW, ...INEXACT according to rounding
+		explicit half(float rhs) : data_(static_cast<detail::uint16>(detail::float2half<round_style>(rhs))) {}
+	
+		/// Conversion to single-precision.
+		/// \return single precision value representing expression value
+		operator float() const { return detail::half2float<float>(data_); }
+
+		/// Assignment operator.
+		/// \param rhs single-precision value to copy from
+		/// \return reference to this half
+		/// \exception FE_OVERFLOW, ...UNDERFLOW, ...INEXACT according to rounding
+		half& operator=(float rhs) { data_ = static_cast<detail::uint16>(detail::float2half<round_style>(rhs)); return *this; }
+
+		/// \}
+		/// \name Arithmetic updates
+		/// \{
+
+		/// Arithmetic assignment.
+		/// \tparam T type of concrete half expression
+		/// \param rhs half expression to add
+		/// \return reference to this half
+		/// \exception FE_... according to operator+(half,half)
+		half& operator+=(half rhs) { return *this = *this + rhs; }
+
+		/// Arithmetic assignment.
+		/// \tparam T type of concrete half expression
+		/// \param rhs half expression to subtract
+		/// \return reference to this half
+		/// \exception FE_... according to operator-(half,half)
+		half& operator-=(half rhs) { return *this = *this - rhs; }
+
+		/// Arithmetic assignment.
+		/// \tparam T type of concrete half expression
+		/// \param rhs half expression to multiply with
+		/// \return reference to this half
+		/// \exception FE_... according to operator*(half,half)
+		half& operator*=(half rhs) { return *this = *this * rhs; }
+
+		/// Arithmetic assignment.
+		/// \tparam T type of concrete half expression
+		/// \param rhs half expression to divide by
+		/// \return reference to this half
+		/// \exception FE_... according to operator/(half,half)
+		half& operator/=(half rhs) { return *this = *this / rhs; }
+
+		/// Arithmetic assignment.
+		/// \param rhs single-precision value to add
+		/// \return reference to this half
+		/// \exception FE_... according to operator=()
+		half& operator+=(float rhs) { return *this = *this + rhs; }
+
+		/// Arithmetic assignment.
+		/// \param rhs single-precision value to subtract
+		/// \return reference to this half
+		/// \exception FE_... according to operator=()
+		half& operator-=(float rhs) { return *this = *this - rhs; }
+
+		/// Arithmetic assignment.
+		/// \param rhs single-precision value to multiply with
+		/// \return reference to this half
+		/// \exception FE_... according to operator=()
+		half& operator*=(float rhs) { return *this = *this * rhs; }
+
+		/// Arithmetic assignment.
+		/// \param rhs single-precision value to divide by
+		/// \return reference to this half
+		/// \exception FE_... according to operator=()
+		half& operator/=(float rhs) { return *this = *this / rhs; }
+
+		/// \}
+		/// \name Increment and decrement
+		/// \{
+
+		/// Prefix increment.
+		/// \return incremented half value
+		/// \exception FE_... according to operator+(half,half)
+		half& operator++() { return *this = *this + half(detail::binary, 0x3C00); }
+
+		/// Prefix decrement.
+		/// \return decremented half value
+		/// \exception FE_... according to operator-(half,half)
+		half& operator--() { return *this = *this + half(detail::binary, 0xBC00); }
+
+		/// Postfix increment.
+		/// \return non-incremented half value
+		/// \exception FE_... according to operator+(half,half)
+		half operator++(int) { half out(*this); ++*this; return out; }
+
+		/// Postfix decrement.
+		/// \return non-decremented half value
+		/// \exception FE_... according to operator-(half,half)
+		half operator--(int) { half out(*this); --*this; return out; }
+		/// \}
+	
+	private:
+		/// Rounding mode to use
+		static const std::float_round_style round_style = (std::float_round_style)(HALF_ROUND_STYLE);
+
+		/// Constructor.
+		/// \param bits binary representation to set half to
+		HALF_CONSTEXPR half(detail::binary_t, unsigned int bits) HALF_NOEXCEPT : data_(static_cast<detail::uint16>(bits)) {}
+
+		/// Internal binary representation
+		detail::uint16 data_;
+
+	#ifndef HALF_DOXYGEN_ONLY
+		friend HALF_CONSTEXPR_NOERR bool operator==(half, half);
+		friend HALF_CONSTEXPR_NOERR bool operator!=(half, half);
+		friend HALF_CONSTEXPR_NOERR bool operator<(half, half);
+		friend HALF_CONSTEXPR_NOERR bool operator>(half, half);
+		friend HALF_CONSTEXPR_NOERR bool operator<=(half, half);
+		friend HALF_CONSTEXPR_NOERR bool operator>=(half, half);
+		friend HALF_CONSTEXPR half operator-(half);
+		friend half operator+(half, half);
+		friend half operator-(half, half);
+		friend half operator*(half, half);
+		friend half operator/(half, half);
+		template<typename charT,typename traits> friend std::basic_ostream<charT,traits>& operator<<(std::basic_ostream<charT,traits>&, half);
+		template<typename charT,typename traits> friend std::basic_istream<charT,traits>& operator>>(std::basic_istream<charT,traits>&, half&);
+		friend HALF_CONSTEXPR half fabs(half);
+		friend half fmod(half, half);
+		friend half remainder(half, half);
+		friend half remquo(half, half, int*);
+		friend half fma(half, half, half);
+		friend HALF_CONSTEXPR_NOERR half fmax(half, half);
+		friend HALF_CONSTEXPR_NOERR half fmin(half, half);
+		friend half fdim(half, half);
+		friend half nanh(const char*);
+		friend half exp(half);
+		friend half exp2(half);
+		friend half expm1(half);
+		friend half log(half);
+		friend half log10(half);
+		friend half log2(half);
+		friend half log1p(half);
+		friend half sqrt(half);
+		friend half rsqrt(half);
+		friend half cbrt(half);
+		friend half hypot(half, half);
+		friend half hypot(half, half, half);
+		friend half pow(half, half);
+		friend void sincos(half, half*, half*);
+		friend half sin(half);
+		friend half cos(half);
+		friend half tan(half);
+		friend half asin(half);
+		friend half acos(half);
+		friend half atan(half);
+		friend half atan2(half, half);
+		friend half sinh(half);
+		friend half cosh(half);
+		friend half tanh(half);
+		friend half asinh(half);
+		friend half acosh(half);
+		friend half atanh(half);
+		friend half erf(half);
+		friend half erfc(half);
+		friend half lgamma(half);
+		friend half tgamma(half);
+		friend half ceil(half);
+		friend half floor(half);
+		friend half trunc(half);
+		friend half round(half);
+		friend long lround(half);
+		friend half rint(half);
+		friend long lrint(half);
+		friend half nearbyint(half);
+	#ifdef HALF_ENABLE_CPP11_LONG_LONG
+		friend long long llround(half);
+		friend long long llrint(half);
+	#endif
+		friend half frexp(half, int*);
+		friend half scalbln(half, long);
+		friend half modf(half, half*);
+		friend int ilogb(half);
+		friend half logb(half);
+		friend half nextafter(half, half);
+		friend half nexttoward(half, long double);
+		friend HALF_CONSTEXPR half copysign(half, half);
+		friend HALF_CONSTEXPR int fpclassify(half);
+		friend HALF_CONSTEXPR bool isfinite(half);
+		friend HALF_CONSTEXPR bool isinf(half);
+		friend HALF_CONSTEXPR bool isnan(half);
+		friend HALF_CONSTEXPR bool isnormal(half);
+		friend HALF_CONSTEXPR bool signbit(half);
+		friend HALF_CONSTEXPR bool isgreater(half, half);
+		friend HALF_CONSTEXPR bool isgreaterequal(half, half);
+		friend HALF_CONSTEXPR bool isless(half, half);
+		friend HALF_CONSTEXPR bool islessequal(half, half);
+		friend HALF_CONSTEXPR bool islessgreater(half, half);
+		template<typename,typename,std::float_round_style> friend struct detail::half_caster;
+		friend class std::numeric_limits<half>;
+	#if HALF_ENABLE_CPP11_HASH
+		friend struct std::hash<half>;
+	#endif
+	#if HALF_ENABLE_CPP11_USER_LITERALS
+		friend half literal::operator "" _h(long double);
+	#endif
+	#endif
+	};
+
+#if HALF_ENABLE_CPP11_USER_LITERALS
+	namespace literal
+	{
+		/// Half literal.
+		/// While this returns a properly rounded half-precision value, half literals can unfortunately not be constant 
+		/// expressions due to rather involved conversions. So don't expect this to be a literal literal without involving 
+		/// conversion operations at runtime. It is a convenience feature, not a performance optimization.
+		/// \param value literal value
+		/// \return half with of given value (possibly rounded)
+		/// \exception FE_OVERFLOW, ...UNDERFLOW, ...INEXACT according to rounding
+		inline half operator "" _h(long double value) { return half(detail::binary, detail::float2half<half::round_style>(value)); }
+	}
+#endif
+
+	namespace detail
+	{
+		/// Helper class for half casts.
+		/// This class template has to be specialized for all valid cast arguments to define an appropriate static 
+		/// `cast` member function and a corresponding `type` member denoting its return type.
+		/// \tparam T destination type
+		/// \tparam U source type
+		/// \tparam R rounding mode to use
+		template<typename T,typename U,std::float_round_style R=(std::float_round_style)(HALF_ROUND_STYLE)> struct half_caster {};
+		template<typename U,std::float_round_style R> struct half_caster<half,U,R>
+		{
+		#if HALF_ENABLE_CPP11_STATIC_ASSERT && HALF_ENABLE_CPP11_TYPE_TRAITS
+			static_assert(std::is_arithmetic<U>::value, "half_cast from non-arithmetic type not supported");
+		#endif
+
+			static half cast(U arg) { return cast_impl(arg, is_float<U>()); };
+
+		private:
+			static half cast_impl(U arg, true_type) { return half(binary, float2half<R>(arg)); }
+			static half cast_impl(U arg, false_type) { return half(binary, int2half<R>(arg)); }
+		};
+		template<typename T,std::float_round_style R> struct half_caster<T,half,R>
+		{
+		#if HALF_ENABLE_CPP11_STATIC_ASSERT && HALF_ENABLE_CPP11_TYPE_TRAITS
+			static_assert(std::is_arithmetic<T>::value, "half_cast to non-arithmetic type not supported");
+		#endif
+
+			static T cast(half arg) { return cast_impl(arg, is_float<T>()); }
+
+		private:
+			static T cast_impl(half arg, true_type) { return half2float<T>(arg.data_); }
+			static T cast_impl(half arg, false_type) { return half2int<R,true,true,T>(arg.data_); }
+		};
+		template<std::float_round_style R> struct half_caster<half,half,R>
+		{
+			static half cast(half arg) { return arg; }
+		};
+	}
+}
+
+/// Extensions to the C++ standard library.
+namespace std
+{
+	/// Numeric limits for half-precision floats.
+	/// **See also:** Documentation for [std::numeric_limits](https://en.cppreference.com/w/cpp/types/numeric_limits)
+	template<> class numeric_limits<half_float::half>
+	{
+	public:
+		/// Is template specialization.
+		static HALF_CONSTEXPR_CONST bool is_specialized = true;
+
+		/// Supports signed values.
+		static HALF_CONSTEXPR_CONST bool is_signed = true;
+
+		/// Is not an integer type.
+		static HALF_CONSTEXPR_CONST bool is_integer = false;
+
+		/// Is not exact.
+		static HALF_CONSTEXPR_CONST bool is_exact = false;
+
+		/// Doesn't provide modulo arithmetic.
+		static HALF_CONSTEXPR_CONST bool is_modulo = false;
+
+		/// Has a finite set of values.
+		static HALF_CONSTEXPR_CONST bool is_bounded = true;
+
+		/// IEEE conformant.
+		static HALF_CONSTEXPR_CONST bool is_iec559 = true;
+
+		/// Supports infinity.
+		static HALF_CONSTEXPR_CONST bool has_infinity = true;
+
+		/// Supports quiet NaNs.
+		static HALF_CONSTEXPR_CONST bool has_quiet_NaN = true;
+
+		/// Supports signaling NaNs.
+		static HALF_CONSTEXPR_CONST bool has_signaling_NaN = true;
+
+		/// Supports subnormal values.
+		static HALF_CONSTEXPR_CONST float_denorm_style has_denorm = denorm_present;
+
+		/// Does not support denormalization detection.
+		static HALF_CONSTEXPR_CONST bool has_denorm_loss = false;
+
+	#if HALF_ERRHANDLING_THROWS
+		static HALF_CONSTEXPR_CONST bool traps = true;
+	#else
+		/// Traps only if [HALF_ERRHANDLING_THROW_...](\ref HALF_ERRHANDLING_THROW_INVALID) is acitvated.
+		static HALF_CONSTEXPR_CONST bool traps = false;
+	#endif
+
+		/// Does not support pre-rounding underflow detection.
+		static HALF_CONSTEXPR_CONST bool tinyness_before = false;
+
+		/// Rounding mode.
+		static HALF_CONSTEXPR_CONST float_round_style round_style = half_float::half::round_style;
+
+		/// Significant digits.
+		static HALF_CONSTEXPR_CONST int digits = 11;
+
+		/// Significant decimal digits.
+		static HALF_CONSTEXPR_CONST int digits10 = 3;
+
+		/// Required decimal digits to represent all possible values.
+		static HALF_CONSTEXPR_CONST int max_digits10 = 5;
+
+		/// Number base.
+		static HALF_CONSTEXPR_CONST int radix = 2;
+
+		/// One more than smallest exponent.
+		static HALF_CONSTEXPR_CONST int min_exponent = -13;
+
+		/// Smallest normalized representable power of 10.
+		static HALF_CONSTEXPR_CONST int min_exponent10 = -4;
+
+		/// One more than largest exponent
+		static HALF_CONSTEXPR_CONST int max_exponent = 16;
+
+		/// Largest finitely representable power of 10.
+		static HALF_CONSTEXPR_CONST int max_exponent10 = 4;
+
+		/// Smallest positive normal value.
+		static HALF_CONSTEXPR half_float::half min() HALF_NOTHROW { return half_float::half(half_float::detail::binary, 0x0400); }
+
+		/// Smallest finite value.
+		static HALF_CONSTEXPR half_float::half lowest() HALF_NOTHROW { return half_float::half(half_float::detail::binary, 0xFBFF); }
+
+		/// Largest finite value.
+		static HALF_CONSTEXPR half_float::half max() HALF_NOTHROW { return half_float::half(half_float::detail::binary, 0x7BFF); }
+
+		/// Difference between 1 and next representable value.
+		static HALF_CONSTEXPR half_float::half epsilon() HALF_NOTHROW { return half_float::half(half_float::detail::binary, 0x1400); }
+
+		/// Maximum rounding error in ULP (units in the last place).
+		static HALF_CONSTEXPR half_float::half round_error() HALF_NOTHROW
+			{ return half_float::half(half_float::detail::binary, (round_style==std::round_to_nearest) ? 0x3800 : 0x3C00); }
+
+		/// Positive infinity.
+		static HALF_CONSTEXPR half_float::half infinity() HALF_NOTHROW { return half_float::half(half_float::detail::binary, 0x7C00); }
+
+		/// Quiet NaN.
+		static HALF_CONSTEXPR half_float::half quiet_NaN() HALF_NOTHROW { return half_float::half(half_float::detail::binary, 0x7FFF); }
+
+		/// Signaling NaN.
+		static HALF_CONSTEXPR half_float::half signaling_NaN() HALF_NOTHROW { return half_float::half(half_float::detail::binary, 0x7DFF); }
+
+		/// Smallest positive subnormal value.
+		static HALF_CONSTEXPR half_float::half denorm_min() HALF_NOTHROW { return half_float::half(half_float::detail::binary, 0x0001); }
+	};
+
+#if HALF_ENABLE_CPP11_HASH
+	/// Hash function for half-precision floats.
+	/// This is only defined if C++11 `std::hash` is supported and enabled.
+	///
+	/// **See also:** Documentation for [std::hash](https://en.cppreference.com/w/cpp/utility/hash)
+	template<> struct hash<half_float::half>
+	{
+		/// Type of function argument.
+		typedef half_float::half argument_type;
+
+		/// Function return type.
+		typedef size_t result_type;
+
+		/// Compute hash function.
+		/// \param arg half to hash
+		/// \return hash value
+		result_type operator()(argument_type arg) const { return hash<half_float::detail::uint16>()(arg.data_&-static_cast<unsigned>(arg.data_!=0x8000)); }
+	};
+#endif
+}
+
+namespace half_float
+{
+	/// \anchor compop
+	/// \name Comparison operators
+	/// \{
+
+	/// Comparison for equality.
+	/// \param x first operand
+	/// \param y second operand
+	/// \retval true if operands equal
+	/// \retval false else
+	/// \exception FE_INVALID if \a x or \a y is NaN
+	inline HALF_CONSTEXPR_NOERR bool operator==(half x, half y)
+	{
+		return !detail::compsignal(x.data_, y.data_) && (x.data_==y.data_ || !((x.data_|y.data_)&0x7FFF));
+	}
+
+	/// Comparison for inequality.
+	/// \param x first operand
+	/// \param y second operand
+	/// \retval true if operands not equal
+	/// \retval false else
+	/// \exception FE_INVALID if \a x or \a y is NaN
+	inline HALF_CONSTEXPR_NOERR bool operator!=(half x, half y)
+	{
+		return detail::compsignal(x.data_, y.data_) || (x.data_!=y.data_ && ((x.data_|y.data_)&0x7FFF));
+	}
+
+	/// Comparison for less than.
+	/// \param x first operand
+	/// \param y second operand
+	/// \retval true if \a x less than \a y
+	/// \retval false else
+	/// \exception FE_INVALID if \a x or \a y is NaN
+	inline HALF_CONSTEXPR_NOERR bool operator<(half x, half y)
+	{
+		return !detail::compsignal(x.data_, y.data_) &&
+			((x.data_^(0x8000|(0x8000-(x.data_>>15))))+(x.data_>>15)) < ((y.data_^(0x8000|(0x8000-(y.data_>>15))))+(y.data_>>15));
+	}
+
+	/// Comparison for greater than.
+	/// \param x first operand
+	/// \param y second operand
+	/// \retval true if \a x greater than \a y
+	/// \retval false else
+	/// \exception FE_INVALID if \a x or \a y is NaN
+	inline HALF_CONSTEXPR_NOERR bool operator>(half x, half y)
+	{
+		return !detail::compsignal(x.data_, y.data_) &&
+			((x.data_^(0x8000|(0x8000-(x.data_>>15))))+(x.data_>>15)) > ((y.data_^(0x8000|(0x8000-(y.data_>>15))))+(y.data_>>15));
+	}
+
+	/// Comparison for less equal.
+	/// \param x first operand
+	/// \param y second operand
+	/// \retval true if \a x less equal \a y
+	/// \retval false else
+	/// \exception FE_INVALID if \a x or \a y is NaN
+	inline HALF_CONSTEXPR_NOERR bool operator<=(half x, half y)
+	{
+		return !detail::compsignal(x.data_, y.data_) &&
+			((x.data_^(0x8000|(0x8000-(x.data_>>15))))+(x.data_>>15)) <= ((y.data_^(0x8000|(0x8000-(y.data_>>15))))+(y.data_>>15));
+	}
+
+	/// Comparison for greater equal.
+	/// \param x first operand
+	/// \param y second operand
+	/// \retval true if \a x greater equal \a y
+	/// \retval false else
+	/// \exception FE_INVALID if \a x or \a y is NaN
+	inline HALF_CONSTEXPR_NOERR bool operator>=(half x, half y)
+	{
+		return !detail::compsignal(x.data_, y.data_) &&
+			((x.data_^(0x8000|(0x8000-(x.data_>>15))))+(x.data_>>15)) >= ((y.data_^(0x8000|(0x8000-(y.data_>>15))))+(y.data_>>15));
+	}
+
+	/// \}
+	/// \anchor arithmetics
+	/// \name Arithmetic operators
+	/// \{
+
+	/// Identity.
+	/// \param arg operand
+	/// \return unchanged operand
+	inline HALF_CONSTEXPR half operator+(half arg) { return arg; }
+
+	/// Negation.
+	/// \param arg operand
+	/// \return negated operand
+	inline HALF_CONSTEXPR half operator-(half arg) { return half(detail::binary, arg.data_^0x8000); }
+
+	/// Addition.
+	/// This operation is exact to rounding for all rounding modes.
+	/// \param x left operand
+	/// \param y right operand
+	/// \return sum of half expressions
+	/// \exception FE_INVALID if \a x and \a y are infinities with different signs or signaling NaNs
+	/// \exception FE_OVERFLOW, ...UNDERFLOW, ...INEXACT according to rounding
+	inline half operator+(half x, half y)
+	{
+	#ifdef HALF_ARITHMETIC_TYPE
+		return half(detail::binary, detail::float2half<half::round_style>(detail::half2float<detail::internal_t>(x.data_)+detail::half2float<detail::internal_t>(y.data_)));
+	#else
+		int absx = x.data_ & 0x7FFF, absy = y.data_ & 0x7FFF;
+		bool sub = ((x.data_^y.data_)&0x8000) != 0;
+		if(absx >= 0x7C00 || absy >= 0x7C00)
+			return half(detail::binary,	(absx>0x7C00 || absy>0x7C00) ? detail::signal(x.data_, y.data_) : (absy!=0x7C00) ? x.data_ :
+										(sub && absx==0x7C00) ? detail::invalid() : y.data_);
+		if(!absx)
+			return absy ? y : half(detail::binary, (half::round_style==std::round_toward_neg_infinity) ? (x.data_|y.data_) : (x.data_&y.data_));
+		if(!absy)
+			return x;
+		unsigned int sign = ((sub && absy>absx) ? y.data_ : x.data_) & 0x8000;
+		if(absy > absx)
+			std::swap(absx, absy);
+		int exp = (absx>>10) + (absx<=0x3FF), d = exp - (absy>>10) - (absy<=0x3FF), mx = ((absx&0x3FF)|((absx>0x3FF)<<10)) << 3, my;
+		if(d < 13)
+		{
+			my = ((absy&0x3FF)|((absy>0x3FF)<<10)) << 3;
+			my = (my>>d) | ((my&((1<<d)-1))!=0);
+		}
+		else
+			my = 1;
+		if(sub)
+		{
+			if(!(mx-=my))
+				return half(detail::binary, static_cast<unsigned>(half::round_style==std::round_toward_neg_infinity)<<15);
+			for(; mx<0x2000 && exp>1; mx<<=1,--exp) ;
+		}
+		else
+		{
+			mx += my;
+			int i = mx >> 14;
+			if((exp+=i) > 30)
+				return half(detail::binary, detail::overflow<half::round_style>(sign));
+			mx = (mx>>i) | (mx&i);
+		}
+		return half(detail::binary, detail::rounded<half::round_style,false>(sign+((exp-1)<<10)+(mx>>3), (mx>>2)&1, (mx&0x3)!=0));
+	#endif
+	}
+
+	/// Subtraction.
+	/// This operation is exact to rounding for all rounding modes.
+	/// \param x left operand
+	/// \param y right operand
+	/// \return difference of half expressions
+	/// \exception FE_INVALID if \a x and \a y are infinities with equal signs or signaling NaNs
+	/// \exception FE_OVERFLOW, ...UNDERFLOW, ...INEXACT according to rounding
+	inline half operator-(half x, half y)
+	{
+	#ifdef HALF_ARITHMETIC_TYPE
+		return half(detail::binary, detail::float2half<half::round_style>(detail::half2float<detail::internal_t>(x.data_)-detail::half2float<detail::internal_t>(y.data_)));
+	#else
+		return x + -y;
+	#endif
+	}
+
+	/// Multiplication.
+	/// This operation is exact to rounding for all rounding modes.
+	/// \param x left operand
+	/// \param y right operand
+	/// \return product of half expressions
+	/// \exception FE_INVALID if multiplying 0 with infinity or if \a x or \a y is signaling NaN
+	/// \exception FE_OVERFLOW, ...UNDERFLOW, ...INEXACT according to rounding
+	inline half operator*(half x, half y)
+	{
+	#ifdef HALF_ARITHMETIC_TYPE
+		return half(detail::binary, detail::float2half<half::round_style>(detail::half2float<detail::internal_t>(x.data_)*detail::half2float<detail::internal_t>(y.data_)));
+	#else
+		int absx = x.data_ & 0x7FFF, absy = y.data_ & 0x7FFF, exp = -16;
+		unsigned int sign = (x.data_^y.data_) & 0x8000;
+		if(absx >= 0x7C00 || absy >= 0x7C00)
+			return half(detail::binary,	(absx>0x7C00 || absy>0x7C00) ? detail::signal(x.data_, y.data_) :
+										((absx==0x7C00 && !absy)||(absy==0x7C00 && !absx)) ? detail::invalid() : (sign|0x7C00));
+		if(!absx || !absy)
+			return half(detail::binary, sign);
+		for(; absx<0x400; absx<<=1,--exp) ;
+		for(; absy<0x400; absy<<=1,--exp) ;
+		detail::uint32 m = static_cast<detail::uint32>((absx&0x3FF)|0x400) * static_cast<detail::uint32>((absy&0x3FF)|0x400);
+		int i = static_cast<int>(m >> 21), s = static_cast<int>(m & i);
+		exp += (absx>>10) + (absy>>10) + i;
+		if(exp > 29)
+			return half(detail::binary, detail::overflow<half::round_style>(sign));
+		else if(exp < -11)
+			return half(detail::binary, detail::underflow<half::round_style>(sign));
+		return half(detail::binary, detail::fixed2half<half::round_style,20,false,false,false>(m>>i, exp, sign, s));
+	#endif
+	}
+
+	/// Division.
+	/// This operation is exact to rounding for all rounding modes.
+	/// \param x left operand
+	/// \param y right operand
+	/// \return quotient of half expressions
+	/// \exception FE_INVALID if dividing 0s or infinities with each other or if \a x or \a y is signaling NaN
+	/// \exception FE_DIVBYZERO if dividing finite value by 0
+	/// \exception FE_OVERFLOW, ...UNDERFLOW, ...INEXACT according to rounding
+	inline half operator/(half x, half y)
+	{
+	#ifdef HALF_ARITHMETIC_TYPE
+		return half(detail::binary, detail::float2half<half::round_style>(detail::half2float<detail::internal_t>(x.data_)/detail::half2float<detail::internal_t>(y.data_)));
+	#else
+		int absx = x.data_ & 0x7FFF, absy = y.data_ & 0x7FFF, exp = 14;
+		unsigned int sign = (x.data_^y.data_) & 0x8000;
+		if(absx >= 0x7C00 || absy >= 0x7C00)
+			return half(detail::binary,	(absx>0x7C00 || absy>0x7C00) ? detail::signal(x.data_, y.data_) :
+										(absx==absy) ? detail::invalid() : (sign|((absx==0x7C00) ? 0x7C00 : 0)));
+		if(!absx)
+			return half(detail::binary, absy ? sign : detail::invalid());
+		if(!absy)
+			return half(detail::binary, detail::pole(sign));
+		for(; absx<0x400; absx<<=1,--exp) ;
+		for(; absy<0x400; absy<<=1,++exp) ;
+		detail::uint32 mx = (absx&0x3FF) | 0x400, my = (absy&0x3FF) | 0x400;
+		int i = mx < my;
+		exp += (absx>>10) - (absy>>10) - i;
+		if(exp > 29)
+			return half(detail::binary, detail::overflow<half::round_style>(sign));
+		else if(exp < -11)
+			return half(detail::binary, detail::underflow<half::round_style>(sign));
+		mx <<= 12 + i;
+		my <<= 1;
+		return half(detail::binary, detail::fixed2half<half::round_style,11,false,false,false>(mx/my, exp, sign, mx%my!=0));
+	#endif
+	}
+
+	/// \}
+	/// \anchor streaming
+	/// \name Input and output
+	/// \{
+
+	/// Output operator.
+	///	This uses the built-in functionality for streaming out floating-point numbers.
+	/// \param out output stream to write into
+	/// \param arg half expression to write
+	/// \return reference to output stream
+	template<typename charT,typename traits> std::basic_ostream<charT,traits>& operator<<(std::basic_ostream<charT,traits> &out, half arg)
+	{
+	#ifdef HALF_ARITHMETIC_TYPE
+		return out << detail::half2float<detail::internal_t>(arg.data_);
+	#else
+		return out << detail::half2float<float>(arg.data_);
+	#endif
+	}
+
+	/// Input operator.
+	///	This uses the built-in functionality for streaming in floating-point numbers, specifically double precision floating 
+	/// point numbers (unless overridden with [HALF_ARITHMETIC_TYPE](\ref HALF_ARITHMETIC_TYPE)). So the input string is first 
+	/// rounded to double precision using the underlying platform's current floating-point rounding mode before being rounded 
+	/// to half-precision using the library's half-precision rounding mode.
+	/// \param in input stream to read from
+	/// \param arg half to read into
+	/// \return reference to input stream
+	/// \exception FE_OVERFLOW, ...UNDERFLOW, ...INEXACT according to rounding
+	template<typename charT,typename traits> std::basic_istream<charT,traits>& operator>>(std::basic_istream<charT,traits> &in, half &arg)
+	{
+	#ifdef HALF_ARITHMETIC_TYPE
+		detail::internal_t f;
+	#else
+		double f;
+	#endif
+		if(in >> f)
+			arg.data_ = detail::float2half<half::round_style>(f);
+		return in;
+	}
+
+	/// \}
+	/// \anchor basic
+	/// \name Basic mathematical operations
+	/// \{
+
+	/// Absolute value.
+	/// **See also:** Documentation for [std::fabs](https://en.cppreference.com/w/cpp/numeric/math/fabs).
+	/// \param arg operand
+	/// \return absolute value of \a arg
+	inline HALF_CONSTEXPR half fabs(half arg) { return half(detail::binary, arg.data_&0x7FFF); }
+
+	/// Absolute value.
+	/// **See also:** Documentation for [std::abs](https://en.cppreference.com/w/cpp/numeric/math/fabs).
+	/// \param arg operand
+	/// \return absolute value of \a arg
+	inline HALF_CONSTEXPR half abs(half arg) { return fabs(arg); }
+
+	/// Remainder of division.
+	/// **See also:** Documentation for [std::fmod](https://en.cppreference.com/w/cpp/numeric/math/fmod).
+	/// \param x first operand
+	/// \param y second operand
+	/// \return remainder of floating-point division.
+	/// \exception FE_INVALID if \a x is infinite or \a y is 0 or if \a x or \a y is signaling NaN
+	inline half fmod(half x, half y)
+	{
+		unsigned int absx = x.data_ & 0x7FFF, absy = y.data_ & 0x7FFF, sign = x.data_ & 0x8000;
+		if(absx >= 0x7C00 || absy >= 0x7C00)
+			return half(detail::binary,	(absx>0x7C00 || absy>0x7C00) ? detail::signal(x.data_, y.data_) :
+										(absx==0x7C00) ? detail::invalid() : x.data_);
+		if(!absy)
+			return half(detail::binary, detail::invalid());
+		if(!absx)
+			return x;
+		if(absx == absy)
+			return half(detail::binary, sign);
+		return half(detail::binary, sign|detail::mod<false,false>(absx, absy));
+	}
+
+	/// Remainder of division.
+	/// **See also:** Documentation for [std::remainder](https://en.cppreference.com/w/cpp/numeric/math/remainder).
+	/// \param x first operand
+	/// \param y second operand
+	/// \return remainder of floating-point division.
+	/// \exception FE_INVALID if \a x is infinite or \a y is 0 or if \a x or \a y is signaling NaN
+	inline half remainder(half x, half y)
+	{
+		unsigned int absx = x.data_ & 0x7FFF, absy = y.data_ & 0x7FFF, sign = x.data_ & 0x8000;
+		if(absx >= 0x7C00 || absy >= 0x7C00)
+			return half(detail::binary,	(absx>0x7C00 || absy>0x7C00) ? detail::signal(x.data_, y.data_) :
+										(absx==0x7C00) ? detail::invalid() : x.data_);
+		if(!absy)
+			return half(detail::binary, detail::invalid());
+		if(absx == absy)
+			return half(detail::binary, sign);
+		return half(detail::binary, sign^detail::mod<false,true>(absx, absy));
+	}
+
+	/// Remainder of division.
+	/// **See also:** Documentation for [std::remquo](https://en.cppreference.com/w/cpp/numeric/math/remquo).
+	/// \param x first operand
+	/// \param y second operand
+	/// \param quo address to store some bits of quotient at
+	/// \return remainder of floating-point division.
+	/// \exception FE_INVALID if \a x is infinite or \a y is 0 or if \a x or \a y is signaling NaN
+	inline half remquo(half x, half y, int *quo)
+	{
+		unsigned int absx = x.data_ & 0x7FFF, absy = y.data_ & 0x7FFF, value = x.data_ & 0x8000;
+		if(absx >= 0x7C00 || absy >= 0x7C00)
+			return half(detail::binary,	(absx>0x7C00 || absy>0x7C00) ? detail::signal(x.data_, y.data_) :
+										(absx==0x7C00) ? detail::invalid() : (*quo = 0, x.data_));
+		if(!absy)
+			return half(detail::binary, detail::invalid());
+		bool qsign = ((value^y.data_)&0x8000) != 0;
+		int q = 1;
+		if(absx != absy)
+			value ^= detail::mod<true, true>(absx, absy, &q);
+		return *quo = qsign ? -q : q, half(detail::binary, value);
+	}
+
+	/// Fused multiply add.
+	/// This function is exact to rounding for all rounding modes.
+	///
+	/// **See also:** Documentation for [std::fma](https://en.cppreference.com/w/cpp/numeric/math/fma).
+	/// \param x first operand
+	/// \param y second operand
+	/// \param z third operand
+	/// \return ( \a x * \a y ) + \a z rounded as one operation.
+	/// \exception FE_INVALID according to operator*() and operator+() unless any argument is a quiet NaN and no argument is a signaling NaN
+	/// \exception FE_OVERFLOW, ...UNDERFLOW, ...INEXACT according to rounding the final addition
+	inline half fma(half x, half y, half z)
+	{
+	#ifdef HALF_ARITHMETIC_TYPE
+		detail::internal_t fx = detail::half2float<detail::internal_t>(x.data_), fy = detail::half2float<detail::internal_t>(y.data_), fz = detail::half2float<detail::internal_t>(z.data_);
+		#if HALF_ENABLE_CPP11_CMATH && FP_FAST_FMA
+			return half(detail::binary, detail::float2half<half::round_style>(std::fma(fx, fy, fz)));
+		#else
+			return half(detail::binary, detail::float2half<half::round_style>(fx*fy+fz));
+		#endif
+	#else
+		int absx = x.data_ & 0x7FFF, absy = y.data_ & 0x7FFF, absz = z.data_ & 0x7FFF, exp = -15;
+		unsigned int sign = (x.data_^y.data_) & 0x8000;
+		bool sub = ((sign^z.data_)&0x8000) != 0;
+		if(absx >= 0x7C00 || absy >= 0x7C00 || absz >= 0x7C00)
+			return	(absx>0x7C00 || absy>0x7C00 || absz>0x7C00) ? half(detail::binary, detail::signal(x.data_, y.data_, z.data_)) :
+					(absx==0x7C00) ? half(detail::binary, (!absy || (sub && absz==0x7C00)) ? detail::invalid() : (sign|0x7C00)) :
+					(absy==0x7C00) ? half(detail::binary, (!absx || (sub && absz==0x7C00)) ? detail::invalid() : (sign|0x7C00)) : z;
+		if(!absx || !absy)
+			return absz ? z : half(detail::binary, (half::round_style==std::round_toward_neg_infinity) ? (z.data_|sign) : (z.data_&sign));
+		for(; absx<0x400; absx<<=1,--exp) ;
+		for(; absy<0x400; absy<<=1,--exp) ;
+		detail::uint32 m = static_cast<detail::uint32>((absx&0x3FF)|0x400) * static_cast<detail::uint32>((absy&0x3FF)|0x400);
+		int i = static_cast<int>(m >> 21);
+		exp += (absx>>10) + (absy>>10) + i;
+		m <<= 3 - i;
+		if(absz)
+		{
+			int expz = 0;
+			for(; absz<0x400; absz<<=1,--expz) ;
+			expz += absz >> 10;
+			detail::uint32 mz = static_cast<detail::uint32>((absz&0x3FF)|0x400) << 13;
+			if(expz > exp || (expz == exp && mz > m))
+			{
+				std::swap(m, mz);
+				std::swap(exp, expz);
+				if(sub)
+					sign = z.data_ & 0x8000;
+			}
+			int d = exp - expz;
+			mz = (d<23) ? ((mz>>d)|((mz&((static_cast<detail::uint32>(1)<<d)-1))!=0)) : 1;
+			if(sub)
+			{
+				m = m - mz;
+				if(!m)
+					return half(detail::binary, static_cast<unsigned>(half::round_style==std::round_toward_neg_infinity)<<15);
+				for(; m<0x800000; m<<=1,--exp) ;
+			}
+			else
+			{
+				m += mz;
+				i = static_cast<int>(m >> 24);
+				m = (m>>i) | (m&i);
+				exp += i;
+			}
+		}
+		if(exp > 30)
+			return half(detail::binary, detail::overflow<half::round_style>(sign));
+		else if(exp < -10)
+			return half(detail::binary, detail::underflow<half::round_style>(sign));
+		return half(detail::binary, detail::fixed2half<half::round_style,23,false,false,false>(m, exp-1, sign));
+	#endif
+	}
+
+	/// Maximum of half expressions.
+	/// **See also:** Documentation for [std::fmax](https://en.cppreference.com/w/cpp/numeric/math/fmax).
+	/// \param x first operand
+	/// \param y second operand
+	/// \return maximum of operands, ignoring quiet NaNs
+	/// \exception FE_INVALID if \a x or \a y is signaling NaN
+	inline HALF_CONSTEXPR_NOERR half fmax(half x, half y)
+	{
+		return half(detail::binary, (!isnan(y) && (isnan(x) || (x.data_^(0x8000|(0x8000-(x.data_>>15)))) < 
+			(y.data_^(0x8000|(0x8000-(y.data_>>15)))))) ? detail::select(y.data_, x.data_) : detail::select(x.data_, y.data_));
+	}
+
+	/// Minimum of half expressions.
+	/// **See also:** Documentation for [std::fmin](https://en.cppreference.com/w/cpp/numeric/math/fmin).
+	/// \param x first operand
+	/// \param y second operand
+	/// \return minimum of operands, ignoring quiet NaNs
+	/// \exception FE_INVALID if \a x or \a y is signaling NaN
+	inline HALF_CONSTEXPR_NOERR half fmin(half x, half y)
+	{
+		return half(detail::binary, (!isnan(y) && (isnan(x) || (x.data_^(0x8000|(0x8000-(x.data_>>15)))) >
+			(y.data_^(0x8000|(0x8000-(y.data_>>15)))))) ? detail::select(y.data_, x.data_) : detail::select(x.data_, y.data_));
+	}
+
+	/// Positive difference.
+	/// This function is exact to rounding for all rounding modes.
+	///
+	/// **See also:** Documentation for [std::fdim](https://en.cppreference.com/w/cpp/numeric/math/fdim).
+	/// \param x first operand
+	/// \param y second operand
+	/// \return \a x - \a y or 0 if difference negative
+	/// \exception FE_... according to operator-(half,half)
+	inline half fdim(half x, half y)
+	{
+		if(isnan(x) || isnan(y))
+			return half(detail::binary, detail::signal(x.data_, y.data_));
+		return (x.data_^(0x8000|(0x8000-(x.data_>>15)))) <= (y.data_^(0x8000|(0x8000-(y.data_>>15)))) ? half(detail::binary, 0) : (x-y);
+	}
+
+	/// Get NaN value.
+	/// **See also:** Documentation for [std::nan](https://en.cppreference.com/w/cpp/numeric/math/nan).
+	/// \param arg string code
+	/// \return quiet NaN
+	inline half nanh(const char *arg)
+	{
+		unsigned int value = 0x7FFF;
+		while(*arg)
+			value ^= static_cast<unsigned>(*arg++) & 0xFF;
+		return half(detail::binary, value);
+	}
+
+	/// \}
+	/// \anchor exponential
+	/// \name Exponential functions
+	/// \{
+
+	/// Exponential function.
+	/// This function is exact to rounding for all rounding modes.
+	///
+	/// **See also:** Documentation for [std::exp](https://en.cppreference.com/w/cpp/numeric/math/exp).
+	/// \param arg function argument
+	/// \return e raised to \a arg
+	/// \exception FE_INVALID for signaling NaN
+	/// \exception FE_OVERFLOW, ...UNDERFLOW, ...INEXACT according to rounding
+	inline half exp(half arg)
+	{
+	#ifdef HALF_ARITHMETIC_TYPE
+		return half(detail::binary, detail::float2half<half::round_style>(std::exp(detail::half2float<detail::internal_t>(arg.data_))));
+	#else
+		int abs = arg.data_ & 0x7FFF, e = (abs>>10) + (abs<=0x3FF), exp;
+		if(!abs)
+			return half(detail::binary, 0x3C00);
+		if(abs >= 0x7C00)
+			return half(detail::binary, (abs==0x7C00) ? (0x7C00&((arg.data_>>15)-1U)) : detail::signal(arg.data_));
+		if(abs >= 0x4C80)
+			return half(detail::binary, (arg.data_&0x8000) ? detail::underflow<half::round_style>() : detail::overflow<half::round_style>());
+		detail::uint32 m = detail::multiply64(static_cast<detail::uint32>((abs&0x3FF)+((abs>0x3FF)<<10))<<21, 0xB8AA3B29);
+		if(e < 14)
+		{
+			exp = 0;
+			m >>= 14 - e;
+		}
+		else
+		{
+			exp = static_cast<int>(m >> (45-e));
+			m = (m<<(e-14)) & 0x7FFFFFFF;
+		}
+		return half(detail::binary, detail::exp2_post<half::round_style>(m, exp, (arg.data_&0x8000)!=0, 0, 26));
+	#endif
+	}
+
+	/// Binary exponential.
+	/// This function is exact to rounding for all rounding modes.
+	///
+	/// **See also:** Documentation for [std::exp2](https://en.cppreference.com/w/cpp/numeric/math/exp2).
+	/// \param arg function argument
+	/// \return 2 raised to \a arg
+	/// \exception FE_INVALID for signaling NaN
+	/// \exception FE_OVERFLOW, ...UNDERFLOW, ...INEXACT according to rounding
+	inline half exp2(half arg)
+	{
+	#if defined(HALF_ARITHMETIC_TYPE) && HALF_ENABLE_CPP11_CMATH
+		return half(detail::binary, detail::float2half<half::round_style>(std::exp2(detail::half2float<detail::internal_t>(arg.data_))));
+	#else
+		int abs = arg.data_ & 0x7FFF, e = (abs>>10) + (abs<=0x3FF), exp = (abs&0x3FF) + ((abs>0x3FF)<<10);
+		if(!abs)
+			return half(detail::binary, 0x3C00);
+		if(abs >= 0x7C00)
+			return half(detail::binary, (abs==0x7C00) ? (0x7C00&((arg.data_>>15)-1U)) : detail::signal(arg.data_));
+		if(abs >= 0x4E40)
+			return half(detail::binary, (arg.data_&0x8000) ? detail::underflow<half::round_style>() : detail::overflow<half::round_style>());
+		return half(detail::binary, detail::exp2_post<half::round_style>(
+			(static_cast<detail::uint32>(exp)<<(6+e))&0x7FFFFFFF, exp>>(25-e), (arg.data_&0x8000)!=0, 0, 28));
+	#endif
+	}
+
+	/// Exponential minus one.
+	/// This function may be 1 ULP off the correctly rounded exact result in <0.05% of inputs for `std::round_to_nearest` 
+	/// and in <1% of inputs for any other rounding mode.
+	///
+	/// **See also:** Documentation for [std::expm1](https://en.cppreference.com/w/cpp/numeric/math/expm1).
+	/// \param arg function argument
+	/// \return e raised to \a arg and subtracted by 1
+	/// \exception FE_INVALID for signaling NaN
+	/// \exception FE_OVERFLOW, ...UNDERFLOW, ...INEXACT according to rounding
+	inline half expm1(half arg)
+	{
+	#if defined(HALF_ARITHMETIC_TYPE) && HALF_ENABLE_CPP11_CMATH
+		return half(detail::binary, detail::float2half<half::round_style>(std::expm1(detail::half2float<detail::internal_t>(arg.data_))));
+	#else
+		unsigned int abs = arg.data_ & 0x7FFF, sign = arg.data_ & 0x8000, e = (abs>>10) + (abs<=0x3FF), exp;
+		if(!abs)
+			return arg;
+		if(abs >= 0x7C00)
+			return half(detail::binary, (abs==0x7C00) ? (0x7C00+(sign>>1)) : detail::signal(arg.data_));
+		if(abs >= 0x4A00)
+			return half(detail::binary, (arg.data_&0x8000) ? detail::rounded<half::round_style,true>(0xBBFF, 1, 1) : detail::overflow<half::round_style>());
+		detail::uint32 m = detail::multiply64(static_cast<detail::uint32>((abs&0x3FF)+((abs>0x3FF)<<10))<<21, 0xB8AA3B29);
+		if(e < 14)
+		{
+			exp = 0;
+			m >>= 14 - e;
+		}
+		else
+		{
+			exp = static_cast<unsigned int>(m >> (45-e));
+			m = (m<<(e-14)) & 0x7FFFFFFF;
+		}
+		m = detail::exp2(m);
+		if(sign)
+		{
+			int s = 0;
+			if(m > 0x80000000)
+			{
+				++exp;
+				m = detail::divide64(0x80000000, m, s);
+			}
+			m = 0x80000000 - ((m>>exp)|((m&((static_cast<detail::uint32>(1)<<exp)-1))!=0)|s);
+			exp = 0;
+		}
+		else
+			m -= (exp<31) ? (0x80000000>>exp) : 1;
+		for(exp+=14; m<0x80000000 && exp; m<<=1,--exp) ;
+		if(exp > 29)
+			return half(detail::binary, detail::overflow<half::round_style>());
+		return half(detail::binary, detail::rounded<half::round_style,true>(sign+(exp<<10)+static_cast<unsigned int>(m>>21), static_cast<int>((m>>20)&1), (m&0xFFFFF)!=0));
+	#endif
+	}
+
+	/// Natural logarithm.
+	/// This function is exact to rounding for all rounding modes.
+	///
+	/// **See also:** Documentation for [std::log](https://en.cppreference.com/w/cpp/numeric/math/log).
+	/// \param arg function argument
+	/// \return logarithm of \a arg to base e
+	/// \exception FE_INVALID for signaling NaN or negative argument
+	/// \exception FE_DIVBYZERO for 0
+	/// \exception FE_OVERFLOW, ...UNDERFLOW, ...INEXACT according to rounding
+	inline half log(half arg)
+	{
+	#ifdef HALF_ARITHMETIC_TYPE
+		return half(detail::binary, detail::float2half<half::round_style>(std::log(detail::half2float<detail::internal_t>(arg.data_))));
+	#else
+		int abs = arg.data_ & 0x7FFF, exp = -15;
+		if(!abs)
+			return half(detail::binary, detail::pole(0x8000));
+		if(arg.data_ & 0x8000)
+			return half(detail::binary, (arg.data_<=0xFC00) ? detail::invalid() : detail::signal(arg.data_));
+		if(abs >= 0x7C00)
+			return (abs==0x7C00) ? arg : half(detail::binary, detail::signal(arg.data_));
+		for(; abs<0x400; abs<<=1,--exp) ;
+		exp += abs >> 10;
+		return half(detail::binary, detail::log2_post<half::round_style,0xB8AA3B2A>(
+			detail::log2(static_cast<detail::uint32>((abs&0x3FF)|0x400)<<20, 27)+8, exp, 17));
+	#endif
+	}
+
+	/// Common logarithm.
+	/// This function is exact to rounding for all rounding modes.
+	///
+	/// **See also:** Documentation for [std::log10](https://en.cppreference.com/w/cpp/numeric/math/log10).
+	/// \param arg function argument
+	/// \return logarithm of \a arg to base 10
+	/// \exception FE_INVALID for signaling NaN or negative argument
+	/// \exception FE_DIVBYZERO for 0
+	/// \exception FE_OVERFLOW, ...UNDERFLOW, ...INEXACT according to rounding
+	inline half log10(half arg)
+	{
+	#ifdef HALF_ARITHMETIC_TYPE
+		return half(detail::binary, detail::float2half<half::round_style>(std::log10(detail::half2float<detail::internal_t>(arg.data_))));
+	#else
+		int abs = arg.data_ & 0x7FFF, exp = -15;
+		if(!abs)
+			return half(detail::binary, detail::pole(0x8000));
+		if(arg.data_ & 0x8000)
+			return half(detail::binary, (arg.data_<=0xFC00) ? detail::invalid() : detail::signal(arg.data_));
+		if(abs >= 0x7C00)
+			return (abs==0x7C00) ? arg : half(detail::binary, detail::signal(arg.data_));
+		switch(abs)
+		{
+			case 0x4900: return half(detail::binary, 0x3C00);
+			case 0x5640: return half(detail::binary, 0x4000);
+			case 0x63D0: return half(detail::binary, 0x4200);
+			case 0x70E2: return half(detail::binary, 0x4400);
+		}
+		for(; abs<0x400; abs<<=1,--exp) ;
+		exp += abs >> 10;
+		return half(detail::binary, detail::log2_post<half::round_style,0xD49A784C>(
+			detail::log2(static_cast<detail::uint32>((abs&0x3FF)|0x400)<<20, 27)+8, exp, 16));
+	#endif
+	}
+
+	/// Binary logarithm.
+	/// This function is exact to rounding for all rounding modes.
+	///
+	/// **See also:** Documentation for [std::log2](https://en.cppreference.com/w/cpp/numeric/math/log2).
+	/// \param arg function argument
+	/// \return logarithm of \a arg to base 2
+	/// \exception FE_INVALID for signaling NaN or negative argument
+	/// \exception FE_DIVBYZERO for 0
+	/// \exception FE_OVERFLOW, ...UNDERFLOW, ...INEXACT according to rounding
+	inline half log2(half arg)
+	{
+	#if defined(HALF_ARITHMETIC_TYPE) && HALF_ENABLE_CPP11_CMATH
+		return half(detail::binary, detail::float2half<half::round_style>(std::log2(detail::half2float<detail::internal_t>(arg.data_))));
+	#else
+		int abs = arg.data_ & 0x7FFF, exp = -15, s = 0;
+		if(!abs)
+			return half(detail::binary, detail::pole(0x8000));
+		if(arg.data_ & 0x8000)
+			return half(detail::binary, (arg.data_<=0xFC00) ? detail::invalid() : detail::signal(arg.data_));
+		if(abs >= 0x7C00)
+			return (abs==0x7C00) ? arg : half(detail::binary, detail::signal(arg.data_));
+		if(abs == 0x3C00)
+			return half(detail::binary, 0);
+		for(; abs<0x400; abs<<=1,--exp) ;
+		exp += (abs>>10);
+		if(!(abs&0x3FF))
+		{
+			unsigned int value = static_cast<unsigned>(exp<0) << 15, m = std::abs(exp) << 6;
+			for(exp=18; m<0x400; m<<=1,--exp) ;
+			return half(detail::binary, value+(exp<<10)+m);
+		}
+		detail::uint32 ilog = exp, sign = detail::sign_mask(ilog), m = 
+			(((ilog<<27)+(detail::log2(static_cast<detail::uint32>((abs&0x3FF)|0x400)<<20, 28)>>4))^sign) - sign;
+		if(!m)
+			return half(detail::binary, 0);
+		for(exp=14; m<0x8000000 && exp; m<<=1,--exp) ;
+		for(; m>0xFFFFFFF; m>>=1,++exp)
+			s |= m & 1;
+		return half(detail::binary, detail::fixed2half<half::round_style,27,false,false,true>(m, exp, sign&0x8000, s));
+	#endif
+	}
+
+	/// Natural logarithm plus one.
+	/// This function may be 1 ULP off the correctly rounded exact result in <0.05% of inputs for `std::round_to_nearest` 
+	/// and in ~1% of inputs for any other rounding mode.
+	///
+	/// **See also:** Documentation for [std::log1p](https://en.cppreference.com/w/cpp/numeric/math/log1p).
+	/// \param arg function argument
+	/// \return logarithm of \a arg plus 1 to base e
+	/// \exception FE_INVALID for signaling NaN or argument <-1
+	/// \exception FE_DIVBYZERO for -1
+	/// \exception FE_OVERFLOW, ...UNDERFLOW, ...INEXACT according to rounding
+	inline half log1p(half arg)
+	{
+	#if defined(HALF_ARITHMETIC_TYPE) && HALF_ENABLE_CPP11_CMATH
+		return half(detail::binary, detail::float2half<half::round_style>(std::log1p(detail::half2float<detail::internal_t>(arg.data_))));
+	#else
+		if(arg.data_ >= 0xBC00)
+			return half(detail::binary, (arg.data_==0xBC00) ? detail::pole(0x8000) : (arg.data_<=0xFC00) ? detail::invalid() : detail::signal(arg.data_));
+		int abs = arg.data_ & 0x7FFF, exp = -15;
+		if(!abs || abs >= 0x7C00)
+			return (abs>0x7C00) ? half(detail::binary, detail::signal(arg.data_)) : arg;
+		for(; abs<0x400; abs<<=1,--exp) ;
+		exp += abs >> 10;
+		detail::uint32 m = static_cast<detail::uint32>((abs&0x3FF)|0x400) << 20;
+		if(arg.data_ & 0x8000)
+		{
+			m = 0x40000000 - (m>>-exp);
+			for(exp=0; m<0x40000000; m<<=1,--exp) ;
+		}
+		else
+		{
+			if(exp < 0)
+			{
+				m = 0x40000000 + (m>>-exp);
+				exp = 0;
+			}
+			else
+			{
+				m += 0x40000000 >> exp;
+				int i = static_cast<int>(m >> 31);
+				m >>= i;
+				exp += i;
+			}
+		}
+		return half(detail::binary, detail::log2_post<half::round_style,0xB8AA3B2A>(detail::log2(m), exp, 17));
+	#endif
+	}
+
+	/// \}
+	/// \anchor power
+	/// \name Power functions
+	/// \{
+
+	/// Square root.
+	/// This function is exact to rounding for all rounding modes.
+	///
+	/// **See also:** Documentation for [std::sqrt](https://en.cppreference.com/w/cpp/numeric/math/sqrt).
+	/// \param arg function argument
+	/// \return square root of \a arg
+	/// \exception FE_INVALID for signaling NaN and negative arguments
+	/// \exception FE_INEXACT according to rounding
+	inline half sqrt(half arg)
+	{
+	#ifdef HALF_ARITHMETIC_TYPE
+		return half(detail::binary, detail::float2half<half::round_style>(std::sqrt(detail::half2float<detail::internal_t>(arg.data_))));
+	#else
+		int abs = arg.data_ & 0x7FFF, exp = 15;
+		if(!abs || arg.data_ >= 0x7C00)
+			return half(detail::binary, (abs>0x7C00) ? detail::signal(arg.data_) : (arg.data_>0x8000) ? detail::invalid() : arg.data_);
+		for(; abs<0x400; abs<<=1,--exp) ;
+		detail::uint32 r = static_cast<detail::uint32>((abs&0x3FF)|0x400) << 10, m = detail::sqrt<20>(r, exp+=abs>>10);
+		return half(detail::binary, detail::rounded<half::round_style,false>((exp<<10)+(m&0x3FF), r>m, r!=0));
+	#endif
+	}
+
+	/// Inverse square root.
+	/// This function is exact to rounding for all rounding modes and thus generally more accurate than directly computing 
+	/// 1 / sqrt(\a arg) in half-precision, in addition to also being faster.
+	/// \param arg function argument
+	/// \return reciprocal of square root of \a arg
+	/// \exception FE_INVALID for signaling NaN and negative arguments
+	/// \exception FE_INEXACT according to rounding
+	inline half rsqrt(half arg)
+	{
+	#ifdef HALF_ARITHMETIC_TYPE
+		return half(detail::binary, detail::float2half<half::round_style>(detail::internal_t(1)/std::sqrt(detail::half2float<detail::internal_t>(arg.data_))));
+	#else
+		unsigned int abs = arg.data_ & 0x7FFF, bias = 0x4000;
+		if(!abs || arg.data_ >= 0x7C00)
+			return half(detail::binary,	(abs>0x7C00) ? detail::signal(arg.data_) : (arg.data_>0x8000) ?
+										detail::invalid() : !abs ? detail::pole(arg.data_&0x8000) : 0);
+		for(; abs<0x400; abs<<=1,bias-=0x400) ;
+		unsigned int frac = (abs+=bias) & 0x7FF;
+		if(frac == 0x400)
+			return half(detail::binary, 0x7A00-(abs>>1));
+		if((half::round_style == std::round_to_nearest && (frac == 0x3FE || frac == 0x76C)) ||
+		   (half::round_style != std::round_to_nearest && (frac == 0x15A || frac == 0x3FC || frac == 0x401 || frac == 0x402 || frac == 0x67B)))
+			return pow(arg, half(detail::binary, 0xB800));
+		detail::uint32 f = 0x17376 - abs, mx = (abs&0x3FF) | 0x400, my = ((f>>1)&0x3FF) | 0x400, mz = my * my;
+		int expy = static_cast<int>(f>>11) - 31, expx = 32 - (abs>>10), i = static_cast<int>(mz >> 21);
+		for(mz=0x60000000-(((mz>>i)*mx)>>(expx-2*expy-i)); mz<0x40000000; mz<<=1,--expy) ;
+		i = static_cast<int>((my*=mz>>10) >> 31);
+		expy += i;
+		my = (my>>(20+i)) + 1;
+		i = static_cast<int>((mz=my*my) >> 21);
+		for(mz=0x60000000-(((mz>>i)*mx)>>(expx-2*expy-i)); mz<0x40000000; mz<<=1,--expy) ;
+		i = static_cast<int>((my*=(mz>>10)+1) >> 31);
+		return half(detail::binary, detail::fixed2half<half::round_style,30,false,false,true>(my>>i, expy+i+14));
+	#endif
+	}
+
+	/// Cubic root.
+	/// This function is exact to rounding for all rounding modes.
+	///
+	/// **See also:** Documentation for [std::cbrt](https://en.cppreference.com/w/cpp/numeric/math/cbrt).
+	/// \param arg function argument
+	/// \return cubic root of \a arg
+	/// \exception FE_INVALID for signaling NaN
+	/// \exception FE_INEXACT according to rounding
+	inline half cbrt(half arg)
+	{
+	#if defined(HALF_ARITHMETIC_TYPE) && HALF_ENABLE_CPP11_CMATH
+		return half(detail::binary, detail::float2half<half::round_style>(std::cbrt(detail::half2float<detail::internal_t>(arg.data_))));
+	#else
+		int abs = arg.data_ & 0x7FFF, exp = -15;
+		if(!abs || abs == 0x3C00 || abs >= 0x7C00)
+			return (abs>0x7C00) ? half(detail::binary, detail::signal(arg.data_)) : arg;
+		for(; abs<0x400; abs<<=1, --exp);
+		detail::uint32 ilog = exp + (abs>>10), sign = detail::sign_mask(ilog), f, m = 
+			(((ilog<<27)+(detail::log2(static_cast<detail::uint32>((abs&0x3FF)|0x400)<<20, 24)>>4))^sign) - sign;
+		for(exp=2; m<0x80000000; m<<=1,--exp) ;
+		m = detail::multiply64(m, 0xAAAAAAAB);
+		int i = static_cast<int>(m >> 31), s;
+		exp += i;
+		m <<= 1 - i;
+		if(exp < 0)
+		{
+			f = m >> -exp;
+			exp = 0;
+		}
+		else
+		{
+			f = (m<<exp) & 0x7FFFFFFF;
+			exp = static_cast<int>(m >> (31-exp));
+		}
+		m = detail::exp2(f, (half::round_style==std::round_to_nearest) ? 29 : 26);
+		if(sign)
+		{
+			if(m > 0x80000000)
+			{
+				m = detail::divide64(0x80000000, m, s);
+				++exp;
+			}
+			exp = -exp;
+		}
+		return half(detail::binary, (half::round_style==std::round_to_nearest) ?
+			detail::fixed2half<half::round_style,31,false,false,false>(m, exp+14, arg.data_&0x8000) :
+			detail::fixed2half<half::round_style,23,false,false,false>((m+0x80)>>8, exp+14, arg.data_&0x8000));
+	#endif
+	}
+
+	/// Hypotenuse function.
+	/// This function is exact to rounding for all rounding modes.
+	///
+	/// **See also:** Documentation for [std::hypot](https://en.cppreference.com/w/cpp/numeric/math/hypot).
+	/// \param x first argument
+	/// \param y second argument
+	/// \return square root of sum of squares without internal over- or underflows
+	/// \exception FE_INVALID if \a x or \a y is signaling NaN
+	/// \exception FE_OVERFLOW, ...UNDERFLOW, ...INEXACT according to rounding of the final square root
+	inline half hypot(half x, half y)
+	{
+	#ifdef HALF_ARITHMETIC_TYPE
+		detail::internal_t fx = detail::half2float<detail::internal_t>(x.data_), fy = detail::half2float<detail::internal_t>(y.data_);
+		#if HALF_ENABLE_CPP11_CMATH
+			return half(detail::binary, detail::float2half<half::round_style>(std::hypot(fx, fy)));
+		#else
+			return half(detail::binary, detail::float2half<half::round_style>(std::sqrt(fx*fx+fy*fy)));
+		#endif
+	#else
+		int absx = x.data_ & 0x7FFF, absy = y.data_ & 0x7FFF, expx = 0, expy = 0;
+		if(absx >= 0x7C00 || absy >= 0x7C00)
+			return half(detail::binary,	(absx==0x7C00) ? detail::select(0x7C00, y.data_) :
+				(absy==0x7C00) ? detail::select(0x7C00, x.data_) : detail::signal(x.data_, y.data_));
+		if(!absx)
+			return half(detail::binary, absy ? detail::check_underflow(absy) : 0);
+		if(!absy)
+			return half(detail::binary, detail::check_underflow(absx));
+		if(absy > absx)
+			std::swap(absx, absy);
+		for(; absx<0x400; absx<<=1,--expx) ;
+		for(; absy<0x400; absy<<=1,--expy) ;
+		detail::uint32 mx = (absx&0x3FF) | 0x400, my = (absy&0x3FF) | 0x400;
+		mx *= mx;
+		my *= my;
+		int ix = static_cast<int>(mx >> 21), iy = static_cast<int>(my >> 21);
+		expx = 2*(expx+(absx>>10)) - 15 + ix;
+		expy = 2*(expy+(absy>>10)) - 15 + iy;
+		mx <<= 10 - ix;
+		my <<= 10 - iy;
+		int d = expx - expy;
+		my = (d<30) ? ((my>>d)|((my&((static_cast<detail::uint32>(1)<<d)-1))!=0)) : 1;
+		return half(detail::binary, detail::hypot_post<half::round_style>(mx+my, expx));
+	#endif
+	}
+
+	/// Hypotenuse function.
+	/// This function is exact to rounding for all rounding modes.
+	///
+	/// **See also:** Documentation for [std::hypot](https://en.cppreference.com/w/cpp/numeric/math/hypot).
+	/// \param x first argument
+	/// \param y second argument
+	/// \param z third argument
+	/// \return square root of sum of squares without internal over- or underflows
+	/// \exception FE_INVALID if \a x, \a y or \a z is signaling NaN
+	/// \exception FE_OVERFLOW, ...UNDERFLOW, ...INEXACT according to rounding of the final square root
+	inline half hypot(half x, half y, half z)
+	{
+	#ifdef HALF_ARITHMETIC_TYPE
+		detail::internal_t fx = detail::half2float<detail::internal_t>(x.data_), fy = detail::half2float<detail::internal_t>(y.data_), fz = detail::half2float<detail::internal_t>(z.data_);
+		return half(detail::binary, detail::float2half<half::round_style>(std::sqrt(fx*fx+fy*fy+fz*fz)));
+	#else
+		int absx = x.data_ & 0x7FFF, absy = y.data_ & 0x7FFF, absz = z.data_ & 0x7FFF, expx = 0, expy = 0, expz = 0;
+		if(!absx)
+			return hypot(y, z);
+		if(!absy)
+			return hypot(x, z);
+		if(!absz)
+			return hypot(x, y);
+		if(absx >= 0x7C00 || absy >= 0x7C00 || absz >= 0x7C00)
+			return half(detail::binary,	(absx==0x7C00) ? detail::select(0x7C00, detail::select(y.data_, z.data_)) :
+										(absy==0x7C00) ? detail::select(0x7C00, detail::select(x.data_, z.data_)) :
+										(absz==0x7C00) ? detail::select(0x7C00, detail::select(x.data_, y.data_)) :
+										detail::signal(x.data_, y.data_, z.data_));
+		if(absz > absy)
+			std::swap(absy, absz);
+		if(absy > absx)
+			std::swap(absx, absy);
+		if(absz > absy)
+			std::swap(absy, absz);
+		for(; absx<0x400; absx<<=1,--expx) ;
+		for(; absy<0x400; absy<<=1,--expy) ;
+		for(; absz<0x400; absz<<=1,--expz) ;
+		detail::uint32 mx = (absx&0x3FF) | 0x400, my = (absy&0x3FF) | 0x400, mz = (absz&0x3FF) | 0x400;
+		mx *= mx;
+		my *= my;
+		mz *= mz;
+		int ix = static_cast<int>(mx >> 21), iy = static_cast<int>(my >> 21), iz = static_cast<int>(mz >> 21);
+		expx = 2*(expx+(absx>>10)) - 15 + ix;
+		expy = 2*(expy+(absy>>10)) - 15 + iy;
+		expz = 2*(expz+(absz>>10)) - 15 + iz;
+		mx <<= 10 - ix;
+		my <<= 10 - iy;
+		mz <<= 10 - iz;
+		int d = expy - expz;
+		mz = (d<30) ? ((mz>>d)|((mz&((static_cast<detail::uint32>(1)<<d)-1))!=0)) : 1;
+		my += mz;
+		if(my & 0x80000000)
+		{
+			my = (my>>1) | (my&1);
+			if(++expy > expx)
+			{
+				std::swap(mx, my);
+				std::swap(expx, expy);
+			}
+		}
+		d = expx - expy;
+		my = (d<30) ? ((my>>d)|((my&((static_cast<detail::uint32>(1)<<d)-1))!=0)) : 1;
+		return half(detail::binary, detail::hypot_post<half::round_style>(mx+my, expx));
+	#endif
+	}
+
+	/// Power function.
+	/// This function may be 1 ULP off the correctly rounded exact result for any rounding mode in ~0.00025% of inputs.
+	///
+	/// **See also:** Documentation for [std::pow](https://en.cppreference.com/w/cpp/numeric/math/pow).
+	/// \param x base
+	/// \param y exponent
+	/// \return \a x raised to \a y
+	/// \exception FE_INVALID if \a x or \a y is signaling NaN or if \a x is finite an negative and \a y is finite and not integral
+	/// \exception FE_DIVBYZERO if \a x is 0 and \a y is negative
+	/// \exception FE_OVERFLOW, ...UNDERFLOW, ...INEXACT according to rounding
+	inline half pow(half x, half y)
+	{
+	#ifdef HALF_ARITHMETIC_TYPE
+		return half(detail::binary, detail::float2half<half::round_style>(std::pow(detail::half2float<detail::internal_t>(x.data_), detail::half2float<detail::internal_t>(y.data_))));
+	#else
+		int absx = x.data_ & 0x7FFF, absy = y.data_ & 0x7FFF, exp = -15;
+		if(!absy || x.data_ == 0x3C00)
+			return half(detail::binary, detail::select(0x3C00, (x.data_==0x3C00) ? y.data_ : x.data_));
+		bool is_int = absy >= 0x6400 || (absy>=0x3C00 && !(absy&((1<<(25-(absy>>10)))-1)));
+		unsigned int sign = x.data_ & (static_cast<unsigned>((absy<0x6800)&&is_int&&((absy>>(25-(absy>>10)))&1))<<15);
+		if(absx >= 0x7C00 || absy >= 0x7C00)
+			return half(detail::binary,	(absx>0x7C00 || absy>0x7C00) ? detail::signal(x.data_, y.data_) :
+										(absy==0x7C00) ? ((absx==0x3C00) ? 0x3C00 : (!absx && y.data_==0xFC00) ? detail::pole() :
+										(0x7C00&-((y.data_>>15)^(absx>0x3C00)))) : (sign|(0x7C00&((y.data_>>15)-1U))));
+		if(!absx)
+			return half(detail::binary, (y.data_&0x8000) ? detail::pole(sign) : sign);
+		if((x.data_&0x8000) && !is_int)
+			return half(detail::binary, detail::invalid());
+		if(x.data_ == 0xBC00)
+			return half(detail::binary, sign|0x3C00);
+		switch(y.data_)
+		{
+			case 0x3800: return sqrt(x);
+			case 0x3C00: return half(detail::binary, detail::check_underflow(x.data_));
+			case 0x4000: return x * x;
+			case 0xBC00: return half(detail::binary, 0x3C00) / x;
+		}
+		for(; absx<0x400; absx<<=1,--exp) ;
+		detail::uint32 ilog = exp + (absx>>10), msign = detail::sign_mask(ilog), f, m = 
+			(((ilog<<27)+((detail::log2(static_cast<detail::uint32>((absx&0x3FF)|0x400)<<20)+8)>>4))^msign) - msign;
+		for(exp=-11; m<0x80000000; m<<=1,--exp) ;
+		for(; absy<0x400; absy<<=1,--exp) ;
+		m = detail::multiply64(m, static_cast<detail::uint32>((absy&0x3FF)|0x400)<<21);
+		int i = static_cast<int>(m >> 31);
+		exp += (absy>>10) + i;
+		m <<= 1 - i;
+		if(exp < 0)
+		{
+			f = m >> -exp;
+			exp = 0;
+		}
+		else
+		{
+			f = (m<<exp) & 0x7FFFFFFF;
+			exp = static_cast<int>(m >> (31-exp));
+		}
+		return half(detail::binary, detail::exp2_post<half::round_style>(f, exp, ((msign&1)^(y.data_>>15))!=0, sign));
+	#endif
+	}
+
+	/// \}
+	/// \anchor trigonometric
+	/// \name Trigonometric functions
+	/// \{
+
+	/// Compute sine and cosine simultaneously.
+	///	This returns the same results as sin() and cos() but is faster than calling each function individually.
+	///
+	/// This function is exact to rounding for all rounding modes.
+	/// \param arg function argument
+	/// \param sin variable to take sine of \a arg
+	/// \param cos variable to take cosine of \a arg
+	/// \exception FE_INVALID for signaling NaN or infinity
+	/// \exception FE_OVERFLOW, ...UNDERFLOW, ...INEXACT according to rounding
+	inline void sincos(half arg, half *sin, half *cos)
+	{
+	#ifdef HALF_ARITHMETIC_TYPE
+		detail::internal_t f = detail::half2float<detail::internal_t>(arg.data_);
+		*sin = half(detail::binary, detail::float2half<half::round_style>(std::sin(f)));
+		*cos = half(detail::binary, detail::float2half<half::round_style>(std::cos(f)));
+	#else
+		int abs = arg.data_ & 0x7FFF, sign = arg.data_ >> 15, k;
+		if(abs >= 0x7C00)
+			*sin = *cos = half(detail::binary, (abs==0x7C00) ? detail::invalid() : detail::signal(arg.data_));
+		else if(!abs)
+		{
+			*sin = arg;
+			*cos = half(detail::binary, 0x3C00);
+		}
+		else if(abs < 0x2500)
+		{
+			*sin = half(detail::binary, detail::rounded<half::round_style,true>(arg.data_-1, 1, 1));
+			*cos = half(detail::binary, detail::rounded<half::round_style,true>(0x3BFF, 1, 1));
+		}
+		else
+		{
+			if(half::round_style != std::round_to_nearest)
+			{
+				switch(abs)
+				{
+				case 0x48B7:
+					*sin = half(detail::binary, detail::rounded<half::round_style,true>((~arg.data_&0x8000)|0x1D07, 1, 1));
+					*cos = half(detail::binary, detail::rounded<half::round_style,true>(0xBBFF, 1, 1));
+					return;
+				case 0x598C:
+					*sin = half(detail::binary, detail::rounded<half::round_style,true>((arg.data_&0x8000)|0x3BFF, 1, 1));
+					*cos = half(detail::binary, detail::rounded<half::round_style,true>(0x80FC, 1, 1));
+					return;
+				case 0x6A64:
+					*sin = half(detail::binary, detail::rounded<half::round_style,true>((~arg.data_&0x8000)|0x3BFE, 1, 1));
+					*cos = half(detail::binary, detail::rounded<half::round_style,true>(0x27FF, 1, 1));
+					return;
+				case 0x6D8C:
+					*sin = half(detail::binary, detail::rounded<half::round_style,true>((arg.data_&0x8000)|0x0FE6, 1, 1));
+					*cos = half(detail::binary, detail::rounded<half::round_style,true>(0x3BFF, 1, 1));
+					return;
+				}
+			}
+			std::pair<detail::uint32,detail::uint32> sc = detail::sincos(detail::angle_arg(abs, k), 28);
+			switch(k & 3)
+			{
+				case 1: sc = std::make_pair(sc.second, -sc.first); break;
+				case 2: sc = std::make_pair(-sc.first, -sc.second); break;
+				case 3: sc = std::make_pair(-sc.second, sc.first); break;
+			}
+			*sin = half(detail::binary, detail::fixed2half<half::round_style,30,true,true,true>((sc.first^-static_cast<detail::uint32>(sign))+sign));
+			*cos = half(detail::binary, detail::fixed2half<half::round_style,30,true,true,true>(sc.second));
+		}
+	#endif
+	}
+
+	/// Sine function.
+	/// This function is exact to rounding for all rounding modes.
+	///
+	/// **See also:** Documentation for [std::sin](https://en.cppreference.com/w/cpp/numeric/math/sin).
+	/// \param arg function argument
+	/// \return sine value of \a arg
+	/// \exception FE_INVALID for signaling NaN or infinity
+	/// \exception FE_OVERFLOW, ...UNDERFLOW, ...INEXACT according to rounding
+	inline half sin(half arg)
+	{
+	#ifdef HALF_ARITHMETIC_TYPE
+		return half(detail::binary, detail::float2half<half::round_style>(std::sin(detail::half2float<detail::internal_t>(arg.data_))));
+	#else
+		int abs = arg.data_ & 0x7FFF, k;
+		if(!abs)
+			return arg;
+		if(abs >= 0x7C00)
+			return half(detail::binary, (abs==0x7C00) ? detail::invalid() : detail::signal(arg.data_));
+		if(abs < 0x2900)
+			return half(detail::binary, detail::rounded<half::round_style,true>(arg.data_-1, 1, 1));
+		if(half::round_style != std::round_to_nearest)
+			switch(abs)
+			{
+				case 0x48B7: return half(detail::binary, detail::rounded<half::round_style,true>((~arg.data_&0x8000)|0x1D07, 1, 1));
+				case 0x6A64: return half(detail::binary, detail::rounded<half::round_style,true>((~arg.data_&0x8000)|0x3BFE, 1, 1));
+				case 0x6D8C: return half(detail::binary, detail::rounded<half::round_style,true>((arg.data_&0x8000)|0x0FE6, 1, 1));
+			}
+		std::pair<detail::uint32,detail::uint32> sc = detail::sincos(detail::angle_arg(abs, k), 28);
+		detail::uint32 sign = -static_cast<detail::uint32>(((k>>1)&1)^(arg.data_>>15));
+		return half(detail::binary, detail::fixed2half<half::round_style,30,true,true,true>((((k&1) ? sc.second : sc.first)^sign) - sign));
+	#endif
+	}
+
+	/// Cosine function.
+	/// This function is exact to rounding for all rounding modes.
+	///
+	/// **See also:** Documentation for [std::cos](https://en.cppreference.com/w/cpp/numeric/math/cos).
+	/// \param arg function argument
+	/// \return cosine value of \a arg
+	/// \exception FE_INVALID for signaling NaN or infinity
+	/// \exception FE_OVERFLOW, ...UNDERFLOW, ...INEXACT according to rounding
+	inline half cos(half arg)
+	{
+	#ifdef HALF_ARITHMETIC_TYPE
+		return half(detail::binary, detail::float2half<half::round_style>(std::cos(detail::half2float<detail::internal_t>(arg.data_))));
+	#else
+		int abs = arg.data_ & 0x7FFF, k;
+		if(!abs)
+			return half(detail::binary, 0x3C00);
+		if(abs >= 0x7C00)
+			return half(detail::binary, (abs==0x7C00) ? detail::invalid() : detail::signal(arg.data_));
+		if(abs < 0x2500)
+			return half(detail::binary, detail::rounded<half::round_style,true>(0x3BFF, 1, 1));
+		if(half::round_style != std::round_to_nearest && abs == 0x598C)
+			return half(detail::binary, detail::rounded<half::round_style,true>(0x80FC, 1, 1));
+		std::pair<detail::uint32,detail::uint32> sc = detail::sincos(detail::angle_arg(abs, k), 28);
+		detail::uint32 sign = -static_cast<detail::uint32>(((k>>1)^k)&1);
+		return half(detail::binary, detail::fixed2half<half::round_style,30,true,true,true>((((k&1) ? sc.first : sc.second)^sign) - sign));
+	#endif
+	}
+
+	/// Tangent function.
+	/// This function is exact to rounding for all rounding modes.
+	///
+	/// **See also:** Documentation for [std::tan](https://en.cppreference.com/w/cpp/numeric/math/tan).
+	/// \param arg function argument
+	/// \return tangent value of \a arg
+	/// \exception FE_INVALID for signaling NaN or infinity
+	/// \exception FE_OVERFLOW, ...UNDERFLOW, ...INEXACT according to rounding
+	inline half tan(half arg)
+	{
+	#ifdef HALF_ARITHMETIC_TYPE
+		return half(detail::binary, detail::float2half<half::round_style>(std::tan(detail::half2float<detail::internal_t>(arg.data_))));
+	#else
+		int abs = arg.data_ & 0x7FFF, exp = 13, k;
+		if(!abs)
+			return arg;
+		if(abs >= 0x7C00)
+			return half(detail::binary, (abs==0x7C00) ? detail::invalid() : detail::signal(arg.data_));
+		if(abs < 0x2700)
+			return half(detail::binary, detail::rounded<half::round_style,true>(arg.data_, 0, 1));
+		if(half::round_style != std::round_to_nearest)
+			switch(abs)
+			{
+				case 0x658C: return half(detail::binary, detail::rounded<half::round_style,true>((arg.data_&0x8000)|0x07E6, 1, 1));
+				case 0x7330: return half(detail::binary, detail::rounded<half::round_style,true>((~arg.data_&0x8000)|0x4B62, 1, 1));
+			}
+		std::pair<detail::uint32,detail::uint32> sc = detail::sincos(detail::angle_arg(abs, k), 30);
+		if(k & 1)
+			sc = std::make_pair(-sc.second, sc.first);
+		detail::uint32 signy = detail::sign_mask(sc.first), signx = detail::sign_mask(sc.second);
+		detail::uint32 my = (sc.first^signy) - signy, mx = (sc.second^signx) - signx;
+		for(; my<0x80000000; my<<=1,--exp) ;
+		for(; mx<0x80000000; mx<<=1,++exp) ;
+		return half(detail::binary, detail::tangent_post<half::round_style>(my, mx, exp, (signy^signx^arg.data_)&0x8000));
+	#endif
+	}
+
+	/// Arc sine.
+	/// This function is exact to rounding for all rounding modes.
+	///
+	/// **See also:** Documentation for [std::asin](https://en.cppreference.com/w/cpp/numeric/math/asin).
+	/// \param arg function argument
+	/// \return arc sine value of \a arg
+	/// \exception FE_INVALID for signaling NaN or if abs(\a arg) > 1
+	/// \exception FE_OVERFLOW, ...UNDERFLOW, ...INEXACT according to rounding
+	inline half asin(half arg)
+	{
+	#ifdef HALF_ARITHMETIC_TYPE
+		return half(detail::binary, detail::float2half<half::round_style>(std::asin(detail::half2float<detail::internal_t>(arg.data_))));
+	#else
+		unsigned int abs = arg.data_ & 0x7FFF, sign = arg.data_ & 0x8000;
+		if(!abs)
+			return arg;
+		if(abs >= 0x3C00)
+			return half(detail::binary, (abs>0x7C00) ? detail::signal(arg.data_) : (abs>0x3C00) ? detail::invalid() :
+										detail::rounded<half::round_style,true>(sign|0x3E48, 0, 1));
+		if(abs < 0x2900)
+			return half(detail::binary, detail::rounded<half::round_style,true>(arg.data_, 0, 1));
+		if(half::round_style != std::round_to_nearest && (abs == 0x2B44 || abs == 0x2DC3))
+			return half(detail::binary, detail::rounded<half::round_style,true>(arg.data_+1, 1, 1));
+		std::pair<detail::uint32,detail::uint32> sc = detail::atan2_args(abs);
+		detail::uint32 m = detail::atan2(sc.first, sc.second, (half::round_style==std::round_to_nearest) ? 27 : 26);
+		return half(detail::binary, detail::fixed2half<half::round_style,30,false,true,true>(m, 14, sign));
+	#endif
+	}
+
+	/// Arc cosine function.
+	/// This function is exact to rounding for all rounding modes.
+	///
+	/// **See also:** Documentation for [std::acos](https://en.cppreference.com/w/cpp/numeric/math/acos).
+	/// \param arg function argument
+	/// \return arc cosine value of \a arg
+	/// \exception FE_INVALID for signaling NaN or if abs(\a arg) > 1
+	/// \exception FE_OVERFLOW, ...UNDERFLOW, ...INEXACT according to rounding
+	inline half acos(half arg)
+	{
+	#ifdef HALF_ARITHMETIC_TYPE
+		return half(detail::binary, detail::float2half<half::round_style>(std::acos(detail::half2float<detail::internal_t>(arg.data_))));
+	#else
+		unsigned int abs = arg.data_ & 0x7FFF, sign = arg.data_ >> 15;
+		if(!abs)
+			return half(detail::binary, detail::rounded<half::round_style,true>(0x3E48, 0, 1));
+		if(abs >= 0x3C00)
+			return half(detail::binary,	(abs>0x7C00) ? detail::signal(arg.data_) : (abs>0x3C00) ? detail::invalid() :
+										sign ? detail::rounded<half::round_style,true>(0x4248, 0, 1) : 0);
+		std::pair<detail::uint32,detail::uint32> cs = detail::atan2_args(abs);
+		detail::uint32 m = detail::atan2(cs.second, cs.first, 28);
+		return half(detail::binary, detail::fixed2half<half::round_style,31,false,true,true>(sign ? (0xC90FDAA2-m) : m, 15, 0, sign));
+	#endif
+	}
+
+	/// Arc tangent function.
+	/// This function is exact to rounding for all rounding modes.
+	///
+	/// **See also:** Documentation for [std::atan](https://en.cppreference.com/w/cpp/numeric/math/atan).
+	/// \param arg function argument
+	/// \return arc tangent value of \a arg
+	/// \exception FE_INVALID for signaling NaN
+	/// \exception FE_OVERFLOW, ...UNDERFLOW, ...INEXACT according to rounding
+	inline half atan(half arg)
+	{
+	#ifdef HALF_ARITHMETIC_TYPE
+		return half(detail::binary, detail::float2half<half::round_style>(std::atan(detail::half2float<detail::internal_t>(arg.data_))));
+	#else
+		unsigned int abs = arg.data_ & 0x7FFF, sign = arg.data_ & 0x8000;
+		if(!abs)
+			return arg;
+		if(abs >= 0x7C00)
+			return half(detail::binary, (abs==0x7C00) ? detail::rounded<half::round_style,true>(sign|0x3E48, 0, 1) : detail::signal(arg.data_));
+		if(abs <= 0x2700)
+			return half(detail::binary, detail::rounded<half::round_style,true>(arg.data_-1, 1, 1));
+		int exp = (abs>>10) + (abs<=0x3FF);
+		detail::uint32 my = (abs&0x3FF) | ((abs>0x3FF)<<10);
+		detail::uint32 m = (exp>15) ?	detail::atan2(my<<19, 0x20000000>>(exp-15), (half::round_style==std::round_to_nearest) ? 26 : 24) :
+										detail::atan2(my<<(exp+4), 0x20000000, (half::round_style==std::round_to_nearest) ? 30 : 28);
+		return half(detail::binary, detail::fixed2half<half::round_style,30,false,true,true>(m, 14, sign));
+	#endif
+	}
+
+	/// Arc tangent function.
+	/// This function may be 1 ULP off the correctly rounded exact result in ~0.005% of inputs for `std::round_to_nearest`, 
+	/// in ~0.1% of inputs for `std::round_toward_zero` and in ~0.02% of inputs for any other rounding mode.
+	///
+	/// **See also:** Documentation for [std::atan2](https://en.cppreference.com/w/cpp/numeric/math/atan2).
+	/// \param y numerator
+	/// \param x denominator
+	/// \return arc tangent value
+	/// \exception FE_INVALID if \a x or \a y is signaling NaN
+	/// \exception FE_OVERFLOW, ...UNDERFLOW, ...INEXACT according to rounding
+	inline half atan2(half y, half x)
+	{
+	#ifdef HALF_ARITHMETIC_TYPE
+		return half(detail::binary, detail::float2half<half::round_style>(std::atan2(detail::half2float<detail::internal_t>(y.data_), detail::half2float<detail::internal_t>(x.data_))));
+	#else
+		unsigned int absx = x.data_ & 0x7FFF, absy = y.data_ & 0x7FFF, signx = x.data_ >> 15, signy = y.data_ & 0x8000;
+		if(absx >= 0x7C00 || absy >= 0x7C00)
+		{
+			if(absx > 0x7C00 || absy > 0x7C00)
+				return half(detail::binary, detail::signal(x.data_, y.data_));
+			if(absy == 0x7C00)
+				return half(detail::binary, (absx<0x7C00) ?	detail::rounded<half::round_style,true>(signy|0x3E48, 0, 1) :
+													signx ?	detail::rounded<half::round_style,true>(signy|0x40B6, 0, 1) :
+															detail::rounded<half::round_style,true>(signy|0x3A48, 0, 1));
+			return (x.data_==0x7C00) ? half(detail::binary, signy) : half(detail::binary, detail::rounded<half::round_style,true>(signy|0x4248, 0, 1));
+		}
+		if(!absy)
+			return signx ? half(detail::binary, detail::rounded<half::round_style,true>(signy|0x4248, 0, 1)) : y;
+		if(!absx)
+			return half(detail::binary, detail::rounded<half::round_style,true>(signy|0x3E48, 0, 1));
+		int d = (absy>>10) + (absy<=0x3FF) - (absx>>10) - (absx<=0x3FF);
+		if(d > (signx ? 18 : 12))
+			return half(detail::binary, detail::rounded<half::round_style,true>(signy|0x3E48, 0, 1));
+		if(signx && d < -11)
+			return half(detail::binary, detail::rounded<half::round_style,true>(signy|0x4248, 0, 1));
+		if(!signx && d < ((half::round_style==std::round_toward_zero) ? -15 : -9))
+		{
+			for(; absy<0x400; absy<<=1,--d) ;
+			detail::uint32 mx = ((absx<<1)&0x7FF) | 0x800, my = ((absy<<1)&0x7FF) | 0x800;
+			int i = my < mx;
+			d -= i;
+			if(d < -25)
+				return half(detail::binary, detail::underflow<half::round_style>(signy));
+			my <<= 11 + i;
+			return half(detail::binary, detail::fixed2half<half::round_style,11,false,false,true>(my/mx, d+14, signy, my%mx!=0));
+		}
+		detail::uint32 m = detail::atan2(	((absy&0x3FF)|((absy>0x3FF)<<10))<<(19+((d<0) ? d : (d>0) ? 0 : -1)),
+											((absx&0x3FF)|((absx>0x3FF)<<10))<<(19-((d>0) ? d : (d<0) ? 0 : 1)));
+		return half(detail::binary, detail::fixed2half<half::round_style,31,false,true,true>(signx ? (0xC90FDAA2-m) : m, 15, signy, signx));
+	#endif
+	}
+
+	/// \}
+	/// \anchor hyperbolic
+	/// \name Hyperbolic functions
+	/// \{
+
+	/// Hyperbolic sine.
+	/// This function is exact to rounding for all rounding modes.
+	///
+	/// **See also:** Documentation for [std::sinh](https://en.cppreference.com/w/cpp/numeric/math/sinh).
+	/// \param arg function argument
+	/// \return hyperbolic sine value of \a arg
+	/// \exception FE_INVALID for signaling NaN
+	/// \exception FE_OVERFLOW, ...UNDERFLOW, ...INEXACT according to rounding
+	inline half sinh(half arg)
+	{
+	#ifdef HALF_ARITHMETIC_TYPE
+		return half(detail::binary, detail::float2half<half::round_style>(std::sinh(detail::half2float<detail::internal_t>(arg.data_))));
+	#else
+		int abs = arg.data_ & 0x7FFF, exp;
+		if(!abs || abs >= 0x7C00)
+			return (abs>0x7C00) ? half(detail::binary, detail::signal(arg.data_)) : arg;
+		if(abs <= 0x2900)
+			return half(detail::binary, detail::rounded<half::round_style,true>(arg.data_, 0, 1));
+		std::pair<detail::uint32,detail::uint32> mm = detail::hyperbolic_args(abs, exp, (half::round_style==std::round_to_nearest) ? 29 : 27);
+		detail::uint32 m = mm.first - mm.second;
+		for(exp+=13; m<0x80000000 && exp; m<<=1,--exp) ;
+		unsigned int sign = arg.data_ & 0x8000;
+		if(exp > 29)
+			return half(detail::binary, detail::overflow<half::round_style>(sign));
+		return half(detail::binary, detail::fixed2half<half::round_style,31,false,false,true>(m, exp, sign));
+	#endif
+	}
+
+	/// Hyperbolic cosine.
+	/// This function is exact to rounding for all rounding modes.
+	///
+	/// **See also:** Documentation for [std::cosh](https://en.cppreference.com/w/cpp/numeric/math/cosh).
+	/// \param arg function argument
+	/// \return hyperbolic cosine value of \a arg
+	/// \exception FE_INVALID for signaling NaN
+	/// \exception FE_OVERFLOW, ...UNDERFLOW, ...INEXACT according to rounding
+	inline half cosh(half arg)
+	{
+	#ifdef HALF_ARITHMETIC_TYPE
+		return half(detail::binary, detail::float2half<half::round_style>(std::cosh(detail::half2float<detail::internal_t>(arg.data_))));
+	#else
+		int abs = arg.data_ & 0x7FFF, exp;
+		if(!abs)
+			return half(detail::binary, 0x3C00);
+		if(abs >= 0x7C00)
+			return half(detail::binary, (abs>0x7C00) ? detail::signal(arg.data_) : 0x7C00);
+		std::pair<detail::uint32,detail::uint32> mm = detail::hyperbolic_args(abs, exp, (half::round_style==std::round_to_nearest) ? 23 : 26);
+		detail::uint32 m = mm.first + mm.second;
+		int i = static_cast<int>((~m & 0xFFFFFFFF) >> 31);
+		m = (m>>i) | (m&i) | 0x80000000;
+		if((exp+=13+i) > 29)
+			return half(detail::binary, detail::overflow<half::round_style>());
+		return half(detail::binary, detail::fixed2half<half::round_style,31,false,false,true>(m, exp));
+	#endif
+	}
+
+	/// Hyperbolic tangent.
+	/// This function is exact to rounding for all rounding modes.
+	///
+	/// **See also:** Documentation for [std::tanh](https://en.cppreference.com/w/cpp/numeric/math/tanh).
+	/// \param arg function argument
+	/// \return hyperbolic tangent value of \a arg
+	/// \exception FE_INVALID for signaling NaN
+	/// \exception FE_OVERFLOW, ...UNDERFLOW, ...INEXACT according to rounding
+	inline half tanh(half arg)
+	{
+	#ifdef HALF_ARITHMETIC_TYPE
+		return half(detail::binary, detail::float2half<half::round_style>(std::tanh(detail::half2float<detail::internal_t>(arg.data_))));
+	#else
+		int abs = arg.data_ & 0x7FFF, exp;
+		if(!abs)
+			return arg;
+		if(abs >= 0x7C00)
+			return half(detail::binary, (abs>0x7C00) ? detail::signal(arg.data_) : (arg.data_-0x4000));
+		if(abs >= 0x4500)
+			return half(detail::binary, detail::rounded<half::round_style,true>((arg.data_&0x8000)|0x3BFF, 1, 1));
+		if(abs < 0x2700)
+			return half(detail::binary, detail::rounded<half::round_style,true>(arg.data_-1, 1, 1));
+		if(half::round_style != std::round_to_nearest && abs == 0x2D3F)
+			return half(detail::binary, detail::rounded<half::round_style,true>(arg.data_-3, 0, 1));
+		std::pair<detail::uint32,detail::uint32> mm = detail::hyperbolic_args(abs, exp, 27);
+		detail::uint32 my = mm.first - mm.second - (half::round_style!=std::round_to_nearest), mx = mm.first + mm.second;
+		int i = static_cast<int>((~mx&0xFFFFFFFF) >> 31);
+		for(exp=13; my<0x80000000; my<<=1,--exp) ;
+		mx = (mx>>i) | 0x80000000;
+		return half(detail::binary, detail::tangent_post<half::round_style>(my, mx, exp-i, arg.data_&0x8000));
+	#endif
+	}
+
+	/// Hyperbolic area sine.
+	/// This function is exact to rounding for all rounding modes.
+	///
+	/// **See also:** Documentation for [std::asinh](https://en.cppreference.com/w/cpp/numeric/math/asinh).
+	/// \param arg function argument
+	/// \return area sine value of \a arg
+	/// \exception FE_INVALID for signaling NaN
+	/// \exception FE_OVERFLOW, ...UNDERFLOW, ...INEXACT according to rounding
+	inline half asinh(half arg)
+	{
+	#if defined(HALF_ARITHMETIC_TYPE) && HALF_ENABLE_CPP11_CMATH
+		return half(detail::binary, detail::float2half<half::round_style>(std::asinh(detail::half2float<detail::internal_t>(arg.data_))));
+	#else
+		int abs = arg.data_ & 0x7FFF;
+		if(!abs || abs >= 0x7C00)
+			return (abs>0x7C00) ? half(detail::binary, detail::signal(arg.data_)) : arg;
+		if(abs <= 0x2900)
+			return half(detail::binary, detail::rounded<half::round_style,true>(arg.data_-1, 1, 1));
+		if(half::round_style != std::round_to_nearest)
+			switch(abs)
+			{
+				case 0x32D4: return half(detail::binary, detail::rounded<half::round_style,true>(arg.data_-13, 1, 1));
+				case 0x3B5B: return half(detail::binary, detail::rounded<half::round_style,true>(arg.data_-197, 1, 1));
+			}
+		return half(detail::binary, detail::area<half::round_style,true>(arg.data_));
+	#endif
+	}
+
+	/// Hyperbolic area cosine.
+	/// This function is exact to rounding for all rounding modes.
+	///
+	/// **See also:** Documentation for [std::acosh](https://en.cppreference.com/w/cpp/numeric/math/acosh).
+	/// \param arg function argument
+	/// \return area cosine value of \a arg
+	/// \exception FE_INVALID for signaling NaN or arguments <1
+	/// \exception FE_OVERFLOW, ...UNDERFLOW, ...INEXACT according to rounding
+	inline half acosh(half arg)
+	{
+	#if defined(HALF_ARITHMETIC_TYPE) && HALF_ENABLE_CPP11_CMATH
+		return half(detail::binary, detail::float2half<half::round_style>(std::acosh(detail::half2float<detail::internal_t>(arg.data_))));
+	#else
+		int abs = arg.data_ & 0x7FFF;
+		if((arg.data_&0x8000) || abs < 0x3C00)
+			return half(detail::binary, (abs<=0x7C00) ? detail::invalid() : detail::signal(arg.data_));
+		if(abs == 0x3C00)
+			return half(detail::binary, 0);
+		if(arg.data_ >= 0x7C00)
+			return (abs>0x7C00) ? half(detail::binary, detail::signal(arg.data_)) : arg;
+		return half(detail::binary, detail::area<half::round_style,false>(arg.data_));
+	#endif
+	}
+
+	/// Hyperbolic area tangent.
+	/// This function is exact to rounding for all rounding modes.
+	///
+	/// **See also:** Documentation for [std::atanh](https://en.cppreference.com/w/cpp/numeric/math/atanh).
+	/// \param arg function argument
+	/// \return area tangent value of \a arg
+	/// \exception FE_INVALID for signaling NaN or if abs(\a arg) > 1
+	/// \exception FE_DIVBYZERO for +/-1
+	/// \exception FE_OVERFLOW, ...UNDERFLOW, ...INEXACT according to rounding
+	inline half atanh(half arg)
+	{
+	#if defined(HALF_ARITHMETIC_TYPE) && HALF_ENABLE_CPP11_CMATH
+		return half(detail::binary, detail::float2half<half::round_style>(std::atanh(detail::half2float<detail::internal_t>(arg.data_))));
+	#else
+		int abs = arg.data_ & 0x7FFF, exp = 0;
+		if(!abs)
+			return arg;
+		if(abs >= 0x3C00)
+			return half(detail::binary, (abs==0x3C00) ? detail::pole(arg.data_&0x8000) : (abs<=0x7C00) ? detail::invalid() : detail::signal(arg.data_));
+		if(abs < 0x2700)
+			return half(detail::binary, detail::rounded<half::round_style,true>(arg.data_, 0, 1));
+		detail::uint32 m = static_cast<detail::uint32>((abs&0x3FF)|((abs>0x3FF)<<10)) << ((abs>>10)+(abs<=0x3FF)+6), my = 0x80000000 + m, mx = 0x80000000 - m;
+		for(; mx<0x80000000; mx<<=1,++exp) ;
+		int i = my >= mx, s;
+		return half(detail::binary, detail::log2_post<half::round_style,0xB8AA3B2A>(detail::log2(
+			(detail::divide64(my>>i, mx, s)+1)>>1, 27)+0x10, exp+i-1, 16, arg.data_&0x8000));
+	#endif
+	}
+
+	/// \}
+	/// \anchor special
+	/// \name Error and gamma functions
+	/// \{
+
+	/// Error function.
+	/// This function may be 1 ULP off the correctly rounded exact result for any rounding mode in <0.5% of inputs.
+	///
+	/// **See also:** Documentation for [std::erf](https://en.cppreference.com/w/cpp/numeric/math/erf).
+	/// \param arg function argument
+	/// \return error function value of \a arg
+	/// \exception FE_INVALID for signaling NaN
+	/// \exception FE_OVERFLOW, ...UNDERFLOW, ...INEXACT according to rounding
+	inline half erf(half arg)
+	{
+	#if defined(HALF_ARITHMETIC_TYPE) && HALF_ENABLE_CPP11_CMATH
+		return half(detail::binary, detail::float2half<half::round_style>(std::erf(detail::half2float<detail::internal_t>(arg.data_))));
+	#else
+		unsigned int abs = arg.data_ & 0x7FFF;
+		if(!abs || abs >= 0x7C00)
+			return (abs>=0x7C00) ? half(detail::binary, (abs==0x7C00) ? (arg.data_-0x4000) : detail::signal(arg.data_)) : arg;
+		if(abs >= 0x4200)
+			return half(detail::binary, detail::rounded<half::round_style,true>((arg.data_&0x8000)|0x3BFF, 1, 1));
+		return half(detail::binary, detail::erf<half::round_style,false>(arg.data_));
+	#endif
+	}
+
+	/// Complementary error function.
+	/// This function may be 1 ULP off the correctly rounded exact result for any rounding mode in <0.5% of inputs.
+	///
+	/// **See also:** Documentation for [std::erfc](https://en.cppreference.com/w/cpp/numeric/math/erfc).
+	/// \param arg function argument
+	/// \return 1 minus error function value of \a arg
+	/// \exception FE_INVALID for signaling NaN
+	/// \exception FE_OVERFLOW, ...UNDERFLOW, ...INEXACT according to rounding
+	inline half erfc(half arg)
+	{
+	#if defined(HALF_ARITHMETIC_TYPE) && HALF_ENABLE_CPP11_CMATH
+		return half(detail::binary, detail::float2half<half::round_style>(std::erfc(detail::half2float<detail::internal_t>(arg.data_))));
+	#else
+		unsigned int abs = arg.data_ & 0x7FFF, sign = arg.data_ & 0x8000;
+		if(abs >= 0x7C00)
+			return (abs>=0x7C00) ? half(detail::binary, (abs==0x7C00) ? (sign>>1) : detail::signal(arg.data_)) : arg;
+		if(!abs)
+			return half(detail::binary, 0x3C00);
+		if(abs >= 0x4400)
+			return half(detail::binary, detail::rounded<half::round_style,true>((sign>>1)-(sign>>15), sign>>15, 1));
+		return half(detail::binary, detail::erf<half::round_style,true>(arg.data_));
+	#endif
+	}
+
+	/// Natural logarithm of gamma function.
+	/// This function may be 1 ULP off the correctly rounded exact result for any rounding mode in ~0.025% of inputs.
+	///
+	/// **See also:** Documentation for [std::lgamma](https://en.cppreference.com/w/cpp/numeric/math/lgamma).
+	/// \param arg function argument
+	/// \return natural logarith of gamma function for \a arg
+	/// \exception FE_INVALID for signaling NaN
+	/// \exception FE_DIVBYZERO for 0 or negative integer arguments
+	/// \exception FE_OVERFLOW, ...UNDERFLOW, ...INEXACT according to rounding
+	inline half lgamma(half arg)
+	{
+	#if defined(HALF_ARITHMETIC_TYPE) && HALF_ENABLE_CPP11_CMATH
+		return half(detail::binary, detail::float2half<half::round_style>(std::lgamma(detail::half2float<detail::internal_t>(arg.data_))));
+	#else
+		int abs = arg.data_ & 0x7FFF;
+		if(abs >= 0x7C00)
+			return half(detail::binary, (abs==0x7C00) ? 0x7C00 : detail::signal(arg.data_));
+		if(!abs || arg.data_ >= 0xE400 || (arg.data_ >= 0xBC00 && !(abs&((1<<(25-(abs>>10)))-1))))
+			return half(detail::binary, detail::pole());
+		if(arg.data_ == 0x3C00 || arg.data_ == 0x4000)
+			return half(detail::binary, 0);
+		return half(detail::binary, detail::gamma<half::round_style,true>(arg.data_));
+	#endif
+	}
+
+	/// Gamma function.
+	/// This function may be 1 ULP off the correctly rounded exact result for any rounding mode in <0.25% of inputs.
+	///
+	/// **See also:** Documentation for [std::tgamma](https://en.cppreference.com/w/cpp/numeric/math/tgamma).
+	/// \param arg function argument
+	/// \return gamma function value of \a arg
+	/// \exception FE_INVALID for signaling NaN, negative infinity or negative integer arguments
+	/// \exception FE_DIVBYZERO for 0
+	/// \exception FE_OVERFLOW, ...UNDERFLOW, ...INEXACT according to rounding
+	inline half tgamma(half arg)
+	{
+	#if defined(HALF_ARITHMETIC_TYPE) && HALF_ENABLE_CPP11_CMATH
+		return half(detail::binary, detail::float2half<half::round_style>(std::tgamma(detail::half2float<detail::internal_t>(arg.data_))));
+	#else
+		unsigned int abs = arg.data_ & 0x7FFF;
+		if(!abs)
+			return half(detail::binary, detail::pole(arg.data_));
+		if(abs >= 0x7C00)
+			return (arg.data_==0x7C00) ? arg : half(detail::binary, detail::signal(arg.data_));
+		if(arg.data_ >= 0xE400 || (arg.data_ >= 0xBC00 && !(abs&((1<<(25-(abs>>10)))-1))))
+			return half(detail::binary, detail::invalid());
+		if(arg.data_ >= 0xCA80)
+			return half(detail::binary, detail::underflow<half::round_style>((1-((abs>>(25-(abs>>10)))&1))<<15));
+		if(arg.data_ <= 0x100 || (arg.data_ >= 0x4900 && arg.data_ < 0x8000))
+			return half(detail::binary, detail::overflow<half::round_style>());
+		if(arg.data_ == 0x3C00)
+			return arg;
+		return half(detail::binary, detail::gamma<half::round_style,false>(arg.data_));
+	#endif
+	}
+
+	/// \}
+	/// \anchor rounding
+	/// \name Rounding
+	/// \{
+
+	/// Nearest integer not less than half value.
+	/// **See also:** Documentation for [std::ceil](https://en.cppreference.com/w/cpp/numeric/math/ceil).
+	/// \param arg half to round
+	/// \return nearest integer not less than \a arg
+	/// \exception FE_INVALID for signaling NaN
+	/// \exception FE_INEXACT if value had to be rounded
+	inline half ceil(half arg) { return half(detail::binary, detail::integral<std::round_toward_infinity,true,true>(arg.data_)); }
+
+	/// Nearest integer not greater than half value.
+	/// **See also:** Documentation for [std::floor](https://en.cppreference.com/w/cpp/numeric/math/floor).
+	/// \param arg half to round
+	/// \return nearest integer not greater than \a arg
+	/// \exception FE_INVALID for signaling NaN
+	/// \exception FE_INEXACT if value had to be rounded
+	inline half floor(half arg) { return half(detail::binary, detail::integral<std::round_toward_neg_infinity,true,true>(arg.data_)); }
+
+	/// Nearest integer not greater in magnitude than half value.
+	/// **See also:** Documentation for [std::trunc](https://en.cppreference.com/w/cpp/numeric/math/trunc).
+	/// \param arg half to round
+	/// \return nearest integer not greater in magnitude than \a arg
+	/// \exception FE_INVALID for signaling NaN
+	/// \exception FE_INEXACT if value had to be rounded
+	inline half trunc(half arg) { return half(detail::binary, detail::integral<std::round_toward_zero,true,true>(arg.data_)); }
+
+	/// Nearest integer.
+	/// **See also:** Documentation for [std::round](https://en.cppreference.com/w/cpp/numeric/math/round).
+	/// \param arg half to round
+	/// \return nearest integer, rounded away from zero in half-way cases
+	/// \exception FE_INVALID for signaling NaN
+	/// \exception FE_INEXACT if value had to be rounded
+	inline half round(half arg) { return half(detail::binary, detail::integral<std::round_to_nearest,false,true>(arg.data_)); }
+
+	/// Nearest integer.
+	/// **See also:** Documentation for [std::lround](https://en.cppreference.com/w/cpp/numeric/math/round).
+	/// \param arg half to round
+	/// \return nearest integer, rounded away from zero in half-way cases
+	/// \exception FE_INVALID if value is not representable as `long`
+	inline long lround(half arg) { return detail::half2int<std::round_to_nearest,false,false,long>(arg.data_); }
+
+	/// Nearest integer using half's internal rounding mode.
+	/// **See also:** Documentation for [std::rint](https://en.cppreference.com/w/cpp/numeric/math/rint).
+	/// \param arg half expression to round
+	/// \return nearest integer using default rounding mode
+	/// \exception FE_INVALID for signaling NaN
+	/// \exception FE_INEXACT if value had to be rounded
+	inline half rint(half arg) { return half(detail::binary, detail::integral<half::round_style,true,true>(arg.data_)); }
+
+	/// Nearest integer using half's internal rounding mode.
+	/// **See also:** Documentation for [std::lrint](https://en.cppreference.com/w/cpp/numeric/math/rint).
+	/// \param arg half expression to round
+	/// \return nearest integer using default rounding mode
+	/// \exception FE_INVALID if value is not representable as `long`
+	/// \exception FE_INEXACT if value had to be rounded
+	inline long lrint(half arg) { return detail::half2int<half::round_style,true,true,long>(arg.data_); }
+
+	/// Nearest integer using half's internal rounding mode.
+	/// **See also:** Documentation for [std::nearbyint](https://en.cppreference.com/w/cpp/numeric/math/nearbyint).
+	/// \param arg half expression to round
+	/// \return nearest integer using default rounding mode
+	/// \exception FE_INVALID for signaling NaN
+	inline half nearbyint(half arg) { return half(detail::binary, detail::integral<half::round_style,true,false>(arg.data_)); }
+#if HALF_ENABLE_CPP11_LONG_LONG
+	/// Nearest integer.
+	/// **See also:** Documentation for [std::llround](https://en.cppreference.com/w/cpp/numeric/math/round).
+	/// \param arg half to round
+	/// \return nearest integer, rounded away from zero in half-way cases
+	/// \exception FE_INVALID if value is not representable as `long long`
+	inline long long llround(half arg) { return detail::half2int<std::round_to_nearest,false,false,long long>(arg.data_); }
+
+	/// Nearest integer using half's internal rounding mode.
+	/// **See also:** Documentation for [std::llrint](https://en.cppreference.com/w/cpp/numeric/math/rint).
+	/// \param arg half expression to round
+	/// \return nearest integer using default rounding mode
+	/// \exception FE_INVALID if value is not representable as `long long`
+	/// \exception FE_INEXACT if value had to be rounded
+	inline long long llrint(half arg) { return detail::half2int<half::round_style,true,true,long long>(arg.data_); }
+#endif
+
+	/// \}
+	/// \anchor float
+	/// \name Floating point manipulation
+	/// \{
+
+	/// Decompress floating-point number.
+	/// **See also:** Documentation for [std::frexp](https://en.cppreference.com/w/cpp/numeric/math/frexp).
+	/// \param arg number to decompress
+	/// \param exp address to store exponent at
+	/// \return significant in range [0.5, 1)
+	/// \exception FE_INVALID for signaling NaN
+	inline half frexp(half arg, int *exp)
+	{
+		*exp = 0;
+		unsigned int abs = arg.data_ & 0x7FFF;
+		if(abs >= 0x7C00 || !abs)
+			return (abs>0x7C00) ? half(detail::binary, detail::signal(arg.data_)) : arg;
+		for(; abs<0x400; abs<<=1,--*exp) ;
+		*exp += (abs>>10) - 14;
+		return half(detail::binary, (arg.data_&0x8000)|0x3800|(abs&0x3FF));
+	}
+
+	/// Multiply by power of two.
+	/// This function is exact to rounding for all rounding modes.
+	///
+	/// **See also:** Documentation for [std::scalbln](https://en.cppreference.com/w/cpp/numeric/math/scalbn).
+	/// \param arg number to modify
+	/// \param exp power of two to multiply with
+	/// \return \a arg multplied by 2 raised to \a exp
+	/// \exception FE_INVALID for signaling NaN
+	/// \exception FE_OVERFLOW, ...UNDERFLOW, ...INEXACT according to rounding
+	inline half scalbln(half arg, long exp)
+	{
+		unsigned int abs = arg.data_ & 0x7FFF, sign = arg.data_ & 0x8000;
+		if(abs >= 0x7C00 || !abs)
+			return (abs>0x7C00) ? half(detail::binary, detail::signal(arg.data_)) : arg;
+		for(; abs<0x400; abs<<=1,--exp) ;
+		exp += abs >> 10;
+		if(exp > 30)
+			return half(detail::binary, detail::overflow<half::round_style>(sign));
+		else if(exp < -10)
+			return half(detail::binary, detail::underflow<half::round_style>(sign));
+		else if(exp > 0)
+			return half(detail::binary, sign|(exp<<10)|(abs&0x3FF));
+		unsigned int m = (abs&0x3FF) | 0x400;
+		return half(detail::binary, detail::rounded<half::round_style,false>(sign|(m>>(1-exp)), (m>>-exp)&1, (m&((1<<-exp)-1))!=0));
+	}
+
+	/// Multiply by power of two.
+	/// This function is exact to rounding for all rounding modes.
+	///
+	/// **See also:** Documentation for [std::scalbn](https://en.cppreference.com/w/cpp/numeric/math/scalbn).
+	/// \param arg number to modify
+	/// \param exp power of two to multiply with
+	/// \return \a arg multplied by 2 raised to \a exp
+	/// \exception FE_INVALID for signaling NaN
+	/// \exception FE_OVERFLOW, ...UNDERFLOW, ...INEXACT according to rounding
+	inline half scalbn(half arg, int exp) { return scalbln(arg, exp); }
+
+	/// Multiply by power of two.
+	/// This function is exact to rounding for all rounding modes.
+	///
+	/// **See also:** Documentation for [std::ldexp](https://en.cppreference.com/w/cpp/numeric/math/ldexp).
+	/// \param arg number to modify
+	/// \param exp power of two to multiply with
+	/// \return \a arg multplied by 2 raised to \a exp
+	/// \exception FE_INVALID for signaling NaN
+	/// \exception FE_OVERFLOW, ...UNDERFLOW, ...INEXACT according to rounding
+	inline half ldexp(half arg, int exp) { return scalbln(arg, exp); }
+
+	/// Extract integer and fractional parts.
+	/// **See also:** Documentation for [std::modf](https://en.cppreference.com/w/cpp/numeric/math/modf).
+	/// \param arg number to decompress
+	/// \param iptr address to store integer part at
+	/// \return fractional part
+	/// \exception FE_INVALID for signaling NaN
+	inline half modf(half arg, half *iptr)
+	{
+		unsigned int abs = arg.data_ & 0x7FFF;
+		if(abs > 0x7C00)
+		{
+			arg = half(detail::binary, detail::signal(arg.data_));
+			return *iptr = arg, arg;
+		}
+		if(abs >= 0x6400)
+			return *iptr = arg, half(detail::binary, arg.data_&0x8000);
+		if(abs < 0x3C00)
+			return iptr->data_ = arg.data_ & 0x8000, arg;
+		unsigned int exp = abs >> 10, mask = (1<<(25-exp)) - 1, m = arg.data_ & mask;
+		iptr->data_ = arg.data_ & ~mask;
+		if(!m)
+			return half(detail::binary, arg.data_&0x8000);
+		for(; m<0x400; m<<=1,--exp) ;
+		return half(detail::binary, (arg.data_&0x8000)|(exp<<10)|(m&0x3FF));
+	}
+
+	/// Extract exponent.
+	/// **See also:** Documentation for [std::ilogb](https://en.cppreference.com/w/cpp/numeric/math/ilogb).
+	/// \param arg number to query
+	/// \return floating-point exponent
+	/// \retval FP_ILOGB0 for zero
+	/// \retval FP_ILOGBNAN for NaN
+	/// \retval INT_MAX for infinity
+	/// \exception FE_INVALID for 0 or infinite values
+	inline int ilogb(half arg)
+	{
+		int abs = arg.data_ & 0x7FFF, exp;
+		if(!abs || abs >= 0x7C00)
+		{
+			detail::raise(FE_INVALID);
+			return !abs ? FP_ILOGB0 : (abs==0x7C00) ? INT_MAX : FP_ILOGBNAN;
+		}
+		for(exp=(abs>>10)-15; abs<0x200; abs<<=1,--exp) ;
+		return exp;
+	}
+
+	/// Extract exponent.
+	/// **See also:** Documentation for [std::logb](https://en.cppreference.com/w/cpp/numeric/math/logb).
+	/// \param arg number to query
+	/// \return floating-point exponent
+	/// \exception FE_INVALID for signaling NaN
+	/// \exception FE_DIVBYZERO for 0
+	inline half logb(half arg)
+	{
+		int abs = arg.data_ & 0x7FFF, exp;
+		if(!abs)
+			return half(detail::binary, detail::pole(0x8000));
+		if(abs >= 0x7C00)
+			return half(detail::binary, (abs==0x7C00) ? 0x7C00 : detail::signal(arg.data_));
+		for(exp=(abs>>10)-15; abs<0x200; abs<<=1,--exp) ;
+		unsigned int value = static_cast<unsigned>(exp<0) << 15;
+		if(exp)
+		{
+			unsigned int m = std::abs(exp) << 6;
+			for(exp=18; m<0x400; m<<=1,--exp) ;
+			value |= (exp<<10) + m;
+		}
+		return half(detail::binary, value);
+	}
+
+	/// Next representable value.
+	/// **See also:** Documentation for [std::nextafter](https://en.cppreference.com/w/cpp/numeric/math/nextafter).
+	/// \param from value to compute next representable value for
+	/// \param to direction towards which to compute next value
+	/// \return next representable value after \a from in direction towards \a to
+	/// \exception FE_INVALID for signaling NaN
+	/// \exception FE_OVERFLOW for infinite result from finite argument
+	/// \exception FE_UNDERFLOW for subnormal result
+	inline half nextafter(half from, half to)
+	{
+		int fabs = from.data_ & 0x7FFF, tabs = to.data_ & 0x7FFF;
+		if(fabs > 0x7C00 || tabs > 0x7C00)
+			return half(detail::binary, detail::signal(from.data_, to.data_));
+		if(from.data_ == to.data_ || !(fabs|tabs))
+			return to;
+		if(!fabs)
+		{
+			detail::raise(FE_UNDERFLOW, !HALF_ERRHANDLING_UNDERFLOW_TO_INEXACT);
+			return half(detail::binary, (to.data_&0x8000)+1);
+		}
+		unsigned int out = from.data_ + (((from.data_>>15)^static_cast<unsigned>(
+			(from.data_^(0x8000|(0x8000-(from.data_>>15))))<(to.data_^(0x8000|(0x8000-(to.data_>>15))))))<<1) - 1;
+		detail::raise(FE_OVERFLOW, fabs<0x7C00 && (out&0x7C00)==0x7C00);
+		detail::raise(FE_UNDERFLOW, !HALF_ERRHANDLING_UNDERFLOW_TO_INEXACT && (out&0x7C00)<0x400);
+		return half(detail::binary, out);
+	}
+
+	/// Next representable value.
+	/// **See also:** Documentation for [std::nexttoward](https://en.cppreference.com/w/cpp/numeric/math/nexttoward).
+	/// \param from value to compute next representable value for
+	/// \param to direction towards which to compute next value
+	/// \return next representable value after \a from in direction towards \a to
+	/// \exception FE_INVALID for signaling NaN
+	/// \exception FE_OVERFLOW for infinite result from finite argument
+	/// \exception FE_UNDERFLOW for subnormal result
+	inline half nexttoward(half from, long double to)
+	{
+		int fabs = from.data_ & 0x7FFF;
+		if(fabs > 0x7C00)
+			return half(detail::binary, detail::signal(from.data_));
+		long double lfrom = static_cast<long double>(from);
+		if(detail::builtin_isnan(to) || lfrom == to)
+			return half(static_cast<float>(to));
+		if(!fabs)
+		{
+			detail::raise(FE_UNDERFLOW, !HALF_ERRHANDLING_UNDERFLOW_TO_INEXACT);
+			return half(detail::binary, (static_cast<unsigned>(detail::builtin_signbit(to))<<15)+1);
+		}
+		unsigned int out = from.data_ + (((from.data_>>15)^static_cast<unsigned>(lfrom<to))<<1) - 1;
+		detail::raise(FE_OVERFLOW, (out&0x7FFF)==0x7C00);
+		detail::raise(FE_UNDERFLOW, !HALF_ERRHANDLING_UNDERFLOW_TO_INEXACT && (out&0x7FFF)<0x400);
+		return half(detail::binary, out);
+	}
+
+	/// Take sign.
+	/// **See also:** Documentation for [std::copysign](https://en.cppreference.com/w/cpp/numeric/math/copysign).
+	/// \param x value to change sign for
+	/// \param y value to take sign from
+	/// \return value equal to \a x in magnitude and to \a y in sign
+	inline HALF_CONSTEXPR half copysign(half x, half y) { return half(detail::binary, x.data_^((x.data_^y.data_)&0x8000)); }
+
+	/// \}
+	/// \anchor classification
+	/// \name Floating point classification
+	/// \{
+
+	/// Classify floating-point value.
+	/// **See also:** Documentation for [std::fpclassify](https://en.cppreference.com/w/cpp/numeric/math/fpclassify).
+	/// \param arg number to classify
+	/// \retval FP_ZERO for positive and negative zero
+	/// \retval FP_SUBNORMAL for subnormal numbers
+	/// \retval FP_INFINITY for positive and negative infinity
+	/// \retval FP_NAN for NaNs
+	/// \retval FP_NORMAL for all other (normal) values
+	inline HALF_CONSTEXPR int fpclassify(half arg)
+	{
+		return	!(arg.data_&0x7FFF) ? FP_ZERO :
+				((arg.data_&0x7FFF)<0x400) ? FP_SUBNORMAL :
+				((arg.data_&0x7FFF)<0x7C00) ? FP_NORMAL :
+				((arg.data_&0x7FFF)==0x7C00) ? FP_INFINITE :
+				FP_NAN;
+	}
+
+	/// Check if finite number.
+	/// **See also:** Documentation for [std::isfinite](https://en.cppreference.com/w/cpp/numeric/math/isfinite).
+	/// \param arg number to check
+	/// \retval true if neither infinity nor NaN
+	/// \retval false else
+	inline HALF_CONSTEXPR bool isfinite(half arg) { return (arg.data_&0x7C00) != 0x7C00; }
+
+	/// Check for infinity.
+	/// **See also:** Documentation for [std::isinf](https://en.cppreference.com/w/cpp/numeric/math/isinf).
+	/// \param arg number to check
+	/// \retval true for positive or negative infinity
+	/// \retval false else
+	inline HALF_CONSTEXPR bool isinf(half arg) { return (arg.data_&0x7FFF) == 0x7C00; }
+
+	/// Check for NaN.
+	/// **See also:** Documentation for [std::isnan](https://en.cppreference.com/w/cpp/numeric/math/isnan).
+	/// \param arg number to check
+	/// \retval true for NaNs
+	/// \retval false else
+	inline HALF_CONSTEXPR bool isnan(half arg) { return (arg.data_&0x7FFF) > 0x7C00; }
+
+	/// Check if normal number.
+	/// **See also:** Documentation for [std::isnormal](https://en.cppreference.com/w/cpp/numeric/math/isnormal).
+	/// \param arg number to check
+	/// \retval true if normal number
+	/// \retval false if either subnormal, zero, infinity or NaN
+	inline HALF_CONSTEXPR bool isnormal(half arg) { return ((arg.data_&0x7C00)!=0) & ((arg.data_&0x7C00)!=0x7C00); }
+
+	/// Check sign.
+	/// **See also:** Documentation for [std::signbit](https://en.cppreference.com/w/cpp/numeric/math/signbit).
+	/// \param arg number to check
+	/// \retval true for negative number
+	/// \retval false for positive number
+	inline HALF_CONSTEXPR bool signbit(half arg) { return (arg.data_&0x8000) != 0; }
+
+	/// \}
+	/// \anchor compfunc
+	/// \name Comparison
+	/// \{
+
+	/// Quiet comparison for greater than.
+	/// **See also:** Documentation for [std::isgreater](https://en.cppreference.com/w/cpp/numeric/math/isgreater).
+	/// \param x first operand
+	/// \param y second operand
+	/// \retval true if \a x greater than \a y
+	/// \retval false else
+	inline HALF_CONSTEXPR bool isgreater(half x, half y)
+	{
+		return ((x.data_^(0x8000|(0x8000-(x.data_>>15))))+(x.data_>>15)) > ((y.data_^(0x8000|(0x8000-(y.data_>>15))))+(y.data_>>15)) && !isnan(x) && !isnan(y);
+	}
+
+	/// Quiet comparison for greater equal.
+	/// **See also:** Documentation for [std::isgreaterequal](https://en.cppreference.com/w/cpp/numeric/math/isgreaterequal).
+	/// \param x first operand
+	/// \param y second operand
+	/// \retval true if \a x greater equal \a y
+	/// \retval false else
+	inline HALF_CONSTEXPR bool isgreaterequal(half x, half y)
+	{
+		return ((x.data_^(0x8000|(0x8000-(x.data_>>15))))+(x.data_>>15)) >= ((y.data_^(0x8000|(0x8000-(y.data_>>15))))+(y.data_>>15)) && !isnan(x) && !isnan(y);
+	}
+
+	/// Quiet comparison for less than.
+	/// **See also:** Documentation for [std::isless](https://en.cppreference.com/w/cpp/numeric/math/isless).
+	/// \param x first operand
+	/// \param y second operand
+	/// \retval true if \a x less than \a y
+	/// \retval false else
+	inline HALF_CONSTEXPR bool isless(half x, half y)
+	{
+		return ((x.data_^(0x8000|(0x8000-(x.data_>>15))))+(x.data_>>15)) < ((y.data_^(0x8000|(0x8000-(y.data_>>15))))+(y.data_>>15)) && !isnan(x) && !isnan(y);
+	}
+
+	/// Quiet comparison for less equal.
+	/// **See also:** Documentation for [std::islessequal](https://en.cppreference.com/w/cpp/numeric/math/islessequal).
+	/// \param x first operand
+	/// \param y second operand
+	/// \retval true if \a x less equal \a y
+	/// \retval false else
+	inline HALF_CONSTEXPR bool islessequal(half x, half y)
+	{
+		return ((x.data_^(0x8000|(0x8000-(x.data_>>15))))+(x.data_>>15)) <= ((y.data_^(0x8000|(0x8000-(y.data_>>15))))+(y.data_>>15)) && !isnan(x) && !isnan(y);
+	}
+
+	/// Quiet comarison for less or greater.
+	/// **See also:** Documentation for [std::islessgreater](https://en.cppreference.com/w/cpp/numeric/math/islessgreater).
+	/// \param x first operand
+	/// \param y second operand
+	/// \retval true if either less or greater
+	/// \retval false else
+	inline HALF_CONSTEXPR bool islessgreater(half x, half y)
+	{
+		return x.data_!=y.data_ && ((x.data_|y.data_)&0x7FFF) && !isnan(x) && !isnan(y);
+	}
+
+	/// Quiet check if unordered.
+	/// **See also:** Documentation for [std::isunordered](https://en.cppreference.com/w/cpp/numeric/math/isunordered).
+	/// \param x first operand
+	/// \param y second operand
+	/// \retval true if unordered (one or two NaN operands)
+	/// \retval false else
+	inline HALF_CONSTEXPR bool isunordered(half x, half y) { return isnan(x) || isnan(y); }
+
+	/// \}
+	/// \anchor casting
+	/// \name Casting
+	/// \{
+
+	/// Cast to or from half-precision floating-point number.
+	/// This casts between [half](\ref half_float::half) and any built-in arithmetic type. The values are converted 
+	/// directly using the default rounding mode, without any roundtrip over `float` that a `static_cast` would otherwise do.
+	///
+	/// Using this cast with neither of the two types being a [half](\ref half_float::half) or with any of the two types 
+	/// not being a built-in arithmetic type (apart from [half](\ref half_float::half), of course) results in a compiler 
+	/// error and casting between [half](\ref half_float::half)s returns the argument unmodified.
+	/// \tparam T destination type (half or built-in arithmetic type)
+	/// \tparam U source type (half or built-in arithmetic type)
+	/// \param arg value to cast
+	/// \return \a arg converted to destination type
+	/// \exception FE_INVALID if \a T is integer type and result is not representable as \a T
+	/// \exception FE_OVERFLOW, ...UNDERFLOW, ...INEXACT according to rounding
+	template<typename T,typename U> T half_cast(U arg) { return detail::half_caster<T,U>::cast(arg); }
+
+	/// Cast to or from half-precision floating-point number.
+	/// This casts between [half](\ref half_float::half) and any built-in arithmetic type. The values are converted 
+	/// directly using the specified rounding mode, without any roundtrip over `float` that a `static_cast` would otherwise do.
+	///
+	/// Using this cast with neither of the two types being a [half](\ref half_float::half) or with any of the two types 
+	/// not being a built-in arithmetic type (apart from [half](\ref half_float::half), of course) results in a compiler 
+	/// error and casting between [half](\ref half_float::half)s returns the argument unmodified.
+	/// \tparam T destination type (half or built-in arithmetic type)
+	/// \tparam R rounding mode to use.
+	/// \tparam U source type (half or built-in arithmetic type)
+	/// \param arg value to cast
+	/// \return \a arg converted to destination type
+	/// \exception FE_INVALID if \a T is integer type and result is not representable as \a T
+	/// \exception FE_OVERFLOW, ...UNDERFLOW, ...INEXACT according to rounding
+	template<typename T,std::float_round_style R,typename U> T half_cast(U arg) { return detail::half_caster<T,U,R>::cast(arg); }
+	/// \}
+
+	/// \}
+	/// \anchor errors
+	/// \name Error handling
+	/// \{
+
+	/// Clear exception flags.
+	/// This function works even if [automatic exception flag handling](\ref HALF_ERRHANDLING_FLAGS) is disabled, 
+	/// but in that case manual flag management is the only way to raise flags.
+	///
+	/// **See also:** Documentation for [std::feclearexcept](https://en.cppreference.com/w/cpp/numeric/fenv/feclearexcept).
+	/// \param excepts OR of exceptions to clear
+	/// \retval 0 all selected flags cleared successfully
+	inline int feclearexcept(int excepts) { detail::errflags() &= ~excepts; return 0; }
+
+	/// Test exception flags.
+	/// This function works even if [automatic exception flag handling](\ref HALF_ERRHANDLING_FLAGS) is disabled, 
+	/// but in that case manual flag management is the only way to raise flags.
+	///
+	/// **See also:** Documentation for [std::fetestexcept](https://en.cppreference.com/w/cpp/numeric/fenv/fetestexcept).
+	/// \param excepts OR of exceptions to test
+	/// \return OR of selected exceptions if raised
+	inline int fetestexcept(int excepts) { return detail::errflags() & excepts; }
+
+	/// Raise exception flags.
+	/// This raises the specified floating point exceptions and also invokes any additional automatic exception handling as 
+	/// configured with the [HALF_ERRHANDLIG_...](\ref HALF_ERRHANDLING_ERRNO) preprocessor symbols.
+	/// This function works even if [automatic exception flag handling](\ref HALF_ERRHANDLING_FLAGS) is disabled, 
+	/// but in that case manual flag management is the only way to raise flags.
+	///
+	/// **See also:** Documentation for [std::feraiseexcept](https://en.cppreference.com/w/cpp/numeric/fenv/feraiseexcept).
+	/// \param excepts OR of exceptions to raise
+	/// \retval 0 all selected exceptions raised successfully
+	inline int feraiseexcept(int excepts) { detail::errflags() |= excepts; detail::raise(excepts); return 0; }
+
+	/// Save exception flags.
+	/// This function works even if [automatic exception flag handling](\ref HALF_ERRHANDLING_FLAGS) is disabled, 
+	/// but in that case manual flag management is the only way to raise flags.
+	///
+	/// **See also:** Documentation for [std::fegetexceptflag](https://en.cppreference.com/w/cpp/numeric/fenv/feexceptflag).
+	/// \param flagp adress to store flag state at
+	/// \param excepts OR of flags to save
+	/// \retval 0 for success
+	inline int fegetexceptflag(int *flagp, int excepts) { *flagp = detail::errflags() & excepts; return 0; }
+
+	/// Restore exception flags.
+	/// This only copies the specified exception state (including unset flags) without incurring any additional exception handling.
+	/// This function works even if [automatic exception flag handling](\ref HALF_ERRHANDLING_FLAGS) is disabled, 
+	/// but in that case manual flag management is the only way to raise flags.
+	///
+	/// **See also:** Documentation for [std::fesetexceptflag](https://en.cppreference.com/w/cpp/numeric/fenv/feexceptflag).
+	/// \param flagp adress to take flag state from
+	/// \param excepts OR of flags to restore
+	/// \retval 0 for success
+	inline int fesetexceptflag(const int *flagp, int excepts) { detail::errflags() = (detail::errflags()|(*flagp&excepts)) & (*flagp|~excepts); return 0; }
+
+	/// Throw C++ exceptions based on set exception flags.
+	/// This function manually throws a corresponding C++ exception if one of the specified flags is set, 
+	/// no matter if automatic throwing (via [HALF_ERRHANDLING_THROW_...](\ref HALF_ERRHANDLING_THROW_INVALID)) is enabled or not.
+	/// This function works even if [automatic exception flag handling](\ref HALF_ERRHANDLING_FLAGS) is disabled, 
+	/// but in that case manual flag management is the only way to raise flags.
+	/// \param excepts OR of exceptions to test
+	/// \param msg error message to use for exception description
+	/// \throw std::domain_error if `FE_INVALID` or `FE_DIVBYZERO` is selected and set
+	/// \throw std::overflow_error if `FE_OVERFLOW` is selected and set
+	/// \throw std::underflow_error if `FE_UNDERFLOW` is selected and set
+	/// \throw std::range_error if `FE_INEXACT` is selected and set
+	inline void fethrowexcept(int excepts, const char *msg = "")
+	{
+		excepts &= detail::errflags();
+		if(excepts & (FE_INVALID|FE_DIVBYZERO))
+			throw std::domain_error(msg);
+		if(excepts & FE_OVERFLOW)
+			throw std::overflow_error(msg);
+		if(excepts & FE_UNDERFLOW)
+			throw std::underflow_error(msg);
+		if(excepts & FE_INEXACT)
+			throw std::range_error(msg);
+	}
+	/// \}
+}
+
+
+#undef HALF_UNUSED_NOERR
+#undef HALF_CONSTEXPR
+#undef HALF_CONSTEXPR_CONST
+#undef HALF_CONSTEXPR_NOERR
+#undef HALF_NOEXCEPT
+#undef HALF_NOTHROW
+#undef HALF_THREAD_LOCAL
+#undef HALF_TWOS_COMPLEMENT_INT
+#ifdef HALF_POP_WARNINGS
+	#pragma warning(pop)
+	#undef HALF_POP_WARNINGS
+#endif
+
+#endif
diff --git a/shaders/ambient_occlusion/ground_truth/gtao_depth_prefilter.comp b/shaders/ambient_occlusion/ground_truth/gtao_depth_prefilter.comp
index f6f01790..240227f0 100644
--- a/shaders/ambient_occlusion/ground_truth/gtao_depth_prefilter.comp
+++ b/shaders/ambient_occlusion/ground_truth/gtao_depth_prefilter.comp
@@ -10,11 +10,11 @@ shared float g_scratchDepths[8][8];
 // layout (std140, set = 0, binding = 0) uniform SceneData - scene.glsl
 
 layout (set = 1, binding = 0) uniform sampler2D depthImage;
-layout (r32f, set = 1, binding = 1) uniform image2D outDepth0;
-layout (r32f, set = 1, binding = 2) uniform image2D outDepth1;
-layout (r32f, set = 1, binding = 3) uniform image2D outDepth2;
-layout (r32f, set = 1, binding = 4) uniform image2D outDepth3;
-layout (r32f, set = 1, binding = 5) uniform image2D outDepth4;
+layout (r16f, set = 1, binding = 1) uniform image2D outDepth0;
+layout (r16f, set = 1, binding = 2) uniform image2D outDepth1;
+layout (r16f, set = 1, binding = 3) uniform image2D outDepth2;
+layout (r16f, set = 1, binding = 4) uniform image2D outDepth3;
+layout (r16f, set = 1, binding = 5) uniform image2D outDepth4;
 
 layout (push_constant) uniform PushConstants {
     vec2 ndcToViewMult;
diff --git a/src/renderer/imgui_wrapper.cpp b/src/renderer/imgui_wrapper.cpp
index 421a7cac..cf81f709 100644
--- a/src/renderer/imgui_wrapper.cpp
+++ b/src/renderer/imgui_wrapper.cpp
@@ -21,6 +21,7 @@
 #include "src/core/game_object/renderable.h"
 #include "src/core/scene/serializer.h"
 #include "src/util/file.h"
+#include "src/util/math_utils.h"
 
 namespace will_engine
 {
@@ -316,7 +317,8 @@ void ImguiWrapper::imguiInterface(Engine* engine)
                         };
 
                         vk_helpers::saveImageR32F(*engine->resourceManager, *engine->immediate, engine->depthImage,
-                                                  VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL, VK_IMAGE_ASPECT_DEPTH_BIT, path.string().c_str(), depthNormalize);
+                                                  VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL, VK_IMAGE_ASPECT_DEPTH_BIT, path.string().c_str(),
+                                                  depthNormalize);
                     }
                     else {
                         fmt::print(" Failed to find/create image save path directory");
@@ -335,7 +337,8 @@ void ImguiWrapper::imguiInterface(Engine* engine)
                             return pixel;
                         };
                         vk_helpers::savePacked32Bit(*engine->resourceManager, *engine->immediate, engine->normalRenderTarget,
-                                                    VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL, VK_IMAGE_ASPECT_COLOR_BIT, path.string().c_str(), unpackFunc);
+                                                    VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL, VK_IMAGE_ASPECT_COLOR_BIT, path.string().c_str(),
+                                                    unpackFunc);
                     }
                     else {
                         fmt::print(" Failed to save normal render target");
@@ -351,7 +354,8 @@ void ImguiWrapper::imguiInterface(Engine* engine)
                             return pixel;
                         };
                         vk_helpers::savePacked32Bit(*engine->resourceManager, *engine->immediate, engine->albedoRenderTarget,
-                                                    VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL, VK_IMAGE_ASPECT_COLOR_BIT, path.string().c_str(), unpackFunc);
+                                                    VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL, VK_IMAGE_ASPECT_COLOR_BIT, path.string().c_str(),
+                                                    unpackFunc);
                     }
                     else {
                         fmt::print(" Failed to save albedo render target");
@@ -367,7 +371,8 @@ void ImguiWrapper::imguiInterface(Engine* engine)
                             return pixel;
                         };
                         vk_helpers::savePacked32Bit(*engine->resourceManager, *engine->immediate, engine->pbrRenderTarget,
-                                                    VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL, VK_IMAGE_ASPECT_COLOR_BIT, path.string().c_str(), unpackFunc);
+                                                    VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL, VK_IMAGE_ASPECT_COLOR_BIT, path.string().c_str(),
+                                                    unpackFunc);
                     }
                     else {
                         fmt::print(" Failed to save pbr render target");
@@ -705,8 +710,9 @@ void ImguiWrapper::imguiInterface(Engine* engine)
                                 Texture* randomTexture = engine->assetManager->getAnyTexture();
 
                                 currentlySelectedTexture = randomTexture->getTextureResource();
-                                currentlySelectedTextureImguiId = ImGui_ImplVulkan_AddTexture(engine->resourceManager->getDefaultSamplerLinear(), currentlySelectedTexture->getTexture().imageView,
-                                                                                              VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL);
+                                currentlySelectedTextureImguiId = ImGui_ImplVulkan_AddTexture(
+                                    engine->resourceManager->getDefaultSamplerLinear(), currentlySelectedTexture->getTexture().imageView,
+                                    VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL);
                             }
                         }
 
@@ -723,7 +729,8 @@ void ImguiWrapper::imguiInterface(Engine* engine)
                             for (Texture* texture : engine->assetManager->getAllTextures()) {
                                 bool isSelected = (currentlySelectedTexture->getId() == texture->getId());
 
-                                std::string label = fmt::format("[{}] Texture ID - {}", texture->isTextureResourceLoaded() ? "LOADED" : "NOT LOADED", texture->getId());
+                                std::string label = fmt::format("[{}] Texture ID - {}", texture->isTextureResourceLoaded() ? "LOADED" : "NOT LOADED",
+                                                                texture->getId());
 
                                 if (ImGui::Selectable(label.c_str(), isSelected)) {
                                     if (currentlySelectedTextureImguiId != VK_NULL_HANDLE) {
@@ -734,8 +741,9 @@ void ImguiWrapper::imguiInterface(Engine* engine)
                                     }
 
                                     currentlySelectedTexture = texture->getTextureResource();
-                                    currentlySelectedTextureImguiId = ImGui_ImplVulkan_AddTexture(engine->resourceManager->getDefaultSamplerLinear(), currentlySelectedTexture->getTexture().imageView,
-                                                                                                  VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL);
+                                    currentlySelectedTextureImguiId = ImGui_ImplVulkan_AddTexture(
+                                        engine->resourceManager->getDefaultSamplerLinear(), currentlySelectedTexture->getTexture().imageView,
+                                        VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL);
                                 }
                             }
 
@@ -879,15 +887,17 @@ void ImguiWrapper::imguiInterface(Engine* engine)
             if (file::getOrCreateDirectory(file::imagesSavePath)) {
                 const std::filesystem::path path = file::imagesSavePath / "gtao_depth.png";
 
-                auto depthNormalize = [](const float depth) {
-                    return depth / 1000.f;
+                auto depthNormalize = [](const uint16_t depth) {
+                    // Equivalent
+                    float manualDepth = math::halfToFloat(depth);
+                    float libraryDepth = half_float::detail::half2float<float>(depth);
+                    return libraryDepth / 1000.f;
                 };
 
-                vk_helpers::saveImageR32F(
+                vk_helpers::saveImageR16F(
                     *engine->resourceManager,
                     *engine->immediate,
                     engine->ambientOcclusionPipeline->depthPrefilterImage,
-                    //engine->depthImage,
                     VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL,
                     VK_IMAGE_ASPECT_COLOR_BIT,
                     path.string().c_str(),
diff --git a/src/renderer/lighting/ambient_occlusion/ground_truth/ground_truth_ambient_occlusion.cpp b/src/renderer/lighting/ambient_occlusion/ground_truth/ground_truth_ambient_occlusion.cpp
index fc22b271..7ee4589e 100644
--- a/src/renderer/lighting/ambient_occlusion/ground_truth/ground_truth_ambient_occlusion.cpp
+++ b/src/renderer/lighting/ambient_occlusion/ground_truth/ground_truth_ambient_occlusion.cpp
@@ -351,7 +351,7 @@ void will_engine::ambient_occlusion::GroundTruthAmbientOcclusionPipeline::draw(V
     //vk_helpers::transitionImage(cmd, depthPrefilterImage.image, VK_IMAGE_LAYOUT_UNDEFINED, VK_IMAGE_LAYOUT_GENERAL, VK_IMAGE_ASPECT_COLOR_BIT);
     vk_helpers::clearColorImage(cmd, depthPrefilterImage.image, VK_IMAGE_LAYOUT_UNDEFINED, VK_IMAGE_LAYOUT_GENERAL);
     // Depth Prefilter
-    {;
+    {
         vkCmdBindPipeline(cmd, VK_PIPELINE_BIND_POINT_COMPUTE, depthPrefilterPipeline);
         vkCmdPushConstants(cmd, depthPrefilterPipelineLayout, VK_SHADER_STAGE_COMPUTE_BIT, 0, sizeof(GTAOPushConstants), &push);
 
@@ -377,7 +377,6 @@ void will_engine::ambient_occlusion::GroundTruthAmbientOcclusionPipeline::draw(V
 
     vk_helpers::transitionImage(cmd, depthPrefilterImage.image, VK_IMAGE_LAYOUT_GENERAL, VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL, VK_IMAGE_ASPECT_COLOR_BIT);
     // vk_helpers::transitionImage(cmd, ambientOcclusionImage.image, VK_IMAGE_LAYOUT_UNDEFINED, VK_IMAGE_LAYOUT_GENERAL, VK_IMAGE_ASPECT_COLOR_BIT);
-    //
     // // Ambient Occlusion
     // {
     //     vkCmdBindPipeline(cmd, VK_PIPELINE_BIND_POINT_COMPUTE, ambientOcclusionPipeline);
@@ -398,12 +397,11 @@ void will_engine::ambient_occlusion::GroundTruthAmbientOcclusionPipeline::draw(V
     //     const auto x = static_cast<uint32_t>(std::ceil(RENDER_EXTENT_WIDTH / 16.0f));
     //     const auto y = static_cast<uint32_t>(std::ceil(RENDER_EXTENT_HEIGHT / 16.0f));
     //     vkCmdDispatch(cmd, x, y, 1);
-    //     vkCmdEndRendering(cmd);
     // }
     //
     //
     // vk_helpers::transitionImage(cmd, ambientOcclusionImage.image, VK_IMAGE_LAYOUT_GENERAL, VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL, VK_IMAGE_ASPECT_COLOR_BIT);
-    //
+
 
     vkCmdEndDebugUtilsLabelEXT(cmd);
 }
diff --git a/src/renderer/lighting/ambient_occlusion/ground_truth/ground_truth_ambient_occlusion.h b/src/renderer/lighting/ambient_occlusion/ground_truth/ground_truth_ambient_occlusion.h
index f45c6a72..60ae2cbb 100644
--- a/src/renderer/lighting/ambient_occlusion/ground_truth/ground_truth_ambient_occlusion.h
+++ b/src/renderer/lighting/ambient_occlusion/ground_truth/ground_truth_ambient_occlusion.h
@@ -42,7 +42,7 @@ class GroundTruthAmbientOcclusionPipeline
     VkSampler depthPrefilterSampler{VK_NULL_HANDLE};
 
     // 16 vs 32. look at cost later.
-    VkFormat depthPrefilterFormat{VK_FORMAT_R32_SFLOAT};
+    VkFormat depthPrefilterFormat{VK_FORMAT_R16_SFLOAT};
     AllocatedImage depthPrefilterImage{VK_NULL_HANDLE};
     std::array<VkImageView, DEPTH_PREFILTER_MIP_COUNT> depthPrefilterImageViews{};
 
diff --git a/src/renderer/vk_helpers.cpp b/src/renderer/vk_helpers.cpp
index 73e5e9e0..834dfb50 100644
--- a/src/renderer/vk_helpers.cpp
+++ b/src/renderer/vk_helpers.cpp
@@ -9,7 +9,7 @@
 #include <stb/stb_image.h>
 #include <stb/stb_image_write.h>
 #include "volk/volk.h"
-#include "extern/half/half/half.hpp"
+#include "extern/half/half.hpp"
 
 #include "immediate_submitter.h"
 #include "resource_manager.h"
@@ -777,47 +777,49 @@ void will_engine::vk_helpers::saveImageR32F(const ResourceManager& resourceManag
 }
 
 void will_engine::vk_helpers::saveImageR16F(const ResourceManager& resourceManager, const ImmediateSubmitter& immediate, const AllocatedImage& image, VkImageLayout imageLayout,
-    VkImageAspectFlags aspectFlag, const char* savePath, const std::function<float(half_float::half)>& valueTransform)
+    VkImageAspectFlags aspectFlag, const char* savePath, const std::function<float(uint16_t)>& valueTransform, int32_t mipLevel)
 {
-    using half_float::half;
-    const size_t dataSize = image.imageExtent.width * image.imageExtent.height * 1 * sizeof(half);
+    size_t newXSize = image.imageExtent.width / static_cast<size_t>(std::pow(2, mipLevel));
+    size_t newYSize = image.imageExtent.height / static_cast<size_t>(std::pow(2, mipLevel));
+    const size_t texelCount = newXSize * newYSize;
+    const size_t dataSize = texelCount * 1 * sizeof(uint16_t);
     AllocatedBuffer receivingBuffer = resourceManager.createReceivingBuffer(dataSize);
 
-    immediate.submit([&](VkCommandBuffer cmd) {
+    immediate.submit([&, mipLevel](VkCommandBuffer cmd) {
         VkBufferImageCopy bufferCopyRegion{};
         bufferCopyRegion.imageSubresource.aspectMask = aspectFlag;
-        bufferCopyRegion.imageSubresource.mipLevel = 0;
+        bufferCopyRegion.imageSubresource.mipLevel = mipLevel;
         bufferCopyRegion.imageSubresource.baseArrayLayer = 0;
         bufferCopyRegion.imageSubresource.layerCount = 1;
-        bufferCopyRegion.imageExtent = image.imageExtent;
+        bufferCopyRegion.imageExtent = {static_cast<uint32_t>(newXSize), static_cast<uint32_t>(newYSize), 1u};
         bufferCopyRegion.bufferOffset = 0;
         bufferCopyRegion.bufferRowLength = 0;
         bufferCopyRegion.bufferImageHeight = 0;
 
-        vk_helpers::transitionImage(cmd, image.image, imageLayout, VK_IMAGE_LAYOUT_GENERAL, aspectFlag);
+        transitionImage(cmd, image.image, imageLayout, VK_IMAGE_LAYOUT_GENERAL, aspectFlag);
 
         vkCmdCopyImageToBuffer(cmd, image.image, VK_IMAGE_LAYOUT_GENERAL, receivingBuffer.buffer, 1, &bufferCopyRegion);
 
-        vk_helpers::transitionImage(cmd, image.image, VK_IMAGE_LAYOUT_GENERAL, imageLayout, aspectFlag);
+        transitionImage(cmd, image.image, VK_IMAGE_LAYOUT_GENERAL, imageLayout, aspectFlag);
     });
 
     void* data = receivingBuffer.info.pMappedData;
-    const auto imageData = static_cast<half*>(data);
+    const auto imageData = static_cast<uint16_t*>(data);
 
-    const auto byteImageData = new uint8_t[image.imageExtent.width * image.imageExtent.height * 4];
+
+    const auto byteImageData = new uint8_t[texelCount * 4];
     const auto powEight = static_cast<float>(pow(2, 8) - 1);
-    for (size_t i = 0; i < image.imageExtent.width * image.imageExtent.height; ++i) {
-        half originalData = imageData[i];
-        float floatData = half_float::detail::half2float(originalData);
-        const float halfValue = valueTransform(originalData);
-        const auto value = static_cast<uint8_t>(halfValue * powEight);
+    for (size_t i = 0; i < texelCount; ++i) {
+        const uint16_t rvalue = imageData[i];
+        const float floatValue = valueTransform(rvalue);
+        const auto value = static_cast<uint8_t>(floatValue * powEight);
         byteImageData[i * 4 + 0] = value;
         byteImageData[i * 4 + 1] = value;
         byteImageData[i * 4 + 2] = value;
         byteImageData[i * 4 + 3] = 255;
     }
 
-    stbi_write_png(savePath, image.imageExtent.width, image.imageExtent.height, 4, byteImageData, image.imageExtent.width * 4);
+    stbi_write_png(savePath, static_cast<int>(newXSize), static_cast<int>(newYSize), 4, byteImageData, static_cast<int>(newXSize) * 4);
 
     delete[] byteImageData;
     resourceManager.destroyBuffer(receivingBuffer);
diff --git a/src/renderer/vk_helpers.h b/src/renderer/vk_helpers.h
index 3a5358ce..ad8c6455 100644
--- a/src/renderer/vk_helpers.h
+++ b/src/renderer/vk_helpers.h
@@ -12,7 +12,7 @@
 
 #include <fmt/format.h>
 #include <glm/glm.hpp>
-#include <half/half/half.hpp>
+#include <half/half.hpp>
 
 #include "vk_types.h"
 
@@ -55,9 +55,11 @@ namespace vk_helpers
     VkRenderingAttachmentInfo attachmentInfo(VkImageView view, const VkClearValue* clear,
                                              VkImageLayout layout /*= VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL*/);
 
-    VkRenderingInfo renderingInfo(VkExtent2D renderExtent, const VkRenderingAttachmentInfo* colorAttachment, const VkRenderingAttachmentInfo* depthAttachment);
+    VkRenderingInfo renderingInfo(VkExtent2D renderExtent, const VkRenderingAttachmentInfo* colorAttachment,
+                                  const VkRenderingAttachmentInfo* depthAttachment);
 
-    VkSubmitInfo2 submitInfo(const VkCommandBufferSubmitInfo* cmd, const VkSemaphoreSubmitInfo* signalSemaphoreInfo, const VkSemaphoreSubmitInfo* waitSemaphoreInfo);
+    VkSubmitInfo2 submitInfo(const VkCommandBufferSubmitInfo* cmd, const VkSemaphoreSubmitInfo* signalSemaphoreInfo,
+                             const VkSemaphoreSubmitInfo* waitSemaphoreInfo);
 
     VkPresentInfoKHR presentInfo();
 
@@ -78,7 +80,8 @@ namespace vk_helpers
      */
     VkDeviceSize getAlignedSize(VkDeviceSize value, VkDeviceSize alignment);
 
-    void clearColorImage(VkCommandBuffer cmd, VkImage image, VkImageLayout srcLayout, VkImageLayout dstLayout, VkClearColorValue clearColor = {0.0f, 0.0f, 0.0f, 1.0f});
+    void clearColorImage(VkCommandBuffer cmd, VkImage image, VkImageLayout srcLayout, VkImageLayout dstLayout,
+                         VkClearColorValue clearColor = {0.0f, 0.0f, 0.0f, 1.0f});
 
     void transitionImage(VkCommandBuffer cmd, VkImage image, VkImageLayout currentLayout, VkImageLayout targetLayout, VkImageAspectFlags aspectMask);
 
@@ -98,26 +101,30 @@ namespace vk_helpers
     VkPipelineShaderStageCreateInfo pipelineShaderStageCreateInfo(VkShaderStageFlagBits stage, VkShaderModule shaderModule,
                                                                   const char* entry = "main");
 
-    void saveImageRGBA32F(const ResourceManager& resourceManager, const ImmediateSubmitter& immediate, const AllocatedImage& image, VkImageLayout imageLayout, VkImageAspectFlags aspectFlag,
-                          const char* savePath, bool overrideAlpha = true);
+    void saveImageRGBA32F(const ResourceManager& resourceManager, const ImmediateSubmitter& immediate, const AllocatedImage& image,
+                          VkImageLayout imageLayout, VkImageAspectFlags aspectFlag, const char* savePath, bool overrideAlpha = true);
 
-    void saveImageRGBA16SFLOAT(const ResourceManager& resourceManager, const ImmediateSubmitter& immediate, const AllocatedImage& image, VkImageLayout imageLayout, VkImageAspectFlags aspectFlag,
-                               const char* savePath, bool overrideAlpha = true);
+    void saveImageRGBA16SFLOAT(const ResourceManager& resourceManager, const ImmediateSubmitter& immediate, const AllocatedImage& image,
+                               VkImageLayout imageLayout, VkImageAspectFlags aspectFlag, const char* savePath, bool overrideAlpha = true);
 
-    void savePacked32Bit(const ResourceManager& resourceManager, const ImmediateSubmitter& immediate, const AllocatedImage& image, VkImageLayout imageLayout, VkImageAspectFlags aspectFlag,
-                         const char* savePath, const std::function<glm::vec4(uint32_t)>& unpackingFunction);
+    void savePacked32Bit(const ResourceManager& resourceManager, const ImmediateSubmitter& immediate, const AllocatedImage& image,
+                         VkImageLayout imageLayout, VkImageAspectFlags aspectFlag, const char* savePath,
+                         const std::function<glm::vec4(uint32_t)>& unpackingFunction);
 
-    void savePacked64Bit(const ResourceManager& resourceManager, const ImmediateSubmitter& immediate, const AllocatedImage& image, VkImageLayout imageLayout, VkImageAspectFlags aspectFlag,
-                         const char* savePath, const std::function<glm::vec4(uint64_t)>& unpackingFunction);
+    void savePacked64Bit(const ResourceManager& resourceManager, const ImmediateSubmitter& immediate, const AllocatedImage& image,
+                         VkImageLayout imageLayout, VkImageAspectFlags aspectFlag, const char* savePath,
+                         const std::function<glm::vec4(uint64_t)>& unpackingFunction);
 
     /**
      * Save the Allocated image as a grayscaled image. The image must be a format with only 1 channel (e.g. R32 or D32)
      */
-    void saveImageR32F(const ResourceManager& resourceManager, const ImmediateSubmitter& immediate, const AllocatedImage& image, VkImageLayout imageLayout, VkImageAspectFlags aspectFlag,
-                       const char* savePath, const std::function<float(float)>& valueTransform, int32_t mipLevel = 0);
+    void saveImageR32F(const ResourceManager& resourceManager, const ImmediateSubmitter& immediate, const AllocatedImage& image,
+                       VkImageLayout imageLayout, VkImageAspectFlags aspectFlag, const char* savePath,
+                       const std::function<float(float)>& valueTransform, int32_t mipLevel = 0);
 
-    void saveImageR16F(const ResourceManager& resourceManager, const ImmediateSubmitter& immediate, const AllocatedImage& image, VkImageLayout imageLayout, VkImageAspectFlags aspectFlag,
-                       const char* savePath, const std::function<float(half_float::half)>& valueTransform);
+    void saveImageR16F(const ResourceManager& resourceManager, const ImmediateSubmitter& immediate, const AllocatedImage& image,
+                       VkImageLayout imageLayout, VkImageAspectFlags aspectFlag, const char* savePath,
+                       const std::function<float(uint16_t)>& valueTransform, int32_t mipLevel = 0);
 
     void saveImage(const std::vector<float>& imageData, int width, int height, std::filesystem::path filename, bool overrideAlpha = true);
 
diff --git a/src/util/math_utils.h b/src/util/math_utils.h
index ab4f8572..d4667f67 100644
--- a/src/util/math_utils.h
+++ b/src/util/math_utils.h
@@ -25,6 +25,32 @@ inline void decomposeMatrix(const glm::mat4& matrix, glm::vec3& position, glm::q
     );
     rotation = quat_cast(rotMat);
 }
+
+inline uint32_t as_uint(const float x)
+{
+    return *(uint32_t*) &x;
+}
+
+inline float as_float(uint32_t x)
+{
+    return *(float*) &x;
+}
+
+/**
+ * https://stackoverflow.com/a/60047308
+ * @param x
+ * @return
+ */
+inline float halfToFloat(const uint16_t x)
+{
+    // IEEE-754 16-bit floating-point format (without infinity): 1-5-10, exp-15, +-131008.0, +-6.1035156E-5, +-5.9604645E-8, 3.311 digits
+    const uint32_t e = (x & 0x7C00) >> 10; // exponent
+    const uint32_t m = (x & 0x03FF) << 13; // mantissa
+    const uint32_t v = as_uint((float) m) >> 23; // evil log2 bit hack to count leading zeros in denormalized format
+    return as_float(
+        (x & 0x8000) << 16 | (e != 0) * ((e + 112) << 23 | m) | ((e == 0) & (m != 0)) * (
+            (v - 37) << 23 | ((m << (150 - v)) & 0x007FE000))); // sign : normalized : denormalized
+}
 }
 
 #endif //MATH_UTILS_H

From 26e8586c928046ca53e8e19a3df6c6b851ef2d00 Mon Sep 17 00:00:00 2001
From: Williscool13 <twtw40@gmail.com>
Date: Fri, 28 Mar 2025 21:05:06 +0700
Subject: [PATCH 15/27] Half License. GTAO main pass setup.

---
 licenses/half/ChangeLog.txt                   | 222 ++++++++++++
 licenses/half/{LICENCE => LICENSE.txt}        |   2 +-
 licenses/half/README.md                       |  13 -
 licenses/half/README.txt                      | 317 ++++++++++++++++++
 .../ground_truth/gtao_main_pass.comp          |  28 ++
 src/renderer/imgui_wrapper.cpp                |  18 +
 .../ground_truth_ambient_occlusion.cpp        |  58 ++--
 .../deferred_resolve/deferred_resolve.cpp     |  42 +--
 src/renderer/vk_helpers.cpp                   |  47 +++
 src/renderer/vk_helpers.h                     |   3 +
 10 files changed, 687 insertions(+), 63 deletions(-)
 create mode 100644 licenses/half/ChangeLog.txt
 rename licenses/half/{LICENCE => LICENSE.txt} (96%)
 delete mode 100644 licenses/half/README.md
 create mode 100644 licenses/half/README.txt

diff --git a/licenses/half/ChangeLog.txt b/licenses/half/ChangeLog.txt
new file mode 100644
index 00000000..766603c9
--- /dev/null
+++ b/licenses/half/ChangeLog.txt
@@ -0,0 +1,222 @@
+Release Notes
+=============
+
+2.2.1 release (2025-03-02):
+---------------------------
+
+- Fixed error when converting minimum negative integer value to half.
+- Added more type conversions to remove warnings on some 64-bit platforms.
+- Removed potential naming conflicts of internal functions with C standard 
+  library functions.
+
+
+2.2.0 release (2021-06-12):
+---------------------------
+
+- Added `rsqrt` function for inverse square root.
+- Improved performance of `pow` function.
+- Fixed bug that forgot to include `<immintrin.h>` for F16C intrinsics.
+
+
+2.1.0 release (2019-08-05):
+---------------------------
+
+- Added detection of IEEE floating-point exceptions to operators and functions.
+- Added configuration options for automatic exception handling.
+- Added functions for explicitly managing floating-point exception flags.
+- Improved accuracy of `pow` and `atan2` functions.
+
+
+2.0.0 release (2019-07-23):
+---------------------------
+
+- Made internal implementation independent from built-in floating point 
+  facilities for increased reliability and IEEE-conformance.
+- Changed default rounding mode to rounding to nearest.
+- Always round ties to even when rounding to nearest.
+- Extended `constexpr` support to comparison and classification functions.
+- Added support for F16C compiler intrinsics for conversions.
+- Enabled C++11 feature detection for Intel compilers.
+
+
+1.12.0 release (2017-03-06):
+----------------------------
+
+- Changed behaviour of `half_cast` to perform conversions to/from `double` 
+  and `long double` directly according to specified rounding mode, without an 
+  intermediate `float` conversion.
+- Added `noexcept` specifiers to constructors.
+- Fixed minor portability problem with `logb` and `ilogb`.
+- Tested for *VC++ 2015*.
+
+
+1.11.0 release (2013-11-16):
+----------------------------
+
+- Made tie-breaking behaviour in round to nearest configurable by 
+  `HALF_ROUND_TIES_TO_EVEN` macro.
+- Completed support for all C++11 mathematical functions even if single-
+  precision versions from `<cmath>` are unsupported.
+- Fixed inability to disable support for C++11 mathematical functions on 
+  *VC++ 2013*.
+
+
+1.10.0 release (2013-11-09):
+----------------------------
+
+- Made default rounding mode configurable by `HALF_ROUND_STYLE` macro.
+- Added support for non-IEEE single-precision implementations.
+- Added `HALF_ENABLE_CPP11_TYPE_TRAITS` preprocessor flag for checking 
+  support for C++11 type traits and TMP features.
+- Restricted `half_cast` to support built-in arithmetic types only.
+- Changed behaviour of `half_cast` to respect rounding mode when casting 
+  to/from integer types.
+
+
+1.9.2 release (2013-11-01):
+---------------------------
+
+- Tested for *gcc 4.8*.
+- Tested and fixed for *VC++ 2013*.
+- Removed unnecessary warnings in *MSVC*.
+
+
+1.9.1 release (2013-08-08):
+---------------------------
+
+- Fixed problems with older gcc and MSVC versions.
+- Small fix to non-C++11 implementations of `remainder` and `remquo`.
+
+
+1.9.0 release (2013-08-07):
+---------------------------
+
+- Changed behaviour of `nearbyint`, `rint`, `lrint` and `llrint` to use 
+  rounding mode of half-precision implementation (which is 
+  truncating/indeterminate) instead of single-precision rounding mode.
+- Added support for more C++11 mathematical functions even if single-
+  precision versions from `<cmath>` are unsupported, in particular 
+  `remainder`, `remquo` and `cbrt`.
+- Minor implementation changes.
+
+
+1.8.1 release (2013-01-22):
+---------------------------
+
+- Fixed bug resulting in multiple definitions of the `nanh` function due to 
+  a missing `inline` specification.
+
+
+1.8.0 release (2013-01-19):
+---------------------------
+
+- Added support for more C++11 mathematical functions even if single-
+  precision versions from `<cmath>` are unsupported, in particular 
+  exponential and logarithm functions, hyperbolic area functions and the 
+  hypotenuse function.
+- Made `fma` function use default implementation if single-precision version
+  from `<cmath>` is not faster and thus `FP_FAST_FMAH` to be defined always.
+- Fixed overload resolution issues when invoking certain mathematical 
+  functions by unqualified calls.
+
+
+1.7.0 release (2012-10-26):
+---------------------------
+
+- Added support for C++11 `noexcept` specifiers.
+- Changed C++11 `long long` to be supported on *VC++ 2003* and up.
+
+
+1.6.1 release (2012-09-13):
+---------------------------
+
+- Made `fma` and `fdim` functions available even if corresponding 
+  single-precision functions are not.
+
+
+1.6.0 release (2012-09-12):
+---------------------------
+
+- Added `HALF_ENABLE_CPP11_LONG_LONG` to control support for `long long` 
+  integers and corresponding mathematical functions.
+- Fixed C++98 compatibility on non-VC compilers.
+
+
+1.5.1 release (2012-08-17):
+---------------------------
+
+- Recorrected `std::numeric_limits::round_style` to always return 
+  `std::round_indeterminate`, due to overflow-handling deviating from 
+  correct round-toward-zero behaviour.
+
+
+1.5.0 release (2012-08-16):
+---------------------------
+
+- Added `half_cast` for explicitly casting between half and any type 
+  convertible to/from `float` and allowing the explicit specification of 
+  the rounding mode to use.
+
+
+1.4.0 release (2012-08-12):
+---------------------------
+
+- Added support for C++11 generalized constant expressions (`constexpr`).
+
+
+1.3.1 release (2012-08-11):
+---------------------------
+
+- Fixed requirement for `std::signbit` and `std::isnan` (even if C++11 
+  `<cmath>` functions disabled) on non-VC compilers.
+
+
+1.3.0 release (2012-08-10):
+---------------------------
+
+- Made requirement for `<cstdint>` and `static_assert` optional and thus 
+  made the library C++98-compatible.
+- Made support for C++11 features user-overridable through explicit 
+  definition of corresponding preprocessor symbols to either 0 or 1.
+- Renamed `HALF_ENABLE_HASH` to `HALF_ENABLE_CPP11_HASH` in correspondence 
+  with other C++11 preprocessor symbols.
+
+
+1.2.0 release (2012-08-07):
+---------------------------
+
+- Added proper preprocessor definitions for `HUGE_VALH` and `FP_FAST_FMAH` 
+  in correspondence with their single-precision counterparts from `<cmath>`.
+- Fixed internal preprocessor macros to be properly undefined after use.
+
+
+1.1.2 release (2012-08-07):
+---------------------------
+
+- Revised `std::numeric_limits::round_style` to return 
+  `std::round_toward_zero` if the `float` version also does and 
+  `std::round_indeterminate` otherwise.
+- Fixed `std::numeric_limits::round_error` to reflect worst-case round 
+  toward zero behaviour.
+
+
+1.1.1 release (2012-08-06):
+---------------------------
+
+- Fixed `std::numeric_limits::min` to return smallest positive normal 
+  number, instead of subnormal number.
+- Fixed `std::numeric_limits::round_style` to return 
+  `std::round_indeterminate` due to mixture of separately rounded 
+  single-precision arithmetics with truncating single-to-half conversions.
+
+
+1.1.0 release (2012-08-06):
+---------------------------
+
+- Added half-precision literals.
+
+
+1.0.0 release (2012-08-05):
+---------------------------
+
+- First release.
diff --git a/licenses/half/LICENCE b/licenses/half/LICENSE.txt
similarity index 96%
rename from licenses/half/LICENCE
rename to licenses/half/LICENSE.txt
index abee50b1..8579da1b 100644
--- a/licenses/half/LICENCE
+++ b/licenses/half/LICENSE.txt
@@ -1,6 +1,6 @@
 The MIT License
 
-Copyright (c) 2012-2017 Christian Rau
+Copyright (c) 2012-2025 Christian Rau
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
diff --git a/licenses/half/README.md b/licenses/half/README.md
deleted file mode 100644
index 52306d39..00000000
--- a/licenses/half/README.md
+++ /dev/null
@@ -1,13 +0,0 @@
-# Upstream
-
-This library was developed by CHRISTIAN RAU. Upstream repository is a https://sourceforge.net/projects/half/ .
-Only half.hpp header file has been imported from upstream.
-
-# License
-
-Original upstream's MIT License. See LICENSE file in the repository root.
-
-# Update
-
-This repository should be kept up-to-date with the upstream.
-
diff --git a/licenses/half/README.txt b/licenses/half/README.txt
new file mode 100644
index 00000000..6a97fb42
--- /dev/null
+++ b/licenses/half/README.txt
@@ -0,0 +1,317 @@
+HALF-PRECISION FLOATING-POINT LIBRARY (Version 2.2.1)
+-----------------------------------------------------
+
+This is a C++ header-only library to provide an IEEE 754 conformant 16-bit 
+half-precision floating-point type along with corresponding arithmetic 
+operators, type conversions and common mathematical functions. It aims for both 
+efficiency and ease of use, trying to accurately mimic the behaviour of the 
+built-in floating-point types at the best performance possible.
+
+
+INSTALLATION AND REQUIREMENTS
+-----------------------------
+
+Conveniently, the library consists of just a single header file containing all 
+the functionality, which can be directly included by your projects, without the 
+neccessity to build anything or link to anything.
+
+Whereas this library is fully C++98-compatible, it can profit from certain 
+C++11 features. Support for those features is checked automatically at compile 
+(or rather preprocessing) time, but can be explicitly enabled or disabled by 
+predefining the corresponding preprocessor symbols to either 1 or 0 yourself 
+before including half.hpp. This is useful when the automatic detection fails 
+(for more exotic implementations) or when a feature should be explicitly 
+disabled:
+
+  - 'long long' integer type for mathematical functions returning 'long long' 
+    results (enabled for VC++ 2003 and icc 11.1 and newer, gcc and clang, 
+    overridable with 'HALF_ENABLE_CPP11_LONG_LONG').
+
+  - Static assertions for extended compile-time checks (enabled for VC++ 2010, 
+    gcc 4.3, clang 2.9, icc 11.1 and newer, overridable with 
+    'HALF_ENABLE_CPP11_STATIC_ASSERT').
+
+  - Generalized constant expressions (enabled for VC++ 2015, gcc 4.6, clang 3.1, 
+    icc 14.0 and newer, overridable with 'HALF_ENABLE_CPP11_CONSTEXPR').
+
+  - noexcept exception specifications (enabled for VC++ 2015, gcc 4.6, 
+    clang 3.0, icc 14.0 and newer, overridable with 'HALF_ENABLE_CPP11_NOEXCEPT').
+
+  - User-defined literals for half-precision literals to work (enabled for 
+    VC++ 2015, gcc 4.7, clang 3.1, icc 15.0 and newer, overridable with 
+    'HALF_ENABLE_CPP11_USER_LITERALS').
+
+  - Thread-local storage for per-thread floating-point exception flags (enabled 
+    for VC++ 2015, gcc 4.8, clang 3.3, icc 15.0 and newer, overridable with 
+    'HALF_ENABLE_CPP11_THREAD_LOCAL').
+
+  - Type traits and template meta-programming features from <type_traits> 
+    (enabled for VC++ 2010, libstdc++ 4.3, libc++ and newer, overridable with 
+    'HALF_ENABLE_CPP11_TYPE_TRAITS').
+
+  - Special integer types from <cstdint> (enabled for VC++ 2010, libstdc++ 4.3, 
+    libc++ and newer, overridable with 'HALF_ENABLE_CPP11_CSTDINT').
+
+  - Certain C++11 single-precision mathematical functions from <cmath> for 
+    floating-point classification during conversions from higher precision types 
+    (enabled for VC++ 2013, libstdc++ 4.3, libc++ and newer, overridable with 
+    'HALF_ENABLE_CPP11_CMATH').
+
+  - Floating-point environment control from <cfenv> for possible exception 
+    propagation to the built-in floating-point platform (enabled for VC++ 2013, 
+    libstdc++ 4.3, libc++ and newer, overridable with 'HALF_ENABLE_CPP11_CFENV').
+
+  - Hash functor 'std::hash' from <functional> (enabled for VC++ 2010, 
+    libstdc++ 4.3, libc++ and newer, overridable with 'HALF_ENABLE_CPP11_HASH').
+
+The library has been tested successfully with Visual C++ 2005-2015, gcc 4-8 
+and clang 3-8 on 32- and 64-bit x86 systems. Please contact me if you have any 
+problems, suggestions or even just success testing it on other platforms.
+
+
+DOCUMENTATION
+-------------
+
+What follows are some general words about the usage of the library and its 
+implementation. For a complete documentation of its interface consult the 
+corresponding website http://half.sourceforge.net. You may also generate the 
+complete developer documentation from the library's only include file's doxygen 
+comments, but this is more relevant to developers rather than mere users.
+
+BASIC USAGE
+
+To make use of the library just include its only header file half.hpp, which 
+defines all half-precision functionality inside the 'half_float' namespace. The 
+actual 16-bit half-precision data type is represented by the 'half' type, which 
+uses the standard IEEE representation with 1 sign bit, 5 exponent bits and 11 
+mantissa bits (including the hidden bit) and supports all types of special 
+values, like subnormal values, infinity and NaNs. This type behaves like the 
+built-in floating-point types as much as possible, supporting the usual 
+arithmetic, comparison and streaming operators, which makes its use pretty 
+straight-forward:
+
+    using half_float::half;
+    half a(3.4), b(5);
+    half c = a * b;
+    c += 3;
+    if(c > a)
+        std::cout << c << std::endl;
+
+Additionally the 'half_float' namespace also defines half-precision versions 
+for all mathematical functions of the C++ standard library, which can be used 
+directly through ADL:
+
+    half a(-3.14159);
+    half s = sin(abs(a));
+    long l = lround(s);
+
+You may also specify explicit half-precision literals, since the library 
+provides a user-defined literal inside the 'half_float::literal' namespace, 
+which you just need to import (assuming support for C++11 user-defined literals):
+
+    using namespace half_float::literal;
+    half x = 1.0_h;
+
+Furthermore the library provides proper specializations for 
+'std::numeric_limits', defining various implementation properties, and 
+'std::hash' for hashing half-precision numbers (assuming support for C++11 
+'std::hash'). Similar to the corresponding preprocessor symbols from <cmath> 
+the library also defines the 'HUGE_VALH' constant and maybe the 'FP_FAST_FMAH' 
+symbol.
+
+CONVERSIONS AND ROUNDING
+
+The half is explicitly constructible/convertible from a single-precision float 
+argument. Thus it is also explicitly constructible/convertible from any type 
+implicitly convertible to float, but constructing it from types like double or 
+int will involve the usual warnings arising when implicitly converting those to 
+float because of the lost precision. On the one hand those warnings are 
+intentional, because converting those types to half neccessarily also reduces 
+precision. But on the other hand they are raised for explicit conversions from 
+those types, when the user knows what they are doing. So if those warnings keep 
+bugging you, then you won't get around first explicitly converting to float 
+before converting to half, or use the 'half_cast' described below. In addition 
+you can also directly assign float values to halfs.
+
+In contrast to the float-to-half conversion, which reduces precision, the 
+conversion from half to float (and thus to any other type implicitly 
+convertible from float) is implicit, because all values represetable with 
+half-precision are also representable with single-precision. This way the 
+half-to-float conversion behaves similar to the builtin float-to-double 
+conversion and all arithmetic expressions involving both half-precision and 
+single-precision arguments will be of single-precision type. This way you can 
+also directly use the mathematical functions of the C++ standard library, 
+though in this case you will invoke the single-precision versions which will 
+also return single-precision values, which is (even if maybe performing the 
+exact same computation, see below) not as conceptually clean when working in a 
+half-precision environment.
+
+The default rounding mode for conversions between half and more precise types 
+as well as for rounding results of arithmetic operations and mathematical 
+functions rounds to the nearest representable value. But by predefining the 
+'HALF_ROUND_STYLE' preprocessor symbol this default can be overridden with one 
+of the other standard rounding modes using their respective constants or the 
+equivalent values of 'std::float_round_style' (it can even be synchronized with 
+the built-in single-precision implementation by defining it to 
+'std::numeric_limits<float>::round_style'):
+
+  - 'std::round_indeterminate' (-1) for the fastest rounding.
+
+  - 'std::round_toward_zero' (0) for rounding toward zero.
+
+  - 'std::round_to_nearest' (1) for rounding to the nearest value (default).
+
+  - 'std::round_toward_infinity' (2) for rounding toward positive infinity.
+
+  - 'std::round_toward_neg_infinity' (3) for rounding toward negative infinity.
+
+In addition to changing the overall default rounding mode one can also use the 
+'half_cast'. This converts between half and any built-in arithmetic type using 
+a configurable rounding mode (or the default rounding mode if none is 
+specified). In addition to a configurable rounding mode, 'half_cast' has 
+another big difference to a mere 'static_cast': Any conversions are performed 
+directly using the given rounding mode, without any intermediate conversion 
+to/from 'float'. This is especially relevant for conversions to integer types, 
+which don't necessarily truncate anymore. But also for conversions from 
+'double' or 'long double' this may produce more precise results than a 
+pre-conversion to 'float' using the single-precision implementation's current 
+rounding mode would.
+
+    half a = half_cast<half>(4.2);
+    half b = half_cast<half,std::numeric_limits<float>::round_style>(4.2f);
+    assert( half_cast<int, std::round_to_nearest>( 0.7_h )     == 1 );
+    assert( half_cast<half,std::round_toward_zero>( 4097 )     == 4096.0_h );
+    assert( half_cast<half,std::round_toward_infinity>( 4097 ) == 4100.0_h );
+    assert( half_cast<half,std::round_toward_infinity>( std::numeric_limits<double>::min() ) > 0.0_h );
+
+ACCURACY AND PERFORMANCE
+
+From version 2.0 onward the library is implemented without employing the 
+underlying floating-point implementation of the system (except for conversions, 
+of course), providing an entirely self-contained half-precision implementation 
+with results independent from the system's existing single- or double-precision 
+implementation and its rounding behaviour.
+
+As to accuracy, many of the operators and functions provided by this library 
+are exact to rounding for all rounding modes, i.e. the error to the exact 
+result is at most 0.5 ULP (unit in the last place) for rounding to nearest and 
+less than 1 ULP for all other rounding modes. This holds for all the operations 
+required by the IEEE 754 standard and many more. Specifically the following 
+functions might exhibit a deviation from the correctly rounded exact result by 
+1 ULP for a select few input values: 'expm1', 'log1p', 'pow', 'atan2', 'erf', 
+'erfc', 'lgamma', 'tgamma' (for more details see the documentation of the 
+individual functions). All other functions and operators are always exact to 
+rounding or independent of the rounding mode altogether.
+
+The increased IEEE-conformance and cleanliness of this implementation comes 
+with a certain performance cost compared to doing computations and mathematical 
+functions in hardware-accelerated single-precision. On average and depending on 
+the platform, the arithemtic operators are about 75% as fast and the 
+mathematical functions about 33-50% as fast as performing the corresponding 
+operations in single-precision and converting between the inputs and outputs. 
+However, directly computing with half-precision values is a rather rare 
+use-case and usually using actual 'float' values for all computations and 
+temproraries and using 'half's only for storage is the recommended way. But 
+nevertheless the goal of this library was to provide a complete and 
+conceptually clean IEEE-confromant half-precision implementation and in the few 
+cases when you do need to compute directly in half-precision you do so for a 
+reason and want accurate results.
+
+If necessary, this internal implementation can be overridden by predefining the 
+'HALF_ARITHMETIC_TYPE' preprocessor symbol to one of the built-in 
+floating-point types ('float', 'double' or 'long double'), which will cause the 
+library to use this type for computing arithmetic operations and mathematical 
+functions (if available). However, due to using the platform's floating-point 
+implementation (and its rounding behaviour) internally, this might cause 
+results to deviate from the specified half-precision rounding mode. It will of 
+course also inhibit the automatic exception detection described below.
+
+The conversion operations between half-precision and single-precision types can 
+also make use of the F16C extension for x86 processors by using the 
+corresponding compiler intrinsics from <immintrin.h>. Support for this is 
+checked at compile-time by looking for the '__F16C__' macro which at least gcc 
+and clang define based on the target platform. It can also be enabled manually 
+by predefining the 'HALF_ENABLE_F16C_INTRINSICS' preprocessor symbol to 1, or 0 
+for explicitly disabling it. However, this will directly use the corresponding 
+intrinsics for conversion without checking if they are available at runtime 
+(possibly crashing if they are not), so make sure they are supported on the 
+target platform before enabling this.
+
+EXCEPTION HANDLING
+
+The half-precision implementation supports all 5 required floating-point 
+exceptions from the IEEE standard to indicate erroneous inputs or inexact 
+results during operations. These are represented by exception flags which 
+actually use the same values as the corresponding 'FE_...' flags defined in 
+C++11's <cfenv> header if supported, specifically:
+
+  - 'FE_INVALID' for invalid inputs to an operation.
+  - 'FE_DIVBYZERO' for finite inputs producing infinite results.
+  - 'FE_OVERFLOW' if a result is too large to represent finitely.
+  - 'FE_UNDERFLOW' for a subnormal or zero result after rounding.
+  - 'FE_INEXACT' if a result needed rounding to be representable.
+  - 'FE_ALL_EXCEPT' as a convenient OR of all possible exception flags.
+
+The internal exception flag state will start with all flags cleared and is 
+maintained per thread if C++11 thread-local storage is supported, otherwise it 
+will be maintained globally and will theoretically NOT be thread-safe (while 
+practically being as thread-safe as a simple integer variable can be). These 
+flags can be managed explicitly using the library's error handling functions, 
+which again try to mimic the built-in functions for handling floating-point 
+exceptions from <cfenv>. You can clear them with 'feclearexcept' (which is the 
+only way a flag can be cleared), test them with 'fetestexcept', explicitly 
+raise errors with 'feraiseexcept' and save and restore their state using 
+'fegetexceptflag' and 'fesetexceptflag'. You can also throw corresponding C++ 
+exceptions based on the current flag state using 'fethrowexcept'.
+
+However, any automatic exception detection and handling during half-precision 
+operations and functions is DISABLED by default, since it comes with a minor 
+performance overhead due to runtime checks, and reacting to IEEE floating-point 
+exceptions is rarely ever needed in application code. But the library fully 
+supports IEEE-conformant detection of floating-point exceptions and various 
+ways for handling them, which can be enabled by pre-defining the corresponding 
+preprocessor symbols to 1. They can be enabled individually or all at once and 
+they will be processed in the order they are listed here:
+
+  - 'HALF_ERRHANDLING_FLAGS' sets the internal exception flags described above 
+    whenever the corresponding exception occurs.
+  - 'HALF_ERRHANDLING_ERRNO' sets the value of 'errno' from <cerrno> similar to 
+    the behaviour of the built-in floating-point types when 'MATH_ERRNO' is used.
+  - 'HALF_ERRHANDLING_FENV' will propagate exceptions to the built-in 
+    floating-point implementation using 'std::feraiseexcept' if support for 
+    C++11 floating-point control is enabled. However, this does not synchronize 
+    exceptions: neither will clearing  propagate nor will it work in reverse.
+  - 'HALF_ERRHANDLING_THROW_...' can be defined to a string literal which will 
+    be used as description message for a C++ exception that is thrown whenever 
+    a 'FE_...' exception occurs, similar to the behaviour of 'fethrowexcept'.
+
+If any of the above error handling is activated, non-quiet operations on 
+half-precision values will also raise a 'FE_INVALID' exception whenever 
+they encounter a signaling NaN value, in addition to transforming the value 
+into a quiet NaN. If error handling is disabled, signaling NaNs will be 
+treated like quiet NaNs (while still getting explicitly quieted if propagated 
+to the result). There can also be additional treatment of overflow and 
+underflow errors after they have been processed as above, which is ENABLED by 
+default (but of course only takes effect if any other exception handling is 
+activated) unless overridden by pre-defining the corresponding preprocessor 
+symbol to 0:
+
+  - 'HALF_ERRHANDLING_OVERFLOW_TO_INEXACT' will cause overflow errors to also 
+    raise a 'FE_INEXACT' exception.
+  - 'HALF_ERRHANDLING_UNDERFLOW_TO_INEXACT' will cause underflow errors to also 
+    raise a 'FE_INEXACT' exception. This will also slightly change the 
+    behaviour of the underflow exception, which will ONLY be raised if the 
+    result is actually inexact due to underflow. If this is disabled, underflow 
+    exceptions will be raised for ANY (possibly exact) subnormal result.
+
+
+CREDITS AND CONTACT
+-------------------
+
+This library is developed by CHRISTIAN RAU and released under the MIT License 
+(see LICENSE.txt). If you have any questions or problems with it, feel free to 
+contact me at rauy@users.sourceforge.net.
+
+Additional credit goes to JEROEN VAN DER ZIJP for his paper on "Fast Half Float 
+Conversions", whose algorithms have been used in the library for converting 
+between half-precision and single-precision values.
diff --git a/shaders/ambient_occlusion/ground_truth/gtao_main_pass.comp b/shaders/ambient_occlusion/ground_truth/gtao_main_pass.comp
index 6f9db5be..7f7f844b 100644
--- a/shaders/ambient_occlusion/ground_truth/gtao_main_pass.comp
+++ b/shaders/ambient_occlusion/ground_truth/gtao_main_pass.comp
@@ -2,12 +2,40 @@
 
 #include "scene.glsl"
 
+layout(local_size_x = 16, local_size_y = 16) in;
+
 // layout (std140, set = 0, binding = 0) uniform SceneData - scene.glsl
 
 layout (set = 1, binding = 0) uniform sampler2D prefilteredDepth;
 layout (set = 1, binding = 1) uniform sampler2D normalBuffer;
 layout (r8, set = 1, binding = 2) uniform image2D aoOutput;
 
+layout (push_constant) uniform PushConstants {
+    vec2 ndcToViewMult;
+    vec2 ndcToViewAdd;
+
+    float depthLinearizeMult;
+    float depthLinearizeAdd;
+
+    float radius;
+    float falloff;
+    float radiusMultiplier;
+
+    float strength;
+
+    int numDirections;
+    int numSteps;
+
+    float temporalWeight;
+    float spatialFilterRadius;
+} pushConstants;
+
 void main() {
+    const ivec2 screenPos = ivec2(gl_GlobalInvocationID.xy);
 
+    if (screenPos.x % 2 == 0 && screenPos.y % 2 == 0){
+        imageStore(aoOutput, screenPos, vec4(1.0f));
+    } else {
+        imageStore(aoOutput, screenPos, vec4(0.5f));
+    }
 }
diff --git a/src/renderer/imgui_wrapper.cpp b/src/renderer/imgui_wrapper.cpp
index cf81f709..a7abdf74 100644
--- a/src/renderer/imgui_wrapper.cpp
+++ b/src/renderer/imgui_wrapper.cpp
@@ -909,6 +909,24 @@ void ImguiWrapper::imguiInterface(Engine* engine)
                 fmt::print(" Failed to find/create image save path directory");
             }
         }
+
+        if (ImGui::Button("Save raw ao image")) {
+            if (file::getOrCreateDirectory(file::imagesSavePath)) {
+                const std::filesystem::path path = file::imagesSavePath / "raw_ao_image.png";
+
+                vk_helpers::saveImageR8UNORM(
+                    *engine->resourceManager,
+                    *engine->immediate,
+                    engine->ambientOcclusionPipeline->ambientOcclusionImage,
+                    VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL,
+                    path.string().c_str(),
+                    0
+                );
+            }
+            else {
+                fmt::print(" Failed to find/create image save path directory");
+            }
+        }
     }
     ImGui::End();
 
diff --git a/src/renderer/lighting/ambient_occlusion/ground_truth/ground_truth_ambient_occlusion.cpp b/src/renderer/lighting/ambient_occlusion/ground_truth/ground_truth_ambient_occlusion.cpp
index 7ee4589e..ca68f6e9 100644
--- a/src/renderer/lighting/ambient_occlusion/ground_truth/ground_truth_ambient_occlusion.cpp
+++ b/src/renderer/lighting/ambient_occlusion/ground_truth/ground_truth_ambient_occlusion.cpp
@@ -105,6 +105,8 @@ will_engine::ambient_occlusion::GroundTruthAmbientOcclusionPipeline::GroundTruth
         VkImageUsageFlags usage{};
         usage |= VK_IMAGE_USAGE_STORAGE_BIT;
         usage |= VK_IMAGE_USAGE_SAMPLED_BIT;
+        usage |= VK_IMAGE_USAGE_TRANSFER_SRC_BIT;
+        usage |= VK_IMAGE_USAGE_TRANSFER_DST_BIT;
 
         VkImageCreateInfo imgInfo = vk_helpers::imageCreateInfo(ambientOcclusionFormat, usage, {RENDER_EXTENTS.width, RENDER_EXTENTS.height, 1});
         ambientOcclusionImage = resourceManager.createImage(imgInfo);
@@ -360,12 +362,10 @@ void will_engine::ambient_occlusion::GroundTruthAmbientOcclusionPipeline::draw(V
         bindingInfos[1] = depthPrefilterDescriptorBuffer.getDescriptorBufferBindingInfo();
         vkCmdBindDescriptorBuffersEXT(cmd, 2, bindingInfos);
 
-        uint32_t index0 = 0;
-        uint32_t index1 = 1;
-        VkDeviceSize offset0 = drawInfo.sceneDataOffset;
+        constexpr std::array<uint32_t, 2> indices{0,1};
+        const std::array offsets{drawInfo.sceneDataOffset, ZERO_DEVICE_SIZE};
 
-        vkCmdSetDescriptorBufferOffsetsEXT(cmd, VK_PIPELINE_BIND_POINT_COMPUTE, depthPrefilterPipelineLayout, 0, 1, &index0, &offset0);
-        vkCmdSetDescriptorBufferOffsetsEXT(cmd, VK_PIPELINE_BIND_POINT_COMPUTE, depthPrefilterPipelineLayout, 1, 1, &index1, &ZERO_DEVICE_SIZE);
+        vkCmdSetDescriptorBufferOffsetsEXT(cmd, VK_PIPELINE_BIND_POINT_COMPUTE, depthPrefilterPipelineLayout, 0, 2, indices.data(), offsets.data());
 
         auto x = static_cast<uint32_t>(std::ceil(RENDER_EXTENT_WIDTH / 8.0f));
         auto y = static_cast<uint32_t>(std::ceil(RENDER_EXTENT_HEIGHT / 8.0f));
@@ -376,31 +376,29 @@ void will_engine::ambient_occlusion::GroundTruthAmbientOcclusionPipeline::draw(V
     }
 
     vk_helpers::transitionImage(cmd, depthPrefilterImage.image, VK_IMAGE_LAYOUT_GENERAL, VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL, VK_IMAGE_ASPECT_COLOR_BIT);
-    // vk_helpers::transitionImage(cmd, ambientOcclusionImage.image, VK_IMAGE_LAYOUT_UNDEFINED, VK_IMAGE_LAYOUT_GENERAL, VK_IMAGE_ASPECT_COLOR_BIT);
-    // // Ambient Occlusion
-    // {
-    //     vkCmdBindPipeline(cmd, VK_PIPELINE_BIND_POINT_COMPUTE, ambientOcclusionPipeline);
-    //     vkCmdPushConstants(cmd, ambientOcclusionPipelineLayout, VK_SHADER_STAGE_COMPUTE_BIT, 0, sizeof(GTAOPushConstants), &drawInfo.pushConstants);
-    //
-    //     VkDescriptorBufferBindingInfoEXT bindingInfos[2] = {};
-    //     bindingInfos[0] = drawInfo.sceneDataBinding;
-    //     bindingInfos[1] = ambientOcclusionDescriptorBuffer.getDescriptorBufferBindingInfo();
-    //     vkCmdBindDescriptorBuffersEXT(cmd, 2, bindingInfos);
-    //
-    //     constexpr VkDeviceSize zeroOffset{0};
-    //     constexpr uint32_t sceneDataIndex{0};
-    //     constexpr uint32_t descriptorIndex{1};
-    //
-    //     vkCmdSetDescriptorBufferOffsetsEXT(cmd, VK_PIPELINE_BIND_POINT_COMPUTE, ambientOcclusionPipelineLayout, 0, 1, &sceneDataIndex, &drawInfo.sceneDataOffset);
-    //     vkCmdSetDescriptorBufferOffsetsEXT(cmd, VK_PIPELINE_BIND_POINT_COMPUTE, ambientOcclusionPipelineLayout, 1, 1, &descriptorIndex, &zeroOffset);
-    //
-    //     const auto x = static_cast<uint32_t>(std::ceil(RENDER_EXTENT_WIDTH / 16.0f));
-    //     const auto y = static_cast<uint32_t>(std::ceil(RENDER_EXTENT_HEIGHT / 16.0f));
-    //     vkCmdDispatch(cmd, x, y, 1);
-    // }
-    //
-    //
-    // vk_helpers::transitionImage(cmd, ambientOcclusionImage.image, VK_IMAGE_LAYOUT_GENERAL, VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL, VK_IMAGE_ASPECT_COLOR_BIT);
+    vk_helpers::transitionImage(cmd, ambientOcclusionImage.image, VK_IMAGE_LAYOUT_UNDEFINED, VK_IMAGE_LAYOUT_GENERAL, VK_IMAGE_ASPECT_COLOR_BIT);
+    // Ambient Occlusion
+    {
+        vkCmdBindPipeline(cmd, VK_PIPELINE_BIND_POINT_COMPUTE, ambientOcclusionPipeline);
+        vkCmdPushConstants(cmd, ambientOcclusionPipelineLayout, VK_SHADER_STAGE_COMPUTE_BIT, 0, sizeof(GTAOPushConstants), &push);
+
+        VkDescriptorBufferBindingInfoEXT bindingInfos[2] = {};
+        bindingInfos[0] = drawInfo.sceneDataBinding;
+        bindingInfos[1] = ambientOcclusionDescriptorBuffer.getDescriptorBufferBindingInfo();
+        vkCmdBindDescriptorBuffersEXT(cmd, 2, bindingInfos);
+
+        constexpr std::array<uint32_t, 2> indices{0,1};
+        const std::array offsets{drawInfo.sceneDataOffset, ZERO_DEVICE_SIZE};
+
+        vkCmdSetDescriptorBufferOffsetsEXT(cmd, VK_PIPELINE_BIND_POINT_COMPUTE, depthPrefilterPipelineLayout, 0, 2, indices.data(), offsets.data());
+
+        const auto x = static_cast<uint32_t>(std::ceil(RENDER_EXTENT_WIDTH / 16.0f));
+        const auto y = static_cast<uint32_t>(std::ceil(RENDER_EXTENT_HEIGHT / 16.0f));
+        vkCmdDispatch(cmd, x, y, 1);
+    }
+
+
+    vk_helpers::transitionImage(cmd, ambientOcclusionImage.image, VK_IMAGE_LAYOUT_GENERAL, VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL, VK_IMAGE_ASPECT_COLOR_BIT);
 
 
     vkCmdEndDebugUtilsLabelEXT(cmd);
diff --git a/src/renderer/pipelines/deferred_resolve/deferred_resolve.cpp b/src/renderer/pipelines/deferred_resolve/deferred_resolve.cpp
index 9edfd8b9..dc6f3c96 100644
--- a/src/renderer/pipelines/deferred_resolve/deferred_resolve.cpp
+++ b/src/renderer/pipelines/deferred_resolve/deferred_resolve.cpp
@@ -4,12 +4,16 @@
 
 #include "deferred_resolve.h"
 
+#include <array>
+
 #include "volk/volk.h"
 #include "src/renderer/renderer_constants.h"
 #include "src/renderer/resource_manager.h"
 
-will_engine::deferred_resolve::DeferredResolvePipeline::DeferredResolvePipeline(ResourceManager& resourceManager, VkDescriptorSetLayout environmentIBLLayout,
-                                                                                VkDescriptorSetLayout cascadeUniformLayout, VkDescriptorSetLayout cascadeSamplerLayout)
+will_engine::deferred_resolve::DeferredResolvePipeline::DeferredResolvePipeline(ResourceManager& resourceManager,
+                                                                                VkDescriptorSetLayout environmentIBLLayout,
+                                                                                VkDescriptorSetLayout cascadeUniformLayout,
+                                                                                VkDescriptorSetLayout cascadeSamplerLayout)
     : resourceManager(resourceManager)
 {
     VkPushConstantRange pushConstants = {};
@@ -123,23 +127,23 @@ void will_engine::deferred_resolve::DeferredResolvePipeline::draw(VkCommandBuffe
     bindingInfos[4] = drawInfo.cascadeSamplerBinding;
     vkCmdBindDescriptorBuffersEXT(cmd, 5, bindingInfos);
 
-    constexpr VkDeviceSize zeroOffset{0};
-    constexpr uint32_t sceneDataIndex{0};
-    constexpr uint32_t renderTargetsIndex{1};
-    constexpr uint32_t environmentIndex{2};
-    constexpr uint32_t cascadedShadowMapUniformIndex{3};
-    constexpr uint32_t cascadedShadowMapSamplerIndex{4};
-
-    const VkDeviceSize sceneDataOffset{drawInfo.sceneDataOffset};
-    const VkDeviceSize environmentOffset{drawInfo.environmentIBLOffset};
-    const VkDeviceSize cascadedShadowMapUniformOffset{drawInfo.cascadeUniformOffset};
-    constexpr VkDeviceSize cascadedShadowMapSamplerOffset{0};
-
-    vkCmdSetDescriptorBufferOffsetsEXT(cmd, VK_PIPELINE_BIND_POINT_COMPUTE, pipelineLayout, 0, 1, &sceneDataIndex, &sceneDataOffset);
-    vkCmdSetDescriptorBufferOffsetsEXT(cmd, VK_PIPELINE_BIND_POINT_COMPUTE, pipelineLayout, 1, 1, &renderTargetsIndex, &zeroOffset);
-    vkCmdSetDescriptorBufferOffsetsEXT(cmd, VK_PIPELINE_BIND_POINT_COMPUTE, pipelineLayout, 2, 1, &environmentIndex, &environmentOffset);
-    vkCmdSetDescriptorBufferOffsetsEXT(cmd, VK_PIPELINE_BIND_POINT_COMPUTE, pipelineLayout, 3, 1, &cascadedShadowMapUniformIndex, &cascadedShadowMapUniformOffset);
-    vkCmdSetDescriptorBufferOffsetsEXT(cmd, VK_PIPELINE_BIND_POINT_COMPUTE, pipelineLayout, 4, 1, &cascadedShadowMapSamplerIndex, &cascadedShadowMapSamplerOffset);
+    constexpr std::array<uint32_t, 5> indices{
+        0,
+        1,
+        2,
+        3,
+        4
+    };
+    const std::array offsets{
+        drawInfo.sceneDataOffset,
+        ZERO_DEVICE_SIZE,
+        drawInfo.environmentIBLOffset,
+        drawInfo.cascadeUniformOffset,
+        ZERO_DEVICE_SIZE
+    };
+
+
+    vkCmdSetDescriptorBufferOffsetsEXT(cmd, VK_PIPELINE_BIND_POINT_COMPUTE, pipelineLayout, 0, 5, indices.data(), offsets.data());
 
     const auto x = static_cast<uint32_t>(std::ceil(RENDER_EXTENT_WIDTH / 16.0f));
     const auto y = static_cast<uint32_t>(std::ceil(RENDER_EXTENT_HEIGHT / 16.0f));
diff --git a/src/renderer/vk_helpers.cpp b/src/renderer/vk_helpers.cpp
index 834dfb50..f593018c 100644
--- a/src/renderer/vk_helpers.cpp
+++ b/src/renderer/vk_helpers.cpp
@@ -6,6 +6,7 @@
 
 #include <filesystem>
 #include <fstream>
+#include <glm/gtc/packing.hpp>
 #include <stb/stb_image.h>
 #include <stb/stb_image_write.h>
 #include "volk/volk.h"
@@ -825,6 +826,52 @@ void will_engine::vk_helpers::saveImageR16F(const ResourceManager& resourceManag
     resourceManager.destroyBuffer(receivingBuffer);
 }
 
+void will_engine::vk_helpers::saveImageR8UNORM(const ResourceManager& resourceManager, const ImmediateSubmitter& immediate,
+    const AllocatedImage& image, VkImageLayout imageLayout, const char* savePath, int32_t mipLevel)
+{
+    const size_t width = image.imageExtent.width / static_cast<size_t>(std::pow(2, mipLevel));
+    const size_t height = image.imageExtent.height / static_cast<size_t>(std::pow(2, mipLevel));
+    const size_t texelCount = width * height;
+    const size_t dataSize = texelCount * 1 * sizeof(uint8_t);
+
+    AllocatedBuffer receivingBuffer = resourceManager.createReceivingBuffer(dataSize);
+
+    immediate.submit([&, mipLevel](VkCommandBuffer cmd) {
+        VkBufferImageCopy bufferCopyRegion{};
+        bufferCopyRegion.imageSubresource.aspectMask = VK_IMAGE_ASPECT_COLOR_BIT;
+        bufferCopyRegion.imageSubresource.mipLevel = mipLevel;
+        bufferCopyRegion.imageSubresource.baseArrayLayer = 0;
+        bufferCopyRegion.imageSubresource.layerCount = 1;
+        bufferCopyRegion.imageExtent = {static_cast<uint32_t>(width), static_cast<uint32_t>(height), 1u};
+        bufferCopyRegion.bufferOffset = 0;
+        bufferCopyRegion.bufferRowLength = 0;
+        bufferCopyRegion.bufferImageHeight = 0;
+
+        transitionImage(cmd, image.image, imageLayout, VK_IMAGE_LAYOUT_GENERAL, VK_IMAGE_ASPECT_COLOR_BIT);
+
+        vkCmdCopyImageToBuffer(cmd, image.image, VK_IMAGE_LAYOUT_GENERAL, receivingBuffer.buffer, 1, &bufferCopyRegion);
+
+        transitionImage(cmd, image.image, VK_IMAGE_LAYOUT_GENERAL, imageLayout, VK_IMAGE_ASPECT_COLOR_BIT);
+    });
+
+    void* data = receivingBuffer.info.pMappedData;
+    const auto imageData = static_cast<uint8_t*>(data);
+
+    const auto byteImageData = new uint8_t[texelCount * 4];
+    for (size_t i = 0; i < texelCount; ++i) {
+        const auto value = imageData[i];
+        byteImageData[i * 4 + 0] = value;
+        byteImageData[i * 4 + 1] = value;
+        byteImageData[i * 4 + 2] = value;
+        byteImageData[i * 4 + 3] = 255;
+    }
+
+    stbi_write_png(savePath, static_cast<int>(width), static_cast<int>(height), 4, byteImageData, static_cast<int>(width) * 4);
+
+    delete[] byteImageData;
+    resourceManager.destroyBuffer(receivingBuffer);
+}
+
 void will_engine::vk_helpers::saveImage(const std::vector<float>& imageData, int width, int height, std::filesystem::path filename, bool overrideAlpha)
 {
     const auto byteImageData = new uint8_t[width * height * 4];
diff --git a/src/renderer/vk_helpers.h b/src/renderer/vk_helpers.h
index ad8c6455..819116e3 100644
--- a/src/renderer/vk_helpers.h
+++ b/src/renderer/vk_helpers.h
@@ -126,6 +126,9 @@ namespace vk_helpers
                        VkImageLayout imageLayout, VkImageAspectFlags aspectFlag, const char* savePath,
                        const std::function<float(uint16_t)>& valueTransform, int32_t mipLevel = 0);
 
+    void saveImageR8UNORM(const ResourceManager& resourceManager, const ImmediateSubmitter& immediate, const AllocatedImage& image,
+                       VkImageLayout imageLayout, const char* savePath, int32_t mipLevel = 0);
+
     void saveImage(const std::vector<float>& imageData, int width, int height, std::filesystem::path filename, bool overrideAlpha = true);
 
     void saveHeightmap(const std::vector<float>& heightData, int width, int height, const std::filesystem::path& filename);

From a1e1814b5ca232de04c9e2d6de988b0492b76a60 Mon Sep 17 00:00:00 2001
From: Williscool13 <twtw40@gmail.com>
Date: Fri, 28 Mar 2025 23:26:45 +0700
Subject: [PATCH 16/27] GTAO main pass scaffolding

---
 .../ground_truth/gtao_main_pass.comp          | 41 ++++++++++++++++---
 1 file changed, 36 insertions(+), 5 deletions(-)

diff --git a/shaders/ambient_occlusion/ground_truth/gtao_main_pass.comp b/shaders/ambient_occlusion/ground_truth/gtao_main_pass.comp
index 7f7f844b..4c0f0869 100644
--- a/shaders/ambient_occlusion/ground_truth/gtao_main_pass.comp
+++ b/shaders/ambient_occlusion/ground_truth/gtao_main_pass.comp
@@ -30,12 +30,43 @@ layout (push_constant) uniform PushConstants {
     float spatialFilterRadius;
 } pushConstants;
 
+vec4 calculateDepthEdges(const float centerZ, const float leftZ, const float rightZ, const float topZ, const float bottomZ)
+{
+    vec4 edgesLRTB = vec4(leftZ, rightZ, topZ, bottomZ) - vec4(centerZ);
+
+    float slopeLR = (edgesLRTB.y - edgesLRTB.x) * 0.5;
+    float slopeTB = (edgesLRTB.w - edgesLRTB.z) * 0.5;
+    vec4 edgesLRTBSlopeAdjusted = edgesLRTB + vec4(slopeLR, -slopeLR, slopeTB, -slopeTB);
+    edgesLRTB = min(abs(edgesLRTB), abs(edgesLRTBSlopeAdjusted));
+
+    return clamp((1.25 - edgesLRTB / (centerZ * 0.011)), 0, 1);;
+}
+
 void main() {
+    // todo: a debug output image that will be used to visualize every step in a separate output image shared by every stage!
     const ivec2 screenPos = ivec2(gl_GlobalInvocationID.xy);
+    vec2 uv = (vec2(screenPos) + 0.5) * sceneData.texelSize;
+
+    float viewSpaceZM = textureLod(prefilteredDepth, uv + vec2(0.0, 0.0) * sceneData.texelSize, 0).r;
+    float viewSpaceZL = textureLod(prefilteredDepth, uv + vec2(-1.0, 0.0) * sceneData.texelSize, 0).r;
+    float viewSpaceZR = textureLod(prefilteredDepth, uv + vec2(1.0, 0.0) * sceneData.texelSize, 0).r;
+    float viewSpaceZT = textureLod(prefilteredDepth, uv + vec2(0.0, 1.0) * sceneData.texelSize, 0).r;
+    float viewSpaceZB = textureLod(prefilteredDepth, uv + vec2(0.0, -1.0) * sceneData.texelSize, 0).r;
+
+    vec4 edges  = calculateDepthEdges(viewSpaceZM, viewSpaceZL, viewSpaceZR, viewSpaceZT, viewSpaceZB);
+    float minEdge = min(min(edges.x, edges.y), min(edges.z, edges.w));
+    imageStore(aoOutput, screenPos, vec4(minEdge));
+
+    // Get view space normal by sampling normal buffer and converting from world to view (code not relevant)
+    //vec3 viewspaceNormal = (lpfloat3)XeGTAO_CalculateNormal(edgesLRTB, CENTER, LEFT, RIGHT, TOP, BOTTOM);
+
+    // Per Intel: Move center pixel slightly towards camera to avoid imprecision artifacts due to depth buffer imprecision; offset depends on depth texture format used
+    viewspaceZM = viewspaceZM * 0.99920f;
+
+    // Get the viewspace fragment position of the center pixel (i.e. reconstruct from depth, but in view space instead of world space)
+    // const vec3 pixCenterPos = XeGTAO_ComputeViewspacePosition( normalizedScreenPos, viewspaceZ, consts );
+    // const vec3 viewVec = normalize(-pixCenterPos);
 
-    if (screenPos.x % 2 == 0 && screenPos.y % 2 == 0){
-        imageStore(aoOutput, screenPos, vec4(1.0f));
-    } else {
-        imageStore(aoOutput, screenPos, vec4(0.5f));
-    }
+    // prevents normals that are facing away from the view vector - xeGTAO struggles with extreme cases, but in Vanilla it seems rare so it's disabled by default
+    // viewspaceNormal = normalize( viewspaceNormal + max( 0, -dot( viewspaceNormal, viewVec ) ) * viewVec );
 }

From b8aeb008474433b57d2dfff01dd7707081b1c9f5 Mon Sep 17 00:00:00 2001
From: Williscool13 <twtw40@gmail.com>
Date: Sat, 29 Mar 2025 17:09:52 +0700
Subject: [PATCH 17/27] View Space normals, position (view vec), depth. Better
 GTAO debugging.

---
 .../ground_truth/gtao_depth_prefilter.comp    | 36 +++++----
 .../ground_truth/gtao_main_pass.comp          | 66 +++++++++++-----
 shaders/deferredResolve.comp                  | 11 +--
 src/core/engine.cpp                           |  1 +
 src/renderer/imgui_wrapper.cpp                | 56 ++++++--------
 src/renderer/imgui_wrapper.h                  |  4 +-
 .../ambient_occlusion_types.h                 | 30 ++++----
 .../ground_truth_ambient_occlusion.cpp        | 77 +++++++++++++++----
 .../ground_truth_ambient_occlusion.h          |  4 +
 src/renderer/vk_helpers.cpp                   | 47 +++++++++++
 src/renderer/vk_helpers.h                     |  3 +
 11 files changed, 228 insertions(+), 107 deletions(-)

diff --git a/shaders/ambient_occlusion/ground_truth/gtao_depth_prefilter.comp b/shaders/ambient_occlusion/ground_truth/gtao_depth_prefilter.comp
index 240227f0..3ceb6ae3 100644
--- a/shaders/ambient_occlusion/ground_truth/gtao_depth_prefilter.comp
+++ b/shaders/ambient_occlusion/ground_truth/gtao_depth_prefilter.comp
@@ -15,25 +15,25 @@ layout (r16f, set = 1, binding = 2) uniform image2D outDepth1;
 layout (r16f, set = 1, binding = 3) uniform image2D outDepth2;
 layout (r16f, set = 1, binding = 4) uniform image2D outDepth3;
 layout (r16f, set = 1, binding = 5) uniform image2D outDepth4;
+layout (rgba8, set = 1, binding = 6) uniform image2D debugImage;
 
 layout (push_constant) uniform PushConstants {
-    vec2 ndcToViewMult;
-    vec2 ndcToViewAdd;
-
     float depthLinearizeMult;
     float depthLinearizeAdd;
 
-    float radius;
-    float falloff;
-    float radiusMultiplier;
-
-    float strength;
+    float projectionParamX;
+    float projectionParamY;
 
-    int numDirections;
-    int numSteps;
+    float effectRadius;
+    float effectFalloffRange;
+    float denoiseBlurBeta;
 
-    float temporalWeight;
-    float spatialFilterRadius;
+    float radiusMultiplier;
+    float sampleDistributionPower;
+    float thinOccluderCompensation;
+    float finalValuePower;
+    float depthMipSamplingOffset;
+    float noiseIndex;
 } pushConstants;
 
 float screenToViewSpaceDepth(float screenDepth, float depthLinearizeMul, float depthLinearizeAdd) {
@@ -78,6 +78,10 @@ void main() {
     const uvec2 baseCoord = gl_GlobalInvocationID.xy;
     const ivec2 screenPos = ivec2(baseCoord.xy) * 2;// We process 2x2 pixels in MIP 0
 
+    if (screenPos.x > sceneData.renderTargetSize.x || screenPos.y > sceneData.renderTargetSize.y) {
+        return;
+    }
+
     vec2 uv = (vec2(screenPos) + 0.5) * sceneData.texelSize;
 
     float rDepth0 = texture(depthImage, uv + vec2(0.0, 0.0) * sceneData.texelSize).r;
@@ -97,7 +101,7 @@ void main() {
 
 
     // MIP 1
-    float dm1 = depthMipFilter(depth0, depth1, depth2, depth3, pushConstants.radius, pushConstants.radiusMultiplier, pushConstants.falloff);
+    float dm1 = depthMipFilter(depth0, depth1, depth2, depth3, pushConstants.effectRadius, pushConstants.radiusMultiplier, pushConstants.effectFalloffRange);
     imageStore(outDepth1, ivec2(baseCoord), vec4(dm1));
     g_scratchDepths[groupThreadID.x][groupThreadID.y] = dm1;
 
@@ -111,7 +115,7 @@ void main() {
         float inBL = g_scratchDepths[groupThreadID.x+0][groupThreadID.y+1];
         float inBR = g_scratchDepths[groupThreadID.x+1][groupThreadID.y+1];
 
-        float dm2 = depthMipFilter(inTL, inTR, inBL, inBR, pushConstants.radius, pushConstants.radiusMultiplier, pushConstants.falloff);
+        float dm2 = depthMipFilter(inTL, inTR, inBL, inBR, pushConstants.effectRadius, pushConstants.radiusMultiplier, pushConstants.effectFalloffRange);
         imageStore(outDepth2, ivec2(baseCoord/2u), vec4(dm2));
         g_scratchDepths[groupThreadID.x][groupThreadID.y] = dm2;
     }
@@ -126,7 +130,7 @@ void main() {
         float inBL = g_scratchDepths[groupThreadID.x+0][groupThreadID.y+2];
         float inBR = g_scratchDepths[groupThreadID.x+2][groupThreadID.y+2];
 
-        float dm3 = depthMipFilter(inTL, inTR, inBL, inBR, pushConstants.radius, pushConstants.radiusMultiplier, pushConstants.falloff);
+        float dm3 = depthMipFilter(inTL, inTR, inBL, inBR, pushConstants.effectRadius, pushConstants.radiusMultiplier, pushConstants.effectFalloffRange);
         imageStore(outDepth3, ivec2(baseCoord/4u), vec4(dm3));
         g_scratchDepths[groupThreadID.x][groupThreadID.y] = dm3;
     }
@@ -141,7 +145,7 @@ void main() {
         float inBL = g_scratchDepths[groupThreadID.x+0][groupThreadID.y+4];
         float inBR = g_scratchDepths[groupThreadID.x+4][groupThreadID.y+4];
 
-        float dm4 = depthMipFilter(inTL, inTR, inBL, inBR, pushConstants.radius, pushConstants.radiusMultiplier, pushConstants.falloff);
+        float dm4 = depthMipFilter(inTL, inTR, inBL, inBR, pushConstants.effectRadius, pushConstants.radiusMultiplier, pushConstants.effectFalloffRange);
         imageStore(outDepth4, ivec2(baseCoord/8u), vec4(dm4));
         //g_scratchDepths[groupThreadID.x][groupThreadID.y] = dm4;
     }
diff --git a/shaders/ambient_occlusion/ground_truth/gtao_main_pass.comp b/shaders/ambient_occlusion/ground_truth/gtao_main_pass.comp
index 4c0f0869..27542611 100644
--- a/shaders/ambient_occlusion/ground_truth/gtao_main_pass.comp
+++ b/shaders/ambient_occlusion/ground_truth/gtao_main_pass.comp
@@ -9,25 +9,25 @@ layout(local_size_x = 16, local_size_y = 16) in;
 layout (set = 1, binding = 0) uniform sampler2D prefilteredDepth;
 layout (set = 1, binding = 1) uniform sampler2D normalBuffer;
 layout (r8, set = 1, binding = 2) uniform image2D aoOutput;
+layout (rgba8, set = 1, binding = 3) uniform image2D debugImage;
 
 layout (push_constant) uniform PushConstants {
-    vec2 ndcToViewMult;
-    vec2 ndcToViewAdd;
-
     float depthLinearizeMult;
     float depthLinearizeAdd;
 
-    float radius;
-    float falloff;
-    float radiusMultiplier;
-
-    float strength;
+    float projectionParamX;
+    float projectionParamY;
 
-    int numDirections;
-    int numSteps;
+    float effectRadius;
+    float effectFalloffRange;
+    float denoiseBlurBeta;
 
-    float temporalWeight;
-    float spatialFilterRadius;
+    float radiusMultiplier;
+    float sampleDistributionPower;
+    float thinOccluderCompensation;
+    float finalValuePower;
+    float depthMipSamplingOffset;
+    float noiseIndex;
 } pushConstants;
 
 vec4 calculateDepthEdges(const float centerZ, const float leftZ, const float rightZ, const float topZ, const float bottomZ)
@@ -42,9 +42,19 @@ vec4 calculateDepthEdges(const float centerZ, const float leftZ, const float rig
     return clamp((1.25 - edgesLRTB / (centerZ * 0.011)), 0, 1);;
 }
 
+vec3 reconstructViewSpacePosition(vec2 uv, float viewDepth, float xProj, float yProj) {
+    vec2 ndc = uv * 2.0 - 1.0;
+    vec2 projectionParams = vec2(1.0 / xProj, 1.0 / yProj);
+    return vec3(ndc * viewDepth * projectionParams, -viewDepth);
+}
+
 void main() {
-    // todo: a debug output image that will be used to visualize every step in a separate output image shared by every stage!
     const ivec2 screenPos = ivec2(gl_GlobalInvocationID.xy);
+
+    if (screenPos.x > sceneData.renderTargetSize.x || screenPos.y > sceneData.renderTargetSize.y) {
+        return;
+    }
+
     vec2 uv = (vec2(screenPos) + 0.5) * sceneData.texelSize;
 
     float viewSpaceZM = textureLod(prefilteredDepth, uv + vec2(0.0, 0.0) * sceneData.texelSize, 0).r;
@@ -55,18 +65,36 @@ void main() {
 
     vec4 edges  = calculateDepthEdges(viewSpaceZM, viewSpaceZL, viewSpaceZR, viewSpaceZT, viewSpaceZB);
     float minEdge = min(min(edges.x, edges.y), min(edges.z, edges.w));
-    imageStore(aoOutput, screenPos, vec4(minEdge));
+    // imageStore(debugImage, screenPos, vec4(vec3(minEdge), 1.0f));
 
     // Get view space normal by sampling normal buffer and converting from world to view (code not relevant)
-    //vec3 viewspaceNormal = (lpfloat3)XeGTAO_CalculateNormal(edgesLRTB, CENTER, LEFT, RIGHT, TOP, BOTTOM);
+    vec3 worldNormal = texture(normalBuffer, uv).rgb;
+    vec3 viewNormal = mat3(sceneData.view) * worldNormal;
+    // imageStore(debugImage, screenPos, vec4(viewNormal * 0.5f + 0.5f, 1.0f));
 
     // Per Intel: Move center pixel slightly towards camera to avoid imprecision artifacts due to depth buffer imprecision; offset depends on depth texture format used
-    viewspaceZM = viewspaceZM * 0.99920f;
+    viewSpaceZM = viewSpaceZM * 0.99920f;
+
 
-    // Get the viewspace fragment position of the center pixel (i.e. reconstruct from depth, but in view space instead of world space)
-    // const vec3 pixCenterPos = XeGTAO_ComputeViewspacePosition( normalizedScreenPos, viewspaceZ, consts );
-    // const vec3 viewVec = normalize(-pixCenterPos);
+    vec3 vPos = reconstructViewSpacePosition(uv, viewSpaceZM, pushConstants.projectionParamX, pushConstants.projectionParamY);
+    vec3 viewVec = normalize(-vPos);
+    imageStore(debugImage, screenPos, vec4(viewVec, 1.0f));
 
+    // Per Intel
     // prevents normals that are facing away from the view vector - xeGTAO struggles with extreme cases, but in Vanilla it seems rare so it's disabled by default
     // viewspaceNormal = normalize( viewspaceNormal + max( 0, -dot( viewspaceNormal, viewVec ) ) * viewVec );
+
+
+    const float effectRadius = pushConstants.effectRadius * pushConstants.radiusMultiplier;
+    const float sampleDistributionPower = pushConstants.sampleDistributionPower;
+    const float thinOccluderCompensation = pushConstants.thinOccluderCompensation;
+    const float falloffRange = pushConstants.effectFalloffRange * effectRadius;
+
+    const float falloffFrom = effectRadius * (1 - pushConstants.effectFalloffRange);
+
+    // fadeout precompute optimisation
+    const float falloffMul = 1.0 / (falloffRange);
+    const float falloffAdd = falloffFrom / (falloffRange) + 1.0;
+
+    vec3 bentNormal = viewNormal;
 }
diff --git a/shaders/deferredResolve.comp b/shaders/deferredResolve.comp
index a6de0812..3d7f2f3d 100644
--- a/shaders/deferredResolve.comp
+++ b/shaders/deferredResolve.comp
@@ -50,12 +50,12 @@ layout (push_constant) uniform PushConstants {
     float farPlane;
 } pushConstants;
 
-vec3 reconstructPosition(ivec2 texCoord, vec2 texelSize, float depth) {
+vec3 reconstructPosition(vec2 uv, float ndcDepth) {
     // Get normalized device coordinates
-    vec2 ndc = (vec2(texCoord) + 0.5) * texelSize * 2.0 - 1.0;
+    vec2 ndc = uv * 2.0 - 1.0;
 
     // Reconstruct view-space position
-    vec4 positionVS = sceneData.invProjection * vec4(ndc, depth, 1.0);
+    vec4 positionVS = sceneData.invProjection * vec4(ndc, ndcDepth, 1.0);
     positionVS /= positionVS.w;
 
     // Transform to world-space
@@ -87,7 +87,7 @@ void main() {
     float depth = texture(depthBuffer, uv).r;
     vec3 normal = texture(normalRenderTarget, uv).rgb;
     vec4 pbrData = texture(pbrRenderTarget, uv);
-    vec3 position = reconstructPosition(screenPos, sceneData.texelSize, depth);
+    vec3 position = reconstructPosition(uv, depth);
 
     float roughness = pbrData.g;
     float metallic = pbrData.r;
@@ -186,7 +186,8 @@ void main() {
             imageStore(outputImage, screenPos, vec4(vec3(dot(N, L)), 1.0));
             break;
         case 9:
-        // imageStore(outputImage, screenPos, vec4(vec3(ao), 1.0f));
+            vec4 viewPos = sceneData.view * vec4(position, 1.0f);
+            imageStore(outputImage, screenPos, vec4(-viewPos.xyz / 100.0f, 1.0f));
             break;
     }
 }
\ No newline at end of file
diff --git a/src/core/engine.cpp b/src/core/engine.cpp
index fc8ca46b..2299ad81 100644
--- a/src/core/engine.cpp
+++ b/src/core/engine.cpp
@@ -568,6 +568,7 @@ void Engine::draw(float deltaTime)
     ambient_occlusion::GTAODrawInfo gtaoDrawInfo{
         camera,
         {},
+        frameNumber,
         sceneDataDescriptorBuffer.getDescriptorBufferBindingInfo(),
         sceneDataDescriptorBuffer.getDescriptorBufferSize() * currentFrameOverlap
     };
diff --git a/src/renderer/imgui_wrapper.cpp b/src/renderer/imgui_wrapper.cpp
index a7abdf74..1a5e17ae 100644
--- a/src/renderer/imgui_wrapper.cpp
+++ b/src/renderer/imgui_wrapper.cpp
@@ -207,7 +207,7 @@ void ImguiWrapper::imguiInterface(Engine* engine)
 
             if (ImGui::BeginTabItem("Pipelines")) {
                 ImGui::Text("Deferred Debug");
-                const char* deferredDebugOptions[]{"None", "Depth", "Velocity", "Albedo", "Normal", "PBR", "Shadows", "Cascade Level", "nDotL"};
+                const char* deferredDebugOptions[]{"None", "Depth", "Velocity", "Albedo", "Normal", "PBR", "Shadows", "Cascade Level", "nDotL", "AO"};
                 ImGui::Combo("Deferred Debug", &engine->deferredDebug, deferredDebugOptions, IM_ARRAYSIZE(deferredDebugOptions));
 
                 ImGui::EndTabItem();
@@ -881,51 +881,43 @@ void ImguiWrapper::imguiInterface(Engine* engine)
         constexpr uint32_t minMip = 0;
         constexpr uint32_t maxMip = 4;
 
-        ImGui::SliderScalar("GTAO level", ImGuiDataType_S32, &gtaoMip, &minMip, &maxMip);
+        if (aoDebugTextureImguiId == VK_NULL_HANDLE) {
+            if (engine->ambientOcclusionPipeline->debugImage.image != VK_NULL_HANDLE) {
+                aoDebugTextureImguiId = ImGui_ImplVulkan_AddTexture(engine->resourceManager->getDefaultSamplerLinear(),
+                                                                    engine->ambientOcclusionPipeline->debugImage.imageView,
+                                                                    VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL);
+            }
+        }
 
-        if (ImGui::Button("Save GTAO depth image")) {
-            if (file::getOrCreateDirectory(file::imagesSavePath)) {
-                const std::filesystem::path path = file::imagesSavePath / "gtao_depth.png";
 
-                auto depthNormalize = [](const uint16_t depth) {
-                    // Equivalent
-                    float manualDepth = math::halfToFloat(depth);
-                    float libraryDepth = half_float::detail::half2float<float>(depth);
-                    return libraryDepth / 1000.f;
-                };
+        if (aoDebugTextureImguiId == VK_NULL_HANDLE) {
+            ImGui::Text("Issue.");
+        }
+        else {
+            float maxSize = ImGui::GetContentRegionAvail().x;
+            maxSize = glm::min(maxSize, 1024.0f);
 
-                vk_helpers::saveImageR16F(
-                    *engine->resourceManager,
-                    *engine->immediate,
-                    engine->ambientOcclusionPipeline->depthPrefilterImage,
-                    VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL,
-                    VK_IMAGE_ASPECT_COLOR_BIT,
-                    path.string().c_str(),
-                    depthNormalize,
-                    gtaoMip
-                );
-            }
-            else {
-                fmt::print(" Failed to find/create image save path directory");
-            }
+            VkExtent3D imageExtent = engine->ambientOcclusionPipeline->debugImage.imageExtent;
+            float width = std::min(maxSize, static_cast<float>(imageExtent.width));
+            float aspectRatio = static_cast<float>(imageExtent.width) / static_cast<float>(imageExtent.height);
+            float height = width / aspectRatio;
+
+            ImGui::Image(reinterpret_cast<ImTextureID>(aoDebugTextureImguiId), ImVec2(width, height));
         }
 
-        if (ImGui::Button("Save raw ao image")) {
+        if (ImGui::Button("Save GTAO Debug Image")) {
             if (file::getOrCreateDirectory(file::imagesSavePath)) {
-                const std::filesystem::path path = file::imagesSavePath / "raw_ao_image.png";
+                const std::filesystem::path path = file::imagesSavePath / "gtao_debug.png";
 
-                vk_helpers::saveImageR8UNORM(
+                vk_helpers::saveImageR8G8B8A8UNORM(
                     *engine->resourceManager,
                     *engine->immediate,
-                    engine->ambientOcclusionPipeline->ambientOcclusionImage,
+                    engine->ambientOcclusionPipeline->debugImage,
                     VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL,
                     path.string().c_str(),
                     0
                 );
             }
-            else {
-                fmt::print(" Failed to find/create image save path directory");
-            }
         }
     }
     ImGui::End();
diff --git a/src/renderer/imgui_wrapper.h b/src/renderer/imgui_wrapper.h
index 17b6f844..c77f836d 100644
--- a/src/renderer/imgui_wrapper.h
+++ b/src/renderer/imgui_wrapper.h
@@ -80,7 +80,9 @@ class ImguiWrapper
     terrain::TerrainProperties terrainProperties{};
     std::array<uint32_t, terrain::MAX_TERRAIN_TEXTURE_COUNT> terrainTextures;
 
-    VkDescriptorSet currentlySelectedTextureImguiId = VK_NULL_HANDLE;
+    VkDescriptorSet currentlySelectedTextureImguiId{VK_NULL_HANDLE};
+
+    VkDescriptorSet aoDebugTextureImguiId{VK_NULL_HANDLE};
 };
 }
 
diff --git a/src/renderer/lighting/ambient_occlusion/ambient_occlusion_types.h b/src/renderer/lighting/ambient_occlusion/ambient_occlusion_types.h
index 55338e44..30cb29ef 100644
--- a/src/renderer/lighting/ambient_occlusion/ambient_occlusion_types.h
+++ b/src/renderer/lighting/ambient_occlusion/ambient_occlusion_types.h
@@ -11,37 +11,33 @@
 namespace will_engine::ambient_occlusion
 {
 static constexpr int32_t DEPTH_PREFILTER_MIP_COUNT = 5;
+static constexpr int32_t GTAO_DENOISE_PASSES = 1;
 
 struct GTAOPushConstants
 {
-    glm::vec2 ndcToViewMult;
-    glm::vec2 ndcToViewAdd;
-
-    // Depth prefilter parameters
     float depthLinearizeMult;
     float depthLinearizeAdd;
 
-    // Defaults follow Intel's implementation
-    float radius = 0.5f;
-    float falloff = 0.615f;
-    float radiusMultiplier = 1.457f;
-
-    // AO parameters
-    float strength;
+    float projectionParamX;
+    float projectionParamY;
 
-    // Sampling parameters
-    uint32_t numDirections;
-    uint32_t numSteps;
+    float effectRadius = 0.5f;
+    float effectFalloffRange = 0.615f;
+    float denoiseBlurBeta = (GTAO_DENOISE_PASSES == 0) ? (1e4f) : (1.2f);
 
-    // Temporal/filter parameters
-    float temporalWeight;
-    float spatialFilterRadius;
+    float radiusMultiplier = 1.457f;
+    float sampleDistributionPower = 2.0f;
+    float thinOccluderCompensation = 0.0f;
+    float finalValuePower = 2.2f;
+    float depthMipSamplingOffset = 3.30f;
+    float noiseIndex;
 };
 
 struct GTAODrawInfo
 {
     Camera* camera{nullptr};
     GTAOPushConstants pushConstants{};
+    int32_t currentFrame{};
     VkDescriptorBufferBindingInfoEXT sceneDataBinding{};
     VkDeviceSize sceneDataOffset{0};
 };
diff --git a/src/renderer/lighting/ambient_occlusion/ground_truth/ground_truth_ambient_occlusion.cpp b/src/renderer/lighting/ambient_occlusion/ground_truth/ground_truth_ambient_occlusion.cpp
index ca68f6e9..48ddb4db 100644
--- a/src/renderer/lighting/ambient_occlusion/ground_truth/ground_truth_ambient_occlusion.cpp
+++ b/src/renderer/lighting/ambient_occlusion/ground_truth/ground_truth_ambient_occlusion.cpp
@@ -10,8 +10,21 @@
 #include "src/renderer/vk_descriptors.h"
 #include "src/renderer/lighting/ambient_occlusion/ambient_occlusion_types.h"
 
-will_engine::ambient_occlusion::GroundTruthAmbientOcclusionPipeline::GroundTruthAmbientOcclusionPipeline(ResourceManager& resourceManager) : resourceManager(resourceManager)
+will_engine::ambient_occlusion::GroundTruthAmbientOcclusionPipeline::GroundTruthAmbientOcclusionPipeline(
+    ResourceManager& resourceManager) : resourceManager(resourceManager)
 {
+    // Debug
+    {
+        VkImageUsageFlags usage{};
+        usage |= VK_IMAGE_USAGE_STORAGE_BIT;
+        usage |= VK_IMAGE_USAGE_SAMPLED_BIT;
+        usage |= VK_IMAGE_USAGE_TRANSFER_SRC_BIT;
+        usage |= VK_IMAGE_USAGE_TRANSFER_DST_BIT;
+
+        VkImageCreateInfo imgInfo = vk_helpers::imageCreateInfo(debugFormat, usage, {RENDER_EXTENTS.width, RENDER_EXTENTS.height, 1});
+        debugImage = resourceManager.createImage(imgInfo);
+    }
+
     // Depth Pre-filtering
     {
         DescriptorLayoutBuilder layoutBuilder;
@@ -21,8 +34,10 @@ will_engine::ambient_occlusion::GroundTruthAmbientOcclusionPipeline::GroundTruth
         layoutBuilder.addBinding(3, VK_DESCRIPTOR_TYPE_STORAGE_IMAGE); // ao depth mip 2
         layoutBuilder.addBinding(4, VK_DESCRIPTOR_TYPE_STORAGE_IMAGE); // ao depth mip 3
         layoutBuilder.addBinding(5, VK_DESCRIPTOR_TYPE_STORAGE_IMAGE); // ao depth mip 4
+        layoutBuilder.addBinding(6, VK_DESCRIPTOR_TYPE_STORAGE_IMAGE); // debug image
 
-        depthPrefilterSetLayout = resourceManager.createDescriptorSetLayout(layoutBuilder, VK_SHADER_STAGE_COMPUTE_BIT, VK_DESCRIPTOR_SET_LAYOUT_CREATE_DESCRIPTOR_BUFFER_BIT_EXT);
+        depthPrefilterSetLayout = resourceManager.createDescriptorSetLayout(layoutBuilder, VK_SHADER_STAGE_COMPUTE_BIT,
+                                                                            VK_DESCRIPTOR_SET_LAYOUT_CREATE_DESCRIPTOR_BUFFER_BIT_EXT);
 
         VkPushConstantRange pushConstants{};
         pushConstants.offset = 0;
@@ -77,8 +92,10 @@ will_engine::ambient_occlusion::GroundTruthAmbientOcclusionPipeline::GroundTruth
         layoutBuilder.addBinding(0, VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER); // pre-filtered depth
         layoutBuilder.addBinding(1, VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER); // MRT normal buffer
         layoutBuilder.addBinding(2, VK_DESCRIPTOR_TYPE_STORAGE_IMAGE); // ao output
+        layoutBuilder.addBinding(3, VK_DESCRIPTOR_TYPE_STORAGE_IMAGE); // debug image
 
-        ambientOcclusionSetLayout = resourceManager.createDescriptorSetLayout(layoutBuilder, VK_SHADER_STAGE_COMPUTE_BIT, VK_DESCRIPTOR_SET_LAYOUT_CREATE_DESCRIPTOR_BUFFER_BIT_EXT);
+        ambientOcclusionSetLayout = resourceManager.createDescriptorSetLayout(layoutBuilder, VK_SHADER_STAGE_COMPUTE_BIT,
+                                                                              VK_DESCRIPTOR_SET_LAYOUT_CREATE_DESCRIPTOR_BUFFER_BIT_EXT);
 
         VkPushConstantRange pushConstants{};
         pushConstants.offset = 0;
@@ -156,7 +173,8 @@ will_engine::ambient_occlusion::GroundTruthAmbientOcclusionPipeline::GroundTruth
         layoutBuilder.addBinding(2, VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER); // MRT normal buffer
         layoutBuilder.addBinding(3, VK_DESCRIPTOR_TYPE_STORAGE_IMAGE); // filtered ao
 
-        spatialFilteringSetLayout = resourceManager.createDescriptorSetLayout(layoutBuilder, VK_SHADER_STAGE_COMPUTE_BIT, VK_DESCRIPTOR_SET_LAYOUT_CREATE_DESCRIPTOR_BUFFER_BIT_EXT);
+        spatialFilteringSetLayout = resourceManager.createDescriptorSetLayout(layoutBuilder, VK_SHADER_STAGE_COMPUTE_BIT,
+                                                                              VK_DESCRIPTOR_SET_LAYOUT_CREATE_DESCRIPTOR_BUFFER_BIT_EXT);
 
         VkPushConstantRange pushConstants{};
         pushConstants.offset = 0;
@@ -197,7 +215,8 @@ will_engine::ambient_occlusion::GroundTruthAmbientOcclusionPipeline::GroundTruth
         layoutBuilder.addBinding(3, VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER); // MRT depth buffer
         layoutBuilder.addBinding(4, VK_DESCRIPTOR_TYPE_STORAGE_IMAGE); // final output
 
-        temporalAccumulationSetLayout = resourceManager.createDescriptorSetLayout(layoutBuilder, VK_SHADER_STAGE_COMPUTE_BIT, VK_DESCRIPTOR_SET_LAYOUT_CREATE_DESCRIPTOR_BUFFER_BIT_EXT);
+        temporalAccumulationSetLayout = resourceManager.createDescriptorSetLayout(layoutBuilder, VK_SHADER_STAGE_COMPUTE_BIT,
+                                                                                  VK_DESCRIPTOR_SET_LAYOUT_CREATE_DESCRIPTOR_BUFFER_BIT_EXT);
 
         VkPushConstantRange pushConstants{};
         pushConstants.offset = 0;
@@ -239,6 +258,9 @@ will_engine::ambient_occlusion::GroundTruthAmbientOcclusionPipeline::GroundTruth
 
 will_engine::ambient_occlusion::GroundTruthAmbientOcclusionPipeline::~GroundTruthAmbientOcclusionPipeline()
 {
+    // Debug Resources
+    resourceManager.destroyImage(debugImage);
+
     // Depth Prefilter Resources
     resourceManager.destroyDescriptorSetLayout(depthPrefilterSetLayout);
     resourceManager.destroyPipelineLayout(depthPrefilterPipelineLayout);
@@ -288,7 +310,7 @@ will_engine::ambient_occlusion::GroundTruthAmbientOcclusionPipeline::~GroundTrut
 void will_engine::ambient_occlusion::GroundTruthAmbientOcclusionPipeline::setupDepthPrefilterDescriptorBuffer(const VkImageView& depthImageView)
 {
     std::vector<DescriptorImageData> imageDescriptors{};
-    imageDescriptors.reserve(1 + DEPTH_PREFILTER_MIP_COUNT);
+    imageDescriptors.reserve(1 + DEPTH_PREFILTER_MIP_COUNT + 1);
 
     imageDescriptors.push_back(
         {
@@ -307,13 +329,19 @@ void will_engine::ambient_occlusion::GroundTruthAmbientOcclusionPipeline::setupD
         imageDescriptors.push_back(imageData);
     }
 
+    imageDescriptors.push_back({
+        VK_DESCRIPTOR_TYPE_STORAGE_IMAGE,
+        {VK_NULL_HANDLE, debugImage.imageView, VK_IMAGE_LAYOUT_GENERAL},
+        false
+    });
+
     resourceManager.setupDescriptorBufferSampler(depthPrefilterDescriptorBuffer, imageDescriptors, 0);
 }
 
 void will_engine::ambient_occlusion::GroundTruthAmbientOcclusionPipeline::setupAmbientOcclusionDescriptorBuffer(VkImageView normalsImageView)
 {
     std::vector<DescriptorImageData> imageDescriptors{};
-    imageDescriptors.reserve(2);
+    imageDescriptors.reserve(4);
 
     imageDescriptors.push_back(
         {
@@ -333,6 +361,11 @@ void will_engine::ambient_occlusion::GroundTruthAmbientOcclusionPipeline::setupA
             {VK_NULL_HANDLE, ambientOcclusionImage.imageView, VK_IMAGE_LAYOUT_GENERAL},
             false
         });
+    imageDescriptors.push_back({
+        VK_DESCRIPTOR_TYPE_STORAGE_IMAGE,
+        {VK_NULL_HANDLE, debugImage.imageView, VK_IMAGE_LAYOUT_GENERAL},
+        false
+    });
 
     resourceManager.setupDescriptorBufferSampler(ambientOcclusionDescriptorBuffer, imageDescriptors, 0);
 }
@@ -349,8 +382,13 @@ void will_engine::ambient_occlusion::GroundTruthAmbientOcclusionPipeline::draw(V
     push.depthLinearizeMult = -projMatrix[3][2];
     push.depthLinearizeAdd = projMatrix[2][2];
 
+    push.projectionParamX = projMatrix[0][0];
+    push.projectionParamY = projMatrix[1][1];
+
+    push.noiseIndex = GTAO_DENOISE_PASSES > 0 ? drawInfo.currentFrame % 64 : 0;
+
+    vk_helpers::transitionImage(cmd, debugImage.image, VK_IMAGE_LAYOUT_UNDEFINED, VK_IMAGE_LAYOUT_GENERAL, VK_IMAGE_ASPECT_COLOR_BIT);
 
-    //vk_helpers::transitionImage(cmd, depthPrefilterImage.image, VK_IMAGE_LAYOUT_UNDEFINED, VK_IMAGE_LAYOUT_GENERAL, VK_IMAGE_ASPECT_COLOR_BIT);
     vk_helpers::clearColorImage(cmd, depthPrefilterImage.image, VK_IMAGE_LAYOUT_UNDEFINED, VK_IMAGE_LAYOUT_GENERAL);
     // Depth Prefilter
     {
@@ -362,20 +400,21 @@ void will_engine::ambient_occlusion::GroundTruthAmbientOcclusionPipeline::draw(V
         bindingInfos[1] = depthPrefilterDescriptorBuffer.getDescriptorBufferBindingInfo();
         vkCmdBindDescriptorBuffersEXT(cmd, 2, bindingInfos);
 
-        constexpr std::array<uint32_t, 2> indices{0,1};
+        constexpr std::array<uint32_t, 2> indices{0, 1};
         const std::array offsets{drawInfo.sceneDataOffset, ZERO_DEVICE_SIZE};
 
         vkCmdSetDescriptorBufferOffsetsEXT(cmd, VK_PIPELINE_BIND_POINT_COMPUTE, depthPrefilterPipelineLayout, 0, 2, indices.data(), offsets.data());
 
-        auto x = static_cast<uint32_t>(std::ceil(RENDER_EXTENT_WIDTH / 8.0f));
-        auto y = static_cast<uint32_t>(std::ceil(RENDER_EXTENT_HEIGHT / 8.0f));
-        // divided by 2 because depth prepass operates on 2x2 (still input4 -> output4)
-        x = x / 2;
-        y = y / 2;
+        auto x = static_cast<uint32_t>(std::ceil(RENDER_EXTENT_WIDTH / 16.0f));
+        auto y = static_cast<uint32_t>(std::ceil(RENDER_EXTENT_HEIGHT / 16.0f));
+        // shader only operates on 8,8 work groups, mip 0 will operate on 2x2 texels
+        // x = x / 2 + 1;
+        // y = y / 2 + 1;
         vkCmdDispatch(cmd, x, y, 1);
     }
 
-    vk_helpers::transitionImage(cmd, depthPrefilterImage.image, VK_IMAGE_LAYOUT_GENERAL, VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL, VK_IMAGE_ASPECT_COLOR_BIT);
+    vk_helpers::transitionImage(cmd, depthPrefilterImage.image, VK_IMAGE_LAYOUT_GENERAL, VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL,
+                                VK_IMAGE_ASPECT_COLOR_BIT);
     vk_helpers::transitionImage(cmd, ambientOcclusionImage.image, VK_IMAGE_LAYOUT_UNDEFINED, VK_IMAGE_LAYOUT_GENERAL, VK_IMAGE_ASPECT_COLOR_BIT);
     // Ambient Occlusion
     {
@@ -387,7 +426,7 @@ void will_engine::ambient_occlusion::GroundTruthAmbientOcclusionPipeline::draw(V
         bindingInfos[1] = ambientOcclusionDescriptorBuffer.getDescriptorBufferBindingInfo();
         vkCmdBindDescriptorBuffersEXT(cmd, 2, bindingInfos);
 
-        constexpr std::array<uint32_t, 2> indices{0,1};
+        constexpr std::array<uint32_t, 2> indices{0, 1};
         const std::array offsets{drawInfo.sceneDataOffset, ZERO_DEVICE_SIZE};
 
         vkCmdSetDescriptorBufferOffsetsEXT(cmd, VK_PIPELINE_BIND_POINT_COMPUTE, depthPrefilterPipelineLayout, 0, 2, indices.data(), offsets.data());
@@ -398,8 +437,12 @@ void will_engine::ambient_occlusion::GroundTruthAmbientOcclusionPipeline::draw(V
     }
 
 
-    vk_helpers::transitionImage(cmd, ambientOcclusionImage.image, VK_IMAGE_LAYOUT_GENERAL, VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL, VK_IMAGE_ASPECT_COLOR_BIT);
+    vk_helpers::transitionImage(cmd, ambientOcclusionImage.image, VK_IMAGE_LAYOUT_GENERAL, VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL,
+                                VK_IMAGE_ASPECT_COLOR_BIT);
+
 
+    vk_helpers::transitionImage(cmd, debugImage.image, VK_IMAGE_LAYOUT_GENERAL, VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL,
+                            VK_IMAGE_ASPECT_COLOR_BIT);
 
     vkCmdEndDebugUtilsLabelEXT(cmd);
 }
diff --git a/src/renderer/lighting/ambient_occlusion/ground_truth/ground_truth_ambient_occlusion.h b/src/renderer/lighting/ambient_occlusion/ground_truth/ground_truth_ambient_occlusion.h
index 60ae2cbb..c243313f 100644
--- a/src/renderer/lighting/ambient_occlusion/ground_truth/ground_truth_ambient_occlusion.h
+++ b/src/renderer/lighting/ambient_occlusion/ground_truth/ground_truth_ambient_occlusion.h
@@ -83,6 +83,10 @@ class GroundTruthAmbientOcclusionPipeline
 private: // Output
     AllocatedImage ambientOcclusionOutputImage{VK_NULL_HANDLE};
 
+private: // Debug
+    VkFormat debugFormat{VK_FORMAT_R8G8B8A8_UNORM};
+    AllocatedImage debugImage{VK_NULL_HANDLE};
+
 private:
     ResourceManager& resourceManager;
 
diff --git a/src/renderer/vk_helpers.cpp b/src/renderer/vk_helpers.cpp
index f593018c..f88ed89c 100644
--- a/src/renderer/vk_helpers.cpp
+++ b/src/renderer/vk_helpers.cpp
@@ -872,6 +872,53 @@ void will_engine::vk_helpers::saveImageR8UNORM(const ResourceManager& resourceMa
     resourceManager.destroyBuffer(receivingBuffer);
 }
 
+void will_engine::vk_helpers::saveImageR8G8B8A8UNORM(const ResourceManager& resourceManager, const ImmediateSubmitter& immediate,
+    const AllocatedImage& image, VkImageLayout imageLayout, const char* savePath, int32_t mipLevel)
+{
+    const size_t width = image.imageExtent.width / static_cast<size_t>(std::pow(2, mipLevel));
+    const size_t height = image.imageExtent.height / static_cast<size_t>(std::pow(2, mipLevel));
+    const size_t texelCount = width * height;
+    const size_t dataSize = texelCount * 4 * sizeof(uint8_t);
+
+    AllocatedBuffer receivingBuffer = resourceManager.createReceivingBuffer(dataSize);
+
+    immediate.submit([&, mipLevel](VkCommandBuffer cmd) {
+        VkBufferImageCopy bufferCopyRegion{};
+        bufferCopyRegion.imageSubresource.aspectMask = VK_IMAGE_ASPECT_COLOR_BIT;
+        bufferCopyRegion.imageSubresource.mipLevel = mipLevel;
+        bufferCopyRegion.imageSubresource.baseArrayLayer = 0;
+        bufferCopyRegion.imageSubresource.layerCount = 1;
+        bufferCopyRegion.imageExtent = {static_cast<uint32_t>(width), static_cast<uint32_t>(height), 1u};
+        bufferCopyRegion.bufferOffset = 0;
+        bufferCopyRegion.bufferRowLength = 0;
+        bufferCopyRegion.bufferImageHeight = 0;
+
+        transitionImage(cmd, image.image, imageLayout, VK_IMAGE_LAYOUT_GENERAL, VK_IMAGE_ASPECT_COLOR_BIT);
+
+        vkCmdCopyImageToBuffer(cmd, image.image, VK_IMAGE_LAYOUT_GENERAL, receivingBuffer.buffer, 1, &bufferCopyRegion);
+
+        transitionImage(cmd, image.image, VK_IMAGE_LAYOUT_GENERAL, imageLayout, VK_IMAGE_ASPECT_COLOR_BIT);
+    });
+
+    void* data = receivingBuffer.info.pMappedData;
+    const auto imageData = static_cast<uint8_t*>(data);
+
+    const auto byteImageData = new uint8_t[texelCount * 4];
+    for (size_t i = 0; i < texelCount; ++i) {
+        const auto value = imageData[i];
+        byteImageData[i * 4 + 0] = imageData[i * 4 + 0];
+        byteImageData[i * 4 + 1] = imageData[i * 4 + 1];
+        byteImageData[i * 4 + 2] = imageData[i * 4 + 2];
+        // overwrite alpha to 1.0f
+        byteImageData[i * 4 + 3] = 255;
+    }
+
+    stbi_write_png(savePath, static_cast<int>(width), static_cast<int>(height), 4, byteImageData, static_cast<int>(width) * 4);
+
+    delete[] byteImageData;
+    resourceManager.destroyBuffer(receivingBuffer);
+}
+
 void will_engine::vk_helpers::saveImage(const std::vector<float>& imageData, int width, int height, std::filesystem::path filename, bool overrideAlpha)
 {
     const auto byteImageData = new uint8_t[width * height * 4];
diff --git a/src/renderer/vk_helpers.h b/src/renderer/vk_helpers.h
index 819116e3..47eedfde 100644
--- a/src/renderer/vk_helpers.h
+++ b/src/renderer/vk_helpers.h
@@ -129,6 +129,9 @@ namespace vk_helpers
     void saveImageR8UNORM(const ResourceManager& resourceManager, const ImmediateSubmitter& immediate, const AllocatedImage& image,
                        VkImageLayout imageLayout, const char* savePath, int32_t mipLevel = 0);
 
+    void saveImageR8G8B8A8UNORM(const ResourceManager& resourceManager, const ImmediateSubmitter& immediate, const AllocatedImage& image,
+                       VkImageLayout imageLayout, const char* savePath, int32_t mipLevel = 0);
+
     void saveImage(const std::vector<float>& imageData, int width, int height, std::filesystem::path filename, bool overrideAlpha = true);
 
     void saveHeightmap(const std::vector<float>& heightData, int width, int height, const std::filesystem::path& filename);

From 582d4da1fe1b7a872238787bfddc76fc3813c02a Mon Sep 17 00:00:00 2001
From: Williscool13 <twtw40@gmail.com>
Date: Sat, 29 Mar 2025 20:57:39 +0700
Subject: [PATCH 18/27] GTAO main pass progress. Early exit for distant pixels.

---
 .../ground_truth/gtao_depth_prefilter.comp    |  12 +-
 .../ground_truth/gtao_main_pass.comp          | 105 ++++++++++++++++--
 shaders/deferredResolve.comp                  |   5 +-
 shaders/environment/environment.vert          |   2 +-
 src/core/camera/free_camera.cpp               |   9 +-
 src/core/engine.cpp                           |   2 +-
 .../ambient_occlusion_types.h                 |  12 +-
 .../ground_truth_ambient_occlusion.cpp        |  15 ++-
 .../post_process/post_process_types.h         |   4 +-
 9 files changed, 137 insertions(+), 29 deletions(-)

diff --git a/shaders/ambient_occlusion/ground_truth/gtao_depth_prefilter.comp b/shaders/ambient_occlusion/ground_truth/gtao_depth_prefilter.comp
index 3ceb6ae3..ec9bb0bc 100644
--- a/shaders/ambient_occlusion/ground_truth/gtao_depth_prefilter.comp
+++ b/shaders/ambient_occlusion/ground_truth/gtao_depth_prefilter.comp
@@ -18,12 +18,16 @@ layout (r16f, set = 1, binding = 5) uniform image2D outDepth4;
 layout (rgba8, set = 1, binding = 6) uniform image2D debugImage;
 
 layout (push_constant) uniform PushConstants {
+    vec2 cameraTanHalfFOV;
+
+    vec2 ndcToViewMul;
+    vec2 ndcToViewAdd;
+
+    vec2 ndcToViewMul_x_PixelSize;
+
     float depthLinearizeMult;
     float depthLinearizeAdd;
 
-    float projectionParamX;
-    float projectionParamY;
-
     float effectRadius;
     float effectFalloffRange;
     float denoiseBlurBeta;
@@ -33,7 +37,7 @@ layout (push_constant) uniform PushConstants {
     float thinOccluderCompensation;
     float finalValuePower;
     float depthMipSamplingOffset;
-    float noiseIndex;
+    uint noiseIndex;
 } pushConstants;
 
 float screenToViewSpaceDepth(float screenDepth, float depthLinearizeMul, float depthLinearizeAdd) {
diff --git a/shaders/ambient_occlusion/ground_truth/gtao_main_pass.comp b/shaders/ambient_occlusion/ground_truth/gtao_main_pass.comp
index 27542611..2917bfd5 100644
--- a/shaders/ambient_occlusion/ground_truth/gtao_main_pass.comp
+++ b/shaders/ambient_occlusion/ground_truth/gtao_main_pass.comp
@@ -12,12 +12,16 @@ layout (r8, set = 1, binding = 2) uniform image2D aoOutput;
 layout (rgba8, set = 1, binding = 3) uniform image2D debugImage;
 
 layout (push_constant) uniform PushConstants {
+    vec2 cameraTanHalfFOV;
+
+    vec2 ndcToViewMul;
+    vec2 ndcToViewAdd;
+
+    vec2 ndcToViewMul_x_PixelSize;
+
     float depthLinearizeMult;
     float depthLinearizeAdd;
 
-    float projectionParamX;
-    float projectionParamY;
-
     float effectRadius;
     float effectFalloffRange;
     float denoiseBlurBeta;
@@ -27,9 +31,44 @@ layout (push_constant) uniform PushConstants {
     float thinOccluderCompensation;
     float finalValuePower;
     float depthMipSamplingOffset;
-    float noiseIndex;
+    uint noiseIndex;
 } pushConstants;
 
+#define XE_HILBERT_LEVEL 6u
+#define XE_HILBERT_WIDTH (1u << XE_HILBERT_LEVEL)
+#define XE_HILBERT_AREA (XE_HILBERT_WIDTH * XE_HILBERT_WIDTH)
+
+uint hilbertIndex(uint posX, uint posY)
+{
+    uint index = 0u;
+    for (uint curLevel = XE_HILBERT_WIDTH/2u; curLevel > 0u; curLevel /= 2u)
+    {
+        uint regionX = (posX & curLevel) > 0u ? 1u : 0u;
+        uint regionY = (posY & curLevel) > 0u ? 1u : 0u;
+        index += curLevel * curLevel * ((3u * regionX) ^ regionY);
+        if (regionY == 0u)
+        {
+            if (regionX == 1u)
+            {
+                posX = uint(XE_HILBERT_WIDTH - 1u) - posX;
+                posY = uint(XE_HILBERT_WIDTH - 1u) - posY;
+            }
+
+            uint temp = posX;
+            posX = posY;
+            posY = temp;
+        }
+    }
+    return index;
+}
+
+vec2 spatioTemporalNoise(ivec2 pixCoord, uint temporalIndex)// without TAA, temporalIndex is always 0
+{
+    uint index = hilbertIndex(uint(pixCoord.x), uint(pixCoord.y));
+    index += 288u * (temporalIndex % 64u);
+    return vec2(fract(0.5 + index * vec2(0.75487766624669276005, 0.5698402909980532659114)));
+}
+
 vec4 calculateDepthEdges(const float centerZ, const float leftZ, const float rightZ, const float topZ, const float bottomZ)
 {
     vec4 edgesLRTB = vec4(leftZ, rightZ, topZ, bottomZ) - vec4(centerZ);
@@ -42,10 +81,29 @@ vec4 calculateDepthEdges(const float centerZ, const float leftZ, const float rig
     return clamp((1.25 - edgesLRTB / (centerZ * 0.011)), 0, 1);;
 }
 
-vec3 reconstructViewSpacePosition(vec2 uv, float viewDepth, float xProj, float yProj) {
+vec3 cheapReconstructViewSpacePosition(vec2 uv, const float viewspaceDepth, vec2 ndcToViewMul, vec2 ndcToViewAdd)
+{
+    vec3 ret;
+    ret.xy = (ndcToViewMul * uv.xy + ndcToViewAdd) * viewspaceDepth;
+    ret.z = -viewspaceDepth;
+    return ret;
+}
+
+
+vec3 reconstructViewSpacePosition(vec2 uv, float viewDepth, float depthLinearizeAdd, float depthLinearizeMult) {
+    float ndcDepth = pushConstants.depthLinearizeAdd - (pushConstants.depthLinearizeMult / viewDepth);
+    uv.y = 1 - uv.y;
     vec2 ndc = uv * 2.0 - 1.0;
-    vec2 projectionParams = vec2(1.0 / xProj, 1.0 / yProj);
-    return vec3(ndc * viewDepth * projectionParams, -viewDepth);
+    vec4 positionVS = sceneData.invProjection * vec4(ndc, ndcDepth, 1.0);
+
+    positionVS /= positionVS.w;
+    return positionVS.xyz;
+}
+
+void outputWorkingTerm(ivec2 screenPos, float visibility, vec3 bentNormal, image2D outputImage){
+    const float XE_GTAO_OCCLUSION_TERM_SCALE = 1.5f;
+    visibility = clamp(visibility / XE_GTAO_OCCLUSION_TERM_SCALE, 0, 1);
+    imageStore(outputImage, screenPos, vec4(visibility + 0.5f / 255.0f));
 }
 
 void main() {
@@ -75,10 +133,14 @@ void main() {
     // Per Intel: Move center pixel slightly towards camera to avoid imprecision artifacts due to depth buffer imprecision; offset depends on depth texture format used
     viewSpaceZM = viewSpaceZM * 0.99920f;
 
+    vec3 vPos = cheapReconstructViewSpacePosition(uv, viewSpaceZM, pushConstants.ndcToViewMul, pushConstants.ndcToViewAdd);
 
-    vec3 vPos = reconstructViewSpacePosition(uv, viewSpaceZM, pushConstants.projectionParamX, pushConstants.projectionParamY);
     vec3 viewVec = normalize(-vPos);
-    imageStore(debugImage, screenPos, vec4(viewVec, 1.0f));
+    //imageStore(debugImage, screenPos, vec4(viewVec, 1.0f));
+
+    // debug world pos
+    vec3 worldPos = (sceneData.invView * vec4(vPos, 1.0)).xyz;
+    imageStore(debugImage, screenPos, vec4(worldPos / 1000.0f, 1.0f));
 
     // Per Intel
     // prevents normals that are facing away from the view vector - xeGTAO struggles with extreme cases, but in Vanilla it seems rare so it's disabled by default
@@ -96,5 +158,30 @@ void main() {
     const float falloffMul = 1.0 / (falloffRange);
     const float falloffAdd = falloffFrom / (falloffRange) + 1.0;
 
+    float visibility = 0;
+    // set bent normal to 0 if generating and outputting to buffer for use in deferred resolve
     vec3 bentNormal = viewNormal;
+
+    // NOISE
+    vec2 noise = spatioTemporalNoise(screenPos, pushConstants.noiseIndex);
+    float noiseSlice = noise.x;
+    float noiseSample = noise.y;
+
+    const float pixelTooCloseThreshold  = 1.3;
+    const vec2 pixelDirRBViewspaceSizeAtCenterZ = viewSpaceZM.xx * pushConstants.ndcToViewMul_x_PixelSize;
+
+    float screenspaceRadius = effectRadius / pixelDirRBViewspaceSizeAtCenterZ.x;
+    visibility += clamp((10 - screenspaceRadius)/100, 0, 1) * 0.5;
+
+    if(screenspaceRadius < pixelTooCloseThreshold)
+    {
+        visibility = 1;
+        const float XE_GTAO_OCCLUSION_TERM_SCALE = 1.5f;
+        visibility = clamp(visibility / XE_GTAO_OCCLUSION_TERM_SCALE, 0, 1);
+        imageStore(debugImage, screenPos, vec4(vec3(visibility), 1.0f));
+        // if outputting bent normals, need to write `viewNormal's` value to the buffer (i.e. no change to trajectory of normal)
+        return;
+    } else {
+        imageStore(debugImage, screenPos, vec4(vec3(1.0f, 1.0f, 0.0f), 1.0f));
+    }
 }
diff --git a/shaders/deferredResolve.comp b/shaders/deferredResolve.comp
index 3d7f2f3d..e04d7ee6 100644
--- a/shaders/deferredResolve.comp
+++ b/shaders/deferredResolve.comp
@@ -186,8 +186,9 @@ void main() {
             imageStore(outputImage, screenPos, vec4(vec3(dot(N, L)), 1.0));
             break;
         case 9:
-            vec4 viewPos = sceneData.view * vec4(position, 1.0f);
-            imageStore(outputImage, screenPos, vec4(-viewPos.xyz / 100.0f, 1.0f));
+            // vec4 viewPos = sceneData.view * vec4(position, 1.0f);
+            // imageStore(outputImage, screenPos, vec4(-viewPos.xyz / 100.0f, 1.0f));
+            imageStore(outputImage, screenPos, vec4(position.xyz / 1000.0f, 1.0f));
             break;
     }
 }
\ No newline at end of file
diff --git a/shaders/environment/environment.vert b/shaders/environment/environment.vert
index 6ea6da69..8b8d9aec 100644
--- a/shaders/environment/environment.vert
+++ b/shaders/environment/environment.vert
@@ -9,7 +9,7 @@ layout (location = 2) out vec4 outPrevMvpPosition;
 // layout (std140, set = 0, binding = 0) uniform SceneData - scene.glsl
 
 void main() {
-    const vec3 vertices[3] = vec3[3](vec3(-1, -1, 0.0002), vec3(3, -1, 0.0002), vec3(-1, 3, 0.0002));
+    const vec3 vertices[3] = vec3[3](vec3(-1, -1, 0.000001), vec3(3, -1, 0.000001), vec3(-1, 3, 0.000001));
 
 
     vec4 currClipPos = vec4(vertices[gl_VertexIndex], 1);
diff --git a/src/core/camera/free_camera.cpp b/src/core/camera/free_camera.cpp
index 8d84ac26..9f6c8c71 100644
--- a/src/core/camera/free_camera.cpp
+++ b/src/core/camera/free_camera.cpp
@@ -18,6 +18,7 @@ void will_engine::FreeCamera::update(const float deltaTime)
     }
 
     glm::vec3 velocity{0.f};
+    float verticalVelocity{0.f};
 
     if (input.isKeyDown(SDLK_D)) {
         velocity.x += 1.0f;
@@ -26,10 +27,10 @@ void will_engine::FreeCamera::update(const float deltaTime)
         velocity.x -= 1.0f;
     }
     if (input.isKeyDown(SDLK_LCTRL)) {
-        velocity.y -= 1.0f;
+        verticalVelocity -= 1.0f;
     }
     if (input.isKeyDown(SDLK_SPACE)) {
-        velocity.y += 1.0f;
+        verticalVelocity += 1.0f;
     }
     // I guess vulkan is negative Z forward?!
     if (input.isKeyDown(SDLK_W)) {
@@ -54,6 +55,7 @@ void will_engine::FreeCamera::update(const float deltaTime)
     const auto currentSpeed = static_cast<float>(glm::pow(10, scale));
 
     velocity *= deltaTime * currentSpeed;
+    verticalVelocity *= deltaTime * currentSpeed;
 
     const float yaw = glm::radians(-input.getMouseXDelta() / 10.0f);
     const float pitch = glm::radians(-input.getMouseYDelta() / 10.0f);
@@ -74,7 +76,8 @@ void will_engine::FreeCamera::update(const float deltaTime)
     transform.setRotation(newRotation);
 
     const glm::mat4 rotationMatrix = getRotationMatrixWS();
-    const auto finalVelocity = glm::vec3(rotationMatrix * glm::vec4(velocity, 0.f));
+    auto finalVelocity = glm::vec3(rotationMatrix * glm::vec4(velocity, 0.f));
+    finalVelocity += glm::vec3(0.0f, verticalVelocity, 0.0f);
     transform.translate(finalVelocity);
 
     updateViewMatrix();
diff --git a/src/core/engine.cpp b/src/core/engine.cpp
index 2299ad81..a19eee4a 100644
--- a/src/core/engine.cpp
+++ b/src/core/engine.cpp
@@ -608,7 +608,7 @@ void Engine::draw(float deltaTime)
 
     vk_helpers::transitionImage(cmd, taaResolveTarget.image, VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL, VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL, VK_IMAGE_ASPECT_COLOR_BIT);
     vk_helpers::transitionImage(cmd, postProcessOutputBuffer.image, VK_IMAGE_LAYOUT_UNDEFINED, VK_IMAGE_LAYOUT_GENERAL, VK_IMAGE_ASPECT_COLOR_BIT);
-    postProcessPipeline->draw(cmd, post_process::PostProcessType::ALL);
+    postProcessPipeline->draw(cmd, post_process::PostProcessType::None);
 
     vk_helpers::transitionImage(cmd, postProcessOutputBuffer.image, VK_IMAGE_LAYOUT_GENERAL, VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL, VK_IMAGE_ASPECT_COLOR_BIT);
     vk_helpers::transitionImage(cmd, swapchainImages[swapchainImageIndex], VK_IMAGE_LAYOUT_UNDEFINED, VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL, VK_IMAGE_ASPECT_COLOR_BIT);
diff --git a/src/renderer/lighting/ambient_occlusion/ambient_occlusion_types.h b/src/renderer/lighting/ambient_occlusion/ambient_occlusion_types.h
index 30cb29ef..6985b044 100644
--- a/src/renderer/lighting/ambient_occlusion/ambient_occlusion_types.h
+++ b/src/renderer/lighting/ambient_occlusion/ambient_occlusion_types.h
@@ -15,12 +15,16 @@ static constexpr int32_t GTAO_DENOISE_PASSES = 1;
 
 struct GTAOPushConstants
 {
+    glm::vec2 cameraTanHalfFOV;
+
+    glm::vec2 ndcToViewMul;
+    glm::vec2 ndcToViewAdd;
+
+    glm::vec2 ndcToViewMul_x_PixelSize;
+
     float depthLinearizeMult;
     float depthLinearizeAdd;
 
-    float projectionParamX;
-    float projectionParamY;
-
     float effectRadius = 0.5f;
     float effectFalloffRange = 0.615f;
     float denoiseBlurBeta = (GTAO_DENOISE_PASSES == 0) ? (1e4f) : (1.2f);
@@ -30,7 +34,7 @@ struct GTAOPushConstants
     float thinOccluderCompensation = 0.0f;
     float finalValuePower = 2.2f;
     float depthMipSamplingOffset = 3.30f;
-    float noiseIndex;
+    uint32_t noiseIndex;
 };
 
 struct GTAODrawInfo
diff --git a/src/renderer/lighting/ambient_occlusion/ground_truth/ground_truth_ambient_occlusion.cpp b/src/renderer/lighting/ambient_occlusion/ground_truth/ground_truth_ambient_occlusion.cpp
index 48ddb4db..e2dcc0dd 100644
--- a/src/renderer/lighting/ambient_occlusion/ground_truth/ground_truth_ambient_occlusion.cpp
+++ b/src/renderer/lighting/ambient_occlusion/ground_truth/ground_truth_ambient_occlusion.cpp
@@ -381,9 +381,18 @@ void will_engine::ambient_occlusion::GroundTruthAmbientOcclusionPipeline::draw(V
     glm::mat4 projMatrix = drawInfo.camera->getProjMatrix();
     push.depthLinearizeMult = -projMatrix[3][2];
     push.depthLinearizeAdd = projMatrix[2][2];
+    if (push.depthLinearizeMult * push.depthLinearizeAdd < 0) {
+        push.depthLinearizeAdd = -push.depthLinearizeAdd;
+    }
+
+    float tanHalfFOVY = 1.0f / projMatrix[1][1];
+    float tanHalfFOVX = 1.0F / projMatrix[0][0];
+    push.cameraTanHalfFOV = {tanHalfFOVX, tanHalfFOVY};
+    push.ndcToViewMul = {push.cameraTanHalfFOV.x * 2.0f, push.cameraTanHalfFOV.y * -2.0f};
+    push.ndcToViewAdd = {push.cameraTanHalfFOV.x * -1.0f, push.cameraTanHalfFOV.y * 1.0f};
+    constexpr glm::vec2 texelSize = {1.0f / RENDER_EXTENT_WIDTH, 1.0f / RENDER_EXTENT_HEIGHT};
+    push.ndcToViewMul_x_PixelSize = {push.ndcToViewMul.x * texelSize.x, push.ndcToViewMul.y * texelSize.y};
 
-    push.projectionParamX = projMatrix[0][0];
-    push.projectionParamY = projMatrix[1][1];
 
     push.noiseIndex = GTAO_DENOISE_PASSES > 0 ? drawInfo.currentFrame % 64 : 0;
 
@@ -442,7 +451,7 @@ void will_engine::ambient_occlusion::GroundTruthAmbientOcclusionPipeline::draw(V
 
 
     vk_helpers::transitionImage(cmd, debugImage.image, VK_IMAGE_LAYOUT_GENERAL, VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL,
-                            VK_IMAGE_ASPECT_COLOR_BIT);
+                                VK_IMAGE_ASPECT_COLOR_BIT);
 
     vkCmdEndDebugUtilsLabelEXT(cmd);
 }
diff --git a/src/renderer/post_process/post_process_types.h b/src/renderer/post_process/post_process_types.h
index ad01f53a..c00a95ea 100644
--- a/src/renderer/post_process/post_process_types.h
+++ b/src/renderer/post_process/post_process_types.h
@@ -11,10 +11,10 @@ namespace post_process
 {
 enum class PostProcessType : uint32_t
 {
-    None = 0,
+    None = 0x00000000,
     Tonemapping = 1 << 0,
     Sharpening = 1 << 1,
-    ALL = Tonemapping | Sharpening
+    ALL = 0xFFFFFFFF
 };
 
 }

From b09f345b5940e6223711bcbe1e06970528c092b9 Mon Sep 17 00:00:00 2001
From: Williscool13 <twtw40@gmail.com>
Date: Sat, 29 Mar 2025 23:30:24 +0700
Subject: [PATCH 19/27] GTAO main pass first try.

---
 .../ground_truth/gtao_main_pass.comp          | 252 +++++++++++++++---
 src/core/engine.cpp                           |   2 +-
 src/core/engine.h                             |   2 +-
 .../ambient_occlusion_types.h                 |   2 +-
 .../lighting/shadows/shadow_constants.cpp     |   8 +-
 .../lighting/shadows/shadow_constants.h       |   4 +-
 6 files changed, 230 insertions(+), 40 deletions(-)

diff --git a/shaders/ambient_occlusion/ground_truth/gtao_main_pass.comp b/shaders/ambient_occlusion/ground_truth/gtao_main_pass.comp
index 2917bfd5..3542ac86 100644
--- a/shaders/ambient_occlusion/ground_truth/gtao_main_pass.comp
+++ b/shaders/ambient_occlusion/ground_truth/gtao_main_pass.comp
@@ -38,6 +38,38 @@ layout (push_constant) uniform PushConstants {
 #define XE_HILBERT_WIDTH (1u << XE_HILBERT_LEVEL)
 #define XE_HILBERT_AREA (XE_HILBERT_WIDTH * XE_HILBERT_WIDTH)
 
+#define XE_GTAO_PI                (3.1415926535897932384626433832795)
+#define XE_GTAO_PI_HALF             (1.5707963267948966192313216916398)
+
+#define XE_GTAO_SLICE_COUNT_LOW                 1.0f
+#define XE_GTAO_SLICE_COUNT_MEDIUM              2.0f
+#define XE_GTAO_SLICE_COUNT_HIGH                3.0f
+#define XE_GTAO_SLICE_COUNT_ULTRA               9.0f
+
+#define XE_GTAO_STEPS_PER_SLICE_COUNT_LOW       2.0f
+#define XE_GTAO_STEPS_PER_SLICE_COUNT_MEDIUM    2.0f
+#define XE_GTAO_STEPS_PER_SLICE_COUNT_HIGH      3.0f
+#define XE_GTAO_STEPS_PER_SLICE_COUNT_ULTRA     3.0f
+
+#define XE_GTAO_OCCLUSION_TERM_SCALE            1.5f
+
+// http://h14s.p5r.org/2012/09/0x5f3759df.html, [Drobot2014a] Low Level Optimizations for GCN, https://blog.selfshadow.com/publications/s2016-shading-course/activision/s2016_pbs_activision_occlusion.pdf slide 63
+float XeGTAO_FastSqrt(float x)
+{
+    return uintBitsToFloat(0x1fbd1df5 + (floatBitsToUint(x) >> 1));
+}
+
+// input [-1, 1] and output [0, PI], from https://seblagarde.wordpress.com/2014/12/01/inverse-trigonometric-functions-gpu-optimization-for-amd-gcn-architecture/
+float XeGTAO_FastACos(float inX)
+{
+    const float PI = 3.141593;
+    const float HALF_PI = 1.570796;
+    float x = abs(inX);
+    float res = -0.156583 * x + HALF_PI;
+    res *= XeGTAO_FastSqrt(1.0 - x);
+    return (inX >= 0) ? res : PI - res;
+}
+
 uint hilbertIndex(uint posX, uint posY)
 {
     uint index = 0u;
@@ -90,7 +122,7 @@ vec3 cheapReconstructViewSpacePosition(vec2 uv, const float viewspaceDepth, vec2
 }
 
 
-vec3 reconstructViewSpacePosition(vec2 uv, float viewDepth, float depthLinearizeAdd, float depthLinearizeMult) {
+vec3 reconstructViewSpacePosition(vec2 uv, float viewDepth) {
     float ndcDepth = pushConstants.depthLinearizeAdd - (pushConstants.depthLinearizeMult / viewDepth);
     uv.y = 1 - uv.y;
     vec2 ndc = uv * 2.0 - 1.0;
@@ -101,11 +133,19 @@ vec3 reconstructViewSpacePosition(vec2 uv, float viewDepth, float depthLinearize
 }
 
 void outputWorkingTerm(ivec2 screenPos, float visibility, vec3 bentNormal, image2D outputImage){
-    const float XE_GTAO_OCCLUSION_TERM_SCALE = 1.5f;
     visibility = clamp(visibility / XE_GTAO_OCCLUSION_TERM_SCALE, 0, 1);
     imageStore(outputImage, screenPos, vec4(visibility + 0.5f / 255.0f));
 }
 
+mat3 adjugate(mat4 m) {
+    return mat3(
+    cross(m[1].xyz, m[2].xyz),
+    cross(m[2].xyz, m[0].xyz),
+    cross(m[0].xyz, m[1].xyz)
+    );
+
+}
+
 void main() {
     const ivec2 screenPos = ivec2(gl_GlobalInvocationID.xy);
 
@@ -127,24 +167,23 @@ void main() {
 
     // Get view space normal by sampling normal buffer and converting from world to view (code not relevant)
     vec3 worldNormal = texture(normalBuffer, uv).rgb;
-    vec3 viewNormal = mat3(sceneData.view) * worldNormal;
-    // imageStore(debugImage, screenPos, vec4(viewNormal * 0.5f + 0.5f, 1.0f));
+    vec3 viewNormal = adjugate(sceneData.view) * worldNormal;
 
     // Per Intel: Move center pixel slightly towards camera to avoid imprecision artifacts due to depth buffer imprecision; offset depends on depth texture format used
-    viewSpaceZM = viewSpaceZM * 0.99920f;
+    viewSpaceZM = viewSpaceZM * 0.998f;
 
     vec3 vPos = cheapReconstructViewSpacePosition(uv, viewSpaceZM, pushConstants.ndcToViewMul, pushConstants.ndcToViewAdd);
-
+    //vec3 vPos = reconstructViewSpacePosition(uv, viewSpaceZM);
     vec3 viewVec = normalize(-vPos);
-    //imageStore(debugImage, screenPos, vec4(viewVec, 1.0f));
 
     // debug world pos
-    vec3 worldPos = (sceneData.invView * vec4(vPos, 1.0)).xyz;
-    imageStore(debugImage, screenPos, vec4(worldPos / 1000.0f, 1.0f));
+//    vec3 worldPos = (sceneData.invView * vec4(vPos, 1.0)).xyz;
+//    imageStore(debugImage, screenPos, vec4(worldPos / 1000.0f, 1.0f));
+//    return;
 
     // Per Intel
     // prevents normals that are facing away from the view vector - xeGTAO struggles with extreme cases, but in Vanilla it seems rare so it's disabled by default
-    // viewspaceNormal = normalize( viewspaceNormal + max( 0, -dot( viewspaceNormal, viewVec ) ) * viewVec );
+     viewNormal = normalize(viewNormal + max(0, -dot(viewNormal, viewVec)) * viewVec);
 
 
     const float effectRadius = pushConstants.effectRadius * pushConstants.radiusMultiplier;
@@ -155,33 +194,184 @@ void main() {
     const float falloffFrom = effectRadius * (1 - pushConstants.effectFalloffRange);
 
     // fadeout precompute optimisation
-    const float falloffMul = 1.0 / (falloffRange);
+    const float falloffMul = -1.0 / (falloffRange);
     const float falloffAdd = falloffFrom / (falloffRange) + 1.0;
 
     float visibility = 0;
-    // set bent normal to 0 if generating and outputting to buffer for use in deferred resolve
-    vec3 bentNormal = viewNormal;
+    vec3 bentNormal = vec3(0.0f);
+
+    {
+        // NOISE
+        vec2 noise = spatioTemporalNoise(screenPos, pushConstants.noiseIndex);
+        float noiseSlice = noise.x;
+        float noiseSample = noise.y;
 
-    // NOISE
-    vec2 noise = spatioTemporalNoise(screenPos, pushConstants.noiseIndex);
-    float noiseSlice = noise.x;
-    float noiseSample = noise.y;
+        const float pixelTooCloseThreshold  = 1.3;
+        const vec2 pixelDirRBViewspaceSizeAtCenterZ = viewSpaceZM.xx * pushConstants.ndcToViewMul_x_PixelSize;
 
-    const float pixelTooCloseThreshold  = 1.3;
-    const vec2 pixelDirRBViewspaceSizeAtCenterZ = viewSpaceZM.xx * pushConstants.ndcToViewMul_x_PixelSize;
+        float screenspaceRadius = effectRadius / pixelDirRBViewspaceSizeAtCenterZ.x;
 
-    float screenspaceRadius = effectRadius / pixelDirRBViewspaceSizeAtCenterZ.x;
-    visibility += clamp((10 - screenspaceRadius)/100, 0, 1) * 0.5;
+        visibility += clamp((10 - screenspaceRadius)/100, 0, 1) * 0.5;
 
-    if(screenspaceRadius < pixelTooCloseThreshold)
-    {
-        visibility = 1;
-        const float XE_GTAO_OCCLUSION_TERM_SCALE = 1.5f;
-        visibility = clamp(visibility / XE_GTAO_OCCLUSION_TERM_SCALE, 0, 1);
-        imageStore(debugImage, screenPos, vec4(vec3(visibility), 1.0f));
-        // if outputting bent normals, need to write `viewNormal's` value to the buffer (i.e. no change to trajectory of normal)
-        return;
-    } else {
-        imageStore(debugImage, screenPos, vec4(vec3(1.0f, 1.0f, 0.0f), 1.0f));
+        if (screenspaceRadius < pixelTooCloseThreshold)
+        {
+            visibility = 1;
+            visibility = clamp(visibility / XE_GTAO_OCCLUSION_TERM_SCALE, 0, 1);
+            imageStore(debugImage, screenPos, vec4(vec3(visibility, 0, visibility), 1.0f));
+            // todo: look at how this will return 1/1.5 instead of 1 for visibility always?
+            // todo: (bent normal) need to write `viewNormal` value to the buffer (i.e. no change to trajectory of normal)
+            return;
+        }
+
+        const float minS = pixelTooCloseThreshold / screenspaceRadius;
+
+        float sliceCount = XE_GTAO_SLICE_COUNT_ULTRA;
+        float stepsPerSlice = XE_GTAO_STEPS_PER_SLICE_COUNT_ULTRA;
+
+        for (float slice = 0; slice < sliceCount; slice++){
+            float sliceK = (slice+noiseSlice) / sliceCount;
+            // lines 5, 6 from the paper
+            float phi = sliceK * XE_GTAO_PI;
+            float cosPhi = cos(phi);
+            float sinPhi = sin(phi);
+            vec2 omega = vec2(cosPhi, sinPhi);//lpfloat2 on omega causes issues with big radii
+
+            // convert to screen units (pixels) for later use
+            omega *= screenspaceRadius;
+
+            // line 8 from the paper
+            const vec3 directionVec = vec3(cosPhi, sinPhi, 0);
+
+            // line 9 from the paper
+            const vec3 orthoDirectionVec = directionVec - (dot(directionVec, viewVec) * viewVec);
+
+            // line 10 from the paper
+            //axisVec is orthogonal to directionVec and viewVec, used to define projectedNormal
+            const vec3 axisVec = normalize(cross(orthoDirectionVec, viewVec));
+
+            // alternative line 9 from the paper
+            // float3 orthoDirectionVec = cross( viewVec, axisVec );
+
+            // line 11 from the paper
+            vec3 projectedNormalVec = viewNormal - axisVec * dot(viewNormal, axisVec);
+
+            // line 13 from the paper
+            float signNorm = sign(dot(orthoDirectionVec, projectedNormalVec));
+
+            // line 14 from the paper
+            float projectedNormalVecLength = length(projectedNormalVec);
+            float cosNorm = clamp(dot(projectedNormalVec, viewVec) / projectedNormalVecLength, 0, 1);
+
+            // line 15 from the paper
+            float n = signNorm * XeGTAO_FastACos(cosNorm);
+
+            // this is a lower weight target; not using -1 as in the original paper because it is under horizon, so a 'weight' has different meaning based on the normal
+            const float lowHorizonCos0  = cos(n+XE_GTAO_PI_HALF);
+            const float lowHorizonCos1  = cos(n-XE_GTAO_PI_HALF);
+
+            // lines 17, 18 from the paper, manually unrolled the 'side' loop
+            float horizonCos0           = lowHorizonCos0;//-1;
+            float horizonCos1           = lowHorizonCos1;//-1;
+
+            for (float step = 0; step < stepsPerSlice; step++) {
+                // R1 sequence (http://extremelearning.com.au/unreasonable-effectiveness-of-quasirandom-sequences/)
+                const float stepBaseNoise = (slice + step * stepsPerSlice) * 0.6180339887498948482;// <- this should unroll
+                float stepNoise = fract(noiseSample + stepBaseNoise);
+
+                // approx line 20 from the paper, with added noise
+                float s = (step+stepNoise) / (stepsPerSlice);// + (lpfloat2)1e-6f);
+
+                // additional distribution modifier
+                s       = pow(s, sampleDistributionPower);
+
+                // avoid sampling center pixel
+                s       += minS;
+
+                // approx lines 21-22 from the paper, unrolled
+                vec2 sampleOffset = s * omega;
+
+                float sampleOffsetLength = length(sampleOffset);
+
+                const int XE_GTAO_DEPTH_MIP_LEVELS = 5;
+                // note: when sampling, using point_point_point or point_point_linear sampler works, but linear_linear_linear will cause unwanted interpolation between neighbouring depth values on the same MIP level!
+                const float mipLevel    = clamp(log2(sampleOffsetLength) - pushConstants.depthMipSamplingOffset, 0, XE_GTAO_DEPTH_MIP_LEVELS);
+
+                // Snap to pixel center (more correct direction math, avoids artifacts due to sampling pos not matching depth texel center - messes up slope - but adds other
+                // artifacts due to them being pushed off the slice). Also use full precision for high res cases.
+                sampleOffset = round(sampleOffset) * sceneData.texelSize;
+
+                vec2 sampleScreenPos0 = uv + sampleOffset;
+                float  SZ0 = textureLod(prefilteredDepth, sampleScreenPos0, mipLevel).r;
+                vec3 samplePos0 = cheapReconstructViewSpacePosition(sampleScreenPos0, SZ0, pushConstants.ndcToViewMul, pushConstants.ndcToViewAdd);
+                //vec3 samplePos0 = reconstructViewSpacePosition(sampleScreenPos0, SZ0);
+
+                vec2 sampleScreenPos1 = uv - sampleOffset;
+                float  SZ1 = textureLod(prefilteredDepth, sampleScreenPos1, mipLevel).r;
+                vec3 samplePos1 = cheapReconstructViewSpacePosition(sampleScreenPos1, SZ1, pushConstants.ndcToViewMul, pushConstants.ndcToViewAdd);
+                //vec3 samplePos1 = reconstructViewSpacePosition(sampleScreenPos1, SZ1);
+
+                vec3 sampleDelta0     = (samplePos0 - vec3(vPos));// using lpfloat for sampleDelta causes precision issues
+                vec3 sampleDelta1     = (samplePos1 - vec3(vPos));// using lpfloat for sampleDelta causes precision issues
+                float sampleDist0     = length(sampleDelta0);
+                float sampleDist1     = length(sampleDelta1);
+
+                // approx lines 23, 24 from the paper, unrolled
+                vec3 sampleHorizonVec0 = (sampleDelta0 / sampleDist0);
+                vec3 sampleHorizonVec1 = (sampleDelta1 / sampleDist1);
+
+
+                // this is our own thickness heuristic that relies on sooner discarding samples behind the center
+                float falloffBase0    = length(vec3(sampleDelta0.x, sampleDelta0.y, sampleDelta0.z * (1+thinOccluderCompensation)));
+                float falloffBase1    = length(vec3(sampleDelta1.x, sampleDelta1.y, sampleDelta1.z * (1+thinOccluderCompensation)));
+                float weight0         = clamp(falloffBase0 * falloffMul + falloffAdd, 0, 1);
+                float weight1         = clamp(falloffBase1 * falloffMul + falloffAdd, 0, 1);
+
+                // sample horizon cos
+                float shc0 = dot(sampleHorizonVec0, viewVec);
+                float shc1 = dot(sampleHorizonVec1, viewVec);
+
+                // discard unwanted samples
+                shc0 = mix(lowHorizonCos0, shc0, weight0);// this would be more correct but too expensive: cos(lerp( acos(lowHorizonCos0), acos(shc0), weight0 ));
+                shc1 = mix(lowHorizonCos1, shc1, weight1);// this would be more correct but too expensive: cos(lerp( acos(lowHorizonCos1), acos(shc1), weight1 ));
+
+
+                // this is a version where thicknessHeuristic is completely disabled
+                horizonCos0 = max(horizonCos0, shc0);
+                horizonCos1 = max(horizonCos1, shc1);
+            }
+
+            #if 1// I can't figure out the slight overdarkening on high slopes, so I'm adding this fudge - in the training set, 0.05 is close (PSNR 21.34) to disabled (PSNR 21.45)
+            projectedNormalVecLength = mix(projectedNormalVecLength, 1, 0.05);
+            #endif
+
+            // line ~27, unrolled
+            float h0 = -XeGTAO_FastACos(horizonCos1);
+            float h1 = XeGTAO_FastACos(horizonCos0);
+            float iarc0 = (cosNorm + 2 * h0 * sin(n) - cos(2 * h0 - n)) / 4;
+            float iarc1 = (cosNorm + 2 * h1 * sin(n) - cos(2 * h1 - n)) / 4;
+            float localVisibility = projectedNormalVecLength * (iarc0+iarc1);
+            visibility += localVisibility;
+
+            // todo: uncomment and fix if outputting bent normals
+            // see "Algorithm 2 Extension that computes bent normals b."
+            // lpfloat t0 = (6*sin(h0-n)-sin(3*h0-n)+6*sin(h1-n)-sin(3*h1-n)+16*sin(n)-3*(sin(h0+n)+sin(h1+n)))/12;
+            // lpfloat t1 = (-cos(3 * h0-n)-cos(3 * h1-n) +8 * cos(n)-3 * (cos(h0+n) +cos(h1+n)))/12;
+            // lpfloat3 localBentNormal = lpfloat3(directionVec.x * (lpfloat)t0, directionVec.y * (lpfloat)t0, -lpfloat(t1) );
+            // localBentNormal = (lpfloat3)mul(XeGTAO_RotFromToMatrix(lpfloat3(0, 0, -1), viewVec), localBentNormal ) * projectedNormalVecLength;
+            // bentNormal += localBentNormal;
+        }
+
+        visibility /= sliceCount;
+        visibility = pow(visibility, pushConstants.finalValuePower);
+        visibility = max(0.03, visibility);// disallow total occlusion (which wouldn't make any sense anyhow since pixel is visible but also helps with packing bent normals)
+
+        // todo (bent normal)
+        // bentNormal = normalize(bentNormal) ;
     }
+
+    // todo (bent normal)
+    visibility = clamp(visibility / XE_GTAO_OCCLUSION_TERM_SCALE, 0, 1);
+    imageStore(debugImage, screenPos, vec4(vec3(visibility), 1.0f));
+
+    //imageStore(debugImage, screenPos, vec4(viewNormal * 0.5f + 0.5f, 1.0f));
 }
diff --git a/src/core/engine.cpp b/src/core/engine.cpp
index a19eee4a..eb3af9ea 100644
--- a/src/core/engine.cpp
+++ b/src/core/engine.cpp
@@ -608,7 +608,7 @@ void Engine::draw(float deltaTime)
 
     vk_helpers::transitionImage(cmd, taaResolveTarget.image, VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL, VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL, VK_IMAGE_ASPECT_COLOR_BIT);
     vk_helpers::transitionImage(cmd, postProcessOutputBuffer.image, VK_IMAGE_LAYOUT_UNDEFINED, VK_IMAGE_LAYOUT_GENERAL, VK_IMAGE_ASPECT_COLOR_BIT);
-    postProcessPipeline->draw(cmd, post_process::PostProcessType::None);
+    postProcessPipeline->draw(cmd, post_process::PostProcessType::Sharpening);
 
     vk_helpers::transitionImage(cmd, postProcessOutputBuffer.image, VK_IMAGE_LAYOUT_GENERAL, VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL, VK_IMAGE_ASPECT_COLOR_BIT);
     vk_helpers::transitionImage(cmd, swapchainImages[swapchainImageIndex], VK_IMAGE_LAYOUT_UNDEFINED, VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL, VK_IMAGE_ASPECT_COLOR_BIT);
diff --git a/src/core/engine.h b/src/core/engine.h
index 547d5b5c..f3308948 100644
--- a/src/core/engine.h
+++ b/src/core/engine.h
@@ -173,7 +173,7 @@ class Engine
     bool bEnableTaa{true};
     float taaBlendValue{0.1f};
     bool bEnableDebugFrustumCullDraw{false};
-    int32_t csmPcf{5};
+    int32_t csmPcf{1};
     int32_t deferredDebug{0};
     bool bDrawTerrainLines{false};
 
diff --git a/src/renderer/lighting/ambient_occlusion/ambient_occlusion_types.h b/src/renderer/lighting/ambient_occlusion/ambient_occlusion_types.h
index 6985b044..1b96cc2b 100644
--- a/src/renderer/lighting/ambient_occlusion/ambient_occlusion_types.h
+++ b/src/renderer/lighting/ambient_occlusion/ambient_occlusion_types.h
@@ -34,7 +34,7 @@ struct GTAOPushConstants
     float thinOccluderCompensation = 0.0f;
     float finalValuePower = 2.2f;
     float depthMipSamplingOffset = 3.30f;
-    uint32_t noiseIndex;
+    uint32_t noiseIndex{0};
 };
 
 struct GTAODrawInfo
diff --git a/src/renderer/lighting/shadows/shadow_constants.cpp b/src/renderer/lighting/shadows/shadow_constants.cpp
index f20008ea..353619f1 100644
--- a/src/renderer/lighting/shadows/shadow_constants.cpp
+++ b/src/renderer/lighting/shadows/shadow_constants.cpp
@@ -5,8 +5,8 @@
 #include "shadow_constants.h"
 
 float will_engine::shadows::CASCADE_BIAS[SHADOW_CASCADE_COUNT][2] = {
-    {400.0f, 7.0f},
-    {350.0f, 6.0f},
-    {300.0f, 5.0f},
-    {250.0f, 5.0f},
+    {400.0f, 1.0f},
+    {350.0f, 1.0f},
+    {300.0f, 1.0f},
+    {250.0f, 1.0f},
 };
diff --git a/src/renderer/lighting/shadows/shadow_constants.h b/src/renderer/lighting/shadows/shadow_constants.h
index 65018b5f..0d04db8b 100644
--- a/src/renderer/lighting/shadows/shadow_constants.h
+++ b/src/renderer/lighting/shadows/shadow_constants.h
@@ -9,11 +9,11 @@
 
 namespace will_engine::shadows
 {
-static constexpr float LAMBDA = 0.5f;
+static constexpr float LAMBDA = 0.8f;
 static constexpr float OVERLAP = 1.05f;
 static constexpr uint32_t SHADOW_CASCADE_COUNT = 4;
 static constexpr float CASCADE_NEAR_PLANE = 0.1f;
-static constexpr float CASCADE_FAR_PLANE = 200.0f;
+static constexpr float CASCADE_FAR_PLANE = 1000.0f;
 extern float CASCADE_BIAS[SHADOW_CASCADE_COUNT][2];
 static constexpr int32_t CASCADE_WIDTH{2048};
 static constexpr int32_t CASCADE_HEIGHT{2048};

From 51bfdaf69c0fcccf367b12da3df2f295900445fd Mon Sep 17 00:00:00 2001
From: Williscool13 <twtw40@gmail.com>
Date: Sun, 30 Mar 2025 16:39:19 +0700
Subject: [PATCH 20/27] GTAO spatial filter progress.

---
 .../ground_truth/gtao_depth_prefilter.comp    |  45 +++---
 .../ground_truth/gtao_main_pass.comp          |  71 ++++++----
 .../ground_truth/gtao_spatial_filter.comp     |  38 +++++-
 shaders/include/gtao.glsl                     |  49 +++++++
 src/core/engine.cpp                           |   5 +-
 src/core/engine.h                             |   1 +
 src/renderer/imgui_wrapper.cpp                |   6 +-
 .../ambient_occlusion_types.h                 |   2 +
 .../ground_truth_ambient_occlusion.cpp        | 128 +++++++++++++++---
 .../ground_truth_ambient_occlusion.h          |  13 +-
 src/renderer/renderer_constants.h             |   1 +
 11 files changed, 272 insertions(+), 87 deletions(-)
 create mode 100644 shaders/include/gtao.glsl

diff --git a/shaders/ambient_occlusion/ground_truth/gtao_depth_prefilter.comp b/shaders/ambient_occlusion/ground_truth/gtao_depth_prefilter.comp
index ec9bb0bc..0439a7a7 100644
--- a/shaders/ambient_occlusion/ground_truth/gtao_depth_prefilter.comp
+++ b/shaders/ambient_occlusion/ground_truth/gtao_depth_prefilter.comp
@@ -2,6 +2,7 @@
 #extension GL_EXT_nonuniform_qualifier: enable
 
 #include "scene.glsl"
+#include "gtao.glsl"
 
 layout(local_size_x = 8, local_size_y = 8) in;
 
@@ -17,29 +18,6 @@ layout (r16f, set = 1, binding = 4) uniform image2D outDepth3;
 layout (r16f, set = 1, binding = 5) uniform image2D outDepth4;
 layout (rgba8, set = 1, binding = 6) uniform image2D debugImage;
 
-layout (push_constant) uniform PushConstants {
-    vec2 cameraTanHalfFOV;
-
-    vec2 ndcToViewMul;
-    vec2 ndcToViewAdd;
-
-    vec2 ndcToViewMul_x_PixelSize;
-
-    float depthLinearizeMult;
-    float depthLinearizeAdd;
-
-    float effectRadius;
-    float effectFalloffRange;
-    float denoiseBlurBeta;
-
-    float radiusMultiplier;
-    float sampleDistributionPower;
-    float thinOccluderCompensation;
-    float finalValuePower;
-    float depthMipSamplingOffset;
-    uint noiseIndex;
-} pushConstants;
-
 float screenToViewSpaceDepth(float screenDepth, float depthLinearizeMul, float depthLinearizeAdd) {
     // Optimization by XeGTAO
     // https://github.com/GameTechDev/XeGTAO/blob/a5b1686c7ea37788eeb3576b5be47f7c03db532c/Source/Rendering/Shaders/XeGTAO.hlsli#L112
@@ -88,10 +66,16 @@ void main() {
 
     vec2 uv = (vec2(screenPos) + 0.5) * sceneData.texelSize;
 
-    float rDepth0 = texture(depthImage, uv + vec2(0.0, 0.0) * sceneData.texelSize).r;
-    float rDepth1 = texture(depthImage, uv + vec2(1.0, 0.0) * sceneData.texelSize).r;
-    float rDepth2 = texture(depthImage, uv + vec2(0.0, 1.0) * sceneData.texelSize).r;
-    float rDepth3 = texture(depthImage, uv + vec2(1.0, 1.0) * sceneData.texelSize).r;
+    // todo: optimize with textureGather?
+    vec4 depths = textureGather(depthImage, uv);
+    float rDepth0 = depths.w; // top-left
+    float rDepth1 = depths.z; // top-right
+    float rDepth2 = depths.x; // bottom-left
+    float rDepth3 = depths.y; // bottom-right
+//    float rDepth0 = texture(depthImage, uv + vec2(0.0, 0.0) * sceneData.texelSize).r;
+//    float rDepth1 = texture(depthImage, uv + vec2(1.0, 0.0) * sceneData.texelSize).r;
+//    float rDepth2 = texture(depthImage, uv + vec2(0.0, 1.0) * sceneData.texelSize).r;
+//    float rDepth3 = texture(depthImage, uv + vec2(1.0, 1.0) * sceneData.texelSize).r;
 
     float depth0 = clampDepth(screenToViewSpaceDepth(rDepth0, pushConstants.depthLinearizeMult, pushConstants.depthLinearizeAdd));
     float depth1 = clampDepth(screenToViewSpaceDepth(rDepth1, pushConstants.depthLinearizeMult, pushConstants.depthLinearizeAdd));
@@ -103,6 +87,13 @@ void main() {
     imageStore(outDepth0, screenPos + ivec2(0, 1), vec4(depth2));
     imageStore(outDepth0, screenPos + ivec2(1, 1), vec4(depth3));
 
+    if (pushConstants.debug == 1){
+        imageStore(debugImage, screenPos + ivec2(0, 0), vec4(vec3(depth0 / 1000.0f), 1.0f));
+        imageStore(debugImage, screenPos + ivec2(1, 0), vec4(vec3(depth1 / 1000.0f), 1.0f));
+        imageStore(debugImage, screenPos + ivec2(0, 1), vec4(vec3(depth2 / 1000.0f), 1.0f));
+        imageStore(debugImage, screenPos + ivec2(1, 1), vec4(vec3(depth3 / 1000.0f), 1.0f));
+        return;
+    }
 
     // MIP 1
     float dm1 = depthMipFilter(depth0, depth1, depth2, depth3, pushConstants.effectRadius, pushConstants.radiusMultiplier, pushConstants.effectFalloffRange);
diff --git a/shaders/ambient_occlusion/ground_truth/gtao_main_pass.comp b/shaders/ambient_occlusion/ground_truth/gtao_main_pass.comp
index 3542ac86..867b91fb 100644
--- a/shaders/ambient_occlusion/ground_truth/gtao_main_pass.comp
+++ b/shaders/ambient_occlusion/ground_truth/gtao_main_pass.comp
@@ -1,6 +1,7 @@
 #version 460
 
 #include "scene.glsl"
+#include "gtao.glsl"
 
 layout(local_size_x = 16, local_size_y = 16) in;
 
@@ -9,30 +10,9 @@ layout(local_size_x = 16, local_size_y = 16) in;
 layout (set = 1, binding = 0) uniform sampler2D prefilteredDepth;
 layout (set = 1, binding = 1) uniform sampler2D normalBuffer;
 layout (r8, set = 1, binding = 2) uniform image2D aoOutput;
-layout (rgba8, set = 1, binding = 3) uniform image2D debugImage;
+layout (r8, set = 1, binding = 3) uniform image2D edgeDataOutput;
+layout (rgba8, set = 1, binding = 4) uniform image2D debugImage;
 
-layout (push_constant) uniform PushConstants {
-    vec2 cameraTanHalfFOV;
-
-    vec2 ndcToViewMul;
-    vec2 ndcToViewAdd;
-
-    vec2 ndcToViewMul_x_PixelSize;
-
-    float depthLinearizeMult;
-    float depthLinearizeAdd;
-
-    float effectRadius;
-    float effectFalloffRange;
-    float denoiseBlurBeta;
-
-    float radiusMultiplier;
-    float sampleDistributionPower;
-    float thinOccluderCompensation;
-    float finalValuePower;
-    float depthMipSamplingOffset;
-    uint noiseIndex;
-} pushConstants;
 
 #define XE_HILBERT_LEVEL 6u
 #define XE_HILBERT_WIDTH (1u << XE_HILBERT_LEVEL)
@@ -161,14 +141,35 @@ void main() {
     float viewSpaceZT = textureLod(prefilteredDepth, uv + vec2(0.0, 1.0) * sceneData.texelSize, 0).r;
     float viewSpaceZB = textureLod(prefilteredDepth, uv + vec2(0.0, -1.0) * sceneData.texelSize, 0).r;
 
+    // Theoretically Equivalent, but visibly more noisy for some reason.
+    // vec4 leftGather = textureGather(prefilteredDepth, uv + vec2(-1.0, 0.0) * sceneData.texelSize, 0);
+    // float viewSpaceZL = leftGather.x;
+    // float viewSpaceZT = leftGather.w;
+    // float viewSpaceZM = leftGather.y;
+    // vec4 bottomather = textureGather(prefilteredDepth, uv + vec2(0.0, -1.0) * sceneData.texelSize, 0);
+    // float viewSpaceZB = bottomather.x;
+    // float viewSpaceZR = bottomather.w;
+
     vec4 edges  = calculateDepthEdges(viewSpaceZM, viewSpaceZL, viewSpaceZR, viewSpaceZT, viewSpaceZB);
+    float packedEdges = XeGTAO_PackEdges(edges);
+    imageStore(edgeDataOutput, screenPos, vec4(packedEdges));
+
     float minEdge = min(min(edges.x, edges.y), min(edges.z, edges.w));
-    // imageStore(debugImage, screenPos, vec4(vec3(minEdge), 1.0f));
+
+    if (pushConstants.debug == 2){
+        imageStore(debugImage, screenPos, vec4(vec3(minEdge), 1.0f));
+        return;
+    }
 
     // Get view space normal by sampling normal buffer and converting from world to view (code not relevant)
     vec3 worldNormal = texture(normalBuffer, uv).rgb;
     vec3 viewNormal = adjugate(sceneData.view) * worldNormal;
 
+    if (pushConstants.debug == 2){
+        imageStore(debugImage, screenPos, vec4(viewNormal, 1.0f));
+        return;
+    }
+
     // Per Intel: Move center pixel slightly towards camera to avoid imprecision artifacts due to depth buffer imprecision; offset depends on depth texture format used
     viewSpaceZM = viewSpaceZM * 0.998f;
 
@@ -177,9 +178,11 @@ void main() {
     vec3 viewVec = normalize(-vPos);
 
     // debug world pos
-//    vec3 worldPos = (sceneData.invView * vec4(vPos, 1.0)).xyz;
-//    imageStore(debugImage, screenPos, vec4(worldPos / 1000.0f, 1.0f));
-//    return;
+    if (pushConstants.debug == 3){
+        vec3 worldPos = (sceneData.invView * vec4(vPos, 1.0)).xyz;
+        imageStore(debugImage, screenPos, vec4(worldPos / 1000.0f, 1.0f));
+        return;
+    }
 
     // Per Intel
     // prevents normals that are facing away from the view vector - xeGTAO struggles with extreme cases, but in Vanilla it seems rare so it's disabled by default
@@ -217,7 +220,12 @@ void main() {
         {
             visibility = 1;
             visibility = clamp(visibility / XE_GTAO_OCCLUSION_TERM_SCALE, 0, 1);
-            imageStore(debugImage, screenPos, vec4(vec3(visibility, 0, visibility), 1.0f));
+            if (pushConstants.debug == 4){
+                imageStore(debugImage, screenPos, vec4(vec3(visibility), 1.0f));
+                return;
+            }
+
+            imageStore(aoOutput, screenPos, vec4(visibility));
             // todo: look at how this will return 1/1.5 instead of 1 for visibility always?
             // todo: (bent normal) need to write `viewNormal` value to the buffer (i.e. no change to trajectory of normal)
             return;
@@ -371,7 +379,10 @@ void main() {
 
     // todo (bent normal)
     visibility = clamp(visibility / XE_GTAO_OCCLUSION_TERM_SCALE, 0, 1);
-    imageStore(debugImage, screenPos, vec4(vec3(visibility), 1.0f));
+    if (pushConstants.debug == 4){
+        imageStore(debugImage, screenPos, vec4(vec3(visibility), 1.0f));
+        return;
+    }
 
-    //imageStore(debugImage, screenPos, vec4(viewNormal * 0.5f + 0.5f, 1.0f));
+    imageStore(aoOutput, screenPos, vec4(visibility));
 }
diff --git a/shaders/ambient_occlusion/ground_truth/gtao_spatial_filter.comp b/shaders/ambient_occlusion/ground_truth/gtao_spatial_filter.comp
index ee6acbe2..6bba912a 100644
--- a/shaders/ambient_occlusion/ground_truth/gtao_spatial_filter.comp
+++ b/shaders/ambient_occlusion/ground_truth/gtao_spatial_filter.comp
@@ -1,14 +1,46 @@
 #version 460
 
 #include "scene.glsl"
+#include "gtao.glsl"
+
+layout(local_size_x = 16, local_size_y = 16) in;
 
 // layout (std140, set = 0, binding = 0) uniform SceneData - scene.glsl
 
 layout (set = 1, binding = 0) uniform sampler2D rawAO;
-layout (set = 1, binding = 1) uniform sampler2D depthBuffer;
-layout (set = 1, binding = 2) uniform sampler2D normalBuffer;
-layout (r8, set = 1, binding = 3) uniform image2D filteredAO;
+layout (set = 1, binding = 1) uniform sampler2D edgeData;
+layout (r8, set = 1, binding = 2) uniform image2D filteredAO;
+layout (rgba8, set = 1, binding = 3) uniform image2D debugImage;
 
 void main() {
+    const ivec2 screenPos = ivec2(gl_GlobalInvocationID.xy);
+
+    if (screenPos.x > sceneData.renderTargetSize.x || screenPos.y > sceneData.renderTargetSize.y) {
+        return;
+    }
+
+    vec2 uv = (vec2(screenPos) + 0.5) * sceneData.texelSize;
+
+    float blurAmount = pushConstants.denoiseBlurBeta / 5.0f;
+    float diagWeight = 0.85 * 0.5;
+
+    // each dispatch operates on 2x1 pixels
+
+    vec4 edges = XeGTAO_UnpackEdges(texture(edgeData, uv).r);
+    float minEdge = min(min(edges.x, edges.y), min(edges.z, edges.w));
+
+    if (pushConstants.debug == 5){
+        imageStore(debugImage, screenPos, vec4(vec3(minEdge), 1.0f));
+        return;
+    }
 
+    if (pushConstants.debug == 5) {
+        imageStore(debugImage, screenPos, vec4(edges.xyz, 1.0f));
+        return;
+        if (screenPos.x % 2 == 0 && screenPos.y %2 == 0){
+            imageStore(debugImage, screenPos, vec4(vec3(1.0f), 1.0f));
+        } else {
+            imageStore(debugImage, screenPos, vec4(vec3(0.5f), 1.0f));
+        }
+    }
 }
diff --git a/shaders/include/gtao.glsl b/shaders/include/gtao.glsl
new file mode 100644
index 00000000..c51b59b5
--- /dev/null
+++ b/shaders/include/gtao.glsl
@@ -0,0 +1,49 @@
+layout (push_constant) uniform PushConstants {
+    vec2 cameraTanHalfFOV;
+
+    vec2 ndcToViewMul;
+    vec2 ndcToViewAdd;
+
+    vec2 ndcToViewMul_x_PixelSize;
+
+    float depthLinearizeMult;
+    float depthLinearizeAdd;
+
+    float effectRadius;
+    float effectFalloffRange;
+    float denoiseBlurBeta;
+
+    float radiusMultiplier;
+    float sampleDistributionPower;
+    float thinOccluderCompensation;
+    float finalValuePower;
+    float depthMipSamplingOffset;
+    uint noiseIndex;
+
+    int debug;
+} pushConstants;
+
+
+// packing/unpacking for edges; 2 bits per edge mean 4 gradient values (0, 0.33, 0.66, 1) for smoother transitions!
+float XeGTAO_PackEdges(vec4 edgesLRTB)
+{
+    // integer version:
+    // edgesLRTB = saturate(edgesLRTB) * 2.9.xxxx + 0.5.xxxx;
+    // return (((uint)edgesLRTB.x) << 6) + (((uint)edgesLRTB.y) << 4) + (((uint)edgesLRTB.z) << 2) + (((uint)edgesLRTB.w));
+    //
+    // optimized, should be same as above
+    edgesLRTB = round(clamp(edgesLRTB, 0, 1) * 2.9);
+    return dot(edgesLRTB, vec4(64.0 / 255.0, 16.0 / 255.0, 4.0 / 255.0, 1.0 / 255.0));
+}
+
+vec4 XeGTAO_UnpackEdges(float _packedVal)
+{
+    int packedVal = int(_packedVal * 255.5f);
+    vec4 edgesLRTB;
+    edgesLRTB.x = float((packedVal >> 6) & 0x03) / 3.0;// there's really no need for mask (as it's an 8 bit input) but I'll leave it in so it doesn't cause any trouble in the future
+    edgesLRTB.y = float((packedVal >> 4) & 0x03) / 3.0;
+    edgesLRTB.z = float((packedVal >> 2) & 0x03) / 3.0;
+    edgesLRTB.w = float((packedVal >> 0) & 0x03) / 3.0;
+
+    return clamp(edgesLRTB, 0, 1);
+}
\ No newline at end of file
diff --git a/src/core/engine.cpp b/src/core/engine.cpp
index eb3af9ea..c0137938 100644
--- a/src/core/engine.cpp
+++ b/src/core/engine.cpp
@@ -208,6 +208,7 @@ void Engine::initRenderer()
 
     ambientOcclusionPipeline->setupDepthPrefilterDescriptorBuffer(depthImage.imageView);
     ambientOcclusionPipeline->setupAmbientOcclusionDescriptorBuffer(normalRenderTarget.imageView);
+    ambientOcclusionPipeline->setupSpatialFilteringDescriptorBuffer(depthImage.imageView, normalRenderTarget.imageView);
 
     const deferred_resolve::DeferredResolveDescriptor deferredResolveDescriptor{
         normalRenderTarget.imageView,
@@ -565,9 +566,11 @@ void Engine::draw(float deltaTime)
     vk_helpers::transitionImage(cmd, depthImage.image, VK_IMAGE_LAYOUT_DEPTH_ATTACHMENT_OPTIMAL, VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL, VK_IMAGE_ASPECT_DEPTH_BIT);
     vk_helpers::transitionImage(cmd, drawImage.image, VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL, VK_IMAGE_LAYOUT_GENERAL, VK_IMAGE_ASPECT_COLOR_BIT);
 
+    ambient_occlusion::GTAOPushConstants gtaoPush{};
+    gtaoPush.debug = gtaoDebug;
     ambient_occlusion::GTAODrawInfo gtaoDrawInfo{
         camera,
-        {},
+        gtaoPush,
         frameNumber,
         sceneDataDescriptorBuffer.getDescriptorBufferBindingInfo(),
         sceneDataDescriptorBuffer.getDescriptorBufferSize() * currentFrameOverlap
diff --git a/src/core/engine.h b/src/core/engine.h
index f3308948..da6d9894 100644
--- a/src/core/engine.h
+++ b/src/core/engine.h
@@ -175,6 +175,7 @@ class Engine
     bool bEnableDebugFrustumCullDraw{false};
     int32_t csmPcf{1};
     int32_t deferredDebug{0};
+    int32_t gtaoDebug{5};
     bool bDrawTerrainLines{false};
 
     void hotReloadShaders() const;
diff --git a/src/renderer/imgui_wrapper.cpp b/src/renderer/imgui_wrapper.cpp
index 1a5e17ae..603fd10d 100644
--- a/src/renderer/imgui_wrapper.cpp
+++ b/src/renderer/imgui_wrapper.cpp
@@ -877,13 +877,11 @@ void ImguiWrapper::imguiInterface(Engine* engine)
     ImGui::End();
 
     if (ImGui::Begin("Discardable Debug")) {
-        static int32_t gtaoMip;
-        constexpr uint32_t minMip = 0;
-        constexpr uint32_t maxMip = 4;
+        ImGui::InputInt("GTAO Debug", &engine->gtaoDebug);
 
         if (aoDebugTextureImguiId == VK_NULL_HANDLE) {
             if (engine->ambientOcclusionPipeline->debugImage.image != VK_NULL_HANDLE) {
-                aoDebugTextureImguiId = ImGui_ImplVulkan_AddTexture(engine->resourceManager->getDefaultSamplerLinear(),
+                aoDebugTextureImguiId = ImGui_ImplVulkan_AddTexture(engine->resourceManager->getDefaultSamplerNearest(),
                                                                     engine->ambientOcclusionPipeline->debugImage.imageView,
                                                                     VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL);
             }
diff --git a/src/renderer/lighting/ambient_occlusion/ambient_occlusion_types.h b/src/renderer/lighting/ambient_occlusion/ambient_occlusion_types.h
index 1b96cc2b..5d3f0369 100644
--- a/src/renderer/lighting/ambient_occlusion/ambient_occlusion_types.h
+++ b/src/renderer/lighting/ambient_occlusion/ambient_occlusion_types.h
@@ -35,6 +35,8 @@ struct GTAOPushConstants
     float finalValuePower = 2.2f;
     float depthMipSamplingOffset = 3.30f;
     uint32_t noiseIndex{0};
+
+    int32_t debug{0};
 };
 
 struct GTAODrawInfo
diff --git a/src/renderer/lighting/ambient_occlusion/ground_truth/ground_truth_ambient_occlusion.cpp b/src/renderer/lighting/ambient_occlusion/ground_truth/ground_truth_ambient_occlusion.cpp
index e2dcc0dd..3c7fbf0e 100644
--- a/src/renderer/lighting/ambient_occlusion/ground_truth/ground_truth_ambient_occlusion.cpp
+++ b/src/renderer/lighting/ambient_occlusion/ground_truth/ground_truth_ambient_occlusion.cpp
@@ -64,7 +64,6 @@ will_engine::ambient_occlusion::GroundTruthAmbientOcclusionPipeline::GroundTruth
         VkImageUsageFlags usage{};
         usage |= VK_IMAGE_USAGE_STORAGE_BIT;
         usage |= VK_IMAGE_USAGE_SAMPLED_BIT;
-        usage |= VK_IMAGE_USAGE_TRANSFER_SRC_BIT;
         usage |= VK_IMAGE_USAGE_TRANSFER_DST_BIT;
 
         VkImageCreateInfo imgInfo = vk_helpers::imageCreateInfo(depthPrefilterFormat, usage, {RENDER_EXTENTS.width, RENDER_EXTENTS.height, 1});
@@ -82,8 +81,17 @@ will_engine::ambient_occlusion::GroundTruthAmbientOcclusionPipeline::GroundTruth
         VkSamplerCreateInfo samplerInfo = {.sType = VK_STRUCTURE_TYPE_SAMPLER_CREATE_INFO};
         samplerInfo.magFilter = VK_FILTER_NEAREST;
         samplerInfo.minFilter = VK_FILTER_NEAREST;
-
-        depthPrefilterSampler = resourceManager.createSampler(samplerInfo);
+        samplerInfo.mipmapMode = VK_SAMPLER_MIPMAP_MODE_NEAREST;
+        samplerInfo.addressModeU = VK_SAMPLER_ADDRESS_MODE_CLAMP_TO_EDGE;
+        samplerInfo.addressModeV = VK_SAMPLER_ADDRESS_MODE_CLAMP_TO_EDGE;
+        samplerInfo.addressModeW = VK_SAMPLER_ADDRESS_MODE_CLAMP_TO_EDGE;
+        samplerInfo.anisotropyEnable = VK_FALSE;
+        samplerInfo.maxAnisotropy = 1.0f;
+        samplerInfo.compareEnable = VK_FALSE;
+        samplerInfo.minLod = 0.0f;
+        samplerInfo.maxLod = 0.0f;
+
+        depthSampler = resourceManager.createSampler(samplerInfo);
     }
 
     // AO Calculation
@@ -92,7 +100,8 @@ will_engine::ambient_occlusion::GroundTruthAmbientOcclusionPipeline::GroundTruth
         layoutBuilder.addBinding(0, VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER); // pre-filtered depth
         layoutBuilder.addBinding(1, VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER); // MRT normal buffer
         layoutBuilder.addBinding(2, VK_DESCRIPTOR_TYPE_STORAGE_IMAGE); // ao output
-        layoutBuilder.addBinding(3, VK_DESCRIPTOR_TYPE_STORAGE_IMAGE); // debug image
+        layoutBuilder.addBinding(3, VK_DESCRIPTOR_TYPE_STORAGE_IMAGE); // edge data output
+        layoutBuilder.addBinding(4, VK_DESCRIPTOR_TYPE_STORAGE_IMAGE); // debug image
 
         ambientOcclusionSetLayout = resourceManager.createDescriptorSetLayout(layoutBuilder, VK_SHADER_STAGE_COMPUTE_BIT,
                                                                               VK_DESCRIPTOR_SET_LAYOUT_CREATE_DESCRIPTOR_BUFFER_BIT_EXT);
@@ -122,12 +131,18 @@ will_engine::ambient_occlusion::GroundTruthAmbientOcclusionPipeline::GroundTruth
         VkImageUsageFlags usage{};
         usage |= VK_IMAGE_USAGE_STORAGE_BIT;
         usage |= VK_IMAGE_USAGE_SAMPLED_BIT;
-        usage |= VK_IMAGE_USAGE_TRANSFER_SRC_BIT;
-        usage |= VK_IMAGE_USAGE_TRANSFER_DST_BIT;
 
         VkImageCreateInfo imgInfo = vk_helpers::imageCreateInfo(ambientOcclusionFormat, usage, {RENDER_EXTENTS.width, RENDER_EXTENTS.height, 1});
         ambientOcclusionImage = resourceManager.createImage(imgInfo);
 
+        usage = {};
+        usage |= VK_IMAGE_USAGE_STORAGE_BIT;
+        usage |= VK_IMAGE_USAGE_SAMPLED_BIT;
+
+        imgInfo = vk_helpers::imageCreateInfo(edgeDataFormat, usage, {RENDER_EXTENTS.width, RENDER_EXTENTS.height, 1});
+        edgeDataImage = resourceManager.createImage(imgInfo);
+
+
         // Depth Mip sampler
         {
             VkSamplerCreateInfo samplerInfo = {.sType = VK_STRUCTURE_TYPE_SAMPLER_CREATE_INFO};
@@ -143,7 +158,7 @@ will_engine::ambient_occlusion::GroundTruthAmbientOcclusionPipeline::GroundTruth
             samplerInfo.minLod = 0.0f;
             samplerInfo.maxLod = DEPTH_PREFILTER_MIP_COUNT - 1;
 
-            ambientOcclusionDepthSampler = resourceManager.createSampler(samplerInfo);
+            depthPrefilterSampler = resourceManager.createSampler(samplerInfo);
         }
 
         // Normals sampler
@@ -161,7 +176,7 @@ will_engine::ambient_occlusion::GroundTruthAmbientOcclusionPipeline::GroundTruth
             samplerInfo.minLod = 0.0f;
             samplerInfo.maxLod = 0.0f;
 
-            ambientOcclusionNormalsSampler = resourceManager.createSampler(samplerInfo);
+            normalsSampler = resourceManager.createSampler(samplerInfo);
         }
     }
 
@@ -169,9 +184,9 @@ will_engine::ambient_occlusion::GroundTruthAmbientOcclusionPipeline::GroundTruth
     {
         DescriptorLayoutBuilder layoutBuilder;
         layoutBuilder.addBinding(0, VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER); // raw ao
-        layoutBuilder.addBinding(1, VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER); // MRT depth buffer
-        layoutBuilder.addBinding(2, VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER); // MRT normal buffer
-        layoutBuilder.addBinding(3, VK_DESCRIPTOR_TYPE_STORAGE_IMAGE); // filtered ao
+        layoutBuilder.addBinding(1, VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER); // edge data
+        layoutBuilder.addBinding(2, VK_DESCRIPTOR_TYPE_STORAGE_IMAGE); // filtered ao
+        layoutBuilder.addBinding(3, VK_DESCRIPTOR_TYPE_STORAGE_IMAGE); // debug image
 
         spatialFilteringSetLayout = resourceManager.createDescriptorSetLayout(layoutBuilder, VK_SHADER_STAGE_COMPUTE_BIT,
                                                                               VK_DESCRIPTOR_SET_LAYOUT_CREATE_DESCRIPTOR_BUFFER_BIT_EXT);
@@ -271,7 +286,7 @@ will_engine::ambient_occlusion::GroundTruthAmbientOcclusionPipeline::~GroundTrut
     }
 
     resourceManager.destroyImage(depthPrefilterImage);
-    resourceManager.destroySampler(depthPrefilterSampler);
+    resourceManager.destroySampler(depthSampler);
 
     resourceManager.destroyDescriptorBuffer(depthPrefilterDescriptorBuffer);
 
@@ -280,9 +295,10 @@ will_engine::ambient_occlusion::GroundTruthAmbientOcclusionPipeline::~GroundTrut
     resourceManager.destroyPipelineLayout(ambientOcclusionPipelineLayout);
     resourceManager.destroyPipeline(ambientOcclusionPipeline);
 
-    resourceManager.destroySampler(ambientOcclusionDepthSampler);
-    resourceManager.destroySampler(ambientOcclusionNormalsSampler);
+    resourceManager.destroySampler(depthPrefilterSampler);
+    resourceManager.destroySampler(normalsSampler);
     resourceManager.destroyImage(ambientOcclusionImage);
+    resourceManager.destroyImage(edgeDataImage);
 
     resourceManager.destroyDescriptorBuffer(ambientOcclusionDescriptorBuffer);
 
@@ -315,7 +331,7 @@ void will_engine::ambient_occlusion::GroundTruthAmbientOcclusionPipeline::setupD
     imageDescriptors.push_back(
         {
             VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER,
-            {resourceManager.getDefaultSamplerLinear(), depthImageView, VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL},
+            {depthSampler, depthImageView, VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL},
             false
         }
     );
@@ -338,7 +354,7 @@ void will_engine::ambient_occlusion::GroundTruthAmbientOcclusionPipeline::setupD
     resourceManager.setupDescriptorBufferSampler(depthPrefilterDescriptorBuffer, imageDescriptors, 0);
 }
 
-void will_engine::ambient_occlusion::GroundTruthAmbientOcclusionPipeline::setupAmbientOcclusionDescriptorBuffer(VkImageView normalsImageView)
+void will_engine::ambient_occlusion::GroundTruthAmbientOcclusionPipeline::setupAmbientOcclusionDescriptorBuffer(const VkImageView& normalsImageView)
 {
     std::vector<DescriptorImageData> imageDescriptors{};
     imageDescriptors.reserve(4);
@@ -346,13 +362,13 @@ void will_engine::ambient_occlusion::GroundTruthAmbientOcclusionPipeline::setupA
     imageDescriptors.push_back(
         {
             VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER,
-            {ambientOcclusionDepthSampler, depthPrefilterImage.imageView, VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL},
+            {depthPrefilterSampler, depthPrefilterImage.imageView, VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL},
             false
         });
     imageDescriptors.push_back(
         {
             VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER,
-            {ambientOcclusionNormalsSampler, normalsImageView, VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL},
+            {normalsSampler, normalsImageView, VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL},
             false
         });
     imageDescriptors.push_back(
@@ -361,6 +377,12 @@ void will_engine::ambient_occlusion::GroundTruthAmbientOcclusionPipeline::setupA
             {VK_NULL_HANDLE, ambientOcclusionImage.imageView, VK_IMAGE_LAYOUT_GENERAL},
             false
         });
+    imageDescriptors.push_back(
+        {
+            VK_DESCRIPTOR_TYPE_STORAGE_IMAGE,
+            {VK_NULL_HANDLE, edgeDataImage.imageView, VK_IMAGE_LAYOUT_GENERAL},
+            false
+        });
     imageDescriptors.push_back({
         VK_DESCRIPTOR_TYPE_STORAGE_IMAGE,
         {VK_NULL_HANDLE, debugImage.imageView, VK_IMAGE_LAYOUT_GENERAL},
@@ -370,6 +392,50 @@ void will_engine::ambient_occlusion::GroundTruthAmbientOcclusionPipeline::setupA
     resourceManager.setupDescriptorBufferSampler(ambientOcclusionDescriptorBuffer, imageDescriptors, 0);
 }
 
+void will_engine::ambient_occlusion::GroundTruthAmbientOcclusionPipeline::setupSpatialFilteringDescriptorBuffer(const VkImageView& depthImageView,
+    const VkImageView& normalsImageView)
+{
+    std::vector<DescriptorImageData> imageDescriptors{};
+    imageDescriptors.reserve(5);
+
+    imageDescriptors.push_back(
+        {
+            VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER,
+            {depthSampler, ambientOcclusionImage.imageView, VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL},
+            false
+        });
+    // imageDescriptors.push_back(
+    //     {
+    //         VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER,
+    //         {depthSampler, depthImageView, VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL},
+    //         false
+    //     });
+    // imageDescriptors.push_back(
+    //     {
+    //         VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER,
+    //         {normalsSampler, normalsImageView, VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL},
+    //         false
+    //     });
+    imageDescriptors.push_back(
+        {
+            VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER,
+            {depthSampler, edgeDataImage.imageView, VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL},
+            false
+        });
+    imageDescriptors.push_back({
+        VK_DESCRIPTOR_TYPE_STORAGE_IMAGE,
+        {VK_NULL_HANDLE, spatialFilteringImage.imageView, VK_IMAGE_LAYOUT_GENERAL},
+        false
+    });
+    imageDescriptors.push_back({
+        VK_DESCRIPTOR_TYPE_STORAGE_IMAGE,
+        {VK_NULL_HANDLE, debugImage.imageView, VK_IMAGE_LAYOUT_GENERAL},
+        false
+    });
+
+    resourceManager.setupDescriptorBufferSampler(spatialFilteringDescriptorBuffer, imageDescriptors, 0);
+}
+
 void will_engine::ambient_occlusion::GroundTruthAmbientOcclusionPipeline::draw(VkCommandBuffer cmd, const GTAODrawInfo& drawInfo) const
 {
     VkDebugUtilsLabelEXT label{};
@@ -448,6 +514,32 @@ void will_engine::ambient_occlusion::GroundTruthAmbientOcclusionPipeline::draw(V
 
     vk_helpers::transitionImage(cmd, ambientOcclusionImage.image, VK_IMAGE_LAYOUT_GENERAL, VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL,
                                 VK_IMAGE_ASPECT_COLOR_BIT);
+    vk_helpers::transitionImage(cmd, spatialFilteringImage.image, VK_IMAGE_LAYOUT_UNDEFINED, VK_IMAGE_LAYOUT_GENERAL,
+                                VK_IMAGE_ASPECT_COLOR_BIT);
+    // Spatial Filtering
+    {
+        vkCmdBindPipeline(cmd, VK_PIPELINE_BIND_POINT_COMPUTE, spatialFilteringPipeline);
+        vkCmdPushConstants(cmd, spatialFilteringPipelineLayout, VK_SHADER_STAGE_COMPUTE_BIT, 0, sizeof(GTAOPushConstants), &push);
+
+        VkDescriptorBufferBindingInfoEXT bindingInfos[2] = {};
+        bindingInfos[0] = drawInfo.sceneDataBinding;
+        bindingInfos[1] = spatialFilteringDescriptorBuffer.getDescriptorBufferBindingInfo();
+        vkCmdBindDescriptorBuffersEXT(cmd, 2, bindingInfos);
+
+        constexpr std::array<uint32_t, 2> indices{0, 1};
+        const std::array offsets{drawInfo.sceneDataOffset, ZERO_DEVICE_SIZE};
+
+        vkCmdSetDescriptorBufferOffsetsEXT(cmd, VK_PIPELINE_BIND_POINT_COMPUTE, spatialFilteringPipelineLayout, 0, 2, indices.data(), offsets.data());
+
+        const auto x = static_cast<uint32_t>(std::ceil(RENDER_EXTENT_WIDTH / 16.0f));
+        const auto y = static_cast<uint32_t>(std::ceil(RENDER_EXTENT_HEIGHT / 16.0f));
+        vkCmdDispatch(cmd, x, y, 1);
+    }
+
+    vk_helpers::transitionImage(cmd, spatialFilteringImage.image, VK_IMAGE_LAYOUT_GENERAL, VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL,
+                                    VK_IMAGE_ASPECT_COLOR_BIT);
+
+
 
 
     vk_helpers::transitionImage(cmd, debugImage.image, VK_IMAGE_LAYOUT_GENERAL, VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL,
diff --git a/src/renderer/lighting/ambient_occlusion/ground_truth/ground_truth_ambient_occlusion.h b/src/renderer/lighting/ambient_occlusion/ground_truth/ground_truth_ambient_occlusion.h
index c243313f..ffefe4c8 100644
--- a/src/renderer/lighting/ambient_occlusion/ground_truth/ground_truth_ambient_occlusion.h
+++ b/src/renderer/lighting/ambient_occlusion/ground_truth/ground_truth_ambient_occlusion.h
@@ -23,7 +23,9 @@ class GroundTruthAmbientOcclusionPipeline
 
     void setupDepthPrefilterDescriptorBuffer(const VkImageView& depthImageView);
 
-    void setupAmbientOcclusionDescriptorBuffer(VkImageView normalsImageView);
+    void setupAmbientOcclusionDescriptorBuffer(const VkImageView& normalsImageView);
+
+    void setupSpatialFilteringDescriptorBuffer(const VkImageView& depthImageView, const VkImageView& normalsImageView);
 
     void draw(VkCommandBuffer cmd, const GTAODrawInfo& drawInfo) const;
 
@@ -39,7 +41,7 @@ class GroundTruthAmbientOcclusionPipeline
     VkPipelineLayout depthPrefilterPipelineLayout{VK_NULL_HANDLE};
     VkPipeline depthPrefilterPipeline{VK_NULL_HANDLE};
 
-    VkSampler depthPrefilterSampler{VK_NULL_HANDLE};
+    VkSampler depthSampler{VK_NULL_HANDLE};
 
     // 16 vs 32. look at cost later.
     VkFormat depthPrefilterFormat{VK_FORMAT_R16_SFLOAT};
@@ -53,13 +55,16 @@ class GroundTruthAmbientOcclusionPipeline
     VkPipelineLayout ambientOcclusionPipelineLayout{VK_NULL_HANDLE};
     VkPipeline ambientOcclusionPipeline{VK_NULL_HANDLE};
 
-    VkSampler ambientOcclusionDepthSampler{VK_NULL_HANDLE};
-    VkSampler ambientOcclusionNormalsSampler{VK_NULL_HANDLE};
+    VkSampler depthPrefilterSampler{VK_NULL_HANDLE};
+    VkSampler normalsSampler{VK_NULL_HANDLE};
 
     // 8 is supposedly enough?
     VkFormat ambientOcclusionFormat{VK_FORMAT_R8_UNORM};
     AllocatedImage ambientOcclusionImage{VK_NULL_HANDLE};
 
+    VkFormat edgeDataFormat{VK_FORMAT_R8_UNORM};
+    AllocatedImage edgeDataImage{VK_NULL_HANDLE};
+
     DescriptorBufferSampler ambientOcclusionDescriptorBuffer;
 
 private: // Spatial Filtering
diff --git a/src/renderer/renderer_constants.h b/src/renderer/renderer_constants.h
index 574825e0..54db7310 100644
--- a/src/renderer/renderer_constants.h
+++ b/src/renderer/renderer_constants.h
@@ -11,6 +11,7 @@ constexpr char ENGINE_NAME[] = "Will Engine";
 constexpr bool USING_REVERSED_DEPTH_BUFFER = true;
 constexpr VkDeviceSize ZERO_DEVICE_SIZE = 0;
 constexpr VkExtent2D RENDER_EXTENTS{1920, 1080};
+//constexpr VkExtent2D RENDER_EXTENTS{3840, 2160};
 constexpr float RENDER_EXTENT_WIDTH{RENDER_EXTENTS.width};
 constexpr float RENDER_EXTENT_HEIGHT{RENDER_EXTENTS.height};
 

From c70f7f47be8e7ab8adeeef7b4582ed6f1da4b02a Mon Sep 17 00:00:00 2001
From: Williscool13 <twtw40@gmail.com>
Date: Sun, 30 Mar 2025 17:05:01 +0700
Subject: [PATCH 21/27] Fix texture gathers.

---
 .../ground_truth/gtao_depth_prefilter.comp    |  2 +-
 .../ground_truth/gtao_main_pass.comp          | 21 +++++++------------
 2 files changed, 8 insertions(+), 15 deletions(-)

diff --git a/shaders/ambient_occlusion/ground_truth/gtao_depth_prefilter.comp b/shaders/ambient_occlusion/ground_truth/gtao_depth_prefilter.comp
index 0439a7a7..a096d70a 100644
--- a/shaders/ambient_occlusion/ground_truth/gtao_depth_prefilter.comp
+++ b/shaders/ambient_occlusion/ground_truth/gtao_depth_prefilter.comp
@@ -67,7 +67,7 @@ void main() {
     vec2 uv = (vec2(screenPos) + 0.5) * sceneData.texelSize;
 
     // todo: optimize with textureGather?
-    vec4 depths = textureGather(depthImage, uv);
+    vec4 depths = textureGatherOffset(depthImage,screenPos * sceneData.texelSize, ivec2(1,1), 0);
     float rDepth0 = depths.w; // top-left
     float rDepth1 = depths.z; // top-right
     float rDepth2 = depths.x; // bottom-left
diff --git a/shaders/ambient_occlusion/ground_truth/gtao_main_pass.comp b/shaders/ambient_occlusion/ground_truth/gtao_main_pass.comp
index 867b91fb..eabc759a 100644
--- a/shaders/ambient_occlusion/ground_truth/gtao_main_pass.comp
+++ b/shaders/ambient_occlusion/ground_truth/gtao_main_pass.comp
@@ -135,20 +135,13 @@ void main() {
 
     vec2 uv = (vec2(screenPos) + 0.5) * sceneData.texelSize;
 
-    float viewSpaceZM = textureLod(prefilteredDepth, uv + vec2(0.0, 0.0) * sceneData.texelSize, 0).r;
-    float viewSpaceZL = textureLod(prefilteredDepth, uv + vec2(-1.0, 0.0) * sceneData.texelSize, 0).r;
-    float viewSpaceZR = textureLod(prefilteredDepth, uv + vec2(1.0, 0.0) * sceneData.texelSize, 0).r;
-    float viewSpaceZT = textureLod(prefilteredDepth, uv + vec2(0.0, 1.0) * sceneData.texelSize, 0).r;
-    float viewSpaceZB = textureLod(prefilteredDepth, uv + vec2(0.0, -1.0) * sceneData.texelSize, 0).r;
-
-    // Theoretically Equivalent, but visibly more noisy for some reason.
-    // vec4 leftGather = textureGather(prefilteredDepth, uv + vec2(-1.0, 0.0) * sceneData.texelSize, 0);
-    // float viewSpaceZL = leftGather.x;
-    // float viewSpaceZT = leftGather.w;
-    // float viewSpaceZM = leftGather.y;
-    // vec4 bottomather = textureGather(prefilteredDepth, uv + vec2(0.0, -1.0) * sceneData.texelSize, 0);
-    // float viewSpaceZB = bottomather.x;
-    // float viewSpaceZR = bottomather.w;
+    vec4 valuesUL = textureGather(prefilteredDepth, screenPos * sceneData.texelSize, 0);
+    vec4 valuesBR = textureGatherOffset(prefilteredDepth, screenPos * sceneData.texelSize, ivec2(1,1), 0);
+    float viewSpaceZM = valuesUL.y;
+    const float viewSpaceZL = valuesUL.x;
+    const float viewSpaceZR = valuesUL.z;
+    const float viewSpaceZT = valuesBR.z;
+    const float viewSpaceZB = valuesBR.x;
 
     vec4 edges  = calculateDepthEdges(viewSpaceZM, viewSpaceZL, viewSpaceZR, viewSpaceZT, viewSpaceZB);
     float packedEdges = XeGTAO_PackEdges(edges);

From b3b64c27f7254400f329d9a50d75f3261dec217b Mon Sep 17 00:00:00 2001
From: Williscool13 <twtw40@gmail.com>
Date: Mon, 31 Mar 2025 14:13:15 +0700
Subject: [PATCH 22/27] GTAO denoise pass.

---
 .../ground_truth/gtao_depth_prefilter.comp    |   2 +-
 .../ground_truth/gtao_main_pass.comp          |   6 +-
 .../ground_truth/gtao_spatial_filter.comp     | 121 +++++++++++++++---
 shaders/include/gtao.glsl                     |   3 +-
 .../ambient_occlusion_types.h                 |   1 +
 .../ground_truth_ambient_occlusion.cpp        |   3 +-
 6 files changed, 115 insertions(+), 21 deletions(-)

diff --git a/shaders/ambient_occlusion/ground_truth/gtao_depth_prefilter.comp b/shaders/ambient_occlusion/ground_truth/gtao_depth_prefilter.comp
index a096d70a..f41ea7d2 100644
--- a/shaders/ambient_occlusion/ground_truth/gtao_depth_prefilter.comp
+++ b/shaders/ambient_occlusion/ground_truth/gtao_depth_prefilter.comp
@@ -67,7 +67,7 @@ void main() {
     vec2 uv = (vec2(screenPos) + 0.5) * sceneData.texelSize;
 
     // todo: optimize with textureGather?
-    vec4 depths = textureGatherOffset(depthImage,screenPos * sceneData.texelSize, ivec2(1,1), 0);
+    vec4 depths = textureGatherOffset(depthImage, uv, ivec2(1,1), 0);
     float rDepth0 = depths.w; // top-left
     float rDepth1 = depths.z; // top-right
     float rDepth2 = depths.x; // bottom-left
diff --git a/shaders/ambient_occlusion/ground_truth/gtao_main_pass.comp b/shaders/ambient_occlusion/ground_truth/gtao_main_pass.comp
index eabc759a..f16bf251 100644
--- a/shaders/ambient_occlusion/ground_truth/gtao_main_pass.comp
+++ b/shaders/ambient_occlusion/ground_truth/gtao_main_pass.comp
@@ -135,8 +135,10 @@ void main() {
 
     vec2 uv = (vec2(screenPos) + 0.5) * sceneData.texelSize;
 
-    vec4 valuesUL = textureGather(prefilteredDepth, screenPos * sceneData.texelSize, 0);
-    vec4 valuesBR = textureGatherOffset(prefilteredDepth, screenPos * sceneData.texelSize, ivec2(1,1), 0);
+    //vec4 valuesUL = textureGather(prefilteredDepth, screenPos * sceneData.texelSize, 0);
+    //vec4 valuesBR = textureGatherOffset(prefilteredDepth, screenPos * sceneData.texelSize, ivec2(1,1), 0);
+    vec4 valuesUL = textureGather(prefilteredDepth, uv, 0);
+    vec4 valuesBR = textureGatherOffset(prefilteredDepth, uv, ivec2(1,1), 0);
     float viewSpaceZM = valuesUL.y;
     const float viewSpaceZL = valuesUL.x;
     const float viewSpaceZR = valuesUL.z;
diff --git a/shaders/ambient_occlusion/ground_truth/gtao_spatial_filter.comp b/shaders/ambient_occlusion/ground_truth/gtao_spatial_filter.comp
index 6bba912a..4dcaf681 100644
--- a/shaders/ambient_occlusion/ground_truth/gtao_spatial_filter.comp
+++ b/shaders/ambient_occlusion/ground_truth/gtao_spatial_filter.comp
@@ -12,8 +12,20 @@ layout (set = 1, binding = 1) uniform sampler2D edgeData;
 layout (r8, set = 1, binding = 2) uniform image2D filteredAO;
 layout (rgba8, set = 1, binding = 3) uniform image2D debugImage;
 
+#define XE_GTAO_OCCLUSION_TERM_SCALE            1.5f
+
+void XeGTAO_AddSample(float ssaoValue, float edgeValue, inout float sum, inout float sumWeight)
+{
+    float weight = edgeValue;
+
+    sum += (weight * ssaoValue);
+    sumWeight += weight;
+}
+
 void main() {
-    const ivec2 screenPos = ivec2(gl_GlobalInvocationID.xy);
+    // each dispatch operates on 2x1 pixels
+    //const ivec2 screenPos = ivec2(gl_GlobalInvocationID.xy);
+    const ivec2 screenPos = ivec2(gl_GlobalInvocationID.xy) * ivec2(2, 1);
 
     if (screenPos.x > sceneData.renderTargetSize.x || screenPos.y > sceneData.renderTargetSize.y) {
         return;
@@ -21,26 +33,103 @@ void main() {
 
     vec2 uv = (vec2(screenPos) + 0.5) * sceneData.texelSize;
 
-    float blurAmount = pushConstants.denoiseBlurBeta / 5.0f;
+    // if final apply use below (see, using multiple denoise passes)
+    float blurAmount = pushConstants.isFinalDenoisePass == 1 ? pushConstants.denoiseBlurBeta : pushConstants.denoiseBlurBeta / 5.0f;
+
     float diagWeight = 0.85 * 0.5;
 
-    // each dispatch operates on 2x1 pixels
+    float aoTerm[2];// pixel pixCoordBase and pixel pixCoordBase + int2( 1, 0 )
+    vec4 edgesC_LRTB[2];
+    float weightTL[2];
+    float weightTR[2];
+    float weightBL[2];
+    float weightBR[2];
 
-    vec4 edges = XeGTAO_UnpackEdges(texture(edgeData, uv).r);
-    float minEdge = min(min(edges.x, edges.y), min(edges.z, edges.w));
 
-    if (pushConstants.debug == 5){
-        imageStore(debugImage, screenPos, vec4(vec3(minEdge), 1.0f));
-        return;
-    }
 
-    if (pushConstants.debug == 5) {
-        imageStore(debugImage, screenPos, vec4(edges.xyz, 1.0f));
-        return;
-        if (screenPos.x % 2 == 0 && screenPos.y %2 == 0){
-            imageStore(debugImage, screenPos, vec4(vec3(1.0f), 1.0f));
-        } else {
-            imageStore(debugImage, screenPos, vec4(vec3(0.5f), 1.0f));
+    vec4 edgesQ0 = textureGatherOffset(edgeData, uv, ivec2(0, 0), 0);
+    vec4 edgesQ1 = textureGatherOffset(edgeData, uv, ivec2(2, 0), 0);
+    vec4 edgesQ2 = textureGatherOffset(edgeData, uv, ivec2(1, 2), 0);
+
+    vec4 visQ0 = textureGatherOffset(rawAO, uv, ivec2(0, 0), 0);
+    vec4 visQ1 = textureGatherOffset(rawAO, uv, ivec2(2, 0), 0);
+    vec4 visQ2 = textureGatherOffset(rawAO, uv, ivec2(0, 2), 0);
+    vec4 visQ3 = textureGatherOffset(rawAO, uv, ivec2(2, 2), 0);
+
+    for (int side = 0; side < 2; side++)
+    {
+        const ivec2 sideScreenPos = ivec2(screenPos.x + side, screenPos.y);
+
+        vec4 edgesL_LRTB  = XeGTAO_UnpackEdges(side == 0 ? edgesQ0.x : edgesQ0.y);
+        vec4 edgesT_LRTB  = XeGTAO_UnpackEdges(side == 0 ? edgesQ0.z : edgesQ1.w);
+        vec4 edgesR_LRTB  = XeGTAO_UnpackEdges(side == 0 ? edgesQ1.x : edgesQ1.y);
+        vec4 edgesB_LRTB  = XeGTAO_UnpackEdges(side == 0 ? edgesQ2.w : edgesQ2.z);
+
+        edgesC_LRTB[side]     = XeGTAO_UnpackEdges(side==0 ? edgesQ0.y : edgesQ1.x);
+
+        // Edges aren't perfectly symmetrical: edge detection algorithm does not guarantee that a left edge on the right pixel will match the right edge on the left pixel (although
+        // they will match in majority of cases). This line further enforces the symmetricity, creating a slightly sharper blur. Works real nice with TAA.
+        edgesC_LRTB[side] *= vec4(edgesL_LRTB.y, edgesR_LRTB.x, edgesT_LRTB.w, edgesB_LRTB.z);
+
+        // this allows some small amount of AO leaking from neighbours if there are 3 or 4 edges; this reduces both spatial and temporal aliasing
+        const float leak_threshold = 2.5;
+        const float leak_strength = 0.5;
+        float edginess = (clamp(4.0 - leak_threshold - dot(edgesC_LRTB[side], vec4(1.0, 1.0, 1.0, 1.0)), 0.0, 1.0) / (4-leak_threshold)) * leak_strength;
+        edgesC_LRTB[side] = clamp(edgesC_LRTB[side] + edginess, 0.0, 1.0);
+
+        // for diagonals; used by first and second pass
+        weightTL[side] = diagWeight * (edgesC_LRTB[side].x * edgesL_LRTB.z + edgesC_LRTB[side].z * edgesT_LRTB.x);
+        weightTR[side] = diagWeight * (edgesC_LRTB[side].z * edgesT_LRTB.y + edgesC_LRTB[side].y * edgesR_LRTB.z);
+        weightBL[side] = diagWeight * (edgesC_LRTB[side].w * edgesB_LRTB.x + edgesC_LRTB[side].x * edgesL_LRTB.w);
+        weightBR[side] = diagWeight * (edgesC_LRTB[side].y * edgesR_LRTB.w + edgesC_LRTB[side].w * edgesB_LRTB.y);
+
+        // first pass
+        float ssaoValue     = side==0 ? visQ0[1] : visQ1[0];
+        float ssaoValueL    = side==0 ? visQ0[0] : visQ0[1];
+        float ssaoValueT    = side==0 ? visQ0[2] : visQ1[3];
+        float ssaoValueR    = side==0 ? visQ1[0] : visQ1[1];
+        float ssaoValueB    = side==0 ? visQ2[2] : visQ3[3];
+        float ssaoValueTL   = side==0 ? visQ0[3] : visQ0[2];
+        float ssaoValueBR   = side==0 ? visQ3[3] : visQ3[2];
+        float ssaoValueTR   = side==0 ? visQ1[3] : visQ1[2];
+        float ssaoValueBL   = side==0 ? visQ2[3] : visQ2[2];
+
+        float sumWeight = blurAmount;
+        float sum = ssaoValue * sumWeight;
+
+        XeGTAO_AddSample(ssaoValueL, edgesC_LRTB[side].x, sum, sumWeight);
+        XeGTAO_AddSample(ssaoValueR, edgesC_LRTB[side].y, sum, sumWeight);
+        XeGTAO_AddSample(ssaoValueT, edgesC_LRTB[side].z, sum, sumWeight);
+        XeGTAO_AddSample(ssaoValueB, edgesC_LRTB[side].w, sum, sumWeight);
+
+        XeGTAO_AddSample(ssaoValueTL, weightTL[side], sum, sumWeight);
+        XeGTAO_AddSample(ssaoValueTR, weightTR[side], sum, sumWeight);
+        XeGTAO_AddSample(ssaoValueBL, weightBL[side], sum, sumWeight);
+        XeGTAO_AddSample(ssaoValueBR, weightBR[side], sum, sumWeight);
+
+        aoTerm[side] = sum / sumWeight;
+
+        // use 1 instead of occ term scale for no-final
+        float outputValue = aoTerm[side] * (pushConstants.isFinalDenoisePass == 1 ? XE_GTAO_OCCLUSION_TERM_SCALE : 1);
+
+        if (pushConstants.debug == 5){
+            imageStore(debugImage, sideScreenPos, vec4(vec3(outputValue), 1.0f));
         }
     }
+
+
+//    if (pushConstants.debug == 5){
+//        imageStore(debugImage, screenPos, vec4(vec3(minEdge), 1.0f));
+//        return;
+//    }
+//
+//    if (pushConstants.debug == 5) {
+//        imageStore(debugImage, screenPos, vec4(edges.xyz, 1.0f));
+//        return;
+//        if (screenPos.x % 2 == 0 && screenPos.y %2 == 0){
+//            imageStore(debugImage, screenPos, vec4(vec3(1.0f), 1.0f));
+//        } else {
+//            imageStore(debugImage, screenPos, vec4(vec3(0.5f), 1.0f));
+//        }
+//    }
 }
diff --git a/shaders/include/gtao.glsl b/shaders/include/gtao.glsl
index c51b59b5..da3a9f5a 100644
--- a/shaders/include/gtao.glsl
+++ b/shaders/include/gtao.glsl
@@ -18,7 +18,8 @@ layout (push_constant) uniform PushConstants {
     float thinOccluderCompensation;
     float finalValuePower;
     float depthMipSamplingOffset;
-    uint noiseIndex;
+    int noiseIndex;
+    int isFinalDenoisePass;
 
     int debug;
 } pushConstants;
diff --git a/src/renderer/lighting/ambient_occlusion/ambient_occlusion_types.h b/src/renderer/lighting/ambient_occlusion/ambient_occlusion_types.h
index 5d3f0369..d9afb53e 100644
--- a/src/renderer/lighting/ambient_occlusion/ambient_occlusion_types.h
+++ b/src/renderer/lighting/ambient_occlusion/ambient_occlusion_types.h
@@ -35,6 +35,7 @@ struct GTAOPushConstants
     float finalValuePower = 2.2f;
     float depthMipSamplingOffset = 3.30f;
     uint32_t noiseIndex{0};
+    int32_t isFinalDenoisePass{1};
 
     int32_t debug{0};
 };
diff --git a/src/renderer/lighting/ambient_occlusion/ground_truth/ground_truth_ambient_occlusion.cpp b/src/renderer/lighting/ambient_occlusion/ground_truth/ground_truth_ambient_occlusion.cpp
index 3c7fbf0e..c0381ef2 100644
--- a/src/renderer/lighting/ambient_occlusion/ground_truth/ground_truth_ambient_occlusion.cpp
+++ b/src/renderer/lighting/ambient_occlusion/ground_truth/ground_truth_ambient_occlusion.cpp
@@ -531,7 +531,8 @@ void will_engine::ambient_occlusion::GroundTruthAmbientOcclusionPipeline::draw(V
 
         vkCmdSetDescriptorBufferOffsetsEXT(cmd, VK_PIPELINE_BIND_POINT_COMPUTE, spatialFilteringPipelineLayout, 0, 2, indices.data(), offsets.data());
 
-        const auto x = static_cast<uint32_t>(std::ceil(RENDER_EXTENT_WIDTH / 16.0f));
+        // each dispatch operates on 2x1 pixels
+        const auto x = static_cast<uint32_t>(std::ceil(RENDER_EXTENT_WIDTH / (16.0f * 2.0f)));
         const auto y = static_cast<uint32_t>(std::ceil(RENDER_EXTENT_HEIGHT / 16.0f));
         vkCmdDispatch(cmd, x, y, 1);
     }

From 7f3e2f8130eb5da2a3a8d011975e51ce3791863c Mon Sep 17 00:00:00 2001
From: Williscool13 <twtw40@gmail.com>
Date: Mon, 31 Mar 2025 16:32:35 +0700
Subject: [PATCH 23/27] Fix mesh visibility synchronization issues.

---
 shaders/visibility_pass.comp                  | 12 +++-----
 .../visibility_pass/visibility_pass.cpp       |  3 ++
 .../visibility_pass/visibility_pass.h         |  2 ++
 src/renderer/vk_helpers.cpp                   | 28 +++++++++++++++++++
 src/renderer/vk_helpers.h                     |  3 ++
 5 files changed, 40 insertions(+), 8 deletions(-)

diff --git a/shaders/visibility_pass.comp b/shaders/visibility_pass.comp
index 03530ad2..56aecb1b 100644
--- a/shaders/visibility_pass.comp
+++ b/shaders/visibility_pass.comp
@@ -50,8 +50,8 @@ layout(set = 1, binding = 0) uniform bufferData
 } buffers;
 
 layout (push_constant) uniform PushConstants {
-    bool enabledFrustumCull;
-    bool shadowPass;
+    int enabledFrustumCull;
+    int shadowPass;
 } push;
 
 int checkIsVisible(mat4 mat, vec3 origin, float radius)
@@ -86,12 +86,10 @@ void main()
     uint invocationId = gl_GlobalInvocationID.x;
     if (invocationId >= buffers.commandBufferCount) { return; }
 
-
-
     uint modelMatrixId = buffers.commandBuffer.commands[invocationId].firstInstance;
     Model models = buffers.modelMatrixBuffer.models[modelMatrixId];
 
-    if (push.shadowPass){
+    if (push.shadowPass == 1){
         if (models.flags.y == 0){
             buffers.commandBuffer.commands[invocationId].instanceCount = 0;
             return;
@@ -109,9 +107,7 @@ void main()
     }
 
 
-
-
-    if (!push.enabledFrustumCull){
+    if (push.enabledFrustumCull == 0){
         buffers.commandBuffer.commands[invocationId].instanceCount = 1;
         return;
     }
diff --git a/src/renderer/pipelines/visibility_pass/visibility_pass.cpp b/src/renderer/pipelines/visibility_pass/visibility_pass.cpp
index 3b24b1f6..787f4955 100644
--- a/src/renderer/pipelines/visibility_pass/visibility_pass.cpp
+++ b/src/renderer/pipelines/visibility_pass/visibility_pass.cpp
@@ -75,11 +75,14 @@ void will_engine::visibility_pass::VisibilityPassPipeline::draw(VkCommandBuffer
         vkCmdSetDescriptorBufferOffsetsEXT(cmd, VK_PIPELINE_BIND_POINT_COMPUTE, pipelineLayout, 1, 1, &addressesIndex, &addressesOffset);
 
         vkCmdDispatch(cmd, static_cast<uint32_t>(std::ceil(static_cast<float>(renderObject->getDrawIndirectCommandCount()) / 64.0f)), 1, 1);
+
+        vk_helpers::synchronizeUniform(cmd, renderObject->getIndirectBuffer(drawInfo.currentFrameOverlap), VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT, VK_ACCESS_2_SHADER_WRITE_BIT, VK_PIPELINE_STAGE_2_DRAW_INDIRECT_BIT, VK_ACCESS_2_INDIRECT_COMMAND_READ_BIT);
     }
 
     vkCmdEndDebugUtilsLabelEXT(cmd);
 }
 
+
 void will_engine::visibility_pass::VisibilityPassPipeline::createPipeline()
 {
     resourceManager.destroyPipeline(pipeline);
diff --git a/src/renderer/pipelines/visibility_pass/visibility_pass.h b/src/renderer/pipelines/visibility_pass/visibility_pass.h
index 1c7bf721..cb7695d6 100644
--- a/src/renderer/pipelines/visibility_pass/visibility_pass.h
+++ b/src/renderer/pipelines/visibility_pass/visibility_pass.h
@@ -39,6 +39,8 @@ class VisibilityPassPipeline
 
     void draw(VkCommandBuffer cmd, const VisibilityPassDrawInfo& drawInfo) const;
 
+    void indirectBufferSynchronize();
+
     void reloadShaders() { createPipeline(); }
 
 private:
diff --git a/src/renderer/vk_helpers.cpp b/src/renderer/vk_helpers.cpp
index f88ed89c..19ffca52 100644
--- a/src/renderer/vk_helpers.cpp
+++ b/src/renderer/vk_helpers.cpp
@@ -306,6 +306,34 @@ void will_engine::vk_helpers::transitionImage(VkCommandBuffer cmd, VkImage image
     vkCmdPipelineBarrier2(cmd, &depInfo);
 }
 
+void will_engine::vk_helpers::synchronizeUniform(VkCommandBuffer cmd, const AllocatedBuffer& buffer, VkPipelineStageFlagBits2 srcPipelineStage, VkAccessFlagBits2 srcAccessBit , VkPipelineStageFlagBits2 dstPipelineStage, VkAccessFlagBits2 dstAccessBit)
+{
+    VkBufferMemoryBarrier2 bufferBarrier{};
+    bufferBarrier.sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER_2;
+    bufferBarrier.pNext = nullptr;
+
+    bufferBarrier.srcStageMask = srcPipelineStage;
+    bufferBarrier.srcAccessMask = srcAccessBit;
+
+    bufferBarrier.dstStageMask = dstPipelineStage;
+    bufferBarrier.dstAccessMask = dstAccessBit;
+
+    bufferBarrier.srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED;
+    bufferBarrier.dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED;
+    bufferBarrier.buffer = buffer.buffer;
+    bufferBarrier.offset = 0;
+    bufferBarrier.size = VK_WHOLE_SIZE;
+
+    VkDependencyInfo depInfo{};
+    depInfo.sType = VK_STRUCTURE_TYPE_DEPENDENCY_INFO;
+    depInfo.pNext = nullptr;
+    depInfo.dependencyFlags = 0;
+    depInfo.bufferMemoryBarrierCount = 1;
+    depInfo.pBufferMemoryBarriers = &bufferBarrier;
+
+    vkCmdPipelineBarrier2(cmd, &depInfo);
+}
+
 void will_engine::vk_helpers::copyImageToImage(VkCommandBuffer cmd, VkImage source, VkImage destination, VkExtent2D srcSize, VkExtent2D dstSize)
 {
     VkImageBlit2 blitRegion{.sType = VK_STRUCTURE_TYPE_IMAGE_BLIT_2, .pNext = nullptr};
diff --git a/src/renderer/vk_helpers.h b/src/renderer/vk_helpers.h
index 47eedfde..87c73eeb 100644
--- a/src/renderer/vk_helpers.h
+++ b/src/renderer/vk_helpers.h
@@ -87,6 +87,9 @@ namespace vk_helpers
 
     void transitionImage(VkCommandBuffer cmd, VkImage image, VkImageLayout currentLayout, VkImageAspectFlags aspectMask, VkImageLayout targetLayout);
 
+    void synchronizeUniform(VkCommandBuffer cmd, const AllocatedBuffer& buffer, VkPipelineStageFlagBits2 srcPipelineStage, VkAccessFlagBits2 srcAccessBit, VkPipelineStageFlagBits2
+                            dstPipelineStage, VkAccessFlagBits2 dstAccessBit);
+
     void copyImageToImage(VkCommandBuffer cmd, VkImage source, VkImage destination, VkExtent2D srcSize, VkExtent2D dstSize);
 
     void copyDepthToDepth(VkCommandBuffer cmd, VkImage source, VkImage destination, VkExtent2D srcSize, VkExtent2D dstSize);

From c48d17d558cbe47424f98770d1454bd4c7ef1404 Mon Sep 17 00:00:00 2001
From: Williscool13 <twtw40@gmail.com>
Date: Mon, 31 Mar 2025 16:32:43 +0700
Subject: [PATCH 24/27] GTAO output into deferred resolve.

---
 .../ground_truth/gtao_main_pass.comp          | 26 +++--
 .../ground_truth/gtao_spatial_filter.comp     | 30 +++---
 shaders/deferredResolve.comp                  |  8 +-
 src/core/engine.cpp                           |  1 +
 src/core/engine.h                             |  2 +-
 .../components/mesh_renderer_component.cpp    |  6 ++
 .../ground_truth_ambient_occlusion.cpp        | 94 +------------------
 .../ground_truth_ambient_occlusion.h          | 20 ++--
 .../deferred_resolve/deferred_resolve.cpp     |  6 ++
 .../deferred_resolve/deferred_resolve.h       |  1 +
 src/renderer/resource_manager.cpp             | 13 +--
 11 files changed, 65 insertions(+), 142 deletions(-)

diff --git a/shaders/ambient_occlusion/ground_truth/gtao_main_pass.comp b/shaders/ambient_occlusion/ground_truth/gtao_main_pass.comp
index f16bf251..8b75a447 100644
--- a/shaders/ambient_occlusion/ground_truth/gtao_main_pass.comp
+++ b/shaders/ambient_occlusion/ground_truth/gtao_main_pass.comp
@@ -93,23 +93,22 @@ vec4 calculateDepthEdges(const float centerZ, const float leftZ, const float rig
     return clamp((1.25 - edgesLRTB / (centerZ * 0.011)), 0, 1);;
 }
 
-vec3 cheapReconstructViewSpacePosition(vec2 uv, const float viewspaceDepth, vec2 ndcToViewMul, vec2 ndcToViewAdd)
+vec3 cheapReconstructViewSpacePosition(vec2 uv, float viewspaceDepth)
 {
     vec3 ret;
-    ret.xy = (ndcToViewMul * uv.xy + ndcToViewAdd) * viewspaceDepth;
+    ret.xy = (pushConstants.ndcToViewMul * uv.xy + pushConstants.ndcToViewAdd) * viewspaceDepth;
     ret.z = -viewspaceDepth;
     return ret;
 }
 
 
-vec3 reconstructViewSpacePosition(vec2 uv, float viewDepth) {
+vec4 reconstructViewSpacePosition(vec2 uv, float viewDepth) {
     float ndcDepth = pushConstants.depthLinearizeAdd - (pushConstants.depthLinearizeMult / viewDepth);
-    uv.y = 1 - uv.y;
     vec2 ndc = uv * 2.0 - 1.0;
     vec4 positionVS = sceneData.invProjection * vec4(ndc, ndcDepth, 1.0);
 
     positionVS /= positionVS.w;
-    return positionVS.xyz;
+    return positionVS;
 }
 
 void outputWorkingTerm(ivec2 screenPos, float visibility, vec3 bentNormal, image2D outputImage){
@@ -135,8 +134,6 @@ void main() {
 
     vec2 uv = (vec2(screenPos) + 0.5) * sceneData.texelSize;
 
-    //vec4 valuesUL = textureGather(prefilteredDepth, screenPos * sceneData.texelSize, 0);
-    //vec4 valuesBR = textureGatherOffset(prefilteredDepth, screenPos * sceneData.texelSize, ivec2(1,1), 0);
     vec4 valuesUL = textureGather(prefilteredDepth, uv, 0);
     vec4 valuesBR = textureGatherOffset(prefilteredDepth, uv, ivec2(1,1), 0);
     float viewSpaceZM = valuesUL.y;
@@ -168,13 +165,14 @@ void main() {
     // Per Intel: Move center pixel slightly towards camera to avoid imprecision artifacts due to depth buffer imprecision; offset depends on depth texture format used
     viewSpaceZM = viewSpaceZM * 0.998f;
 
-    vec3 vPos = cheapReconstructViewSpacePosition(uv, viewSpaceZM, pushConstants.ndcToViewMul, pushConstants.ndcToViewAdd);
-    //vec3 vPos = reconstructViewSpacePosition(uv, viewSpaceZM);
+    //vec3 vPos = cheapReconstructViewSpacePosition(uv, viewSpaceZM);
+    vec4 vPosAlt = reconstructViewSpacePosition(uv, viewSpaceZM);
+    vec3 vPos = vPosAlt.xyz;
     vec3 viewVec = normalize(-vPos);
 
     // debug world pos
     if (pushConstants.debug == 3){
-        vec3 worldPos = (sceneData.invView * vec4(vPos, 1.0)).xyz;
+        vec3 worldPos = (sceneData.invView * vPosAlt).xyz;
         imageStore(debugImage, screenPos, vec4(worldPos / 1000.0f, 1.0f));
         return;
     }
@@ -305,13 +303,13 @@ void main() {
 
                 vec2 sampleScreenPos0 = uv + sampleOffset;
                 float  SZ0 = textureLod(prefilteredDepth, sampleScreenPos0, mipLevel).r;
-                vec3 samplePos0 = cheapReconstructViewSpacePosition(sampleScreenPos0, SZ0, pushConstants.ndcToViewMul, pushConstants.ndcToViewAdd);
-                //vec3 samplePos0 = reconstructViewSpacePosition(sampleScreenPos0, SZ0);
+                //vec3 samplePos0 = cheapReconstructViewSpacePosition(sampleScreenPos0, SZ0);
+                vec3 samplePos0 = reconstructViewSpacePosition(sampleScreenPos0, SZ0).xyz;
 
                 vec2 sampleScreenPos1 = uv - sampleOffset;
                 float  SZ1 = textureLod(prefilteredDepth, sampleScreenPos1, mipLevel).r;
-                vec3 samplePos1 = cheapReconstructViewSpacePosition(sampleScreenPos1, SZ1, pushConstants.ndcToViewMul, pushConstants.ndcToViewAdd);
-                //vec3 samplePos1 = reconstructViewSpacePosition(sampleScreenPos1, SZ1);
+                //vec3 samplePos1 = cheapReconstructViewSpacePosition(sampleScreenPos1, SZ1);
+                vec3 samplePos1 = reconstructViewSpacePosition(sampleScreenPos1, SZ1).xyz;
 
                 vec3 sampleDelta0     = (samplePos0 - vec3(vPos));// using lpfloat for sampleDelta causes precision issues
                 vec3 sampleDelta1     = (samplePos1 - vec3(vPos));// using lpfloat for sampleDelta causes precision issues
diff --git a/shaders/ambient_occlusion/ground_truth/gtao_spatial_filter.comp b/shaders/ambient_occlusion/ground_truth/gtao_spatial_filter.comp
index 4dcaf681..7ffe6fb6 100644
--- a/shaders/ambient_occlusion/ground_truth/gtao_spatial_filter.comp
+++ b/shaders/ambient_occlusion/ground_truth/gtao_spatial_filter.comp
@@ -115,21 +115,23 @@ void main() {
         if (pushConstants.debug == 5){
             imageStore(debugImage, sideScreenPos, vec4(vec3(outputValue), 1.0f));
         }
+
+        imageStore(filteredAO, sideScreenPos, vec4(outputValue));
     }
 
 
-//    if (pushConstants.debug == 5){
-//        imageStore(debugImage, screenPos, vec4(vec3(minEdge), 1.0f));
-//        return;
-//    }
-//
-//    if (pushConstants.debug == 5) {
-//        imageStore(debugImage, screenPos, vec4(edges.xyz, 1.0f));
-//        return;
-//        if (screenPos.x % 2 == 0 && screenPos.y %2 == 0){
-//            imageStore(debugImage, screenPos, vec4(vec3(1.0f), 1.0f));
-//        } else {
-//            imageStore(debugImage, screenPos, vec4(vec3(0.5f), 1.0f));
-//        }
-//    }
+    //    if (pushConstants.debug == 5){
+    //        imageStore(debugImage, screenPos, vec4(vec3(minEdge), 1.0f));
+    //        return;
+    //    }
+    //
+    //    if (pushConstants.debug == 5) {
+    //        imageStore(debugImage, screenPos, vec4(edges.xyz, 1.0f));
+    //        return;
+    //        if (screenPos.x % 2 == 0 && screenPos.y %2 == 0){
+    //            imageStore(debugImage, screenPos, vec4(vec3(1.0f), 1.0f));
+    //        } else {
+    //            imageStore(debugImage, screenPos, vec4(vec3(0.5f), 1.0f));
+    //        }
+    //    }
 }
diff --git a/shaders/deferredResolve.comp b/shaders/deferredResolve.comp
index e04d7ee6..e393bb23 100644
--- a/shaders/deferredResolve.comp
+++ b/shaders/deferredResolve.comp
@@ -16,8 +16,9 @@ layout (set = 1, binding = 1) uniform sampler2D albedoRenderTarget;
 layout (set = 1, binding = 2) uniform sampler2D pbrRenderTarget;
 layout (set = 1, binding = 3) uniform sampler2D depthBuffer;
 layout (set = 1, binding = 4) uniform sampler2D velocityBuffer; // velocity buffer is not actually used in this deferred resolve
+layout (set = 1, binding = 5) uniform sampler2D aoBuffer;
 
-layout (rgba16f, set = 1, binding = 5) uniform image2D outputImage;
+layout (rgba16f, set = 1, binding = 6) uniform image2D outputImage;
 
 layout (set = 2, binding = 0) uniform samplerCube environmentDiffuseAndSpecular;
 layout (set = 2, binding = 1) uniform sampler2D lut;
@@ -186,9 +187,8 @@ void main() {
             imageStore(outputImage, screenPos, vec4(vec3(dot(N, L)), 1.0));
             break;
         case 9:
-            // vec4 viewPos = sceneData.view * vec4(position, 1.0f);
-            // imageStore(outputImage, screenPos, vec4(-viewPos.xyz / 100.0f, 1.0f));
-            imageStore(outputImage, screenPos, vec4(position.xyz / 1000.0f, 1.0f));
+            float ao = texture(aoBuffer, uv).r;
+            imageStore(outputImage, screenPos, vec4(vec3(ao), 1.0f));
             break;
     }
 }
\ No newline at end of file
diff --git a/src/core/engine.cpp b/src/core/engine.cpp
index c0137938..65288151 100644
--- a/src/core/engine.cpp
+++ b/src/core/engine.cpp
@@ -216,6 +216,7 @@ void Engine::initRenderer()
         pbrRenderTarget.imageView,
         depthImage.imageView,
         velocityRenderTarget.imageView,
+        ambientOcclusionPipeline->getAmbientOcclusionRenderTarget().imageView,
         drawImage.imageView,
         resourceManager->getDefaultSamplerLinear()
     };
diff --git a/src/core/engine.h b/src/core/engine.h
index da6d9894..299ad7a3 100644
--- a/src/core/engine.h
+++ b/src/core/engine.h
@@ -175,7 +175,7 @@ class Engine
     bool bEnableDebugFrustumCullDraw{false};
     int32_t csmPcf{1};
     int32_t deferredDebug{0};
-    int32_t gtaoDebug{5};
+    int32_t gtaoDebug{4};
     bool bDrawTerrainLines{false};
 
     void hotReloadShaders() const;
diff --git a/src/core/game_object/components/mesh_renderer_component.cpp b/src/core/game_object/components/mesh_renderer_component.cpp
index a4d89616..997a3dde 100644
--- a/src/core/game_object/components/mesh_renderer_component.cpp
+++ b/src/core/game_object/components/mesh_renderer_component.cpp
@@ -100,8 +100,14 @@ void MeshRendererComponent::updateRenderImgui()
             }
             ImGui::Separator();
 
+            const bool originalVis = bIsVisible;
+            const bool originalShadow = bIsShadowCaster;
             ImGui::Checkbox("Visible", &bIsVisible);
             ImGui::Checkbox("Cast Shadows", &bIsShadowCaster);
+
+            if (bIsVisible != originalVis || bIsShadowCaster != originalShadow) {
+                dirty();
+            }
         }
     }
 }
diff --git a/src/renderer/lighting/ambient_occlusion/ground_truth/ground_truth_ambient_occlusion.cpp b/src/renderer/lighting/ambient_occlusion/ground_truth/ground_truth_ambient_occlusion.cpp
index c0381ef2..26ad79fd 100644
--- a/src/renderer/lighting/ambient_occlusion/ground_truth/ground_truth_ambient_occlusion.cpp
+++ b/src/renderer/lighting/ambient_occlusion/ground_truth/ground_truth_ambient_occlusion.cpp
@@ -218,56 +218,7 @@ will_engine::ambient_occlusion::GroundTruthAmbientOcclusionPipeline::GroundTruth
         usage |= VK_IMAGE_USAGE_SAMPLED_BIT;
 
         VkImageCreateInfo imgInfo = vk_helpers::imageCreateInfo(ambientOcclusionFormat, usage, {RENDER_EXTENTS.width, RENDER_EXTENTS.height, 1});
-        spatialFilteringImage = resourceManager.createImage(imgInfo);
-    }
-
-    // Temporal Accumulation
-    {
-        DescriptorLayoutBuilder layoutBuilder;
-        layoutBuilder.addBinding(0, VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER); // filtered ao
-        layoutBuilder.addBinding(1, VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER); // final output history
-        layoutBuilder.addBinding(2, VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER); // MRT velocity buffer
-        layoutBuilder.addBinding(3, VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER); // MRT depth buffer
-        layoutBuilder.addBinding(4, VK_DESCRIPTOR_TYPE_STORAGE_IMAGE); // final output
-
-        temporalAccumulationSetLayout = resourceManager.createDescriptorSetLayout(layoutBuilder, VK_SHADER_STAGE_COMPUTE_BIT,
-                                                                                  VK_DESCRIPTOR_SET_LAYOUT_CREATE_DESCRIPTOR_BUFFER_BIT_EXT);
-
-        VkPushConstantRange pushConstants{};
-        pushConstants.offset = 0;
-        pushConstants.size = sizeof(GTAOPushConstants);
-        pushConstants.stageFlags = VK_SHADER_STAGE_COMPUTE_BIT;
-
-        VkDescriptorSetLayout setLayouts[2];
-        setLayouts[0] = resourceManager.getSceneDataLayout();
-        setLayouts[1] = temporalAccumulationSetLayout;
-
-        VkPipelineLayoutCreateInfo layoutInfo{};
-        layoutInfo.sType = VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO;
-        layoutInfo.pNext = nullptr;
-        layoutInfo.pSetLayouts = setLayouts;
-        layoutInfo.setLayoutCount = 2;
-        layoutInfo.pPushConstantRanges = &pushConstants;
-        layoutInfo.pushConstantRangeCount = 1;
-
-        temporalAccumulationPipelineLayout = resourceManager.createPipelineLayout(layoutInfo);
-        createTemporalAccumulationPipeline();
-
-        temporalAccumulationDescriptorBuffer = resourceManager.createDescriptorBufferSampler(temporalAccumulationSetLayout, 1);
-
-
-        VkImageUsageFlags usage{};
-        usage |= VK_IMAGE_USAGE_TRANSFER_DST_BIT;
-        usage |= VK_IMAGE_USAGE_SAMPLED_BIT;
-        VkImageCreateInfo imgInfo = vk_helpers::imageCreateInfo(ambientOcclusionFormat, usage, {RENDER_EXTENTS.width, RENDER_EXTENTS.height, 1});
-        historyOutputImage = resourceManager.createImage(imgInfo);
-
-        usage = {};
-        usage |= VK_IMAGE_USAGE_TRANSFER_SRC_BIT;
-        usage |= VK_IMAGE_USAGE_STORAGE_BIT;
-        usage |= VK_IMAGE_USAGE_SAMPLED_BIT;
-        imgInfo = vk_helpers::imageCreateInfo(ambientOcclusionFormat, usage, {RENDER_EXTENTS.width, RENDER_EXTENTS.height, 1});
-        ambientOcclusionOutputImage = resourceManager.createImage(imgInfo);
+        denoisedFinalAO = resourceManager.createImage(imgInfo);
     }
 }
 
@@ -307,20 +258,9 @@ will_engine::ambient_occlusion::GroundTruthAmbientOcclusionPipeline::~GroundTrut
     resourceManager.destroyPipelineLayout(spatialFilteringPipelineLayout);
     resourceManager.destroyPipeline(spatialFilteringPipeline);
 
-    resourceManager.destroyImage(spatialFilteringImage);
+    resourceManager.destroyImage(denoisedFinalAO);
 
     resourceManager.destroyDescriptorBuffer(spatialFilteringDescriptorBuffer);
-
-
-    // Temporal Accumulation Resources
-    resourceManager.destroyDescriptorSetLayout(temporalAccumulationSetLayout);
-    resourceManager.destroyPipelineLayout(temporalAccumulationPipelineLayout);
-    resourceManager.destroyPipeline(temporalAccumulationPipeline);
-
-    resourceManager.destroyImage(historyOutputImage);
-    resourceManager.destroyImage(ambientOcclusionOutputImage);
-
-    resourceManager.destroyDescriptorBuffer(temporalAccumulationDescriptorBuffer);
 }
 
 void will_engine::ambient_occlusion::GroundTruthAmbientOcclusionPipeline::setupDepthPrefilterDescriptorBuffer(const VkImageView& depthImageView)
@@ -424,7 +364,7 @@ void will_engine::ambient_occlusion::GroundTruthAmbientOcclusionPipeline::setupS
         });
     imageDescriptors.push_back({
         VK_DESCRIPTOR_TYPE_STORAGE_IMAGE,
-        {VK_NULL_HANDLE, spatialFilteringImage.imageView, VK_IMAGE_LAYOUT_GENERAL},
+        {VK_NULL_HANDLE, denoisedFinalAO.imageView, VK_IMAGE_LAYOUT_GENERAL},
         false
     });
     imageDescriptors.push_back({
@@ -514,7 +454,7 @@ void will_engine::ambient_occlusion::GroundTruthAmbientOcclusionPipeline::draw(V
 
     vk_helpers::transitionImage(cmd, ambientOcclusionImage.image, VK_IMAGE_LAYOUT_GENERAL, VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL,
                                 VK_IMAGE_ASPECT_COLOR_BIT);
-    vk_helpers::transitionImage(cmd, spatialFilteringImage.image, VK_IMAGE_LAYOUT_UNDEFINED, VK_IMAGE_LAYOUT_GENERAL,
+    vk_helpers::transitionImage(cmd, denoisedFinalAO.image, VK_IMAGE_LAYOUT_UNDEFINED, VK_IMAGE_LAYOUT_GENERAL,
                                 VK_IMAGE_ASPECT_COLOR_BIT);
     // Spatial Filtering
     {
@@ -537,7 +477,7 @@ void will_engine::ambient_occlusion::GroundTruthAmbientOcclusionPipeline::draw(V
         vkCmdDispatch(cmd, x, y, 1);
     }
 
-    vk_helpers::transitionImage(cmd, spatialFilteringImage.image, VK_IMAGE_LAYOUT_GENERAL, VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL,
+    vk_helpers::transitionImage(cmd, denoisedFinalAO.image, VK_IMAGE_LAYOUT_GENERAL, VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL,
                                     VK_IMAGE_ASPECT_COLOR_BIT);
 
 
@@ -554,7 +494,6 @@ void will_engine::ambient_occlusion::GroundTruthAmbientOcclusionPipeline::reload
     createDepthPrefilterPipeline();
     createAmbientOcclusionPipeline();
     createSpatialFilteringPipeline();
-    createTemporalAccumulationPipeline();
 }
 
 void will_engine::ambient_occlusion::GroundTruthAmbientOcclusionPipeline::createDepthPrefilterPipeline()
@@ -625,26 +564,3 @@ void will_engine::ambient_occlusion::GroundTruthAmbientOcclusionPipeline::create
     spatialFilteringPipeline = resourceManager.createComputePipeline(pipelineInfo);
     resourceManager.destroyShaderModule(computeShader);
 }
-
-void will_engine::ambient_occlusion::GroundTruthAmbientOcclusionPipeline::createTemporalAccumulationPipeline()
-{
-    resourceManager.destroyPipeline(temporalAccumulationPipeline);
-    VkShaderModule computeShader = resourceManager.createShaderModule("shaders/ambient_occlusion/ground_truth/gtao_temporal_accumulation.comp");
-
-    VkPipelineShaderStageCreateInfo stageInfo{};
-    stageInfo.sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO;
-    stageInfo.pNext = nullptr;
-    stageInfo.stage = VK_SHADER_STAGE_COMPUTE_BIT;
-    stageInfo.module = computeShader;
-    stageInfo.pName = "main";
-
-    VkComputePipelineCreateInfo pipelineInfo{};
-    pipelineInfo.sType = VK_STRUCTURE_TYPE_COMPUTE_PIPELINE_CREATE_INFO;
-    pipelineInfo.pNext = nullptr;
-    pipelineInfo.layout = temporalAccumulationPipelineLayout;
-    pipelineInfo.stage = stageInfo;
-    pipelineInfo.flags = VK_PIPELINE_CREATE_DESCRIPTOR_BUFFER_BIT_EXT;
-
-    temporalAccumulationPipeline = resourceManager.createComputePipeline(pipelineInfo);
-    resourceManager.destroyShaderModule(computeShader);
-}
diff --git a/src/renderer/lighting/ambient_occlusion/ground_truth/ground_truth_ambient_occlusion.h b/src/renderer/lighting/ambient_occlusion/ground_truth/ground_truth_ambient_occlusion.h
index ffefe4c8..5964b1f3 100644
--- a/src/renderer/lighting/ambient_occlusion/ground_truth/ground_truth_ambient_occlusion.h
+++ b/src/renderer/lighting/ambient_occlusion/ground_truth/ground_truth_ambient_occlusion.h
@@ -30,11 +30,15 @@ class GroundTruthAmbientOcclusionPipeline
     void draw(VkCommandBuffer cmd, const GTAODrawInfo& drawInfo) const;
 
     void reloadShaders();
+
+    AllocatedImage getAmbientOcclusionRenderTarget() const { return denoisedFinalAO; }
+
 private:
     void createDepthPrefilterPipeline();
+
     void createAmbientOcclusionPipeline();
+
     void createSpatialFilteringPipeline();
-    void createTemporalAccumulationPipeline();
 
 private: // Depth Pre-filter
     VkDescriptorSetLayout depthPrefilterSetLayout{VK_NULL_HANDLE};
@@ -72,22 +76,10 @@ class GroundTruthAmbientOcclusionPipeline
     VkPipelineLayout spatialFilteringPipelineLayout{VK_NULL_HANDLE};
     VkPipeline spatialFilteringPipeline{VK_NULL_HANDLE};
 
-    AllocatedImage spatialFilteringImage{VK_NULL_HANDLE};
+    AllocatedImage denoisedFinalAO{VK_NULL_HANDLE};
 
     DescriptorBufferSampler spatialFilteringDescriptorBuffer;
 
-private: // Temporal Accumulation
-    VkDescriptorSetLayout temporalAccumulationSetLayout{VK_NULL_HANDLE};
-    VkPipelineLayout temporalAccumulationPipelineLayout{VK_NULL_HANDLE};
-    VkPipeline temporalAccumulationPipeline{VK_NULL_HANDLE};
-
-    AllocatedImage historyOutputImage{VK_NULL_HANDLE};
-
-    DescriptorBufferSampler temporalAccumulationDescriptorBuffer;
-
-private: // Output
-    AllocatedImage ambientOcclusionOutputImage{VK_NULL_HANDLE};
-
 private: // Debug
     VkFormat debugFormat{VK_FORMAT_R8G8B8A8_UNORM};
     AllocatedImage debugImage{VK_NULL_HANDLE};
diff --git a/src/renderer/pipelines/deferred_resolve/deferred_resolve.cpp b/src/renderer/pipelines/deferred_resolve/deferred_resolve.cpp
index dc6f3c96..faea01fe 100644
--- a/src/renderer/pipelines/deferred_resolve/deferred_resolve.cpp
+++ b/src/renderer/pipelines/deferred_resolve/deferred_resolve.cpp
@@ -80,6 +80,11 @@ void will_engine::deferred_resolve::DeferredResolvePipeline::setupDescriptorBuff
     velocityTarget.imageView = drawInfo.velocityTarget;
     velocityTarget.imageLayout = VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL;
 
+    VkDescriptorImageInfo aoTarget = {};
+    aoTarget.sampler = drawInfo.sampler;
+    aoTarget.imageView = drawInfo.aoTarget;
+    aoTarget.imageLayout = VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL;
+
     VkDescriptorImageInfo drawImageTarget = {};
     drawImageTarget.imageView = drawInfo.outputTarget;
     drawImageTarget.imageLayout = VK_IMAGE_LAYOUT_GENERAL;
@@ -89,6 +94,7 @@ void will_engine::deferred_resolve::DeferredResolvePipeline::setupDescriptorBuff
     renderTargetDescriptors.push_back({VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER, pbrDataTarget, false});
     renderTargetDescriptors.push_back({VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER, depthImageTarget, false});
     renderTargetDescriptors.push_back({VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER, velocityTarget, false});
+    renderTargetDescriptors.push_back({VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER, aoTarget, false});
     renderTargetDescriptors.push_back({VK_DESCRIPTOR_TYPE_STORAGE_IMAGE, drawImageTarget, false});
 
     resourceManager.setupDescriptorBufferSampler(resolveDescriptorBuffer, renderTargetDescriptors, 0);
diff --git a/src/renderer/pipelines/deferred_resolve/deferred_resolve.h b/src/renderer/pipelines/deferred_resolve/deferred_resolve.h
index ce817a49..ada68554 100644
--- a/src/renderer/pipelines/deferred_resolve/deferred_resolve.h
+++ b/src/renderer/pipelines/deferred_resolve/deferred_resolve.h
@@ -19,6 +19,7 @@ struct DeferredResolveDescriptor
     VkImageView pbrTarget;
     VkImageView depthTarget;
     VkImageView velocityTarget;
+    VkImageView aoTarget;
     VkImageView outputTarget;
 
     VkSampler sampler;
diff --git a/src/renderer/resource_manager.cpp b/src/renderer/resource_manager.cpp
index de3c8bfc..47a072d1 100644
--- a/src/renderer/resource_manager.cpp
+++ b/src/renderer/resource_manager.cpp
@@ -113,12 +113,13 @@ will_engine::ResourceManager::ResourceManager(const VulkanContext& context, Imme
     // Render Targets
     {
         DescriptorLayoutBuilder layoutBuilder;
-        layoutBuilder.addBinding(0, VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER);
-        layoutBuilder.addBinding(1, VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER);
-        layoutBuilder.addBinding(2, VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER);
-        layoutBuilder.addBinding(3, VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER);
-        layoutBuilder.addBinding(4, VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER);
-        layoutBuilder.addBinding(5, VK_DESCRIPTOR_TYPE_STORAGE_IMAGE);
+        layoutBuilder.addBinding(0, VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER); // Normals
+        layoutBuilder.addBinding(1, VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER); // Albedo
+        layoutBuilder.addBinding(2, VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER); // PBR
+        layoutBuilder.addBinding(3, VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER); // Depth
+        layoutBuilder.addBinding(4, VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER); // Velocity
+        layoutBuilder.addBinding(5, VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER); // AO
+        layoutBuilder.addBinding(6, VK_DESCRIPTOR_TYPE_STORAGE_IMAGE); // Output
 
         renderTargetsLayout = layoutBuilder.build(context.device, VK_SHADER_STAGE_COMPUTE_BIT, nullptr, VK_DESCRIPTOR_SET_LAYOUT_CREATE_DESCRIPTOR_BUFFER_BIT_EXT);
     }

From 5cbe33915427bd5981ff26602565f3c7f171ac3c Mon Sep 17 00:00:00 2001
From: Williscool13 <twtw40@gmail.com>
Date: Mon, 31 Mar 2025 17:28:54 +0700
Subject: [PATCH 25/27] Clean up minor GTAO texture sampling issues.

---
 .../ground_truth/gtao_depth_prefilter.comp      | 14 +++++---------
 .../ground_truth/gtao_main_pass.comp            | 17 ++++-------------
 .../ground_truth/gtao_spatial_filter.comp       | 17 ++++++++---------
 shaders/deferredResolve.comp                    |  7 ++++---
 4 files changed, 21 insertions(+), 34 deletions(-)

diff --git a/shaders/ambient_occlusion/ground_truth/gtao_depth_prefilter.comp b/shaders/ambient_occlusion/ground_truth/gtao_depth_prefilter.comp
index f41ea7d2..ab682687 100644
--- a/shaders/ambient_occlusion/ground_truth/gtao_depth_prefilter.comp
+++ b/shaders/ambient_occlusion/ground_truth/gtao_depth_prefilter.comp
@@ -67,15 +67,11 @@ void main() {
     vec2 uv = (vec2(screenPos) + 0.5) * sceneData.texelSize;
 
     // todo: optimize with textureGather?
-    vec4 depths = textureGatherOffset(depthImage, uv, ivec2(1,1), 0);
-    float rDepth0 = depths.w; // top-left
-    float rDepth1 = depths.z; // top-right
-    float rDepth2 = depths.x; // bottom-left
-    float rDepth3 = depths.y; // bottom-right
-//    float rDepth0 = texture(depthImage, uv + vec2(0.0, 0.0) * sceneData.texelSize).r;
-//    float rDepth1 = texture(depthImage, uv + vec2(1.0, 0.0) * sceneData.texelSize).r;
-//    float rDepth2 = texture(depthImage, uv + vec2(0.0, 1.0) * sceneData.texelSize).r;
-//    float rDepth3 = texture(depthImage, uv + vec2(1.0, 1.0) * sceneData.texelSize).r;
+    vec4 depths = textureGatherOffset(depthImage, vec2(screenPos) * sceneData.texelSize, ivec2(1,1), 0);
+    float rDepth0 = depths.w;
+    float rDepth1 = depths.z;
+    float rDepth2 = depths.x;
+    float rDepth3 = depths.y;
 
     float depth0 = clampDepth(screenToViewSpaceDepth(rDepth0, pushConstants.depthLinearizeMult, pushConstants.depthLinearizeAdd));
     float depth1 = clampDepth(screenToViewSpaceDepth(rDepth1, pushConstants.depthLinearizeMult, pushConstants.depthLinearizeAdd));
diff --git a/shaders/ambient_occlusion/ground_truth/gtao_main_pass.comp b/shaders/ambient_occlusion/ground_truth/gtao_main_pass.comp
index 8b75a447..d721fb5c 100644
--- a/shaders/ambient_occlusion/ground_truth/gtao_main_pass.comp
+++ b/shaders/ambient_occlusion/ground_truth/gtao_main_pass.comp
@@ -134,8 +134,9 @@ void main() {
 
     vec2 uv = (vec2(screenPos) + 0.5) * sceneData.texelSize;
 
-    vec4 valuesUL = textureGather(prefilteredDepth, uv, 0);
-    vec4 valuesBR = textureGatherOffset(prefilteredDepth, uv, ivec2(1,1), 0);
+    vec2 gatherCenter = vec2(screenPos) * sceneData.texelSize;
+    vec4 valuesUL = textureGather(prefilteredDepth, gatherCenter, 0);
+    vec4 valuesBR = textureGatherOffset(prefilteredDepth, gatherCenter, ivec2(1, 1), 0);
     float viewSpaceZM = valuesUL.y;
     const float viewSpaceZL = valuesUL.x;
     const float viewSpaceZR = valuesUL.z;
@@ -146,12 +147,6 @@ void main() {
     float packedEdges = XeGTAO_PackEdges(edges);
     imageStore(edgeDataOutput, screenPos, vec4(packedEdges));
 
-    float minEdge = min(min(edges.x, edges.y), min(edges.z, edges.w));
-
-    if (pushConstants.debug == 2){
-        imageStore(debugImage, screenPos, vec4(vec3(minEdge), 1.0f));
-        return;
-    }
 
     // Get view space normal by sampling normal buffer and converting from world to view (code not relevant)
     vec3 worldNormal = texture(normalBuffer, uv).rgb;
@@ -159,7 +154,6 @@ void main() {
 
     if (pushConstants.debug == 2){
         imageStore(debugImage, screenPos, vec4(viewNormal, 1.0f));
-        return;
     }
 
     // Per Intel: Move center pixel slightly towards camera to avoid imprecision artifacts due to depth buffer imprecision; offset depends on depth texture format used
@@ -174,12 +168,11 @@ void main() {
     if (pushConstants.debug == 3){
         vec3 worldPos = (sceneData.invView * vPosAlt).xyz;
         imageStore(debugImage, screenPos, vec4(worldPos / 1000.0f, 1.0f));
-        return;
     }
 
     // Per Intel
     // prevents normals that are facing away from the view vector - xeGTAO struggles with extreme cases, but in Vanilla it seems rare so it's disabled by default
-     viewNormal = normalize(viewNormal + max(0, -dot(viewNormal, viewVec)) * viewVec);
+    viewNormal = normalize(viewNormal + max(0, -dot(viewNormal, viewVec)) * viewVec);
 
 
     const float effectRadius = pushConstants.effectRadius * pushConstants.radiusMultiplier;
@@ -215,7 +208,6 @@ void main() {
             visibility = clamp(visibility / XE_GTAO_OCCLUSION_TERM_SCALE, 0, 1);
             if (pushConstants.debug == 4){
                 imageStore(debugImage, screenPos, vec4(vec3(visibility), 1.0f));
-                return;
             }
 
             imageStore(aoOutput, screenPos, vec4(visibility));
@@ -374,7 +366,6 @@ void main() {
     visibility = clamp(visibility / XE_GTAO_OCCLUSION_TERM_SCALE, 0, 1);
     if (pushConstants.debug == 4){
         imageStore(debugImage, screenPos, vec4(vec3(visibility), 1.0f));
-        return;
     }
 
     imageStore(aoOutput, screenPos, vec4(visibility));
diff --git a/shaders/ambient_occlusion/ground_truth/gtao_spatial_filter.comp b/shaders/ambient_occlusion/ground_truth/gtao_spatial_filter.comp
index 7ffe6fb6..63fc4cef 100644
--- a/shaders/ambient_occlusion/ground_truth/gtao_spatial_filter.comp
+++ b/shaders/ambient_occlusion/ground_truth/gtao_spatial_filter.comp
@@ -31,8 +31,6 @@ void main() {
         return;
     }
 
-    vec2 uv = (vec2(screenPos) + 0.5) * sceneData.texelSize;
-
     // if final apply use below (see, using multiple denoise passes)
     float blurAmount = pushConstants.isFinalDenoisePass == 1 ? pushConstants.denoiseBlurBeta : pushConstants.denoiseBlurBeta / 5.0f;
 
@@ -46,15 +44,16 @@ void main() {
     float weightBR[2];
 
 
+    vec2 gatherCenter = vec2(screenPos) * sceneData.texelSize;
 
-    vec4 edgesQ0 = textureGatherOffset(edgeData, uv, ivec2(0, 0), 0);
-    vec4 edgesQ1 = textureGatherOffset(edgeData, uv, ivec2(2, 0), 0);
-    vec4 edgesQ2 = textureGatherOffset(edgeData, uv, ivec2(1, 2), 0);
+    vec4 edgesQ0 = textureGatherOffset(edgeData, gatherCenter, ivec2(0, 0), 0);
+    vec4 edgesQ1 = textureGatherOffset(edgeData, gatherCenter, ivec2(2, 0), 0);
+    vec4 edgesQ2 = textureGatherOffset(edgeData, gatherCenter, ivec2(1, 2), 0);
 
-    vec4 visQ0 = textureGatherOffset(rawAO, uv, ivec2(0, 0), 0);
-    vec4 visQ1 = textureGatherOffset(rawAO, uv, ivec2(2, 0), 0);
-    vec4 visQ2 = textureGatherOffset(rawAO, uv, ivec2(0, 2), 0);
-    vec4 visQ3 = textureGatherOffset(rawAO, uv, ivec2(2, 2), 0);
+    vec4 visQ0 = textureGatherOffset(rawAO, gatherCenter, ivec2(0, 0), 0);
+    vec4 visQ1 = textureGatherOffset(rawAO, gatherCenter, ivec2(2, 0), 0);
+    vec4 visQ2 = textureGatherOffset(rawAO, gatherCenter, ivec2(0, 2), 0);
+    vec4 visQ3 = textureGatherOffset(rawAO, gatherCenter, ivec2(2, 2), 0);
 
     for (int side = 0; side < 2; side++)
     {
diff --git a/shaders/deferredResolve.comp b/shaders/deferredResolve.comp
index e393bb23..3a26434f 100644
--- a/shaders/deferredResolve.comp
+++ b/shaders/deferredResolve.comp
@@ -138,10 +138,11 @@ void main() {
 
     vec3 reflectionSpecular = SpecularReflection(environmentDiffuseAndSpecular, lut, V, N, roughness, F) * indirectAttenuation;
 
-    vec3 ambient = (kD * reflectionDiffuse + reflectionSpecular);
+    //vec3 ambient = (kD * reflectionDiffuse + reflectionSpecular);
+    float ao = texture(aoBuffer, uv).r;
+    vec3  ambient = (kD * reflectionDiffuse + reflectionSpecular) * ao;
 
-
-    vec3 finalColor = (diffuse + specular) * vec3(1.0f) * nDotL * shadowFactor;
+    vec3 finalColor = (diffuse + specular) * nDotL * shadowFactor;
     finalColor += ambient;
 
     imageStore(outputImage, screenPos, vec4(finalColor, albedo.w));

From 2b9d1c7b373c09a12b10c3222effbce03c47a4b1 Mon Sep 17 00:00:00 2001
From: Williscool13 <twtw40@gmail.com>
Date: Mon, 31 Mar 2025 18:50:15 +0700
Subject: [PATCH 26/27] GTAo debug improvements.

---
 .../ground_truth/gtao_depth_prefilter.comp    |   5 +-
 .../ground_truth/gtao_main_pass.comp          |  45 ++--
 .../ground_truth/gtao_spatial_filter.comp     |   6 +
 .../gtao_temporal_accumulation.comp           |  15 --
 shaders/include/gtao.glsl                     |   3 +
 src/renderer/imgui_wrapper.cpp                | 214 ++++++++++++++----
 src/renderer/imgui_wrapper.h                  |   1 +
 .../ambient_occlusion_types.h                 |  24 +-
 .../ground_truth_ambient_occlusion.cpp        |  31 ++-
 .../ground_truth_ambient_occlusion.h          |   4 +-
 .../visibility_pass/visibility_pass.h         |   2 -
 11 files changed, 232 insertions(+), 118 deletions(-)
 delete mode 100644 shaders/ambient_occlusion/ground_truth/gtao_temporal_accumulation.comp

diff --git a/shaders/ambient_occlusion/ground_truth/gtao_depth_prefilter.comp b/shaders/ambient_occlusion/ground_truth/gtao_depth_prefilter.comp
index ab682687..0283c028 100644
--- a/shaders/ambient_occlusion/ground_truth/gtao_depth_prefilter.comp
+++ b/shaders/ambient_occlusion/ground_truth/gtao_depth_prefilter.comp
@@ -54,6 +54,10 @@ float depthMipFilter(float depth0, float depth1, float depth2, float depth3, flo
 }
 
 void main() {
+    if (pushConstants.debug == -1){
+        return;
+    }
+
     ivec2 groupThreadID = ivec2(gl_LocalInvocationID.xy);
 
     // MIP 0
@@ -88,7 +92,6 @@ void main() {
         imageStore(debugImage, screenPos + ivec2(1, 0), vec4(vec3(depth1 / 1000.0f), 1.0f));
         imageStore(debugImage, screenPos + ivec2(0, 1), vec4(vec3(depth2 / 1000.0f), 1.0f));
         imageStore(debugImage, screenPos + ivec2(1, 1), vec4(vec3(depth3 / 1000.0f), 1.0f));
-        return;
     }
 
     // MIP 1
diff --git a/shaders/ambient_occlusion/ground_truth/gtao_main_pass.comp b/shaders/ambient_occlusion/ground_truth/gtao_main_pass.comp
index d721fb5c..fa3d99e7 100644
--- a/shaders/ambient_occlusion/ground_truth/gtao_main_pass.comp
+++ b/shaders/ambient_occlusion/ground_truth/gtao_main_pass.comp
@@ -21,16 +21,6 @@ layout (rgba8, set = 1, binding = 4) uniform image2D debugImage;
 #define XE_GTAO_PI                (3.1415926535897932384626433832795)
 #define XE_GTAO_PI_HALF             (1.5707963267948966192313216916398)
 
-#define XE_GTAO_SLICE_COUNT_LOW                 1.0f
-#define XE_GTAO_SLICE_COUNT_MEDIUM              2.0f
-#define XE_GTAO_SLICE_COUNT_HIGH                3.0f
-#define XE_GTAO_SLICE_COUNT_ULTRA               9.0f
-
-#define XE_GTAO_STEPS_PER_SLICE_COUNT_LOW       2.0f
-#define XE_GTAO_STEPS_PER_SLICE_COUNT_MEDIUM    2.0f
-#define XE_GTAO_STEPS_PER_SLICE_COUNT_HIGH      3.0f
-#define XE_GTAO_STEPS_PER_SLICE_COUNT_ULTRA     3.0f
-
 #define XE_GTAO_OCCLUSION_TERM_SCALE            1.5f
 
 // http://h14s.p5r.org/2012/09/0x5f3759df.html, [Drobot2014a] Low Level Optimizations for GCN, https://blog.selfshadow.com/publications/s2016-shading-course/activision/s2016_pbs_activision_occlusion.pdf slide 63
@@ -50,7 +40,7 @@ float XeGTAO_FastACos(float inX)
     return (inX >= 0) ? res : PI - res;
 }
 
-uint hilbertIndex(uint posX, uint posY)
+uint XE_GTAO_HilbertIndex(uint posX, uint posY)
 {
     uint index = 0u;
     for (uint curLevel = XE_HILBERT_WIDTH/2u; curLevel > 0u; curLevel /= 2u)
@@ -74,14 +64,14 @@ uint hilbertIndex(uint posX, uint posY)
     return index;
 }
 
-vec2 spatioTemporalNoise(ivec2 pixCoord, uint temporalIndex)// without TAA, temporalIndex is always 0
+vec2 XE_GTAO_SpatioTemporalNoise(ivec2 pixCoord, uint temporalIndex)// without TAA, temporalIndex is always 0
 {
-    uint index = hilbertIndex(uint(pixCoord.x), uint(pixCoord.y));
+    uint index = XE_GTAO_HilbertIndex(uint(pixCoord.x), uint(pixCoord.y));
     index += 288u * (temporalIndex % 64u);
     return vec2(fract(0.5 + index * vec2(0.75487766624669276005, 0.5698402909980532659114)));
 }
 
-vec4 calculateDepthEdges(const float centerZ, const float leftZ, const float rightZ, const float topZ, const float bottomZ)
+vec4 XE_GTAO_CalculateDepthEdges(const float centerZ, const float leftZ, const float rightZ, const float topZ, const float bottomZ)
 {
     vec4 edgesLRTB = vec4(leftZ, rightZ, topZ, bottomZ) - vec4(centerZ);
 
@@ -97,6 +87,7 @@ vec3 cheapReconstructViewSpacePosition(vec2 uv, float viewspaceDepth)
 {
     vec3 ret;
     ret.xy = (pushConstants.ndcToViewMul * uv.xy + pushConstants.ndcToViewAdd) * viewspaceDepth;
+    //ret.y = -ret.y;
     ret.z = -viewspaceDepth;
     return ret;
 }
@@ -111,11 +102,6 @@ vec4 reconstructViewSpacePosition(vec2 uv, float viewDepth) {
     return positionVS;
 }
 
-void outputWorkingTerm(ivec2 screenPos, float visibility, vec3 bentNormal, image2D outputImage){
-    visibility = clamp(visibility / XE_GTAO_OCCLUSION_TERM_SCALE, 0, 1);
-    imageStore(outputImage, screenPos, vec4(visibility + 0.5f / 255.0f));
-}
-
 mat3 adjugate(mat4 m) {
     return mat3(
     cross(m[1].xyz, m[2].xyz),
@@ -126,6 +112,10 @@ mat3 adjugate(mat4 m) {
 }
 
 void main() {
+    if (pushConstants.debug == -1){
+        return;
+    }
+
     const ivec2 screenPos = ivec2(gl_GlobalInvocationID.xy);
 
     if (screenPos.x > sceneData.renderTargetSize.x || screenPos.y > sceneData.renderTargetSize.y) {
@@ -143,7 +133,7 @@ void main() {
     const float viewSpaceZT = valuesBR.z;
     const float viewSpaceZB = valuesBR.x;
 
-    vec4 edges  = calculateDepthEdges(viewSpaceZM, viewSpaceZL, viewSpaceZR, viewSpaceZT, viewSpaceZB);
+    vec4 edges  = XE_GTAO_CalculateDepthEdges(viewSpaceZM, viewSpaceZL, viewSpaceZR, viewSpaceZT, viewSpaceZB);
     float packedEdges = XeGTAO_PackEdges(edges);
     imageStore(edgeDataOutput, screenPos, vec4(packedEdges));
 
@@ -191,7 +181,7 @@ void main() {
 
     {
         // NOISE
-        vec2 noise = spatioTemporalNoise(screenPos, pushConstants.noiseIndex);
+        vec2 noise = XE_GTAO_SpatioTemporalNoise(screenPos, pushConstants.noiseIndex);
         float noiseSlice = noise.x;
         float noiseSample = noise.y;
 
@@ -211,15 +201,14 @@ void main() {
             }
 
             imageStore(aoOutput, screenPos, vec4(visibility));
-            // todo: look at how this will return 1/1.5 instead of 1 for visibility always?
-            // todo: (bent normal) need to write `viewNormal` value to the buffer (i.e. no change to trajectory of normal)
+            // (Bent Normals) need to write `viewNormal` value to the buffer (i.e. no change to trajectory of normal)
             return;
         }
 
         const float minS = pixelTooCloseThreshold / screenspaceRadius;
 
-        float sliceCount = XE_GTAO_SLICE_COUNT_ULTRA;
-        float stepsPerSlice = XE_GTAO_STEPS_PER_SLICE_COUNT_ULTRA;
+        float sliceCount = pushConstants.sliceCount;
+        float stepsPerSlice = pushConstants.stepsPerSliceCount;
 
         for (float slice = 0; slice < sliceCount; slice++){
             float sliceK = (slice+noiseSlice) / sliceCount;
@@ -345,7 +334,7 @@ void main() {
             float localVisibility = projectedNormalVecLength * (iarc0+iarc1);
             visibility += localVisibility;
 
-            // todo: uncomment and fix if outputting bent normals
+            // (Bent Normals)
             // see "Algorithm 2 Extension that computes bent normals b."
             // lpfloat t0 = (6*sin(h0-n)-sin(3*h0-n)+6*sin(h1-n)-sin(3*h1-n)+16*sin(n)-3*(sin(h0+n)+sin(h1+n)))/12;
             // lpfloat t1 = (-cos(3 * h0-n)-cos(3 * h1-n) +8 * cos(n)-3 * (cos(h0+n) +cos(h1+n)))/12;
@@ -358,11 +347,11 @@ void main() {
         visibility = pow(visibility, pushConstants.finalValuePower);
         visibility = max(0.03, visibility);// disallow total occlusion (which wouldn't make any sense anyhow since pixel is visible but also helps with packing bent normals)
 
-        // todo (bent normal)
+        // (Bent Normals)
         // bentNormal = normalize(bentNormal) ;
     }
 
-    // todo (bent normal)
+    // (Bent Normals)
     visibility = clamp(visibility / XE_GTAO_OCCLUSION_TERM_SCALE, 0, 1);
     if (pushConstants.debug == 4){
         imageStore(debugImage, screenPos, vec4(vec3(visibility), 1.0f));
diff --git a/shaders/ambient_occlusion/ground_truth/gtao_spatial_filter.comp b/shaders/ambient_occlusion/ground_truth/gtao_spatial_filter.comp
index 63fc4cef..f077594f 100644
--- a/shaders/ambient_occlusion/ground_truth/gtao_spatial_filter.comp
+++ b/shaders/ambient_occlusion/ground_truth/gtao_spatial_filter.comp
@@ -27,6 +27,12 @@ void main() {
     //const ivec2 screenPos = ivec2(gl_GlobalInvocationID.xy);
     const ivec2 screenPos = ivec2(gl_GlobalInvocationID.xy) * ivec2(2, 1);
 
+    if (pushConstants.debug == -1){
+        imageStore(filteredAO, screenPos, vec4(1.0f));
+        imageStore(filteredAO, screenPos + ivec2(1, 0), vec4(1.0f));
+        return;
+    }
+
     if (screenPos.x > sceneData.renderTargetSize.x || screenPos.y > sceneData.renderTargetSize.y) {
         return;
     }
diff --git a/shaders/ambient_occlusion/ground_truth/gtao_temporal_accumulation.comp b/shaders/ambient_occlusion/ground_truth/gtao_temporal_accumulation.comp
deleted file mode 100644
index bf32d9ad..00000000
--- a/shaders/ambient_occlusion/ground_truth/gtao_temporal_accumulation.comp
+++ /dev/null
@@ -1,15 +0,0 @@
-#version 460
-
-#include "scene.glsl"
-
-// layout (std140, set = 0, binding = 0) uniform SceneData - scene.glsl
-
-layout (set = 1, binding = 0) uniform sampler2D filteredAO;
-layout (set = 1, binding = 1) uniform sampler2D historyOutputAO;
-layout (set = 1, binding = 2) uniform sampler2D depthBuffer;
-layout (set = 1, binding = 2) uniform sampler2D velocityBuffer;
-layout (r8, set = 1, binding = 2) uniform image2D outputAO;
-
-void main() {
-
-}
diff --git a/shaders/include/gtao.glsl b/shaders/include/gtao.glsl
index da3a9f5a..5fd64f10 100644
--- a/shaders/include/gtao.glsl
+++ b/shaders/include/gtao.glsl
@@ -21,6 +21,9 @@ layout (push_constant) uniform PushConstants {
     int noiseIndex;
     int isFinalDenoisePass;
 
+    float sliceCount;
+    float stepsPerSliceCount;
+
     int debug;
 } pushConstants;
 
diff --git a/src/renderer/imgui_wrapper.cpp b/src/renderer/imgui_wrapper.cpp
index 603fd10d..aea583b3 100644
--- a/src/renderer/imgui_wrapper.cpp
+++ b/src/renderer/imgui_wrapper.cpp
@@ -192,30 +192,187 @@ void ImguiWrapper::imguiInterface(Engine* engine)
 
     if (ImGui::Begin("Renderer")) {
         if (ImGui::BeginTabBar("RendererTabs")) {
-            if (ImGui::BeginTabItem("Shaders")) {
-                ImGui::Text("Shaders");
+            if (ImGui::BeginTabItem("Debugging")) {
                 ImGui::SetNextItemWidth(75.0f);
                 if (ImGui::Button("Hot-Reload Shaders")) {
                     engine->hotReloadShaders();
                 }
                 ImGui::Separator();
+
                 ImGui::Text("Temporal Anti-Aliasing");
                 ImGui::Checkbox("Enable TAA", &engine->bEnableTaa);
                 ImGui::DragFloat("Taa Blend Value", &engine->taaBlendValue, 0.01, 0.1f, 0.5f);
-                ImGui::EndTabItem();
-            }
+                ImGui::Separator();
 
-            if (ImGui::BeginTabItem("Pipelines")) {
                 ImGui::Text("Deferred Debug");
                 const char* deferredDebugOptions[]{"None", "Depth", "Velocity", "Albedo", "Normal", "PBR", "Shadows", "Cascade Level", "nDotL", "AO"};
                 ImGui::Combo("Deferred Debug", &engine->deferredDebug, deferredDebugOptions, IM_ARRAYSIZE(deferredDebugOptions));
+                ImGui::Separator();
 
+                ImGui::Text("Frustum Cull Debug Draw");
+                ImGui::Checkbox("Enable Frustum Cull Debug Draw", &engine->bEnableDebugFrustumCullDraw);
                 ImGui::EndTabItem();
             }
 
-            if (ImGui::BeginTabItem("Debug View")) {
-                ImGui::Text("Frustum Cull Debug Draw");
-                ImGui::Checkbox("Enable Frustum Cull Debug Draw", &engine->bEnableDebugFrustumCullDraw);
+            if (ImGui::BeginTabItem("Ambient Occlusion")) {
+                ambient_occlusion::GTAOPushConstants& gtao = engine->ambientOcclusionPipeline->gtaoPush;
+                if (ImGui::CollapsingHeader("GTAO Settings")) {
+                    ImGui::Text("Effect Parameters");
+                    ImGui::Separator();
+                    ImGui::SliderFloat("Effect Radius", &gtao.effectRadius, 0.1f, 2.0f);
+                    ImGui::SliderFloat("Effect Falloff Range", &gtao.effectFalloffRange, 0.0f, 1.0f);
+
+                    ImGui::Spacing();
+                    ImGui::Text("Denoise Parameters");
+                    ImGui::Separator();
+                    float blurBeta = gtao.denoiseBlurBeta;
+                    if (ImGui::SliderFloat("Denoise Blur Beta", &blurBeta, 0.0f, 5.0f)) {
+                        if (ambient_occlusion::GTAO_DENOISE_PASSES != 0) {
+                            gtao.denoiseBlurBeta = blurBeta;
+                        }
+                    }
+                    ImGui::Checkbox("Final Denoise Pass", (bool*) &gtao.isFinalDenoisePass);
+
+                    ImGui::Spacing();
+                    ImGui::Text("Sampling Parameters");
+                    ImGui::Separator();
+                    ImGui::SliderFloat("Radius Multiplier", &gtao.radiusMultiplier, 0.1f, 3.0f);
+                    ImGui::SliderFloat("Sample Distribution Power", &gtao.sampleDistributionPower, 1.0f, 4.0f);
+                    ImGui::SliderFloat("Thin Occluder Compensation", &gtao.thinOccluderCompensation, 0.0f, 1.0f);
+                    ImGui::SliderFloat("Final Value Power", &gtao.finalValuePower, 1.0f, 4.0f);
+                    ImGui::SliderFloat("Depth Mip Sampling Offset", &gtao.depthMipSamplingOffset, 0.0f, 5.0f);
+
+                    ImGui::Spacing();
+                    ImGui::Text("Sample Count");
+                    ImGui::Separator();
+                    const char* qualityPresets[] = {"Low", "Medium", "High", "Ultra"};
+                    int slicePreset = 0;
+
+                    if (gtao.sliceCount == ambient_occlusion::XE_GTAO_SLICE_COUNT_LOW) slicePreset = 0;
+                    else if (gtao.sliceCount == ambient_occlusion::XE_GTAO_SLICE_COUNT_MEDIUM) slicePreset = 1;
+                    else if (gtao.sliceCount == ambient_occlusion::XE_GTAO_SLICE_COUNT_HIGH) slicePreset = 2;
+                    else slicePreset = 3;
+
+                    if (ImGui::Combo("Slice Count Preset", &slicePreset, qualityPresets, IM_ARRAYSIZE(qualityPresets))) {
+                        switch (slicePreset) {
+                            case 0: gtao.sliceCount = ambient_occlusion::XE_GTAO_SLICE_COUNT_LOW;
+                                break;
+                            case 1: gtao.sliceCount = ambient_occlusion::XE_GTAO_SLICE_COUNT_MEDIUM;
+                                break;
+                            case 2: gtao.sliceCount = ambient_occlusion::XE_GTAO_SLICE_COUNT_HIGH;
+                                break;
+                            case 3: gtao.sliceCount = ambient_occlusion::XE_GTAO_SLICE_COUNT_ULTRA;
+                                break;
+                            default:
+                                break;
+                        }
+                    }
+
+                    int stepsPreset = 0;
+                    if (gtao.stepsPerSliceCount == ambient_occlusion::XE_GTAO_STEPS_PER_SLICE_COUNT_MEDIUM) stepsPreset = 1;
+                    else if (gtao.stepsPerSliceCount == ambient_occlusion::XE_GTAO_STEPS_PER_SLICE_COUNT_LOW) stepsPreset = 0;
+                    else if (gtao.stepsPerSliceCount == ambient_occlusion::XE_GTAO_STEPS_PER_SLICE_COUNT_ULTRA) stepsPreset = 3;
+                    else stepsPreset = 2;
+
+                    if (ImGui::Combo("Steps Per Slice Preset", &stepsPreset, qualityPresets, IM_ARRAYSIZE(qualityPresets))) {
+                        switch (stepsPreset) {
+                            case 0: gtao.stepsPerSliceCount = ambient_occlusion::XE_GTAO_STEPS_PER_SLICE_COUNT_LOW;
+                                break;
+                            case 1: gtao.stepsPerSliceCount = ambient_occlusion::XE_GTAO_STEPS_PER_SLICE_COUNT_MEDIUM;
+                                break;
+                            case 2: gtao.stepsPerSliceCount = ambient_occlusion::XE_GTAO_STEPS_PER_SLICE_COUNT_HIGH;
+                                break;
+                            case 3: gtao.stepsPerSliceCount = ambient_occlusion::XE_GTAO_STEPS_PER_SLICE_COUNT_ULTRA;
+                                break;
+                            default:
+                                break;
+                        }
+                    }
+
+                    ImGui::Spacing();
+                    ImGui::Text("Other Parameters");
+                    ImGui::Separator();
+                    ImGui::InputInt("Debug Mode", &gtao.debug);
+
+                    ImGui::Spacing();
+                    if (ImGui::Button("Reset to Defaults")) {
+                        gtao.effectRadius = 0.5f;
+                        gtao.effectFalloffRange = 0.615f;
+                        gtao.denoiseBlurBeta = (ambient_occlusion::GTAO_DENOISE_PASSES == 0) ? (1e4f) : (1.2f);
+                        gtao.radiusMultiplier = 1.457f;
+                        gtao.sampleDistributionPower = 2.0f;
+                        gtao.thinOccluderCompensation = 0.0f;
+                        gtao.finalValuePower = 2.2f;
+                        gtao.depthMipSamplingOffset = 3.30f;
+                        gtao.noiseIndex = 0;
+                        gtao.isFinalDenoisePass = 1;
+                        gtao.sliceCount = ambient_occlusion::XE_GTAO_SLICE_COUNT_ULTRA;
+                        gtao.stepsPerSliceCount = ambient_occlusion::XE_GTAO_STEPS_PER_SLICE_COUNT_ULTRA;
+                        gtao.debug = 0;
+                    }
+                }
+
+                if (ImGui::CollapsingHeader("GTAO Debug Preview")) {
+                    ImGui::Checkbox("Show GTAO Debug Preview", &showGtaoDebugPreview);
+
+                    if (showGtaoDebugPreview) {
+                        if (aoDebugTextureImguiId == VK_NULL_HANDLE) {
+                            if (engine->ambientOcclusionPipeline->debugImage.image != VK_NULL_HANDLE) {
+                                aoDebugTextureImguiId = ImGui_ImplVulkan_AddTexture(
+                                    engine->resourceManager->getDefaultSamplerNearest(),
+                                    engine->ambientOcclusionPipeline->debugImage.imageView,
+                                    VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL);
+                            }
+                        }
+
+                        ImGui::BeginChild("GTAODebugPreview", ImVec2(0, 0), false, ImGuiWindowFlags_None);
+
+                        if (aoDebugTextureImguiId == VK_NULL_HANDLE) {
+                            ImGui::TextColored(ImVec4(1.0f, 0.3f, 0.3f, 1.0f), "Debug texture not available.");
+                        }
+                        else {
+                            // Calculate best fit size
+                            float maxSize = ImGui::GetContentRegionAvail().x;
+                            maxSize = glm::min(maxSize, 1024.0f);
+
+                            VkExtent3D imageExtent = engine->ambientOcclusionPipeline->debugImage.imageExtent;
+                            float width = std::min(maxSize, static_cast<float>(imageExtent.width));
+                            float aspectRatio = static_cast<float>(imageExtent.width) / static_cast<float>(imageExtent.height);
+                            float height = width / aspectRatio;
+
+                            ImGui::Image(reinterpret_cast<ImTextureID>(aoDebugTextureImguiId), ImVec2(width, height));
+
+                            ImGui::SameLine();
+                            if (ImGui::Button("Save GTAO Debug Image")) {
+                                if (file::getOrCreateDirectory(file::imagesSavePath)) {
+                                    const std::filesystem::path path = file::imagesSavePath / "gtao_debug.png";
+
+                                    vk_helpers::saveImageR8G8B8A8UNORM(
+                                        *engine->resourceManager,
+                                        *engine->immediate,
+                                        engine->ambientOcclusionPipeline->debugImage,
+                                        VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL,
+                                        path.string().c_str(),
+                                        0
+                                    );
+
+                                    ImGui::OpenPopup("SaveConfirmation");
+                                }
+                            }
+
+                            if (ImGui::BeginPopupModal("SaveConfirmation", NULL, ImGuiWindowFlags_AlwaysAutoResize)) {
+                                ImGui::Text("Image saved to %s/gtao_debug.png", file::imagesSavePath.string().c_str());
+                                if (ImGui::Button("OK", ImVec2(120, 0))) {
+                                    ImGui::CloseCurrentPopup();
+                                }
+                                ImGui::EndPopup();
+                            }
+                        }
+
+                        ImGui::EndChild();
+                    }
+                }
+
                 ImGui::EndTabItem();
             }
 
@@ -877,46 +1034,7 @@ void ImguiWrapper::imguiInterface(Engine* engine)
     ImGui::End();
 
     if (ImGui::Begin("Discardable Debug")) {
-        ImGui::InputInt("GTAO Debug", &engine->gtaoDebug);
-
-        if (aoDebugTextureImguiId == VK_NULL_HANDLE) {
-            if (engine->ambientOcclusionPipeline->debugImage.image != VK_NULL_HANDLE) {
-                aoDebugTextureImguiId = ImGui_ImplVulkan_AddTexture(engine->resourceManager->getDefaultSamplerNearest(),
-                                                                    engine->ambientOcclusionPipeline->debugImage.imageView,
-                                                                    VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL);
-            }
-        }
-
-
-        if (aoDebugTextureImguiId == VK_NULL_HANDLE) {
-            ImGui::Text("Issue.");
-        }
-        else {
-            float maxSize = ImGui::GetContentRegionAvail().x;
-            maxSize = glm::min(maxSize, 1024.0f);
-
-            VkExtent3D imageExtent = engine->ambientOcclusionPipeline->debugImage.imageExtent;
-            float width = std::min(maxSize, static_cast<float>(imageExtent.width));
-            float aspectRatio = static_cast<float>(imageExtent.width) / static_cast<float>(imageExtent.height);
-            float height = width / aspectRatio;
-
-            ImGui::Image(reinterpret_cast<ImTextureID>(aoDebugTextureImguiId), ImVec2(width, height));
-        }
-
-        if (ImGui::Button("Save GTAO Debug Image")) {
-            if (file::getOrCreateDirectory(file::imagesSavePath)) {
-                const std::filesystem::path path = file::imagesSavePath / "gtao_debug.png";
-
-                vk_helpers::saveImageR8G8B8A8UNORM(
-                    *engine->resourceManager,
-                    *engine->immediate,
-                    engine->ambientOcclusionPipeline->debugImage,
-                    VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL,
-                    path.string().c_str(),
-                    0
-                );
-            }
-        }
+        ImGui::Text("Empty");
     }
     ImGui::End();
 
diff --git a/src/renderer/imgui_wrapper.h b/src/renderer/imgui_wrapper.h
index c77f836d..701dc596 100644
--- a/src/renderer/imgui_wrapper.h
+++ b/src/renderer/imgui_wrapper.h
@@ -82,6 +82,7 @@ class ImguiWrapper
 
     VkDescriptorSet currentlySelectedTextureImguiId{VK_NULL_HANDLE};
 
+    bool showGtaoDebugPreview = false;
     VkDescriptorSet aoDebugTextureImguiId{VK_NULL_HANDLE};
 };
 }
diff --git a/src/renderer/lighting/ambient_occlusion/ambient_occlusion_types.h b/src/renderer/lighting/ambient_occlusion/ambient_occlusion_types.h
index d9afb53e..5139ad51 100644
--- a/src/renderer/lighting/ambient_occlusion/ambient_occlusion_types.h
+++ b/src/renderer/lighting/ambient_occlusion/ambient_occlusion_types.h
@@ -13,17 +13,26 @@ namespace will_engine::ambient_occlusion
 static constexpr int32_t DEPTH_PREFILTER_MIP_COUNT = 5;
 static constexpr int32_t GTAO_DENOISE_PASSES = 1;
 
+static constexpr float XE_GTAO_SLICE_COUNT_LOW = 1.0f;
+static constexpr float XE_GTAO_SLICE_COUNT_MEDIUM = 2.0f;
+static constexpr float XE_GTAO_SLICE_COUNT_HIGH = 3.0f;
+static constexpr float XE_GTAO_SLICE_COUNT_ULTRA = 9.0f;
+static constexpr float XE_GTAO_STEPS_PER_SLICE_COUNT_LOW = 2.0f;
+static constexpr float XE_GTAO_STEPS_PER_SLICE_COUNT_MEDIUM = 2.0f;
+static constexpr float XE_GTAO_STEPS_PER_SLICE_COUNT_HIGH = 3.0f;
+static constexpr float XE_GTAO_STEPS_PER_SLICE_COUNT_ULTRA = 3.0f;
+
 struct GTAOPushConstants
 {
-    glm::vec2 cameraTanHalfFOV;
+    glm::vec2 cameraTanHalfFOV{};
 
-    glm::vec2 ndcToViewMul;
-    glm::vec2 ndcToViewAdd;
+    glm::vec2 ndcToViewMul{};
+    glm::vec2 ndcToViewAdd{};
 
-    glm::vec2 ndcToViewMul_x_PixelSize;
+    glm::vec2 ndcToViewMul_x_PixelSize{};
 
-    float depthLinearizeMult;
-    float depthLinearizeAdd;
+    float depthLinearizeMult{0.0f};
+    float depthLinearizeAdd{0.0f};
 
     float effectRadius = 0.5f;
     float effectFalloffRange = 0.615f;
@@ -37,6 +46,9 @@ struct GTAOPushConstants
     uint32_t noiseIndex{0};
     int32_t isFinalDenoisePass{1};
 
+    float sliceCount{XE_GTAO_SLICE_COUNT_ULTRA};
+    float stepsPerSliceCount{XE_GTAO_STEPS_PER_SLICE_COUNT_ULTRA};
+
     int32_t debug{0};
 };
 
diff --git a/src/renderer/lighting/ambient_occlusion/ground_truth/ground_truth_ambient_occlusion.cpp b/src/renderer/lighting/ambient_occlusion/ground_truth/ground_truth_ambient_occlusion.cpp
index 26ad79fd..1f8ddba1 100644
--- a/src/renderer/lighting/ambient_occlusion/ground_truth/ground_truth_ambient_occlusion.cpp
+++ b/src/renderer/lighting/ambient_occlusion/ground_truth/ground_truth_ambient_occlusion.cpp
@@ -376,31 +376,30 @@ void will_engine::ambient_occlusion::GroundTruthAmbientOcclusionPipeline::setupS
     resourceManager.setupDescriptorBufferSampler(spatialFilteringDescriptorBuffer, imageDescriptors, 0);
 }
 
-void will_engine::ambient_occlusion::GroundTruthAmbientOcclusionPipeline::draw(VkCommandBuffer cmd, const GTAODrawInfo& drawInfo) const
+void will_engine::ambient_occlusion::GroundTruthAmbientOcclusionPipeline::draw(VkCommandBuffer cmd, const GTAODrawInfo& drawInfo)
 {
     VkDebugUtilsLabelEXT label{};
     label.sType = VK_STRUCTURE_TYPE_DEBUG_UTILS_LABEL_EXT;
     label.pLabelName = "GT Ambient Occlusion";
     vkCmdBeginDebugUtilsLabelEXT(cmd, &label);
 
-    GTAOPushConstants push = drawInfo.pushConstants;
     glm::mat4 projMatrix = drawInfo.camera->getProjMatrix();
-    push.depthLinearizeMult = -projMatrix[3][2];
-    push.depthLinearizeAdd = projMatrix[2][2];
-    if (push.depthLinearizeMult * push.depthLinearizeAdd < 0) {
-        push.depthLinearizeAdd = -push.depthLinearizeAdd;
+    gtaoPush.depthLinearizeMult = -projMatrix[3][2];
+    gtaoPush.depthLinearizeAdd = projMatrix[2][2];
+    if (gtaoPush.depthLinearizeMult * gtaoPush.depthLinearizeAdd < 0) {
+        gtaoPush.depthLinearizeAdd = -gtaoPush.depthLinearizeAdd;
     }
 
     float tanHalfFOVY = 1.0f / projMatrix[1][1];
     float tanHalfFOVX = 1.0F / projMatrix[0][0];
-    push.cameraTanHalfFOV = {tanHalfFOVX, tanHalfFOVY};
-    push.ndcToViewMul = {push.cameraTanHalfFOV.x * 2.0f, push.cameraTanHalfFOV.y * -2.0f};
-    push.ndcToViewAdd = {push.cameraTanHalfFOV.x * -1.0f, push.cameraTanHalfFOV.y * 1.0f};
+    gtaoPush.cameraTanHalfFOV = {tanHalfFOVX, tanHalfFOVY};
+    gtaoPush.ndcToViewMul = {gtaoPush.cameraTanHalfFOV.x * 2.0f, gtaoPush.cameraTanHalfFOV.y * -2.0f};
+    gtaoPush.ndcToViewAdd = {gtaoPush.cameraTanHalfFOV.x * -1.0f, gtaoPush.cameraTanHalfFOV.y * 1.0f};
     constexpr glm::vec2 texelSize = {1.0f / RENDER_EXTENT_WIDTH, 1.0f / RENDER_EXTENT_HEIGHT};
-    push.ndcToViewMul_x_PixelSize = {push.ndcToViewMul.x * texelSize.x, push.ndcToViewMul.y * texelSize.y};
+    gtaoPush.ndcToViewMul_x_PixelSize = {gtaoPush.ndcToViewMul.x * texelSize.x, gtaoPush.ndcToViewMul.y * texelSize.y};
 
 
-    push.noiseIndex = GTAO_DENOISE_PASSES > 0 ? drawInfo.currentFrame % 64 : 0;
+    gtaoPush.noiseIndex = GTAO_DENOISE_PASSES > 0 ? drawInfo.currentFrame % 64 : 0;
 
     vk_helpers::transitionImage(cmd, debugImage.image, VK_IMAGE_LAYOUT_UNDEFINED, VK_IMAGE_LAYOUT_GENERAL, VK_IMAGE_ASPECT_COLOR_BIT);
 
@@ -408,7 +407,7 @@ void will_engine::ambient_occlusion::GroundTruthAmbientOcclusionPipeline::draw(V
     // Depth Prefilter
     {
         vkCmdBindPipeline(cmd, VK_PIPELINE_BIND_POINT_COMPUTE, depthPrefilterPipeline);
-        vkCmdPushConstants(cmd, depthPrefilterPipelineLayout, VK_SHADER_STAGE_COMPUTE_BIT, 0, sizeof(GTAOPushConstants), &push);
+        vkCmdPushConstants(cmd, depthPrefilterPipelineLayout, VK_SHADER_STAGE_COMPUTE_BIT, 0, sizeof(GTAOPushConstants), &gtaoPush);
 
         VkDescriptorBufferBindingInfoEXT bindingInfos[2] = {};
         bindingInfos[0] = drawInfo.sceneDataBinding;
@@ -434,7 +433,7 @@ void will_engine::ambient_occlusion::GroundTruthAmbientOcclusionPipeline::draw(V
     // Ambient Occlusion
     {
         vkCmdBindPipeline(cmd, VK_PIPELINE_BIND_POINT_COMPUTE, ambientOcclusionPipeline);
-        vkCmdPushConstants(cmd, ambientOcclusionPipelineLayout, VK_SHADER_STAGE_COMPUTE_BIT, 0, sizeof(GTAOPushConstants), &push);
+        vkCmdPushConstants(cmd, ambientOcclusionPipelineLayout, VK_SHADER_STAGE_COMPUTE_BIT, 0, sizeof(GTAOPushConstants), &gtaoPush);
 
         VkDescriptorBufferBindingInfoEXT bindingInfos[2] = {};
         bindingInfos[0] = drawInfo.sceneDataBinding;
@@ -459,7 +458,7 @@ void will_engine::ambient_occlusion::GroundTruthAmbientOcclusionPipeline::draw(V
     // Spatial Filtering
     {
         vkCmdBindPipeline(cmd, VK_PIPELINE_BIND_POINT_COMPUTE, spatialFilteringPipeline);
-        vkCmdPushConstants(cmd, spatialFilteringPipelineLayout, VK_SHADER_STAGE_COMPUTE_BIT, 0, sizeof(GTAOPushConstants), &push);
+        vkCmdPushConstants(cmd, spatialFilteringPipelineLayout, VK_SHADER_STAGE_COMPUTE_BIT, 0, sizeof(GTAOPushConstants), &gtaoPush);
 
         VkDescriptorBufferBindingInfoEXT bindingInfos[2] = {};
         bindingInfos[0] = drawInfo.sceneDataBinding;
@@ -478,9 +477,7 @@ void will_engine::ambient_occlusion::GroundTruthAmbientOcclusionPipeline::draw(V
     }
 
     vk_helpers::transitionImage(cmd, denoisedFinalAO.image, VK_IMAGE_LAYOUT_GENERAL, VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL,
-                                    VK_IMAGE_ASPECT_COLOR_BIT);
-
-
+                                VK_IMAGE_ASPECT_COLOR_BIT);
 
 
     vk_helpers::transitionImage(cmd, debugImage.image, VK_IMAGE_LAYOUT_GENERAL, VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL,
diff --git a/src/renderer/lighting/ambient_occlusion/ground_truth/ground_truth_ambient_occlusion.h b/src/renderer/lighting/ambient_occlusion/ground_truth/ground_truth_ambient_occlusion.h
index 5964b1f3..f5a78a74 100644
--- a/src/renderer/lighting/ambient_occlusion/ground_truth/ground_truth_ambient_occlusion.h
+++ b/src/renderer/lighting/ambient_occlusion/ground_truth/ground_truth_ambient_occlusion.h
@@ -27,7 +27,7 @@ class GroundTruthAmbientOcclusionPipeline
 
     void setupSpatialFilteringDescriptorBuffer(const VkImageView& depthImageView, const VkImageView& normalsImageView);
 
-    void draw(VkCommandBuffer cmd, const GTAODrawInfo& drawInfo) const;
+    void draw(VkCommandBuffer cmd, const GTAODrawInfo& drawInfo);
 
     void reloadShaders();
 
@@ -87,6 +87,8 @@ class GroundTruthAmbientOcclusionPipeline
 private:
     ResourceManager& resourceManager;
 
+    GTAOPushConstants gtaoPush{};
+
     friend void ImguiWrapper::imguiInterface(Engine* engine);
 };
 }
diff --git a/src/renderer/pipelines/visibility_pass/visibility_pass.h b/src/renderer/pipelines/visibility_pass/visibility_pass.h
index cb7695d6..1c7bf721 100644
--- a/src/renderer/pipelines/visibility_pass/visibility_pass.h
+++ b/src/renderer/pipelines/visibility_pass/visibility_pass.h
@@ -39,8 +39,6 @@ class VisibilityPassPipeline
 
     void draw(VkCommandBuffer cmd, const VisibilityPassDrawInfo& drawInfo) const;
 
-    void indirectBufferSynchronize();
-
     void reloadShaders() { createPipeline(); }
 
 private:

From 339415ac06aee640b65486cb64517574ce2745ec Mon Sep 17 00:00:00 2001
From: Williscool13 <twtw40@gmail.com>
Date: Mon, 31 Mar 2025 19:11:44 +0700
Subject: [PATCH 27/27] Final GTAO version.

---
 .../ground_truth/gtao_main_pass.comp              | 14 +++++---------
 src/renderer/imgui_wrapper.cpp                    | 14 +-------------
 .../ambient_occlusion/ambient_occlusion_types.h   | 15 ++++++++++++---
 src/renderer/renderer_constants.h                 |  4 ++--
 4 files changed, 20 insertions(+), 27 deletions(-)

diff --git a/shaders/ambient_occlusion/ground_truth/gtao_main_pass.comp b/shaders/ambient_occlusion/ground_truth/gtao_main_pass.comp
index fa3d99e7..6aff7c77 100644
--- a/shaders/ambient_occlusion/ground_truth/gtao_main_pass.comp
+++ b/shaders/ambient_occlusion/ground_truth/gtao_main_pass.comp
@@ -87,7 +87,7 @@ vec3 cheapReconstructViewSpacePosition(vec2 uv, float viewspaceDepth)
 {
     vec3 ret;
     ret.xy = (pushConstants.ndcToViewMul * uv.xy + pushConstants.ndcToViewAdd) * viewspaceDepth;
-    //ret.y = -ret.y;
+    ret.y = -ret.y;
     ret.z = -viewspaceDepth;
     return ret;
 }
@@ -149,14 +149,12 @@ void main() {
     // Per Intel: Move center pixel slightly towards camera to avoid imprecision artifacts due to depth buffer imprecision; offset depends on depth texture format used
     viewSpaceZM = viewSpaceZM * 0.998f;
 
-    //vec3 vPos = cheapReconstructViewSpacePosition(uv, viewSpaceZM);
-    vec4 vPosAlt = reconstructViewSpacePosition(uv, viewSpaceZM);
-    vec3 vPos = vPosAlt.xyz;
+    vec3 vPos = cheapReconstructViewSpacePosition(uv, viewSpaceZM);
     vec3 viewVec = normalize(-vPos);
 
     // debug world pos
     if (pushConstants.debug == 3){
-        vec3 worldPos = (sceneData.invView * vPosAlt).xyz;
+        vec3 worldPos = (sceneData.invView * vec4(vPos, 1.0f)).xyz;
         imageStore(debugImage, screenPos, vec4(worldPos / 1000.0f, 1.0f));
     }
 
@@ -284,13 +282,11 @@ void main() {
 
                 vec2 sampleScreenPos0 = uv + sampleOffset;
                 float  SZ0 = textureLod(prefilteredDepth, sampleScreenPos0, mipLevel).r;
-                //vec3 samplePos0 = cheapReconstructViewSpacePosition(sampleScreenPos0, SZ0);
-                vec3 samplePos0 = reconstructViewSpacePosition(sampleScreenPos0, SZ0).xyz;
+                vec3 samplePos0 = cheapReconstructViewSpacePosition(sampleScreenPos0, SZ0);
 
                 vec2 sampleScreenPos1 = uv - sampleOffset;
                 float  SZ1 = textureLod(prefilteredDepth, sampleScreenPos1, mipLevel).r;
-                //vec3 samplePos1 = cheapReconstructViewSpacePosition(sampleScreenPos1, SZ1);
-                vec3 samplePos1 = reconstructViewSpacePosition(sampleScreenPos1, SZ1).xyz;
+                vec3 samplePos1 = cheapReconstructViewSpacePosition(sampleScreenPos1, SZ1);
 
                 vec3 sampleDelta0     = (samplePos0 - vec3(vPos));// using lpfloat for sampleDelta causes precision issues
                 vec3 sampleDelta1     = (samplePos1 - vec3(vPos));// using lpfloat for sampleDelta causes precision issues
diff --git a/src/renderer/imgui_wrapper.cpp b/src/renderer/imgui_wrapper.cpp
index aea583b3..b8605098 100644
--- a/src/renderer/imgui_wrapper.cpp
+++ b/src/renderer/imgui_wrapper.cpp
@@ -296,19 +296,7 @@ void ImguiWrapper::imguiInterface(Engine* engine)
 
                     ImGui::Spacing();
                     if (ImGui::Button("Reset to Defaults")) {
-                        gtao.effectRadius = 0.5f;
-                        gtao.effectFalloffRange = 0.615f;
-                        gtao.denoiseBlurBeta = (ambient_occlusion::GTAO_DENOISE_PASSES == 0) ? (1e4f) : (1.2f);
-                        gtao.radiusMultiplier = 1.457f;
-                        gtao.sampleDistributionPower = 2.0f;
-                        gtao.thinOccluderCompensation = 0.0f;
-                        gtao.finalValuePower = 2.2f;
-                        gtao.depthMipSamplingOffset = 3.30f;
-                        gtao.noiseIndex = 0;
-                        gtao.isFinalDenoisePass = 1;
-                        gtao.sliceCount = ambient_occlusion::XE_GTAO_SLICE_COUNT_ULTRA;
-                        gtao.stepsPerSliceCount = ambient_occlusion::XE_GTAO_STEPS_PER_SLICE_COUNT_ULTRA;
-                        gtao.debug = 0;
+                        gtao = {};
                     }
                 }
 
diff --git a/src/renderer/lighting/ambient_occlusion/ambient_occlusion_types.h b/src/renderer/lighting/ambient_occlusion/ambient_occlusion_types.h
index 5139ad51..18a2904d 100644
--- a/src/renderer/lighting/ambient_occlusion/ambient_occlusion_types.h
+++ b/src/renderer/lighting/ambient_occlusion/ambient_occlusion_types.h
@@ -22,6 +22,15 @@ static constexpr float XE_GTAO_STEPS_PER_SLICE_COUNT_MEDIUM = 2.0f;
 static constexpr float XE_GTAO_STEPS_PER_SLICE_COUNT_HIGH = 3.0f;
 static constexpr float XE_GTAO_STEPS_PER_SLICE_COUNT_ULTRA = 3.0f;
 
+#ifdef NDEBUG
+#define DEFAULT_GTAO_SLICE_COUNT XE_GTAO_SLICE_COUNT_ULTRA
+#define DEFAULT_GTAO_STEPS_PER_SLICE_COUNT XE_GTAO_STEPS_PER_SLICE_COUNT_ULTRA
+#else
+#define DEFAULT_GTAO_SLICE_COUNT XE_GTAO_SLICE_COUNT_MEDIUM
+#define DEFAULT_GTAO_STEPS_PER_SLICE_COUNT XE_GTAO_STEPS_PER_SLICE_COUNT_MEDIUM
+#endif
+
+
 struct GTAOPushConstants
 {
     glm::vec2 cameraTanHalfFOV{};
@@ -46,10 +55,10 @@ struct GTAOPushConstants
     uint32_t noiseIndex{0};
     int32_t isFinalDenoisePass{1};
 
-    float sliceCount{XE_GTAO_SLICE_COUNT_ULTRA};
-    float stepsPerSliceCount{XE_GTAO_STEPS_PER_SLICE_COUNT_ULTRA};
+    float sliceCount{DEFAULT_GTAO_SLICE_COUNT};
+    float stepsPerSliceCount{DEFAULT_GTAO_STEPS_PER_SLICE_COUNT};
 
-    int32_t debug{0};
+    int32_t debug{4};
 };
 
 struct GTAODrawInfo
diff --git a/src/renderer/renderer_constants.h b/src/renderer/renderer_constants.h
index 54db7310..b383e60b 100644
--- a/src/renderer/renderer_constants.h
+++ b/src/renderer/renderer_constants.h
@@ -10,8 +10,8 @@ constexpr int32_t FRAME_OVERLAP = 2;
 constexpr char ENGINE_NAME[] = "Will Engine";
 constexpr bool USING_REVERSED_DEPTH_BUFFER = true;
 constexpr VkDeviceSize ZERO_DEVICE_SIZE = 0;
-constexpr VkExtent2D RENDER_EXTENTS{1920, 1080};
-//constexpr VkExtent2D RENDER_EXTENTS{3840, 2160};
+//constexpr VkExtent2D RENDER_EXTENTS{1920, 1080};
+constexpr VkExtent2D RENDER_EXTENTS{3840, 2160};
 constexpr float RENDER_EXTENT_WIDTH{RENDER_EXTENTS.width};
 constexpr float RENDER_EXTENT_HEIGHT{RENDER_EXTENTS.height};