From 6befba73eac8e1c85eee965e1d4000beb6bea1b6 Mon Sep 17 00:00:00 2001
From: Kevin Wheatley <kevin.wheatley@framestore.com>
Date: Fri, 21 Feb 2025 00:11:17 +0000
Subject: [PATCH 01/11] ACES 2.0 Output Transform performance optimisation
 (#2119)

* Extend ocioperf to take config file parameter on CLI

Signed-off-by: Kevin Wheatley <kevin.wheatley@framestore.com>

* Extend ocioconvert to take config on command line

Signed-off-by: Kevin Wheatley <kevin.wheatley@framestore.com>

* Extract tonescale_fwd function

Signed-off-by: Kevin Wheatley <kevin.wheatley@framestore.com>

* Extract inverse tonescale function

Signed-off-by: Kevin Wheatley <kevin.wheatley@framestore.com>

* Combine c and Z variables in J calculation exponent
replace 100.0 entries when referring to the scale of J
Extract calculation of nonlinear compression into functions

Signed-off-by: Kevin Wheatley <kevin.wheatley@framestore.com>

* Split RGB<->JMh function into two parts to expose opponent intermediate values

Signed-off-by: Kevin Wheatley <kevin.wheatley@framestore.com>

* Use function to compute matrix multiply for LMS calculations

Signed-off-by: Kevin Wheatley <kevin.wheatley@framestore.com>

* Remove unused member variable from JMhParams structure

Signed-off-by: Kevin Wheatley <kevin.wheatley@framestore.com>

* Combine chromatic adaptation weights into LMS matrix (and inverse) - CHANGES PIXEL OUTPUT

Signed-off-by: Kevin Wheatley <kevin.wheatley@framestore.com>

* Use matrix form for transforming cone responses to Aab

Signed-off-by: Kevin Wheatley <kevin.wheatley@framestore.com>

* Normalise the F_L parameter

Signed-off-by: Kevin Wheatley <kevin.wheatley@framestore.com>

* Remove ra and ba related variables to avoid them being out of sync with opponent calculation

Signed-off-by: Kevin Wheatley <kevin.wheatley@framestore.com>

* Make A<->J conversion function generic

Signed-off-by: Kevin Wheatley <kevin.wheatley@framestore.com>

* Deduplicate Y<->J conversions

Signed-off-by: Kevin Wheatley <kevin.wheatley@framestore.com>

* Factor JMh scaling parameters into Aab matrices

Signed-off-by: Kevin Wheatley <kevin.wheatley@framestore.com>

* factor our references to PI, 360 and 180 constants
Avoid looking up cusp twice during inverse
Whilst searching for the cusp we have already constrained the search so we do not need to clamp

Signed-off-by: Kevin Wheatley <kevin.wheatley@framestore.com>

* Add functions to explain some of the calculations

Signed-off-by: Kevin Wheatley <kevin.wheatley@framestore.com>

* Further clarify when 100 means reference luminance
Migrate rescaling into tonescale s_2 parameter
Rename model_gamma to reflect it is actually the inverse

Signed-off-by: Kevin Wheatley <kevin.wheatley@framestore.com>

* migrate init steps performed within other init functions to the top level to avoid repeat init of precomputed values.

Signed-off-by: Kevin Wheatley <kevin.wheatley@framestore.com>

* extract some of the fixed values that only depend on the hue to reduce recomputation during inverse gamut mapping

Signed-off-by: Kevin Wheatley <kevin.wheatley@framestore.com>

* Avoid double lookup for reachMaxM value by resolving once the hue is known.

also reduces size of object on stack by not passing the whole table.

Signed-off-by: Kevin Wheatley <kevin.wheatley@framestore.com>

* Push wrapping of hues to the boundary,
mark up  conversion points from external inputs etc

Signed-off-by: Kevin Wheatley <kevin.wheatley@framestore.com>

* Store gamma values as reciprocals
move more magic constants into const variables
factor some of the complex expressions into function (temporarily makes things slower)

Signed-off-by: Kevin Wheatley <kevin.wheatley@framestore.com>

* Add some missing includes to headers

Signed-off-by: Kevin Wheatley <kevin.wheatley@framestore.com>

* minor cleanup to use std::array instead of plain array for test samples

Signed-off-by: Kevin Wheatley <kevin.wheatley@framestore.com>

* Inline reach boundary finding
restructure find_gamut_boundary_intersection to highlight common patterns.

Signed-off-by: Kevin Wheatley <kevin.wheatley@framestore.com>

* Extract gamut mapper compression function
rework get_focus_gain to directly computer the slope_gain
Share calculation of analytical thereshold

Signed-off-by: Kevin Wheatley <kevin.wheatley@framestore.com>

* Rework gamut mapper to compress absolute M then only recalculate calculate J

Signed-off-by: Kevin Wheatley <kevin.wheatley@framestore.com>

* Precalculate maximum search range for cusp lookup

next steps would be to factor hue into separate table to improve cache hits followed by redistribution to more uniform hues which should narrow search range

Signed-off-by: Kevin Wheatley <kevin.wheatley@framestore.com>

* Experiment with reusing slope calculations in gamut mapper
presmooth cusp values

Signed-off-by: Kevin Wheatley <kevin.wheatley@framestore.com>

* Add a collection of TODO's

Signed-off-by: Kevin Wheatley <kevin.wheatley@framestore.com>

* Restore function mapping table index to hue

Signed-off-by: Kevin Wheatley <kevin.wheatley@framestore.com>

* Minor tweaks to tonescale inverse clamp

Signed-off-by: Kevin Wheatley <kevin.wheatley@framestore.com>

* Remove duplicate table whilst calculating upper hull gamma

Signed-off-by: Kevin Wheatley <kevin.wheatley@framestore.com>

* Add some additional sample points for the upper hull gamma finder

Signed-off-by: Kevin Wheatley <kevin.wheatley@framestore.com>

* Slight tidy up of gamma fitting code

Signed-off-by: Kevin Wheatley <kevin.wheatley@framestore.com>

* Experiment with alternate smin implementation

Signed-off-by: Kevin Wheatley <kevin.wheatley@framestore.com>

* Remove unused function and tidy up comments

Signed-off-by: Kevin Wheatley <kevin.wheatley@framestore.com>

* Extract hue search into separate function

Signed-off-by: Kevin Wheatley <kevin.wheatley@framestore.com>

* Extract hues into separate table, merge gamma values into their place (gamma values now sampled on cusp hue intervals). Removes extra texture from GPU path.

Signed-off-by: Kevin Wheatley <kevin.wheatley@framestore.com>

* Simplify upper hull gamma hue lookup to avoid unneeded lerping as we are sampling the table entries directly

Signed-off-by: Kevin Wheatley <kevin.wheatley@framestore.com>

* Split out tonescale function, minor tweaks to Aab->JMh

Signed-off-by: Kevin Wheatley <kevin.wheatley@framestore.com>

* Build tables more uniformly, needs some clean up and lots of testing

Signed-off-by: Kevin Wheatley <kevin.wheatley@framestore.com>

* Speed up reach corner finding by switching to testing against the Achromatic rather than J limit

Signed-off-by: Kevin Wheatley <kevin.wheatley@framestore.com>

* Speed up hull gamma finding by computing values which depend only on the test points and not the gamma values themselves

Signed-off-by: Kevin Wheatley <kevin.wheatley@framestore.com>

* Adjust GPU hue lookup to take advantage of more uniform distribution

Signed-off-by: Kevin Wheatley <kevin.wheatley@framestore.com>

* Fix GLSL compatibility with hue lookup
Remove compiler warnings for unused parameters

Signed-off-by: Kevin Wheatley <kevin.wheatley@framestore.com>

* Attempt to simplify table generation code

Signed-off-by: Kevin Wheatley <kevin.wheatley@framestore.com>

* Explicilty allow GCC to perform additional optimisations - Needs some discussion

Signed-off-by: Kevin Wheatley <kevin.wheatley@framestore.com>

* Add extra entries to reach table to avoid needing to clamp to range during pixel processing

Signed-off-by: Kevin Wheatley <kevin.wheatley@framestore.com>

* GPU move reach Max M sampling to avoid looking it up multiple times per pixel

Signed-off-by: Kevin Wheatley <kevin.wheatley@framestore.com>

* Remove smoothing from GPU path, it is baked into the csup

Signed-off-by: Kevin Wheatley <kevin.wheatley@framestore.com>

* Fix bug with reach lookup

Signed-off-by: Kevin Wheatley <kevin.wheatley@framestore.com>

* Try only wrap hues on input to the shaders

Signed-off-by: Kevin Wheatley <kevin.wheatley@framestore.com>

* rework GPU camut compressor to follow the same algorithm as CPU. Not 100% the same GPU still recalculates some values

Signed-off-by: Kevin Wheatley <kevin.wheatley@framestore.com>

* Rework solve_J_intersect to have fewer div instructions

Signed-off-by: Kevin Wheatley <kevin.wheatley@framestore.com>

* Adjust GPU code to better align with CPU code's structure, some additional precomputation is now applied during shader generation

Signed-off-by: Kevin Wheatley <kevin.wheatley@framestore.com>

* Precompute more scaling factors into matrices and nonlinear functions

Signed-off-by: Kevin Wheatley <kevin.wheatley@framestore.com>

* Experiment with unsigned integers for array access

Signed-off-by: Kevin Wheatley <kevin.wheatley@framestore.com>

* Bypass one J-> A conversion by saving the Aab computed earlier

Signed-off-by: Kevin Wheatley <kevin.wheatley@framestore.com>

* Test intrinsics for compression Norm calculation

Signed-off-by: Kevin Wheatley <kevin.wheatley@framestore.com>

* Attempt to calculate sin/cos only once per pixel.
Some minor micro optimisations.
Further alignment of GPU with CPU code,
Tests values need evaluating
Some GPU results are different - TBD

Signed-off-by: Kevin Wheatley <kevin.wheatley@framestore.com>

* Remove unused parameters

Signed-off-by: Kevin Wheatley <kevin.wheatley@framestore.com>

* Try tree vectoriser for gcc

Signed-off-by: Kevin Wheatley <kevin.wheatley@framestore.com>

* Add Vectorise option for MSVC

Signed-off-by: Kevin Wheatley <kevin.wheatley@framestore.com>

* Remove unused function

Signed-off-by: Kevin Wheatley <kevin.wheatley@framestore.com>

* Constexpr std::max is only available in C++ 14 for now avoid the call to it

Signed-off-by: Kevin Wheatley <kevin.wheatley@framestore.com>

* Try to fir intrinsic based errors on osome build configurations

Signed-off-by: Kevin Wheatley <kevin.wheatley@framestore.com>

* Another C++ 14 usage fix

Signed-off-by: Kevin Wheatley <kevin.wheatley@framestore.com>

* Remove check for CLANG left over from testing

Signed-off-by: Kevin Wheatley <kevin.wheatley@framestore.com>

---------

Signed-off-by: Kevin Wheatley <kevin.wheatley@framestore.com>
---
 src/OpenColorIO/CMakeLists.txt                |   10 +
 .../ops/fixedfunction/ACES2/ColorLib.h        |    1 +
 .../ops/fixedfunction/ACES2/Common.h          |  149 +-
 .../ops/fixedfunction/ACES2/MatrixLib.h       |    1 +
 .../ops/fixedfunction/ACES2/Transform.cpp     | 1478 +++++++++++------
 .../ops/fixedfunction/ACES2/Transform.h       |   28 +-
 .../ops/fixedfunction/FixedFunctionOpCPU.cpp  |   94 +-
 .../ops/fixedfunction/FixedFunctionOpGPU.cpp  |  891 +++++-----
 src/apps/ocioconvert/main.cpp                 |   68 +-
 src/apps/ocioperf/main.cpp                    |   71 +-
 tests/cpu/CMakeLists.txt                      |    1 +
 tests/gpu/FixedFunctionOp_test.cpp            |    3 +
 12 files changed, 1722 insertions(+), 1073 deletions(-)

diff --git a/src/OpenColorIO/CMakeLists.txt b/src/OpenColorIO/CMakeLists.txt
index 7d2894da6b..857058696c 100755
--- a/src/OpenColorIO/CMakeLists.txt
+++ b/src/OpenColorIO/CMakeLists.txt
@@ -220,6 +220,16 @@ if(OCIO_USE_SIMD AND (OCIO_ARCH_X86 OR OCIO_USE_SSE2NEON))
     set_property(SOURCE ops/lut3d/Lut3DOpCPU_AVX.cpp APPEND PROPERTY COMPILE_OPTIONS ${OCIO_AVX_ARGS})
     set_property(SOURCE ops/lut3d/Lut3DOpCPU_AVX2.cpp APPEND PROPERTY COMPILE_OPTIONS ${OCIO_AVX2_ARGS})
     set_property(SOURCE ops/lut3d/Lut3DOpCPU_AVX512.cpp APPEND PROPERTY COMPILE_OPTIONS ${OCIO_AVX512_ARGS})
+    set_property(SOURCE ops/fixedfunction/ACES2/Transform.cpp APPEND PROPERTY COMPILE_OPTIONS  ${OCIO_SSE2_ARGS} ${OCIO_AVX_ARGS})
+endif()
+
+# TODO: What to do for other compilers?
+if(USE_GCC)
+    set_property(SOURCE ops/fixedfunction/ACES2/Transform.cpp APPEND PROPERTY COMPILE_OPTIONS
+                -fno-math-errno -fno-signed-zeros -fno-trapping-math -fno-signaling-nans -ffinite-math-only -freciprocal-math -ftree-vectorize)
+endif()
+if(MSVC)
+    set_property(SOURCE ops/fixedfunction/ACES2/Transform.cpp APPEND PROPERTY COMPILE_OPTIONS /Qvec)
 endif()
 
 configure_file(CPUInfoConfig.h.in CPUInfoConfig.h)
diff --git a/src/OpenColorIO/ops/fixedfunction/ACES2/ColorLib.h b/src/OpenColorIO/ops/fixedfunction/ACES2/ColorLib.h
index f10ddf3d56..17d5022ee5 100644
--- a/src/OpenColorIO/ops/fixedfunction/ACES2/ColorLib.h
+++ b/src/OpenColorIO/ops/fixedfunction/ACES2/ColorLib.h
@@ -7,6 +7,7 @@
 #include "transforms/builtins/ColorMatrixHelpers.h"
 #include "MatrixLib.h"
 
+#include <cmath>
 
 namespace OCIO_NAMESPACE
 {
diff --git a/src/OpenColorIO/ops/fixedfunction/ACES2/Common.h b/src/OpenColorIO/ops/fixedfunction/ACES2/Common.h
index aeb1524071..18f0123636 100644
--- a/src/OpenColorIO/ops/fixedfunction/ACES2/Common.h
+++ b/src/OpenColorIO/ops/fixedfunction/ACES2/Common.h
@@ -7,44 +7,98 @@
 #include "MatrixLib.h"
 #include "ColorLib.h"
 
+#include <cmath>
 
 namespace OCIO_NAMESPACE
 {
 
 namespace ACES2
 {
+constexpr float PI = 3.14159265358979f;
 
-constexpr int TABLE_SIZE = 360;
-constexpr int TABLE_ADDITION_ENTRIES = 2;
-constexpr int TABLE_TOTAL_SIZE = TABLE_SIZE + TABLE_ADDITION_ENTRIES;
-constexpr int GAMUT_TABLE_BASE_INDEX = 1;
+constexpr float hue_limit = 360.0f;
+//constexpr float hue_limit = 2.0f * PI;
+inline float _wrap_to_hue_limit(float y)
+{
+    if ( y < 0.f)
+    {
+        y = y + hue_limit;
+    }
+    return y;
+}
+
+inline float wrap_to_hue_limit(float hue)
+{
+    float y = std::fmod(hue, hue_limit);
+    return _wrap_to_hue_limit(y);
+}
+inline constexpr float to_degrees(const float v) { return v; }
+inline float from_degrees(const float v) { return wrap_to_hue_limit(v); }
+inline constexpr float to_radians(const float v) { return PI * v / 180.0f; };
+inline float _from_radians(const float v) { return _wrap_to_hue_limit(180.0f * v / PI); }; // v needs to be wrapped already
+inline float from_radians(const float v) { return wrap_to_hue_limit(180.0f * v / PI); };
+/*
+inline constexpr float to_degrees(const float v) { return 180.0f * v / PI; }
+inline float from_degrees(const float v) { return wrap_to_hue_limit(PI * v / 180.0f); }
+inline constexpr float to_radians(const float v) { return v; }
+inline float _from_radians(const float v) { return _wrap_to_hue_limit(v); };
+inline float from_radians(const float v) { return wrap_to_hue_limit(v); };
+*/
+
+struct TableBase
+{
+    static constexpr unsigned int _TABLE_ADDITION_ENTRIES = 2;
+    static constexpr unsigned int base_index = 1;
+    static constexpr unsigned int nominal_size = 360;
+    static constexpr unsigned int total_size = nominal_size + _TABLE_ADDITION_ENTRIES;
+
+    static constexpr unsigned int lower_wrap_index = 0;
+    static constexpr unsigned int upper_wrap_index = base_index + nominal_size;
+    static constexpr unsigned int first_nominal_index = base_index;
+    static constexpr unsigned int last_nominal_index = upper_wrap_index - 1;
+
+    inline float base_hue_for_position(unsigned int i_lo) const
+    {
+        if (hue_limit == float(nominal_size)) // TODO C++ 17 if constexpr
+            return float(i_lo);
+
+        const float result = i_lo * hue_limit / nominal_size;
+        return result;
+    }
+
+    inline unsigned int hue_position_in_uniform_table(float wrapped_hue) const 
+    {
+        if (hue_limit == float(nominal_size)) // TODO C++ 17 if constexpr
+            return static_cast<unsigned int>(wrapped_hue);
+        else
+            return static_cast<unsigned int>(wrapped_hue / hue_limit * float(nominal_size)); // TODO: can we use the 'lost' fraction for the lerps?
+    }
+
+   inline unsigned int nominal_hue_position_in_uniform_table(float wrapped_hue) const 
+    {
+        return first_nominal_index + hue_position_in_uniform_table(wrapped_hue);
+    }
+};
 
-struct Table3D
+struct Table3D : public TableBase, std::array<float[3], TableBase::total_size>
 {
-    static constexpr int base_index = GAMUT_TABLE_BASE_INDEX;
-    static constexpr int size = TABLE_SIZE;
-    static constexpr int total_size = TABLE_TOTAL_SIZE;
-    float table[TABLE_TOTAL_SIZE][3];
 };
 
-struct Table1D
+struct Table1D : public TableBase, std::array<float, TableBase::total_size>
 {
-    static constexpr int base_index = GAMUT_TABLE_BASE_INDEX;
-    static constexpr int size = TABLE_SIZE;
-    static constexpr int total_size = TABLE_TOTAL_SIZE;
-    float table[TABLE_TOTAL_SIZE];
 };
 
 struct JMhParams
 {
-    float F_L;
-    float z;
-    float A_w;
+    m33f MATRIX_RGB_to_CAM16_c;
+    m33f MATRIX_CAM16_c_to_RGB;
+    m33f MATRIX_cone_response_to_Aab;
+    m33f MATRIX_Aab_to_cone_response;
+    float F_L_n;    // F_L normalised
+    float cz;
+    float inv_cz;   // 1/cz
     float A_w_J;
-    f3 XYZ_w;
-    f3 D_RGB;
-    m33f MATRIX_RGB_to_CAM16;
-    m33f MATRIX_CAM16_to_RGB;
+    float inv_A_w_J; // 1/A_w_J
 };
 
 struct ToneScaleParams
@@ -57,41 +111,63 @@ struct ToneScaleParams
     float s_2;
     float u_2;
     float m_2;
+    float forward_limit;
+    float inverse_limit;
+    float log_peak;
 };
 
-struct ChromaCompressParams
+struct SharedCompressionParameters
 {
     float limit_J_max;
-    float model_gamma;
+    float model_gamma_inv;
+    Table1D reach_m_table;
+};
+
+struct ResolvedSharedCompressionParameters
+{
+    float limit_J_max;
+    float model_gamma_inv;
+    float reachMaxM;
+};
+
+struct ChromaCompressParams
+{
     float sat;
     float sat_thr;
     float compr;
-    Table1D reach_m_table;
     float chroma_compress_scale;
     static constexpr float cusp_mid_blend = 1.3f;
 };
 
+struct HueDependantGamutParams
+{
+    float gamma_bottom_inv;
+    f2 JMcusp;
+    float gamma_top_inv;
+    float focusJ;
+    float analytical_threshold;
+};
 struct GamutCompressParams
 {
-    float limit_J_max;
     float mid_J;
-    float model_gamma;
     float focus_dist;
-    float lower_hull_gamma;
-    Table1D reach_m_table;
+    float lower_hull_gamma_inv;
+    std::array<int, 2> hue_linearity_search_range;
+    Table1D hue_table;;
     Table3D gamut_cusp_table;
-    Table1D upper_hull_gamma_table;
 };
 
 // CAM
 constexpr float reference_luminance = 100.f;
 constexpr float L_A = 100.f;
 constexpr float Y_b = 20.f;
-constexpr float ac_resp = 1.f;
-constexpr float ra = 2.f * ac_resp;
-constexpr float ba = 0.05f + (2.f - ra);
 constexpr f3 surround = {0.9f, 0.59f, 0.9f}; // Dim surround
 
+constexpr float J_scale = 100.0f;
+constexpr float cam_nl_Y_reference = 100.0f;
+constexpr float cam_nl_offset = 0.2713f * cam_nl_Y_reference;
+constexpr float cam_nl_scale = 4.0f * cam_nl_Y_reference;
+
 // Chroma compression
 constexpr float chroma_compress = 2.4f;
 constexpr float chroma_compress_fact = 3.3f;
@@ -100,11 +176,11 @@ constexpr float chroma_expand_fact = 0.69f;
 constexpr float chroma_expand_thr = 0.5f;
 
 // Gamut compression
-constexpr float smooth_cusps = 0.12f;
+constexpr float smooth_cusps =  0.12f; // C++ 14 required for constexpr std::max(0.000001f, 0.12f);
 constexpr float smooth_m = 0.27f;
 constexpr float cusp_mid_blend = 1.3f;
 constexpr float focus_gain_blend = 0.3f;
-constexpr float focus_adjust_gain = 0.55f;
+constexpr float focus_adjust_gain_inv = 1.0f / 0.55f;
 constexpr float focus_distance = 1.35f;
 constexpr float focus_distance_scaling = 1.75f;
 constexpr float compression_threshold = 0.75f;
@@ -125,6 +201,11 @@ constexpr float gammaMaximum = 5.0f;
 constexpr float gammaSearchStep = 0.4f;
 constexpr float gammaAccuracy = 1e-5f;
 
+constexpr int cuspCornerCount = 6;
+constexpr int totalCornerCount = cuspCornerCount + 2;
+constexpr int max_sorted_corners = 2 * cuspCornerCount;
+constexpr float reach_cusp_tolerance = 1e-3f;
+constexpr float display_cusp_tolerance = 1e-7f;
 
 } // namespace ACES2
 
diff --git a/src/OpenColorIO/ops/fixedfunction/ACES2/MatrixLib.h b/src/OpenColorIO/ops/fixedfunction/ACES2/MatrixLib.h
index 27a24df0c9..d552742ccb 100644
--- a/src/OpenColorIO/ops/fixedfunction/ACES2/MatrixLib.h
+++ b/src/OpenColorIO/ops/fixedfunction/ACES2/MatrixLib.h
@@ -6,6 +6,7 @@
 
 #include "ops/matrix/MatrixOpData.h"
 
+#include <array>
 
 namespace OCIO_NAMESPACE
 {
diff --git a/src/OpenColorIO/ops/fixedfunction/ACES2/Transform.cpp b/src/OpenColorIO/ops/fixedfunction/ACES2/Transform.cpp
index 80b8ba5c72..14461ce8fa 100644
--- a/src/OpenColorIO/ops/fixedfunction/ACES2/Transform.cpp
+++ b/src/OpenColorIO/ops/fixedfunction/ACES2/Transform.cpp
@@ -3,6 +3,13 @@
 
 #include "Transform.h"
 
+#include <array>
+#include <algorithm>
+#include <cmath>
+#include <limits>
+
+#include "SSE2.h"
+#include "AVX.h"
 
 namespace OCIO_NAMESPACE
 {
@@ -13,48 +20,47 @@ namespace ACES2
 //
 // Table lookups
 //
-
-float wrap_to_360(float hue)
+inline f3 lerp(const f3& lower, const f3& upper, const float t)
 {
-    float y = std::fmod(hue, 360.f);
-    if ( y < 0.f)
-    {
-        y = y + 360.f;
-    }
-    return y;
+    return {
+        lerpf(lower[0], upper[0], t),
+        lerpf(lower[1], upper[1], t),
+        lerpf(lower[2], upper[2], t)
+    };
 }
 
-float base_hue_for_position(int i_lo, int table_size) 
+inline f3 lerp(const float lower[3], const float upper[3], const float t)
 {
-    const float result = i_lo * 360.f / table_size;
-    return result;
+    return {
+        lerpf(lower[0], upper[0], t),
+        lerpf(lower[1], upper[1], t),
+        lerpf(lower[2], upper[2], t)
+    };
 }
 
-int hue_position_in_uniform_table(float hue, int table_size)
+inline unsigned int midpoint(unsigned int a, unsigned int b)
 {
-    const float wrapped_hue = wrap_to_360(hue);
-    return int(wrapped_hue / 360.f * (float) table_size);
+    return (a + b) / 2;
 }
 
-int next_position_in_table(int entry, int table_size)
+inline float midpoint(float a, float b)
 {
-    return (entry + 1) % table_size;
+    return (a + b) / 2.f;
 }
 
-int clamp_to_table_bounds(int entry, int table_size)
+unsigned int lookup_hue_interval(float h, const Table1D &hues, const std::array<int, 2> & hue_linearity_search_range)
 {
-    return std::min(table_size - 1, std::max(0, entry));
-}
+    // Search the given Table for the interval containing the desired hue
+    // Returns the upper index of the interval
 
-f2 cusp_from_table(float h, const Table3D &gt)
-{
-    int i_lo = 0;
-    int i_hi = gt.base_index + gt.size; // allowed as we have an extra entry in the table
-    int i = clamp_to_table_bounds(hue_position_in_uniform_table(h, gt.size) + gt.base_index, gt.total_size);
+    // We can narrow the search range based on the hues being almost uniform
+    unsigned int i = hues.nominal_hue_position_in_uniform_table(h);
+    unsigned int i_lo = std::max(int(hues.lower_wrap_index), int(i) + hue_linearity_search_range[0]); // Should be nominal not lower_wrap?
+    unsigned int i_hi = std::min(int(hues.upper_wrap_index), int(i) + hue_linearity_search_range[1]);
 
     while (i_lo + 1 < i_hi)
     {
-        if (h > gt.table[i][2])
+        if (h > hues[i])
         {
             i_lo = i;
         }
@@ -62,165 +68,265 @@ f2 cusp_from_table(float h, const Table3D &gt)
         {
             i_hi = i;
         }
-        i = clamp_to_table_bounds((i_lo + i_hi) / 2, gt.total_size);
+        i = midpoint(i_lo, i_hi);
     }
 
-    i_hi = std::max(1, i_hi);
+    i_hi = std::max(1U, i_hi); // TODO: should not be needed if we initialise with the correct lo and hi
 
-    const f3 lo {
-        gt.table[i_hi-1][0],
-        gt.table[i_hi-1][1],
-        gt.table[i_hi-1][2]
-    };
-
-    const f3 hi {
-        gt.table[i_hi][0],
-        gt.table[i_hi][1],
-        gt.table[i_hi][2]
-    };
-
-    const float t = (h - lo[2]) / (hi[2] - lo[2]);
-    const float cuspJ = lerpf(lo[0], hi[0], t);
-    const float cuspM = lerpf(lo[1], hi[1], t);
-
-    return { cuspJ, cuspM };
+    return i_hi;
 }
 
-float reach_m_from_table(float h, const ACES2::Table1D &gt)
+inline float interpolation_weight(float h, float h_lo, float h_hi)
 {
-    const int i_lo = clamp_to_table_bounds(hue_position_in_uniform_table(h, gt.size), gt.total_size);
-    const int i_hi = clamp_to_table_bounds(next_position_in_table(i_lo, gt.size), gt.total_size);
-
-    const float t = (h - i_lo) / (i_hi - i_lo);
-    return lerpf(gt.table[i_lo], gt.table[i_hi], t);
+    return (h - h_lo) / (h_hi - h_lo);
 }
 
-float hue_dependent_upper_hull_gamma(float h, const ACES2::Table1D &gt)
+inline float interpolation_weight(float h, unsigned int h_lo, unsigned int h_hi)
 {
-    const int i_lo = clamp_to_table_bounds(hue_position_in_uniform_table(h, gt.size) + gt.base_index, gt.total_size);
-    const int i_hi = clamp_to_table_bounds(next_position_in_table(i_lo, gt.size), gt.total_size);
+    return (h - h_lo) / (h_hi - h_lo);
+}
 
-    const float base_hue = (float) (i_lo - gt.base_index);
+inline f3 cusp_from_table(unsigned int i_hi, float t, const Table3D &gt)
+{
+    return lerp(gt[i_hi-1], gt[i_hi], t);
+}
 
-    const float t = wrap_to_360(h) - base_hue;
+float reach_m_from_table(float h, const ACES2::Table1D &rt)
+{
+    const unsigned int base = rt.hue_position_in_uniform_table(h);
+    const float t = h - base; // NOTE assumes uniform 1 degree 360 spacing
+    const unsigned int i_lo = base + rt.first_nominal_index; 
+    const unsigned int i_hi = i_lo + 1; // NOTE assumes uniform 1 degree 360 spacing 
 
-    return lerpf(gt.table[i_lo], gt.table[i_hi], t);
+    return lerpf(rt[i_lo], rt[i_hi], t);
 }
 
 //
 // CAM
 //
 
-// Post adaptation non linear response compression
-float panlrc_forward(float v, float F_L)
+inline float _post_adaptation_cone_response_compression_fwd(float Rc)
+{
+    const float F_L_Y = powf(Rc, 0.42f);
+    const float Ra    = (F_L_Y) / (cam_nl_offset + F_L_Y);
+    return Ra;
+}
+
+inline float _post_adaptation_cone_response_compression_inv(float Ra)
 {
-    const float F_L_v = powf(F_L * std::abs(v) / reference_luminance, 0.42f);
+    const float F_L_Y = (cam_nl_offset * Ra) / (1.0f - Ra); // TODO: what happens when Ra >= 1.0
+    const float Rc    = powf(F_L_Y, 1.f / 0.42f);
+    return Rc;
+}
+
+
+float post_adaptation_cone_response_compression_fwd(float v)
+{
+    const float abs_v = std::abs(v);
+    const float Ra =  _post_adaptation_cone_response_compression_fwd(abs_v);
     // Note that std::copysign(1.f, 0.f) returns 1 but the CTL copysign(1.,0.) returns 0.
     // TODO: Should we change the behaviour?
-    return (400.f * std::copysign(1.f, v) * F_L_v) / (27.13f + F_L_v);
+    return std::copysign(Ra, v);
 }
 
-float panlrc_inverse(float v, float F_L)
+float post_adaptation_cone_response_compression_inv(float v)
 {
-    return std::copysign(1.f, v) * reference_luminance / F_L * powf((27.13f * std::abs(v) / (400.f - std::abs(v))), 1.f / 0.42f);
+    const float abs_v = std::abs(v);
+    const float Rc    = _post_adaptation_cone_response_compression_inv(abs_v);
+    return std::copysign(Rc, v);
 }
 
-// Optimization used during initialization
-float Y_to_J(float Y, const JMhParams &params)
+inline float Achromatic_n_to_J(float A, const float cz)
 {
-    float F_L_Y = powf(params.F_L * std::abs(Y) / reference_luminance, 0.42f);
-    return std::copysign(1.f, Y) * reference_luminance * powf(((400.f * F_L_Y) / (27.13f + F_L_Y)) / params.A_w_J, surround[1] * params.z);
+    return J_scale * powf(A, cz);
 }
 
-f3 RGB_to_JMh(const f3 &RGB, const JMhParams &p)
+inline float J_to_Achromatic_n(float J, const float inv_cz)
 {
-    const float red = RGB[0];
-    const float grn = RGB[1];
-    const float blu = RGB[2];
+    return powf(J * (1.0f / J_scale), inv_cz);
+}
 
-    const float red_m = red * p.MATRIX_RGB_to_CAM16[0] + grn * p.MATRIX_RGB_to_CAM16[1] + blu * p.MATRIX_RGB_to_CAM16[2];
-    const float grn_m = red * p.MATRIX_RGB_to_CAM16[3] + grn * p.MATRIX_RGB_to_CAM16[4] + blu * p.MATRIX_RGB_to_CAM16[5];
-    const float blu_m = red * p.MATRIX_RGB_to_CAM16[6] + grn * p.MATRIX_RGB_to_CAM16[7] + blu * p.MATRIX_RGB_to_CAM16[8];
+// Optimization for achromatic values
 
-    const float red_a =  panlrc_forward(red_m * p.D_RGB[0], p.F_L);
-    const float grn_a =  panlrc_forward(grn_m * p.D_RGB[1], p.F_L);
-    const float blu_a =  panlrc_forward(blu_m * p.D_RGB[2], p.F_L);
+inline float _A_to_Y(float A, const JMhParams &p)
+{
+    const float Ra = p.A_w_J * A;
+    const float Y  = _post_adaptation_cone_response_compression_inv(Ra) / p.F_L_n;
+    return Y;
+}
 
-    const float A = 2.f * red_a + grn_a + 0.05f * blu_a;
-    const float a = red_a - 12.f * grn_a / 11.f + blu_a / 11.f;
-    const float b = (red_a + grn_a - 2.f * blu_a) / 9.f;
+inline float _J_to_Y(float abs_J, const JMhParams &p)
+{
+    return _A_to_Y(J_to_Achromatic_n(abs_J, p.inv_cz), p);
+}
 
-    const float J = 100.f * powf(A / p.A_w, surround[1] * p.z);
+inline float _Y_to_J(float abs_Y, const JMhParams &p)
+{
+    const float Ra = _post_adaptation_cone_response_compression_fwd(abs_Y * p.F_L_n);
+    const float J  = Achromatic_n_to_J(Ra * p.inv_A_w_J, p.cz);
+    return J;
+}
+
+float Y_to_J(float Y, const JMhParams &p)
+{
+    const float abs_Y = std::abs(Y);
+    const float J     = _Y_to_J(abs_Y, p);
+    return std::copysign(J, Y);
+}
+
+f3 RGB_to_Aab(const f3 &RGB, const JMhParams &p)
+{
+    const f3 rgb_m = mult_f3_f33(RGB, p.MATRIX_RGB_to_CAM16_c);
 
-    const float M = J == 0.f ? 0.f : 43.f * surround[2] * sqrt(a * a + b * b);
+    const f3 rgb_a = {
+        post_adaptation_cone_response_compression_fwd(rgb_m[0]),
+        post_adaptation_cone_response_compression_fwd(rgb_m[1]),
+        post_adaptation_cone_response_compression_fwd(rgb_m[2])
+    };
 
-    const float PI = 3.14159265358979f;
-    const float h_rad = std::atan2(b, a);
-    float h = std::fmod(h_rad * 180.f / PI, 360.f);
-    if (h < 0.f)
+    const f3 Aab = mult_f3_f33(rgb_a, p.MATRIX_cone_response_to_Aab);
+    return Aab;
+}
+
+f3 Aab_to_JMh(const f3 &Aab, const JMhParams &p)
+{
+    if (Aab[0] <= 0.f)
     {
-        h += 360.f;
+        return {0.f, 0.f, 0.f};
     }
+    const float J = Achromatic_n_to_J(Aab[0], p.cz);
+    const float M = sqrt(Aab[1] * Aab[1] + Aab[2] * Aab[2]);
+    const float h_rad = std::atan2(Aab[2], Aab[1]);
+    float h = _from_radians(h_rad); // Call to unwrapped hue version due to atan2 limits
 
     return {J, M, h};
 }
 
-f3 JMh_to_RGB(const f3 &JMh, const JMhParams &p)
+f3 RGB_to_JMh(const f3 &RGB, const JMhParams &p)
+{
+    const f3 Aab = RGB_to_Aab(RGB, p);
+    const f3 JMh = Aab_to_JMh(Aab, p);
+    return JMh;
+}
+
+f3 JMh_to_Aab(const f3 &JMh, const float &cos_hr, const float &sin_hr, const JMhParams &p)
 {
     const float J = JMh[0];
     const float M = JMh[1];
-    const float h = JMh[2];
 
-    const float PI = 3.14159265358979f;
-    const float h_rad = h * PI / 180.f;
+    const float A = J_to_Achromatic_n(J, p.inv_cz);
+    const float a = M * cos_hr;
+    const float b = M * sin_hr;
+    return {A, a, b};
+}
 
-    const float scale = M / (43.f * surround[2]);
-    const float A = p.A_w * powf(J / 100.f, 1.f / (surround[1] * p.z));
-    const float a = scale * cos(h_rad);
-    const float b = scale * sin(h_rad);
+f3 JMh_to_Aab(const f3 &JMh, const JMhParams &p)
+{
+    const float h = JMh[2];
+    const float h_rad = to_radians(h);
+    const float cos_hr = cos(h_rad);
+    const float sin_hr = sin(h_rad);
 
-    const float red_a = (460.f * A + 451.f * a + 288.f * b) / 1403.f;
-    const float grn_a = (460.f * A - 891.f * a - 261.f * b) / 1403.f;
-    const float blu_a = (460.f * A - 220.f * a - 6300.f * b) / 1403.f;
+    return JMh_to_Aab(JMh, cos_hr, sin_hr, p);
+}
 
-    float red_m = panlrc_inverse(red_a, p.F_L) / p.D_RGB[0];
-    float grn_m = panlrc_inverse(grn_a, p.F_L) / p.D_RGB[1];
-    float blu_m = panlrc_inverse(blu_a, p.F_L) / p.D_RGB[2];
+f3 Aab_to_RGB(const f3 &Aab, const JMhParams &p)
+{
+    const f3 rgb_a = mult_f3_f33(Aab, p.MATRIX_Aab_to_cone_response);
 
-    const float red = red_m * p.MATRIX_CAM16_to_RGB[0] + grn_m * p.MATRIX_CAM16_to_RGB[1] + blu_m * p.MATRIX_CAM16_to_RGB[2];
-    const float grn = red_m * p.MATRIX_CAM16_to_RGB[3] + grn_m * p.MATRIX_CAM16_to_RGB[4] + blu_m * p.MATRIX_CAM16_to_RGB[5];
-    const float blu = red_m * p.MATRIX_CAM16_to_RGB[6] + grn_m * p.MATRIX_CAM16_to_RGB[7] + blu_m * p.MATRIX_CAM16_to_RGB[8];
+    const f3 rgb_m = {
+        post_adaptation_cone_response_compression_inv(rgb_a[0]),
+        post_adaptation_cone_response_compression_inv(rgb_a[1]),
+        post_adaptation_cone_response_compression_inv(rgb_a[2])
+    };
+
+    const f3 rgb = mult_f3_f33(rgb_m, p.MATRIX_CAM16_c_to_RGB);
+    return rgb;
+}
 
-    return {red, grn, blu};
+f3 JMh_to_RGB(const f3 &JMh, const JMhParams &p)
+{
+    const f3 Aab = JMh_to_Aab(JMh, p);
+    const f3 rgb = Aab_to_RGB(Aab, p);
+    return rgb;
 }
 
 //
 // Tonescale / Chroma compress
 //
 
-float chroma_compress_norm(float h, float chroma_compress_scale)
-{
-    const float PI = 3.14159265358979f;
-    const float h_rad = h / 180.f * PI;
-    const float a = cos(h_rad);
-    const float b = sin(h_rad);
-    const float cos_hr2 = a * a - b * b;
-    const float sin_hr2 = 2.0f * a * b;
-    const float cos_hr3 = 4.0f * a * a * a - 3.0f * a;
-    const float sin_hr3 = 3.0f * b - 4.0f * b * b * b;
+//TODO: move to header
+// https://stackoverflow.com/questions/6996764/fastest-way-to-do-horizontal-sse-vector-sum-or-other-reduction/35270026#35270026
+// https://gcc.godbolt.org/#compilers:!((compiler:g6,options:%27-xc+-O3+-Wall+-fverbose-asm+-march%3Dhaswell+-mno-avx%27,sourcez:MQSwdgxgNgrgJgUwAQB4QFt3gC4CdwB0AFgHwBQoksiqAztnDseWQPStLZEi1I9JQQAa2QBZAIa5x2AOS0ANEgBGMbJwDuCcSLicA9kgh70ABxBRk2A0oTZsCXIb2ICbDtHFgA5khhgAZnq42H7SCFAAnkiIECCIvFa%2BtMjoegBu4ia0rLREMP4muuD0Wrp6/kipaURQWWT%2BUHrSSES0MOgA%2BlkdtMkAjAAUHR3ofQBMABxIaQCUSADeSEvLK6tr62vs00gAvEgA2kgAIkgAwkgAPkgAQkgAgkgAumQrw6OTSLn5y3sjnV/%2BBoILq0AZpRTgpAdUSiDoAZQAEgBVABiKIAMgBRAZjRQAZkUAAZFH0ZjMANxLLaHc4nK63B7PV4jcZTNroXhLX6YDriOBwEFgxQAimbDjs3h7Q5HADU51OMrpNxlDzuMtuTOWAI2XKhPKqCBqgoBwvatFFqy2B2W5yWStl8sVTypHAsai4yCMpnMDiQ4jSejifsq6WUUVwCBgtHAPgBLy1Zp1uz1nT5At6Awlwry/gp8aWEZCuDArx5EDS2F6HX8eLGmbNeYAvmR6o1mq12iCesk8UMWR9Zgt81D%2B2yc8m/iN0oa4DATILZpT1lslLgmnAIOJ6Ehwgh0AgwJWkAS%2BvokLjCcO3qzPonuan%2BQvs/k8yttet71O0oaoMac6aOVFLZuC8IgWnEKAKgAWhIAQ9HUcDIOHCUNk/NNu3rDln1zclh0LGBiw2Sdy0rWhq1rTDzVw5sWy2Wh0AgixHCMGhowALwQRQVDUKMYEYqJaEaTRHD8CxeicRAoPY5B%2BHEJATDXJQLHQVsmjUDtOm6Ig0z7d4pkHeYr1HW8OQnHltMfLIhWmV8E1Mj8Uw6CyBSsrMJVsgtbAIkt1mIisqxrOt3OoltQH8RAKmGO4ADUAA1hlU9t2TGABWAA2Lt/QAD101K0psodmT06YhN1P48o6Td6CyCqslZMEPJHYrqhAUCzPQCqECyvBxAgbB/FZJ8kFJSlgNasDWSM5qb3a3lLNBNIhIhECiAtZYtjTTgiGQUrJpWfDCI0rtegQXs0lZNakw2sAijAQQwEsbbPh7aYHGjPQwEUdRuAgMD%2BD0ExsAwCCkECRwYti4ctgGMADHULd7Fu%2BhcBgPqQA%2BhQ/Ruv0oCgJByi2vc/QjQmkAAFluLAwAwdoZjIZtgAPRh/BbNx2HZrY4D0GAlIQMguZ5iwWnZLp02SOtr0mXQ0jgOZFiTBWtjSZNDluK5GSmm8/AisqeW1hB/HABAXNBS6FY2TmPpkNR1CCXAIi%2B5APsiJJkDTOEjkUB7v0cKxUe4bxSa8SQlHELxkCUEAj3UKOwM8JAUQABT9flNY%2BAFsFMT8DSNKz9f8RRiIRwoF1li0tg9En%2BFhkNqlqOA04mXR30/KrsG6QpMxzTOTEag6fL8ysBX8NKyaGHl0NoOAwTgbCyRC2iOY5vhD1rMgcGFzsEDMWtuwQCXRxAJAsrligQHCg2RwhhKitZI/uFH3XOj8ExeqEbgOm3kBR4GLLFBPpcOoth4iggDBwnhdCwzAFBeIeBUZA2/PcOKAhbC8CjJ8f0bswz2CQDHLg3M1APWNjGYMVQKDhGSI3e%2B38yZPx6DmIEn8d51j/lCGE8JkRoixAMPoRJ8SKDGPPchN1z5UJMo/NCj4v4UQfmTf%2BjVJYTGobWOhAIgSNCYSAPoaVMKj0LuwxEqIMTYl4UgYkx4BHzxWFsOE8MTCk1KtgW2O5lIHkrGI9kKjJECmkUFdoeiWggFrH3LyhFB7RkGtGCinjBGAOsRwOEcJMRjFrg3eJ/dHJdR6n1TRMioBEjNjYpJcidzfhLPgmAbUo5OG9GJM8VQkb2D5PjCoclBD2CkHjEwWTcC6FwLWeQWVMDyEvDRDeR1fHdh7D0ISPQGK4xYggXSd9j4zEMisLKs1nK5JYfI3CGytnoV8b/PZeFQklnCVoyYPQgksKbKzReKdMiI1BmudASAJBSFkLwCGLQ7AmAAFzsHoG/acuAGjwQIF6VgABHGACB6DozANkPotY%2BgAE4%2BhkzSqwIg8EoJWCku0KCHV0pQTxfgNiH1sD8VYKisYDKyZ9EJMAeljLmWJXUiLboDEvnLI%2BCfQqywjDIrUIokyABFPiug9hZX2cKjGYrjKNCOHxPGex2RSr5PKl0gTVUgz2AMJAUFFAmuPniJAMpj4AHZ/4pKtVlfKdMVgiu3OK7g%2Br1WORzr%2BVy7QtWz0ldKxqdF2ieuTEas1ZqsqngdRah1KVLU2v/oSJNWV7XH1oQ6p1w5XVKuKuycN3jBQqrVYoD1aqQ2ugMIa41pr63H1TQ6jNWUs3HxzS6xVTUbyNGTIWtVOqkBjQjXW0d0bY3mrTYmh11qkDOoVaK7tHxuCzTURYY0Yay0mU9USGNVaTIjqjQ2rKTbj4TvTWm%2BNmap1pvyjOuduau3ivZLNSeoJGjlpAOXDg6EsjRANlob5hgoCeC8HIT4DChYAyBlgDiuAzlFguWWfyZFAqYXuUAA%3D%3D)),filterAsm:(binary:!t,commentOnly:!t,directives:!t,intel:!t,labels:!t),version:3
+// Alternate https://github.com/vectorclass/version2/blob/master/vectorf128.h#L1048
+
+#if OCIO_USE_SSE2 || OCIO_USE_AVX
+inline float hsum_ps_sse1(__m128 v) {                              // v    = [   D   C |   B   A ]
+    __m128 shuf   = _mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 3, 0, 1)); // shuf = [   C   D |   A   B ]
+    __m128 sums   = _mm_add_ps(v, shuf);                           // sums = [ D+C C+D | B+A A+B ]
+    shuf          = _mm_movehl_ps(shuf, sums);                     // shuf = [   C   D | D+C C+D ]
+    sums          = _mm_add_ss(sums, shuf);                        // sums = [ D+C C+D | B+A A+B+C+D ]
+    return    _mm_cvtss_f32(sums);                                 // A+B+C+D
+}
+#endif
+#ifdef OCIO_USE_AVX
+inline float hsum256_ps_avx(__m256 v) {         // v     = [   H   G |   F   E |   D   C |   B   A ]
+    __m128 vlow  = _mm256_castps256_ps128(v);   // vlow  = [   D   C |   B   A ]
+    __m128 vhigh = _mm256_extractf128_ps(v, 1); // vhigh = [   H   G |   F   E ]
+    __m128 v128  = _mm_add_ps(vlow, vhigh);     // v128  = [ H+D G+C | F+B E+A ]
+    return hsum_ps_sse1(v128);
+}
+#endif
 
-    const float M = 11.34072f * a +
-              16.46899f * cos_hr2 +
-               7.88380f * cos_hr3 +
-              14.66441f * b +
-              -6.37224f * sin_hr2 +
-               9.19364f * sin_hr3 +
-              77.12896f;
 
-    return M * chroma_compress_scale;
+float chroma_compress_norm(float cos_hr1, float sin_hr1, float chroma_compress_scale)
+{
+    const float cos_hr2 = 2.0f * cos_hr1 * cos_hr1 - 1.0f;
+    const float sin_hr2 = 2.0f * cos_hr1 * sin_hr1;
+    const float cos_hr3 = 4.0f * cos_hr1 * cos_hr1 * cos_hr1 - 3.0f * cos_hr1;
+    const float sin_hr3 = 3.0f * sin_hr1 - 4.0f * sin_hr1 * sin_hr1 * sin_hr1;
+
+    constexpr unsigned int AVX_ALIGNMENT = 32;
+    alignas(AVX_ALIGNMENT) const float trig_angles_hr[8] = {
+        cos_hr1, cos_hr2, cos_hr3, 0.0f,
+        sin_hr1, sin_hr2, sin_hr3, 1.0f
+    };
+    alignas(AVX_ALIGNMENT) static constexpr float weights[8] = { // TODO: investigate reordering of the entries so we are summing equal magnitude values first?
+        11.34072f, 16.46899f, 14.66441f,  0.0f,
+        4.66441f,  -6.37224f,  9.19364f, 77.12896f
+    };
+    // TODO: benchmark this across multiple platforms to justify the multiple code paths.
+#if OCIO_USE_SSE2
+#if OCIO_USE_AVX
+    __m256 trigs        = _mm256_load_ps(trig_angles_hr);
+    __m256 trig_weights = _mm256_load_ps(weights);
+    __m256 t1           = _mm256_mul_ps(trigs, trig_weights);
+    const float M       = hsum256_ps_avx(t1);
+#else
+    __m128 cosines        = _mm_load_ps(trig_angles_hr);
+    __m128 sines          = _mm_load_ps(&trig_angles_hr[4]);
+    __m128 cosine_weights = _mm_load_ps(weights);
+    __m128 sine_weights   = _mm_load_ps(&weights[4]);
+
+    __m128 t1     = _mm_mul_ps(cosines, cosine_weights);
+    __m128 t2     = _mm_mul_ps(sines, sine_weights);
+    __m128 t3     = _mm_add_ps(t1, t2); // TODO use fmadd_ps_sse2 from Lut1DOpCPU_SSE2.cpp ?
+    const float M = hsum_ps_sse1(t3);
+#endif
+#else
+    const float M = weights[0] * trig_angles_hr[0] +
+                    weights[1] * trig_angles_hr[1] +
+                    weights[2] * trig_angles_hr[2] +
+                    weights[4] * trig_angles_hr[4] +
+                    weights[5] * trig_angles_hr[5] +
+                    weights[6] * trig_angles_hr[6] +
+                    weights[7];
+#endif
+
+    return M * chroma_compress_scale; // TODO: is it worth prescaling the above weights?
 }
 
-float toe_fwd( float x, float limit, float k1_in, float k2_in)
+inline float toe_fwd( float x, float limit, float k1_in, float k2_in)
 {
     if (x > limit)
     {
@@ -230,10 +336,13 @@ float toe_fwd( float x, float limit, float k1_in, float k2_in)
     const float k2 = std::max(k2_in, 0.001f);
     const float k1 = sqrt(k1_in * k1_in + k2 * k2);
     const float k3 = (limit + k1) / (limit + k2);
-    return 0.5f * (k3 * x - k1 + sqrt((k3 * x - k1) * (k3 * x - k1) + 4.f * k2 * k3 * x));
+
+    const float minus_b = k3 * x - k1;
+    const float minus_ac = k2 * k3 * x; // a is 1.0
+    return 0.5f * (minus_b + sqrt(minus_b * minus_b + 4.f * minus_ac)); // a is 1.0, mins_b squared == b^2
 }
 
-float toe_inv( float x, float limit, float k1_in, float k2_in)
+inline float toe_inv( float x, float limit, float k1_in, float k2_in)
 {
     if (x > limit)
     {
@@ -246,33 +355,74 @@ float toe_inv( float x, float limit, float k1_in, float k2_in)
     return (x * x + k1 * x) / (k3 * (x + k2));
 }
 
-f3 tonescale_chroma_compress_fwd(const f3 &JMh, const JMhParams &p, const ToneScaleParams &pt, const ChromaCompressParams &pc)
+template <bool inverse>
+inline float aces_tonescale(const float Y_in, const ToneScaleParams &pt)
 {
-    const float J = JMh[0];
-    const float M = JMh[1];
-    const float h = JMh[2];
+    if (inverse)
+    {
+        const float Y_ts_norm = Y_in / reference_luminance; // TODO
+        const float Z = std::max(0.f, std::min(pt.inverse_limit, Y_ts_norm)); // TODO: could eliminate max in the context of the full tonescale
+        const float f = (Z + sqrt(Z * (4.f * pt.t_1 + Z))) / 2.f;
+        const float Y = pt.s_2 / (powf((pt.m_2 / f), (1.f / pt.g)) - 1.f); // TODO: investigate precomputing reciprocal m_2  and a negative power? may swap a division for a multiply? powf(pt.m_2_recip * f, (-1.f / pt.g))
+        return Y;
+    }
+
+    const float f    = pt.m_2 * powf(Y_in / (Y_in + pt.s_2), pt.g);
+    const float Y_ts = std::max(0.f, f * f / (f + pt.t_1)) * pt.n_r;  // max prevents -ve values being output also handles division by zero possibility // TODO: can we eliminate the n_r scaling? should it be reference_luminance
+    return Y_ts;
+}
 
+template <bool inverse>
+inline float tonescale(const float J, const JMhParams &p, const ToneScaleParams &pt) // TODO: consider computing tonescale from and to A rather than J to avoid extra pow() calls
+{
     // Tonescale applied in Y (convert to and from J)
-    const float A = p.A_w_J * powf(std::abs(J) / 100.f, 1.f / (surround[1] * p.z));
-    const float Y = std::copysign(1.f, J) * 100.f / p.F_L * powf((27.13f * A) / (400.f - A), 1.f / 0.42f) / 100.f;
+    const float J_abs = std::abs(J);
+    const float Y_in  = _J_to_Y(J_abs, p);
+    const float Y_out = aces_tonescale<inverse>(Y_in, pt);
+    const float J_out = _Y_to_J(Y_out, p);
+    return std::copysign(J_out, J);
+}
 
-    const float f = pt.m_2 * powf(std::max(0.f, Y) / (Y + pt.s_2), pt.g);
-    const float Y_ts = std::max(0.f, f * f / (f + pt.t_1)) * pt.n_r;
+float tonescale_fwd(const float J, const JMhParams &p, const ToneScaleParams &pt)
+{
+    return tonescale<false>(J, p, pt);
+}
+
+float tonescale_inv(const float J, const JMhParams &p, const ToneScaleParams &pt)
+{
+    return tonescale<true>(J, p, pt);
+}
 
-    const float F_L_Y = powf(p.F_L * std::abs(Y_ts) / 100.f, 0.42f);
-    const float J_ts = std::copysign(1.f, Y_ts) * 100.f * powf(((400.f * F_L_Y) / (27.13f + F_L_Y)) / p.A_w_J, surround[1] * p.z);
 
-    // ChromaCompress
+template <bool inverse>
+inline float tonescale_A_to_J(const float A, const JMhParams &p, const ToneScaleParams &pt) // TODO: consider computing tonescale from and to A rather than J to avoid extra pow() calls
+{
+    const float Y_in  = _A_to_Y(A, p);
+    const float Y_out = aces_tonescale<inverse>(Y_in, pt);
+    const float J_out = _Y_to_J(Y_out, p);
+    return std::copysign(J_out, A);
+}
+
+float tonescale_A_to_J_fwd(const float A, const JMhParams &p, const ToneScaleParams &pt)
+{
+    return tonescale_A_to_J<false>(A, p, pt);
+}
+
+f3 chroma_compress_fwd(const f3 &JMh, const float J_ts, const float Mnorm, const ResolvedSharedCompressionParameters &pr, const ChromaCompressParams &pc)
+{
+    const float J = JMh[0];
+    const float M = JMh[1];
+    const float h = JMh[2];
+
     float M_cp = M;
 
     if (M != 0.0)
     {
-        const float nJ = J_ts / pc.limit_J_max;
+        const float nJ = J_ts / pr.limit_J_max;
         const float snJ = std::max(0.f, 1.f - nJ);
-        const float Mnorm = chroma_compress_norm(h, pc.chroma_compress_scale);
-        const float limit = powf(nJ, pc.model_gamma) * reach_m_from_table(h, pc.reach_m_table) / Mnorm;
+        const float limit = powf(nJ, pr.model_gamma_inv) * pr.reachMaxM / Mnorm;
 
-        M_cp = M * powf(J_ts / J, pc.model_gamma);
+        M_cp = M * powf(J_ts / J, pr.model_gamma_inv);
         M_cp = M_cp / Mnorm;
         M_cp = limit - toe_fwd(limit - M_cp, limit - 0.001f, snJ * pc.sat, sqrt(nJ * nJ + pc.sat_thr));
         M_cp = toe_fwd(M_cp, limit, nJ * pc.compr, snJ);
@@ -282,49 +432,45 @@ f3 tonescale_chroma_compress_fwd(const f3 &JMh, const JMhParams &p, const ToneSc
     return {J_ts, M_cp, h};
 }
 
-f3 tonescale_chroma_compress_inv(const f3 &JMh, const JMhParams &p, const ToneScaleParams &pt, const ChromaCompressParams &pc)
+f3 chroma_compress_inv(const f3 &JMh, const float J, const float Mnorm, const ResolvedSharedCompressionParameters &pr, const ChromaCompressParams &pc)
 {
     const float J_ts = JMh[0];
     const float M_cp = JMh[1];
     const float h    = JMh[2];
-
-    // Inverse Tonescale applied in Y (convert to and from J)
-    const float A = p.A_w_J * powf(std::abs(J_ts) / 100.f, 1.f / (surround[1] * p.z));
-    const float Y_ts = std::copysign(1.f, J_ts) * 100.f / p.F_L * powf((27.13f * A) / (400.f - A), 1.f / 0.42f) / 100.f;
-
-    const float Z = std::max(0.f, std::min(pt.n / (pt.u_2 * pt.n_r), Y_ts));
-    const float ht = (Z + sqrt(Z * (4.f * pt.t_1 + Z))) / 2.f;
-    const float Y = pt.s_2 / (powf((pt.m_2 / ht), (1.f / pt.g)) - 1.f) * pt.n_r;
-
-    const float F_L_Y = powf(p.F_L * std::abs(Y) / 100.f, 0.42f);
-    const float J = std::copysign(1.f, Y) * 100.f * powf(((400.f * F_L_Y) / (27.13f + F_L_Y)) / p.A_w_J, surround[1] * p.z);
-
-    // Inverse ChromaCompress
     float M = M_cp;
 
     if (M_cp != 0.0)
     {
-        const float nJ = J_ts / pc.limit_J_max;
+        const float nJ = J_ts / pr.limit_J_max;
         const float snJ = std::max(0.f, 1.f - nJ);
-        const float Mnorm = chroma_compress_norm(h, pc.chroma_compress_scale);
-        const float limit = powf(nJ, pc.model_gamma) * reach_m_from_table(h, pc.reach_m_table) / Mnorm;
+        const float limit = powf(nJ, pr.model_gamma_inv) * pr.reachMaxM / Mnorm;
 
         M = M_cp / Mnorm;
         M = toe_inv(M, limit, nJ * pc.compr, snJ);
         M = limit - toe_inv(limit - M, limit - 0.001f, snJ * pc.sat, sqrt(nJ * nJ + pc.sat_thr));
         M = M * Mnorm;
-        M = M * powf(J_ts / J, -pc.model_gamma);
+        M = M * powf(J_ts / J, -pr.model_gamma_inv);
     }
 
     return {J, M, h};
 }
 
-JMhParams init_JMhParams(const Primaries &P)
+inline float model_gamma(void)
 {
-    JMhParams p;
+    // c * z nonlinearity
+    return surround[1] * (1.48f + sqrt(Y_b / reference_luminance));
+}
+
+JMhParams init_JMhParams(const Primaries &prims)
+{
+    const m33f base_cone_response_to_Aab = {
+        2.0f, 1.0f, 1.0f / 20.0f,
+        1.0f, -12.0f / 11.0f, 1.0f / 11.0f,
+        1.0f / 9.0f, 1.0f / 9.0f, -2.0f / 9.0f
+    };
 
     const m33f MATRIX_16 = XYZtoRGB_f33(CAM16::primaries);
-    const m33f RGB_to_XYZ = RGBtoXYZ_f33(P);
+    const m33f RGB_to_XYZ = RGBtoXYZ_f33(prims);
     const f3 XYZ_w = mult_f3_f33(f3_from_f(reference_luminance), RGB_to_XYZ);
 
     const float Y_W = XYZ_w[1];
@@ -332,16 +478,18 @@ JMhParams init_JMhParams(const Primaries &P)
     const f3 RGB_w = mult_f3_f33(XYZ_w, MATRIX_16);
 
     // Viewing condition dependent parameters
-    const float K = 1.f / (5.f * L_A + 1.f);
-    const float K4 = powf(K, 4.f);
-    const float N = Y_b / Y_W;
+    constexpr float K = 1.f / (5.f * L_A + 1.f);
+    constexpr float K4 = K * K * K * K;
     const float F_L = 0.2f * K4 * (5.f * L_A) + 0.1f * powf((1.f - K4), 2.f) * powf(5.f * L_A, 1.f/3.f);
-    const float z = 1.48f + sqrt(N);
+
+    const float F_L_n = F_L / reference_luminance;
+    const float cz    = model_gamma();
+    const float inv_cz = 1.0f / cz;
 
     const f3 D_RGB = {
-        Y_W / RGB_w[0],
-        Y_W / RGB_w[1],
-        Y_W / RGB_w[2]
+        F_L_n * Y_W / RGB_w[0],
+        F_L_n * Y_W / RGB_w[1],
+        F_L_n * Y_W / RGB_w[2]
     };
 
     const f3 RGB_WC {
@@ -351,76 +499,397 @@ JMhParams init_JMhParams(const Primaries &P)
     };
 
     const f3 RGB_AW = {
-        panlrc_forward(RGB_WC[0], F_L),
-        panlrc_forward(RGB_WC[1], F_L),
-        panlrc_forward(RGB_WC[2], F_L)
+        post_adaptation_cone_response_compression_fwd(RGB_WC[0]),
+        post_adaptation_cone_response_compression_fwd(RGB_WC[1]),
+        post_adaptation_cone_response_compression_fwd(RGB_WC[2])
+    };
+
+    const m33f cone_response_to_Aab = mult_f33_f33(scale_f33(Identity_M33, f3_from_f(cam_nl_scale)), base_cone_response_to_Aab); // TODO: this scale maybe ill conditioning the matrix
+    const float A_w   = cone_response_to_Aab[0] * RGB_AW[0] + cone_response_to_Aab[1] * RGB_AW[1] + cone_response_to_Aab[2] * RGB_AW[2];
+    const float A_w_J = _post_adaptation_cone_response_compression_fwd(F_L);
+    const float inv_A_w_J = 1.0f / A_w_J;
+
+    // TODO: evaluate the condition number of the below matrices and consider extracting a power of 2 scale out of them may improve noise behaviour
+    //       power of 2 to make cost of extra scaling limited vs generalised multiply/divide ?
+
+    // Note we are prescaling the CAM16 LMS responses to directly provide for chromatic adaptation.
+    const m33f MATRIX_RGB_to_CAM16 = mult_f33_f33(RGBtoRGB_f33(prims, CAM16::primaries), scale_f33(Identity_M33, f3_from_f(reference_luminance)));
+    const m33f MATRIX_RGB_to_CAM16_c = mult_f33_f33(scale_f33(Identity_M33, D_RGB), MATRIX_RGB_to_CAM16);
+    const m33f MATRIX_CAM16_c_to_RGB = invert_f33(MATRIX_RGB_to_CAM16_c);
+
+    const m33f MATRIX_cone_response_to_Aab = {
+        cone_response_to_Aab[0] / A_w,                cone_response_to_Aab[1] / A_w,                cone_response_to_Aab[2] / A_w,
+        cone_response_to_Aab[3] * 43.f * surround[2], cone_response_to_Aab[4] * 43.f * surround[2], cone_response_to_Aab[5] * 43.f * surround[2],
+        cone_response_to_Aab[6] * 43.f * surround[2], cone_response_to_Aab[7] * 43.f * surround[2], cone_response_to_Aab[8] * 43.f * surround[2],
+    };
+    const m33f MATRIX_Aab_to_cone_response = invert_f33(MATRIX_cone_response_to_Aab);
+
+    JMhParams p = {
+        MATRIX_RGB_to_CAM16_c,
+        MATRIX_CAM16_c_to_RGB,
+        MATRIX_cone_response_to_Aab,
+        MATRIX_Aab_to_cone_response,
+        F_L_n,
+        cz,
+        inv_cz,
+        A_w_J,
+        inv_A_w_J
     };
+   return p;
+}
 
-    const float A_w = ra * RGB_AW[0] + RGB_AW[1] + ba * RGB_AW[2];
+inline f3 generate_unit_cube_cusp_corners(const unsigned int corner)
+{
+    // Generation order R, Y, G, C, B, M to ensure hues rotate in correct order
+    return {float(static_cast<unsigned int>(((corner + 1) % cuspCornerCount) < 3)),
+            float(static_cast<unsigned int>(((corner + 5) % cuspCornerCount) < 3)),
+            float(static_cast<unsigned int>(((corner + 3) % cuspCornerCount) < 3))
+            };
+}
 
-    const float F_L_W = powf(F_L, 0.42f);
-    const float A_w_J   = (400.f * F_L_W) / (27.13f + F_L_W);
+void build_limiting_cusp_corners_tables(std::array<f3, totalCornerCount>& RGB_corners, std::array<f3, totalCornerCount>& JMh_corners,
+                               const JMhParams &params, const float peakLuminance)
+{
+    // We calculate the RGB and JMh values for the limiting gamut cusp corners
+    // They are then arranged into a cycle with the lowest JMh value at [1] to allow for hue wrapping
+    std::array<f3, cuspCornerCount> temp_RGB_corners;
+    std::array<f3, cuspCornerCount> temp_JMh_corners;
+    unsigned int min_index = 0;
+    for (unsigned int i = 0; i != cuspCornerCount; ++i)
+    {
+      temp_RGB_corners[i] = mult_f_f3(peakLuminance / reference_luminance, generate_unit_cube_cusp_corners(i));
+      temp_JMh_corners[i] = RGB_to_JMh(temp_RGB_corners[i], params);
+      if (temp_JMh_corners[i][2] < temp_JMh_corners[min_index][2])
+        min_index = i;
+    }
 
-    p.XYZ_w = XYZ_w;
-    p.F_L = F_L;
-    p.z = z;
-    p.D_RGB = D_RGB;
-    p.A_w = A_w;
-    p.A_w_J = A_w_J;
+    // Rotate entries placing lowest at [1] (not [0])
+    for (unsigned int i = 0; i != cuspCornerCount; ++i)
+    {
+      RGB_corners[i + 1] = temp_RGB_corners[(i + min_index) % cuspCornerCount];
+      JMh_corners[i + 1] = temp_JMh_corners[(i + min_index) % cuspCornerCount];
+    }
 
-    p.MATRIX_RGB_to_CAM16 = mult_f33_f33(RGBtoRGB_f33(P, CAM16::primaries), scale_f33(Identity_M33, f3_from_f(100.f)));
-    p.MATRIX_CAM16_to_RGB = invert_f33(p.MATRIX_RGB_to_CAM16);
+    // Copy end elements to create a cycle
+    RGB_corners[0]                   = RGB_corners[cuspCornerCount];
+    RGB_corners[cuspCornerCount + 1] = RGB_corners[1];
+    JMh_corners[0]                   = JMh_corners[cuspCornerCount];
+    JMh_corners[cuspCornerCount + 1] = JMh_corners[1];
 
-    return p;
+    // Wrap the hues, to maintain monotonicity these entries will fall outside [0.0, hue_limit)
+    JMh_corners[0][2]                   = JMh_corners[0][2] - hue_limit;
+    JMh_corners[cuspCornerCount + 1][2] = JMh_corners[cuspCornerCount + 1][2] + hue_limit;
 }
 
-Table3D make_gamut_table(const Primaries &P, float peakLuminance)
+void find_reach_corners_table(std::array<f3, totalCornerCount>& JMh_corners, const JMhParams &params, const float limitJ, const float maximum_source)
 {
-    const JMhParams params = init_JMhParams(P);
+    // We need to find the value of JMh that corresponds to limitJ for each corner
+    // This is done by scaling the unit corners converting to JMh until the J value is near the limitJ
+    // As an optimisation we use the equivalent Achromatic value to search for the J value and avoid the
+    // non-linear transform during the search
+    // Strictly speaking we should only need to find the R, G and  B "corners" as the reach is unbounded and
+    // as such does not form a cube, but is formed by the transformed 3 lower planes of the cube and
+    // the plane at J = limitJ
+    std::array<f3, cuspCornerCount> temp_JMh_corners;
+    const float limitA = J_to_Achromatic_n(limitJ, params.inv_cz);
+
+    unsigned int min_index = 0;
+    for (unsigned int i = 0; i != cuspCornerCount; ++i)
+    {
+        const f3 rgb_vector = generate_unit_cube_cusp_corners(i);
 
+        float lower = 0.0f;
+        float upper = maximum_source;
+        while ((upper - lower) > reach_cusp_tolerance)
+        {
+            float       test        = midpoint(lower, upper);
+            const f3    test_corner = mult_f_f3(test, rgb_vector);
+            const float A           = RGB_to_Aab(test_corner, params)[0];
+            if (A < limitA)
+            {
+                lower = test;
+            }
+            else
+            {
+                upper = test;
+            }
+            if (A == limitA)
+                break;
+        }
+        temp_JMh_corners[i] = RGB_to_JMh(mult_f_f3(upper, rgb_vector), params);
+
+        if (temp_JMh_corners[i][2] < temp_JMh_corners[min_index][2])
+            min_index = i;
+    }
+
+    // Rotate entries placing lowest at [1] (not [0]) // TODO: could use std::rotate_copy or even the ranges vs in C++20
+    for (unsigned int i = 0; i != cuspCornerCount; ++i)
+    {
+      JMh_corners[i + 1] = temp_JMh_corners[(i + min_index) % cuspCornerCount];
+    }
+
+    // Copy end elements to create a cycle
+    JMh_corners[0]                   = JMh_corners[cuspCornerCount];
+    JMh_corners[cuspCornerCount + 1] = JMh_corners[1];
+
+    // Wrap the hues, to maintain monotonicity these entries will fall outside [0.0, hue_limit)
+    JMh_corners[0][2]                   = JMh_corners[0][2] - hue_limit;
+    JMh_corners[cuspCornerCount + 1][2] = JMh_corners[cuspCornerCount + 1][2] + hue_limit;
+}
+
+unsigned int extract_sorted_cube_hues(std::array<float, max_sorted_corners>& sorted_hues,
+                             const std::array<f3, totalCornerCount>& reach_JMh, const std::array<f3, totalCornerCount>& display_JMh)
+{
+    // Basic merge of 2 sorted arrays, extracting the unique hues.
+    // Return the count of the unique hues
+    //TODO: use STL for this and similar functions
+    unsigned int idx         = 0;
+    unsigned int reach_idx   = 1;
+    unsigned int display_idx = 1;
+    while ((reach_idx < (cuspCornerCount + 1)) || (display_idx < (cuspCornerCount + 1)))
+    {
+        const float reach_hue   = reach_JMh[reach_idx][2];
+        const float display_hue = display_JMh[display_idx][2];
+        if (reach_hue == display_hue)
+        {
+            sorted_hues[idx] = reach_hue;
+            ++reach_idx;
+            ++display_idx; // When equal consume both
+        }
+        else
+        {
+            if (reach_hue < display_hue)
+            {
+                sorted_hues[idx] = reach_hue;
+                ++reach_idx;
+            }
+            else
+            {
+                sorted_hues[idx] = display_hue;
+                ++display_idx;
+            }
+      }
+      ++idx;
+    }
+    return idx;
+}
+
+void build_hue_sample_interval(const unsigned int samples, const float lower, const float upper, Table1D &hue_table, const unsigned int base)
+{
+    const float delta = (upper - lower) / float(samples);
+    for (unsigned int i = 0; i != samples; ++i)
+    {
+        hue_table[base + i] = lower + float(i) * delta;
+    }
+}
+
+void build_hue_table(Table1D &hue_table, const std::array<float, max_sorted_corners>& sorted_hues, const unsigned int unique_hues)
+{
+    const float ideal_spacing = hue_table.nominal_size / hue_limit;
+    std::array<unsigned int, 2 * cuspCornerCount + 2> samples_count = {};
+    unsigned int last_idx  = std::numeric_limits<unsigned int>::max();
+    unsigned int min_index = sorted_hues[0] == 0.0f ? 0 : 1; // Ensure we can always sample at 0.0 hue
+    for (unsigned int hue_idx = 0; hue_idx != unique_hues; ++hue_idx)
+    {
+        // BUG: "hue_table.size - 1" will fail if we have multiple hues mapping near the top of the table
+        unsigned int nominal_idx = std::min(std::max(static_cast<unsigned int>(std::round(sorted_hues[hue_idx] * ideal_spacing)), min_index), hue_table.nominal_size - 1);
+        if (last_idx == nominal_idx)
+        {
+            // Last two hues should sample at same index, need to adjust them
+            // Adjust previous sample down if we can
+            if (hue_idx > 1 && samples_count[hue_idx - 2] != (samples_count[hue_idx - 1] - 1))
+            {
+                samples_count[hue_idx - 1] = samples_count[hue_idx - 1] - 1;
+            }
+            else
+            {
+                nominal_idx = nominal_idx + 1;
+            }
+        }
+        samples_count[hue_idx] = std::min(nominal_idx, hue_table.nominal_size - 1U);
+        last_idx = min_index = nominal_idx;
+    }
+
+    unsigned int total_samples = 0;
+    // Special cases for ends
+    unsigned int i = 0;
+    build_hue_sample_interval(samples_count[i], 0.0f, sorted_hues[i], hue_table, total_samples + 1);
+    total_samples += samples_count[i];
+    for (++i; i != unique_hues; ++i)
+    {
+        const unsigned int samples = samples_count[i] - samples_count[i - 1];
+        build_hue_sample_interval(samples, sorted_hues[i - 1], sorted_hues[i], hue_table, total_samples + 1);
+        total_samples += samples;
+    }
+    // BUG: could break if we are unlucky with samples all being used up by this point
+    build_hue_sample_interval(hue_table.nominal_size - total_samples, sorted_hues[i - 1], hue_limit, hue_table, total_samples + 1);
+
+    hue_table[hue_table.lower_wrap_index] = hue_table[hue_table.last_nominal_index] - hue_limit;
+    hue_table[hue_table.upper_wrap_index] = hue_table[hue_table.first_nominal_index] + hue_limit;
+}
+
+std::array<float, 2> find_display_cusp_for_hue(float hue, const std::array<f3, totalCornerCount>& RGB_corners, const std::array<f3, totalCornerCount>& JMh_corners,
+                                               const JMhParams &params, std::array<float, 2> & previous)
+{
+    // This works by finding the required line segment between two of the XYZ cusp corners, then binary searching
+    // along the line calculating the JMh of points along the line till we find the required value.
+    // All values on the line segments are valid cusp locations.
+
+    unsigned int upper_corner = 1;
+    for (unsigned int i = upper_corner; i != totalCornerCount; ++i) // TODO: binary search?
+    {
+        if (JMh_corners[i][2] > hue)
+        {
+            upper_corner = i;
+            break;
+        }
+    }
+    const unsigned int lower_corner = upper_corner - 1;
+
+    // hue should now be within [lower_corner, upper_corner), handle exact match
+    if (JMh_corners[lower_corner][2] == hue)
+    {
+        return {JMh_corners[lower_corner][0], JMh_corners[lower_corner][1]};
+    }
+
+    // search by lerping between RGB corners for the hue
+    const f3 cusp_lower = RGB_corners[lower_corner];
+    const f3 cusp_upper = RGB_corners[upper_corner];
+    f3       sample;
+
+    float sample_t;
+    float lower_t = (upper_corner == previous[0]) ? previous[1] : 0.0f; // If we are still on the same segment start from where we left off
+    float upper_t = 1.0f;
+
+    f3 JMh;
+
+    // There is an edge case where we need to search towards the range when across the [0.0f, hue_limit) boundary
+    // each edge needs the directions swapped. This is handled by comparing against the appropriate corner to make
+    // sure we are still in the expected range between the lower and upper corner hue limits
+    while ((upper_t - lower_t) > display_cusp_tolerance)
+    {
+        sample_t = midpoint(lower_t, upper_t);
+        sample   = lerp(cusp_lower, cusp_upper, sample_t);
+        JMh      = RGB_to_JMh(sample, params);
+        if (JMh[2] < JMh_corners[lower_corner][2])
+        {
+            upper_t = sample_t;
+        }
+        else if (JMh[2] >= JMh_corners[upper_corner][2])
+        {
+            lower_t = sample_t;
+        }
+        else if (JMh[2] > hue)
+        {
+            upper_t = sample_t;
+        }
+        else
+        {
+            lower_t = sample_t;
+        }
+    }
+
+    // Use the midpoint of the final interval for the actual samples
+    sample_t = midpoint(lower_t, upper_t);
+    sample   = lerp(cusp_lower, cusp_upper, sample_t);
+    JMh      = RGB_to_JMh(sample, params);
+
+    previous[0] = float(upper_corner);
+    previous[1] = sample_t;
+
+    return {JMh[0], JMh[1]};
+}
+
+Table3D build_cusp_table(const Table1D& hue_table, const std::array<f3, totalCornerCount>& RGB_corners, const std::array<f3, totalCornerCount>& JMh_corners, const JMhParams &params)
+{
+    std::array<float, 2> previous = {0.0f, 0.0f};
+    Table3D output_table;
+    for (unsigned int i = output_table.first_nominal_index; i != output_table.upper_wrap_index; ++i)
+    {
+      const float hue = hue_table[i];
+      const std::array<float, 2> JM = find_display_cusp_for_hue(hue, RGB_corners, JMh_corners, params, previous);
+      output_table[i][0] = JM[0];
+      output_table[i][1] = JM[1] * (1.f + smooth_m * smooth_cusps);;
+      output_table[i][2] = hue;
+    }
+
+    // Copy extra entries to ease the code to handle hues wrapping around
+    output_table[output_table.lower_wrap_index][0] = output_table[output_table.last_nominal_index][0];
+    output_table[output_table.lower_wrap_index][1] = output_table[output_table.last_nominal_index][1];
+    output_table[output_table.lower_wrap_index][2] = hue_table[hue_table.lower_wrap_index];
+    output_table[output_table.upper_wrap_index][0] = output_table[output_table.first_nominal_index][0];
+    output_table[output_table.upper_wrap_index][1] = output_table[output_table.first_nominal_index][1];
+    output_table[output_table.upper_wrap_index][2] = hue_table[hue_table.upper_wrap_index];
+    return output_table;
+}
+
+Table3D make_uniform_hue_gamut_table(const JMhParams &reach_params, const JMhParams &params, float peakLuminance, float forward_limit, const SharedCompressionParameters& sp, Table1D& hue_table)
+{
+    // The principal here is to sample the hues as uniformly as possible, whilst ensuring we sample the corners
+    // of the limiting gamut and the reach primaries at limit J Max
+    //
+    // The corners are calculated then the hues are extracted and merged to form a unique sorted hue list
+    // We then build the hue table from the list, those hues are then used to compute the JMh od the limiting
+    // gamut cusp.
+
+    std::array<f3, totalCornerCount> reach_JMh_corners;
+    std::array<f3, totalCornerCount> limiting_RGB_corners;
+    std::array<f3, totalCornerCount> limiting_JMh_corners;
+    std::array<float, max_sorted_corners> sorted_hues;
+
+    find_reach_corners_table(reach_JMh_corners, reach_params, sp.limit_J_max, forward_limit);
+    build_limiting_cusp_corners_tables(limiting_RGB_corners, limiting_JMh_corners, params, peakLuminance);
+    const unsigned int unique_hues = extract_sorted_cube_hues(sorted_hues, reach_JMh_corners, limiting_JMh_corners);
+    build_hue_table(hue_table, sorted_hues, unique_hues);
+    return build_cusp_table(hue_table, limiting_RGB_corners, limiting_JMh_corners, params);
+}
+
+Table3D make_gamut_table(const JMhParams &params, float peakLuminance, Table1D& hue_table)
+{
     Table3D gamutCuspTableUnsorted{};
-    for (int i = 0; i < gamutCuspTableUnsorted.size; i++)
+    unsigned int minhIndex = 0;
+    for (unsigned int i = 0; i < gamutCuspTableUnsorted.nominal_size; i++)
     {
-        const float hNorm = (float) i / gamutCuspTableUnsorted.size;
+        const float hNorm = float(i) / float(gamutCuspTableUnsorted.nominal_size);
         const f3 HSV = {hNorm, 1., 1.};
         const f3 RGB = HSV_to_RGB(HSV);
         const f3 scaledRGB = mult_f_f3(peakLuminance / reference_luminance, RGB);
         const f3 JMh = RGB_to_JMh(scaledRGB, params);
 
-        gamutCuspTableUnsorted.table[i][0] = JMh[0];
-        gamutCuspTableUnsorted.table[i][1] = JMh[1];
-        gamutCuspTableUnsorted.table[i][2] = JMh[2];
-    }
-
-    int minhIndex = 0;
-    for (int i = 0; i < gamutCuspTableUnsorted.size; i++)
-    {
-        if ( gamutCuspTableUnsorted.table[i][2] < gamutCuspTableUnsorted.table[minhIndex][2])
+        gamutCuspTableUnsorted[i][0] = JMh[0];
+        gamutCuspTableUnsorted[i][1] = JMh[1]  * (1.f + smooth_m * smooth_cusps);
+        gamutCuspTableUnsorted[i][2] = JMh[2];
+        if ( gamutCuspTableUnsorted[i][2] < gamutCuspTableUnsorted[minhIndex][2])
             minhIndex = i;
     }
 
     Table3D gamutCuspTable{};
-    for (int i = 0; i < gamutCuspTableUnsorted.size; i++)
+    for (unsigned int i = 0; i < gamutCuspTableUnsorted.nominal_size; i++)
     {
-        gamutCuspTable.table[i + gamutCuspTable.base_index][0] = gamutCuspTableUnsorted.table[(minhIndex+i) % gamutCuspTableUnsorted.size][0];
-        gamutCuspTable.table[i + gamutCuspTable.base_index][1] = gamutCuspTableUnsorted.table[(minhIndex+i) % gamutCuspTableUnsorted.size][1];
-        gamutCuspTable.table[i + gamutCuspTable.base_index][2] = gamutCuspTableUnsorted.table[(minhIndex+i) % gamutCuspTableUnsorted.size][2];
+        gamutCuspTable[i + gamutCuspTable.base_index][0] = gamutCuspTableUnsorted[(minhIndex+i) % gamutCuspTableUnsorted.nominal_size][0];
+        gamutCuspTable[i + gamutCuspTable.base_index][1] = gamutCuspTableUnsorted[(minhIndex+i) % gamutCuspTableUnsorted.nominal_size][1];
+        gamutCuspTable[i + gamutCuspTable.base_index][2] = gamutCuspTableUnsorted[(minhIndex+i) % gamutCuspTableUnsorted.nominal_size][2];
     }
 
     // Copy last populated entry to first empty spot
-    gamutCuspTable.table[0][0] = gamutCuspTable.table[gamutCuspTable.base_index + gamutCuspTable.size-1][0];
-    gamutCuspTable.table[0][1] = gamutCuspTable.table[gamutCuspTable.base_index + gamutCuspTable.size-1][1];
-    gamutCuspTable.table[0][2] = gamutCuspTable.table[gamutCuspTable.base_index + gamutCuspTable.size-1][2];
+    gamutCuspTable[gamutCuspTable.lower_wrap_index][0] = gamutCuspTable[gamutCuspTable.last_nominal_index][0];
+    gamutCuspTable[gamutCuspTable.lower_wrap_index][1] = gamutCuspTable[gamutCuspTable.last_nominal_index][1];
+    gamutCuspTable[gamutCuspTable.lower_wrap_index][2] = gamutCuspTable[gamutCuspTable.last_nominal_index][2];
 
     // Copy first populated entry to last empty spot
-    gamutCuspTable.table[gamutCuspTable.base_index + gamutCuspTable.size][0] = gamutCuspTable.table[gamutCuspTable.base_index][0];
-    gamutCuspTable.table[gamutCuspTable.base_index + gamutCuspTable.size][1] = gamutCuspTable.table[gamutCuspTable.base_index][1];
-    gamutCuspTable.table[gamutCuspTable.base_index + gamutCuspTable.size][2] = gamutCuspTable.table[gamutCuspTable.base_index][2];
+    gamutCuspTable[gamutCuspTable.upper_wrap_index][0] = gamutCuspTable[gamutCuspTable.first_nominal_index][0];
+    gamutCuspTable[gamutCuspTable.upper_wrap_index][1] = gamutCuspTable[gamutCuspTable.first_nominal_index][1];
+    gamutCuspTable[gamutCuspTable.upper_wrap_index][2] = gamutCuspTable[gamutCuspTable.first_nominal_index][2];
 
-    // Wrap the hues, to maintain monotonicity. These entries will fall outside [0.0, 360.0]
-    gamutCuspTable.table[0][2] = gamutCuspTable.table[0][2] - 360.f;
-    gamutCuspTable.table[gamutCuspTable.size+1][2] = gamutCuspTable.table[gamutCuspTable.size+1][2] + 360.f;
+    // Wrap the hues, to maintain monotonicity. These entries will fall outside [0.0, hue_limit)
+    gamutCuspTable[gamutCuspTable.lower_wrap_index][2] -= hue_limit;
+    gamutCuspTable[gamutCuspTable.upper_wrap_index][2] += hue_limit;
 
+    // Extract hue table
+    for (unsigned int i = 0; i < gamutCuspTable.total_size; i++)
+    {
+        hue_table[i] = gamutCuspTable[i][2];
+    }
     return gamutCuspTable;
 }
 
@@ -429,22 +898,20 @@ bool any_below_zero(const f3 &rgb)
     return (rgb[0] < 0. || rgb[1] < 0. || rgb[2] < 0.);
 }
 
-Table1D make_reach_m_table(const Primaries &P, float peakLuminance)
+Table1D make_reach_m_table(const JMhParams &params, const float limit_J_max)
 {
-    const JMhParams params = init_JMhParams(P);
-    const float limit_J_max = Y_to_J(peakLuminance, params);
-
     Table1D gamutReachTable{};
 
-    for (int i = 0; i < gamutReachTable.size; i++) {
-        const float hue = (float) i;
+    for (unsigned int i = 0; i < gamutReachTable.nominal_size; i++) {
+        const float hue = gamutReachTable.base_hue_for_position(i);
 
-        const float search_range = 50.f;
-        float low = 0.;
+        constexpr float search_range = 50.f;
+        constexpr float search_maximum = 1300.f; // TODO: magic limit
+        float low = 0.f;
         float high = low + search_range;
         bool outside = false;
 
-        while ((outside != true) & (high < 1300.f))
+        while ((outside != true) & (high < search_maximum))
         {
             const f3 searchJMh = {limit_J_max, high, hue};
             const f3 newLimitRGB = JMh_to_RGB(searchJMh, params);
@@ -456,7 +923,7 @@ Table1D make_reach_m_table(const Primaries &P, float peakLuminance)
             }
         }
 
-        while (high-low > 1e-2)
+        while (high-low > 1e-2) // TODO:tolerance as constexpr
         {
             const float sampleM = (high + low) / 2.f;
             const f3 searchJMh = {limit_J_max, sampleM, hue};
@@ -472,265 +939,293 @@ Table1D make_reach_m_table(const Primaries &P, float peakLuminance)
             }
         }
 
-        gamutReachTable.table[i] = high;
+        gamutReachTable[i + gamutReachTable.base_index] = high;
     }
+    gamutReachTable[gamutReachTable.lower_wrap_index] = gamutReachTable[gamutReachTable.last_nominal_index];
+    gamutReachTable[gamutReachTable.upper_wrap_index] = gamutReachTable[gamutReachTable.first_nominal_index];
 
     return gamutReachTable;
 }
 
-bool outside_hull(const f3 &rgb)
+inline bool outside_hull(const f3 &rgb, float maxRGBtestVal)
 {
     // limit value, once we cross this value, we are outside of the top gamut shell
-    const float maxRGBtestVal = 1.0;
     return rgb[0] > maxRGBtestVal || rgb[1] > maxRGBtestVal || rgb[2] > maxRGBtestVal;
 }
 
-float get_focus_gain(float J, float cuspJ, float limit_J_max)
+inline float get_focus_gain(float J, float analytical_threshold, float limit_J_max, float focus_dist)
 {
-    const float thr = lerpf(cuspJ, limit_J_max, focus_gain_blend);
-
-    if (J > thr)
-    {
-        // Approximate inverse required above threshold
-        float gain = (limit_J_max - thr) / std::max(0.0001f, (limit_J_max - std::min(limit_J_max, J)));
-        return powf(log10(gain), 1.f / focus_adjust_gain) + 1.f;
-    }
-    else
+    float gain = limit_J_max * focus_dist;
+    if (J > analytical_threshold)
     {
-        // Analytic inverse possible below cusp
-        return 1.f;
+        // Approximate inverse required above threshold due to the introduction of J in the calculation
+        float gain_adjustment = log10((limit_J_max - analytical_threshold) / std::max(0.0001f, limit_J_max - J));
+        //gain = powf(gain, focus_adjust_gain_inv) + 1.f;
+        gain_adjustment = gain_adjustment * gain_adjustment + 1.f;
+        gain = gain * gain_adjustment;
     }
+    return gain;
 }
 
-float solve_J_intersect(float J, float M, float focusJ, float maxJ, float slope_gain)
+float solve_J_intersect(float J, float M, float focusJ, float maxJ, float slope_gain) // TODO: eval placing maxJ as last patrameter?
 {
-    const float a = M / (focusJ * slope_gain);
-    float b = 0.f;
-    float c = 0.f;
-    float intersectJ = 0.f;
-
-    if (J < focusJ)
-    {
-        b = 1.f - M / slope_gain;
-    }
-    else
-    {
-        b = - (1.f + M / slope_gain + maxJ * M / (focusJ * slope_gain));
-    }
-
-    if (J < focusJ)
-    {
-        c = -J;
-    }
-    else
-    {
-        c = maxJ * M / slope_gain + J;
-    }
-
-    const float root = sqrt(b * b - 4.f * a * c);
+    const float M_scaled = M / slope_gain;
+    const float a = M_scaled / focusJ;
 
     if (J < focusJ)
     {
-        intersectJ = 2.f * c / (-b - root);
+        const float b = 1.f - M_scaled;
+        const float c = -J;
+        const float det = b * b - 4.f * a * c;
+        const float root = sqrt(det);
+        return -2.f * c / (b + root);
     }
     else
     {
-        intersectJ = 2.f * c / (-b + root);
+        const float b = -(1.f + M_scaled + maxJ * a);
+        const float c = maxJ * M_scaled + J;
+        const float det = b * b - 4.f * a * c;
+        const float root = sqrt(det);
+        return -2.f * c / (b - root);
     }
-
-    return intersectJ;
 }
 
-float smin(float a, float b, float s)
+// Smooth minimum about the scaled reference, based upon a cubic polynomial
+inline float smin_scaled(float a, float b, float scale_reference)
 {
-    const float h = std::max(s - std::abs(a - b), 0.f) / s;
-    return std::min(a, b) - h * h * h * s * (1.f / 6.f);
+    const float s_scaled = smooth_cusps * scale_reference;
+    const float h = std::max(s_scaled - std::abs(a - b), 0.0f) / s_scaled;
+    return std::min(a, b) - h * h * h * s_scaled * (1.f / 6.f);
 }
 
-f3 find_gamut_boundary_intersection(const f3 &JMh_s, const f2 &JM_cusp_in, float J_focus, float J_max, float slope_gain, float gamma_top, float gamma_bottom)
+inline float compute_compression_vector_slope(const float intersectJ, const float focusJ, const float limitJmax, const float slope_gain)
 {
-    const float s = std::max(0.000001f, smooth_cusps);
-    const f2 JM_cusp = {
-        JM_cusp_in[0],
-        JM_cusp_in[1] * (1.f + smooth_m * s)
-    };
+    const float direction_scaler = (intersectJ < focusJ) ? intersectJ : (limitJmax - intersectJ); // TODO < vs <=
+    return direction_scaler * (intersectJ - focusJ) / (focusJ * slope_gain);
+}
 
-    const float J_intersect_source = solve_J_intersect(JMh_s[0], JMh_s[1], J_focus, J_max, slope_gain);
-    const float J_intersect_cusp = solve_J_intersect(JM_cusp[0], JM_cusp[1], J_focus, J_max, slope_gain);
+inline float estimate_line_and_boundary_intersection_M(const float J_axis_intersect, const float slope, const float inv_gamma,
+                                                       const float J_max, const float M_max, const float J_intersection_reference)
+{
+    // Line defined by     J = slope * x + J_axis_intersect
+    // Boundary defined by J = J_max * (x / M_max) ^ (1/inv_gamma)
+    // Approximate as we do not want to iteratively solve intersection of a straight line and an exponential
 
-    float slope = 0.f;
-    if (J_intersect_source < J_focus)
-    {
-        slope = J_intersect_source * (J_intersect_source - J_focus) / (J_focus * slope_gain);
-    }
-    else
-    {
-        slope = (J_max - J_intersect_source) * (J_intersect_source - J_focus) / (J_focus * slope_gain);
-    }
+    // We calculate a shifted intersection from the original intersection using the inverse of the exponential
+    // and the provided reference
+    const float normalised_J         = J_axis_intersect / J_intersection_reference;
+    const float shifted_intersection = J_intersection_reference * powf(normalised_J, inv_gamma);
 
-    const float M_boundary_lower = J_intersect_cusp * powf(J_intersect_source / J_intersect_cusp, 1.f / gamma_bottom) / (JM_cusp[0] / JM_cusp[1] - slope);
-    const float M_boundary_upper = JM_cusp[1] * (J_max - J_intersect_cusp) * powf((J_max - J_intersect_source) / (J_max - J_intersect_cusp), 1.f / gamma_top) / (slope * JM_cusp[1] + J_max - JM_cusp[0]);
-    const float M_boundary = JM_cusp[1] * smin(M_boundary_lower / JM_cusp[1], M_boundary_upper / JM_cusp[1], s);
-    const float J_boundary = J_intersect_source + slope * M_boundary;
+    // Now we find the M intersection of two lines
+    // line from origin to J,M Max       l1(x) = J/M * x
+    // line from J Intersect' with slope l2(x) = slope * x + Intersect'
 
-    return {J_boundary, M_boundary, J_intersect_source};
+    //return shifted_intersection / ((J_max / M_max) - slope);
+    return shifted_intersection * M_max / (J_max - slope * M_max);
 }
 
-f3 get_reach_boundary(
-    float J,
-    float M,
-    float h,
-    const f2 &JMcusp,
-    float focusJ,
-    float limit_J_max,
-    float model_gamma,
-    float focus_dist,
-    const ACES2::Table1D & reach_m_table
-)
+float find_gamut_boundary_intersection(const f2 &JM_cusp, float J_max, float gamma_top_inv, float gamma_bottom_inv, const float J_intersect_source, const float slope, const float J_intersect_cusp)
 {
-    const float reachMaxM = reach_m_from_table(h, reach_m_table);
-
-    const float slope_gain = limit_J_max * focus_dist * get_focus_gain(J, JMcusp[0], limit_J_max);
-
-    const float intersectJ = solve_J_intersect(J, M, focusJ, limit_J_max, slope_gain);
+    const float M_boundary_lower = estimate_line_and_boundary_intersection_M(J_intersect_source, slope, gamma_bottom_inv, JM_cusp[0], JM_cusp[1], J_intersect_cusp);
+
+    // The upper hull is flipped and thus 'zeroed' at J_max
+    // Also note we negate the slope
+    const float f_J_intersect_cusp   = J_max - J_intersect_cusp;
+    const float f_J_intersect_source = J_max - J_intersect_source;
+    const float f_JM_cusp_J          = J_max - JM_cusp[0];
+    const float M_boundary_upper =
+      estimate_line_and_boundary_intersection_M(f_J_intersect_source, -slope, gamma_top_inv, f_JM_cusp_J, JM_cusp[1], f_J_intersect_cusp);
+
+    // Smooth minimum between the two calculated values for the M component
+    const float M_boundary = smin_scaled(M_boundary_lower, M_boundary_upper, JM_cusp[1]);
+    return M_boundary;
+}
 
-    float slope;
-    if (intersectJ < focusJ)
-    {
-        slope = intersectJ * (intersectJ - focusJ) / (focusJ * slope_gain);
-    }
-    else
+template <bool invert>
+inline float reinhard_remap(const float scale, const float nd)
+{
+    if (invert)
     {
-        slope = (limit_J_max - intersectJ) * (intersectJ - focusJ) / (focusJ * slope_gain);
+        if (nd >= 1.0f) // TODO: given remap_M already tests against proportion do we need this asymptote test
+            return scale;
+        else
+            return scale * -(nd / (nd - 1.0f));
     }
-
-    const float boundary = limit_J_max * powf(intersectJ / limit_J_max, model_gamma) * reachMaxM / (limit_J_max - slope * reachMaxM);
-    return {J, boundary, h};
+    return scale * nd / (1.0f + nd);
 }
 
-float compression_function(
-    float v,
-    float thr,
-    float lim,
-    bool invert)
+template <bool invert>
+inline float remap_M(const float M, const float gamut_boundary_M, const float reach_boundary_M)
 {
-    float s = (lim - thr) * (1.f - thr) / (lim - 1.f);
-    float nd = (v - thr) / s;
-
-    float vCompressed;
-
-    if (invert) {
-        if (v < thr || lim <= 1.0001f || v > thr + s) {
-            vCompressed = v;
-        } else {
-            vCompressed = thr + s * (-nd / (nd - 1.f));
-        }
-    } else {
-        if (v < thr || lim <= 1.0001f) {
-            vCompressed = v;
-        } else {
-            vCompressed = thr + s * nd / (1.f + nd);
-        }
-    }
-
-    return vCompressed;
+    const float boundary_ratio = gamut_boundary_M / reach_boundary_M;
+    const float proportion = std::max(boundary_ratio, compression_threshold);
+    const float threshold  = proportion * gamut_boundary_M;
+
+    if (M <= threshold || proportion >= 1.0f)
+        return M;
+
+    // Translate to place threshold at zero
+    const float m_offset     = M - threshold;
+    const float gamut_offset = gamut_boundary_M - threshold;
+    const float reach_offset = reach_boundary_M - threshold;
+
+    const float scale = reach_offset / ((reach_offset / gamut_offset) - 1.0f);
+    const float nd = m_offset / scale;
+ 
+    // shift back to absolute
+    return threshold + reinhard_remap<invert>(scale, nd);
 }
 
-f3 compressGamut(const f3 &JMh, float Jx, const ACES2::GamutCompressParams& p, bool invert)
+template <bool invert>
+f3 compressGamut(const f3 &JMh, float Jx, const ACES2::ResolvedSharedCompressionParameters& sr, const ACES2::GamutCompressParams& p, const HueDependantGamutParams hdp)
 {
     const float J = JMh[0];
     const float M = JMh[1];
     const float h = JMh[2];
+    
+    const float slope_gain = get_focus_gain(Jx, hdp.analytical_threshold, sr.limit_J_max, p.focus_dist);
+    const float J_intersect_source = solve_J_intersect(J, M, hdp.focusJ, sr.limit_J_max, slope_gain);
+    const float gamut_slope = compute_compression_vector_slope(J_intersect_source, hdp.focusJ, sr.limit_J_max, slope_gain);
+
+    const float J_intersect_cusp = solve_J_intersect(hdp.JMcusp[0], hdp.JMcusp[1], hdp.focusJ, sr.limit_J_max, slope_gain);
+    const float gamut_boundary_M = find_gamut_boundary_intersection(hdp.JMcusp, sr.limit_J_max, hdp.gamma_top_inv, hdp.gamma_bottom_inv, J_intersect_source, gamut_slope, J_intersect_cusp);
 
-    if (M < 0.0001f || J > p.limit_J_max)
+    if (gamut_boundary_M <= 0.0f) // TODO: when/why does this happen?
     {
         return {J, 0.f, h};
     }
-    else
-    {
-        const f2 project_from = {J, M};
-        const f2 JMcusp = cusp_from_table(h, p.gamut_cusp_table);
-        const float focusJ = lerpf(JMcusp[0], p.mid_J, std::min(1.f, cusp_mid_blend - (JMcusp[0] / p.limit_J_max)));
-        const float slope_gain = p.limit_J_max * p.focus_dist * get_focus_gain(Jx, JMcusp[0], p.limit_J_max);
 
-        const float gamma_top = hue_dependent_upper_hull_gamma(h, p.upper_hull_gamma_table);
-        const float gamma_bottom = p.lower_hull_gamma;
+    const float reachBoundaryM   = estimate_line_and_boundary_intersection_M(J_intersect_source, gamut_slope, sr.model_gamma_inv, sr.limit_J_max, sr.reachMaxM, sr.limit_J_max);
 
-        const f3 boundaryReturn = find_gamut_boundary_intersection({J, M, h}, JMcusp, focusJ, p.limit_J_max, slope_gain, gamma_top, gamma_bottom);
-        const f2 JMboundary = {boundaryReturn[0], boundaryReturn[1]};
-        const f2 project_to = {boundaryReturn[2], 0.f};
+    const float remapped_M = remap_M<invert>(M, gamut_boundary_M, reachBoundaryM);
 
-        if (JMboundary[1] <= 0.0f)
-        {
-            return {J, 0.f, h};
-        }
-
-        const f3 reachBoundary = get_reach_boundary(JMboundary[0], JMboundary[1], h, JMcusp, focusJ, p.limit_J_max, p.model_gamma, p.focus_dist, p.reach_m_table);
-
-        const float difference = std::max(1.0001f, reachBoundary[1] / JMboundary[1]);
-        const float threshold = std::max(compression_threshold, 1.f / difference);
-
-        float v = project_from[1] / JMboundary[1];
-        v = compression_function(v, threshold, difference, invert);
-
-        const f2 JMcompressed {
-            project_to[0] + v * (JMboundary[0] - project_to[0]),
-            project_to[1] + v * (JMboundary[1] - project_to[1])
-        };
+    const f3 JMcompressed {
+        J_intersect_source + remapped_M * gamut_slope,
+        remapped_M,
+        h
+    };
+    return JMcompressed;
+}
 
-        return {JMcompressed[0], JMcompressed[1], h};
-    }
+inline float compute_focusJ(float cusp_J, float mid_J, float limit_J_max)
+{
+    return lerpf(cusp_J, mid_J, std::min(1.f, cusp_mid_blend - (cusp_J / limit_J_max)));
 }
 
-f3 gamut_compress_fwd(const f3 &JMh, const GamutCompressParams &p)
+HueDependantGamutParams init_HueDependantGamutParams(const float hue, const ResolvedSharedCompressionParameters &sr, const GamutCompressParams &p)
 {
-    return compressGamut(JMh, JMh[0], p, false);
+    HueDependantGamutParams hdp;
+    hdp.gamma_bottom_inv = p.lower_hull_gamma_inv;
+
+    const unsigned int i_hi = lookup_hue_interval(hue, p.hue_table, p.hue_linearity_search_range);
+    const float t = interpolation_weight(hue, p.hue_table[i_hi-1], p.hue_table[i_hi]);
+    const f3 cusp = cusp_from_table(i_hi, t, p.gamut_cusp_table);
+
+    hdp.JMcusp = { cusp[0], cusp[1] };
+    hdp.gamma_top_inv = { cusp[2] };
+    hdp.focusJ = compute_focusJ(hdp.JMcusp[0], p.mid_J, sr.limit_J_max);
+    hdp.analytical_threshold = lerpf(hdp.JMcusp[0], sr.limit_J_max, focus_gain_blend);
+    return hdp;
 }
 
-f3 gamut_compress_inv(const f3 &JMh, const GamutCompressParams &p)
+f3 gamut_compress_fwd(const f3 &JMh, const ResolvedSharedCompressionParameters &sr, const GamutCompressParams &p)
 {
-    const f2 JMcusp = cusp_from_table(JMh[2], p.gamut_cusp_table);
-    float Jx = JMh[0];
+    const float J = JMh[0];
+    const float M = JMh[1];
+    const float h = JMh[2];
 
-    f3 unCompressedJMh;
+    if (J <= 0.0f) // Limit to +ve J values // TODO test this is needed
+    {
+        return {0.0f, 0.f, h};
+    }
+    if (M <= 0.0f || J > sr.limit_J_max) // We compress M only so avoid mapping zero
+                                         // Above the expected maximum we explicitly map to 0 M
+    {
+        return {J, 0.f, h};
+    }
+    const HueDependantGamutParams hdp = init_HueDependantGamutParams(h, sr, p);
+    
+    return compressGamut<false>(JMh, JMh[0], sr, p, hdp);
+}
 
-    // Analytic inverse below threshold
-    if (Jx <= lerpf(JMcusp[0], p.limit_J_max, focus_gain_blend))
+f3 gamut_compress_inv(const f3 &JMh, const ResolvedSharedCompressionParameters &sr, const GamutCompressParams &p)
+{
+    const float J = JMh[0];
+    const float M = JMh[1];
+    const float h = JMh[2];
+
+    if (J <= 0.0f) // Limit to +ve J values // TODO test this is needed
     {
-        unCompressedJMh = compressGamut(JMh, Jx, p, true);
+        return {0.0f, 0.f, h};
     }
-    // Approximation above threshold
-    else
+    if (M <= 0.0f || J > sr.limit_J_max) // We compress M only so avoid mapping zero
+                                         // Above the expected maximum we explicitly map to 0 M
     {
-        Jx = compressGamut(JMh, Jx, p, true)[0];
-        unCompressedJMh = compressGamut(JMh, Jx, p, true);
+        return {J, 0.f, h};
     }
+    const HueDependantGamutParams hdp = init_HueDependantGamutParams(h, sr, p);
 
-    return unCompressedJMh;
+    float Jx = J;
+    if (Jx > hdp.analytical_threshold)
+    {
+        // Approximation above threshold
+        Jx = compressGamut<true>(JMh, Jx, sr, p, hdp)[0];
+    }
+    return compressGamut<true>(JMh, Jx, sr, p, hdp);
+}
+
+static constexpr unsigned int gamma_test_count = 5;
+struct testData {
+    f3 testJMh;
+    float J_intersect_source;
+    float slope;
+    float J_intersect_cusp;
+};
+
+std::array<testData, gamma_test_count> generate_gamma_test_data(const f2 &JMcusp, float hue, float limit_J_max, float mid_J, float focus_dist)
+{
+    const std::array<float, gamma_test_count> testPositions = {0.01f, 0.1f, 0.5f, 0.8f, 0.99f};
+    std::array<testData, gamma_test_count> data;
+    const float analytical_threshold = lerpf(JMcusp[0], limit_J_max, focus_gain_blend);
+    const float focusJ = compute_focusJ(JMcusp[0], mid_J, limit_J_max);
+
+    unsigned int testIndex = 0;
+    std::generate(data.begin(), data.end(), [JMcusp, limit_J_max, focus_dist, testPositions, hue, analytical_threshold, focusJ, testIndex]() mutable {
+        const float testJ = lerpf(JMcusp[0], limit_J_max, testPositions[testIndex]);
+        const float slope_gain = get_focus_gain(testJ, analytical_threshold, limit_J_max, focus_dist);
+        const float J_intersect_source = solve_J_intersect(testJ, JMcusp[1], focusJ, limit_J_max, slope_gain);
+        testData result = {
+            { testJ, JMcusp[1], hue },
+            J_intersect_source,
+            compute_compression_vector_slope(J_intersect_source, focusJ, limit_J_max, slope_gain),
+            solve_J_intersect(JMcusp[0], JMcusp[1], focusJ, limit_J_max, slope_gain)
+        };
+        testIndex++;
+        return result;
+    });
+    return data;
 }
 
 bool evaluate_gamma_fit(
     const f2 &JMcusp,
-    const f3 testJMh[3],
-    float topGamma,
+    const std::array<testData, gamma_test_count> data,
+    float topGamma_inv,
     float peakLuminance,
     float limit_J_max,
-    float mid_J,
-    float focus_dist,
-    float lower_hull_gamma,
+    float lower_hull_gamma_inv,
     const JMhParams &limitJMhParams)
 {
-    const float focusJ = lerpf(JMcusp[0], mid_J, std::min(1.f, cusp_mid_blend - (JMcusp[0] / limit_J_max)));
-
-    for (size_t testIndex = 0; testIndex < 3; testIndex++)
+    const float luminance_limit = peakLuminance / reference_luminance;
+    for (auto test_data: data)
     {
-        const float slope_gain = limit_J_max * focus_dist * get_focus_gain(testJMh[testIndex][0], JMcusp[0], limit_J_max);
-        const f3 approxLimit = find_gamut_boundary_intersection(testJMh[testIndex], JMcusp, focusJ, limit_J_max, slope_gain, topGamma, lower_hull_gamma);
-        const f3 approximate_JMh = {approxLimit[0], approxLimit[1], testJMh[testIndex][2]};
+        const float approxLimit_M = find_gamut_boundary_intersection(JMcusp, limit_J_max, topGamma_inv, lower_hull_gamma_inv,
+                                                    test_data.J_intersect_source, test_data.slope, test_data.J_intersect_cusp);
+        const float approxLimit_J = test_data.J_intersect_source + test_data.slope * approxLimit_M;
+
+        const f3 approximate_JMh = {approxLimit_J, approxLimit_M, test_data.testJMh[2]};
         const f3 newLimitRGB = JMh_to_RGB(approximate_JMh, limitJMhParams);
-        const f3 newLimitRGBScaled = mult_f_f3(reference_luminance / peakLuminance, newLimitRGB);
 
-        if (!outside_hull(newLimitRGBScaled))
+        if (!outside_hull(newLimitRGB, luminance_limit))
         {
             return false;
         }
@@ -739,47 +1234,38 @@ bool evaluate_gamma_fit(
     return true;
 }
 
-Table1D make_upper_hull_gamma(
-    const Table3D &gamutCuspTable,
+void make_upper_hull_gamma(
+    const Table1D &hue_table,
+    Table3D &gamutCuspTable,
     float peakLuminance,
     float limit_J_max,
     float mid_J,
     float focus_dist,
-    float lower_hull_gamma,
+    float lower_hull_gamma_inv,
     const JMhParams &limitJMhParams)
 {
-    const int test_count = 3;
-    const float testPositions[test_count] = {0.01f, 0.5f, 0.99f};
-
-    Table1D gammaTable{};
-    Table1D gamutTopGamma{};
-
-    for (int i = 0; i < gammaTable.size; i++)
+    for (unsigned int i = gamutCuspTable.first_nominal_index; i != gamutCuspTable.upper_wrap_index; ++i)
     {
-        gammaTable.table[i] = -1.f;
+        const float hue = hue_table[i];
+        const f2 JMcusp = { gamutCuspTable[i][0], gamutCuspTable[i][1] };
 
-        const float hue = (float) i;
-        const f2 JMcusp = cusp_from_table(hue, gamutCuspTable);
-
-        f3 testJMh[test_count]{};
-        for (int testIndex = 0; testIndex < test_count; testIndex++)
-        {
-            const float testJ = JMcusp[0] + ((limit_J_max - JMcusp[0]) * testPositions[testIndex]);
-            testJMh[testIndex] = {
-                testJ,
-                JMcusp[1],
-                hue
-            };
-        }
+        std::array<testData, gamma_test_count> data = generate_gamma_test_data(JMcusp, hue, limit_J_max, mid_J, focus_dist);
 
+        // TODO: calculate best fitting inverse gamma directly rather than reciprocating it in the loop
+        // TODO: adjacent found gamma values typically fall close to each other could initialise the search range
+        //       with values near to speed up searching. Note that discrepancies do occur at/near corners of gamut
         const float search_range = gammaSearchStep;
         float low = gammaMinimum;
         float high = low + search_range;
         bool outside = false;
 
-        while (!(outside) && (high < 5.f))
+        auto gamma_fit_predicate = [JMcusp, data, peakLuminance, limit_J_max, lower_hull_gamma_inv, limitJMhParams](float gamma)
+        {
+            return evaluate_gamma_fit(JMcusp, data, 1.0f / gamma, peakLuminance, limit_J_max, lower_hull_gamma_inv, limitJMhParams);
+        };
+        while (!(outside) && (high < gammaMaximum))
         {
-            const bool gammaFound = evaluate_gamma_fit(JMcusp, testJMh, high, peakLuminance, limit_J_max, mid_J, focus_dist, lower_hull_gamma, limitJMhParams);
+            const bool gammaFound = gamma_fit_predicate(high);
             if (!gammaFound)
             {
                 low = high;
@@ -794,30 +1280,23 @@ Table1D make_upper_hull_gamma(
         float testGamma = -1.f;
         while ( (high-low) > gammaAccuracy)
         {
-            testGamma = (high + low) / 2.f;
-            const bool gammaFound = evaluate_gamma_fit(JMcusp, testJMh, testGamma, peakLuminance, limit_J_max, mid_J, focus_dist, lower_hull_gamma, limitJMhParams);
+            testGamma = midpoint(high, low);
+            const bool gammaFound = gamma_fit_predicate(testGamma);
             if (gammaFound)
             {
                 high = testGamma;
-                gammaTable.table[i] = high;
             }
             else
             {
                 low = testGamma;
             }
         }
-
-        // Duplicate gamma value to array, leaving empty entries at first and last position
-        gamutTopGamma.table[i+gamutTopGamma.base_index] = gammaTable.table[i];
+        gamutCuspTable[i][2] = 1.0f / high;
     }
 
-    // Copy last populated entry to first empty spot
-    gamutTopGamma.table[0] = gammaTable.table[gammaTable.size-1];
-
-    // Copy first populated entry to last empty spot
-    gamutTopGamma.table[gamutTopGamma.total_size-1] = gammaTable.table[0];
-
-    return gamutTopGamma;
+    // Copy last populated entries to empty spot 'wrapping' entries
+    gamutCuspTable[gamutCuspTable.lower_wrap_index][2] = gamutCuspTable[gamutCuspTable.last_nominal_index][2];
+    gamutCuspTable[gamutCuspTable.upper_wrap_index][2] = gamutCuspTable[gamutCuspTable.first_nominal_index][2];
 }
 
 // Tonescale pre-calculations
@@ -836,6 +1315,8 @@ ToneScaleParams init_ToneScaleParams(float peakLuminance)
     const float r_hit_max = 896.f;   // scene-referred value "hitting the roof"
 
     // Calculate output constants
+    // TODO: factor these calculations into well named functions
+    // TODO: ensure correct use of n_r vs n vs reference_luminance vs 100.f etc
     const float r_hit = r_hit_min + (r_hit_max - r_hit_min) * (log(n/n_r)/log(10000.f/100.f));
     const float m_0 = (n / n_r);
     const float m_1 = 0.5f * (m_0 + sqrt(m_0 * (m_0 + 4.f * t_1)));
@@ -846,9 +1327,12 @@ ToneScaleParams init_ToneScaleParams(float peakLuminance)
     const float g_ip = 0.5f * (c_t + sqrt(c_t * (c_t + 4.f * t_1)));
     const float g_ipp2 = -(m_1 * powf((g_ip/m),(1.f/g))) / (powf(g_ip/m , 1.f/g)-1.f);
     const float w_2 = c / g_ipp2;
-    const float s_2 = w_2 * m_1;
+    const float s_2 = w_2 * m_1 * reference_luminance;
     const float u_2 = powf((r_hit/m_1)/((r_hit/m_1) + w_2), g);
     const float m_2 = m_1 / u_2;
+    const float inverse_limit = n / (u_2 * n_r);
+    const float forward_limit = 8.0f * r_hit;
+    const float log_peak = log10( n / n_r);
 
     ToneScaleParams TonescaleParams = {
         n,
@@ -858,70 +1342,100 @@ ToneScaleParams init_ToneScaleParams(float peakLuminance)
         c_t,
         s_2,
         u_2,
-        m_2
+        m_2,
+        forward_limit,
+        inverse_limit,
+        log_peak
     };
 
     return TonescaleParams;
 }
 
-ChromaCompressParams init_ChromaCompressParams(float peakLuminance)
+SharedCompressionParameters init_SharedCompressionParams(float peakLuminance, const JMhParams &inputJMhParams, const JMhParams &reachParams)
 {
-    const ToneScaleParams tsParams = init_ToneScaleParams(peakLuminance);
-    const JMhParams inputJMhParams = init_JMhParams(ACES_AP0::primaries);
+    const float limit_J_max = Y_to_J(peakLuminance, inputJMhParams);
+    const float model_gamma_inv = 1.f / model_gamma();
 
-    float limit_J_max = Y_to_J(peakLuminance, inputJMhParams);
+    SharedCompressionParameters params = {
+        limit_J_max,
+        model_gamma_inv,
+        make_reach_m_table(reachParams, limit_J_max)
+    };
+    return params;
+}
+
+ResolvedSharedCompressionParameters resolve_CompressionParams(float hue, const SharedCompressionParameters &p)
+{
+    ResolvedSharedCompressionParameters params = {
+        p.limit_J_max,
+        p.model_gamma_inv,
+        reach_m_from_table(hue, p.reach_m_table)
+    };
+    return params;
+}
 
+ChromaCompressParams init_ChromaCompressParams(float peakLuminance, const ToneScaleParams &tsParams)
+{
     // Calculated chroma compress variables
-    const float log_peak = log10( tsParams.n / tsParams.n_r);
-    const float compr = chroma_compress + (chroma_compress * chroma_compress_fact) * log_peak;
-    const float sat = std::max(0.2f, chroma_expand - (chroma_expand * chroma_expand_fact) * log_peak);
+    // TODO: name these magic constants
+    const float compr = chroma_compress + (chroma_compress * chroma_compress_fact) * tsParams.log_peak;
+    const float sat = std::max(0.2f, chroma_expand - (chroma_expand * chroma_expand_fact) * tsParams.log_peak);
     const float sat_thr = chroma_expand_thr / tsParams.n;
-    const float model_gamma = 1.f / (surround[1] * (1.48f + sqrt(Y_b / L_A)));
-
-    ChromaCompressParams params{};
-    params.limit_J_max = limit_J_max;
-    params.model_gamma = model_gamma;
-    params.sat = sat;
-    params.sat_thr = sat_thr;
-    params.compr = compr;
-    params.chroma_compress_scale = powf(0.03379f * peakLuminance, 0.30596f) - 0.45135f;
-    params.reach_m_table = make_reach_m_table(ACES_AP1::primaries, peakLuminance);
+    const float chroma_compress_scale =  powf(0.03379f * peakLuminance, 0.30596f) - 0.45135f;
+
+    ChromaCompressParams params = {
+        sat,
+        sat_thr,
+        compr,
+        chroma_compress_scale
+    };
     return params;
 }
 
-GamutCompressParams init_GamutCompressParams(float peakLuminance, const Primaries &limitingPrimaries)
+std::array<int, 2> determine_hue_linearity_search_range(const Table3D &gamutCuspTable)
 {
-    const ToneScaleParams tsParams = init_ToneScaleParams(peakLuminance);
-    const JMhParams inputJMhParams = init_JMhParams(ACES_AP0::primaries);
+    // This function searches through the hues looking for the largest deviations from a linear distribution
+    // We can then use this to initialise the binary search range to something smaller than the full one to
+    // reduce the number of lookups per hue lookup from ~ceil(log2(table size)) to ~ceil(log2(range))
+    // during image rendering.
+
+    // TODO: Padding values are a quick hack to ensure the range encloses the needed range, left as an exersize
+    // for the reader to fully reason if these values could be smaller, probably best done the closer the hue
+    // distribution comes to linear as the overhead becomes a potentially greater issue
+    constexpr int lower_padding = 0;
+    constexpr int upper_padding = 1;
+    std::array<int, 2> hue_linearity_search_range = {lower_padding, upper_padding};
+    for (unsigned int i = gamutCuspTable.first_nominal_index; i != gamutCuspTable.upper_wrap_index; ++i)
+    {
+        const unsigned int pos = gamutCuspTable.nominal_hue_position_in_uniform_table(gamutCuspTable[i][2]); // TODO: compute from hue table less cache pressure?
+        const int delta = int(i) - int(pos);
+        hue_linearity_search_range[0] = std::min(hue_linearity_search_range[0], delta + lower_padding);
+        hue_linearity_search_range[1] = std::max(hue_linearity_search_range[1], delta + upper_padding);
+
+        //std::cout << i << " " << pos << " " << delta << " " << gamutCuspTable[i][0] << " " << gamutCuspTable[i][1] << " " << gamutCuspTable[i][2] << "\n";
+    }
+    //std::cout << "search range " << hue_linearity_search_range[0] << " " << hue_linearity_search_range[1] << "\n";
+    return hue_linearity_search_range;
+}
 
-    float limit_J_max = Y_to_J(peakLuminance, inputJMhParams);
-    float mid_J = Y_to_J(tsParams.c_t * 100.f, inputJMhParams);
+GamutCompressParams init_GamutCompressParams(float peakLuminance, const JMhParams &inputJMhParams, const JMhParams &limitJMhParams,
+                                             const ToneScaleParams &tsParams, const SharedCompressionParameters &shParams, const JMhParams &reachParams)
+{
+    float mid_J = Y_to_J(tsParams.c_t * reference_luminance, inputJMhParams);
 
     // Calculated chroma compress variables
-    const float log_peak = log10( tsParams.n / tsParams.n_r);
-    const float model_gamma = 1.f / (surround[1] * (1.48f + sqrt(Y_b / L_A)));
-    const float focus_dist = focus_distance + focus_distance * focus_distance_scaling * log_peak;
-    const float lower_hull_gamma =  1.14f + 0.07f * log_peak;
-
-    const JMhParams limitJMhParams = init_JMhParams(limitingPrimaries);
+    const float focus_dist = focus_distance + focus_distance * focus_distance_scaling * tsParams.log_peak;
+    const float lower_hull_gamma_inv =  1.0f / (1.14f + 0.07f * tsParams.log_peak); // TODO: name these magic constants
 
-    GamutCompressParams params{};
-    params.limit_J_max = limit_J_max;
+    GamutCompressParams params;
     params.mid_J = mid_J;
-    params.model_gamma = model_gamma;
     params.focus_dist = focus_dist;
-    params.lower_hull_gamma = lower_hull_gamma;
-    params.reach_m_table = make_reach_m_table(ACES_AP1::primaries, peakLuminance);
-    params.gamut_cusp_table = make_gamut_table(limitingPrimaries, peakLuminance);
-    params.upper_hull_gamma_table = make_upper_hull_gamma(
-        params.gamut_cusp_table,
-        peakLuminance,
-        limit_J_max,
-        mid_J,
-        focus_dist,
-        lower_hull_gamma,
-        limitJMhParams);
-
+    params.lower_hull_gamma_inv = lower_hull_gamma_inv;
+    params.gamut_cusp_table = make_uniform_hue_gamut_table(reachParams, limitJMhParams, peakLuminance, tsParams.forward_limit, shParams, params.hue_table);
+    //params.gamut_cusp_table = make_gamut_table(limitJMhParams, peakLuminance, params.hue_table);
+    params.hue_linearity_search_range = determine_hue_linearity_search_range(params.gamut_cusp_table);
+    make_upper_hull_gamma(params.hue_table, params.gamut_cusp_table, peakLuminance, shParams.limit_J_max, mid_J, focus_dist,
+                          lower_hull_gamma_inv, limitJMhParams); //TODO: mess of parameters
     return params;
 }
 
diff --git a/src/OpenColorIO/ops/fixedfunction/ACES2/Transform.h b/src/OpenColorIO/ops/fixedfunction/ACES2/Transform.h
index ff3f969e5a..11adc13fdb 100644
--- a/src/OpenColorIO/ops/fixedfunction/ACES2/Transform.h
+++ b/src/OpenColorIO/ops/fixedfunction/ACES2/Transform.h
@@ -14,17 +14,31 @@ namespace ACES2
 
 JMhParams init_JMhParams(const Primaries &P);
 ToneScaleParams init_ToneScaleParams(float peakLuminance);
-ChromaCompressParams init_ChromaCompressParams(float peakLuminance);
-GamutCompressParams init_GamutCompressParams(float peakLuminance, const Primaries &P);
-
+SharedCompressionParameters init_SharedCompressionParams(float peakLuminance, const JMhParams &inputJMhParams, const JMhParams &reachParams);
+ResolvedSharedCompressionParameters resolve_CompressionParams(float hue, const SharedCompressionParameters &p);
+ChromaCompressParams init_ChromaCompressParams(float peakLuminance, const ToneScaleParams &tsParams);
+GamutCompressParams init_GamutCompressParams(float peakLuminance, const JMhParams &inputJMhParams, const JMhParams &limitJMhParams,
+                                             const ToneScaleParams &tsParams, const SharedCompressionParameters &shParams, const JMhParams &reachParams);
+
+f3 RGB_to_Aab(const f3 &RGB, const JMhParams &p);
+f3 Aab_to_JMh(const f3 &Aab, const JMhParams &p);
 f3 RGB_to_JMh(const f3 &RGB, const JMhParams &p);
+f3 JMh_to_Aab(const f3 &JMh, const JMhParams &p);
+f3 JMh_to_Aab(const f3 &JMh, const float &cos_hr, const float &sin_hr, const JMhParams &p);
+f3 Aab_to_RGB(const f3 &Aab, const JMhParams &p);
 f3 JMh_to_RGB(const f3 &JMh, const JMhParams &p);
 
-f3 tonescale_chroma_compress_fwd(const f3 &JMh, const JMhParams &p, const ToneScaleParams &pt, const ChromaCompressParams &pc);
-f3 tonescale_chroma_compress_inv(const f3 &JMh, const JMhParams &p, const ToneScaleParams &pt, const ChromaCompressParams &pc);
+float tonescale_fwd(const float J, const JMhParams &p, const ToneScaleParams &pt);
+float tonescale_inv(const float J, const JMhParams &p, const ToneScaleParams &pt);
+float tonescale_A_to_J_fwd(const float A, const JMhParams &p, const ToneScaleParams &pt);
+
+float chroma_compress_norm(float cos_hr, float sin_hr, float chroma_compress_scale);
+
+f3 chroma_compress_fwd(const f3 &JMh, const float J_ts, const float Mnorm, const ResolvedSharedCompressionParameters &ps, const ChromaCompressParams &pc);
+f3 chroma_compress_inv(const f3 &JMh, const float J, const float Mnorm, const ResolvedSharedCompressionParameters &ps, const ChromaCompressParams &pc);
 
-f3 gamut_compress_fwd(const f3 &JMh, const GamutCompressParams &p);
-f3 gamut_compress_inv(const f3 &JMh, const GamutCompressParams &p);
+f3 gamut_compress_fwd(const f3 &JMh, const ResolvedSharedCompressionParameters &ps, const GamutCompressParams &p);
+f3 gamut_compress_inv(const f3 &JMh, const ResolvedSharedCompressionParameters &ps, const GamutCompressParams &p);
 
 
 } // namespace ACES2
diff --git a/src/OpenColorIO/ops/fixedfunction/FixedFunctionOpCPU.cpp b/src/OpenColorIO/ops/fixedfunction/FixedFunctionOpCPU.cpp
index e9e9774338..feb387db5f 100644
--- a/src/OpenColorIO/ops/fixedfunction/FixedFunctionOpCPU.cpp
+++ b/src/OpenColorIO/ops/fixedfunction/FixedFunctionOpCPU.cpp
@@ -147,6 +147,7 @@ class Renderer_ACES_OutputTransform20 : public OpCPU
     ACES2::JMhParams m_pIn;
     ACES2::JMhParams m_pOut;
     ACES2::ToneScaleParams m_t;
+    ACES2::SharedCompressionParameters m_s;
     ACES2::ChromaCompressParams m_c;
     ACES2::GamutCompressParams m_g;
 };
@@ -184,6 +185,7 @@ class Renderer_ACES_TONESCALE_COMPRESS_20 : public OpCPU
     bool m_fwd;
     ACES2::JMhParams m_p;
     ACES2::ToneScaleParams m_t;
+    ACES2::SharedCompressionParameters m_s;
     ACES2::ChromaCompressParams m_c;
 };
 
@@ -201,6 +203,7 @@ class Renderer_ACES_GAMUT_COMPRESS_20 : public OpCPU
 
 protected:
     bool m_fwd;
+    ACES2::SharedCompressionParameters m_s;
     ACES2::GamutCompressParams m_g;
 };
 
@@ -1023,8 +1026,10 @@ Renderer_ACES_OutputTransform20::Renderer_ACES_OutputTransform20(ConstFixedFunct
     m_pIn = ACES2::init_JMhParams(ACES_AP0::primaries);
     m_pOut = ACES2::init_JMhParams(lim_primaries);
     m_t = ACES2::init_ToneScaleParams(peak_luminance);
-    m_c = ACES2::init_ChromaCompressParams(peak_luminance);
-    m_g = ACES2::init_GamutCompressParams(peak_luminance, lim_primaries);
+    const ACES2::JMhParams reachGamut = ACES2::init_JMhParams(ACES_AP1::primaries);
+    m_s = ACES2::init_SharedCompressionParams(peak_luminance, m_pIn, reachGamut);
+    m_c = ACES2::init_ChromaCompressParams(peak_luminance, m_t);
+    m_g = ACES2::init_GamutCompressParams(peak_luminance, m_pIn, m_pOut, m_t, m_s, reachGamut);
 }
 
 void Renderer_ACES_OutputTransform20::apply(const void * inImg, void * outImg, long numPixels) const
@@ -1047,10 +1052,21 @@ void Renderer_ACES_OutputTransform20::fwd(const void * inImg, void * outImg, lon
     for(long idx=0; idx<numPixels; ++idx)
     {
         const ACES2::f3 RGBIn {in[0], in[1], in[2]};
-        const ACES2::f3 JMh           = ACES2::RGB_to_JMh(RGBIn, m_pIn);
-        const ACES2::f3 tonemappedJMh = ACES2::tonescale_chroma_compress_fwd(JMh, m_pIn, m_t, m_c);
-        const ACES2::f3 compressedJMh = ACES2::gamut_compress_fwd(tonemappedJMh, m_g);
-        const ACES2::f3 RGBOut        = ACES2::JMh_to_RGB(compressedJMh, m_pOut);
+        const ACES2::f3 Aab           = ACES2::RGB_to_Aab(RGBIn, m_pIn);
+        const ACES2::f3 JMh           = ACES2::Aab_to_JMh(Aab, m_pIn);
+
+        const ACES2::ResolvedSharedCompressionParameters rp = resolve_CompressionParams(JMh[2], m_s);
+        const float h_rad = ACES2::to_radians(JMh[2]);
+        const float cos_hr1 = std::cos(h_rad);
+        const float sin_hr1 = std::sin(h_rad);
+        const float Mnorm = ACES2::chroma_compress_norm(cos_hr1, sin_hr1, m_c.chroma_compress_scale);
+
+        const float J_ts = ACES2::tonescale_A_to_J_fwd(Aab[0], m_pIn, m_t);
+        const ACES2::f3 tonemappedJMh = ACES2::chroma_compress_fwd(JMh, J_ts, Mnorm, rp, m_c);
+        const ACES2::f3 compressedJMh = ACES2::gamut_compress_fwd(tonemappedJMh, rp, m_g);
+
+        const ACES2::f3 Aabout        = ACES2::JMh_to_Aab(compressedJMh, cos_hr1, sin_hr1, m_pOut);
+        const ACES2::f3 RGBOut        = ACES2::Aab_to_RGB(Aabout, m_pOut);
 
         out[0] = RGBOut[0];
         out[1] = RGBOut[1];
@@ -1071,10 +1087,20 @@ void Renderer_ACES_OutputTransform20::inv(const void * inImg, void * outImg, lon
     {
         const ACES2::f3 RGBout {in[0], in[1], in[2]};
         const ACES2::f3 compressedJMh = ACES2::RGB_to_JMh(RGBout, m_pOut);
-        const ACES2::f3 tonemappedJMh = ACES2::gamut_compress_inv(compressedJMh, m_g);
-        const ACES2::f3 JMh           = ACES2::tonescale_chroma_compress_inv(tonemappedJMh, m_pIn, m_t, m_c);
-        const ACES2::f3 RGBin         = ACES2::JMh_to_RGB(JMh, m_pIn);
 
+        const ACES2::ResolvedSharedCompressionParameters rp = resolve_CompressionParams(compressedJMh[2], m_s);
+        const float h_rad = ACES2::to_radians(compressedJMh[2]);
+        const float cos_hr1 = std::cos(h_rad);
+        const float sin_hr1 = std::sin(h_rad);
+        const float Mnorm = ACES2::chroma_compress_norm(cos_hr1, sin_hr1, m_c.chroma_compress_scale);
+
+        const ACES2::f3 tonemappedJMh = ACES2::gamut_compress_inv(compressedJMh, rp, m_g);
+        const float J         = ACES2::tonescale_inv(tonemappedJMh[0], m_pIn, m_t);
+        const ACES2::f3 JMh   = ACES2::chroma_compress_inv(tonemappedJMh, J, Mnorm, rp, m_c);
+    
+        const ACES2::f3 Aab   = ACES2::JMh_to_Aab(JMh, cos_hr1, sin_hr1, m_pIn);
+        const ACES2::f3 RGBin = ACES2::Aab_to_RGB(Aab, m_pIn);
+    
         out[0] = RGBin[0];
         out[1] = RGBin[1];
         out[2] = RGBin[2];
@@ -1132,7 +1158,7 @@ void Renderer_ACES_RGB_TO_JMh_20::fwd(const void * inImg, void * outImg, long nu
 
         out[0] = JMh[0];
         out[1] = JMh[1];
-        out[2] = JMh[2];
+        out[2] = ACES2::to_degrees(JMh[2]);
         out[3] = in[3];
 
         in  += 4;
@@ -1147,7 +1173,8 @@ void Renderer_ACES_RGB_TO_JMh_20::inv(const void * inImg, void * outImg, long nu
 
     for(long idx=0; idx<numPixels; ++idx)
     {
-        const ACES2::f3 RGB = ACES2::JMh_to_RGB({in[0], in[1], in[2]}, m_p);
+        const float normalised_hue = ACES2::from_degrees(in[2]);
+        const ACES2::f3 RGB = ACES2::JMh_to_RGB({in[0], in[1], normalised_hue}, m_p);
 
         out[0] = RGB[0];
         out[1] = RGB[1];
@@ -1168,7 +1195,9 @@ Renderer_ACES_TONESCALE_COMPRESS_20::Renderer_ACES_TONESCALE_COMPRESS_20(ConstFi
 
     m_p = ACES2::init_JMhParams(ACES_AP0::primaries);
     m_t = ACES2::init_ToneScaleParams(peak_luminance);
-    m_c = ACES2::init_ChromaCompressParams(peak_luminance);
+    const ACES2::JMhParams reachGamut = ACES2::init_JMhParams(ACES_AP1::primaries);
+    m_s = ACES2::init_SharedCompressionParams(peak_luminance, m_p, reachGamut);
+    m_c = ACES2::init_ChromaCompressParams(peak_luminance, m_t);
 }
 
 void Renderer_ACES_TONESCALE_COMPRESS_20::apply(const void * inImg, void * outImg, long numPixels) const
@@ -1190,11 +1219,18 @@ void Renderer_ACES_TONESCALE_COMPRESS_20::fwd(const void * inImg, void * outImg,
 
     for(long idx=0; idx<numPixels; ++idx)
     {
-        const ACES2::f3 JMh = ACES2::tonescale_chroma_compress_fwd({in[0], in[1], in[2]}, m_p, m_t, m_c);
+        const float normalised_hue = ACES2::from_degrees(in[2]);
+        const float h_rad = ACES2::to_radians(normalised_hue);
+        const float cos_hr1 = cos(h_rad);
+        const float sin_hr1 = sin(h_rad);
+        const float Mnorm = ACES2::chroma_compress_norm(cos_hr1, sin_hr1, m_c.chroma_compress_scale);
+        const ACES2::ResolvedSharedCompressionParameters rp = resolve_CompressionParams(normalised_hue, m_s);
+        const float J_ts = ACES2::tonescale_fwd(in[0], m_p, m_t);
+        const ACES2::f3 JMh = ACES2::chroma_compress_fwd({in[0], in[1], normalised_hue}, J_ts, Mnorm, rp, m_c);
 
         out[0] = JMh[0];
         out[1] = JMh[1];
-        out[2] = JMh[2];
+        out[2] = ACES2::to_degrees(JMh[2]);
         out[3] = in[3];
 
         in  += 4;
@@ -1209,11 +1245,18 @@ void Renderer_ACES_TONESCALE_COMPRESS_20::inv(const void * inImg, void * outImg,
 
     for(long idx=0; idx<numPixels; ++idx)
     {
-        const ACES2::f3 JMh = ACES2::tonescale_chroma_compress_inv({in[0], in[1], in[2]}, m_p, m_t, m_c);
+        const float normalised_hue = ACES2::from_degrees(in[2]);
+        const float h_rad = ACES2::to_radians(normalised_hue);
+        const float cos_hr1 = cos(h_rad);
+        const float sin_hr1 = sin(h_rad);
+        const float Mnorm = ACES2::chroma_compress_norm(cos_hr1, sin_hr1, m_c.chroma_compress_scale);
+        const ACES2::ResolvedSharedCompressionParameters rp = resolve_CompressionParams(normalised_hue, m_s);
+        const float J = ACES2::tonescale_inv(in[0], m_p, m_t);
+        const ACES2::f3 JMh = ACES2::chroma_compress_inv({in[0], in[1],  normalised_hue}, J, Mnorm, rp, m_c);
 
         out[0] = JMh[0];
         out[1] = JMh[1];
-        out[2] = JMh[2];
+        out[2] = ACES2::to_degrees(JMh[2]);
         out[3] = in[3];
 
         in  += 4;
@@ -1244,7 +1287,12 @@ Renderer_ACES_GAMUT_COMPRESS_20::Renderer_ACES_GAMUT_COMPRESS_20(ConstFixedFunct
         {white_x, white_y}
     };
 
-    m_g = ACES2::init_GamutCompressParams(peakLuminance, limitingPrimaries);
+    const ACES2::JMhParams pIn = ACES2::init_JMhParams(ACES_AP0::primaries);
+    const ACES2::JMhParams pLim = ACES2::init_JMhParams(limitingPrimaries);
+    const ACES2::ToneScaleParams t = ACES2::init_ToneScaleParams(peakLuminance);
+    const ACES2::JMhParams reachGamut = ACES2::init_JMhParams(ACES_AP1::primaries);
+    m_s = ACES2::init_SharedCompressionParams(peakLuminance, pIn, reachGamut);
+    m_g = ACES2::init_GamutCompressParams(peakLuminance, pIn, pLim, t, m_s, reachGamut);
 }
 
 void Renderer_ACES_GAMUT_COMPRESS_20::apply(const void * inImg, void * outImg, long numPixels) const
@@ -1266,11 +1314,13 @@ void Renderer_ACES_GAMUT_COMPRESS_20::fwd(const void * inImg, void * outImg, lon
 
     for(long idx=0; idx<numPixels; ++idx)
     {
-        const ACES2::f3 JMh = ACES2::gamut_compress_fwd({in[0], in[1], in[2]}, m_g);
+        const float normalised_hue = ACES2::from_degrees(in[2]);
+        const ACES2::ResolvedSharedCompressionParameters rp = resolve_CompressionParams(normalised_hue, m_s);
+        const ACES2::f3 JMh = ACES2::gamut_compress_fwd({in[0], in[1], normalised_hue}, rp, m_g);
 
         out[0] = JMh[0];
         out[1] = JMh[1];
-        out[2] = JMh[2];
+        out[2] = ACES2::to_degrees(JMh[2]);
         out[3] = in[3];
 
         in  += 4;
@@ -1285,11 +1335,13 @@ void Renderer_ACES_GAMUT_COMPRESS_20::inv(const void * inImg, void * outImg, lon
 
     for(long idx=0; idx<numPixels; ++idx)
     {
-        const ACES2::f3 JMh = ACES2::gamut_compress_inv({in[0], in[1], in[2]}, m_g);
+        const float normalised_hue = ACES2::from_degrees(in[2]);
+        const ACES2::ResolvedSharedCompressionParameters rp = resolve_CompressionParams(normalised_hue, m_s);
+        const ACES2::f3 JMh = ACES2::gamut_compress_inv({in[0], in[1], normalised_hue}, rp, m_g);
 
         out[0] = JMh[0];
         out[1] = JMh[1];
-        out[2] = JMh[2];
+        out[2] = ACES2::to_degrees(JMh[2]);
         out[3] = in[3];
 
         in  += 4;
diff --git a/src/OpenColorIO/ops/fixedfunction/FixedFunctionOpGPU.cpp b/src/OpenColorIO/ops/fixedfunction/FixedFunctionOpGPU.cpp
index 1b4ba870dd..ea6dced0c0 100644
--- a/src/OpenColorIO/ops/fixedfunction/FixedFunctionOpGPU.cpp
+++ b/src/OpenColorIO/ops/fixedfunction/FixedFunctionOpGPU.cpp
@@ -349,57 +349,143 @@ void Add_GamutComp_13_Inv_Shader(GpuShaderText & ss,
     );
 }
 
-void _Add_RGB_to_JMh_Shader(
+void _Add_WrapHueChannel_Shader(
+    GpuShaderCreatorRcPtr & shaderCreator,
+    GpuShaderText & ss)
+{
+    const std::string pxl(shaderCreator->getPixelName());
+    ss.newLine() << ss.floatDecl("hwrap") << " = " << pxl << ".b;";
+    ss.newLine() << "hwrap = hwrap - floor(hwrap / 360.0) * 360.0;";
+    ss.newLine() << "hwrap = (hwrap < 0.0) ? hwrap + 360.0 : hwrap;";
+    ss.newLine() << pxl << ".b = hwrap;";
+}
+
+void _Add_SinCos_Shader(
+    GpuShaderCreatorRcPtr & shaderCreator,
+    GpuShaderText & ss)
+{
+    const std::string pxl(shaderCreator->getPixelName());
+    ss.newLine() << ss.floatDecl("h_rad") << " = " << pxl << ".b * " << 3.14159265358979f / 180.0f << ";";
+    ss.newLine() << ss.floatDecl("cos_hr") << " = cos(h_rad);";
+    ss.newLine() << ss.floatDecl("sin_hr") << " = sin(h_rad);";
+}
+
+
+void _Add_RGB_to_Aab_Shader(
     GpuShaderCreatorRcPtr & shaderCreator,
     GpuShaderText & ss,
     const ACES2::JMhParams & p)
 {
     const std::string pxl(shaderCreator->getPixelName());
 
-    ss.newLine() << ss.float3Decl("lms") << " = " << ss.mat3fMul(&p.MATRIX_RGB_to_CAM16[0], pxl + ".rgb") << ";";
-    ss.newLine() << "lms = " << "lms * " << ss.float3Const(p.D_RGB[0], p.D_RGB[1], p.D_RGB[2]) << ";";
+    ss.newLine() << "{";
+    ss.indent();
 
-    ss.newLine() << ss.float3Decl("F_L_v") << " = pow(" << p.F_L << " * abs(lms) / 100.0, " << ss.float3Const(0.42f) << ");";
-    ss.newLine() << ss.float3Decl("rgb_a") << " = (400.0 * sign(lms) * F_L_v) / (27.13 + F_L_v);";
+    ss.newLine() << ss.float3Decl("lms") << " = " << ss.mat3fMul(&p.MATRIX_RGB_to_CAM16_c[0], pxl + ".rgb") << ";";
 
-    ss.newLine() << ss.floatDecl("A") << " = 2.0 * rgb_a.r + rgb_a.g + 0.05 * rgb_a.b;";
-    ss.newLine() << ss.floatDecl("a") << " = rgb_a.r - 12.0 * rgb_a.g / 11.0 + rgb_a.b / 11.0;";
-    ss.newLine() << ss.floatDecl("b") << " = (rgb_a.r + rgb_a.g - 2.0 * rgb_a.b) / 9.0;";
+    ss.newLine() << ss.float3Decl("F_L_v") << " = pow(abs(lms), " << ss.float3Const(0.42f) << ");";
+    ss.newLine() << ss.float3Decl("rgb_a") << " = (sign(lms) * F_L_v) / ( " << ACES2::cam_nl_offset << " + F_L_v);";
 
-    ss.newLine() << ss.floatDecl("J") << " = 100.0 * pow(A / " << p.A_w << ", " << ACES2::surround[1] << " * " << p.z << ");";
+    ss.newLine() << "Aab = " << ss.mat3fMul(&p.MATRIX_cone_response_to_Aab[0], "rgb_a.rgb") << ";";
 
-    ss.newLine() << ss.floatDecl("M") << " = (J == 0.0) ? 0.0 : 43.0 * " << ACES2::surround[2] << " * sqrt(a * a + b * b);";
+    ss.dedent();
+    ss.newLine() << "}";
+}
 
-    ss.newLine() << ss.floatDecl("h") << " = (a == 0.0) ? 0.0 : " << ss.atan2("b", "a") << " * 180.0 / 3.14159265358979;";
+void _Add_Aab_to_JMh_Shader(
+    GpuShaderCreatorRcPtr & shaderCreator,
+    GpuShaderText & ss,
+    const ACES2::JMhParams & p)
+{
+    const std::string pxl(shaderCreator->getPixelName());
+
+    ss.newLine() << "{";
+    ss.indent();
+
+    ss.newLine() << ss.floatDecl("J") << " = " << ACES2::J_scale << " * pow(Aab.r, " << p.cz << ");";
+
+    ss.newLine() << ss.floatDecl("M") << " = (J == 0.0) ? 0.0 : sqrt(Aab.g * Aab.g + Aab.b * Aab.b);";
+
+    ss.newLine() << ss.floatDecl("h") << " = (Aab.g == 0.0) ? 0.0 : " << ss.atan2("Aab.b", "Aab.g") << " * " << 180.0 / 3.14159265358979 << ";";
     ss.newLine() << "h = h - floor(h / 360.0) * 360.0;";
     ss.newLine() << "h = (h < 0.0) ? h + 360.0 : h;";
 
-    ss.newLine() << pxl << ".rgb = " << ss.float3Const("J", "M", "h") << ";";
+    ss.newLine() << "JMh.rgb = " << ss.float3Const("J", "M", "h") << ";";
+
+    ss.dedent();
+    ss.newLine() << "}";
 }
 
-void _Add_JMh_to_RGB_Shader(
+void _Add_RGB_to_JMh_Shader(
     GpuShaderCreatorRcPtr & shaderCreator,
     GpuShaderText & ss,
     const ACES2::JMhParams & p)
 {
     const std::string pxl(shaderCreator->getPixelName());
 
-    ss.newLine() << ss.floatDecl("h") << " = " << pxl << ".b * 3.14159265358979 / 180.0;";
+    ss.newLine() << ss.float3Decl("JMh") << ";";    // TODO: leaky abstraction should really be explicit functions
+    ss.newLine() << ss.float3Decl("Aab") << ";";    // TODO: leaky abstraction should really be explicit functions
 
-    ss.newLine() << ss.floatDecl("scale") << " = " << pxl << ".g / (43.0 * " << ACES2::surround[2] << ");";
-    ss.newLine() << ss.floatDecl("A") << " = " << p.A_w << " * pow(" << pxl << ".r / 100.0, 1.0 / (" << ACES2::surround[1] << " * " << p.z << "));";
-    ss.newLine() << ss.floatDecl("a") << " = scale * cos(h);";
-    ss.newLine() << ss.floatDecl("b") << " = scale * sin(h);";
+    ss.newLine() << "{";
+    ss.indent();
 
-    ss.newLine() << ss.float3Decl("rgb_a") << ";";
-    ss.newLine() << "rgb_a.r = (460.0 * A + 451.0 * a + 288.0 *b) / 1403.0;";
-    ss.newLine() << "rgb_a.g = (460.0 * A - 891.0 * a - 261.0 *b) / 1403.0;";
-    ss.newLine() << "rgb_a.b = (460.0 * A - 220.0 * a - 6300.0 *b) / 1403.0;";
+    _Add_RGB_to_Aab_Shader(shaderCreator, ss, p);
+    _Add_Aab_to_JMh_Shader(shaderCreator, ss, p);
+    
+    ss.newLine() << pxl << ".rgb = JMh;";
 
-    ss.newLine() << ss.float3Decl("lms") << " = sign(rgb_a) * 100.0 / " << p.F_L << " * pow(27.13 * abs(rgb_a) / (400.0 - abs(rgb_a)), " << ss.float3Const(1.f / 0.42f) << ");";
-    ss.newLine() << "lms = " << "lms / " << ss.float3Const(p.D_RGB[0], p.D_RGB[1], p.D_RGB[2]) << ";";
+    ss.dedent();
+    ss.newLine() << "}";
+}
 
-    ss.newLine() << pxl << ".rgb = " << ss.mat3fMul(&p.MATRIX_CAM16_to_RGB[0], "lms") << ";";
+
+void _Add_JMh_to_Aab_Shader(
+    GpuShaderCreatorRcPtr & shaderCreator,
+    GpuShaderText & ss,
+    const ACES2::JMhParams & p)
+{
+    const std::string pxl(shaderCreator->getPixelName());
+
+    ss.newLine() << "{";
+    ss.indent();
+
+
+    ss.newLine() << "Aab.r = pow(JMh.r * " << 1.0f / ACES2::J_scale << ", " << p.inv_cz << ");";
+    ss.newLine() << "Aab.g = JMh.g * cos_hr;";
+    ss.newLine() << "Aab.b = JMh.g * sin_hr;";
+    
+    ss.dedent();
+    ss.newLine() << "}";
+}
+
+void _Add_Aab_to_RGB_Shader(
+    GpuShaderText & ss,
+    const ACES2::JMhParams & p)
+{
+    ss.newLine() << "{";
+    ss.indent();
+
+    ss.newLine() << ss.float3Decl("rgb_a") << " = " << ss.mat3fMul(&p.MATRIX_Aab_to_cone_response[0], "Aab.rgb") << ";";
+    ss.newLine() << ss.float3Decl("lms") << " = sign(rgb_a) * pow( " << ACES2::cam_nl_offset << " * abs(rgb_a) / (1.0f - abs(rgb_a)), " << ss.float3Const(1.f / 0.42f) << ");";
+    ss.newLine() << "JMh.rgb = " << ss.mat3fMul(&p.MATRIX_CAM16_c_to_RGB[0], "lms") << ";";
+
+    ss.dedent();
+    ss.newLine() << "}";
+}
+
+void _Add_JMh_to_RGB_Shader(
+    GpuShaderCreatorRcPtr & shaderCreator,
+    GpuShaderText & ss,
+    const ACES2::JMhParams & p)
+{
+    const std::string pxl(shaderCreator->getPixelName());
+
+    ss.newLine() << ss.float3Decl("JMh") << " = " << pxl << ".rgb;";
+    ss.newLine() << ss.float3Decl("Aab") << ";";
+    _Add_JMh_to_Aab_Shader(shaderCreator, ss, p);
+    _Add_Aab_to_RGB_Shader(ss, p);
+
+    ss.newLine() << pxl << ".rgb = JMh;";
 }
 
 std::string _Add_Reach_table(
@@ -430,12 +516,12 @@ std::string _Add_Reach_table(
     shaderCreator->addTexture(
         name.c_str(),
         GpuShaderText::getSamplerName(name).c_str(),
-        table.size,
+        table.total_size,
         1,
         GpuShaderCreator::TEXTURE_RED_CHANNEL,
         dimensions,
         INTERP_NEAREST,
-        &(table.table[0]));
+        &(table[0]));
 
 
     if (dimensions == GpuShaderDesc::TEXTURE_1D)
@@ -458,26 +544,22 @@ std::string _Add_Reach_table(
     ss.newLine() << "{";
     ss.indent();
 
-    ss.newLine() << ss.floatDecl("hwrap") << " = h;";
-    ss.newLine() << "hwrap = hwrap - floor(hwrap / 360.0) * 360.0;";
-    ss.newLine() << "hwrap = (hwrap < 0.0) ? hwrap + 360.0 : hwrap;";
-
-    ss.newLine() << ss.floatDecl("i_lo") << " = floor(hwrap);";
-    ss.newLine() << ss.floatDecl("i_hi") << " = (i_lo + 1);";
-    ss.newLine() << "i_hi = i_hi - floor(i_hi / 360.0) * 360.0;";
+    ss.newLine() << ss.floatDecl("i_base") << " = floor(h);";
+    ss.newLine() << ss.floatDecl("i_lo") << " = i_base + " << table.base_index << ";";
+    ss.newLine() << ss.floatDecl("i_hi") << " = i_lo + 1;";
 
     if (dimensions == GpuShaderDesc::TEXTURE_1D)
     {
-        ss.newLine() << ss.floatDecl("lo") << " = " << ss.sampleTex1D(name, "(i_lo + 0.5) / 360.0") << ".r;";
-        ss.newLine() << ss.floatDecl("hi") << " = " << ss.sampleTex1D(name, "(i_hi + 0.5) / 360.0") << ".r;";
+        ss.newLine() << ss.floatDecl("lo") << " = " << ss.sampleTex1D(name, "(i_lo + 0.5) / " + std::to_string(table.total_size)) << ".r;";
+        ss.newLine() << ss.floatDecl("hi") << " = " << ss.sampleTex1D(name, "(i_hi + 0.5) / " + std::to_string(table.total_size)) << ".r;";
     }
     else
     {
-        ss.newLine() << ss.floatDecl("lo") << " = " << ss.sampleTex2D(name, ss.float2Const("(i_lo + 0.5) / 360.0", "0.0")) << ".r;";
-        ss.newLine() << ss.floatDecl("hi") << " = " << ss.sampleTex2D(name, ss.float2Const("(i_hi + 0.5) / 360.0", "0.5")) << ".r;";
+        ss.newLine() << ss.floatDecl("lo") << " = " << ss.sampleTex2D(name, ss.float2Const("(i_lo + 0.5) / " + std::to_string(table.total_size), "0.0")) << ".r;";
+        ss.newLine() << ss.floatDecl("hi") << " = " << ss.sampleTex2D(name, ss.float2Const("(i_hi + 0.5) / " + std::to_string(table.total_size), "0.5")) << ".r;";
     }
 
-    ss.newLine() << ss.floatDecl("t") << " = (h - i_lo) / (i_hi - i_lo);";
+    ss.newLine() << ss.floatDecl("t") << " = h - i_base;"; // Hardcoded single degree spacing
     ss.newLine() << "return " << ss.lerp("lo", "hi", "t") << ";";
 
     ss.dedent();
@@ -532,14 +614,98 @@ std::string _Add_Toe_func(
     return name;
 }
 
+std::string _Add_Tonescale_func(
+    GpuShaderCreatorRcPtr & shaderCreator,
+    unsigned resourceIndex,
+    bool invert,
+    const ACES2::JMhParams & p,
+    const ACES2::ToneScaleParams & t)
+{
+    // Reserve name
+    std::ostringstream resName;
+    resName << shaderCreator->getResourcePrefix()
+            << std::string("_")
+            << std::string("tonescale")
+            << (invert ? std::string("_inv") : std::string("_fwd"))
+            << resourceIndex;
+
+    // Note: Remove potentially problematic double underscores from GLSL resource names.
+    std::string name(resName.str());
+    StringUtils::ReplaceInPlace(name, "__", "_");
+
+    GpuShaderText ss(shaderCreator->getLanguage());
+
+    ss.newLine() << ss.floatKeyword() << " " << name << "(float J)";
+    ss.newLine() << "{";
+    ss.indent();
+
+    // Tonescale applied in Y (convert to and from J)
+    ss.newLine() << ss.floatDecl("A") << " = " << p.A_w_J << " * pow(abs(J) * " << 1.0f / ACES2::J_scale << ", " << p.inv_cz << ");";
+    ss.newLine() << ss.floatDecl("Y") << " = pow(( " << ACES2::cam_nl_offset << " * A) / (1.0f - A), " << 1.0 / 0.42 << ");";
+
+    if (invert)
+    {
+        // Inverse Tonescale applied in Y (convert to and from J)
+        ss.newLine() << ss.floatDecl("Y_i") << " = Y / " << double(p.F_L_n) * ACES2::reference_luminance << ";";
+ 
+        ss.newLine() << ss.floatDecl("Z") << " = max(0.0, min(" << t.inverse_limit << ", Y_i));";
+        ss.newLine() << ss.floatDecl("ht") << " = 0.5 * (Z + sqrt(Z * (" << 4.0 * t.t_1 << " + Z)));";
+        ss.newLine() << ss.floatDecl("Yo") << " = " << double(p.F_L_n) * t.s_2 << " / (pow((" << t.m_2 << " / ht), (" << 1.0 / t.g << ")) - 1.0);";
+
+        ss.newLine() << ss.floatDecl("F_L_Y") << " = pow(abs(Yo), 0.42);";
+    }
+    else
+    {
+        // Tonescale applied in Y (convert to and from J)
+        ss.newLine() << ss.floatDecl("f") << " = " << t.m_2  << " * pow(Y / (Y + " << double(t.s_2) * p.F_L_n << "), " << t.g << ");";
+        ss.newLine() << ss.floatDecl("Y_ts") << " = max(0.0, f * f / (f + " << t.t_1 << "));";
+        ss.newLine() << ss.floatDecl("F_L_Y") << " = pow(" << double(p.F_L_n) * ACES2::reference_luminance << " * Y_ts, 0.42);";
+    }
+
+    ss.newLine() << ss.floatDecl("J_ts") << " = " << ACES2::J_scale << " * pow((F_L_Y / ( " << ACES2::cam_nl_offset << " + F_L_Y)) * " << p.inv_A_w_J << ", " << p.cz << ");";
+    ss.newLine() << "return J_ts;";
+
+    ss.dedent();
+    ss.newLine() << "}";
+
+    shaderCreator->addToHelperShaderCode(ss.string().c_str());
+
+    return name;
+}
+
+void _Add_ChromaCompressionNorm_Shader(
+    GpuShaderText & ss,
+    const ACES2::ChromaCompressParams & c)
+{
+    // Mnorm
+    ss.newLine() << ss.floatDecl("Mnorm") << ";";
+    ss.newLine() << "{";
+    ss.indent();
+
+    ss.newLine() << ss.floatDecl("cos_hr2") << " = cos_hr * cos_hr - sin_hr * sin_hr;";
+    ss.newLine() << ss.floatDecl("sin_hr2") << " = 2.0 * cos_hr * sin_hr;";
+    ss.newLine() << ss.floatDecl("cos_hr3") << " = 4.0 * cos_hr * cos_hr * cos_hr - 3.0 * cos_hr;";
+    ss.newLine() << ss.floatDecl("sin_hr3") << " = 3.0 * sin_hr - 4.0 * sin_hr * sin_hr * sin_hr;";
+    ss.newLine() << ss.float3Decl("cosines") << " = " <<  ss.float3Const("cos_hr", "cos_hr2", "cos_hr3") <<";";
+    ss.newLine() << ss.float3Decl("cosine_weights") << " = " <<  ss.float3Const(11.34072 * c.chroma_compress_scale,
+                                                                                16.46899 * c.chroma_compress_scale,
+                                                                                7.88380 * c.chroma_compress_scale) <<";";
+    ss.newLine() << ss.float3Decl("sines") << " = " <<  ss.float3Const("sin_hr", "sin_hr2", "sin_hr3") <<";";
+    ss.newLine() << ss.float3Decl("sine_weights") << " = " <<  ss.float3Const(14.66441 * c.chroma_compress_scale,
+                                                                              -6.37224 * c.chroma_compress_scale,
+                                                                               9.19364 * c.chroma_compress_scale) <<";";
+    ss.newLine() << "Mnorm = dot(cosines, cosine_weights) + dot(sines, sine_weights) + " << 77.12896 * c.chroma_compress_scale<< ";";
+
+    ss.dedent();
+    ss.newLine() << "}";
+}
+
 void _Add_Tonescale_Compress_Fwd_Shader(
     GpuShaderCreatorRcPtr & shaderCreator,
     GpuShaderText & ss,
     unsigned resourceIndex,
-    const ACES2::JMhParams & p,
-    const ACES2::ToneScaleParams & t,
-    const ACES2::ChromaCompressParams & c,
-    const std::string & reachName)
+    const ACES2::SharedCompressionParameters & s,
+    const ACES2::ChromaCompressParams & c)
 {
     std::string toeName = _Add_Toe_func(shaderCreator, resourceIndex, false);
 
@@ -549,15 +715,6 @@ void _Add_Tonescale_Compress_Fwd_Shader(
     ss.newLine() << ss.floatDecl("M") << " = " << pxl << ".g;";
     ss.newLine() << ss.floatDecl("h") << " = " << pxl << ".b;";
 
-    // Tonescale applied in Y (convert to and from J)
-    ss.newLine() << ss.floatDecl("A") << " = " << p.A_w_J << " * pow(abs(J) / 100.0, 1.0 / (" << ACES2::surround[1] << " * " << p.z << "));";
-    ss.newLine() << ss.floatDecl("Y") << " = sign(J) * 100.0 / " << p.F_L << " * pow((27.13 * A) / (400.0 - A), 1.0 / 0.42) / 100.0;";
-
-    ss.newLine() << ss.floatDecl("f") << " = " << t.m_2  << " * pow(max(0.0, Y) / (Y + " << t.s_2 << "), " << t.g << ");";
-    ss.newLine() << ss.floatDecl("Y_ts") << " = max(0.0, f * f / (f + " << t.t_1 << ")) * " << t.n_r << ";";
-
-    ss.newLine() << ss.floatDecl("F_L_Y") << " = pow(" << p.F_L << " * abs(Y_ts) / 100.0, 0.42);";
-    ss.newLine() << ss.floatDecl("J_ts") << " = sign(Y_ts) * 100.0 * pow(((400.0 * F_L_Y) / (27.13 + F_L_Y)) / " << p.A_w_J << ", " << ACES2::surround[1] << " * " << p.z << ");";
 
     // ChromaCompress
     ss.newLine() << ss.floatDecl("M_cp") << " = M;";
@@ -566,31 +723,13 @@ void _Add_Tonescale_Compress_Fwd_Shader(
     ss.newLine() << "{";
     ss.indent();
 
-    ss.newLine() << ss.floatDecl("nJ") << " = J_ts / " << c.limit_J_max << ";";
+    ss.newLine() << ss.floatDecl("nJ") << " = J_ts / " << s.limit_J_max << ";";
     ss.newLine() << ss.floatDecl("snJ") << " = max(0.0, 1.0 - nJ);";
 
-    // Mnorm
-    ss.newLine() << ss.floatDecl("Mnorm") << ";";
-    ss.newLine() << "{";
-    ss.indent();
+    _Add_ChromaCompressionNorm_Shader(ss, c);
 
-    ss.newLine() << ss.floatDecl("PI") << " = 3.14159265358979;";
-    ss.newLine() << ss.floatDecl("h_rad") << " = h / 180.0 * PI;";
-    ss.newLine() << ss.floatDecl("a") << " = cos(h_rad);";
-    ss.newLine() << ss.floatDecl("b") << " = sin(h_rad);";
-    ss.newLine() << ss.floatDecl("cos_hr2") << " = a * a - b * b;";
-    ss.newLine() << ss.floatDecl("sin_hr2") << " = 2.0 * a * b;";
-    ss.newLine() << ss.floatDecl("cos_hr3") << " = 4.0 * a * a * a - 3.0 * a;";
-    ss.newLine() << ss.floatDecl("sin_hr3") << " = 3.0 * b - 4.0 * b * b * b;";
-    ss.newLine() << ss.floatDecl("M") << " = 11.34072 * a + 16.46899 * cos_hr2 + 7.88380 * cos_hr3 + 14.66441 * b + -6.37224 * sin_hr2 + 9.19364 * sin_hr3 + 77.12896;";
-    ss.newLine() << "Mnorm = M * " << c.chroma_compress_scale << ";";
-
-    ss.dedent();
-    ss.newLine() << "}";
-
-    ss.newLine() << ss.floatDecl("reachM") << " = " << reachName << "_sample(h);";
-    ss.newLine() << ss.floatDecl("limit") << " = pow(nJ, " << c.model_gamma << ") * reachM / Mnorm;";
-    ss.newLine() << "M_cp = M * pow(J_ts / J, " << c.model_gamma << ");";
+    ss.newLine() << ss.floatDecl("limit") << " = pow(nJ, " << s.model_gamma_inv << ") * reachMaxM / Mnorm;";
+    ss.newLine() << "M_cp = M * pow(J_ts / J, " << s.model_gamma_inv << ");";
     ss.newLine() << "M_cp = M_cp / Mnorm;";
 
     ss.newLine() << "M_cp = limit - " << toeName << "(limit - M_cp, limit - 0.001, snJ * " << c.sat << ", sqrt(nJ * nJ + " << c.sat_thr << "));";
@@ -607,30 +746,17 @@ void _Add_Tonescale_Compress_Inv_Shader(
     GpuShaderCreatorRcPtr & shaderCreator,
     GpuShaderText & ss,
     unsigned resourceIndex,
-    const ACES2::JMhParams & p,
-    const ACES2::ToneScaleParams & t,
-    const ACES2::ChromaCompressParams & c,
-    const std::string & reachName)
+    const ACES2::SharedCompressionParameters & s,
+    const ACES2::ChromaCompressParams & c)
 {
     std::string toeName = _Add_Toe_func(shaderCreator, resourceIndex, true);
 
     const std::string pxl(shaderCreator->getPixelName());
-
+    
     ss.newLine() << ss.floatDecl("J_ts") << " = " << pxl << ".r;";
     ss.newLine() << ss.floatDecl("M_cp") << " = " << pxl << ".g;";
     ss.newLine() << ss.floatDecl("h") << " = " << pxl << ".b;";
 
-    // Inverse Tonescale applied in Y (convert to and from J)
-    ss.newLine() << ss.floatDecl("A") << " = " << p.A_w_J << " * pow(abs(J_ts) / 100.0, 1.0 / (" << ACES2::surround[1] << " * " << p.z << "));";
-    ss.newLine() << ss.floatDecl("Y_ts") << " = sign(J_ts) * 100.0 / " << p.F_L << " * pow((27.13 * A) / (400.0 - A), 1.0 / 0.42) / 100.0;";
-
-    ss.newLine() << ss.floatDecl("Z") << " = max(0.0, min(" << t.n << " / (" << t.u_2 * t.n_r << "), Y_ts));";
-    ss.newLine() << ss.floatDecl("ht") << " = (Z + sqrt(Z * (4.0 * " << t.t_1 << " + Z))) / 2.0;";
-    ss.newLine() << ss.floatDecl("Y") << " = " << t.s_2 << " / (pow((" << t.m_2 << " / ht), (1.0 / " << t.g << ")) - 1.0);";
-
-    ss.newLine() << ss.floatDecl("F_L_Y") << " = pow(" << p.F_L << " * abs(Y * 100.0) / 100.0, 0.42);";
-    ss.newLine() << ss.floatDecl("J") << " = sign(Y) * 100.0 * pow(((400.0 * F_L_Y) / (27.13 + F_L_Y)) / " << p.A_w_J << ", " << ACES2::surround[1] << " * " << p.z << ");";
-
     // ChromaCompress
     ss.newLine() << ss.floatDecl("M") << " = M_cp;";
 
@@ -638,36 +764,18 @@ void _Add_Tonescale_Compress_Inv_Shader(
     ss.newLine() << "{";
     ss.indent();
 
-    ss.newLine() << ss.floatDecl("nJ") << " = J_ts / " << c.limit_J_max << ";";
+    ss.newLine() << ss.floatDecl("nJ") << " = J_ts / " << s.limit_J_max << ";";
     ss.newLine() << ss.floatDecl("snJ") << " = max(0.0, 1.0 - nJ);";
 
-    // Mnorm
-    ss.newLine() << ss.floatDecl("Mnorm") << ";";
-    ss.newLine() << "{";
-    ss.indent();
+    _Add_ChromaCompressionNorm_Shader(ss, c);
 
-    ss.newLine() << ss.floatDecl("PI") << " = 3.14159265358979;";
-    ss.newLine() << ss.floatDecl("h_rad") << " = h / 180.0 * PI;";
-    ss.newLine() << ss.floatDecl("a") << " = cos(h_rad);";
-    ss.newLine() << ss.floatDecl("b") << " = sin(h_rad);";
-    ss.newLine() << ss.floatDecl("cos_hr2") << " = a * a - b * b;";
-    ss.newLine() << ss.floatDecl("sin_hr2") << " = 2.0 * a * b;";
-    ss.newLine() << ss.floatDecl("cos_hr3") << " = 4.0 * a * a * a - 3.0 * a;";
-    ss.newLine() << ss.floatDecl("sin_hr3") << " = 3.0 * b - 4.0 * b * b * b;";
-    ss.newLine() << ss.floatDecl("M") << " = 11.34072 * a + 16.46899 * cos_hr2 + 7.88380 * cos_hr3 + 14.66441 * b + -6.37224 * sin_hr2 + 9.19364 * sin_hr3 + 77.12896;";
-    ss.newLine() << "Mnorm = M * " << c.chroma_compress_scale << ";";
-
-    ss.dedent();
-    ss.newLine() << "}";
-
-    ss.newLine() << ss.floatDecl("reachM") << " = " << reachName << "_sample(h);";
-    ss.newLine() << ss.floatDecl("limit") << " = pow(nJ, " << c.model_gamma << ") * reachM / Mnorm;";
+    ss.newLine() << ss.floatDecl("limit") << " = pow(nJ, " << s.model_gamma_inv << ") * reachMaxM / Mnorm;";
 
     ss.newLine() << "M = M_cp / Mnorm;";
     ss.newLine() << "M = " << toeName << "(M, limit, nJ * " << c.compr << ", snJ);";
     ss.newLine() << "M = limit - " << toeName << "(limit - M, limit - 0.001, snJ * " << c.sat << ", sqrt(nJ * nJ + " << c.sat_thr << "));";
     ss.newLine() << "M = M * Mnorm;";
-    ss.newLine() << "M = M * pow(J_ts / J, " << -c.model_gamma << ");";
+    ss.newLine() << "M = M * pow(J_ts / J, " << -s.model_gamma_inv << ");";
 
     ss.dedent();
     ss.newLine() << "}";
@@ -708,7 +816,7 @@ std::string _Add_Cusp_table(
         GpuShaderCreator::TEXTURE_RGB_CHANNEL,
         dimensions,
         INTERP_NEAREST,
-        &(g.gamut_cusp_table.table[0][0]));
+        &(g.gamut_cusp_table[0][0]));
 
     if (dimensions == GpuShaderDesc::TEXTURE_1D)
     {
@@ -727,24 +835,20 @@ std::string _Add_Cusp_table(
     GpuShaderText ss(shaderCreator->getLanguage());
 
     const std::string hues_array_name = name + "_hues_array";
-    std::vector<float> hues_array(g.gamut_cusp_table.total_size);
-    for (int i = 0; i < g.gamut_cusp_table.total_size; ++i)
-    {
-        hues_array[i] = g.gamut_cusp_table.table[i][2];
-    }
-    ss.declareFloatArrayConst(hues_array_name, (int) hues_array.size(), hues_array.data());
+    ss.declareFloatArrayConst(hues_array_name, (int) g.hue_table.total_size, g.hue_table.data());
 
-    ss.newLine() << ss.float2Keyword() << " " << name << "_sample(float h)";
+    ss.newLine() << ss.float3Keyword() << " " << name << "_sample(float h)";
     ss.newLine() << "{";
     ss.indent();
 
-    ss.newLine() << ss.floatDecl("i_lo") << " = 0;";
-    ss.newLine() << ss.floatDecl("i_hi") << " = " << g.gamut_cusp_table.base_index + g.gamut_cusp_table.size << ";";
+    ss.newLine() << ss.intDecl("i") << " = " << ss.intKeyword() << "(h) + " << g.gamut_cusp_table.base_index << ";";
 
-    ss.newLine() << ss.floatDecl("hwrap") << " = h;";
-    ss.newLine() << "hwrap = hwrap - floor(hwrap / 360.0) * 360.0;";
-    ss.newLine() << "hwrap = (hwrap < 0.0) ? hwrap + 360.0 : hwrap;";
-    ss.newLine() << ss.intDecl("i") << " = " << ss.intKeyword() << "(hwrap + " << g.gamut_cusp_table.base_index << ");";
+    ss.newLine() << ss.intDecl("i_lo") << " = " << ss.intKeyword() << "(max("
+                 << ss.floatKeyword() << "(" << g.gamut_cusp_table.lower_wrap_index << "), "
+                 << ss.floatKeyword() << "(i + " << g.hue_linearity_search_range[0] << ")));";
+    ss.newLine() << ss.intDecl("i_hi") << " = " << ss.intKeyword() << "(min("
+                 << ss.floatKeyword() << "(" << g.gamut_cusp_table.upper_wrap_index << "), "
+                 << ss.floatKeyword() << "(i + " << g.hue_linearity_search_range[1] << ")));";
 
     ss.newLine() << "while (i_lo + 1 < i_hi)";
     ss.newLine() << "{";
@@ -764,7 +868,7 @@ std::string _Add_Cusp_table(
     ss.newLine() << "i_hi = i;";
     ss.dedent();
     ss.newLine() << "}";
-    ss.newLine() << "i = " << ss.intKeyword() << "((i_lo + i_hi) / 2.0);";
+    ss.newLine() << "i = (i_lo + i_hi) / 2;";
 
     ss.dedent();
     ss.newLine() << "}";
@@ -780,98 +884,8 @@ std::string _Add_Cusp_table(
         ss.newLine() << ss.float3Decl("hi") << " = " << ss.sampleTex2D(name, ss.float2Const(std::string("(i_hi + 0.5) / ") + std::to_string(g.gamut_cusp_table.total_size), "0.5")) << ".rgb;";
     }
 
-    ss.newLine() << ss.floatDecl("t") << " = (h - lo.b) / (hi.b - lo.b);";
-    ss.newLine() << ss.floatDecl("cuspJ") << " = " << ss.lerp("lo.r", "hi.r", "t") << ";";
-    ss.newLine() << ss.floatDecl("cuspM") << " = " << ss.lerp("lo.g", "hi.g", "t") << ";";
-
-    ss.newLine() << "return " << ss.float2Const("cuspJ", "cuspM") << ";";
-
-    ss.dedent();
-    ss.newLine() << "}";
-
-    shaderCreator->addToHelperShaderCode(ss.string().c_str());
-
-    return name;
-}
-
-std::string _Add_Gamma_table(
-    GpuShaderCreatorRcPtr & shaderCreator,
-    unsigned resourceIndex,
-    const ACES2::GamutCompressParams & g)
-{
-    // Reserve name
-    std::ostringstream resName;
-    resName << shaderCreator->getResourcePrefix()
-            << std::string("_")
-            << std::string("upper_hull_gamma_table_")
-            << resourceIndex;
-
-    // Note: Remove potentially problematic double underscores from GLSL resource names.
-    std::string name(resName.str());
-    StringUtils::ReplaceInPlace(name, "__", "_");
-
-    // Register texture
-    GpuShaderDesc::TextureDimensions dimensions = GpuShaderDesc::TEXTURE_1D;
-    if (shaderCreator->getLanguage() == GPU_LANGUAGE_GLSL_ES_1_0
-        || shaderCreator->getLanguage() == GPU_LANGUAGE_GLSL_ES_3_0
-        || !shaderCreator->getAllowTexture1D())
-    {
-        dimensions = GpuShaderDesc::TEXTURE_2D;
-    }
-
-    shaderCreator->addTexture(
-        name.c_str(),
-        GpuShaderText::getSamplerName(name).c_str(),
-        g.gamut_cusp_table.total_size,
-        1,
-        GpuShaderCreator::TEXTURE_RED_CHANNEL,
-        dimensions,
-        INTERP_NEAREST,
-        &(g.upper_hull_gamma_table.table[0]));
-
-
-    if (dimensions == GpuShaderDesc::TEXTURE_1D)
-    {
-        GpuShaderText ss(shaderCreator->getLanguage());
-        ss.declareTex1D(name);
-        shaderCreator->addToDeclareShaderCode(ss.string().c_str());
-    }
-    else
-    {
-        GpuShaderText ss(shaderCreator->getLanguage());
-        ss.declareTex2D(name);
-        shaderCreator->addToDeclareShaderCode(ss.string().c_str());
-    }
-
-    // Sampler function
-    GpuShaderText ss(shaderCreator->getLanguage());
-
-    ss.newLine() << ss.floatKeyword() << " " << name << "_sample(float h)";
-    ss.newLine() << "{";
-    ss.indent();
-
-    ss.newLine() << ss.floatDecl("hwrap") << " = h;";
-    ss.newLine() << "hwrap = hwrap - floor(hwrap / 360.0) * 360.0;";
-    ss.newLine() << "hwrap = (hwrap < 0.0) ? hwrap + 360.0 : hwrap;";
-
-    ss.newLine() << ss.floatDecl("i_lo") << " = floor(hwrap) + " << g.upper_hull_gamma_table.base_index << ";";
-    ss.newLine() << ss.floatDecl("i_hi") << " = (i_lo + 1);";
-    ss.newLine() << "i_hi = i_hi - floor(i_hi / 360.0) * 360.0;";
-
-    ss.newLine() << ss.floatDecl("base_hue") << " = i_lo - " << g.upper_hull_gamma_table.base_index << ";";
-
-    if (dimensions == GpuShaderDesc::TEXTURE_1D)
-    {
-        ss.newLine() << ss.floatDecl("lo") << " = " << ss.sampleTex1D(name, std::string("(i_lo + 0.5) / ") + std::to_string(g.upper_hull_gamma_table.total_size)) << ".r;";
-        ss.newLine() << ss.floatDecl("hi") << " = " << ss.sampleTex1D(name, std::string("(i_hi + 0.5) / ") + std::to_string(g.upper_hull_gamma_table.total_size)) << ".r;";
-    }
-    else
-    {
-        ss.newLine() << ss.floatDecl("lo") << " = " << ss.sampleTex2D(name, ss.float2Const(std::string("(i_lo + 0.5) / ") + std::to_string(g.upper_hull_gamma_table.total_size), "0.5")) << ".r;";
-        ss.newLine() << ss.floatDecl("hi") << " = " << ss.sampleTex2D(name, ss.float2Const(std::string("(i_hi + 0.5) / ") + std::to_string(g.upper_hull_gamma_table.total_size), "0.5")) << ".r;";
-    }
-
-    ss.newLine() << ss.floatDecl("t") << " = hwrap - base_hue;";
+    ss.newLine() << ss.floatDecl("t") << " = (h - " << hues_array_name << "[i_hi - 1]) / "
+                                                "(" << hues_array_name << "[i_hi]" << " - " << hues_array_name << "[i_hi - 1]);";
     ss.newLine() << "return " << ss.lerp("lo", "hi", "t") << ";";
 
     ss.dedent();
@@ -885,7 +899,7 @@ std::string _Add_Gamma_table(
 std::string _Add_Focus_Gain_func(
     GpuShaderCreatorRcPtr & shaderCreator,
     unsigned resourceIndex,
-    const ACES2::GamutCompressParams & g)
+    const ACES2::SharedCompressionParameters & s)
 {
     // Reserve name
     std::ostringstream resName;
@@ -904,13 +918,15 @@ std::string _Add_Focus_Gain_func(
     ss.newLine() << "{";
     ss.indent();
 
-    ss.newLine() << ss.floatDecl("thr") << " = " << ss.lerp("cuspJ", std::to_string(g.limit_J_max), std::to_string(ACES2::focus_gain_blend)) << ";";
+    ss.newLine() << ss.floatDecl("thr") << " = " << ss.lerp("cuspJ", std::to_string(s.limit_J_max), std::to_string(ACES2::focus_gain_blend)) << ";";
 
-    ss.newLine() << "if (J > thr)";
+    ss.newLine() << "if (J > thr)"; // TODO threshold
     ss.newLine() << "{";
     ss.indent();
-    ss.newLine() << ss.floatDecl("gain") << " = ( " << g.limit_J_max << " - thr) / max(0.0001, (" << g.limit_J_max << " - min(" << g.limit_J_max << ", J)));";
-    ss.newLine() << "return pow(log(gain)/log(10.0), 1.0 / " << ACES2::focus_adjust_gain << ") + 1.0;";
+    ss.newLine() << ss.floatDecl("gain") << " = ( " << s.limit_J_max << " - thr) / max(0.0001, " << s.limit_J_max << " - J);";
+    ss.newLine() << "gain = log(gain)/log(10.0);";  // TODO log10(gain) but not all shading languages have log10() would log2(gain)/log2(10) be better? perhaps delegate to GpuShaderText?
+    //ss.newLine() << "return pow(gain, " << ACES2::focus_adjust_gain_inv << ") + 1.0;"; // TODO; remove once agreed on change
+    ss.newLine() << "return gain * gain + 1.0;";
     ss.dedent();
     ss.newLine() << "}";
     ss.newLine() << "else";
@@ -931,7 +947,7 @@ std::string _Add_Focus_Gain_func(
 std::string _Add_Solve_J_Intersect_func(
     GpuShaderCreatorRcPtr & shaderCreator,
     unsigned resourceIndex,
-    const ACES2::GamutCompressParams & g)
+    const ACES2::SharedCompressionParameters & s)
 {
     // Reserve name
     std::ostringstream resName;
@@ -950,54 +966,30 @@ std::string _Add_Solve_J_Intersect_func(
     ss.newLine() << "{";
     ss.indent();
 
-    ss.newLine() << ss.floatDecl("a") << " = " << "M / (focusJ * slope_gain);";
-    ss.newLine() << ss.floatDecl("b") << " = 0.0;";
-    ss.newLine() << ss.floatDecl("c") << " = 0.0;";
-    ss.newLine() << ss.floatDecl("intersectJ") << " = 0.0;";
-
-    ss.newLine() << "if (J < focusJ)";
-    ss.newLine() << "{";
-    ss.indent();
-    ss.newLine() << "b = 1.0 - M / slope_gain;";
-    ss.dedent();
-    ss.newLine() << "}";
-    ss.newLine() << "else";
-    ss.newLine() << "{";
-    ss.indent();
-    ss.newLine() << "b = - (1.0 + M / slope_gain + " << g.limit_J_max << " * M / (focusJ * slope_gain));";
-    ss.dedent();
-    ss.newLine() << "}";
-
-    ss.newLine() << "if (J < focusJ)";
-    ss.newLine() << "{";
-    ss.indent();
-    ss.newLine() << "c = -J;";
-    ss.dedent();
-    ss.newLine() << "}";
-    ss.newLine() << "else";
-    ss.newLine() << "{";
-    ss.indent();
-    ss.newLine() << "c = " << g.limit_J_max << " * M / slope_gain + J;";
-    ss.dedent();
-    ss.newLine() << "}";
-
-    ss.newLine() << ss.floatDecl("root") << " = sqrt(b * b - 4.0 * a * c);";
+    ss.newLine() << ss.floatDecl("M_scaled") << " = M / slope_gain;";
+    ss.newLine() << ss.floatDecl("a") << " = M_scaled / focusJ;";
 
     ss.newLine() << "if (J < focusJ)";
     ss.newLine() << "{";
     ss.indent();
-    ss.newLine() << "intersectJ = 2.0 * c / (-b - root);";
+    ss.newLine() << ss.floatDecl("b") << " = 1.0 - M_scaled;";
+    ss.newLine() << ss.floatDecl("c") << " = -J;";
+    ss.newLine() << ss.floatDecl("det") << " =  b * b - 4.f * a * c;";
+    ss.newLine() << ss.floatDecl("root") << " =  sqrt(det);";
+    ss.newLine() << "return -2.0 * c / (b + root);";
     ss.dedent();
     ss.newLine() << "}";
     ss.newLine() << "else";
     ss.newLine() << "{";
     ss.indent();
-    ss.newLine() << "intersectJ = 2.0 * c / (-b + root);";
+    ss.newLine() << ss.floatDecl("b") << " = - (1.0 + M_scaled + " << s.limit_J_max << " * a);";
+    ss.newLine() << ss.floatDecl("c") << " = " << s.limit_J_max << " * M_scaled + J;";
+    ss.newLine() << ss.floatDecl("det") << " =  b * b - 4.f * a * c;";
+    ss.newLine() << ss.floatDecl("root") << " =  sqrt(det);";
+    ss.newLine() << "return -2.0 * c / (b - root);";
     ss.dedent();
     ss.newLine() << "}";
 
-    ss.newLine() << "return intersectJ;";
-
     ss.dedent();
     ss.newLine() << "}";
 
@@ -1009,8 +1001,7 @@ std::string _Add_Solve_J_Intersect_func(
 std::string _Add_Find_Gamut_Boundary_Intersection_func(
     GpuShaderCreatorRcPtr & shaderCreator,
     unsigned resourceIndex,
-    const ACES2::GamutCompressParams & g,
-    const std::string & solveJIntersectName)
+    const ACES2::SharedCompressionParameters & s)
 {
     // Reserve name
     std::ostringstream resName;
@@ -1025,49 +1016,28 @@ std::string _Add_Find_Gamut_Boundary_Intersection_func(
 
     GpuShaderText ss(shaderCreator->getLanguage());
 
-    ss.newLine() << ss.float3Keyword() << " " << name << "(" << ss.float3Keyword() << " JMh_s, " << ss.float2Keyword() << " JM_cusp_in, float J_focus, float slope_gain, float gamma_top, float gamma_bottom)";
+    ss.newLine() << ss.floatKeyword() << " " << name << "(" << ss.float2Keyword()
+                 << " JM_cusp, float gamma_top_inv, float gamma_bottom_inv, float J_intersect_source, float J_intersect_cusp, float slope)";
     ss.newLine() << "{";
     ss.indent();
 
-    ss.newLine() << ss.float2Decl("JM_cusp") << " = " << ss.float2Const("JM_cusp_in.r", std::string("JM_cusp_in.g * (1.0 + ") + std::to_string(ACES2::smooth_m) + " * " + std::to_string(ACES2::smooth_cusps) + ")") << ";";
-
-    ss.newLine() << ss.floatDecl("J_intersect_source") << " = " << solveJIntersectName << "(JMh_s.r, JMh_s.g, J_focus, slope_gain);";
-    ss.newLine() << ss.floatDecl("J_intersect_cusp") << " = " << solveJIntersectName << "(JM_cusp.r, JM_cusp.g, J_focus, slope_gain);";
-
-    ss.newLine() << ss.floatDecl("slope") << " = 0.0;";
-    ss.newLine() << "if (J_intersect_source < J_focus)";
-    ss.newLine() << "{";
-    ss.indent();
-    ss.newLine() << "slope = J_intersect_source * (J_intersect_source - J_focus) / (J_focus * slope_gain);";
-    ss.dedent();
-    ss.newLine() << "}";
-    ss.newLine() << "else";
-    ss.newLine() << "{";
-    ss.indent();
-    ss.newLine() << "slope = (" << g.limit_J_max << " - J_intersect_source) * (J_intersect_source - J_focus) / (J_focus * slope_gain);";
-    ss.dedent();
-    ss.newLine() << "}";
-
-    ss.newLine() << ss.floatDecl("M_boundary_lower ") << " = J_intersect_cusp * pow(J_intersect_source / J_intersect_cusp, 1.0 / gamma_bottom) / (JM_cusp.r / JM_cusp.g - slope);";
-    ss.newLine() << ss.floatDecl("M_boundary_upper") << " = JM_cusp.g * (" << g.limit_J_max << " - J_intersect_cusp) * pow((" << g.limit_J_max << " - J_intersect_source) / (" << g.limit_J_max << " - J_intersect_cusp), 1.0 / gamma_top) / (slope * JM_cusp.g + " << g.limit_J_max << " - JM_cusp.r);";
+    ss.newLine() << ss.floatDecl("M_boundary_lower") << " = J_intersect_cusp * pow(J_intersect_source / J_intersect_cusp, gamma_bottom_inv) / (JM_cusp.r / JM_cusp.g - slope);";
+    ss.newLine() << ss.floatDecl("M_boundary_upper") << " = JM_cusp.g * (" << s.limit_J_max << " - J_intersect_cusp) * pow((" << s.limit_J_max << " - J_intersect_source) / (" << s.limit_J_max << " - J_intersect_cusp), gamma_top_inv) / (slope * JM_cusp.g + " << s.limit_J_max << " - JM_cusp.r);";
 
     ss.newLine() << ss.floatDecl("smin") << " = 0.0;";
     ss.newLine() << "{";
     ss.indent();
-    ss.newLine() << ss.floatDecl("a") << " = M_boundary_lower / JM_cusp.g;";
-    ss.newLine() << ss.floatDecl("b") << " = M_boundary_upper / JM_cusp.g;";
-    ss.newLine() << ss.floatDecl("s") << " = " << ACES2::smooth_cusps << ";";
+    ss.newLine() << ss.floatDecl("a") << " = M_boundary_lower;";
+    ss.newLine() << ss.floatDecl("b") << " = M_boundary_upper;";
+    ss.newLine() << ss.floatDecl("s") << " = " << ACES2::smooth_cusps << " * JM_cusp.g;";
 
     ss.newLine() << ss.floatDecl("h") << " = max(s - abs(a - b), 0.0) / s;";
-    ss.newLine() << "smin = min(a, b) - h * h * h * s * (1.0 / 6.0);";
+    ss.newLine() << "smin = min(a, b) - h * h * h * s * " << (1.0 / 6.0) << ";";
 
     ss.dedent();
     ss.newLine() << "}";
 
-    ss.newLine() << ss.floatDecl("M_boundary") << " = JM_cusp.g * smin;";
-    ss.newLine() << ss.floatDecl("J_boundary") << "= J_intersect_source + slope * M_boundary;";
-
-    ss.newLine() << "return " << ss.float3Const("J_boundary", "M_boundary", "J_intersect_source") << ";";
+    ss.newLine() << "return smin;";
 
     ss.dedent();
     ss.newLine() << "}";
@@ -1077,19 +1047,17 @@ std::string _Add_Find_Gamut_Boundary_Intersection_func(
     return name;
 }
 
-std::string _Add_Reach_Boundary_func(
+std::string _Add_Compression_func(
     GpuShaderCreatorRcPtr & shaderCreator,
     unsigned resourceIndex,
-    const ACES2::GamutCompressParams & g,
-    const std::string & reachName,
-    const std::string & getFocusGainName,
-    const std::string & solveJIntersectName)
+    bool invert)
 {
     // Reserve name
     std::ostringstream resName;
     resName << shaderCreator->getResourcePrefix()
             << std::string("_")
-            << std::string("get_reach_boundary")
+            << std::string("remap_M")
+            << (invert ? std::string("_inv") : std::string("_fwd"))
             << resourceIndex;
 
     // Note: Remove potentially problematic double underscores from GLSL resource names.
@@ -1098,102 +1066,47 @@ std::string _Add_Reach_Boundary_func(
 
     GpuShaderText ss(shaderCreator->getLanguage());
 
-    ss.newLine() << ss.float3Keyword() << " " << name << "(float J, float M, float h, " << ss.float2Keyword() << " JMcusp, float focusJ)";
+    ss.newLine() << ss.floatKeyword() << " " << name << "(float M, float gamut_boundary_M, float reach_boundary_M)";
     ss.newLine() << "{";
     ss.indent();
 
-    ss.newLine() << ss.floatDecl("reachMaxM") << " = " << reachName << "_sample(h);";
-    ss.newLine() << ss.floatDecl("slope_gain") << " = " << g.limit_J_max << " * " << g.focus_dist << " * " << getFocusGainName << "(J, JMcusp.r);";
-    ss.newLine() << ss.floatDecl("intersectJ") << " = " << solveJIntersectName << "(J, M, focusJ, slope_gain);";
+    ss.newLine() << ss.floatDecl("boundary_ratio") << " = gamut_boundary_M / reach_boundary_M;";
+    ss.newLine() << ss.floatDecl("proportion") << " = max(boundary_ratio, " << ACES2::compression_threshold << ");";
+    ss.newLine() << ss.floatDecl("threshold") << " = proportion * gamut_boundary_M;";
 
-    ss.newLine() << ss.floatDecl("slope") << " = 0.0;";
-    ss.newLine() << "if (intersectJ < focusJ)";
-    ss.newLine() << "{";
-    ss.indent();
-    ss.newLine() << "slope = intersectJ * (intersectJ - focusJ) / (focusJ * slope_gain);";
-    ss.dedent();
-    ss.newLine() << "}";
-    ss.newLine() << "else";
+    ss.newLine() << "if (proportion >= 1.0f || M <= threshold)";
     ss.newLine() << "{";
     ss.indent();
-    ss.newLine() << "slope = (" << g.limit_J_max << " - intersectJ) * (intersectJ - focusJ) / (focusJ * slope_gain);";
+    ss.newLine() << "return M;";
     ss.dedent();
     ss.newLine() << "}";
+    ss.newLine() << ss.floatDecl("m_offset") << " = M - threshold;";
+    ss.newLine() << ss.floatDecl("gamut_offset") << " = gamut_boundary_M - threshold;";
+    ss.newLine() << ss.floatDecl("reach_offset") << " = reach_boundary_M - threshold;";
 
-    ss.newLine() << ss.floatDecl("boundary") << " = " << g.limit_J_max << " * pow(intersectJ / " << g.limit_J_max << ", " << g.model_gamma << ") * reachMaxM / (" << g.limit_J_max << " - slope * reachMaxM);";
-
-    ss.newLine() << "return " << ss.float3Const("J", "boundary", "h") << ";";
-
-    ss.dedent();
-    ss.newLine() << "}";
-
-    shaderCreator->addToHelperShaderCode(ss.string().c_str());
-
-    return name;
-}
-
-std::string _Add_Compression_func(
-    GpuShaderCreatorRcPtr & shaderCreator,
-    unsigned resourceIndex,
-    bool invert)
-{
-    // Reserve name
-    std::ostringstream resName;
-    resName << shaderCreator->getResourcePrefix()
-            << std::string("_")
-            << std::string("compression")
-            << (invert ? std::string("_inv") : std::string("_fwd"))
-            << resourceIndex;
-
-    // Note: Remove potentially problematic double underscores from GLSL resource names.
-    std::string name(resName.str());
-    StringUtils::ReplaceInPlace(name, "__", "_");
-
-    GpuShaderText ss(shaderCreator->getLanguage());
-
-    ss.newLine() << ss.floatKeyword() << " " << name << "(float v, float thr, float lim)";
-    ss.newLine() << "{";
-    ss.indent();
-
-    ss.newLine() << ss.floatDecl("s") << " = (lim - thr) * (1.0 - thr) / (lim - 1.0);";
-    ss.newLine() << ss.floatDecl("nd") << " = (v - thr) / s;";
-
-
-    ss.newLine() << ss.floatDecl("vCompressed") << " = 0.0;";
+    ss.newLine() << ss.floatDecl("scale") << " = reach_offset / ((reach_offset / gamut_offset) - 1.0f);";
+    ss.newLine() << ss.floatDecl("nd") << " = m_offset / scale;";
 
     if (invert)
     {
-        ss.newLine() << "if (v < thr || lim <= 1.0001 || v > thr + s)";
+        ss.newLine() << "if (nd >= 1.0f)"; // TODO: could be done branchless?
         ss.newLine() << "{";
         ss.indent();
-        ss.newLine() << "vCompressed = v;";
+        ss.newLine() << "return threshold + scale;";
         ss.dedent();
         ss.newLine() << "}";
         ss.newLine() << "else";
         ss.newLine() << "{";
         ss.indent();
-        ss.newLine() << "vCompressed = thr + s * (-nd / (nd - 1));";
+        ss.newLine() << "return threshold + scale * -(nd / (nd - 1.0f));";
         ss.dedent();
         ss.newLine() << "}";
     }
     else
     {
-        ss.newLine() << "if (v < thr || lim <= 1.0001)";
-        ss.newLine() << "{";
-        ss.indent();
-        ss.newLine() << "vCompressed = v;";
-        ss.dedent();
-        ss.newLine() << "}";
-        ss.newLine() << "else";
-        ss.newLine() << "{";
-        ss.indent();
-        ss.newLine() << "vCompressed = thr + s * nd / (1.0 + nd);";
-        ss.dedent();
-        ss.newLine() << "}";
+        ss.newLine() << "return threshold + scale * nd / (1.0f + nd);";
     }
 
-    ss.newLine() << "return vCompressed;";
-
     ss.dedent();
     ss.newLine() << "}";
 
@@ -1205,13 +1118,12 @@ std::string _Add_Compression_func(
 std::string _Add_Compress_Gamut_func(
     GpuShaderCreatorRcPtr & shaderCreator,
     unsigned resourceIndex,
+    const ACES2::SharedCompressionParameters & s,
     const ACES2::GamutCompressParams & g,
-    const std::string & cuspName,
     const std::string & getFocusGainName,
-    const std::string & gammaName,
     const std::string & findGamutBoundaryIntersectionName,
-    const std::string & getReachBoundaryName,
-    const std::string & compressionName)
+    const std::string & compressionName,
+    const std::string & solveJIntersectName)
 {
     // Reserve name
     std::ostringstream resName;
@@ -1226,7 +1138,7 @@ std::string _Add_Compress_Gamut_func(
 
     GpuShaderText ss(shaderCreator->getLanguage());
 
-    ss.newLine() << ss.float3Keyword() << " " << name << "(" << ss.float3Keyword() << " JMh, float Jx)";
+    ss.newLine() << ss.float3Keyword() << " " << name << "(" << ss.float3Keyword() << " JMh, float Jx, " << ss.float3Keyword() << " JMGcusp, float reachMaxM)";
     ss.newLine() << "{";
     ss.indent();
 
@@ -1234,7 +1146,7 @@ std::string _Add_Compress_Gamut_func(
     ss.newLine() << ss.floatDecl("M") << " = JMh.g;";
     ss.newLine() << ss.floatDecl("h") << " = JMh.b;";
 
-    ss.newLine() << "if (M < 0.0001 || J > " << g.limit_J_max << ")";
+    ss.newLine() << "if (M <= 0.0 || J > " << s.limit_J_max << ")";
     ss.newLine() << "{";
     ss.indent();
     ss.newLine() << "return " << ss.float3Const("J", "0.0", "h") << ";";
@@ -1244,40 +1156,36 @@ std::string _Add_Compress_Gamut_func(
     ss.newLine() << "{";
     ss.indent();
 
-    ss.newLine() << ss.float2Decl("project_from") << " = " << ss.float2Const("J", "M") << ";";
-    ss.newLine() << ss.float2Decl("JMcusp") << " = " << cuspName << "_sample(h);";
+    ss.newLine() << ss.float2Decl("JMcusp") << " = JMGcusp.rg;";
 
-    ss.newLine() << ss.floatDecl("focusJ") << " = " << ss.lerp("JMcusp.r", std::to_string(g.mid_J), std::string("min(1.0, ") + std::to_string(ACES2::cusp_mid_blend) + " - (JMcusp.r / " + std::to_string(g.limit_J_max)) << "));";
-    ss.newLine() << ss.floatDecl("slope_gain") << " = " << g.limit_J_max << " * " << g.focus_dist << " * " << getFocusGainName << "(Jx, JMcusp.r);";
+    ss.newLine() << ss.floatDecl("focusJ") << " = " << ss.lerp("JMcusp.r", std::to_string(g.mid_J), std::string("min(1.0, ") + std::to_string(ACES2::cusp_mid_blend) + " - (JMcusp.r / " + std::to_string(s.limit_J_max)) << "));";
+    ss.newLine() << ss.floatDecl("slope_gain") << " = " << s.limit_J_max * g.focus_dist << " * " << getFocusGainName << "(Jx, JMcusp.r);";
+    ss.newLine() << ss.floatDecl("J_intersect_source") << " = " << solveJIntersectName << "(JMh.r, JMh.g, focusJ, slope_gain);";
+    ss.newLine() << ss.floatDecl("gamut_slope") << " = (J_intersect_source < focusJ) ? J_intersect_source : (" << s.limit_J_max << " - J_intersect_source);";
+    ss.newLine() << "gamut_slope = gamut_slope * (J_intersect_source - focusJ) / (focusJ * slope_gain);";
 
-    ss.newLine() << ss.floatDecl("gamma_top") << " = " << gammaName << "_sample(h);";
-    ss.newLine() << ss.floatDecl("gamma_bottom") << " = " << g.lower_hull_gamma << ";";
+    ss.newLine() << ss.floatDecl("gamma_top_inv") << " = JMGcusp.b;";
+    ss.newLine() << ss.floatDecl("gamma_bottom_inv") << " = " << g.lower_hull_gamma_inv << ";"; // TODO move to where it is used
 
-    ss.newLine() << ss.float3Decl("boundaryReturn") << " = " << findGamutBoundaryIntersectionName << "(" << ss.float3Const("J", "M", "h") << ", JMcusp, focusJ, slope_gain, gamma_top, gamma_bottom);";
-    ss.newLine() << ss.float2Decl("JMboundary") << " = " << ss.float2Const("boundaryReturn.r", "boundaryReturn.g") << ";";
-    ss.newLine() << ss.float2Decl("project_to") << " = " << ss.float2Const("boundaryReturn.b", "0.0") << ";";
+    ss.newLine() << ss.floatDecl("J_intersect_cusp") << " = " << solveJIntersectName << "(JMcusp.r, JMcusp.g, focusJ, slope_gain);";
+    ss.newLine() << ss.floatDecl("gamutBoundaryM") << " = " << findGamutBoundaryIntersectionName
+                 << "(JMcusp, gamma_top_inv, gamma_bottom_inv, J_intersect_source, J_intersect_cusp, gamut_slope);";
 
-    ss.newLine() << "if (JMboundary.g <= 0.0)";
+    ss.newLine() << "if (gamutBoundaryM <= 0.0)";
     ss.newLine() << "{";
     ss.indent();
     ss.newLine() << "return " << ss.float3Const("J", "0.0", "h") << ";";
     ss.dedent();
     ss.newLine() << "}";
 
-    ss.newLine() << ss.float3Decl("reachBoundary") << " = " << getReachBoundaryName << "(JMboundary.r, JMboundary.g, h, JMcusp, focusJ);";
+    ss.newLine() << ss.floatDecl("reachBoundaryM") << " = " << s.limit_J_max << " * pow(J_intersect_source / " << s.limit_J_max << ",  " << s.model_gamma_inv << ");";
+    ss.newLine() << "reachBoundaryM = reachBoundaryM / ((" << s.limit_J_max << " / reachMaxM) - gamut_slope);";
 
-    ss.newLine() << ss.floatDecl("difference") << " = max(1.0001, reachBoundary.g / JMboundary.g);";
-    ss.newLine() << ss.floatDecl("threshold") << " = max(" << ACES2::compression_threshold << ", 1.0 / difference);";
+    ss.newLine() << ss.floatDecl("remapped_M") << " = " << compressionName << "(M, gamutBoundaryM, reachBoundaryM);";
+    ss.newLine() << ss.floatDecl("remapped_J") << " = J_intersect_source + remapped_M * gamut_slope;";
 
-    ss.newLine() << ss.floatDecl("v") << " = project_from.g / JMboundary.g;";
-    ss.newLine() << "v = " << compressionName << "(v, threshold, difference);";
 
-    ss.newLine() << ss.float2Decl("JMcompressed") << " = " << ss.float2Const(
-        "project_to.r + v * (JMboundary.r - project_to.r)",
-        "project_to.g + v * (JMboundary.g - project_to.g)"
-    ) << ";";
-
-    ss.newLine() << "return " << ss.float3Const("JMcompressed.r", "JMcompressed.g", "h") << ";";
+    ss.newLine() << "return " << ss.float3Const("remapped_J", "remapped_M", "h") << ";";
 
     ss.dedent();
     ss.newLine() << "}";
@@ -1294,58 +1202,56 @@ void _Add_Gamut_Compress_Fwd_Shader(
     GpuShaderCreatorRcPtr & shaderCreator,
     GpuShaderText & ss,
     unsigned int resourceIndex,
-    const ACES2::GamutCompressParams & g,
-    const std::string & reachName)
+    const ACES2::SharedCompressionParameters & s,
+    const ACES2::GamutCompressParams & g)
 {
     std::string cuspName = _Add_Cusp_table(shaderCreator, resourceIndex, g);
-    std::string gammaName = _Add_Gamma_table(shaderCreator, resourceIndex, g);
-    std::string getFocusGainName = _Add_Focus_Gain_func(shaderCreator, resourceIndex, g);
-    std::string solveJIntersectName = _Add_Solve_J_Intersect_func(shaderCreator, resourceIndex, g);
-    std::string findGamutBoundaryIntersectionName = _Add_Find_Gamut_Boundary_Intersection_func(shaderCreator, resourceIndex, g, solveJIntersectName);
-    std::string getReachBoundaryName = _Add_Reach_Boundary_func(shaderCreator, resourceIndex, g, reachName, getFocusGainName, solveJIntersectName);
+    std::string getFocusGainName = _Add_Focus_Gain_func(shaderCreator, resourceIndex, s);
+    std::string solveJIntersectName = _Add_Solve_J_Intersect_func(shaderCreator, resourceIndex, s);
+    std::string findGamutBoundaryIntersectionName = _Add_Find_Gamut_Boundary_Intersection_func(shaderCreator, resourceIndex, s);
     std::string compressionName = _Add_Compression_func(shaderCreator, resourceIndex, false);
-    std::string gamutCompressName = _Add_Compress_Gamut_func(shaderCreator, resourceIndex, g, cuspName, getFocusGainName, gammaName, findGamutBoundaryIntersectionName, getReachBoundaryName, compressionName);
+    std::string gamutCompressName = _Add_Compress_Gamut_func(shaderCreator, resourceIndex, s, g, getFocusGainName, findGamutBoundaryIntersectionName, compressionName, solveJIntersectName);
 
     const std::string pxl(shaderCreator->getPixelName());
 
-    ss.newLine() << pxl << ".rgb = " << gamutCompressName << "(" << pxl << ".rgb, " << pxl << ".r);";
+    ss.newLine() << ss.float3Decl("JMGcusp") << " = " << cuspName << "_sample(" << pxl << ".b);";
+    ss.newLine() << pxl << ".rgb = " << gamutCompressName << "(" << pxl << ".rgb, " << pxl << ".r, JMGcusp, reachMaxM);";
 }
 
 void _Add_Gamut_Compress_Inv_Shader(
     GpuShaderCreatorRcPtr & shaderCreator,
     GpuShaderText & ss,
     unsigned int resourceIndex,
-    const ACES2::GamutCompressParams & g,
-    const std::string & reachName)
+    const ACES2::SharedCompressionParameters & s,
+    const ACES2::GamutCompressParams & g)
 {
     std::string cuspName = _Add_Cusp_table(shaderCreator, resourceIndex, g);
-    std::string gammaName = _Add_Gamma_table(shaderCreator, resourceIndex, g);
-    std::string getFocusGainName = _Add_Focus_Gain_func(shaderCreator, resourceIndex, g);
-    std::string solveJIntersectName = _Add_Solve_J_Intersect_func(shaderCreator, resourceIndex, g);
-    std::string findGamutBoundaryIntersectionName = _Add_Find_Gamut_Boundary_Intersection_func(shaderCreator, resourceIndex, g, solveJIntersectName);
-    std::string getReachBoundaryName = _Add_Reach_Boundary_func(shaderCreator, resourceIndex, g, reachName, getFocusGainName, solveJIntersectName);
+    std::string getFocusGainName = _Add_Focus_Gain_func(shaderCreator, resourceIndex, s);
+    std::string solveJIntersectName = _Add_Solve_J_Intersect_func(shaderCreator, resourceIndex, s);
+    std::string findGamutBoundaryIntersectionName = _Add_Find_Gamut_Boundary_Intersection_func(shaderCreator, resourceIndex, s);
     std::string compressionName = _Add_Compression_func(shaderCreator, resourceIndex, true);
-    std::string gamutCompressName = _Add_Compress_Gamut_func(shaderCreator, resourceIndex, g, cuspName, getFocusGainName, gammaName, findGamutBoundaryIntersectionName, getReachBoundaryName, compressionName);
+    std::string gamutCompressName = _Add_Compress_Gamut_func(shaderCreator, resourceIndex, s, g, getFocusGainName, findGamutBoundaryIntersectionName, compressionName, solveJIntersectName);
 
     const std::string pxl(shaderCreator->getPixelName());
 
-    ss.newLine() << ss.float2Decl("JMcusp") << " = " << cuspName << "_sample(" << pxl << ".b);";
+    ss.newLine() << ss.float3Decl("JMGcusp") << " = " << cuspName << "_sample(" << pxl << ".b);";
+
     ss.newLine() << ss.floatDecl("Jx") << " = " << pxl << ".r;";
     ss.newLine() << ss.float3Decl("unCompressedJMh") << ";";
 
     // Analytic inverse below threshold
-    ss.newLine() << "if (Jx <= " << ss.lerp("JMcusp.r", std::to_string(g.limit_J_max), std::to_string(ACES2::focus_gain_blend)) << ")";
+    ss.newLine() << "if (Jx <= " << ss.lerp("JMGcusp.r", std::to_string(s.limit_J_max), std::to_string(ACES2::focus_gain_blend)) << ")";
     ss.newLine() << "{";
     ss.indent();
-    ss.newLine() << "unCompressedJMh = " << gamutCompressName << "(" << pxl << ".rgb, Jx);";
+    ss.newLine() << "unCompressedJMh = " << gamutCompressName << "(" << pxl << ".rgb, Jx, JMGcusp, reachMaxM);";
     ss.dedent();
     ss.newLine() << "}";
     // Approximation above threshold
     ss.newLine() << "else";
     ss.newLine() << "{";
     ss.indent();
-    ss.newLine() << "Jx = " << gamutCompressName << "(" << pxl << ".rgb, Jx).r;";
-    ss.newLine() << "unCompressedJMh = " << gamutCompressName << "(" << pxl << ".rgb, Jx);";
+    ss.newLine() << "Jx = " << gamutCompressName << "(" << pxl << ".rgb, Jx, JMGcusp, reachMaxM).r;";
+    ss.newLine() << "unCompressedJMh = " << gamutCompressName << "(" << pxl << ".rgb, Jx, JMGcusp, reachMaxM);";
     ss.dedent();
     ss.newLine() << "}";
 
@@ -1368,8 +1274,6 @@ void Add_ACES_OutputTransform_Fwd_Shader(
     const float white_x = (float) params[7];
     const float white_y = (float) params[8];
 
-    const Primaries in_primaries = ACES_AP0::primaries;
-
     const Primaries lim_primaries = {
         {red_x  , red_y  },
         {green_x, green_y},
@@ -1377,31 +1281,41 @@ void Add_ACES_OutputTransform_Fwd_Shader(
         {white_x, white_y}
     };
 
-    ACES2::JMhParams pIn = ACES2::init_JMhParams(in_primaries);
-    ACES2::JMhParams pLim = ACES2::init_JMhParams(lim_primaries);
-    ACES2::ToneScaleParams t = ACES2::init_ToneScaleParams(peak_luminance);
-    ACES2::ChromaCompressParams c = ACES2::init_ChromaCompressParams(peak_luminance);
-    const ACES2::GamutCompressParams g = ACES2::init_GamutCompressParams(peak_luminance, lim_primaries);
+    const ACES2::JMhParams pIn = ACES2::init_JMhParams(ACES_AP0::primaries);
+    const ACES2::JMhParams pLim = ACES2::init_JMhParams(lim_primaries);
+    const ACES2::ToneScaleParams t = ACES2::init_ToneScaleParams(peak_luminance);
+    const ACES2::JMhParams reachGamut = ACES2::init_JMhParams(ACES_AP1::primaries);
+    const ACES2::SharedCompressionParameters s = ACES2::init_SharedCompressionParams(peak_luminance, pIn, reachGamut);
+    const ACES2::ChromaCompressParams c = ACES2::init_ChromaCompressParams(peak_luminance, t);
+    const ACES2::GamutCompressParams g = ACES2::init_GamutCompressParams(peak_luminance, pIn, pLim, t, s, reachGamut);
 
     unsigned resourceIndex = shaderCreator->getNextResourceIndex();
 
-    std::string reachName = _Add_Reach_table(shaderCreator, resourceIndex, g.reach_m_table);
+    const std::string reachName = _Add_Reach_table(shaderCreator, resourceIndex, s.reach_m_table);
+    const std::string tonescaleName_Fwd = _Add_Tonescale_func(shaderCreator, resourceIndex, false, pIn, t);
+    const std::string pxl(shaderCreator->getPixelName());
 
     ss.newLine() << "";
     ss.newLine() << "// Add RGB to JMh";
     ss.newLine() << "";
-    ss.newLine() << "{";
-    ss.indent();
-        _Add_RGB_to_JMh_Shader(shaderCreator, ss, pIn);
-    ss.dedent();
-    ss.newLine() << "}";
+    _Add_RGB_to_JMh_Shader(shaderCreator, ss, pIn);
+    _Add_SinCos_Shader(shaderCreator, ss);
+
 
     ss.newLine() << "";
     ss.newLine() << "// Add ToneScale and ChromaCompress (fwd)";
     ss.newLine() << "";
+
+    ss.newLine() << ss.floatDecl("J_ts") << " = " << tonescaleName_Fwd << "(" << pxl << ".r);";
+
+    ss.newLine() << "// Sample tables (fwd)";
+    ss.newLine() << ss.floatDecl("reachMaxM") << " = " << reachName << "_sample(" << pxl << ".b);";
+
+    ss.newLine() << "";
+
     ss.newLine() << "{";
     ss.indent();
-        _Add_Tonescale_Compress_Fwd_Shader(shaderCreator, ss, resourceIndex, pIn, t, c, reachName);
+        _Add_Tonescale_Compress_Fwd_Shader(shaderCreator, ss, resourceIndex, s, c);
     ss.dedent();
     ss.newLine() << "}";
 
@@ -1410,7 +1324,7 @@ void Add_ACES_OutputTransform_Fwd_Shader(
     ss.newLine() << "";
     ss.newLine() << "{";
     ss.indent();
-        _Add_Gamut_Compress_Fwd_Shader(shaderCreator, ss, resourceIndex, g, reachName);
+        _Add_Gamut_Compress_Fwd_Shader(shaderCreator, ss, resourceIndex, s, g);
     ss.dedent();
     ss.newLine() << "}";
 
@@ -1441,8 +1355,6 @@ void Add_ACES_OutputTransform_Inv_Shader(
     const float white_x = (float) params[7];
     const float white_y = (float) params[8];
 
-    const Primaries in_primaries = ACES_AP0::primaries;
-
     const Primaries lim_primaries = {
         {red_x  , red_y  },
         {green_x, green_y},
@@ -1450,40 +1362,44 @@ void Add_ACES_OutputTransform_Inv_Shader(
         {white_x, white_y}
     };
 
-    ACES2::JMhParams pIn = ACES2::init_JMhParams(in_primaries);
-    ACES2::JMhParams pLim = ACES2::init_JMhParams(lim_primaries);
-    ACES2::ToneScaleParams t = ACES2::init_ToneScaleParams(peak_luminance);
-    ACES2::ChromaCompressParams c = ACES2::init_ChromaCompressParams(peak_luminance);
-    const ACES2::GamutCompressParams g = ACES2::init_GamutCompressParams(peak_luminance, lim_primaries);
+    const ACES2::JMhParams pIn = ACES2::init_JMhParams(ACES_AP0::primaries);
+    const ACES2::JMhParams pLim = ACES2::init_JMhParams(lim_primaries);
+    const ACES2::ToneScaleParams t = ACES2::init_ToneScaleParams(peak_luminance);
+    const ACES2::JMhParams reachGamut = ACES2::init_JMhParams(ACES_AP1::primaries);
+    const ACES2::SharedCompressionParameters s = ACES2::init_SharedCompressionParams(peak_luminance, pIn, reachGamut);
+    const ACES2::ChromaCompressParams c = ACES2::init_ChromaCompressParams(peak_luminance, t);
+    const ACES2::GamutCompressParams g = ACES2::init_GamutCompressParams(peak_luminance, pIn, pLim, t, s, reachGamut);
 
     unsigned resourceIndex = shaderCreator->getNextResourceIndex();
-
-    std::string reachName = _Add_Reach_table(shaderCreator, resourceIndex, c.reach_m_table);
+    const std::string pxl(shaderCreator->getPixelName());
+    
+    std::string reachName = _Add_Reach_table(shaderCreator, resourceIndex, s.reach_m_table);
+    std::string tonescaleName_Inv = _Add_Tonescale_func(shaderCreator, resourceIndex, true, pIn, t);
 
     ss.newLine() << "";
     ss.newLine() << "// Add RGB to JMh";
     ss.newLine() << "";
-    ss.newLine() << "{";
-    ss.indent();
-        _Add_RGB_to_JMh_Shader(shaderCreator, ss, pLim);
-    ss.dedent();
-    ss.newLine() << "}";
+    _Add_RGB_to_JMh_Shader(shaderCreator, ss, pLim);
+    _Add_SinCos_Shader(shaderCreator, ss);
 
+
+    ss.newLine() << ss.floatDecl("reachMaxM") << " = " << reachName << "_sample(" << pxl << ".b);";
     ss.newLine() << "";
     ss.newLine() << "// Add GamutCompress (inv)";
     ss.newLine() << "";
     ss.newLine() << "{";
     ss.indent();
-        _Add_Gamut_Compress_Inv_Shader(shaderCreator, ss, resourceIndex, g, reachName);
+        _Add_Gamut_Compress_Inv_Shader(shaderCreator, ss, resourceIndex, s, g);
     ss.dedent();
     ss.newLine() << "}";
 
     ss.newLine() << "";
     ss.newLine() << "// Add ToneScale and ChromaCompress (inv)";
     ss.newLine() << "";
+    ss.newLine() << ss.floatDecl("J") << " = " << tonescaleName_Inv << "(" << pxl << ".b);";
     ss.newLine() << "{";
     ss.indent();
-        _Add_Tonescale_Compress_Inv_Shader(shaderCreator, ss, resourceIndex, pIn, t, c, reachName);
+        _Add_Tonescale_Compress_Inv_Shader(shaderCreator, ss, resourceIndex, s, c);
     ss.dedent();
     ss.newLine() << "}";
 
@@ -1544,7 +1460,8 @@ void Add_JMh_to_RGB_Shader(
     };
 
     ACES2::JMhParams p = ACES2::init_JMhParams(primaries);
-
+    _Add_WrapHueChannel_Shader(shaderCreator, ss);
+    _Add_SinCos_Shader(shaderCreator, ss);
     _Add_JMh_to_RGB_Shader(shaderCreator, ss, p);
 }
 
@@ -1555,15 +1472,25 @@ void Add_Tonescale_Compress_Fwd_Shader(
 {
     const float peak_luminance = (float) params[0];
 
-    ACES2::JMhParams p = ACES2::init_JMhParams(ACES_AP0::primaries);
-    ACES2::ToneScaleParams t = ACES2::init_ToneScaleParams(peak_luminance);
-    ACES2::ChromaCompressParams c = ACES2::init_ChromaCompressParams(peak_luminance);
+    const ACES2::JMhParams p = ACES2::init_JMhParams(ACES_AP0::primaries);
+    const ACES2::ToneScaleParams t = ACES2::init_ToneScaleParams(peak_luminance);
+    const ACES2::JMhParams reachGamut = ACES2::init_JMhParams(ACES_AP1::primaries);
+    const ACES2::SharedCompressionParameters s = ACES2::init_SharedCompressionParams(peak_luminance, p, reachGamut);
+    const ACES2::ChromaCompressParams c = ACES2::init_ChromaCompressParams(peak_luminance, t);
 
     unsigned resourceIndex = shaderCreator->getNextResourceIndex();
+    const std::string pxl(shaderCreator->getPixelName());
 
-    std::string reachName = _Add_Reach_table(shaderCreator, resourceIndex, c.reach_m_table);
+    const std::string reachName = _Add_Reach_table(shaderCreator, resourceIndex, s.reach_m_table);
+    const std::string tonescaleName_Fwd = _Add_Tonescale_func(shaderCreator, resourceIndex, false, p, t);
 
-    _Add_Tonescale_Compress_Fwd_Shader(shaderCreator, ss, resourceIndex, p, t, c, reachName);
+    _Add_WrapHueChannel_Shader(shaderCreator, ss);
+    _Add_SinCos_Shader(shaderCreator, ss);
+
+    ss.newLine() << ss.floatDecl("reachMaxM") << " = " << reachName << "_sample(" << pxl << ".b);";
+    ss.newLine() << ss.floatDecl("J_ts") << " = " << tonescaleName_Fwd << "(" << pxl << ".r);";
+
+    _Add_Tonescale_Compress_Fwd_Shader(shaderCreator, ss, resourceIndex, s, c);
 }
 
 void Add_Tonescale_Compress_Inv_Shader(
@@ -1573,15 +1500,25 @@ void Add_Tonescale_Compress_Inv_Shader(
 {
     const float peak_luminance = (float) params[0];
 
-    ACES2::JMhParams p = ACES2::init_JMhParams(ACES_AP0::primaries);
-    ACES2::ToneScaleParams t = ACES2::init_ToneScaleParams(peak_luminance);
-    ACES2::ChromaCompressParams c = ACES2::init_ChromaCompressParams(peak_luminance);
+    const ACES2::JMhParams p = ACES2::init_JMhParams(ACES_AP0::primaries);
+    const ACES2::ToneScaleParams t = ACES2::init_ToneScaleParams(peak_luminance);
+    const ACES2::JMhParams reachGamut = ACES2::init_JMhParams(ACES_AP1::primaries);
+    const ACES2::SharedCompressionParameters s = ACES2::init_SharedCompressionParams(peak_luminance, p, reachGamut);
+    const ACES2::ChromaCompressParams c = ACES2::init_ChromaCompressParams(peak_luminance, t);
 
     unsigned resourceIndex = shaderCreator->getNextResourceIndex();
+    const std::string pxl(shaderCreator->getPixelName());
 
-    std::string reachName = _Add_Reach_table(shaderCreator, resourceIndex, c.reach_m_table);
+    const std::string reachName = _Add_Reach_table(shaderCreator, resourceIndex, s.reach_m_table);
+    std::string tonescaleName_Inv = _Add_Tonescale_func(shaderCreator, resourceIndex, true, p, t);
 
-    _Add_Tonescale_Compress_Inv_Shader(shaderCreator, ss, resourceIndex, p, t, c, reachName);
+    _Add_WrapHueChannel_Shader(shaderCreator, ss);
+    _Add_SinCos_Shader(shaderCreator, ss);
+
+    ss.newLine() << ss.floatDecl("reachMaxM") << " = " << reachName << "_sample(" << pxl << ".b);";
+    ss.newLine() << ss.floatDecl("J") << " = " << tonescaleName_Inv << "(" << pxl << ".r);";
+
+    _Add_Tonescale_Compress_Inv_Shader(shaderCreator, ss, resourceIndex, s, c);
 }
 
 void Add_Gamut_Compress_Fwd_Shader(
@@ -1607,13 +1544,24 @@ void Add_Gamut_Compress_Fwd_Shader(
         {white_x, white_y}
     };
 
-    const ACES2::GamutCompressParams g = ACES2::init_GamutCompressParams(peak_luminance, primaries);
+    const ACES2::JMhParams pIn = ACES2::init_JMhParams(ACES_AP0::primaries);
+    const ACES2::JMhParams pLim = ACES2::init_JMhParams(primaries);
+    const ACES2::ToneScaleParams t = ACES2::init_ToneScaleParams(peak_luminance);
+    const ACES2::JMhParams reachGamut = ACES2::init_JMhParams(ACES_AP1::primaries);
+    const ACES2::SharedCompressionParameters s = ACES2::init_SharedCompressionParams(peak_luminance, pIn, reachGamut);
+    const ACES2::GamutCompressParams g = ACES2::init_GamutCompressParams(peak_luminance, pIn, pLim, t, s, reachGamut); 
 
     unsigned resourceIndex = shaderCreator->getNextResourceIndex();
+    const std::string pxl(shaderCreator->getPixelName());
 
-    std::string reachName = _Add_Reach_table(shaderCreator, resourceIndex, g.reach_m_table);
+    const std::string reachName = _Add_Reach_table(shaderCreator, resourceIndex, s.reach_m_table);
 
-    _Add_Gamut_Compress_Fwd_Shader(shaderCreator, ss, resourceIndex, g, reachName);
+    _Add_WrapHueChannel_Shader(shaderCreator, ss);
+    _Add_SinCos_Shader(shaderCreator, ss);
+
+    ss.newLine() << ss.floatDecl("reachMaxM") << " = " << reachName << "_sample(" << pxl << ".b);";
+
+    _Add_Gamut_Compress_Fwd_Shader(shaderCreator, ss, resourceIndex, s, g);
 }
 
 void Add_Gamut_Compress_Inv_Shader(
@@ -1639,13 +1587,24 @@ void Add_Gamut_Compress_Inv_Shader(
         {white_x, white_y}
     };
 
-    const ACES2::GamutCompressParams g = ACES2::init_GamutCompressParams(peak_luminance, primaries);
+    const ACES2::JMhParams pIn = ACES2::init_JMhParams(ACES_AP0::primaries);
+    const ACES2::JMhParams pLim = ACES2::init_JMhParams(primaries);
+    const ACES2::ToneScaleParams t = ACES2::init_ToneScaleParams(peak_luminance);
+    const ACES2::JMhParams reachGamut = ACES2::init_JMhParams(ACES_AP1::primaries);
+    const ACES2::SharedCompressionParameters s = ACES2::init_SharedCompressionParams(peak_luminance, pIn, reachGamut);
+    const ACES2::GamutCompressParams g = ACES2::init_GamutCompressParams(peak_luminance, pIn, pLim, t, s, reachGamut);
 
     unsigned resourceIndex = shaderCreator->getNextResourceIndex();
+    const std::string pxl(shaderCreator->getPixelName());
+
+    const std::string reachName = _Add_Reach_table(shaderCreator, resourceIndex, s.reach_m_table);
+
+    _Add_WrapHueChannel_Shader(shaderCreator, ss);
+    _Add_SinCos_Shader(shaderCreator, ss);
 
-    std::string reachName = _Add_Reach_table(shaderCreator, resourceIndex, g.reach_m_table);
+    ss.newLine() << ss.floatDecl("reachMaxM") << " = " << reachName << "_sample(" << pxl << ".b);";
 
-    _Add_Gamut_Compress_Inv_Shader(shaderCreator, ss, resourceIndex, g, reachName);
+    _Add_Gamut_Compress_Inv_Shader(shaderCreator, ss, resourceIndex, s, g);
 }
 
 void Add_Surround_10_Fwd_Shader(GpuShaderCreatorRcPtr & shaderCreator, GpuShaderText & ss, float gamma)
diff --git a/src/apps/ocioconvert/main.cpp b/src/apps/ocioconvert/main.cpp
index 60305e06ec..e8071c76fa 100644
--- a/src/apps/ocioconvert/main.cpp
+++ b/src/apps/ocioconvert/main.cpp
@@ -50,6 +50,7 @@ int main(int argc, const char **argv)
     std::vector<std::string> stringAttrs;
 
     std::string outputDepth;
+    std::string inputconfig;
 
     bool usegpu                 = false;
     bool usegpuLegacy           = false;
@@ -85,6 +86,7 @@ int main(int argc, const char **argv)
                "--h",                   &help,                  "Display the help and exit",
                "--help",                &help,                  "Display the help and exit",
                "-v" ,                   &verbose,               "Display general information",
+              "--iconfig %s",           &inputconfig,           "Input .ocio configuration file (default: $OCIO)",
                "<SEPARATOR>", "\nOpenImageIO or OpenEXR options:",
                "--bitdepth %s",         &outputDepth,  "Output image bitdepth",
                "--float-attribute %L",  &floatAttrs,   "\"name=float\" pair defining OIIO float attribute "
@@ -266,34 +268,51 @@ int main(int argc, const char **argv)
         outputimage     = args[2].c_str();
     }
 
+    // Load the current config.
+    OCIO::ConstConfigRcPtr config;
+    
+    try
+    {
+        if (useLut)
+        {
+            config = OCIO::Config::CreateRaw();
+        }
+        else if (!inputconfig.empty() )
+        {
+            config = OCIO::Config::CreateFromFile(inputconfig.c_str());
+        }
+        else
+        {
+            const char * env = OCIO::GetEnvVariable("OCIO");
+            if (env && *env)
+                inputconfig = env;
+            config = OCIO::GetCurrentConfig();
+        }
+    }
+    catch (const OCIO::Exception & e)
+    {
+        std::cout << "ERROR loading config file: " << e.what() << std::endl;
+        exit(1);
+    }
+    catch (...)
+    {
+
+        std::cerr << "ERROR loading config file: '" << inputconfig << "'" << std::endl;
+        exit(1);
+    }
+
     if (verbose)
     {
         std::cout << std::endl;
         std::cout << OCIO::ImageIO::GetVersion() << std::endl;
         std::cout << "OCIO Version: " << OCIO::GetVersion() << std::endl;
-        const char * env = OCIO::GetEnvVariable("OCIO");
-        if (env && *env && !useLut)
+        if (!useLut)
         {
-            try
-            {
-                std::cout << std::endl;
-                std::cout << "OCIO Config. file:    '" << env << "'" << std::endl;
-                OCIO::ConstConfigRcPtr config = OCIO::GetCurrentConfig();
-                std::cout << "OCIO Config. version: " << config->getMajorVersion() << "." 
-                                                      << config->getMinorVersion() << std::endl;
-                std::cout << "OCIO search_path:     " << config->getSearchPath() << std::endl;
-            }
-            catch (const OCIO::Exception & e)
-            {
-                std::cout << "ERROR loading config file: " << e.what() << std::endl;
-                exit(1);
-            }
-            catch (...)
-            {
-
-                std::cerr << "ERROR loading config file: '" << env << "'" << std::endl;
-                exit(1);
-            }
+            std::cout << std::endl;
+            std::cout << "OCIO Config. file:    '" << inputconfig << "'" << std::endl;
+            std::cout << "OCIO Config. version: " << config->getMajorVersion() << "." 
+                                                  << config->getMinorVersion() << std::endl;
+            std::cout << "OCIO search_path:     " << config->getSearchPath() << std::endl;
         }
     }
 
@@ -388,10 +407,6 @@ int main(int argc, const char **argv)
     // Process the image.
     try
     {
-        // Load the current config.
-        OCIO::ConstConfigRcPtr config
-            = useLut ? OCIO::Config::CreateRaw() : OCIO::GetCurrentConfig();
-
         // Get the processor.
         OCIO::ConstProcessorRcPtr processor;
 
@@ -637,7 +652,6 @@ int main(int argc, const char **argv)
     {
         if (useDisplayView)
         {
-            OCIO::ConstConfigRcPtr config = OCIO::GetCurrentConfig();
             outputcolorspace = config->getDisplayViewColorSpaceName(display, view);
         }
 
diff --git a/src/apps/ocioperf/main.cpp b/src/apps/ocioperf/main.cpp
index a3af75a587..4c7837494f 100644
--- a/src/apps/ocioperf/main.cpp
+++ b/src/apps/ocioperf/main.cpp
@@ -24,7 +24,6 @@ class CustomMeasure
 
     explicit CustomMeasure(const char * explanation)
         :   m_explanations(explanation)
-        ,   m_iterations(1)
     {
         resume();
     }
@@ -183,6 +182,10 @@ int main(int argc, const char **argv)
     bool useDisplayview = false;
     bool useInvertview  = false;
 
+    std::string inputconfig;
+    OCIO::ConstConfigRcPtr srcConfig;
+
+
     ArgParse ap;
     ap.options("ocioperf -- apply and measure a color transformation processing\n\n"
                "usage: ocioperf [options] --transform /path/to/file.clf\n\n",
@@ -206,6 +209,8 @@ int main(int argc, const char **argv)
                                             "(Deprecated) Provide the input and (display, view) pair to apply on the image",
                "--invertview %s %s %s",     &display, &view, &outColorSpace,
                                             "Provide the (display, view) pair and output color space to apply on the image",
+               "--iconfig %s",              &inputconfig,
+                                            "Input .ocio configuration file (default: $OCIO)",
                "--iter %d",                 &iterations, "Provide the number of iterations on the processing. Default is 50",
                "--bitdepths %s %s",         &inBitDepthStr, &outBitDepthStr,
                                             "Provide input and output bit-depths (i.e. ui16, f32). Default is f32",
@@ -232,25 +237,6 @@ int main(int argc, const char **argv)
     {
         std::cout << std::endl;
         std::cout << "OCIO Version: " << OCIO::GetVersion() << std::endl;
-        const char * env = OCIO::GetEnvVariable("OCIO");
-        if(env && *env)
-        {
-            try
-            {
-                std::cout << std::endl;
-                std::cout << "OCIO Config. file:    '" << env << "'" << std::endl;
-                OCIO::ConstConfigRcPtr config = OCIO::GetCurrentConfig();
-                std::cout << "OCIO Config. version: " << config->getMajorVersion() << "." 
-                                                      << config->getMinorVersion() << std::endl;
-                std::cout << "OCIO search_path:     " << config->getSearchPath() << std::endl;
-            }
-            catch(...)
-            {
-
-                std::cerr << "ERROR: Error loading the config file: '" << env << "'";
-                return 1;
-            }
-        }
     }
 
     if (!transformFile.empty())
@@ -258,7 +244,7 @@ int main(int argc, const char **argv)
         std::cout << std::endl;
         std::cout << "Processing using '" << transformFile << "'" << std::endl << std::endl;
     }
-
+    
     std::cout << std::endl << std::endl;
     std::cout << "Processing statistics:" << std::endl << std::endl;
 
@@ -296,25 +282,38 @@ int main(int argc, const char **argv)
         // Checking for an input colorspace or input (display, view) pair.
         else if (!inColorSpace.empty() || (!display.empty() && !view.empty()))
         {
+            if(!inputconfig.empty())
+            {
+                std::cout << std::endl;
+                std::cout << "Loading " << inputconfig << std::endl;
+                srcConfig = OCIO::Config::CreateFromFile(inputconfig.c_str());
+            }
+            else if(OCIO::GetEnvVariable("OCIO"))
+            {
+                std::cout << std::endl;
+                std::cout << "Loading $OCIO " << OCIO::GetEnvVariable("OCIO") << std::endl;
+                srcConfig = OCIO::Config::CreateFromEnv();
+            }
+            else
+            {
+                throw OCIO::Exception("You must specify an input OCIO configuration (either with --iconfig or $OCIO).\n");
+            }
+
             if (verbose)
             {
-                const char * env = OCIO::GetEnvVariable("OCIO");
-                if (env && *env)
-                {
-                    std::cout << std::endl;
-                    const std::string inputStr = !inColorSpace.empty() ?  inColorSpace : "(" + display + ", " + view + ")";
-                    const std::string outputStr = !outColorSpace.empty() ?  outColorSpace : "(" + display + ", " + view + ")";
-                    std::cout << "Processing from '" 
-                              << inputStr << "' to '"
-                              << outputStr << "'" << std::endl;
-                }
-                else
-                {
-                    throw OCIO::Exception("Missing the ${OCIO} env. variable.");
-                }
+                std::cout << std::endl;
+                std::cout << "OCIO Config. version: " << srcConfig->getMajorVersion() << "." 
+                                                      << srcConfig->getMinorVersion() << std::endl;
+                std::cout << "OCIO search_path:     " << srcConfig->getSearchPath() << std::endl;
+                std::cout << std::endl;
+                const std::string inputStr = !inColorSpace.empty() ?  inColorSpace : "(" + display + ", " + view + ")";
+                const std::string outputStr = !outColorSpace.empty() ?  outColorSpace : "(" + display + ", " + view + ")";
+                std::cout << "Processing from '" 
+                          << inputStr << "' to '"
+                          << outputStr << "'" << std::endl;
             }
 
-            OCIO::ConfigRcPtr config  = OCIO::Config::CreateFromEnv()->createEditableCopy();
+            OCIO::ConfigRcPtr config  = srcConfig->createEditableCopy();
             config->setProcessorCacheFlags(nocache ? OCIO::PROCESSOR_CACHE_OFF 
                                                    : OCIO::PROCESSOR_CACHE_DEFAULT);
 
diff --git a/tests/cpu/CMakeLists.txt b/tests/cpu/CMakeLists.txt
index 66c32cb8a7..7dd684659d 100755
--- a/tests/cpu/CMakeLists.txt
+++ b/tests/cpu/CMakeLists.txt
@@ -343,6 +343,7 @@ if(OCIO_USE_SIMD AND (OCIO_ARCH_X86 OR OCIO_USE_SSE2NEON))
     set_property(SOURCE "${CMAKE_SOURCE_DIR}/src/OpenColorIO/ops/lut3d/Lut3DOpCPU_AVX.cpp" APPEND PROPERTY COMPILE_OPTIONS ${OCIO_AVX_ARGS})
     set_property(SOURCE "${CMAKE_SOURCE_DIR}/src/OpenColorIO/ops/lut3d/Lut3DOpCPU_AVX2.cpp" APPEND PROPERTY COMPILE_OPTIONS ${OCIO_AVX2_ARGS})
     set_property(SOURCE "${CMAKE_SOURCE_DIR}/src/OpenColorIO/ops/lut3d/Lut3DOpCPU_AVX512.cpp" APPEND PROPERTY COMPILE_OPTIONS ${OCIO_AVX512_ARGS})
+    set_property(SOURCE "${CMAKE_SOURCE_DIR}/ops/fixedfunction/ACES2/Transform.cpp" APPEND PROPERTY COMPILE_OPTIONS  ${OCIO_SSE2_ARGS} ${OCIO_AVX_ARGS})
     set_property(SOURCE "SSE2_tests.cpp" APPEND PROPERTY COMPILE_OPTIONS ${OCIO_SSE2_ARGS})
     set_property(SOURCE "AVX_tests.cpp" APPEND PROPERTY COMPILE_OPTIONS ${OCIO_AVX_ARGS})
     set_property(SOURCE "AVX2_tests.cpp" APPEND PROPERTY COMPILE_OPTIONS ${OCIO_AVX2_ARGS})
diff --git a/tests/gpu/FixedFunctionOp_test.cpp b/tests/gpu/FixedFunctionOp_test.cpp
index a7e77bc2d2..2c5d7cade4 100644
--- a/tests/gpu/FixedFunctionOp_test.cpp
+++ b/tests/gpu/FixedFunctionOp_test.cpp
@@ -458,6 +458,9 @@ OCIO_ADD_GPU_TEST(FixedFunction, style_aces2_output_transform_invfwd)
     grp->appendTransform(func_inv);
     grp->appendTransform(func_fwd);
 
+    test.setTestNaN(false); // TODO: Partially running the output transform without the clamp so do not test Nan or Inf values
+    test.setTestInfinity(false);
+
     test.setProcessor(grp);
 
     test.setErrorThreshold(7e-4f);

From 8f8497fcfb0aef3d16b752027b2f978ea9185ba8 Mon Sep 17 00:00:00 2001
From: "cuneyt.ozdas" <cuneyt.ozdas@autodesk.com>
Date: Thu, 6 Mar 2025 17:28:00 -0800
Subject: [PATCH 02/11] Update ACES2 CPU non-SIMD path (#2122)

* - Commenting out the ACES2 SIMD implementation for now to focus on validity of the scalar math. For SIMD we need to do implement run-time switching logic too.
- Slight improvements to the unit tests so that we print out the computed error metric as well as the actual and expected values. Helps to see the magnitude of the error.
- FixedFunctionOpCPU and BuiltinTransform tests now produce error lines with the same structure & syntax, including the computed error.
- Updated the expected values for ACES2 tests with the values the new optimized code produces, this makes all of the of CPU tests pass now.
- For ACES2 ops and builtin transforms, the error threshold is increased to 1e-4
- added few, temporary code snippets that dumps the currently produced results, making it easier to update the golden values if needed again.

Signed-off-by: cuneyt.ozdas <cuneyt.ozdas@autodesk.com>

* - Fixing Linux build

Signed-off-by: cuneyt.ozdas <cuneyt.ozdas@autodesk.com>

* Making Linux build happy is never easy.

Signed-off-by: cuneyt.ozdas <cuneyt.ozdas@autodesk.com>

---------

Signed-off-by: cuneyt.ozdas <cuneyt.ozdas@autodesk.com>
---
 src/OpenColorIO/CMakeLists.txt                |  10 -
 .../ops/fixedfunction/ACES2/Transform.cpp     |  10 +
 tests/cpu/CMakeLists.txt                      |   1 -
 tests/cpu/UnitTestUtils.h                     |  14 +-
 .../FixedFunctionOpCPU_tests.cpp              | 219 +++++++++-------
 .../cpu/transforms/BuiltinTransform_tests.cpp | 237 ++++++++++--------
 6 files changed, 283 insertions(+), 208 deletions(-)

diff --git a/src/OpenColorIO/CMakeLists.txt b/src/OpenColorIO/CMakeLists.txt
index 857058696c..7d2894da6b 100755
--- a/src/OpenColorIO/CMakeLists.txt
+++ b/src/OpenColorIO/CMakeLists.txt
@@ -220,16 +220,6 @@ if(OCIO_USE_SIMD AND (OCIO_ARCH_X86 OR OCIO_USE_SSE2NEON))
     set_property(SOURCE ops/lut3d/Lut3DOpCPU_AVX.cpp APPEND PROPERTY COMPILE_OPTIONS ${OCIO_AVX_ARGS})
     set_property(SOURCE ops/lut3d/Lut3DOpCPU_AVX2.cpp APPEND PROPERTY COMPILE_OPTIONS ${OCIO_AVX2_ARGS})
     set_property(SOURCE ops/lut3d/Lut3DOpCPU_AVX512.cpp APPEND PROPERTY COMPILE_OPTIONS ${OCIO_AVX512_ARGS})
-    set_property(SOURCE ops/fixedfunction/ACES2/Transform.cpp APPEND PROPERTY COMPILE_OPTIONS  ${OCIO_SSE2_ARGS} ${OCIO_AVX_ARGS})
-endif()
-
-# TODO: What to do for other compilers?
-if(USE_GCC)
-    set_property(SOURCE ops/fixedfunction/ACES2/Transform.cpp APPEND PROPERTY COMPILE_OPTIONS
-                -fno-math-errno -fno-signed-zeros -fno-trapping-math -fno-signaling-nans -ffinite-math-only -freciprocal-math -ftree-vectorize)
-endif()
-if(MSVC)
-    set_property(SOURCE ops/fixedfunction/ACES2/Transform.cpp APPEND PROPERTY COMPILE_OPTIONS /Qvec)
 endif()
 
 configure_file(CPUInfoConfig.h.in CPUInfoConfig.h)
diff --git a/src/OpenColorIO/ops/fixedfunction/ACES2/Transform.cpp b/src/OpenColorIO/ops/fixedfunction/ACES2/Transform.cpp
index 14461ce8fa..f72526f961 100644
--- a/src/OpenColorIO/ops/fixedfunction/ACES2/Transform.cpp
+++ b/src/OpenColorIO/ops/fixedfunction/ACES2/Transform.cpp
@@ -8,8 +8,18 @@
 #include <cmath>
 #include <limits>
 
+// Disabling SIMD for now to do scalar math verification.
+// FIXME: Re-enable SIMD and verify SIMD math is correct.
+#undef OCIO_USE_SSE2
+#undef OCIO_USE_AVX
+
+#if OCIO_USE_SSE2
 #include "SSE2.h"
+#endif
+
+#if OCIO_USE_AVX
 #include "AVX.h"
+#endif
 
 namespace OCIO_NAMESPACE
 {
diff --git a/tests/cpu/CMakeLists.txt b/tests/cpu/CMakeLists.txt
index 7dd684659d..66c32cb8a7 100755
--- a/tests/cpu/CMakeLists.txt
+++ b/tests/cpu/CMakeLists.txt
@@ -343,7 +343,6 @@ if(OCIO_USE_SIMD AND (OCIO_ARCH_X86 OR OCIO_USE_SSE2NEON))
     set_property(SOURCE "${CMAKE_SOURCE_DIR}/src/OpenColorIO/ops/lut3d/Lut3DOpCPU_AVX.cpp" APPEND PROPERTY COMPILE_OPTIONS ${OCIO_AVX_ARGS})
     set_property(SOURCE "${CMAKE_SOURCE_DIR}/src/OpenColorIO/ops/lut3d/Lut3DOpCPU_AVX2.cpp" APPEND PROPERTY COMPILE_OPTIONS ${OCIO_AVX2_ARGS})
     set_property(SOURCE "${CMAKE_SOURCE_DIR}/src/OpenColorIO/ops/lut3d/Lut3DOpCPU_AVX512.cpp" APPEND PROPERTY COMPILE_OPTIONS ${OCIO_AVX512_ARGS})
-    set_property(SOURCE "${CMAKE_SOURCE_DIR}/ops/fixedfunction/ACES2/Transform.cpp" APPEND PROPERTY COMPILE_OPTIONS  ${OCIO_SSE2_ARGS} ${OCIO_AVX_ARGS})
     set_property(SOURCE "SSE2_tests.cpp" APPEND PROPERTY COMPILE_OPTIONS ${OCIO_SSE2_ARGS})
     set_property(SOURCE "AVX_tests.cpp" APPEND PROPERTY COMPILE_OPTIONS ${OCIO_AVX_ARGS})
     set_property(SOURCE "AVX2_tests.cpp" APPEND PROPERTY COMPILE_OPTIONS ${OCIO_AVX2_ARGS})
diff --git a/tests/cpu/UnitTestUtils.h b/tests/cpu/UnitTestUtils.h
index bbdac35549..315cf258ee 100644
--- a/tests/cpu/UnitTestUtils.h
+++ b/tests/cpu/UnitTestUtils.h
@@ -79,7 +79,8 @@ template<typename T>
 inline bool EqualWithSafeRelError(T value,
                                   T expected,
                                   T eps,
-                                  T minExpected)
+                                  T minExpected,
+                                  T* computedError = nullptr)
 {
     // If value and expected are infinity, return true.
     if (value == expected) return true;
@@ -88,9 +89,14 @@ inline bool EqualWithSafeRelError(T value,
         ((expected < minExpected) ? minExpected : expected) :
         ((-expected < minExpected) ? minExpected : -expected);
 
-    return (
-        ((value > expected) ? value - expected : expected - value)
-        / div) <= eps;
+    T err = ((value > expected) ? value - expected : expected - value) / div;
+
+    if (computedError) 
+    {
+      *computedError = err;
+    }
+    
+    return (err <= eps);
 }
 
 // C++20 introduces new strongly typed, UTF-8 based, char8_t and u8string types
diff --git a/tests/cpu/ops/fixedfunction/FixedFunctionOpCPU_tests.cpp b/tests/cpu/ops/fixedfunction/FixedFunctionOpCPU_tests.cpp
index 1aa7dbc463..bd8536f33a 100644
--- a/tests/cpu/ops/fixedfunction/FixedFunctionOpCPU_tests.cpp
+++ b/tests/cpu/ops/fixedfunction/FixedFunctionOpCPU_tests.cpp
@@ -33,17 +33,22 @@ void ApplyFixedFunction(float * input_32f,
         // Using rel error with a large minExpected value of 1 will transition
         // from absolute error for expected values < 1 and
         // relative error for values > 1.
+        float computedError = 0.0f;
         const bool equalRel = OCIO::EqualWithSafeRelError(input_32f[idx],
                                                           expected_32f[idx],
                                                           errorThreshold,
-                                                          1.0f);
+                                                          1.0f,
+                                                          &computedError);
         if (!equalRel)
         {
             std::ostringstream errorMsg;
             errorMsg.precision(14);
             errorMsg << "Index: " << idx;
             errorMsg << " - Values: " << input_32f[idx] << " expected: " << expected_32f[idx];
-            errorMsg << " - Threshold: " << errorThreshold;
+            errorMsg << " - Error: " << computedError << " ("
+                     << std::setprecision(3) << computedError / errorThreshold;
+            errorMsg << "x of Threshold: "
+                     << std::setprecision(6) << errorThreshold << ")";
             OCIO_CHECK_ASSERT_MESSAGE_FROM(0, errorMsg.str(), lineNo);
         }
     }
@@ -462,45 +467,45 @@ OCIO_ADD_TEST(FixedFunctionOpCPU, aces_output_transform_20)
 
     const float expected_32f[num_samples*4] = {
         // ACEScg primaries and secondaries scaled by 4
-        4.965774059f, -0.032864563f, 0.041625995f, 1.0f,
-        3.969441891f, 3.825784922f, -0.056133576f, 1.0f,
-        -0.075329021f, 3.688980103f, 0.270296901f, 1.0f,
-        -0.095423937f, 3.650517225f, 3.459972620f, 1.0f,
-        -0.028930068f, 0.196428135f, 2.796343565f, 1.0f,
-        4.900805950f, -0.064376131f, 3.838256121f, 1.0f,
+        4.966040611f, -0.032990534f, 0.041587759f, 1.0f,
+        3.969455719f, 3.825795889f, -0.056159109f, 1.0f,
+        -0.075445443f, 3.689064741f, 0.270243138f, 1.0f,
+        -0.095422804f, 3.650515079f, 3.459970713f, 1.0f,
+        -0.029242843f, 0.196083903f, 2.797096968f, 1.0f,
+        4.900825977f, -0.064415932f, 3.838272572f, 1.0f,
         // OCIO test values
-        0.096890204f, -0.001135312f, 0.018971510f, 0.5f,
-        0.809614301f, 0.479856580f, 0.814239502f, 1.0f,
-        0.107420206f, 0.920529068f, 0.726378500f, 0.0f,
+        0.096831776f, -0.001114858f, 0.018976377f, 0.5f,
+        0.811647296f, 0.478211939f, 0.816507518f, 1.0f,
+        0.110244252f, 0.919241786f, 0.726084292f, 0.0f,
         // ColorChecker24 (SMPTE 2065-1 2021)
-        0.115475260f, 0.050812904f, 0.030212952f, 1.0f,
-        0.484879673f, 0.301042974f, 0.226768956f, 1.0f,
-        0.098463766f, 0.160814658f, 0.277010560f, 1.0f,
-        0.071130365f, 0.107334383f, 0.035097566f, 1.0f,
-        0.207111493f, 0.198474824f, 0.375326216f, 1.0f,
-        0.195447892f, 0.481111974f, 0.393299013f, 1.0f,
-        0.571913838f, 0.196872935f, 0.041634772f, 1.0f,
-        0.045791931f, 0.069875360f, 0.291233480f, 1.0f,
-        0.424848706f, 0.083199009f, 0.102153838f, 1.0f,
-        0.059589427f, 0.022219172f, 0.091246888f, 1.0f,
-        0.360365510f, 0.478741467f, 0.086726837f, 1.0f,
-        0.695662081f, 0.371994525f, 0.068298168f, 1.0f,
-        0.011806309f, 0.021665439f, 0.199594811f, 1.0f,
-        0.076526314f, 0.256237417f, 0.060564656f, 1.0f,
-        0.300064564f, 0.023416257f, 0.030360471f, 1.0f,
-        0.805484772f, 0.596903503f, 0.082996152f, 1.0f,
-        0.388385952f, 0.079899102f, 0.245819211f, 1.0f,
-        0.010952532f, 0.196105912f, 0.307181358f, 1.0f,
-        0.921019495f, 0.921707213f, 0.912856042f, 1.0f,
-        0.590192318f, 0.588423848f, 0.587825358f, 1.0f,
-        0.337743521f, 0.337685764f, 0.338155121f, 1.0f,
-        0.169265985f, 0.169178501f, 0.169557109f, 1.0f,
-        0.058346048f, 0.059387825f, 0.060296260f, 1.0f,
-        0.012581184f, 0.012947139f, 0.013654195f, 1.0f,
+        0.115581684f, 0.050785132f, 0.030158322f, 1.0f,
+        0.482630610f, 0.301559567f, 0.228200614f, 1.0f,
+        0.097509719f, 0.160682827f, 0.278755993f, 1.0f,
+        0.071118668f, 0.107350536f, 0.035066456f, 1.0f,
+        0.206827119f, 0.198065758f, 0.376981646f, 1.0f,
+        0.197157621f, 0.480333209f, 0.393290222f, 1.0f,
+        0.570664287f, 0.197219044f, 0.042163782f, 1.0f,
+        0.045591675f, 0.069720201f, 0.292005479f, 1.0f,
+        0.425108939f, 0.083108872f, 0.102091998f, 1.0f,
+        0.059560396f, 0.022268835f, 0.091132581f, 1.0f,
+        0.360384226f, 0.478674322f, 0.086890966f, 1.0f,
+        0.691989481f, 0.372686356f, 0.070826821f, 1.0f,
+        0.012042155f, 0.021904279f, 0.198501319f, 1.0f,
+        0.076645926f, 0.256147027f, 0.060666814f, 1.0f,
+        0.300039411f, 0.023424838f, 0.030365985f, 1.0f,
+        0.803476214f, 0.596933603f, 0.085341305f, 1.0f,
+        0.388712883f, 0.079724148f, 0.245922253f, 1.0f,
+        0.011061139f, 0.196086824f, 0.307065904f, 1.0f,
+        0.921007156f, 0.921683431f, 0.912948132f, 1.0f,
+        0.590166390f, 0.588430583f, 0.587841213f, 1.0f,
+        0.337742388f, 0.337684810f, 0.338159621f, 1.0f,
+        0.169266224f, 0.169178173f, 0.169558540f, 1.0f,
+        0.058399219f, 0.059382606f, 0.060239695f, 1.0f,
+        0.012618840f, 0.012950940f, 0.013591323f, 1.0f,
         // Spectrally non-selective 18 % reflecting diffuser
-        0.145115465f, 0.145115525f, 0.145115510f, 1.0f,
+        0.145115077f, 0.145115703f, 0.145115480f, 1.0f,
         // Perfect reflecting diffuser
-        1.041565657f, 1.041566014f, 1.041565657f, 1.0f,
+        1.041565537f, 1.041566610f, 1.041566253f, 1.0f,
     };
 
     OCIO::FixedFunctionOpData::Params params = {
@@ -518,6 +523,17 @@ OCIO_ADD_TEST(FixedFunctionOpCPU, aces_output_transform_20)
                        1e-5f,
                        __LINE__);
 
+#if DUMP_RESULT
+    std::cout << "Results: \n" << std::setprecision(9) << std::fixed;
+    for (unsigned i = 0; i < num_samples; ++i) 
+    {
+        std::cout   << input2_32f[i * 4 + 0] << "f, "
+                    << input2_32f[i * 4 + 1] << "f, " 
+                    << input2_32f[i * 4 + 2] << "f, "
+                    << input2_32f[i * 4 + 3] << "f,\n";
+    }
+#endif
+
     OCIO::ConstFixedFunctionOpDataRcPtr funcData2
         = std::make_shared<OCIO::FixedFunctionOpData>(OCIO::FixedFunctionOpData::ACES_OUTPUT_TRANSFORM_20_INV,
                                                       params);
@@ -778,35 +794,35 @@ OCIO_ADD_TEST(FixedFunctionOpCPU, aces_tonescale_compress_20)
 
     const float expected_32f[num_samples*4] = {
         // ACEScg primaries and secondaries scaled by 4
-        110.702453613f, 211.251770020f, 25.025110245f, 1.0f,
-        168.016815186f, 129.796249390f, 106.183448792f, 1.0f,
-        140.814849854f, 193.459213257f, 147.056488037f, 1.0f,
-        156.429519653f, 110.938514709f, 192.204727173f, 1.0f,
-        80.456542969f, 98.490524292f, 268.442108154f, 1.0f,
-        135.172195435f, 175.559280396f, 341.715240479f, 1.0f,
+        110.702453613f, 211.242279053f, 25.025110245f, 1.0f,
+        168.016815186f, 129.795593262f, 106.183448792f, 1.0f,
+        140.814849854f, 193.450653076f, 147.056488037f, 1.0f,
+        156.429504395f, 110.935348511f, 192.204727173f, 1.0f,
+        80.456558228f, 98.743263245f, 268.442108154f, 1.0f,
+        135.172225952f, 175.572814941f, 341.715240479f, 1.0f,
         // OCIO test values
-        18.187314987f, 33.819175720f, 4.173158169f, 0.5f,
-        80.413116455f, 21.309329987f, 332.159759521f, 1.0f,
-        83.447891235f, 37.852291107f, 182.925750732f, 0.0f,
+        18.187316895f, 33.767055511f, 4.173158169f, 0.5f,
+        80.413101196f, 21.547714233f, 332.159759521f, 1.0f,
+        83.447883606f, 37.597621918f, 182.925750732f, 0.0f,
         // ColorChecker24 (SMPTE 2065-1 2021)
-        27.411964417f, 13.382769585f, 38.146659851f, 1.0f,
-        59.987670898f, 14.391894341f, 39.841842651f, 1.0f,
-        43.298923492f, 12.199877739f, 249.107116699f, 1.0f,
-        31.489658356f, 14.075142860f, 128.878036499f, 1.0f,
-        50.749198914f, 12.731814384f, 285.658966064f, 1.0f,
-        64.728637695f, 18.593795776f, 179.324264526f, 1.0f,
-        53.399448395f, 37.394428253f, 50.924011230f, 1.0f,
-        34.719596863f, 21.616765976f, 271.008331299f, 1.0f,
-        43.910713196f, 36.788166046f, 13.975610733f, 1.0f,
-        23.196525574f, 15.118354797f, 317.544281006f, 1.0f,
-        63.348674774f, 33.283493042f, 119.145133972f, 1.0f,
-        64.908889771f, 35.371044159f, 70.842193604f, 1.0f,
-        24.876911163f, 23.143159866f, 273.228973389f, 1.0f,
-        44.203376770f, 28.918329239f, 144.154159546f, 1.0f,
-        32.824356079f, 43.447875977f, 17.892261505f, 1.0f,
-        75.830871582f, 39.872474670f, 90.752044678f, 1.0f,
-        45.823116302f, 34.652069092f, 348.832092285f, 1.0f,
-        43.597240448f, 23.079078674f, 218.454376221f, 1.0f,
+        27.411968231f, 13.410449982f, 38.146659851f, 1.0f,
+        59.987659454f, 14.175936699f, 39.841842651f, 1.0f,
+        43.298923492f, 12.367712021f, 249.107116699f, 1.0f,
+        31.489654541f, 14.086299896f, 128.878036499f, 1.0f,
+        50.749198914f, 12.862657547f, 285.658966064f, 1.0f,
+        64.728637695f, 18.433788300f, 179.324264526f, 1.0f,
+        53.399444580f, 37.239288330f, 50.924011230f, 1.0f,
+        34.719596863f, 21.685737610f, 271.008331299f, 1.0f,
+        43.910709381f, 36.826980591f, 13.975610733f, 1.0f,
+        23.196529388f, 15.087531090f, 317.544281006f, 1.0f,
+        63.348682404f, 33.255519867f, 119.145133972f, 1.0f,
+        64.908874512f, 34.922687531f, 70.842193604f, 1.0f,
+        24.876913071f, 23.019479752f, 273.228973389f, 1.0f,
+        44.203376770f, 28.884298325f, 144.154159546f, 1.0f,
+        32.824359894f, 43.442367554f, 17.892261505f, 1.0f,
+        75.830871582f, 39.538505554f, 90.752044678f, 1.0f,
+        45.823120117f, 34.710170746f, 348.832092285f, 1.0f,
+        43.597236633f, 23.048465729f, 218.454376221f, 1.0f,
     };
 
     OCIO::FixedFunctionOpData::Params params = {1000.f};
@@ -819,6 +835,17 @@ OCIO_ADD_TEST(FixedFunctionOpCPU, aces_tonescale_compress_20)
                        1e-5f,
                        __LINE__);
 
+#if DUMP_RESULTS
+    std::cout << "Results: \n" << std::setprecision(9) << std::fixed;
+    for (unsigned i = 0; i < num_samples; ++i)
+    {
+        std::cout   << input2_32f[i * 4 + 0] << "f, "
+                    << input2_32f[i * 4 + 1] << "f, "
+                    << input2_32f[i * 4 + 2] << "f, "
+                    << input2_32f[i * 4 + 3] << "f,\n";
+    }
+#endif
+
     OCIO::ConstFixedFunctionOpDataRcPtr funcData2
         = std::make_shared<OCIO::FixedFunctionOpData>(OCIO::FixedFunctionOpData::ACES_TONESCALE_COMPRESS_20_INV,
                                                       params);
@@ -871,35 +898,35 @@ OCIO_ADD_TEST(FixedFunctionOpCPU, aces_gamut_map_20)
 
     const float expected_32f[num_samples*4] = {
         // ACEScg primaries and secondaries scaled by 4
-        107.831291199f, 174.252944946f, 25.025119781f, 1.0f,
-        168.028198242f, 118.224960327f, 106.183464050f, 1.0f,
-        140.030105591f, 127.177192688f, 147.056488037f, 1.0f,
-        156.512435913f, 73.218856812f, 192.204727173f, 1.0f,
-        79.378631592f, 72.613555908f, 268.442108154f, 1.0f,
-        133.827835083f, 149.929809570f, 341.715240479f, 1.0f,
+        107.829742432f, 174.270156860f, 25.025110245f,  1.0f,
+        168.028274536f, 118.227561951f, 106.183448792f, 1.0f,
+        140.030166626f, 127.184478760f, 147.056488037f, 1.0f,
+        156.512435913f, 73.219184875f,  192.204727173f, 1.0f,
+        79.378555298f,  72.608604431f,  268.442108154f, 1.0f,
+        133.827941895f, 149.930618286f, 341.715240479f, 1.0f,
         // OCIO test values
-        18.194000244f, 33.312938690f, 4.173166752f, 0.5f,
-        80.413116455f, 21.309329987f, 332.159759521f, 1.0f,
-        83.467437744f, 37.305160522f, 182.925750732f, 0.0f,
+        18.193992615f,  33.313068390f,  4.173158169f,   0.5f,
+        80.413116455f,  21.309329987f,  332.159759521f, 1.0f,
+        83.467445374f,  37.305030823f,  182.925750732f, 0.0f,
         // ColorChecker24 (SMPTE 2065-1 2021)
-        27.411962509f, 13.382793427f, 38.146591187f, 1.0f,
-        59.987670898f, 14.391893387f, 39.841842651f, 1.0f,
-        43.298923492f, 12.199877739f, 249.107116699f, 1.0f,
-        31.489658356f, 14.075142860f, 128.878036499f, 1.0f,
-        50.749198914f, 12.731814384f, 285.658966064f, 1.0f,
-        64.728637695f, 18.593795776f, 179.324264526f, 1.0f,
-        53.399448395f, 37.394428253f, 50.924011230f, 1.0f,
-        34.719596863f, 21.616765976f, 271.008331299f, 1.0f,
-        43.910709381f, 36.788166046f, 13.975610733f, 1.0f,
-        23.196525574f, 15.118361473f, 317.544250488f, 1.0f,
-        63.348674774f, 33.283493042f, 119.145133972f, 1.0f,
-        64.908889771f, 35.371044159f, 70.842193604f, 1.0f,
-        24.876916885f, 23.143167496f, 273.229034424f, 1.0f,
-        44.203376770f, 28.918329239f, 144.154159546f, 1.0f,
-        32.824352264f, 43.447864532f, 17.892255783f, 1.0f,
-        75.830871582f, 39.872474670f, 90.752044678f, 1.0f,
-        45.823104858f, 34.652038574f, 348.832092285f, 1.0f,
-        43.635551453f, 21.629474640f, 218.454376221f, 1.0f,
+        27.411962509f,  13.382769585f,  38.146659851f,  1.0f,
+        59.987674713f,  14.391894341f,  39.841842651f,  1.0f,
+        43.298919678f,  12.199877739f,  249.107116699f, 1.0f,
+        31.489658356f,  14.075142860f,  128.878036499f, 1.0f,
+        50.749198914f,  12.731814384f,  285.658966064f, 1.0f,
+        64.728637695f,  18.593795776f,  179.324264526f, 1.0f,
+        53.399448395f,  37.394428253f,  50.924011230f,  1.0f,
+        34.719596863f,  21.616765976f,  271.008331299f, 1.0f,
+        43.910713196f,  36.788166046f,  13.975610733f,  1.0f,
+        23.196525574f,  15.118354797f,  317.544281006f, 1.0f,
+        63.348674774f,  33.283493042f,  119.145133972f, 1.0f,
+        64.908882141f,  35.371044159f,  70.842193604f,  1.0f,
+        24.876911163f,  23.143159866f,  273.228973389f, 1.0f,
+        44.203376770f,  28.918329239f,  144.154159546f, 1.0f,
+        32.824356079f,  43.447875977f,  17.892261505f,  1.0f,
+        75.830871582f,  39.872474670f,  90.752044678f,  1.0f,
+        45.823112488f,  34.652069092f,  348.832092285f, 1.0f,
+        43.635547638f,  21.629518509f,  218.454376221f, 1.0f,
     };
 
     OCIO::FixedFunctionOpData::Params params = {
@@ -916,6 +943,16 @@ OCIO_ADD_TEST(FixedFunctionOpCPU, aces_gamut_map_20)
                        funcData,
                        1e-5f,
                        __LINE__);
+#if DUMP_RESULTS
+    std::cout << "Results: \n" << std::setprecision(9) << std::fixed;
+    for (unsigned i = 0; i < num_samples; ++i)
+    {
+        std::cout   << input2_32f[i * 4 + 0] << "f, "
+                    << input2_32f[i * 4 + 1] << "f, "
+                    << input2_32f[i * 4 + 2] << "f, "
+                    << input2_32f[i * 4 + 3] << "f,\n";
+    }
+#endif
 
     OCIO::ConstFixedFunctionOpDataRcPtr funcData2
         = std::make_shared<OCIO::FixedFunctionOpData>(OCIO::FixedFunctionOpData::ACES_GAMUT_COMPRESS_20_INV,
diff --git a/tests/cpu/transforms/BuiltinTransform_tests.cpp b/tests/cpu/transforms/BuiltinTransform_tests.cpp
index e46bd74fe6..5ffd0ce06b 100644
--- a/tests/cpu/transforms/BuiltinTransform_tests.cpp
+++ b/tests/cpu/transforms/BuiltinTransform_tests.cpp
@@ -124,7 +124,8 @@ void ValidateValues(const char * prefixMsg, T in, T out, T errorThreshold, int l
     // Using rel error with a large minExpected value of 1 will transition
     // from absolute error for expected values < 1 and
     // relative error for values > 1.
-    if (!OCIO::EqualWithSafeRelError(in, out, errorThreshold, T(1.)))
+    T computedError{};
+    if (!OCIO::EqualWithSafeRelError(in, out, errorThreshold, T(1.), &computedError))
     {
         std::ostringstream errorMsg;
         errorMsg.precision(std::numeric_limits<T>::max_digits10);
@@ -132,7 +133,11 @@ void ValidateValues(const char * prefixMsg, T in, T out, T errorThreshold, int l
         {
             errorMsg << prefixMsg << ": ";
         }
-        errorMsg << "value = " << in << " but expected = " << out;
+        errorMsg << " - Values: " << in << " expected: " << out;
+        errorMsg << " - Error: " << computedError << " ("
+                 << std::setprecision(3) << computedError / errorThreshold;
+        errorMsg << "x of Threshold: " << std::setprecision(6) << errorThreshold
+                 << ")";
         OCIO_CHECK_ASSERT_MESSAGE_FROM(0, errorMsg.str(), lineNo);
     }
 }
@@ -328,7 +333,7 @@ namespace
 using Values = std::vector<float>;
 using AllValues = std::map<std::string, std::tuple<float, Values, Values>>;
 
-void ValidateBuiltinTransform(const char * style, const Values & in, const Values & out, float errorThreshold, int lineNo)
+void ValidateBuiltinTransform(const char * style, const Values & in, const Values & out, float errorThreshold, int lineNo, Values &results)
 {
     OCIO::BuiltinTransformRcPtr builtin = OCIO::BuiltinTransform::Create();
     OCIO_CHECK_NO_THROW_FROM(builtin->setStyle(style), lineNo);
@@ -348,16 +353,16 @@ void ValidateBuiltinTransform(const char * style, const Values & in, const Value
 
     OCIO::PackedImageDesc inDesc((void *)&in[0], long(in.size() / 3), 1, 3);
 
-    Values vals(in.size(), -1.0f);
-    OCIO::PackedImageDesc outDesc((void *)&vals[0], long(vals.size() / 3), 1, 3);
+    results = Values(in.size(), -1.0f);
+    OCIO::PackedImageDesc outDesc((void *)&results[0], long(results.size() / 3), 1, 3);
 
     OCIO_CHECK_NO_THROW_FROM(cpu->apply(inDesc, outDesc), lineNo);
 
     for (size_t idx = 0; idx < out.size(); ++idx)
     {
         std::ostringstream oss;
-        oss << style << ": for index = " << idx << " with threshold = " << errorThreshold;
-        ValidateValues(oss.str().c_str(), vals[idx], out[idx], errorThreshold, lineNo);
+        oss << style << ": for index = " << idx;
+        ValidateValues(oss.str().c_str(), results[idx], out[idx], errorThreshold, lineNo);
     }
 }
 
@@ -458,100 +463,99 @@ AllValues UnitTestValues
         { 1.0e-6f,
         { 0.5f, 0.4f, 0.3f }, { 0.22214814f,     0.21179835f,     0.15639816f } } },
 
-    { "ACES-OUTPUT - ACES2065-1_to_CIE-XYZ-D65 - SDR-100nit-REC709_2.0",
-        { 1.0e-6f,
-        { 0.5f, 0.4f, 0.3f }, { 0.26260212f, 0.25207470f, 0.20617338f } } },
-    { "ACES-OUTPUT - ACES2065-1_to_CIE-XYZ-D65 - SDR-100nit-P3-D65_2.0",
-        { 1.0e-6f,
-        { 0.5f, 0.4f, 0.3f }, { 0.26260212f, 0.25207472f, 0.20617332f } } },
-    { "ACES-OUTPUT - ACES2065-1_to_CIE-XYZ-D65 - HDR-108nit-P3-D65_2.0",
-        { 1.0e-6f,
-        { 0.5f, 0.4f, 0.3f }, { 0.16253406f, 0.15513624f, 0.12449740f } } },
-    { "ACES-OUTPUT - ACES2065-1_to_CIE-XYZ-D65 - HDR-300nit-P3-D65_2.0",
-        { 1.0e-6f,
-        { 0.5f, 0.4f, 0.3f }, { 0.20592399f, 0.19440515f, 0.15028581f } } },
-    { "ACES-OUTPUT - ACES2065-1_to_CIE-XYZ-D65 - HDR-500nit-P3-D65_2.0",
-        { 1.0e-6f,
-        { 0.5f, 0.4f, 0.3f }, { 0.41039306f, 0.38813826f, 0.30191866f } } },
-    { "ACES-OUTPUT - ACES2065-1_to_CIE-XYZ-D65 - HDR-1000nit-P3-D65_2.0",
-        { 1.0e-6f,
-        { 0.5f, 0.4f, 0.3f }, { 0.46536570f, 0.43852836f, 0.33688113f } } },
-    { "ACES-OUTPUT - ACES2065-1_to_CIE-XYZ-D65 - HDR-2000nit-P3-D65_2.0",
-        { 1.0e-6f,
-        { 0.5f, 0.4f, 0.3f }, { 0.51225936f, 0.48264506f, 0.37060050f } } },
-    { "ACES-OUTPUT - ACES2065-1_to_CIE-XYZ-D65 - HDR-4000nit-P3-D65_2.0",
-        { 1.0e-6f,
-        { 0.5f, 0.4f, 0.3f }, { 0.55653524f, 0.51967940f, 0.38678724f } } },
-    { "ACES-OUTPUT - ACES2065-1_to_CIE-XYZ-D65 - HDR-500nit-REC2020_2.0",
-        { 1.0e-6f,
-        { 0.5f, 0.4f, 0.3f }, { 0.41039258f, 0.38813800f, 0.30191845f } } },
-    { "ACES-OUTPUT - ACES2065-1_to_CIE-XYZ-D65 - HDR-1000nit-REC2020_2.0",
-        { 1.0e-6f,
-        { 0.5f, 0.4f, 0.3f }, { 0.46536540f, 0.43852820f, 0.33688095f } } },
-    { "ACES-OUTPUT - ACES2065-1_to_CIE-XYZ-D65 - HDR-2000nit-REC2020_2.0",
-        { 1.0e-6f,
-        { 0.5f, 0.4f, 0.3f }, { 0.51225930f, 0.48264477f, 0.37060022f } } },
-    { "ACES-OUTPUT - ACES2065-1_to_CIE-XYZ-D65 - HDR-4000nit-REC2020_2.0",
-        { 1.0e-6f,
-        { 0.5f, 0.4f, 0.3f }, { 0.55653550f, 0.51967950f, 0.38678730f } } },
-
-    { "ACES-OUTPUT - ACES2065-1_to_CIE-XYZ-D65 - SDR-100nit-REC709-D60-in-REC709-D65_2.0",
-        { 1.0e-6f,
-        { 0.5f, 0.4f, 0.3f }, { 0.25147703f, 0.24029444f, 0.18221131f } } },
-    { "ACES-OUTPUT - ACES2065-1_to_CIE-XYZ-D65 - SDR-100nit-REC709-D60-in-P3-D65_2.0",
-        { 1.0e-6f,
-        { 0.5f, 0.4f, 0.3f }, { 0.25373828f, 0.24245512f, 0.18384966f } } },
-    { "ACES-OUTPUT - ACES2065-1_to_CIE-XYZ-D65 - SDR-100nit-REC709-D60-in-REC2020-D65_2.0",
-        { 1.0e-6f,
-        { 0.5f, 0.4f, 0.3f }, { 0.25712878f, 0.24569483f, 0.18630630f } } },
-    { "ACES-OUTPUT - ACES2065-1_to_CIE-XYZ-D65 - SDR-100nit-P3-D60-in-P3-D65_2.0",
-        { 1.0e-6f,
-        { 0.5f, 0.4f, 0.3f }, { 0.25373834f, 0.24245517f, 0.18384990f } } },
-    { "ACES-OUTPUT - ACES2065-1_to_CIE-XYZ-D65 - SDR-100nit-P3-D60-in-XYZ-E_2.0",
-        { 1.0e-6f,
-        { 0.5f, 0.4f, 0.3f }, { 0.26332240f, 0.25161302f, 0.19079340f } } },
-    { "ACES-OUTPUT - ACES2065-1_to_CIE-XYZ-D65 - HDR-108nit-P3-D60-in-P3-D65_2.0",
-        { 1.0e-6f,
-        { 0.5f, 0.4f, 0.3f }, { 0.15705064f, 0.14920068f, 0.11100890f } } },
-    { "ACES-OUTPUT - ACES2065-1_to_CIE-XYZ-D65 - HDR-300nit-P3-D60-in-XYZ-E_2.0",
-        { 1.0e-6f,
-        { 0.5f, 0.4f, 0.3f }, { 0.20469205f, 0.19229384f, 0.13782679f } } },
-    { "ACES-OUTPUT - ACES2065-1_to_CIE-XYZ-D65 - HDR-500nit-P3-D60-in-P3-D65_2.0",
-        { 1.0e-6f,
-        { 0.5f, 0.4f, 0.3f }, { 0.39655724f, 0.37322620f, 0.26917280f } } },
-    { "ACES-OUTPUT - ACES2065-1_to_CIE-XYZ-D65 - HDR-1000nit-P3-D60-in-P3-D65_2.0",
-        { 1.0e-6f,
-        { 0.5f, 0.4f, 0.3f }, { 0.44968104f, 0.42165324f, 0.30032730f } } },
-    { "ACES-OUTPUT - ACES2065-1_to_CIE-XYZ-D65 - HDR-2000nit-P3-D60-in-P3-D65_2.0",
-        { 1.0e-6f,
-        { 0.5f, 0.4f, 0.3f }, { 0.49499428f, 0.46407104f, 0.33038715f } } },
-    { "ACES-OUTPUT - ACES2065-1_to_CIE-XYZ-D65 - HDR-4000nit-P3-D60-in-P3-D65_2.0",
-        { 1.0e-6f,
-        { 0.5f, 0.4f, 0.3f }, { 0.53778990f, 0.49960223f, 0.34477120f } } },
-    { "ACES-OUTPUT - ACES2065-1_to_CIE-XYZ-D65 - HDR-500nit-P3-D60-in-REC2020-D65_2.0",
-        { 1.0e-6f,
-        { 0.5f, 0.4f, 0.3f }, { 0.40185606f, 0.37821326f, 0.27276948f } } },
-    { "ACES-OUTPUT - ACES2065-1_to_CIE-XYZ-D65 - HDR-1000nit-P3-D60-in-REC2020-D65_2.0",
-        { 1.0e-6f,
-        { 0.5f, 0.4f, 0.3f }, { 0.45568960f, 0.42728746f, 0.30434040f } } },
-    { "ACES-OUTPUT - ACES2065-1_to_CIE-XYZ-D65 - HDR-2000nit-P3-D60-in-REC2020-D65_2.0",
-        { 1.0e-6f,
-        { 0.5f, 0.4f, 0.3f }, { 0.50160843f, 0.47027197f, 0.33480182f } } },
-    { "ACES-OUTPUT - ACES2065-1_to_CIE-XYZ-D65 - HDR-4000nit-P3-D60-in-REC2020-D65_2.0",
-        { 1.0e-6f,
-        { 0.5f, 0.4f, 0.3f }, { 0.54497580f, 0.50627790f, 0.34937808f } } },
-    { "ACES-OUTPUT - ACES2065-1_to_CIE-XYZ-D65 - HDR-500nit-REC2020-D60-in-REC2020-D65_2.0",
-        { 1.0e-6f,
-        { 0.5f, 0.4f, 0.3f }, { 0.40185624f, 0.37821335f, 0.27276933f } } },
-    { "ACES-OUTPUT - ACES2065-1_to_CIE-XYZ-D65 - HDR-1000nit-REC2020-D60-in-REC2020-D65_2.0",
-        { 1.0e-6f,
-        { 0.5f, 0.4f, 0.3f }, { 0.45568994f, 0.42728750f, 0.30434027f } } },
-    { "ACES-OUTPUT - ACES2065-1_to_CIE-XYZ-D65 - HDR-2000nit-REC2020-D60-in-REC2020-D65_2.0",
-        { 1.0e-6f,
-        { 0.5f, 0.4f, 0.3f }, { 0.50160870f, 0.47027220f, 0.33480210f } } },
-    { "ACES-OUTPUT - ACES2065-1_to_CIE-XYZ-D65 - HDR-4000nit-REC2020-D60-in-REC2020-D65_2.0",
-        { 1.0e-6f,
-        { 0.5f, 0.4f, 0.3f }, { 0.54497580f, 0.50627780f, 0.34937814f } } },
+     { "ACES-OUTPUT - ACES2065-1_to_CIE-XYZ-D65 - SDR-100nit-REC709_2.0",
+        { 1.0e-4f,
+        { 0.5f, 0.4f, 0.3f }, { 0.26216024f, 0.25204185f, 0.20722267f } } },
+     { "ACES-OUTPUT - ACES2065-1_to_CIE-XYZ-D65 - SDR-100nit-P3-D65_2.0",
+        { 1.0e-4f,
+        { 0.5f, 0.4f, 0.3f }, { 0.26216024f, 0.25204197f, 0.20722273f } } },
+     { "ACES-OUTPUT - ACES2065-1_to_CIE-XYZ-D65 - HDR-108nit-P3-D65_2.0",
+        { 1.0e-4f,
+        { 0.5f, 0.4f, 0.3f }, { 0.16215305f, 0.15510881f, 0.12539130f } } },
+     { "ACES-OUTPUT - ACES2065-1_to_CIE-XYZ-D65 - HDR-300nit-P3-D65_2.0",
+        { 1.0e-4f,
+        { 0.5f, 0.4f, 0.3f }, { 0.20546328f, 0.19437392f, 0.15134005f } } },
+     { "ACES-OUTPUT - ACES2065-1_to_CIE-XYZ-D65 - HDR-500nit-P3-D65_2.0",
+        { 1.0e-4f,
+        { 0.5f, 0.4f, 0.3f }, { 0.40945035f, 0.38807374f, 0.30408621f } } },
+     { "ACES-OUTPUT - ACES2065-1_to_CIE-XYZ-D65 - HDR-1000nit-P3-D65_2.0",
+        { 1.0e-4f,
+        { 0.5f, 0.4f, 0.3f }, { 0.46440443f, 0.43846416f, 0.33907151f } } },
+     { "ACES-OUTPUT - ACES2065-1_to_CIE-XYZ-D65 - HDR-2000nit-P3-D65_2.0",
+        { 1.0e-4f,
+        { 0.5f, 0.4f, 0.3f }, { 0.51138031f, 0.48258632f, 0.37260318f } } },
+     { "ACES-OUTPUT - ACES2065-1_to_CIE-XYZ-D65 - HDR-4000nit-P3-D65_2.0",
+        { 1.0e-4f,
+        { 0.5f, 0.4f, 0.3f }, { 0.55570847f, 0.51962745f, 0.38863316f } } },
+     { "ACES-OUTPUT - ACES2065-1_to_CIE-XYZ-D65 - HDR-500nit-REC2020_2.0",
+        { 1.0e-4f,
+        { 0.5f, 0.4f, 0.3f }, { 0.40945050f, 0.38807371f, 0.30408630f } } },
+     { "ACES-OUTPUT - ACES2065-1_to_CIE-XYZ-D65 - HDR-1000nit-REC2020_2.0",
+        { 1.0e-4f,
+        { 0.5f, 0.4f, 0.3f }, { 0.46440458f, 0.43846416f, 0.33907148f } } },
+     { "ACES-OUTPUT - ACES2065-1_to_CIE-XYZ-D65 - HDR-2000nit-REC2020_2.0",
+        { 1.0e-4f,
+        { 0.5f, 0.4f, 0.3f }, { 0.51138067f, 0.48258659f, 0.37260354f } } },
+     { "ACES-OUTPUT - ACES2065-1_to_CIE-XYZ-D65 - HDR-4000nit-REC2020_2.0",
+        { 1.0e-4f,
+        { 0.5f, 0.4f, 0.3f }, { 0.55570871f, 0.51962751f, 0.38863319f } } },
+     { "ACES-OUTPUT - ACES2065-1_to_CIE-XYZ-D65 - SDR-100nit-REC709-D60-in-REC709-D65_2.0",
+        { 1.0e-4f,
+        { 0.5f, 0.4f, 0.3f }, { 0.25105247f, 0.24027000f, 0.18314247f } } },
+     { "ACES-OUTPUT - ACES2065-1_to_CIE-XYZ-D65 - SDR-100nit-REC709-D60-in-P3-D65_2.0",
+        { 1.0e-4f,
+        { 0.5f, 0.4f, 0.3f }, { 0.25330988f, 0.24243048f, 0.18478924f } } },
+     { "ACES-OUTPUT - ACES2065-1_to_CIE-XYZ-D65 - SDR-100nit-REC709-D60-in-REC2020-D65_2.0",
+        { 1.0e-4f,
+        { 0.5f, 0.4f, 0.3f }, { 0.25669453f, 0.24566977f, 0.18725836f } } },
+     { "ACES-OUTPUT - ACES2065-1_to_CIE-XYZ-D65 - SDR-100nit-P3-D60-in-P3-D65_2.0",
+        { 1.0e-4f,
+        { 0.5f, 0.4f, 0.3f }, { 0.25331008f, 0.24243063f, 0.18478930f } } },
+     { "ACES-OUTPUT - ACES2065-1_to_CIE-XYZ-D65 - SDR-100nit-P3-D60-in-XYZ-E_2.0",
+        { 1.0e-4f,
+        { 0.5f, 0.4f, 0.3f }, { 0.26287800f, 0.25158763f, 0.19176911f } } },
+     { "ACES-OUTPUT - ACES2065-1_to_CIE-XYZ-D65 - HDR-108nit-P3-D60-in-P3-D65_2.0",
+        { 1.0e-4f,
+        { 0.5f, 0.4f, 0.3f }, { 0.15668137f, 0.14918011f, 0.11180915f } } },
+     { "ACES-OUTPUT - ACES2065-1_to_CIE-XYZ-D65 - HDR-300nit-P3-D60-in-XYZ-E_2.0",
+        { 1.0e-4f,
+        { 0.5f, 0.4f, 0.3f }, { 0.20423311f, 0.19227016f, 0.13879779f } } },
+     { "ACES-OUTPUT - ACES2065-1_to_CIE-XYZ-D65 - HDR-500nit-P3-D60-in-P3-D65_2.0",
+        { 1.0e-4f,
+        { 0.5f, 0.4f, 0.3f }, { 0.39564425f, 0.37317851f, 0.27111340f } } },
+     { "ACES-OUTPUT - ACES2065-1_to_CIE-XYZ-D65 - HDR-1000nit-P3-D60-in-P3-D65_2.0",
+        { 1.0e-4f,
+        { 0.5f, 0.4f, 0.3f }, { 0.44875023f, 0.42160600f, 0.30228844f } } },
+     { "ACES-OUTPUT - ACES2065-1_to_CIE-XYZ-D65 - HDR-2000nit-P3-D60-in-P3-D65_2.0",
+        { 1.0e-4f,
+        { 0.5f, 0.4f, 0.3f }, { 0.49414319f, 0.46402797f, 0.33218032f } } },
+     { "ACES-OUTPUT - ACES2065-1_to_CIE-XYZ-D65 - HDR-4000nit-P3-D60-in-P3-D65_2.0",
+        { 1.0e-4f,
+        { 0.5f, 0.4f, 0.3f }, { 0.53698927f, 0.49956405f, 0.34642375f } } },
+     { "ACES-OUTPUT - ACES2065-1_to_CIE-XYZ-D65 - HDR-500nit-P3-D60-in-REC2020-D65_2.0",
+        { 1.0e-4f,
+        { 0.5f, 0.4f, 0.3f }, { 0.40093076f, 0.37816489f, 0.27473599f } } },
+     { "ACES-OUTPUT - ACES2065-1_to_CIE-XYZ-D65 - HDR-1000nit-P3-D60-in-REC2020-D65_2.0",
+        { 1.0e-4f,
+        { 0.5f, 0.4f, 0.3f }, { 0.45474637f, 0.42723945f, 0.30632755f } } },
+     { "ACES-OUTPUT - ACES2065-1_to_CIE-XYZ-D65 - HDR-2000nit-P3-D60-in-REC2020-D65_2.0",
+        { 1.0e-4f,
+        { 0.5f, 0.4f, 0.3f }, { 0.50074583f, 0.47022825f, 0.33661887f } } },
+     { "ACES-OUTPUT - ACES2065-1_to_CIE-XYZ-D65 - HDR-4000nit-P3-D60-in-REC2020-D65_2.0",
+        { 1.0e-4f,
+        { 0.5f, 0.4f, 0.3f }, { 0.54416442f, 0.50623918f, 0.35105261f } } },
+     { "ACES-OUTPUT - ACES2065-1_to_CIE-XYZ-D65 - HDR-500nit-REC2020-D60-in-REC2020-D65_2.0",
+        { 1.0e-4f,
+        { 0.5f, 0.4f, 0.3f }, { 0.40093106f, 0.37816498f, 0.27473608f } } },
+     { "ACES-OUTPUT - ACES2065-1_to_CIE-XYZ-D65 - HDR-1000nit-REC2020-D60-in-REC2020-D65_2.0",
+        { 1.0e-4f,
+        { 0.5f, 0.4f, 0.3f }, { 0.45474690f, 0.42723969f, 0.30632779f } } },
+     { "ACES-OUTPUT - ACES2065-1_to_CIE-XYZ-D65 - HDR-2000nit-REC2020-D60-in-REC2020-D65_2.0",
+        { 1.0e-4f,
+        { 0.5f, 0.4f, 0.3f }, { 0.50074631f, 0.47022864f, 0.33661926f } } },
+     { "ACES-OUTPUT - ACES2065-1_to_CIE-XYZ-D65 - HDR-4000nit-REC2020-D60-in-REC2020-D65_2.0",
+        { 1.0e-4f,
+        { 0.5f, 0.4f, 0.3f }, { 0.54416472f, 0.50623930f, 0.35105285f } } },
 
     { "APPLE_LOG_to_ACES2065-1",
         { 1.0e-6f,
@@ -669,6 +673,10 @@ OCIO_ADD_TEST(Builtins, validate)
 {
     OCIO::ConstBuiltinTransformRegistryRcPtr reg = OCIO::BuiltinTransformRegistry::Get();
 
+#if DUMP_RESULTS
+    std::ostringstream ss;
+#endif
+
     for (size_t index = 0; index < reg->getNumBuiltins(); ++index)
     {
         const char * name = reg->getBuiltinStyle(index);
@@ -696,10 +704,35 @@ OCIO_ADD_TEST(Builtins, validate)
         }
         else
         {
-            ValidateBuiltinTransform(name, std::get<1>(values), std::get<2>(values), std::get<0>(values), __LINE__);
+            Values results;
+            ValidateBuiltinTransform(name, std::get<1>(values), std::get<2>(values), std::get<0>(values), __LINE__, results);
+
+#if DUMP_RESULTS
+            ss << std::defaultfloat;
+            ss << "    { \"" << name << "\",\n";
+            ss << "        { " << std::get<0>(values) << "f,\n        { ";
+            int comma = 0;
+            for (auto val : std::get<1>(values)) 
+            {
+                ss << (comma++ ? ", " : "") << val << "f";
+                comma = true;
+            }
+            ss << " },\n        { ";
+            ss << std::setprecision(8) << std::fixed;
+            comma = 0;
+            for (auto val : results) 
+            {
+                ss << (comma++ ? ", " : "") << val << "f";
+            }
+            ss << " } } },\n ";
+#endif
         }
     }
 
+#if DUMP_RESULTS
+    std::cout << ss.str() << std::endl;
+#endif
+
     // The above checks if a test values is missing, but not if there are test values
     // that don't have an associated built-in.
     OCIO_CHECK_EQUAL(UnitTestValues.size(), reg->getNumBuiltins());

From 35a9fe4fe78c45452d2c6472f1e533dc4a8962f2 Mon Sep 17 00:00:00 2001
From: "cuneyt.ozdas" <cuneyt.ozdas@autodesk.com>
Date: Thu, 6 Mar 2025 17:28:35 -0800
Subject: [PATCH 03/11] Address GPU unit test failures (#2123)

* - Weights for cos(3h) and sin(h) in chroma_compress_norm() looks wrong. Fixing the weights makes the GPU tests pass now (except of the inverse output transform which seems to have a separate issue).
- If the new weights are correct, I'll need to update the CPU test target values too.

Signed-off-by: cuneyt.ozdas <cuneyt.ozdas@autodesk.com>

* - Updating the expected values in the CPU tests

Signed-off-by: cuneyt.ozdas <cuneyt.ozdas@autodesk.com>

* - The remaining GPU test failures were caused by a simple typo where we were passing h instead of J to ocio_tonescale_inv() function. With the fix all the unit tests are happy now.

- Since we decided not to include any SIMD implementation in this version, I removed the conditional code paths and left the current SSE & AVX implementations as commented out for future guidence.

Signed-off-by: cuneyt.ozdas <cuneyt.ozdas@autodesk.com>

---------

Signed-off-by: cuneyt.ozdas <cuneyt.ozdas@autodesk.com>
---
 .../ops/fixedfunction/ACES2/Transform.cpp     |  27 ++--
 .../ops/fixedfunction/FixedFunctionOpGPU.cpp  |   8 +-
 .../FixedFunctionOpCPU_tests.cpp              | 124 +++++++++---------
 .../cpu/transforms/BuiltinTransform_tests.cpp |  94 ++++++++-----
 4 files changed, 141 insertions(+), 112 deletions(-)

diff --git a/src/OpenColorIO/ops/fixedfunction/ACES2/Transform.cpp b/src/OpenColorIO/ops/fixedfunction/ACES2/Transform.cpp
index f72526f961..68161def88 100644
--- a/src/OpenColorIO/ops/fixedfunction/ACES2/Transform.cpp
+++ b/src/OpenColorIO/ops/fixedfunction/ACES2/Transform.cpp
@@ -8,19 +8,6 @@
 #include <cmath>
 #include <limits>
 
-// Disabling SIMD for now to do scalar math verification.
-// FIXME: Re-enable SIMD and verify SIMD math is correct.
-#undef OCIO_USE_SSE2
-#undef OCIO_USE_AVX
-
-#if OCIO_USE_SSE2
-#include "SSE2.h"
-#endif
-
-#if OCIO_USE_AVX
-#include "AVX.h"
-#endif
-
 namespace OCIO_NAMESPACE
 {
 
@@ -265,6 +252,7 @@ f3 JMh_to_RGB(const f3 &JMh, const JMhParams &p)
 // Tonescale / Chroma compress
 //
 
+/*
 //TODO: move to header
 // https://stackoverflow.com/questions/6996764/fastest-way-to-do-horizontal-sse-vector-sum-or-other-reduction/35270026#35270026
 // https://gcc.godbolt.org/#compilers:!((compiler:g6,options:%27-xc+-O3+-Wall+-fverbose-asm+-march%3Dhaswell+-mno-avx%27,sourcez:MQSwdgxgNgrgJgUwAQB4QFt3gC4CdwB0AFgHwBQoksiqAztnDseWQPStLZEi1I9JQQAa2QBZAIa5x2AOS0ANEgBGMbJwDuCcSLicA9kgh70ABxBRk2A0oTZsCXIb2ICbDtHFgA5khhgAZnq42H7SCFAAnkiIECCIvFa%2BtMjoegBu4ia0rLREMP4muuD0Wrp6/kipaURQWWT%2BUHrSSES0MOgA%2BlkdtMkAjAAUHR3ofQBMABxIaQCUSADeSEvLK6tr62vs00gAvEgA2kgAIkgAwkgAPkgAQkgAgkgAumQrw6OTSLn5y3sjnV/%2BBoILq0AZpRTgpAdUSiDoAZQAEgBVABiKIAMgBRAZjRQAZkUAAZFH0ZjMANxLLaHc4nK63B7PV4jcZTNroXhLX6YDriOBwEFgxQAimbDjs3h7Q5HADU51OMrpNxlDzuMtuTOWAI2XKhPKqCBqgoBwvatFFqy2B2W5yWStl8sVTypHAsai4yCMpnMDiQ4jSejifsq6WUUVwCBgtHAPgBLy1Zp1uz1nT5At6Awlwry/gp8aWEZCuDArx5EDS2F6HX8eLGmbNeYAvmR6o1mq12iCesk8UMWR9Zgt81D%2B2yc8m/iN0oa4DATILZpT1lslLgmnAIOJ6Ehwgh0AgwJWkAS%2BvokLjCcO3qzPonuan%2BQvs/k8yttet71O0oaoMac6aOVFLZuC8IgWnEKAKgAWhIAQ9HUcDIOHCUNk/NNu3rDln1zclh0LGBiw2Sdy0rWhq1rTDzVw5sWy2Wh0AgixHCMGhowALwQRQVDUKMYEYqJaEaTRHD8CxeicRAoPY5B%2BHEJATDXJQLHQVsmjUDtOm6Ig0z7d4pkHeYr1HW8OQnHltMfLIhWmV8E1Mj8Uw6CyBSsrMJVsgtbAIkt1mIisqxrOt3OoltQH8RAKmGO4ADUAA1hlU9t2TGABWAA2Lt/QAD101K0psodmT06YhN1P48o6Td6CyCqslZMEPJHYrqhAUCzPQCqECyvBxAgbB/FZJ8kFJSlgNasDWSM5qb3a3lLNBNIhIhECiAtZYtjTTgiGQUrJpWfDCI0rtegQXs0lZNakw2sAijAQQwEsbbPh7aYHGjPQwEUdRuAgMD%2BD0ExsAwCCkECRwYti4ctgGMADHULd7Fu%2BhcBgPqQA%2BhQ/Ruv0oCgJByi2vc/QjQmkAAFluLAwAwdoZjIZtgAPRh/BbNx2HZrY4D0GAlIQMguZ5iwWnZLp02SOtr0mXQ0jgOZFiTBWtjSZNDluK5GSmm8/AisqeW1hB/HABAXNBS6FY2TmPpkNR1CCXAIi%2B5APsiJJkDTOEjkUB7v0cKxUe4bxSa8SQlHELxkCUEAj3UKOwM8JAUQABT9flNY%2BAFsFMT8DSNKz9f8RRiIRwoF1li0tg9En%2BFhkNqlqOA04mXR30/KrsG6QpMxzTOTEag6fL8ysBX8NKyaGHl0NoOAwTgbCyRC2iOY5vhD1rMgcGFzsEDMWtuwQCXRxAJAsrligQHCg2RwhhKitZI/uFH3XOj8ExeqEbgOm3kBR4GLLFBPpcOoth4iggDBwnhdCwzAFBeIeBUZA2/PcOKAhbC8CjJ8f0bswz2CQDHLg3M1APWNjGYMVQKDhGSI3e%2B38yZPx6DmIEn8d51j/lCGE8JkRoixAMPoRJ8SKDGPPchN1z5UJMo/NCj4v4UQfmTf%2BjVJYTGobWOhAIgSNCYSAPoaVMKj0LuwxEqIMTYl4UgYkx4BHzxWFsOE8MTCk1KtgW2O5lIHkrGI9kKjJECmkUFdoeiWggFrH3LyhFB7RkGtGCinjBGAOsRwOEcJMRjFrg3eJ/dHJdR6n1TRMioBEjNjYpJcidzfhLPgmAbUo5OG9GJM8VQkb2D5PjCoclBD2CkHjEwWTcC6FwLWeQWVMDyEvDRDeR1fHdh7D0ISPQGK4xYggXSd9j4zEMisLKs1nK5JYfI3CGytnoV8b/PZeFQklnCVoyYPQgksKbKzReKdMiI1BmudASAJBSFkLwCGLQ7AmAAFzsHoG/acuAGjwQIF6VgABHGACB6DozANkPotY%2BgAE4%2BhkzSqwIg8EoJWCku0KCHV0pQTxfgNiH1sD8VYKisYDKyZ9EJMAeljLmWJXUiLboDEvnLI%2BCfQqywjDIrUIokyABFPiug9hZX2cKjGYrjKNCOHxPGex2RSr5PKl0gTVUgz2AMJAUFFAmuPniJAMpj4AHZ/4pKtVlfKdMVgiu3OK7g%2Br1WORzr%2BVy7QtWz0ldKxqdF2ieuTEas1ZqsqngdRah1KVLU2v/oSJNWV7XH1oQ6p1w5XVKuKuycN3jBQqrVYoD1aqQ2ugMIa41pr63H1TQ6jNWUs3HxzS6xVTUbyNGTIWtVOqkBjQjXW0d0bY3mrTYmh11qkDOoVaK7tHxuCzTURYY0Yay0mU9USGNVaTIjqjQ2rKTbj4TvTWm%2BNmap1pvyjOuduau3ivZLNSeoJGjlpAOXDg6EsjRANlob5hgoCeC8HIT4DChYAyBlgDiuAzlFguWWfyZFAqYXuUAA%3D%3D)),filterAsm:(binary:!t,commentOnly:!t,directives:!t,intel:!t,labels:!t),version:3
@@ -279,6 +267,7 @@ inline float hsum_ps_sse1(__m128 v) {                              // v    = [
     return    _mm_cvtss_f32(sums);                                 // A+B+C+D
 }
 #endif
+
 #ifdef OCIO_USE_AVX
 inline float hsum256_ps_avx(__m256 v) {         // v     = [   H   G |   F   E |   D   C |   B   A ]
     __m128 vlow  = _mm256_castps256_ps128(v);   // vlow  = [   D   C |   B   A ]
@@ -287,7 +276,8 @@ inline float hsum256_ps_avx(__m256 v) {         // v     = [   H   G |   F   E |
     return hsum_ps_sse1(v128);
 }
 #endif
-
+#endif
+*/
 
 float chroma_compress_norm(float cos_hr1, float sin_hr1, float chroma_compress_scale)
 {
@@ -302,9 +292,11 @@ float chroma_compress_norm(float cos_hr1, float sin_hr1, float chroma_compress_s
         sin_hr1, sin_hr2, sin_hr3, 1.0f
     };
     alignas(AVX_ALIGNMENT) static constexpr float weights[8] = { // TODO: investigate reordering of the entries so we are summing equal magnitude values first?
-        11.34072f, 16.46899f, 14.66441f,  0.0f,
-        4.66441f,  -6.37224f,  9.19364f, 77.12896f
+        11.34072f, 16.46899f, 7.88380f, 0.0f,
+        14.66441f, -6.37224f, 9.19364f, 77.12896f
     };
+
+    /*
     // TODO: benchmark this across multiple platforms to justify the multiple code paths.
 #if OCIO_USE_SSE2
 #if OCIO_USE_AVX
@@ -324,6 +316,7 @@ float chroma_compress_norm(float cos_hr1, float sin_hr1, float chroma_compress_s
     const float M = hsum_ps_sse1(t3);
 #endif
 #else
+    */
     const float M = weights[0] * trig_angles_hr[0] +
                     weights[1] * trig_angles_hr[1] +
                     weights[2] * trig_angles_hr[2] +
@@ -331,7 +324,9 @@ float chroma_compress_norm(float cos_hr1, float sin_hr1, float chroma_compress_s
                     weights[5] * trig_angles_hr[5] +
                     weights[6] * trig_angles_hr[6] +
                     weights[7];
+    /*
 #endif
+    */
 
     return M * chroma_compress_scale; // TODO: is it worth prescaling the above weights?
 }
diff --git a/src/OpenColorIO/ops/fixedfunction/FixedFunctionOpGPU.cpp b/src/OpenColorIO/ops/fixedfunction/FixedFunctionOpGPU.cpp
index ea6dced0c0..c08fd74613 100644
--- a/src/OpenColorIO/ops/fixedfunction/FixedFunctionOpGPU.cpp
+++ b/src/OpenColorIO/ops/fixedfunction/FixedFunctionOpGPU.cpp
@@ -663,6 +663,7 @@ std::string _Add_Tonescale_func(
     }
 
     ss.newLine() << ss.floatDecl("J_ts") << " = " << ACES2::J_scale << " * pow((F_L_Y / ( " << ACES2::cam_nl_offset << " + F_L_Y)) * " << p.inv_A_w_J << ", " << p.cz << ");";
+    // TODO: copysign is missing here. /coz
     ss.newLine() << "return J_ts;";
 
     ss.dedent();
@@ -682,14 +683,15 @@ void _Add_ChromaCompressionNorm_Shader(
     ss.newLine() << "{";
     ss.indent();
 
-    ss.newLine() << ss.floatDecl("cos_hr2") << " = cos_hr * cos_hr - sin_hr * sin_hr;";
+    // TODO: optimization: can bake weights into terms and convert dotprods to addition. /coz
+    ss.newLine() << ss.floatDecl("cos_hr2") << " = 2.0 * cos_hr * cos_hr - 1.0;";
     ss.newLine() << ss.floatDecl("sin_hr2") << " = 2.0 * cos_hr * sin_hr;";
     ss.newLine() << ss.floatDecl("cos_hr3") << " = 4.0 * cos_hr * cos_hr * cos_hr - 3.0 * cos_hr;";
     ss.newLine() << ss.floatDecl("sin_hr3") << " = 3.0 * sin_hr - 4.0 * sin_hr * sin_hr * sin_hr;";
     ss.newLine() << ss.float3Decl("cosines") << " = " <<  ss.float3Const("cos_hr", "cos_hr2", "cos_hr3") <<";";
     ss.newLine() << ss.float3Decl("cosine_weights") << " = " <<  ss.float3Const(11.34072 * c.chroma_compress_scale,
                                                                                 16.46899 * c.chroma_compress_scale,
-                                                                                7.88380 * c.chroma_compress_scale) <<";";
+                                                                                 7.88380 * c.chroma_compress_scale) <<";";
     ss.newLine() << ss.float3Decl("sines") << " = " <<  ss.float3Const("sin_hr", "sin_hr2", "sin_hr3") <<";";
     ss.newLine() << ss.float3Decl("sine_weights") << " = " <<  ss.float3Const(14.66441 * c.chroma_compress_scale,
                                                                               -6.37224 * c.chroma_compress_scale,
@@ -1396,7 +1398,7 @@ void Add_ACES_OutputTransform_Inv_Shader(
     ss.newLine() << "";
     ss.newLine() << "// Add ToneScale and ChromaCompress (inv)";
     ss.newLine() << "";
-    ss.newLine() << ss.floatDecl("J") << " = " << tonescaleName_Inv << "(" << pxl << ".b);";
+    ss.newLine() << ss.floatDecl("J") << " = " << tonescaleName_Inv << "(" << pxl << ".r);";
     ss.newLine() << "{";
     ss.indent();
         _Add_Tonescale_Compress_Inv_Shader(shaderCreator, ss, resourceIndex, s, c);
diff --git a/tests/cpu/ops/fixedfunction/FixedFunctionOpCPU_tests.cpp b/tests/cpu/ops/fixedfunction/FixedFunctionOpCPU_tests.cpp
index bd8536f33a..aee030f53e 100644
--- a/tests/cpu/ops/fixedfunction/FixedFunctionOpCPU_tests.cpp
+++ b/tests/cpu/ops/fixedfunction/FixedFunctionOpCPU_tests.cpp
@@ -467,41 +467,41 @@ OCIO_ADD_TEST(FixedFunctionOpCPU, aces_output_transform_20)
 
     const float expected_32f[num_samples*4] = {
         // ACEScg primaries and secondaries scaled by 4
-        4.966040611f, -0.032990534f, 0.041587759f, 1.0f,
-        3.969455719f, 3.825795889f, -0.056159109f, 1.0f,
-        -0.075445443f, 3.689064741f, 0.270243138f, 1.0f,
-        -0.095422804f, 3.650515079f, 3.459970713f, 1.0f,
-        -0.029242843f, 0.196083903f, 2.797096968f, 1.0f,
-        4.900825977f, -0.064415932f, 3.838272572f, 1.0f,
+        4.966013432f, -0.033002287f, 0.041583523f, 1.0f,
+        3.969460726f, 3.825797558f, -0.056160748f, 1.0f,
+        -0.075460039f, 3.689072609f, 0.270235062f, 1.0f,
+        -0.095436633f, 3.650521517f, 3.459975719f, 1.0f,
+        -0.028881177f, 0.196473420f, 2.796123743f, 1.0f,
+        4.900828362f, -0.064385533f, 3.838270903f, 1.0f,
         // OCIO test values
-        0.096831776f, -0.001114858f, 0.018976377f, 0.5f,
-        0.811647296f, 0.478211939f, 0.816507518f, 1.0f,
-        0.110244252f, 0.919241786f, 0.726084292f, 0.0f,
+        0.096890487f, -0.001135427f, 0.018971475f, 0.5f,
+        0.809613585f, 0.479857147f, 0.814239979f, 1.0f,
+        0.107417941f, 0.920530438f, 0.726379037f, 0.0f,
         // ColorChecker24 (SMPTE 2065-1 2021)
-        0.115581684f, 0.050785132f, 0.030158322f, 1.0f,
-        0.482630610f, 0.301559567f, 0.228200614f, 1.0f,
-        0.097509719f, 0.160682827f, 0.278755993f, 1.0f,
-        0.071118668f, 0.107350536f, 0.035066456f, 1.0f,
-        0.206827119f, 0.198065758f, 0.376981646f, 1.0f,
-        0.197157621f, 0.480333209f, 0.393290222f, 1.0f,
-        0.570664287f, 0.197219044f, 0.042163782f, 1.0f,
-        0.045591675f, 0.069720201f, 0.292005479f, 1.0f,
-        0.425108939f, 0.083108872f, 0.102091998f, 1.0f,
-        0.059560396f, 0.022268835f, 0.091132581f, 1.0f,
-        0.360384226f, 0.478674322f, 0.086890966f, 1.0f,
-        0.691989481f, 0.372686356f, 0.070826821f, 1.0f,
-        0.012042155f, 0.021904279f, 0.198501319f, 1.0f,
-        0.076645926f, 0.256147027f, 0.060666814f, 1.0f,
-        0.300039411f, 0.023424838f, 0.030365985f, 1.0f,
-        0.803476214f, 0.596933603f, 0.085341305f, 1.0f,
-        0.388712883f, 0.079724148f, 0.245922253f, 1.0f,
-        0.011061139f, 0.196086824f, 0.307065904f, 1.0f,
-        0.921007156f, 0.921683431f, 0.912948132f, 1.0f,
-        0.590166390f, 0.588430583f, 0.587841213f, 1.0f,
-        0.337742388f, 0.337684810f, 0.338159621f, 1.0f,
-        0.169266224f, 0.169178173f, 0.169558540f, 1.0f,
-        0.058399219f, 0.059382606f, 0.060239695f, 1.0f,
-        0.012618840f, 0.012950940f, 0.013591323f, 1.0f,
+        0.115475342f, 0.050812997f, 0.030212998f, 1.0f,
+        0.484880149f, 0.301042914f, 0.226769030f, 1.0f,
+        0.098463453f, 0.160814837f, 0.277010798f, 1.0f,
+        0.071130276f, 0.107334509f, 0.035097614f, 1.0f,
+        0.207111374f, 0.198474824f, 0.375326097f, 1.0f,
+        0.195447117f, 0.481112540f, 0.393299103f, 1.0f,
+        0.571913302f, 0.196873263f, 0.041634843f, 1.0f,
+        0.045791976f, 0.069875412f, 0.291233569f, 1.0f,
+        0.424848884f, 0.083199054f, 0.102153927f, 1.0f,
+        0.059589352f, 0.022219239f, 0.091246955f, 1.0f,
+        0.360364884f, 0.478741497f, 0.086726815f, 1.0f,
+        0.695661962f, 0.371994466f, 0.068298057f, 1.0f,
+        0.011806240f, 0.021665439f, 0.199594870f, 1.0f,
+        0.076526135f, 0.256237596f, 0.060564563f, 1.0f,
+        0.300064713f, 0.023416281f, 0.030360531f, 1.0f,
+        0.805483222f, 0.596904039f, 0.082996234f, 1.0f,
+        0.388385385f, 0.079899333f, 0.245818958f, 1.0f,
+        0.010951802f, 0.196106046f, 0.307181537f, 1.0f,
+        0.921020269f, 0.921707630f, 0.912857533f, 1.0f,
+        0.590191603f, 0.588424563f, 0.587825298f, 1.0f,
+        0.337743223f, 0.337686002f, 0.338155240f, 1.0f,
+        0.169266403f, 0.169178575f, 0.169557154f, 1.0f,
+        0.058346011f, 0.059387885f, 0.060296256f, 1.0f,
+        0.012581199f, 0.012947144f, 0.013654212f, 1.0f,
         // Spectrally non-selective 18 % reflecting diffuser
         0.145115077f, 0.145115703f, 0.145115480f, 1.0f,
         // Perfect reflecting diffuser
@@ -524,7 +524,7 @@ OCIO_ADD_TEST(FixedFunctionOpCPU, aces_output_transform_20)
                        __LINE__);
 
 #if DUMP_RESULT
-    std::cout << "Results: \n" << std::setprecision(9) << std::fixed;
+    std::cout << "aces_output_transform_20 results: \n" << std::setprecision(9) << std::fixed;
     for (unsigned i = 0; i < num_samples; ++i) 
     {
         std::cout   << input2_32f[i * 4 + 0] << "f, "
@@ -794,35 +794,35 @@ OCIO_ADD_TEST(FixedFunctionOpCPU, aces_tonescale_compress_20)
 
     const float expected_32f[num_samples*4] = {
         // ACEScg primaries and secondaries scaled by 4
-        110.702453613f, 211.242279053f, 25.025110245f, 1.0f,
-        168.016815186f, 129.795593262f, 106.183448792f, 1.0f,
-        140.814849854f, 193.450653076f, 147.056488037f, 1.0f,
-        156.429504395f, 110.935348511f, 192.204727173f, 1.0f,
-        80.456558228f, 98.743263245f, 268.442108154f, 1.0f,
-        135.172225952f, 175.572814941f, 341.715240479f, 1.0f,
+        110.702453613f, 211.251770020f, 25.025110245f,  1.0f,
+        168.016815186f, 129.796249390f, 106.183448792f, 1.0f,
+        140.814849854f, 193.459197998f, 147.056488037f, 1.0f,
+        156.429504395f, 110.938423157f, 192.204727173f, 1.0f,
+        80.456558228f, 98.490531921f, 268.442108154f,   1.0f,
+        135.172225952f, 175.559326172f, 341.715240479f, 1.0f,
         // OCIO test values
-        18.187316895f, 33.767055511f, 4.173158169f, 0.5f,
-        80.413101196f, 21.547714233f, 332.159759521f, 1.0f,
-        83.447883606f, 37.597621918f, 182.925750732f, 0.0f,
+        18.187316895f, 33.819190979f, 4.173158169f,   0.5f,
+        80.413101196f, 21.309329987f, 332.159759521f, 1.0f,
+        83.447883606f, 37.852523804f, 182.925750732f, 0.0f,
         // ColorChecker24 (SMPTE 2065-1 2021)
-        27.411968231f, 13.410449982f, 38.146659851f, 1.0f,
-        59.987659454f, 14.175936699f, 39.841842651f, 1.0f,
-        43.298923492f, 12.367712021f, 249.107116699f, 1.0f,
-        31.489654541f, 14.086299896f, 128.878036499f, 1.0f,
-        50.749198914f, 12.862657547f, 285.658966064f, 1.0f,
-        64.728637695f, 18.433788300f, 179.324264526f, 1.0f,
-        53.399444580f, 37.239288330f, 50.924011230f, 1.0f,
-        34.719596863f, 21.685737610f, 271.008331299f, 1.0f,
-        43.910709381f, 36.826980591f, 13.975610733f, 1.0f,
-        23.196529388f, 15.087531090f, 317.544281006f, 1.0f,
-        63.348682404f, 33.255519867f, 119.145133972f, 1.0f,
-        64.908874512f, 34.922687531f, 70.842193604f, 1.0f,
-        24.876913071f, 23.019479752f, 273.228973389f, 1.0f,
-        44.203376770f, 28.884298325f, 144.154159546f, 1.0f,
-        32.824359894f, 43.442367554f, 17.892261505f, 1.0f,
-        75.830871582f, 39.538505554f, 90.752044678f, 1.0f,
-        45.823120117f, 34.710170746f, 348.832092285f, 1.0f,
-        43.597236633f, 23.048465729f, 218.454376221f, 1.0f,
+        27.411968231f, 13.382784843f, 38.146659851f,  1.0f,
+        59.987659454f, 14.391894341f, 39.841842651f,  1.0f,
+        43.298923492f, 12.199877739f, 249.107116699f, 1.0f,
+        31.489654541f, 14.075141907f, 128.878036499f, 1.0f,
+        50.749198914f, 12.731806755f, 285.658966064f, 1.0f,
+        64.728637695f, 18.593791962f, 179.324264526f, 1.0f,
+        53.399444580f, 37.394416809f, 50.924011230f,  1.0f,
+        34.719596863f, 21.616765976f, 271.008331299f, 1.0f,
+        43.910709381f, 36.788166046f, 13.975610733f,  1.0f,
+        23.196529388f, 15.118354797f, 317.544281006f, 1.0f,
+        63.348682404f, 33.283519745f, 119.145133972f, 1.0f,
+        64.908874512f, 35.371063232f, 70.842193604f,  1.0f,
+        24.876913071f, 23.143159866f, 273.228973389f, 1.0f,
+        44.203376770f, 28.918329239f, 144.154159546f, 1.0f,
+        32.824359894f, 43.447853088f, 17.892261505f,  1.0f,
+        75.830871582f, 39.872489929f, 90.752044678f,  1.0f,
+        45.823120117f, 34.652057648f, 348.832092285f, 1.0f,
+        43.597236633f, 23.079071045f, 218.454376221f, 1.0f,
     };
 
     OCIO::FixedFunctionOpData::Params params = {1000.f};
@@ -836,7 +836,7 @@ OCIO_ADD_TEST(FixedFunctionOpCPU, aces_tonescale_compress_20)
                        __LINE__);
 
 #if DUMP_RESULTS
-    std::cout << "Results: \n" << std::setprecision(9) << std::fixed;
+    std::cout << "aces_tonescale_compress_20 results: \n" << std::setprecision(9) << std::fixed;
     for (unsigned i = 0; i < num_samples; ++i)
     {
         std::cout   << input2_32f[i * 4 + 0] << "f, "
diff --git a/tests/cpu/transforms/BuiltinTransform_tests.cpp b/tests/cpu/transforms/BuiltinTransform_tests.cpp
index 5ffd0ce06b..8ffc3dd3c3 100644
--- a/tests/cpu/transforms/BuiltinTransform_tests.cpp
+++ b/tests/cpu/transforms/BuiltinTransform_tests.cpp
@@ -17,6 +17,7 @@
 #include "UnitTestUtils.h"
 
 namespace OCIO = OCIO_NAMESPACE;
+#define DUMP_RESULTS 0
 
 
 OCIO_ADD_TEST(BuiltinTransform, creation)
@@ -465,97 +466,128 @@ AllValues UnitTestValues
 
      { "ACES-OUTPUT - ACES2065-1_to_CIE-XYZ-D65 - SDR-100nit-REC709_2.0",
         { 1.0e-4f,
-        { 0.5f, 0.4f, 0.3f }, { 0.26216024f, 0.25204185f, 0.20722267f } } },
+        { 0.5f, 0.4f, 0.3f },
+        { 0.26260215f, 0.25207460f, 0.20617345f } } },
      { "ACES-OUTPUT - ACES2065-1_to_CIE-XYZ-D65 - SDR-100nit-P3-D65_2.0",
         { 1.0e-4f,
-        { 0.5f, 0.4f, 0.3f }, { 0.26216024f, 0.25204197f, 0.20722273f } } },
+        { 0.5f, 0.4f, 0.3f },
+        { 0.26260215f, 0.25207475f, 0.20617352f } } },
      { "ACES-OUTPUT - ACES2065-1_to_CIE-XYZ-D65 - HDR-108nit-P3-D65_2.0",
         { 1.0e-4f,
-        { 0.5f, 0.4f, 0.3f }, { 0.16215305f, 0.15510881f, 0.12539130f } } },
+        { 0.5f, 0.4f, 0.3f },
+        { 0.16253395f, 0.15513620f, 0.12449738f } } },
      { "ACES-OUTPUT - ACES2065-1_to_CIE-XYZ-D65 - HDR-300nit-P3-D65_2.0",
         { 1.0e-4f,
-        { 0.5f, 0.4f, 0.3f }, { 0.20546328f, 0.19437392f, 0.15134005f } } },
+        { 0.5f, 0.4f, 0.3f },
+        { 0.20592400f, 0.19440512f, 0.15028587f } } },
      { "ACES-OUTPUT - ACES2065-1_to_CIE-XYZ-D65 - HDR-500nit-P3-D65_2.0",
         { 1.0e-4f,
-        { 0.5f, 0.4f, 0.3f }, { 0.40945035f, 0.38807374f, 0.30408621f } } },
+        { 0.5f, 0.4f, 0.3f },
+        { 0.41039270f, 0.38813815f, 0.30191854f } } },
      { "ACES-OUTPUT - ACES2065-1_to_CIE-XYZ-D65 - HDR-1000nit-P3-D65_2.0",
         { 1.0e-4f,
-        { 0.5f, 0.4f, 0.3f }, { 0.46440443f, 0.43846416f, 0.33907151f } } },
+        { 0.5f, 0.4f, 0.3f },
+        { 0.46536559f, 0.43852845f, 0.33688101f } } },
      { "ACES-OUTPUT - ACES2065-1_to_CIE-XYZ-D65 - HDR-2000nit-P3-D65_2.0",
         { 1.0e-4f,
-        { 0.5f, 0.4f, 0.3f }, { 0.51138031f, 0.48258632f, 0.37260318f } } },
+        { 0.5f, 0.4f, 0.3f },
+        { 0.51225948f, 0.48264498f, 0.37060043f } } },
      { "ACES-OUTPUT - ACES2065-1_to_CIE-XYZ-D65 - HDR-4000nit-P3-D65_2.0",
         { 1.0e-4f,
-        { 0.5f, 0.4f, 0.3f }, { 0.55570847f, 0.51962745f, 0.38863316f } } },
+        { 0.5f, 0.4f, 0.3f },
+        { 0.55653530f, 0.51967967f, 0.38678783f } } },
      { "ACES-OUTPUT - ACES2065-1_to_CIE-XYZ-D65 - HDR-500nit-REC2020_2.0",
         { 1.0e-4f,
-        { 0.5f, 0.4f, 0.3f }, { 0.40945050f, 0.38807371f, 0.30408630f } } },
+        { 0.5f, 0.4f, 0.3f },
+        { 0.41039288f, 0.38813818f, 0.30191860f } } },
      { "ACES-OUTPUT - ACES2065-1_to_CIE-XYZ-D65 - HDR-1000nit-REC2020_2.0",
         { 1.0e-4f,
-        { 0.5f, 0.4f, 0.3f }, { 0.46440458f, 0.43846416f, 0.33907148f } } },
+        { 0.5f, 0.4f, 0.3f },
+        { 0.46536580f, 0.43852842f, 0.33688098f } } },
      { "ACES-OUTPUT - ACES2065-1_to_CIE-XYZ-D65 - HDR-2000nit-REC2020_2.0",
         { 1.0e-4f,
-        { 0.5f, 0.4f, 0.3f }, { 0.51138067f, 0.48258659f, 0.37260354f } } },
+        { 0.5f, 0.4f, 0.3f },
+        { 0.51225960f, 0.48264492f, 0.37060046f } } },
      { "ACES-OUTPUT - ACES2065-1_to_CIE-XYZ-D65 - HDR-4000nit-REC2020_2.0",
         { 1.0e-4f,
-        { 0.5f, 0.4f, 0.3f }, { 0.55570871f, 0.51962751f, 0.38863319f } } },
+        { 0.5f, 0.4f, 0.3f },
+        { 0.55653548f, 0.51967967f, 0.38678783f } } },
      { "ACES-OUTPUT - ACES2065-1_to_CIE-XYZ-D65 - SDR-100nit-REC709-D60-in-REC709-D65_2.0",
         { 1.0e-4f,
-        { 0.5f, 0.4f, 0.3f }, { 0.25105247f, 0.24027000f, 0.18314247f } } },
+        { 0.5f, 0.4f, 0.3f },
+        { 0.25147712f, 0.24029461f, 0.18221153f } } },
      { "ACES-OUTPUT - ACES2065-1_to_CIE-XYZ-D65 - SDR-100nit-REC709-D60-in-P3-D65_2.0",
         { 1.0e-4f,
-        { 0.5f, 0.4f, 0.3f }, { 0.25330988f, 0.24243048f, 0.18478924f } } },
+        { 0.5f, 0.4f, 0.3f },
+        { 0.25373834f, 0.24245527f, 0.18384993f } } },
      { "ACES-OUTPUT - ACES2065-1_to_CIE-XYZ-D65 - SDR-100nit-REC709-D60-in-REC2020-D65_2.0",
         { 1.0e-4f,
-        { 0.5f, 0.4f, 0.3f }, { 0.25669453f, 0.24566977f, 0.18725836f } } },
+        { 0.5f, 0.4f, 0.3f },
+        { 0.25712875f, 0.24569492f, 0.18630651f } } },
      { "ACES-OUTPUT - ACES2065-1_to_CIE-XYZ-D65 - SDR-100nit-P3-D60-in-P3-D65_2.0",
         { 1.0e-4f,
-        { 0.5f, 0.4f, 0.3f }, { 0.25331008f, 0.24243063f, 0.18478930f } } },
+        { 0.5f, 0.4f, 0.3f },
+        { 0.25373828f, 0.24245520f, 0.18384989f } } },
      { "ACES-OUTPUT - ACES2065-1_to_CIE-XYZ-D65 - SDR-100nit-P3-D60-in-XYZ-E_2.0",
         { 1.0e-4f,
-        { 0.5f, 0.4f, 0.3f }, { 0.26287800f, 0.25158763f, 0.19176911f } } },
+        { 0.5f, 0.4f, 0.3f },
+        { 0.26332238f, 0.25161314f, 0.19079420f } } },
      { "ACES-OUTPUT - ACES2065-1_to_CIE-XYZ-D65 - HDR-108nit-P3-D60-in-P3-D65_2.0",
         { 1.0e-4f,
-        { 0.5f, 0.4f, 0.3f }, { 0.15668137f, 0.14918011f, 0.11180915f } } },
+        { 0.5f, 0.4f, 0.3f },
+        { 0.15705051f, 0.14920059f, 0.11100878f } } },
      { "ACES-OUTPUT - ACES2065-1_to_CIE-XYZ-D65 - HDR-300nit-P3-D60-in-XYZ-E_2.0",
         { 1.0e-4f,
-        { 0.5f, 0.4f, 0.3f }, { 0.20423311f, 0.19227016f, 0.13879779f } } },
+        { 0.5f, 0.4f, 0.3f },
+        { 0.20469207f, 0.19229385f, 0.13782671f } } },
      { "ACES-OUTPUT - ACES2065-1_to_CIE-XYZ-D65 - HDR-500nit-P3-D60-in-P3-D65_2.0",
         { 1.0e-4f,
-        { 0.5f, 0.4f, 0.3f }, { 0.39564425f, 0.37317851f, 0.27111340f } } },
+        { 0.5f, 0.4f, 0.3f },
+        { 0.39655733f, 0.37322620f, 0.26917258f } } },
      { "ACES-OUTPUT - ACES2065-1_to_CIE-XYZ-D65 - HDR-1000nit-P3-D60-in-P3-D65_2.0",
         { 1.0e-4f,
-        { 0.5f, 0.4f, 0.3f }, { 0.44875023f, 0.42160600f, 0.30228844f } } },
+        { 0.5f, 0.4f, 0.3f },
+        { 0.44968122f, 0.42165339f, 0.30032712f } } },
      { "ACES-OUTPUT - ACES2065-1_to_CIE-XYZ-D65 - HDR-2000nit-P3-D60-in-P3-D65_2.0",
         { 1.0e-4f,
-        { 0.5f, 0.4f, 0.3f }, { 0.49414319f, 0.46402797f, 0.33218032f } } },
+        { 0.5f, 0.4f, 0.3f },
+        { 0.49499470f, 0.46407115f, 0.33038712f } } },
      { "ACES-OUTPUT - ACES2065-1_to_CIE-XYZ-D65 - HDR-4000nit-P3-D60-in-P3-D65_2.0",
         { 1.0e-4f,
-        { 0.5f, 0.4f, 0.3f }, { 0.53698927f, 0.49956405f, 0.34642375f } } },
+        { 0.5f, 0.4f, 0.3f },
+        { 0.53778988f, 0.49960214f, 0.34477147f } } },
      { "ACES-OUTPUT - ACES2065-1_to_CIE-XYZ-D65 - HDR-500nit-P3-D60-in-REC2020-D65_2.0",
         { 1.0e-4f,
-        { 0.5f, 0.4f, 0.3f }, { 0.40093076f, 0.37816489f, 0.27473599f } } },
+        { 0.5f, 0.4f, 0.3f },
+        { 0.40185603f, 0.37821317f, 0.27276924f } } },
      { "ACES-OUTPUT - ACES2065-1_to_CIE-XYZ-D65 - HDR-1000nit-P3-D60-in-REC2020-D65_2.0",
         { 1.0e-4f,
-        { 0.5f, 0.4f, 0.3f }, { 0.45474637f, 0.42723945f, 0.30632755f } } },
+        { 0.5f, 0.4f, 0.3f },
+        { 0.45568976f, 0.42728746f, 0.30434006f } } },
      { "ACES-OUTPUT - ACES2065-1_to_CIE-XYZ-D65 - HDR-2000nit-P3-D60-in-REC2020-D65_2.0",
         { 1.0e-4f,
-        { 0.5f, 0.4f, 0.3f }, { 0.50074583f, 0.47022825f, 0.33661887f } } },
+        { 0.5f, 0.4f, 0.3f },
+        { 0.50160873f, 0.47027206f, 0.33480173f } } },
      { "ACES-OUTPUT - ACES2065-1_to_CIE-XYZ-D65 - HDR-4000nit-P3-D60-in-REC2020-D65_2.0",
         { 1.0e-4f,
-        { 0.5f, 0.4f, 0.3f }, { 0.54416442f, 0.50623918f, 0.35105261f } } },
+        { 0.5f, 0.4f, 0.3f },
+        { 0.54497570f, 0.50627774f, 0.34937829f } } },
      { "ACES-OUTPUT - ACES2065-1_to_CIE-XYZ-D65 - HDR-500nit-REC2020-D60-in-REC2020-D65_2.0",
         { 1.0e-4f,
-        { 0.5f, 0.4f, 0.3f }, { 0.40093106f, 0.37816498f, 0.27473608f } } },
+        { 0.5f, 0.4f, 0.3f },
+        { 0.40185642f, 0.37821338f, 0.27276939f } } },
      { "ACES-OUTPUT - ACES2065-1_to_CIE-XYZ-D65 - HDR-1000nit-REC2020-D60-in-REC2020-D65_2.0",
         { 1.0e-4f,
-        { 0.5f, 0.4f, 0.3f }, { 0.45474690f, 0.42723969f, 0.30632779f } } },
+        { 0.5f, 0.4f, 0.3f },
+        { 0.45569009f, 0.42728764f, 0.30434042f } } },
      { "ACES-OUTPUT - ACES2065-1_to_CIE-XYZ-D65 - HDR-2000nit-REC2020-D60-in-REC2020-D65_2.0",
         { 1.0e-4f,
-        { 0.5f, 0.4f, 0.3f }, { 0.50074631f, 0.47022864f, 0.33661926f } } },
+        { 0.5f, 0.4f, 0.3f },
+        { 0.50160891f, 0.47027206f, 0.33480188f } } },
      { "ACES-OUTPUT - ACES2065-1_to_CIE-XYZ-D65 - HDR-4000nit-REC2020-D60-in-REC2020-D65_2.0",
         { 1.0e-4f,
-        { 0.5f, 0.4f, 0.3f }, { 0.54416472f, 0.50623930f, 0.35105285f } } },
+        { 0.5f, 0.4f, 0.3f },
+        { 0.54497600f, 0.50627792f, 0.34937853f } } },
 
     { "APPLE_LOG_to_ACES2065-1",
         { 1.0e-6f,

From c7832ae8b445161ad3ee24b0a185b360d76a6403 Mon Sep 17 00:00:00 2001
From: "cuneyt.ozdas" <cuneyt.ozdas@autodesk.com>
Date: Thu, 6 Mar 2025 17:29:24 -0800
Subject: [PATCH 04/11] Remove unused code for old gamut table calculations
 (#2124)

Signed-off-by: Kevin Wheatley <kevin.wheatley@framestore.com>
Signed-off-by: cuneyt.ozdas <cuneyt.ozdas@autodesk.com>
---
 .../ops/fixedfunction/ACES2/ColorLib.h        | 24 ---------
 .../ops/fixedfunction/ACES2/Transform.cpp     | 50 -------------------
 2 files changed, 74 deletions(-)

diff --git a/src/OpenColorIO/ops/fixedfunction/ACES2/ColorLib.h b/src/OpenColorIO/ops/fixedfunction/ACES2/ColorLib.h
index 17d5022ee5..426e31f271 100644
--- a/src/OpenColorIO/ops/fixedfunction/ACES2/ColorLib.h
+++ b/src/OpenColorIO/ops/fixedfunction/ACES2/ColorLib.h
@@ -15,30 +15,6 @@ namespace OCIO_NAMESPACE
 namespace ACES2
 {
 
-inline f3 HSV_to_RGB(const f3 &HSV)
-{
-    const float C = HSV[2] * HSV[1];
-    const float X = C * (1.f - std::abs(std::fmod(HSV[0] * 6.f, 2.f) - 1.f));
-    const float m = HSV[2] - C;
-
-    f3 RGB{};
-    if (HSV[0] < 1.f/6.f) {
-        RGB = {C, X, 0.f};
-    } else if (HSV[0] < 2./6.) {
-        RGB = {X, C, 0.f};
-    } else if (HSV[0] < 3./6.) {
-        RGB = {0.f, C, X};
-    } else if (HSV[0] < 4./6.) {
-        RGB = {0.f, X, C};
-    } else if (HSV[0] < 5./6.) {
-        RGB = {X, 0.f, C};
-    } else {
-        RGB = {C, 0.f, X};
-    }
-    RGB = add_f_f3(m, RGB);
-    return RGB;
-}
-
 inline m33f RGBtoXYZ_f33(const Primaries &C)
 {
     return m33_from_ocio_matrix_array(
diff --git a/src/OpenColorIO/ops/fixedfunction/ACES2/Transform.cpp b/src/OpenColorIO/ops/fixedfunction/ACES2/Transform.cpp
index 68161def88..3e23f59a27 100644
--- a/src/OpenColorIO/ops/fixedfunction/ACES2/Transform.cpp
+++ b/src/OpenColorIO/ops/fixedfunction/ACES2/Transform.cpp
@@ -849,55 +849,6 @@ Table3D make_uniform_hue_gamut_table(const JMhParams &reach_params, const JMhPar
     return build_cusp_table(hue_table, limiting_RGB_corners, limiting_JMh_corners, params);
 }
 
-Table3D make_gamut_table(const JMhParams &params, float peakLuminance, Table1D& hue_table)
-{
-    Table3D gamutCuspTableUnsorted{};
-    unsigned int minhIndex = 0;
-    for (unsigned int i = 0; i < gamutCuspTableUnsorted.nominal_size; i++)
-    {
-        const float hNorm = float(i) / float(gamutCuspTableUnsorted.nominal_size);
-        const f3 HSV = {hNorm, 1., 1.};
-        const f3 RGB = HSV_to_RGB(HSV);
-        const f3 scaledRGB = mult_f_f3(peakLuminance / reference_luminance, RGB);
-        const f3 JMh = RGB_to_JMh(scaledRGB, params);
-
-        gamutCuspTableUnsorted[i][0] = JMh[0];
-        gamutCuspTableUnsorted[i][1] = JMh[1]  * (1.f + smooth_m * smooth_cusps);
-        gamutCuspTableUnsorted[i][2] = JMh[2];
-        if ( gamutCuspTableUnsorted[i][2] < gamutCuspTableUnsorted[minhIndex][2])
-            minhIndex = i;
-    }
-
-    Table3D gamutCuspTable{};
-    for (unsigned int i = 0; i < gamutCuspTableUnsorted.nominal_size; i++)
-    {
-        gamutCuspTable[i + gamutCuspTable.base_index][0] = gamutCuspTableUnsorted[(minhIndex+i) % gamutCuspTableUnsorted.nominal_size][0];
-        gamutCuspTable[i + gamutCuspTable.base_index][1] = gamutCuspTableUnsorted[(minhIndex+i) % gamutCuspTableUnsorted.nominal_size][1];
-        gamutCuspTable[i + gamutCuspTable.base_index][2] = gamutCuspTableUnsorted[(minhIndex+i) % gamutCuspTableUnsorted.nominal_size][2];
-    }
-
-    // Copy last populated entry to first empty spot
-    gamutCuspTable[gamutCuspTable.lower_wrap_index][0] = gamutCuspTable[gamutCuspTable.last_nominal_index][0];
-    gamutCuspTable[gamutCuspTable.lower_wrap_index][1] = gamutCuspTable[gamutCuspTable.last_nominal_index][1];
-    gamutCuspTable[gamutCuspTable.lower_wrap_index][2] = gamutCuspTable[gamutCuspTable.last_nominal_index][2];
-
-    // Copy first populated entry to last empty spot
-    gamutCuspTable[gamutCuspTable.upper_wrap_index][0] = gamutCuspTable[gamutCuspTable.first_nominal_index][0];
-    gamutCuspTable[gamutCuspTable.upper_wrap_index][1] = gamutCuspTable[gamutCuspTable.first_nominal_index][1];
-    gamutCuspTable[gamutCuspTable.upper_wrap_index][2] = gamutCuspTable[gamutCuspTable.first_nominal_index][2];
-
-    // Wrap the hues, to maintain monotonicity. These entries will fall outside [0.0, hue_limit)
-    gamutCuspTable[gamutCuspTable.lower_wrap_index][2] -= hue_limit;
-    gamutCuspTable[gamutCuspTable.upper_wrap_index][2] += hue_limit;
-
-    // Extract hue table
-    for (unsigned int i = 0; i < gamutCuspTable.total_size; i++)
-    {
-        hue_table[i] = gamutCuspTable[i][2];
-    }
-    return gamutCuspTable;
-}
-
 bool any_below_zero(const f3 &rgb)
 {
     return (rgb[0] < 0. || rgb[1] < 0. || rgb[2] < 0.);
@@ -1437,7 +1388,6 @@ GamutCompressParams init_GamutCompressParams(float peakLuminance, const JMhParam
     params.focus_dist = focus_dist;
     params.lower_hull_gamma_inv = lower_hull_gamma_inv;
     params.gamut_cusp_table = make_uniform_hue_gamut_table(reachParams, limitJMhParams, peakLuminance, tsParams.forward_limit, shParams, params.hue_table);
-    //params.gamut_cusp_table = make_gamut_table(limitJMhParams, peakLuminance, params.hue_table);
     params.hue_linearity_search_range = determine_hue_linearity_search_range(params.gamut_cusp_table);
     make_upper_hull_gamma(params.hue_table, params.gamut_cusp_table, peakLuminance, shParams.limit_J_max, mid_J, focus_dist,
                           lower_hull_gamma_inv, limitJMhParams); //TODO: mess of parameters

From 9d0af3f1fce4bd8715fdd31ebff0c842f8965b52 Mon Sep 17 00:00:00 2001
From: "cuneyt.ozdas" <cuneyt.ozdas@autodesk.com>
Date: Thu, 6 Mar 2025 20:03:13 -0800
Subject: [PATCH 05/11] Minor code cleanup

Signed-off-by: cuneyt.ozdas <cuneyt.ozdas@autodesk.com>
---
 .../ops/fixedfunction/FixedFunctionOpGPU.cpp  |  1 -
 tests/cpu/UnitTestUtils.h                     |  2 +-
 .../FixedFunctionOpCPU_tests.cpp              | 21 -------------
 .../cpu/transforms/BuiltinTransform_tests.cpp | 30 -------------------
 4 files changed, 1 insertion(+), 53 deletions(-)

diff --git a/src/OpenColorIO/ops/fixedfunction/FixedFunctionOpGPU.cpp b/src/OpenColorIO/ops/fixedfunction/FixedFunctionOpGPU.cpp
index c08fd74613..da769a104c 100644
--- a/src/OpenColorIO/ops/fixedfunction/FixedFunctionOpGPU.cpp
+++ b/src/OpenColorIO/ops/fixedfunction/FixedFunctionOpGPU.cpp
@@ -663,7 +663,6 @@ std::string _Add_Tonescale_func(
     }
 
     ss.newLine() << ss.floatDecl("J_ts") << " = " << ACES2::J_scale << " * pow((F_L_Y / ( " << ACES2::cam_nl_offset << " + F_L_Y)) * " << p.inv_A_w_J << ", " << p.cz << ");";
-    // TODO: copysign is missing here. /coz
     ss.newLine() << "return J_ts;";
 
     ss.dedent();
diff --git a/tests/cpu/UnitTestUtils.h b/tests/cpu/UnitTestUtils.h
index 315cf258ee..37b584e88c 100644
--- a/tests/cpu/UnitTestUtils.h
+++ b/tests/cpu/UnitTestUtils.h
@@ -93,7 +93,7 @@ inline bool EqualWithSafeRelError(T value,
 
     if (computedError) 
     {
-      *computedError = err;
+        *computedError = err;
     }
     
     return (err <= eps);
diff --git a/tests/cpu/ops/fixedfunction/FixedFunctionOpCPU_tests.cpp b/tests/cpu/ops/fixedfunction/FixedFunctionOpCPU_tests.cpp
index aee030f53e..9139c516df 100644
--- a/tests/cpu/ops/fixedfunction/FixedFunctionOpCPU_tests.cpp
+++ b/tests/cpu/ops/fixedfunction/FixedFunctionOpCPU_tests.cpp
@@ -835,17 +835,6 @@ OCIO_ADD_TEST(FixedFunctionOpCPU, aces_tonescale_compress_20)
                        1e-5f,
                        __LINE__);
 
-#if DUMP_RESULTS
-    std::cout << "aces_tonescale_compress_20 results: \n" << std::setprecision(9) << std::fixed;
-    for (unsigned i = 0; i < num_samples; ++i)
-    {
-        std::cout   << input2_32f[i * 4 + 0] << "f, "
-                    << input2_32f[i * 4 + 1] << "f, "
-                    << input2_32f[i * 4 + 2] << "f, "
-                    << input2_32f[i * 4 + 3] << "f,\n";
-    }
-#endif
-
     OCIO::ConstFixedFunctionOpDataRcPtr funcData2
         = std::make_shared<OCIO::FixedFunctionOpData>(OCIO::FixedFunctionOpData::ACES_TONESCALE_COMPRESS_20_INV,
                                                       params);
@@ -943,16 +932,6 @@ OCIO_ADD_TEST(FixedFunctionOpCPU, aces_gamut_map_20)
                        funcData,
                        1e-5f,
                        __LINE__);
-#if DUMP_RESULTS
-    std::cout << "Results: \n" << std::setprecision(9) << std::fixed;
-    for (unsigned i = 0; i < num_samples; ++i)
-    {
-        std::cout   << input2_32f[i * 4 + 0] << "f, "
-                    << input2_32f[i * 4 + 1] << "f, "
-                    << input2_32f[i * 4 + 2] << "f, "
-                    << input2_32f[i * 4 + 3] << "f,\n";
-    }
-#endif
 
     OCIO::ConstFixedFunctionOpDataRcPtr funcData2
         = std::make_shared<OCIO::FixedFunctionOpData>(OCIO::FixedFunctionOpData::ACES_GAMUT_COMPRESS_20_INV,
diff --git a/tests/cpu/transforms/BuiltinTransform_tests.cpp b/tests/cpu/transforms/BuiltinTransform_tests.cpp
index 8ffc3dd3c3..b623028ad7 100644
--- a/tests/cpu/transforms/BuiltinTransform_tests.cpp
+++ b/tests/cpu/transforms/BuiltinTransform_tests.cpp
@@ -17,8 +17,6 @@
 #include "UnitTestUtils.h"
 
 namespace OCIO = OCIO_NAMESPACE;
-#define DUMP_RESULTS 0
-
 
 OCIO_ADD_TEST(BuiltinTransform, creation)
 {
@@ -705,10 +703,6 @@ OCIO_ADD_TEST(Builtins, validate)
 {
     OCIO::ConstBuiltinTransformRegistryRcPtr reg = OCIO::BuiltinTransformRegistry::Get();
 
-#if DUMP_RESULTS
-    std::ostringstream ss;
-#endif
-
     for (size_t index = 0; index < reg->getNumBuiltins(); ++index)
     {
         const char * name = reg->getBuiltinStyle(index);
@@ -738,33 +732,9 @@ OCIO_ADD_TEST(Builtins, validate)
         {
             Values results;
             ValidateBuiltinTransform(name, std::get<1>(values), std::get<2>(values), std::get<0>(values), __LINE__, results);
-
-#if DUMP_RESULTS
-            ss << std::defaultfloat;
-            ss << "    { \"" << name << "\",\n";
-            ss << "        { " << std::get<0>(values) << "f,\n        { ";
-            int comma = 0;
-            for (auto val : std::get<1>(values)) 
-            {
-                ss << (comma++ ? ", " : "") << val << "f";
-                comma = true;
-            }
-            ss << " },\n        { ";
-            ss << std::setprecision(8) << std::fixed;
-            comma = 0;
-            for (auto val : results) 
-            {
-                ss << (comma++ ? ", " : "") << val << "f";
-            }
-            ss << " } } },\n ";
-#endif
         }
     }
 
-#if DUMP_RESULTS
-    std::cout << ss.str() << std::endl;
-#endif
-
     // The above checks if a test values is missing, but not if there are test values
     // that don't have an associated built-in.
     OCIO_CHECK_EQUAL(UnitTestValues.size(), reg->getNumBuiltins());

From b7574776626c37bd6471ac953d1aeac9944d7ad4 Mon Sep 17 00:00:00 2001
From: "cuneyt.ozdas" <cuneyt.ozdas@autodesk.com>
Date: Sun, 9 Mar 2025 16:35:46 -0700
Subject: [PATCH 06/11] Adding negative A trap on Aab_to_JMh_Shader() per code
 review

Signed-off-by: cuneyt.ozdas <cuneyt.ozdas@autodesk.com>
---
 .../ops/fixedfunction/FixedFunctionOpGPU.cpp         | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/src/OpenColorIO/ops/fixedfunction/FixedFunctionOpGPU.cpp b/src/OpenColorIO/ops/fixedfunction/FixedFunctionOpGPU.cpp
index da769a104c..04b2c1d07a 100644
--- a/src/OpenColorIO/ops/fixedfunction/FixedFunctionOpGPU.cpp
+++ b/src/OpenColorIO/ops/fixedfunction/FixedFunctionOpGPU.cpp
@@ -402,6 +402,16 @@ void _Add_Aab_to_JMh_Shader(
     ss.newLine() << "{";
     ss.indent();
 
+    ss.newLine() << "if (Aab.r <= 0.0)";
+    ss.newLine() << "{";
+    ss.indent();
+    ss.newLine() << "JMh.rgb = " << ss.float3Const(0.0) << ";";
+    ss.dedent();
+    ss.newLine() << "}";
+
+    ss.newLine() << "else";
+    ss.newLine() << "{";
+    ss.indent();
     ss.newLine() << ss.floatDecl("J") << " = " << ACES2::J_scale << " * pow(Aab.r, " << p.cz << ");";
 
     ss.newLine() << ss.floatDecl("M") << " = (J == 0.0) ? 0.0 : sqrt(Aab.g * Aab.g + Aab.b * Aab.b);";
@@ -411,6 +421,8 @@ void _Add_Aab_to_JMh_Shader(
     ss.newLine() << "h = (h < 0.0) ? h + 360.0 : h;";
 
     ss.newLine() << "JMh.rgb = " << ss.float3Const("J", "M", "h") << ";";
+    ss.dedent();
+    ss.newLine() << "}";
 
     ss.dedent();
     ss.newLine() << "}";

From aaf39f2b6d7b64c5b5059130d7f8b73e4e3f8013 Mon Sep 17 00:00:00 2001
From: "cuneyt.ozdas" <cuneyt.ozdas@autodesk.com>
Date: Mon, 10 Mar 2025 11:55:43 -0700
Subject: [PATCH 07/11] Adding copysign to tonescale to make it aligned with
 the CPU implementation.

It's possible that on GPU we may never receive negative J due to prior guarding, but for now aligning with the CPU to be on the safer side.

Signed-off-by: cuneyt.ozdas <cuneyt.ozdas@autodesk.com>
---
 src/OpenColorIO/ops/fixedfunction/FixedFunctionOpGPU.cpp | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/OpenColorIO/ops/fixedfunction/FixedFunctionOpGPU.cpp b/src/OpenColorIO/ops/fixedfunction/FixedFunctionOpGPU.cpp
index 04b2c1d07a..ae65eb3920 100644
--- a/src/OpenColorIO/ops/fixedfunction/FixedFunctionOpGPU.cpp
+++ b/src/OpenColorIO/ops/fixedfunction/FixedFunctionOpGPU.cpp
@@ -652,6 +652,8 @@ std::string _Add_Tonescale_func(
     ss.indent();
 
     // Tonescale applied in Y (convert to and from J)
+    // TODO: Investigate if we can receive negative J here at all. 
+    // If not, abs(J) here and the sign(J) at the return may not be needed at all.
     ss.newLine() << ss.floatDecl("A") << " = " << p.A_w_J << " * pow(abs(J) * " << 1.0f / ACES2::J_scale << ", " << p.inv_cz << ");";
     ss.newLine() << ss.floatDecl("Y") << " = pow(( " << ACES2::cam_nl_offset << " * A) / (1.0f - A), " << 1.0 / 0.42 << ");";
 
@@ -675,7 +677,7 @@ std::string _Add_Tonescale_func(
     }
 
     ss.newLine() << ss.floatDecl("J_ts") << " = " << ACES2::J_scale << " * pow((F_L_Y / ( " << ACES2::cam_nl_offset << " + F_L_Y)) * " << p.inv_A_w_J << ", " << p.cz << ");";
-    ss.newLine() << "return J_ts;";
+    ss.newLine() << "return sign(J) * J_ts;";
 
     ss.dedent();
     ss.newLine() << "}";

From 606b52f9d1cfda8afb6c3657293668065cff1ca1 Mon Sep 17 00:00:00 2001
From: Doug Walker <doug.walker@autodesk.com>
Date: Fri, 14 Mar 2025 01:40:02 -0400
Subject: [PATCH 08/11] Add built-in transform round-trip test

Signed-off-by: Doug Walker <doug.walker@autodesk.com>
---
 .../FixedFunctionOpCPU_tests.cpp              |   7 +
 .../cpu/transforms/BuiltinTransform_tests.cpp | 151 ++++++++++++++++++
 2 files changed, 158 insertions(+)

diff --git a/tests/cpu/ops/fixedfunction/FixedFunctionOpCPU_tests.cpp b/tests/cpu/ops/fixedfunction/FixedFunctionOpCPU_tests.cpp
index 9139c516df..008264582e 100644
--- a/tests/cpu/ops/fixedfunction/FixedFunctionOpCPU_tests.cpp
+++ b/tests/cpu/ops/fixedfunction/FixedFunctionOpCPU_tests.cpp
@@ -544,6 +544,13 @@ OCIO_ADD_TEST(FixedFunctionOpCPU, aces_output_transform_20)
                        __LINE__);
 }
 
+// NB: The ACES 2 FixedFunction takes linear ACES2065-1 values and produces linear RGB values
+// in the encoding gamut. The relatively large tolerance on the following round-trip tests doesn't
+// fully test accuracy of saturated values. See additional tests in BuiltinTransform_tests.cpp
+// that do a similar round-trip but using gamma-corrected code values and therefore does
+// a more thorough test of colors where one or more channels is near zero, which is an area
+// that is more challenging for the algorithm to invert.
+
 OCIO_ADD_TEST(FixedFunctionOpCPU, aces_ot_20_rec709_100n_rt)
 {
     const int lut_size = 8;
diff --git a/tests/cpu/transforms/BuiltinTransform_tests.cpp b/tests/cpu/transforms/BuiltinTransform_tests.cpp
index b623028ad7..76746b9d1d 100644
--- a/tests/cpu/transforms/BuiltinTransform_tests.cpp
+++ b/tests/cpu/transforms/BuiltinTransform_tests.cpp
@@ -8,6 +8,7 @@
 
 #include "transforms/BuiltinTransform.cpp"
 
+#include "ops/lut3d/Lut3DOp.h"
 #include "ops/matrix/MatrixOp.h"
 #include "Platform.h"
 #include "transforms/builtins/ACES.h"
@@ -739,3 +740,153 @@ OCIO_ADD_TEST(Builtins, validate)
     // that don't have an associated built-in.
     OCIO_CHECK_EQUAL(UnitTestValues.size(), reg->getNumBuiltins());
 }
+
+namespace
+{
+
+void ValidateDisplayViewRoundTrip(const char * display_style, const char * view_style,
+                                  float scale, float errorThreshold, 
+                                  std::vector<int> difficultItems, float difficultThreshold,
+                                  int lineNo)
+{
+    // Built-in transform for the display.
+    OCIO::BuiltinTransformRcPtr display_builtin = OCIO::BuiltinTransform::Create();
+    OCIO_CHECK_NO_THROW_FROM(display_builtin->setStyle(display_style), lineNo);
+    OCIO_CHECK_NO_THROW_FROM(display_builtin->validate(), lineNo);
+    auto display_builtin_inv = display_builtin->createEditableCopy();
+    display_builtin_inv->setDirection(OCIO::TRANSFORM_DIR_INVERSE);
+
+    // Built-in transform for the view.
+    OCIO::BuiltinTransformRcPtr view_builtin = OCIO::BuiltinTransform::Create();
+    OCIO_CHECK_NO_THROW_FROM(view_builtin->setStyle(view_style), lineNo);
+    OCIO_CHECK_NO_THROW_FROM(view_builtin->validate(), lineNo);
+    auto view_builtin_inv = view_builtin->createEditableCopy();
+    view_builtin_inv->setDirection(OCIO::TRANSFORM_DIR_INVERSE);
+
+    // Assemble inverse and forward transform into a group transform that goes from
+    // display code values to ACES and back to code values.
+    OCIO::GroupTransformRcPtr group = OCIO::GroupTransform::Create();
+    group->appendTransform(display_builtin_inv);
+    group->appendTransform(view_builtin_inv);
+    group->appendTransform(view_builtin);
+    group->appendTransform(display_builtin);
+
+    // Create a Processor.
+    OCIO::ConstConfigRcPtr config = OCIO::Config::CreateRaw();
+    OCIO::ConstProcessorRcPtr proc;
+    OCIO_CHECK_NO_THROW_FROM(proc = config->getProcessor(group), lineNo);
+    OCIO_REQUIRE_ASSERT(proc);
+
+    // Create a CPUProcessor.
+    // Use optimization none to avoid replacing inv/fwd pairs and avoid fast pow for the display.
+    OCIO::ConstCPUProcessorRcPtr cpu;
+    OCIO_CHECK_NO_THROW_FROM(cpu = proc->getOptimizedCPUProcessor(OCIO::OPTIMIZATION_NONE), lineNo);
+    OCIO_REQUIRE_ASSERT(cpu);
+
+    // Create a 7 x 7 x 7 grid of RGBA values.
+    const unsigned lut_size = 7;
+    const unsigned num_channels = 4;
+    unsigned num_samples = lut_size * lut_size * lut_size;
+    std::vector<float> input_32f(num_samples * num_channels, 0.f);
+    std::vector<float> output_32f(num_samples * num_channels, 0.f);
+
+    GenerateIdentityLut3D(input_32f.data(), lut_size, num_channels, OCIO::LUT3DORDER_FAST_RED);
+
+    // Scale the grid of points, which is necessary when testing the ST-2084/PQ displays
+    // since the transforms are only designed to process up to a maximum luminance level.
+    for(unsigned idx=0; idx<(num_samples*4); ++idx)
+    {
+        input_32f[idx] *= scale;
+    }
+
+    // Process the values.
+    OCIO::PackedImageDesc inDesc((void *)&input_32f[0], num_samples, 1, 4);
+    OCIO::PackedImageDesc outDesc((void *)&output_32f[0], num_samples, 1, 4);
+    OCIO_CHECK_NO_THROW_FROM(cpu->apply(inDesc, outDesc), lineNo);
+
+    // Check if values are within tolerance.
+    for(unsigned idx=0; idx<(num_samples*4); idx+=4)
+    {
+        float computedErrorR, computedErrorG, computedErrorB = 0.0f;
+
+        const bool isDifficult = std::find(difficultItems.begin(), difficultItems.end(), idx)
+                                        != difficultItems.end();
+        const float tol = isDifficult ? difficultThreshold: errorThreshold;
+
+        const bool equalRelR = OCIO::EqualWithSafeRelError(output_32f[idx], input_32f[idx],
+                                                           tol, 1.0f,
+                                                           &computedErrorR);
+        const bool equalRelG = OCIO::EqualWithSafeRelError(output_32f[idx+1], input_32f[idx+1],
+                                                           tol, 1.0f,
+                                                           &computedErrorG);
+        const bool equalRelB = OCIO::EqualWithSafeRelError(output_32f[idx+2], input_32f[idx+2],
+                                                           tol, 1.0f,
+                                                           &computedErrorB);
+        if (!equalRelR || !equalRelG || !equalRelB)
+        {
+            std::ostringstream errorMsg;
+            errorMsg.precision(10);
+            errorMsg << "Index: " << idx << " - Tol.: " << tol << "\n - Expected: ";
+            errorMsg << std::fixed;
+            errorMsg << input_32f[idx] << ", " << input_32f[idx+1] << ", " << input_32f[idx+2];
+            errorMsg << "\n - Actual:   ";
+            errorMsg << output_32f[idx] << ", " << output_32f[idx+1] << ", " << output_32f[idx+2];
+            errorMsg << "\n - Error:    ";
+            errorMsg << computedErrorR << ", " << computedErrorG << ", " << computedErrorB;
+            OCIO_CHECK_ASSERT_MESSAGE_FROM(0, errorMsg.str(), lineNo);
+        }
+    }
+}
+
+} // anon.
+
+OCIO_ADD_TEST(Builtins, aces2_displayview_roundtrip)
+{
+    // Perform a round-trip test from display code-values to ACES and back to code values.
+    // This uses a 7 x 7 x 7 grid of RGB values.
+
+    ValidateDisplayViewRoundTrip("DISPLAY - CIE-XYZ-D65_to_REC.1886-REC.709",
+                                 "ACES-OUTPUT - ACES2065-1_to_CIE-XYZ-D65 - SDR-100nit-REC709_2.0",
+                                 1.0f,          // scale factor
+                                 0.003f,        // tolerance
+                                 {}, 0.f,
+                                 __LINE__);
+
+    ValidateDisplayViewRoundTrip("DISPLAY - CIE-XYZ-D65_to_DisplayP3",
+                                 "ACES-OUTPUT - ACES2065-1_to_CIE-XYZ-D65 - SDR-100nit-P3-D65_2.0",
+                                 1.0f,          // scale factor
+                                 0.001f,        // tolerance
+                                 {}, 0.f,
+                                 __LINE__);
+
+    ValidateDisplayViewRoundTrip("DISPLAY - CIE-XYZ-D65_to_ST2084-P3-D65",
+                                 "ACES-OUTPUT - ACES2065-1_to_CIE-XYZ-D65 - HDR-1000nit-P3-D65_2.0",
+                                 // Need to lower the max value from 1000 to 990 nits.
+                                 0.7507f,       // scale factor = 990 nits
+                                 0.005f,        // main tolerance
+                                 {196, 392},    // difficult values
+                                 0.03f,         // tolerance for difficult values
+                                 __LINE__);
+
+    ValidateDisplayViewRoundTrip("DISPLAY - CIE-XYZ-D65_to_ST2084-P3-D65",
+                                 "ACES-OUTPUT - ACES2065-1_to_CIE-XYZ-D65 - HDR-4000nit-P3-D65_2.0",
+                                 // Need to lower the max value from 4000 to 3860 nits.
+                                 0.8987f,       // scale factor = 3860 nits
+                                 0.006f,        // main tolerance
+                                 {196, 392, 396, 588, 592, 1196, 1200, 1260, 1288}, // difficult values
+                                 0.2f,         // tolerance for difficult values
+                                 __LINE__);
+
+    // TODO: The Rec.2100 transforms have too many values that don't invert to easily validate.
+//     ValidateDisplayViewRoundTrip("DISPLAY - CIE-XYZ-D65_to_REC.2100-PQ",
+//                                  "ACES-OUTPUT - ACES2065-1_to_CIE-XYZ-D65 - HDR-1000nit-REC2020_2.0",
+//                                  0.7507f,       // scale factor = 990 nits
+//                                  5e-3f,         // tolerance
+//                                  __LINE__);
+// 
+//     ValidateDisplayViewRoundTrip("DISPLAY - CIE-XYZ-D65_to_REC.2100-PQ",
+//                                  "ACES-OUTPUT - ACES2065-1_to_CIE-XYZ-D65 - HDR-4000nit-REC2020_2.0",
+//                                  0.8987f,       // scale factor = 3860 nits
+//                                  5e-3f,         // tolerance
+//                                  __LINE__);
+}

From d60cc2e0bfb04409b7e0cb8c4f701565418d62e8 Mon Sep 17 00:00:00 2001
From: Doug Walker <doug.walker@autodesk.com>
Date: Fri, 14 Mar 2025 02:01:53 -0400
Subject: [PATCH 09/11] Loosen tolerance for other machines

Signed-off-by: Doug Walker <doug.walker@autodesk.com>
---
 tests/cpu/transforms/BuiltinTransform_tests.cpp | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/tests/cpu/transforms/BuiltinTransform_tests.cpp b/tests/cpu/transforms/BuiltinTransform_tests.cpp
index 76746b9d1d..29d252cfb9 100644
--- a/tests/cpu/transforms/BuiltinTransform_tests.cpp
+++ b/tests/cpu/transforms/BuiltinTransform_tests.cpp
@@ -848,7 +848,7 @@ OCIO_ADD_TEST(Builtins, aces2_displayview_roundtrip)
     ValidateDisplayViewRoundTrip("DISPLAY - CIE-XYZ-D65_to_REC.1886-REC.709",
                                  "ACES-OUTPUT - ACES2065-1_to_CIE-XYZ-D65 - SDR-100nit-REC709_2.0",
                                  1.0f,          // scale factor
-                                 0.003f,        // tolerance
+                                 0.004f,        // tolerance
                                  {}, 0.f,
                                  __LINE__);
 
@@ -864,7 +864,7 @@ OCIO_ADD_TEST(Builtins, aces2_displayview_roundtrip)
                                  // Need to lower the max value from 1000 to 990 nits.
                                  0.7507f,       // scale factor = 990 nits
                                  0.005f,        // main tolerance
-                                 {196, 392},    // difficult values
+                                 {168, 196, 364, 392, 1344},    // difficult values
                                  0.03f,         // tolerance for difficult values
                                  __LINE__);
 
@@ -872,8 +872,8 @@ OCIO_ADD_TEST(Builtins, aces2_displayview_roundtrip)
                                  "ACES-OUTPUT - ACES2065-1_to_CIE-XYZ-D65 - HDR-4000nit-P3-D65_2.0",
                                  // Need to lower the max value from 4000 to 3860 nits.
                                  0.8987f,       // scale factor = 3860 nits
-                                 0.006f,        // main tolerance
-                                 {196, 392, 396, 588, 592, 1196, 1200, 1260, 1288}, // difficult values
+                                 0.007f,        // main tolerance
+                                 {168, 196, 392, 396, 588, 592, 952, 1148, 1196, 1200, 1260, 1288},
                                  0.2f,         // tolerance for difficult values
                                  __LINE__);
 

From cfc3be9b68f517a40a1d5137cade3454199c362d Mon Sep 17 00:00:00 2001
From: Doug Walker <doug.walker@autodesk.com>
Date: Fri, 14 Mar 2025 16:44:21 -0400
Subject: [PATCH 10/11] Add GPU round-trip tests

Signed-off-by: Doug Walker <doug.walker@autodesk.com>
---
 tests/gpu/FixedFunctionOp_test.cpp | 153 ++++++++++++++++++++++++++++-
 1 file changed, 152 insertions(+), 1 deletion(-)

diff --git a/tests/gpu/FixedFunctionOp_test.cpp b/tests/gpu/FixedFunctionOp_test.cpp
index 2c5d7cade4..bcc6a69cd3 100644
--- a/tests/gpu/FixedFunctionOp_test.cpp
+++ b/tests/gpu/FixedFunctionOp_test.cpp
@@ -434,7 +434,11 @@ OCIO_ADD_GPU_TEST(FixedFunction, style_aces2_output_transform_inv)
 
 OCIO_ADD_GPU_TEST(FixedFunction, style_aces2_output_transform_invfwd)
 {
-     const double data_inv[9] = {
+    // Test that there are no shader resource (textures, functions, etc) conflicts between
+    // the 2 different inverse and forward transforms.  This is not a round-trip test,
+    // it tests that the forward and inverse may exist in the same shader with no issue.
+
+    const double data_inv[9] = {
         // Peak luminance
         100.f,
         // REC709 gamut
@@ -463,9 +467,156 @@ OCIO_ADD_GPU_TEST(FixedFunction, style_aces2_output_transform_invfwd)
 
     test.setProcessor(grp);
 
+    // This is not expected to be an identity, but it should be the same between CPU & GPU.
     test.setErrorThreshold(7e-4f);
 }
 
+namespace
+{
+
+OCIO::GroupTransformRcPtr BuildRoundTripTransform(const char * display_style, const char * view_style)
+{
+    // Built-in transform for the display.
+    OCIO::BuiltinTransformRcPtr display_builtin = OCIO::BuiltinTransform::Create();
+    display_builtin->setStyle(display_style);
+    display_builtin->validate();
+    auto display_builtin_inv = display_builtin->createEditableCopy();
+    display_builtin_inv->setDirection(OCIO::TRANSFORM_DIR_INVERSE);
+
+    // Built-in transform for the view.
+    OCIO::BuiltinTransformRcPtr view_builtin = OCIO::BuiltinTransform::Create();
+    view_builtin->setStyle(view_style);
+    view_builtin->validate();
+    auto view_builtin_inv = view_builtin->createEditableCopy();
+    view_builtin_inv->setDirection(OCIO::TRANSFORM_DIR_INVERSE);
+
+    // Assemble inverse and forward transform into a group transform that goes from
+    // display code values to ACES and back to code values.
+    OCIO::GroupTransformRcPtr group = OCIO::GroupTransform::Create();
+    group->appendTransform(display_builtin_inv);
+    group->appendTransform(view_builtin_inv);
+    group->appendTransform(view_builtin);
+    group->appendTransform(display_builtin);
+
+    return group;
+}
+
+void GenerateIdentityLut3D(OCIOGPUTest::CustomValues & values, int edgeLen, int numChannels, float scale)
+{
+    int num_samples = edgeLen * edgeLen * edgeLen;
+    std::vector<float> img(num_samples * numChannels, 0.f);
+
+    float c = 1.0f / ((float)edgeLen - 1.0f);
+    for (int i = 0; i < edgeLen*edgeLen*edgeLen; i++)
+    {
+        img[numChannels*i + 0] = scale * (float)(i%edgeLen) * c;
+        img[numChannels*i + 1] = scale * (float)((i / edgeLen) % edgeLen) * c;
+        img[numChannels*i + 2] = scale * (float)((i / edgeLen / edgeLen) % edgeLen) * c;
+    }
+    values.m_inputValues = img;
+}
+
+} // anon.
+
+// The following group of tests compares the display code value to ACES and back to code value
+// round-trip. The round-trip is not perfect (see BuiltinTransform_tests.cpp) but the tests 
+// here simply check if the CPU and GPU are giving the same result.
+
+OCIO_ADD_GPU_TEST(FixedFunction, style_aces2_rec709_rndtrip)
+{
+    const char * display_style = "DISPLAY - CIE-XYZ-D65_to_REC.1886-REC.709";
+    const char * view_style = "ACES-OUTPUT - ACES2065-1_to_CIE-XYZ-D65 - SDR-100nit-REC709_2.0";
+    auto group = BuildRoundTripTransform(display_style, view_style);
+
+    // The test harness gets a processor from the transform with the default optimization
+    // level. However, the forward/inverse does not optimize out due to the clamp to AP1
+    // in-between the FixedFunctions.
+    test.setProcessor(group);
+
+    // Set up a grid of RGBA custom values.
+    const int lut_size = 17;
+    const int num_channels = 4;
+    OCIOGPUTest::CustomValues values;
+    GenerateIdentityLut3D(values, lut_size, num_channels, 1.0f);
+
+    test.setCustomValues(values);
+
+    test.setErrorThreshold(0.004f);
+}
+
+OCIO_ADD_GPU_TEST(FixedFunction, style_aces2_displayp3_rndtrip)
+{
+    const char * display_style = "DISPLAY - CIE-XYZ-D65_to_DisplayP3";
+    const char * view_style = "ACES-OUTPUT - ACES2065-1_to_CIE-XYZ-D65 - SDR-100nit-P3-D65_2.0";
+    auto group = BuildRoundTripTransform(display_style, view_style);
+    test.setProcessor(group);
+
+    const int lut_size = 17;
+    const int num_channels = 4;
+    OCIOGPUTest::CustomValues values;
+    GenerateIdentityLut3D(values, lut_size, num_channels, 1.0f);
+
+    test.setCustomValues(values);
+
+    test.setErrorThreshold(0.001f);
+}
+
+OCIO_ADD_GPU_TEST(FixedFunction, style_aces2_1000nit_p3_rndtrip)
+{
+    const char * display_style = "DISPLAY - CIE-XYZ-D65_to_ST2084-P3-D65";
+    const char * view_style = "ACES-OUTPUT - ACES2065-1_to_CIE-XYZ-D65 - HDR-1000nit-P3-D65_2.0";
+    auto group = BuildRoundTripTransform(display_style, view_style);
+    test.setProcessor(group);
+
+    const int lut_size = 17;
+    const int num_channels = 4;
+    OCIOGPUTest::CustomValues values;
+    GenerateIdentityLut3D(values, lut_size, num_channels, 0.75183f);  // scale to 1000 nits
+
+    test.setCustomValues(values);
+
+    // TODO: Investigate why this is not closer.
+    // Setting the CPUProcessor to OPTIMIZATION_NONE helps slightly, but is not the main
+    // cause of the error.
+    test.setErrorThreshold(0.01f);
+}
+
+OCIO_ADD_GPU_TEST(FixedFunction, style_aces2_4000nit_p3_rndtrip)
+{
+    const char * display_style = "DISPLAY - CIE-XYZ-D65_to_ST2084-P3-D65";
+    const char * view_style = "ACES-OUTPUT - ACES2065-1_to_CIE-XYZ-D65 - HDR-4000nit-P3-D65_2.0";
+    auto group = BuildRoundTripTransform(display_style, view_style);
+    test.setProcessor(group);
+
+    const int lut_size = 17;
+    const int num_channels = 4;
+    OCIOGPUTest::CustomValues values;
+    GenerateIdentityLut3D(values, lut_size, num_channels, 0.90257f);  // scale to 4000 nits
+
+    test.setCustomValues(values);
+
+    // TODO: Investigate why this is not closer.
+    test.setErrorThreshold(0.018f);
+}
+
+OCIO_ADD_GPU_TEST(FixedFunction, style_aces2_4000nit_rec2020_rndtrip)
+{
+    const char * display_style = "DISPLAY - CIE-XYZ-D65_to_REC.2100-PQ";
+    const char * view_style = "ACES-OUTPUT - ACES2065-1_to_CIE-XYZ-D65 - HDR-4000nit-REC2020_2.0";
+    auto group = BuildRoundTripTransform(display_style, view_style);
+    test.setProcessor(group);
+
+    const int lut_size = 17;
+    const int num_channels = 4;
+    OCIOGPUTest::CustomValues values;
+    GenerateIdentityLut3D(values, lut_size, num_channels, 0.90257f);  // scale to 4000 nits
+
+    test.setCustomValues(values);
+
+    // TODO: Investigate why this is not closer.
+    test.setErrorThreshold(0.025f);
+}
+
 OCIO_ADD_GPU_TEST(FixedFunction, style_aces2_rgb_to_jmh_fwd)
 {
     // ACES AP0

From c57b4f3d56ef1b29e6c6e869747d009efcf6c3c4 Mon Sep 17 00:00:00 2001
From: Doug Walker <doug.walker@autodesk.com>
Date: Fri, 14 Mar 2025 18:03:02 -0400
Subject: [PATCH 11/11] Loosen tolerances for other GPUs

Signed-off-by: Doug Walker <doug.walker@autodesk.com>
---
 tests/gpu/FixedFunctionOp_test.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/gpu/FixedFunctionOp_test.cpp b/tests/gpu/FixedFunctionOp_test.cpp
index bcc6a69cd3..523ff66ef1 100644
--- a/tests/gpu/FixedFunctionOp_test.cpp
+++ b/tests/gpu/FixedFunctionOp_test.cpp
@@ -578,7 +578,7 @@ OCIO_ADD_GPU_TEST(FixedFunction, style_aces2_1000nit_p3_rndtrip)
     // TODO: Investigate why this is not closer.
     // Setting the CPUProcessor to OPTIMIZATION_NONE helps slightly, but is not the main
     // cause of the error.
-    test.setErrorThreshold(0.01f);
+    test.setErrorThreshold(0.012f);
 }
 
 OCIO_ADD_GPU_TEST(FixedFunction, style_aces2_4000nit_p3_rndtrip)
@@ -614,7 +614,7 @@ OCIO_ADD_GPU_TEST(FixedFunction, style_aces2_4000nit_rec2020_rndtrip)
     test.setCustomValues(values);
 
     // TODO: Investigate why this is not closer.
-    test.setErrorThreshold(0.025f);
+    test.setErrorThreshold(0.03f);
 }
 
 OCIO_ADD_GPU_TEST(FixedFunction, style_aces2_rgb_to_jmh_fwd)