From 48096234a3467fa21e311d9df45904044406bad5 Mon Sep 17 00:00:00 2001 From: "cuneyt.ozdas" Date: Wed, 26 Feb 2025 11:07:00 -0800 Subject: [PATCH 1/3] - Weights for cos(3h) and sin(h) in chroma_compress_norm() looks wrong. Fixing the weights makes the GPU tests pass now (except of the inverse output transform which seems to have a separate issue). - If the new weights are correct, I'll need to update the CPU test target values too. Signed-off-by: cuneyt.ozdas --- src/OpenColorIO/ops/fixedfunction/ACES2/Transform.cpp | 4 ++-- src/OpenColorIO/ops/fixedfunction/FixedFunctionOpGPU.cpp | 6 ++++-- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/src/OpenColorIO/ops/fixedfunction/ACES2/Transform.cpp b/src/OpenColorIO/ops/fixedfunction/ACES2/Transform.cpp index f72526f961..aaab6d1bb1 100644 --- a/src/OpenColorIO/ops/fixedfunction/ACES2/Transform.cpp +++ b/src/OpenColorIO/ops/fixedfunction/ACES2/Transform.cpp @@ -302,8 +302,8 @@ float chroma_compress_norm(float cos_hr1, float sin_hr1, float chroma_compress_s sin_hr1, sin_hr2, sin_hr3, 1.0f }; alignas(AVX_ALIGNMENT) static constexpr float weights[8] = { // TODO: investigate reordering of the entries so we are summing equal magnitude values first? - 11.34072f, 16.46899f, 14.66441f, 0.0f, - 4.66441f, -6.37224f, 9.19364f, 77.12896f + 11.34072f, 16.46899f, 7.88380f, 0.0f, + 14.66441f, -6.37224f, 9.19364f, 77.12896f }; // TODO: benchmark this across multiple platforms to justify the multiple code paths. #if OCIO_USE_SSE2 diff --git a/src/OpenColorIO/ops/fixedfunction/FixedFunctionOpGPU.cpp b/src/OpenColorIO/ops/fixedfunction/FixedFunctionOpGPU.cpp index ea6dced0c0..7836e58381 100644 --- a/src/OpenColorIO/ops/fixedfunction/FixedFunctionOpGPU.cpp +++ b/src/OpenColorIO/ops/fixedfunction/FixedFunctionOpGPU.cpp @@ -663,6 +663,7 @@ std::string _Add_Tonescale_func( } ss.newLine() << ss.floatDecl("J_ts") << " = " << ACES2::J_scale << " * pow((F_L_Y / ( " << ACES2::cam_nl_offset << " + F_L_Y)) * " << p.inv_A_w_J << ", " << p.cz << ");"; + // TODO: copysign is missing here. /coz ss.newLine() << "return J_ts;"; ss.dedent(); @@ -682,14 +683,15 @@ void _Add_ChromaCompressionNorm_Shader( ss.newLine() << "{"; ss.indent(); - ss.newLine() << ss.floatDecl("cos_hr2") << " = cos_hr * cos_hr - sin_hr * sin_hr;"; + // TODO: optimization: can bake weights into terms and convert dotprods to addition. /coz + ss.newLine() << ss.floatDecl("cos_hr2") << " = 2.0 * cos_hr * cos_hr - 1.0;"; ss.newLine() << ss.floatDecl("sin_hr2") << " = 2.0 * cos_hr * sin_hr;"; ss.newLine() << ss.floatDecl("cos_hr3") << " = 4.0 * cos_hr * cos_hr * cos_hr - 3.0 * cos_hr;"; ss.newLine() << ss.floatDecl("sin_hr3") << " = 3.0 * sin_hr - 4.0 * sin_hr * sin_hr * sin_hr;"; ss.newLine() << ss.float3Decl("cosines") << " = " << ss.float3Const("cos_hr", "cos_hr2", "cos_hr3") <<";"; ss.newLine() << ss.float3Decl("cosine_weights") << " = " << ss.float3Const(11.34072 * c.chroma_compress_scale, 16.46899 * c.chroma_compress_scale, - 7.88380 * c.chroma_compress_scale) <<";"; + 7.88380 * c.chroma_compress_scale) <<";"; ss.newLine() << ss.float3Decl("sines") << " = " << ss.float3Const("sin_hr", "sin_hr2", "sin_hr3") <<";"; ss.newLine() << ss.float3Decl("sine_weights") << " = " << ss.float3Const(14.66441 * c.chroma_compress_scale, -6.37224 * c.chroma_compress_scale, From 7607e9966a2e060cab26fc792066ccf9d933b540 Mon Sep 17 00:00:00 2001 From: "cuneyt.ozdas" Date: Wed, 26 Feb 2025 13:44:51 -0800 Subject: [PATCH 2/3] - Updating the expected values in the CPU tests Signed-off-by: cuneyt.ozdas --- .../FixedFunctionOpCPU_tests.cpp | 124 +++++++++--------- .../cpu/transforms/BuiltinTransform_tests.cpp | 94 ++++++++----- 2 files changed, 125 insertions(+), 93 deletions(-) diff --git a/tests/cpu/ops/fixedfunction/FixedFunctionOpCPU_tests.cpp b/tests/cpu/ops/fixedfunction/FixedFunctionOpCPU_tests.cpp index bd8536f33a..aee030f53e 100644 --- a/tests/cpu/ops/fixedfunction/FixedFunctionOpCPU_tests.cpp +++ b/tests/cpu/ops/fixedfunction/FixedFunctionOpCPU_tests.cpp @@ -467,41 +467,41 @@ OCIO_ADD_TEST(FixedFunctionOpCPU, aces_output_transform_20) const float expected_32f[num_samples*4] = { // ACEScg primaries and secondaries scaled by 4 - 4.966040611f, -0.032990534f, 0.041587759f, 1.0f, - 3.969455719f, 3.825795889f, -0.056159109f, 1.0f, - -0.075445443f, 3.689064741f, 0.270243138f, 1.0f, - -0.095422804f, 3.650515079f, 3.459970713f, 1.0f, - -0.029242843f, 0.196083903f, 2.797096968f, 1.0f, - 4.900825977f, -0.064415932f, 3.838272572f, 1.0f, + 4.966013432f, -0.033002287f, 0.041583523f, 1.0f, + 3.969460726f, 3.825797558f, -0.056160748f, 1.0f, + -0.075460039f, 3.689072609f, 0.270235062f, 1.0f, + -0.095436633f, 3.650521517f, 3.459975719f, 1.0f, + -0.028881177f, 0.196473420f, 2.796123743f, 1.0f, + 4.900828362f, -0.064385533f, 3.838270903f, 1.0f, // OCIO test values - 0.096831776f, -0.001114858f, 0.018976377f, 0.5f, - 0.811647296f, 0.478211939f, 0.816507518f, 1.0f, - 0.110244252f, 0.919241786f, 0.726084292f, 0.0f, + 0.096890487f, -0.001135427f, 0.018971475f, 0.5f, + 0.809613585f, 0.479857147f, 0.814239979f, 1.0f, + 0.107417941f, 0.920530438f, 0.726379037f, 0.0f, // ColorChecker24 (SMPTE 2065-1 2021) - 0.115581684f, 0.050785132f, 0.030158322f, 1.0f, - 0.482630610f, 0.301559567f, 0.228200614f, 1.0f, - 0.097509719f, 0.160682827f, 0.278755993f, 1.0f, - 0.071118668f, 0.107350536f, 0.035066456f, 1.0f, - 0.206827119f, 0.198065758f, 0.376981646f, 1.0f, - 0.197157621f, 0.480333209f, 0.393290222f, 1.0f, - 0.570664287f, 0.197219044f, 0.042163782f, 1.0f, - 0.045591675f, 0.069720201f, 0.292005479f, 1.0f, - 0.425108939f, 0.083108872f, 0.102091998f, 1.0f, - 0.059560396f, 0.022268835f, 0.091132581f, 1.0f, - 0.360384226f, 0.478674322f, 0.086890966f, 1.0f, - 0.691989481f, 0.372686356f, 0.070826821f, 1.0f, - 0.012042155f, 0.021904279f, 0.198501319f, 1.0f, - 0.076645926f, 0.256147027f, 0.060666814f, 1.0f, - 0.300039411f, 0.023424838f, 0.030365985f, 1.0f, - 0.803476214f, 0.596933603f, 0.085341305f, 1.0f, - 0.388712883f, 0.079724148f, 0.245922253f, 1.0f, - 0.011061139f, 0.196086824f, 0.307065904f, 1.0f, - 0.921007156f, 0.921683431f, 0.912948132f, 1.0f, - 0.590166390f, 0.588430583f, 0.587841213f, 1.0f, - 0.337742388f, 0.337684810f, 0.338159621f, 1.0f, - 0.169266224f, 0.169178173f, 0.169558540f, 1.0f, - 0.058399219f, 0.059382606f, 0.060239695f, 1.0f, - 0.012618840f, 0.012950940f, 0.013591323f, 1.0f, + 0.115475342f, 0.050812997f, 0.030212998f, 1.0f, + 0.484880149f, 0.301042914f, 0.226769030f, 1.0f, + 0.098463453f, 0.160814837f, 0.277010798f, 1.0f, + 0.071130276f, 0.107334509f, 0.035097614f, 1.0f, + 0.207111374f, 0.198474824f, 0.375326097f, 1.0f, + 0.195447117f, 0.481112540f, 0.393299103f, 1.0f, + 0.571913302f, 0.196873263f, 0.041634843f, 1.0f, + 0.045791976f, 0.069875412f, 0.291233569f, 1.0f, + 0.424848884f, 0.083199054f, 0.102153927f, 1.0f, + 0.059589352f, 0.022219239f, 0.091246955f, 1.0f, + 0.360364884f, 0.478741497f, 0.086726815f, 1.0f, + 0.695661962f, 0.371994466f, 0.068298057f, 1.0f, + 0.011806240f, 0.021665439f, 0.199594870f, 1.0f, + 0.076526135f, 0.256237596f, 0.060564563f, 1.0f, + 0.300064713f, 0.023416281f, 0.030360531f, 1.0f, + 0.805483222f, 0.596904039f, 0.082996234f, 1.0f, + 0.388385385f, 0.079899333f, 0.245818958f, 1.0f, + 0.010951802f, 0.196106046f, 0.307181537f, 1.0f, + 0.921020269f, 0.921707630f, 0.912857533f, 1.0f, + 0.590191603f, 0.588424563f, 0.587825298f, 1.0f, + 0.337743223f, 0.337686002f, 0.338155240f, 1.0f, + 0.169266403f, 0.169178575f, 0.169557154f, 1.0f, + 0.058346011f, 0.059387885f, 0.060296256f, 1.0f, + 0.012581199f, 0.012947144f, 0.013654212f, 1.0f, // Spectrally non-selective 18 % reflecting diffuser 0.145115077f, 0.145115703f, 0.145115480f, 1.0f, // Perfect reflecting diffuser @@ -524,7 +524,7 @@ OCIO_ADD_TEST(FixedFunctionOpCPU, aces_output_transform_20) __LINE__); #if DUMP_RESULT - std::cout << "Results: \n" << std::setprecision(9) << std::fixed; + std::cout << "aces_output_transform_20 results: \n" << std::setprecision(9) << std::fixed; for (unsigned i = 0; i < num_samples; ++i) { std::cout << input2_32f[i * 4 + 0] << "f, " @@ -794,35 +794,35 @@ OCIO_ADD_TEST(FixedFunctionOpCPU, aces_tonescale_compress_20) const float expected_32f[num_samples*4] = { // ACEScg primaries and secondaries scaled by 4 - 110.702453613f, 211.242279053f, 25.025110245f, 1.0f, - 168.016815186f, 129.795593262f, 106.183448792f, 1.0f, - 140.814849854f, 193.450653076f, 147.056488037f, 1.0f, - 156.429504395f, 110.935348511f, 192.204727173f, 1.0f, - 80.456558228f, 98.743263245f, 268.442108154f, 1.0f, - 135.172225952f, 175.572814941f, 341.715240479f, 1.0f, + 110.702453613f, 211.251770020f, 25.025110245f, 1.0f, + 168.016815186f, 129.796249390f, 106.183448792f, 1.0f, + 140.814849854f, 193.459197998f, 147.056488037f, 1.0f, + 156.429504395f, 110.938423157f, 192.204727173f, 1.0f, + 80.456558228f, 98.490531921f, 268.442108154f, 1.0f, + 135.172225952f, 175.559326172f, 341.715240479f, 1.0f, // OCIO test values - 18.187316895f, 33.767055511f, 4.173158169f, 0.5f, - 80.413101196f, 21.547714233f, 332.159759521f, 1.0f, - 83.447883606f, 37.597621918f, 182.925750732f, 0.0f, + 18.187316895f, 33.819190979f, 4.173158169f, 0.5f, + 80.413101196f, 21.309329987f, 332.159759521f, 1.0f, + 83.447883606f, 37.852523804f, 182.925750732f, 0.0f, // ColorChecker24 (SMPTE 2065-1 2021) - 27.411968231f, 13.410449982f, 38.146659851f, 1.0f, - 59.987659454f, 14.175936699f, 39.841842651f, 1.0f, - 43.298923492f, 12.367712021f, 249.107116699f, 1.0f, - 31.489654541f, 14.086299896f, 128.878036499f, 1.0f, - 50.749198914f, 12.862657547f, 285.658966064f, 1.0f, - 64.728637695f, 18.433788300f, 179.324264526f, 1.0f, - 53.399444580f, 37.239288330f, 50.924011230f, 1.0f, - 34.719596863f, 21.685737610f, 271.008331299f, 1.0f, - 43.910709381f, 36.826980591f, 13.975610733f, 1.0f, - 23.196529388f, 15.087531090f, 317.544281006f, 1.0f, - 63.348682404f, 33.255519867f, 119.145133972f, 1.0f, - 64.908874512f, 34.922687531f, 70.842193604f, 1.0f, - 24.876913071f, 23.019479752f, 273.228973389f, 1.0f, - 44.203376770f, 28.884298325f, 144.154159546f, 1.0f, - 32.824359894f, 43.442367554f, 17.892261505f, 1.0f, - 75.830871582f, 39.538505554f, 90.752044678f, 1.0f, - 45.823120117f, 34.710170746f, 348.832092285f, 1.0f, - 43.597236633f, 23.048465729f, 218.454376221f, 1.0f, + 27.411968231f, 13.382784843f, 38.146659851f, 1.0f, + 59.987659454f, 14.391894341f, 39.841842651f, 1.0f, + 43.298923492f, 12.199877739f, 249.107116699f, 1.0f, + 31.489654541f, 14.075141907f, 128.878036499f, 1.0f, + 50.749198914f, 12.731806755f, 285.658966064f, 1.0f, + 64.728637695f, 18.593791962f, 179.324264526f, 1.0f, + 53.399444580f, 37.394416809f, 50.924011230f, 1.0f, + 34.719596863f, 21.616765976f, 271.008331299f, 1.0f, + 43.910709381f, 36.788166046f, 13.975610733f, 1.0f, + 23.196529388f, 15.118354797f, 317.544281006f, 1.0f, + 63.348682404f, 33.283519745f, 119.145133972f, 1.0f, + 64.908874512f, 35.371063232f, 70.842193604f, 1.0f, + 24.876913071f, 23.143159866f, 273.228973389f, 1.0f, + 44.203376770f, 28.918329239f, 144.154159546f, 1.0f, + 32.824359894f, 43.447853088f, 17.892261505f, 1.0f, + 75.830871582f, 39.872489929f, 90.752044678f, 1.0f, + 45.823120117f, 34.652057648f, 348.832092285f, 1.0f, + 43.597236633f, 23.079071045f, 218.454376221f, 1.0f, }; OCIO::FixedFunctionOpData::Params params = {1000.f}; @@ -836,7 +836,7 @@ OCIO_ADD_TEST(FixedFunctionOpCPU, aces_tonescale_compress_20) __LINE__); #if DUMP_RESULTS - std::cout << "Results: \n" << std::setprecision(9) << std::fixed; + std::cout << "aces_tonescale_compress_20 results: \n" << std::setprecision(9) << std::fixed; for (unsigned i = 0; i < num_samples; ++i) { std::cout << input2_32f[i * 4 + 0] << "f, " diff --git a/tests/cpu/transforms/BuiltinTransform_tests.cpp b/tests/cpu/transforms/BuiltinTransform_tests.cpp index 5ffd0ce06b..8ffc3dd3c3 100644 --- a/tests/cpu/transforms/BuiltinTransform_tests.cpp +++ b/tests/cpu/transforms/BuiltinTransform_tests.cpp @@ -17,6 +17,7 @@ #include "UnitTestUtils.h" namespace OCIO = OCIO_NAMESPACE; +#define DUMP_RESULTS 0 OCIO_ADD_TEST(BuiltinTransform, creation) @@ -465,97 +466,128 @@ AllValues UnitTestValues { "ACES-OUTPUT - ACES2065-1_to_CIE-XYZ-D65 - SDR-100nit-REC709_2.0", { 1.0e-4f, - { 0.5f, 0.4f, 0.3f }, { 0.26216024f, 0.25204185f, 0.20722267f } } }, + { 0.5f, 0.4f, 0.3f }, + { 0.26260215f, 0.25207460f, 0.20617345f } } }, { "ACES-OUTPUT - ACES2065-1_to_CIE-XYZ-D65 - SDR-100nit-P3-D65_2.0", { 1.0e-4f, - { 0.5f, 0.4f, 0.3f }, { 0.26216024f, 0.25204197f, 0.20722273f } } }, + { 0.5f, 0.4f, 0.3f }, + { 0.26260215f, 0.25207475f, 0.20617352f } } }, { "ACES-OUTPUT - ACES2065-1_to_CIE-XYZ-D65 - HDR-108nit-P3-D65_2.0", { 1.0e-4f, - { 0.5f, 0.4f, 0.3f }, { 0.16215305f, 0.15510881f, 0.12539130f } } }, + { 0.5f, 0.4f, 0.3f }, + { 0.16253395f, 0.15513620f, 0.12449738f } } }, { "ACES-OUTPUT - ACES2065-1_to_CIE-XYZ-D65 - HDR-300nit-P3-D65_2.0", { 1.0e-4f, - { 0.5f, 0.4f, 0.3f }, { 0.20546328f, 0.19437392f, 0.15134005f } } }, + { 0.5f, 0.4f, 0.3f }, + { 0.20592400f, 0.19440512f, 0.15028587f } } }, { "ACES-OUTPUT - ACES2065-1_to_CIE-XYZ-D65 - HDR-500nit-P3-D65_2.0", { 1.0e-4f, - { 0.5f, 0.4f, 0.3f }, { 0.40945035f, 0.38807374f, 0.30408621f } } }, + { 0.5f, 0.4f, 0.3f }, + { 0.41039270f, 0.38813815f, 0.30191854f } } }, { "ACES-OUTPUT - ACES2065-1_to_CIE-XYZ-D65 - HDR-1000nit-P3-D65_2.0", { 1.0e-4f, - { 0.5f, 0.4f, 0.3f }, { 0.46440443f, 0.43846416f, 0.33907151f } } }, + { 0.5f, 0.4f, 0.3f }, + { 0.46536559f, 0.43852845f, 0.33688101f } } }, { "ACES-OUTPUT - ACES2065-1_to_CIE-XYZ-D65 - HDR-2000nit-P3-D65_2.0", { 1.0e-4f, - { 0.5f, 0.4f, 0.3f }, { 0.51138031f, 0.48258632f, 0.37260318f } } }, + { 0.5f, 0.4f, 0.3f }, + { 0.51225948f, 0.48264498f, 0.37060043f } } }, { "ACES-OUTPUT - ACES2065-1_to_CIE-XYZ-D65 - HDR-4000nit-P3-D65_2.0", { 1.0e-4f, - { 0.5f, 0.4f, 0.3f }, { 0.55570847f, 0.51962745f, 0.38863316f } } }, + { 0.5f, 0.4f, 0.3f }, + { 0.55653530f, 0.51967967f, 0.38678783f } } }, { "ACES-OUTPUT - ACES2065-1_to_CIE-XYZ-D65 - HDR-500nit-REC2020_2.0", { 1.0e-4f, - { 0.5f, 0.4f, 0.3f }, { 0.40945050f, 0.38807371f, 0.30408630f } } }, + { 0.5f, 0.4f, 0.3f }, + { 0.41039288f, 0.38813818f, 0.30191860f } } }, { "ACES-OUTPUT - ACES2065-1_to_CIE-XYZ-D65 - HDR-1000nit-REC2020_2.0", { 1.0e-4f, - { 0.5f, 0.4f, 0.3f }, { 0.46440458f, 0.43846416f, 0.33907148f } } }, + { 0.5f, 0.4f, 0.3f }, + { 0.46536580f, 0.43852842f, 0.33688098f } } }, { "ACES-OUTPUT - ACES2065-1_to_CIE-XYZ-D65 - HDR-2000nit-REC2020_2.0", { 1.0e-4f, - { 0.5f, 0.4f, 0.3f }, { 0.51138067f, 0.48258659f, 0.37260354f } } }, + { 0.5f, 0.4f, 0.3f }, + { 0.51225960f, 0.48264492f, 0.37060046f } } }, { "ACES-OUTPUT - ACES2065-1_to_CIE-XYZ-D65 - HDR-4000nit-REC2020_2.0", { 1.0e-4f, - { 0.5f, 0.4f, 0.3f }, { 0.55570871f, 0.51962751f, 0.38863319f } } }, + { 0.5f, 0.4f, 0.3f }, + { 0.55653548f, 0.51967967f, 0.38678783f } } }, { "ACES-OUTPUT - ACES2065-1_to_CIE-XYZ-D65 - SDR-100nit-REC709-D60-in-REC709-D65_2.0", { 1.0e-4f, - { 0.5f, 0.4f, 0.3f }, { 0.25105247f, 0.24027000f, 0.18314247f } } }, + { 0.5f, 0.4f, 0.3f }, + { 0.25147712f, 0.24029461f, 0.18221153f } } }, { "ACES-OUTPUT - ACES2065-1_to_CIE-XYZ-D65 - SDR-100nit-REC709-D60-in-P3-D65_2.0", { 1.0e-4f, - { 0.5f, 0.4f, 0.3f }, { 0.25330988f, 0.24243048f, 0.18478924f } } }, + { 0.5f, 0.4f, 0.3f }, + { 0.25373834f, 0.24245527f, 0.18384993f } } }, { "ACES-OUTPUT - ACES2065-1_to_CIE-XYZ-D65 - SDR-100nit-REC709-D60-in-REC2020-D65_2.0", { 1.0e-4f, - { 0.5f, 0.4f, 0.3f }, { 0.25669453f, 0.24566977f, 0.18725836f } } }, + { 0.5f, 0.4f, 0.3f }, + { 0.25712875f, 0.24569492f, 0.18630651f } } }, { "ACES-OUTPUT - ACES2065-1_to_CIE-XYZ-D65 - SDR-100nit-P3-D60-in-P3-D65_2.0", { 1.0e-4f, - { 0.5f, 0.4f, 0.3f }, { 0.25331008f, 0.24243063f, 0.18478930f } } }, + { 0.5f, 0.4f, 0.3f }, + { 0.25373828f, 0.24245520f, 0.18384989f } } }, { "ACES-OUTPUT - ACES2065-1_to_CIE-XYZ-D65 - SDR-100nit-P3-D60-in-XYZ-E_2.0", { 1.0e-4f, - { 0.5f, 0.4f, 0.3f }, { 0.26287800f, 0.25158763f, 0.19176911f } } }, + { 0.5f, 0.4f, 0.3f }, + { 0.26332238f, 0.25161314f, 0.19079420f } } }, { "ACES-OUTPUT - ACES2065-1_to_CIE-XYZ-D65 - HDR-108nit-P3-D60-in-P3-D65_2.0", { 1.0e-4f, - { 0.5f, 0.4f, 0.3f }, { 0.15668137f, 0.14918011f, 0.11180915f } } }, + { 0.5f, 0.4f, 0.3f }, + { 0.15705051f, 0.14920059f, 0.11100878f } } }, { "ACES-OUTPUT - ACES2065-1_to_CIE-XYZ-D65 - HDR-300nit-P3-D60-in-XYZ-E_2.0", { 1.0e-4f, - { 0.5f, 0.4f, 0.3f }, { 0.20423311f, 0.19227016f, 0.13879779f } } }, + { 0.5f, 0.4f, 0.3f }, + { 0.20469207f, 0.19229385f, 0.13782671f } } }, { "ACES-OUTPUT - ACES2065-1_to_CIE-XYZ-D65 - HDR-500nit-P3-D60-in-P3-D65_2.0", { 1.0e-4f, - { 0.5f, 0.4f, 0.3f }, { 0.39564425f, 0.37317851f, 0.27111340f } } }, + { 0.5f, 0.4f, 0.3f }, + { 0.39655733f, 0.37322620f, 0.26917258f } } }, { "ACES-OUTPUT - ACES2065-1_to_CIE-XYZ-D65 - HDR-1000nit-P3-D60-in-P3-D65_2.0", { 1.0e-4f, - { 0.5f, 0.4f, 0.3f }, { 0.44875023f, 0.42160600f, 0.30228844f } } }, + { 0.5f, 0.4f, 0.3f }, + { 0.44968122f, 0.42165339f, 0.30032712f } } }, { "ACES-OUTPUT - ACES2065-1_to_CIE-XYZ-D65 - HDR-2000nit-P3-D60-in-P3-D65_2.0", { 1.0e-4f, - { 0.5f, 0.4f, 0.3f }, { 0.49414319f, 0.46402797f, 0.33218032f } } }, + { 0.5f, 0.4f, 0.3f }, + { 0.49499470f, 0.46407115f, 0.33038712f } } }, { "ACES-OUTPUT - ACES2065-1_to_CIE-XYZ-D65 - HDR-4000nit-P3-D60-in-P3-D65_2.0", { 1.0e-4f, - { 0.5f, 0.4f, 0.3f }, { 0.53698927f, 0.49956405f, 0.34642375f } } }, + { 0.5f, 0.4f, 0.3f }, + { 0.53778988f, 0.49960214f, 0.34477147f } } }, { "ACES-OUTPUT - ACES2065-1_to_CIE-XYZ-D65 - HDR-500nit-P3-D60-in-REC2020-D65_2.0", { 1.0e-4f, - { 0.5f, 0.4f, 0.3f }, { 0.40093076f, 0.37816489f, 0.27473599f } } }, + { 0.5f, 0.4f, 0.3f }, + { 0.40185603f, 0.37821317f, 0.27276924f } } }, { "ACES-OUTPUT - ACES2065-1_to_CIE-XYZ-D65 - HDR-1000nit-P3-D60-in-REC2020-D65_2.0", { 1.0e-4f, - { 0.5f, 0.4f, 0.3f }, { 0.45474637f, 0.42723945f, 0.30632755f } } }, + { 0.5f, 0.4f, 0.3f }, + { 0.45568976f, 0.42728746f, 0.30434006f } } }, { "ACES-OUTPUT - ACES2065-1_to_CIE-XYZ-D65 - HDR-2000nit-P3-D60-in-REC2020-D65_2.0", { 1.0e-4f, - { 0.5f, 0.4f, 0.3f }, { 0.50074583f, 0.47022825f, 0.33661887f } } }, + { 0.5f, 0.4f, 0.3f }, + { 0.50160873f, 0.47027206f, 0.33480173f } } }, { "ACES-OUTPUT - ACES2065-1_to_CIE-XYZ-D65 - HDR-4000nit-P3-D60-in-REC2020-D65_2.0", { 1.0e-4f, - { 0.5f, 0.4f, 0.3f }, { 0.54416442f, 0.50623918f, 0.35105261f } } }, + { 0.5f, 0.4f, 0.3f }, + { 0.54497570f, 0.50627774f, 0.34937829f } } }, { "ACES-OUTPUT - ACES2065-1_to_CIE-XYZ-D65 - HDR-500nit-REC2020-D60-in-REC2020-D65_2.0", { 1.0e-4f, - { 0.5f, 0.4f, 0.3f }, { 0.40093106f, 0.37816498f, 0.27473608f } } }, + { 0.5f, 0.4f, 0.3f }, + { 0.40185642f, 0.37821338f, 0.27276939f } } }, { "ACES-OUTPUT - ACES2065-1_to_CIE-XYZ-D65 - HDR-1000nit-REC2020-D60-in-REC2020-D65_2.0", { 1.0e-4f, - { 0.5f, 0.4f, 0.3f }, { 0.45474690f, 0.42723969f, 0.30632779f } } }, + { 0.5f, 0.4f, 0.3f }, + { 0.45569009f, 0.42728764f, 0.30434042f } } }, { "ACES-OUTPUT - ACES2065-1_to_CIE-XYZ-D65 - HDR-2000nit-REC2020-D60-in-REC2020-D65_2.0", { 1.0e-4f, - { 0.5f, 0.4f, 0.3f }, { 0.50074631f, 0.47022864f, 0.33661926f } } }, + { 0.5f, 0.4f, 0.3f }, + { 0.50160891f, 0.47027206f, 0.33480188f } } }, { "ACES-OUTPUT - ACES2065-1_to_CIE-XYZ-D65 - HDR-4000nit-REC2020-D60-in-REC2020-D65_2.0", { 1.0e-4f, - { 0.5f, 0.4f, 0.3f }, { 0.54416472f, 0.50623930f, 0.35105285f } } }, + { 0.5f, 0.4f, 0.3f }, + { 0.54497600f, 0.50627792f, 0.34937853f } } }, { "APPLE_LOG_to_ACES2065-1", { 1.0e-6f, From 089ee33d4f6ba89d992031b5b007a903d6399b81 Mon Sep 17 00:00:00 2001 From: "cuneyt.ozdas" Date: Fri, 28 Feb 2025 10:16:52 -0800 Subject: [PATCH 3/3] - The remaining GPU test failures were caused by a simple typo where we were passing h instead of J to ocio_tonescale_inv() function. With the fix all the unit tests are happy now. - Since we decided not to include any SIMD implementation in this version, I removed the conditional code paths and left the current SSE & AVX implementations as commented out for future guidence. Signed-off-by: cuneyt.ozdas --- .../ops/fixedfunction/ACES2/Transform.cpp | 23 ++++++++----------- .../ops/fixedfunction/FixedFunctionOpGPU.cpp | 2 +- 2 files changed, 10 insertions(+), 15 deletions(-) diff --git a/src/OpenColorIO/ops/fixedfunction/ACES2/Transform.cpp b/src/OpenColorIO/ops/fixedfunction/ACES2/Transform.cpp index aaab6d1bb1..68161def88 100644 --- a/src/OpenColorIO/ops/fixedfunction/ACES2/Transform.cpp +++ b/src/OpenColorIO/ops/fixedfunction/ACES2/Transform.cpp @@ -8,19 +8,6 @@ #include #include -// Disabling SIMD for now to do scalar math verification. -// FIXME: Re-enable SIMD and verify SIMD math is correct. -#undef OCIO_USE_SSE2 -#undef OCIO_USE_AVX - -#if OCIO_USE_SSE2 -#include "SSE2.h" -#endif - -#if OCIO_USE_AVX -#include "AVX.h" -#endif - namespace OCIO_NAMESPACE { @@ -265,6 +252,7 @@ f3 JMh_to_RGB(const f3 &JMh, const JMhParams &p) // Tonescale / Chroma compress // +/* //TODO: move to header // https://stackoverflow.com/questions/6996764/fastest-way-to-do-horizontal-sse-vector-sum-or-other-reduction/35270026#35270026 // https://gcc.godbolt.org/#compilers:!((compiler:g6,options:%27-xc+-O3+-Wall+-fverbose-asm+-march%3Dhaswell+-mno-avx%27,sourcez:MQSwdgxgNgrgJgUwAQB4QFt3gC4CdwB0AFgHwBQoksiqAztnDseWQPStLZEi1I9JQQAa2QBZAIa5x2AOS0ANEgBGMbJwDuCcSLicA9kgh70ABxBRk2A0oTZsCXIb2ICbDtHFgA5khhgAZnq42H7SCFAAnkiIECCIvFa%2BtMjoegBu4ia0rLREMP4muuD0Wrp6/kipaURQWWT%2BUHrSSES0MOgA%2BlkdtMkAjAAUHR3ofQBMABxIaQCUSADeSEvLK6tr62vs00gAvEgA2kgAIkgAwkgAPkgAQkgAgkgAumQrw6OTSLn5y3sjnV/%2BBoILq0AZpRTgpAdUSiDoAZQAEgBVABiKIAMgBRAZjRQAZkUAAZFH0ZjMANxLLaHc4nK63B7PV4jcZTNroXhLX6YDriOBwEFgxQAimbDjs3h7Q5HADU51OMrpNxlDzuMtuTOWAI2XKhPKqCBqgoBwvatFFqy2B2W5yWStl8sVTypHAsai4yCMpnMDiQ4jSejifsq6WUUVwCBgtHAPgBLy1Zp1uz1nT5At6Awlwry/gp8aWEZCuDArx5EDS2F6HX8eLGmbNeYAvmR6o1mq12iCesk8UMWR9Zgt81D%2B2yc8m/iN0oa4DATILZpT1lslLgmnAIOJ6Ehwgh0AgwJWkAS%2BvokLjCcO3qzPonuan%2BQvs/k8yttet71O0oaoMac6aOVFLZuC8IgWnEKAKgAWhIAQ9HUcDIOHCUNk/NNu3rDln1zclh0LGBiw2Sdy0rWhq1rTDzVw5sWy2Wh0AgixHCMGhowALwQRQVDUKMYEYqJaEaTRHD8CxeicRAoPY5B%2BHEJATDXJQLHQVsmjUDtOm6Ig0z7d4pkHeYr1HW8OQnHltMfLIhWmV8E1Mj8Uw6CyBSsrMJVsgtbAIkt1mIisqxrOt3OoltQH8RAKmGO4ADUAA1hlU9t2TGABWAA2Lt/QAD101K0psodmT06YhN1P48o6Td6CyCqslZMEPJHYrqhAUCzPQCqECyvBxAgbB/FZJ8kFJSlgNasDWSM5qb3a3lLNBNIhIhECiAtZYtjTTgiGQUrJpWfDCI0rtegQXs0lZNakw2sAijAQQwEsbbPh7aYHGjPQwEUdRuAgMD%2BD0ExsAwCCkECRwYti4ctgGMADHULd7Fu%2BhcBgPqQA%2BhQ/Ruv0oCgJByi2vc/QjQmkAAFluLAwAwdoZjIZtgAPRh/BbNx2HZrY4D0GAlIQMguZ5iwWnZLp02SOtr0mXQ0jgOZFiTBWtjSZNDluK5GSmm8/AisqeW1hB/HABAXNBS6FY2TmPpkNR1CCXAIi%2B5APsiJJkDTOEjkUB7v0cKxUe4bxSa8SQlHELxkCUEAj3UKOwM8JAUQABT9flNY%2BAFsFMT8DSNKz9f8RRiIRwoF1li0tg9En%2BFhkNqlqOA04mXR30/KrsG6QpMxzTOTEag6fL8ysBX8NKyaGHl0NoOAwTgbCyRC2iOY5vhD1rMgcGFzsEDMWtuwQCXRxAJAsrligQHCg2RwhhKitZI/uFH3XOj8ExeqEbgOm3kBR4GLLFBPpcOoth4iggDBwnhdCwzAFBeIeBUZA2/PcOKAhbC8CjJ8f0bswz2CQDHLg3M1APWNjGYMVQKDhGSI3e%2B38yZPx6DmIEn8d51j/lCGE8JkRoixAMPoRJ8SKDGPPchN1z5UJMo/NCj4v4UQfmTf%2BjVJYTGobWOhAIgSNCYSAPoaVMKj0LuwxEqIMTYl4UgYkx4BHzxWFsOE8MTCk1KtgW2O5lIHkrGI9kKjJECmkUFdoeiWggFrH3LyhFB7RkGtGCinjBGAOsRwOEcJMRjFrg3eJ/dHJdR6n1TRMioBEjNjYpJcidzfhLPgmAbUo5OG9GJM8VQkb2D5PjCoclBD2CkHjEwWTcC6FwLWeQWVMDyEvDRDeR1fHdh7D0ISPQGK4xYggXSd9j4zEMisLKs1nK5JYfI3CGytnoV8b/PZeFQklnCVoyYPQgksKbKzReKdMiI1BmudASAJBSFkLwCGLQ7AmAAFzsHoG/acuAGjwQIF6VgABHGACB6DozANkPotY%2BgAE4%2BhkzSqwIg8EoJWCku0KCHV0pQTxfgNiH1sD8VYKisYDKyZ9EJMAeljLmWJXUiLboDEvnLI%2BCfQqywjDIrUIokyABFPiug9hZX2cKjGYrjKNCOHxPGex2RSr5PKl0gTVUgz2AMJAUFFAmuPniJAMpj4AHZ/4pKtVlfKdMVgiu3OK7g%2Br1WORzr%2BVy7QtWz0ldKxqdF2ieuTEas1ZqsqngdRah1KVLU2v/oSJNWV7XH1oQ6p1w5XVKuKuycN3jBQqrVYoD1aqQ2ugMIa41pr63H1TQ6jNWUs3HxzS6xVTUbyNGTIWtVOqkBjQjXW0d0bY3mrTYmh11qkDOoVaK7tHxuCzTURYY0Yay0mU9USGNVaTIjqjQ2rKTbj4TvTWm%2BNmap1pvyjOuduau3ivZLNSeoJGjlpAOXDg6EsjRANlob5hgoCeC8HIT4DChYAyBlgDiuAzlFguWWfyZFAqYXuUAA%3D%3D)),filterAsm:(binary:!t,commentOnly:!t,directives:!t,intel:!t,labels:!t),version:3 @@ -279,6 +267,7 @@ inline float hsum_ps_sse1(__m128 v) { // v = [ return _mm_cvtss_f32(sums); // A+B+C+D } #endif + #ifdef OCIO_USE_AVX inline float hsum256_ps_avx(__m256 v) { // v = [ H G | F E | D C | B A ] __m128 vlow = _mm256_castps256_ps128(v); // vlow = [ D C | B A ] @@ -287,7 +276,8 @@ inline float hsum256_ps_avx(__m256 v) { // v = [ H G | F E | return hsum_ps_sse1(v128); } #endif - +#endif +*/ float chroma_compress_norm(float cos_hr1, float sin_hr1, float chroma_compress_scale) { @@ -305,6 +295,8 @@ float chroma_compress_norm(float cos_hr1, float sin_hr1, float chroma_compress_s 11.34072f, 16.46899f, 7.88380f, 0.0f, 14.66441f, -6.37224f, 9.19364f, 77.12896f }; + + /* // TODO: benchmark this across multiple platforms to justify the multiple code paths. #if OCIO_USE_SSE2 #if OCIO_USE_AVX @@ -324,6 +316,7 @@ float chroma_compress_norm(float cos_hr1, float sin_hr1, float chroma_compress_s const float M = hsum_ps_sse1(t3); #endif #else + */ const float M = weights[0] * trig_angles_hr[0] + weights[1] * trig_angles_hr[1] + weights[2] * trig_angles_hr[2] + @@ -331,7 +324,9 @@ float chroma_compress_norm(float cos_hr1, float sin_hr1, float chroma_compress_s weights[5] * trig_angles_hr[5] + weights[6] * trig_angles_hr[6] + weights[7]; + /* #endif + */ return M * chroma_compress_scale; // TODO: is it worth prescaling the above weights? } diff --git a/src/OpenColorIO/ops/fixedfunction/FixedFunctionOpGPU.cpp b/src/OpenColorIO/ops/fixedfunction/FixedFunctionOpGPU.cpp index 7836e58381..c08fd74613 100644 --- a/src/OpenColorIO/ops/fixedfunction/FixedFunctionOpGPU.cpp +++ b/src/OpenColorIO/ops/fixedfunction/FixedFunctionOpGPU.cpp @@ -1398,7 +1398,7 @@ void Add_ACES_OutputTransform_Inv_Shader( ss.newLine() << ""; ss.newLine() << "// Add ToneScale and ChromaCompress (inv)"; ss.newLine() << ""; - ss.newLine() << ss.floatDecl("J") << " = " << tonescaleName_Inv << "(" << pxl << ".b);"; + ss.newLine() << ss.floatDecl("J") << " = " << tonescaleName_Inv << "(" << pxl << ".r);"; ss.newLine() << "{"; ss.indent(); _Add_Tonescale_Compress_Inv_Shader(shaderCreator, ss, resourceIndex, s, c);