diff --git a/.github/workflows/ci_workflow.yml b/.github/workflows/ci_workflow.yml index bf98506e1a..0fcf7e2c37 100644 --- a/.github/workflows/ci_workflow.yml +++ b/.github/workflows/ci_workflow.yml @@ -204,7 +204,7 @@ jobs: -DOCIO_BUILD_DOCS=${{ matrix.build-docs }} \ -DOCIO_BUILD_OPENFX=${{ matrix.build-openfx }} \ -DOCIO_BUILD_GPU_TESTS=OFF \ - -DOCIO_USE_SSE=${{ matrix.use-sse }} \ + -DOCIO_USE_SIMD=${{ matrix.use-sse }} \ -DOCIO_USE_OIIO_FOR_APPS=${{ matrix.use-oiio }} \ -DOCIO_INSTALL_EXT_PACKAGES=ALL \ -DOCIO_WARNING_AS_ERROR=ON \ @@ -345,7 +345,7 @@ jobs: -DOCIO_BUILD_DOCS=${{ matrix.build-docs }} \ -DOCIO_BUILD_OPENFX=${{ matrix.build-openfx }} \ -DOCIO_BUILD_GPU_TESTS=OFF \ - -DOCIO_USE_SSE=${{ matrix.use-sse }} \ + -DOCIO_USE_SIMD=${{ matrix.use-sse }} \ -DOCIO_USE_OIIO_FOR_APPS=${{ matrix.use-oiio }} \ -DOCIO_INSTALL_EXT_PACKAGES=ALL \ -DOCIO_WARNING_AS_ERROR=ON \ @@ -493,7 +493,7 @@ jobs: -DOCIO_BUILD_DOCS=${{ matrix.build-docs }} \ -DOCIO_BUILD_OPENFX=${{ matrix.build-openfx }} \ -DOCIO_BUILD_GPU_TESTS=OFF \ - -DOCIO_USE_SSE=${{ matrix.use-sse }} \ + -DOCIO_USE_SIMD=${{ matrix.use-sse }} \ -DOCIO_USE_OIIO_FOR_APPS=${{ matrix.use-oiio }} \ -DOCIO_INSTALL_EXT_PACKAGES=ALL \ -DOCIO_WARNING_AS_ERROR=ON \ diff --git a/CMakeLists.txt b/CMakeLists.txt index 932874f81c..d6353e8b03 100755 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -25,6 +25,14 @@ if(APPLE AND NOT DEFINED CMAKE_OSX_DEPLOYMENT_TARGET) endif() +############################################################################### +# By default, build the library, tests, tools, and Python binding as universal binaries for macOS. + +if(APPLE AND (NOT DEFINED CMAKE_OSX_ARCHITECTURES OR CMAKE_OSX_ARCHITECTURES STREQUAL "")) + set(CMAKE_OSX_ARCHITECTURES "arm64;x86_64" CACHE STRING "Default OS X architectures" FORCE) +endif() + + ############################################################################### # Project definition. @@ -173,11 +181,13 @@ endif() ############################################################################### # Optimization / internal linking preferences - -option(OCIO_USE_SSE "Specify whether to enable SSE CPU performance optimizations" ON) +# TODO Remove OCIO_USE_SSE once it is fully deprecated. +option(OCIO_USE_SSE "Specify whether to enable SSE (supplanted by OCIO_USE_SIMD)" ON) +# TODO Remove mark_as_advanced once OCIO_USE_SSE is fully deprecated. +mark_as_advanced(OCIO_USE_SSE) +option(OCIO_USE_SIMD "Specify whether to enable SIMD CPU performance optimizations" ${OCIO_USE_SSE}) option(OCIO_USE_OIIO_FOR_APPS "Request OIIO to build apps (ociolutimage, ocioconvert and ociodisplay), the default uses OpenEXR." OFF) - ############################################################################### # GPU configuration message(STATUS "") @@ -185,6 +195,53 @@ message(STATUS "Checking for GPU configuration...") include(CheckSupportGL) +############################################################################### +# Check for ARM neon + +if(OCIO_USE_SIMD) + include(CheckSupportARMNeon) +endif() + + +############################################################################### +# Add sse2neon to the build if ARM NEON intrinsics are supported. + +if(HAVE_NEON AND OCIO_USE_SIMD) + # Install sse2neon. Please note that sse2neon is downloaded during the configure step as it is + # needed for CompilerFlags.cmake and CheckSupportSSE2.cmake. + + # Sse2neon is not treated like an imported target. The logic to find sse2neon is here because + # a find module is not suitable for sse2neon's use case. + find_path(sse2neon_INCLUDE_DIR + NAMES + sse2neon.h + HINTS + ${sse2neon_ROOT} + PATH_SUFFIXES + sse2neon + include + sse2neon/include + ) + + # As per instructions on sse2neon's GitHub page, the following compiler flags should be used: + # "-march=armv8-a+fp+simd+crypto+crc". These flags are required for some ARM platforms that do + # not enable floating point calculations or SIMD instructions by default. However, for ARM64 + # (Apple ARM platform) and x86_64 platforms, these features are already enabled by default. + # Therefore, no additional compiler flags are needed. + if (NOT sse2neon_INCLUDE_DIR) + include(Installsse2neon) + else() + # Any changes to the following lines must be replicated in Installsse2neon.cmake as well. + # Create a target for sse2neon (non-imported) + add_library(sse2neon INTERFACE) + # Add the include directories to the target. + target_include_directories(sse2neon INTERFACE "${sse2neon_INCLUDE_DIR}") + # Ignore the warnings coming from sse2neon.h as they are false positives. + target_compile_options(sse2neon INTERFACE -Wno-unused-parameter) + endif() +endif() + + ############################################################################### # Define compilation and link flags diff --git a/docs/quick_start/installation.rst b/docs/quick_start/installation.rst index 788ae7e16b..fc58c5ec6f 100644 --- a/docs/quick_start/installation.rst +++ b/docs/quick_start/installation.rst @@ -277,7 +277,8 @@ Here are the most common OCIO-specific CMake options (the default values are sho - ``-DOCIO_USE_OIIO_FOR_APPS=OFF`` (Set ON to build tools with OpenImageIO rather than OpenEXR) - ``-DOCIO_BUILD_PYTHON=ON`` (Set to OFF to not build the Python binding) - ``-DOCIO_BUILD_OPENFX=OFF`` (Set to ON to build the OpenFX plug-ins) -- ``-DOCIO_USE_SSE=ON`` (Set to OFF to turn off SSE CPU performance optimizations) +- ``-DOCIO_USE_SSE=ON`` (Deprecated -- please use OCIO_USE_SIMD) +- ``-DOCIO_USE_SIMD=ON`` (Set to OFF to turn off SIMD CPU performance optimizations, such as SSE and NEON) - ``-DOCIO_BUILD_TESTS=ON`` (Set to OFF to not build the unit tests) - ``-DOCIO_BUILD_GPU_TESTS=ON`` (Set to OFF to not build the GPU unit tests) - ``-DOCIO_USE_HEADLESS=OFF`` (Set to ON to do headless GPU reendering) @@ -285,6 +286,16 @@ Here are the most common OCIO-specific CMake options (the default values are sho - ``-DOCIO_BUILD_DOCS=OFF`` (Set to ON to build the documentation) - ``-DOCIO_BUILD_FROZEN_DOCS=OFF`` (Set to ON to update the Python documentation) +On the MacOS under the ARM architecture, the default is to make a universal build +(natively supporting both the Intel and ARM processors). The ``-DCMAKE_OSX_ARCHITECTURES`` option +may be set to just arm64 or x86_64 to override the default value, which is ``arm64;x86_64``. + +When doing a universal build, note that the OCIO dependencies must be built as universal libraries +too. If you are running in OCIO_INSTALL_EXT_PACKAGES=MISSING or NONE mode, your build will fail if +any of your installed libraries are not universal. The easiest way to address this is to set +OCIO_INSTALL_EXT_PACKAGES=ALL in order to let OCIO build everything. Alternatively, you may set +CMAKE_OSX_ARCHITECTURES to just the platform you are targeting. + Several command-line tools (such as ``ocioconvert``) require reading or writing image files. If ``OCIO_USE_OIIO_FOR_APPS=OFF``, these will be built using OpenEXR rather than OpenImageIO and therefore you will be limited to using OpenEXR files with these tools rather than the diff --git a/share/cmake/macros/CheckForOpenEXRCompatibility.cmake b/share/cmake/macros/CheckForOpenEXRCompatibility.cmake new file mode 100644 index 0000000000..bd79eb96e1 --- /dev/null +++ b/share/cmake/macros/CheckForOpenEXRCompatibility.cmake @@ -0,0 +1,66 @@ +# SPDX-License-Identifier: BSD-3-Clause +# Copyright Contributors to the OpenColorIO Project. +# +# Check for compatibility between OpenEXR and OpenImageIO since OCIO requires OpenEXR 3+. +# + +message(STATUS "Checking if the OpenImageIO found is built with OpenEXR 3+...") + +find_path (OpenImageIO_INCLUDE_DIR + NAMES + OpenImageIO/imageio.h + HINTS + ${OpenImageIO_ROOT} + # Assuming that OpenImageIO was installed normally, go back a few folders down + # to get the equivalent of OpenImageIO_ROOT. + ${OpenImageIO_DIR}/../../.. + PATH_SUFFIXES + OpenImageIO/include + include +) + +if (NOT OpenImageIO_INCLUDE_DIR) + message(STATUS "${ColorWarning}Could not find OpenImageIO header to evaluate the OpenEXR version.") + message(STATUS "Please provide the OpenImageIO_DIR variable.") + message(STATUS "If your OpenImageIO's files are located in different root directory, \ +please provide the OpenImageIO_ROOT where the include files are located.${ColorReset}") +endif() + +# Try to figure out version number +set (OIIO_VERSION_HEADER "${OpenImageIO_INCLUDE_DIR}/OpenImageIO/oiioversion.h") +if (EXISTS "${OIIO_VERSION_HEADER}") + file (STRINGS "${OIIO_VERSION_HEADER}" TMP REGEX "^#define OIIO_VERSION_MAJOR .*$") + string (REGEX MATCHALL "[0-9]+" OpenImageIO_VERSION_MAJOR ${TMP}) + file (STRINGS "${OIIO_VERSION_HEADER}" TMP REGEX "^#define OIIO_VERSION_MINOR .*$") + string (REGEX MATCHALL "[0-9]+" OpenImageIO_VERSION_MINOR ${TMP}) + file (STRINGS "${OIIO_VERSION_HEADER}" TMP REGEX "^#define OIIO_VERSION_PATCH .*$") + string (REGEX MATCHALL "[0-9]+" OpenImageIO_VERSION_PATCH ${TMP}) + file (STRINGS "${OIIO_VERSION_HEADER}" TMP REGEX "^#define OIIO_VERSION_TWEAK .*$") + if (TMP) + string (REGEX MATCHALL "[0-9]+" OpenImageIO_VERSION_TWEAK ${TMP}) + else () + set (OpenImageIO_VERSION_TWEAK 0) + endif () + set (OpenImageIO_VERSION "${OpenImageIO_VERSION_MAJOR}.${OpenImageIO_VERSION_MINOR}.${OpenImageIO_VERSION_PATCH}.${OpenImageIO_VERSION_TWEAK}") +endif () + +set (OIIO_IMATH_HEADER "${OpenImageIO_INCLUDE_DIR}/OpenImageIO/Imath.h") +if (EXISTS "${OIIO_IMATH_HEADER}") + file(STRINGS "${OIIO_IMATH_HEADER}" TMP REGEX "^#define OIIO_USING_IMATH .*$") + string(REGEX MATCHALL "[0-9]" OIIO_IMATH_VERSION ${TMP}) + if (OIIO_IMATH_VERSION LESS 3) + message(STATUS "Skipping OpenImageIO built against OpenEXR 2, please use version 3 or greater.") + else() + set(is_OpenEXR_VERSION_valid TRUE) + endif() +endif() + +# clean up variables +unset(OpenImageIO_INCLUDE_DIR) +unset(OIIO_VERSION_HEADER) +unset(OIIO_VERSION_MAJOR) +unset(OIIO_VERSION_MINOR) +unset(OIIO_VERSION_PATCH) +unset(OIIO_VERSION_TWEAK) +unset(OIIO_IMATH_HEADER) +unset(OIIO_IMATH_VERSION) \ No newline at end of file diff --git a/share/cmake/macros/ocio_check_dependency_version.cmake b/share/cmake/macros/ocio_check_dependency_version.cmake deleted file mode 100644 index 2c2b741192..0000000000 --- a/share/cmake/macros/ocio_check_dependency_version.cmake +++ /dev/null @@ -1,38 +0,0 @@ -# SPDX-License-Identifier: BSD-3-Clause -# Copyright Contributors to the OpenColorIO Project. - -################################################################################################### -# ocio_check_dependency_version try to find the specified dependency and validate the version. -# -# Note that a function is used here to scoped-in any variables set by find_package. We do not want -# those variables to be propagated to the caller of the function. -# -# Argument: -# dep_name is the name of the dependency (package). Please note that dep_name is case sensitive. -# -################################################################################################### - -function (ocio_check_dependency_version dep_name output) - cmake_parse_arguments( - # prefix - Must be different than the one used in ocio_handle_dependency.cmake. - ocio_cdv - # options - "" - # one value keywords - "MIN_VERSION" - # multi value keywords - "" - # args - ${ARGN}) - - if (dep_name) - find_package(${dep_name} ${ocio_cdv_UNPARSED_ARGUMENTS}) - if (ocio_cdv_MIN_VERSION AND ${dep_name}_VERSION) - if (${${dep_name}_VERSION} VERSION_GREATER_EQUAL ocio_cdv_MIN_VERSION) - set(${output} TRUE) - else() - set(${output} FALSE) - endif() - endif() - endif() -endfunction() \ No newline at end of file diff --git a/share/cmake/modules/FindExtPackages.cmake b/share/cmake/modules/FindExtPackages.cmake index 870b039bfc..48bc82de07 100644 --- a/share/cmake/modules/FindExtPackages.cmake +++ b/share/cmake/modules/FindExtPackages.cmake @@ -197,13 +197,10 @@ if((OCIO_BUILD_APPS AND OCIO_USE_OIIO_FOR_APPS) OR OCIO_BUILD_TESTS) # Supported from OIIO 2.4+. Setting this for lower versions doesn't affect anything. set(OPENIMAGEIO_CONFIG_DO_NOT_FIND_IMATH 1) - include(ocio_check_dependency_version) - # Since OpenImageIO will try to find OpenEXR through its OpenImageIOConfig.cmake file, - # let's try to find OpenEXR first and if the version is too old, OCIO will not try to find - # OpenImageIO. - ocio_check_dependency_version( OpenEXR "is_OpenEXR_VERSION_valid" - MIN_VERSION ${OpenEXR_MININUM_VERSION} - CONFIG) + set(is_OpenEXR_VERSION_valid FALSE) + # Check for compatibility between OpenEXR and OpenImageIO. + # Will set is_OpenEXR_VERSION_valid to TRUE if valid. + include(CheckForOpenEXRCompatibility) # Do not try to find OpenImageIO if the version of OpenEXR is too old. if (is_OpenEXR_VERSION_valid) @@ -227,8 +224,6 @@ if((OCIO_BUILD_APPS AND OCIO_USE_OIIO_FOR_APPS) OR OCIO_BUILD_TESTS) MIN_VERSION ${OIIO_VERSION} RECOMMENDED_VERSION ${OIIO_RECOMMENDED_VERSION} PROMOTE_TARGET OpenImageIO::OpenImageIO) - else() - message(WARNING "Skipping OpenImageIO because the OpenEXR found by OpenImageIO is too old (under ${OpenEXR_MININUM_VERSION})") endif() endif() diff --git a/share/cmake/modules/install/InstallOpenEXR.cmake b/share/cmake/modules/install/InstallOpenEXR.cmake index 419fb4aa59..ffbd3a1f55 100644 --- a/share/cmake/modules/install/InstallOpenEXR.cmake +++ b/share/cmake/modules/install/InstallOpenEXR.cmake @@ -201,6 +201,7 @@ if(_OpenEXR_TARGET_CREATE) IMPORTED_LOCATION ${IlmThread_LIBRARY} INTERFACE_INCLUDE_DIRECTORIES "${OpenEXR_INCLUDE_DIR}" INTERFACE_LINK_LIBRARIES "OpenEXR::IlmThreadConfig;OpenEXR::IlmThreadConfig;OpenEXR::Iex;Threads::Threads" + STATIC_LIBRARY_OPTIONS "-no_warning_for_no_symbols" ) set_target_properties(OpenEXR::IlmThreadConfig PROPERTIES INTERFACE_INCLUDE_DIRECTORIES "${OpenEXR_INCLUDE_DIR};${OpenEXR_INCLUDE_DIR}/OpenEXR" @@ -217,6 +218,7 @@ if(_OpenEXR_TARGET_CREATE) IMPORTED_LOCATION ${OpenEXRCore_LIBRARY} INTERFACE_INCLUDE_DIRECTORIES "${OpenEXR_INCLUDE_DIR}" INTERFACE_LINK_LIBRARIES "OpenEXR::IlmThreadConfig;ZLIB::ZLIB;\$" + STATIC_LIBRARY_OPTIONS "-no_warning_for_no_symbols" ) set_target_properties(OpenEXR::OpenEXRUtil PROPERTIES IMPORTED_LOCATION ${OpenEXRUtil_LIBRARY} diff --git a/share/cmake/modules/install/Installsse2neon.cmake b/share/cmake/modules/install/Installsse2neon.cmake new file mode 100644 index 0000000000..5f0f810ca1 --- /dev/null +++ b/share/cmake/modules/install/Installsse2neon.cmake @@ -0,0 +1,43 @@ +# SPDX-License-Identifier: BSD-3-Clause +# Copyright Contributors to the OpenColorIO Project. +# +# Install sse2neon (header-only version) +# https://github.com/DLTcollab/sse2neon +# +# +# Global targets defined by this module: +# sse2neon +############################################################################### + +# Download sse2neon using FetchContent and make it available at configure time. + +include(FetchContent) + +set(FETCHCONTENT_BASE_DIR "${CMAKE_BINARY_DIR}/ext/build/sse2neon") +FetchContent_Declare(sse2neon + GIT_REPOSITORY https://github.com/DLTcollab/sse2neon.git + GIT_TAG v1.6.0 +) + +# FetchContent_MakeAvailable is not available until CMake 3.14+. +# Using FetchContent_GetProperties and FetchContent_Populate instead. +FetchContent_GetProperties(sse2neon) + +if(NOT sse2neon_POPULATED) + FetchContent_Populate(sse2neon) + + set(_EXT_DIST_INCLUDE "${CMAKE_BINARY_DIR}/ext/dist/${CMAKE_INSTALL_INCLUDEDIR}") + file(COPY "${sse2neon_SOURCE_DIR}/sse2neon.h" DESTINATION "${_EXT_DIST_INCLUDE}/sse2neon") + + # sse2neon_INCLUDE_DIR is used internally for CheckSupportSSE2.cmake and to create sse2neon + # target for OCIO. + set(sse2neon_INCLUDE_DIR "${sse2neon_SOURCE_DIR}") + + # Any changes to the following lines must be replicated in ./CMakeLists.txt as well. + # Create a target for sse2neon (non-imported) + add_library(sse2neon INTERFACE) + # Add the include directories to the target. + target_include_directories(sse2neon INTERFACE "${sse2neon_INCLUDE_DIR}") + # Ignore the warnings coming from sse2neon.h as they are false positives. + target_compile_options(sse2neon INTERFACE -Wno-unused-parameter) +endif() \ No newline at end of file diff --git a/share/cmake/utils/CheckSupportARMNeon.cmake b/share/cmake/utils/CheckSupportARMNeon.cmake new file mode 100644 index 0000000000..5d17854757 --- /dev/null +++ b/share/cmake/utils/CheckSupportARMNeon.cmake @@ -0,0 +1,27 @@ +# SPDX-License-Identifier: BSD-3-Clause +# Copyright Contributors to the OpenColorIO Project. + +# Checks for ARM NEON availability + +include(CheckCXXSourceCompiles) + +set(_cmake_osx_architectures_orig "${CMAKE_OSX_ARCHITECTURES}") + +if(APPLE) + set(CMAKE_OSX_ARCHITECTURES "arm64") +endif() + +set(source_code " +#include +int main() +{ + float32x4_t v = vdupq_n_f32(0); + return 0; +}") + +check_cxx_source_compiles ("${source_code}" HAVE_NEON) + +set(CMAKE_OSX_ARCHITECTURES "${_cmake_osx_architectures_orig}") + +unset(_cmake_osx_architectures_orig) +mark_as_advanced(HAVE_NEON) diff --git a/share/cmake/utils/CheckSupportSSE2.cmake b/share/cmake/utils/CheckSupportSSE2.cmake index f30bbb763c..07fecbd7a5 100644 --- a/share/cmake/utils/CheckSupportSSE2.cmake +++ b/share/cmake/utils/CheckSupportSSE2.cmake @@ -3,7 +3,9 @@ include(CheckCXXSourceCompiles) -set(_cmake_required_flags_old "${CMAKE_REQUIRED_FLAGS}") +set(_cmake_required_flags_orig "${CMAKE_REQUIRED_FLAGS}") +set(_cmake_required_includes_orig "${CMAKE_REQUIRED_INCLUDES}") +set(_cmake_osx_architectures_orig "${CMAKE_OSX_ARCHITECTURES}") if(NOT CMAKE_SIZE_OF_VOID_P EQUAL 8) # As CheckCXXCompilerFlag implicitly uses CMAKE_CXX_FLAGS some custom flags could trigger @@ -16,8 +18,10 @@ if(NOT CMAKE_SIZE_OF_VOID_P EQUAL 8) endif() endif() -check_cxx_source_compiles (" - #include + +macro(check_sse2_availability _check_sse2_header_ _check_output_var_name_) + set(_SSE2_TEST_SOURCE_CODE " + ${_check_sse2_header_} int main () { __m128d a, b; @@ -26,10 +30,44 @@ check_cxx_source_compiles (" b = _mm_add_pd (a,a); _mm_storeu_pd (vals,b); return (0); - }" - HAVE_SSE2) + }") + + check_cxx_source_compiles ("${_SSE2_TEST_SOURCE_CODE}" ${_check_output_var_name_}) + mark_as_advanced(${_check_output_var_name_}) +endmacro() + +if(NOT HAVE_NEON) + check_sse2_availability("#include " HAVE_SSE2) +elseif(APPLE AND HAVE_NEON) + # Test for both supported architectures + # x86_64 and arm64 + set(ARCHITECTURES_LIST "arm64;x86_64") + + message(STATUS "Checking SSE2 support using SSE2NEON library for arm64 and x86_64 architectures") + foreach (current_arch IN LISTS ARCHITECTURES_LIST) + + set (CMAKE_OSX_ARCHITECTURES "${current_arch}") + + if(current_arch STREQUAL arm64) + set(CMAKE_REQUIRED_INCLUDES ${sse2neon_INCLUDE_DIR}) + set(_sse2_header_ "#include ") + set(_output_var_name_ "HAVE_SSE2_WITH_SSE2NEON") + elseif(current_arch STREQUAL x86_64) + set(_sse2_header_ "#include ") + set(_output_var_name_ "HAVE_SSE2") + endif() + + check_sse2_availability("${_sse2_header_}" ${_output_var_name_}) + + endforeach() +endif() + +set(CMAKE_REQUIRED_FLAGS "${_cmake_required_flags_orig}") +set(CMAKE_REQUIRED_INCLUDES "${_cmake_required_includes_orig}") +set(CMAKE_OSX_ARCHITECTURES "${_cmake_osx_architectures_orig}") + +unset(_cmake_required_flags_orig) +unset(_cmake_required_includes_orig) +unset(_cmake_osx_architectures_orig) -set(CMAKE_REQUIRED_FLAGS "${_cmake_required_flags_old}") -unset(_cmake_required_flags_old) -mark_as_advanced(HAVE_SSE2) diff --git a/share/cmake/utils/CompilerFlags.cmake b/share/cmake/utils/CompilerFlags.cmake index b11722f847..5eefc087cd 100644 --- a/share/cmake/utils/CompilerFlags.cmake +++ b/share/cmake/utils/CompilerFlags.cmake @@ -8,6 +8,21 @@ set(PLATFORM_COMPILE_OPTIONS "") set(PLATFORM_LINK_OPTIONS "") +############################################################################### +# Define if SSE2 can be used. + +if(OCIO_USE_SIMD) + include(CheckSupportSSE2) +endif() + +if(NOT HAVE_SSE2 AND NOT HAVE_SSE2_WITH_SSE2NEON) + message(STATUS "Disabling SSE optimizations, as the target doesn't support them") + set(OCIO_USE_SIMD OFF) +endif() + +############################################################################### +# Compile flags + if(USE_MSVC) set(PLATFORM_COMPILE_OPTIONS "${PLATFORM_COMPILE_OPTIONS};/DUSE_MSVC") @@ -40,7 +55,6 @@ elseif(USE_CLANG) # Use of 'register' specifier must be removed for C++17 support. set(PLATFORM_COMPILE_OPTIONS "${PLATFORM_COMPILE_OPTIONS};-Wno-deprecated-register") - elseif(USE_GCC) set(PLATFORM_COMPILE_OPTIONS "${PLATFORM_COMPILE_OPTIONS};-DUSE_GCC") @@ -90,19 +104,6 @@ set_unless_defined(CMAKE_CXX_VISIBILITY_PRESET hidden) set_unless_defined(CMAKE_VISIBILITY_INLINES_HIDDEN YES) -############################################################################### -# Define if SSE2 can be used. - -message(STATUS "") -message(STATUS "Checking for SSE2 support...") -include(CheckSupportSSE2) - -if(NOT HAVE_SSE2) - message(STATUS "Disabling SSE optimizations, as the target doesn't support them") - set(OCIO_USE_SSE OFF) -endif(NOT HAVE_SSE2) - - ############################################################################### # Define RPATH. diff --git a/share/dev/windows/ocio.bat b/share/dev/windows/ocio.bat index 7f24bc279b..a4762a97d9 100644 --- a/share/dev/windows/ocio.bat +++ b/share/dev/windows/ocio.bat @@ -206,7 +206,7 @@ if !DO_CONFIGURE!==1 ( -DOCIO_BUILD_TESTS=ON^ -DOCIO_BUILD_GPU_TESTS=ON^ -DOCIO_BUILD_DOCS=OFF^ - -DOCIO_USE_SSE=ON^ + -DOCIO_USE_SIMD=ON^ -DOCIO_WARNING_AS_ERROR=ON^ -DOCIO_BUILD_JAVA=OFF^ "!OCIO_PATH!" diff --git a/src/OpenColorIO/CMakeLists.txt b/src/OpenColorIO/CMakeLists.txt index 6c2693db2f..608cbac63a 100755 --- a/src/OpenColorIO/CMakeLists.txt +++ b/src/OpenColorIO/CMakeLists.txt @@ -292,6 +292,10 @@ target_link_libraries(OpenColorIO MINIZIP::minizip-ng ) +if(OCIO_USE_SIMD AND HAVE_SSE2_WITH_SSE2NEON) + target_link_libraries(OpenColorIO PRIVATE $) +endif() + if(APPLE) target_link_libraries(OpenColorIO PRIVATE @@ -322,11 +326,20 @@ if(BUILD_SHARED_LIBS OR (OCIO_BUILD_PYTHON AND UNIX)) set_property(TARGET OpenColorIO PROPERTY POSITION_INDEPENDENT_CODE ON) endif() -if(OCIO_USE_SSE) - target_compile_definitions(OpenColorIO - PRIVATE - USE_SSE - ) +if(OCIO_USE_SIMD) + if(HAVE_SSE2) + target_compile_definitions(OpenColorIO + PRIVATE + USE_SSE + ) + endif() + + if(HAVE_SSE2_WITH_SSE2NEON) + target_compile_definitions(OpenColorIO + PRIVATE + USE_SSE2_WITH_SSE2NEON + ) + endif() endif() if(MSVC AND BUILD_TYPE_DEBUG AND BUILD_SHARED_LIBS) diff --git a/src/OpenColorIO/SSE.h b/src/OpenColorIO/SSE.h index a903120ec7..05eae7136f 100644 --- a/src/OpenColorIO/SSE.h +++ b/src/OpenColorIO/SSE.h @@ -6,12 +6,19 @@ #define INCLUDED_OCIO_SSE_H -#ifdef USE_SSE - - -#include -#include - +#if defined(USE_SSE) || defined(USE_SSE2_WITH_SSE2NEON) + +// Include the appropriate SIMD intrinsics header based on the architecture (Intel vs. ARM). +#if !defined(__aarch64__) + #if defined(USE_SSE) + #include + #endif +#elif defined(__aarch64__) + // ARM architecture A64 (ARM64) + #if defined(USE_SSE2_WITH_SSE2NEON) + #include + #endif +#endif #include @@ -20,6 +27,34 @@ namespace OCIO_NAMESPACE { +// Note that it is important for the code below this ifdef stays in the OCIO_NAMESPACE since +// it is redefining two of the functions from sse2neon. + +#if defined(__aarch64__) + #if defined(USE_SSE2_WITH_SSE2NEON) + // Using vmaxnmq_f32 and vminnmq_f32 rather than sse2neon's vmaxq_f32 and vminq_f32 due to + // NaN handling. This doesn't seem to be significantly slower than the default sse2neon behavior. + + // With the Intel intrinsics, if one value is a NaN, the second argument is output, as if it were + // a simple (a>b) ? a:b. OCIO sometimes uses this behavior to filter out a possible NaN in the + // first argument. The vmaxq/vminq will return a NaN if either input is a NaN, which omits the + // filtering behavior. The vmaxnmq/vminnmq (similar to std::fmax/fmin) are not quite the same as + // the Intel _mm_max_ps / _mm_min_ps since they always returns the non-NaN argument + // (for quiet NaNs, signaling NaNs always get returned), but that's fine for OCIO since a NaN in + // the first argument continues to be filtered out. + static inline __m128 _mm_max_ps(__m128 a, __m128 b) + { + return vreinterpretq_m128_f32( + vmaxnmq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))); + } + static inline __m128 _mm_min_ps(__m128 a, __m128 b) + { + return vreinterpretq_m128_f32( + vminnmq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))); + } + #endif +#endif + // Macros for alignment declarations #define OCIO_SIMD_BYTES 16 #if defined( _MSC_VER ) @@ -45,7 +80,7 @@ static const __m128i EBIAS = _mm_set1_epi32(EXP_BIAS); static const __m128 EONE = _mm_set1_ps(1.0f); static const __m128 EZERO = _mm_set1_ps(0.0f); static const __m128 ENEG126 = _mm_set1_ps(-126.0f); -static const __m128 EPOS127 = _mm_set1_ps(127.0f); +static const __m128 EPOS128 = _mm_set1_ps(128.0f); static const __m128 EPOSINF = _mm_set1_ps(std::numeric_limits::infinity()); @@ -72,10 +107,10 @@ inline __m128 isNegativeSpecial(const __m128 x) return _mm_castsi128_ps(_mm_srai_epi32(_mm_castps_si128(x), SIGN_SHIFT)); } -// Select function in SSE version 2 +// Bit-wise select function in SSE version 2 // -// Return the parameter arg_false when the parameter mask is 0x0, -// or the parameter arg_true when the mask is 0xffffffff. +// Return the parameter arg_false bit where the parameter mask is 0x0, +// return the parameter arg_true bit where the mask is 1. // // Algorithm Explanation: // @@ -105,7 +140,11 @@ inline __m128 isNegativeSpecial(const __m128 x) // inline __m128 sseSelect(const __m128& mask, const __m128& arg_true, const __m128& arg_false) { - return _mm_xor_ps( arg_false, _mm_and_ps( mask, _mm_xor_ps( arg_true, arg_false ) ) ); + return _mm_xor_ps( // bit-wise XOR of arg_false, (...) + arg_false, + _mm_and_ps( // bit-wise AND of mask, (...) + mask, + _mm_xor_ps( arg_true, arg_false ) ) ); // bit-wise XOR of arg_true, arg_false } // Coefficients of Chebyshev (minimax) degree 5 polynomial @@ -125,6 +164,10 @@ static const __m128 PNEXP2 = _mm_set1_ps((float)2.414427569091865207710e-1); static const __m128 PNEXP1 = _mm_set1_ps((float)6.930038344665415134202e-1); static const __m128 PNEXP0 = _mm_set1_ps((float)1.000002593370603213644); +// Note: The above polynomials have been chosen to achieve a precision of +// approximately 15 bits of mantissa. + + // log2 function in SSE version 2 // // The function log2() is evaluated by performing argument @@ -132,12 +175,14 @@ static const __m128 PNEXP0 = _mm_set1_ps((float)1.000002593370603213644); // over a restricted range. inline __m128 sseLog2(__m128 x) { - // y = log2( x ) = log2( 2^exposant * mantissa ) - // = exposant + log2( mantissa ) + // y = log2( x ) = log2( 2^exponent * mantissa ) + // = exponent + log2( mantissa ) __m128 mantissa - = _mm_or_ps( - _mm_andnot_ps(_mm_castsi128_ps(EMASK), x), EONE); + = _mm_or_ps( // OR with EONE + _mm_andnot_ps( // NOT(EMASK) AND x + _mm_castsi128_ps(EMASK), x), // reinterpret cast int to float + EONE); __m128 log2 = _mm_add_ps( @@ -161,14 +206,15 @@ inline __m128 sseLog2(__m128 x) PNLOG0); __m128i exponent - = _mm_sub_epi32( - _mm_srli_epi32( - _mm_and_si128(_mm_castps_si128(x), + = _mm_sub_epi32( // subtract EBIAS + _mm_srli_epi32( // right-shift by EXP_SHIFT + _mm_and_si128(_mm_castps_si128(x), // bit-wise AND with EMASK EMASK), EXP_SHIFT), EBIAS); - log2 = _mm_add_ps(log2, _mm_cvtepi32_ps(exponent)); + log2 = _mm_add_ps(log2, + _mm_cvtepi32_ps(exponent)); // convert exponent to float return log2; } @@ -187,24 +233,30 @@ inline __m128 sseExp2(__m128 x) // Compute the largest integer not greater than x, i.e., floor(x) // Note: cvttps_epi32 simply cast the float value to int. That means cvttps_epi32(-2.7) = -2 // rather than -3, hence for negative numbers we need to add -1. This ensures that "fraction" - // is always in the range [0, 1). + // is always in the range [0, 1). Note that _mm_castps_si128(0xFFFFFFFF) is -1. + // If x is outside the INT_MIN to INT_MAX range, _mm_cvttps_epi32 will return 0x80000000 + // (i.e. INT_MIN, just the sign bit set), which Intel calls the "integer indefinite" value. + // When 1 is subtracted from INT_MIN, it gives INT_MAX. So floor_x is wrong for values + // outside [INT_MIN, INT_MAX] but it's ignored thanks to the checks at the bottom. + // It's also wrong for x=NaN, but again it's ok since the polynomial returns NaN and + // hence the output is NaN, regardless of floor_x. __m128i floor_x - = _mm_add_epi32( - _mm_cvttps_epi32(x), - _mm_castps_si128( - _mm_cmpnle_ps(EZERO, x))); + = _mm_add_epi32( // add a pair of integer arguments + _mm_cvttps_epi32(x), // convert float to int via truncation + _mm_castps_si128( // reinterpret cast float to int + _mm_cmpnle_ps(EZERO, x))); // NOT( EZERO <= x ) ? 0xFFFFFFFF : 0 // Compute exp2(floor_x) by moving floor_x to the exponent bits of the floating-point number. __m128 zf - = _mm_castsi128_ps( - _mm_slli_epi32( - _mm_add_epi32(floor_x, EBIAS), + = _mm_castsi128_ps( // reinterpret cast int to float + _mm_slli_epi32( // left shift by EXP_SHIFT + _mm_add_epi32(floor_x, EBIAS), // add a pair of integer arguments EXP_SHIFT)); - __m128 iexp = _mm_cvtepi32_ps(floor_x); - __m128 fraction = _mm_sub_ps(x, iexp); + __m128 iexp = _mm_cvtepi32_ps(floor_x); // convert floor_x to float + __m128 fraction = _mm_sub_ps(x, iexp); // x - iexp - // Compute exp2(fraction) using a polynomial approximation + // Compute exp2(fraction) using a polynomial approximation. __m128 mexp = _mm_add_ps( _mm_mul_ps( @@ -222,19 +274,26 @@ inline __m128 sseExp2(__m128 x) fraction), PNEXP0); - __m128 exp2 = _mm_mul_ps(zf, mexp); + __m128 exp2 = _mm_mul_ps(zf, mexp); // zf * mexp // Handle underflow: // If the (unbiased) exponent of zf is less than -126, the result is smaller than // the smallest representable floating-point number and an underflow computation is // potentially happening. When this happens, force the result to zero. - exp2 = _mm_andnot_ps(_mm_cmplt_ps(iexp, ENEG126), exp2); + // Note that as described above, floor_x is inaccurate, so the test here uses x. + exp2 = _mm_andnot_ps( // NOT(...) AND exp2 + _mm_cmplt_ps(x, ENEG126), // iexp < ENEG126 ? 0xFFFFFFFF : 0 + exp2); // Handle overflow: // If the (unbiased) exponent of zf is greater than 127, the result is larger than // the largest representable floating-point number and an overflow computation is // potentially happening. When this happens, force the result to positive infinity. - exp2 = sseSelect(_mm_cmpgt_ps(iexp, EPOS127), EPOSINF, exp2); + // Note that as described above, floor_x is inaccurate, so the test here uses x. + exp2 = sseSelect( // (...) is a mask to select EPOSINF, exp2 + _mm_cmpge_ps(x, EPOS128), // iexp > EPOS128 ? 0xFFFFFFFF : 0 + EPOSINF, + exp2); return exp2; } @@ -593,7 +652,7 @@ inline void sseSinCos(const float x, float& sin_x, float& cos_x) } // namespace OCIO_NAMESPACE -#endif +#endif // USE_SSE -#endif +#endif // INCLUDED_OCIO_SSE_H diff --git a/tests/cpu/CMakeLists.txt b/tests/cpu/CMakeLists.txt index 431d570f4e..edbb21ad2c 100755 --- a/tests/cpu/CMakeLists.txt +++ b/tests/cpu/CMakeLists.txt @@ -26,6 +26,10 @@ function(add_ocio_test NAME SOURCES PRIVATE_INCLUDES) xxHash ) + if(OCIO_USE_SIMD AND HAVE_SSE2_WITH_SSE2NEON) + target_link_libraries(${TEST_BINARY} PRIVATE sse2neon) + endif() + if(APPLE) # Frameworks needed to access the ICC monitor profile. target_link_libraries(${TEST_BINARY} @@ -43,12 +47,23 @@ function(add_ocio_test NAME SOURCES PRIVATE_INCLUDES) "${CMAKE_BINARY_DIR}/generated_include" ) endif(PRIVATE_INCLUDES) - if(OCIO_USE_SSE) - target_compile_definitions(${TEST_BINARY} - PRIVATE - USE_SSE - ) - endif(OCIO_USE_SSE) + + if(OCIO_USE_SIMD) + if (HAVE_SSE2) + target_compile_definitions(${TEST_BINARY} + PRIVATE + USE_SSE + ) + endif() + + if(HAVE_SSE2_WITH_SSE2NEON) + target_compile_definitions(${TEST_BINARY} + PRIVATE + USE_SSE2_WITH_SSE2NEON + ) + endif() + endif(OCIO_USE_SIMD) + if(WIN32) # A windows application linking to eXpat static libraries must # have the global macro XML_STATIC defined @@ -66,6 +81,7 @@ function(add_ocio_test NAME SOURCES PRIVATE_INCLUDES) ) endif() endif(WIN32) + set_target_properties(${TEST_BINARY} PROPERTIES COMPILE_OPTIONS "${PLATFORM_COMPILE_OPTIONS}" LINK_OPTIONS "${PLATFORM_LINK_OPTIONS}" diff --git a/tests/cpu/ops/log/LogOpCPU_tests.cpp b/tests/cpu/ops/log/LogOpCPU_tests.cpp index 79649a23ae..06e98fe0da 100644 --- a/tests/cpu/ops/log/LogOpCPU_tests.cpp +++ b/tests/cpu/ops/log/LogOpCPU_tests.cpp @@ -9,7 +9,6 @@ namespace OCIO = OCIO_NAMESPACE; - constexpr float qnan = std::numeric_limits::quiet_NaN(); constexpr float inf = std::numeric_limits::infinity(); @@ -23,6 +22,7 @@ void TestLog(float logBase) 0.f, 0.f, 0.f, inf, -inf, -inf, -inf, 0.f, 0.f, 0.f, 0.f, -inf }; + float rgba[32] = {}; OCIO::ConstLogOpDataRcPtr logOp = std::make_shared( @@ -52,16 +52,25 @@ void TestLog(float logBase) expected = logf(std::max(minValue, (float)expected)) / logf(logBase); } + // Evaluating output for input rgbaImage[0-7] = { 0.0367126f, 0.5f, 1.f, 0.f, + // 0.2f, 0.f, .99f, 128.f, + // ... } OCIO_CHECK_CLOSE(result, expected, error); } const float resMin = logf(minValue) / logf(logBase); + + // Evaluating output for input rgbaImage[8-11] = {qnan, qnan, qnan, 0.}. OCIO_CHECK_CLOSE(rgba[8], resMin, error); OCIO_CHECK_EQUAL(rgba[11], 0.0f); + + // Evaluating output for input rgbaImage[12-15] = {0., 0., 0., qnan.}. OCIO_CHECK_CLOSE(rgba[12], resMin, error); OCIO_CHECK_ASSERT(OCIO::IsNan(rgba[15])); + // SSE implementation of sseLog2 & sseExp2 do not behave like CPU. // TODO: Address issues with Inf/NaN handling demonstrated by many of the test results below. + // Evaluating output for input rgbaImage[16-19] = {inf, inf, inf, 0.}. #ifdef USE_SSE if (logBase == 10.0f) { @@ -75,10 +84,16 @@ void TestLog(float logBase) OCIO_CHECK_EQUAL(rgba[16], inf); #endif OCIO_CHECK_EQUAL(rgba[19], 0.0f); + + // Evaluating output for input rgbaImage[20-23] = {0., 0., 0., inf}. OCIO_CHECK_CLOSE(rgba[20], resMin, error); OCIO_CHECK_EQUAL(rgba[23], inf); + + // Evaluating output for input rgbaImage[24-27] = {-inf, -inf, -inf, 0.}. OCIO_CHECK_CLOSE(rgba[24], resMin, error); OCIO_CHECK_EQUAL(rgba[27], 0.0f); + + // Evaluating output for input rgbaImage[28-31] = {0., 0., 0., -inf}. OCIO_CHECK_CLOSE(rgba[28], resMin, error); OCIO_CHECK_EQUAL(rgba[31], -inf); } @@ -127,30 +142,33 @@ void TestAntiLog(float logBase) // LogOpCPU implementation uses optimized logarithm approximation // cannot use strict comparison. + // Evaluating output for input rgbaImage[0-7] = { 0.0367126f, 0.5f, 1.f, 0.f, + // 0.2f, 0.f, .99f, 128.f, + // ... } OCIO_CHECK_ASSERT(OCIO::EqualWithSafeRelError(result, expected, rtol, 1.0f)); } -#ifdef USE_SSE - OCIO_CHECK_EQUAL(rgba[8], inf); -#else + + // Evaluating output for input rgbaImage[8-11] = {qnan, qnan, qnan, 0.}. OCIO_CHECK_ASSERT(OCIO::IsNan(rgba[8])); -#endif OCIO_CHECK_EQUAL(rgba[11], 0.0f); + + // Evaluating output for input rgbaImage[12-15] = {0., 0., 0., qnan.}. OCIO_CHECK_CLOSE(rgba[12], 1.0f, rtol); OCIO_CHECK_ASSERT(OCIO::IsNan(rgba[15])); -#ifdef USE_SSE - OCIO_CHECK_EQUAL(rgba[16], 0.0f); // sseExp2(inf) is 0 -#else + + // Evaluating output for input rgbaImage[16-19] = {inf, inf, inf, 0.}. OCIO_CHECK_EQUAL(rgba[16], inf); -#endif OCIO_CHECK_EQUAL(rgba[19], 0.0f); + + // Evaluating output for input rgbaImage[20-23] = {0., 0., 0., inf}. OCIO_CHECK_CLOSE(rgba[20], 1.0f, rtol); OCIO_CHECK_EQUAL(rgba[23], inf); -#ifdef USE_SSE - OCIO_CHECK_EQUAL(rgba[24], inf); -#else + + // Evaluating output for input rgbaImage[24-27] = {-inf, -inf, -inf, 0.}. OCIO_CHECK_EQUAL(rgba[24], 0.0f); -#endif OCIO_CHECK_EQUAL(rgba[27], 0.0f); + + // Evaluating output for input rgbaImage[28-31] = {0., 0., 0., -inf}. OCIO_CHECK_CLOSE(rgba[28], 1.0f, rtol); OCIO_CHECK_EQUAL(rgba[31], -inf); } @@ -263,39 +281,35 @@ OCIO_ADD_TEST(LogOpCPU, log2lin_test) // LogOpCPU implementation uses optimized logarithm approximation // cannot use strict comparison. + // Evaluating output for input rgbaImage[0-7] = { 0.0367126f, 0.5f, 1.f, 0.f, + // 0.2f, 0.f, .99f, 128.f, + // ... } OCIO_CHECK_ASSERT(OCIO::EqualWithSafeRelError(result, expected, rtol, 1.0f)); } const float res0 = ComputeLog2LinEval(0.0f, redP); -#ifdef USE_SSE - OCIO_CHECK_EQUAL(rgba[8], inf); -#else + // Evaluating output for input rgbaImage[8-11] = {qnan, qnan, qnan, 0.}. OCIO_CHECK_ASSERT(OCIO::IsNan(rgba[8])); -#endif - OCIO_CHECK_EQUAL(rgba[11], 0.0f); + // Evaluating output for input rgbaImage[12-15] = {0., 0., 0., qnan.}. OCIO_CHECK_CLOSE(rgba[12], res0, rtol); OCIO_CHECK_ASSERT(OCIO::IsNan(rgba[15])); -#ifdef USE_SSE - OCIO_CHECK_CLOSE(rgba[16], -0.003041422227f, rtol); -#else + // Evaluating output for input rgbaImage[16-19] = {inf, inf, inf, 0.}. OCIO_CHECK_EQUAL(rgba[16], inf); -#endif OCIO_CHECK_EQUAL(rgba[19], 0.0f); + // Evaluating output for input rgbaImage[20-23] = {0., 0., 0., inf}. OCIO_CHECK_CLOSE(rgba[20], res0, rtol); OCIO_CHECK_EQUAL(rgba[23], inf); -#ifdef USE_SSE - OCIO_CHECK_EQUAL(rgba[24], inf); -#else + // Evaluating output for input rgbaImage[24-27] = {-inf, -inf, -inf, 0.}. OCIO_CHECK_CLOSE(rgba[24], ComputeLog2LinEval(-inf, redP), rtol); -#endif OCIO_CHECK_EQUAL(rgba[27], 0.0f); + // Evaluating output for input rgbaImage[28-31] = {0., 0., 0., -inf}. OCIO_CHECK_CLOSE(rgba[28], res0, rtol); OCIO_CHECK_EQUAL(rgba[31], -inf); } @@ -399,18 +413,24 @@ OCIO_ADD_TEST(LogOpCPU, lin2log_test) // LogOpCPU implementation uses optimized logarithm approximation // cannot use strict comparison + // Evaluating output for input rgbaImage[0-7] = { 0.0367126f, 0.5f, 1.f, 0.f, + // 0.2f, 0.f, .99f, 128.f, + // ... } OCIO_CHECK_CLOSE(result, expected, error); } const float res0 = ComputeLin2LogEval(0.0f, redP); const float resMin = ComputeLin2LogEval(-100.0f, redP); + // Evaluating output for input rgbaImage[8-11] = {qnan, qnan, qnan, 0.}. OCIO_CHECK_CLOSE(rgba[8], resMin, error); OCIO_CHECK_EQUAL(rgba[11], 0.0f); + // Evaluating output for input rgbaImage[12-15] = {0., 0., 0., qnan.}. OCIO_CHECK_CLOSE(rgba[12], res0, error); OCIO_CHECK_ASSERT(OCIO::IsNan(rgba[15])); + // Evaluating output for input rgbaImage[16-19] = {inf, inf, inf, 0.}. #ifdef USE_SSE OCIO_CHECK_CLOSE(rgba[16], 10.08598328f, error); #else @@ -418,17 +438,20 @@ OCIO_ADD_TEST(LogOpCPU, lin2log_test) #endif OCIO_CHECK_EQUAL(rgba[19], 0.0f); + // Evaluating output for input rgbaImage[20-23] = {0., 0., 0., inf}. OCIO_CHECK_CLOSE(rgba[20], res0, error); OCIO_CHECK_EQUAL(rgba[23], inf); + // Evaluating output for input rgbaImage[24-27] = {-inf, -inf, -inf, 0.}. OCIO_CHECK_CLOSE(rgba[24], resMin, error); OCIO_CHECK_EQUAL(rgba[27], 0.0f); + // Evaluating output for input rgbaImage[28-31] = {0., 0., 0., -inf}. OCIO_CHECK_CLOSE(rgba[28], res0, error); OCIO_CHECK_EQUAL(rgba[31], -inf); } -OCIO_ADD_TEST(LogOpCPU, cameralog2lin_test) +OCIO_ADD_TEST(LogOpCPU, cameralin2log_test) { constexpr int numPixels = 3; constexpr int numValues = 4 * numPixels; @@ -460,18 +483,21 @@ OCIO_ADD_TEST(LogOpCPU, cameralog2lin_test) const float error = 1e-7f; #endif // USE_SSE + // Evaluating output for input rgbaImage[0-2] = { -0.1f, 0.f, 0.01f, ... }. OCIO_CHECK_CLOSE(rgba[0], -0.168771237955f, error); OCIO_CHECK_CLOSE(rgba[1], -0.048771237955f, error); OCIO_CHECK_CLOSE(rgba[2], -0.036771237955f, error); + + // Evaluating output for input rgbaImage[4-6] = { 0.08f, 0.16f, 1.16f, ... }. OCIO_CHECK_CLOSE(rgba[4], 0.047228762045f, error); #ifdef USE_SSE OCIO_CHECK_CLOSE(rgba[5], 0.170878935551f, 10.0f * error); #else OCIO_CHECK_CLOSE(rgba[5], 0.170878935551f, error); #endif // USE_SSE - OCIO_CHECK_CLOSE(rgba[6], 0.68141615509f, error); + // Evaluating output for input rgbaImage[8-10] = { -inf, inf, qnan, ... }. #ifdef USE_SSE OCIO_CHECK_EQUAL(rgba[8], -inf); OCIO_CHECK_CLOSE(rgba[9], 26.2f, 10.0f * error); @@ -492,9 +518,12 @@ OCIO_ADD_TEST(LogOpCPU, cameralog2lin_test) OCIO::ConstOpCPURcPtr pRendererNoLS = OCIO::GetLogRenderer(lognols, true); pRendererNoLS->apply(rgbaImage, rgba_nols, numPixels); + // Evaluating output for input rgbaImage[0-2] = { -0.1f, 0.f, 0.01f, ... }. OCIO_CHECK_CLOSE(rgba_nols[0], -0.325512374199f, error); OCIO_CHECK_CLOSE(rgba_nols[1], -0.127141806077f, error); OCIO_CHECK_CLOSE(rgba_nols[2], -0.107304749265f, error); + + // Evaluating output for input rgbaImage[4-6] = { 0.08f, 0.16f, 1.16f, ... }. OCIO_CHECK_CLOSE(rgba_nols[4], 0.031554648421f, error); #ifdef USE_SSE OCIO_CHECK_CLOSE(rgba_nols[5], 0.170878935551f, 10.0f * error); @@ -502,8 +531,9 @@ OCIO_ADD_TEST(LogOpCPU, cameralog2lin_test) OCIO_CHECK_CLOSE(rgba_nols[5], 0.170878935551f, error); #endif // USE_SSE OCIO_CHECK_CLOSE(rgba_nols[6], 0.68141615509f, error); - OCIO_CHECK_EQUAL(rgba_nols[8], -inf); + // Evaluating output for input rgbaImage[8-10] = { -inf, inf, qnan, ... }. + OCIO_CHECK_EQUAL(rgba_nols[8], -inf); #ifdef USE_SSE OCIO_CHECK_CLOSE(rgba_nols[9], 26.2f, 10.0f * error); OCIO_CHECK_ASSERT(OCIO::IsNan(rgba_nols[10])); @@ -527,12 +557,18 @@ OCIO_ADD_TEST(LogOpCPU, cameralog2lin_test) #else const float error2 = 1e-7f; #endif // USE_SSE + + // Evaluating output for input rgbaImage[0-2] = { -0.1f, 0.f, 0.01f, ... }. OCIO_CHECK_CLOSE(rgba_nobreak[0], -24.6f, error2); OCIO_CHECK_CLOSE(rgba_nobreak[1], -0.264385618977f, error2); OCIO_CHECK_CLOSE(rgba_nobreak[2], -0.20700938942f, error2); + + // Evaluating output for input rgbaImage[4-6] = { 0.08f, 0.16f, 1.16f, ... }. OCIO_CHECK_CLOSE(rgba_nobreak[4], 0.028548034423f, error2); OCIO_CHECK_CLOSE(rgba_nobreak[5], 0.170878935551f, error2); OCIO_CHECK_CLOSE(rgba_nobreak[6], 0.68141615509, error2); + + // Evaluating output for input rgbaImage[8-10] = { -inf, inf, qnan, ... }. OCIO_CHECK_CLOSE(rgba_nobreak[8], -24.6f, error2); #ifdef USE_SSE OCIO_CHECK_CLOSE(rgba_nobreak[9], 26.2f, error2); @@ -542,17 +578,11 @@ OCIO_ADD_TEST(LogOpCPU, cameralog2lin_test) OCIO_CHECK_CLOSE(rgba_nobreak[10], -24.6f, error2); } -OCIO_ADD_TEST(LogOpCPU, cameralin2log_test) +OCIO_ADD_TEST(LogOpCPU, cameralog2lin_test) { // Inverse of previous test. - const float rgbaImage[12] = { -0.168771237955f, - -0.048771237955f, - -0.036771237955f, - 0.f, - 0.047228762045f, - 0.170878935551f, - 0.68141615509f, - 0.f, + const float rgbaImage[12] = { -0.168771237955f, -0.048771237955f, -0.036771237955f, 0.f, + 0.047228762045f, 0.170878935551f, 0.68141615509f, 0.f, -inf, inf, qnan, 0.0f }; float rgba[12] = {}; @@ -571,18 +601,22 @@ OCIO_ADD_TEST(LogOpCPU, cameralin2log_test) #else const float error = 1e-7f; #endif // USE_SSE + + // Evaluating output for input rgbaImage[0-2] = + // { -0.168771237955f, -0.048771237955f, -0.036771237955f, ... }. OCIO_CHECK_CLOSE(rgba[0], -0.1f, error); OCIO_CHECK_CLOSE(rgba[1], 0.0f, error); OCIO_CHECK_CLOSE(rgba[2], 0.01f, error); + + // Evaluating output for input rgbaImage[4-6] = + // { 0.047228762045f, 0.170878935551f, 0.68141615509f, ... }. OCIO_CHECK_CLOSE(rgba[4], 0.08f, error); OCIO_CHECK_CLOSE(rgba[5], 0.16f, error); OCIO_CHECK_CLOSE(rgba[6], 1.16f, 10.0f * error); + + // Evaluating output for input rgbaImage[8-10] = { -inf, inf, qnan, ... }. OCIO_CHECK_EQUAL(rgba[8], -inf); -#ifdef USE_SSE - OCIO_CHECK_CLOSE(rgba[9], -0.0454545f, error); // sseExp2(inf) is 0 -#else OCIO_CHECK_EQUAL(rgba[9], inf); -#endif OCIO_CHECK_ASSERT(OCIO::IsNan(rgba[10])); } diff --git a/tests/gpu/CMakeLists.txt b/tests/gpu/CMakeLists.txt index 2245fbf1d5..4a19e6b5d6 100644 --- a/tests/gpu/CMakeLists.txt +++ b/tests/gpu/CMakeLists.txt @@ -26,12 +26,12 @@ set(SOURCES add_executable(test_gpu_exec ${SOURCES}) -if(OCIO_USE_SSE) +if(OCIO_USE_SIMD) target_compile_definitions(test_gpu_exec PRIVATE USE_SSE ) -endif(OCIO_USE_SSE) +endif(OCIO_USE_SIMD) set_target_properties(test_gpu_exec PROPERTIES COMPILE_OPTIONS "${PLATFORM_COMPILE_OPTIONS}" diff --git a/tests/osl/CMakeLists.txt b/tests/osl/CMakeLists.txt index 3316720d47..3f327ff295 100644 --- a/tests/osl/CMakeLists.txt +++ b/tests/osl/CMakeLists.txt @@ -18,12 +18,12 @@ set(SOURCES add_executable(test_osl_exec ${SOURCES}) -if(OCIO_USE_SSE) +if(OCIO_USE_SIMD) target_compile_definitions(test_osl_exec PRIVATE USE_SSE ) -endif(OCIO_USE_SSE) +endif(OCIO_USE_SIMD) set_target_properties(test_osl_exec PROPERTIES COMPILE_OPTIONS "${PLATFORM_COMPILE_OPTIONS}"