From f2991ae951578a24ae0ebf29e4efb487a20fb796 Mon Sep 17 00:00:00 2001 From: "Gogar, Sunny L" Date: Tue, 14 Jul 2020 15:27:09 -0700 Subject: [PATCH 1/3] Initial commit for iso3dfd_dpcpp code sample Signed-off-by: Gogar, Sunny L --- .../iso3dfd_dpcpp/CMakeLists.txt | 8 + .../StructuredGrids/iso3dfd_dpcpp/License.txt | 7 + .../StructuredGrids/iso3dfd_dpcpp/README.md | 145 ++++++ .../iso3dfd_dpcpp/include/device_selector.hpp | 47 ++ .../iso3dfd_dpcpp/include/iso3dfd.h | 51 +++ .../iso3dfd_dpcpp/iso3dfd_dpcpp.sln | 25 ++ .../iso3dfd_dpcpp/iso3dfd_dpcpp.vcxproj | 157 +++++++ .../iso3dfd_dpcpp.vcxproj.filters | 36 ++ .../iso3dfd_dpcpp/iso3dfd_dpcpp.vcxproj.user | 11 + .../StructuredGrids/iso3dfd_dpcpp/sample.json | 30 ++ .../iso3dfd_dpcpp/src/CMakeLists.txt | 27 ++ .../iso3dfd_dpcpp/src/iso3dfd.cpp | 343 ++++++++++++++ .../iso3dfd_dpcpp/src/iso3dfd_kernels.cpp | 419 ++++++++++++++++++ .../iso3dfd_dpcpp/src/utils.cpp | 165 +++++++ 14 files changed, 1471 insertions(+) create mode 100644 DirectProgramming/DPC++/StructuredGrids/iso3dfd_dpcpp/CMakeLists.txt create mode 100644 DirectProgramming/DPC++/StructuredGrids/iso3dfd_dpcpp/License.txt create mode 100644 DirectProgramming/DPC++/StructuredGrids/iso3dfd_dpcpp/README.md create mode 100644 DirectProgramming/DPC++/StructuredGrids/iso3dfd_dpcpp/include/device_selector.hpp create mode 100644 DirectProgramming/DPC++/StructuredGrids/iso3dfd_dpcpp/include/iso3dfd.h create mode 100755 DirectProgramming/DPC++/StructuredGrids/iso3dfd_dpcpp/iso3dfd_dpcpp.sln create mode 100755 DirectProgramming/DPC++/StructuredGrids/iso3dfd_dpcpp/iso3dfd_dpcpp.vcxproj create mode 100755 DirectProgramming/DPC++/StructuredGrids/iso3dfd_dpcpp/iso3dfd_dpcpp.vcxproj.filters create mode 100755 DirectProgramming/DPC++/StructuredGrids/iso3dfd_dpcpp/iso3dfd_dpcpp.vcxproj.user create mode 100755 DirectProgramming/DPC++/StructuredGrids/iso3dfd_dpcpp/sample.json create mode 100644 DirectProgramming/DPC++/StructuredGrids/iso3dfd_dpcpp/src/CMakeLists.txt create mode 100644 DirectProgramming/DPC++/StructuredGrids/iso3dfd_dpcpp/src/iso3dfd.cpp create mode 100644 DirectProgramming/DPC++/StructuredGrids/iso3dfd_dpcpp/src/iso3dfd_kernels.cpp create mode 100644 DirectProgramming/DPC++/StructuredGrids/iso3dfd_dpcpp/src/utils.cpp diff --git a/DirectProgramming/DPC++/StructuredGrids/iso3dfd_dpcpp/CMakeLists.txt b/DirectProgramming/DPC++/StructuredGrids/iso3dfd_dpcpp/CMakeLists.txt new file mode 100644 index 0000000000..ab666d05d1 --- /dev/null +++ b/DirectProgramming/DPC++/StructuredGrids/iso3dfd_dpcpp/CMakeLists.txt @@ -0,0 +1,8 @@ +cmake_minimum_required (VERSION 3.0) +project (ISO3DFD_DPCPP) +if(WIN32) + set(CMAKE_CXX_COMPILER "dpcpp-cl") +else() + set(CMAKE_CXX_COMPILER "dpcpp") +endif() +add_subdirectory (src) diff --git a/DirectProgramming/DPC++/StructuredGrids/iso3dfd_dpcpp/License.txt b/DirectProgramming/DPC++/StructuredGrids/iso3dfd_dpcpp/License.txt new file mode 100644 index 0000000000..da5f7c1888 --- /dev/null +++ b/DirectProgramming/DPC++/StructuredGrids/iso3dfd_dpcpp/License.txt @@ -0,0 +1,7 @@ +Copyright 2019 Intel Corporation + +Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. diff --git a/DirectProgramming/DPC++/StructuredGrids/iso3dfd_dpcpp/README.md b/DirectProgramming/DPC++/StructuredGrids/iso3dfd_dpcpp/README.md new file mode 100644 index 0000000000..516f9c1ba6 --- /dev/null +++ b/DirectProgramming/DPC++/StructuredGrids/iso3dfd_dpcpp/README.md @@ -0,0 +1,145 @@ +# `ISO3DFD DPC++` Sample + +The ISO3DFD sample refers to Three-Dimensional Finite-Difference Wave Propagation in Isotropic Media. It is a three-dimensional stencil to simulate a wave propagating in a 3D isotropic medium and shows some of the more common challenges when targeting SYCL devices (GPU/CPU) in more complex applications. + +For comprehensive instructions regarding DPC++ Programming, go to https://software.intel.com/en-us/oneapi-programming-guide and search based on relevant terms noted in the comments. + +| Optimized for | Description +|:--- |:--- +| OS | Linux* Ubuntu* 18.04; Windows 10 +| Hardware | Skylake with GEN9 or newer +| Software | Intel® oneAPI DPC++ Compiler beta; +| What you will learn | How to offload the computation to GPU using Intel DPC++ compiler +| Time to complete | 15 minutes + +Performance number tabulation [if applicable -- **NO for beta**] + +| iso3dfd sample | Performance data +|:--- |:--- +| Scalar baseline -O2 | 1.0 +| SYCL | 2x speedup + + +## Purpose + +ISO3DFD is a finite difference stencil kernel for solving the 3D acoustic isotropic wave equation which can be used as a proxy for propogating a seismic wave. Kernels in this sample are implemented as 16th order in space, with symmetric coefficients, and 2nd order in time scheme without boundary conditions.. Using Data Parallel C++, the sample can explicitly run on the GPU and/or CPU to propagate a seismic wave which is a compute intensive task. + +The code will attempt first to execute on an available GPU and fallback to the system's CPU if a compatible GPU is not detected. By default, the output will print the device name where the DPC++ code ran along with the grid computation metrics - flops and effective throughput. For validating results, a serial version of the application will be run on CPU and results will be compared to the DPC++ version. + + +## Key Implementation Details + +The basic DPC++ implementation explained in the code includes includes the use of the following : +* DPC++ local buffers and accessors (declare local memory buffers and accessors to be accessed and managed by each DPC++ workgroup) +* Code for Shared Local Memory (SLM) optimizations +* DPC++ kernels (including parallel_for function and nd-range<3> objects) +* DPC++ queues (including custom device selector and exception handlers) + + +## License + +This code sample is licensed under MIT license. + + +## Building the `ISO3DFD` Program for CPU and GPU + +### Include Files +The include folder is located at `%ONEAPI_ROOT%\dev-utilities\latest\include` on your development system". + +### Running Samples In DevCloud +If running a sample in the Intel DevCloud, remember that you must specify the compute node (CPU, GPU) as well whether to run in batch or interactive mode. For more information see the Intel® oneAPI Base Toolkit Get Started Guide (https://devcloud.intel.com/oneapi/get-started/base-toolkit/) + +### On a Linux* System +Perform the following steps: +1. Build the program using the following `cmake` commands. +``` +$ mkdir build +$ cd build +$ cmake .. +$ make -j +``` + +> Note: by default, executable is build with kernel with direct global memory usage. You can build the kernel with shared local memory (SLM) buffers with the following: +``` +cmake -DSHARED_KERNEL=1 .. +make -j +``` + +2. Run the program : + ``` + make run + ``` +> Note: for selecting CPU as a SYCL device use `make run_cpu` + +3. Clean the program using: + ``` + make clean + ``` + +### On a Windows* System Using Visual Studio* Version 2017 or Newer +``` +* Build the program using VS2017 or VS2019 + Right click on the solution file and open using either VS2017 or VS2019 IDE. + Right click on the project in Solution explorer and select Rebuild. + From top menu select Debug -> Start without Debugging. + +* Build the program using MSBuild + Open "x64 Native Tools Command Prompt for VS2017" or "x64 Native Tools Command Prompt for VS2019" + Run - MSBuild mandelbrot.sln /t:Rebuild /p:Configuration="Release" +``` + +## Running the Sample +``` +make run +``` + +### Application Parameters +You can modify the ISO3DFD parameters from the command line. + * Configurable Application Parameters + + Usage: src/iso3dfd.exe n1 n2 n3 b1 b2 b3 Iterations [omp|sycl] [gpu|cpu] + + n1 n2 n3 : Grid sizes for the stencil + b1 b2 b3 OR : cache block sizes for cpu openmp version. + b1 b2 : Thread block sizes in X and Y dimension for SYCL version. + and b3 : size of slice of work in Z dimension for SYCL version. + Iterations : No. of timesteps. + [omp|sycl] : Optional: Run the OpenMP or the SYCL variant. Default is to use both for validation + [gpu|cpu] : Optional: Device to run the SYCL version Default is to use the GPU if available, if not fallback to CPU + +### Example of Output +``` +Grid Sizes: 256 256 256 +Memory Usage: 230 MB + ***** Running C++ Serial variant ***** +Initializing ... +-------------------------------------- +time : 2.92984 secs +throughput : 57.2632 Mpts/s +flops : 3.49306 GFlops +bytes : 0.687159 GBytes/s + +-------------------------------------- + +-------------------------------------- + ***** Running SYCL variant ***** +Initializing ... + Running on Intel(R) Gen9 + The Device Max Work Group Size is : 256 + The Device Max EUCount is : 48 + The blockSize x is : 32 + The blockSize y is : 8 + Using Global Memory Kernel +-------------------------------------- +time : 0.597494 secs +throughput : 280.793 Mpts/s +flops : 17.1284 GFlops +bytes : 3.36952 GBytes/s + +-------------------------------------- + +-------------------------------------- +Final wavefields from SYCL device and CPU are equivalent: Success +-------------------------------------- +``` + diff --git a/DirectProgramming/DPC++/StructuredGrids/iso3dfd_dpcpp/include/device_selector.hpp b/DirectProgramming/DPC++/StructuredGrids/iso3dfd_dpcpp/include/device_selector.hpp new file mode 100644 index 0000000000..dcef0afa0e --- /dev/null +++ b/DirectProgramming/DPC++/StructuredGrids/iso3dfd_dpcpp/include/device_selector.hpp @@ -0,0 +1,47 @@ +//============================================================== +// Copyright © 2020 Intel Corporation +// +// SPDX-License-Identifier: MIT +// ============================================================= + +#ifndef DEVICESELECTOR_HPP +#define DEVICESELECTOR_HPP + +#include +#include +#include +#include "CL/sycl.hpp" + +// This is the class provided to SYCL runtime by the application to decide +// on which device to run, or whether to run at all. +// When selecting a device, SYCL runtime first takes (1) a selector provided by +// the program or a default one and (2) the set of all available devices. Then +// it passes each device to the '()' operator of the selector. Device, for +// which '()' returned the highest number, is selected. If a negative number +// was returned for all devices, then the selection process will cause an +// exception. +class MyDeviceSelector : public sycl::device_selector { + public: + MyDeviceSelector(const std::string &p) : pattern(p) { + // std::cout << "Looking for \"" << p << "\" devices" << std::endl; + } + + // This is the function which gives a "rating" to devices. + virtual int operator()(const sycl::device &device) const override { + // The template parameter to device.get_info can be a variety of properties + // defined by the SYCL spec's sycl::info:: enum. Properties may have + // different types. Here we query name which is a string. + const std::string name = device.get_info(); + // std::cout << "Trying device: " << name << "..." << std::endl; + // std::cout << " Vendor: " << + // device.get_info() << std::endl; + + // Device with pattern in the name is prioritized: + return (name.find(pattern) != std::string::npos) ? 100 : 1; + } + + private: + std::string pattern; +}; + +#endif diff --git a/DirectProgramming/DPC++/StructuredGrids/iso3dfd_dpcpp/include/iso3dfd.h b/DirectProgramming/DPC++/StructuredGrids/iso3dfd_dpcpp/include/iso3dfd.h new file mode 100644 index 0000000000..50c65cd6f6 --- /dev/null +++ b/DirectProgramming/DPC++/StructuredGrids/iso3dfd_dpcpp/include/iso3dfd.h @@ -0,0 +1,51 @@ +//============================================================== +// Copyright © 2020 Intel Corporation +// +// SPDX-License-Identifier: MIT +// ============================================================= + +#include +using namespace sycl; + +#include +#include +#include +#include +#include +/* + * Parameters to define coefficients + * kHalfLength: Radius of the stencil + * Sample source code is tested for kHalfLength=8 resulting in + * 16th order Stencil finite difference kernel + */ +constexpr float dt = 0.002f; +constexpr float dxyz = 50.0f; +constexpr unsigned int kHalfLength = 8; + +/* + * Padding to test and eliminate shared local memory bank conflicts for + * the shared local memory(slm) version of the kernel executing on GPU + */ +constexpr unsigned int kPad = 0; + +bool Iso3dfdDevice(sycl::queue &q, float *ptr_next, float *ptr_prev, + float *ptr_vel, float *ptr_coeff, size_t n1, size_t n2, + size_t n3, size_t n1_block, size_t n2_block, + size_t n3_block, size_t end_z, unsigned int num_iterations); + +void PrintTargetInfo(sycl::queue &q, unsigned int dim_x, unsigned int dim_y); + +void Usage(const std::string &program_name); + +void PrintStats(double time, size_t n1, size_t n2, size_t n3, + unsigned int num_iterations); + +bool WithinEpsilon(float *output, float *reference, const size_t dim_x, + const size_t dim_y, const size_t dim_z, + const unsigned int radius, const int zadjust, + const float delta); + +bool CheckGridDimension(size_t n1, size_t n2, size_t n3, unsigned int dim_x, + unsigned int dim_y, unsigned int block_z); + +bool CheckBlockDimension(sycl::queue &q, unsigned int dim_x, unsigned int dim_y); diff --git a/DirectProgramming/DPC++/StructuredGrids/iso3dfd_dpcpp/iso3dfd_dpcpp.sln b/DirectProgramming/DPC++/StructuredGrids/iso3dfd_dpcpp/iso3dfd_dpcpp.sln new file mode 100755 index 0000000000..33e315d59e --- /dev/null +++ b/DirectProgramming/DPC++/StructuredGrids/iso3dfd_dpcpp/iso3dfd_dpcpp.sln @@ -0,0 +1,25 @@ + +Microsoft Visual Studio Solution File, Format Version 12.00 +# Visual Studio 15 +VisualStudioVersion = 15.0.27130.2010 +MinimumVisualStudioVersion = 10.0.40219.1 +Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "iso3dfd_dpcpp", "iso3dfd_dpcpp.vcxproj", "{07DA0A96-CA76-4446-9586-99A145B9A9C8}" +EndProject +Global + GlobalSection(SolutionConfigurationPlatforms) = preSolution + Debug|x64 = Debug|x64 + Release|x64 = Release|x64 + EndGlobalSection + GlobalSection(ProjectConfigurationPlatforms) = postSolution + {07DA0A96-CA76-4446-9586-99A145B9A9C8}.Debug|x64.ActiveCfg = Debug|x64 + {07DA0A96-CA76-4446-9586-99A145B9A9C8}.Debug|x64.Build.0 = Debug|x64 + {07DA0A96-CA76-4446-9586-99A145B9A9C8}.Release|x64.ActiveCfg = Release|x64 + {07DA0A96-CA76-4446-9586-99A145B9A9C8}.Release|x64.Build.0 = Release|x64 + EndGlobalSection + GlobalSection(SolutionProperties) = preSolution + HideSolutionNode = FALSE + EndGlobalSection + GlobalSection(ExtensibilityGlobals) = postSolution + SolutionGuid = {3F515120-AC09-42A9-97D3-A26B1251EC9D} + EndGlobalSection +EndGlobal diff --git a/DirectProgramming/DPC++/StructuredGrids/iso3dfd_dpcpp/iso3dfd_dpcpp.vcxproj b/DirectProgramming/DPC++/StructuredGrids/iso3dfd_dpcpp/iso3dfd_dpcpp.vcxproj new file mode 100755 index 0000000000..17aba91fe9 --- /dev/null +++ b/DirectProgramming/DPC++/StructuredGrids/iso3dfd_dpcpp/iso3dfd_dpcpp.vcxproj @@ -0,0 +1,157 @@ + + + + + Debug + x64 + + + Release + x64 + + + + 15.0 + {07da0a96-ca76-4446-9586-99a145b9a9c8} + Win32Proj + iso3dfd_dpcpp + $(WindowsSDKVersion.Replace("\","")) + + + + Application + true + Intel(R) oneAPI DPC++ Compiler + Unicode + + + Application + false + Intel(R) oneAPI DPC++ Compiler + true + Unicode + + + Application + true + Intel(R) oneAPI DPC++ Compiler + Unicode + + + Application + false + Intel(R) oneAPI DPC++ Compiler + true + Unicode + + + + + + + + + + + + + + + + + + + + + true + + + true + + + false + + + false + + + + Use + Level3 + Disabled + true + true + pch.h + include;$(ONEAPI_ROOT)dev-utilities\latest\include;%(AdditionalIncludeDirectories) + + + Console + true + + + + + Use + Level3 + Disabled + true + true + pch.h + include;$(ONEAPI_ROOT)dev-utilities\latest\include;%(AdditionalIncludeDirectories) + + + Console + true + + + + + Use + Level3 + MaxSpeed + true + true + true + true + pch.h + include;$(ONEAPI_ROOT)dev-utilities\latest\include;%(AdditionalIncludeDirectories) + + + Console + true + true + true + + + + + Use + Level3 + MaxSpeed + true + true + true + true + pch.h + include;$(ONEAPI_ROOT)dev-utilities\latest\include;%(AdditionalIncludeDirectories) + + + Console + true + true + true + + + + + + + + + + + + + + + diff --git a/DirectProgramming/DPC++/StructuredGrids/iso3dfd_dpcpp/iso3dfd_dpcpp.vcxproj.filters b/DirectProgramming/DPC++/StructuredGrids/iso3dfd_dpcpp/iso3dfd_dpcpp.vcxproj.filters new file mode 100755 index 0000000000..e314961ca6 --- /dev/null +++ b/DirectProgramming/DPC++/StructuredGrids/iso3dfd_dpcpp/iso3dfd_dpcpp.vcxproj.filters @@ -0,0 +1,36 @@ + + + + + {4FC737F1-C7A5-4376-A066-2A32D752A2FF} + cpp;c;cc;cxx;def;odl;idl;hpj;bat;asm;asmx + + + {93995380-89BD-4b04-88EB-625FBE52EBFB} + h;hh;hpp;hxx;hm;inl;inc;ipp;xsd + + + {67DA6AB6-F800-4c08-8B7A-83BB121AAD01} + rc;ico;cur;bmp;dlg;rc2;rct;bin;rgs;gif;jpg;jpeg;jpe;resx;tiff;tif;png;wav;mfcribbon-ms + + + + + Source Files + + + Source Files + + + Source Files + + + + + Header Files + + + Header Files + + + diff --git a/DirectProgramming/DPC++/StructuredGrids/iso3dfd_dpcpp/iso3dfd_dpcpp.vcxproj.user b/DirectProgramming/DPC++/StructuredGrids/iso3dfd_dpcpp/iso3dfd_dpcpp.vcxproj.user new file mode 100755 index 0000000000..5675a6f273 --- /dev/null +++ b/DirectProgramming/DPC++/StructuredGrids/iso3dfd_dpcpp/iso3dfd_dpcpp.vcxproj.user @@ -0,0 +1,11 @@ + + + + 256 256 256 32 8 64 100 sycl gpu + WindowsLocalDebugger + + + 256 256 256 32 8 64 100 sycl gpu + WindowsLocalDebugger + + \ No newline at end of file diff --git a/DirectProgramming/DPC++/StructuredGrids/iso3dfd_dpcpp/sample.json b/DirectProgramming/DPC++/StructuredGrids/iso3dfd_dpcpp/sample.json new file mode 100755 index 0000000000..695fea9e80 --- /dev/null +++ b/DirectProgramming/DPC++/StructuredGrids/iso3dfd_dpcpp/sample.json @@ -0,0 +1,30 @@ +{ + "guid": "3A7DA713-6083-4CA3-B66E-A3DF21744EB4", + "name": "iso3dfd_dpcpp", + "categories": [ "Toolkit/Intel® oneAPI HPC Toolkit" ], + "description": "A finite difference stencil kernel for solving 3D acoustic isotropic wave equation", + "toolchain": [ "dpcpp" ], + "targetDevice": [ "CPU", "GPU" ], + "languages": [ { "cpp": {} } ], + "os": [ "linux", "windows" ], + "builder": [ "ide", "cmake" ], + "ciTests": { + "linux": [{ + "steps": [ + "mkdir build", + "cd build", + "cmake ..", + "make", + "make run" + ] + }], + "windows": [{ + "steps": [ + "MSBuild iso3dfd_dpcpp.sln /t:Rebuild /p:Configuration=\"Release\"", + "cd x64/Release", + "iso3dfd.exe 256 256 256 32 8 64 10 gpu" + ] + }] + + } +} diff --git a/DirectProgramming/DPC++/StructuredGrids/iso3dfd_dpcpp/src/CMakeLists.txt b/DirectProgramming/DPC++/StructuredGrids/iso3dfd_dpcpp/src/CMakeLists.txt new file mode 100644 index 0000000000..4801b32e96 --- /dev/null +++ b/DirectProgramming/DPC++/StructuredGrids/iso3dfd_dpcpp/src/CMakeLists.txt @@ -0,0 +1,27 @@ +set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O3 --std=c++17") +include_directories("../include/") + +OPTION(SHARED_KERNEL "Use SLM Kernel Version - Only for GPU" OFF) +if(SHARED_KERNEL) + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DUSE_SHARED") +endif(SHARED_KERNEL) + +# Set default build type to RelWithDebInfo if not specified +if (NOT CMAKE_BUILD_TYPE) + message (STATUS "Default CMAKE_BUILD_TYPE not set using Release with Debug Info") + set (CMAKE_BUILD_TYPE "RelWithDebInfo" CACHE + STRING "Choose the type of build, options are: None Debug Release RelWithDebInfo MinSizeRel" + FORCE) +endif() + +set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS}") + +add_executable (iso3dfd.exe iso3dfd.cpp iso3dfd_kernels.cpp utils.cpp) +target_link_libraries(iso3dfd.exe OpenCL sycl) +if(WIN32) + add_custom_target (run iso3dfd.exe 256 256 256 32 8 64 10 gpu) + add_custom_target (run_cpu iso3dfd.exe 256 256 256 256 1 1 10 cpu) +else() + add_custom_target (run iso3dfd.exe 256 256 256 32 8 64 10 gpu) + add_custom_target (run_cpu iso3dfd.exe 256 256 256 256 1 1 10 cpu) +endif() diff --git a/DirectProgramming/DPC++/StructuredGrids/iso3dfd_dpcpp/src/iso3dfd.cpp b/DirectProgramming/DPC++/StructuredGrids/iso3dfd_dpcpp/src/iso3dfd.cpp new file mode 100644 index 0000000000..e2c4a687d8 --- /dev/null +++ b/DirectProgramming/DPC++/StructuredGrids/iso3dfd_dpcpp/src/iso3dfd.cpp @@ -0,0 +1,343 @@ +//============================================================== +// Copyright � 2020 Intel Corporation +// +// SPDX-License-Identifier: MIT +// ============================================================= + +// ISO3DFD: Intel� oneAPI DPC++ Language Basics Using 3D-Finite-Difference-Wave +// Propagation +// +// ISO3DFD is a finite difference stencil kernel for solving the 3D acoustic +// isotropic wave equation. Kernels in this sample are implemented as 16th order +// in space, 2nd order in time scheme without boundary conditions. Using Data +// Parallel C++, the sample can explicitly run on the GPU and/or CPU to +// calculate a result. If successful, the output will print the device name +// where the DPC++ code ran along with the grid computation metrics - flops +// and effective throughput +// +// For comprehensive instructions regarding DPC++ Programming, go to +// https://software.intel.com/en-us/oneapi-programming-guide +// and search based on relevant terms noted in the comments. +// +// DPC++ material used in this code sample: +// +// DPC++ Queues (including device selectors and exception handlers) +// DPC++ Custom device selector +// DPC++ Buffers and accessors (communicate data between the host and the +// device) +// DPC++ Kernels (including parallel_for function and nd-range<3> +// objects) +// Shared Local Memory (SLM) optimizations (DPC++) +// DPC++ Basic synchronization (barrier function) +// +#include "iso3dfd.h" +#include +#include "device_selector.hpp" +#include "dpc_common.hpp" + +/* + * Host-Code + * Function used for initialization + */ +void Initialize(float* ptr_prev, float* ptr_next, float* ptr_vel, size_t n1, + size_t n2, size_t n3) { + std::cout << "Initializing ... \n"; + size_t dim2 = n2 * n1; + + for (size_t i = 0; i < n3; i++) { + for (size_t j = 0; j < n2; j++) { + size_t offset = i * dim2 + j * n1; +#pragma omp simd + for (int k = 0; k < n1; k++) { + ptr_prev[offset + k] = 0.0f; + ptr_next[offset + k] = 0.0f; + ptr_vel[offset + k] = + 2250000.0f * dt * dt; // Integration of the v*v and dt*dt + } + } + } + // Add a source to initial wavefield as an initial condition + float val = 1.f; + for (int s = 5; s >= 0; s--) { + for (int i = n3 / 2 - s; i < n3 / 2 + s; i++) { + for (int j = n2 / 4 - s; j < n2 / 4 + s; j++) { + size_t offset = i * dim2 + j * n1; + for (int k = n1 / 4 - s; k < n1 / 4 + s; k++) { + ptr_prev[offset + k] = val; + } + } + } + val *= 10; + } +} + +/* + * Host-Code + * OpenMP implementation for single iteration of iso3dfd kernel. + * This function is used as reference implementation for verification and + * also to compare performance of OpenMP and DPC++ on CPU + * Additional Details: + * https://software.intel.com/en-us/articles/eight-optimizations-for-3-dimensional-finite-difference-3dfd-code-with-an-isotropic-iso + */ +void Iso3dfdIteration(float* ptr_next_base, float* ptr_prev_base, + float* ptr_vel_base, float* coeff, const size_t n1, + const size_t n2, const size_t n3, const size_t n1_block, + const size_t n2_block, const size_t n3_block) { + size_t dimn1n2 = n1 * n2; + size_t n3End = n3 - kHalfLength; + size_t n2End = n2 - kHalfLength; + size_t n1End = n1 - kHalfLength; + +#pragma omp parallel default(shared) +#pragma omp for schedule(static) collapse(3) + for (size_t bz = kHalfLength; bz < n3End; + bz += n3_block) { // start of cache blocking + for (size_t by = kHalfLength; by < n2End; by += n2_block) { + for (size_t bx = kHalfLength; bx < n1End; bx += n1_block) { + int izEnd = std::min(bz + n3_block, n3End); + int iyEnd = std::min(by + n2_block, n2End); + int ixEnd = std::min(n1_block, n1End - bx); + for (size_t iz = bz; iz < izEnd; iz++) { // start of inner iterations + for (size_t iy = by; iy < iyEnd; iy++) { + float* ptr_next = ptr_next_base + iz * dimn1n2 + iy * n1 + bx; + float* ptr_prev = ptr_prev_base + iz * dimn1n2 + iy * n1 + bx; + float* ptr_vel = ptr_vel_base + iz * dimn1n2 + iy * n1 + bx; +#pragma omp simd + for (size_t ix = 0; ix < ixEnd; ix++) { + float value = 0.0; + value += ptr_prev[ix] * coeff[0]; +#pragma unroll(kHalfLength) + for (unsigned int ir = 1; ir <= kHalfLength; ir++) { + value += coeff[ir] * + ((ptr_prev[ix + ir] + ptr_prev[ix - ir]) + + (ptr_prev[ix + ir * n1] + ptr_prev[ix - ir * n1]) + + (ptr_prev[ix + ir * dimn1n2] + + ptr_prev[ix - ir * dimn1n2])); + } + ptr_next[ix] = + 2.0f * ptr_prev[ix] - ptr_next[ix] + value * ptr_vel[ix]; + } + } + } // end of inner iterations + } + } + } // end of cache blocking +} + +/* + * Host-Code + * Driver function for ISO3DFD OpenMP code + * Uses ptr_next and ptr_prev as ping-pong buffers to achieve + * accelerated wave propogation + */ +void Iso3dfd(float* ptr_next, float* ptr_prev, float* ptr_vel, float* coeff, + const size_t n1, const size_t n2, const size_t n3, + const unsigned int nreps, const size_t n1_block, + const size_t n2_block, const size_t n3_block) { + for (unsigned int it = 0; it < nreps; it += 1) { + Iso3dfdIteration(ptr_next, ptr_prev, ptr_vel, coeff, n1, n2, n3, n1_block, + n2_block, n3_block); + + // here's where boundary conditions and halo exchanges happen + // Swap previous & next between iterations + it++; + if (it < nreps) + Iso3dfdIteration(ptr_prev, ptr_next, ptr_vel, coeff, n1, n2, n3, n1_block, + n2_block, n3_block); + } // time loop +} + +/* + * Host-Code + * Main function to drive the sample application + */ +int main(int argc, char* argv[]) { + // Arrays used to update the wavefield + float* prev_base; + float* next_base; + // Array to store wave velocity + float* vel_base; + // Array to store results for comparison + float* temp; + + bool sycl = true; + bool omp = true; + bool error = false; + bool is_gpu = true; + + size_t n1, n2, n3; + size_t n1_block, n2_block, n3_block; + unsigned int num_iterations; + + // Read Input Parameters + try { + n1 = std::stoi(argv[1]) + (2 * kHalfLength); + n2 = std::stoi(argv[2]) + (2 * kHalfLength); + n3 = std::stoi(argv[3]) + (2 * kHalfLength); + n1_block = std::stoi(argv[4]); + n2_block = std::stoi(argv[5]); + n3_block = std::stoi(argv[6]); + num_iterations = std::stoi(argv[7]); + } + + catch (...) { + Usage(argv[0]); + return 1; + } + + // Read optional arguments to select version and device + for (auto arg = 8; arg < argc; arg++) { + std::string arg_value = argv[arg]; + transform(arg_value.begin(), arg_value.end(), arg_value.begin(), ::tolower); + + if (arg_value == "omp") { + omp = true; + sycl = false; + } else if (arg_value == "sycl") { + omp = false; + sycl = true; + } else if (arg_value == "gpu") { + is_gpu = true; + } else if (arg_value == "cpu") { + is_gpu = false; + } else { + Usage(argv[0]); + return 1; + } + } + + // Validate input sizes for the grid and block dimensions + if (CheckGridDimension(n1 - 2 * kHalfLength, n2 - 2 * kHalfLength, + n3 - 2 * kHalfLength, n1_block, n2_block, n3_block)) { + Usage(argv[0]); + return 1; + } + + // Compute the total size of grid + size_t nsize = n1 * n2 * n3; + + prev_base = new float[nsize]; + next_base = new float[nsize]; + vel_base = new float[nsize]; + + // Compute coefficients to be used in wavefield update + float coeff[kHalfLength + 1] = {-3.0548446, +1.7777778, -3.1111111e-1, + +7.572087e-2, -1.76767677e-2, +3.480962e-3, + -5.180005e-4, +5.074287e-5, -2.42812e-6}; + + // Apply the DX DY and DZ to coefficients + coeff[0] = (3.0f * coeff[0]) / (dxyz * dxyz); + for (int i = 1; i <= kHalfLength; i++) { + coeff[i] = coeff[i] / (dxyz * dxyz); + } + + std::cout << "Grid Sizes: " << n1 - 2 * kHalfLength << " " + << n2 - 2 * kHalfLength << " " << n3 - 2 * kHalfLength << "\n"; + std::cout << "Memory Usage: " << ((3 * nsize * sizeof(float)) / (1024 * 1024)) + << " MB\n"; + + // Check if running OpenMP OR Serial version on CPU + if (omp) { +#if defined(_OPENMP) + std::cout << " ***** Running OpenMP variant *****\n"; +#else + std::cout << " ***** Running C++ Serial variant *****\n"; +#endif + + // Initialize arrays and introduce initial conditions (source) + Initialize(prev_base, next_base, vel_base, n1, n2, n3); + + // Start timer + dpc_common::TimeInterval t_ser; + // Invoke the driver function to perform 3D wave propogation + // using OpenMP/Serial version + Iso3dfd(next_base, prev_base, vel_base, coeff, n1, n2, n3, num_iterations, + n1_block, n2_block, n3_block); + + // End timer + PrintStats(t_ser.Elapsed() * 1e3, n1, n2, n3, num_iterations); + } + + // Check if running both OpenMP/Serial and DPC++ version + // Keeping a copy of output buffer from OpenMP version + // for comparison + if (omp && sycl) { + temp = new float[nsize]; + if (num_iterations % 2) + memcpy(temp, next_base, nsize * sizeof(float)); + else + memcpy(temp, prev_base, nsize * sizeof(float)); + } + + // Check if running DPC++/SYCL version + if (sycl) { + std::cout << " ***** Running SYCL variant *****\n"; + // Initialize arrays and introduce initial conditions (source) + Initialize(prev_base, next_base, vel_base, n1, n2, n3); + + // Initializing a string pattern to allow a custom device selector + // pick a SYCL device as per user's preference and available devices + // Default value of pattern is set to CPU + std::string pattern("CPU"); + std::string pattern_gpu("Gen"); + + // Replacing the pattern string to Gen if running on a GPU + if (is_gpu) { + pattern.replace(0, 3, pattern_gpu); + } + + // Create a custom device selector using DPC++ device selector class + MyDeviceSelector device_sel(pattern); + + // Create a device queue using DPC++ class queue with a custom + // device selector + queue q(device_sel, dpc_common::exception_handler); + + // Validate if the block sizes selected are + // within range for the selected SYCL device + if (CheckBlockDimension(q, n1_block, n2_block)) { + Usage(argv[0]); + return 1; + } + + // Start timer + dpc_common::TimeInterval t_dpc; + + // Invoke the driver function to perform 3D wave propogation + // using DPC++ version on the selected SYCL device + Iso3dfdDevice(q, next_base, prev_base, vel_base, coeff, n1, n2, n3, + n1_block, n2_block, n3_block, n3 - kHalfLength, + num_iterations); + // Wait for the commands to complete. Enforce synchronization on the command + // queue + q.wait_and_throw(); + + // End timer + PrintStats(t_dpc.Elapsed() * 1e3, n1, n2, n3, num_iterations); + } + + // If running both OpenMP/Serial and DPC++ version + // Comparing results + if (omp && sycl) { + if (num_iterations % 2) { + error = WithinEpsilon(next_base, temp, n1, n2, n3, kHalfLength, 0, 0.1f); + } else { + error = WithinEpsilon(prev_base, temp, n1, n2, n3, kHalfLength, 0, 0.1f); + } + if (error) { + std::cout << "Final wavefields from SYCL device and CPU are not " + << "equivalent: Fail\n"; + } else { + std::cout << "Final wavefields from SYCL device and CPU are equivalent:" + << " Success\n"; + } + std::cout << "--------------------------------------\n"; + delete[] temp; + } + + delete[] prev_base; + delete[] next_base; + delete[] vel_base; + + return error ? 1 : 0; +} diff --git a/DirectProgramming/DPC++/StructuredGrids/iso3dfd_dpcpp/src/iso3dfd_kernels.cpp b/DirectProgramming/DPC++/StructuredGrids/iso3dfd_dpcpp/src/iso3dfd_kernels.cpp new file mode 100644 index 0000000000..1b7bdec23e --- /dev/null +++ b/DirectProgramming/DPC++/StructuredGrids/iso3dfd_dpcpp/src/iso3dfd_kernels.cpp @@ -0,0 +1,419 @@ +//============================================================== +// Copyright © 2020 Intel Corporation +// +// SPDX-License-Identifier: MIT +// ============================================================= +// ISO3DFD: Intel® oneAPI DPC++ Language Basics Using 3D-Finite-Difference-Wave +// Propagation +// +// ISO3DFD is a finite difference stencil kernel for solving the 3D acoustic +// isotropic wave equation which can be used as a proxy for propogating a +// seismic wave. Kernels in this sample are implemented as 16th order in space, +// with symmetric coefficients, and 2nd order in time scheme without boundary +// conditions.. Using Data Parallel C++, the sample can explicitly run on the +// GPU and/or CPU to propagate a seismic wave which is a compute intensive task. +// If successful, the output will print the device name +// where the DPC++ code ran along with the grid computation metrics - flops +// and effective throughput. +// +// For comprehensive instructions regarding DPC++ Programming, go to +// https://software.intel.com/en-us/oneapi-programming-guide +// and search based on relevant terms noted in the comments. +// +// DPC++ material used in this code sample: +// +// DPC++ Queues (including device selectors and exception handlers) +// DPC++ Custom device selector +// DPC++ Buffers and accessors (communicate data between the host and the +// device) +// DPC++ Kernels (including parallel_for function and nd-range<3> +// objects) +// Shared Local Memory (SLM) optimizations (DPC++) +// DPC++ Basic synchronization (barrier function) +// +#include "iso3dfd.h" + +/* + * Device-Code - Optimized for GPU + * SYCL implementation for single iteration of iso3dfd kernel + * using shared local memory optimizations + * + * ND-Range kernel is used to spawn work-items in x, y dimension + * Each work-item then traverses in the z-dimension + * + * z-dimension slicing can be used to vary the total number + * global work-items. + * + * SLM Padding can be used to eliminate SLM bank conflicts if + * there are any + */ +void Iso3dfdIterationSLM(sycl::nd_item<3> it, float *next, float *prev, + float *vel, const float *coeff, float *tab, size_t nx, + size_t nxy, size_t bx, size_t by, size_t z_offset, + int full_end_z) { + // Compute local-id for each work-item + auto id0 = it.get_local_id(2); + auto id1 = it.get_local_id(1); + + // Compute the position in local memory each work-item + // will fetch data from global memory into shared + // local memory + auto stride = it.get_local_range(2) + 2 * kHalfLength + kPad; + auto identifiant = (id0 + kHalfLength) + (id1 + kHalfLength) * stride; + + // We compute the start and the end position in the grid + // for each work-item. + // Each work-items local value gid is updated to track the + // current cell/grid point it is working with. + // This position is calculated with the help of slice-ID and number of + // grid points each work-item will process. + // Offset of kHalfLength is also used to account for HALO + auto begin_z = it.get_global_id(0) * z_offset + kHalfLength; + auto end_z = begin_z + z_offset; + if (end_z > full_end_z) end_z = full_end_z; + + auto gid = (it.get_global_id(2) + bx) + ((it.get_global_id(1) + by) * nx) + + (begin_z * nxy); + + // front and back temporary arrays are used to ensure + // the grid values in z-dimension are read once, shifted in + // these array and re-used multiple times before being discarded + // + // This is an optimization technique to enable data-reuse and + // improve overall FLOPS to BYTES read ratio + float front[kHalfLength + 1]; + float back[kHalfLength]; + float c[kHalfLength + 1]; + + for (auto iter = 0; iter < kHalfLength; iter++) { + front[iter] = prev[gid + iter * nxy]; + } + c[0] = coeff[0]; + + for (auto iter = 1; iter <= kHalfLength; iter++) { + back[iter - 1] = prev[gid - iter * nxy]; + c[iter] = coeff[iter]; + } + + // Shared Local Memory (SLM) optimizations (DPC++) + // Set some flags to indicate if the current work-item + // should read from global memory to shared local memory buffer + // or not + auto items_x = it.get_local_range(2); + auto items_y = it.get_local_range(1); + + bool copy_halo_y = false, copy_halo_x = false; + if (id1 < kHalfLength) copy_halo_y = true; + if (id0 < kHalfLength) copy_halo_x = true; + + for (auto i = begin_z; i < end_z; i++) { + // Shared Local Memory (SLM) optimizations (DPC++) + // If work-item is flagged to read into SLM buffer + if (copy_halo_y) { + tab[identifiant - kHalfLength * stride] = prev[gid - kHalfLength * nx]; + tab[identifiant + items_y * stride] = prev[gid + items_y * nx]; + } + if (copy_halo_x) { + tab[identifiant - kHalfLength] = prev[gid - kHalfLength]; + tab[identifiant + items_x] = prev[gid + items_x]; + } + tab[identifiant] = front[0]; + + // DPC++ Basic synchronization (barrier function) + // Force synchronization within a work-group + // using barrier function to ensure + // all the work-items have completed reading into the SLM buffer + it.barrier(access::fence_space::local_space); + + // Only one new data-point read from global memory + // in z-dimension (depth) + front[kHalfLength] = prev[gid + kHalfLength * nxy]; + + // Stencil code to update grid point at position given by global id (gid) + // New time step for grid point is computed based on the values of the + // the immediate neighbors - horizontal, vertical and depth + // directions(kHalfLength number of points in each direction), + // as well as the value of grid point at a previous time step + // + // Neighbors in the depth (z-dimension) are read out of + // front and back arrays + // Neighbors in the horizontal and vertical (x, y dimension) are + // read from the SLM buffers + float value = c[0] * front[0]; +#pragma unroll(kHalfLength) + for (auto iter = 1; iter <= kHalfLength; iter++) { + value += c[iter] * + (front[iter] + back[iter - 1] + tab[identifiant + iter] + + tab[identifiant - iter] + tab[identifiant + iter * stride] + + tab[identifiant - iter * stride]); + } + next[gid] = 2.0f * front[0] - next[gid] + value * vel[gid]; + + // Update the gid to advance in the z-dimension + gid += nxy; + + // Input data in front and back are shifted to discard the + // oldest value and read one new value. + for (auto iter = kHalfLength - 1; iter > 0; iter--) { + back[iter] = back[iter - 1]; + } + back[0] = front[0]; + + for (auto iter = 0; iter < kHalfLength; iter++) { + front[iter] = front[iter + 1]; + } + + // DPC++ Basic synchronization (barrier function) + // Force synchronization within a work-group + // using barrier function to ensure that SLM buffers + // are not overwritten by next set of work-items + // (highly unlikely but not impossible) + it.barrier(access::fence_space::local_space); + } +} + +/* + * Device-Code - Optimized for GPU, CPU + * SYCL implementation for single iteration of iso3dfd kernel + * without using any shared local memory optimizations + * + * + * ND-Range kernel is used to spawn work-items in x, y dimension + * Each work-item can then traverse in the z-dimension + * + * z-dimension slicing can be used to vary the total number + * global work-items. + * + */ +void Iso3dfdIterationGlobal(sycl::nd_item<3> it, float *next, float *prev, + float *vel, const float *coeff, int nx, int nxy, + int bx, int by, int z_offset, int full_end_z) { + // We compute the start and the end position in the grid + // for each work-item. + // Each work-items local value gid is updated to track the + // current cell/grid point it is working with. + // This position is calculated with the help of slice-ID and number of + // grid points each work-item will process. + // Offset of kHalfLength is also used to account for HALO + auto begin_z = it.get_global_id(0) * z_offset + kHalfLength; + auto end_z = begin_z + z_offset; + if (end_z > full_end_z) end_z = full_end_z; + + auto gid = (it.get_global_id(2) + bx) + ((it.get_global_id(1) + by) * nx) + + (begin_z * nxy); + + // front and back temporary arrays are used to ensure + // the grid values in z-dimension are read once, shifted in + // these array and re-used multiple times before being discarded + // + // This is an optimization technique to enable data-reuse and + // improve overall FLOPS to BYTES read ratio + float front[kHalfLength + 1]; + float back[kHalfLength]; + float c[kHalfLength + 1]; + + for (auto iter = 0; iter <= kHalfLength; iter++) { + front[iter] = prev[gid + iter * nxy]; + } + c[0] = coeff[0]; + for (auto iter = 1; iter <= kHalfLength; iter++) { + c[iter] = coeff[iter]; + back[iter - 1] = prev[gid - iter * nxy]; + } + + // Stencil code to update grid point at position given by global id (gid) + // New time step for grid point is computed based on the values of the + // the immediate neighbors - horizontal, vertical and depth + // directions(kHalfLength number of points in each direction), + // as well as the value of grid point at a previous time step + + float value = c[0] * front[0]; +#pragma unroll(kHalfLength) + for (auto iter = 1; iter <= kHalfLength; iter++) { + value += c[iter] * + (front[iter] + back[iter - 1] + prev[gid + iter] + + prev[gid - iter] + prev[gid + iter * nx] + prev[gid - iter * nx]); + } + next[gid] = 2.0f * front[0] - next[gid] + value * vel[gid]; + + // Update the gid and position in z-dimension and check if there + // is more work to do + gid += nxy; + begin_z++; + + while (begin_z < end_z) { + // Input data in front and back are shifted to discard the + // oldest value and read one new value. + for (auto iter = kHalfLength - 1; iter > 0; iter--) { + back[iter] = back[iter - 1]; + } + back[0] = front[0]; + + for (auto iter = 0; iter < kHalfLength; iter++) { + front[iter] = front[iter + 1]; + } + + // Only one new data-point read from global memory + // in z-dimension (depth) + front[kHalfLength] = prev[gid + kHalfLength * nxy]; + + // Stencil code to update grid point at position given by global id (gid) + float value = c[0] * front[0]; +#pragma unroll(kHalfLength) + for (auto iter = 1; iter <= kHalfLength; iter++) { + value += c[iter] * (front[iter] + back[iter - 1] + prev[gid + iter] + + prev[gid - iter] + prev[gid + iter * nx] + + prev[gid - iter * nx]); + } + + next[gid] = 2.0f * front[0] - next[gid] + value * vel[gid]; + + gid += nxy; + begin_z++; + } +} + +/* + * Host-side SYCL Code + * + * Driver function for ISO3DFD SYCL code + * Uses ptr_next and ptr_prev as ping-pong buffers to achieve + * accelerated wave propogation + * + * This function uses SYCL buffers to facilitate host to device + * buffer copies + * + */ + +bool Iso3dfdDevice(sycl::queue &q, float *ptr_next, float *ptr_prev, + float *ptr_vel, float *ptr_coeff, size_t n1, size_t n2, + size_t n3, size_t n1_block, size_t n2_block, size_t n3_block, + size_t end_z, unsigned int nIterations) { + auto nx = n1; + auto nxy = n1 * n2; + + auto bx = kHalfLength; + auto by = kHalfLength; + + // Display information about the selected device + PrintTargetInfo(q, n1_block, n2_block); + + auto sizeTotal = nxy * n3; + + { // Begin buffer scope + // Create buffers using DPC++ class buffer + buffer b_ptr_next(ptr_next, sizeTotal); + buffer b_ptr_prev(ptr_prev, sizeTotal); + buffer b_ptr_vel(ptr_vel, sizeTotal); + buffer b_ptr_coeff(ptr_coeff, kHalfLength + 1); + + // Iterate over time steps + for (auto i = 0; i < nIterations; i += 1) { + // Submit command group for execution + q.submit([&](auto &h) { + // Create accessors + auto next = b_ptr_next.get_access(h); + auto prev = b_ptr_prev.get_access(h); + auto vel = b_ptr_vel.get_access(h); + auto coeff = b_ptr_coeff.get_access(h); + + // Define local and global range + + // Define local ND range of work-items + // Size of each DPC++ work-group selected here is a product of + // n2_block and n1_block which can be controlled by the input + // command line arguments + auto local_nd_range = range(1, n2_block, n1_block); + + // Define global ND range of work-items + // Size of total number of work-items is selected based on the + // total grid size in first and second dimensions (XY-plane) + // + // Each of the work-item then works on computing + // one or more grid points. This value can be controlled by the + // input command line argument n3_block + // + // Effectively this implementation enables slicing of the full + // grid into smaller grid slices which can be computed in parallel + // to allow auto-scaling of the total number of work-items + // spawned to achieve full occupancy for small or larger accelerator + // devices + auto global_nd_range = + range((n3 - 2 * kHalfLength) / n3_block, (n2 - 2 * kHalfLength), + (n1 - 2 * kHalfLength)); + +#ifdef USE_SHARED + // Using 3D-stencil kernel with Shared Local Memory (SLM) + // optimizations (DPC++) to improve effective FLOPS to BYTES + // ratio. By default, SLM code path is disabled in this + // code sample. + // SLM code path can be enabled by recompiling the DPC++ source + // as follows: + // cmake -DSHARED_KERNEL=1 .. + // make -j`nproc` + + // Define a range for SLM Buffer + // Padding can be used to avoid SLM bank conflicts + // By default padding is disabled in the sample code + auto local_range = range((n1_block + (2 * kHalfLength) + kPad) * + (n2_block + (2 * kHalfLength))); + + // Create an accessor for SLM buffer + accessor tab( + local_range, h); + + // Send a DPC++ kernel (lambda) for parallel execution + // The function that executes a single iteration is called + // "Iso3dfdIterationSLM" + // alternating the 'next' and 'prev' parameters which effectively + // swaps their content at every iteration. + if (i % 2 == 0) + h.parallel_for( + nd_range(global_nd_range, local_nd_range), [=](nd_item<3> it) { + Iso3dfdIterationSLM(it, next.get_pointer(), prev.get_pointer(), + vel.get_pointer(), coeff.get_pointer(), + tab.get_pointer(), nx, nxy, bx, by, + n3_block, end_z); + }); + else + h.parallel_for( + nd_range(global_nd_range, local_nd_range), [=](nd_item<3> it) { + Iso3dfdIterationSLM(it, prev.get_pointer(), next.get_pointer(), + vel.get_pointer(), coeff.get_pointer(), + tab.get_pointer(), nx, nxy, bx, by, + n3_block, end_z); + }); + +#else + + // Use Global Memory version of the 3D-Stencil kernel. + // This code path is enabled by default + + // Send a DPC++ kernel (lambda) for parallel execution + // The function that executes a single iteration is called + // "Iso3dfdIterationGlobal" + // alternating the 'next' and 'prev' parameters which effectively + // swaps their content at every iteration. + if (i % 2 == 0) + h.parallel_for( + nd_range(global_nd_range, local_nd_range), [=](nd_item<3> it) { + Iso3dfdIterationGlobal(it, next.get_pointer(), + prev.get_pointer(), vel.get_pointer(), + coeff.get_pointer(), nx, nxy, bx, by, + n3_block, end_z); + }); + else + h.parallel_for( + nd_range(global_nd_range, local_nd_range), [=](nd_item<3> it) { + Iso3dfdIterationGlobal(it, prev.get_pointer(), + next.get_pointer(), vel.get_pointer(), + coeff.get_pointer(), nx, nxy, bx, by, + n3_block, end_z); + }); +#endif + }); + } + } // end buffer scope + return true; +} diff --git a/DirectProgramming/DPC++/StructuredGrids/iso3dfd_dpcpp/src/utils.cpp b/DirectProgramming/DPC++/StructuredGrids/iso3dfd_dpcpp/src/utils.cpp new file mode 100644 index 0000000000..680fca2674 --- /dev/null +++ b/DirectProgramming/DPC++/StructuredGrids/iso3dfd_dpcpp/src/utils.cpp @@ -0,0 +1,165 @@ +//============================================================== +// Copyright © 2020 Intel Corporation +// +// SPDX-License-Identifier: MIT +// ============================================================= + +#include "iso3dfd.h" + +/* + * Host-Code + * Utility function to validate grid and block dimensions + */ +bool CheckGridDimension(size_t n1, size_t n2, size_t n3, unsigned int dim_x, + unsigned int dim_y, unsigned int block_z) { + if (n1 % dim_x) { + std::cout << " ERROR: Invalid Grid Size: n1 should be multiple of DIMX - " + << dim_x << "\n"; + return true; + } + if (n2 % dim_y) { + std::cout << " ERROR: Invalid Grid Size: n2 should be multiple of DIMY - " + << dim_y << "\n"; + return true; + } + if (n3 % block_z) { + std::cout << " ERROR: Invalid Grid Size: n3 should be multiple of BLOCKZ - " + << block_z << "\n"; + return true; + } + + return false; +} + +/* + * Host-Code + * Utility function to validate block sizes + */ +bool CheckBlockDimension(sycl::queue& q, unsigned int dim_x, + unsigned int dim_y) { + auto device = q.get_device(); + auto max_block_size = + device.get_info(); + + if ((max_block_size > 1) && (dim_x * dim_y > max_block_size)) { + std::cout << "ERROR: Invalid block sizes: n1_Tblock * n2_Tblock should be " + "less than or equal to " + << max_block_size << "\n"; + return true; + } + + return false; +} + +/* + * Host-Code + * Utility function to print device info + */ +void PrintTargetInfo(sycl::queue& q, unsigned int dim_x, unsigned int dim_y) { + auto device = q.get_device(); + auto max_block_size = + device.get_info(); + + auto max_exec_unit_count = + device.get_info(); + + std::cout << " Running on " << device.get_info() + << "\n"; + std::cout << " The Device Max Work Group Size is : " << max_block_size + << "\n"; + std::cout << " The Device Max EUCount is : " << max_exec_unit_count << "\n"; + std::cout << " The blockSize x is : " << dim_x << "\n"; + std::cout << " The blockSize y is : " << dim_y << "\n"; +#ifdef USE_SHARED + std::cout << " Using Shared Local Memory Kernel\n"; +#else + std::cout << " Using Global Memory Kernel\n"; + +#endif +} + +/* + * Host-Code + * Utility function to get input arguments + */ +void Usage(const std::string& programName) { + std::cout << " Incorrect parameters \n"; + std::cout << " Usage: "; + std::cout << programName + << " n1 n2 n3 b1 b2 b3 Iterations [omp|sycl] [gpu|cpu] \n\n"; + std::cout << " n1 n2 n3 : Grid sizes for the stencil \n"; + std::cout << " b1 b2 b3 : cache block sizes for cpu openmp version.\n"; + std::cout << " Iterations : No. of timesteps. \n"; + std::cout << " [omp|sycl] : Optional: Run the OpenMP or the SYCL variant." + << " Default is to use both for validation \n"; + std::cout + << " [gpu|cpu] : Optional: Device to run the SYCL version" + << " Default is to use the GPU if available, if not fallback to CPU \n\n"; +} + +/* + * Host-Code + * Utility function to print stats + */ +void PrintStats(double time, size_t n1, size_t n2, size_t n3, + unsigned int nIterations) { + float throughput_mpoints = 0.0f, mflops = 0.0f, normalized_time = 0.0f; + double mbytes = 0.0f; + + normalized_time = (double)time / nIterations; + throughput_mpoints = ((n1 - 2 * kHalfLength) * (n2 - 2 * kHalfLength) * + (n3 - 2 * kHalfLength)) / + (normalized_time * 1e3f); + mflops = (7.0f * kHalfLength + 5.0f) * throughput_mpoints; + mbytes = 12.0f * throughput_mpoints; + + std::cout << "--------------------------------------\n"; + std::cout << "time : " << time / 1e3f << " secs\n"; + std::cout << "throughput : " << throughput_mpoints << " Mpts/s\n"; + std::cout << "flops : " << mflops / 1e3f << " GFlops\n"; + std::cout << "bytes : " << mbytes / 1e3f << " GBytes/s\n"; + std::cout << "\n--------------------------------------\n"; + std::cout << "\n--------------------------------------\n"; +} + +/* + * Host-Code + * Utility function to calculate L2-norm between resulting buffer and reference + * buffer + */ +bool WithinEpsilon(float* output, float* reference, const size_t dim_x, + const size_t dim_y, const size_t dim_z, + const unsigned int radius, const int zadjust = 0, + const float delta = 0.01f) { + std::ofstream error_file; + error_file.open("error_diff.txt"); + + bool error = false; + double norm2 = 0; + + for (size_t iz = 0; iz < dim_z; iz++) { + for (size_t iy = 0; iy < dim_y; iy++) { + for (size_t ix = 0; ix < dim_x; ix++) { + if (ix >= radius && ix < (dim_x - radius) && iy >= radius && + iy < (dim_y - radius) && iz >= radius && + iz < (dim_z - radius + zadjust)) { + float difference = fabsf(*reference - *output); + norm2 += difference * difference; + if (difference > delta) { + error = true; + error_file << " ERROR: " << ix << ", " << iy << ", " << iz << " " + << *output << " instead of " << *reference + << " (|e|=" << difference << ")\n"; + } + } + ++output; + ++reference; + } + } + } + + error_file.close(); + norm2 = sqrt(norm2); + if (error) std::cout << "error (Euclidean norm): " << norm2 << "\n"; + return error; +} From 31df9067ae117b6e960f01e325a925ad07ba670f Mon Sep 17 00:00:00 2001 From: slgogar <33332238+slgogar@users.noreply.github.com> Date: Wed, 15 Jul 2020 12:42:50 -0700 Subject: [PATCH 2/3] Update License.txt --- .../DPC++/StructuredGrids/iso3dfd_dpcpp/License.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/DirectProgramming/DPC++/StructuredGrids/iso3dfd_dpcpp/License.txt b/DirectProgramming/DPC++/StructuredGrids/iso3dfd_dpcpp/License.txt index da5f7c1888..148940418d 100644 --- a/DirectProgramming/DPC++/StructuredGrids/iso3dfd_dpcpp/License.txt +++ b/DirectProgramming/DPC++/StructuredGrids/iso3dfd_dpcpp/License.txt @@ -1,4 +1,4 @@ -Copyright 2019 Intel Corporation +Copyright 2020 Intel Corporation Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: From b9032e4694c5aa9a2181fa9a0bc1e78a0bbe4ae9 Mon Sep 17 00:00:00 2001 From: slgogar <33332238+slgogar@users.noreply.github.com> Date: Wed, 15 Jul 2020 14:10:42 -0700 Subject: [PATCH 3/3] Update sample.json --- .../DPC++/StructuredGrids/iso3dfd_dpcpp/sample.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/DirectProgramming/DPC++/StructuredGrids/iso3dfd_dpcpp/sample.json b/DirectProgramming/DPC++/StructuredGrids/iso3dfd_dpcpp/sample.json index 695fea9e80..9d6ed588ab 100755 --- a/DirectProgramming/DPC++/StructuredGrids/iso3dfd_dpcpp/sample.json +++ b/DirectProgramming/DPC++/StructuredGrids/iso3dfd_dpcpp/sample.json @@ -22,7 +22,7 @@ "steps": [ "MSBuild iso3dfd_dpcpp.sln /t:Rebuild /p:Configuration=\"Release\"", "cd x64/Release", - "iso3dfd.exe 256 256 256 32 8 64 10 gpu" + "iso3dfd_dpcpp.exe 256 256 256 32 8 64 10 gpu" ] }]