diff --git a/DirectProgramming/DPC++FPGA/ReferenceDesigns/merge_sort/CMakeLists.txt b/DirectProgramming/DPC++FPGA/ReferenceDesigns/merge_sort/CMakeLists.txt
new file mode 100755
index 0000000000..1256ecbe67
--- /dev/null
+++ b/DirectProgramming/DPC++FPGA/ReferenceDesigns/merge_sort/CMakeLists.txt
@@ -0,0 +1,20 @@
+if(UNIX)
+    # Direct CMake to use dpcpp rather than the default C++ compiler/linker
+    set(CMAKE_CXX_COMPILER dpcpp)
+else() # Windows
+    # Force CMake to use dpcpp rather than the default C++ compiler/linker
+    # (needed on Windows only)
+    include (CMakeForceCompiler)
+    CMAKE_FORCE_CXX_COMPILER (dpcpp IntelDPCPP)
+    include (Platform/Windows-Clang)
+endif()
+
+cmake_minimum_required (VERSION 3.4)
+
+project(MergeSort CXX)
+
+set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR})
+set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR})
+set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR})
+
+add_subdirectory (src)
\ No newline at end of file
diff --git a/DirectProgramming/DPC++FPGA/ReferenceDesigns/merge_sort/License.txt b/DirectProgramming/DPC++FPGA/ReferenceDesigns/merge_sort/License.txt
new file mode 100755
index 0000000000..7c8b8a36c6
--- /dev/null
+++ b/DirectProgramming/DPC++FPGA/ReferenceDesigns/merge_sort/License.txt
@@ -0,0 +1,23 @@
+Copyright Intel Corporation
+
+SPDX-License-Identifier: MIT
+https://opensource.org/licenses/MIT
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+
diff --git a/DirectProgramming/DPC++FPGA/ReferenceDesigns/merge_sort/README.md b/DirectProgramming/DPC++FPGA/ReferenceDesigns/merge_sort/README.md
new file mode 100755
index 0000000000..fad96c8809
--- /dev/null
+++ b/DirectProgramming/DPC++FPGA/ReferenceDesigns/merge_sort/README.md
@@ -0,0 +1,199 @@
+# Merge Sort
+This DPC++ reference design demonstrates a highly paramaterizable merge sort algorithm on an FPGA.
+
+***Documentation***:  
+* [DPC++ FPGA Code Samples Guide](https://software.intel.com/content/www/us/en/develop/articles/explore-dpcpp-through-intel-fpga-code-samples.html) helps you to navigate the samples and build your knowledge of DPC++ for FPGA. <br>
+* [oneAPI DPC++ FPGA Optimization Guide](https://software.intel.com/content/www/us/en/develop/documentation/oneapi-fpga-optimization-guide) is the reference manual for targeting FPGAs through DPC++. <br>
+* [oneAPI Programming Guide](https://software.intel.com/en-us/oneapi-programming-guide) is a general resource for target-independent DPC++ programming. 
+ 
+| Optimized for                     | Description
+---                                 |---
+| OS                                | Linux* Ubuntu* 18.04/20.04, RHEL*/CentOS* 8, SUSE* 15; Windows* 10
+| Hardware                          | Intel&reg; Programmable Acceleration Card (PAC) with Intel Arria&reg; 10 GX FPGA <br> Intel&reg; FPGA Programmable Acceleration Card (PAC) D5005 (with Intel Stratix&reg; 10 SX) <br> Intel Xeon&reg; CPU E5-1650 v2 @ 3.50GHz (host machine)
+| Software                          | Intel&reg; oneAPI DPC++ Compiler <br> Intel&reg; FPGA Add-On for oneAPI Base Toolkit
+| What you will learn               | How to use the spatial compute of the FPGA to create a merge sort design that takes advantage of thread- and SIMD-level parallelism.
+| Time to complete                  | 1 hour
+
+<br>
+
+**Performance**
+The performance data below was gathered using the Intel® FPGA PAC D5005 (with Intel Stratix® 10 SX) sorting `2^24=16777216` elements using 1-16 merge units and the best throughput across 5 seeds.
+
+TODO: Update this.
+
+| Merge Units | Execution time (ms) | Throughput (Melements/s) |
+| :---------- | :-----------------: | :----------------------: |
+| 1           | 1476                | 11                       |
+| 2           | 569.8               | 28                       |
+| 4           | 195.2               | 82                       |
+| 8           | 99.9                | 160                      |
+| 16          | 69.9                | 228                      |
+
+## Purpose
+This FPGA reference design demonstrates a highly paramaterizable merge sort design that utilizes the spatial computing of the FPGA. The basic merge sort algorithm is described [here](https://en.wikipedia.org/wiki/Merge_sort). See the [Additional Design Information Section](#additional-design-information) for more information on how the merge sort algorithm was implemented on the FPGA.
+
+## License  
+Code samples are licensed under the MIT license. See
+[License.txt](https://github.com/oneapi-src/oneAPI-samples/blob/master/License.txt) for details.
+
+## Building the Reference Design
+
+### Include Files
+The include folder is located at `%ONEAPI_ROOT%\dev-utilities\latest\include` on your development system.
+
+### Running Code Samples in DevCloud
+If running a sample in the Intel DevCloud, remember that you must specify the type of compute node and whether to run in batch or interactive mode. Compiles to FPGA are only supported on fpga_compile nodes. Executing programs on FPGA hardware is only supported on fpga_runtime nodes of the appropriate type, such as fpga_runtime:arria10 or fpga_runtime:stratix10.  Neither compiling nor executing programs on FPGA hardware are supported on the login nodes. For more information, see the Intel® oneAPI Base Toolkit Get Started Guide ([https://devcloud.intel.com/oneapi/documentation/base-toolkit/](https://devcloud.intel.com/oneapi/documentation/base-toolkit/)).
+
+When compiling for FPGA hardware, it is recommended to increase the job timeout to 24h.
+ 
+### On a Linux* System
+1. Install the design into a directory `build` from the design directory by running `cmake`:
+
+   ```
+   mkdir build
+   cd build
+   ```
+
+   If you are compiling for the Intel® PAC with Intel Arria® 10 GX FPGA, run `cmake` using the command:
+
+   ```
+   cmake ..
+   ```
+
+   If instead you are compiling for the Intel® FPGA PAC D5005 (with Intel Stratix® 10 SX), run `cmake` using the command:
+
+   ```
+   cmake .. -DFPGA_BOARD=intel_s10sx_pac:pac_s10
+   ```
+
+2. Compile the design through the generated `Makefile`. The following targets are provided, and they match the recommended development flow:
+
+    * Compile for emulation (fast compile time, targets emulated FPGA device).
+
+       ```
+       make fpga_emu
+       ```
+
+    * Generate HTML performance report. Find the report in `merge_sort_report.prj/reports/report.html`directory.
+
+       ```
+       make report
+       ```
+
+    * Compile for FPGA hardware (longer compile time, targets FPGA device).
+
+       ```
+       make fpga
+       ```
+
+3. (Optional) As the above hardware compile may take several hours to complete, FPGA precompiled binaries (compatible with Linux* Ubuntu* 18.04) can be downloaded <a href="https://iotdk.intel.com/fpga-precompiled-binaries/latest/merge_sort.fpga.tar.gz" download>here</a>.
+
+### On a Windows* System
+1. Generate the `Makefile` by running `cmake`.
+     ```
+   mkdir build
+   cd build
+   ```
+   To compile for the Intel® PAC with Intel Arria® 10 GX FPGA, run `cmake` using the command:  
+    ```
+    cmake -G "NMake Makefiles" ..
+   ```
+   Alternatively, to compile for the Intel® FPGA PAC D5005 (with Intel Stratix® 10 SX), run `cmake` using the command:
+   ```
+   cmake -G "NMake Makefiles" .. -DFPGA_BOARD=intel_s10sx_pac:pac_s10
+   ```
+
+2. Compile the design through the generated `Makefile`. The following build targets are provided, matching the recommended development flow:
+   * Compile for emulation (fast compile time, targets emulated FPGA device): 
+     ```
+     nmake fpga_emu
+     ```
+   * Generate the optimization report: 
+     ```
+     nmake report
+     ``` 
+   * An FPGA hardware target is not provided on Windows*. 
+
+*Note:* The Intel® PAC with Intel Arria® 10 GX FPGA and Intel® FPGA PAC D5005 (with Intel Stratix® 10 SX) do not yet support Windows*. Compiling to FPGA hardware on Windows* requires a third-party or custom Board Support Package (BSP) with Windows* support.
+
+### In Third-Party Integrated Development Environments (IDEs)
+
+You can compile and run this Reference Design in the Eclipse* IDE (in Linux*) and the Visual Studio* IDE (in Windows*). For instructions, refer to the following link: [Intel® oneAPI DPC++ FPGA Workflows on Third-Party IDEs](https://software.intel.com/en-us/articles/intel-oneapi-dpcpp-fpga-workflow-on-ide)
+
+## Running the Reference Design
+
+ 1. Run the sample on the FPGA emulator (the kernel executes on the CPU).
+     ```
+     ./merge_sort.fpga_emu    (Linux)
+     merge_sort.fpga_emu.exe  (Windows)
+     ```
+
+2. Run the sample on the FPGA device.
+     ```
+     ./merge_sort.fpga        (Linux)
+     ```
+
+### Example of Output
+You should see output similar to the following in the console:
+```
+Running sort 17 times for an input size of 16777216 using 8 4-way merge units
+Streaming data from device memory
+Execution time: 69.9848 ms
+Throughput: 228.621 Melements/s
+PASSED
+```
+NOTE: When running on the FPGA emulator, the *Execution time* and *Throughput* do not reflect the design's actual hardware performance.
+    
+
+## Additional Design Information
+### Source Code Breakdown
+The following source files can be found in the `src/` sub-directory.
+
+| File                           | Description 
+|:---                            |:---
+|`main.cpp`                      | Contains the `main()` function and the top-level interfaces.
+|`merge_sort.hpp`                 | The function to submit all of the merge sort kernels (`SortingNetwork`, `Produce`, `Merge`, and `Consume`).
+|`consume.hpp`                   | The `Consume` kernel for the merge unit. This kernel reads from an input pipe and writes out to either a different output pipe, or to device memory.
+|`impu_math.hpp`                 | Metaprogramming math helper functions (*impu* = Intel Metaprogramming Utilities)
+|`merge.hpp`                     | The `Merge` kernel for the merge unit and the merge tree. This kernel streams in two sorted lists, merges them into a single sorted list of double the size, and streams the data out a pipe.
+|`pipe_array.hpp`                | Header file containing the definition of an array of pipes.
+|`pipe_array_internal.hpp`       | Helper for pipe_array.hpp.
+|`produce.hpp`                   | The `Produce` kernel for the merge unit. This kernel reads from input pipes or performs strided reads from device memory and writes the data to an output pipe.
+|`sorting_networks.hpp`           | Contains all of the code relevant to sorting networks, including the `SortingNetwork` kernel, as well as the `BitonicSortingNetwork` and `MergeSortNetwork` helper functions.
+|`unrolled_loop.hpp`              | A templated-based loop unroller that unrolls loops in the compiler front end.
+
+### Merge Sort Details
+This section will describe how the merge sort design is structured and how it takes advantage of the spatial compute of the FPGA. <br/>
+
+The figure below shows the conceptual view of the merge sort design to the user. The user streams data into a SYCL pipe (`InPipe`) and, after some delay, the elements are streamed out of a SYCL pipe (`OutPipe`), in sorted order. The number of elements that the merge sort design is capable of sorting is a runtime parameter, but it must be a power of 2. However, this restriction can be worked around by padding the input stream with min/max elements, depending on the direction of the sort (smallest-to-largest vs largest-to-smallest). This technique is demonstrated in this design (see the `fpga_sort` function in *main.cpp*).
+
+<img src="sort_api.png" alt="sort_api" width="500"/>
+
+The basis of the merge sort design is what we call a *merge unit*, which is shown in the figure below. A single merge unit streams in two sorted lists of size `count` in parallel and merges them into a single sorted list of size `2*count`. The lists are streamed in from device memory (e.g., DDR or HBM) by two `Produce` kernels. The `Consume` kernel can stream data out to either a SYCL pipe or to device memory.
+
+<img src="merge_unit.png" alt="merge_unit" width="600"/>
+
+A single merge unit requires `lg(N)` iterations to sort `N` elements. This requires the host to enqueue `lg(N)` iterations of the merge unit kernels that merge sublists of size {`1`, `2`, `4`, ...} into larger lists of size {`2`, `4`, `8`, ...}, respectively. This results in a timeline that looks like the figure below.
+
+<img src="basic_runtime_graph.png" alt="basic_runtime_graph" width="800"/>
+
+To achieve SIMD-level (**S**ingle **I**nstruction **M**ultiple **D**ata) parallelism, we enhance the merge unit to merge `k` elements per cycle. The figure below illustrates how this is done. In the following discussion, we will assume that we are sorting from smallest-to-largest, but the logic is very similar for sorting largest-to-smallest and is easily configurable at compile time in this design. <br/>
+
+The merge unit looks at the two inputs of size `k` coming from the `ProduceA` and `ProduceB` kernels (in the figure below, `k=4`) and compares the first elements of each set; remember, these set of `k` elements are already sorted, so we are comparing the smallest elements of the set. Whichever set of elements has the *smaller of the smallest elements* is chosen and combined with `k` other elements from the `feedback` path. These `2*k` elements go through a merge sort network that sorts them in a single cycle. After the `2*k` elements are sorted, the smallest `k` elements are sent to the output (to the `Consume` kernel) and the largest `k` elements are fed back into the sorting network (the `feedback` path in the figure below), and the process repeats. This allows the merge unit to process `k` elements per cycle in the steady state. Note that `k` must be a power of 2. <br/>
+
+More information on this design can be found in this paper by [R. Kobayashi and K. Kise](https://www.researchgate.net/publication/316604001_A_High_Performance_FPGA-Based_Sorting_Accelerator_with_a_Data_Compression_Mechanism).
+
+<img src="k-way_merge_unit.png" alt="way_merge_unit" width="900"/>
+
+To achieve thread-level parallelism, the merge sort design accepts a template parameter, `units`, which allows one to instantiate multiple instances of the merge unit, as shown in the figure below. Before the merge units start processing data, the incoming data coming from the input pipe is sent through a bitonic sorting network and written to the temporary buffer partitions in device memory. This sorting network sorts `k` elements per cycle in the steady state. Choosing the number of merge units is an area-performance tradeoff (note: the number of instantiated merge units must be a power of 2). Each merge unit sorts an `N/units`-sized partition of the input data in parallel.
+
+<img src="parallel_tree_bitonic_k-way.png" alt="parallel_tree_bitonic_k-way" width="800"/>
+
+After the merge units sort their `N/units`-sized partition, the partitions of each unit must be reduced into a single sorted list. There are two options to do this: (1) reuse the merge units to perform `lg(units)` more iterations to sort the partitions, or (2) create a merge tree to reduce the partitions into a single sorted list. Option (1) saves area at the expense of performance, since it has to perform additional sorting iterations. Option (2), which we choose for this design, improves performance by creating a merge tree to reduce the final partitions into a single sorted list. The `Merge` kernels in the merge tree (shown in the figure above) use the same kernel code that is used in the `Merge` kernel of the merge unit, which means they too can merge `k` elements per cycle. Once the merge units perform their last iteration, they output to a pipe (instead of writing to device memory) that feeds the merge tree.
+
+### Performance disclaimers
+Tests document performance of components on a particular test, in specific systems. Differences in hardware, software, or configuration will affect actual performance. Consult other sources of information to evaluate performance as you consider your purchase. For more complete information about performance and benchmark results, visit [www.intel.com/benchmarks](www.intel.com/benchmarks).
+
+Performance results are based on testing as of May 2021 and may not reflect all publicly available security updates.  See configuration disclosure for details.  No product or component can be absolutely secure.
+
+Intel technologies’ features and benefits depend on system configuration and may require enabled hardware, software or service activation. Performance varies depending on system configuration. Check with your system manufacturer or retailer or learn more at [intel.com](www.intel.com).
diff --git a/DirectProgramming/DPC++FPGA/ReferenceDesigns/merge_sort/basic_runtime_graph.png b/DirectProgramming/DPC++FPGA/ReferenceDesigns/merge_sort/basic_runtime_graph.png
new file mode 100644
index 0000000000..8b4c8ce172
Binary files /dev/null and b/DirectProgramming/DPC++FPGA/ReferenceDesigns/merge_sort/basic_runtime_graph.png differ
diff --git a/DirectProgramming/DPC++FPGA/ReferenceDesigns/merge_sort/k-way_merge_unit.png b/DirectProgramming/DPC++FPGA/ReferenceDesigns/merge_sort/k-way_merge_unit.png
new file mode 100755
index 0000000000..0343abfb77
Binary files /dev/null and b/DirectProgramming/DPC++FPGA/ReferenceDesigns/merge_sort/k-way_merge_unit.png differ
diff --git a/DirectProgramming/DPC++FPGA/ReferenceDesigns/merge_sort/merge_sort.sln b/DirectProgramming/DPC++FPGA/ReferenceDesigns/merge_sort/merge_sort.sln
new file mode 100755
index 0000000000..4e67133f38
--- /dev/null
+++ b/DirectProgramming/DPC++FPGA/ReferenceDesigns/merge_sort/merge_sort.sln
@@ -0,0 +1,25 @@
+﻿
+Microsoft Visual Studio Solution File, Format Version 12.00
+# Visual Studio 15
+VisualStudioVersion = 15.0.28307.705
+MinimumVisualStudioVersion = 10.0.40219.1
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "merge_sort", "merge_sort.vcxproj", "{ACDE6B7A-6F9A-428E-B040-CEDC5B1E2C79}"
+EndProject
+Global
+	GlobalSection(SolutionConfigurationPlatforms) = preSolution
+		Debug|x64 = Debug|x64
+		Release|x64 = Release|x64
+	EndGlobalSection
+	GlobalSection(ProjectConfigurationPlatforms) = postSolution
+		{ACDE6B7A-6F9A-428E-B040-CEDC5B1E2C79}.Debug|x64.ActiveCfg = Debug|x64
+		{ACDE6B7A-6F9A-428E-B040-CEDC5B1E2C79}.Debug|x64.Build.0 = Debug|x64
+		{ACDE6B7A-6F9A-428E-B040-CEDC5B1E2C79}.Release|x64.ActiveCfg = Release|x64
+		{ACDE6B7A-6F9A-428E-B040-CEDC5B1E2C79}.Release|x64.Build.0 = Release|x64
+	EndGlobalSection
+	GlobalSection(SolutionProperties) = preSolution
+		HideSolutionNode = FALSE
+	EndGlobalSection
+	GlobalSection(ExtensibilityGlobals) = postSolution
+		SolutionGuid = {97D1BD74-AAAB-4835-8F00-37A58B70871A}
+	EndGlobalSection
+EndGlobal
diff --git a/DirectProgramming/DPC++FPGA/ReferenceDesigns/merge_sort/merge_sort.vcxproj b/DirectProgramming/DPC++FPGA/ReferenceDesigns/merge_sort/merge_sort.vcxproj
new file mode 100755
index 0000000000..b2c3792ce5
--- /dev/null
+++ b/DirectProgramming/DPC++FPGA/ReferenceDesigns/merge_sort/merge_sort.vcxproj
@@ -0,0 +1,176 @@
+﻿<?xml version="1.0" encoding="utf-8"?>
+<Project DefaultTargets="Build" ToolsVersion="15.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <ItemGroup Label="ProjectConfigurations">
+    <ProjectConfiguration Include="Debug|x64">
+      <Configuration>Debug</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|x64">
+      <Configuration>Release</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+  </ItemGroup>
+  <ItemGroup>
+    <ClCompile Include="src\main.cpp" />
+  </ItemGroup>
+  <ItemGroup>
+    <ClInclude Include="src\consume.hpp" />
+    <ClInclude Include="src\merge.hpp" />
+    <ClInclude Include="src\merge_sort.hpp" />
+    <ClInclude Include="src\pipe_array_internal.hpp" />
+    <ClInclude Include="src\pipe_array.hpp" />
+    <ClInclude Include="src\produce.hpp" />
+    <ClInclude Include="src\sorting_networks.hpp" />
+    <ClInclude Include="src\impu_math.hpp" />
+    <ClInclude Include="src\unrolled_loop.hpp" />
+  </ItemGroup>
+  <ItemGroup>
+    <None Include="README.md" />
+  </ItemGroup>
+  <PropertyGroup Label="Globals">
+    <VCProjectVersion>15.0</VCProjectVersion>
+    <ProjectGuid>{acde6b7a-6f9a-428e-b040-cedc5b1e2c79}</ProjectGuid>
+    <Keyword>Win32Proj</Keyword>
+    <RootNamespace>qrd</RootNamespace>
+    <WindowsTargetPlatformVersion>$(WindowsSDKVersion.Replace("\",""))</WindowsTargetPlatformVersion>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>true</UseDebugLibraries>
+    <PlatformToolset>Intel(R) oneAPI DPC++ Compiler</PlatformToolset>
+    <CharacterSet>Unicode</CharacterSet>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>false</UseDebugLibraries>
+    <PlatformToolset>Intel(R) oneAPI DPC++ Compiler</PlatformToolset>
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+    <CharacterSet>Unicode</CharacterSet>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>true</UseDebugLibraries>
+    <PlatformToolset>Intel(R) oneAPI DPC++ Compiler</PlatformToolset>
+    <CharacterSet>Unicode</CharacterSet>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>false</UseDebugLibraries>
+    <PlatformToolset>Intel(R) oneAPI DPC++ Compiler</PlatformToolset>
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+    <CharacterSet>Unicode</CharacterSet>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
+  <ImportGroup Label="ExtensionSettings">
+  </ImportGroup>
+  <ImportGroup Label="Shared">
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <PropertyGroup Label="UserMacros" />
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+    <LinkIncremental>true</LinkIncremental>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+    <LinkIncremental>true</LinkIncremental>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+    <LinkIncremental>false</LinkIncremental>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+    <LinkIncremental>false</LinkIncremental>
+  </PropertyGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+    <ClCompile>
+      <PrecompiledHeader>Use</PrecompiledHeader>
+      <WarningLevel>Level3</WarningLevel>
+      <Optimization>Disabled</Optimization>
+      <SDLCheck>true</SDLCheck>
+      <ConformanceMode>true</ConformanceMode>
+      <PrecompiledHeaderFile>pch.h</PrecompiledHeaderFile>
+      <AdditionalIncludeDirectories>$(ONEAPI_ROOT)dev-utilities\latest\include</AdditionalIncludeDirectories>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+    <ClCompile>
+      <PrecompiledHeader>Use</PrecompiledHeader>
+      <WarningLevel>Level3</WarningLevel>
+      <Optimization>Disabled</Optimization>
+      <SDLCheck>true</SDLCheck>
+      <ConformanceMode>true</ConformanceMode>
+      <PrecompiledHeaderFile>pch.h</PrecompiledHeaderFile>
+      <EnableFPGACompilationAhead>true</EnableFPGACompilationAhead>
+      <AdditionalOptions>-DFPGA_EMULATOR -DFIXED_ITERATIONS=64 -DROWS_COMPONENT=128 -DCOLS_COMPONENT=128 %(AdditionalOptions)</AdditionalOptions>
+      <ObjectFileName>
+      </ObjectFileName>
+      <AdditionalIncludeDirectories>$(ONEAPI_ROOT)dev-utilities\latest\include</AdditionalIncludeDirectories>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+      <SpecifyDevCmplAdditionalOptions>-Xsclock=360MHz;-Xsfp-relaxed;-fno-fast-math;-Xsparallel=2</SpecifyDevCmplAdditionalOptions>
+    </Link>
+    <Manifest />
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+    <ClCompile>
+      <PrecompiledHeader>Use</PrecompiledHeader>
+      <WarningLevel>Level3</WarningLevel>
+      <Optimization>MaxSpeed</Optimization>
+      <FunctionLevelLinking>true</FunctionLevelLinking>
+      <IntrinsicFunctions>true</IntrinsicFunctions>
+      <SDLCheck>true</SDLCheck>
+      <ConformanceMode>true</ConformanceMode>
+      <PrecompiledHeaderFile>pch.h</PrecompiledHeaderFile>
+      <AdditionalIncludeDirectories>$(ONEAPI_ROOT)dev-utilities\latest\include</AdditionalIncludeDirectories>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <EnableCOMDATFolding>true</EnableCOMDATFolding>
+      <OptimizeReferences>true</OptimizeReferences>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+    <ClCompile>
+      <PrecompiledHeader>Use</PrecompiledHeader>
+      <WarningLevel>Level3</WarningLevel>
+      <Optimization>MaxSpeed</Optimization>
+      <FunctionLevelLinking>true</FunctionLevelLinking>
+      <IntrinsicFunctions>true</IntrinsicFunctions>
+      <SDLCheck>true</SDLCheck>
+      <ConformanceMode>true</ConformanceMode>
+      <PrecompiledHeaderFile>pch.h</PrecompiledHeaderFile>
+      <EnableFPGACompilationAhead>true</EnableFPGACompilationAhead>
+      <AdditionalOptions>-DFPGA_EMULATOR -DFIXED_ITERATIONS=64 -DROWS_COMPONENT=128 -DCOLS_COMPONENT=128 %(AdditionalOptions)</AdditionalOptions>
+      <ObjectFileName>
+      </ObjectFileName>
+      <AdditionalIncludeDirectories>$(ONEAPI_ROOT)dev-utilities\latest\include</AdditionalIncludeDirectories>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <EnableCOMDATFolding>true</EnableCOMDATFolding>
+      <OptimizeReferences>true</OptimizeReferences>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+      <SpecifyDevCmplAdditionalOptions>-Xsclock=330MHz;-Xsfp-relaxed;-Xsparallel=2</SpecifyDevCmplAdditionalOptions>
+    </Link>
+  </ItemDefinitionGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
+  <ImportGroup Label="ExtensionTargets">
+  </ImportGroup>
+</Project>
diff --git a/DirectProgramming/DPC++FPGA/ReferenceDesigns/merge_sort/merge_unit.png b/DirectProgramming/DPC++FPGA/ReferenceDesigns/merge_sort/merge_unit.png
new file mode 100644
index 0000000000..22869a56a4
Binary files /dev/null and b/DirectProgramming/DPC++FPGA/ReferenceDesigns/merge_sort/merge_unit.png differ
diff --git a/DirectProgramming/DPC++FPGA/ReferenceDesigns/merge_sort/parallel_tree_bitonic_k-way.png b/DirectProgramming/DPC++FPGA/ReferenceDesigns/merge_sort/parallel_tree_bitonic_k-way.png
new file mode 100755
index 0000000000..5e756ef0e9
Binary files /dev/null and b/DirectProgramming/DPC++FPGA/ReferenceDesigns/merge_sort/parallel_tree_bitonic_k-way.png differ
diff --git a/DirectProgramming/DPC++FPGA/ReferenceDesigns/merge_sort/sample.json b/DirectProgramming/DPC++FPGA/ReferenceDesigns/merge_sort/sample.json
new file mode 100755
index 0000000000..94d2a2c5f1
--- /dev/null
+++ b/DirectProgramming/DPC++FPGA/ReferenceDesigns/merge_sort/sample.json
@@ -0,0 +1,55 @@
+{
+  "guid": "AF215180-ECB8-4414-8049-D89E4882576D",
+  "name": "Merge Sort",
+  "categories": ["Toolkit/oneAPI Direct Programming/DPC++ FPGA/Reference Designs"],
+  "description": "A Reference design demonstrating merge sort on an Intel® FPGA",
+  "toolchain": ["dpcpp"],
+  "os": ["linux", "windows"],
+  "builder": ["ide", "cmake"],
+  "targetDevice": ["FPGA"],
+  "languages": [{"cpp":{}}],
+  "ciTests": {
+    "linux": [
+      {
+        "id": "fpga_emu",
+        "steps": [
+          "mkdir build",
+          "cd build",
+          "cmake ..",
+          "make fpga_emu",
+          "./merge_sort.fpga_emu"
+        ]
+      },
+      {
+        "id": "report",
+        "steps": [
+          "mkdir build",
+          "cd build",
+          "cmake ..",
+          "make report"
+        ]
+      }
+    ],
+    "windows": [
+      {
+        "id": "fpga_emu",
+        "steps": [
+          "mkdir build",
+          "cd build",
+          "cmake -G \"NMake Makefiles\" ..",
+          "nmake fpga_emu",
+          "merge_sort.fpga_emu.exe"
+        ]
+      },
+      {
+        "id": "report",
+        "steps": [
+          "mkdir build",
+          "cd build",
+          "cmake -G \"NMake Makefiles\" ..",
+          "nmake report"
+        ]
+      }
+    ]
+  }
+}
diff --git a/DirectProgramming/DPC++FPGA/ReferenceDesigns/merge_sort/sort_api.png b/DirectProgramming/DPC++FPGA/ReferenceDesigns/merge_sort/sort_api.png
new file mode 100644
index 0000000000..ee36fb930b
Binary files /dev/null and b/DirectProgramming/DPC++FPGA/ReferenceDesigns/merge_sort/sort_api.png differ
diff --git a/DirectProgramming/DPC++FPGA/ReferenceDesigns/merge_sort/src/CMakeLists.txt b/DirectProgramming/DPC++FPGA/ReferenceDesigns/merge_sort/src/CMakeLists.txt
new file mode 100644
index 0000000000..caf32fbbc3
--- /dev/null
+++ b/DirectProgramming/DPC++FPGA/ReferenceDesigns/merge_sort/src/CMakeLists.txt
@@ -0,0 +1,109 @@
+# To see a Makefile equivalent of this build system:
+# https://github.com/oneapi-src/oneAPI-samples/blob/master/DirectProgramming/DPC++/ProjectTemplates/makefile-fpga
+set(SOURCE_FILE main.cpp)
+set(TARGET_NAME merge_sort)
+set(EMULATOR_TARGET ${TARGET_NAME}.fpga_emu)
+set(FPGA_TARGET ${TARGET_NAME}.fpga)
+
+# FPGA board selection
+if(NOT DEFINED FPGA_BOARD)
+    set(FPGA_BOARD "intel_a10gx_pac:pac_a10")
+    message(STATUS "FPGA_BOARD was not specified.\
+                    \nConfiguring the design to run on the default FPGA board ${FPGA_BOARD} (Intel(R) PAC with Intel Arria(R) 10 GX FPGA). \
+                    \nPlease refer to the README for information on board selection.")
+else()
+    message(STATUS "Configuring the design to run on FPGA board ${FPGA_BOARD}")
+endif()
+
+# This is a Windows-specific flag that enables error handling in host code
+if(WIN32)
+    set(WIN_FLAG "/EHsc")
+endif()
+
+# check if the BSP has USM host allocations or manually enable using host allocations
+# e.g. cmake .. -DUSE_USM_HOST_ALLOCATIONS=1
+if(FPGA_BOARD MATCHES ".usm.*" OR DEFINED USE_USM_HOST_ALLOCATIONS)
+    set(ENABLE_USM "-DUSM_HOST_ALLOCATIONS")
+    message(STATUS "USM host allocations are enabled")
+endif()
+
+# Allow the user to enable hardware profiling
+# Profiling can be enabled when running cmake by adding the flag -DPROFILE_HW=1
+# e.g. cmake .. -DPROFILE_HW=1
+if(PROFILE_HW)
+    set(PROFILE_FLAG "-Xsprofile")
+endif()
+
+# Select the number of merge units to use. Must be a power of 2.
+# e.g. cmake .. -DNUM_MERGE_UNITS=16
+if(MERGE_UNITS)
+    set(MERGE_UNITS_FLAG "-DMERGE_UNITS=${MERGE_UNITS}")
+    message(STATUS "Number of merge units explicitly set to ${MERGE_UNITS}")
+endif()
+
+# Select the number of elements each merge unit can sort in one cycle. Must be a power of 2.
+# e.g. cmake .. -DSORT_WIDTH=16
+if(SORT_WIDTH)
+  set(SORT_WIDTH_FLAG "-DSORT_WIDTH=${SORT_WIDTH}")
+  message(STATUS "Sort width explicitly set to ${SORT_WIDTH}")
+endif()
+
+# Choose the random seed for the hardware compile
+# e.g. cmake .. -DSEED=7
+if(NOT DEFINED SEED)
+    # the default seed
+    set(SEED 3)
+else()
+    message(STATUS "Seed explicitly set to ${SEED}")
+endif()
+
+# A DPC++ ahead-of-time (AoT) compile processes the device code in two stages.
+# 1. The "compile" stage compiles the device code to an intermediate representation (SPIR-V).
+# 2. The "link" stage invokes the compiler's FPGA backend before linking.
+#    For this reason, FPGA backend flags must be passed as link flags in CMake.
+set(EMULATOR_COMPILE_FLAGS "-Wall ${WIN_FLAG} -fintelfpga ${ENABLE_USM} ${MERGE_UNITS_FLAG} ${SORT_WIDTH_FLAG} -DFPGA_EMULATOR")
+set(EMULATOR_LINK_FLAGS "-fintelfpga ${ENABLE_USM} ${MERGE_UNITS_FLAG} ${SORT_WIDTH_FLAG}")
+set(HARDWARE_COMPILE_FLAGS "-Wall ${WIN_FLAG} -fintelfpga ${ENABLE_USM} ${MERGE_UNITS_FLAG} ${SORT_WIDTH_FLAG}")
+set(HARDWARE_LINK_FLAGS "-fintelfpga -Xshardware ${PROFILE_FLAG} -Xsparallel=2 -Xsseed=${SEED} -Xsboard=${FPGA_BOARD} ${ENABLE_USM} ${MERGE_UNITS_FLAG} ${SORT_WIDTH_FLAG} ${USER_HARDWARE_FLAGS}")
+# use cmake -D USER_HARDWARE_FLAGS=<flags> to set extra flags for FPGA backend compilation
+
+###############################################################################
+### FPGA Emulator
+###############################################################################
+# To compile in a single command:
+#    dpcpp -fintelfpga -DFPGA_EMULATOR merge_sort.cpp -o merge_sort.fpga_emu
+# CMake executes:
+#    [compile] dpcpp -fintelfpga -DFPGA_EMULATOR -o merge_sort.cpp.o -c merge_sort.cpp
+#    [link]    dpcpp -fintelfpga merge_sort.cpp.o -o merge_sort.fpga_emu
+add_executable(${EMULATOR_TARGET} ${SOURCE_FILE})
+set_target_properties(${EMULATOR_TARGET} PROPERTIES COMPILE_FLAGS "${EMULATOR_COMPILE_FLAGS}")
+set_target_properties(${EMULATOR_TARGET} PROPERTIES LINK_FLAGS "${EMULATOR_LINK_FLAGS}")
+add_custom_target(fpga_emu DEPENDS ${EMULATOR_TARGET})
+
+###############################################################################
+### Generate Report
+###############################################################################
+# To compile manually:
+#   dpcpp -fintelfpga -Xshardware -Xsparallel=2 -Xsseed=<SEED> -Xsboard=<FPGA_BOARD> -fsycl-link=early merge_sort.cpp -o merge_sort_report.a
+set(FPGA_EARLY_IMAGE ${TARGET_NAME}_report.a)
+# The compile output is not an executable, but an intermediate compilation result unique to DPC++.
+add_executable(${FPGA_EARLY_IMAGE} ${SOURCE_FILE})
+add_custom_target(report DEPENDS ${FPGA_EARLY_IMAGE})
+set_target_properties(${FPGA_EARLY_IMAGE} PROPERTIES COMPILE_FLAGS "${HARDWARE_COMPILE_FLAGS}")
+set_target_properties(${FPGA_EARLY_IMAGE} PROPERTIES LINK_FLAGS "${HARDWARE_LINK_FLAGS} -fsycl-link=early")
+# fsycl-link=early stops the compiler after RTL generation, before invoking Quartus
+
+###############################################################################
+### FPGA Hardware
+###############################################################################
+# To compile in a single command:
+#   dpcpp -fintelfpga -Xshardware -Xsseed=1 -Xsboard=<FPGA_BOARD> merge_sort.cpp -o merge_sort.fpga
+# CMake executes:
+#   [compile] dpcpp -fintelfpga -c -o merge_sort.cpp.o -c merge_sort.cpp
+#   [link]    dpcpp -fintelfpga -Xshardware -Xsparallel=2 -Xsseed=<SEED> -Xsboard=<FPGA_BOARD> merge_sort.cpp.o -o merge_sort.fpga
+add_executable(${FPGA_TARGET} EXCLUDE_FROM_ALL ${SOURCE_FILE})
+add_custom_target(fpga DEPENDS ${FPGA_TARGET})
+set_target_properties(${FPGA_TARGET} PROPERTIES COMPILE_FLAGS "${HARDWARE_COMPILE_FLAGS}")
+set_target_properties(${FPGA_TARGET} PROPERTIES LINK_FLAGS "${HARDWARE_LINK_FLAGS} -reuse-exe=${CMAKE_BINARY_DIR}/${FPGA_TARGET}")
+# The -reuse-exe flag enables rapid recompilation of host-only code changes.
+# See DPC++FPGA/GettingStarted/fast_recompile for details.
diff --git a/DirectProgramming/DPC++FPGA/ReferenceDesigns/merge_sort/src/consume.hpp b/DirectProgramming/DPC++FPGA/ReferenceDesigns/merge_sort/src/consume.hpp
new file mode 100644
index 0000000000..86a84e0e7d
--- /dev/null
+++ b/DirectProgramming/DPC++FPGA/ReferenceDesigns/merge_sort/src/consume.hpp
@@ -0,0 +1,47 @@
+#ifndef __CONSUME_HPP__
+#define __CONSUME_HPP__
+
+#include <CL/sycl.hpp>
+#include <CL/sycl/INTEL/fpga_extensions.hpp>
+
+using namespace sycl;
+
+//
+// Streams in 'k_width' elements of data per cycle from a SYCL pipe and either
+// writes it to memory (to_pipe==false) or writes it to a pipe (to_pipe==true)
+//
+template <typename Id, typename ValueT, typename IndexT, typename InPipe,
+          typename OutPipe, unsigned char k_width>
+event Consume(queue& q, ValueT* out_ptr, IndexT total_count, IndexT offset,
+              bool to_pipe) {
+  // the number of loop iterations required to consume all of the data
+  const IndexT iterations = total_count / k_width;
+
+  return q.submit([&](handler& h) {
+    h.single_task<Id>([=]() [[intel::kernel_args_restrict]] {
+      // Pointer to the output data.
+      // Creating a device_ptr tells the compiler that this pointer is in
+      // device memory, not host memory, and avoids creating extra connections
+      // to host memory
+      device_ptr<ValueT> out(out_ptr);
+
+      for (IndexT i = 0; i < iterations; i++) {
+        // get the data from the pipe
+        auto data = InPipe::read();
+
+        // write to either the output pipe, or to device memory
+        if (to_pipe) {
+          OutPipe::write(data);
+        } else {
+          // write the 'k_width' elements to device memory
+          #pragma unroll
+          for (unsigned char j = 0; j < k_width; j++) {
+            out[offset + i * k_width + j] = data[j];
+          }
+        }
+      }
+    });
+  });
+}
+
+#endif /* __CONSUME_HPP__ */
\ No newline at end of file
diff --git a/DirectProgramming/DPC++FPGA/ReferenceDesigns/merge_sort/src/impu_math.hpp b/DirectProgramming/DPC++FPGA/ReferenceDesigns/merge_sort/src/impu_math.hpp
new file mode 100644
index 0000000000..3c709515cf
--- /dev/null
+++ b/DirectProgramming/DPC++FPGA/ReferenceDesigns/merge_sort/src/impu_math.hpp
@@ -0,0 +1,61 @@
+#ifndef __IMPU_MATH__
+#define __IMPU_MATH__
+
+namespace impu {
+namespace math {
+
+  // returns n^2
+  template <typename T>
+  constexpr T Pow2(T n) {
+    static_assert(std::is_integral<T>::value);
+    static_assert(std::is_unsigned<T>::value);
+    return T(1) << n;
+  }
+
+  // returns whether 'n' is a power of 2
+  template <typename T>
+  constexpr bool IsPow2(T n) {
+    static_assert(std::is_integral<T>::value);
+    static_assert(std::is_unsigned<T>::value);
+    return (n != 0) && ((n & (n - 1)) == 0);
+  }
+
+  // returns log2(n) rounding down
+  template <typename T>
+  constexpr T Log2(T n) {
+    static_assert(std::is_integral_v<T>);
+    if (n < 2) {
+      return T(0);
+    } else {
+      T ret = 0;
+      while (n >= 2) {
+        ret++;
+        n /= 2;
+      }
+      return ret;
+    }
+  }
+
+  // returns log(2) rounded up
+  template <typename T>
+  static constexpr T CeilLog2(T n) {
+    return ((n == 1) ? T(0) : Log2(n - 1) + T(1));
+  }
+
+  // return 'n' rounded up to the nearest power of 2
+  template <typename T>
+  constexpr T RoundUpPow2(T n) {
+    static_assert(std::is_integral<T>::value);
+    static_assert(std::is_unsigned<T>::value);
+    if (n == 0) {
+      return 2;
+    } else if (IsPow2(n)) {
+      return n;
+    } else {
+      return T(1) << (Log2(n) + 1);
+    }
+  }
+}  // namespace math
+}  // namespace impu
+
+#endif /* __IMPU_MATH__ */
\ No newline at end of file
diff --git a/DirectProgramming/DPC++FPGA/ReferenceDesigns/merge_sort/src/main.cpp b/DirectProgramming/DPC++FPGA/ReferenceDesigns/merge_sort/src/main.cpp
new file mode 100644
index 0000000000..ff50fc304d
--- /dev/null
+++ b/DirectProgramming/DPC++FPGA/ReferenceDesigns/merge_sort/src/main.cpp
@@ -0,0 +1,388 @@
+#include <algorithm>
+#include <array>
+#include <chrono>
+#include <limits>
+#include <numeric>
+#include <type_traits>
+#include <utility>
+#include <vector>
+
+#include <CL/sycl.hpp>
+#include <CL/sycl/INTEL/fpga_extensions.hpp>
+
+// dpc_common.hpp can be found in the dev-utilities include folder.
+// e.g., $ONEAPI_ROOT/dev-utilities/include/dpc_common.hpp
+#include "dpc_common.hpp"
+
+// include the merge sort kernel header
+#include "merge_sort.hpp"
+
+using namespace sycl;
+using namespace std::chrono;
+
+// Determines whether we will use USM host or device allocations to move data
+// between host and the device.
+// This can be set on the command line by defining the preprocessor macro
+// 'USM_HOST_ALLOCATIONS' using the flag: '-DUSM_HOST_ALLOCATIONS'
+#if defined(USM_HOST_ALLOCATIONS)
+constexpr bool kUseUSMHostAllocation = true;
+#else
+constexpr bool kUseUSMHostAllocation = false;
+#endif
+
+// The number of merge units, which must be a power of 2.
+// This can be set by defining the preprocessor macro 'MERGE_UNITS'
+// otherwise the default value below is used.
+#ifndef MERGE_UNITS
+#define MERGE_UNITS 8
+#endif
+constexpr size_t kMergeUnits = MERGE_UNITS;
+static_assert(kMergeUnits > 0);
+static_assert(impu::math::IsPow2(kMergeUnits));
+
+// The width of the sort, which must be a power of 2
+// This can be set by defining the preprocessor macro 'SORT_WIDTH'
+// otherwise the default value below is used.
+#ifndef SORT_WIDTH
+#define SORT_WIDTH 4
+#endif
+constexpr size_t kSortWidth = SORT_WIDTH;
+static_assert(kSortWidth >= 1);
+static_assert(impu::math::IsPow2(kSortWidth));
+
+////////////////////////////////////////////////////////////////////////////////
+// Forward declare functions used in this file by main()
+template <typename ValueT, typename IndexT, typename KernelPtrType>
+double fpga_sort(queue &q, ValueT *in_vec, ValueT *out_vec, IndexT count);
+
+template <typename T>
+bool validate(T *val, T *ref, unsigned int count);
+////////////////////////////////////////////////////////////////////////////////
+
+
+int main(int argc, char *argv[]) {
+  // the type to sort, needs a compare function!
+  using ValueT = int;
+
+  // the type used to index in the sorter
+  // below we do a runtime check to make sure this type has enough bits to
+  // count all the elements to be sorted.
+  using IndexT = unsigned int;
+
+  /////////////////////////////////////////////////////////////
+  // reading and validating the command line arguments
+  // defaults
+  bool passed = true;
+#ifdef FPGA_EMULATOR
+  IndexT count = 128;
+  int runs = 1;
+#else
+  IndexT count = 1 << 24;
+  int runs = 17;
+#endif
+  int seed = 777;
+
+  // get the size of the input as the first command line argument
+  if (argc > 1) {
+    count = atoi(argv[1]);
+  }
+
+  // get the number of runs as the second command line argument
+  if (argc > 2) {
+    runs = atoi(argv[2]);
+  }
+
+  // get the random number generator seed as the third command line argument
+  if (argc > 3) {
+    seed = atoi(argv[3]);
+  }
+
+  // enforce at least two runs
+  runs = std::max((int)2, runs);
+
+  // check args
+  if (count <= kMergeUnits) {
+    std::cerr << "ERROR: 'count' must be greater than number of merge units\n";
+    std::terminate();
+  } else if (count > std::numeric_limits<IndexT>::max()) {
+    std::cerr << "ERROR: the index type (IndexT) does not have enough bits to "
+              << "count to 'count'\n";
+    std::terminate();
+  } else if ((count % kSortWidth) != 0) {
+    std::cerr << "ERROR: 'count' must be a multiple of the sorter width\n";
+    std::terminate();
+  }
+  /////////////////////////////////////////////////////////////
+
+  // the device selector
+#ifdef FPGA_EMULATOR
+  INTEL::fpga_emulator_selector selector;
+#else
+  INTEL::fpga_selector selector;
+#endif
+
+  // create the device queue
+  queue q(selector, dpc_common::exception_handler);
+
+  // make sure the device supports USM device allocations
+  auto d = q.get_device();
+  if (!d.get_info<info::device::usm_device_allocations>()) {
+    std::cerr << "ERROR: The selected device does not support USM device"
+              << " allocations\n";
+    std::terminate();
+  }
+
+  // make sure the device support USM host allocations if we chose to use them
+  if (!d.get_info<info::device::usm_host_allocations>() &&
+      kUseUSMHostAllocation) {
+    std::cerr << "ERROR: The selected device does not support USM host"
+              << " allocations\n";
+    std::terminate();
+  }
+
+  // the input, output, and reference data
+  std::vector<ValueT> in_vec(count), out_vec(count), ref(count);
+
+  // generate some random input data
+  srand(seed);
+  std::generate(in_vec.begin(), in_vec.end(), [] { return rand() % 100; });
+
+  // copy the input to the output reference and compute the expected result
+  std::copy(in_vec.begin(), in_vec.end(), ref.begin());
+  std::sort(ref.begin(), ref.end());
+
+  // allocate the input and output data either in USM host or device allocations
+  ValueT *in, *out;
+  if constexpr (kUseUSMHostAllocation) {
+    // using USM host allocations
+    if ((in = malloc_host<ValueT>(count, q)) == nullptr) {
+      std::cerr << "ERROR: could not allocate space for 'in' using "
+                << "malloc_host\n";
+      std::terminate();
+    }
+    if ((out = malloc_host<ValueT>(count, q)) == nullptr) {
+      std::cerr << "ERROR: could not allocate space for 'out' using "
+                << "malloc_host\n";
+      std::terminate();
+    }
+
+    // Copy the input to USM memory and reset the output.
+    // This is NOT efficient since, in the case of USM host allocations,
+    // we could have simply generated the input data into the host allocation
+    // and avoided this copy. However, it makes the code cleaner to assume the
+    // input is always in 'in_vec' and this portion of the code is not part of
+    // the performance timing.
+    std::copy(in_vec.begin(), in_vec.end(), in);
+    std::fill(out, out + count, ValueT(0));
+  } else {
+    // using device allocations
+    if ((in = malloc_device<ValueT>(count, q)) == nullptr) {
+      std::cerr << "ERROR: could not allocate space for 'in' using "
+                << "malloc_device\n";
+      std::terminate();
+    }
+    if ((out = malloc_device<ValueT>(count, q)) == nullptr) {
+      std::cerr << "ERROR: could not allocate space for 'out' using "
+                << "malloc_device\n";
+      std::terminate();
+    }
+
+    // copy the input to the device memory and wait for the copy to finish
+    q.memcpy(in, in_vec.data(), count * sizeof(ValueT)).wait();
+  }
+
+  // track timing information, in ms
+  std::vector<double> time(runs);
+
+  try {
+    std::cout << "Running sort " << runs << " times for an "
+              << "input size of " << count << " using " << kMergeUnits
+              << " " << kSortWidth << "-way merge units\n";
+    std::cout << "Streaming data from "
+              << (kUseUSMHostAllocation ? "host" : "device") << " memory\n";
+
+    // the pointer type for the kernel depends on whether data is coming from
+    // USM host or device allocations
+    using KernelPtrType =
+        typename std::conditional_t<kUseUSMHostAllocation, host_ptr<ValueT>,
+                                    device_ptr<ValueT>>;
+
+    // run the sort multiple times to increase the accuracy of the timing
+    for (int i = 0; i < runs; i++) {
+      // run the sort
+      time[i] = fpga_sort<ValueT, IndexT, KernelPtrType>(q, in, out, count);
+
+      // Copy the output to 'out_vec'. In the case where we are using USM host
+      // allocations this is unnecessary since we could simply deference
+      // 'out'. However, it makes the following code cleaner since the output
+      // is always in 'out_vec' and this copy is not part of the performance
+      // timing.
+      q.memcpy(out_vec.data(), out, count * sizeof(ValueT)).wait();
+
+      // validate the output
+      passed &= validate(out_vec.data(), ref.data(), count);
+    }
+  } catch (exception const &e) {
+    std::cout << "Caught a synchronous SYCL exception: " << e.what() << "\n";
+    std::terminate();
+  }
+
+  // free the memory allocated with malloc_host or malloc_device
+  sycl::free(in, q);
+  sycl::free(out, q);
+
+  // print the performance results
+  if (passed) {
+    // NOTE: when run in emulation, these results do not accurately represent
+    // the performance of the kernels in actual FPGA hardware
+    double avg_time_ms =
+      std::accumulate(time.begin() + 1, time.end(), 0.0) / (runs - 1);
+
+    IndexT input_count_mega = count * 1e-6;
+
+    std::cout << "Execution time: " << avg_time_ms << " ms\n";
+    std::cout << "Throughput: " << (input_count_mega / (avg_time_ms * 1e-3))
+              << " Melements/s\n";
+
+    std::cout << "PASSED\n";
+    return 0;
+  } else {
+    std::cout << "FAILED\n";
+    return 0;
+  }
+}
+
+// forward declare the kernel and pipe IDs to reduce name mangling
+class InputKernelID;
+class OuputKernelID;
+class SortInPipeID;
+class SortOutPipeID;
+
+//
+// perform the actual sort on the FPGA.
+//
+template <typename ValueT, typename IndexT, typename KernelPtrType>
+double fpga_sort(queue &q, ValueT *in_ptr, ValueT *out_ptr, IndexT count) {
+  // the input and output pipe for the sorter
+  using SortInPipe =
+      sycl::INTEL::pipe<SortInPipeID, sycl::vec<ValueT, kSortWidth>>;
+  using SortOutPipe =
+      sycl::INTEL::pipe<SortOutPipeID, sycl::vec<ValueT, kSortWidth>>;
+
+  // the sorter must sort a power of 2, so round up the requested count
+  // to the nearest power of 2; we will pad the input to make sure the
+  // output is still correct
+  const IndexT sorter_count = impu::math::RoundUpPow2(count);
+
+  // allocate some memory for the merge sort to use as temporary storage
+  ValueT *buf_0, *buf_1;
+  if ((buf_0 = malloc_device<ValueT>(sorter_count, q)) == nullptr) {
+    std::cerr << "ERROR: could not allocate memory for 'buf_0'\n";
+    std::terminate();
+  }
+  if ((buf_1 = malloc_device<ValueT>(sorter_count, q)) == nullptr) {
+    std::cerr << "ERROR: could not allocate memory for 'buf_1'\n";
+    std::terminate();
+  }
+
+  // This is the element we will pad the input with. In the case of this design,
+  // we are sorting from smallest to largest and we want the last elements out
+  // to be this element, so pad with MAX. If you are sorting from largest to
+  // smallest, make this the MIN element. If you are sorting custom types
+  // which are not supported by std::numeric_limits, then you will have to set
+  // this padding element differently.
+  const auto padding_element = std::numeric_limits<ValueT>::max();
+
+  // We are sorting kSortWidth elements per cycle, so we will have 
+  // sorter_count/kSortWidth pipe reads/writes from/to the sorter
+  const IndexT total_pipe_accesses = sorter_count / kSortWidth;
+
+  // launch the kernel that provides data into the sorter
+  auto input_kernel_event = q.submit([&](handler &h) {
+    h.single_task<InputKernelID>([=]() [[intel::kernel_args_restrict]] {
+      // read from the input pointer and write it to the sorter's input pipe
+      KernelPtrType in(in_ptr);
+
+      for (IndexT i = 0; i < total_pipe_accesses; i++) {
+        // read data from device memory
+        bool in_range = i < sorter_count;
+
+        // build the input pipe data
+        sycl::vec<ValueT, kSortWidth> data;
+        #pragma unroll
+        for (unsigned char j = 0; j < kSortWidth; j++) {
+          data[j] = in_range ? in[i * kSortWidth + j] : padding_element;
+        }
+
+        // write it into the sorter
+        SortInPipe::write(data);
+      }
+    });
+  });
+
+  // launch the kernel that reads out data from the sorter
+  auto output_kernel_event = q.submit([&](handler &h) {
+    h.single_task<OuputKernelID>([=]() [[intel::kernel_args_restrict]] {
+      // read from the sorter's output pipe and write to the output pointer
+      KernelPtrType out(out_ptr);
+      
+      for (IndexT i = 0; i < total_pipe_accesses; i++) {
+        // read data from the sorter
+        auto data = SortOutPipe::read();
+
+        // sorter_count is a multiple of kSortWidth
+        bool in_range = i < sorter_count;
+
+        // only write out to device memory if the index is in range
+        if (in_range) {
+          // write output to device memory
+          #pragma unroll
+          for (unsigned char j = 0; j < kSortWidth; j++) {
+            out[i * kSortWidth + j] = data[j];
+          }
+        }
+      }
+    });
+  });
+
+  // launch the merge sort kernels
+  auto merge_sort_events =
+      SubmitMergeSort<ValueT, IndexT, SortInPipe, SortOutPipe, kSortWidth,
+                      kMergeUnits>(q, sorter_count, buf_0, buf_1);
+
+  // wait for the input and output kernels to finish
+  auto start = high_resolution_clock::now();
+  input_kernel_event.wait();
+  output_kernel_event.wait();
+  auto end = high_resolution_clock::now();
+
+  // wait for the merge sort kernels to finish
+  for (auto &e : merge_sort_events) {
+    e.wait();
+  }
+
+  // free the memory allocated for the merge sort temporary buffers
+  sycl::free(buf_0, q);
+  sycl::free(buf_1, q);
+
+  // return the duration of the sort in milliseconds, excluding memory transfers
+  duration<double, std::milli> diff = end - start;
+  return diff.count();
+}
+
+//
+// simple function to check if two regions of memory contain the same values
+//
+template <typename T>
+bool validate(T *val, T *ref, unsigned int count) {
+  for (unsigned int i = 0; i < count; i++) {
+    if (val[i] != ref[i]) {
+      std::cout << "ERROR: mismatch at entry " << i << "\n";
+      std::cout << "\t" << val[i] << " != " << ref[i]
+                << " (val[i] != ref[i])\n";
+      return false;
+    }
+  }
+
+  return true;
+}
diff --git a/DirectProgramming/DPC++FPGA/ReferenceDesigns/merge_sort/src/merge.hpp b/DirectProgramming/DPC++FPGA/ReferenceDesigns/merge_sort/src/merge.hpp
new file mode 100644
index 0000000000..a32d1dfc4c
--- /dev/null
+++ b/DirectProgramming/DPC++FPGA/ReferenceDesigns/merge_sort/src/merge.hpp
@@ -0,0 +1,164 @@
+#ifndef __MERGE_HPP__
+#define __MERGE_HPP__
+
+#include <CL/sycl.hpp>
+#include <CL/sycl/INTEL/fpga_extensions.hpp>
+
+#include "sorting_networks.hpp"
+#include "impu_math.hpp"
+
+using namespace sycl;
+
+//
+// Streams in two sorted list of size 'in_count`, 'k_width' elements at a time,
+// from both InPipeA and InPipeB and merges them into a single sorted list of
+// size 'in_count*2' to OutPipe. This merges two sorted lists of size in_count
+// at a rate of 'k_width' elements per cycle.
+//
+template <typename Id, typename ValueT, typename IndexT, typename InPipeA,
+          typename InPipeB, typename OutPipe, unsigned char k_width,
+          class CompareFunc>
+event Merge(queue& q, IndexT total_count, IndexT in_count,
+            CompareFunc compare) {
+  // sanity check on k_width
+  static_assert(k_width >= 1);
+  static_assert(impu::math::IsPow2(k_width));
+
+  // merging two lists of size 'in_count' into a single output list of
+  // double the size
+  const IndexT out_count = in_count * 2;
+
+  return q.submit([&](handler& h) {
+    h.single_task<Id>([=] {
+      // the two input and feedback buffers
+      sycl::vec<ValueT, k_width> a, b, network_feedback;
+
+      bool drain_a = false;
+      bool drain_b = false;
+      bool a_valid = false;
+      bool b_valid = false;
+
+      // track the number of elements we have read from each input pipe
+      // for each sublist (counts up to 'in_count')
+      IndexT read_from_a = 0;
+      IndexT read_from_b = 0;
+
+      // create a small 2 element shift register to track whether we have
+      // read the last inputs from the input pipes
+      bool read_from_a_is_last = false; // (0 == in_count)
+      bool read_from_b_is_last = false; // (0 == in_count)
+      bool next_read_from_a_is_last = (k_width == in_count);
+      bool next_read_from_b_is_last = (k_width == in_count);
+
+      // track the number of elements we have written to the output pipe
+      // for each sublist (counts up to 'out_count')
+      IndexT written_out_inner = 0;
+
+      // track the number of elements we have written to the output pipe
+      // in total (counts up to 'total_count')
+      IndexT written_out = 0;
+
+      // this flag indicates that the chosen buffer (from Pipe A or B) is the
+      // first buffer from either sublist. This indicates that no output will
+      // be produced and instead we will just populate the feedback buffer
+      bool first_in_buffer = true;
+
+      // the main processing loop
+      [[intel::initiation_interval(1)]]
+      while (written_out != total_count) {
+        // read 'k_width' elements from Pipe A
+        if (!a_valid && !drain_b) {
+          a = InPipeA::read();
+          a_valid = true;
+          read_from_a_is_last = next_read_from_a_is_last;
+          next_read_from_a_is_last = (read_from_a == in_count-2*k_width);
+          read_from_a += k_width;
+        }
+
+        // read 'k_width' elements from Pipe B
+        if (!b_valid && !drain_a) {
+          b = InPipeB::read();
+          b_valid = true;
+          read_from_b_is_last = next_read_from_b_is_last;
+          next_read_from_b_is_last = (read_from_b == in_count-2*k_width);
+          read_from_b += k_width;
+        }
+
+        // determine which of the two inputs to feed into the merge sort network
+        bool choose_a = ((compare(a[0], b[0]) || drain_a) && !drain_b);
+        auto chosen_data_in = choose_a ? a : b;
+
+        // create input for merge sort network sorter network
+        sycl::vec<ValueT, k_width * 2> merge_sort_network_data;
+        #pragma unroll
+        for (unsigned char i = 0; i < k_width; i++) {
+          // populate the k_width*2 sized input for the merge sort network
+          // from the chosen input data and the feedback data
+          merge_sort_network_data[2 * i] = chosen_data_in[i];
+          merge_sort_network_data[2 * i + 1] = network_feedback[i];
+        }
+
+        // merge sort network, which sorts 'merge_sort_network_data' in-place
+        MergeSortNetwork<ValueT, k_width>(merge_sort_network_data, compare);
+
+        if (first_in_buffer) {
+          // the first buffer read for a sublist doesn't create any output,
+          // it just creates feedback
+          #pragma unroll
+          for (unsigned char i = 0; i < k_width; i++) {
+            network_feedback[i] = chosen_data_in[i];
+          }
+          drain_a = drain_a | (read_from_b_is_last && !choose_a);
+          drain_b = drain_b | (read_from_a_is_last && choose_a);
+          a_valid = !choose_a;
+          b_valid = choose_a;
+          first_in_buffer = false;
+        } else {
+          sycl::vec<ValueT, k_width> out_data;
+          if (written_out_inner == out_count - k_width) {
+            // on the last iteration for a set of sublists, the feedback
+            // is the only data left that is valid, so it goes to the output
+            out_data = network_feedback;
+          } else {
+            // grab the output and feedback data from the merge sort network
+            #pragma unroll
+            for (unsigned char i = 0; i < k_width; i++) {
+              out_data[i] = merge_sort_network_data[i];
+              network_feedback[i] = merge_sort_network_data[k_width + i];
+            }
+          }
+
+          // write the output data to the output pipe
+          OutPipe::write(out_data);
+          written_out += k_width;
+
+          // check if switching to a new set of 'in_count' sorted sublists
+          if (written_out_inner == out_count - k_width) {
+            // switching, so reset all internal counters and flags
+            drain_a = false;
+            drain_b = false;
+            a_valid = false;
+            b_valid = false;
+            read_from_a = 0;
+            read_from_b = 0;
+            read_from_a_is_last = false; // (0 == in_count)
+            read_from_b_is_last = false; // (0 == in_count)
+            next_read_from_a_is_last = (k_width == in_count);
+            next_read_from_b_is_last = (k_width == in_count);
+            written_out_inner = 0;
+            first_in_buffer = true;
+          } else {
+            // not switching, so update counters and flags
+            written_out_inner += k_width;
+            drain_a = drain_a | (read_from_b_is_last && !choose_a);
+            drain_b = drain_b | (read_from_a_is_last && choose_a);
+            a_valid = !choose_a;
+            b_valid = choose_a;
+          }
+        }
+      }
+    });
+  });
+}
+
+#endif /* __MERGE_HPP__ */
\ No newline at end of file
diff --git a/DirectProgramming/DPC++FPGA/ReferenceDesigns/merge_sort/src/merge_sort.hpp b/DirectProgramming/DPC++FPGA/ReferenceDesigns/merge_sort/src/merge_sort.hpp
new file mode 100644
index 0000000000..2b82ae4f67
--- /dev/null
+++ b/DirectProgramming/DPC++FPGA/ReferenceDesigns/merge_sort/src/merge_sort.hpp
@@ -0,0 +1,394 @@
+#ifndef __MERGESORT_HPP__
+#define __MERGESORT_HPP__
+
+#include <array>
+#include <iostream>
+#include <limits>
+#include <type_traits>
+#include <vector>
+
+#include <CL/sycl.hpp>
+#include <CL/sycl/INTEL/fpga_extensions.hpp>
+
+#include "consume.hpp"
+#include "merge.hpp"
+#include "produce.hpp"
+#include "sorting_networks.hpp"
+#include "unrolled_loop.hpp"
+#include "pipe_array.hpp"
+#include "impu_math.hpp"
+
+using namespace sycl;
+
+///////////////////////////////////////////////////////////////
+// Convenient default comparators
+struct LessThan {
+  template <class T>
+  bool operator()(T const& a, T const& b) const {
+    return a < b;
+  }
+};
+
+struct GreaterThan {
+  template <class T>
+  bool operator()(T const& a, T const& b) const {
+    return a > b;
+  }
+};
+///////////////////////////////////////////////////////////////
+
+///////////////////////////////////////////////////////////////
+// Forward declare kernel and pipe IDs to reduce name mangling
+// Kernel and pipe ID classes for the merge units
+template <int u>
+class ProduceAKernelID;
+template <int u>
+class ProduceBKernelID;
+template <int u>
+class MergeKernelID;
+template <int u>
+class ConsumeKernelID;
+
+class SortNetworkID;
+
+class APipeID;
+class BPipeID;
+class MergePipeID;
+class InternalOutPipeID;
+
+// Kernel and pipe ID classes for the merge tree
+template <int u, int v>
+class MergeTreeMergeKernelID;
+
+class InternalMergeTreePipeID;
+///////////////////////////////////////////////////////////////
+
+//
+// Submits all of the merge sort kernels necessary to sort 'count' elements.
+// Returns all of the events for the caller to wait on.
+// NOTE: there is no need to worry about returing a std::vector by value here;
+// C++ return-value-optimization (RVO) will take care of it!
+//
+template <typename ValueT, typename IndexT, typename InPipe, typename OutPipe,
+          unsigned char k_width, size_t units, typename Compare>
+std::vector<event> SubmitMergeSort(queue& q, size_t count, ValueT* buf_0,
+                                   ValueT* buf_1, Compare comp) {
+  // sanity check the number of merge units and the width of the sorter
+  static_assert(units >= 1);
+  static_assert(impu::math::IsPow2(units));
+  static_assert(k_width >= 1);
+  static_assert(impu::math::IsPow2(k_width));
+
+  // sanity check on IndexT
+  static_assert(std::is_integral_v<IndexT>);
+
+  // ensure we have a valid compare function
+  static_assert(
+      std::is_invocable_r_v<bool, Compare, ValueT, ValueT>,
+      "The 'Compare' function type must be invocable (i.e. operator()) with two"
+      "'ValueT' arguments and returning a boolean");
+
+  // A depth of 0 allows the compiler to pick the depth for each pipe, which
+  // allows it to balance the depth of the pipeline.
+  constexpr size_t kDefaultPipeDepth = 0;
+
+  // the type that is passed around the pipes
+  using PipeType = sycl::vec<ValueT, k_width>;
+
+  // the pipes connecting the different kernels of each merge unit
+  // one set of pipes for each 'units' merge units
+  using APipes =
+    impu::pipe::PipeArray<APipeID, PipeType, kDefaultPipeDepth, units>;
+  using BPipes =
+    impu::pipe::PipeArray<BPipeID, PipeType, kDefaultPipeDepth, units>;
+  using MergePipes =
+    impu::pipe::PipeArray<MergePipeID, PipeType, kDefaultPipeDepth, units>;
+  using InternalOutPipes =
+      impu::pipe::PipeArray<InternalOutPipeID, PipeType, kDefaultPipeDepth, units>;
+
+  //////////////////////////////////////////////////////////////////////////////
+  // These defines make the latter code cleaner
+  #define SubmitSortNetworkKernel \
+    SortNetworkKernel<SortNetworkID, ValueT, IndexT, InPipe, k_width>
+  #define SubmitProduceA \
+    Produce<ProduceAKernelID<u>, ValueT, IndexT, APipe, k_width>
+  #define SubmitProduceB \
+    Produce<ProduceBKernelID<u>, ValueT, IndexT, BPipe, k_width>
+  #define SubmitMerge \
+    Merge<MergeKernelID<u>, ValueT, IndexT, APipe, BPipe, MergePipe, k_width>
+  #define SubmitConsume                                                     \
+    Consume<ConsumeKernelID<u>, ValueT, IndexT, MergePipe, InternalOutPipe, \
+            k_width>
+  #define SubmitMTMerge                                                       \
+    Merge<MergeTreeMergeKernelID<level, merge_unit>, ValueT, IndexT, MTAPipe, \
+          MTBPipe, MTOutPipe, k_width>
+  //////////////////////////////////////////////////////////////////////////////
+
+  // depth of the merge tree to reduce the sorted partitions of each merge unit
+  constexpr size_t kReductionLevels = impu::math::Log2(units);
+
+  // validate 'count'
+  if (count == 0) {
+    std::cerr << "ERROR: 'count' must be greater than 0\n";
+    std::terminate();
+  } else if (!impu::math::IsPow2(count)) {
+    std::cerr << "ERROR: 'count' must be a power of 2\n";
+    std::terminate();
+  } else if (count < 4 * units) {
+    std::cerr << "ERROR: 'count' must be at least 4x greater than "
+              << "the number of merge units (" << units << ")\n";
+    std::terminate();
+  } else if (count > std::numeric_limits<IndexT>::max()) {
+    std::cerr << "ERROR: the index type does not have enough bits to count to "
+              << "'count'\n";
+    std::terminate();
+  } else if ((count / units) <= k_width) {
+    std::cerr << "ERROR: 'count/units' (elements per merge unit) "
+              << "must be greater than k_width\n";
+    std::terminate();
+  }
+
+  // validate the input buffers
+  if (buf_0 == nullptr) {
+    std::cerr << "ERROR: 'buf_0' is nullptr\n";
+    std::terminate();
+  }
+  if (buf_1 == nullptr) {
+    std::cerr << "ERROR: 'buf_1' is nullptr\n";
+    std::terminate();
+  }
+
+  // double buffering is more convenient with an array of pointers,
+  // so create one from the two buffers passed in by the caller
+  ValueT* buf[2] = {buf_0, buf_1};
+
+  // using double buffering, so track the current buffer and have a simple
+  // lamda to compute the next buffer index
+  unsigned buf_idx = 0;
+  auto next_buf_idx = [](unsigned buf_idx) { return buf_idx ^ 0x1; };
+
+  // the number of elements each merge unit will sort
+  const IndexT count_per_unit = count / units;
+
+  // each producer will produce half of the data for each merge unit
+  const IndexT half_count_per_unit = count_per_unit / 2;
+
+  // the number of sorting iterations each merge unit will perform
+  // NOTE: we subtract log2(k_width) because the bitonic sorting network
+  // performs the first log2(k_width) iterations of the sort while streaming
+  // the input data from the input pipe into device memory.
+  const IndexT iterations =
+    impu::math::Log2(count_per_unit) - impu::math::Log2(k_width);
+
+  // store the various merge unit and merge tree kernel events
+  std::array<std::vector<event>, units> produce_a_events, produce_b_events,
+      merge_events, consume_events;
+  std::array<std::vector<event>, kReductionLevels> mt_merge_events;
+  for (size_t i = 0; i < units; i++) {
+    produce_a_events[i].resize(iterations);
+    produce_b_events[i].resize(iterations);
+    merge_events[i].resize(iterations);
+    consume_events[i].resize(iterations);
+  }
+
+  // launch the sorting network kernel that performs the first log2(k_width)
+  // iterations of the sort. For example, if k_width=4, the sorting network
+  // sorts 4 elements per cycle, in the steady state. This means we need
+  // log2(4)=2 less iterations of the merge sort since we start with sorted
+  // sublists of size 4.
+  auto sort_network_event =
+      SubmitSortNetworkKernel(q, buf[buf_idx], count, comp);
+
+  ////////////////////////////////////////////////////////////////////////////
+  // Launching all of the merge unit kernels
+  // start with inputs of size 'k_width' since the data from the input pipe
+  // was sent through a sorting network that sorted sublists of size 'k_width'.
+  IndexT in_count = k_width;
+
+  // perform the sort iterations for each merge unit
+  for (size_t i = 0; i < iterations; i++) {
+    // The Consume kernels will write to a pipe on the last iteration
+    bool consumer_to_pipe = (i == (iterations - 1));
+
+    // launch the merge unit kernels for this iteration of the sort using
+    // a front-end meta-programming unroller
+    impu::UnrolledLoop<units>([&](auto u) {
+      // the intra merge unit pipes
+      using APipe = typename APipes::template PipeAt<u>;
+      using BPipe = typename BPipes::template PipeAt<u>;
+      using MergePipe = typename MergePipes::template PipeAt<u>;
+
+      // if there is only 1 merge unit, there will be no merge tree, so the
+      // single merge unit's output pipe will be the entire sort's output pipe
+      using InternalOutPipe =
+          std::conditional_t<(units == 1), OutPipe,
+                             typename InternalOutPipes::template PipeAt<u>>;
+
+      // build the dependency event vector
+      std::vector<event> wait_events;
+      if (i == 0) {
+        // on the first iteration, wait for sorting network kernel to be done so
+        // that all of the data is in the temp buffers in device memory
+        wait_events.push_back(sort_network_event);
+      } else {
+        // on all iterations (except the first), Produce kernels for the
+        // current iteration must wait for the Consume kernels to be done
+        // writing to device memory from the previous iteration.
+        // This is coarse grain synchronization between the Produce and Consume
+        // kernels of each merge unit.
+        wait_events.push_back(consume_events[u][i - 1]);
+      }
+
+      // the temporary device buffers reside in a single device allocation,
+      // so compute the offset into the buffer for each merge unit.
+      const size_t unit_buf_offset = count_per_unit * u;
+
+      // get device pointers for this merge unit's Produce and Consume kernels
+      ValueT* in_buf = buf[buf_idx];
+      ValueT* out_buf = buf[next_buf_idx(buf_idx)];
+
+      ////////////////////////////////////////////////////////////////////////
+      // Enqueue the merge unit kernels
+      // Produce A
+      produce_a_events[u][i] =
+        SubmitProduceA(q, in_buf, half_count_per_unit, in_count,
+                       unit_buf_offset, wait_events);
+
+      // Produce B
+      produce_b_events[u][i] =
+          SubmitProduceB(q, in_buf, half_count_per_unit, in_count,
+                         unit_buf_offset + half_count_per_unit, wait_events);
+
+      // Merge
+      merge_events[u][i] = SubmitMerge(q, count_per_unit, in_count, comp);
+
+      // Consume
+      consume_events[u][i] = SubmitConsume(q, out_buf, count_per_unit,
+                                           unit_buf_offset, consumer_to_pipe);
+      ////////////////////////////////////////////////////////////////////////
+    });
+    ////////////////////////////////////////////////////////////////////////
+
+    // swap buffers
+    buf_idx = next_buf_idx(buf_idx);
+
+    // increase the input size
+    in_count *= 2;
+  }
+  ////////////////////////////////////////////////////////////////////////////
+
+  ////////////////////////////////////////////////////////////////////////////
+  // Launching all of the merge tree kernels
+
+  // the merge tree pipe array
+  // NOTE: we actually only need 2^(kReductionLevels)-2 total pipes,
+  // but we have created a 2D pipe array with kReductionLevels*units
+  // pipes. The 2D pipe array makes the metaprogramming much easier and the
+  // front-end compiler will not use the extra pipes and therefore they
+  // will NOT be instantiated in hardware
+  using InternalMTPipes =
+    impu::pipe::PipeArray<InternalMergeTreePipeID, PipeType, kDefaultPipeDepth,
+                           kReductionLevels, units>;
+
+  // create the merge tree connected by pipes to merge the sorted output
+  // of each merge unit into a single sorted output. The output of the last
+  // level of the merge tree will stream out of 'OutPipe'.
+  // NOTE: if units==1, then there is no merge tree!
+  impu::UnrolledLoop<kReductionLevels>([&](auto level) {
+    // each level of the merge tree reduces the number of sorted partitions
+    // by a factor of 2.
+    // level 0 has 'units' merge kernels, level 1 has 'units/2', and so on...
+    // See README.md for a good illustration.
+    constexpr size_t kLevelMergeUnits = units / ((1 << level) * 2);
+
+    impu::UnrolledLoop<kLevelMergeUnits>([&](auto merge_unit) {
+      // When level == 0, we know we will use 'MTAPipeFromMergeUnit' and
+      // 'MTBPipeFromMergeUnit' below. However, we cannot access
+      // PipeAt<-1, ...> without a compiler error. So, we will set the previous
+      // level to 0, knowing that we will NOT use 'MTAPipeFromMergeTree' nor
+      // 'MTBPipeFromMergeTree' in the case that level == 0.
+      constexpr size_t prev_level = (level == 0) ? 0 : level - 1;
+
+      // 'PipeA' for this merge kernel in the merge tree.
+      // If the merge tree level is 0, the pipe is from a merge unit,
+      // otherwise it is from the previous level of the merge tree.
+      using MTAPipeFromMergeUnit =
+          typename InternalOutPipes::template PipeAt<merge_unit * 2>;
+      using MTAPipeFromMergeTree =
+          typename InternalMTPipes::template PipeAt<prev_level, merge_unit * 2>;
+      using MTAPipe =
+          typename std::conditional_t<(level == 0), MTAPipeFromMergeUnit,
+                                      MTAPipeFromMergeTree>;
+
+      // 'PipeB' for this merge kernel in the merge tree.
+      // If the merge tree level is 0, the pipe is from a merge unit,
+      // otherwise it is from the previous level of the merge tree.
+      using MTBPipeFromMergeUnit =
+          typename InternalOutPipes::template PipeAt<merge_unit * 2 + 1>;
+      using MTBPipeFromMergeTree =
+          typename InternalMTPipes::template PipeAt<prev_level,
+                                                    merge_unit * 2 + 1>;
+      using MTBPipe =
+          typename std::conditional_t<(level == 0), MTBPipeFromMergeUnit,
+                                      MTBPipeFromMergeTree>;
+
+      // 'OutPipe' for this merge kernel in the merge tree.
+      // If this is the last level, then the output pipe is the output pipe
+      // of the entire sorter, otherwise it is going to another level of the
+      // merge tree.
+      using MTOutPipeToMT =
+          typename InternalMTPipes::template PipeAt<level, merge_unit>;
+      using MTOutPipe =
+          typename std::conditional_t<(level == (kReductionLevels - 1)),
+                                      OutPipe, MTOutPipeToMT>;
+
+      // Launch the merge kernel
+      const auto e = SubmitMTMerge(q, in_count * 2, in_count, comp);
+      mt_merge_events[level].push_back(e);
+    });
+
+    // increase the input size
+    in_count *= 2;
+  });
+  ////////////////////////////////////////////////////////////////////////////
+
+  ////////////////////////////////////////////////////////////////////////////
+  // Combine all kernel events into a single return vector
+  std::vector<event> ret;
+
+  // add event from the sorting network stage
+  ret.push_back(sort_network_event);
+
+  // add each merge unit's sorting events
+  for (size_t u = 0; u < units; u++) {
+    ret.insert(ret.end(), produce_a_events[u].begin(),
+               produce_a_events[u].end());
+    ret.insert(ret.end(), produce_b_events[u].begin(),
+               produce_b_events[u].end());
+    ret.insert(ret.end(), merge_events[u].begin(), merge_events[u].end());
+    ret.insert(ret.end(), consume_events[u].begin(), consume_events[u].end());
+  }
+
+  // add the merge tree kernel events
+  for (size_t level = 0; level < kReductionLevels; level++) {
+    ret.insert(ret.end(), mt_merge_events[level].begin(),
+               mt_merge_events[level].end());
+  }
+
+  return ret;
+  ////////////////////////////////////////////////////////////////////////////
+}
+
+//
+// A convenient function that defaults the sorter's comparator to 'LessThan'
+// (i.e., operator<)
+//
+template <typename ValueT, typename IndexT, typename InPipe, typename OutPipe,
+          unsigned char k_width, size_t units>
+std::vector<event> SubmitMergeSort(queue& q, IndexT count, ValueT* buf_0,
+                                   ValueT* buf_1) {
+  return SubmitMergeSort<ValueT, IndexT, InPipe, OutPipe, k_width, units>(
+      q, count, buf_0, buf_1, LessThan());
+}
+
+#endif /* __MERGESORT_HPP__ */
\ No newline at end of file
diff --git a/DirectProgramming/DPC++FPGA/ReferenceDesigns/merge_sort/src/pipe_array.hpp b/DirectProgramming/DPC++FPGA/ReferenceDesigns/merge_sort/src/pipe_array.hpp
new file mode 100644
index 0000000000..4523715936
--- /dev/null
+++ b/DirectProgramming/DPC++FPGA/ReferenceDesigns/merge_sort/src/pipe_array.hpp
@@ -0,0 +1,95 @@
+#ifndef __PIPE_ARRAY_HPP__
+#define __PIPE_ARRAY_HPP__
+
+#include <CL/sycl.hpp>
+#include <CL/sycl/INTEL/fpga_extensions.hpp>
+#include <utility>
+
+#include "pipe_array_internal.hpp"
+
+namespace impu {
+namespace pipe {
+
+template <class Id,          // identifier for the pipe array
+          typename BaseTy,   // type to write/read for each pipe
+          size_t min_depth,  // minimum capacity of each pipe
+          size_t... dims     // depth of each dimension in the array
+                             // any number of dimensions are supported
+          >
+struct PipeArray {
+  PipeArray() = delete;  // ensure we cannot create an instance
+
+  template <size_t... idxs>
+  struct StructId;  // the ID of each pipe in the array
+
+  // VerifyIndices checks that we only access pipe indicies that are in range
+  template <size_t... idxs>
+  struct VerifyIndices {
+    static_assert(sizeof...(idxs) == sizeof...(dims),
+                  "Indexing into a PipeArray requires as many indices as "
+                  "dimensions of the PipeArray.");
+    static_assert(detail::VerifierDimLayer<dims...>::template VerifierIdxLayer<
+                      idxs...>::IsValid(),
+                  "Index out of bounds");
+    using VerifiedPipe =
+        cl::sycl::INTEL::pipe<StructId<idxs...>, BaseTy, min_depth>;
+  };
+
+  // helpers for accessing the dimensions of the pipe array
+  // usage:
+  //  MyPipeArray::GetNumDims() - number of dimensions in this pipe array
+  //  MyPipeArray::GetDimSize<3>() - size of dimension 3 in this pipe array
+  static constexpr size_t GetNumDims() { return (sizeof...(dims)); }
+  template <int dim_num>
+  static constexpr size_t GetDimSize() {
+    return std::get<dim_num>(dims...);
+  }
+
+  // PipeAt<idxs...> is used to reference a pipe at a particular index
+  template <size_t... idxs>
+  using PipeAt = typename VerifyIndices<idxs...>::VerifiedPipe;
+
+  // functor to implement blocking write to all pipes in the array
+  template <std::size_t... I>
+  struct BlockingWriteFunc {
+    void operator()(const BaseTy &data, bool &success) const {
+      PipeAt<I...>::write(data);
+    }
+  };
+  // functor to impllement non-blocking write to all pipes in the array
+  template <std::size_t... I>
+  struct NonBlockingWriteFunc {
+    void operator()(const BaseTy &data, bool &success) const {
+      PipeAt<I...>::write(data, success);
+    }
+  };
+  // helper function for implementing write() call to all pipes in the array
+  template <template <std::size_t...> class WriteFunc,
+            typename... IndexSequences>
+  static void write_currying_helper(const BaseTy &data, bool &success,
+                                    IndexSequences...) {
+    detail::write_currying<WriteFunc, BaseTy, std::index_sequence<>,
+                   IndexSequences...>()(data, success);
+  }
+
+  // blocking write
+  // write the same data to all pipes in the array using blocking writes
+  static void write(const BaseTy &data) {
+    bool success;  // temporary variable, ignored in BlockingWriteFunc
+    write_currying_helper<BlockingWriteFunc>(
+        data, success, std::make_index_sequence<dims>()...);
+  }
+
+  // non-blocking write
+  // write the same data to all pipes in the array using non-blocking writes
+  static void write(const BaseTy &data, bool &success) {
+    write_currying_helper<NonBlockingWriteFunc>(
+        data, success, std::make_index_sequence<dims>()...);
+  }
+
+};  // end of struct PipeArray
+
+}  // namespace pipe
+}  // namespace impu
+
+#endif /* __PIPE_ARRAY_HPP__ */
diff --git a/DirectProgramming/DPC++FPGA/ReferenceDesigns/merge_sort/src/pipe_array_internal.hpp b/DirectProgramming/DPC++FPGA/ReferenceDesigns/merge_sort/src/pipe_array_internal.hpp
new file mode 100644
index 0000000000..a28df0c653
--- /dev/null
+++ b/DirectProgramming/DPC++FPGA/ReferenceDesigns/merge_sort/src/pipe_array_internal.hpp
@@ -0,0 +1,58 @@
+#ifndef __PIPE_ARRAY_INTERNAL_HPP__
+#define __PIPE_ARRAY_INTERNAL_HPP__
+
+namespace impu {
+namespace pipe {
+namespace detail {
+
+// Templated classes for verifying dimensions when accessing elements in the
+// pipe array.
+template <size_t dim1, size_t... dims>
+struct VerifierDimLayer {
+  template <size_t idx1, size_t... idxs>
+  struct VerifierIdxLayer {
+    static constexpr bool IsValid() {
+      return idx1 < dim1 &&
+             (VerifierDimLayer<dims...>::template VerifierIdxLayer<
+                 idxs...>::IsValid());
+    }
+  };
+};
+template <size_t dim>
+struct VerifierDimLayer<dim> {
+  template <size_t idx>
+  struct VerifierIdxLayer {
+    static constexpr bool IsValid() { return idx < dim; }
+  };
+};
+
+// Templated classes to perform 'currying' write to all pipes in the array
+// Primary template, dummy
+template <template <std::size_t...> class WriteFunc, typename BaseTy,
+          typename PartialSequence, typename... RemainingSequences>
+struct write_currying {};
+// Induction case
+template <template <std::size_t...> class WriteFunc, typename BaseTy,
+          std::size_t... I, std::size_t... J, typename... RemainingSequences>
+struct write_currying<WriteFunc, BaseTy, std::index_sequence<I...>,
+                      std::index_sequence<J...>, RemainingSequences...> {
+  void operator()(const BaseTy &data, bool &success) const {
+    (write_currying<WriteFunc, BaseTy, std::index_sequence<I..., J>,
+                    RemainingSequences...>()(data, success),
+     ...);
+  }
+};
+// Base case
+template <template <std::size_t...> class WriteFunc, typename BaseTy,
+          std::size_t... I>
+struct write_currying<WriteFunc, BaseTy, std::index_sequence<I...>> {
+  void operator()(const BaseTy &data, bool &success) const {
+    WriteFunc<I...>()(data, success);
+  }
+};
+
+}  // namespace detail
+}  // namespace pipe
+}  // namespace impu
+
+#endif /* __PIPE_ARRAY_INTERNAL_HPP__ */
\ No newline at end of file
diff --git a/DirectProgramming/DPC++FPGA/ReferenceDesigns/merge_sort/src/produce.hpp b/DirectProgramming/DPC++FPGA/ReferenceDesigns/merge_sort/src/produce.hpp
new file mode 100644
index 0000000000..ce6c8cb63b
--- /dev/null
+++ b/DirectProgramming/DPC++FPGA/ReferenceDesigns/merge_sort/src/produce.hpp
@@ -0,0 +1,45 @@
+#ifndef __PRODUCE_HPP__
+#define __PRODUCE_HPP__
+
+#include <CL/sycl.hpp>
+#include <CL/sycl/INTEL/fpga_extensions.hpp>
+
+using namespace sycl;
+
+//
+// Produces 'k_width' elements of data per cycle into the merge unit from
+// device memory
+//
+template<typename Id, typename ValueT, typename IndexT, typename OutPipe,
+         unsigned char k_width>
+event Produce(queue& q, ValueT *in_ptr, IndexT count, IndexT in_block_count,
+              IndexT start_offset, std::vector<event>& depend_events) {
+  // the number of loop iterations required to produce all of the data
+  const IndexT iterations = count / k_width;
+
+  return q.submit([&](handler& h) {
+    h.depends_on(depend_events);
+
+    h.single_task<Id>([=]() [[intel::kernel_args_restrict]] {
+      // Pointer to the input data.
+      // Creating a device_ptr tells the compiler that this pointer is in
+      // device memory, not host memory, and avoids creating extra connections
+      // to host memory
+      device_ptr<ValueT> in(in_ptr);
+
+      for (IndexT i = 0; i < iterations; i++) {
+        // read 'k_width' elements from device memory
+        sycl::vec<ValueT, k_width> pipe_data;
+        #pragma unroll
+        for (unsigned char j = 0; j < k_width; j++) {
+          pipe_data[j] = in[start_offset + i*k_width + j];
+        }
+
+        // write to the output pipe
+        OutPipe::write(pipe_data);
+      }
+    });
+  });
+}
+
+#endif /* __PRODUCE_HPP__ */
\ No newline at end of file
diff --git a/DirectProgramming/DPC++FPGA/ReferenceDesigns/merge_sort/src/sorting_networks.hpp b/DirectProgramming/DPC++FPGA/ReferenceDesigns/merge_sort/src/sorting_networks.hpp
new file mode 100644
index 0000000000..d199ecc85c
--- /dev/null
+++ b/DirectProgramming/DPC++FPGA/ReferenceDesigns/merge_sort/src/sorting_networks.hpp
@@ -0,0 +1,133 @@
+#ifndef __SORTINGNETWORKS_HPP__
+#define __SORTINGNETWORKS_HPP__
+
+#include <CL/sycl.hpp>
+#include <CL/sycl/INTEL/fpga_extensions.hpp>
+
+#include "impu_math.hpp"
+
+#define SWAP(a, b)  \
+  do {              \
+    auto tmp = (a); \
+    (a) = (b);      \
+    (b) = tmp;      \
+  } while (0)
+
+using namespace sycl;
+
+//
+// Creates a merge sort network.
+// Takes in two sorted lists ('a' and 'b') of size 'k_width' and merges them
+// into a single sorted output in a single cycle, in the steady state.
+//
+// Convention:
+//    a = {data[0], data[2], data[4], ...}
+//    b = {data[1], data[3], data[5], ...}
+//
+template <typename ValueT, unsigned char k_width, class CompareFunc>
+void MergeSortNetwork(sycl::vec<ValueT, k_width * 2>& data,
+                      CompareFunc compare) {
+  if constexpr (k_width == 4) {
+    // Special case for k_width==4 that has 1 less compare on the critical path
+    #pragma unroll
+    for (unsigned char i = 0; i < 4; i++) {
+      if (!compare(data[2 * i], data[2 * i + 1])) {
+        SWAP(data[2 * i], data[2 * i + 1]);
+      }
+    }
+
+    if (!compare(data[1], data[4])) {
+      SWAP(data[1], data[4]);
+    }
+    if (!compare(data[3], data[6])) {
+      SWAP(data[3], data[6]);
+    }
+
+    #pragma unroll
+    for (unsigned char i = 0; i < 3; i++) {
+      if (!compare(data[2 * i + 1], data[2 * i + 2])) {
+        SWAP(data[2 * i + 1], data[2 * i + 2]);
+      }
+    }
+  } else {
+    // the general case
+    // this works well for k_width = 1 or 2, but is not optimal for
+    // k_width = 4 (see if-case above) or higher
+    constexpr unsigned char merge_tree_depth = impu::math::Log2(k_width * 2);
+    #pragma unroll
+    for (unsigned i = 0; i < merge_tree_depth; i++) {
+      #pragma unroll
+      for (unsigned j = 0; j < k_width - i; j++) {
+        if (!compare(data[i + 2 * j], data[i + 2 * j + 1])) {
+          SWAP(data[i + 2 * j], data[i + 2 * j + 1]);
+        }
+      }
+    }
+  }
+}
+
+//
+// Creates a bitonic sorting network.
+// It accepts and sorts 'k_width' elements per cycle, in the steady state.
+// For more info see: https://en.wikipedia.org/wiki/Bitonic_sorter
+//
+template <typename ValueT, unsigned char k_width, class CompareFunc>
+void BitonicSortNetwork(sycl::vec<ValueT, k_width>& data, CompareFunc compare) {
+  #pragma unroll
+  for (unsigned char k = 2; k <= k_width; k *= 2) {
+    #pragma unroll
+    for (unsigned char j = k / 2; j > 0; j /= 2) {
+      #pragma unroll
+      for (unsigned char i = 0; i < k_width; i++) {
+        const unsigned char l = i ^ j;
+        if (l > i) {
+          const bool comp = compare(data[i], data[l]);
+          const bool cond1 = ((i & k) == 0) && !comp;
+          const bool cond2 = ((i & k) != 0) && comp;
+          if (cond1 || cond2) {
+            SWAP(data[i], data[l]);
+          }
+        }
+      }
+    }
+  }
+}
+
+//
+// The sorting network kernel.
+// This kernel streams in 'k_width' elements per cycle, sends them through a
+// 'k_width' wide bitonic sorting network, and writes the sorted output or size
+// 'k_width' to device memory. The result is an output array ('out_ptr') of size
+// 'total_count' where each set of 'k_width' elements is sorted.
+//
+template <typename Id, typename ValueT, typename IndexT, typename InPipe,
+          unsigned char k_width, class CompareFunc>
+event SortNetworkKernel(queue& q, ValueT* out_ptr, IndexT total_count,
+                        CompareFunc compare) {
+  // the number of loop iterations required to process all of the data
+  const IndexT iterations = total_count / k_width;
+
+  return q.submit([&](handler& h) {
+    h.single_task<Id>([=]() [[intel::kernel_args_restrict]] {
+      device_ptr<ValueT> out(out_ptr);
+
+      for (IndexT i = 0; i < iterations; i++) {
+        // read the input data from the pipe
+        sycl::vec<ValueT, k_width> data = InPipe::read();
+
+        // bitonic sort network sorts the k_width elements of 'data' in-place
+        // NOTE: there are no dependencies across loop iterations on 'data'
+        // here, so this sorting network can be fully pipelined
+        BitonicSortNetwork<ValueT, k_width>(data, compare);
+
+        // write the 'k_width' sorted elements to device memory
+        #pragma unroll
+        for (unsigned char j = 0; j < k_width; j++) {
+          out[i * k_width + j] = data[j];
+        }
+      }
+    });
+  });
+}
+
+#endif /* __SORTINGNETWORKS_HPP__ */
\ No newline at end of file
diff --git a/DirectProgramming/DPC++FPGA/ReferenceDesigns/merge_sort/src/unrolled_loop.hpp b/DirectProgramming/DPC++FPGA/ReferenceDesigns/merge_sort/src/unrolled_loop.hpp
new file mode 100755
index 0000000000..ff58e78b26
--- /dev/null
+++ b/DirectProgramming/DPC++FPGA/ReferenceDesigns/merge_sort/src/unrolled_loop.hpp
@@ -0,0 +1,188 @@
+#ifndef __UNROLLEDLOOP_HPP__
+#define __UNROLLEDLOOP_HPP__
+#pragma once
+
+#include <type_traits>
+#include <utility>
+
+namespace impu {
+//
+// The code below creates the constexprs 'make_integer_range'
+// and 'make_index_range' these are akin to 'std::make_integer_sequence'
+// and 'std::make_index_sequence', respectively.
+// However they allow you to specificy a range and can either increment
+// or decrement, rather than a strict increasing sequence
+//
+template <typename T, typename, T begin, bool increase>
+struct integer_range_impl;
+
+// incrementing case
+template <typename T, T... N, T begin>
+struct integer_range_impl<T, std::integer_sequence<T, N...>, begin, true> {
+  using type = std::integer_sequence<T, N + begin...>;
+};
+
+// decrementing case
+template <typename T, T... N, T begin>
+struct integer_range_impl<T, std::integer_sequence<T, N...>, begin, false> {
+  using type = std::integer_sequence<T, begin - N...>;
+};
+
+// integer_range
+template <typename T, T begin, T end>
+using integer_range = typename integer_range_impl<T,
+    std::make_integer_sequence<T, (begin < end) ? end - begin : begin - end>,
+    begin,
+    (begin < end)>::type;
+
+//
+// make_integer_range
+//
+// USAGE:
+//    make_integer_range<int,1,10>{} ==> 1,2,...,9
+//    make_integer_range<int,10,1>{} ==> 10,9,...,2
+//
+template <class T, T begin, T end>
+using make_integer_range = integer_range<T, begin, end>;
+
+//
+// make_index_range
+//
+// USAGE:
+//    make_index_range<1,10>{} ==> 1,2,...,9
+//    make_index_range<10,1>{} ==> 10,9,...,2
+//
+template <std::size_t begin, std::size_t end>
+using make_index_range = integer_range<std::size_t, begin, end>;
+
+//
+// The code below creates the constexprs 'make_integer_pow2_sequence'
+// and 'make_index_pow2_sequence'. These generate the sequence
+// 2^0, 2^1, 2^2, ... , 2^(N-1) = 1,2,4,...,2^(N-1)
+//
+template <typename T, typename>
+struct integer_pow2_sequence_impl;
+
+template <typename T, T... Pows>
+struct integer_pow2_sequence_impl<T, std::integer_sequence<T, Pows...>> {
+  using type = std::integer_sequence<T, (T(1) << Pows)...>;
+};
+
+// integer_pow2_sequence
+template <typename T, T N>
+using integer_pow2_sequence =
+    typename integer_pow2_sequence_impl<T,
+                                        std::make_integer_sequence<T, N>>::type;
+
+//
+// make_integer_pow2_sequence
+//
+// USAGE:
+//    make_integer_pow2_sequence<int,5>{} ==> 1,2,4,8,16
+//
+template <class T, T N>
+using make_integer_pow2_sequence = integer_pow2_sequence<T, N>;
+
+//
+// make_index_pow2_sequence
+//
+// USAGE:
+//    make_index_pow2_sequence<5>{} ==> 1,2,4,8,16
+//
+template <std::size_t N>
+using make_index_pow2_sequence = integer_pow2_sequence<std::size_t, N>;
+
+///////////////////////////////////////////////////////////////////////////////
+//
+// Example usage for UnrolledLoop constexpr:
+//
+// Base
+//    UnrolledLoop(std::integer_sequence<int,5,2,7,8>{},[&](auto i) { 
+//      /* i = 5,2,7,8 */
+//    });
+//
+// Case A
+//    UnrolledLoop<10>([&](auto i) { 
+//      /* i = 0,1,...,9 */
+//    });
+//
+// Case B
+//    UnrolledLoop<10>([&](auto i) {
+//      /* i = 0,1,...,9 */
+//    });
+//
+// Case C
+//    UnrolledLoop<char, 1, 10>([&](auto i) { 
+//      /* i = 1,2,...,9 */
+//    });
+//    UnrolledLoop<char, 10, 1>([&](auto i) {
+//      /* i = 10,9,...,2 */
+//    });
+//
+// Case D
+//    UnrolledLoop<1, 10>([&](auto i) {
+//      /* i = 1,2,...,9 */
+//    });
+//    UnrolledLoop<10, 1>([&](auto i) {
+//      /* i = 10,9,...,2 */
+//    });
+//
+///////////////////////////////////////////////////////////////////////////////
+
+//
+// Base implementation
+// Templated on:
+//    ItType    - the type of the iterator (size_t, int, char, ...)
+//    ItType... - the indices to iterate on
+//    F         - the function to run for each index (i.e. the lamda)
+//
+template <class ItType, ItType... inds, class F>
+constexpr void UnrolledLoop(std::integer_sequence<ItType, inds...>, F&& f) {
+  (f(std::integral_constant<ItType, inds>{}), ...);
+}
+
+//
+// Convience implementation (A)
+// performs UnrolledLoop in range [0,n) with iterator of type ItType
+//
+template <class ItType, ItType n, class F>
+constexpr void UnrolledLoop(F&& f) {
+  UnrolledLoop(std::make_integer_sequence<ItType, n>{}, std::forward<F>(f));
+}
+
+//
+// Convenience implementation (B)
+// performs UnrolledLoop in range [0,n) with an iterator of type std::size_t
+//
+template <std::size_t n, class F>
+constexpr void UnrolledLoop(F&& f) {
+  UnrolledLoop(std::make_index_sequence<n>{}, std::forward<F>(f));
+}
+
+//
+// Convenience implementation (C)
+// performs UnrolledLoop from start...end with an iterator of type ItType
+// NOTE:  start is INCLUSIVE, end is EXCLUSIVE
+// NOTE:  if start<=end, sequence is start,start+1,...,end-1
+//        if end<=start, sequence is start,start-1,...,end+1
+//
+template <class ItType, ItType start, ItType end, class F>
+constexpr void UnrolledLoop(F&& f) {
+  UnrolledLoop(make_integer_range<ItType, start, end>{}, std::forward<F>(f));
+}
+
+//
+// Convenience implementation (C)
+// performs UnrolledLoop from start...end with an iterator of type size_t
+// NOTE:  start is INCLUSIVE, end is EXCLUSIVE
+// NOTE:  if start<=end, sequence is start,start+1,...,end-1
+//        if end<=start, sequence is start,start-1,...,end+1
+//
+template <std::size_t start, std::size_t end, class F>
+constexpr void UnrolledLoop(F&& f) {
+  UnrolledLoop(make_index_range<start, end>{}, std::forward<F>(f));
+}
+
+}  // namespace impu
+
+#endif /* __UNROLLEDLOOP_HPP__ */