From 006302686c6277bd473b7e2e07ae68d90aced775 Mon Sep 17 00:00:00 2001
From: Ben Barsdell <bbarsdell@nvidia.com>
Date: Thu, 19 Oct 2023 11:57:54 +1100
Subject: [PATCH 01/47] Update jitify2 copyright years

---
 LICENSE                              | 2 +-
 example_headers/class_arg_kernel.cuh | 2 +-
 example_headers/constant_header.cuh  | 2 +-
 example_headers/my_header1.cuh       | 2 +-
 example_headers/my_header2.cuh       | 2 +-
 example_headers/my_header3.cuh       | 2 +-
 jitify2.hpp                          | 2 +-
 jitify2_preprocess.cpp               | 2 +-
 jitify2_test.cu                      | 2 +-
 jitify2_test_kernels.cu              | 2 +-
 10 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/LICENSE b/LICENSE
index b678a46..a4d873b 100644
--- a/LICENSE
+++ b/LICENSE
@@ -1,6 +1,6 @@
 BSD 3-Clause License
 
-Copyright (c) 2017-2024, NVIDIA Corporation
+Copyright (c) 2017-2025, NVIDIA Corporation
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
diff --git a/example_headers/class_arg_kernel.cuh b/example_headers/class_arg_kernel.cuh
index 19dd48a..b452ba3 100644
--- a/example_headers/class_arg_kernel.cuh
+++ b/example_headers/class_arg_kernel.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2024, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2017-2025, NVIDIA CORPORATION. All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
diff --git a/example_headers/constant_header.cuh b/example_headers/constant_header.cuh
index f3f1cc9..0eaf9bf 100644
--- a/example_headers/constant_header.cuh
+++ b/example_headers/constant_header.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2024, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2017-2025, NVIDIA CORPORATION. All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
diff --git a/example_headers/my_header1.cuh b/example_headers/my_header1.cuh
index 7f07df7..38027c9 100644
--- a/example_headers/my_header1.cuh
+++ b/example_headers/my_header1.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2024, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2017-2025, NVIDIA CORPORATION. All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
diff --git a/example_headers/my_header2.cuh b/example_headers/my_header2.cuh
index f5a90c2..c776fae 100644
--- a/example_headers/my_header2.cuh
+++ b/example_headers/my_header2.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2024, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2017-2025, NVIDIA CORPORATION. All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
diff --git a/example_headers/my_header3.cuh b/example_headers/my_header3.cuh
index 4933de5..e5f3cc7 100644
--- a/example_headers/my_header3.cuh
+++ b/example_headers/my_header3.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2024, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2017-2025, NVIDIA CORPORATION. All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
diff --git a/jitify2.hpp b/jitify2.hpp
index d5d379d..25592c4 100644
--- a/jitify2.hpp
+++ b/jitify2.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2024, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2017-2025, NVIDIA CORPORATION. All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
diff --git a/jitify2_preprocess.cpp b/jitify2_preprocess.cpp
index 575efe9..93ffdbe 100644
--- a/jitify2_preprocess.cpp
+++ b/jitify2_preprocess.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2024, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2017-2025, NVIDIA CORPORATION. All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
diff --git a/jitify2_test.cu b/jitify2_test.cu
index f22c684..e53bc96 100644
--- a/jitify2_test.cu
+++ b/jitify2_test.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2024, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2017-2025, NVIDIA CORPORATION. All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
diff --git a/jitify2_test_kernels.cu b/jitify2_test_kernels.cu
index 8dbcbab..d2681b3 100644
--- a/jitify2_test_kernels.cu
+++ b/jitify2_test_kernels.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2024, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2017-2025, NVIDIA CORPORATION. All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions

From f5cd6e1aa6dc03377656a293b3edbf55ccaba9a7 Mon Sep 17 00:00:00 2001
From: Ben Barsdell <bbarsdell@nvidia.com>
Date: Mon, 6 Nov 2023 14:39:47 +1100
Subject: [PATCH 02/47] Overhaul parsing and preprocessing

- Replaces C++ lexing/parsing/patching code with a proper lexer
  implementation, which significantly improves robustness and
  maintainability.
- Replaces minification logic with robust token-based minification.
- Replaces preprocessing logic with a new approach that uses custom
  parsing to find include directives. This only requires invoking
  NVRTC (and only its preprocessor) once per preprocess, which speeds
  up preprocessing by 50x in some cases.
- Fixes include directory handling. Relative include paths are now
  handled robustly, and there is no longer any ambiguity between
  external and built-in headers. Note that relative paths (including
  in -I options) now start from the current executable directory
  instead of the current working directory.
- These changes should be almost completely backwards compatible.
---
 jitify2.hpp     | 2303 ++++++++++++++++++++++++++++++++++++-----------
 jitify2_test.cu |  549 ++++++++++-
 2 files changed, 2296 insertions(+), 556 deletions(-)

diff --git a/jitify2.hpp b/jitify2.hpp
index 25592c4..be1cb51 100644
--- a/jitify2.hpp
+++ b/jitify2.hpp
@@ -110,6 +110,7 @@
 #include <list>
 #include <map>
 #include <memory>
+#include <queue>
 #include <regex>
 #include <thread>
 #include <type_traits>
@@ -2294,6 +2295,15 @@ inline std::string path_base(const std::string& p) {
   }
 }
 
+inline bool path_is_absolute(const std::string& p) {
+#if defined _WIN32 || defined _WIN64
+  return (p.size() >= 1 && (p[0] == '\\' || p[0] == '/')) ||
+         (p.size() >= 3 && p[1] == ':' && (p[2] == '\\' || p[2] == '/'));
+#else
+  return p.size() >= 1 && p[0] == '/';
+#endif
+}
+
 inline std::string path_join(StringRef p1, StringRef p2) {
 #if defined _WIN32 || defined _WIN64
   // Note that Windows supports both forward and backslash path separators.
@@ -2301,7 +2311,7 @@ inline std::string path_join(StringRef p1, StringRef p2) {
 #else
   const char* sep = "/";
 #endif
-  if (p1.size() && p2.size() && std::strchr(sep, p2[0])) {
+  if (p1.size() && p2.size() && path_is_absolute(p2)) {
     return {};  // Error, cannot join to absolute path
   }
   std::string result;
@@ -3366,6 +3376,15 @@ inline void add_default_device_flag_if_not_specified(OptionsVec* options) {
   }
 }
 
+inline void add_no_source_include_flag_if_not_specified(OptionsVec* options) {
+  // This prevents NVRTC's preprocessor from automatically using the current
+  // working directory as an include path. We need to do this because we must
+  // find all includes ourselves so that we can patch them etc.
+  if (options->find({"--no-source-include", "-no-source-include"}).empty()) {
+    options->emplace_back("-no-source-include");
+  }
+}
+
 // Demangles nested variable names using the PTX name mangling scheme
 // (which mostly follows the Itanium64 ABI). E.g., _ZN1a3Foo2bcE -> a::Foo::bc.
 inline std::string demangle_ptx_variable_name(const char* mangled_name) {
@@ -3463,13 +3482,12 @@ inline void find_lowered_global_variables(StringRef ptx,
 
 inline bool ptx_remove_unused_globals(std::string* ptx);  // Defined below
 
-// Returns false on error.
 // Sets *error on failure if provided.
 // Sets *log if provided.
 // Sets *ptx on success if provided.
 // Adds one entry to *lowered_name_map for each entry in name_expressions as
 //   well as any global definitions found in the generated PTX.
-inline bool compile_program(
+inline nvrtcResult compile_program(
     const std::string& name, const std::string& source,
     const StringMap& header_sources, const OptionsVec& options,
     std::string* error = nullptr, std::string* log = nullptr,
@@ -3478,7 +3496,7 @@ inline bool compile_program(
     StringMap* lowered_name_map = nullptr, bool remove_unused_globals = false) {
   if (!nvrtc()) {
     if (error) *error = nvrtc().error();
-    return false;
+    return NVRTC_ERROR_PROGRAM_CREATION_FAILURE;
   }
 
   std::vector<const char*> header_names_c;
@@ -3509,7 +3527,7 @@ inline bool compile_program(
     nvrtcResult jitify_nvrtc_ret = call;                              \
     if (jitify_nvrtc_ret != NVRTC_SUCCESS) {                          \
       if (error) *error = nvrtc().GetErrorString()(jitify_nvrtc_ret); \
-      return false;                                                   \
+      return jitify_nvrtc_ret;                                        \
     }                                                                 \
   } while (0)
 
@@ -3592,7 +3610,7 @@ inline bool compile_program(
   }
 
 #undef JITIFY_CHECK_NVRTC
-  return true;
+  return NVRTC_SUCCESS;
 }
 
 inline StringVec split_string(std::string str, long maxsplit = -1,
@@ -3743,10 +3761,10 @@ inline CompiledProgram CompiledProgram::compile(
       {"-remove-unused-globals", "--remove-unused-globals"});
   std::string log, ptx, cubin, nvvm;
   StringMap lowered_name_map;
-  if (!detail::compile_program(name, source, header_sources, compiler_options,
-                               &error, &log, &ptx, &cubin, &nvvm,
-                               name_expressions, &lowered_name_map,
-                               should_remove_unused_globals)) {
+  if (detail::compile_program(name, source, header_sources, compiler_options,
+                              &error, &log, &ptx, &cubin, &nvvm,
+                              name_expressions, &lowered_name_map,
+                              should_remove_unused_globals)) {
     std::string options_str = detail::string_join(
         compiler_options, " ", "Compiler options: \"", "\"\n");
     std::vector<std::string> header_names;
@@ -3986,7 +4004,20 @@ class PreprocessedProgramData
   }
 };
 
-using FileCallback = std::function<bool(const std::string&, std::string*)>;
+namespace parser {
+
+class IncludeName;
+
+}  // namespace parser
+
+using parser::IncludeName;  // Pull into main namespace
+
+using HeaderCallback =
+    std::function<bool(const parser::IncludeName&, std::string*)>;
+
+// TODO: Mark with deprecated attribute.
+// Deprecated, use HeaderCallback instead.
+using FileCallback = HeaderCallback;
 
 class PreprocessedProgram
     : public detail::FallibleObjectBase<PreprocessedProgram,
@@ -3999,11 +4030,10 @@ class PreprocessedProgram
 
  public:
   /*! \see ProgramData::preprocess */
-  static PreprocessedProgram preprocess(std::string name, std::string source,
-                                        StringMap header_sources = {},
-                                        OptionsVec compiler_options = {},
-                                        OptionsVec linker_options = {},
-                                        FileCallback header_callback = nullptr);
+  static PreprocessedProgram preprocess(
+      std::string program_name, std::string program_source,
+      StringMap header_sources = {}, OptionsVec compiler_options = {},
+      OptionsVec linker_options = {}, HeaderCallback header_callback = nullptr);
 };
 
 namespace detail {
@@ -5567,102 +5597,6 @@ static const StringMap& get_jitsafe_headers_map() {
   return jitsafe_headers_map;
 }
 
-inline bool extract_include_info_from_compile_error(const std::string& log,
-                                                    std::string* name,
-                                                    std::string* parent,
-                                                    int* line_num) {
-  static const StringVec pattern = {"could not open source file \"",
-                                    "cannot open source file \""};
-  for (auto& p : pattern) {
-    size_t beg = log.find(p);
-    if (beg != std::string::npos) {
-      beg += p.size();
-      size_t end = log.find("\"", beg);
-      *name = log.substr(beg, end - beg);
-
-      size_t line_beg = log.rfind("\n", beg);
-      if (line_beg == std::string::npos) {
-        line_beg = 0;
-      } else {
-        line_beg += 1;
-      }
-
-      size_t split = log.find("(", line_beg);
-      *parent = log.substr(line_beg, split - line_beg);
-      *line_num = std::atoi(
-          log.substr(split + 1, log.find(")", split + 1) - (split + 1))
-              .c_str());
-
-      return true;
-    }
-  }
-  return false;
-}
-
-// Returns the offset of the beginning of the specified line, taking into
-// account any #line directives.
-// TODO: It's not clear what this should do when there is a #line directive
-// that skips lines (e.g., line_num = 2 and there is a '#line 1' several lines
-// into the source, resulting in an ambiguity).
-inline size_t find_source_line(StringRef source, int line_num) {
-  // TODO: This is not robust to `#line` inside comments, strings etc.
-  size_t beg = 0;
-  // HACK: This is a WAR for the ambiguity introduced by jitify's include guard
-  // that is 2 lines followed by '#line 1', when line_num <= 3.
-  if (startswith(source, "#ifndef JITIFY_INCLUDE_GUARD_")) {
-    beg = source.find("#line 1\n") + 8;
-  }
-  for (int i = 1; i < line_num; ++i) {
-    beg = source.find_first_of("\n#", beg);
-    if (beg == std::string::npos) return beg;
-    if (source[beg] == '#' && source.substr(beg, 5) == "#line" &&
-        std::isspace((unsigned char)source[beg + 5])) {
-      // Found a #line directive, parse it and reset the line numbering.
-      beg += 5;
-      while (std::isspace((unsigned char)source[++beg]))
-        ;
-      size_t num_beg = beg;
-      while (std::isdigit((unsigned char)source[++beg]))
-        ;
-      size_t num_end = beg;
-      int num = std::atoi(
-          std::string(source.substr(num_beg, num_end - num_beg)).c_str());
-      i = num - 1;
-      beg = source.find_first_of("\n", beg);
-      if (beg == std::string::npos) return beg;
-    } else if (source[beg] == '#') {
-      // This was just some other # token, don't count it as a new line.
-      --i;
-    }
-    ++beg;
-  }
-  return beg;
-}
-
-inline bool is_include_directive_with_quotes(StringRef source, int line_num,
-                                             std::string* error = nullptr) {
-  // TODO: This implementation does not handle things like
-  // "#define INC <foo>\n #include INC", which Thrust does in some headers.
-  size_t beg = find_source_line(source, line_num);
-  if (beg == std::string::npos) {
-    if (error) *error = "EOF reached before source line was found";
-    return false;
-  }
-  // TODO: This is not robust to inline comments, strings etc.
-  beg = source.find("include", beg);
-  if (beg == std::string::npos) {
-    if (error) *error = "Line does not contain 'include'";
-    return false;
-  }
-  beg += 7;
-  beg = source.find_first_of("\"<", beg);
-  if (beg == std::string::npos) {
-    if (error) *error = "Did not find expected '\"' or '<' character";
-    return false;
-  }
-  return source[beg] == '"';
-}
-
 // Elides "/." and "/.." tokens from path. Returns empty string if illformed.
 inline std::string path_simplify(StringRef path) {
 #if defined _WIN32 || defined _WIN64
@@ -5735,152 +5669,696 @@ inline bool read_text_file(const std::string& fullpath, std::string* content) {
   return true;
 }
 
-static const char* const kJitifyBuiltinHeaderPrefix = "__jitify_builtin";
-static const char* const kJitifyCallbackHeaderPrefix = "__jitify_callback";
+// Prepends the current executable dir (instead of the current working dir,
+// which is the implicit default) to relative paths. This is expected to be more
+// useful than the default because it allows referencing headers that are
+// shipped with the application independent of the current working directory.
+inline std::string expand_include_path(std::string path) {
+  if (path.empty()) return "";
+  if (!path_is_absolute(path)) {
+    path = path_join(path_base(get_current_executable_path()), path);
+  }
+  // TODO: Consider also expanding "$FOO" and "${FOO}" as environment variables.
+  return path;
+}
 
-// Searches for the specified header and loads its contents into *source and its
-// full path into *fullpath. Returns false if not found.
-inline bool load_header_impl(const std::string& filename,
-                             const StringVec& include_paths,
-                             StringRef current_dir, bool search_current_dir,
-                             bool search_builtin_headers,
-                             FileCallback header_callback, std::string* source,
-                             std::string* fullpath) {
-  // Try loading from header callback.
-  if (header_callback) {
-    *fullpath = path_join(kJitifyCallbackHeaderPrefix, filename);
-    if (header_callback(filename, source)) return true;
-  }
-  // Try loading from filesystem.
-  if (search_current_dir) {
-    *fullpath = path_join(current_dir, filename);
-    if (read_text_file(*fullpath, source)) return true;
-  }
-  // Search include directories.
-  for (const std::string& include_path : include_paths) {
-    *fullpath = path_join(include_path, filename);
-    if (read_text_file(*fullpath, source)) return true;
+inline void extract_include_paths(OptionsVec* options,
+                                  StringVec* include_paths) {
+  const std::vector<int> idxs = options->find({"-I"});
+  for (int i = (int)idxs.size() - 1; i >= 0; --i) {
+    const int idx = idxs[i];
+    std::string include_path = (*options)[idx].value();
+    include_path = expand_include_path(std::move(include_path));
+    include_paths->push_back(std::move(include_path));
+    options->erase(idx);
   }
-  // Try loading from builtin headers.
-  if (search_builtin_headers) {
-    *fullpath = path_join(kJitifyBuiltinHeaderPrefix, filename);
-    auto iter = get_jitsafe_headers_map().find(filename);
-    if (iter != get_jitsafe_headers_map().end()) {
-      *source = iter->second;
-      return true;
+}
+
+// Replaces forward and backward slashes with '|'.
+inline std::string sanitize_slashes(std::string s) {
+  for (std::string::iterator it = s.begin(); it != s.end(); ++it) {
+    if (*it == '\\' || *it == '/') {
+      *it = '|';
     }
   }
-  return false;
+  return s;
 }
 
-enum class HeaderLoadStatus {
-  FAILED = 0,
-  ALREADY_LOADED = 1,
-  NEWLY_LOADED = 2,
+// Note: When acting as a reference, this behaves like a raw pointer, so the
+// referenced value must outlive this class. Caution is advised.
+template <typename T>
+class ValueOrRef {
+ public:
+  using value_type = T;
+  using reference = T&;
+  using const_reference = const T&;
+  using pointer = T*;
+
+  ValueOrRef() = default;
+  // Construct as value. Allow implicit conversions.
+  ValueOrRef(value_type _val) : val_(std::move(_val)) {}
+  // Construct as reference.
+  explicit ValueOrRef(pointer _ref) : ref_(_ref) {}
+
+  // Implicit conversion to reference.
+  operator const_reference() const { return ref_ ? *ref_ : val_; }
+  operator reference() { return ref_ ? *ref_ : val_; }
+
+  void copy_to_and_reference(T* dst) {
+    *dst = ref_ ? *ref_ : std::move(val_);
+    ref_ = dst;
+  }
+
+ private:
+  value_type val_;
+  pointer ref_ = nullptr;
 };
 
-// Searches for the specified header and adds its contents to *sources and its
-// simplified full path to *fullpaths (if provided). Returns 0 if not found, -1
-// if alreay found, or 1 if successfully loaded.
-inline HeaderLoadStatus load_header(
-    const std::string& filename, const StringVec& include_paths,
-    StringRef current_dir, bool search_current_dir, bool search_builtin_headers,
-    FileCallback header_callback, StringMap* sources, StringMap* fullpaths) {
-  if (sources->count(filename)) {
-    return HeaderLoadStatus::ALREADY_LOADED;
-  }
-  std::string source, fullpath;
-  if (!load_header_impl(filename, include_paths, current_dir,
-                        search_current_dir, search_builtin_headers,
-                        header_callback, &source, &fullpath)) {
-    return HeaderLoadStatus::FAILED;
-  }
-  sources->emplace(filename, source);
-  if (fullpaths) {
-    // Record the full file path corresponding to this include name.
-    fullpaths->emplace(filename, path_simplify(fullpath));
-  }
-  return HeaderLoadStatus::NEWLY_LOADED;
-}
-
-// Replaces std with cuda::std so that the jit-safe libcudacxx implementations
-// are used instead of the unsafe standard implementations.
-inline std::string replace_std_with_cuda_std(std::string source) {
-  static const std::regex re_qualified_name(
-      R"(::cuda::std::|\bcuda::std::|::std::|\bstd::)", std::regex::optimize);
-  // TODO: This isn't safe because it might already be ns cuda { ns std { } }.
-  // static const std::regex re_namespace(R"(\bnamespace\s+std\s*\{)",
-  //                                     std::regex::optimize);
-  source = std::regex_replace(source, re_qualified_name, "::cuda::std::");
-  // source = std::regex_replace(source, re_namespace, "namespace cuda::std {");
-  return source;
-}
-
-// Helper class for basic lexing of C++ source code.
-class CppLexer {
-  const char* current_;
+using StringOrRef = ValueOrRef<std::string>;
+
+}  // namespace detail
+
+namespace parser {
+
+// This includes whitespace and comment tokens so that it forms a lossless
+// representation of the original source.
+class Token {
+ public:
+  enum class Type : int {
+    kInvalid,
+    kLParen,          // (
+    kRParen,          // )
+    kLBracket,        // [ <: (if not followed by :: or :>)
+    kRBracket,        // ] :>
+    kLBrace,          // { <%
+    kRBrace,          // } %>
+    kDot,             // .
+    kDotStar,         // .*
+    kArrow,           // ->
+    kArrowStar,       // ->*
+    kComma,           // ,
+    kPlus,            // +
+    kPlusPlus,        // ++
+    kPlusEq,          // +=
+    kMinus,           // -
+    kMinusMinus,      // --
+    kMinusEq,         // -=
+    kStar,            // *
+    kStarEq,          // *=
+    kSlash,           // /
+    kSlashEq,         // /=
+    kPercent,         // %
+    kPercentEq,       // %=
+    kQuestion,        // ?
+    kColon,           // :
+    kColonColon,      // ::
+    kAmp,             // &
+    kAmpAmp,          // &&
+    kAmpEq,           // &=
+    kBar,             // |
+    kBarBar,          // ||
+    kBarEq,           // |=
+    kCaret,           // ^
+    kCaretEq,         // ^=
+    kTilde,           // ~
+    kEq,              // =
+    kEqEq,            // ==
+    kBang,            // !
+    kBangEq,          // !=
+    kLt,              // <
+    kLtLt,            // <<
+    kLtEq,            // <=
+    kLtLtEq,          // <<=
+    kGt,              // >
+    kGtGt,            // >>
+    kGtEq,            // >=
+    kGtGtEq,          // >>=
+    kHash,            // # %:
+    kHashHash,        // ## %:%:
+    kSemicolon,       // ;
+    kEndOfDirective,  // Newline at end of a preprocessor directive
+    kWhitespace,      // Any sequence of whitespace
+    kNumber,          // Anything beginning with a digit
+    kString,          // "abc" (or <abc> after a #include directive)
+    kRawString,       // R"delim(abc)delim"
+    kCharacter,       // 'c' (possibly prefixed)
+    kIdentifier,      // abc_def
+    kKeyword,         // class, using, not, etc. (excludes preproc directives)
+    kComment,         // // or /**/ comment
+    kEndOfFile,       // The end of the file
+    kNumTokenTypes
+  };
+
+  // This is useful for debugging.
+  friend std::string to_string(Type token_type) {
+#define JITIFY_DEFINE_TOKEN_CASE(type) \
+  case Type::type:                     \
+    return #type
+
+    switch (token_type) {
+      JITIFY_DEFINE_TOKEN_CASE(kInvalid);
+      JITIFY_DEFINE_TOKEN_CASE(kLParen);
+      JITIFY_DEFINE_TOKEN_CASE(kRParen);
+      JITIFY_DEFINE_TOKEN_CASE(kLBracket);
+      JITIFY_DEFINE_TOKEN_CASE(kRBracket);
+      JITIFY_DEFINE_TOKEN_CASE(kLBrace);
+      JITIFY_DEFINE_TOKEN_CASE(kRBrace);
+      JITIFY_DEFINE_TOKEN_CASE(kDot);
+      JITIFY_DEFINE_TOKEN_CASE(kDotStar);
+      JITIFY_DEFINE_TOKEN_CASE(kArrow);
+      JITIFY_DEFINE_TOKEN_CASE(kArrowStar);
+      JITIFY_DEFINE_TOKEN_CASE(kComma);
+      JITIFY_DEFINE_TOKEN_CASE(kPlus);
+      JITIFY_DEFINE_TOKEN_CASE(kPlusPlus);
+      JITIFY_DEFINE_TOKEN_CASE(kPlusEq);
+      JITIFY_DEFINE_TOKEN_CASE(kMinus);
+      JITIFY_DEFINE_TOKEN_CASE(kMinusMinus);
+      JITIFY_DEFINE_TOKEN_CASE(kMinusEq);
+      JITIFY_DEFINE_TOKEN_CASE(kStar);
+      JITIFY_DEFINE_TOKEN_CASE(kStarEq);
+      JITIFY_DEFINE_TOKEN_CASE(kSlash);
+      JITIFY_DEFINE_TOKEN_CASE(kSlashEq);
+      JITIFY_DEFINE_TOKEN_CASE(kPercent);
+      JITIFY_DEFINE_TOKEN_CASE(kPercentEq);
+      JITIFY_DEFINE_TOKEN_CASE(kQuestion);
+      JITIFY_DEFINE_TOKEN_CASE(kColon);
+      JITIFY_DEFINE_TOKEN_CASE(kColonColon);
+      JITIFY_DEFINE_TOKEN_CASE(kAmp);
+      JITIFY_DEFINE_TOKEN_CASE(kAmpAmp);
+      JITIFY_DEFINE_TOKEN_CASE(kAmpEq);
+      JITIFY_DEFINE_TOKEN_CASE(kBar);
+      JITIFY_DEFINE_TOKEN_CASE(kBarBar);
+      JITIFY_DEFINE_TOKEN_CASE(kBarEq);
+      JITIFY_DEFINE_TOKEN_CASE(kCaret);
+      JITIFY_DEFINE_TOKEN_CASE(kCaretEq);
+      JITIFY_DEFINE_TOKEN_CASE(kTilde);
+      JITIFY_DEFINE_TOKEN_CASE(kEq);
+      JITIFY_DEFINE_TOKEN_CASE(kEqEq);
+      JITIFY_DEFINE_TOKEN_CASE(kBang);
+      JITIFY_DEFINE_TOKEN_CASE(kBangEq);
+      JITIFY_DEFINE_TOKEN_CASE(kLt);
+      JITIFY_DEFINE_TOKEN_CASE(kLtLt);
+      JITIFY_DEFINE_TOKEN_CASE(kLtEq);
+      JITIFY_DEFINE_TOKEN_CASE(kLtLtEq);
+      JITIFY_DEFINE_TOKEN_CASE(kGt);
+      JITIFY_DEFINE_TOKEN_CASE(kGtGt);
+      JITIFY_DEFINE_TOKEN_CASE(kGtEq);
+      JITIFY_DEFINE_TOKEN_CASE(kGtGtEq);
+      JITIFY_DEFINE_TOKEN_CASE(kHash);
+      JITIFY_DEFINE_TOKEN_CASE(kHashHash);
+      JITIFY_DEFINE_TOKEN_CASE(kSemicolon);
+      JITIFY_DEFINE_TOKEN_CASE(kEndOfDirective);
+      JITIFY_DEFINE_TOKEN_CASE(kWhitespace);
+      JITIFY_DEFINE_TOKEN_CASE(kNumber);
+      JITIFY_DEFINE_TOKEN_CASE(kString);
+      JITIFY_DEFINE_TOKEN_CASE(kRawString);
+      JITIFY_DEFINE_TOKEN_CASE(kCharacter);
+      JITIFY_DEFINE_TOKEN_CASE(kIdentifier);
+      JITIFY_DEFINE_TOKEN_CASE(kKeyword);
+      JITIFY_DEFINE_TOKEN_CASE(kComment);
+      JITIFY_DEFINE_TOKEN_CASE(kEndOfFile);
+      JITIFY_DEFINE_TOKEN_CASE(kNumTokenTypes);
+    }
+#undef JITIFY_DEFINE_TOKEN_CASE
+    return "<unknown>";
+  }
+
+  friend std::ostream& operator<<(std::ostream& stream, Type token_type) {
+    return stream << to_string(token_type);
+  }
+
+  friend std::ostream& operator<<(std::ostream& stream, const Token& token) {
+    return stream << token.type() << "(" << token.token_string() << ")";
+  }
+
+  static bool TypeIsValid(Token::Type token_type) {
+    return token_type != Token::Type::kInvalid &&
+           token_type != Token::Type::kEndOfFile;
+  }
+
+  // Efficiently represents a set of token types.
+  class TypeSet {
+   public:
+    constexpr TypeSet() : data_(0) {}
+
+    template <typename... TokenTypes>
+    constexpr TypeSet(Type token_type0, TokenTypes... token_types)
+        : TypeSet(TypeSet(uint64_t(1) << static_cast<int>(token_type0)) |
+                  TypeSet(token_types...)) {}
+
+    // Tests if token_type is in the set.
+    constexpr bool count(Type token_type) const {
+      return data_ & (uint64_t(1) << static_cast<int>(token_type));
+    }
+
+    // Combine sets.
+    friend constexpr TypeSet operator|(TypeSet lhs, TypeSet rhs) {
+      return TypeSet(lhs.data_ | rhs.data_);
+    }
+    friend constexpr TypeSet operator&(TypeSet lhs, TypeSet rhs) {
+      return TypeSet(lhs.data_ & rhs.data_);
+    }
+
+   private:
+    constexpr explicit TypeSet(uint64_t _data) : data_(_data) {}
+
+    uint64_t data_;
+    static_assert(static_cast<size_t>(Type::kNumTokenTypes) <=
+                      sizeof(data_) * 8,
+                  "Too many token types to fit in 64-bit set!");
+  };
+
+  Token() = default;
+  Token(Type _type, const char* _begin, const char* _end,
+        std::string _token_string = {})
+      : begin_(_begin),
+        size_(static_cast<uint32_t>(_end - _begin)),
+        type_(_type),
+        token_string_(std::move(_token_string)) {}
+  Token(Type _type, std::string _token_string)
+      : type_(_type), token_string_(std::move(_token_string)) {}
+
+  const char* begin() const { return begin_; }
+  const char* end() const { return begin_ + size_; }
+  Type type() const { return type_; }
+
+  explicit operator bool() const { return TypeIsValid(type_); }
+
+  friend bool operator==(const Token& lhs, const Token& rhs) {
+    return lhs.type_ == rhs.type_ && lhs.begin_ == rhs.begin_ &&
+           lhs.size_ == rhs.size_ && lhs.token_string_ == rhs.token_string_;
+  }
+  friend bool operator!=(const Token& lhs, const Token& rhs) {
+    return !(lhs == rhs);
+  }
+
+  template <typename... TokenTypes>
+  bool matches(TokenTypes... token_types) const {
+    return TypeSet(token_types...).count(type_);
+  }
+
+  bool matches_identifier(const char* name) const {
+    return type_ == Token::Type::kIdentifier && token_string() == name;
+  }
+
+  // Returns the number of newlines in the token's original source string.
+  // Note that any token can have escaped newlines in it.
+  int num_newlines() const {
+    const std::string source = source_string();
+    return (int)std::count(source.begin(), source.end(), '\n');
+  }
+
+  // Returns the number of newlines (excluding escaped newlines) in the token
+  // string.
+  int num_unescaped_newlines() const {
+    const std::string token = token_string();
+    return (int)std::count(token.begin(), token.end(), '\n');
+  }
+
+  std::string source_string() const {
+    return begin_ ? std::string(begin_, size_) : token_string_;
+  }
+
+  std::string token_string() const {
+    return token_string_.empty() ? source_string() : token_string_;
+  }
+
+ private:
+  // Note: begin_ and end_ point to locations in the original source. In the
+  // simple case, the string between them is exactly the token string, and
+  // token_string_ is empty. If the source contains escaped newlines or if
+  // tokens have been concatenated, begin_ and end_ reference the original
+  // source string (e.g., "foo ## b\\\nar") and token_string_ is set to the
+  // logical token string (e.g., "foobar").
+  const char* begin_ = nullptr;
+  uint32_t size_ = 0;
+  Type type_ = Type::kInvalid;
+  std::string token_string_;
+};
 
-  bool isspace(char c) const {
-    return std::isspace(static_cast<unsigned char>(c));
+// Converts token to a kKeyword token if it matches a language keyword,
+// otherwise returns it unchanged. The cxx_standard_year argument is e.g., 11,
+// 14, 17, or 20, or -1 for the latest standard including technical
+// specifications.
+inline bool is_keyword(const std::string& token_string,
+                       int cxx_standard_year = -1) {
+  static const std::unordered_set<std::string> keywords = {
+      "and",          "and_eq",      "asm",          "auto",
+      "bitand",       "bitor",       "bool",         "break",
+      "case",         "catch",       "char",         "class",
+      "compl",        "const",       "const_cast",   "continue",
+      "default",      "delete",      "do",           "double",
+      "dynamic_cast", "else",        "enum",         "explicit",
+      "export",       "extern",      "false",        "float",
+      "for",          "friend",      "goto",         "if",
+      "inline",       "int",         "long",         "mutable",
+      "namespace",    "new",         "not",          "not_eq",
+      "operator",     "or",          "or_eq",        "private",
+      "protected",    "public",      "register",     "reinterpret_cast",
+      "return",       "short",       "signed",       "sizeof",
+      "static",       "static_cast", "struct",       "switch",
+      "template",     "this",        "throw",        "true",
+      "try",          "typedef",     "typeid",       "typename",
+      "union",        "unsigned",    "using",        "virtual",
+      "void",         "volatile",    "wchar_t",      "while",
+      "xor",          "xor_eq",      "__restrict__", "__constant__",
+      "__device__",   "__global__",  "__host__",
+  };
+  static const std::unordered_set<std::string> cxx11_keywords = {
+      "alignas",  "alignof",  "char16_t", "char32_t",      "constexpr",
+      "decltype", "noexcept", "nullptr",  "static_assert", "thread_local",
+  };
+  static const std::unordered_set<std::string> cxx20_keywords = {
+      "char8_t",  "concept",   "consteval", "constinit",
+      "co_await", "co_return", "co_yield",  "requires",
+  };
+  static const std::unordered_set<std::string> ts_keywords = {
+      "atomic_cancel", "atomic_commit", "atomic_noexcept",
+      "reflexpr",      "synchronized",
+  };
+  if (cxx_standard_year == -1) {
+    cxx_standard_year = 99;
   }
+  if (keywords.count(token_string)) return true;
+  if (cxx_standard_year < 11) return false;
+  if (cxx11_keywords.count(token_string)) return true;
+  if (cxx_standard_year < 20) return false;
+  if (cxx20_keywords.count(token_string)) return true;
+  if (cxx_standard_year != 99) return false;
+  return ts_keywords.count(token_string);
+}
+
+class CppLexer {
+  class Iterator {
+   public:
+    using iterator_category = std::input_iterator_tag;
+    using difference_type = std::ptrdiff_t;
+    using value_type = Token;
+    using pointer = const Token*;
+    using reference = const Token&;
+
+    Iterator() : lexer_(nullptr) {}
+    explicit Iterator(CppLexer* _lexer)
+        : lexer_(_lexer), current_(lexer_->next()) {}
+
+    reference operator*() const { return current_; }
+    pointer operator->() { return &current_; }
+
+    Iterator& operator++() {
+      current_ = lexer_->next();
+      return *this;
+    }
+
+    Iterator operator++(int) {
+      Iterator tmp = *this;
+      ++(*this);
+      return tmp;
+    }
+
+    friend bool operator==(const Iterator& lhs, const Iterator& rhs) {
+      return (lhs.lexer_ == rhs.lexer_ && lhs.current_ == rhs.current_) ||
+             (lhs.current_.type() == Token::Type::kEndOfFile &&
+              rhs.lexer_ == nullptr) ||
+             (lhs.lexer_ == nullptr &&
+              rhs.current_.type() == Token::Type::kEndOfFile);
+    }
+    friend bool operator!=(const Iterator& lhs, const Iterator& rhs) {
+      return !(lhs == rhs);
+    }
+
+   private:
+    CppLexer* lexer_;
+    value_type current_;
+  };
 
  public:
-  CppLexer(const char* str) : current_(str) {}
-  const char* current() const { return current_; }
-  char advance() { return *current_++; }
-  void skip(int n) { current_ += n; }
-  char peek(int i = 0) const { return *(current_ + i); }
-  bool match(char c) { return peek() == c ? advance() : false; }
-  bool match(const char* s) {
+  using iterator = Iterator;
+
+  template <typename Container>
+  static Container tokenize(const char* source, int _cxx_standard_year = -1) {
+    CppLexer lexer(source, _cxx_standard_year);
+    Container result;
+    for (const Token& token : lexer) {
+      result.push_back(token);
+    }
+    return result;
+  }
+
+  CppLexer(const char* source, int _cxx_standard_year = -1)
+      : current_(source), cxx_standard_year_(_cxx_standard_year) {}
+
+  iterator begin() { return iterator(this); }
+  iterator end() { return iterator(); }
+
+  Token next() {
+    using Tt = Token::Type;
+    token_start_ = current_;
+    char c = advance();
+    // clang-format off
+    switch (c) {
+      case '\0': return token(Tt::kEndOfFile);
+      // This just handles the very first character being an escaped newline,
+      // because all other escaped newlines are skipped over.
+      case '\\': return token(match('\n') ? Tt::kWhitespace : Tt::kInvalid);
+      case '\n': return in_directive_
+                     ? (in_directive_ = false, in_include_directive_ = false,
+                        token(Tt::kEndOfDirective))
+                     : whitespace();
+      case '(': return token(Tt::kLParen);
+      case ')': return token(Tt::kRParen);
+      case '[': return token(Tt::kLBracket);
+      case ']': return token(Tt::kRBracket);
+      case '<':
+        return in_include_directive_
+                   ? angle_include()
+                   : token(((peek_match(":") && !peek_match("::")) ||
+                            peek_match(":::") || peek_match("::>"))
+                               ? (match(':'), Tt::kLBracket)
+                               : match('%')
+                                     ? Tt::kLBrace
+                                     : match('<')
+                                           ? match('=') ? Tt::kLtLtEq
+                                                        : Tt::kLtLt
+                                           : match('=') ? Tt::kLtEq : Tt::kLt);
+      case '>':  // Note: This does not distinguish template close vs. bitshift
+        return token(match('>') ? match('=') ? Tt::kGtGtEq : Tt::kGtGt
+                                : match('=') ? Tt::kGtEq : Tt::kGt);
+      case ':': return token(match('>')
+                           ? Tt::kRBracket
+                           : match(':') ? Tt::kColonColon : Tt::kColon);
+      case '{': return token(Tt::kLBrace);
+      case '}': return token(Tt::kRBrace);
+      case '%':
+        return token(match('>')
+                         ? Tt::kRBrace
+                         : match(':')
+                               // TODO: Probably need to do the in_directive_
+                               // etc. logic here too.
+                               ? match("%:") ? Tt::kHashHash : Tt::kHash
+                               : match('=') ? Tt::kPercentEq : Tt::kPercent);
+      case '.': return token(match('*') ? Tt::kDotStar : Tt::kDot);
+      case '-': return token(match('>') ? match('*') ? Tt::kArrowStar
+                                                     : Tt::kArrow
+                                        : match('-') ? Tt::kMinusMinus
+                                                     : match('=') ? Tt::kMinusEq
+                                                                  : Tt::kMinus);
+
+      case ',': return token(Tt::kComma);
+      case '+': return token(match('+') ? Tt::kPlusPlus
+                                        : match('=') ? Tt::kPlusEq
+                                                     : Tt::kPlus);
+      case '*': return token(match('=') ? Tt::kStarEq : Tt::kStar);
+      case '/': return match('/')
+                           ? line_comment()
+                           : match('*') ? block_comment()
+                                        : token(match('=') ? Tt::kSlashEq
+                                                           : Tt::kSlash);
+      // Note: Trigraphs not supported.
+      case '?': return token(Tt::kQuestion);
+      case '&': return token(match('&') ? Tt::kAmpAmp
+                                        : match('=') ? Tt::kAmpEq : Tt::kAmp);
+      case '|': return token(match('|') ? Tt::kBarBar
+                                        : match('=') ? Tt::kBarEq : Tt::kBar);
+      case '^': return token(match('=') ? Tt::kCaretEq : Tt::kCaret);
+      case '~': return token(Tt::kTilde);
+      case '=': return token(match('=') ? Tt::kEqEq : Tt::kEq);
+      case '!': return token(match('=') ? Tt::kBangEq : Tt::kBang);
+      case '#':
+        return token(match('#') ? Tt::kHashHash
+                                : (is_start_of_directive_ = !in_directive_,
+                                   in_directive_ = true,
+                                   Tt::kHash));
+      case '\'': return character();
+      case '"': return in_include_directive_ ? quote_include() : string();
+      case 'u': match('8');
+        // fall-through
+        [[gnu::fallthrough]];  // Not sure why gcc complains here without this
+      case 'L':
+        // fall-through
+      case 'U':
+        return match('\'')
+                   ? character()
+                   : match('"') ? string()
+                                : match("R\"") ? raw_string() : identifier();
+      case 'R': return match('"') ? raw_string() : identifier();
+      case ';': return token(Tt::kSemicolon);
+      default:
+        if (is_space(c)) return in_directive_ ? whitespace_except_newlines()
+                                              : whitespace();
+        if (is_digit(c)) return number();
+        if (is_alpha(c)) return identifier();
+    }
+    // clang-format on
+    return token(Tt::kInvalid);
+  }
+
+ private:
+  bool is_space_except_newline(char c) const {
+    return c == ' ' || c == '\f' || c == '\r' || c == '\t' || c == '\v';
+  }
+  bool is_space(char c) const {
+    // Note: std::isspace is locale-dependent.
+    return is_space_except_newline(c) || c == '\n';
+  }
+  bool is_digit(char c) const { return c >= '0' && c <= '9'; }
+  bool is_alpha(char c) const {
+    // Note: std::isalpha is locale-dependent.
+    //   Also, implementations may accept additional alphabet characters (e.g.,
+    //   MSVC accepts '$', and clang accepts things like Greek alphabet unicode
+    //   chars).
+    return (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z') || c == '_' ||
+           c == '$';
+  }
+  bool is_alnum(char c) const { return is_alpha(c) || is_digit(c); }
+
+  bool contains_escaped_newlines(const char* begin, const char* end) const {
+    for (const char* ptr = begin; ptr != end; ++ptr) {
+      if (ptr[0] == '\\' && ptr[1] == '\n') return true;
+    }
+    return false;
+  }
+  std::string without_escaped_newlines(const char* begin,
+                                       const char* end) const {
+    std::string result;
+    result.reserve(end - begin);
+    for (const char* ptr = begin; ptr != end; ++ptr) {
+      if (ptr[0] == '\\' && ptr[1] == '\n') {
+        ++ptr;
+      } else {
+        result.push_back(*ptr);
+      }
+    }
+    return result;
+  }
+
+  const char* skip_escaped_newlines(const char* ptr) const {
+    while (*ptr == '\\' && *(ptr + 1) == '\n') ptr += 2;
+    return ptr;
+  }
+  const char* reverse_skip_escaped_newlines(const char* ptr) const {
+    while (*ptr == '\n' && *(ptr - 1) == '\\') ptr -= 2;
+    return ptr;
+  }
+  const char* advance_by(const char* ptr, int n) const {
+    if (n == 0) return ptr;
+    bool reverse = n < 0;
+    n = reverse ? -n : n;
+    // Skip over escaped newlines (which can appear anywhere, even in the middle
+    // of tokens).
+    for (int i = 0; i < n; ++i) {
+      ptr = reverse ? reverse_skip_escaped_newlines(ptr - 1)
+                    : skip_escaped_newlines(ptr + 1);
+    }
+    return ptr;
+  }
+
+  char advance() {
+    char ret = *current_;
+    // Only advance if this isn't the end of the string.
+    if (ret) current_ = advance_by(current_, 1);
+    return ret;
+  }
+  char peek(int i = 0) const { return *advance_by(current_, i); }
+  int peek_match(const char* s) {
     int i;
     for (i = 0; s[i]; ++i) {
-      if (!peek(i) || peek(i) != s[i]) return false;
+      if (!peek(i) || peek(i) != s[i]) return 0;
     }
-    current_ += i;
+    return i;
+  }
+  bool match(char c) { return peek() == c && advance(); }
+  bool match(const char* s) {
+    int n = peek_match(s);
+    if (!n) return false;
+    current_ = advance_by(current_, n);
     return true;
   }
-  bool match_whitespace() {
-    // Includes line continuations.
-    return (isspace(peek()) || (peek() == '\\' && peek(1) == '\n')) ? advance()
-                                                                    : false;
+  bool match_literal_suffix() {
+    if (!is_alpha(peek())) return false;
+    advance();
+    while (is_alnum(peek())) advance();
+    return true;
   }
-  const char* whitespace() {
-    while (match_whitespace()) {
+  Token escapable_char_delimited_span(char delim, Token::Type token_type,
+                                      bool include_suffix = true,
+                                      bool enable_escapes = true) {
+    bool in_escape = false;
+    // Note: We stop if we reach an unescaped newline because it's a syntax
+    // error and we don't want to run on into the next line.
+    while (peek() && ((peek() != delim && peek() != '\n') || in_escape)) {
+      in_escape = enable_escapes && !in_escape && peek() == '\\';
+      advance();
     }
-    // while (isspace(peek()) || (peek() == '\\' && peek(1) == '\n')) advance();
-    return current_;
-  }
-  const char* escapable_char_delimited_span(char delim) {
-    while (peek() && (peek() != delim || peek(-1) == '\\')) advance();
-    if (peek() == delim) {
-      skip(1);
-    } else {
-      // Error, unexpected end of string.
-    }
-    return current_;
-  }
-  // Excludes the ending newline char.
-  const char* line() { return escapable_char_delimited_span('\n') - 1; }
-  // These all include the ending delimiter chars.
-  const char* string_literal() { return escapable_char_delimited_span('"'); }
-  const char* char_literal() { return escapable_char_delimited_span('\''); }
-  const char* delimited_span(const char* delim, int delim_size) {
-    auto peek_equals_delimiter = [&] {
-      for (int i = 0; i < delim_size; ++i) {
-        if (peek(i) != delim[i]) return false;
+    // Include the ending delimiter.
+    if (match(delim) && include_suffix) {
+      // Include literal suffix.
+      match_literal_suffix();
+    }
+    return token(token_type);
+  }
+
+  // Constructs a token to represent the current part of the source.
+  Token token(Token::Type type) const {
+    std::string token_string;
+    // If the source string contains escaped newlines, we remove them to
+    // construct a clean token string.
+    if (type == Token::Type::kRawString) {
+      // Special processing for raw strings because we must preserve escaped
+      // newlines inside them.
+      // TODO: Check how escaped newlines inside the delimiters should be
+      // handled.
+      const char* first_quotes = token_start_;
+      while (first_quotes != current_ && first_quotes[0] != '"') ++first_quotes;
+      if (contains_escaped_newlines(token_start_, first_quotes)) {
+        token_string = without_escaped_newlines(token_start_, first_quotes) +
+                       std::string(first_quotes, current_);
       }
-      return true;
-    };
-    while (peek() && !peek_equals_delimiter()) advance();
-    if (peek() == delim[0]) {
-      skip(delim_size);
     } else {
-      // Error, unexpected end of string.
+      if (contains_escaped_newlines(token_start_, current_)) {
+        token_string = without_escaped_newlines(token_start_, current_);
+      }
     }
-    return current_;
+    return Token(type, token_start_, current_, std::move(token_string));
   }
-  const char* block_comment() { return delimited_span("*/", 2); }
-  const char* raw_string_literal() {
+
+  Token whitespace() {
+    while (is_space(peek())) {
+      advance();
+    }
+    return token(Token::Type::kWhitespace);
+  }
+  Token whitespace_except_newlines() {
+    while (is_space_except_newline(peek())) advance();
+    return token(Token::Type::kWhitespace);
+  }
+  Token number() {
+    while (is_alnum(peek())) advance();
+    return token(Token::Type::kNumber);
+  }
+  Token string() {
+    return escapable_char_delimited_span('"', Token::Type::kString);
+  }
+  Token raw_string() {
     const char* delim_beg = current_;
     while (peek() && peek() != '(') advance();
     std::string delim;
@@ -5888,236 +6366,899 @@ class CppLexer {
     delim += ')';
     delim.append(delim_beg, current_);
     delim += '"';
-    return delimited_span(delim.c_str(), (int)delim.size());
+    while (peek() && !match(delim.c_str())) advance();
+    match_literal_suffix();
+    return token(Token::Type::kRawString);
+  }
+  Token quote_include() {
+    // Note: Strings in #include directives treat backslashes literally, not as
+    // escapes.
+    return escapable_char_delimited_span('"', Token::Type::kString, false,
+                                         false);
+  }
+  Token angle_include() {
+    // Note: Strings in #include directives treat backslashes literally, not as
+    // escapes.
+    return escapable_char_delimited_span('>', Token::Type::kString, false,
+                                         false);
+  }
+  Token character() {
+    return escapable_char_delimited_span('\'', Token::Type::kCharacter);
+  }
+  Token identifier() {
+    while (is_alnum(peek())) advance();
+    Token result = token(Token::Type::kIdentifier);
+    if (!is_start_of_directive_ &&
+        is_keyword(result.token_string(), cxx_standard_year_)) {
+      result = Token(Token::Type::kKeyword, result.begin(), result.end(),
+                     result.token_string());
+    }
+    if (in_directive_) {
+      if (is_start_of_directive_ && result.token_string() == "include") {
+        in_include_directive_ = true;
+      }
+      is_start_of_directive_ = false;
+    }
+    return result;
+  }
+  Token line_comment() {
+    // Excludes the newline.
+    while (peek() && peek() != '\n') advance();
+    return token(Token::Type::kComment);
+  }
+  Token block_comment() {
+    while (peek() && !match("*/")) advance();
+    return token(Token::Type::kComment);
   }
+
+  const char* current_;
+  int cxx_standard_year_;
+  const char* token_start_;
+  bool in_directive_ = false;
+  bool is_start_of_directive_ = false;
+  bool in_include_directive_ = false;
 };
 
-inline bool find_pragma_once(const std::string& source, size_t* begin_ptr,
-                             size_t* end_ptr) {
-  // Match string literals, comments (/), and preprocessor directives (#).
-  const char* match_chars = "\"'R/#";
-  size_t pos = 0;
-  while ((pos = source.find_first_of(match_chars, pos)) != std::string::npos) {
-    const char* beg = source.c_str() + pos;
-    CppLexer lexer(beg);
-    bool hit = false;
-    const char* end = [&] {
-      // clang-format off
-      switch (lexer.advance()) {
-        case '"':  return lexer.string_literal();
-        case '\'': return lexer.char_literal();
-        case 'R':  return lexer.match('"') ? lexer.raw_string_literal() :
-                          lexer.current();
-        case '/':  return lexer.match('/') ? lexer.line() :
-                          lexer.match('*') ? lexer.block_comment() :
-                          lexer.current();
-        case '#':  return (hit = lexer.match("pragma") &&
-                           lexer.match_whitespace() &&
-                           (lexer.whitespace(), lexer.match("once"))),
-                          lexer.current();
-        default:   return lexer.current(); // Should never be reached
-      }
-      // clang-format on
-    }();
-    if (hit) {
-      *begin_ptr = pos;
-      *end_ptr = end - source.c_str();
-      return true;
+// Pastes two tokens together as per the ## macro operator.
+// Returns a kInvalid token if the concatenation does not form a valid token.
+inline Token concatenate(const Token& lhs, const Token& rhs,
+                         int cxx_standard_year) {
+  using Tt = Token::Type;
+  std::string combined_token_string = lhs.token_string() + rhs.token_string();
+  Token::Type type = [&] {
+    auto match = [&](Token::Type x, Token::Type y) -> bool {
+      return lhs.type() == x && rhs.type() == y;
+    };
+    if (match(Tt::kLt, Tt::kColon)) return Tt::kLBracket;
+    if (match(Tt::kColon, Tt::kGt)) return Tt::kRBracket;
+    if (match(Tt::kLt, Tt::kPercent)) return Tt::kLBrace;
+    if (match(Tt::kPercent, Tt::kGt)) return Tt::kRBrace;
+    if (match(Tt::kDot, Tt::kStar)) return Tt::kDotStar;
+    if (match(Tt::kMinus, Tt::kGt)) return Tt::kArrow;
+    if (match(Tt::kArrow, Tt::kStar)) return Tt::kArrowStar;
+    if (match(Tt::kPlus, Tt::kPlus)) return Tt::kPlusPlus;
+    if (match(Tt::kPlus, Tt::kEq)) return Tt::kPlusEq;
+    if (match(Tt::kMinus, Tt::kMinus)) return Tt::kMinusMinus;
+    if (match(Tt::kMinus, Tt::kEq)) return Tt::kMinusEq;
+    if (match(Tt::kStar, Tt::kEq)) return Tt::kStarEq;
+    if (match(Tt::kSlash, Tt::kEq)) return Tt::kSlashEq;
+    if (match(Tt::kPercent, Tt::kEq)) return Tt::kPercentEq;
+    if (match(Tt::kColon, Tt::kColon)) return Tt::kColonColon;
+    if (match(Tt::kAmp, Tt::kAmp)) return Tt::kAmpAmp;
+    if (match(Tt::kAmp, Tt::kEq)) return Tt::kAmpEq;
+    if (match(Tt::kBar, Tt::kBar)) return Tt::kBarBar;
+    if (match(Tt::kBar, Tt::kEq)) return Tt::kBarEq;
+    if (match(Tt::kCaret, Tt::kEq)) return Tt::kCaretEq;
+    if (match(Tt::kEq, Tt::kEq)) return Tt::kEqEq;
+    if (match(Tt::kBang, Tt::kEq)) return Tt::kBangEq;
+    if (match(Tt::kLt, Tt::kLt)) return Tt::kLtLt;
+    if (match(Tt::kLt, Tt::kEq)) return Tt::kLtEq;
+    if (match(Tt::kLt, Tt::kLtEq)) return Tt::kLtLtEq;
+    if (match(Tt::kLtLt, Tt::kEq)) return Tt::kLtLtEq;
+    if (match(Tt::kGt, Tt::kGt)) return Tt::kGtGt;
+    if (match(Tt::kGt, Tt::kEq)) return Tt::kGtEq;
+    if (match(Tt::kGt, Tt::kGtEq)) return Tt::kGtGtEq;
+    if (match(Tt::kGtGt, Tt::kEq)) return Tt::kGtGtEq;
+    if (match(Tt::kHash, Tt::kHash)) return Tt::kHashHash;
+    if (match(Tt::kPercent, Tt::kColon)) return Tt::kHash;
+    // E.g., 123 ## 456.
+    if (match(Tt::kNumber, Tt::kNumber)) return Tt::kNumber;
+    // E.g., 123 ## ull.
+    if (match(Tt::kNumber, Tt::kIdentifier)) return Tt::kNumber;
+    // E.g., abc ## 123, class ## 123
+    if (lhs.matches(Tt::kIdentifier, Tt::kKeyword) &&
+        rhs.type() == Tt::kNumber) {
+      return Tt::kIdentifier;
     }
-    pos += end - beg;
-  }
-  return false;
+    // E.g., u8 ## 'c'.
+    if (match(Tt::kIdentifier, Tt::kCharacter)) return Tt::kCharacter;
+    // E.g., u8 ## "abc" (but not include ## <abc>).
+    // TODO: Consider using a separate kAngleString instead (it would simplify
+    // this but slightly complicate parsing of include directives).
+    if (match(Tt::kIdentifier, Tt::kString) &&
+        lhs.token_string() != "include") {
+      return Tt::kString;
+    }
+    // E.g., u8 ## R"(abc)".
+    if (match(Tt::kIdentifier, Tt::kRawString)) return Tt::kRawString;
+    // E.g., 'c' ## _foo.
+    if (match(Tt::kCharacter, Tt::kIdentifier)) return Tt::kCharacter;
+    // E.g., "foo" ## s.
+    if (match(Tt::kString, Tt::kIdentifier)) return Tt::kString;
+    // E.g., R"(foo)" ## s.
+    if (match(Tt::kRawString, Tt::kIdentifier)) return Tt::kRawString;
+    // E.g., abc ## def -> ident, cl ## ass -> kw, not ## using -> ident.
+    if (lhs.matches(Tt::kIdentifier, Tt::kKeyword) &&
+        rhs.matches(Tt::kIdentifier, Tt::kKeyword)) {
+      return is_keyword(combined_token_string, cxx_standard_year)
+                 ? Tt::kKeyword
+                 : Tt::kIdentifier;
+    }
+    return Tt::kInvalid;
+  }();
+  return Token(type, lhs.begin(), rhs.end(), std::move(combined_token_string));
 }
 
-inline std::string remove_cpp_comments_and_line_continuations(
-    const std::string& source) {
-  std::string result;
-  result.reserve(source.size());
-  size_t old_pos = 0, pos;
-  // Match string literals, comments (forward slashes), and line continuations
-  // (backslashes).
-  const char* match_chars = "\"'R/\\";
-  while ((pos = source.find_first_of(match_chars, old_pos)) !=
-         std::string::npos) {
-    result.append(source, old_pos, pos - old_pos);
-    const char* beg = source.c_str() + pos;
-    CppLexer lexer(beg);
-    const char* end = [&] {
-      // clang-format off
-      switch (lexer.advance()) {
-        case '"':  return lexer.string_literal();
-        case '\'': return lexer.char_literal();
-        case 'R':  return lexer.match('"') ? lexer.raw_string_literal() :
-                          lexer.current();
-        case '/':  return lexer.match('/') ? lexer.line() :
-                          lexer.match('*') ? lexer.block_comment() :
-                          lexer.current();
-        // Match line continuation (escaped newline).
-        // TODO: Line continuations inside string literals will not be matched
-        // here. Would need to use a separate pass that only matches them and
-        // raw strings.
-        case '\\': return lexer.match('\n'), lexer.current();
-        default:   return lexer.current(); // Should never be reached
+template <int Size>
+class TokenHistoryBuffer {
+ public:
+  using value_type = Token;
+  using reference = Token&;
+  using const_reference = const Token&;
+
+  constexpr int size() const { return Size; }
+
+  void push(const Token& value) {
+    if (++head_ == size()) {
+      head_ = 0;
+    }
+    data_[head_] = value;
+  }
+
+  // Requires i to be in the range (-size(), 0], where
+  // i=0 corresponds to the most recent value.
+  const_reference operator[](int i) const {
+    assert(-size() < i && i <= 0);
+    int idx = head_ + i;
+    if (idx < 0) {
+      idx += size();
+    }
+    return data_[idx];
+  }
+
+  bool match(std::initializer_list<Token::Type> token_types) const {
+    assert((int)token_types.size() <= size());
+    int i = 0;
+    for (Token::Type token_type : token_types) {
+      Token::Type historic_type =
+          (*this)[-(int)token_types.size() + 1 + i++].type();
+      if (historic_type != token_type) {
+        return false;
       }
-      // clang-format on
-    }();
-    old_pos = end - source.c_str();
-    if (end - beg == 1 || *beg == '"' || *beg == '\'' || *beg == 'R') {
-      // Keep single characters ('/') and string literals.
-      result.append(beg, end);
-    } else {
-      // Elide comments and line continuations.
     }
+    return true;
   }
-  result.append(source, old_pos, std::string::npos);
-  return result;
+
+  // Removes the most recent entry.
+  void pop() {
+    data_[head_--] = value_type();
+    if (head_ < 0) head_ += size();
+  }
+
+ private:
+  std::array<value_type, Size> data_ = {};
+  int head_ = -1;
+};
+
+// This filters out whitespace and comments from an iterator over Tokens, and
+// provides several convenience methods to assist parsing.
+template <typename TokenIterator>
+class CppParserIterator {
+ public:
+  using token_iterator = TokenIterator;
+  using iterator_category = typename std::conditional<
+      std::is_same<
+          typename std::iterator_traits<token_iterator>::iterator_category,
+          std::input_iterator_tag>::value,
+      std::input_iterator_tag, std::forward_iterator_tag>::type;
+  using difference_type =
+      typename std::iterator_traits<token_iterator>::difference_type;
+  using value_type = typename std::iterator_traits<token_iterator>::value_type;
+  using reference = typename std::iterator_traits<token_iterator>::reference;
+  using pointer = typename std::iterator_traits<token_iterator>::pointer;
+
+  explicit CppParserIterator(token_iterator token_iter, token_iterator _end)
+      : previous_tokens_(), current_(token_iter), end_(_end) {
+    skip_whitespace_and_comments();
+  }
+
+  token_iterator base() const { return current_; }
+
+  // Construct a corresponding end iterator for use with iterator-based
+  // algorithms.
+  CppParserIterator end() const { return CppParserIterator(end_, end_); }
+
+  explicit operator bool() const { return current_ != end_; }
+
+  reference operator*() const { return *current_; }
+  token_iterator operator->() const { return current_; }
+
+  // Advances to the next non-whitespace and non-comment token.
+  CppParserIterator& operator++() {
+    previous_tokens_.push(*current_);
+    ++current_;
+    skip_whitespace_and_comments();
+    return *this;
+  }
+
+  CppParserIterator operator++(int) {
+    CppParserIterator tmp(*this);
+    ++(*this);
+    return tmp;
+  }
+
+  // Requires idx to be in the range (-size(), 0], where
+  // idx=0 corresponds to the most recent value (before current).
+  const value_type& previous_token(int idx = 0) const {
+    return previous_tokens_[idx];
+  }
+
+  bool match(Token::Type token_type) {
+    if (current_->type() != token_type) return false;
+    ++(*this);
+    return true;
+  }
+
+  template <class... TokenTypes>
+  bool match(TokenTypes... token_types) {
+    if (!current_->matches(token_types...)) return false;
+    ++(*this);
+    return true;
+  }
+
+  bool match_identifier(const char* name) {
+    if (current_->type() != Token::Type::kIdentifier ||
+        current_->token_string() != name) {
+      return false;
+    }
+    ++(*this);
+    return true;
+  }
+
+  // Advances to the first token with the given type.
+  CppParserIterator& advance_to(Token::Type token_type) {
+    while (*this && (*this)->type() != token_type) ++(*this);
+    return *this;
+  }
+
+  // Erases tokens from *token_container in the range [first_to_erase, *this]
+  // inclusive, and sets *this to point to the next parser token.
+  template <typename Container>
+  CppParserIterator& erase_back_to(Container* token_container,
+                                   CppParserIterator first_to_erase) {
+    for (token_iterator it = first_to_erase.base(); it != current_; ++it) {
+      previous_tokens_.pop();
+    }
+    current_ = token_container->erase(first_to_erase.base(), ++current_);
+    skip_whitespace_and_comments();
+    return *this;
+  }
+
+  int line_number() const { return line_num_; }
+
+  bool has_whitespace_before() const { return whitespace_before_; }
+
+ private:
+  void skip_whitespace_and_comments() {
+    line_num_ += previous_tokens_[0].num_newlines();
+    whitespace_before_ = false;
+    while (current_ != end_ &&
+           current_->matches(Token::Type::kWhitespace, Token::Type::kComment)) {
+      line_num_ += current_->num_newlines();
+      ++current_;
+      whitespace_before_ = true;
+    }
+    using Tt = Token::Type;
+    // Handle #line preprocessor directives.
+    if (previous_tokens_.match(
+            {Tt::kHash, Tt::kIdentifier, Tt::kNumber, Tt::kEndOfDirective}) &&
+        previous_tokens_[-2].matches_identifier("line")) {
+      // TODO: Should check this for invalid values (non-integer or negative
+      // integer; strangely, zero is allowed).
+      line_num_ = std::atoi(previous_tokens_[-1].token_string().c_str());
+    } else if (previous_tokens_.match({Tt::kHash, Tt::kIdentifier, Tt::kNumber,
+                                       Tt::kString, Tt::kEndOfDirective}) &&
+               previous_tokens_[-3].matches_identifier("line")) {
+      line_num_ = std::atoi(previous_tokens_[-2].token_string().c_str());
+      // TODO: The string token should be used as the new filename.
+    }
+  }
+
+  TokenHistoryBuffer<5> previous_tokens_;
+  token_iterator current_;
+  token_iterator end_;
+  int line_num_ = 1;
+  bool whitespace_before_ = false;
+};
+
+template <typename TokenIterator>
+inline CppParserIterator<TokenIterator> make_cpp_parser_iterator(
+    TokenIterator iter, TokenIterator end) {
+  return CppParserIterator<TokenIterator>(iter, end);
 }
 
-// This removes most but not all whitespace. Remaining whitespace is tricky to
-// handle safely+efficiently.
-inline std::string remove_cpp_whitespace(const std::string& source) {
-  std::string result;
-  result.reserve(source.size());
-  size_t old_pos = 0, pos;
-  // Match string literals, preprocessor directives, whitespace, and chars that
-  // can safely have whitespace after them removed.
-  bool inside_directive = false;
-  const char* match_chars = "\"'R# \f\n\r\t\v.,;!|~^()[]{}";
-  while ((pos = source.find_first_of(match_chars, old_pos)) !=
-         std::string::npos) {
-    result.append(source, old_pos, pos - old_pos);
-    const char* beg = source.c_str() + pos;
-    CppLexer lexer(beg);
-    bool end_of_directive = false;
-    bool is_whitespace = false;
-    const char* end = [&] {
-      // clang-format off
-      char c = lexer.advance();
-      switch (c) {
-        case '"':  return lexer.string_literal();
-        case '\'': return lexer.char_literal();
-        case 'R':  return lexer.match('"') ? lexer.raw_string_literal() :
-                          lexer.current();
-        case '#':  return inside_directive = true, lexer.current();
-        default:   return is_whitespace = true, lexer.whitespace();
+struct SourceLocation {
+  SourceLocation() = default;
+  SourceLocation(std::string _filename, int _line = 0)
+      : filename_(std::move(_filename)), line_(_line) {}
+
+  const std::string& file_name() const noexcept { return filename_; }
+  int line() const noexcept { return line_; }
+
+  friend std::string to_string(const SourceLocation& location) {
+    return location.file_name() + ":" + std::to_string(location.line());
+  }
+
+ private:
+  std::string filename_;
+  int line_ = 0;
+};
+
+static const char* const kJitifyDirPrefix = "__jitify_rel_inc:";
+static const char* const kJitifyNamePrefix = ":__jitify_name:";
+
+class IncludeName {
+ public:
+  IncludeName() = default;
+  /* Construct as a <> include (unless _include_name is a patched name, in which
+   * case it is parsed into a "" include.
+   */
+  explicit IncludeName(std::string _include_name, SourceLocation _location = {})
+      : include_name_(std::move(_include_name)),
+        location_(std::move(_location)) {
+    const size_t prefix_len = std::strlen(kJitifyDirPrefix);
+    if (include_name_.substr(0, prefix_len) == kJitifyDirPrefix) {
+      // Parse patched name.
+      const size_t dir_end = include_name_.find(kJitifyNamePrefix, prefix_len);
+      assert(dir_end != std::string::npos);
+      current_dir_ = include_name_.substr(prefix_len, dir_end - prefix_len);
+      include_name_ =
+          include_name_.substr(dir_end + std::strlen(kJitifyNamePrefix));
+    }
+  }
+  /* Construct as a "" include.
+   */
+  IncludeName(std::string _include_name, std::string _current_dir,
+              SourceLocation _location = {})
+      : include_name_(std::move(_include_name)),
+        current_dir_(std::move(_current_dir)),
+        location_(std::move(_location)) {
+    // Absolute paths should always be treated like <> includes.
+    if (jitify2::detail::path_is_absolute(include_name_)) {
+      current_dir_.clear();
+    }
+  }
+  /*! Returns the filename of the include directive (the part inside "" or <>).
+   */
+  const std::string& name() const { return include_name_; }
+  /*! For "" includes, returns the current directory in which the include
+   *  directive was present. For <> includes, returns empty string.
+   */
+  const std::string& current_dir() const { return current_dir_; }
+  /*! Returns whether this is a "" include (as opposed to a <> include).*/
+  bool is_quote_include() const { return !current_dir_.empty(); }
+  /*! Returns the full path to the header assuming it exists in its current
+   * directory. Must only be called for "" includes, never <> includes.
+   */
+  std::string local_full_path() const {
+    assert(is_quote_include());
+    return is_quote_include() ? current_dir() + "/" + name() : "";
+  }
+  /*! Returns the full path to the header assuming it exists in the given
+   * include directory. May be called for either "" or <> includes.
+   */
+  std::string nonlocal_full_path(const std::string& include_dir) const {
+    return include_dir + "/" + include_name_;
+  }
+  // For quote-includes, this returns a modified name that encodes the current
+  // dir too.
+  std::string patched_name() const {
+    if (!is_quote_include()) return name();
+    return kJitifyDirPrefix + current_dir() + kJitifyNamePrefix + name();
+  }
+
+  friend bool operator==(const IncludeName& lhs, const IncludeName& rhs) {
+    return lhs.name() == rhs.name() && lhs.current_dir() == rhs.current_dir();
+  }
+  friend bool operator!=(const IncludeName& lhs, const IncludeName& rhs) {
+    return !(lhs == rhs);
+  }
+
+  size_t hash() const {
+    using jitify2::detail::string_concat;
+    const std::string hash_str =
+        is_quote_include()
+            ? string_concat('"', include_name_, '"', current_dir_)
+            : string_concat('<', include_name_, '>');
+    return std::hash<std::string>()(hash_str);
+  }
+  struct Hash {
+    size_t operator()(const IncludeName& x) const { return x.hash(); }
+  };
+
+  // Implicit conversion to string to maintain backwards compatibility with
+  // FileCallback.
+  operator const std::string &() const { return name(); }
+
+  friend std::string to_string(const IncludeName& incname) {
+    using jitify2::detail::string_concat;
+    return incname.is_quote_include() ? string_concat('"', incname.name(), '"')
+                                      : string_concat('<', incname.name(), '>');
+  }
+
+  const SourceLocation& location() const { return location_; }
+
+ private:
+  std::string include_name_;
+  std::string current_dir_;  // Empty for <> includes, non-empty for "" includes
+  // Informational only.
+  SourceLocation location_;
+};
+
+// Visitor must be callable with signature:
+//   (IncludeName, CppParserIterator<TokenIterator>) -> void.
+template <typename TokenIterator, typename Visitor>
+inline ErrorMsg visit_all_include_directives(TokenIterator begin,
+                                             TokenIterator end,
+                                             const std::string& full_path,
+                                             Visitor visitor) {
+  auto error_msg = [&](int line_number, const std::string& msg) {
+    return ErrorMsg(full_path + ":" + std::to_string(line_number) +
+                    ": error: " + msg);
+  };
+  using Tt = Token::Type;
+  for (auto iter = make_cpp_parser_iterator(begin, end); iter; ++iter) {
+    if (iter.match(Tt::kHash)) {
+      if (!iter.match(Tt::kIdentifier)) {
+        return error_msg(
+            iter.line_number(),
+            "invalid preprocessing directive #" + iter->source_string());
       }
-      // clang-format on
-    }();
-    if (inside_directive && is_whitespace && std::find(beg, end, '\n') != end) {
-      inside_directive = false;
-      end_of_directive = true;
-    }
-    old_pos = end - source.c_str();
-    if ((end - beg == 1 && !std::isspace((unsigned char)*beg)) || *beg == '"' ||
-        *beg == '\'' || *beg == 'R' || *beg == '#') {
-      // Keep single characters ('R'), string literals, and preprocessor
-      // directives.
-      result.append(beg, end);
-    } else {
-      // Elide or replace whitespace.
-      bool before_directive = !inside_directive && *end == '#';
-      if (!std::isspace((unsigned char)*beg)) {
-        // Remove whitespace after symbol.
-        result += *beg;
-        if (end_of_directive || before_directive) {
-          result += '\n';
-        }
-      } else {
-        if (end_of_directive) {
-          result += '\n';
-        } else {
-          // A newline may already be present from a preprocessor directive.
-          bool after_newline = result.empty() || result.back() == '\n';
-          if (!after_newline || before_directive) {
-            // Replace whitespace.
-            result += before_directive ? '\n' : ' ';
+      if (iter.previous_token().token_string() == "include") {
+        auto prev_iter = iter;
+        // Note: It is possible to have macro substitutions here instead of a
+        // string literal, but it is very rare, and some popular tools are
+        // known to not support it (e.g., scons). Of course, Thrust does it!
+        if (!iter.match(Tt::kString)) {
+          // WAR for Thrust using macro substitutions in an #include directive.
+          if (iter->matches_identifier("__THRUST_HOST_SYSTEM_TAG_HEADER")) {
+            *iter = Token(Tt::kString, iter->begin(), iter->end(),
+                          "<thrust/system/cpp/detail/execution_policy.h>");
+            ++iter;
+          } else if (iter->matches_identifier(
+                         "__THRUST_DEVICE_SYSTEM_TAG_HEADER")) {
+            *iter = Token(Tt::kString, iter->begin(), iter->end(),
+                          "<thrust/system/cuda/detail/execution_policy.h>");
+            ++iter;
+
+          } else {
+            return error_msg(
+                iter.line_number(),
+                "#include expects \"FILENAME\" or <FILENAME>, got " +
+                    iter->source_string());
           }
         }
+
+        std::string include_name = iter.previous_token().token_string();
+        const bool is_quote_include = include_name[0] == '"';
+        // Remove quotes/angles.
+        include_name = include_name.substr(1, include_name.size() - 2);
+        const std::string current_dir = jitify2::detail::path_base(full_path);
+        SourceLocation location(full_path, iter.line_number());
+        IncludeName include =
+            is_quote_include
+                ? IncludeName(std::move(include_name), current_dir,
+                              std::move(location))
+                : IncludeName(std::move(include_name), std::move(location));
+        visitor(std::move(include), prev_iter);
       }
+      iter.advance_to(Tt::kEndOfDirective);
+      if (!iter) break;
     }
   }
-  result.append(source, old_pos, std::string::npos);
-  return result;
-}
-
-// WAR for #pragma once not working when there are multiple inclusions of the
-// same header from different paths.
-inline std::string replace_pragma_once_with_ifndef(const std::string& source) {
+  return {};
+}
+
+template <typename TokenSequence, typename Iterator, int N>
+inline Iterator insert_directive_impl(TokenSequence* tokens, Iterator where,
+                                      const Token (&directive_tokens)[N]) {
+  using Tt = Token::Type;
+  // TODO: Find a safer way to do this.
+  constexpr int kMaxNewTokens = 1 + 1 + (2 * N - 1) + 1;
+  Token new_tokens[kMaxNewTokens];
+  int j = 0;
+  Iterator before_where = where;
+  --before_where;
+  if (where != tokens->begin() && before_where->type() != Tt::kEndOfDirective &&
+      (before_where->type() != Tt::kWhitespace ||
+       before_where->num_unescaped_newlines() == 0)) {
+    // Must add newline before new directive.
+    new_tokens[j++] = Token(Tt::kWhitespace, "\n");
+  }
+  new_tokens[j++] = Token(Tt::kHash, "#");
+  for (int i = 0; i < N; ++i) {
+    if (i > 0) {
+      new_tokens[j++] = Token(Tt::kWhitespace, " ");
+    }
+    new_tokens[j++] = directive_tokens[i];
+  }
+  new_tokens[j++] = Token(Tt::kEndOfDirective, "\n");
+  assert(j <= kMaxNewTokens);
+  return tokens->insert(where, new_tokens, new_tokens + j);
+}
+
+template <typename TokenSequence, typename Iterator,
+          typename... DirectiveTokens>
+inline Iterator insert_directive(TokenSequence* tokens, Iterator where,
+                                 const std::string& name,
+                                 const DirectiveTokens&... directive_tokens) {
+  Token directive_tokens_array[] = {Token(Token::Type::kIdentifier, name),
+                                    directive_tokens...};
+  return insert_directive_impl(tokens, where, directive_tokens_array);
+}
+
+// Note: List seems to be up to 4x faster than deque.
+using TokenSequence = std::list<Token>;
+
+// Returns true if a pragma once directive was found.
+inline bool replace_pragma_once_with_ifndef(const std::string& unique_source_id,
+                                            TokenSequence* tokens) {
+  using Tt = Token::Type;
+  // Find and remove all "#pragma once" directives.
+  bool found = false;
+  for (auto iter = make_cpp_parser_iterator(tokens->begin(), tokens->end());
+       iter;) {
+    auto start_iter = iter;
+    if (iter.match(Tt::kHash)) {
+      if (iter.match_identifier("pragma") && iter.match_identifier("once")) {
+        iter.advance_to(Tt::kEndOfDirective);
+        if (!iter) break;
+        // Note: The ++ here advances to the next _base_ token (because we don't
+        // want to jump over subsequent comment or whitespace tokens).
+        iter.erase_back_to(tokens, start_iter);
+        found = true;
+        // Note: There can be more than one #pragma once.
+        continue;
+      } else {
+        iter.advance_to(Tt::kEndOfDirective);
+        if (!iter) break;
+      }
+    }
+    ++iter;
+  }
   constexpr const char* const kJitifyIncludeGuardPrefix =
       "JITIFY_INCLUDE_GUARD_";
-  if (startswith(source, std::string("#ifndef ") + kJitifyIncludeGuardPrefix)) {
-    return source;  // Already been processed
-  }
-  size_t begin, end;
-  if (!find_pragma_once(source, &begin, &end)) return source;
-  // Replace #pragma once with hash-based include guard around source.
-  std::string include_guard_name =
-      string_concat(kJitifyIncludeGuardPrefix, sha256(source), "\n");
-  // Note: We use `#line 1` to fix the line numbering after adding additional
-  // code at the beginning of the file.
-  std::string prefix = string_concat("#ifndef ", include_guard_name, "#define ",
-                                     include_guard_name, "#line 1\n");
-  std::string suffix = "\n#endif  // " + include_guard_name;
-  std::string result;
-  result.reserve(prefix.size() + source.size() + suffix.size());
-  result += prefix;
-  result.append(source, 0, begin);
-  result.append(source, end, std::string::npos);
-  result += suffix;
-  return result;
+  if (found) {
+    using jitify2::detail::sha256;
+    using jitify2::detail::string_concat;
+    // Insert a hash-based include guard around the source.
+    std::string include_guard_name =
+        string_concat(kJitifyIncludeGuardPrefix, sha256(unique_source_id));
+    Token guard_identifier(Tt::kIdentifier, include_guard_name);
+    // Note: Reverse order due to insertion at the beginning.
+    insert_directive(tokens, tokens->begin(), "define", guard_identifier);
+    insert_directive(tokens, tokens->begin(), "ifndef", guard_identifier);
+    insert_directive(tokens, tokens->end(), "endif",
+                     Token(Tt::kComment, "// " + include_guard_name));
+  }
+  return found;
+}
+
+// Changes usages of "std::" to "cuda::std::".
+// TODO: This isn't completely robust because we don't apply macro
+// substitutions.
+template <typename TokenSequence>
+inline void replace_std_with_cuda_std(TokenSequence* tokens) {
+  using Tt = Token::Type;
+  for (auto iter = make_cpp_parser_iterator(tokens->begin(), tokens->end());
+       iter;) {
+    if (((iter.previous_token().type() != Tt::kIdentifier &&
+          iter.match(Tt::kColonColon)) ||
+         iter.previous_token().type() != Tt::kColonColon)) {
+      auto before_std_iter = iter;
+      if (iter.match_identifier("std") && iter.match(Tt::kColonColon)) {
+        tokens->insert(before_std_iter.base(), Token(Tt::kIdentifier, "cuda"));
+        tokens->insert(before_std_iter.base(), Token(Tt::kColonColon, "::"));
+      } else if (iter.previous_token().type() != Tt::kColonColon) {
+        ++iter;
+      }
+    } else {
+      ++iter;
+    }
+  }
 }
 
-inline std::string patch_cuda_source(std::string source, bool use_cuda_std,
-                                     bool replace_pragma_once) {
-  if (use_cuda_std) {
-    source = detail::replace_std_with_cuda_std(std::move(source));
+inline bool must_separate_tokens(const Token& lhs, const Token& rhs,
+                                 int cxx_standard_year) {
+  using Tt = Token::Type;
+  // Check if concatenating them would form a new token.
+  return concatenate(lhs, rhs, cxx_standard_year) ||
+         // These are parsed greedily, so lhs/rhs would become reversed.
+         // E.g., a+++b == a++ +b.
+         // Note: It's very important to get these right, because otherwise it
+         // will silently introduce bugs in the minified source.
+         (lhs.matches(Tt::kPlus) && rhs.matches(Tt::kPlusPlus)) ||
+         (lhs.matches(Tt::kMinus) && rhs.matches(Tt::kMinusMinus)) ||
+         (lhs.matches(Tt::kColon) && rhs.matches(Tt::kColonColon)) ||
+         (lhs.matches(Tt::kGt) && rhs.matches(Tt::kGtGt));
+}
+
+template <typename TokenIterator>
+inline void minify_cuda_source(TokenIterator begin, TokenIterator end,
+                               int cxx_standard_year,
+                               std::string* minified_source) {
+  using Tt = Token::Type;
+  minified_source->clear();
+  bool in_directive = false;
+  for (auto iter = make_cpp_parser_iterator(begin, end); iter; ++iter) {
+    if (iter.previous_token() &&
+        must_separate_tokens(iter.previous_token(), *iter, cxx_standard_year)) {
+      minified_source->push_back(' ');
+      // TODO: The below condition should really check that the hash is the
+      // start of a directive (and not another hash inside a directive), but
+      // there's not an easy way to do it here. Using a new kStartOfIdentifier
+      // type is a possibility, but it complicates other things.
+    } else if (!iter->matches(Tt::kEndOfDirective) &&
+               iter.has_whitespace_before() &&
+               iter.previous_token().matches(Tt::kIdentifier) &&
+               iter.previous_token(-1).matches_identifier("define") &&
+               iter.previous_token(-2).matches(Tt::kHash)) {
+      // Must separate macro name and definition with whitespace.
+      // E.g., `FOO-123` is OK, but `#define FOO-123` is not.
+      // E.g., `FOO(bar)` is OK, but `#define FOO(bar)` is different to
+      // `#define FOO (bar)`.
+      minified_source->push_back(' ');
+    }
+    if (!in_directive && iter->type() == Tt::kHash) {
+      in_directive = true;
+      if (iter.previous_token() &&
+          !iter.previous_token().matches(Tt::kEndOfDirective)) {
+        // Must start directives on a new line.
+        minified_source->push_back('\n');
+      }
+    } else if (in_directive && iter->type() == Tt::kEndOfDirective) {
+      in_directive = false;
+    }
+    // Note: Using token_string() means that escaped newlines are elided.
+    minified_source->append(iter->token_string());
   }
-  if (replace_pragma_once) {
-    source = detail::replace_pragma_once_with_ifndef(std::move(source));
+}
+
+enum class ProcessFlags : unsigned {
+  kNone = 0,
+  kReplacePragmaOnce = 1 << 0,
+  kReplaceStd = 1 << 1,
+  kMinify = 1 << 2,
+  kAddUsedHeaderWarning = 1 << 3,
+};
+inline ProcessFlags operator|(ProcessFlags lhs, ProcessFlags rhs) {
+  using T = typename std::underlying_type<ProcessFlags>::type;
+  return static_cast<ProcessFlags>(static_cast<T>(lhs) | static_cast<T>(rhs));
+}
+inline ProcessFlags& operator|=(ProcessFlags& lhs, ProcessFlags rhs) {
+  lhs = lhs | rhs;
+  return lhs;
+}
+inline bool operator&(ProcessFlags lhs, ProcessFlags rhs) {
+  using T = typename std::underlying_type<ProcessFlags>::type;
+  return static_cast<T>(lhs) & static_cast<T>(rhs);
+}
+
+// Note: The returned includes are _all_ the includes in the source, even if
+// they end up not being reachable due to #if[def] directives.
+// Note: It is OK if source and *processed_source are the same underlying memory
+// (i.e., in-place operation is OK).
+template <typename IncludeVisitor>
+inline ErrorMsg process_cuda_source(const std::string& source,
+                                    const std::string& full_path,
+                                    ProcessFlags flags, int cxx_standard_year,
+                                    std::string* processed_source,
+                                    IncludeVisitor include_visitor) {
+  using Tt = Token::Type;
+  auto tokens = CppLexer::tokenize<TokenSequence>(source.c_str());
+  using TokenIterator = TokenSequence::iterator;
+  ErrorMsg err = visit_all_include_directives(
+      tokens.begin(), tokens.end(), full_path,
+      [&](IncludeName include, CppParserIterator<TokenIterator> iter) {
+        if (include.is_quote_include()) {
+          // Change `#include "name"` to `#include <patched_name>`, where
+          // patched_name encodes the current dir as well as the name.
+          *iter = Token(Tt::kString, "<" + include.patched_name() + ">");
+        }
+        include_visitor(std::move(include));
+      });
+  if (err) return err;
+  // Insert "#line 1" at the beginning of the file so that line numbering is
+  // not messed up by subsequent line insertions at the beginning.
+  // Note: Reverse order due to insertion at the beginning.
+  insert_directive(&tokens, tokens.begin(), "line", Token(Tt::kNumber, "1"));
+  if (flags & ProcessFlags::kAddUsedHeaderWarning) {
+    // Insert a guarded #warning that we can use to see if this header was
+    // actually included during compilation.
+    insert_directive(&tokens, tokens.begin(), "endif");
+    insert_directive(&tokens, tokens.begin(), "warning",
+                     Token(Tt::kIdentifier, "JITIFY_USED_HEADER"),
+                     Token(Tt::kString, "\"" + full_path + "\""));
+    insert_directive(&tokens, tokens.begin(), "ifdef",
+                     Token(Tt::kIdentifier, "JITIFY_USED_HEADER_WARNINGS"));
+  }
+  if (flags & ProcessFlags::kReplacePragmaOnce) {
+    // Note: Must use source itself as unique idenfitier because multiple
+    // filenames may refer to the same file (via copy/symlink/hardlink).
+    replace_pragma_once_with_ifndef(source, &tokens);
+  }
+  if (flags & ProcessFlags::kReplaceStd) {
+    replace_std_with_cuda_std(&tokens);
+  }
+  if (flags & ProcessFlags::kMinify) {
+    // Reconstruct minified source.
+    minify_cuda_source(tokens.begin(), tokens.end(), cxx_standard_year,
+                       processed_source);
+  } else {
+    processed_source->clear();
+    // Reconstruct source.
+    for (const Token& token : tokens) {
+      processed_source->append(token.source_string());
+    }
   }
-  // HACK This is a WAR for some CUB sources including a header they shouldn't.
-  size_t pos = source.find("#include \"../util_device.cuh\"");
-  if (pos != std::string::npos) {
-    source[pos] = '/';  // Comment out the line
-    source[pos + 1] = '/';
+  return {};
+}
+
+}  // namespace parser
+
+namespace detail {
+
+static const char* const kJitifyBuiltinHeaderPrefix = "__jitify_builtin";
+static const char* const kJitifyCallbackHeaderPrefix = "__jitify_callback";
+
+enum class HeaderLoadStatus {
+  kFailed = 0,
+  kAlreadyLoaded = 1,
+  kNewlyLoaded = 2,
+};
+
+// Note: StringMapT is to allow the caller to use StringOrRef instead of
+// std::string in the map.
+template <class StringMapT>
+HeaderLoadStatus load_header(const parser::IncludeName& include,
+                             HeaderCallback header_callback,
+                             const std::vector<std::string>& include_paths,
+                             bool use_builtin_headers, std::string* full_path,
+                             StringMapT* fullpath_to_source) {
+  auto already_loaded = [&](const std::string& fp) {
+    return fullpath_to_source->count(fp);
+  };
+  auto newly_loaded = [&](std::string source) {
+    fullpath_to_source->emplace(*full_path, std::move(source));
+    return HeaderLoadStatus::kNewlyLoaded;
+  };
+  std::string source;
+  // Try loading via callback.
+  *full_path = include.nonlocal_full_path(kJitifyCallbackHeaderPrefix);
+  *full_path = path_simplify(*full_path);
+  if (already_loaded(*full_path)) return HeaderLoadStatus::kAlreadyLoaded;
+  if (header_callback and header_callback(include, &source)) {
+    return newly_loaded(std::move(source));
+  }
+  // Try loading from current directory.
+  if (include.is_quote_include()) {
+    *full_path = include.local_full_path();
+    *full_path = path_simplify(*full_path);
+    if (already_loaded(*full_path)) return HeaderLoadStatus::kAlreadyLoaded;
+    if (read_text_file(*full_path, &source)) {
+      return newly_loaded(std::move(source));
+    }
   }
-  // HACK This is a WAR for Thrust (pre-CUDA-11) using "#define A #pragma B".
-  pos = source.find("#pragma nv_exec_check_disable");
-  if (pos != std::string::npos) {
-    source[pos] = '/';  // Comment out the (rest of the) line
-    source[pos + 1] = '/';
+  // Try loading from include directories.
+  for (const std::string& include_path : include_paths) {
+    *full_path = include.nonlocal_full_path(include_path);
+    *full_path = path_simplify(*full_path);
+    if (already_loaded(*full_path)) return HeaderLoadStatus::kAlreadyLoaded;
+    if (read_text_file(*full_path, &source)) {
+      return newly_loaded(std::move(source));
+    }
   }
-  // HACK This is a WAR for Thrust using
-  pos = source.find("__has_cpp_attribute(gnu::warn_unused_result)");
-  if (pos != std::string::npos) {
-    source[pos + 23] = '_';  // Replace "::" with "__".
-    source[pos + 24] = '_';
+  // Try loading from builtin headers.
+  if (use_builtin_headers) {
+    *full_path = include.nonlocal_full_path(kJitifyBuiltinHeaderPrefix);
+    *full_path = path_simplify(*full_path);
+    if (already_loaded(*full_path)) return HeaderLoadStatus::kAlreadyLoaded;
+    auto iter = get_jitsafe_headers_map().find(include.name());
+    if (iter != get_jitsafe_headers_map().end()) {
+      source = iter->second;
+      return newly_loaded(std::move(source));
+    }
   }
-  return source;
+  return HeaderLoadStatus::kFailed;
 }
 
-// Removes comments and most whitespace from C++ source code.
-inline std::string minify_cpp_source(const std::string& source) {
-  return remove_cpp_whitespace(
-      remove_cpp_comments_and_line_continuations(source));
+inline bool remove_stop_compilation_error(std::string* compile_log) {
+  size_t pos = compile_log->find("__JITIFY_STOP_COMPILATION");
+  if (pos == std::string::npos) return false;
+  pos = compile_log->find_last_of('\n', pos);
+  if (pos == std::string::npos) {
+    pos = 0;
+  }
+  compile_log->resize(pos);
+  return true;
 }
 
-inline void extract_include_paths(OptionsVec* options,
-                                  StringVec* include_paths) {
-  const std::vector<int> idxs = options->find({"-I"});
-  for (int i = (int)idxs.size() - 1; i >= 0; --i) {
-    const int idx = idxs[i];
-    include_paths->push_back((*options)[idx].value());
-    options->erase(idx);
+// Finds used header warnings, removes them from the compile log, and adds their
+// fullpaths to *used_headers.
+inline bool extract_used_header_warnings(
+    std::string* compile_log, std::unordered_set<std::string>* used_headers) {
+  // Remove line containing JITIFY_USED_HEADER and the next two lines.
+  // If the line after the first one of these contains -diag-suppress,
+  //   remove that line and the one after it.
+  static const char* const kJitifyUsedHeader = "JITIFY_USED_HEADER";
+  int num_found = 0;
+  size_t pos;
+  while ((pos = compile_log->find(kJitifyUsedHeader)) != std::string::npos) {
+    ++num_found;
+    size_t start = pos + std::strlen(kJitifyUsedHeader) + 2;
+    size_t end = compile_log->find_first_of('"', start);
+    assert(end != std::string::npos);
+    std::string header_fullpath = compile_log->substr(start, end - start);
+    used_headers->emplace(std::move(header_fullpath));
+    start = compile_log->find_last_of('\n', pos);
+    if (start == std::string::npos) {
+      start = (size_t)-1;
+    }
+    ++start;
+    // Each full warning message is 4 lines.
+    for (int i = 0; i < 4; ++i) {
+      size_t new_end = compile_log->find_first_of('\n', end + 1);
+      if (new_end == std::string::npos) break;  // End of log
+      end = new_end;
+    }
+    ++end;
+    std::string tail = compile_log->substr(end);
+    compile_log->resize(start);
+    *compile_log += tail;
+  }
+  const bool found_any = num_found > 0;
+  if (found_any) {
+    if (compile_log->find("#warning directive") == std::string::npos) {
+      // There are no other warnings, remove message about -diag-suppress.
+      pos = compile_log->find("-diag-suppress");
+      if (pos == std::string::npos) return true;
+      size_t start = compile_log->find_last_of('\n', pos);
+      if (start == std::string::npos) {
+        start = (size_t)-1;
+      }
+      ++start;
+      size_t end =
+          compile_log->find_first_of('\n', pos + std::strlen("-diag-suppress"));
+      assert(end != std::string::npos);
+      end = compile_log->find_first_of('\n', end + 1);
+      std::string tail;
+      if (end != std::string::npos) {
+        ++end;
+        tail = compile_log->substr(end);
+      }
+      compile_log->resize(start);
+      *compile_log += tail;
+    }
   }
+  return found_any;
 }
 
 }  // namespace detail
 
 inline PreprocessedProgram PreprocessedProgram::preprocess(
-    std::string name, std::string source, StringMap header_sources,
-    OptionsVec compiler_options, OptionsVec linker_options,
-    FileCallback header_callback) {
+    std::string program_name, std::string program_source,
+    StringMap header_sources, OptionsVec compiler_options,
+    OptionsVec linker_options, HeaderCallback header_callback) {
   // Add pre-include built-in JIT-safe headers.
   bool use_system_headers_war = !compiler_options.pop(
       {"-no-system-headers-workaround", "--no-system-headers-workaround"});
@@ -6141,7 +7282,8 @@ inline PreprocessedProgram PreprocessedProgram::preprocess(
         detail::get_jitsafe_headers_map().at("jitify_preinclude.h"));
     compiler_options.push_back(Option("-include", "jitify_preinclude.h"));
   }
-  detail::add_std_flag_if_not_specified(&compiler_options, 11);
+  const int cxx_standard_year =
+      detail::add_std_flag_if_not_specified(&compiler_options, 11);
   detail::add_default_device_flag_if_not_specified(&compiler_options);
   bool minify = compiler_options.pop({"-m", "--minify"});
   // TODO: This flag is experimental, because the implementation does not
@@ -6160,37 +7302,148 @@ inline PreprocessedProgram PreprocessedProgram::preprocess(
   bool should_remove_unused_globals = compiler_options.pop(
       {"-remove-unused-globals", "--remove-unused-globals"});
 
-  // Patch all given sources.
-  source = detail::patch_cuda_source(source, use_cuda_std, replace_pragma_once);
-  for (auto& name_source : header_sources) {
-    const std::string& header_name = name_source.first;
-    std::string& header_source = name_source.second;
-    bool is_jitify_preinclude = header_name == "jitify_preinclude.h";
-    bool is_cuda_std_header =
-        detail::get_workaround_system_headers().count(header_name);
-    header_source = detail::patch_cuda_source(
-        header_source,
-        use_cuda_std && !is_jitify_preinclude && !is_cuda_std_header,
-        replace_pragma_once);
-  }
+  using parser::IncludeName;
+  using parser::ProcessFlags;
+  std::unordered_map<IncludeName, std::string, IncludeName::Hash>
+      include_to_fullpath;
+  std::unordered_map<std::string, detail::StringOrRef> fullpath_to_source;
+  std::queue<IncludeName> include_queue;
+  ProcessFlags process_flags = ProcessFlags::kNone;
+  if (replace_pragma_once) process_flags |= ProcessFlags::kReplacePragmaOnce;
+  if (minify) process_flags |= ProcessFlags::kMinify;
+  const ProcessFlags replace_std_flag_if_enabled =
+      use_cuda_std ? ProcessFlags::kReplaceStd : ProcessFlags::kNone;
+
+  auto process_cuda_source_fn =
+      [&](std::string* source_ptr, const std::string& fullpath,
+          ProcessFlags extra_flags = ProcessFlags::kNone) {
+        return parser::process_cuda_source(
+            source_ptr->c_str(), fullpath, process_flags | extra_flags,
+            cxx_standard_year, source_ptr, [&](IncludeName include) {
+              if (include_to_fullpath.count(include)) {
+                return;
+              }
+              include_queue.push(std::move(include));
+            });
+      };
 
-  if (minify) {
-    source = detail::minify_cpp_source(source);
-    for (auto& name_source : header_sources) {
-      std::string* header_source = &name_source.second;
-      *header_source = detail::minify_cpp_source(*header_source);
-    }
+  const std::string current_dir =
+      detail::path_base(detail::get_current_executable_path());
+  const std::string program_fullpath =
+      detail::path_join(current_dir, detail::sanitize_slashes(program_name));
+  ErrorMsg err = process_cuda_source_fn(&program_source, program_fullpath,
+                                        replace_std_flag_if_enabled);
+  if (err) return Error(err);
+  static const char* const early_stop_code = R"(
+#ifdef JITIFY_PREPROCESS_ONLY
+#include <__JITIFY_STOP_COMPILATION>
+#endif
+)";
+  program_source += early_stop_code;
+
+  // Put the given header_sources into the include_to_fullpath and
+  // fullpath_to_source maps.
+  for (auto& header_source : header_sources) {
+    const std::string& name = header_source.first;
+    std::string* source_ptr = &header_source.second;
+    std::string fullpath = detail::path_is_absolute(name)
+                               ? name
+                               : detail::path_join(current_dir, name);
+    fullpath = detail::path_simplify(fullpath);
+    err = process_cuda_source_fn(
+        source_ptr, fullpath,
+        replace_std_flag_if_enabled | ProcessFlags::kAddUsedHeaderWarning);
+    if (err) return Error(err);
+    // Note: The names (keys) in header_sources will be matched:
+    // a) directly, for `#include <name>` directives, and
+    // b) as if they are filenames (relative to the current exe dir if not
+    //    absolute), for `#include "name"` directives. This will NOT fall back
+    //    to direct matching like <> includes.
+    // This allows path-based matching.
+    fullpath_to_source.emplace(fullpath, detail::StringOrRef(source_ptr));
+    // This allows direct matching for <> includes.
+    include_to_fullpath.emplace(IncludeName(name), std::move(fullpath));
   }
 
-  // Temporarily add the program source to header_sources for easier processing.
-  header_sources.emplace(name, source);
-
   StringVec include_paths;
   detail::extract_include_paths(&compiler_options, &include_paths);
-  std::string include_paths_msg =
-      detail::string_join(include_paths, "\n", "Include paths:\n", "\n");
+
+  // Recursively load and process all includes, putting them into the
+  // include_to_fullpath and fullpath_to_source maps.
+  std::string header_log;
+  while (!include_queue.empty()) {
+    const IncludeName include_name = std::move(include_queue.front());
+    include_queue.pop();
+    std::string header_fullpath;
+    using detail::HeaderLoadStatus;
+    const HeaderLoadStatus status = detail::load_header(
+        include_name, header_callback, include_paths, use_builtin_headers,
+        &header_fullpath, &fullpath_to_source);
+    // Note: We ignore missing headers here because they may not be needed; if
+    // they are needed, the error will be caught when we invoke the compiler.
+    if (status == HeaderLoadStatus::kFailed) continue;
+    header_log += detail::string_concat("Found #include ", include_name,
+                                        " from ", include_name.location(),
+                                        " at:\n  ", header_fullpath, "\n");
+    if (status == HeaderLoadStatus::kNewlyLoaded) {
+      std::string& header_source = fullpath_to_source.at(header_fullpath);
+      if (detail::endswith(header_fullpath, "cub/util_device.cuh")) {
+        // WAR for CUB header that is full of host-only code.
+        header_source = "";
+      } else {
+        ProcessFlags extra_flags = ProcessFlags::kAddUsedHeaderWarning;
+        const bool is_jitify_preinclude =
+            include_name.name() == "jitify_preinclude.h";
+        const bool is_builtin_header =
+            header_fullpath.find(detail::kJitifyBuiltinHeaderPrefix) == 0;
+        const bool is_cuda_std_header =
+            // TODO: More robust way to detect this?
+            header_fullpath.find("cuda/std/") != std::string::npos ||
+            header_fullpath.find("cuda\\std\\") != std::string::npos;
+        if (!is_jitify_preinclude && !is_builtin_header &&
+            !is_cuda_std_header) {
+          extra_flags |= replace_std_flag_if_enabled;
+        }
+        err = process_cuda_source_fn(&header_source, header_fullpath,
+                                     extra_flags);
+        if (!err.empty()) return Error(err);
+      }
+    }
+    include_to_fullpath.emplace(include_name, header_fullpath);
+  }
+
+  // Put all includes from the maps into header_sources.
+  for (const auto& include_fullpath : include_to_fullpath) {
+    const IncludeName include_name = include_fullpath.first;
+    const std::string& fullpath = include_fullpath.second;
+    assert(fullpath_to_source.count(fullpath));
+    detail::StringOrRef* source_ptr = &fullpath_to_source.at(fullpath);
+    // Note: This will not replace existing headers that were passed in, giving
+    // them the priority. This also makes our use of StringOrRef safe, because
+    // the ones that are references are the ones that are already in
+    // header_sources.
+    // Note: We insert an empty string first and then assign to it.
+    auto iter_inserted =
+        header_sources.emplace(include_name.patched_name(), std::string());
+    auto iter = iter_inserted.first;
+    std::string* out_source_ptr = &iter->second;
+    const bool inserted = iter_inserted.second;
+    if (inserted) {
+      // This is a cheap string move the first time this source_ptr is used.
+      // Subsequent times (i.e., if the same header source is mapped to multiple
+      // include names), it copies the string.
+      // TODO: In theory we could use StringOrRef in header_sources too to avoid
+      // needing copies of the same header sources, and I think it would be safe
+      // as long as we didn't erase any elements from it, but it's a bit risky,
+      // and would be exposed in the public interface.
+      source_ptr->copy_to_and_reference(out_source_ptr);
+    }
+  }
 
   if (!nvrtc()) return Error(nvrtc().error());
+  if (nvrtc().get_version() >= 11060) {
+    detail::add_no_source_include_flag_if_not_specified(&compiler_options);
+  }
   // Parse architecture flags for special handling. If specified here, the arch
   // must be explicit (no auto-detection), and it will not be passed through to
   // the compile phase.
@@ -6248,96 +7501,45 @@ inline PreprocessedProgram PreprocessedProgram::preprocess(
     // default arch) when none was specified by the user.
     arch_flags.insert({0, false});
   }
+  // We temporarily enable warnings so that we can parse the ones we added.
+  const bool disable_warnings =
+      compiler_options.pop({"--disable-warnings", "-w"});
   // Maps header include names to their full file paths.
   StringMap header_fullpaths;
-  std::string compile_log, header_log;
-  // Repeat preprocessing for each specified architecture.
+  std::string compile_log;
+  std::unordered_set<std::string> used_header_fullpaths;
+  // Repeat preprocessing for each specified architecture, collecting in
+  // used_header_fullpaths.
   for (const ArchFlag& arch_flag : arch_flags) {
     if (arch_flag.cc) {
       // Temporarily add this arch flag.
       compiler_options.push_back(static_cast<Option>(arch_flag));
     }
-
+    compiler_options.push_back(Option("-DJITIFY_PREPROCESS_ONLY"));
+    compiler_options.push_back(Option("-DJITIFY_USED_HEADER_WARNINGS"));
     std::string compiler_options_msg = detail::string_join(
         compiler_options, " ", "Compiler options: \"", "\"\n");
     std::string compile_error;
-    while (!detail::compile_program(name, source, header_sources,
-                                    compiler_options, &compile_error,
-                                    &compile_log)) {
-      std::string include_name, include_parent;
-      int line_num = 0;
-      if (!detail::extract_include_info_from_compile_error(
-              compile_log, &include_name, &include_parent, &line_num)) {
-        // There was a non include-related compilation error.
-        return Error("Compilation failed: " + compile_error + "\n" +
-                     compiler_options_msg + header_log + compile_log);
-      }
-
-      bool is_included_with_quotes = false;
-      if (header_sources.count(include_parent)) {
-        const std::string& parent_source = header_sources.at(include_parent);
-        std::string parse_error;
-        is_included_with_quotes = detail::is_include_directive_with_quotes(
-            parent_source, line_num, &parse_error);
-        if (!parse_error.empty()) {
-          // TODO: This happens with at least one Thrust header due to our
-          // parsing not being robust enough. For now we just ignore it instead.
-          // return Error("Internal parsing error for " + include_parent + ":" +
-          //             std::to_string(line_num) + ": " + parse_error);
-          // TODO: Print a warning message, but only if the "-w" option is not
-          // on. std::cerr << "Warning [jitify]: Internal parsing error for "
-          //          << include_parent << ":" << line_num << ": " <<
-          //          parse_error;
-        }
-      }
-
-      // Try to load the new header.
-      // Note: This fullpath lookup is needed because the compiler error
-      // messages have the include name of the header instead of its full path.
-      std::string include_parent_fullpath = header_fullpaths[include_parent];
-      std::string include_path = detail::path_base(include_parent_fullpath);
-
-      using detail::HeaderLoadStatus;
-      HeaderLoadStatus load_status =
-          detail::load_header(include_name, include_paths, include_path,
-                              /*search_current_dir = */ is_included_with_quotes,
-                              use_builtin_headers, header_callback,
-                              &header_sources, &header_fullpaths);
-      if (load_status != HeaderLoadStatus::FAILED) {
-        const std::string& header_fullpath = header_fullpaths.at(include_name);
-        if (load_status == HeaderLoadStatus::NEWLY_LOADED) {
-          // Patch the newly-loaded header.
-          bool is_cuda_std_header =
-              header_fullpath.find(detail::kJitifyBuiltinHeaderPrefix) == 0 ||
-              // TODO: More robust way to detect this?
-              header_fullpath.find(detail::path_join(
-                  detail::path_join("cuda", "std"), "")) != std::string::npos;
-          std::string* header_source = &header_sources.at(include_name);
-          if (!is_cuda_std_header) {
-            *header_source = detail::patch_cuda_source(
-                *header_source, use_cuda_std, replace_pragma_once);
-          }
-          if (minify) {
-            *header_source = detail::minify_cpp_source(*header_source);
-          }
-        }
-        // Log where the header was found.
-        header_log += detail::string_join(
-            {"Found #include ", (is_included_with_quotes ? "\"" : "<"),
-             include_name, (is_included_with_quotes ? "\"" : ">"), " from ",
-             include_parent, ":", std::to_string(line_num), " [",
-             include_parent_fullpath, "]", " at:\n  ", header_fullpath, "\n"},
-            "");
-      } else {
-        // Missing header.
-        std::string current_dir_msg =
-            "Current path: \"" + include_path + "\"\n";
-        return Error("Preprocessing failed: Header not found\n" + header_log +
-                     include_paths_msg + compiler_options_msg +
-                     current_dir_msg + include_parent + "(" +
-                     std::to_string(line_num) + "): error: " + include_name +
-                     ": [jitify] File not found");
-      }
+    // Note: This should always fail, because we inserted an #error directive.
+    const nvrtcResult compile_result =
+        detail::compile_program(program_name, program_source, header_sources,
+                                compiler_options, &compile_error, &compile_log);
+    assert(compile_result != NVRTC_SUCCESS);
+    if (compile_result != NVRTC_ERROR_COMPILATION) {
+      // There was something wrong with the compilation (e.g., invalid option).
+      return Error("Compilation failed: " + compile_error + "\n" +
+                   compiler_options_msg + header_log + compile_log);
+    }
+    compiler_options.pop_back();  // Remove -DJITIFY_USED_HEADER_WARNINGS
+    compiler_options.pop_back();  // Remove -DJITIFY_PREPROCESS_ONLY
+
+    detail::remove_stop_compilation_error(&compile_log);
+    detail::extract_used_header_warnings(&compile_log, &used_header_fullpaths);
+    if (compile_log.find(": error: ") != std::string::npos ||
+        compile_log.find(": catastrophic error: ") != std::string::npos) {
+      // There were real compilation errors.
+      return Error("Compilation failed: " + compile_error + "\n" +
+                   compiler_options_msg + header_log + compile_log);
     }
 
     if (arch_flag.cc) {
@@ -6345,18 +7547,35 @@ inline PreprocessedProgram PreprocessedProgram::preprocess(
     }
   }
 
-  // Remove the program source from header_sources now that processing is done.
-  header_sources.erase(name);
+  // Remove unused headers from header_sources.
+  for (auto it = header_sources.begin(); it != header_sources.end();) {
+    const std::string& name = it->first;
+    // Note that this parses patched names back into IncludeName.
+    IncludeName include_name(name);
+    assert(include_to_fullpath.count(include_name));
+    const std::string& fullpath = include_to_fullpath.at(include_name);
+    if (!used_header_fullpaths.count(fullpath) &&
+        // // WAR for CUB header that is full of host-only code.
+        !detail::endswith(fullpath, "cub/util_device.cuh")) {
+      it = header_sources.erase(it);
+    } else {
+      ++it;
+    }
+  }
 
+  // Re-add the --disable-warnings flag if it was provided.
+  if (disable_warnings) {
+    compiler_options.push_back(Option("-w"));
+  }
   // Re-add the -remove-unused-globals flag if it was provided.
   if (should_remove_unused_globals) {
     compiler_options.push_back(Option("-remove-unused-globals"));
   }
 
   return PreprocessedProgram(
-      std::move(name), std::move(source), std::move(header_sources),
-      std::move(compiler_options), std::move(linker_options),
-      std::move(header_log), std::move(compile_log));
+      std::move(program_name), std::move(program_source),
+      std::move(header_sources), std::move(compiler_options),
+      std::move(linker_options), std::move(header_log), std::move(compile_log));
 }
 
 /*! An object containing CUDA source and header strings and associated metadata.
@@ -6405,9 +7624,9 @@ class ProgramData : public serialization::Serializable<ProgramData> {
    *  \return A PreprocessedProgram object that contains either a valid
    *    PreprocessedProgramData object or an error state.
    */
-  PreprocessedProgram preprocess(OptionsVec compiler_options = {},
-                                 OptionsVec linker_options = {},
-                                 FileCallback header_callback = nullptr) const {
+  PreprocessedProgram preprocess(
+      OptionsVec compiler_options = {}, OptionsVec linker_options = {},
+      HeaderCallback header_callback = nullptr) const {
     return PreprocessedProgram::preprocess(name_, source_, header_sources_,
                                            compiler_options, linker_options,
                                            header_callback);
diff --git a/jitify2_test.cu b/jitify2_test.cu
index e53bc96..2f376e8 100644
--- a/jitify2_test.cu
+++ b/jitify2_test.cu
@@ -38,6 +38,13 @@
 #include <string>
 #include <vector>
 
+#if defined _WIN32 || defined _WIN64
+#include <direct.h>
+#define chdir _chdir
+#else
+#include <unistd.h>
+#endif
+
 #include "gtest/gtest.h"
 
 #define CHECK_CUDA(call)                                                  \
@@ -635,10 +642,10 @@ TEST(Jitify2Test, PathSimplify) {
   EXPECT_EQ(jitify2::detail::path_simplify("/foo/bar/"), "/foo/bar/");
   EXPECT_EQ(jitify2::detail::path_simplify("foo/../bar/"), "bar/");
   EXPECT_EQ(jitify2::detail::path_simplify("/foo/../bar/"), "/bar/");
-  EXPECT_EQ(jitify2::detail::path_simplify("/../foo"), "");  // Invalid path
-  EXPECT_EQ(jitify2::detail::path_simplify("/foo/../../bar"), // Invalid path
+  EXPECT_EQ(jitify2::detail::path_simplify("/../foo"), "");    // Invalid path
+  EXPECT_EQ(jitify2::detail::path_simplify("/foo/../../bar"),  // Invalid path
             "");
-  EXPECT_EQ(jitify2::detail::path_simplify("/.."), "");  // Invalid path
+  EXPECT_EQ(jitify2::detail::path_simplify("/.."), "");         // Invalid path
   EXPECT_EQ(jitify2::detail::path_simplify("/foo/../.."), "");  // Invalid path
 #if defined _WIN32 || defined _WIN64
   EXPECT_EQ(jitify2::detail::path_simplify(R"(\)"), R"(\)");
@@ -650,8 +657,7 @@ TEST(Jitify2Test, PathSimplify) {
   EXPECT_EQ(jitify2::detail::path_simplify(R"(\foo\..\bar)"), R"(\bar)");
   EXPECT_EQ(jitify2::detail::path_simplify(R"(foo\..\bar)"), R"(bar)");
 
-  EXPECT_EQ(jitify2::detail::path_simplify(R"(\foo/.\bar)"),
-            R"(\foo/bar)");
+  EXPECT_EQ(jitify2::detail::path_simplify(R"(\foo/.\bar)"), R"(\foo/bar)");
   EXPECT_EQ(jitify2::detail::path_simplify(R"(\foo/.\bar\./cat)"),
             R"(\foo/bar\cat)");
   EXPECT_EQ(jitify2::detail::path_simplify(R"(\foo/.\bar\../cat)"),
@@ -697,11 +703,18 @@ TEST(Jitify2Test, PreprocessedProgram) {
 __global__ void my_kernel() {}
 )";
   static const char* const header_name = "my_header1.cuh";
+
+  Program empty_program(name, "");
+  ASSERT_EQ(get_error(empty_program), "");
+  PreprocessedProgram empty_preprog =
+      empty_program->preprocess({"-no-preinclude-workarounds"});
+  ASSERT_TRUE(static_cast<bool>(empty_preprog));
+
   Program program(name, source);
   ASSERT_EQ(get_error(program), "");
   PreprocessedProgram preprog = program->preprocess();
   ASSERT_EQ(static_cast<bool>(preprog), false);
-  EXPECT_TRUE(CONTAINS(preprog.error(), "File not found"));
+  EXPECT_TRUE(CONTAINS(preprog.error(), "could not open source file"));
   preprog = program->preprocess({"-Iexample_headers"}, {"-lfoo"});
   ASSERT_EQ(get_error(preprog), "");
   EXPECT_EQ(preprog->name(), name);
@@ -711,6 +724,89 @@ __global__ void my_kernel() {}
   EXPECT_EQ(preprog->remaining_linker_options(), StringVec({"-lfoo"}));
   EXPECT_NE(preprog->header_log(), "");
   EXPECT_EQ(preprog->compile_log(), "");
+
+  // Ensure that the --disable-warnings flag doesn't break things.
+  preprog = program->preprocess(
+      {"-Iexample_headers", "--disable-warnings", "-w"}, {"-lfoo"});
+  ASSERT_EQ(get_error(preprog), "");
+  CompiledProgram compiled = preprog->compile();
+  ASSERT_EQ(get_error(compiled), "");
+}
+
+TEST(Jitify2Test, ExplicitHeaderSources) {
+  // This test checks how the keys in a user-provided header_sources map are
+  // matched to #include directives in the source.
+  static const std::string good_header = R"()";
+  static const std::string bad_header =
+      R"(#error TEST FAIL: WRONG HEADER FOUND)";
+  static const std::string angle_header = R"(#include <bar>)";
+  static const std::string quote_header = R"(#include "bar")";
+  static const std::string quote_abs_header = R"(#include "/bar")";
+  PreprocessedProgram preprog;
+  preprog = Program("my_program", R"(#include <foo>)", {{"foo", good_header}})
+                ->preprocess();
+  ASSERT_EQ(get_error(preprog), "");
+  preprog =
+      Program("my_program", R"(#include <foo/bar>)", {{"foo/bar", good_header}})
+          ->preprocess();
+  ASSERT_EQ(get_error(preprog), "");
+  preprog = Program("my_program", R"(#include <foo/angle>)",
+                    {{"foo/angle", angle_header},
+                     {"bar", good_header},
+                     {"foo/bar", bad_header}})
+                ->preprocess();
+  ASSERT_EQ(get_error(preprog), "");
+  preprog = Program("my_program", R"(#include "foo")",
+                    {{"foo", good_header}, {"/foo", bad_header}})
+                ->preprocess();
+  ASSERT_EQ(get_error(preprog), "");
+  preprog = Program("my_program", R"(#include "foo/bar")",
+                    {{"foo/bar", good_header},
+                     {"/foo/bar", bad_header},
+                     {"bar", bad_header}})
+                ->preprocess();
+  ASSERT_EQ(get_error(preprog), "");
+  preprog = Program("my_program", R"(#include <foo/quote>)",
+                    {{"foo/quote", quote_header},
+                     {"foo/bar", good_header},
+                     {"bar", bad_header},
+                     {"/foo/bar", bad_header}})
+                ->preprocess();
+  ASSERT_EQ(get_error(preprog), "");
+  preprog = Program("my_program", R"(#include </foo/bar>)",
+                    {{"/foo/bar", good_header},
+                     {"foo/bar", bad_header},
+                     {"bar", bad_header}})
+                ->preprocess();
+  ASSERT_EQ(get_error(preprog), "");
+  preprog = Program("my_program", R"(#include "/foo/bar")",
+                    {{"/foo/bar", good_header},
+                     {"foo/bar", bad_header},
+                     {"bar", bad_header}})
+                ->preprocess();
+  ASSERT_EQ(get_error(preprog), "");
+  preprog = Program("my_program", R"(#include <foo/quote_abs>)",
+                    {{"foo/quote_abs", quote_abs_header},
+                     {"/bar", good_header},
+                     {"bar", bad_header},
+                     {"/foo/bar", bad_header},
+                     {"foo/bar", bad_header}})
+                ->preprocess();
+  ASSERT_EQ(get_error(preprog), "");
+}
+
+TEST(Jitify2Test, CurrentExeIncludePath) {
+  static const std::string source = R"(
+#include <example_headers/my_header1.cuh>
+)";
+  std::unique_ptr<const char, int (*)(const char*)> cd_back("..", ::chdir);
+  ASSERT_EQ(::chdir("example_headers"), 0);
+  // This requires -I. to be expanded to the current executable directory, not
+  // the current working directory.
+  PreprocessedProgram preprog =
+      Program("my_program", source)->preprocess({"-I."});
+  ASSERT_EQ(get_error(preprog), "");
+  ASSERT_EQ(get_error(preprog->compile()), "");
 }
 
 TEST(Jitify2Test, CompiledProgram) {
@@ -1527,14 +1623,6 @@ __global__ void my_kernel() {}
 TEST(Jitify2Test, Thrust) {
   // clang-format off
   static const char* const source = R"(
-// WAR for header include issue (note: order of includes matters):
-//   https://github.com/NVIDIA/jitify/issues/107#issuecomment-1225617951
-#include <cuda/std/cstdint>
-#include <cuda/std/cstddef>
-#include <cuda/std/type_traits>
-#include <cuda/std/limits>
-namespace std { using ::ptrdiff_t; }
-
 #include <thrust/iterator/counting_iterator.h>
 __global__ void my_kernel(thrust::counting_iterator<int> begin,
                           thrust::counting_iterator<int> end) {
@@ -1880,6 +1968,439 @@ TEST(Jitify2Test, SerializationGoldensLinkedProgram) {
   });
 }
 
+void expect_tokenization(const char* source,
+                         std::vector<parser::Token::Type> expected_types,
+                         std::vector<std::string> expected_token_strings) {
+  using namespace jitify2::parser;
+  CppLexer lexer(source);
+  std::vector<Token> tokens;
+  Token cur_token = lexer.next();
+  while (cur_token) {
+    tokens.push_back(cur_token);
+    cur_token = lexer.next();
+  }
+  std::vector<Token::Type> types;
+  std::vector<std::string> token_strings;
+  types.reserve(tokens.size());
+  token_strings.reserve(tokens.size());
+  for (const Token& token : tokens) {
+    types.push_back(token.type());
+    token_strings.push_back(std::string(token.token_string()));
+  }
+  EXPECT_EQ(types, expected_types);
+  EXPECT_EQ(token_strings, expected_token_strings);
+}
+
+TEST(Jitify2ParserTest, SingleTokens) {
+  using namespace jitify2::parser;
+  using Tt = Token::Type;
+  expect_tokenization("(", {Tt::kLParen}, {"("});
+  expect_tokenization(")", {Tt::kRParen}, {")"});
+  expect_tokenization("[", {Tt::kLBracket}, {"["});
+  expect_tokenization("<:", {Tt::kLBracket}, {"<:"});
+  expect_tokenization("]", {Tt::kRBracket}, {"]"});
+  expect_tokenization(":>", {Tt::kRBracket}, {":>"});
+  expect_tokenization("{", {Tt::kLBrace}, {"{"});
+  expect_tokenization("<%", {Tt::kLBrace}, {"<%"});
+  expect_tokenization("}", {Tt::kRBrace}, {"}"});
+  expect_tokenization("%>", {Tt::kRBrace}, {"%>"});
+  expect_tokenization(".", {Tt::kDot}, {"."});
+  expect_tokenization(".*", {Tt::kDotStar}, {".*"});
+  expect_tokenization("->", {Tt::kArrow}, {"->"});
+  expect_tokenization("->*", {Tt::kArrowStar}, {"->*"});
+  expect_tokenization(",", {Tt::kComma}, {","});
+  expect_tokenization("+", {Tt::kPlus}, {"+"});
+  expect_tokenization("++", {Tt::kPlusPlus}, {"++"});
+  expect_tokenization("+=", {Tt::kPlusEq}, {"+="});
+  expect_tokenization("-", {Tt::kMinus}, {"-"});
+  expect_tokenization("--", {Tt::kMinusMinus}, {"--"});
+  expect_tokenization("-=", {Tt::kMinusEq}, {"-="});
+  expect_tokenization("*", {Tt::kStar}, {"*"});
+  expect_tokenization("*=", {Tt::kStarEq}, {"*="});
+  expect_tokenization("/", {Tt::kSlash}, {"/"});
+  expect_tokenization("/=", {Tt::kSlashEq}, {"/="});
+  expect_tokenization("%", {Tt::kPercent}, {"%"});
+  expect_tokenization("%=", {Tt::kPercentEq}, {"%="});
+  expect_tokenization("?", {Tt::kQuestion}, {"?"});
+  expect_tokenization(":", {Tt::kColon}, {":"});
+  expect_tokenization("::", {Tt::kColonColon}, {"::"});
+  expect_tokenization("&", {Tt::kAmp}, {"&"});
+  expect_tokenization("&&", {Tt::kAmpAmp}, {"&&"});
+  expect_tokenization("&=", {Tt::kAmpEq}, {"&="});
+  expect_tokenization("|", {Tt::kBar}, {"|"});
+  expect_tokenization("||", {Tt::kBarBar}, {"||"});
+  expect_tokenization("|=", {Tt::kBarEq}, {"|="});
+  expect_tokenization("^", {Tt::kCaret}, {"^"});
+  expect_tokenization("^=", {Tt::kCaretEq}, {"^="});
+  expect_tokenization("~", {Tt::kTilde}, {"~"});
+  expect_tokenization("=", {Tt::kEq}, {"="});
+  expect_tokenization("==", {Tt::kEqEq}, {"=="});
+  expect_tokenization("!", {Tt::kBang}, {"!"});
+  expect_tokenization("!=", {Tt::kBangEq}, {"!="});
+  expect_tokenization("<", {Tt::kLt}, {"<"});
+  expect_tokenization("<<", {Tt::kLtLt}, {"<<"});
+  expect_tokenization("<=", {Tt::kLtEq}, {"<="});
+  expect_tokenization("<<=", {Tt::kLtLtEq}, {"<<="});
+  expect_tokenization(">", {Tt::kGt}, {">"});
+  expect_tokenization(">>", {Tt::kGtGt}, {">>"});
+  expect_tokenization(">=", {Tt::kGtEq}, {">="});
+  expect_tokenization(">>=", {Tt::kGtGtEq}, {">>="});
+  expect_tokenization("#", {Tt::kHash}, {"#"});
+  expect_tokenization("%:", {Tt::kHash}, {"%:"});
+  expect_tokenization("##", {Tt::kHashHash}, {"##"});
+  expect_tokenization("%:%:", {Tt::kHashHash}, {"%:%:"});
+  expect_tokenization(";", {Tt::kSemicolon}, {";"});
+  expect_tokenization(" ", {Tt::kWhitespace}, {" "});
+  expect_tokenization("\f", {Tt::kWhitespace}, {"\f"});
+  expect_tokenization("\r", {Tt::kWhitespace}, {"\r"});
+  expect_tokenization("\t", {Tt::kWhitespace}, {"\t"});
+  expect_tokenization("\v", {Tt::kWhitespace}, {"\v"});
+  expect_tokenization("\n", {Tt::kWhitespace}, {"\n"});
+  expect_tokenization("0123", {Tt::kNumber}, {"0123"});
+  expect_tokenization("123", {Tt::kNumber}, {"123"});
+  expect_tokenization("0x1F", {Tt::kNumber}, {"0x1F"});
+  expect_tokenization("0b10", {Tt::kNumber}, {"0b10"});
+  expect_tokenization("123u", {Tt::kNumber}, {"123u"});
+  expect_tokenization("123LLu", {Tt::kNumber}, {"123LLu"});
+  expect_tokenization("\"str\"", {Tt::kString}, {"\"str\""});
+  expect_tokenization("u\"str\"", {Tt::kString}, {"u\"str\""});
+  expect_tokenization("u8\"str\"", {Tt::kString}, {"u8\"str\""});
+  expect_tokenization("U\"str\"", {Tt::kString}, {"U\"str\""});
+  expect_tokenization("L\"str\"", {Tt::kString}, {"L\"str\""});
+  expect_tokenization(R"("a \n\"b\"")", {Tt::kString}, {R"("a \n\"b\"")"});
+  expect_tokenization("R\"xx(str)xx\"", {Tt::kRawString}, {"R\"xx(str)xx\""});
+  expect_tokenization("uR\"xx(str)xx\"", {Tt::kRawString}, {"uR\"xx(str)xx\""});
+  expect_tokenization("u8R\"xx(str)xx\"", {Tt::kRawString},
+                      {"u8R\"xx(str)xx\""});
+  expect_tokenization("UR\"xx(str)xx\"", {Tt::kRawString}, {"UR\"xx(str)xx\""});
+  expect_tokenization("LR\"xx(str)xx\"", {Tt::kRawString}, {"LR\"xx(str)xx\""});
+  expect_tokenization(R"yy(R"xx(a\nb
+\c\\")xx")yy",
+                      {Tt::kRawString}, {R"yy(R"xx(a\nb
+\c\\")xx")yy"});
+  expect_tokenization("'c'", {Tt::kCharacter}, {"'c'"});
+  expect_tokenization("u'c'", {Tt::kCharacter}, {"u'c'"});
+  expect_tokenization("u8'c'", {Tt::kCharacter}, {"u8'c'"});
+  expect_tokenization("U'c'", {Tt::kCharacter}, {"U'c'"});
+  expect_tokenization("L'c'", {Tt::kCharacter}, {"L'c'"});
+  expect_tokenization(R"('\'')", {Tt::kCharacter}, {R"('\'')"});
+  expect_tokenization(R"('\\')", {Tt::kCharacter}, {R"('\\')"});
+  expect_tokenization(R"('\n')", {Tt::kCharacter}, {R"('\n')"});
+  expect_tokenization("abc_DEF1", {Tt::kIdentifier}, {"abc_DEF1"});
+  expect_tokenization("u", {Tt::kIdentifier}, {"u"});
+  expect_tokenization("u8", {Tt::kIdentifier}, {"u8"});
+  expect_tokenization("U", {Tt::kIdentifier}, {"U"});
+  expect_tokenization("L", {Tt::kIdentifier}, {"L"});
+  expect_tokenization("uabc", {Tt::kIdentifier}, {"uabc"});
+  expect_tokenization("u8abc", {Tt::kIdentifier}, {"u8abc"});
+  expect_tokenization("Uabc", {Tt::kIdentifier}, {"Uabc"});
+  expect_tokenization("Labc", {Tt::kIdentifier}, {"Labc"});
+  expect_tokenization("class", {Tt::kKeyword}, {"class"});
+  expect_tokenization("not", {Tt::kKeyword}, {"not"});
+  expect_tokenization("consteval", {Tt::kKeyword}, {"consteval"});
+  expect_tokenization("__device__", {Tt::kKeyword}, {"__device__"});
+  expect_tokenization("__constant__", {Tt::kKeyword}, {"__constant__"});
+  expect_tokenization("// A comment", {Tt::kComment}, {"// A comment"});
+  expect_tokenization("// A \"comment\"", {Tt::kComment}, {"// A \"comment\""});
+  expect_tokenization("/* A comment\n*/", {Tt::kComment}, {"/* A comment\n*/"});
+  expect_tokenization("/* A \"comment\"\n*/", {Tt::kComment},
+                      {"/* A \"comment\"\n*/"});
+}
+
+TEST(Jitify2ParserTest, MultipleTokens) {
+  using namespace jitify2::parser;
+  using Tt = Token::Type;
+  // Make sure escaped backslashes don't break string tokenization.
+  expect_tokenization(R"('\\';)", {Tt::kCharacter, Tt::kSemicolon},
+                      {R"('\\')", ";"});
+  expect_tokenization(R"("\\";)", {Tt::kString, Tt::kSemicolon},
+                      {R"("\\")", ";"});
+  // Make sure unterminated string doesn't run on into the next line.
+  expect_tokenization(R"("foo
+";)",
+                      {Tt::kString, Tt::kWhitespace, Tt::kString},
+                      {R"("foo)", "\n", R"(";)"});
+  // Make sure #include strings treat backslashes literally.
+  expect_tokenization(
+      R"(#include "x\n\\y\"//comment)",
+      {Tt::kHash, Tt::kIdentifier, Tt::kWhitespace, Tt::kString, Tt::kComment},
+      {"#", "include", " ", R"("x\n\\y\")", "//comment"});
+  expect_tokenization(
+      R"(#include <x\n\\y\>//comment)",
+      {Tt::kHash, Tt::kIdentifier, Tt::kWhitespace, Tt::kString, Tt::kComment},
+      {"#", "include", " ", R"(<x\n\\y\>)", "//comment"});
+}
+
+TEST(Jitify2ParserTest, AlternativeOperatorRepresentations) {
+  using namespace jitify2::parser;
+  static const char* const source1 = "vector<::std::string>";
+  CppLexer lexer(source1);
+  ASSERT_EQ(lexer.next().token_string(), "vector");
+  ASSERT_EQ(lexer.next().token_string(), "<");
+  ASSERT_EQ(lexer.next().token_string(), "::");
+  ASSERT_EQ(lexer.next().token_string(), "std");
+  ASSERT_EQ(lexer.next().token_string(), "::");
+  ASSERT_EQ(lexer.next().token_string(), "string");
+
+  static const char* const source2 = "(argv<::>)";
+  lexer = CppLexer(source2);
+  ASSERT_EQ(lexer.next().token_string(), "(");
+  ASSERT_EQ(lexer.next().token_string(), "argv");
+  ASSERT_EQ(lexer.next().token_string(), "<:");
+  ASSERT_EQ(lexer.next().token_string(), ":>");
+  ASSERT_EQ(lexer.next().token_string(), ")");
+
+  static const char* const source3 = "foo<:::std::string>";
+  lexer = CppLexer(source3);
+  ASSERT_EQ(lexer.next().token_string(), "foo");
+  ASSERT_EQ(lexer.next().token_string(), "<:");
+  ASSERT_EQ(lexer.next().token_string(), "::");
+  ASSERT_EQ(lexer.next().token_string(), "std");
+  ASSERT_EQ(lexer.next().token_string(), "::");
+  ASSERT_EQ(lexer.next().token_string(), "string");
+}
+
+TEST(Jitify2ParserTest, Minify) {
+  using namespace jitify2::parser;
+  using Tt = Token::Type;
+  static const char* const source = R"~(#pragma once
+#define BAR -1
+#define CAT ( x )
+#define CAT2 (x) + (y)
+#define DOG( x )
+#define DOG2( x ) x
+#define DOG3( x ) ( x )
+#define DOG4( x ) (x) + (y)
+#define DOG5( x )x 
+#define DOG6( x )( x )
+
+#define FOO            \
+  do {                 \
+    printf("error\n"); \
+  while (0)
+
+#define _STR(x) #x
+#define STR(x) _STR(x)
+#define PLUS +
+const char* str_suf1 = "foo"s;
+const char* str_suf2 = "foo" STR(bar) "bar";
+#pragma once
+const char* rstr_suf1 = R"(foo)"s;
+const char* rstr_suf2 = R"(foo)" s;
+char char_suf = 'c's;
+#pragma once
+unsigned num_suf1 = 123u;
+unsigned num_suf2 = 123 PLUS 1;
+
+#pragma once
+c += a++ + b;
+c += a + ++b;
+c += a +++ b;
+c += a++++ + ++++b;
+
+a : ::b;
+)~";
+  static const char* const minified_source = R"~(#pragma once
+#define BAR -1
+#define CAT (x)
+#define CAT2 (x)+(y)
+#define DOG(x)
+#define DOG2(x)x
+#define DOG3(x)(x)
+#define DOG4(x)(x)+(y)
+#define DOG5(x)x
+#define DOG6(x)(x)
+#define FOO do{printf("error\n");while(0)
+#define _STR(x)#x
+#define STR(x)_STR(x)
+#define PLUS +
+const char*str_suf1="foo"s;const char*str_suf2="foo" STR(bar)"bar";
+#pragma once
+const char*rstr_suf1=R"(foo)"s;const char*rstr_suf2=R"(foo)" s;char char_suf='c's;
+#pragma once
+unsigned num_suf1=123u;unsigned num_suf2=123 PLUS 1;
+#pragma once
+c+=a+++b;c+=a+ ++b;c+=a+++b;c+=a+++++ ++++b;a: ::b;)~";
+  auto tokens = CppLexer::tokenize<TokenSequence>(source);
+  const int cxx_standard_year = 20;
+  std::string processed_source;
+  minify_cuda_source(tokens.begin(), tokens.end(), cxx_standard_year,
+                     &processed_source);
+  EXPECT_EQ(processed_source, minified_source);
+}
+
+TEST(Jitify2ParserTest, ProcessCudaSource) {
+  using namespace jitify2::parser;
+  static const char* const source = R"~(
+# /*blah*/pragma /*blah*/once  // blah
+const char* include = "#include <x.h>";
+#pragma once
+#pragma once
+// A comment.
+//using std::array;
+using std::array;
+using ::std::array;
+# /*blah*/ inclu\
+de /*blah*/ <a.h> /*blah*/ // blah
+#line 1
+const char* bar = "#include \"y.h\"";
+# /*blah*/ include /*blah*/ "b.h" /*blah*/ // blah
+#include "foo/c.h"
+#include <foo/c.h>
+const char* cat = R"blah(#include "z.h")blah" "dog";
+int i = cat[0 + 1];
+)~";
+  static const char* const expected =
+      R"~(#ifndef JITIFY_INCLUDE_GUARD_D17F1E6F8466B0A8F5157A76D6618008AF6353BBABC40BE8FC2AFF6B38D21883
+#define JITIFY_INCLUDE_GUARD_D17F1E6F8466B0A8F5157A76D6618008AF6353BBABC40BE8FC2AFF6B38D21883
+#ifdef JITIFY_USED_HEADER_WARNINGS
+#warning JITIFY_USED_HEADER "./my_header.cuh"
+#endif
+#line 1
+
+const char* include = "#include <x.h>";
+// A comment.
+//using std::array;
+using cuda::std::array;
+using ::cuda::std::array;
+# /*blah*/ inclu\
+de /*blah*/ <a.h> /*blah*/ // blah
+#line 1
+const char* bar = "#include \"y.h\"";
+# /*blah*/ include /*blah*/ <__jitify_rel_inc:.:__jitify_name:b.h> /*blah*/ // blah
+#include <__jitify_rel_inc:.:__jitify_name:foo/c.h>
+#include <foo/c.h>
+const char* cat = R"blah(#include "z.h")blah" "dog";
+int i = cat[0 + 1];
+#endif // JITIFY_INCLUDE_GUARD_D17F1E6F8466B0A8F5157A76D6618008AF6353BBABC40BE8FC2AFF6B38D21883
+)~";
+  static const char* const expected_minified =
+      R"~(#ifndef JITIFY_INCLUDE_GUARD_D17F1E6F8466B0A8F5157A76D6618008AF6353BBABC40BE8FC2AFF6B38D21883
+#define JITIFY_INCLUDE_GUARD_D17F1E6F8466B0A8F5157A76D6618008AF6353BBABC40BE8FC2AFF6B38D21883
+#ifdef JITIFY_USED_HEADER_WARNINGS
+#warning JITIFY_USED_HEADER "./my_header.cuh"
+#endif
+#line 1
+const char*include="#include <x.h>";using cuda::std::array;using::cuda::std::array;
+#include<a.h>
+#line 1
+const char*bar="#include \"y.h\"";
+#include<__jitify_rel_inc:.:__jitify_name:b.h>
+#include<__jitify_rel_inc:.:__jitify_name:foo/c.h>
+#include<foo/c.h>
+const char*cat=R"blah(#include "z.h")blah""dog";int i=cat[0+1];
+#endif
+)~";
+  const int cxx_standard_year = -1;
+  std::string processed_source;
+  std::vector<IncludeName> includes;
+  std::string include_name = "my_header.cuh";
+  std::string current_dir = ".";
+  std::string include_fullpath = current_dir + "/" + include_name;
+  EXPECT_TRUE(process_cuda_source(
+                  source, include_fullpath,
+                  ProcessFlags::kReplacePragmaOnce | ProcessFlags::kReplaceStd |
+                      ProcessFlags::kAddUsedHeaderWarning,
+                  cxx_standard_year, &processed_source,
+                  [&](IncludeName include) { includes.push_back(include); })
+                  .empty());
+  std::vector<IncludeName> expected_includes = {
+      IncludeName("a.h"), IncludeName("b.h", current_dir),
+      IncludeName("foo/c.h", "."), IncludeName("foo/c.h")};
+  ASSERT_EQ(includes, expected_includes);
+  EXPECT_EQ(includes[0].location().file_name(), include_fullpath);
+  EXPECT_EQ(includes[0].location().line(), 11);
+  EXPECT_EQ(includes[2].location().file_name(), include_fullpath);
+  EXPECT_EQ(includes[2].location().line(), 3);
+  EXPECT_EQ(processed_source, expected);
+  includes.clear();
+  EXPECT_TRUE(process_cuda_source(
+                  source, include_fullpath,
+                  ProcessFlags::kReplacePragmaOnce | ProcessFlags::kReplaceStd |
+                      ProcessFlags::kAddUsedHeaderWarning |
+                      ProcessFlags::kMinify,
+                  cxx_standard_year, &processed_source,
+                  [&](IncludeName include) { includes.push_back(include); })
+                  .empty());
+  ASSERT_EQ(includes, expected_includes);
+  EXPECT_EQ(includes[0].location().file_name(), include_fullpath);
+  EXPECT_EQ(includes[0].location().line(), 11);
+  EXPECT_EQ(includes[2].location().file_name(), include_fullpath);
+  EXPECT_EQ(includes[2].location().line(), 3);
+  EXPECT_EQ(processed_source, expected_minified);
+}
+
+TEST(Jitify2ParserTest, CppParserIterator) {
+  static const char* const source =
+      R"(
+#/*blah*/incl\
+ude/*blah*/<foo> //blah
+
+
+#/*blah*/line/*blah*/10
+#/*blah*/include/*blah*/"bar" //blah
+#/*blah*/li\
+ne/*blah*/20 "newfilename"
+# include "cat"
+)";
+  using namespace jitify2::parser;
+  using Tt = Token::Type;
+  auto tokens = CppLexer::tokenize<TokenSequence>(source);
+  auto iter = make_cpp_parser_iterator(tokens.begin(), tokens.end());
+  EXPECT_EQ(iter.line_number(), 2);
+  ASSERT_TRUE(iter.match(Tt::kHash));
+  EXPECT_EQ(iter.line_number(), 2);
+  ASSERT_TRUE(iter.match_identifier("include"));
+  EXPECT_EQ(iter.line_number(), 3);
+  ASSERT_TRUE(iter.match(Tt::kString));
+  EXPECT_EQ(iter.line_number(), 3);
+  ASSERT_EQ(iter.previous_token().token_string(), "<foo>");
+  ASSERT_TRUE(iter.match(Tt::kEndOfDirective));
+
+  EXPECT_EQ(iter.line_number(), 6);
+  ASSERT_TRUE(iter.match(Tt::kHash));
+  EXPECT_EQ(iter.line_number(), 6);
+  ASSERT_TRUE(iter.match_identifier("line"));
+  EXPECT_EQ(iter.line_number(), 6);
+  ASSERT_TRUE(iter.match(Tt::kNumber));
+  EXPECT_EQ(iter.line_number(), 6);
+  ASSERT_EQ(iter.previous_token().token_string(), "10");
+  ASSERT_TRUE(iter.match(Tt::kEndOfDirective));
+
+  EXPECT_EQ(iter.line_number(), 10);
+  ASSERT_TRUE(iter.match(Tt::kHash));
+  EXPECT_EQ(iter.line_number(), 10);
+  ASSERT_TRUE(iter.match_identifier("include"));
+  EXPECT_EQ(iter.line_number(), 10);
+  ASSERT_TRUE(iter.match(Tt::kString));
+  EXPECT_EQ(iter.line_number(), 10);
+  ASSERT_EQ(iter.previous_token().token_string(), "\"bar\"");
+  ASSERT_TRUE(iter.match(Tt::kEndOfDirective));
+
+  EXPECT_EQ(iter.line_number(), 11);
+  ASSERT_TRUE(iter.match(Tt::kHash));
+  EXPECT_EQ(iter.line_number(), 11);
+  ASSERT_TRUE(iter.match_identifier("line"));
+  EXPECT_EQ(iter.line_number(), 12);
+  ASSERT_TRUE(iter.match(Tt::kNumber));
+  EXPECT_EQ(iter.line_number(), 12);
+  ASSERT_EQ(iter.previous_token().token_string(), "20");
+  ASSERT_TRUE(iter.match(Tt::kString));
+  ASSERT_EQ(iter.previous_token().token_string(), "\"newfilename\"");
+  EXPECT_EQ(iter.line_number(), 12);
+  ASSERT_TRUE(iter.match(Tt::kEndOfDirective));
+
+  EXPECT_EQ(iter.line_number(), 20);
+  ASSERT_TRUE(iter.match(Tt::kHash));
+  EXPECT_EQ(iter.line_number(), 20);
+  ASSERT_TRUE(iter.match_identifier("include"));
+  EXPECT_EQ(iter.line_number(), 20);
+  ASSERT_TRUE(iter.match(Tt::kString));
+  EXPECT_EQ(iter.line_number(), 20);
+  ASSERT_EQ(iter.previous_token().token_string(), "\"cat\"");
+  ASSERT_TRUE(iter.match(Tt::kEndOfDirective));
+}
+
 int main(int argc, char** argv) {
   cudaSetDevice(0);
   // Initialize the driver context (avoids "initialization error"/"context is

From 619104fec8b576feaeaf74f4c0b0fd68b9cb6400 Mon Sep 17 00:00:00 2001
From: Ben Barsdell <bbarsdell@nvidia.com>
Date: Tue, 21 Nov 2023 21:07:24 +1100
Subject: [PATCH 03/47] Add -D_FILE_OFFSET_BITS=64 to the build

- This is required for building on some systems, including
  manylinux2014, and it doesn't hurt on other systems.
---
 CMakeLists.txt | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 0736f43..5cffae3 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -12,7 +12,10 @@ if (MSVC)
   set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} /O2")
 else()
   set(CMAKE_CXX_FLAGS
-      "${CMAKE_CXX_FLAGS} -O3 -Wall -Wextra -Wconversion -Wshadow -fmessage-length=80")
+      "${CMAKE_CXX_FLAGS} -O3 \
+      -Wall -Wextra -Wconversion -Wshadow -fmessage-length=80 \
+      -D_FILE_OFFSET_BITS=64 \
+      ")
   set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -g")
   if (ASAN)
     set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} \
@@ -32,7 +35,11 @@ if (MSVC)
     "${CMAKE_CUDA_FLAGS_RELEASE} -O3 -Xcompiler=\"/O2\"")
 else()
   set(CMAKE_CUDA_FLAGS
-    "${CMAKE_CUDA_FLAGS} -Xcompiler=\"-Wall -Wextra -Wconversion -Wshadow\" -O3 -rdc=true")
+    "${CMAKE_CUDA_FLAGS} -O3 \
+    -Xcompiler=\"-Wall -Wextra -Wconversion -Wshadow -fmessage-length=80 \" \
+    -D_FILE_OFFSET_BITS=64 \
+    -rdc=true \
+    ")
   set(CMAKE_CUDA_FLAGS_DEBUG "${CMAKE_CUDA_FLAGS_DEBUG} -g")
 endif()
 

From 91b74962cc4ef75709e93f331c9fdc823e3ee2e2 Mon Sep 17 00:00:00 2001
From: Ben Barsdell <bbarsdell@nvidia.com>
Date: Tue, 21 Nov 2023 21:09:03 +1100
Subject: [PATCH 04/47] Undef all internal macros

---
 jitify2.hpp | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/jitify2.hpp b/jitify2.hpp
index be1cb51..cf805ba 100644
--- a/jitify2.hpp
+++ b/jitify2.hpp
@@ -8675,12 +8675,13 @@ class ProgramCache {
 
 }  // namespace jitify2
 
+#undef JITIFY_DEFINE_SERIALIZABLE_MEMBERS
+
 #ifndef JITIFY_SERIALIZATION_ONLY
 
 #undef JITIFY_PATH_MAX
-#undef JITIFY_THROW_OR_RETURN_IF_CUDA_ERROR
-#undef JITIFY_THROW_OR_RETURN
-#undef JITIFY_THROW_OR_TERMINATE
+#undef JITIFY_DEPRECATED
+#undef JITIFY_IF_THREAD_SAFE
 
 #if defined(_WIN32) || defined(_WIN64)
 #pragma pop_macro("max")
@@ -8688,6 +8689,10 @@ class ProgramCache {
 #pragma pop_macro("strtok_r")
 #endif
 
+#undef JITIFY_THROW_OR_RETURN_IF_CUDA_ERROR
+#undef JITIFY_THROW_OR_RETURN
+#undef JITIFY_THROW_OR_TERMINATE
+
 #endif  // not JITIFY_SERIALIZATION_ONLY
 
 #endif  // JITIFY2_HPP_INCLUDE_GUARD

From 6efe6332b552c7a573284b170a559035316f49d3 Mon Sep 17 00:00:00 2001
From: Ben Barsdell <bbarsdell@nvidia.com>
Date: Tue, 21 Nov 2023 21:10:05 +1100
Subject: [PATCH 05/47] Minor formatting fixes

---
 jitify2.hpp | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/jitify2.hpp b/jitify2.hpp
index cf805ba..bbb658d 100644
--- a/jitify2.hpp
+++ b/jitify2.hpp
@@ -2110,9 +2110,7 @@ class ConfiguredKernelData {
   /*! Launch the configured kernel.
    *  \return An empty string on success, otherwise an error message.
    */
-  ErrorMsg launch() const {
-    return this->launch_raw(nullptr);
-  }
+  ErrorMsg launch() const { return this->launch_raw(nullptr); }
 };
 
 class ConfiguredKernel
@@ -7555,7 +7553,7 @@ inline PreprocessedProgram PreprocessedProgram::preprocess(
     assert(include_to_fullpath.count(include_name));
     const std::string& fullpath = include_to_fullpath.at(include_name);
     if (!used_header_fullpaths.count(fullpath) &&
-        // // WAR for CUB header that is full of host-only code.
+        // WAR for CUB header that is full of host-only code.
         !detail::endswith(fullpath, "cub/util_device.cuh")) {
       it = header_sources.erase(it);
     } else {

From 7fe899d3061c1acfc573a6797c03f16d2bef7e73 Mon Sep 17 00:00:00 2001
From: Ben Barsdell <bbarsdell@nvidia.com>
Date: Tue, 21 Nov 2023 21:11:26 +1100
Subject: [PATCH 06/47] Add deprecated attribute to FileCallback

---
 jitify2.hpp | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/jitify2.hpp b/jitify2.hpp
index bbb658d..ad1ffa4 100644
--- a/jitify2.hpp
+++ b/jitify2.hpp
@@ -4013,9 +4013,8 @@ using parser::IncludeName;  // Pull into main namespace
 using HeaderCallback =
     std::function<bool(const parser::IncludeName&, std::string*)>;
 
-// TODO: Mark with deprecated attribute.
-// Deprecated, use HeaderCallback instead.
-using FileCallback = HeaderCallback;
+using FileCallback JITIFY_DEPRECATED("Use HeaderCallback instead") =
+    HeaderCallback;
 
 class PreprocessedProgram
     : public detail::FallibleObjectBase<PreprocessedProgram,

From db439e49fcb80d652883198d350100b8cfe9d480 Mon Sep 17 00:00:00 2001
From: Ben Barsdell <bbarsdell@nvidia.com>
Date: Tue, 21 Nov 2023 21:12:02 +1100
Subject: [PATCH 07/47] Add specific test for limits headers (issue 107)

---
 jitify2_test.cu | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/jitify2_test.cu b/jitify2_test.cu
index 2f376e8..8716b3a 100644
--- a/jitify2_test.cu
+++ b/jitify2_test.cu
@@ -1775,6 +1775,20 @@ __global__ void my_kernel() {}
                     "-no-system-headers-workaround", "-no-replace-pragma-once"})
       ->get_kernel("my_kernel");
 }
+
+// See GitHub issue #107.
+TEST(Jitify2Test, LibCudaCxxAndBuiltinLimits) {
+  static const char* const source = R"(
+#include <limits>
+#include <cuda/std/limits>
+)";
+
+  PreprocessedProgram preprog =
+    Program("limits_program", source)->preprocess({"-I" CUDA_INC_DIR});
+  ASSERT_EQ(get_error(preprog), "");
+  CompiledProgram compiled = preprog->compile();
+  ASSERT_EQ(get_error(compiled), "");
+}
 #endif  // CUDA_VERSION >= 11000
 
 TEST(Jitify2Test, AssertHeader) {

From 25ab77b992c9f8e30b4b2cb757646c58b44f2d4d Mon Sep 17 00:00:00 2001
From: Ben Barsdell <bbarsdell@nvidia.com>
Date: Tue, 21 Nov 2023 21:13:13 +1100
Subject: [PATCH 08/47] Fix minor issues for MSVC build

---
 jitify2.hpp | 21 +++++++++++++--------
 1 file changed, 13 insertions(+), 8 deletions(-)

diff --git a/jitify2.hpp b/jitify2.hpp
index ad1ffa4..c785526 100644
--- a/jitify2.hpp
+++ b/jitify2.hpp
@@ -42,10 +42,12 @@
 #define JITIFY2_HPP_INCLUDE_GUARD
 
 #include <algorithm>
+#include <array>
 #include <cassert>
 #include <climits>
 #include <initializer_list>
 #include <iostream>
+#include <iterator>
 #include <sstream>
 #include <streambuf>
 #include <string>
@@ -6189,7 +6191,9 @@ class CppLexer {
       case '"': return in_include_directive_ ? quote_include() : string();
       case 'u': match('8');
         // fall-through
+#ifdef __GNUC__
         [[gnu::fallthrough]];  // Not sure why gcc complains here without this
+#endif
       case 'L':
         // fall-through
       case 'U':
@@ -6874,13 +6878,14 @@ inline Iterator insert_directive_impl(TokenSequence* tokens, Iterator where,
   constexpr int kMaxNewTokens = 1 + 1 + (2 * N - 1) + 1;
   Token new_tokens[kMaxNewTokens];
   int j = 0;
-  Iterator before_where = where;
-  --before_where;
-  if (where != tokens->begin() && before_where->type() != Tt::kEndOfDirective &&
-      (before_where->type() != Tt::kWhitespace ||
-       before_where->num_unescaped_newlines() == 0)) {
-    // Must add newline before new directive.
-    new_tokens[j++] = Token(Tt::kWhitespace, "\n");
+  if (where != tokens->begin()) {
+    const Iterator before_where = std::prev(where);
+    if (before_where->type() != Tt::kEndOfDirective &&
+        (before_where->type() != Tt::kWhitespace ||
+         before_where->num_unescaped_newlines() == 0)) {
+      // Must add newline before new directive.
+      new_tokens[j++] = Token(Tt::kWhitespace, "\n");
+    }
   }
   new_tokens[j++] = Token(Tt::kHash, "#");
   for (int i = 0; i < N; ++i) {
@@ -7144,7 +7149,7 @@ HeaderLoadStatus load_header(const parser::IncludeName& include,
   *full_path = include.nonlocal_full_path(kJitifyCallbackHeaderPrefix);
   *full_path = path_simplify(*full_path);
   if (already_loaded(*full_path)) return HeaderLoadStatus::kAlreadyLoaded;
-  if (header_callback and header_callback(include, &source)) {
+  if (header_callback && header_callback(include, &source)) {
     return newly_loaded(std::move(source));
   }
   // Try loading from current directory.

From c604e66a734d1c63ada0d7e1643e92db08abe410 Mon Sep 17 00:00:00 2001
From: Ben Barsdell <bbarsdell@nvidia.com>
Date: Tue, 21 Nov 2023 21:13:52 +1100
Subject: [PATCH 09/47] Fix StringRef issue in path_is_absolute for C++17

---
 jitify2.hpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/jitify2.hpp b/jitify2.hpp
index c785526..0b1cd35 100644
--- a/jitify2.hpp
+++ b/jitify2.hpp
@@ -2295,7 +2295,7 @@ inline std::string path_base(const std::string& p) {
   }
 }
 
-inline bool path_is_absolute(const std::string& p) {
+inline bool path_is_absolute(StringRef p) {
 #if defined _WIN32 || defined _WIN64
   return (p.size() >= 1 && (p[0] == '\\' || p[0] == '/')) ||
          (p.size() >= 3 && p[1] == ':' && (p[2] == '\\' || p[2] == '/'));
@@ -3737,7 +3737,7 @@ inline void copy_compiler_flag_for_linker_ptxas(
     const Option linker_option =
         output_key.empty()
             ? compiler_option
-            : Option(std::string{output_key}, compiler_option.value());
+            : Option(std::string(output_key), compiler_option.value());
     linker_options->push_back(linker_option);
   }
 }

From 68b76f401c85cada5ad9a6adba2d5c4df5889bea Mon Sep 17 00:00:00 2001
From: Ben Barsdell <bbarsdell@nvidia.com>
Date: Tue, 21 Nov 2023 21:14:50 +1100
Subject: [PATCH 10/47] Fix bad parsing in extract_used_header_warnings

- This makes it more robust to changes in formatting in the compiler
  log output between different nvrtc versions.
---
 jitify2.hpp | 23 +++++++++--------------
 1 file changed, 9 insertions(+), 14 deletions(-)

diff --git a/jitify2.hpp b/jitify2.hpp
index 0b1cd35..db86455 100644
--- a/jitify2.hpp
+++ b/jitify2.hpp
@@ -7217,14 +7217,12 @@ inline bool extract_used_header_warnings(
       start = (size_t)-1;
     }
     ++start;
-    // Each full warning message is 4 lines.
-    for (int i = 0; i < 4; ++i) {
-      size_t new_end = compile_log->find_first_of('\n', end + 1);
-      if (new_end == std::string::npos) break;  // End of log
-      end = new_end;
-    }
-    ++end;
-    std::string tail = compile_log->substr(end);
+    std::string tail;
+    // Log messages are separated by a blank line.
+    end = compile_log->find("\n\n", end + 1);
+    if (end != std::string::npos) {
+      tail = compile_log->substr(end + 2);
+    }
     compile_log->resize(start);
     *compile_log += tail;
   }
@@ -7239,14 +7237,11 @@ inline bool extract_used_header_warnings(
         start = (size_t)-1;
       }
       ++start;
-      size_t end =
-          compile_log->find_first_of('\n', pos + std::strlen("-diag-suppress"));
-      assert(end != std::string::npos);
-      end = compile_log->find_first_of('\n', end + 1);
       std::string tail;
+      size_t end =
+          compile_log->find("\n\n", pos + std::strlen("-diag-suppress"));
       if (end != std::string::npos) {
-        ++end;
-        tail = compile_log->substr(end);
+        tail = compile_log->substr(end + 2);
       }
       compile_log->resize(start);
       *compile_log += tail;

From 64dbeceead4dacc066f9b1b2e3d164268d13c8ae Mon Sep 17 00:00:00 2001
From: Ben Barsdell <bbarsdell@nvidia.com>
Date: Wed, 22 Nov 2023 13:07:22 +1100
Subject: [PATCH 11/47] Add tests for explicit headers with include path

---
 jitify2_test.cu | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/jitify2_test.cu b/jitify2_test.cu
index 8716b3a..3dc9c75 100644
--- a/jitify2_test.cu
+++ b/jitify2_test.cu
@@ -773,6 +773,22 @@ TEST(Jitify2Test, ExplicitHeaderSources) {
                      {"/foo/bar", bad_header}})
                 ->preprocess();
   ASSERT_EQ(get_error(preprog), "");
+  // Relative include takes precedence over -I path.
+  preprog = Program("my_program", R"(#include <foo/quote>)",
+                    {{"foo/quote", quote_header},
+                     {"foo/bar", good_header},
+                     {"bar", bad_header},
+                     {"/foo/bar", bad_header}})
+                ->preprocess({"-I."});
+  ASSERT_EQ(get_error(preprog), "");
+  // Finding a header at the root from a quote-include in a subdir requires
+  // explicitly passing the current dir as an include path ("-I.").
+  preprog = Program("my_program", R"(#include <foo/quote>)",
+                    {{"foo/quote", quote_header},
+                     {"bar", good_header},
+                     {"/foo/bar", bad_header}})
+                ->preprocess({"-I."});
+  ASSERT_EQ(get_error(preprog), "");
   preprog = Program("my_program", R"(#include </foo/bar>)",
                     {{"/foo/bar", good_header},
                      {"foo/bar", bad_header},

From 596722b4815d13235251230ee43b81d12ce4257b Mon Sep 17 00:00:00 2001
From: Ben Barsdell <bbarsdell@nvidia.com>
Date: Wed, 22 Nov 2023 22:40:44 +1100
Subject: [PATCH 12/47] Handle absolute filenames in load_header

---
 jitify2.hpp | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/jitify2.hpp b/jitify2.hpp
index db86455..27bdcf8 100644
--- a/jitify2.hpp
+++ b/jitify2.hpp
@@ -7145,6 +7145,18 @@ HeaderLoadStatus load_header(const parser::IncludeName& include,
     return HeaderLoadStatus::kNewlyLoaded;
   };
   std::string source;
+  if (path_is_absolute(include.name())) {
+    // Handle absolute filename.
+    *full_path = include.name();
+    *full_path = path_simplify(*full_path);
+    if (already_loaded(*full_path)) return HeaderLoadStatus::kAlreadyLoaded;
+    // Try loading via callback or from the filesystem.
+    if ((header_callback && header_callback(include, &source)) ||
+        read_text_file(*full_path, &source)) {
+      return newly_loaded(std::move(source));
+    }
+    return HeaderLoadStatus::kFailed;
+  }
   // Try loading via callback.
   *full_path = include.nonlocal_full_path(kJitifyCallbackHeaderPrefix);
   *full_path = path_simplify(*full_path);

From 8ca6ca178d5a02fef8c928ec8ab2579e4e5fda1b Mon Sep 17 00:00:00 2001
From: Ben Barsdell <bbarsdell@nvidia.com>
Date: Wed, 22 Nov 2023 22:53:33 +1100
Subject: [PATCH 13/47] Handle preincludes during preprocessing

---
 jitify2.hpp     |  7 +++++++
 jitify2_test.cu | 18 ++++++++++++++++++
 2 files changed, 25 insertions(+)

diff --git a/jitify2.hpp b/jitify2.hpp
index 27bdcf8..4e4b274 100644
--- a/jitify2.hpp
+++ b/jitify2.hpp
@@ -7377,6 +7377,13 @@ inline PreprocessedProgram PreprocessedProgram::preprocess(
   StringVec include_paths;
   detail::extract_include_paths(&compiler_options, &include_paths);
 
+  // Process preincludes as if they are <> includes.
+  for (int idx : compiler_options.find({"--pre-include", "-include"})) {
+    const std::string& preinclude = compiler_options[idx].value();
+    if (preinclude == "jitify_preinclude.h") continue;
+    include_queue.push(IncludeName(preinclude));
+  }
+
   // Recursively load and process all includes, putting them into the
   // include_to_fullpath and fullpath_to_source maps.
   std::string header_log;
diff --git a/jitify2_test.cu b/jitify2_test.cu
index 3dc9c75..587a6f0 100644
--- a/jitify2_test.cu
+++ b/jitify2_test.cu
@@ -811,6 +811,24 @@ TEST(Jitify2Test, ExplicitHeaderSources) {
   ASSERT_EQ(get_error(preprog), "");
 }
 
+TEST(Jitify2Test, Preincludes) {
+  // This tests that preincludes get preprocessed and that absolute paths are
+  // handled correctly. Note that cuda.h includes <stdlib.h>, so it must be
+  // preprocessed by Jitify (not simply loaded directly by NVRTC) to work.
+  static const std::string source = R"(
+#ifndef CUDA_VERSION
+#error TEST FAILED
+#endif
+)";
+  PreprocessedProgram preprog;
+  preprog = Program("my_program", source)
+                ->preprocess({"--pre-include=" CUDA_INC_DIR "/cuda.h"});
+  ASSERT_EQ(get_error(preprog), "");
+  preprog = Program("my_program", source)
+                ->preprocess({"-include=" CUDA_INC_DIR "/cuda.h"});
+  ASSERT_EQ(get_error(preprog), "");
+}
+
 TEST(Jitify2Test, CurrentExeIncludePath) {
   static const std::string source = R"(
 #include <example_headers/my_header1.cuh>

From 4b82bddcc00e7245d839ed281ab4c3083fc2b12e Mon Sep 17 00:00:00 2001
From: Ben Barsdell <bbarsdell@nvidia.com>
Date: Wed, 22 Nov 2023 23:32:20 +1100
Subject: [PATCH 14/47] Fix read_text_file for Windows line endings

---
 jitify2.hpp | 14 ++++++--------
 1 file changed, 6 insertions(+), 8 deletions(-)

diff --git a/jitify2.hpp b/jitify2.hpp
index 4e4b274..e458e61 100644
--- a/jitify2.hpp
+++ b/jitify2.hpp
@@ -5656,15 +5656,13 @@ inline bool read_text_file(const std::string& fullpath, std::string* content) {
   if (::fseek(file, 0, SEEK_SET)) return false;
   content->resize(size);
   // Note: This supports empty (size=0) files.
-  if ((long)::fread(&(*content)[0], 1, size, file) != size) return false;
-  // Crop off trailing null characters that may arise due to multi-character
-  // newline conversions (e.g., on Windows).
-  const size_t last_char_pos = content->find_last_not_of("\0");
-  if (last_char_pos == std::string::npos) {
-    content->resize(0);
-  } else {
-    content->resize(last_char_pos + 1);
+  const long bytes_read = (long)::fread(&(*content)[0], 1, size, file);
+  // Note: Newline conversions (e.g., on Windows) may cause ::fread to return
+  // < size on success, so we must use ::ferror to check for failure.
+  if (bytes_read != size && ::ferror(file)) {
+    return false;
   }
+  content->resize(bytes_read);
   return true;
 }
 

From 21e84dc6fa08244b2a7273be49c5ce134eea9b5d Mon Sep 17 00:00:00 2001
From: Ben Barsdell <bbarsdell@nvidia.com>
Date: Wed, 8 May 2024 12:06:48 +1000
Subject: [PATCH 15/47] Support digit separators when parsing numbers

---
 jitify2.hpp     | 3 ++-
 jitify2_test.cu | 2 ++
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/jitify2.hpp b/jitify2.hpp
index e458e61..84a5038 100644
--- a/jitify2.hpp
+++ b/jitify2.hpp
@@ -6351,7 +6351,8 @@ class CppLexer {
     return token(Token::Type::kWhitespace);
   }
   Token number() {
-    while (is_alnum(peek())) advance();
+    const char digits_separator = '\'';
+    while (is_alnum(peek()) || peek() == digits_separator) advance();
     return token(Token::Type::kNumber);
   }
   Token string() {
diff --git a/jitify2_test.cu b/jitify2_test.cu
index 587a6f0..6b44940 100644
--- a/jitify2_test.cu
+++ b/jitify2_test.cu
@@ -2110,6 +2110,8 @@ TEST(Jitify2ParserTest, SingleTokens) {
   expect_tokenization("0b10", {Tt::kNumber}, {"0b10"});
   expect_tokenization("123u", {Tt::kNumber}, {"123u"});
   expect_tokenization("123LLu", {Tt::kNumber}, {"123LLu"});
+  expect_tokenization("123'45'6", {Tt::kNumber}, {"123'45'6"});
+  expect_tokenization("0x12'34'56", {Tt::kNumber}, {"0x12'34'56"});
   expect_tokenization("\"str\"", {Tt::kString}, {"\"str\""});
   expect_tokenization("u\"str\"", {Tt::kString}, {"u\"str\""});
   expect_tokenization("u8\"str\"", {Tt::kString}, {"u8\"str\""});

From 37d3025decf08c2a643d1283983b1aec6382f0f5 Mon Sep 17 00:00:00 2001
From: Ben Barsdell <bbarsdell@nvidia.com>
Date: Sat, 15 Feb 2025 13:09:19 +1100
Subject: [PATCH 16/47] Add new nvJitLink enums to avoid warnings

---
 jitify2.hpp | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/jitify2.hpp b/jitify2.hpp
index 84a5038..6077b2e 100644
--- a/jitify2.hpp
+++ b/jitify2.hpp
@@ -1682,6 +1682,12 @@ class LibNvJitLink
     case NVJITLINK_ERROR_INTERNAL: return "NVJITLINK_ERROR_INTERNAL";
 #if CUDA_VERSION >= 12030
     case NVJITLINK_ERROR_THREADPOOL: return "NVJITLINK_ERROR_THREADPOOL";
+#endif
+#if CUDA_VERSION >= 12040
+    case NVJITLINK_ERROR_UNRECOGNIZED_INPUT: return "NVJITLINK_ERROR_UNRECOGNIZED_INPUT";
+#endif
+#if CUDA_VERSION >= 12060
+    case NVJITLINK_ERROR_FINALIZE: return "NVJITLINK_ERROR_FINALIZE";
 #endif
     }
     // clang-format on

From 188796a9de0fbfad9066cc8056f8f556f57a1bb1 Mon Sep 17 00:00:00 2001
From: Ben Barsdell <bbarsdell@nvidia.com>
Date: Sat, 15 Feb 2025 15:55:48 +1100
Subject: [PATCH 17/47] WAR NVRTC compilation issue for Thrust iterators

- This appears to be a bug related to inline namespaces.
- Also attempts to avoid warnings about incompatible redefinitions of
  constants.
---
 jitify2.hpp     | 10 ++++++++--
 jitify2_test.cu |  8 ++++++++
 2 files changed, 16 insertions(+), 2 deletions(-)

diff --git a/jitify2.hpp b/jitify2.hpp
index 6077b2e..1989ca6 100644
--- a/jitify2.hpp
+++ b/jitify2.hpp
@@ -4149,9 +4149,15 @@ JITIFY_DEFINE_C_AND_CXX_HEADERS(limits, R"(
 #else
  #define ULONG_MAX  UINT_MAX
 #endif
-#define LLONG_MAX  0x7fffffffffffffff
+#ifndef LLONG_MAX
+#define LLONG_MAX  0x7fffffffffffffffLL
+#endif
+#ifndef LLONG_MIN
 #define LLONG_MIN  (-LLONG_MAX - 1)
-#define ULLONG_MAX 0xffffffffffffffff
+#endif
+#ifndef ULLONG_MAX
+#define ULLONG_MAX 0xffffffffffffffffULL
+#endif
 )",
                                 "");
 
diff --git a/jitify2_test.cu b/jitify2_test.cu
index 6b44940..3d03d4b 100644
--- a/jitify2_test.cu
+++ b/jitify2_test.cu
@@ -1657,7 +1657,15 @@ __global__ void my_kernel() {}
 TEST(Jitify2Test, Thrust) {
   // clang-format off
   static const char* const source = R"(
+// WAR for NVRTC issue causing compilation error:
+//   `namespace "thrust" has no actual member "iterator_core_access"`.
+#define THRUST_WRAPPED_NAMESPACE jitify_thrust_ns_war
+#define THRUST_DISABLE_ABI_NAMESPACE
+
 #include <thrust/iterator/counting_iterator.h>
+
+using namespace jitify_thrust_ns_war;  // Part of WAR above
+
 __global__ void my_kernel(thrust::counting_iterator<int> begin,
                           thrust::counting_iterator<int> end) {
 })";

From 6c7f7bbfb18fb72ba024ed43c14ba3a006db40c0 Mon Sep 17 00:00:00 2001
From: Ben Barsdell <bbarsdell@nvidia.com>
Date: Sat, 15 Feb 2025 15:57:45 +1100
Subject: [PATCH 18/47] WAR missing header in CUB

- This has been fixed in recent versions.
---
 jitify2_test.cu | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/jitify2_test.cu b/jitify2_test.cu
index 3d03d4b..164d210 100644
--- a/jitify2_test.cu
+++ b/jitify2_test.cu
@@ -1694,12 +1694,12 @@ TEST(Jitify2Test, CubBlockPrimitives) {
 #define ProcessFloatMinusZero BaseDigitExtractor<KeyT>::ProcessFloatMinusZero
 #endif
 
-// WAR for header include issue (note: order of includes matters):
-//   https://github.com/NVIDIA/jitify/issues/107#issuecomment-1225617951
-#include <cuda/std/cstdint>
-#include <cuda/std/cstddef>
-#include <cuda/std/type_traits>
-#include <cuda/std/limits>
+// WAR for issue in CUB shipped with CUDA 12.4-<12.8.
+// TODO(benbarsdell): Check exactly when this issue was fixed in CUB,
+// this is an upper-bound.
+#if CUB_VERSION < 200700
+#include <type_traits>
+#endif
 
 #include <cub/block/block_load.cuh>
 #include <cub/block/block_radix_sort.cuh>

From 1cff309200cd0053e906775c062e932e9f07cf37 Mon Sep 17 00:00:00 2001
From: Ben Barsdell <bbarsdell@nvidia.com>
Date: Sat, 15 Feb 2025 13:11:58 +1100
Subject: [PATCH 19/47] Add NVTX ranges and track file cache stats

- Adds nvtx3 to the build.
- Adds NVTX ranges to key functions and ProgramCache hits/misses,
  gated by a macro JITIFY_ENABLE_NVTX (disabled by default).
- Adds tracking of file-cache hits/misses.
- Improves ProgramCache tests. Ensures that mem-miss file-hit is
  tested.
---
 CMakeLists.txt    | 13 +++++++-
 CMakeLists.txt.in | 10 ++++++
 jitify2.hpp       | 61 ++++++++++++++++++++++++++++++++++--
 jitify2_test.cu   | 79 ++++++++++++++++++++++++++---------------------
 4 files changed, 123 insertions(+), 40 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 5cffae3..1ade917 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -83,6 +83,15 @@ add_subdirectory(${CMAKE_CURRENT_BINARY_DIR}/googletest-src
                  ${CMAKE_CURRENT_BINARY_DIR}/googletest-build
                  EXCLUDE_FROM_ALL)
 
+# ----
+# NVTX
+# ----
+# Download and unpack nvtx at configure time.
+configure_file(CMakeLists.txt.in nvtx-download/CMakeLists.txt)
+add_subdirectory(${CMAKE_CURRENT_BINARY_DIR}/nvtx-src/c
+                 ${CMAKE_CURRENT_BINARY_DIR}/nvtx-build
+                 EXCLUDE_FROM_ALL)
+
 # ----
 # Executable utilities
 # ----
@@ -142,8 +151,10 @@ foreach(test ${TESTS})
   target_include_directories(${test} PRIVATE ${CMAKE_CURRENT_BINARY_DIR})
   # Ensure the main jitify header can be found.
   target_include_directories(${test} PRIVATE ${CMAKE_CURRENT_SOURCE_DIR})
-  target_link_libraries(${test} gtest_main)
+  target_link_libraries(${test} gtest_main nvtx3-cpp)
   set_property(TARGET ${test} PROPERTY CUDA_ARCHITECTURES OFF)
+  target_compile_definitions(${test}
+                               PUBLIC JITIFY_ENABLE_NVTX=1)
   if (${test} MATCHES "_static$")
     target_compile_definitions(${test}
                                PUBLIC JITIFY_LINK_CUDA_STATIC=1
diff --git a/CMakeLists.txt.in b/CMakeLists.txt.in
index 764f48d..f37e590 100644
--- a/CMakeLists.txt.in
+++ b/CMakeLists.txt.in
@@ -13,3 +13,13 @@ ExternalProject_Add(googletest
   INSTALL_COMMAND   ""
   TEST_COMMAND      ""
 )
+ExternalProject_Add(nvtx3
+  GIT_REPOSITORY    https://github.com/NVIDIA/NVTX.git
+  GIT_TAG           release-v3
+  SOURCE_DIR        "${CMAKE_CURRENT_BINARY_DIR}/nvtx-src"
+  BINARY_DIR        "${CMAKE_CURRENT_BINARY_DIR}/nvtx-build"
+  CONFIGURE_COMMAND ""
+  BUILD_COMMAND     ""
+  INSTALL_COMMAND   ""
+  TEST_COMMAND      ""
+)
diff --git a/jitify2.hpp b/jitify2.hpp
index 1989ca6..5c9a735 100644
--- a/jitify2.hpp
+++ b/jitify2.hpp
@@ -204,8 +204,29 @@
 
 #endif  // not JITIFY_SERIALIZATION_ONLY
 
+#ifndef JITIFY_ENABLE_NVTX
+#define JITIFY_ENABLE_NVTX 0
+#endif
+
+#if JITIFY_ENABLE_NVTX
+#include <nvtx3/nvtx3.hpp>
+#define JITIFY_NVTX_FUNC_RANGE() NVTX3_FUNC_RANGE_IN(JitifyNvtxDomain)
+#else
+#define JITIFY_NVTX_FUNC_RANGE()
+#endif  // JITIFY_ENABLE_NVTX
+
 namespace jitify2 {
 
+#if JITIFY_ENABLE_NVTX
+struct JitifyNvtxDomain {
+  static constexpr char const* name{"jitify"};
+};
+
+using nvtx_scoped_range = nvtx3::scoped_range_in<JitifyNvtxDomain>;
+#else
+using nvtx_scoped_range = std::string;
+#endif
+
 // Convenience aliases.
 using StringVec = std::vector<std::string>;
 using StringMap = std::unordered_map<std::string, std::string>;
@@ -706,6 +727,7 @@ class Serializable {
    *  \param stream The stream to output serialized data to.
    */
   void serialize(std::ostream& stream) const {
+    JITIFY_NVTX_FUNC_RANGE();
     const auto* subclass = static_cast<const Subclass*>(this);
     subclass->serialize_members(SerializeImpl(stream));
   }
@@ -718,9 +740,11 @@ class Serializable {
     return ss.str();
   }
   static bool deserialize(std::istream& stream, Subclass* subclass) {
+    JITIFY_NVTX_FUNC_RANGE();
     return subclass->deserialize_members(DeserializeImpl(stream));
   }
   static bool deserialize(StringRef serialized, Subclass* subclass) {
+    JITIFY_NVTX_FUNC_RANGE();
     imemstream ms(serialized);
     return subclass->deserialize_members(DeserializeImpl(ms));
   }
@@ -2012,6 +2036,7 @@ class Kernel : public detail::FallibleObjectBase<Kernel, KernelData> {
 };
 
 inline Kernel Kernel::get_kernel(LoadedProgramData program, std::string name) {
+  JITIFY_NVTX_FUNC_RANGE();
   name = detail::normalize_cuda_symbol_name(name);
   auto iter = program.lowered_name_map().find(name);
   if (iter != program.lowered_name_map().end()) {
@@ -2196,6 +2221,7 @@ class LoadedProgram
 
 inline LoadedProgram LoadedProgram::load(StringRef cubin,
                                          StringMap lowered_name_map) {
+  JITIFY_NVTX_FUNC_RANGE();
   CUmodule module;
   if (!cuda()) return Error(cuda().error());
   CUresult ret = cuda().ModuleLoadData()(&module, cubin.data());
@@ -2920,6 +2946,7 @@ class CompiledProgram
 inline LinkedProgram LinkedProgram::link(
     size_t num_programs, const CompiledProgramData* compiled_programs[],
     OptionsVec options) {
+  JITIFY_NVTX_FUNC_RANGE();
   if (num_programs == 0) return Error("Must have at least one program to link");
   const OptionsVec& prog_linker_options =
       compiled_programs[0]->remaining_linker_options();
@@ -3754,6 +3781,7 @@ inline CompiledProgram CompiledProgram::compile(
     const std::string& name, const std::string& source,
     const StringMap& header_sources, const StringVec& name_expressions,
     OptionsVec compiler_options, OptionsVec linker_options) {
+  JITIFY_NVTX_FUNC_RANGE();
   if (!compiler_options) return Error("Failed to parse compiler options");
   if (!linker_options) return Error("Failed to parse linker options");
   std::string error;
@@ -7279,6 +7307,7 @@ inline PreprocessedProgram PreprocessedProgram::preprocess(
     std::string program_name, std::string program_source,
     StringMap header_sources, OptionsVec compiler_options,
     OptionsVec linker_options, HeaderCallback header_callback) {
+  JITIFY_NVTX_FUNC_RANGE();
   // Add pre-include built-in JIT-safe headers.
   bool use_system_headers_war = !compiler_options.pop(
       {"-no-system-headers-workaround", "--no-system-headers-workaround"});
@@ -8014,9 +8043,10 @@ class LRUFileCache {
   std::string get(const std::string& name,
                   typename std::result_of<Construct()>::type* result,
                   Construct construct, Serialize serialize,
-                  Deserialize deserialize) const {
+                  Deserialize deserialize, bool* hit = nullptr) const {
     if (path_.empty() || max_size_ == 0) {
       *result = construct();
+      if (hit) *hit = false;
     } else {
       bool is_dir;
       // Create the cache directory if necessary.
@@ -8035,6 +8065,7 @@ class LRUFileCache {
       if (istream) {
         // Found in cache, load it.
         *result = deserialize(istream);
+        if (hit) *hit = true;
       } else {
         // Not found in cache, acquire a file lock for exclusive access.
         FileLock file_lock(lock_file_name_.c_str());
@@ -8046,6 +8077,7 @@ class LRUFileCache {
           // Found in cache now, just load it.
           file_lock.close();
           *result = deserialize(istream);
+          if (hit) *hit = true;
         } else {
           // We must construct the object and write it to the cache.
           auto result_tmp = construct();
@@ -8067,6 +8099,7 @@ class LRUFileCache {
           // Atomically make the new cache file visible to readers.
           std::rename(temp_filename.c_str(), filename.c_str());
           *result = std::move(result_tmp);
+          if (hit) *hit = false;
         }
       }
     }
@@ -8384,6 +8417,8 @@ class ProgramCache {
   JITIFY_IF_THREAD_SAFE(mutable std::mutex mutex_;)
   size_t num_hits_ = 0;
   size_t num_misses_ = 0;
+  size_t num_file_hits_ = 0;
+  size_t num_file_misses_ = 0;
 
   OptionsVec merge_compiler_options(OptionsVec extra_compiler_options) const {
     extra_compiler_options.insert(extra_compiler_options.begin(),
@@ -8494,6 +8529,7 @@ class ProgramCache {
                             const StringMap& extra_header_sources = {},
                             OptionsVec extra_compiler_options = {},
                             OptionsVec extra_linker_options = {}) {
+    JITIFY_NVTX_FUNC_RANGE();
     // Add the current CUDA context to the key, as modules are context-specific.
     CUcontext context;
     if (!cuda()) return LoadedProgram::Error(cuda().error());
@@ -8510,8 +8546,12 @@ class ProgramCache {
     bool found = value_and_found.second;
     if (found) {
       ++num_hits_;
+      // Note: We use ranges instead of marks here so that they conveniently
+      // show up in the output of `nsys profile --stat=true`.
+      nvtx_scoped_range("mem_cache_hit");
     } else {
       ++num_misses_;
+      nvtx_scoped_range("mem_cache_miss");
       // Add the SM architecture to the key, as cubins are arch-specific.
       OptionsVec all_compiler_options =
           merge_compiler_options(extra_compiler_options);
@@ -8539,6 +8579,7 @@ class ProgramCache {
       filename_ss << to_filename_(key) << ".sm" << compute_capability << ".v"
                   << std::hex << serialization::kSerializationVersion;
       LinkedProgram linked;
+      bool hit = false;
       error = file_cache_.get(
           filename_ss.str(), &linked,
           [&] {
@@ -8551,8 +8592,16 @@ class ProgramCache {
           },
           [&](std::istream& istream) {
             return LinkedProgram::deserialize(istream);
-          });
+          },
+          &hit);
       if (!error.empty()) return LoadedProgram::Error(error);
+      if (hit) {
+        ++num_file_hits_;
+        nvtx_scoped_range("file_cache_hit");
+      } else {
+        ++num_file_misses_;
+        nvtx_scoped_range("file_cache_miss");
+      }
       if (!linked) return LoadedProgram::Error(linked.error());
       *value = linked->load();
       if (!*value) return LoadedProgram::Error(value->error());
@@ -8682,10 +8731,14 @@ class ProgramCache {
    *    will be stored.
    *  \see reset_stats
    */
-  void get_stats(size_t* num_hits, size_t* num_misses) const {
+  void get_stats(size_t* num_hits, size_t* num_misses,
+                 size_t* num_file_hits = nullptr,
+                 size_t* num_file_misses = nullptr) const {
     JITIFY_IF_THREAD_SAFE(std::lock_guard<std::mutex> lock(mutex_);)
     *num_hits = num_hits_;
     *num_misses = num_misses_;
+    if (num_file_hits) *num_file_hits = num_file_hits_;
+    if (num_file_misses) *num_file_misses = num_file_misses_;
   }
 
   /*! Reset the cache hit and miss statistics to zero.
@@ -8695,6 +8748,8 @@ class ProgramCache {
     JITIFY_IF_THREAD_SAFE(std::lock_guard<std::mutex> lock(mutex_);)
     num_hits_ = 0;
     num_misses_ = 0;
+    num_file_hits_ = 0;
+    num_file_misses_ = 0;
   }
 };
 
diff --git a/jitify2_test.cu b/jitify2_test.cu
index 164d210..0057372 100644
--- a/jitify2_test.cu
+++ b/jitify2_test.cu
@@ -342,6 +342,17 @@ inline bool remove_empty_dir(const char* path) {
 #endif
 }
 
+#define JITIFY_TEST_CHECK_HITS(expected_hits, expected_misses,                 \
+                               expected_file_hits, expected_file_misses)       \
+  {                                                                            \
+    size_t num_hits, num_misses, num_file_hits, num_file_misses;               \
+    cache.get_stats(&num_hits, &num_misses, &num_file_hits, &num_file_misses); \
+    EXPECT_EQ(num_hits, expected_hits);                                        \
+    EXPECT_EQ(num_misses, expected_misses);                                    \
+    EXPECT_EQ(num_file_hits, expected_file_hits);                              \
+    EXPECT_EQ(num_file_misses, expected_file_misses);                          \
+  }
+
 TEST(Jitify2Test, ProgramCache) {
   static const char* const source = R"(
 template <typename T>
@@ -353,53 +364,49 @@ __global__ void my_kernel(const T* __restrict__ idata, T* __restrict__ odata) {}
   static const char* const cache_path = "jitify2_test_cache/subdir";
   ProgramCache<key_type> cache(max_size,
                                *Program("my_program", source)->preprocess(),
-                               nullptr, cache_path);
+                               nullptr, cache_path, /*max_files=*/max_size + 1);
   ScopeGuard scoped_cleanup_files([&] {
     cache.clear();
     remove_empty_dir(cache_path);
     remove_empty_dir(cache_path0);
   });
 
-  auto check_hits = [&](size_t expected_hits, size_t expected_misses) {
-    size_t num_hits, num_misses;
-    cache.get_stats(&num_hits, &num_misses);
-    EXPECT_EQ(num_hits, expected_hits);
-    EXPECT_EQ(num_misses, expected_misses);
-  };
-
   Kernel kernel;
   Template my_kernel("my_kernel");
 
-  check_hits(0, 0);
+  JITIFY_TEST_CHECK_HITS(0, 0, 0, 0);
   kernel = cache.get_kernel(/* key = */ 0, my_kernel.instantiate<float>());
   ASSERT_EQ(get_error(kernel), "");
   ASSERT_EQ(kernel->configure(1, 1)->launch(nullptr, nullptr), "");
-  check_hits(0, 1);
+  JITIFY_TEST_CHECK_HITS(0, 1, 0, 1);
   kernel = cache.get_kernel(/* key = */ 1, my_kernel.instantiate<double>());
   ASSERT_EQ(get_error(kernel), "");
-  check_hits(0, 2);
+  JITIFY_TEST_CHECK_HITS(0, 2, 0, 2);
   kernel = cache.get_kernel(/* key = */ 2, my_kernel.instantiate<int>());
   ASSERT_EQ(get_error(kernel), "");
   CUfunction function_int = kernel->function();
-  check_hits(0, 3);
+  JITIFY_TEST_CHECK_HITS(0, 3, 0, 3);
   cache.reset_stats();
-  check_hits(0, 0);
+  JITIFY_TEST_CHECK_HITS(0, 0, 0, 0);
   kernel = cache.get_kernel(/* key = */ 0, my_kernel.instantiate<float>());
   ASSERT_EQ(get_error(kernel), "");
   CUfunction function_float = kernel->function();
-  check_hits(0, 1);
+  JITIFY_TEST_CHECK_HITS(0, 1, 1, 0);
   kernel = cache.get_kernel(/* key = */ 2, my_kernel.instantiate<int>());
   ASSERT_EQ(get_error(kernel), "");
   EXPECT_EQ(kernel->function(), function_int);
-  check_hits(1, 1);
+  JITIFY_TEST_CHECK_HITS(1, 1, 1, 0);
   kernel = cache.get_kernel(/* key = */ 0, my_kernel.instantiate<float>());
   ASSERT_EQ(get_error(kernel), "");
   EXPECT_EQ(kernel->function(), function_float);
-  check_hits(2, 1);
+  JITIFY_TEST_CHECK_HITS(2, 1, 1, 0);
   LoadedProgram program =
       cache.get_program(/* key = */ 2, {my_kernel.instantiate<int>()});
   ASSERT_EQ(get_error(program), "");
-  check_hits(3, 1);
+  JITIFY_TEST_CHECK_HITS(3, 1, 1, 0);
+  kernel = cache.get_kernel(/* key = */ 1, my_kernel.instantiate<double>());
+  ASSERT_EQ(get_error(kernel), "");
+  JITIFY_TEST_CHECK_HITS(3, 2, 2, 0);
 
   // Make sure cache dir was created.
   bool cache_path_is_dir;
@@ -410,7 +417,7 @@ __global__ void my_kernel(const T* __restrict__ idata, T* __restrict__ odata) {}
   // Now clear the cache.
   ASSERT_TRUE(cache.clear());
   EXPECT_EQ(cache.max_in_mem(), max_size);
-  EXPECT_EQ(cache.max_files(), max_size);
+  EXPECT_EQ(cache.max_files(), max_size + 1);
   // Make sure cache dir still exists.
   ASSERT_TRUE(jitify2::detail::path_exists(cache_path, &cache_path_is_dir));
   ASSERT_TRUE(cache_path_is_dir);
@@ -436,52 +443,48 @@ __global__ void my_kernel(const T* __restrict__ idata, T* __restrict__ odata) {}
   static const char* const cache_path0 = "jitify2_test_cache";
   static const char* const cache_path = "jitify2_test_cache/subdir";
   ProgramCache<> cache(max_size, *Program("my_program", source)->preprocess(),
-                       nullptr, cache_path);
+                       nullptr, cache_path, /*max_files=*/max_size + 1);
   ScopeGuard scoped_cleanup_files([&] {
     cache.clear();
     remove_empty_dir(cache_path);
     remove_empty_dir(cache_path0);
   });
 
-  auto check_hits = [&](size_t expected_hits, size_t expected_misses) {
-    size_t num_hits, num_misses;
-    cache.get_stats(&num_hits, &num_misses);
-    EXPECT_EQ(num_hits, expected_hits);
-    EXPECT_EQ(num_misses, expected_misses);
-  };
-
   Kernel kernel;
   Template my_kernel("my_kernel");
 
-  check_hits(0, 0);
+  JITIFY_TEST_CHECK_HITS(0, 0, 0, 0);
   kernel = cache.get_kernel(my_kernel.instantiate<float>());
   ASSERT_EQ(get_error(kernel), "");
   ASSERT_EQ(kernel->configure(1, 1)->launch(nullptr, nullptr), "");
-  check_hits(0, 1);
+  JITIFY_TEST_CHECK_HITS(0, 1, 0, 1);
   kernel = cache.get_kernel(my_kernel.instantiate<double>());
   ASSERT_EQ(get_error(kernel), "");
-  check_hits(0, 2);
+  JITIFY_TEST_CHECK_HITS(0, 2, 0, 2);
   kernel = cache.get_kernel(my_kernel.instantiate<int>());
   ASSERT_EQ(get_error(kernel), "");
   CUfunction function_int = kernel->function();
-  check_hits(0, 3);
+  JITIFY_TEST_CHECK_HITS(0, 3, 0, 3);
   cache.reset_stats();
-  check_hits(0, 0);
+  JITIFY_TEST_CHECK_HITS(0, 0, 0, 0);
   kernel = cache.get_kernel(my_kernel.instantiate<float>());
   ASSERT_EQ(get_error(kernel), "");
   CUfunction function_float = kernel->function();
-  check_hits(0, 1);
+  JITIFY_TEST_CHECK_HITS(0, 1, 1, 0);
   kernel = cache.get_kernel(my_kernel.instantiate<int>());
   ASSERT_EQ(get_error(kernel), "");
   EXPECT_EQ(kernel->function(), function_int);
-  check_hits(1, 1);
+  JITIFY_TEST_CHECK_HITS(1, 1, 1, 0);
   kernel = cache.get_kernel(my_kernel.instantiate<float>());
   ASSERT_EQ(get_error(kernel), "");
   EXPECT_EQ(kernel->function(), function_float);
-  check_hits(2, 1);
+  JITIFY_TEST_CHECK_HITS(2, 1, 1, 0);
   LoadedProgram program = cache.get_program({my_kernel.instantiate<int>()});
   ASSERT_EQ(get_error(program), "");
-  check_hits(3, 1);
+  JITIFY_TEST_CHECK_HITS(3, 1, 1, 0);
+  kernel = cache.get_kernel(my_kernel.instantiate<double>());
+  ASSERT_EQ(get_error(kernel), "");
+  JITIFY_TEST_CHECK_HITS(3, 2, 2, 0);
 
   // Make sure cache dir was created.
   bool cache_path_is_dir;
@@ -492,7 +495,7 @@ __global__ void my_kernel(const T* __restrict__ idata, T* __restrict__ odata) {}
   // Now clear the cache.
   ASSERT_TRUE(cache.clear());
   EXPECT_EQ(cache.max_in_mem(), max_size);
-  EXPECT_EQ(cache.max_files(), max_size);
+  EXPECT_EQ(cache.max_files(), max_size + 1);
   // Make sure cache dir still exists.
   ASSERT_TRUE(jitify2::detail::path_exists(cache_path, &cache_path_is_dir));
   ASSERT_TRUE(cache_path_is_dir);
@@ -509,6 +512,8 @@ __global__ void my_kernel(const T* __restrict__ idata, T* __restrict__ odata) {}
   EXPECT_EQ(cache.max_files(), max_size + 2);
 }
 
+#undef JITIFY_TEST_CHECK_HITS
+
 TEST(Jitify2Test, ProgramCacheFilenameSanitization) {
   static const char* const source = R"(__global__ void my_kernel() {})";
   const size_t max_size = 1;
@@ -2459,6 +2464,8 @@ ne/*blah*/20 "newfilename"
   ASSERT_TRUE(iter.match(Tt::kEndOfDirective));
 }
 
+// TODO(benbarsdell): Add tests for nvtx ranges (using cuPTI).
+
 int main(int argc, char** argv) {
   cudaSetDevice(0);
   // Initialize the driver context (avoids "initialization error"/"context is

From 4be3b9fbb1a1deefc73eb554094250de9e4b917b Mon Sep 17 00:00:00 2001
From: Ben Barsdell <bbarsdell@nvidia.com>
Date: Tue, 11 Mar 2025 20:42:34 +1100
Subject: [PATCH 20/47] Build tests for C++17 instead of 11

- Most people are using at least 17 now, and we have code that is
  only enabled from 17 onward.
---
 CMakeLists.txt | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 1ade917..b341304 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -4,8 +4,8 @@ project(jitify LANGUAGES CXX CUDA)
 option(ASAN "Enable address sanitizer in debug build" ON)
 
 # C++ compiler options.
-set (CMAKE_CXX_STANDARD 11)
-set (CMAKE_CUDA_STANDARD 11)  # Doesn't work?
+set (CMAKE_CXX_STANDARD 17)
+set (CMAKE_CUDA_STANDARD 17)  # Doesn't work?
 set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ")
 if (MSVC)
   set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /WX")

From bb8dbadeb2b75e222ae54d12798bfe278ad4a635 Mon Sep 17 00:00:00 2001
From: Ben Barsdell <bbarsdell@nvidia.com>
Date: Tue, 11 Mar 2025 20:46:14 +1100
Subject: [PATCH 21/47] Fix formatting

---
 .clang-format   |  2 +-
 jitify2.hpp     | 23 +++++++++++------------
 jitify2_test.cu |  4 +++-
 3 files changed, 15 insertions(+), 14 deletions(-)

diff --git a/.clang-format b/.clang-format
index 4a88069..23b99fb 100644
--- a/.clang-format
+++ b/.clang-format
@@ -46,7 +46,7 @@ BreakConstructorInitializers: BeforeColon
 BreakAfterJavaFieldAnnotations: false
 BreakStringLiterals: true
 ColumnLimit:     80
-CommentPragmas:  '^ IWYU pragma:'
+#CommentPragmas:  '^ IWYU pragma:'
 CommentPragmas:  '^\\.+'
 CompactNamespaces: false
 ConstructorInitializerAllOnOneLineOrOnePerLine: true
diff --git a/jitify2.hpp b/jitify2.hpp
index 5c9a735..68a6f04 100644
--- a/jitify2.hpp
+++ b/jitify2.hpp
@@ -2359,7 +2359,7 @@ inline std::string path_join(StringRef p1, StringRef p2) {
 inline bool path_exists(const char* filename, bool* is_dir = nullptr) {
   struct stat stats;
   bool ret = ::stat(filename, &stats) == 0;
-#define JITIFY_S_ISDIR(mode) (((mode)&S_IFMT) == S_IFDIR)
+#define JITIFY_S_ISDIR(mode) (((mode) & S_IFMT) == S_IFDIR)
   if (is_dir) *is_dir = JITIFY_S_ISDIR(stats.st_mode);
 #undef JITIFY_S_ISDIR
   return ret;
@@ -2389,8 +2389,9 @@ inline bool endswith(StringRef str, StringRef suffix) {
 }
 
 inline bool is_true_value(std::string str) {
-  std::transform(str.begin(), str.end(), str.begin(),
-                 [](unsigned char c) { return static_cast<unsigned char>(std::tolower(c)); });
+  std::transform(str.begin(), str.end(), str.begin(), [](unsigned char c) {
+    return static_cast<unsigned char>(std::tolower(c));
+  });
   return !(str == "false" || str == "off" || str == "no" || str == "0");
 }
 
@@ -2979,18 +2980,16 @@ inline LinkedProgram LinkedProgram::link(
         return Error("Linking LTO IR is not supported with CUDA < 11.4");
       }
     }
-    const std::string& program = !compiled_program.lto_ir().empty()
-                                     ? compiled_program.lto_ir()
-                                     : !compiled_program.cubin().empty()
-                                           ? compiled_program.cubin()
-                                           : compiled_program.ptx();
+    const std::string& program =
+        !compiled_program.lto_ir().empty()  ? compiled_program.lto_ir()
+        : !compiled_program.cubin().empty() ? compiled_program.cubin()
+                                            : compiled_program.ptx();
     CUjitInputType program_type =
 #if CUDA_VERSION >= 11040
         !compiled_program.lto_ir().empty() ? CU_JIT_INPUT_NVVM :
 #endif
-                                           !compiled_program.cubin().empty()
-                                               ? CU_JIT_INPUT_CUBIN
-                                               : CU_JIT_INPUT_PTX;
+        !compiled_program.cubin().empty() ? CU_JIT_INPUT_CUBIN
+                                          : CU_JIT_INPUT_PTX;
     programs.emplace_back(&program);
     program_types.emplace_back(program_type);
   }
@@ -6828,7 +6827,7 @@ class IncludeName {
 
   // Implicit conversion to string to maintain backwards compatibility with
   // FileCallback.
-  operator const std::string &() const { return name(); }
+  operator const std::string&() const { return name(); }
 
   friend std::string to_string(const IncludeName& incname) {
     using jitify2::detail::string_concat;
diff --git a/jitify2_test.cu b/jitify2_test.cu
index 0057372..7c7ea20 100644
--- a/jitify2_test.cu
+++ b/jitify2_test.cu
@@ -1831,7 +1831,7 @@ TEST(Jitify2Test, LibCudaCxxAndBuiltinLimits) {
 )";
 
   PreprocessedProgram preprog =
-    Program("limits_program", source)->preprocess({"-I" CUDA_INC_DIR});
+      Program("limits_program", source)->preprocess({"-I" CUDA_INC_DIR});
   ASSERT_EQ(get_error(preprog), "");
   CompiledProgram compiled = preprog->compile();
   ASSERT_EQ(get_error(compiled), "");
@@ -1902,10 +1902,12 @@ const char c = '\xff';
 
 // WAR for header include issue (note: order of includes matters):
 //   https://github.com/NVIDIA/jitify/issues/107#issuecomment-1225617951
+// clang-format off
 #include <cuda/std/cstdint>
 #include <cuda/std/cstddef>
 #include <cuda/std/type_traits>
 #include <cuda/std/limits>
+// clang-format on
 
 // CUB headers can be tricky to parse.
 #include <cub/block/block_load.cuh>

From 2f729f8aca35ab6e3bbafd351702c1b9d1fd7ea1 Mon Sep 17 00:00:00 2001
From: Ben Barsdell <bbarsdell@nvidia.com>
Date: Tue, 11 Mar 2025 20:47:21 +1100
Subject: [PATCH 22/47] Fix multi-statement JITIFY_THROW_OR_TERMINATE

- This was breaking some single-line if statements (thanks to @Robadob
  for pointing this out).
---
 jitify2.hpp | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/jitify2.hpp b/jitify2.hpp
index 68a6f04..861808d 100644
--- a/jitify2.hpp
+++ b/jitify2.hpp
@@ -180,9 +180,11 @@
 #else
 // TODO: Would std::exit or std::abort be better than std::terminate?
 #include <exception>
-#define JITIFY_THROW_OR_TERMINATE(msg)                       \
-  std::cerr << "Jitify fatal error: " << (msg) << std::endl; \
-  std::terminate()
+#define JITIFY_THROW_OR_TERMINATE(msg)                         \
+  do {                                                         \
+    std::cerr << "Jitify fatal error: " << (msg) << std::endl; \
+    std::terminate();                                          \
+  } while (0)
 #endif
 
 #if JITIFY_ENABLE_EXCEPTIONS

From f2be5caf0441042dff33bb767783fff5cc28c8eb Mon Sep 17 00:00:00 2001
From: Ben Barsdell <bbarsdell@nvidia.com>
Date: Tue, 11 Mar 2025 20:56:01 +1100
Subject: [PATCH 23/47] Add extra info to compilation error objects

- This allows users to access the error log, options, and header info
  separately from the full error message.
- Default error messages now no longer include options and header
  info; define JITIFY_VERBOSE_ERRORS=1 to restore this behaviour.
---
 jitify2.hpp           | 63 +++++++++++++++++++++++++++++++++----------
 jitify2_test.cu       | 15 +++++++++++
 jitify2_user_guide.md | 30 +++++++++++++++++++++
 3 files changed, 94 insertions(+), 14 deletions(-)

diff --git a/jitify2.hpp b/jitify2.hpp
index 861808d..b5db1c6 100644
--- a/jitify2.hpp
+++ b/jitify2.hpp
@@ -96,6 +96,11 @@
 #define JITIFY_FAIL_IMMEDIATELY 0
 #endif
 
+// Adds options and headers logging to compilation error messages.
+#ifndef JITIFY_VERBOSE_ERRORS
+#define JITIFY_VERBOSE_ERRORS 0
+#endif
+
 #ifndef JITIFY_USE_LIBCUFILT
 #define JITIFY_USE_LIBCUFILT 0  // Use Jitify's builtin demangler by default
 #endif
@@ -1076,13 +1081,26 @@ class Template {
 class ErrorMsg : public std::string {
  public:
   using std::string::string;
-  ErrorMsg(const std::string& str) : std::string(str) {}
-  ErrorMsg(std::string&& str) : std::string(std::move(str)) {}
+  ErrorMsg(const std::string& str, StringMap _extra = {})
+      : std::string(str), extra_(std::move(_extra)) {}
+  ErrorMsg(std::string&& str, StringMap _extra = {})
+      : std::string(std::move(str)), extra_(std::move(_extra)) {}
 
   /*! Returns true if the error message is empty. */
   bool ok() const { return this->empty(); }
   /*! Returns true if the error message is non-empty. */
   explicit operator bool() const { return !this->empty(); }
+
+  const std::string& extra(const std::string& key) const {
+    auto iter = extra_.find(key);
+    if (iter == extra_.end()) {
+      JITIFY_THROW_OR_RETURN("Extra error info key '" + key + "' not found");
+    }
+    return iter->second;
+  }
+
+ private:
+  StringMap extra_;
 };
 
 namespace detail {
@@ -3776,6 +3794,24 @@ inline void copy_compiler_flag_for_linker_ptxas(
   }
 }
 
+inline ErrorMsg make_compilation_error_msg(const std::string& compile_error,
+                                           const std::string& compile_log,
+                                           const OptionsVec& compiler_options,
+                                           const std::string& header_log) {
+  std::string options_str = string_join(compiler_options, " ");
+  std::string msg = "Compilation failed: " + compile_error + "\n";
+#if JITIFY_VERBOSE_ERRORS
+  msg += "Compiler options: \"" + options_str + "\"\nHeaders:\n" + header_log +
+         "\nCompilation log:\n" + compile_log;
+#else
+  msg += compile_log;
+#endif
+  return ErrorMsg(msg, {{"error", compile_error},
+                        {"log", compile_log},
+                        {"options", options_str},
+                        {"headers", header_log}});
+}
+
 }  // namespace detail
 
 inline CompiledProgram CompiledProgram::compile(
@@ -3800,18 +3836,15 @@ inline CompiledProgram CompiledProgram::compile(
                               &error, &log, &ptx, &cubin, &nvvm,
                               name_expressions, &lowered_name_map,
                               should_remove_unused_globals)) {
-    std::string options_str = detail::string_join(
-        compiler_options, " ", "Compiler options: \"", "\"\n");
     std::vector<std::string> header_names;
     header_names.reserve(header_sources.size());
     for (const auto& item : header_sources) {
       header_names.push_back(item.first);
     }
     std::sort(header_names.begin(), header_names.end());
-    std::string headers_str =
-        detail::string_join(header_names, "\n  ", "Header names:\n  ", "\n");
-    return Error("Compilation failed: " + error + "\n" + options_str +
-                 headers_str + "\n" + log);
+    std::string headers_str = detail::string_join(header_names, "\n  ");
+    return Error(detail::make_compilation_error_msg(
+        error, log, compiler_options, headers_str));
   }
 
   // We copy certain compiler options to linker_options so that they are used if
@@ -7574,8 +7607,6 @@ inline PreprocessedProgram PreprocessedProgram::preprocess(
     }
     compiler_options.push_back(Option("-DJITIFY_PREPROCESS_ONLY"));
     compiler_options.push_back(Option("-DJITIFY_USED_HEADER_WARNINGS"));
-    std::string compiler_options_msg = detail::string_join(
-        compiler_options, " ", "Compiler options: \"", "\"\n");
     std::string compile_error;
     // Note: This should always fail, because we inserted an #error directive.
     const nvrtcResult compile_result =
@@ -7584,8 +7615,10 @@ inline PreprocessedProgram PreprocessedProgram::preprocess(
     assert(compile_result != NVRTC_SUCCESS);
     if (compile_result != NVRTC_ERROR_COMPILATION) {
       // There was something wrong with the compilation (e.g., invalid option).
-      return Error("Compilation failed: " + compile_error + "\n" +
-                   compiler_options_msg + header_log + compile_log);
+      return Error(detail::make_compilation_error_msg(
+          compile_error, compile_log, compiler_options,
+          header_log.substr(
+              0, header_log.size() - 1)));  // Remove trailing newline
     }
     compiler_options.pop_back();  // Remove -DJITIFY_USED_HEADER_WARNINGS
     compiler_options.pop_back();  // Remove -DJITIFY_PREPROCESS_ONLY
@@ -7595,8 +7628,10 @@ inline PreprocessedProgram PreprocessedProgram::preprocess(
     if (compile_log.find(": error: ") != std::string::npos ||
         compile_log.find(": catastrophic error: ") != std::string::npos) {
       // There were real compilation errors.
-      return Error("Compilation failed: " + compile_error + "\n" +
-                   compiler_options_msg + header_log + compile_log);
+      return Error(detail::make_compilation_error_msg(
+          compile_error, compile_log, compiler_options,
+          header_log.substr(
+              0, header_log.size() - 1)));  // Remove trailing newline
     }
 
     if (arch_flag.cc) {
diff --git a/jitify2_test.cu b/jitify2_test.cu
index 7c7ea20..f94d404 100644
--- a/jitify2_test.cu
+++ b/jitify2_test.cu
@@ -27,6 +27,7 @@
  */
 
 #define JITIFY_ENABLE_EXCEPTIONS 1
+#define JITIFY_VERBOSE_ERRORS 1
 #include "jitify2.hpp"
 
 #include "example_headers/class_arg_kernel.cuh"
@@ -1001,6 +1002,20 @@ TEST(Jitify2Test, InvalidPrograms) {
       "");
   // Not OK.
   EXPECT_NE(get_error(Program("bad_program", "NOT CUDA C!")->preprocess()), "");
+
+  // Check that the returned error object contains correct extra info.
+  PreprocessedProgram preprocessed =
+      Program("bad_program", "NOT CUDA C!")->preprocess();
+  EXPECT_FALSE(preprocessed.ok());
+  const ErrorMsg error = preprocessed.error();
+  EXPECT_THROW(error.extra("foo"), std::runtime_error);
+  EXPECT_EQ(error.extra("error"), "NVRTC_ERROR_COMPILATION");
+  EXPECT_TRUE(error.extra("log").find("identifier \"NOT\" is undefined") !=
+              std::string::npos);
+  EXPECT_TRUE(error.extra("options").find("-default-device") !=
+              std::string::npos);
+  EXPECT_EQ(error.extra("headers"), "");
+  EXPECT_TRUE(error.find("Compilation failed:") != std::string::npos);
 }
 
 TEST(Jitify2Test, CompileLTO_IR) {
diff --git a/jitify2_user_guide.md b/jitify2_user_guide.md
index c65635c..4d657bd 100644
--- a/jitify2_user_guide.md
+++ b/jitify2_user_guide.md
@@ -97,6 +97,26 @@ call or when a method such as `launch()` fails:
   }
 ```
 
+Most errors are simple strings, but compilation errors contain
+additional info that can be accessed via the `extra()` method:
+
+```
+  jitify2::PreprocessedProgram preprocessed =
+      jitify2::Program("bad_program", "NOT CUDA C!")->preprocess();
+  assert(!preprocessed.ok());
+  const jitify2::ErrorMsg error = preprocessed.error();
+  std::cerr << error << std::endl;                   // Full error message
+  std::cerr << error.extra("error") << std::endl;    // "NVRTC_ERROR_COMPILATION"
+  std::cerr << error.extra("log") << std::endl;      // "error: identifier "NOT" is undefined..."
+  std::cerr << error.extra("options") << std::endl;  // "-include=jitify_preinclude.h ..."
+  std::cerr << error.extra("headers") << std::endl;  // (empty)
+```
+
+By default, the full error message only includes the error name
+and compile log. To also include compiler options and headers
+in this message, define the macro `JITIFY_VERBOSE_ERRORS=1` before
+including `jitify2.hpp`.
+
 <a name="basic_workflow"/>
 
 ## Basic workflow example
@@ -211,6 +231,9 @@ $ mkdir build && cd build && cmake ..
 $ make check
 ```
 
+Note that the tests in `jitify2_test.cu` may also be useful as a form of
+documentation for many jitify features.
+
 <a name="build_options"/>
 
 ## Build options
@@ -261,6 +284,13 @@ $ make check
   version 11.4, and the application must be linked with the
   libcufilt.a static library.
 
+- `JITIFY_VERBOSE_ERRORS=0`
+
+  Defining this macro to 1 before including the jitify header causes
+  compilation errors to include options and header info in the error
+  message. Note that this info can always be accessed manually via
+  the `extra()` method of the error object.
+
 <a name="compiler_options"/>
 
 ## Compiler options

From 99a3cc5d582f3e2cee4302e32cc3296716ee9011 Mon Sep 17 00:00:00 2001
From: Ben Barsdell <bbarsdell@nvidia.com>
Date: Tue, 11 Mar 2025 21:09:23 +1100
Subject: [PATCH 24/47] Document NVTX ranges and enable in tests

---
 jitify2_test.cu       | 1 +
 jitify2_user_guide.md | 6 ++++++
 2 files changed, 7 insertions(+)

diff --git a/jitify2_test.cu b/jitify2_test.cu
index f94d404..72ce60a 100644
--- a/jitify2_test.cu
+++ b/jitify2_test.cu
@@ -27,6 +27,7 @@
  */
 
 #define JITIFY_ENABLE_EXCEPTIONS 1
+#define JITIFY_ENABLE_NVTX 1
 #define JITIFY_VERBOSE_ERRORS 1
 #include "jitify2.hpp"
 
diff --git a/jitify2_user_guide.md b/jitify2_user_guide.md
index 4d657bd..4751184 100644
--- a/jitify2_user_guide.md
+++ b/jitify2_user_guide.md
@@ -284,6 +284,12 @@ documentation for many jitify features.
   version 11.4, and the application must be linked with the
   libcufilt.a static library.
 
+- `JITIFY_ENABLE_NVTX=0`
+
+  Defining this macro to 1 before including the jitify header causes
+  NVTX ranges to be emitted around important functions and cache
+  hits/misses.
+
 - `JITIFY_VERBOSE_ERRORS=0`
 
   Defining this macro to 1 before including the jitify header causes

From 2aa067e161e2f91ad73f7898d5cbff84905b889d Mon Sep 17 00:00:00 2001
From: Ben Barsdell <bbarsdell@nvidia.com>
Date: Wed, 12 Mar 2025 18:14:11 +1100
Subject: [PATCH 25/47] WAR compiler warning due to std::move

- It's unclear exactly why this is necessary, but it apparently
  solves a warning with GCC 12.
---
 jitify2.hpp | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/jitify2.hpp b/jitify2.hpp
index b5db1c6..ece7bca 100644
--- a/jitify2.hpp
+++ b/jitify2.hpp
@@ -5759,7 +5759,9 @@ inline void extract_include_paths(OptionsVec* options,
   for (int i = (int)idxs.size() - 1; i >= 0; --i) {
     const int idx = idxs[i];
     std::string include_path = (*options)[idx].value();
-    include_path = expand_include_path(std::move(include_path));
+    // Note: Not passing the arg with std::move() here due to a "may be used
+    // uninitialized" warning with some compilers.
+    include_path = expand_include_path(include_path);
     include_paths->push_back(std::move(include_path));
     options->erase(idx);
   }

From 73bd8aa804d0451370841565ff5396eece9a563f Mon Sep 17 00:00:00 2001
From: Ben Barsdell <bbarsdell@nvidia.com>
Date: Sun, 16 Mar 2025 14:32:26 +1100
Subject: [PATCH 26/47] Set default JIT -std to match host compilation

- Also WARs MSVC not always setting __cplusplus correctly.
- Note that an alternative to matching the host compilation would be
  to match NVRTC's default (which appears to be c++17 since CUDA
  12.0), but this is less intuitive and would require maintenance to
  keep it up to date with NVRTC.
---
 jitify2.hpp           | 32 +++++++++++++++++++----------
 jitify2_test.cu       | 48 +++++++++++++++++++++++++++++++++++++++++++
 jitify2_user_guide.md |  5 +++--
 3 files changed, 72 insertions(+), 13 deletions(-)

diff --git a/jitify2.hpp b/jitify2.hpp
index ece7bca..30e62cb 100644
--- a/jitify2.hpp
+++ b/jitify2.hpp
@@ -54,11 +54,19 @@
 #include <unordered_map>
 #include <vector>
 
+#ifdef _MSC_VER  // MSVC compiler
+// In MSVC, __cplusplus is always 199711L unless the `/Zc:__cplusplus` option is
+// specified, so need to use _MSVC_LANG instead.
+#define JITIFY_CPLUSPLUS _MSVC_LANG
+#else
+#define JITIFY_CPLUSPLUS __cplusplus
+#endif
+
 // This macro is used by source files generated by jitify_preprocess to avoid
 // unnecessary dependencies.
 #ifdef JITIFY_SERIALIZATION_ONLY
 
-#if __cplusplus >= 201703L
+#if JITIFY_CPLUSPLUS >= 201703L
 #include <string_view>
 #endif
 
@@ -130,7 +138,7 @@
 #define JITIFY_IF_THREAD_SAFE(x)
 #endif
 
-#if __cplusplus >= 201402L
+#if JITIFY_CPLUSPLUS >= 201402L
 #define JITIFY_DEPRECATED(msg) [[deprecated(msg)]]
 #else
 #define JITIFY_DEPRECATED(msg)
@@ -238,7 +246,7 @@ using nvtx_scoped_range = std::string;
 using StringVec = std::vector<std::string>;
 using StringMap = std::unordered_map<std::string, std::string>;
 
-#if __cplusplus >= 201703L
+#if JITIFY_CPLUSPLUS >= 201703L
 using StringRef = std::string_view;
 using StringSlice = std::string_view;
 #else
@@ -539,7 +547,7 @@ struct imemstream : virtual membuf, std::istream {
   imemstream(const char* data, size_t size)
       : membuf(data, size), std::istream(static_cast<std::streambuf*>(this)) {}
   imemstream(const std::string& str) : imemstream(str.data(), str.size()) {}
-#if __cplusplus >= 201703L
+#if JITIFY_CPLUSPLUS >= 201703L
   imemstream(std::string_view sv) : imemstream(sv.data(), sv.size()) {}
 #endif
 };
@@ -1006,7 +1014,7 @@ inline std::string reflect(const Instance<T>& value) {
 inline std::string reflect(const std::string& s) { return s; }
 /*! Use an existing code string as-is. */
 inline const char* reflect(const char* s) { return s; }
-#if __cplusplus >= 201703L
+#if JITIFY_CPLUSPLUS >= 201703L
 /*! Use an existing code string as-is. */
 inline std::string_view reflect(std::string_view s) { return s; }
 #endif
@@ -3406,7 +3414,11 @@ inline bool process_architecture_flags(OptionsVec* compiler_options,
 
 // Returns the standard year (e.g., 11 for c++11).
 inline int add_std_flag_if_not_specified(OptionsVec* options,
-                                         int default_standard_year = 11) {
+                                         int default_standard_year = 0) {
+  if (!default_standard_year) {
+    // Default to the same C++ dialect as the host binary is compiled for.
+    default_standard_year = JITIFY_CPLUSPLUS / 100 % 100;
+  }
   auto option_inds = options->find({"--std", "-std"});
   if (!option_inds.empty()) {
     const char* value = (*options)[option_inds.back()].value().c_str();
@@ -3414,8 +3426,6 @@ inline int add_std_flag_if_not_specified(OptionsVec* options,
     int standard_year = std::atoi(value);
     return standard_year;
   }
-  // Jitify must be compiled with C++11 support, so we default to enabling it
-  // for the JIT-compiled code too.
   std::string value = "c++" + std::to_string(default_standard_year);
   options->emplace_back("-std", value);
   return default_standard_year;
@@ -3826,7 +3836,7 @@ inline CompiledProgram CompiledProgram::compile(
                                           &error)) {
     return Error("Failed to process architecture flags: " + error);
   }
-  detail::add_std_flag_if_not_specified(&compiler_options, 11);
+  detail::add_std_flag_if_not_specified(&compiler_options);
   detail::add_default_device_flag_if_not_specified(&compiler_options);
   bool should_remove_unused_globals = compiler_options.pop(
       {"-remove-unused-globals", "--remove-unused-globals"});
@@ -7368,7 +7378,7 @@ inline PreprocessedProgram PreprocessedProgram::preprocess(
     compiler_options.push_back(Option("-include", "jitify_preinclude.h"));
   }
   const int cxx_standard_year =
-      detail::add_std_flag_if_not_specified(&compiler_options, 11);
+      detail::add_std_flag_if_not_specified(&compiler_options);
   detail::add_default_device_flag_if_not_specified(&compiler_options);
   bool minify = compiler_options.pop({"-m", "--minify"});
   // TODO: This flag is experimental, because the implementation does not
@@ -8248,7 +8258,7 @@ class LRUCache {
       touch(rank_iter);
       iter = *rank_iter;
       // Change the key of the LRU entry to the new key.
-#if __cplusplus >= 201703L
+#if JITIFY_CPLUSPLUS >= 201703L
       auto node_handle = cache_.extract(iter);
       node_handle.key() = key;
       iter = cache_.insert(std::move(node_handle)).position;
diff --git a/jitify2_test.cu b/jitify2_test.cu
index 72ce60a..84fc720 100644
--- a/jitify2_test.cu
+++ b/jitify2_test.cu
@@ -263,6 +263,54 @@ __global__ void my_kernel2(const float* indata, float* outdata) {
   EXPECT_FLOAT_EQ(inval, outval);
 }
 
+TEST(Jitify2Test, StdFlag) {
+  static const char* const source = R"(
+__global__ void my_kernel(long* cplusplus) {
+  *cplusplus = __cplusplus;
+}
+)";
+
+  long h_data = 0;
+  long* data;
+  CHECK_CUDART(cudaMalloc((void**)&data, sizeof(*data)));
+
+  ASSERT_EQ(Program("my_program", source)
+                ->preprocess()
+                ->get_kernel("my_kernel")
+                ->configure(1, 1)
+                ->launch(data),
+            "");
+  CHECK_CUDART(
+      cudaMemcpy(&h_data, data, sizeof(*data), cudaMemcpyDeviceToHost));
+  CHECK_CUDART(cudaDeviceSynchronize());
+  // Default should be same as host binary.
+  EXPECT_EQ(h_data, JITIFY_CPLUSPLUS);
+
+  ASSERT_EQ(Program("my_program", source)
+                ->preprocess({"-std=c++03"})
+                ->get_kernel("my_kernel")
+                ->configure(1, 1)
+                ->launch(data),
+            "");
+  CHECK_CUDART(
+      cudaMemcpy(&h_data, data, sizeof(*data), cudaMemcpyDeviceToHost));
+  CHECK_CUDART(cudaDeviceSynchronize());
+  EXPECT_EQ(h_data, 199711L);
+
+  ASSERT_EQ(Program("my_program", source)
+                ->preprocess({"-std=c++14"})
+                ->get_kernel("my_kernel")
+                ->configure(1, 1)
+                ->launch(data),
+            "");
+  CHECK_CUDART(
+      cudaMemcpy(&h_data, data, sizeof(*data), cudaMemcpyDeviceToHost));
+  CHECK_CUDART(cudaDeviceSynchronize());
+  EXPECT_EQ(h_data, 201402L);
+
+  CHECK_CUDART(cudaFree(data));
+}
+
 TEST(Jitify2Test, LaunchLatencyBenchmark) {
   static const char* const source = R"(
 template <int N, int M, typename T, typename U>
diff --git a/jitify2_user_guide.md b/jitify2_user_guide.md
index 4751184..3fb2880 100644
--- a/jitify2_user_guide.md
+++ b/jitify2_user_guide.md
@@ -370,8 +370,9 @@ options), some trigger special behavior in Jitify as detailed below:
 - `-std=<std>`
 
   Unless otherwise specified, this flag is automatically passed to
-  NVRTC for all kernels and is set to `c++11` (which is the minimum
-  requirement for Jitify itself). Jitify also supports the value
+  NVRTC for all kernels and is set to the same C++ dialect as the
+  host binary is compiled for (i.e., matching
+  __cplusplus/_MSVC_LANG). Jitify also supports the value
   `-std=c++03` for explicitly selecting the `C++03` standard.
 
 - `--minify (-m)`

From 5f1ad9ddbb1932db1365254248eea86750a58663 Mon Sep 17 00:00:00 2001
From: Ben Barsdell <bbarsdell@nvidia.com>
Date: Wed, 19 Mar 2025 21:01:43 +1100
Subject: [PATCH 27/47] Undef NVTX macro

---
 jitify2.hpp | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/jitify2.hpp b/jitify2.hpp
index 30e62cb..b2e0617 100644
--- a/jitify2.hpp
+++ b/jitify2.hpp
@@ -8807,6 +8807,8 @@ class ProgramCache {
 
 #undef JITIFY_DEFINE_SERIALIZABLE_MEMBERS
 
+#undef JITIFY_NVTX_FUNC_RANGE
+
 #ifndef JITIFY_SERIALIZATION_ONLY
 
 #undef JITIFY_PATH_MAX

From 1187709529e18aae67459df1bf9c0b99d0163a27 Mon Sep 17 00:00:00 2001
From: Ben Barsdell <bbarsdell@nvidia.com>
Date: Wed, 19 Mar 2025 21:06:48 +1100
Subject: [PATCH 28/47] Rename ErrorMsg::extra() to info()

---
 jitify2.hpp           | 25 ++++++++++++++++---------
 jitify2_test.cu       | 12 ++++++------
 jitify2_user_guide.md | 14 +++++++-------
 3 files changed, 29 insertions(+), 22 deletions(-)

diff --git a/jitify2.hpp b/jitify2.hpp
index b2e0617..c02a8eb 100644
--- a/jitify2.hpp
+++ b/jitify2.hpp
@@ -1089,26 +1089,33 @@ class Template {
 class ErrorMsg : public std::string {
  public:
   using std::string::string;
-  ErrorMsg(const std::string& str, StringMap _extra = {})
-      : std::string(str), extra_(std::move(_extra)) {}
-  ErrorMsg(std::string&& str, StringMap _extra = {})
-      : std::string(std::move(str)), extra_(std::move(_extra)) {}
+  ErrorMsg(const std::string& str, StringMap _info = {})
+      : std::string(str), info_(std::move(_info)) {}
+  ErrorMsg(std::string&& str, StringMap _info = {})
+      : std::string(std::move(str)), info_(std::move(_info)) {}
 
   /*! Returns true if the error message is empty. */
   bool ok() const { return this->empty(); }
   /*! Returns true if the error message is non-empty. */
   explicit operator bool() const { return !this->empty(); }
 
-  const std::string& extra(const std::string& key) const {
-    auto iter = extra_.find(key);
-    if (iter == extra_.end()) {
-      JITIFY_THROW_OR_RETURN("Extra error info key '" + key + "' not found");
+  JITIFY_DEPRECATED("Use info() instead")
+  const std::string& extra(const std::string& key) const { return info(key); }
+
+  /*! Returns additional information about the error.
+   *  \param key The name of the information to return. See the user guide for
+   *    details. An invalid key will cause an exception or termination.
+   */
+  const std::string& info(const std::string& key) const {
+    auto iter = info_.find(key);
+    if (iter == info_.end()) {
+      JITIFY_THROW_OR_TERMINATE("Error info key '" + key + "' not found");
     }
     return iter->second;
   }
 
  private:
-  StringMap extra_;
+  StringMap info_;  // Additional information about the error
 };
 
 namespace detail {
diff --git a/jitify2_test.cu b/jitify2_test.cu
index 84fc720..268a791 100644
--- a/jitify2_test.cu
+++ b/jitify2_test.cu
@@ -1057,14 +1057,14 @@ TEST(Jitify2Test, InvalidPrograms) {
       Program("bad_program", "NOT CUDA C!")->preprocess();
   EXPECT_FALSE(preprocessed.ok());
   const ErrorMsg error = preprocessed.error();
-  EXPECT_THROW(error.extra("foo"), std::runtime_error);
-  EXPECT_EQ(error.extra("error"), "NVRTC_ERROR_COMPILATION");
-  EXPECT_TRUE(error.extra("log").find("identifier \"NOT\" is undefined") !=
+  EXPECT_TRUE(error.find("Compilation failed:") != std::string::npos);
+  EXPECT_THROW(error.info("foo"), std::runtime_error);
+  EXPECT_EQ(error.info("error"), "NVRTC_ERROR_COMPILATION");
+  EXPECT_TRUE(error.info("log").find("identifier \"NOT\" is undefined") !=
               std::string::npos);
-  EXPECT_TRUE(error.extra("options").find("-default-device") !=
+  EXPECT_TRUE(error.info("options").find("-default-device") !=
               std::string::npos);
-  EXPECT_EQ(error.extra("headers"), "");
-  EXPECT_TRUE(error.find("Compilation failed:") != std::string::npos);
+  EXPECT_EQ(error.info("headers"), "");
 }
 
 TEST(Jitify2Test, CompileLTO_IR) {
diff --git a/jitify2_user_guide.md b/jitify2_user_guide.md
index 3fb2880..c186538 100644
--- a/jitify2_user_guide.md
+++ b/jitify2_user_guide.md
@@ -98,18 +98,18 @@ call or when a method such as `launch()` fails:
 ```
 
 Most errors are simple strings, but compilation errors contain
-additional info that can be accessed via the `extra()` method:
+additional information that can be accessed via the `info()` method:
 
 ```
   jitify2::PreprocessedProgram preprocessed =
       jitify2::Program("bad_program", "NOT CUDA C!")->preprocess();
   assert(!preprocessed.ok());
   const jitify2::ErrorMsg error = preprocessed.error();
-  std::cerr << error << std::endl;                   // Full error message
-  std::cerr << error.extra("error") << std::endl;    // "NVRTC_ERROR_COMPILATION"
-  std::cerr << error.extra("log") << std::endl;      // "error: identifier "NOT" is undefined..."
-  std::cerr << error.extra("options") << std::endl;  // "-include=jitify_preinclude.h ..."
-  std::cerr << error.extra("headers") << std::endl;  // (empty)
+  std::cerr << error << std::endl;                  // Full error message
+  std::cerr << error.info("error") << std::endl;    // "NVRTC_ERROR_COMPILATION"
+  std::cerr << error.info("log") << std::endl;      // "error: identifier "NOT" is undefined..."
+  std::cerr << error.info("options") << std::endl;  // "-include=jitify_preinclude.h ..."
+  std::cerr << error.info("headers") << std::endl;  // (empty)
 ```
 
 By default, the full error message only includes the error name
@@ -295,7 +295,7 @@ documentation for many jitify features.
   Defining this macro to 1 before including the jitify header causes
   compilation errors to include options and header info in the error
   message. Note that this info can always be accessed manually via
-  the `extra()` method of the error object.
+  the `info()` method of the error object.
 
 <a name="compiler_options"/>
 

From 076808ce53446bc9e81090fde95daaeea2b2caa9 Mon Sep 17 00:00:00 2001
From: Ben Barsdell <bbarsdell@nvidia.com>
Date: Wed, 19 Mar 2025 21:10:15 +1100
Subject: [PATCH 29/47] Assert that launch args are trivially copyable

---
 jitify2.hpp           | 25 +++++++++++++++++++++++++
 jitify2_user_guide.md |  6 ++++++
 2 files changed, 31 insertions(+)

diff --git a/jitify2.hpp b/jitify2.hpp
index c02a8eb..5a76043 100644
--- a/jitify2.hpp
+++ b/jitify2.hpp
@@ -113,6 +113,12 @@
 #define JITIFY_USE_LIBCUFILT 0  // Use Jitify's builtin demangler by default
 #endif
 
+// Users can enable this to disable the is_trivially_copyable assertion on
+// kernel args.
+#ifndef JITIFY_IGNORE_NOT_TRIVIALLY_COPYABLE_ARGS
+#define JITIFY_IGNORE_NOT_TRIVIALLY_COPYABLE_ARGS 0
+#endif
+
 #if CUDA_VERSION >= 11040 && JITIFY_USE_LIBCUFILT
 #include <nv_decode.h>  // For __cu_demangle (requires linking with libcufilt.a)
 #endif
@@ -2092,6 +2098,22 @@ inline Kernel LoadedProgramData::get_kernel(std::string name) const {
   return Kernel::get_kernel(*this, std::move(name));
 }
 
+namespace detail {
+
+template <typename...>
+struct are_trivially_copyable;
+
+template <>
+struct are_trivially_copyable<> : std::true_type {};
+
+template <typename First, typename... Rest>
+struct are_trivially_copyable<First, Rest...>
+    : std::conditional<std::is_trivially_copyable<First>::value,
+                       are_trivially_copyable<Rest...>, std::false_type>::type {
+};
+
+}  // namespace detail
+
 /*! An object containing a configured CUDA kernel and associated metadata.
  */
 class ConfiguredKernelData {
@@ -2171,6 +2193,9 @@ class ConfiguredKernelData {
    */
   template <typename Arg, typename... Args>
   ErrorMsg launch(const Arg& arg, const Args&... args) const {
+    static_assert(JITIFY_IGNORE_NOT_TRIVIALLY_COPYABLE_ARGS ||
+                      detail::are_trivially_copyable<Arg, Args...>::value,
+                  "Kernel launch arguments must be trivially copyable");
     void* arg_ptrs[] = {(void*)&arg, (void*)&args...};
     return this->launch_raw(arg_ptrs);
   }
diff --git a/jitify2_user_guide.md b/jitify2_user_guide.md
index c186538..b43b60f 100644
--- a/jitify2_user_guide.md
+++ b/jitify2_user_guide.md
@@ -297,6 +297,12 @@ documentation for many jitify features.
   message. Note that this info can always be accessed manually via
   the `info()` method of the error object.
 
+- `JITIFY_IGNORE_NOT_TRIVIALLY_COPYABLE_ARGS=0`
+
+  Defining this macro to 1 before including the jitify header
+  disables the static assertion that kernel launch arguments are
+  trivially copyable.
+
 <a name="compiler_options"/>
 
 ## Compiler options

From cdabb5fe84bfef4fba44c888d74e229f70132ea6 Mon Sep 17 00:00:00 2001
From: Ben Barsdell <bbarsdell@nvidia.com>
Date: Wed, 19 Mar 2025 21:12:13 +1100
Subject: [PATCH 30/47] Add to_string and operator<< to OptionsVec

- These are useful for debugging and logging.
---
 jitify2.hpp | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

diff --git a/jitify2.hpp b/jitify2.hpp
index 5a76043..29a82db 100644
--- a/jitify2.hpp
+++ b/jitify2.hpp
@@ -385,6 +385,24 @@ class OptionsVec {
     return serialize_impl(false);
   }
 
+  friend std::string to_string(const OptionsVec& options,
+                               bool canonical = false) {
+    StringVec sv =
+        canonical ? options.serialize_canonical() : options.serialize();
+    std::string result;
+    if (sv.size() > 0) {
+      result = sv[0];
+    }
+    for (size_t i = 1; i < sv.size(); ++i) {
+      result += " " + sv[i];
+    }
+    return result;
+  }
+
+  friend std::ostream& operator<<(std::ostream& os, const OptionsVec& options) {
+    return os << to_string(options);
+  }
+
   // Allow implicit conversion (to avoid breaking the old options API).
   operator StringVec() const { return serialize(); }
 

From 3f39e277cb3c60f01a7d0c5f3ce508b2e50bc24b Mon Sep 17 00:00:00 2001
From: Ben Barsdell <bbarsdell@nvidia.com>
Date: Wed, 19 Mar 2025 21:15:20 +1100
Subject: [PATCH 31/47] Fix handling of Tegra and binary-compatible archs

- Adds Orin Tegra arch.
- Ensures that Tegra archs are not used for forward compatibility.
- Allows falling back to a binary-compatible real arch when NVRTC
  doesn't support the requested arch exactly.
---
 jitify2.hpp     | 33 +++++++++++++++++++++++++--------
 jitify2_test.cu | 31 +++++++++++++++++++++----------
 2 files changed, 46 insertions(+), 18 deletions(-)

diff --git a/jitify2.hpp b/jitify2.hpp
index 29a82db..eac38ee 100644
--- a/jitify2.hpp
+++ b/jitify2.hpp
@@ -3312,6 +3312,16 @@ inline int get_current_device_compute_capability(std::string* error = nullptr) {
   return cc;
 }
 
+// Returns whether the given compute capability corresponds to a Tegra GPU.
+inline bool is_tegra(int cc) {
+  // TODO: It would be better to detect these somehow, rather than hard-coding.
+  return cc == 87 ||  // Orin
+         cc == 72 ||  // Xavier
+         cc == 62 ||  // Parker
+         cc == 53 ||  // Erista
+         cc == 32;    // Logan
+}
+
 // Returns 0 on failure and sets *error if provided. Otherwise returns a compute
 // capability that is supported by the current version of NVRTC.
 inline int limit_to_supported_compute_capability(int cc,
@@ -3321,12 +3331,7 @@ inline int limit_to_supported_compute_capability(int cc,
   // newer hardware+driver. Forward compatibility of PTX allows this to work.
   // Tegra chips do not have forwards compatibility so we need to special case
   // them.
-  // TODO: It would be better to detect these somehow, rather than hard-coding.
-  bool is_tegra = (cc == 32 ||  // Logan
-                   cc == 53 ||  // Erista
-                   cc == 62 ||  // Parker
-                   cc == 72);   // Xavier
-  if (is_tegra) return cc;
+  if (is_tegra(cc)) return cc;
 
   if (!nvrtc()) {
     if (error) *error = nvrtc().error();
@@ -3341,6 +3346,8 @@ inline int limit_to_supported_compute_capability(int cc,
       std::vector<int> supported_archs(num_supported_archs);
       nvrtc_ret = nvrtc().GetSupportedArchs()(supported_archs.data());
       if (nvrtc_ret != NVRTC_SUCCESS) return 0;
+      // Don't use tegra archs.
+      while (is_tegra(supported_archs.back())) supported_archs.pop_back();
       return supported_archs.back();
     }();
     cc = std::min(cc, max_supported_arch);
@@ -3363,6 +3370,13 @@ inline int limit_to_supported_compute_capability(int cc,
   return cc;
 }
 
+inline bool is_binary_compatible_cc(int compiled_cc, int device_cc) {
+  auto get_major = [](int _cc) { return _cc / 10; };
+  auto get_minor = [](int _cc) { return _cc % 10; };
+  return get_major(compiled_cc) == get_major(device_cc) &&
+         get_minor(compiled_cc) <= get_minor(device_cc);
+}
+
 // Parses compiler_options and applies automatic architecture detection if
 // necessary, filling in the architecture flag in both compiler_options and
 // linker_options.
@@ -3439,11 +3453,14 @@ inline bool process_architecture_flags(OptionsVec* compiler_options,
     int supported_real_cc =
         limit_to_supported_compute_capability(real_cc, &error);
     if (!check_error()) return false;
-    if (!nvrtc().GetCUBIN() || supported_real_cc != real_cc) {
-      // This NVRTC version does not support compiling to a/the real arch.
+    if (!nvrtc().GetCUBIN() ||
+        !is_binary_compatible_cc(supported_real_cc, real_cc)) {
+      // This NVRTC version does not support compiling to a (compatible) real
+      // arch.
       virt_cc = supported_real_cc;
     } else {
       // Pass the real arch to NVRTC.
+      real_cc = supported_real_cc;
       virt_cc = 0;
     }
   }
diff --git a/jitify2_test.cu b/jitify2_test.cu
index 268a791..2484e7c 100644
--- a/jitify2_test.cu
+++ b/jitify2_test.cu
@@ -1081,6 +1081,8 @@ const int arch = __CUDA_ARCH__ / 10;
   EXPECT_GT(program->nvvm().size(), 0);
   EXPECT_EQ(program->nvvm().size(), program->lto_ir().size());
   int current_arch = get_current_device_arch();
+  current_arch =
+      jitify2::detail::limit_to_supported_compute_capability(current_arch);
   LinkedProgram linked_program = program->link();
   if (CUDA_VERSION < 11040) {
     ASSERT_FALSE(linked_program.ok());
@@ -1525,6 +1527,8 @@ TEST(Jitify2Test, ArchFlags) {
 const int arch = __CUDA_ARCH__ / 10;
 )";
   int current_arch = get_current_device_arch();
+  current_arch =
+      jitify2::detail::limit_to_supported_compute_capability(current_arch);
   int arch;
   // Test default behavior (automatic architecture detection).
   PreprocessedProgram preprocessed =
@@ -1544,19 +1548,20 @@ const int arch = __CUDA_ARCH__ / 10;
   ASSERT_EQ(program->link()->load()->get_global_value("arch", &arch), "");
   EXPECT_EQ(arch, 50);
 
-  auto expect_cubin_size_if_available = [](size_t cubin_size) {
-    if (jitify2::nvrtc().GetCUBIN()) {
-      EXPECT_GT(cubin_size, 0);
-    } else {
-      EXPECT_EQ(cubin_size, 0);
-    }
-  };
+#define JITIFY_EXPECT_CUBIN_SIZE_IF_AVAILABLE(cubin_size) \
+  do {                                                    \
+    if (jitify2::nvrtc().GetCUBIN()) {                    \
+      EXPECT_GT(cubin_size, 0);                           \
+    } else {                                              \
+      EXPECT_EQ(cubin_size, 0);                           \
+    }                                                     \
+  } while (0)
 
   // Test explicit real architecture (may compile directly to CUBIN).
   program = preprocessed->compile(
       "", {}, {"-arch", "sm_" + std::to_string(current_arch)});
   EXPECT_GT(program->ptx().size(), 0);
-  expect_cubin_size_if_available(program->cubin().size());
+  JITIFY_EXPECT_CUBIN_SIZE_IF_AVAILABLE(program->cubin().size());
   ASSERT_EQ(program->link()->load()->get_global_value("arch", &arch), "");
   EXPECT_EQ(arch, current_arch);
 
@@ -1570,7 +1575,7 @@ const int arch = __CUDA_ARCH__ / 10;
   // Test automatic real architecture (may compile directly to CUBIN).
   program = preprocessed->compile("", {}, {"-arch=sm_."});
   EXPECT_GT(program->ptx().size(), 0);
-  expect_cubin_size_if_available(program->cubin().size());
+  JITIFY_EXPECT_CUBIN_SIZE_IF_AVAILABLE(program->cubin().size());
   ASSERT_EQ(program->link()->load()->get_global_value("arch", &arch), "");
   EXPECT_EQ(arch, current_arch);
 
@@ -1579,10 +1584,12 @@ const int arch = __CUDA_ARCH__ / 10;
                 ->preprocess({"-arch=sm_50"})
                 ->compile("", {}, {"-arch=sm_."});
   EXPECT_GT(program->ptx().size(), 0);
-  expect_cubin_size_if_available(program->cubin().size());
+  JITIFY_EXPECT_CUBIN_SIZE_IF_AVAILABLE(program->cubin().size());
   ASSERT_EQ(program->link()->load()->get_global_value("arch", &arch), "");
   EXPECT_EQ(arch, current_arch);
 
+#undef JITIFY_EXPECT_CUBIN_SIZE_IF_AVAILABLE
+
   // Test that multiple architectures can be specified for preprocessing.
   program = Program("arch_flags_program", source)
                 ->preprocess({"-arch=compute_50", "-arch=compute_52",
@@ -1621,6 +1628,10 @@ const int arch = __CUDA_ARCH__ / 10;
   EXPECT_EQ(linker_options.count("--maxrregcount=100"), 1);
   EXPECT_EQ(linker_options.count("--generate-line-info"), 1);
   EXPECT_EQ(linker_options.count("--device-debug"), 1);
+
+  // TODO: Test "sm_90a/100a/120a".
+  //         Should possibly automatically add 'a' suffix when inferring arch
+  //         from current device.
 }
 
 struct Base {

From 57779f8832a8b28f503ac3fe5b74e250737fb0c4 Mon Sep 17 00:00:00 2001
From: Ben Barsdell <bbarsdell@nvidia.com>
Date: Wed, 19 Mar 2025 21:21:12 +1100
Subject: [PATCH 32/47] Fix std::size_t missing in builtin utility header

---
 jitify2.hpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/jitify2.hpp b/jitify2.hpp
index eac38ee..02921bb 100644
--- a/jitify2.hpp
+++ b/jitify2.hpp
@@ -5479,6 +5479,7 @@ template <typename... Ts> using void_t = typename __jitify_make_void<Ts...>::typ
 
 static const char* const jitsafe_header_utility = R"(
 #pragma once
+#include <cstring>  // For std::size_t
 #include <type_traits>
 
 namespace std {

From f408323d997b167fb0408f7d2386564166c7374c Mon Sep 17 00:00:00 2001
From: Ben Barsdell <bbarsdell@nvidia.com>
Date: Wed, 19 Mar 2025 21:23:36 +1100
Subject: [PATCH 33/47] Add NVRTC and linker versions to cache filenames

- This will ensure cache entries are not reused between different
  library versions (e.g., if the libraries are updated to fix
  code-gen bugs).
- The NVRTC build version is extracted via a small hack, but there is
  unfortunately no obvious way to get the build version of the
  linkers.
---
 jitify2.hpp     | 73 +++++++++++++++++++++++++++++++++++++++++++++++--
 jitify2_test.cu |  4 +++
 2 files changed, 75 insertions(+), 2 deletions(-)

diff --git a/jitify2.hpp b/jitify2.hpp
index 02921bb..00df62a 100644
--- a/jitify2.hpp
+++ b/jitify2.hpp
@@ -1747,10 +1747,34 @@ class LibNvJitLink
                                   nvJitLinkHandle, size_t*)
   JITIFY_DEFINE_NVJITLINK_WRAPPER(GetInfoLog, nvJitLinkResult, nvJitLinkHandle,
                                   char*)
+#if JITIFY_LINK_NVJITLINK_STATIC && CUDA_VERSION < 12030
+  detail::function_type<nvJitLinkResult, unsigned int*, unsigned int*>*
+  Version() {
+    return nullptr;
+  }
+#else
+  JITIFY_DEFINE_NVJITLINK_WRAPPER(Version, nvJitLinkResult, unsigned int*,
+                                  unsigned int*);
+#endif
 #undef JITIFY_DEFINE_NVJITLINK_WRAPPER
 #undef JITIFY_STR_IMPL
 #undef JITIFY_STR
 
+  // Returns the runtime nvJitLink version in the same format as CUDA_VERSION.
+  int get_version() const {
+    static const int version = [this] {
+      unsigned int major, minor;
+      if (Version()) {
+        Version()(&major, &minor);
+      } else {
+        major = 12;
+        minor = 0;  // No way to get this
+      }
+      return major * 1000 + minor * 10;
+    }();
+    return version;
+  }
+
   // TODO: Check if an official version of this is added to nvJitLink in future.
   const char* get_error_string(nvJitLinkResult result) const {
     // clang-format off
@@ -1795,6 +1819,8 @@ class LibNvJitLink
  private:
   std::string get_symbol_name(const char* func_name, int major = -1,
                               int minor = -1) const {
+    // Special case for nvJitLinkVersion symbol, which is unversioned.
+    if (func_name == std::string("nvJitLinkVersion")) return func_name;
     const int compiled_major = CUDA_VERSION / 1000;
     const int compiled_minor = CUDA_VERSION % 1000 / 10;
     if (major == -1) major = compiled_major;
@@ -8502,6 +8528,31 @@ struct default_hasher<AutoKey> {
   using type = AutoKey::Hash;
 };
 
+inline int get_nvrtc_build_version() {
+  static const int version = [] {
+    // This is a HACK to extract the value of __CUDACC_VER_BUILD__ from NVRTC;
+    // there doesn't seem to be any other (cross-platform) way to get this info.
+    const std::string key = "JITIFY_NVRTC_BUILD_VER = ";
+    std::string source =
+        detail::string_concat("__device__ int ", key, "__CUDACC_VER_BUILD__;");
+    std::string ptx;
+    if (detail::compile_program("program", source, {}, {}, nullptr, nullptr,
+                                &ptx)) {
+      return -1;
+    }
+    size_t start = ptx.find(key);
+    if (start == std::string::npos) return -1;
+    start += key.size();
+    size_t end = ptx.find(";", start);
+    if (end == std::string::npos) return -1;
+    size_t parse_end;
+    int result = std::stoi(ptx.substr(start, end - start), &parse_end);
+    if (parse_end != end - start) return -1;
+    return result;
+  }();
+  return version;
+}
+
 }  // namespace detail
 
 template <typename Key = detail::AutoKey,
@@ -8690,10 +8741,28 @@ class ProgramCache {
                                       error);
         }
       }
+      if (!nvrtc()) return LoadedProgram::Error(nvrtc().error());
+      const int nvrtc_major = nvrtc().get_version() / 1000;
+      const int nvrtc_minor = nvrtc().get_version() / 10 % 100;
+      const int nvrtc_build = detail::get_nvrtc_build_version();
+      // Note that there's no (cross-platform) way to get the build version of
+      // nvjitlink or libcuda.
+#if CUDA_VERSION >= 12000
+      const int linker_major = nvjitlink().get_version() / 1000;
+      const int linker_minor = nvjitlink().get_version() / 10 % 100;
+      const char* const linker_name = "nvjitlink";
+#else
+      const int linker_major = cuda().get_version() / 1000;
+      const int linker_minor = cuda().get_version() / 10 % 100;
+      const char* const linker_name = "culink";
+#endif
       std::stringstream filename_ss;
       filename_ss.imbue(std::locale::classic());
-      filename_ss << to_filename_(key) << ".sm" << compute_capability << ".v"
-                  << std::hex << serialization::kSerializationVersion;
+      filename_ss << to_filename_(key) << ".sm" << compute_capability
+                  << ".nvrtc" << nvrtc_major << "-" << nvrtc_minor << "-"
+                  << nvrtc_build << "." << linker_name << linker_major << "-"
+                  << linker_minor << ".v" << std::hex
+                  << serialization::kSerializationVersion;
       LinkedProgram linked;
       bool hit = false;
       error = file_cache_.get(
diff --git a/jitify2_test.cu b/jitify2_test.cu
index 2484e7c..cc42c44 100644
--- a/jitify2_test.cu
+++ b/jitify2_test.cu
@@ -720,6 +720,10 @@ TEST(Jitify2Test, PathSimplify) {
 #endif
 }
 
+TEST(Jitify2Test, GetNvrtcBuildVersion) {
+  EXPECT_NE(jitify2::detail::get_nvrtc_build_version(), -1);
+}
+
 TEST(Jitify2Test, Program) {
   static const char* const name = "my_program";
   static const char* const source = "/* empty source */";

From 9cbe8dd0782422fd8960baf1875db1b280c84181 Mon Sep 17 00:00:00 2001
From: Ben Barsdell <bbarsdell@nvidia.com>
Date: Tue, 6 May 2025 15:36:48 +1000
Subject: [PATCH 34/47] Refactor Option constructors to improve usability

- Moves option parsing into Option instead of OptionsVec. This
  ensures that single options can be constructed with "-key=val".
  Previously, this silently set key="-key=val", which was bad.
- Also adds an operator bool() for convenience.
---
 jitify2.hpp     | 152 ++++++++++++++++++++++++++----------------------
 jitify2_test.cu |  18 ++++++
 2 files changed, 100 insertions(+), 70 deletions(-)

diff --git a/jitify2.hpp b/jitify2.hpp
index 00df62a..509085a 100644
--- a/jitify2.hpp
+++ b/jitify2.hpp
@@ -260,11 +260,81 @@ using StringRef = const std::string&;
 using StringSlice = std::string;
 #endif
 
+namespace detail {
+
+// Strip whitespace from string in-place.
+inline void ltrim(std::string* s) {
+  s->erase(s->begin(), std::find_if(s->begin(), s->end(), [](unsigned char c) {
+             return !std::isspace(c);
+           }));
+}
+inline void rtrim(std::string* s) {
+  s->erase(std::find_if(s->rbegin(), s->rend(),
+                        [](unsigned char c) { return !std::isspace(c); })
+               .base(),
+           s->end());
+}
+inline void trim(std::string* s) {
+  ltrim(s);
+  rtrim(s);
+}
+
+// Strip whitespace from a string view.
+inline StringSlice ltrim(StringRef s) {
+  size_t beg = std::find_if(s.begin(), s.end(),
+                            [](unsigned char c) { return !std::isspace(c); }) -
+               s.begin();
+  return s.substr(beg);
+}
+inline StringSlice rtrim(StringRef s) {
+  size_t end = std::find_if(s.rbegin(), s.rend(),
+                            [](unsigned char c) { return !std::isspace(c); })
+                   .base() -
+               s.begin();
+  return s.substr(0, end);
+}
+inline StringSlice trim(StringRef s) { return rtrim(ltrim(s)); }
+
+}  // namespace detail
+
 class Option {
+  void set_key_and_value() {
+    // TODO: Consider changing key and value to be views into key_and_value to
+    // avoid double-storage.
+    if (value_.empty()) {
+      key_and_value_ = key_;
+    } else {
+      key_and_value_.reserve(key_.size() + 1 + value_.size());
+      key_and_value_.append(key_);
+      key_and_value_.append("=");
+      key_and_value_.append(value_);
+    }
+  }
+
  public:
   Option() = default;
-  explicit Option(std::string _key, std::string _value = {},
-                  StringVec _repr = {})
+  explicit Option(std::string raw) {
+    const size_t eql = raw.find('=');
+    if (eql != std::string::npos) {
+      // Parse "-key=val".
+      key_ = raw.substr(0, eql);
+      value_ = raw.substr(eql + 1);
+    } else if (raw.size() > 2 &&
+               // HACK: Special case for '-l<lib>' linker flag.
+               (std::isupper(static_cast<unsigned char>(raw[1])) ||
+                (raw[1] == 'l' && raw.substr(0, 9) != "-lineinfo"))) {
+      // Parse "-Kval".
+      key_ = raw.substr(0, 2);
+      value_ = raw.substr(2);
+    } else {
+      // Parse "-key" (no value).
+      key_ = raw;
+    }
+    detail::trim(&value_);  // Strip whitespace
+    repr_ = {std::move(raw)};
+    set_key_and_value();
+  }
+  Option(std::string _key, std::string _value, StringVec _repr = {})
       : key_(std::move(_key)),
         value_(std::move(_value)),
         repr_(std::move(_repr)) {
@@ -274,16 +344,7 @@ class Option {
         repr_.front() += "=" + value_;
       }
     }
-    // TODO: Consider changing key and value to be views into key_and_value to
-    // avoid double-storage.
-    if (value_.empty()) {
-      key_and_value_ = key_;
-    } else {
-      key_and_value_.reserve(key_.size() + 1 + value_.size());
-      key_and_value_.append(key_);
-      key_and_value_.append("=");
-      key_and_value_.append(value_);
-    }
+    set_key_and_value();
   }
 
   const std::string& key() const { return key_; }
@@ -306,6 +367,8 @@ class Option {
     return !(lhs == rhs);
   }
 
+  explicit operator bool() const { return !key_.empty(); }
+
  private:
   std::string key_;
   std::string value_;
@@ -313,43 +376,6 @@ class Option {
   StringVec repr_;
 };
 
-namespace detail {
-
-// Strip whitespace from string in-place.
-inline void ltrim(std::string* s) {
-  s->erase(s->begin(), std::find_if(s->begin(), s->end(), [](unsigned char c) {
-             return !std::isspace(c);
-           }));
-}
-inline void rtrim(std::string* s) {
-  s->erase(std::find_if(s->rbegin(), s->rend(),
-                        [](unsigned char c) { return !std::isspace(c); })
-               .base(),
-           s->end());
-}
-inline void trim(std::string* s) {
-  ltrim(s);
-  rtrim(s);
-}
-
-// Strip whitespace from a string view.
-inline StringSlice ltrim(StringRef s) {
-  size_t beg = std::find_if(s.begin(), s.end(),
-                            [](unsigned char c) { return !std::isspace(c); }) -
-               s.begin();
-  return s.substr(beg);
-}
-inline StringSlice rtrim(StringRef s) {
-  size_t end = std::find_if(s.rbegin(), s.rend(),
-                            [](unsigned char c) { return !std::isspace(c); })
-                   .base() -
-               s.begin();
-  return s.substr(0, end);
-}
-inline StringSlice trim(StringRef s) { return rtrim(ltrim(s)); }
-
-}  // namespace detail
-
 class OptionsVec {
   using vec_type = std::vector<Option>;
 
@@ -507,31 +533,17 @@ class OptionsVec {
       if (option[0] != '-') {
         return false;  // "Expected an option, got " + option
       }
-      std::string key, val;
-      StringVec repr = {option};
-      const size_t eql = option.find('=');
+      Option new_option;
       if (i + 1 < options.size() && options[i + 1][0] != '-') {
         // Parse "-key" "val".
-        key = option;
-        val = options[++i];
-        repr = {key, val};
-      } else if (eql != std::string::npos) {
-        // Parse "-key=val".
-        key = option.substr(0, eql);
-        val = option.substr(eql + 1);
-      } else if (option.size() > 2 &&
-                 // HACK: Special case for '-l<lib>' linker flag.
-                 (std::isupper(static_cast<unsigned char>(option[1])) ||
-                  (option[1] == 'l' && option.substr(0, 9) != "-lineinfo"))) {
-        // Parse "-Kval".
-        key = option.substr(0, 2);
-        val = option.substr(2);
+        std::string val = options[++i];
+        detail::trim(&val);  // Strip whitespace
+        new_option = Option(option, val, {option, val});
       } else {
-        // Parse "-key" (no value).
-        key = option;
+        // Parse self-contained option.
+        new_option = Option(option);
       }
-      detail::trim(&val);  // Strip whitespace
-      options_.emplace_back(std::move(key), std::move(val), std::move(repr));
+      options_.emplace_back(std::move(new_option));
     }
     return true;
   }
diff --git a/jitify2_test.cu b/jitify2_test.cu
index cc42c44..658598c 100644
--- a/jitify2_test.cu
+++ b/jitify2_test.cu
@@ -1462,6 +1462,24 @@ __global__ void foo_kernel(int* data) {
   CHECK_CUDART(cudaFree(d_data));
 }
 
+TEST(Jitify2Test, Option) {
+  Option option;
+  EXPECT_FALSE(static_cast<bool>(option));
+  option = Option("--restrict");
+  EXPECT_TRUE(static_cast<bool>(option));
+  EXPECT_EQ(option.key(), "--restrict");
+  EXPECT_EQ(option.value(), "");
+  option = Option("-include", "foo/bar");
+  EXPECT_TRUE(static_cast<bool>(option));
+  EXPECT_EQ(option.key(), "-include");
+  EXPECT_EQ(option.value(), "foo/bar");
+  option = Option("-include=foo/bar");
+  EXPECT_TRUE(static_cast<bool>(option));
+  EXPECT_EQ(option.key(), "-include");
+  EXPECT_EQ(option.value(), "foo/bar");
+  EXPECT_EQ(option.key_and_value(), "-include=foo/bar");
+}
+
 TEST(Jitify2Test, OptionsVec) {
   OptionsVec options0;
   EXPECT_TRUE(options0.ok());

From bf85005ea2bf0ee2cacc0eae7b99bc557d1a3cd1 Mon Sep 17 00:00:00 2001
From: Ben Barsdell <bbarsdell@nvidia.com>
Date: Tue, 6 May 2025 15:48:33 +1000
Subject: [PATCH 35/47] Change OptionsVec::pop() to return a popped option

- This is more useful than just returning bool.
---
 jitify2.hpp | 43 +++++++++++++++++++++++++------------------
 1 file changed, 25 insertions(+), 18 deletions(-)

diff --git a/jitify2.hpp b/jitify2.hpp
index 509085a..c8aee2d 100644
--- a/jitify2.hpp
+++ b/jitify2.hpp
@@ -432,18 +432,24 @@ class OptionsVec {
   // Allow implicit conversion (to avoid breaking the old options API).
   operator StringVec() const { return serialize(); }
 
-  // Removes all options with any of the specified keys, and returns whether any
-  // were removed.
-  bool pop(std::initializer_list<std::string> keys) {
+  // Removes all options with any of the specified keys, and returns the first
+  // matching one, or an empty Option if none were found. Also returns the
+  // number of matches found in *count if count is provided.
+  Option pop(std::initializer_list<std::string> keys, size_t* count = nullptr) {
     assert(ok_);
-    auto iter = std::remove_if(
-        options_.begin(), options_.end(), [&](const Option& option) {
-          return std::find(keys.begin(), keys.end(), option.key()) !=
-                 keys.end();
-        });
-    if (iter == options_.end()) return false;
-    options_.resize(iter - options_.begin());
-    return true;
+    Option result;
+    size_t num_removed = 0;
+    for (auto iter = options_.rbegin(); iter != options_.rend();) {
+      if (std::find(keys.begin(), keys.end(), iter->key()) != keys.end()) {
+        result = std::move(*iter);
+        iter = decltype(iter){options_.erase(std::next(iter).base())};
+        ++num_removed;
+      } else {
+        ++iter;
+      }
+    }
+    if (count) *count = num_removed;
+    return result;
   }
 
   void pop_back() {
@@ -3943,8 +3949,8 @@ inline CompiledProgram CompiledProgram::compile(
   }
   detail::add_std_flag_if_not_specified(&compiler_options);
   detail::add_default_device_flag_if_not_specified(&compiler_options);
-  bool should_remove_unused_globals = compiler_options.pop(
-      {"-remove-unused-globals", "--remove-unused-globals"});
+  bool should_remove_unused_globals = static_cast<bool>(compiler_options.pop(
+      {"-remove-unused-globals", "--remove-unused-globals"}));
   std::string log, ptx, cubin, nvvm;
   StringMap lowered_name_map;
   if (detail::compile_program(name, source, header_sources, compiler_options,
@@ -7486,10 +7492,11 @@ inline PreprocessedProgram PreprocessedProgram::preprocess(
   const int cxx_standard_year =
       detail::add_std_flag_if_not_specified(&compiler_options);
   detail::add_default_device_flag_if_not_specified(&compiler_options);
-  bool minify = compiler_options.pop({"-m", "--minify"});
+  bool minify = static_cast<bool>(compiler_options.pop({"-m", "--minify"}));
   // TODO: This flag is experimental, because the implementation does not
   // support transformations of "namespace std {" (as used for specializations).
-  bool use_cuda_std = compiler_options.pop({"-cuda-std", "--cuda-std"});
+  bool use_cuda_std =
+      static_cast<bool>(compiler_options.pop({"-cuda-std", "--cuda-std"}));
   bool replace_pragma_once = !compiler_options.pop(
       {"-no-replace-pragma-once", "--no-replace-pragma-once"});
   bool use_builtin_headers = !compiler_options.pop(
@@ -7500,8 +7507,8 @@ inline PreprocessedProgram PreprocessedProgram::preprocess(
   }
 
   // This is re-added to the remaining options below.
-  bool should_remove_unused_globals = compiler_options.pop(
-      {"-remove-unused-globals", "--remove-unused-globals"});
+  bool should_remove_unused_globals = static_cast<bool>(compiler_options.pop(
+      {"-remove-unused-globals", "--remove-unused-globals"}));
 
   using parser::IncludeName;
   using parser::ProcessFlags;
@@ -7711,7 +7718,7 @@ inline PreprocessedProgram PreprocessedProgram::preprocess(
   }
   // We temporarily enable warnings so that we can parse the ones we added.
   const bool disable_warnings =
-      compiler_options.pop({"--disable-warnings", "-w"});
+      static_cast<bool>(compiler_options.pop({"--disable-warnings", "-w"}));
   // Maps header include names to their full file paths.
   StringMap header_fullpaths;
   std::string compile_log;

From f9048df06028692ee70137cc9c0ac5864332c135 Mon Sep 17 00:00:00 2001
From: Ben Barsdell <bbarsdell@nvidia.com>
Date: Wed, 7 May 2025 18:50:44 +1000
Subject: [PATCH 36/47] Don't use current exe dir for include dirs

- This "feature" was poorly thought out for a few reasons:
  - During preprocessing, the current exe dir is jitify2_preprocess,
    not the user's application.
  - It didn't apply to -I or -L flags used after preprocessing.
  - It is not the obvious default behavior.
- If we want to re-add this feature later, it should be opt-in behind
  a "--paths-relative-to-exe-dir" option, and it should (only) be
  applied after preprocessing.
---
 jitify2.hpp     | 19 +------------------
 jitify2_test.cu | 14 --------------
 2 files changed, 1 insertion(+), 32 deletions(-)

diff --git a/jitify2.hpp b/jitify2.hpp
index c8aee2d..7cad481 100644
--- a/jitify2.hpp
+++ b/jitify2.hpp
@@ -5862,28 +5862,12 @@ inline bool read_text_file(const std::string& fullpath, std::string* content) {
   return true;
 }
 
-// Prepends the current executable dir (instead of the current working dir,
-// which is the implicit default) to relative paths. This is expected to be more
-// useful than the default because it allows referencing headers that are
-// shipped with the application independent of the current working directory.
-inline std::string expand_include_path(std::string path) {
-  if (path.empty()) return "";
-  if (!path_is_absolute(path)) {
-    path = path_join(path_base(get_current_executable_path()), path);
-  }
-  // TODO: Consider also expanding "$FOO" and "${FOO}" as environment variables.
-  return path;
-}
-
 inline void extract_include_paths(OptionsVec* options,
                                   StringVec* include_paths) {
   const std::vector<int> idxs = options->find({"-I"});
   for (int i = (int)idxs.size() - 1; i >= 0; --i) {
     const int idx = idxs[i];
     std::string include_path = (*options)[idx].value();
-    // Note: Not passing the arg with std::move() here due to a "may be used
-    // uninitialized" warning with some compilers.
-    include_path = expand_include_path(include_path);
     include_paths->push_back(std::move(include_path));
     options->erase(idx);
   }
@@ -7535,8 +7519,7 @@ inline PreprocessedProgram PreprocessedProgram::preprocess(
             });
       };
 
-  const std::string current_dir =
-      detail::path_base(detail::get_current_executable_path());
+  const std::string current_dir = ".";
   const std::string program_fullpath =
       detail::path_join(current_dir, detail::sanitize_slashes(program_name));
   ErrorMsg err = process_cuda_source_fn(&program_source, program_fullpath,
diff --git a/jitify2_test.cu b/jitify2_test.cu
index 658598c..8709b10 100644
--- a/jitify2_test.cu
+++ b/jitify2_test.cu
@@ -888,20 +888,6 @@ TEST(Jitify2Test, Preincludes) {
   ASSERT_EQ(get_error(preprog), "");
 }
 
-TEST(Jitify2Test, CurrentExeIncludePath) {
-  static const std::string source = R"(
-#include <example_headers/my_header1.cuh>
-)";
-  std::unique_ptr<const char, int (*)(const char*)> cd_back("..", ::chdir);
-  ASSERT_EQ(::chdir("example_headers"), 0);
-  // This requires -I. to be expanded to the current executable directory, not
-  // the current working directory.
-  PreprocessedProgram preprog =
-      Program("my_program", source)->preprocess({"-I."});
-  ASSERT_EQ(get_error(preprog), "");
-  ASSERT_EQ(get_error(preprog->compile()), "");
-}
-
 TEST(Jitify2Test, CompiledProgram) {
   // Tests compilation, lowered name lookup, and basic CompiledProgram API
   // functionality.

From 013185f6a181c281b060764e133897331f9dfebe Mon Sep 17 00:00:00 2001
From: Ben Barsdell <bbarsdell@nvidia.com>
Date: Wed, 7 May 2025 22:24:01 +1000
Subject: [PATCH 37/47] Support arch suffixes and fix sm in file cache key

- Adds support for "a" and "f" arch suffixes, such as "sm_90a".
  Previously, such suffixes were silently removed.
- Also fixes the architecture that is put into cache filenames by
  using the same logic to determine it as is used during compilation,
  ensuring it will exactly match the compiled binary.
---
 jitify2.hpp     | 123 ++++++++++++++++++++++++++++++++----------------
 jitify2_test.cu |  27 +++++++++++
 2 files changed, 110 insertions(+), 40 deletions(-)

diff --git a/jitify2.hpp b/jitify2.hpp
index 7cad481..66b674a 100644
--- a/jitify2.hpp
+++ b/jitify2.hpp
@@ -3300,18 +3300,23 @@ namespace detail {
 // On success, sets *is_virtual to true if a "compute_" value was found, or
 // false for an "arch_" value, and *idx is set to the index of the option within
 // the options vector (e.g., so that it can be erased by the caller).
-inline int parse_arch_flag(const OptionsVec& options, bool* is_virtual,
+inline int parse_arch_flag(const OptionsVec& options,
+                           bool* is_virtual = nullptr,
+                           std::string* suffix = nullptr,
                            std::string* error = nullptr,
                            size_t* idx = nullptr) {
   const std::vector<int> idxs =
       options.find({"-arch", "--gpu-architecture", "--gpu-name"}, 1);
-  if (idxs.empty()) return 0;  // // No architecture flag found
+  if (idxs.empty()) {
+    if (suffix) *suffix = "";
+    return 0;  // No architecture flag found
+  }
   std::string value = options[idxs[0]].value();
   if (startswith(value, "compute_")) {
-    *is_virtual = true;
+    if (is_virtual) *is_virtual = true;
     value = value.substr(std::strlen("compute_"));
   } else if (startswith(value, "sm_")) {
-    *is_virtual = false;
+    if (is_virtual) *is_virtual = false;
     value = value.substr(std::strlen("sm_"));
   } else {
     if (error) *error = "Expected value to begin with 'compute_' or 'sm_'.";
@@ -3320,12 +3325,15 @@ inline int parse_arch_flag(const OptionsVec& options, bool* is_virtual,
   int result;
   if (value == ".") {
     result = -1;
+    if (suffix) *suffix = "";
   } else {
-    int cc = std::atoi(std::string(value).c_str());
+    char* suffix_c;
+    int cc = (int)std::strtol(value.c_str(), &suffix_c, 10);
     if (cc == 0) {
       if (error) *error = "Failed to parse a valid architecture number.";
       return 0;
     }
+    if (suffix) *suffix = suffix_c;
     result = cc;
   }
   if (idx) *idx = idxs[0];
@@ -3414,7 +3422,9 @@ inline int limit_to_supported_compute_capability(int cc,
   return cc;
 }
 
-inline bool is_binary_compatible_cc(int compiled_cc, int device_cc) {
+inline bool is_binary_compatible_cc(int compiled_cc, int device_cc,
+                                    StringRef suffix = "") {
+  if (suffix == "a") return compiled_cc == device_cc;
   auto get_major = [](int _cc) { return _cc / 10; };
   auto get_minor = [](int _cc) { return _cc % 10; };
   return get_major(compiled_cc) == get_major(device_cc) &&
@@ -3437,9 +3447,12 @@ inline bool process_architecture_flags(OptionsVec* compiler_options,
     return true;
   };
   bool is_virtual;
+  std::string linker_suffix;
+  size_t idx;
   // First identify any existing real arch in linker_options (e.g., from a
   // previous call to this function).
-  int linker_cc = parse_arch_flag(*linker_options, &is_virtual, &error);
+  int linker_cc = parse_arch_flag(*linker_options, &is_virtual, &linker_suffix,
+                                  &error, &idx);
   if (!check_error()) return false;
   if (linker_cc < 0) {
     // We do not allow "-arch=sm_." to be given as a linker option.
@@ -3454,14 +3467,33 @@ inline bool process_architecture_flags(OptionsVec* compiler_options,
     }
     return false;
   }
+  // Remove the parsed arch flag entries; they are replaced below.
+  if (linker_cc != 0) {
+    linker_options->erase(idx);
+  }
   // Now parse compiler options.
-  size_t idx;
-  int given_cc = parse_arch_flag(*compiler_options, &is_virtual, &error, &idx);
+  // Note: We only use a suffix if one is explicitly specified, we never
+  // automatically use one.
+  std::string suffix;
+  int given_cc =
+      parse_arch_flag(*compiler_options, &is_virtual, &suffix, &error, &idx);
   if (!check_error()) return false;
   // Remove the parsed arch flag entries; they are replaced below.
   if (given_cc != 0) {
     compiler_options->erase(idx);
   }
+  if (linker_suffix != "" && linker_suffix != suffix) {
+    if (error_ptr) {
+      *error_ptr = "Linker architecture flag has incompatible suffix";
+    }
+    return false;
+  }
+  if (suffix != "" && suffix != "a" && suffix != "f") {
+    if (error_ptr) {
+      *error_ptr = "Unsupported architecture suffix: " + suffix;
+    }
+    return false;
+  }
   int real_cc;
   if (linker_cc != 0) {
     real_cc = linker_cc;
@@ -3498,9 +3530,16 @@ inline bool process_architecture_flags(OptionsVec* compiler_options,
         limit_to_supported_compute_capability(real_cc, &error);
     if (!check_error()) return false;
     if (!nvrtc().GetCUBIN() ||
-        !is_binary_compatible_cc(supported_real_cc, real_cc)) {
-      // This NVRTC version does not support compiling to a (compatible) real
-      // arch.
+        !is_binary_compatible_cc(supported_real_cc, real_cc, suffix)) {
+      // We must use a virtual architecture (PTX).
+      if (suffix != "" &&
+          !is_binary_compatible_cc(supported_real_cc, real_cc, suffix)) {
+        // Even PTX does not provide compatibility here.
+        if (error_ptr) {
+          *error_ptr = "No compatible architecture is supported, due to suffix";
+        }
+        return false;
+      }
       virt_cc = supported_real_cc;
     } else {
       // Pass the real arch to NVRTC.
@@ -3512,14 +3551,13 @@ inline bool process_architecture_flags(OptionsVec* compiler_options,
   // options.
   if (virt_cc) {
     compiler_options->push_back(
-        Option("-arch", "compute_" + std::to_string(virt_cc)));
+        Option("-arch", "compute_" + std::to_string(virt_cc) + suffix));
   } else {
     compiler_options->push_back(
-        Option("-arch", "sm_" + std::to_string(real_cc)));
-  }
-  if (linker_cc == 0) {
-    linker_options->push_back(Option("-arch", "sm_" + std::to_string(real_cc)));
+        Option("-arch", "sm_" + std::to_string(real_cc) + suffix));
   }
+  linker_options->push_back(
+      Option("-arch", "sm_" + std::to_string(real_cc) + suffix));
   return true;
 }
 
@@ -7652,14 +7690,19 @@ inline PreprocessedProgram PreprocessedProgram::preprocess(
   struct ArchFlag {
     int cc;
     bool is_virtual;
+    std::string suffix;
     explicit operator Option() const {
-      return Option("-arch",
-                    (is_virtual ? "compute_" : "sm_") + std::to_string(cc));
+      return Option("-arch", (is_virtual ? "compute_" : "sm_") +
+                                 std::to_string(cc) + suffix);
     }
     bool operator==(const ArchFlag& other) const {
-      return cc == other.cc && is_virtual == other.is_virtual;
+      return cc == other.cc && is_virtual == other.is_virtual &&
+             suffix == other.suffix;
+    }
+    size_t hash() const {
+      return detail::hash_value(suffix,
+                                detail::fasthash64(cc) ^ (is_virtual * ~0));
     }
-    size_t hash() const { return detail::fasthash64(cc) ^ (is_virtual * ~0); }
     struct Hash {
       size_t operator()(const ArchFlag& x) const { return x.hash(); }
     };
@@ -7670,8 +7713,9 @@ inline PreprocessedProgram PreprocessedProgram::preprocess(
     std::string error;
     size_t idx;
     bool is_virtual = false;
-    int given_cc =
-        detail::parse_arch_flag(compiler_options, &is_virtual, &error, &idx);
+    std::string suffix = "";
+    int given_cc = detail::parse_arch_flag(compiler_options, &is_virtual,
+                                           &suffix, &error, &idx);
     if (!error.empty()) {
       return Error("Failed to parse architecture flag: " + error);
     }
@@ -7690,14 +7734,14 @@ inline PreprocessedProgram PreprocessedProgram::preprocess(
       }
       is_virtual = true;
     }
-    arch_flags.insert({given_cc, is_virtual});
+    arch_flags.insert({given_cc, is_virtual, std::move(suffix)});
     // Remove the parsed arch flag entries; they are replaced below.
     compiler_options.erase(idx);
   }
   if (arch_flags.empty()) {
     // Push a placeholder entry so that preprocessing still runs (with the
     // default arch) when none was specified by the user.
-    arch_flags.insert({0, false});
+    arch_flags.insert({0, false, ""});
   }
   // We temporarily enable warnings so that we can parse the ones we added.
   const bool disable_warnings =
@@ -8724,25 +8768,24 @@ class ProgramCache {
       // Add the SM architecture to the key, as cubins are arch-specific.
       OptionsVec all_compiler_options =
           merge_compiler_options(extra_compiler_options);
+      OptionsVec all_linker_options =
+          merge_linker_options(extra_linker_options);
       std::string error;
-      bool is_virtual;
-      int given_cc =
-          detail::parse_arch_flag(all_compiler_options, &is_virtual, &error);
+      // This ensures the linker options will contain a fully-specified arch
+      // that matches what will be used during compilation and linking.
+      if (!detail::process_architecture_flags(&all_compiler_options,
+                                              &all_linker_options, &error)) {
+        return LoadedProgram::Error("Failed to process architecture flags: " +
+                                    error);
+      }
+      // Get the binary SM architecture.
+      std::string suffix;
+      const int linker_cc =
+          detail::parse_arch_flag(all_linker_options, nullptr, &suffix, &error);
       if (!error.empty()) {
         return LoadedProgram::Error("Failed to parse architecture flag: " +
                                     error);
       }
-      int compute_capability;
-      if (given_cc > 0 && !is_virtual) {
-        compute_capability = given_cc;
-      } else {
-        compute_capability =
-            detail::get_current_device_compute_capability(&error);
-        if (!error.empty()) {
-          return LoadedProgram::Error("Failed to detect device architecture: " +
-                                      error);
-        }
-      }
       if (!nvrtc()) return LoadedProgram::Error(nvrtc().error());
       const int nvrtc_major = nvrtc().get_version() / 1000;
       const int nvrtc_minor = nvrtc().get_version() / 10 % 100;
@@ -8760,7 +8803,7 @@ class ProgramCache {
 #endif
       std::stringstream filename_ss;
       filename_ss.imbue(std::locale::classic());
-      filename_ss << to_filename_(key) << ".sm" << compute_capability
+      filename_ss << to_filename_(key) << ".sm" << linker_cc << suffix
                   << ".nvrtc" << nvrtc_major << "-" << nvrtc_minor << "-"
                   << nvrtc_build << "." << linker_name << linker_major << "-"
                   << linker_minor << ".v" << std::hex
diff --git a/jitify2_test.cu b/jitify2_test.cu
index 8709b10..16eedd3 100644
--- a/jitify2_test.cu
+++ b/jitify2_test.cu
@@ -1596,6 +1596,33 @@ const int arch = __CUDA_ARCH__ / 10;
   ASSERT_EQ(program->link()->load()->get_global_value("arch", &arch), "");
   EXPECT_EQ(arch, current_arch);
 
+#if CUDA_VERSION >= 12010
+  // Test architecture-specific "a" suffix.
+  program = Program("arch_flags_program", source)
+                ->preprocess({"-arch=compute_90a"})
+                ->compile("", {}, {"-arch=compute_90a"});
+  EXPECT_GT(program->ptx().size(), 0);
+  EXPECT_EQ(program->cubin().size(), 0);
+  EXPECT_NE(program->ptx().find(".target sm_90a"), std::string::npos);
+
+  program = Program("arch_flags_program", source)
+                ->preprocess({"-arch=sm_90a"})
+                ->compile("", {}, {"-arch=sm_90a"});
+  EXPECT_GT(program->ptx().size(), 0);
+  JITIFY_EXPECT_CUBIN_SIZE_IF_AVAILABLE(program->cubin().size());
+  EXPECT_NE(program->ptx().find(".target sm_90a"), std::string::npos);
+#endif
+
+#if CUDA_VERSION >= 12090
+  // Test family-specific "f" suffix.
+  program = Program("arch_flags_program", source)
+                ->preprocess({"-arch=sm_100f"})
+                ->compile("", {}, {"-arch=sm_100f"});
+  EXPECT_GT(program->ptx().size(), 0);
+  JITIFY_EXPECT_CUBIN_SIZE_IF_AVAILABLE(program->cubin().size());
+  EXPECT_NE(program->ptx().find(".target sm_100f"), std::string::npos);
+#endif
+
 #undef JITIFY_EXPECT_CUBIN_SIZE_IF_AVAILABLE
 
   // Test that multiple architectures can be specified for preprocessing.

From 757196cf4b264226f85627417fd77c9091a30004 Mon Sep 17 00:00:00 2001
From: Ben Barsdell <bbarsdell@nvidia.com>
Date: Thu, 8 May 2025 19:02:06 +1000
Subject: [PATCH 38/47] Handle empty path in jitify2_preprocess

---
 jitify2_preprocess.cpp | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/jitify2_preprocess.cpp b/jitify2_preprocess.cpp
index 93ffdbe..f25d618 100644
--- a/jitify2_preprocess.cpp
+++ b/jitify2_preprocess.cpp
@@ -118,7 +118,9 @@ std::string sanitize_varname(const std::string& s) {
 bool make_directories_for(const std::string& filename) {
   using jitify2::detail::make_directories;
   using jitify2::detail::path_base;
-  if (!make_directories(path_base(filename))) {
+  std::string path = path_base(filename);
+  if (path.empty()) return true;
+  if (!make_directories(path)) {
     std::cerr << "Error creating directories for output file " << filename
               << std::endl;
     return false;

From 049d7c38ab9647545ab31c73298ea8aa22118b74 Mon Sep 17 00:00:00 2001
From: Ben Barsdell <bbarsdell@nvidia.com>
Date: Tue, 13 May 2025 21:14:36 +1000
Subject: [PATCH 39/47] Fix some include handling issues

- Fixes include paths being processed in reverse order.
- Fixes IncludeName path handling not respecting empty paths.
- Uses '@' instead of ':' for separators so that they remain valid
  filenames (on Windows).
- Encodes include paths that appear in quote-includes, and sanitizes
  injected warnings, so that full paths are not included in shipped
  binaries.
- Adds jitify2::quote_include_name(), which can be used to override
  quote-includes (if really needed).
---
 jitify2.hpp     | 123 +++++++++++++++++++++++++++++++++++++++++-------
 jitify2_test.cu |  83 +++++++++++++++++++++++++++++---
 2 files changed, 184 insertions(+), 22 deletions(-)

diff --git a/jitify2.hpp b/jitify2.hpp
index 66b674a..8f4d87d 100644
--- a/jitify2.hpp
+++ b/jitify2.hpp
@@ -5831,7 +5831,7 @@ static const StringMap& get_jitsafe_headers_map() {
 }
 
 // Elides "/." and "/.." tokens from path. Returns empty string if illformed.
-inline std::string path_simplify(StringRef path) {
+inline std::string path_simplify(StringRef path, bool canonicalize = false) {
 #if defined _WIN32 || defined _WIN64
   // Note that Windows supports both forward and backslash path separators.
   const char* sep = "\\/";
@@ -5856,7 +5856,7 @@ inline std::string path_simplify(StringRef path) {
       } else if (cur_dir != ".") {  // Ignore /./
         dirs.push_back(cur_dir);
         if (after_slash) {
-          seps.push_back(path[i]);
+          seps.push_back(canonicalize ? '/' : path[i]);
         }
       }
       cur_dir.clear();
@@ -5903,10 +5903,12 @@ inline bool read_text_file(const std::string& fullpath, std::string* content) {
 inline void extract_include_paths(OptionsVec* options,
                                   StringVec* include_paths) {
   const std::vector<int> idxs = options->find({"-I"});
+  include_paths->clear();
+  include_paths->resize(idxs.size());
   for (int i = (int)idxs.size() - 1; i >= 0; --i) {
     const int idx = idxs[i];
     std::string include_path = (*options)[idx].value();
-    include_paths->push_back(std::move(include_path));
+    (*include_paths)[i] = std::move(include_path);
     options->erase(idx);
   }
 }
@@ -6924,8 +6926,8 @@ struct SourceLocation {
   int line_ = 0;
 };
 
-static const char* const kJitifyDirPrefix = "__jitify_rel_inc:";
-static const char* const kJitifyNamePrefix = ":__jitify_name:";
+static const char* const kJitifyDirPrefix = "__jitify_rel_inc@";
+static const char* const kJitifyNamePrefix = "@__jitify_name@";
 
 class IncludeName {
  public:
@@ -6972,13 +6974,13 @@ class IncludeName {
    */
   std::string local_full_path() const {
     assert(is_quote_include());
-    return is_quote_include() ? current_dir() + "/" + name() : "";
+    return is_quote_include() ? nonlocal_full_path(current_dir()) : "";
   }
   /*! Returns the full path to the header assuming it exists in the given
    * include directory. May be called for either "" or <> includes.
    */
   std::string nonlocal_full_path(const std::string& include_dir) const {
-    return include_dir + "/" + include_name_;
+    return jitify2::detail::path_join(include_dir, include_name_);
   }
   // For quote-includes, this returns a modified name that encodes the current
   // dir too.
@@ -6987,6 +6989,10 @@ class IncludeName {
     return kJitifyDirPrefix + current_dir() + kJitifyNamePrefix + name();
   }
 
+  IncludeName with_current_dir(std::string _current_dir) const {
+    return IncludeName(name(), _current_dir, location());
+  }
+
   friend bool operator==(const IncludeName& lhs, const IncludeName& rhs) {
     return lhs.name() == rhs.name() && lhs.current_dir() == rhs.current_dir();
   }
@@ -7279,6 +7285,8 @@ inline bool operator&(ProcessFlags lhs, ProcessFlags rhs) {
 // they end up not being reachable due to #if[def] directives.
 // Note: It is OK if source and *processed_source are the same underlying memory
 // (i.e., in-place operation is OK).
+// Note: include_visitor should be a callable with signature
+// (IncludeName*)->void.
 template <typename IncludeVisitor>
 inline ErrorMsg process_cuda_source(const std::string& source,
                                     const std::string& full_path,
@@ -7291,12 +7299,13 @@ inline ErrorMsg process_cuda_source(const std::string& source,
   ErrorMsg err = visit_all_include_directives(
       tokens.begin(), tokens.end(), full_path,
       [&](IncludeName include, CppParserIterator<TokenIterator> iter) {
+        // Note: We pass by mutable pointer to allow the visitor to modify it.
+        include_visitor(&include);
         if (include.is_quote_include()) {
           // Change `#include "name"` to `#include <patched_name>`, where
           // patched_name encodes the current dir as well as the name.
           *iter = Token(Tt::kString, "<" + include.patched_name() + ">");
         }
-        include_visitor(std::move(include));
       });
   if (err) return err;
   // Insert "#line 1" at the beginning of the file so that line numbering is
@@ -7337,6 +7346,20 @@ inline ErrorMsg process_cuda_source(const std::string& source,
 
 }  // namespace parser
 
+/*! Converts a quote-include name into Jitify's internal representation. This
+ *  can be used when specifying extra_header_sources in order to override a
+ *  quote-include. Note that this only works for quote-includes that appear
+ *  in the same directory as the source file (i.e., the current working dir).
+ *  E.g., if the source file contains `#include "foo/bar.h"`, that header can
+ *  be overridden via extra_header_sources by specifying the name
+ *  `quote_include_name("foo/bar.h")`.
+ *  Note that this isn't really recommended. It's likely better to use
+ *  angle-includes, or to use "-include" to add a completely new header.
+ */
+inline std::string quote_include_name(std::string name) {
+  return IncludeName(name, ".").patched_name();
+}
+
 namespace detail {
 
 static const char* const kJitifyBuiltinHeaderPrefix = "__jitify_builtin";
@@ -7532,6 +7555,27 @@ inline PreprocessedProgram PreprocessedProgram::preprocess(
   bool should_remove_unused_globals = static_cast<bool>(compiler_options.pop(
       {"-remove-unused-globals", "--remove-unused-globals"}));
 
+  StringVec include_paths;
+  detail::extract_include_paths(&compiler_options, &include_paths);
+  for (std::string& include_path : include_paths) {
+    include_path = detail::path_simplify(include_path, /*canonicalize=*/true);
+  }
+  // Returns index of longest matching include dir, or -1 if no match.
+  auto match_include_path = [&](std::string path, size_t* length) -> int {
+    path = detail::path_simplify(path, /*canonicalize=*/true);
+    *length = 0;
+    int matched_index = -1;
+    for (int i = 0; i < (int)include_paths.size(); ++i) {
+      const std::string& include_path = include_paths[i];
+      if (include_path.size() > *length &&
+          detail::startswith(path, include_path)) {
+        *length = include_path.size();
+        matched_index = i;
+      }
+    }
+    return matched_index;
+  };
+
   using parser::IncludeName;
   using parser::ProcessFlags;
   std::unordered_map<IncludeName, std::string, IncludeName::Hash>
@@ -7544,16 +7588,52 @@ inline PreprocessedProgram PreprocessedProgram::preprocess(
   const ProcessFlags replace_std_flag_if_enabled =
       use_cuda_std ? ProcessFlags::kReplaceStd : ProcessFlags::kNone;
 
+  static const char* const kJitifyEncodedIncludePath = "__jitify_I";
+  // Replaces an include path prefix with an index to avoid it appearing
+  // in the shipped binary.
+  auto encode_include_path = [&](IncludeName include) -> IncludeName {
+    size_t prefix_length;
+    const int matched_include_path_index =
+        match_include_path(include.current_dir(), &prefix_length);
+    if (matched_include_path_index != -1) {
+      include = include.with_current_dir(
+          kJitifyEncodedIncludePath +
+          std::to_string(matched_include_path_index) + "@" +
+          include.current_dir().substr(prefix_length));
+    }
+    return include;
+  };
+  auto decode_include_path = [&](IncludeName include) -> IncludeName {
+    if (include.is_quote_include()) {
+      std::string current_dir = include.current_dir();
+      size_t pos = current_dir.find(kJitifyEncodedIncludePath);
+      if (pos != std::string::npos) {
+        pos += std::strlen(kJitifyEncodedIncludePath);
+        const size_t end = current_dir.find("@", pos);
+        const int index = std::stoi(current_dir.substr(pos, end - pos));
+        current_dir = include_paths.at(index) + current_dir.substr(end + 1);
+        include = include.with_current_dir(current_dir);
+      }
+    }
+    return include;
+  };
+
   auto process_cuda_source_fn =
       [&](std::string* source_ptr, const std::string& fullpath,
           ProcessFlags extra_flags = ProcessFlags::kNone) {
         return parser::process_cuda_source(
             source_ptr->c_str(), fullpath, process_flags | extra_flags,
-            cxx_standard_year, source_ptr, [&](IncludeName include) {
-              if (include_to_fullpath.count(include)) {
+            cxx_standard_year, source_ptr, [&](IncludeName* include) {
+              if (include->is_quote_include() &&
+                  detail::path_is_absolute(include->current_dir())) {
+                // Replace an include path prefix with an index to avoid it
+                // appearing in the shipped binary.
+                *include = encode_include_path(*include);
+              }
+              if (include_to_fullpath.count(*include)) {
                 return;
               }
-              include_queue.push(std::move(include));
+              include_queue.push(*include);
             });
       };
 
@@ -7594,9 +7674,6 @@ inline PreprocessedProgram PreprocessedProgram::preprocess(
     include_to_fullpath.emplace(IncludeName(name), std::move(fullpath));
   }
 
-  StringVec include_paths;
-  detail::extract_include_paths(&compiler_options, &include_paths);
-
   // Process preincludes as if they are <> includes.
   for (int idx : compiler_options.find({"--pre-include", "-include"})) {
     const std::string& preinclude = compiler_options[idx].value();
@@ -7612,9 +7689,10 @@ inline PreprocessedProgram PreprocessedProgram::preprocess(
     include_queue.pop();
     std::string header_fullpath;
     using detail::HeaderLoadStatus;
+    const IncludeName decoded_include_name = decode_include_path(include_name);
     const HeaderLoadStatus status = detail::load_header(
-        include_name, header_callback, include_paths, use_builtin_headers,
-        &header_fullpath, &fullpath_to_source);
+        decoded_include_name, header_callback, include_paths,
+        use_builtin_headers, &header_fullpath, &fullpath_to_source);
     // Note: We ignore missing headers here because they may not be needed; if
     // they are needed, the error will be caught when we invoke the compiler.
     if (status == HeaderLoadStatus::kFailed) continue;
@@ -7807,6 +7885,19 @@ inline PreprocessedProgram PreprocessedProgram::preprocess(
     }
   }
 
+  // Redact full include paths from used header warnings in header sources
+  // so that they aren't shipped with the binary.
+  for (auto& header_name_and_source : header_sources) {
+    std::string& header_source = header_name_and_source.second;
+    static const char* const warning_string = "#warning JITIFY_USED_HEADER ";
+    size_t pos = header_source.find(warning_string);
+    assert(pos != std::string::npos);
+    pos += std::strlen(warning_string);
+    const size_t end = header_source.find("\n", pos);
+    assert(end != std::string::npos);
+    std::memset(&header_source[pos], '*', end - pos);
+  }
+
   // Re-add the --disable-warnings flag if it was provided.
   if (disable_warnings) {
     compiler_options.push_back(Option("-w"));
diff --git a/jitify2_test.cu b/jitify2_test.cu
index 16eedd3..92e55f0 100644
--- a/jitify2_test.cu
+++ b/jitify2_test.cu
@@ -717,6 +717,12 @@ TEST(Jitify2Test, PathSimplify) {
             R"(\foo/bar\cat)");
   EXPECT_EQ(jitify2::detail::path_simplify(R"(\foo/.\bar\../cat)"),
             R"(\foo/cat)");
+  EXPECT_EQ(jitify2::detail::path_simplify(R"(\foo/.\bar\../cat)",
+                                           /*canonicalize=*/true),
+            R"(/foo/cat)");
+  EXPECT_EQ(jitify2::detail::path_simplify(R"(///foo///.\\\bar\\\..///cat)",
+                                           /*canonicalize=*/true),
+            R"(/foo/cat)");
 #endif
 }
 
@@ -792,7 +798,36 @@ __global__ void my_kernel() {}
   ASSERT_EQ(get_error(compiled), "");
 }
 
+TEST(Jitify2Test, EncodedQuoteIncludes) {
+  // This tests that encoding of quote-includes works, and that the full include
+  // path is not left in the modified header source.
+  // Note that cuda_fp16.h contains `#include "cuda_fp16.hpp"`, which will be
+  // encoded with the cuda include dir.
+  static const char* const source = R"(
+#include <cuda_fp16.h>
+__global__ void my_kernel() {}
+)";
+  auto preprog =
+      Program("my_program", source)
+          ->preprocess({"-Ifoo/bar", "-I/cat/dog", "-I" CUDA_INC_DIR});
+  ASSERT_EQ(get_error(preprog), "");
+  auto compiled = preprog->compile();
+  ASSERT_EQ(get_error(compiled), "");
+  // Note: The '2' here is the index of the cuda include dir amongst the
+  // "-I" options.
+  ASSERT_TRUE(
+      preprog->header_sources().at("cuda_fp16.h").find("__jitify_I2@") !=
+      std::string::npos);
+  for (const auto& name_header : preprog->header_sources()) {
+    const std::string& header_source = name_header.second;
+    ASSERT_FALSE(header_source.find(CUDA_INC_DIR) != std::string::npos);
+  }
+}
+
 TEST(Jitify2Test, ExplicitHeaderSources) {
+  // TODO: This is currently only testing preprocess(), not compile(). We need
+  // to test compile() as well, because it behaves differently and may be the
+  // more common use-case.
   // This test checks how the keys in a user-provided header_sources map are
   // matched to #include directives in the source.
   static const std::string good_header = R"()";
@@ -870,6 +905,42 @@ TEST(Jitify2Test, ExplicitHeaderSources) {
   ASSERT_EQ(get_error(preprog), "");
 }
 
+TEST(Jitify2Test, ExtraHeaderSourcesOverride) {
+  // This tests that includes can be overridden by extra_header_sources.
+  static const char* const source = R"(
+#ifdef USE_QUOTE_INCLUDE
+#include "example_headers/my_header1.cuh"
+#else
+#include <example_headers/my_header1.cuh>
+#endif
+
+template <typename T>
+__global__ void my_kernel(T* data) {
+  *data = cube(*data);
+}
+)";
+  static const char* const header = R"(
+template <typename T>
+__device__ T cube(T x) { return x * x * x; }
+)";
+  // Test angle-include.
+  PreprocessedProgram preprog =
+      Program("my_program", source)->preprocess({"-I."});
+  *preprog->get_kernel("my_kernel<int>", {},
+                       {{"example_headers/my_header1.cuh", header}});
+
+  // Test quote-include.
+  // Note that this requires the use of jitify2::quote_include_name().
+  // Note also that this isn't really recommended. It's likely better to use
+  // angle-includes, or to use "-include" to add a completely new header.
+  preprog = Program("my_program", source)
+                ->preprocess({"-DUSE_QUOTE_INCLUDE", "-I" CUDA_INC_DIR});
+  *preprog->get_kernel(
+      "my_kernel<int>", {},
+      {{jitify2::quote_include_name("example_headers/my_header1.cuh"),
+        header}});
+}
+
 TEST(Jitify2Test, Preincludes) {
   // This tests that preincludes get preprocessed and that absolute paths are
   // handled correctly. Note that cuda.h includes <stdlib.h>, so it must be
@@ -2442,8 +2513,8 @@ using ::cuda::std::array;
 de /*blah*/ <a.h> /*blah*/ // blah
 #line 1
 const char* bar = "#include \"y.h\"";
-# /*blah*/ include /*blah*/ <__jitify_rel_inc:.:__jitify_name:b.h> /*blah*/ // blah
-#include <__jitify_rel_inc:.:__jitify_name:foo/c.h>
+# /*blah*/ include /*blah*/ <__jitify_rel_inc@.@__jitify_name@b.h> /*blah*/ // blah
+#include <__jitify_rel_inc@.@__jitify_name@foo/c.h>
 #include <foo/c.h>
 const char* cat = R"blah(#include "z.h")blah" "dog";
 int i = cat[0 + 1];
@@ -2460,8 +2531,8 @@ const char*include="#include <x.h>";using cuda::std::array;using::cuda::std::arr
 #include<a.h>
 #line 1
 const char*bar="#include \"y.h\"";
-#include<__jitify_rel_inc:.:__jitify_name:b.h>
-#include<__jitify_rel_inc:.:__jitify_name:foo/c.h>
+#include<__jitify_rel_inc@.@__jitify_name@b.h>
+#include<__jitify_rel_inc@.@__jitify_name@foo/c.h>
 #include<foo/c.h>
 const char*cat=R"blah(#include "z.h")blah""dog";int i=cat[0+1];
 #endif
@@ -2477,7 +2548,7 @@ const char*cat=R"blah(#include "z.h")blah""dog";int i=cat[0+1];
                   ProcessFlags::kReplacePragmaOnce | ProcessFlags::kReplaceStd |
                       ProcessFlags::kAddUsedHeaderWarning,
                   cxx_standard_year, &processed_source,
-                  [&](IncludeName include) { includes.push_back(include); })
+                  [&](IncludeName* include) { includes.push_back(*include); })
                   .empty());
   std::vector<IncludeName> expected_includes = {
       IncludeName("a.h"), IncludeName("b.h", current_dir),
@@ -2495,7 +2566,7 @@ const char*cat=R"blah(#include "z.h")blah""dog";int i=cat[0+1];
                       ProcessFlags::kAddUsedHeaderWarning |
                       ProcessFlags::kMinify,
                   cxx_standard_year, &processed_source,
-                  [&](IncludeName include) { includes.push_back(include); })
+                  [&](IncludeName* include) { includes.push_back(*include); })
                   .empty());
   ASSERT_EQ(includes, expected_includes);
   EXPECT_EQ(includes[0].location().file_name(), include_fullpath);

From adbeb714fdb016275fa1686dca6a54c537328e2b Mon Sep 17 00:00:00 2001
From: Ben Barsdell <bbarsdell@nvidia.com>
Date: Wed, 14 May 2025 09:44:09 +1000
Subject: [PATCH 40/47] Fix lonely hash "invalid preprocessing directive"

- Preprocessor directives containing no identifier (i.e., just a hash)
  are legal code, but we were treating them as an error.
- This fixes an issue with a CCCL header in CUDA 12.9.
- This commit also fixes an unnecessary assertion failure with old CUDA
  versions.
---
 jitify2.hpp     | 14 ++++++--------
 jitify2_test.cu | 31 +++++++++++++++++--------------
 2 files changed, 23 insertions(+), 22 deletions(-)

diff --git a/jitify2.hpp b/jitify2.hpp
index 8f4d87d..b9e318b 100644
--- a/jitify2.hpp
+++ b/jitify2.hpp
@@ -7045,12 +7045,9 @@ inline ErrorMsg visit_all_include_directives(TokenIterator begin,
   using Tt = Token::Type;
   for (auto iter = make_cpp_parser_iterator(begin, end); iter; ++iter) {
     if (iter.match(Tt::kHash)) {
-      if (!iter.match(Tt::kIdentifier)) {
-        return error_msg(
-            iter.line_number(),
-            "invalid preprocessing directive #" + iter->source_string());
-      }
-      if (iter.previous_token().token_string() == "include") {
+      // Note that preprocessor directives are allowed to have no identifier.
+      if (iter.match(Tt::kIdentifier) &&
+          iter.previous_token().token_string() == "include") {
         auto prev_iter = iter;
         // Note: It is possible to have macro substitutions here instead of a
         // string literal, but it is very rare, and some popular tools are
@@ -7891,10 +7888,11 @@ inline PreprocessedProgram PreprocessedProgram::preprocess(
     std::string& header_source = header_name_and_source.second;
     static const char* const warning_string = "#warning JITIFY_USED_HEADER ";
     size_t pos = header_source.find(warning_string);
-    assert(pos != std::string::npos);
+    // Ignore special cases (e.g., CUB header that we force to empty).
+    if (pos == std::string::npos) continue;
     pos += std::strlen(warning_string);
     const size_t end = header_source.find("\n", pos);
-    assert(end != std::string::npos);
+    if (end == std::string::npos) continue;
     std::memset(&header_source[pos], '*', end - pos);
   }
 
diff --git a/jitify2_test.cu b/jitify2_test.cu
index 92e55f0..7f60330 100644
--- a/jitify2_test.cu
+++ b/jitify2_test.cu
@@ -2478,6 +2478,7 @@ c+=a+++b;c+=a+ ++b;c+=a+++b;c+=a+++++ ++++b;a: ::b;)~";
 TEST(Jitify2ParserTest, ProcessCudaSource) {
   using namespace jitify2::parser;
   static const char* const source = R"~(
+# // Nothing here
 # /*blah*/pragma /*blah*/once  // blah
 const char* include = "#include <x.h>";
 #pragma once
@@ -2497,13 +2498,14 @@ const char* cat = R"blah(#include "z.h")blah" "dog";
 int i = cat[0 + 1];
 )~";
   static const char* const expected =
-      R"~(#ifndef JITIFY_INCLUDE_GUARD_D17F1E6F8466B0A8F5157A76D6618008AF6353BBABC40BE8FC2AFF6B38D21883
-#define JITIFY_INCLUDE_GUARD_D17F1E6F8466B0A8F5157A76D6618008AF6353BBABC40BE8FC2AFF6B38D21883
+      R"~(#ifndef JITIFY_INCLUDE_GUARD_05492F4AF61B4B5D5B3321684DD52AD05E5B1A44B9116CEFDF6F999843941EFF
+#define JITIFY_INCLUDE_GUARD_05492F4AF61B4B5D5B3321684DD52AD05E5B1A44B9116CEFDF6F999843941EFF
 #ifdef JITIFY_USED_HEADER_WARNINGS
 #warning JITIFY_USED_HEADER "./my_header.cuh"
 #endif
 #line 1
 
+# // Nothing here
 const char* include = "#include <x.h>";
 // A comment.
 //using std::array;
@@ -2518,15 +2520,16 @@ const char* bar = "#include \"y.h\"";
 #include <foo/c.h>
 const char* cat = R"blah(#include "z.h")blah" "dog";
 int i = cat[0 + 1];
-#endif // JITIFY_INCLUDE_GUARD_D17F1E6F8466B0A8F5157A76D6618008AF6353BBABC40BE8FC2AFF6B38D21883
+#endif // JITIFY_INCLUDE_GUARD_05492F4AF61B4B5D5B3321684DD52AD05E5B1A44B9116CEFDF6F999843941EFF
 )~";
   static const char* const expected_minified =
-      R"~(#ifndef JITIFY_INCLUDE_GUARD_D17F1E6F8466B0A8F5157A76D6618008AF6353BBABC40BE8FC2AFF6B38D21883
-#define JITIFY_INCLUDE_GUARD_D17F1E6F8466B0A8F5157A76D6618008AF6353BBABC40BE8FC2AFF6B38D21883
+      R"~(#ifndef JITIFY_INCLUDE_GUARD_05492F4AF61B4B5D5B3321684DD52AD05E5B1A44B9116CEFDF6F999843941EFF
+#define JITIFY_INCLUDE_GUARD_05492F4AF61B4B5D5B3321684DD52AD05E5B1A44B9116CEFDF6F999843941EFF
 #ifdef JITIFY_USED_HEADER_WARNINGS
 #warning JITIFY_USED_HEADER "./my_header.cuh"
 #endif
 #line 1
+#
 const char*include="#include <x.h>";using cuda::std::array;using::cuda::std::array;
 #include<a.h>
 #line 1
@@ -2543,19 +2546,19 @@ const char*cat=R"blah(#include "z.h")blah""dog";int i=cat[0+1];
   std::string include_name = "my_header.cuh";
   std::string current_dir = ".";
   std::string include_fullpath = current_dir + "/" + include_name;
-  EXPECT_TRUE(process_cuda_source(
-                  source, include_fullpath,
-                  ProcessFlags::kReplacePragmaOnce | ProcessFlags::kReplaceStd |
-                      ProcessFlags::kAddUsedHeaderWarning,
-                  cxx_standard_year, &processed_source,
-                  [&](IncludeName* include) { includes.push_back(*include); })
-                  .empty());
+  EXPECT_EQ((const std::string&)process_cuda_source(
+                source, include_fullpath,
+                ProcessFlags::kReplacePragmaOnce | ProcessFlags::kReplaceStd |
+                    ProcessFlags::kAddUsedHeaderWarning,
+                cxx_standard_year, &processed_source,
+                [&](IncludeName* include) { includes.push_back(*include); }),
+            "");
   std::vector<IncludeName> expected_includes = {
       IncludeName("a.h"), IncludeName("b.h", current_dir),
       IncludeName("foo/c.h", "."), IncludeName("foo/c.h")};
   ASSERT_EQ(includes, expected_includes);
   EXPECT_EQ(includes[0].location().file_name(), include_fullpath);
-  EXPECT_EQ(includes[0].location().line(), 11);
+  EXPECT_EQ(includes[0].location().line(), 12);
   EXPECT_EQ(includes[2].location().file_name(), include_fullpath);
   EXPECT_EQ(includes[2].location().line(), 3);
   EXPECT_EQ(processed_source, expected);
@@ -2570,7 +2573,7 @@ const char*cat=R"blah(#include "z.h")blah""dog";int i=cat[0+1];
                   .empty());
   ASSERT_EQ(includes, expected_includes);
   EXPECT_EQ(includes[0].location().file_name(), include_fullpath);
-  EXPECT_EQ(includes[0].location().line(), 11);
+  EXPECT_EQ(includes[0].location().line(), 12);
   EXPECT_EQ(includes[2].location().file_name(), include_fullpath);
   EXPECT_EQ(includes[2].location().line(), 3);
   EXPECT_EQ(processed_source, expected_minified);

From be6f65f50ddd649911adda0217fc23cd4b818f1d Mon Sep 17 00:00:00 2001
From: Ben Barsdell <bbarsdell@nvidia.com>
Date: Thu, 15 May 2025 12:30:24 +1000
Subject: [PATCH 41/47] Change ULLONG_MAX definition to match libcudacxx

- This avoids "incompatible redefinition" warnings when both <limits>
  and <cuda/std/limits> are included (and in that order).
---
 jitify2.hpp     | 2 +-
 jitify2_test.cu | 1 +
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/jitify2.hpp b/jitify2.hpp
index b9e318b..f82b167 100644
--- a/jitify2.hpp
+++ b/jitify2.hpp
@@ -4377,7 +4377,7 @@ JITIFY_DEFINE_C_AND_CXX_HEADERS(limits, R"(
 #define LLONG_MIN  (-LLONG_MAX - 1)
 #endif
 #ifndef ULLONG_MAX
-#define ULLONG_MAX 0xffffffffffffffffULL
+#define ULLONG_MAX 0xffffffffffffffffUL
 #endif
 )",
                                 "");
diff --git a/jitify2_test.cu b/jitify2_test.cu
index 7f60330..ec5786b 100644
--- a/jitify2_test.cu
+++ b/jitify2_test.cu
@@ -2016,6 +2016,7 @@ TEST(Jitify2Test, LibCudaCxxAndBuiltinLimits) {
   ASSERT_EQ(get_error(preprog), "");
   CompiledProgram compiled = preprog->compile();
   ASSERT_EQ(get_error(compiled), "");
+  ASSERT_EQ(compiled->log(), "");  // Ensure no warnings
 }
 #endif  // CUDA_VERSION >= 11000
 

From 9df3902f73d511313af0c79b956b0a9f6bbca988 Mon Sep 17 00:00:00 2001
From: Ben Barsdell <bbarsdell@nvidia.com>
Date: Thu, 15 May 2025 12:38:29 +1000
Subject: [PATCH 42/47] Fix user guide code sample and tweak test command

---
 jitify2_user_guide.md | 21 ++++++++++-----------
 1 file changed, 10 insertions(+), 11 deletions(-)

diff --git a/jitify2_user_guide.md b/jitify2_user_guide.md
index b43b60f..b85d8e1 100644
--- a/jitify2_user_guide.md
+++ b/jitify2_user_guide.md
@@ -48,16 +48,15 @@ It provides a simple API for compiling and executing CUDA source code at runtime
   dim3 grid(1), block(1);
   float* data;
   cudaMalloc((void**)&data, sizeof(float));
-  jitify2::LoadedProgram program =
-      jitify2::Program(program_name, program_source)
-          // Preprocess source code and load all included headers.
-          ->preprocess({"-std=c++14"})
-          // Compile, link, and load the program, and obtain the loaded kernel.
-          ->get_kernel("my_kernel<float>")
-          // Configure the kernel launch.
-          ->configure(grid, block)
-          // Launch the kernel.
-          ->launch(data);
+  jitify2::Program(program_name, program_source)
+      // Preprocess source code and load all included headers.
+      ->preprocess({"-std=c++14"})
+      // Compile, link, and load the program, and obtain the loaded kernel.
+      ->get_kernel("my_kernel<float>")
+      // Configure the kernel launch.
+      ->configure(grid, block)
+      // Launch the kernel.
+      ->launch(data);
 ```
 
 <a name="error_handling"/>
@@ -228,7 +227,7 @@ The unit tests can be built and run using CMake as follows:
 
 ```bash
 $ mkdir build && cd build && cmake ..
-$ make check
+$ make check -j6
 ```
 
 Note that the tests in `jitify2_test.cu` may also be useful as a form of

From 359d766bf5dbd0d195dd49623a64db3f2b3e332b Mon Sep 17 00:00:00 2001
From: Ben Barsdell <bbarsdell@nvidia.com>
Date: Fri, 16 May 2025 15:19:33 +1000
Subject: [PATCH 43/47] Fix pragma once removal breaking line numbers

- Previously, the whole pragma once line was removed, which messed up
  line numbering. Now, only the "pragma ... once" tokens are removed,
  leaving "#\n" and maintaining line numbers.
- Also adds a test that line numbers are maintained.
- Note that the existing tests for line numbers of include directives
  were not affected by this because they are extracted before the
  pragma once directives are removed.
---
 jitify2.hpp     | 29 +++++++++++++--------------
 jitify2_test.cu | 52 +++++++++++++++++++++++++++++++++++++++++--------
 2 files changed, 58 insertions(+), 23 deletions(-)

diff --git a/jitify2.hpp b/jitify2.hpp
index f82b167..0161a29 100644
--- a/jitify2.hpp
+++ b/jitify2.hpp
@@ -6838,11 +6838,13 @@ class CppParserIterator {
     return true;
   }
 
+  bool peek_identifier(const char* name) {
+    return current_->type() == Token::Type::kIdentifier &&
+           current_->token_string() == name;
+  }
+
   bool match_identifier(const char* name) {
-    if (current_->type() != Token::Type::kIdentifier ||
-        current_->token_string() != name) {
-      return false;
-    }
+    if (!peek_identifier(name)) return false;
     ++(*this);
     return true;
   }
@@ -6861,6 +6863,8 @@ class CppParserIterator {
     for (token_iterator it = first_to_erase.base(); it != current_; ++it) {
       previous_tokens_.pop();
     }
+    // Note: The ++ here advances to the next _base_ token (because we don't
+    // want to jump over subsequent comment or whitespace tokens).
     current_ = token_container->erase(first_to_erase.base(), ++current_);
     skip_whitespace_and_comments();
     return *this;
@@ -7142,21 +7146,16 @@ inline bool replace_pragma_once_with_ifndef(const std::string& unique_source_id,
   bool found = false;
   for (auto iter = make_cpp_parser_iterator(tokens->begin(), tokens->end());
        iter;) {
-    auto start_iter = iter;
     if (iter.match(Tt::kHash)) {
-      if (iter.match_identifier("pragma") && iter.match_identifier("once")) {
-        iter.advance_to(Tt::kEndOfDirective);
-        if (!iter) break;
-        // Note: The ++ here advances to the next _base_ token (because we don't
-        // want to jump over subsequent comment or whitespace tokens).
+      auto start_iter = iter;
+      if (iter.match_identifier("pragma") && iter.peek_identifier("once")) {
+        // Erase "pragma ... once", leaving "#\n".
         iter.erase_back_to(tokens, start_iter);
         found = true;
-        // Note: There can be more than one #pragma once.
-        continue;
-      } else {
-        iter.advance_to(Tt::kEndOfDirective);
-        if (!iter) break;
+        // Note: There can be more than one #pragma once, so we don't break out.
       }
+      iter.advance_to(Tt::kEndOfDirective);
+      if (!iter) break;
     }
     ++iter;
   }
diff --git a/jitify2_test.cu b/jitify2_test.cu
index ec5786b..c5346d5 100644
--- a/jitify2_test.cu
+++ b/jitify2_test.cu
@@ -2041,6 +2041,30 @@ __global__ void my_assert_kernel() {
   // longer be used for CUDA operations after this point.
 }
 
+TEST(Jitify2Test, LineNumbers) {
+  // This checks that line numbers are maintained after preprocessing replaces
+  // #pragma once directives etc.
+  static const char* const source = R"(
+#include <test_line_numbers.cuh>
+)";
+  static const char* const header_source = R"(// Line 1
+// Line 2
+#pragma once  // Line 3
+// Line 4
+#warning TEST_WARNING  // Line 5
+)";
+  CompiledProgram compiled =
+      jitify2::Program("line_numbers_program", source,
+                       {{"test_line_numbers.cuh", header_source}})
+          ->preprocess()
+          ->compile();
+  ASSERT_EQ(get_error(compiled), "");
+  const std::string expected =
+      "test_line_numbers.cuh(5): warning #1105-D: #warning directive: "
+      "TEST_WARNING";
+  EXPECT_EQ(compiled->log().substr(0, expected.size()), expected);
+}
+
 TEST(Jitify2Test, Minify) {
   static const char* const name = "my_program";
   // This source is intentionally tricky to parse so that it stresses the
@@ -2484,6 +2508,8 @@ TEST(Jitify2ParserTest, ProcessCudaSource) {
 const char* include = "#include <x.h>";
 #pragma once
 #pragma once
+#pragma pack(1)
+struct PackedStruct {};
 // A comment.
 //using std::array;
 using std::array;
@@ -2499,15 +2525,20 @@ const char* cat = R"blah(#include "z.h")blah" "dog";
 int i = cat[0 + 1];
 )~";
   static const char* const expected =
-      R"~(#ifndef JITIFY_INCLUDE_GUARD_05492F4AF61B4B5D5B3321684DD52AD05E5B1A44B9116CEFDF6F999843941EFF
-#define JITIFY_INCLUDE_GUARD_05492F4AF61B4B5D5B3321684DD52AD05E5B1A44B9116CEFDF6F999843941EFF
+      R"~(#ifndef JITIFY_INCLUDE_GUARD_02AF516366B59070A1EF7711992CA9E66C3BE9955A6440B49A271CF7EE1D3239
+#define JITIFY_INCLUDE_GUARD_02AF516366B59070A1EF7711992CA9E66C3BE9955A6440B49A271CF7EE1D3239
 #ifdef JITIFY_USED_HEADER_WARNINGS
 #warning JITIFY_USED_HEADER "./my_header.cuh"
 #endif
 #line 1
 
 # // Nothing here
+# /*blah*/  // blah
 const char* include = "#include <x.h>";
+#
+#
+#pragma pack(1)
+struct PackedStruct {};
 // A comment.
 //using std::array;
 using cuda::std::array;
@@ -2521,17 +2552,22 @@ const char* bar = "#include \"y.h\"";
 #include <foo/c.h>
 const char* cat = R"blah(#include "z.h")blah" "dog";
 int i = cat[0 + 1];
-#endif // JITIFY_INCLUDE_GUARD_05492F4AF61B4B5D5B3321684DD52AD05E5B1A44B9116CEFDF6F999843941EFF
+#endif // JITIFY_INCLUDE_GUARD_02AF516366B59070A1EF7711992CA9E66C3BE9955A6440B49A271CF7EE1D3239
 )~";
   static const char* const expected_minified =
-      R"~(#ifndef JITIFY_INCLUDE_GUARD_05492F4AF61B4B5D5B3321684DD52AD05E5B1A44B9116CEFDF6F999843941EFF
-#define JITIFY_INCLUDE_GUARD_05492F4AF61B4B5D5B3321684DD52AD05E5B1A44B9116CEFDF6F999843941EFF
+      R"~(#ifndef JITIFY_INCLUDE_GUARD_02AF516366B59070A1EF7711992CA9E66C3BE9955A6440B49A271CF7EE1D3239
+#define JITIFY_INCLUDE_GUARD_02AF516366B59070A1EF7711992CA9E66C3BE9955A6440B49A271CF7EE1D3239
 #ifdef JITIFY_USED_HEADER_WARNINGS
 #warning JITIFY_USED_HEADER "./my_header.cuh"
 #endif
 #line 1
 #
-const char*include="#include <x.h>";using cuda::std::array;using::cuda::std::array;
+#
+const char*include="#include <x.h>";
+#
+#
+#pragma pack(1)
+struct PackedStruct{};using cuda::std::array;using::cuda::std::array;
 #include<a.h>
 #line 1
 const char*bar="#include \"y.h\"";
@@ -2559,7 +2595,7 @@ const char*cat=R"blah(#include "z.h")blah""dog";int i=cat[0+1];
       IncludeName("foo/c.h", "."), IncludeName("foo/c.h")};
   ASSERT_EQ(includes, expected_includes);
   EXPECT_EQ(includes[0].location().file_name(), include_fullpath);
-  EXPECT_EQ(includes[0].location().line(), 12);
+  EXPECT_EQ(includes[0].location().line(), 14);
   EXPECT_EQ(includes[2].location().file_name(), include_fullpath);
   EXPECT_EQ(includes[2].location().line(), 3);
   EXPECT_EQ(processed_source, expected);
@@ -2574,7 +2610,7 @@ const char*cat=R"blah(#include "z.h")blah""dog";int i=cat[0+1];
                   .empty());
   ASSERT_EQ(includes, expected_includes);
   EXPECT_EQ(includes[0].location().file_name(), include_fullpath);
-  EXPECT_EQ(includes[0].location().line(), 12);
+  EXPECT_EQ(includes[0].location().line(), 14);
   EXPECT_EQ(includes[2].location().file_name(), include_fullpath);
   EXPECT_EQ(includes[2].location().line(), 3);
   EXPECT_EQ(processed_source, expected_minified);

From 3e47c9984686ca67131620cc7c762d01e906f109 Mon Sep 17 00:00:00 2001
From: Ben Barsdell <bbarsdell@nvidia.com>
Date: Fri, 16 May 2025 16:04:18 +1000
Subject: [PATCH 44/47] Refactor and clean up builtin headers

- Refactors builtin headers to properly handle the "header.h" and
  "cheader" versions both being included without conflicting.
- Fixes incompatibility between our tuple header and libcudacxx's.
- Makes other minor fixes and enhancements to some headers.
- Adds tests for compatibility within and between all builtin headers.
---
 jitify2.hpp     | 340 ++++++++++++++++++++++++++++++++----------------
 jitify2_test.cu |  49 +++++++
 2 files changed, 280 insertions(+), 109 deletions(-)

diff --git a/jitify2.hpp b/jitify2.hpp
index 0161a29..eb4da7a 100644
--- a/jitify2.hpp
+++ b/jitify2.hpp
@@ -4287,22 +4287,18 @@ static const char* const jitsafe_header_preinclude_h =
 #endif
     ;
 
-#define JITIFY_DEFINE_C_AND_CXX_HEADERS_EX(name, header, std_and_global_impl, \
-                                           std_only_impl)                     \
-  static const char* const jitsafe_header_##name##_h =                        \
-      "#pragma once\n" header "\n" std_and_global_impl;                       \
-  static const char* const jitsafe_header_c##name =                           \
-      "#pragma once\n" header                                                 \
-      "\n"                                                                    \
-      "namespace std {\n" std_only_impl std_and_global_impl                   \
-      "}  // namespace std\n" std_and_global_impl
-
-#define JITIFY_DEFINE_C_AND_CXX_HEADERS(name, header, std_and_global_impl) \
-  JITIFY_DEFINE_C_AND_CXX_HEADERS_EX(name, header, std_and_global_impl, "")
-
-JITIFY_DEFINE_C_AND_CXX_HEADERS(assert, "", "");
-
-JITIFY_DEFINE_C_AND_CXX_HEADERS(float, R"(
+static const char* const jitsafe_header_assert_h = R"(
+#pragma once
+// Note: NVRTC defines the assert() macro.
+)";
+
+static const char* const jitsafe_header_cassert = R"(
+#pragma once
+#include <assert.h>
+)";
+
+static const char* const jitsafe_header_float_h = R"(
+#pragma once
 #define FLT_RADIX       2
 #define FLT_MANT_DIG    24
 #define DBL_MANT_DIG    53
@@ -4327,10 +4323,23 @@ JITIFY_DEFINE_C_AND_CXX_HEADERS(float, R"(
 #define FLT_EVAL_METHOD 0
 #define DECIMAL_DIG     21
 #endif
-)",
-                                "");
+#if defined __cplusplus && __cplusplus >= 201703L
+#define FLT_DECIMAL_DIG 9
+#define DBL_DECIMAL_DIG 17
+#define FLT_TRUE_MIN    1.40129846432481707092372958328991613e-45f
+#define DBL_TRUE_MIN    4.94065645841246544176568792868221372e-324
+#define FLT_HAS_SUBNORM 1
+#define DBL_HAS_SUBNORM 1
+#endif
+)";
+
+static const char* const jitsafe_header_cfloat = R"(
+#pragma once
+#include <float.h>
+)";
 
-JITIFY_DEFINE_C_AND_CXX_HEADERS(limits, R"(
+static const char* const jitsafe_header_limits_h = R"(
+#pragma once
 #if defined _WIN32 || defined _WIN64
  #define __WORDSIZE 32
 #else
@@ -4379,39 +4388,50 @@ JITIFY_DEFINE_C_AND_CXX_HEADERS(limits, R"(
 #ifndef ULLONG_MAX
 #define ULLONG_MAX 0xffffffffffffffffUL
 #endif
-)",
-                                "");
+)";
+
+static const char* const jitsafe_header_climits = R"(
+#pragma once
+#include <limits.h>
+)";
+
+static const char* const jitsafe_header_math_h = R"(
+#pragma once
+#define M_PI 3.14159265358979323846
+// Note: Global namespace already includes CUDA math funcs.
+)";
 
-// Note: Global namespace already includes CUDA math funcs
-JITIFY_DEFINE_C_AND_CXX_HEADERS_EX(math, "#define M_PI 3.14159265358979323846",
-                                   "", R"(
+static const char* const jitsafe_header_cmath = R"(
+#pragma once
+#include <math.h>
+namespace std {
 #if __cplusplus >= 201103L
-#define DEFINE_MATH_UNARY_FUNC_WRAPPER(f)                       \
+#define JITIFY_DEFINE_MATH_UNARY_FUNC_WRAPPER(f)                \
   inline double f(double x) { return ::f(x); }                  \
   inline float f##f(float x) { return ::f(x); }                 \
   /*inline long double f##l(long double x) { return ::f(x); }*/ \
   inline float f(float x) { return ::f(x); }                    \
   /*inline long double f(long double x)    { return ::f(x); }*/
 #else
-#define DEFINE_MATH_UNARY_FUNC_WRAPPER(f)       \
-  inline double f(double x) { return ::f(x); }  \
-  inline float f##f(float x) { return ::f(x); } \
+#define JITIFY_DEFINE_MATH_UNARY_FUNC_WRAPPER(f) \
+  inline double f(double x) { return ::f(x); }   \
+  inline float f##f(float x) { return ::f(x); }  \
   /*inline long double f##l(long double x) { return ::f(x); }*/
 #endif
-DEFINE_MATH_UNARY_FUNC_WRAPPER(cos)
-DEFINE_MATH_UNARY_FUNC_WRAPPER(sin)
-DEFINE_MATH_UNARY_FUNC_WRAPPER(tan)
-DEFINE_MATH_UNARY_FUNC_WRAPPER(acos)
-DEFINE_MATH_UNARY_FUNC_WRAPPER(asin)
-DEFINE_MATH_UNARY_FUNC_WRAPPER(atan)
+JITIFY_DEFINE_MATH_UNARY_FUNC_WRAPPER(cos)
+JITIFY_DEFINE_MATH_UNARY_FUNC_WRAPPER(sin)
+JITIFY_DEFINE_MATH_UNARY_FUNC_WRAPPER(tan)
+JITIFY_DEFINE_MATH_UNARY_FUNC_WRAPPER(acos)
+JITIFY_DEFINE_MATH_UNARY_FUNC_WRAPPER(asin)
+JITIFY_DEFINE_MATH_UNARY_FUNC_WRAPPER(atan)
 template <typename T>
 inline T atan2(T y, T x) {
   return ::atan2(y, x);
 }
-DEFINE_MATH_UNARY_FUNC_WRAPPER(cosh)
-DEFINE_MATH_UNARY_FUNC_WRAPPER(sinh)
-DEFINE_MATH_UNARY_FUNC_WRAPPER(tanh)
-DEFINE_MATH_UNARY_FUNC_WRAPPER(exp)
+JITIFY_DEFINE_MATH_UNARY_FUNC_WRAPPER(cosh)
+JITIFY_DEFINE_MATH_UNARY_FUNC_WRAPPER(sinh)
+JITIFY_DEFINE_MATH_UNARY_FUNC_WRAPPER(tanh)
+JITIFY_DEFINE_MATH_UNARY_FUNC_WRAPPER(exp)
 template <typename T>
 inline T frexp(T x, int* exp) {
   return ::frexp(x, exp);
@@ -4420,8 +4440,8 @@ template <typename T>
 inline T ldexp(T x, int exp) {
   return ::ldexp(x, exp);
 }
-DEFINE_MATH_UNARY_FUNC_WRAPPER(log)
-DEFINE_MATH_UNARY_FUNC_WRAPPER(log10)
+JITIFY_DEFINE_MATH_UNARY_FUNC_WRAPPER(log)
+JITIFY_DEFINE_MATH_UNARY_FUNC_WRAPPER(log10)
 template <typename T>
 inline T modf(T x, T* intpart) {
   return ::modf(x, intpart);
@@ -4430,31 +4450,31 @@ template <typename T>
 inline T pow(T x, T y) {
   return ::pow(x, y);
 }
-DEFINE_MATH_UNARY_FUNC_WRAPPER(sqrt)
-DEFINE_MATH_UNARY_FUNC_WRAPPER(ceil)
-DEFINE_MATH_UNARY_FUNC_WRAPPER(floor)
+JITIFY_DEFINE_MATH_UNARY_FUNC_WRAPPER(sqrt)
+JITIFY_DEFINE_MATH_UNARY_FUNC_WRAPPER(ceil)
+JITIFY_DEFINE_MATH_UNARY_FUNC_WRAPPER(floor)
 template <typename T>
 inline T fmod(T n, T d) {
   return ::fmod(n, d);
 }
-DEFINE_MATH_UNARY_FUNC_WRAPPER(fabs)
+JITIFY_DEFINE_MATH_UNARY_FUNC_WRAPPER(fabs)
 template <typename T>
 inline T abs(T x) {
   return ::abs(x);
 }
 #if __cplusplus >= 201103L
-DEFINE_MATH_UNARY_FUNC_WRAPPER(acosh)
-DEFINE_MATH_UNARY_FUNC_WRAPPER(asinh)
-DEFINE_MATH_UNARY_FUNC_WRAPPER(atanh)
-DEFINE_MATH_UNARY_FUNC_WRAPPER(exp2)
-DEFINE_MATH_UNARY_FUNC_WRAPPER(expm1)
+JITIFY_DEFINE_MATH_UNARY_FUNC_WRAPPER(acosh)
+JITIFY_DEFINE_MATH_UNARY_FUNC_WRAPPER(asinh)
+JITIFY_DEFINE_MATH_UNARY_FUNC_WRAPPER(atanh)
+JITIFY_DEFINE_MATH_UNARY_FUNC_WRAPPER(exp2)
+JITIFY_DEFINE_MATH_UNARY_FUNC_WRAPPER(expm1)
 template <typename T>
 inline int ilogb(T x) {
   return ::ilogb(x);
 }
-DEFINE_MATH_UNARY_FUNC_WRAPPER(log1p)
-DEFINE_MATH_UNARY_FUNC_WRAPPER(log2)
-DEFINE_MATH_UNARY_FUNC_WRAPPER(logb)
+JITIFY_DEFINE_MATH_UNARY_FUNC_WRAPPER(log1p)
+JITIFY_DEFINE_MATH_UNARY_FUNC_WRAPPER(log2)
+JITIFY_DEFINE_MATH_UNARY_FUNC_WRAPPER(logb)
 template <typename T>
 inline T scalbn(T x, int n) {
   return ::scalbn(x, n);
@@ -4463,17 +4483,17 @@ template <typename T>
 inline T scalbln(T x, long n) {
   return ::scalbn(x, n);
 }
-DEFINE_MATH_UNARY_FUNC_WRAPPER(cbrt)
+JITIFY_DEFINE_MATH_UNARY_FUNC_WRAPPER(cbrt)
 template <typename T>
 inline T hypot(T x, T y) {
   return ::hypot(x, y);
 }
-DEFINE_MATH_UNARY_FUNC_WRAPPER(erf)
-DEFINE_MATH_UNARY_FUNC_WRAPPER(erfc)
-DEFINE_MATH_UNARY_FUNC_WRAPPER(tgamma)
-DEFINE_MATH_UNARY_FUNC_WRAPPER(lgamma)
-DEFINE_MATH_UNARY_FUNC_WRAPPER(trunc)
-DEFINE_MATH_UNARY_FUNC_WRAPPER(round)
+JITIFY_DEFINE_MATH_UNARY_FUNC_WRAPPER(erf)
+JITIFY_DEFINE_MATH_UNARY_FUNC_WRAPPER(erfc)
+JITIFY_DEFINE_MATH_UNARY_FUNC_WRAPPER(tgamma)
+JITIFY_DEFINE_MATH_UNARY_FUNC_WRAPPER(lgamma)
+JITIFY_DEFINE_MATH_UNARY_FUNC_WRAPPER(trunc)
+JITIFY_DEFINE_MATH_UNARY_FUNC_WRAPPER(round)
 template <typename T>
 inline long lround(T x) {
   return ::lround(x);
@@ -4482,7 +4502,7 @@ template <typename T>
 inline long long llround(T x) {
   return ::llround(x);
 }
-DEFINE_MATH_UNARY_FUNC_WRAPPER(rint)
+JITIFY_DEFINE_MATH_UNARY_FUNC_WRAPPER(rint)
 template <typename T>
 inline long lrint(T x) {
   return ::lrint(x);
@@ -4491,15 +4511,19 @@ template <typename T>
 inline long long llrint(T x) {
   return ::llrint(x);
 }
-DEFINE_MATH_UNARY_FUNC_WRAPPER(nearbyint)
+JITIFY_DEFINE_MATH_UNARY_FUNC_WRAPPER(nearbyint)
 // TODO: remainder, remquo, copysign, nan, nextafter, nexttoward, fdim,
 // fmax, fmin, fma
 #endif  // __cplusplus >= 201103L
-#undef DEFINE_MATH_UNARY_FUNC_WRAPPER
-)");
+#undef JITIFY_DEFINE_MATH_UNARY_FUNC_WRAPPER
+}  // namespace std
+)";
 
+static const char* const jitsafe_header_stddef_h = R"(
+#pragma once
+#define NULL 0
 // TODO: offsetof
-JITIFY_DEFINE_C_AND_CXX_HEADERS_EX(stddef, "#include <climits>", R"(
+// Note: NVRTC provides built-in definitions of ::size_t, ::ptrdiff_t, and ::wchar_t.
 #if __cplusplus >= 201103L
 typedef decltype(nullptr) nullptr_t;
 #if defined(_MSC_VER)
@@ -4516,18 +4540,31 @@ typedef decltype(nullptr) nullptr_t;
   } max_align_t;
 #endif
 #endif  // __cplusplus >= 201103L
+)";
+
+static const char* const jitsafe_header_cstddef = R"(
+#pragma once
+#include <stddef.h>
+namespace std {
+using ::size_t;
+using ::ptrdiff_t;
+// Note: NVRTC defines wchar_t as a macro, so we can't define std::wchar_t.
+#if __cplusplus >= 201103L
+using ::nullptr_t;
+using ::max_align_t;
+#endif  // __cplusplus >= 201103L
 #if __cplusplus >= 201703L
 enum class byte : unsigned char {};
+// TODO: byte operators.
+template <class I>
+constexpr I to_integer(byte b) noexcept;
 #endif  // __cplusplus >= 201703L
-)",
-                                   R"(
-// NVRTC provides built-in definitions of ::size_t and ::ptrdiff_t.
-using ::size_t;
-using ::ptrdiff_t;
-)");
+}  // namespace std
+)";
 
-JITIFY_DEFINE_C_AND_CXX_HEADERS(stdint, R"(
-#include <climits>
+static const char* const jitsafe_header_stdint_h = R"(
+#pragma once
+#include <limits.h>
 #define INT8_MIN SCHAR_MIN
 #define INT16_MIN SHRT_MIN
 #define INT32_MIN INT_MIN
@@ -4556,8 +4593,7 @@ JITIFY_DEFINE_C_AND_CXX_HEADERS(stdint, R"(
 #define WCHAR_MAX                                                              \
     (sizeof(wchar_t) == 2 ? _JITIFY_WCHAR_T_IS_UNSIGNED ? USHRT_MAX : SHRT_MAX \
                           : _JITIFY_WCHAR_T_IS_UNSIGNED ? UINT_MAX : INT_MAX)
-)",
-                                R"(
+static_assert(INT8_MIN == SCHAR_MIN, "");  // Sanity test that both are defined
 typedef signed char int8_t;
 typedef signed short int16_t;
 typedef signed int int32_t;
@@ -4586,41 +4622,115 @@ typedef unsigned long long uint_least64_t;
 typedef unsigned long long uintmax_t;
 typedef int64_t intptr_t;  // optional
 typedef uint64_t uintptr_t;  // optional
-)");
+)";
+
+static const char* const jitsafe_header_cstdint = R"(
+#pragma once
+#include <stdint.h>
+namespace std {
+using ::int8_t;
+using ::int16_t;
+using ::int32_t;
+using ::int64_t;
+using ::int_fast8_t;
+using ::int_fast16_t;
+using ::int_fast32_t;
+using ::int_fast64_t;
+using ::int_least8_t;
+using ::int_least16_t;
+using ::int_least32_t;
+using ::int_least64_t;
+using ::intmax_t;
+using ::uint8_t;
+using ::uint16_t;
+using ::uint32_t;
+using ::uint64_t;
+using ::uint_fast8_t;
+using ::uint_fast16_t;
+using ::uint_fast32_t;
+using ::uint_fast64_t;
+using ::uint_least8_t;
+using ::uint_least16_t;
+using ::uint_least32_t;
+using ::uint_least64_t;
+using ::uintmax_t;
+using ::intptr_t;  // optional
+using ::uintptr_t;  // optional
+}  // namespace std
+)";
 
-JITIFY_DEFINE_C_AND_CXX_HEADERS_EX(stdio, "#include <cstddef>", R"(
+static const char* const jitsafe_header_stdio_h = R"(
+#pragma once
+#define NULL 0
 using FILE = int;
 int fflush(FILE* stream);
 int fprintf(FILE* stream, const char* format, ...);
-)",
-                                   R"(
-// NVRTC provides a built-in definition of ::size_t.
+)";
+
+static const char* const jitsafe_header_cstdio = R"(
+#pragma once
+#include <stdio.h>
+namespace std {
 using ::size_t;
-)");
+using ::FILE;
+using ::fflush;
+using ::fprintf;
+}  // namespace std
+)";
 
-JITIFY_DEFINE_C_AND_CXX_HEADERS(stdlib, "#include <cstddef>", "");
+static const char* const jitsafe_header_stdlib_h = R"(
+#pragma once
+#define NULL 0
+)";
+
+static const char* const jitsafe_header_cstdlib = R"(
+#pragma once
+#include <stdlib.h>
+namespace std {
+using ::size_t;
+}  // namespace std
+)";
 
-JITIFY_DEFINE_C_AND_CXX_HEADERS_EX(string, "", R"(
-//#include <cstddef>
+static const char* const jitsafe_header_string_h = R"(
+#pragma once
+#define NULL 0
 char* strcpy(char* destination, const char* source);
 int strcmp(const char* str1, const char* str2);
 char* strerror(int errnum);
 char* strcat(char* dest, const char* src);
-)",
-                                   R"(
-// NVRTC provides a built-in definition of ::size_t.
+)";
+
+static const char* const jitsafe_header_cstring = R"(
+#pragma once
+#include <string.h>
+namespace std {
 using ::size_t;
-)");
+using ::strcpy;
+using ::strcmp;
+using ::strerror;
+using ::strcat;
+}  // namespace std
+)";
 
-// va_start, va_arg etc. are predefined by NVRTC, but we still need a header.
-JITIFY_DEFINE_C_AND_CXX_HEADERS(stdarg, "", "");
+static const char* const jitsafe_header_stdarg_h = R"(
+#pragma once
+// Note: va_start, va_arg etc. are predefined by NVRTC.
+)";
+
+static const char* const jitsafe_header_cstdarg = R"(
+#pragma once
+#include <stdarg.h>
+namespace std {
+using ::va_list;
+}  // namespace std
+)";
 
-JITIFY_DEFINE_C_AND_CXX_HEADERS_EX(time, R"(
+static const char* const jitsafe_header_time_h = R"(
+#pragma once
 #define NULL 0
 #define CLOCKS_PER_SEC 1000000
-)",
-                                   R"(
 typedef long time_t;
+// Note: NVRTC provides built-in definitions of ::size_t and ::clock_t.
 struct tm {
   int tm_sec;
   int tm_min;
@@ -4638,15 +4748,21 @@ struct timespec {
   long tv_nsec;
 };
 #endif
-)",
-                                   R"(
-// NVRTC provides built-in definitions of ::size_t and ::clock_t.
+)";
+
+static const char* const jitsafe_header_ctime = R"(
+#pragma once
+#include <time.h>
+namespace std {
+using ::time_t;
 using ::size_t;
 using ::clock_t;
-)");
-
-#undef JITIFY_DEFINE_C_AND_CXX_HEADERS
-#undef JITIFY_DEFINE_C_AND_CXX_HEADERS_EX
+using ::tm;
+#if __cplusplus >= 201703L
+using ::timespec;
+#endif
+}  // namespace std
+)";
 
 static const char* const jitsafe_header_algorithm = R"(
 #pragma once
@@ -4674,7 +4790,8 @@ JITIFY_CXX14_CONSTEXPR const T& min(const T& a, const T& b) {
 static const char* const jitsafe_header_array = R"(
 #pragma once
 namespace std {
-template <class T, std::size_t N>
+using ::size_t;
+template <class T, size_t N>
 class array {
   T data_[N];
 
@@ -5056,7 +5173,8 @@ static const char* const jitsafe_header_tuple = R"(
 namespace std {
 template <class... Types> class tuple;
 
-template <size_t I, class T>
+// Note: T is variadic only so that it matches libcudacxx's definition.
+template <size_t I, class... T>
 struct tuple_element;
 // Recursive case.
 template <size_t I, class Head, class... Tail>
@@ -5561,11 +5679,12 @@ template <typename... Ts> using void_t = typename __jitify_make_void<Ts...>::typ
 
 static const char* const jitsafe_header_utility = R"(
 #pragma once
-#include <cstring>  // For std::size_t
 #include <type_traits>
 
 namespace std {
 
+using ::size_t;
+
 template <class T1, class T2>
 struct pair {
   T1 first;
@@ -5602,16 +5721,16 @@ class integer_sequence {
  public:
   using type = integer_sequence;  // Needed by make_index_sequence
   using value_type = T;
-  static constexpr std::size_t size() noexcept { return sizeof...(Ints); }
+  static constexpr size_t size() noexcept { return sizeof...(Ints); }
 };
 
-template <std::size_t... Ints>
-using index_sequence = std::integer_sequence<std::size_t, Ints...>;
+template <size_t... Ints>
+using index_sequence = std::integer_sequence<size_t, Ints...>;
 
 namespace __jitify_detail {
-template <std::size_t Sequence1Length, class Sequence1, class Sequence2>
+template <size_t Sequence1Length, class Sequence1, class Sequence2>
 struct concat_integer_sequence;
-template <std::size_t Sequence1Length, typename T, T... Ints1, T... Ints2>
+template <size_t Sequence1Length, typename T, T... Ints1, T... Ints2>
 struct concat_integer_sequence<Sequence1Length, integer_sequence<T, Ints1...>,
                                integer_sequence<T, Ints2...>>
     : integer_sequence<T, Ints1..., (Sequence1Length + Ints2)...> {};
@@ -5641,8 +5760,8 @@ JITIFY_DEFINE_MAKE_INTEGER_SEQUENCE_TYPE(long long)
 JITIFY_DEFINE_MAKE_INTEGER_SEQUENCE_TYPE(unsigned long long)
 #undef JITIFY_DEFINE_MAKE_INTEGER_SEQUENCE_TYPE
 
-template <std::size_t N>
-using make_index_sequence = std::make_integer_sequence<std::size_t, N>;
+template <size_t N>
+using make_index_sequence = std::make_integer_sequence<size_t, N>;
 
 template <class... T>
 using index_sequence_for = std::make_index_sequence<sizeof...(T)>;
@@ -5738,6 +5857,9 @@ namespace std {
 // even with nvcc.
 static const char* const jitsafe_header_typeinfo = R"(
 #pragma once
+namespace std {
+using ::size_t;
+}  // namespace std
 // WAR for typeid being builtin but not supported in device code.
 #define typeid(x) type_info{}
 class type_info {
diff --git a/jitify2_test.cu b/jitify2_test.cu
index c5346d5..0f936bd 100644
--- a/jitify2_test.cu
+++ b/jitify2_test.cu
@@ -1795,6 +1795,41 @@ __global__ void nontype_kernel() {}
 #undef JITIFY_NONTYPE_REFLECTION_TEST
 }
 
+static const StringVec& get_jitsafe_headers_list() {
+  static StringVec headers = [] {
+    StringVec result;
+    for (const auto& name_source : jitify2::detail::get_jitsafe_headers_map()) {
+      result.push_back(name_source.first);
+    }
+    std::sort(result.begin(), result.end());
+    return result;
+  }();
+  return headers;
+}
+
+TEST(Jitify2Test, BuiltinHeadersAllCompatible) {
+  std::string source;
+  for (const std::string& header_name : get_jitsafe_headers_list()) {
+    source += "#include <" + header_name + ">\n";
+  }
+  CompiledProgram compiled =
+      jitify2::Program("my_program", source)->preprocess({"-std=c++17"})->compile();
+  ASSERT_EQ(get_error(compiled), "");
+  EXPECT_EQ(compiled->log(), "");
+}
+
+TEST(Jitify2Test, BuiltinHeadersIndividual) {
+  for (const std::string& header_name : get_jitsafe_headers_list()) {
+    std::string source = "#include <" + header_name + ">\n";
+    CompiledProgram compiled =
+      jitify2::Program("my_program", source)->preprocess({"-std=c++17"})->compile();
+    EXPECT_EQ(get_error(compiled), "");
+    if (compiled) {
+      EXPECT_EQ(compiled->log(), "");
+    }
+  }
+}
+
 TEST(Jitify2Test, BuiltinNumericLimitsHeader) {
   static const char* const source = R"(
 #include <limits>
@@ -2018,6 +2053,20 @@ TEST(Jitify2Test, LibCudaCxxAndBuiltinLimits) {
   ASSERT_EQ(get_error(compiled), "");
   ASSERT_EQ(compiled->log(), "");  // Ensure no warnings
 }
+
+TEST(Jitify2Test, LibCudaCxxAndBuiltinTuple) {
+  static const char* const source = R"(
+#include <tuple>
+#include <cuda/std/tuple>
+)";
+
+  PreprocessedProgram preprog =
+      Program("tuple_program", source)->preprocess({"-I" CUDA_INC_DIR});
+  ASSERT_EQ(get_error(preprog), "");
+  CompiledProgram compiled = preprog->compile();
+  ASSERT_EQ(get_error(compiled), "");
+  ASSERT_EQ(compiled->log(), "");  // Ensure no warnings
+}
 #endif  // CUDA_VERSION >= 11000
 
 TEST(Jitify2Test, AssertHeader) {

From 937f9fd5bce6cad72b7db693a5f185c5c691433a Mon Sep 17 00:00:00 2001
From: Ben Barsdell <bbarsdell@nvidia.com>
Date: Tue, 20 May 2025 18:57:10 +1000
Subject: [PATCH 45/47] Add exception, numeric, and cxxabi builtin headers

- Adding placeholder headers like these is of low value, but it
  improves out-of-the-box compatibility with existing codes that
  aren't RTC-safe.
---
 jitify2.hpp | 97 ++++++++++++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 92 insertions(+), 5 deletions(-)

diff --git a/jitify2.hpp b/jitify2.hpp
index eb4da7a..a2e2e5a 100644
--- a/jitify2.hpp
+++ b/jitify2.hpp
@@ -5044,6 +5044,10 @@ struct IntegerLimits {
     traps             = false
   };
 };
+
+#undef JITIFY_CXX11_NOEXCEPT
+#undef JITIFY_CXX11_CONSTEXPR
+
 }  // namespace __jitify_detail
 template <typename T>
 struct numeric_limits {
@@ -5139,14 +5143,29 @@ static const char* const jitsafe_header_sstream = R"(
 #include <istream>
 )";
 
-static const char* const jitsafe_header_stdexcept = R"(
+static const char* const jitsafe_header_exception = R"(
 #pragma once
 #include <string>
 namespace std {
-struct runtime_error {
-  explicit runtime_error( const string& what_arg );
-  explicit runtime_error( const char* what_arg );
-  virtual const char* what() const;
+class exception {
+ public:
+  exception() noexcept;
+  exception(const exception& other) noexcept;
+  exception& operator=(const exception& other) noexcept;
+  virtual ~exception() {}
+  virtual const char* what() const noexcept;
+};
+}  // namespace std
+)";
+
+static const char* const jitsafe_header_stdexcept = R"(
+#pragma once
+#include <exception>
+namespace std {
+struct runtime_error : public exception {
+  explicit runtime_error(const string& what_arg);
+  explicit runtime_error(const char* what_arg);
+  const char* what() const noexcept override;
 };
 }  // namespace std
 )";
@@ -5887,6 +5906,71 @@ int setitimer(int, const struct itimerval*, struct itimerval*);
 int utimes(const char*, const struct timeval[2]);
 )";
 
+static const char* const jitsafe_header_numeric = R"(
+#pragma once
+namespace std {
+
+#if __cplusplus >= 202002L
+#define JITIFY_CXX20_CONSTEXPR constexpr
+#else
+#define JITIFY_CXX20_CONSTEXPR
+#endif
+
+template <class InputIter, class T>
+JITIFY_CXX20_CONSTEXPR T accumulate(InputIter first, InputIter last, T init);
+
+template <class InputIter, class T, class BinaryOp>
+JITIFY_CXX20_CONSTEXPR T accumulate(
+    InputIter first, InputIter last, T init, BinaryOp op);
+
+template <class InputIter1, class InputIter2, class T>
+JITIFY_CXX20_CONSTEXPR T inner_product(
+    InputIter1 first1, InputIter1 last1, InputIter2 first2, T init);
+
+template <class InputIter1, class InputIter2, class T,
+         class BinaryOp1, class BinaryOp2>
+JITIFY_CXX20_CONSTEXPR T inner_product(
+    InputIter1 first1, InputIter1 last1, InputIter2 first2, T init,
+    BinaryOp1 op1, BinaryOp2 op2);
+
+template <class InputIter, class OutputIter>
+JITIFY_CXX20_CONSTEXPR OutputIter adjacent_difference(
+    InputIter first, InputIter last, OutputIter d_first);
+
+template <class InputIter, class OutputIter, class BinaryOp>
+JITIFY_CXX20_CONSTEXPR OutputIter adjacent_difference(
+    InputIter first, InputIter last, OutputIter d_first, BinaryOp op);
+
+template <class InputIter, class OutputIter>
+JITIFY_CXX20_CONSTEXPR OutputIter partial_sum(
+    InputIter first, InputIter last, OutputIter d_first);
+
+template <class InputIter, class OutputIter, class BinaryOp>
+JITIFY_CXX20_CONSTEXPR OutputIter partial_sum(
+    InputIter first, InputIter last, OutputIter d_first, BinaryOp op);
+
+#if __cplusplus >= 201103L
+
+template <class ForwardIter, class T>
+JITIFY_CXX20_CONSTEXPR void iota(ForwardIter first, ForwardIter last, T value);
+
+#endif  // __cplusplus >= 201103L
+
+// TODO: More functions added since C++17.
+
+#undef JITIFY_CXX20_CONSTEXPR
+
+}  // namespace std
+)";
+
+static const char* const jitsafe_header_cxxabi_h = R"(
+#pragma once
+namespace abi {
+extern "C" char* __cxa_demangle(
+    const char* mangled_name, char* output_buffer, size_t* length, int* status);
+}  // namespace abi
+)";
+
 // WAR: These need to be pre-added as a workaround for NVRTC implicitly using
 // /usr/include as an include path. The other built-in headers will be included
 // lazily as needed.
@@ -5935,6 +6019,7 @@ static const StringMap& get_jitsafe_headers_map() {
       {"mutex", jitsafe_header_mutex},
       {"ostream", jitsafe_header_ostream},
       {"sstream", jitsafe_header_sstream},
+      {"exception", jitsafe_header_exception},
       {"stdexcept", jitsafe_header_stdexcept},
       {"string", jitsafe_header_string},
       {"tuple", jitsafe_header_tuple},
@@ -5948,6 +6033,8 @@ static const StringMap& get_jitsafe_headers_map() {
       {"iomanip", jitsafe_header_iomanip},
       {"typeinfo", jitsafe_header_typeinfo},
       {"sys/time.h", jitsafe_header_sys_time},
+      {"numeric", jitsafe_header_numeric},
+      {"cxxabi.h", jitsafe_header_cxxabi_h},
   };
   return jitsafe_headers_map;
 }

From 28b4df93d626bc807b8fe6e4a63ed9304df5a73e Mon Sep 17 00:00:00 2001
From: Ben Barsdell <bbarsdell@nvidia.com>
Date: Tue, 20 May 2025 19:01:07 +1000
Subject: [PATCH 46/47] Fix invoke_result implementation

- This is still incomplete, but covers most cases.
- It's recommended to use <cuda/std/type_traits> instead; at some
  point we should look into doing this automatically.
---
 jitify2.hpp     | 73 ++++++++++++++++++++++++++++++++++---------------
 jitify2_test.cu | 11 ++++++++
 2 files changed, 62 insertions(+), 22 deletions(-)

diff --git a/jitify2.hpp b/jitify2.hpp
index a2e2e5a..ed08011 100644
--- a/jitify2.hpp
+++ b/jitify2.hpp
@@ -5211,6 +5211,9 @@ struct tuple_element<0, tuple<Head, Tail...>> {
 // TODO: This is incomplete.
 static const char* const jitsafe_header_type_traits = R"(
 #pragma once
+
+#include <utility>  // For std::declval
+
 #if __cplusplus >= 201103L
 namespace std {
 
@@ -5331,25 +5334,6 @@ struct is_function<Ret(Args...)> : true_type {};  // regular
 template <class Ret, class... Args>
 struct is_function<Ret(Args......)> : true_type {};  // variadic
 
-template <class>
-struct result_of;
-template <class F, typename... Args>
-struct result_of<F(Args...)> {
-  // TODO: This is a hack; a proper implem is quite complicated.
-  typedef typename F::result_type type;
-};
-// Note: We include this before C++17 for convenience.
-// TODO: This implementation is probably not standard-conforming.
-template <class F, class... Args>
-struct invoke_result : result_of<F(Args...)> {};
-
-#if __cplusplus >= 201402L
-template <class T>
-using result_of_t = typename result_of<T>::type;
-template <class F, class... Args>
-using invoke_result_t = typename invoke_result<F, Args...>::type;
-#endif  // __cplusplus >= 201402L
-
 template <class T> struct is_pointer                    : false_type {};
 template <class T> struct is_pointer<T*>                : true_type {};
 template <class T> struct is_pointer<T* const>          : true_type {};
@@ -5491,6 +5475,41 @@ template <class T>
 using remove_cvref_t = typename remove_cvref<T>::type;
 #endif
 
+namespace __jitify_detail {
+// TODO: Need specialization for member function pointers.
+template <class T>
+struct invoke_impl {
+  template <class Func, class... Args>
+  static auto call(Func&& func, Args&&... args)
+      -> decltype(std::forward<Func>(func)(std::forward<Args>(args)...));
+};
+template <class Func, class... Args, class FuncDecayed = std::decay_t<Func>>
+auto invoke(Func&& func, Args&&... args)
+    -> decltype(invoke_impl<FuncDecayed>::call(std::forward<Func>(func),
+                                               std::forward<Args>(args)...));
+template <typename Void, typename, typename...>
+struct invoke_result {};
+template <typename Func, typename... Args>
+struct invoke_result<decltype(void(invoke(std::declval<Func>(),
+                                          std::declval<Args>()...))),
+                     Func, Args...> {
+  using type = decltype(invoke(std::declval<Func>(), std::declval<Args>()...));
+};
+}  // namespace __jitify_detail
+
+template<class> struct result_of;
+template <class Func, class... Args>
+struct result_of<Func(Args...)> :
+    __jitify_detail::invoke_result<void, Func, Args...> {};
+
+template <class Func, class... Args>
+struct invoke_result : __jitify_detail::invoke_result<void, Func, Args...> {};
+
+template <class T>
+using result_of_t = typename result_of<T>::type;
+template <class Func, class... Args>
+using invoke_result_t = typename invoke_result<Func, Args...>::type;
+
 template <class T, T v>
 struct integral_constant {
   static constexpr T value = v;
@@ -5698,7 +5717,6 @@ template <typename... Ts> using void_t = typename __jitify_make_void<Ts...>::typ
 
 static const char* const jitsafe_header_utility = R"(
 #pragma once
-#include <type_traits>
 
 namespace std {
 
@@ -5721,17 +5739,28 @@ pair<T1, T2> make_pair(const T1& first, const T2& second) {
 
 #if __cplusplus >= 201103L
 
+namespace __jitify_utility_detail {
+template <class T>
+struct type_identity { using type = T; };
+template <class T>
+auto add_rvalue_reference_impl(int) -> type_identity<T&&>;
+template <class T>
+auto add_rvalue_reference_impl(...) -> type_identity<T>;
+template <class T>
+struct add_rvalue_reference : decltype(add_rvalue_reference_impl<T>(0)) {};
+}  // namespace __jitify_utility_detail
+
 template <typename T>
 struct __jitify_always_false {
   static constexpr bool value = false;
 };
 template <typename T>
-typename std::add_rvalue_reference<T>::type declval() noexcept {
+typename __jitify_utility_detail::add_rvalue_reference<T>::type declval() noexcept {
   static_assert(__jitify_always_false<T>::value,
                 "declval not allowed in an evaluated context");
+}
 
 #endif  // __cplusplus >= 201103L
-}
 
 #if __cplusplus >= 201402L
 
diff --git a/jitify2_test.cu b/jitify2_test.cu
index 0f936bd..0c8a766 100644
--- a/jitify2_test.cu
+++ b/jitify2_test.cu
@@ -2195,6 +2195,17 @@ __global__ void my_kernel() {}
   ASSERT_EQ(compiled->ptx(), orig_ptx);
 }
 
+TEST(Jitify2Test, InvokeResult) {
+  static const std::string source = R"(
+#include <type_traits>
+double op(float, int) { return 0.0; }
+static_assert(
+    std::is_same<std::invoke_result_t<decltype(op), float, int>, double>::value,
+    "");
+)";
+  *Program("my_program", source)->preprocess()->compile();
+}
+
 bool read_binary_file(const char* filename, std::string* contents) {
   std::ifstream file(filename, std::ios::binary | std::ios::ate);
   if (!file) return false;

From a4ef1f8425af3772bdd6dd646e025ccfbb9a7a51 Mon Sep 17 00:00:00 2001
From: Ben Barsdell <bbarsdell@nvidia.com>
Date: Wed, 21 May 2025 20:45:12 +1000
Subject: [PATCH 47/47] Improve directory handling and support symlinks

- Replaces path_simplify() with get_real_path(), which returns the
  full canonical path with symlinks expanded. This requires the
  path to exist, so explicit header sources (which are not real
  files) needed some special handling.
- Full path expansion is also applied to "." paths. To avoid these
  being embedded in the application, they are now explicitly encoded
  as ".".
- Also adds a test for include directory order.
- Also fixes some formatting.
---
 jitify2.hpp     | 112 +++++++++++++++++-------------------
 jitify2_test.cu | 148 +++++++++++++++++++++++++-----------------------
 2 files changed, 129 insertions(+), 131 deletions(-)

diff --git a/jitify2.hpp b/jitify2.hpp
index ed08011..740a086 100644
--- a/jitify2.hpp
+++ b/jitify2.hpp
@@ -169,6 +169,7 @@
 #include <direct.h>       // For mkdir
 #include <fcntl.h>        // For open, O_RDWR etc.
 #include <io.h>           // For _sopen_s
+#include <stdlib.h>       // For _fullpath
 #include <sys/locking.h>  // For _LK_LOCK etc.
 #define JITIFY_PATH_MAX MAX_PATH
 #else
@@ -2489,7 +2490,7 @@ inline bool path_exists(const char* filename, bool* is_dir = nullptr) {
 
 inline const char* get_current_executable_path() {
   static const char* path = []() -> const char* {
-    static char buffer[JITIFY_PATH_MAX] = {};
+    static char buffer[JITIFY_PATH_MAX + 1] = {};
 #ifdef __linux__
     if (!::realpath("/proc/self/exe", buffer)) return nullptr;
 #elif defined(_WIN32) || defined(_WIN64)
@@ -6068,48 +6069,19 @@ static const StringMap& get_jitsafe_headers_map() {
   return jitsafe_headers_map;
 }
 
-// Elides "/." and "/.." tokens from path. Returns empty string if illformed.
-inline std::string path_simplify(StringRef path, bool canonicalize = false) {
-#if defined _WIN32 || defined _WIN64
-  // Note that Windows supports both forward and backslash path separators.
-  const char* sep = "\\/";
+// Returns the canonical full path (resolving all symlinks, "." and ".."
+// references, and repeat slashes) for the given filename, or an empty string on
+// failure. The filename must exist and be accessible.
+// Note: "." -> current working directory.
+// Note: "" -> "".
+inline std::string get_real_path(const char* filename) {
+  char buffer[JITIFY_PATH_MAX + 1] = {};
+#if defined(_WIN32) || defined(_WIN64)
+  if (!::_fullpath(buffer, filename, JITIFY_PATH_MAX)) return "";
 #else
-  const char* sep = "/";
+  if (!::realpath(filename, buffer)) return "";
 #endif
-  const int n = (int)path.size();
-  StringVec dirs;
-  std::string seps;
-  std::string cur_dir;
-  bool after_slash = false;
-  for (int i = 0; i < n + 1; ++i) {
-    if (i == n || std::strchr(sep, path[i])) {
-      if (after_slash) continue;  // Ignore repeat slashes
-      after_slash = i < n;
-      if (cur_dir == ".." && !dirs.empty() && dirs.back() != "..") {
-        if (dirs.size() == 1 && dirs.front().empty()) {
-          return {};  // Bad path: back-traversals exceed depth of absolute path
-        }
-        dirs.pop_back();
-        seps.pop_back();
-      } else if (cur_dir != ".") {  // Ignore /./
-        dirs.push_back(cur_dir);
-        if (after_slash) {
-          seps.push_back(canonicalize ? '/' : path[i]);
-        }
-      }
-      cur_dir.clear();
-    } else {
-      after_slash = false;
-      cur_dir.push_back(path[i]);
-    }
-  }
-  std::ostringstream ss;
-  for (int i = 0; i < (int)dirs.size() - 1; ++i) {
-    ss << dirs[i] << seps[i];
-  }
-  if (!dirs.empty()) ss << dirs.back();
-  if (after_slash) ss << seps.back();
-  return ss.str();
+  return std::string(buffer);
 }
 
 // Reads a whole text file into *content. Returns false on failure.
@@ -7591,6 +7563,10 @@ inline ErrorMsg process_cuda_source(const std::string& source,
  *  angle-includes, or to use "-include" to add a completely new header.
  */
 inline std::string quote_include_name(std::string name) {
+  // Note: Preprocessing encodes the current directory as ".", so this will
+  // match that. We also wouldn't want to use get_real_path(".") anyway because
+  // it wouldn't necessarily match the current directory that was used during
+  // preprocessing.
   return IncludeName(name, ".").patched_name();
 }
 
@@ -7614,7 +7590,7 @@ HeaderLoadStatus load_header(const parser::IncludeName& include,
                              bool use_builtin_headers, std::string* full_path,
                              StringMapT* fullpath_to_source) {
   auto already_loaded = [&](const std::string& fp) {
-    return fullpath_to_source->count(fp);
+    return !fp.empty() && fullpath_to_source->count(fp);
   };
   auto newly_loaded = [&](std::string source) {
     fullpath_to_source->emplace(*full_path, std::move(source));
@@ -7624,18 +7600,21 @@ HeaderLoadStatus load_header(const parser::IncludeName& include,
   if (path_is_absolute(include.name())) {
     // Handle absolute filename.
     *full_path = include.name();
-    *full_path = path_simplify(*full_path);
+    // Try loading absolute filename via callback.
+    if (already_loaded(*full_path)) return HeaderLoadStatus::kAlreadyLoaded;
+    if (header_callback && header_callback(include, &source)) {
+      return newly_loaded(std::move(source));
+    }
+    // Try loading absolute filename from the filesystem.
+    *full_path = get_real_path(full_path->c_str());
     if (already_loaded(*full_path)) return HeaderLoadStatus::kAlreadyLoaded;
-    // Try loading via callback or from the filesystem.
-    if ((header_callback && header_callback(include, &source)) ||
-        read_text_file(*full_path, &source)) {
+    if (!full_path->empty() && read_text_file(*full_path, &source)) {
       return newly_loaded(std::move(source));
     }
     return HeaderLoadStatus::kFailed;
   }
   // Try loading via callback.
   *full_path = include.nonlocal_full_path(kJitifyCallbackHeaderPrefix);
-  *full_path = path_simplify(*full_path);
   if (already_loaded(*full_path)) return HeaderLoadStatus::kAlreadyLoaded;
   if (header_callback && header_callback(include, &source)) {
     return newly_loaded(std::move(source));
@@ -7643,25 +7622,30 @@ HeaderLoadStatus load_header(const parser::IncludeName& include,
   // Try loading from current directory.
   if (include.is_quote_include()) {
     *full_path = include.local_full_path();
-    *full_path = path_simplify(*full_path);
+    // Note: We first match with existing full paths _before_ applying
+    // get_real_path() so that extra header sources (which may not actually
+    // exist in the filesystem) are found.
+    if (already_loaded(*full_path)) return HeaderLoadStatus::kAlreadyLoaded;
+    *full_path = get_real_path(full_path->c_str());
     if (already_loaded(*full_path)) return HeaderLoadStatus::kAlreadyLoaded;
-    if (read_text_file(*full_path, &source)) {
+    if (!full_path->empty() && read_text_file(*full_path, &source)) {
       return newly_loaded(std::move(source));
     }
   }
   // Try loading from include directories.
   for (const std::string& include_path : include_paths) {
     *full_path = include.nonlocal_full_path(include_path);
-    *full_path = path_simplify(*full_path);
+    // See comment above for why we do this here.
     if (already_loaded(*full_path)) return HeaderLoadStatus::kAlreadyLoaded;
-    if (read_text_file(*full_path, &source)) {
+    *full_path = get_real_path(full_path->c_str());
+    if (already_loaded(*full_path)) return HeaderLoadStatus::kAlreadyLoaded;
+    if (!full_path->empty() && read_text_file(*full_path, &source)) {
       return newly_loaded(std::move(source));
     }
   }
   // Try loading from builtin headers.
   if (use_builtin_headers) {
     *full_path = include.nonlocal_full_path(kJitifyBuiltinHeaderPrefix);
-    *full_path = path_simplify(*full_path);
     if (already_loaded(*full_path)) return HeaderLoadStatus::kAlreadyLoaded;
     auto iter = get_jitsafe_headers_map().find(include.name());
     if (iter != get_jitsafe_headers_map().end()) {
@@ -7792,11 +7776,12 @@ inline PreprocessedProgram PreprocessedProgram::preprocess(
   StringVec include_paths;
   detail::extract_include_paths(&compiler_options, &include_paths);
   for (std::string& include_path : include_paths) {
-    include_path = detail::path_simplify(include_path, /*canonicalize=*/true);
+    include_path = detail::get_real_path(include_path.c_str());
   }
+  // Remove empty (non-existent) include paths.
+  std::remove(include_paths.begin(), include_paths.end(), std::string{});
   // Returns index of longest matching include dir, or -1 if no match.
   auto match_include_path = [&](std::string path, size_t* length) -> int {
-    path = detail::path_simplify(path, /*canonicalize=*/true);
     *length = 0;
     int matched_index = -1;
     for (int i = 0; i < (int)include_paths.size(); ++i) {
@@ -7822,6 +7807,8 @@ inline PreprocessedProgram PreprocessedProgram::preprocess(
   const ProcessFlags replace_std_flag_if_enabled =
       use_cuda_std ? ProcessFlags::kReplaceStd : ProcessFlags::kNone;
 
+  const std::string starting_dir = detail::get_real_path(".");
+
   static const char* const kJitifyEncodedIncludePath = "__jitify_I";
   // Replaces an include path prefix with an index to avoid it appearing
   // in the shipped binary.
@@ -7834,6 +7821,12 @@ inline PreprocessedProgram PreprocessedProgram::preprocess(
           kJitifyEncodedIncludePath +
           std::to_string(matched_include_path_index) + "@" +
           include.current_dir().substr(prefix_length));
+    } else {
+      // Try matching current directory and encode as ".".
+      if (detail::startswith(include.current_dir(), starting_dir)) {
+        include = include.with_current_dir(
+            "." + include.current_dir().substr(starting_dir.size()));
+      }
     }
     return include;
   };
@@ -7847,6 +7840,9 @@ inline PreprocessedProgram PreprocessedProgram::preprocess(
         const int index = std::stoi(current_dir.substr(pos, end - pos));
         current_dir = include_paths.at(index) + current_dir.substr(end + 1);
         include = include.with_current_dir(current_dir);
+      } else if (detail::startswith(current_dir, ".")) {
+        current_dir = starting_dir + current_dir.substr(1);
+        include = include.with_current_dir(current_dir);
       }
     }
     return include;
@@ -7871,9 +7867,8 @@ inline PreprocessedProgram PreprocessedProgram::preprocess(
             });
       };
 
-  const std::string current_dir = ".";
   const std::string program_fullpath =
-      detail::path_join(current_dir, detail::sanitize_slashes(program_name));
+      detail::path_join(starting_dir, detail::sanitize_slashes(program_name));
   ErrorMsg err = process_cuda_source_fn(&program_source, program_fullpath,
                                         replace_std_flag_if_enabled);
   if (err) return Error(err);
@@ -7891,15 +7886,14 @@ inline PreprocessedProgram PreprocessedProgram::preprocess(
     std::string* source_ptr = &header_source.second;
     std::string fullpath = detail::path_is_absolute(name)
                                ? name
-                               : detail::path_join(current_dir, name);
-    fullpath = detail::path_simplify(fullpath);
+                               : detail::path_join(starting_dir, name);
     err = process_cuda_source_fn(
         source_ptr, fullpath,
         replace_std_flag_if_enabled | ProcessFlags::kAddUsedHeaderWarning);
     if (err) return Error(err);
     // Note: The names (keys) in header_sources will be matched:
     // a) directly, for `#include <name>` directives, and
-    // b) as if they are filenames (relative to the current exe dir if not
+    // b) as if they are filenames (relative to the current working dir if not
     //    absolute), for `#include "name"` directives. This will NOT fall back
     //    to direct matching like <> includes.
     // This allows path-based matching.
diff --git a/jitify2_test.cu b/jitify2_test.cu
index 0c8a766..e6a10ee 100644
--- a/jitify2_test.cu
+++ b/jitify2_test.cu
@@ -669,63 +669,6 @@ TEST(Jitify2Test, PathJoin) {
 #endif
 }
 
-TEST(Jitify2Test, PathSimplify) {
-  EXPECT_EQ(jitify2::detail::path_simplify(""), "");
-  EXPECT_EQ(jitify2::detail::path_simplify("/"), "/");
-  EXPECT_EQ(jitify2::detail::path_simplify("//"), "/");
-  EXPECT_EQ(jitify2::detail::path_simplify("/foo/bar"), "/foo/bar");
-  EXPECT_EQ(jitify2::detail::path_simplify("foo/bar"), "foo/bar");
-  EXPECT_EQ(jitify2::detail::path_simplify("/foo/./bar"), "/foo/bar");
-  EXPECT_EQ(jitify2::detail::path_simplify("foo/./bar"), "foo/bar");
-  EXPECT_EQ(jitify2::detail::path_simplify("/foo/../bar"), "/bar");
-  EXPECT_EQ(jitify2::detail::path_simplify("foo/../bar"), "bar");
-  EXPECT_EQ(jitify2::detail::path_simplify("/foo/cat/../../bar"), "/bar");
-  EXPECT_EQ(jitify2::detail::path_simplify("foo/cat/../../bar"), "bar");
-  EXPECT_EQ(jitify2::detail::path_simplify("/./bar"), "/bar");
-  EXPECT_EQ(jitify2::detail::path_simplify("./bar"), "bar");
-  EXPECT_EQ(jitify2::detail::path_simplify("../bar"), "../bar");
-  EXPECT_EQ(jitify2::detail::path_simplify("../../bar"), "../../bar");
-  EXPECT_EQ(jitify2::detail::path_simplify("../.././bar"), "../../bar");
-  EXPECT_EQ(jitify2::detail::path_simplify(".././../bar"), "../../bar");
-  EXPECT_EQ(jitify2::detail::path_simplify("./../../bar"), "../../bar");
-  EXPECT_EQ(jitify2::detail::path_simplify("/foo/bar/.."), "/foo");
-  EXPECT_EQ(jitify2::detail::path_simplify("foo/bar/.."), "foo");
-  EXPECT_EQ(jitify2::detail::path_simplify("//foo///..////bar"), "/bar");
-  EXPECT_EQ(jitify2::detail::path_simplify("foo/"), "foo/");
-  EXPECT_EQ(jitify2::detail::path_simplify("/foo/"), "/foo/");
-  EXPECT_EQ(jitify2::detail::path_simplify("foo/bar/"), "foo/bar/");
-  EXPECT_EQ(jitify2::detail::path_simplify("/foo/bar/"), "/foo/bar/");
-  EXPECT_EQ(jitify2::detail::path_simplify("foo/../bar/"), "bar/");
-  EXPECT_EQ(jitify2::detail::path_simplify("/foo/../bar/"), "/bar/");
-  EXPECT_EQ(jitify2::detail::path_simplify("/../foo"), "");    // Invalid path
-  EXPECT_EQ(jitify2::detail::path_simplify("/foo/../../bar"),  // Invalid path
-            "");
-  EXPECT_EQ(jitify2::detail::path_simplify("/.."), "");         // Invalid path
-  EXPECT_EQ(jitify2::detail::path_simplify("/foo/../.."), "");  // Invalid path
-#if defined _WIN32 || defined _WIN64
-  EXPECT_EQ(jitify2::detail::path_simplify(R"(\)"), R"(\)");
-  EXPECT_EQ(jitify2::detail::path_simplify(R"(\\)"), R"(\)");
-  EXPECT_EQ(jitify2::detail::path_simplify(R"(\foo\bar)"), R"(\foo\bar)");
-  EXPECT_EQ(jitify2::detail::path_simplify(R"(foo\bar)"), R"(foo\bar)");
-  EXPECT_EQ(jitify2::detail::path_simplify(R"(\foo\.\bar)"), R"(\foo\bar)");
-  EXPECT_EQ(jitify2::detail::path_simplify(R"(foo\.\bar)"), R"(foo\bar)");
-  EXPECT_EQ(jitify2::detail::path_simplify(R"(\foo\..\bar)"), R"(\bar)");
-  EXPECT_EQ(jitify2::detail::path_simplify(R"(foo\..\bar)"), R"(bar)");
-
-  EXPECT_EQ(jitify2::detail::path_simplify(R"(\foo/.\bar)"), R"(\foo/bar)");
-  EXPECT_EQ(jitify2::detail::path_simplify(R"(\foo/.\bar\./cat)"),
-            R"(\foo/bar\cat)");
-  EXPECT_EQ(jitify2::detail::path_simplify(R"(\foo/.\bar\../cat)"),
-            R"(\foo/cat)");
-  EXPECT_EQ(jitify2::detail::path_simplify(R"(\foo/.\bar\../cat)",
-                                           /*canonicalize=*/true),
-            R"(/foo/cat)");
-  EXPECT_EQ(jitify2::detail::path_simplify(R"(///foo///.\\\bar\\\..///cat)",
-                                           /*canonicalize=*/true),
-            R"(/foo/cat)");
-#endif
-}
-
 TEST(Jitify2Test, GetNvrtcBuildVersion) {
   EXPECT_NE(jitify2::detail::get_nvrtc_build_version(), -1);
 }
@@ -805,22 +748,43 @@ TEST(Jitify2Test, EncodedQuoteIncludes) {
   // encoded with the cuda include dir.
   static const char* const source = R"(
 #include <cuda_fp16.h>
+#include "example_headers/my_header1.cuh"
 __global__ void my_kernel() {}
 )";
-  auto preprog =
-      Program("my_program", source)
-          ->preprocess({"-Ifoo/bar", "-I/cat/dog", "-I" CUDA_INC_DIR});
+  auto preprog = Program("my_program", source)
+                     ->preprocess({"-I.", "-Iexample_headers", "-Ifoo/bar",
+                                   "-I" CUDA_INC_DIR});
   ASSERT_EQ(get_error(preprog), "");
   auto compiled = preprog->compile();
   ASSERT_EQ(get_error(compiled), "");
-  // Note: The '2' here is the index of the cuda include dir amongst the
-  // "-I" options.
-  ASSERT_TRUE(
+  // Note: The '2' in "I2@" here is the index of the cuda include dir amongst
+  // the "-I" options (excluding invalid paths like "foo/bar").
+  EXPECT_TRUE(
       preprog->header_sources().at("cuda_fp16.h").find("__jitify_I2@") !=
       std::string::npos);
+  std::string cwd = jitify2::detail::get_real_path(".");
+  for (const auto& name_header : preprog->header_sources()) {
+    const std::string& header_name = name_header.first;
+    const std::string& header_source = name_header.second;
+    EXPECT_FALSE(header_name.find(CUDA_INC_DIR) != std::string::npos);
+    EXPECT_FALSE(header_source.find(CUDA_INC_DIR) != std::string::npos);
+    EXPECT_FALSE(header_name.find(cwd) != std::string::npos);
+  }
+  // Repeat without "-I.", which will rely on the implicit current working
+  // directory include path for quote includes.
+  preprog = Program("my_program", source)->preprocess({"-I" CUDA_INC_DIR});
+  compiled = preprog->compile();
+  ASSERT_EQ(get_error(compiled), "");
+  ASSERT_EQ(get_error(preprog), "");
+  EXPECT_TRUE(
+      preprog->header_sources().at("cuda_fp16.h").find("__jitify_I0@") !=
+      std::string::npos);
   for (const auto& name_header : preprog->header_sources()) {
+    const std::string& header_name = name_header.first;
     const std::string& header_source = name_header.second;
-    ASSERT_FALSE(header_source.find(CUDA_INC_DIR) != std::string::npos);
+    EXPECT_FALSE(header_name.find(CUDA_INC_DIR) != std::string::npos);
+    EXPECT_FALSE(header_source.find(CUDA_INC_DIR) != std::string::npos);
+    EXPECT_FALSE(header_name.find(cwd) != std::string::npos);
   }
 }
 
@@ -875,7 +839,7 @@ TEST(Jitify2Test, ExplicitHeaderSources) {
                      {"/foo/bar", bad_header}})
                 ->preprocess({"-I."});
   ASSERT_EQ(get_error(preprog), "");
-  // Finding a header at the root from a quote-include in a subdir requires
+  // Finding a header at "." from a quote-include in a subdir requires
   // explicitly passing the current dir as an include path ("-I.").
   preprog = Program("my_program", R"(#include <foo/quote>)",
                     {{"foo/quote", quote_header},
@@ -926,8 +890,10 @@ __device__ T cube(T x) { return x * x * x; }
   // Test angle-include.
   PreprocessedProgram preprog =
       Program("my_program", source)->preprocess({"-I."});
-  *preprog->get_kernel("my_kernel<int>", {},
-                       {{"example_headers/my_header1.cuh", header}});
+  ASSERT_EQ(get_error(preprog), "");
+  Kernel kernel = preprog->get_kernel(
+      "my_kernel<int>", {}, {{"example_headers/my_header1.cuh", header}});
+  ASSERT_EQ(get_error(kernel), "");
 
   // Test quote-include.
   // Note that this requires the use of jitify2::quote_include_name().
@@ -935,10 +901,12 @@ __device__ T cube(T x) { return x * x * x; }
   // angle-includes, or to use "-include" to add a completely new header.
   preprog = Program("my_program", source)
                 ->preprocess({"-DUSE_QUOTE_INCLUDE", "-I" CUDA_INC_DIR});
-  *preprog->get_kernel(
+  ASSERT_EQ(get_error(preprog), "");
+  kernel = preprog->get_kernel(
       "my_kernel<int>", {},
       {{jitify2::quote_include_name("example_headers/my_header1.cuh"),
         header}});
+  ASSERT_EQ(get_error(kernel), "");
 }
 
 TEST(Jitify2Test, Preincludes) {
@@ -1812,8 +1780,9 @@ TEST(Jitify2Test, BuiltinHeadersAllCompatible) {
   for (const std::string& header_name : get_jitsafe_headers_list()) {
     source += "#include <" + header_name + ">\n";
   }
-  CompiledProgram compiled =
-      jitify2::Program("my_program", source)->preprocess({"-std=c++17"})->compile();
+  CompiledProgram compiled = jitify2::Program("my_program", source)
+                                 ->preprocess({"-std=c++17"})
+                                 ->compile();
   ASSERT_EQ(get_error(compiled), "");
   EXPECT_EQ(compiled->log(), "");
 }
@@ -1821,8 +1790,9 @@ TEST(Jitify2Test, BuiltinHeadersAllCompatible) {
 TEST(Jitify2Test, BuiltinHeadersIndividual) {
   for (const std::string& header_name : get_jitsafe_headers_list()) {
     std::string source = "#include <" + header_name + ">\n";
-    CompiledProgram compiled =
-      jitify2::Program("my_program", source)->preprocess({"-std=c++17"})->compile();
+    CompiledProgram compiled = jitify2::Program("my_program", source)
+                                   ->preprocess({"-std=c++17"})
+                                   ->compile();
     EXPECT_EQ(get_error(compiled), "");
     if (compiled) {
       EXPECT_EQ(compiled->log(), "");
@@ -2114,6 +2084,40 @@ TEST(Jitify2Test, LineNumbers) {
   EXPECT_EQ(compiled->log().substr(0, expected.size()), expected);
 }
 
+TEST(Jitify2Test, IncludeDirectoryOrder) {
+  static const char* const source = R"(
+#include <my_header1.cuh>
+
+__global__ void my_kernel(int* data) {
+  *data = square(*data);
+}
+)";
+  std::unique_ptr<const char, int (*)(const char*)> tmp_header_filename(
+      "my_header1.cuh", std::remove);
+  // Create empty header in current directory with same name as correct header.
+  std::ofstream tmp_header(tmp_header_filename.get());
+
+  // Passing -I. first should cause the empty header to be used, which will
+  // fail.
+  PreprocessedProgram preprog =
+      jitify2::Program("include_dirs_program", source)
+          ->preprocess(
+              {"-I.", "-Iexample_headers", "-no-preinclude-workarounds",
+               "-no-system-headers-workaround", "-arch=sm_80", "-std=c++17"});
+  ASSERT_NE(get_error(preprog), "");
+  EXPECT_TRUE(preprog.error().find("identifier \"square\" is undefined") !=
+              std::string::npos);
+  // Passing -Iexample_headers first should succeed.
+  CompiledProgram compiled =
+      jitify2::Program("include_dirs_program", source)
+          ->preprocess(
+              {"-Iexample_headers", "-I.", "-no-preinclude-workarounds",
+               "-no-system-headers-workaround", "-arch=sm_80", "-std=c++17"})
+          ->compile();
+  ASSERT_EQ(get_error(compiled), "");
+  EXPECT_EQ(compiled->log(), "");
+}
+
 TEST(Jitify2Test, Minify) {
   static const char* const name = "my_program";
   // This source is intentionally tricky to parse so that it stresses the