Add support for compiling CUDA code on both amd64 and the orin

Change-Id: Ic7aca062c454fa1da3480667363e7250ddc2a8fe
Signed-off-by: Austin Schuh <austin.linux@gmail.com>
diff --git a/WORKSPACE b/WORKSPACE
index efeb2a4..e60524a 100644
--- a/WORKSPACE
+++ b/WORKSPACE
@@ -1600,3 +1600,23 @@
     strip_prefix = "julia-1.8.5",
     url = "https://julialang-s3.julialang.org/bin/linux/x64/1.8/julia-1.8.5-linux-x86_64.tar.gz",
 )
+
+http_archive(
+    name = "com_github_nvidia_cuco",
+    build_file = "//third_party/cuco:cuco.BUILD",
+    patch_args = ["-p1"],
+    patches = [
+        "//third_party/cuco:template.patch",
+    ],
+    sha256 = "eecc9a111956a195f28ebc4b4fd23ac6991d072f5c1d7c68a59d059e05d7ad78",
+    strip_prefix = "cuCollections-b7514d2010967fdfe4a1d414894bb945bc09fddc",
+    url = "https://github.com/NVIDIA/cuCollections/archive/b7514d2010967fdfe4a1d414894bb945bc09fddc.zip",
+)
+
+http_archive(
+    name = "com_github_nvidia_cccl",
+    build_file = "//third_party/cccl:cccl.BUILD",
+    sha256 = "38160c628a9e32b7cd55553f299768f72b24074cc9c1a993ba40a177877b3421",
+    strip_prefix = "cccl-931dc6793482c61edbc97b7a19256874fd264313",
+    url = "https://github.com/NVIDIA/cccl/archive/931dc6793482c61edbc97b7a19256874fd264313.zip",
+)
diff --git a/third_party/BUILD b/third_party/BUILD
index 7195123..0430621 100644
--- a/third_party/BUILD
+++ b/third_party/BUILD
@@ -117,3 +117,23 @@
         "//conditions:default": [":unavailable"],
     }),
 )
+
+cc_library(
+    name = "cudart",
+    visibility = ["//visibility:public"],
+    deps = select({
+        "//tools:cpu_k8": ["@amd64_debian_sysroot//:cudart"],
+        "//tools:cpu_arm64": ["@arm64_debian_sysroot//:cudart"],
+        "//conditions:default": [":unavailable"],
+    }),
+)
+
+cc_library(
+    name = "nppi",
+    visibility = ["//visibility:public"],
+    deps = select({
+        "//tools:cpu_k8": ["@amd64_debian_sysroot//:nppi"],
+        "//tools:cpu_arm64": ["@arm64_debian_sysroot//:nppi"],
+        "//conditions:default": [":unavailable"],
+    }),
+)
diff --git a/third_party/bazel-toolchain/bazel_tools_changes/tools/cpp/unix_cc_toolchain_config.bzl b/third_party/bazel-toolchain/bazel_tools_changes/tools/cpp/unix_cc_toolchain_config.bzl
index 0044dce..2aa6e7e 100755
--- a/third_party/bazel-toolchain/bazel_tools_changes/tools/cpp/unix_cc_toolchain_config.bzl
+++ b/third_party/bazel-toolchain/bazel_tools_changes/tools/cpp/unix_cc_toolchain_config.bzl
@@ -221,6 +221,15 @@
                 with_features = [with_feature_set(features = ["fastbuild"])],
             ),
             flag_set(
+                actions = all_compile_actions,
+                with_features = [with_feature_set(features = ["cuda"])],
+                flag_groups = ([
+                    flag_group(
+                        flags = ctx.attr.cuda_flags,
+                    ),
+                ]),
+            ),
+            flag_set(
                 actions = [ACTION_NAMES.c_compile],
                 flag_groups = ([
                     flag_group(
@@ -1131,6 +1140,51 @@
         ],
     )
 
+    cuda_flags = [
+        "--cuda-gpu-arch=sm_80",
+        "--cuda-gpu-arch=sm_86",
+        "--cuda-gpu-arch=sm_87",
+        "-x",
+        "cuda",
+    ]
+
+    if ctx.attr.cpu == "aarch64":
+        cuda_flags += [
+            "--cuda-path=external/arm64_debian_sysroot/usr/local/cuda-11.8/",
+            "--ptxas-path=external/arm64_debian_sysroot/usr/local/cuda-11.8/bin/ptxas",
+            "-D__CUDACC_VER_MAJOR__=11",
+            "-D__CUDACC_VER_MINOR__=8",
+        ]
+        pass
+    elif ctx.attr.cpu == "k8":
+        cuda_flags += [
+            "--cuda-path=external/amd64_debian_sysroot/usr/lib/cuda/",
+            "--ptxas-path=external/amd64_debian_sysroot/usr/bin/ptxas",
+            "-D__CUDACC_VER_MAJOR__=11",
+            "-D__CUDACC_VER_MINOR__=8",
+        ]
+    else:
+        fail("Unknown cpu", ctx.attr.cpu)
+
+    cuda_feature = feature(
+        name = "cuda",
+        provides = ["cuda"],
+        flag_sets = [
+            flag_set(
+                actions = [
+                    ACTION_NAMES.cpp_compile,
+                    ACTION_NAMES.cpp_header_parsing,
+                    ACTION_NAMES.cpp_module_compile,
+                ],
+                flag_groups = [
+                    flag_group(
+                        flags = cuda_flags,
+                    ),
+                ],
+            ),
+        ],
+    )
+
     thinlto_feature = feature(
         name = "thin_lto",
         flag_sets = [
@@ -1210,6 +1264,7 @@
             cs_fdo_instrument_feature,
             cs_fdo_optimize_feature,
             thinlto_feature,
+            cuda_feature,
             fdo_prefetch_hints_feature,
             autofdo_feature,
             build_interface_libraries_feature,
@@ -1312,6 +1367,7 @@
         "coverage_link_flags": attr.string_list(),
         "supports_start_end_lib": attr.bool(),
         "builtin_sysroot": attr.string(),
+        "cuda_flags": attr.string_list(),
     },
     provides = [CcToolchainConfigInfo],
 )
diff --git a/third_party/bazel-toolchain/toolchain/cc_toolchain_config.bzl b/third_party/bazel-toolchain/toolchain/cc_toolchain_config.bzl
index b167f6e..88fd82d 100644
--- a/third_party/bazel-toolchain/toolchain/cc_toolchain_config.bzl
+++ b/third_party/bazel-toolchain/toolchain/cc_toolchain_config.bzl
@@ -406,6 +406,7 @@
     opt_compile_flags.extend(opt_copts)
     fastbuild_compile_flags.extend(fastbuild_copts)
     link_flags.extend(linkopts)
+    cuda_flags = ["-isystem", target_toolchain_path_prefix + "lib/clang/" + llvm_subfolder + "/include/cuda_wrappers"]
 
     # Source: https://cs.opensource.google/bazel/bazel/+/master:tools/cpp/unix_cc_toolchain_config.bzl
     unix_cc_toolchain_config(
@@ -435,4 +436,5 @@
         coverage_link_flags = coverage_link_flags,
         supports_start_end_lib = supports_start_end_lib,
         builtin_sysroot = sysroot_path,
+        cuda_flags = cuda_flags,
     )
diff --git a/third_party/cccl/BUILD b/third_party/cccl/BUILD
new file mode 100644
index 0000000..9a3294f
--- /dev/null
+++ b/third_party/cccl/BUILD
@@ -0,0 +1 @@
+exports_files(["cccl.BUILD"])
diff --git a/third_party/cccl/cccl.BUILD b/third_party/cccl/cccl.BUILD
new file mode 100644
index 0000000..6e15929
--- /dev/null
+++ b/third_party/cccl/cccl.BUILD
@@ -0,0 +1,19 @@
+cc_library(
+    name = "cccl",
+    hdrs = glob(include = [
+        "thrust/thrust/**",
+        "libcudacxx/include/**",
+        "cub/cub/**",
+    ]),
+    features = ["cuda"],
+    includes = [
+        "cub",
+        "libcudacxx/include",
+        "thrust",
+    ],
+    target_compatible_with = [
+        "@//tools/platforms/gpu:nvidia",
+        "@platforms//os:linux",
+    ],
+    visibility = ["//visibility:public"],
+)
diff --git a/third_party/cuco/BUILD b/third_party/cuco/BUILD
new file mode 100644
index 0000000..5b878ee
--- /dev/null
+++ b/third_party/cuco/BUILD
@@ -0,0 +1 @@
+exports_files(["cuco.BUILD", "template.patch"])
diff --git a/third_party/cuco/cuco.BUILD b/third_party/cuco/cuco.BUILD
new file mode 100644
index 0000000..f88e569
--- /dev/null
+++ b/third_party/cuco/cuco.BUILD
@@ -0,0 +1,15 @@
+cc_library(
+    name = "cuco",
+    hdrs = glob(include = ["include/**"]),
+    defines = [
+        "__CUDACC_RELAXED_CONSTEXPR__",
+        "__CUDACC_EXTENDED_LAMBDA__",
+    ],
+    features = ["cuda"],
+    includes = ["include"],
+    target_compatible_with = [
+        "@//tools/platforms/gpu:nvidia",
+        "@platforms//os:linux",
+    ],
+    visibility = ["//visibility:public"],
+)
diff --git a/third_party/cuco/template.patch b/third_party/cuco/template.patch
new file mode 100644
index 0000000..488eb9e
--- /dev/null
+++ b/third_party/cuco/template.patch
@@ -0,0 +1,138 @@
+diff --git a/include/cuco/detail/pair/pair.inl b/include/cuco/detail/pair/pair.inl
+index 3279a91..44be993 100644
+--- a/include/cuco/detail/pair/pair.inl
++++ b/include/cuco/detail/pair/pair.inl
+@@ -51,7 +51,53 @@ __host__ __device__ constexpr bool operator==(cuco::pair<T1, T2> const& lhs,
+ }  // namespace cuco
+ 
+ namespace thrust {
+-#include <cuco/detail/pair/tuple_helpers.inl>
++template <std::size_t I, typename T1, typename T2>
++__host__ __device__ constexpr auto get(cuco::pair<T1, T2>& p) ->
++  typename tuple_element<I, cuco::pair<T1, T2>>::type&
++{
++  static_assert(I < 2);
++  if constexpr (I == 0) {
++    return p.first;
++  } else {
++    return p.second;
++  }
++}
++
++template <std::size_t I, typename T1, typename T2>
++__host__ __device__ constexpr auto get(cuco::pair<T1, T2>&& p) ->
++  typename tuple_element<I, cuco::pair<T1, T2>>::type&&
++{
++  static_assert(I < 2);
++  if constexpr (I == 0) {
++    return std::move(p.first);
++  } else {
++    return std::move(p.second);
++  }
++}
++
++template <std::size_t I, typename T1, typename T2>
++__host__ __device__ constexpr auto get(cuco::pair<T1, T2> const& p) ->
++  typename tuple_element<I, cuco::pair<T1, T2>>::type const&
++{
++  static_assert(I < 2);
++  if constexpr (I == 0) {
++    return p.first;
++  } else {
++    return p.second;
++  }
++}
++
++template <std::size_t I, typename T1, typename T2>
++__host__ __device__ constexpr auto get(cuco::pair<T1, T2> const&& p) ->
++  typename tuple_element<I, cuco::pair<T1, T2>>::type const&&
++{
++  static_assert(I < 2);
++  if constexpr (I == 0) {
++    return std::move(p.first);
++  } else {
++    return std::move(p.second);
++  }
++}
+ }  // namespace thrust
+ 
+ namespace cuda::std {
+diff --git a/include/cuco/detail/static_map.inl b/include/cuco/detail/static_map.inl
+index 48799a7..336f6de 100644
+--- a/include/cuco/detail/static_map.inl
++++ b/include/cuco/detail/static_map.inl
+@@ -443,7 +443,7 @@ __device__
+                 "insert_and_find is not supported for unpackable data on pre-Volta GPUs.");
+ #endif
+ 
+-  auto current_slot{initial_slot(insert_pair.first, hash)};
++  auto current_slot{this->initial_slot(insert_pair.first, hash)};
+ 
+   while (true) {
+     key_type const existing_key = current_slot->first.load(cuda::std::memory_order_relaxed);
+@@ -514,7 +514,7 @@ __device__
+ 
+     // if we couldn't insert the key, but it wasn't a duplicate, then there must
+     // have been some other key there, so we keep looking for a slot
+-    current_slot = next_slot(current_slot);
++    current_slot = this->next_slot(current_slot);
+   }
+ }
+ 
+diff --git a/include/cuco/detail/static_map/static_map_ref.inl b/include/cuco/detail/static_map/static_map_ref.inl
+index f27f21e..e90948f 100644
+--- a/include/cuco/detail/static_map/static_map_ref.inl
++++ b/include/cuco/detail/static_map/static_map_ref.inl
+@@ -141,21 +141,6 @@ static_map_ref<Key, T, Scope, KeyEqual, ProbingScheme, StorageRef, Operators...>
+   return impl_.empty_value_sentinel();
+ }
+ 
+-template <typename Key,
+-          typename T,
+-          cuda::thread_scope Scope,
+-          typename KeyEqual,
+-          typename ProbingScheme,
+-          typename StorageRef,
+-          typename... Operators>
+-template <typename... NewOperators>
+-auto static_map_ref<Key, T, Scope, KeyEqual, ProbingScheme, StorageRef, Operators...>::with(
+-  NewOperators...) && noexcept
+-{
+-  return static_map_ref<Key, T, Scope, KeyEqual, ProbingScheme, StorageRef, NewOperators...>(
+-    std::move(*this));
+-}
+-
+ namespace detail {
+ 
+ template <typename Key,
+diff --git a/include/cuco/pair.cuh b/include/cuco/pair.cuh
+index d28cae5..1caaa24 100644
+--- a/include/cuco/pair.cuh
++++ b/include/cuco/pair.cuh
+@@ -87,8 +87,7 @@ struct alignas(detail::pair_alignment<First, Second>()) pair {
+    */
+   template <typename T, std::enable_if_t<detail::is_std_pair_like<T>::value>* = nullptr>
+   __host__ __device__ constexpr pair(T const& p)
+-    : pair{cuda::std::get<0>(thrust::raw_reference_cast(p)),
+-           cuda::std::get<1>(thrust::raw_reference_cast(p))}
++    : pair{std::get<0>(thrust::raw_reference_cast(p)), std::get<1>(thrust::raw_reference_cast(p))}
+   {
+   }
+ 
+diff --git a/include/cuco/static_map_ref.cuh b/include/cuco/static_map_ref.cuh
+index 88e40f8..7cf1d74 100644
+--- a/include/cuco/static_map_ref.cuh
++++ b/include/cuco/static_map_ref.cuh
+@@ -174,7 +174,11 @@ class static_map_ref
+    * @return `*this` with `NewOperators...`
+    */
+   template <typename... NewOperators>
+-  [[nodiscard]] __host__ __device__ auto with(NewOperators... ops) && noexcept;
++  [[nodiscard]] __host__ __device__ auto with(NewOperators... ops) && noexcept
++  {
++    return static_map_ref<Key, T, Scope, KeyEqual, ProbingScheme, StorageRef, NewOperators...>(
++      std::move(*this));
++  }
+ 
+  private:
+   impl_type impl_;  ///< Static map ref implementation
diff --git a/tools/platforms/BUILD b/tools/platforms/BUILD
index f485894..599f926 100644
--- a/tools/platforms/BUILD
+++ b/tools/platforms/BUILD
@@ -8,6 +8,7 @@
         "//tools/platforms/go:has_support",
         "//tools/platforms/rust:has_support",
         "//tools/platforms/nodejs:has_support",
+        "//tools/platforms/gpu:nvidia",
     ],
 )
 
@@ -20,6 +21,7 @@
         "//tools/platforms/go:lacks_support",
         "//tools/platforms/rust:has_support",
         "//tools/platforms/nodejs:lacks_support",
+        "//tools/platforms/gpu:nvidia",
     ],
 )
 
@@ -37,6 +39,7 @@
         # handle that, need to figure out how to do that here or switch linkers.
         "//tools/platforms/rust:lacks_support",
         "//tools/platforms/nodejs:lacks_support",
+        "//tools/platforms/gpu:none",
     ],
 )
 
@@ -48,6 +51,7 @@
         "//tools/platforms/go:lacks_support",
         "//tools/platforms/rust:lacks_support",
         "//tools/platforms/nodejs:lacks_support",
+        "//tools/platforms/gpu:none",
     ],
 )
 
@@ -59,6 +63,7 @@
         "//tools/platforms/go:lacks_support",
         "//tools/platforms/rust:lacks_support",
         "//tools/platforms/nodejs:lacks_support",
+        "//tools/platforms/gpu:none",
     ],
 )
 
diff --git a/tools/platforms/gpu/BUILD b/tools/platforms/gpu/BUILD
new file mode 100644
index 0000000..9686148
--- /dev/null
+++ b/tools/platforms/gpu/BUILD
@@ -0,0 +1,13 @@
+package(default_visibility = ["//visibility:public"])
+
+constraint_setting(name = "gpu")
+
+constraint_value(
+    name = "nvidia",
+    constraint_setting = ":gpu",
+)
+
+constraint_value(
+    name = "none",
+    constraint_setting = ":gpu",
+)