Add support for compiling CUDA code on both amd64 and the orin
Change-Id: Ic7aca062c454fa1da3480667363e7250ddc2a8fe
Signed-off-by: Austin Schuh <austin.linux@gmail.com>
diff --git a/third_party/BUILD b/third_party/BUILD
index 7195123..0430621 100644
--- a/third_party/BUILD
+++ b/third_party/BUILD
@@ -117,3 +117,23 @@
"//conditions:default": [":unavailable"],
}),
)
+
+cc_library(
+ name = "cudart",
+ visibility = ["//visibility:public"],
+ deps = select({
+ "//tools:cpu_k8": ["@amd64_debian_sysroot//:cudart"],
+ "//tools:cpu_arm64": ["@arm64_debian_sysroot//:cudart"],
+ "//conditions:default": [":unavailable"],
+ }),
+)
+
+cc_library(
+ name = "nppi",
+ visibility = ["//visibility:public"],
+ deps = select({
+ "//tools:cpu_k8": ["@amd64_debian_sysroot//:nppi"],
+ "//tools:cpu_arm64": ["@arm64_debian_sysroot//:nppi"],
+ "//conditions:default": [":unavailable"],
+ }),
+)
diff --git a/third_party/bazel-toolchain/bazel_tools_changes/tools/cpp/unix_cc_toolchain_config.bzl b/third_party/bazel-toolchain/bazel_tools_changes/tools/cpp/unix_cc_toolchain_config.bzl
index 0044dce..2aa6e7e 100755
--- a/third_party/bazel-toolchain/bazel_tools_changes/tools/cpp/unix_cc_toolchain_config.bzl
+++ b/third_party/bazel-toolchain/bazel_tools_changes/tools/cpp/unix_cc_toolchain_config.bzl
@@ -221,6 +221,15 @@
with_features = [with_feature_set(features = ["fastbuild"])],
),
flag_set(
+ actions = all_compile_actions,
+ with_features = [with_feature_set(features = ["cuda"])],
+ flag_groups = ([
+ flag_group(
+ flags = ctx.attr.cuda_flags,
+ ),
+ ]),
+ ),
+ flag_set(
actions = [ACTION_NAMES.c_compile],
flag_groups = ([
flag_group(
@@ -1131,6 +1140,51 @@
],
)
+ cuda_flags = [
+ "--cuda-gpu-arch=sm_80",
+ "--cuda-gpu-arch=sm_86",
+ "--cuda-gpu-arch=sm_87",
+ "-x",
+ "cuda",
+ ]
+
+ if ctx.attr.cpu == "aarch64":
+ cuda_flags += [
+ "--cuda-path=external/arm64_debian_sysroot/usr/local/cuda-11.8/",
+ "--ptxas-path=external/arm64_debian_sysroot/usr/local/cuda-11.8/bin/ptxas",
+ "-D__CUDACC_VER_MAJOR__=11",
+ "-D__CUDACC_VER_MINOR__=8",
+ ]
+ pass
+ elif ctx.attr.cpu == "k8":
+ cuda_flags += [
+ "--cuda-path=external/amd64_debian_sysroot/usr/lib/cuda/",
+ "--ptxas-path=external/amd64_debian_sysroot/usr/bin/ptxas",
+ "-D__CUDACC_VER_MAJOR__=11",
+ "-D__CUDACC_VER_MINOR__=8",
+ ]
+ else:
+ fail("Unknown cpu", ctx.attr.cpu)
+
+ cuda_feature = feature(
+ name = "cuda",
+ provides = ["cuda"],
+ flag_sets = [
+ flag_set(
+ actions = [
+ ACTION_NAMES.cpp_compile,
+ ACTION_NAMES.cpp_header_parsing,
+ ACTION_NAMES.cpp_module_compile,
+ ],
+ flag_groups = [
+ flag_group(
+ flags = cuda_flags,
+ ),
+ ],
+ ),
+ ],
+ )
+
thinlto_feature = feature(
name = "thin_lto",
flag_sets = [
@@ -1210,6 +1264,7 @@
cs_fdo_instrument_feature,
cs_fdo_optimize_feature,
thinlto_feature,
+ cuda_feature,
fdo_prefetch_hints_feature,
autofdo_feature,
build_interface_libraries_feature,
@@ -1312,6 +1367,7 @@
"coverage_link_flags": attr.string_list(),
"supports_start_end_lib": attr.bool(),
"builtin_sysroot": attr.string(),
+ "cuda_flags": attr.string_list(),
},
provides = [CcToolchainConfigInfo],
)
diff --git a/third_party/bazel-toolchain/toolchain/cc_toolchain_config.bzl b/third_party/bazel-toolchain/toolchain/cc_toolchain_config.bzl
index b167f6e..88fd82d 100644
--- a/third_party/bazel-toolchain/toolchain/cc_toolchain_config.bzl
+++ b/third_party/bazel-toolchain/toolchain/cc_toolchain_config.bzl
@@ -406,6 +406,7 @@
opt_compile_flags.extend(opt_copts)
fastbuild_compile_flags.extend(fastbuild_copts)
link_flags.extend(linkopts)
+ cuda_flags = ["-isystem", target_toolchain_path_prefix + "lib/clang/" + llvm_subfolder + "/include/cuda_wrappers"]
# Source: https://cs.opensource.google/bazel/bazel/+/master:tools/cpp/unix_cc_toolchain_config.bzl
unix_cc_toolchain_config(
@@ -435,4 +436,5 @@
coverage_link_flags = coverage_link_flags,
supports_start_end_lib = supports_start_end_lib,
builtin_sysroot = sysroot_path,
+ cuda_flags = cuda_flags,
)
diff --git a/third_party/cccl/BUILD b/third_party/cccl/BUILD
new file mode 100644
index 0000000..9a3294f
--- /dev/null
+++ b/third_party/cccl/BUILD
@@ -0,0 +1 @@
+exports_files(["cccl.BUILD"])
diff --git a/third_party/cccl/cccl.BUILD b/third_party/cccl/cccl.BUILD
new file mode 100644
index 0000000..6e15929
--- /dev/null
+++ b/third_party/cccl/cccl.BUILD
@@ -0,0 +1,19 @@
+cc_library(
+ name = "cccl",
+ hdrs = glob(include = [
+ "thrust/thrust/**",
+ "libcudacxx/include/**",
+ "cub/cub/**",
+ ]),
+ features = ["cuda"],
+ includes = [
+ "cub",
+ "libcudacxx/include",
+ "thrust",
+ ],
+ target_compatible_with = [
+ "@//tools/platforms/gpu:nvidia",
+ "@platforms//os:linux",
+ ],
+ visibility = ["//visibility:public"],
+)
diff --git a/third_party/cuco/BUILD b/third_party/cuco/BUILD
new file mode 100644
index 0000000..5b878ee
--- /dev/null
+++ b/third_party/cuco/BUILD
@@ -0,0 +1 @@
+exports_files(["cuco.BUILD", "template.patch"])
diff --git a/third_party/cuco/cuco.BUILD b/third_party/cuco/cuco.BUILD
new file mode 100644
index 0000000..f88e569
--- /dev/null
+++ b/third_party/cuco/cuco.BUILD
@@ -0,0 +1,15 @@
+cc_library(
+ name = "cuco",
+ hdrs = glob(include = ["include/**"]),
+ defines = [
+ "__CUDACC_RELAXED_CONSTEXPR__",
+ "__CUDACC_EXTENDED_LAMBDA__",
+ ],
+ features = ["cuda"],
+ includes = ["include"],
+ target_compatible_with = [
+ "@//tools/platforms/gpu:nvidia",
+ "@platforms//os:linux",
+ ],
+ visibility = ["//visibility:public"],
+)
diff --git a/third_party/cuco/template.patch b/third_party/cuco/template.patch
new file mode 100644
index 0000000..488eb9e
--- /dev/null
+++ b/third_party/cuco/template.patch
@@ -0,0 +1,138 @@
+diff --git a/include/cuco/detail/pair/pair.inl b/include/cuco/detail/pair/pair.inl
+index 3279a91..44be993 100644
+--- a/include/cuco/detail/pair/pair.inl
++++ b/include/cuco/detail/pair/pair.inl
+@@ -51,7 +51,53 @@ __host__ __device__ constexpr bool operator==(cuco::pair<T1, T2> const& lhs,
+ } // namespace cuco
+
+ namespace thrust {
+-#include <cuco/detail/pair/tuple_helpers.inl>
++template <std::size_t I, typename T1, typename T2>
++__host__ __device__ constexpr auto get(cuco::pair<T1, T2>& p) ->
++ typename tuple_element<I, cuco::pair<T1, T2>>::type&
++{
++ static_assert(I < 2);
++ if constexpr (I == 0) {
++ return p.first;
++ } else {
++ return p.second;
++ }
++}
++
++template <std::size_t I, typename T1, typename T2>
++__host__ __device__ constexpr auto get(cuco::pair<T1, T2>&& p) ->
++ typename tuple_element<I, cuco::pair<T1, T2>>::type&&
++{
++ static_assert(I < 2);
++ if constexpr (I == 0) {
++ return std::move(p.first);
++ } else {
++ return std::move(p.second);
++ }
++}
++
++template <std::size_t I, typename T1, typename T2>
++__host__ __device__ constexpr auto get(cuco::pair<T1, T2> const& p) ->
++ typename tuple_element<I, cuco::pair<T1, T2>>::type const&
++{
++ static_assert(I < 2);
++ if constexpr (I == 0) {
++ return p.first;
++ } else {
++ return p.second;
++ }
++}
++
++template <std::size_t I, typename T1, typename T2>
++__host__ __device__ constexpr auto get(cuco::pair<T1, T2> const&& p) ->
++ typename tuple_element<I, cuco::pair<T1, T2>>::type const&&
++{
++ static_assert(I < 2);
++ if constexpr (I == 0) {
++ return std::move(p.first);
++ } else {
++ return std::move(p.second);
++ }
++}
+ } // namespace thrust
+
+ namespace cuda::std {
+diff --git a/include/cuco/detail/static_map.inl b/include/cuco/detail/static_map.inl
+index 48799a7..336f6de 100644
+--- a/include/cuco/detail/static_map.inl
++++ b/include/cuco/detail/static_map.inl
+@@ -443,7 +443,7 @@ __device__
+ "insert_and_find is not supported for unpackable data on pre-Volta GPUs.");
+ #endif
+
+- auto current_slot{initial_slot(insert_pair.first, hash)};
++ auto current_slot{this->initial_slot(insert_pair.first, hash)};
+
+ while (true) {
+ key_type const existing_key = current_slot->first.load(cuda::std::memory_order_relaxed);
+@@ -514,7 +514,7 @@ __device__
+
+ // if we couldn't insert the key, but it wasn't a duplicate, then there must
+ // have been some other key there, so we keep looking for a slot
+- current_slot = next_slot(current_slot);
++ current_slot = this->next_slot(current_slot);
+ }
+ }
+
+diff --git a/include/cuco/detail/static_map/static_map_ref.inl b/include/cuco/detail/static_map/static_map_ref.inl
+index f27f21e..e90948f 100644
+--- a/include/cuco/detail/static_map/static_map_ref.inl
++++ b/include/cuco/detail/static_map/static_map_ref.inl
+@@ -141,21 +141,6 @@ static_map_ref<Key, T, Scope, KeyEqual, ProbingScheme, StorageRef, Operators...>
+ return impl_.empty_value_sentinel();
+ }
+
+-template <typename Key,
+- typename T,
+- cuda::thread_scope Scope,
+- typename KeyEqual,
+- typename ProbingScheme,
+- typename StorageRef,
+- typename... Operators>
+-template <typename... NewOperators>
+-auto static_map_ref<Key, T, Scope, KeyEqual, ProbingScheme, StorageRef, Operators...>::with(
+- NewOperators...) && noexcept
+-{
+- return static_map_ref<Key, T, Scope, KeyEqual, ProbingScheme, StorageRef, NewOperators...>(
+- std::move(*this));
+-}
+-
+ namespace detail {
+
+ template <typename Key,
+diff --git a/include/cuco/pair.cuh b/include/cuco/pair.cuh
+index d28cae5..1caaa24 100644
+--- a/include/cuco/pair.cuh
++++ b/include/cuco/pair.cuh
+@@ -87,8 +87,7 @@ struct alignas(detail::pair_alignment<First, Second>()) pair {
+ */
+ template <typename T, std::enable_if_t<detail::is_std_pair_like<T>::value>* = nullptr>
+ __host__ __device__ constexpr pair(T const& p)
+- : pair{cuda::std::get<0>(thrust::raw_reference_cast(p)),
+- cuda::std::get<1>(thrust::raw_reference_cast(p))}
++ : pair{std::get<0>(thrust::raw_reference_cast(p)), std::get<1>(thrust::raw_reference_cast(p))}
+ {
+ }
+
+diff --git a/include/cuco/static_map_ref.cuh b/include/cuco/static_map_ref.cuh
+index 88e40f8..7cf1d74 100644
+--- a/include/cuco/static_map_ref.cuh
++++ b/include/cuco/static_map_ref.cuh
+@@ -174,7 +174,11 @@ class static_map_ref
+ * @return `*this` with `NewOperators...`
+ */
+ template <typename... NewOperators>
+- [[nodiscard]] __host__ __device__ auto with(NewOperators... ops) && noexcept;
++ [[nodiscard]] __host__ __device__ auto with(NewOperators... ops) && noexcept
++ {
++ return static_map_ref<Key, T, Scope, KeyEqual, ProbingScheme, StorageRef, NewOperators...>(
++ std::move(*this));
++ }
+
+ private:
+ impl_type impl_; ///< Static map ref implementation