Add support for compiling CUDA code on both amd64 and the orin
Change-Id: Ic7aca062c454fa1da3480667363e7250ddc2a8fe
Signed-off-by: Austin Schuh <austin.linux@gmail.com>
diff --git a/WORKSPACE b/WORKSPACE
index efeb2a4..e60524a 100644
--- a/WORKSPACE
+++ b/WORKSPACE
@@ -1600,3 +1600,23 @@
strip_prefix = "julia-1.8.5",
url = "https://julialang-s3.julialang.org/bin/linux/x64/1.8/julia-1.8.5-linux-x86_64.tar.gz",
)
+
+http_archive(
+ name = "com_github_nvidia_cuco",
+ build_file = "//third_party/cuco:cuco.BUILD",
+ patch_args = ["-p1"],
+ patches = [
+ "//third_party/cuco:template.patch",
+ ],
+ sha256 = "eecc9a111956a195f28ebc4b4fd23ac6991d072f5c1d7c68a59d059e05d7ad78",
+ strip_prefix = "cuCollections-b7514d2010967fdfe4a1d414894bb945bc09fddc",
+ url = "https://github.com/NVIDIA/cuCollections/archive/b7514d2010967fdfe4a1d414894bb945bc09fddc.zip",
+)
+
+http_archive(
+ name = "com_github_nvidia_cccl",
+ build_file = "//third_party/cccl:cccl.BUILD",
+ sha256 = "38160c628a9e32b7cd55553f299768f72b24074cc9c1a993ba40a177877b3421",
+ strip_prefix = "cccl-931dc6793482c61edbc97b7a19256874fd264313",
+ url = "https://github.com/NVIDIA/cccl/archive/931dc6793482c61edbc97b7a19256874fd264313.zip",
+)
diff --git a/third_party/BUILD b/third_party/BUILD
index 7195123..0430621 100644
--- a/third_party/BUILD
+++ b/third_party/BUILD
@@ -117,3 +117,23 @@
"//conditions:default": [":unavailable"],
}),
)
+
+cc_library(
+ name = "cudart",
+ visibility = ["//visibility:public"],
+ deps = select({
+ "//tools:cpu_k8": ["@amd64_debian_sysroot//:cudart"],
+ "//tools:cpu_arm64": ["@arm64_debian_sysroot//:cudart"],
+ "//conditions:default": [":unavailable"],
+ }),
+)
+
+cc_library(
+ name = "nppi",
+ visibility = ["//visibility:public"],
+ deps = select({
+ "//tools:cpu_k8": ["@amd64_debian_sysroot//:nppi"],
+ "//tools:cpu_arm64": ["@arm64_debian_sysroot//:nppi"],
+ "//conditions:default": [":unavailable"],
+ }),
+)
diff --git a/third_party/bazel-toolchain/bazel_tools_changes/tools/cpp/unix_cc_toolchain_config.bzl b/third_party/bazel-toolchain/bazel_tools_changes/tools/cpp/unix_cc_toolchain_config.bzl
index 0044dce..2aa6e7e 100755
--- a/third_party/bazel-toolchain/bazel_tools_changes/tools/cpp/unix_cc_toolchain_config.bzl
+++ b/third_party/bazel-toolchain/bazel_tools_changes/tools/cpp/unix_cc_toolchain_config.bzl
@@ -221,6 +221,15 @@
with_features = [with_feature_set(features = ["fastbuild"])],
),
flag_set(
+ actions = all_compile_actions,
+ with_features = [with_feature_set(features = ["cuda"])],
+ flag_groups = ([
+ flag_group(
+ flags = ctx.attr.cuda_flags,
+ ),
+ ]),
+ ),
+ flag_set(
actions = [ACTION_NAMES.c_compile],
flag_groups = ([
flag_group(
@@ -1131,6 +1140,51 @@
],
)
+ cuda_flags = [
+ "--cuda-gpu-arch=sm_80",
+ "--cuda-gpu-arch=sm_86",
+ "--cuda-gpu-arch=sm_87",
+ "-x",
+ "cuda",
+ ]
+
+ if ctx.attr.cpu == "aarch64":
+ cuda_flags += [
+ "--cuda-path=external/arm64_debian_sysroot/usr/local/cuda-11.8/",
+ "--ptxas-path=external/arm64_debian_sysroot/usr/local/cuda-11.8/bin/ptxas",
+ "-D__CUDACC_VER_MAJOR__=11",
+ "-D__CUDACC_VER_MINOR__=8",
+ ]
+ pass
+ elif ctx.attr.cpu == "k8":
+ cuda_flags += [
+ "--cuda-path=external/amd64_debian_sysroot/usr/lib/cuda/",
+ "--ptxas-path=external/amd64_debian_sysroot/usr/bin/ptxas",
+ "-D__CUDACC_VER_MAJOR__=11",
+ "-D__CUDACC_VER_MINOR__=8",
+ ]
+ else:
+ fail("Unknown cpu", ctx.attr.cpu)
+
+ cuda_feature = feature(
+ name = "cuda",
+ provides = ["cuda"],
+ flag_sets = [
+ flag_set(
+ actions = [
+ ACTION_NAMES.cpp_compile,
+ ACTION_NAMES.cpp_header_parsing,
+ ACTION_NAMES.cpp_module_compile,
+ ],
+ flag_groups = [
+ flag_group(
+ flags = cuda_flags,
+ ),
+ ],
+ ),
+ ],
+ )
+
thinlto_feature = feature(
name = "thin_lto",
flag_sets = [
@@ -1210,6 +1264,7 @@
cs_fdo_instrument_feature,
cs_fdo_optimize_feature,
thinlto_feature,
+ cuda_feature,
fdo_prefetch_hints_feature,
autofdo_feature,
build_interface_libraries_feature,
@@ -1312,6 +1367,7 @@
"coverage_link_flags": attr.string_list(),
"supports_start_end_lib": attr.bool(),
"builtin_sysroot": attr.string(),
+ "cuda_flags": attr.string_list(),
},
provides = [CcToolchainConfigInfo],
)
diff --git a/third_party/bazel-toolchain/toolchain/cc_toolchain_config.bzl b/third_party/bazel-toolchain/toolchain/cc_toolchain_config.bzl
index b167f6e..88fd82d 100644
--- a/third_party/bazel-toolchain/toolchain/cc_toolchain_config.bzl
+++ b/third_party/bazel-toolchain/toolchain/cc_toolchain_config.bzl
@@ -406,6 +406,7 @@
opt_compile_flags.extend(opt_copts)
fastbuild_compile_flags.extend(fastbuild_copts)
link_flags.extend(linkopts)
+ cuda_flags = ["-isystem", target_toolchain_path_prefix + "lib/clang/" + llvm_subfolder + "/include/cuda_wrappers"]
# Source: https://cs.opensource.google/bazel/bazel/+/master:tools/cpp/unix_cc_toolchain_config.bzl
unix_cc_toolchain_config(
@@ -435,4 +436,5 @@
coverage_link_flags = coverage_link_flags,
supports_start_end_lib = supports_start_end_lib,
builtin_sysroot = sysroot_path,
+ cuda_flags = cuda_flags,
)
diff --git a/third_party/cccl/BUILD b/third_party/cccl/BUILD
new file mode 100644
index 0000000..9a3294f
--- /dev/null
+++ b/third_party/cccl/BUILD
@@ -0,0 +1 @@
+exports_files(["cccl.BUILD"])
diff --git a/third_party/cccl/cccl.BUILD b/third_party/cccl/cccl.BUILD
new file mode 100644
index 0000000..6e15929
--- /dev/null
+++ b/third_party/cccl/cccl.BUILD
@@ -0,0 +1,19 @@
+cc_library(
+ name = "cccl",
+ hdrs = glob(include = [
+ "thrust/thrust/**",
+ "libcudacxx/include/**",
+ "cub/cub/**",
+ ]),
+ features = ["cuda"],
+ includes = [
+ "cub",
+ "libcudacxx/include",
+ "thrust",
+ ],
+ target_compatible_with = [
+ "@//tools/platforms/gpu:nvidia",
+ "@platforms//os:linux",
+ ],
+ visibility = ["//visibility:public"],
+)
diff --git a/third_party/cuco/BUILD b/third_party/cuco/BUILD
new file mode 100644
index 0000000..5b878ee
--- /dev/null
+++ b/third_party/cuco/BUILD
@@ -0,0 +1 @@
+exports_files(["cuco.BUILD", "template.patch"])
diff --git a/third_party/cuco/cuco.BUILD b/third_party/cuco/cuco.BUILD
new file mode 100644
index 0000000..f88e569
--- /dev/null
+++ b/third_party/cuco/cuco.BUILD
@@ -0,0 +1,15 @@
+cc_library(
+ name = "cuco",
+ hdrs = glob(include = ["include/**"]),
+ defines = [
+ "__CUDACC_RELAXED_CONSTEXPR__",
+ "__CUDACC_EXTENDED_LAMBDA__",
+ ],
+ features = ["cuda"],
+ includes = ["include"],
+ target_compatible_with = [
+ "@//tools/platforms/gpu:nvidia",
+ "@platforms//os:linux",
+ ],
+ visibility = ["//visibility:public"],
+)
diff --git a/third_party/cuco/template.patch b/third_party/cuco/template.patch
new file mode 100644
index 0000000..488eb9e
--- /dev/null
+++ b/third_party/cuco/template.patch
@@ -0,0 +1,138 @@
+diff --git a/include/cuco/detail/pair/pair.inl b/include/cuco/detail/pair/pair.inl
+index 3279a91..44be993 100644
+--- a/include/cuco/detail/pair/pair.inl
++++ b/include/cuco/detail/pair/pair.inl
+@@ -51,7 +51,53 @@ __host__ __device__ constexpr bool operator==(cuco::pair<T1, T2> const& lhs,
+ } // namespace cuco
+
+ namespace thrust {
+-#include <cuco/detail/pair/tuple_helpers.inl>
++template <std::size_t I, typename T1, typename T2>
++__host__ __device__ constexpr auto get(cuco::pair<T1, T2>& p) ->
++ typename tuple_element<I, cuco::pair<T1, T2>>::type&
++{
++ static_assert(I < 2);
++ if constexpr (I == 0) {
++ return p.first;
++ } else {
++ return p.second;
++ }
++}
++
++template <std::size_t I, typename T1, typename T2>
++__host__ __device__ constexpr auto get(cuco::pair<T1, T2>&& p) ->
++ typename tuple_element<I, cuco::pair<T1, T2>>::type&&
++{
++ static_assert(I < 2);
++ if constexpr (I == 0) {
++ return std::move(p.first);
++ } else {
++ return std::move(p.second);
++ }
++}
++
++template <std::size_t I, typename T1, typename T2>
++__host__ __device__ constexpr auto get(cuco::pair<T1, T2> const& p) ->
++ typename tuple_element<I, cuco::pair<T1, T2>>::type const&
++{
++ static_assert(I < 2);
++ if constexpr (I == 0) {
++ return p.first;
++ } else {
++ return p.second;
++ }
++}
++
++template <std::size_t I, typename T1, typename T2>
++__host__ __device__ constexpr auto get(cuco::pair<T1, T2> const&& p) ->
++ typename tuple_element<I, cuco::pair<T1, T2>>::type const&&
++{
++ static_assert(I < 2);
++ if constexpr (I == 0) {
++ return std::move(p.first);
++ } else {
++ return std::move(p.second);
++ }
++}
+ } // namespace thrust
+
+ namespace cuda::std {
+diff --git a/include/cuco/detail/static_map.inl b/include/cuco/detail/static_map.inl
+index 48799a7..336f6de 100644
+--- a/include/cuco/detail/static_map.inl
++++ b/include/cuco/detail/static_map.inl
+@@ -443,7 +443,7 @@ __device__
+ "insert_and_find is not supported for unpackable data on pre-Volta GPUs.");
+ #endif
+
+- auto current_slot{initial_slot(insert_pair.first, hash)};
++ auto current_slot{this->initial_slot(insert_pair.first, hash)};
+
+ while (true) {
+ key_type const existing_key = current_slot->first.load(cuda::std::memory_order_relaxed);
+@@ -514,7 +514,7 @@ __device__
+
+ // if we couldn't insert the key, but it wasn't a duplicate, then there must
+ // have been some other key there, so we keep looking for a slot
+- current_slot = next_slot(current_slot);
++ current_slot = this->next_slot(current_slot);
+ }
+ }
+
+diff --git a/include/cuco/detail/static_map/static_map_ref.inl b/include/cuco/detail/static_map/static_map_ref.inl
+index f27f21e..e90948f 100644
+--- a/include/cuco/detail/static_map/static_map_ref.inl
++++ b/include/cuco/detail/static_map/static_map_ref.inl
+@@ -141,21 +141,6 @@ static_map_ref<Key, T, Scope, KeyEqual, ProbingScheme, StorageRef, Operators...>
+ return impl_.empty_value_sentinel();
+ }
+
+-template <typename Key,
+- typename T,
+- cuda::thread_scope Scope,
+- typename KeyEqual,
+- typename ProbingScheme,
+- typename StorageRef,
+- typename... Operators>
+-template <typename... NewOperators>
+-auto static_map_ref<Key, T, Scope, KeyEqual, ProbingScheme, StorageRef, Operators...>::with(
+- NewOperators...) && noexcept
+-{
+- return static_map_ref<Key, T, Scope, KeyEqual, ProbingScheme, StorageRef, NewOperators...>(
+- std::move(*this));
+-}
+-
+ namespace detail {
+
+ template <typename Key,
+diff --git a/include/cuco/pair.cuh b/include/cuco/pair.cuh
+index d28cae5..1caaa24 100644
+--- a/include/cuco/pair.cuh
++++ b/include/cuco/pair.cuh
+@@ -87,8 +87,7 @@ struct alignas(detail::pair_alignment<First, Second>()) pair {
+ */
+ template <typename T, std::enable_if_t<detail::is_std_pair_like<T>::value>* = nullptr>
+ __host__ __device__ constexpr pair(T const& p)
+- : pair{cuda::std::get<0>(thrust::raw_reference_cast(p)),
+- cuda::std::get<1>(thrust::raw_reference_cast(p))}
++ : pair{std::get<0>(thrust::raw_reference_cast(p)), std::get<1>(thrust::raw_reference_cast(p))}
+ {
+ }
+
+diff --git a/include/cuco/static_map_ref.cuh b/include/cuco/static_map_ref.cuh
+index 88e40f8..7cf1d74 100644
+--- a/include/cuco/static_map_ref.cuh
++++ b/include/cuco/static_map_ref.cuh
+@@ -174,7 +174,11 @@ class static_map_ref
+ * @return `*this` with `NewOperators...`
+ */
+ template <typename... NewOperators>
+- [[nodiscard]] __host__ __device__ auto with(NewOperators... ops) && noexcept;
++ [[nodiscard]] __host__ __device__ auto with(NewOperators... ops) && noexcept
++ {
++ return static_map_ref<Key, T, Scope, KeyEqual, ProbingScheme, StorageRef, NewOperators...>(
++ std::move(*this));
++ }
+
+ private:
+ impl_type impl_; ///< Static map ref implementation
diff --git a/tools/platforms/BUILD b/tools/platforms/BUILD
index f485894..599f926 100644
--- a/tools/platforms/BUILD
+++ b/tools/platforms/BUILD
@@ -8,6 +8,7 @@
"//tools/platforms/go:has_support",
"//tools/platforms/rust:has_support",
"//tools/platforms/nodejs:has_support",
+ "//tools/platforms/gpu:nvidia",
],
)
@@ -20,6 +21,7 @@
"//tools/platforms/go:lacks_support",
"//tools/platforms/rust:has_support",
"//tools/platforms/nodejs:lacks_support",
+ "//tools/platforms/gpu:nvidia",
],
)
@@ -37,6 +39,7 @@
# handle that, need to figure out how to do that here or switch linkers.
"//tools/platforms/rust:lacks_support",
"//tools/platforms/nodejs:lacks_support",
+ "//tools/platforms/gpu:none",
],
)
@@ -48,6 +51,7 @@
"//tools/platforms/go:lacks_support",
"//tools/platforms/rust:lacks_support",
"//tools/platforms/nodejs:lacks_support",
+ "//tools/platforms/gpu:none",
],
)
@@ -59,6 +63,7 @@
"//tools/platforms/go:lacks_support",
"//tools/platforms/rust:lacks_support",
"//tools/platforms/nodejs:lacks_support",
+ "//tools/platforms/gpu:none",
],
)
diff --git a/tools/platforms/gpu/BUILD b/tools/platforms/gpu/BUILD
new file mode 100644
index 0000000..9686148
--- /dev/null
+++ b/tools/platforms/gpu/BUILD
@@ -0,0 +1,13 @@
+package(default_visibility = ["//visibility:public"])
+
+constraint_setting(name = "gpu")
+
+constraint_value(
+ name = "nvidia",
+ constraint_setting = ":gpu",
+)
+
+constraint_value(
+ name = "none",
+ constraint_setting = ":gpu",
+)