| #ifndef FRC971_ORIN_CUDA_H_ |
| #define FRC971_ORIN_CUDA_H_ |
| |
| #include <chrono> |
| #include <span> |
| |
| #include "absl/log/check.h" |
| #include "absl/log/log.h" |
| |
| #include "cuda_runtime.h" |
| #include "device_launch_parameters.h" |
| |
| // CHECKs that a cuda method returned success. |
| // TODO(austin): This will not handle if and else statements quite right, fix if |
| // we care. |
| #define CHECK_CUDA(condition) \ |
| if (auto c = condition) \ |
| LOG(FATAL) << "Check failed: " #condition " (" << cudaGetErrorString(c) \ |
| << ") " |
| |
| namespace frc971::apriltag { |
| |
| // Class to manage the lifetime of a Cuda stream. This is used to provide |
| // relative ordering between kernels on the same stream. |
| class CudaStream { |
| public: |
| CudaStream() { CHECK_CUDA(cudaStreamCreate(&stream_)); } |
| |
| CudaStream(const CudaStream &) = delete; |
| CudaStream &operator=(const CudaStream &) = delete; |
| |
| virtual ~CudaStream() { CHECK_CUDA(cudaStreamDestroy(stream_)); } |
| |
| // Returns the stream. |
| cudaStream_t get() { return stream_; } |
| |
| private: |
| cudaStream_t stream_; |
| }; |
| |
| // Class to manage the lifetime of a Cuda Event. Cuda events are used for |
| // timing events on a stream. |
| class CudaEvent { |
| public: |
| CudaEvent() { CHECK_CUDA(cudaEventCreate(&event_)); } |
| |
| CudaEvent(const CudaEvent &) = delete; |
| CudaEvent &operator=(const CudaEvent &) = delete; |
| |
| virtual ~CudaEvent() { CHECK_CUDA(cudaEventDestroy(event_)); } |
| |
| // Queues up an event to be timestamped on the stream when it is executed. |
| void Record(CudaStream *stream) { |
| CHECK_CUDA(cudaEventRecord(event_, stream->get())); |
| } |
| |
| // Returns the time elapsed between start and this event if it has been |
| // triggered. |
| std::chrono::nanoseconds ElapsedTime(const CudaEvent &start) { |
| float ms; |
| CHECK_CUDA(cudaEventElapsedTime(&ms, start.event_, event_)); |
| return std::chrono::duration_cast<std::chrono::nanoseconds>( |
| std::chrono::duration<float, std::milli>(ms)); |
| } |
| |
| // Waits until the event has been triggered. |
| void Synchronize() { CHECK_CUDA(cudaEventSynchronize(event_)); } |
| |
| private: |
| cudaEvent_t event_; |
| }; |
| |
| // Class to manage the lifetime of page locked host memory for fast copies back |
| // to host memory. |
| template <typename T> |
| class HostMemory { |
| public: |
| // Allocates a block of memory for holding up to size objects of type T. |
| HostMemory(size_t size) { |
| T *memory; |
| CHECK_CUDA(cudaMallocHost((void **)(&memory), size * sizeof(T))); |
| span_ = std::span<T>(memory, size); |
| } |
| HostMemory(const HostMemory &) = delete; |
| HostMemory &operator=(const HostMemory &) = delete; |
| |
| virtual ~HostMemory() { CHECK_CUDA(cudaFreeHost(span_.data())); } |
| |
| // Returns a pointer to the memory. |
| T *get() { return span_.data(); } |
| const T *get() const { return span_.data(); } |
| |
| // Returns the number of objects the memory can hold. |
| size_t size() const { return span_.size(); } |
| |
| // Copies data from other (host memory) to this's memory. |
| void MemcpyFrom(const T *other) { |
| memcpy(span_.data(), other, sizeof(T) * size()); |
| } |
| // Copies data to other (host memory) from this's memory. |
| void MemcpyTo(const T *other) { |
| memcpy(other, span_.data(), sizeof(T) * size()); |
| } |
| |
| private: |
| std::span<T> span_; |
| }; |
| |
| // Class to manage the lifetime of device memory. |
| template <typename T> |
| class GpuMemory { |
| public: |
| // Allocates a block of memory for holding up to size objects of type T in |
| // device memory. |
| GpuMemory(size_t size) : size_(size) { |
| CHECK_CUDA(cudaMalloc((void **)(&memory_), size * sizeof(T))); |
| } |
| GpuMemory(const GpuMemory &) = delete; |
| GpuMemory &operator=(const GpuMemory &) = delete; |
| |
| virtual ~GpuMemory() { CHECK_CUDA(cudaFree(memory_)); } |
| |
| // Returns the device pointer to the memory. |
| T *get() { return memory_; } |
| const T *get() const { return memory_; } |
| |
| // Returns the number of objects this memory can hold. |
| size_t size() const { return size_; } |
| |
| // Copies data from host memory to this memory asynchronously on the provided |
| // stream. |
| void MemcpyAsyncFrom(const T *host_memory, CudaStream *stream) { |
| CHECK_CUDA(cudaMemcpyAsync(memory_, host_memory, sizeof(T) * size_, |
| cudaMemcpyHostToDevice, stream->get())); |
| } |
| void MemcpyAsyncFrom(const HostMemory<T> *host_memory, CudaStream *stream) { |
| MemcpyAsyncFrom(host_memory->get(), stream); |
| } |
| |
| // Copies data to host memory from this memory asynchronously on the provided |
| // stream. |
| void MemcpyAsyncTo(T *host_memory, size_t size, CudaStream *stream) const { |
| CHECK_CUDA(cudaMemcpyAsync(reinterpret_cast<void *>(host_memory), |
| reinterpret_cast<void *>(memory_), |
| sizeof(T) * size, cudaMemcpyDeviceToHost, |
| stream->get())); |
| } |
| void MemcpyAsyncTo(T *host_memory, CudaStream *stream) const { |
| MemcpyAsyncTo(host_memory, size_, stream); |
| } |
| void MemcpyAsyncTo(HostMemory<T> *host_memory, CudaStream *stream) const { |
| MemcpyAsyncTo(host_memory->get(), stream); |
| } |
| |
| // Copies data from host_memory to this memory blocking. |
| void MemcpyFrom(const T *host_memory) { |
| CHECK_CUDA(cudaMemcpy(reinterpret_cast<void *>(memory_), |
| reinterpret_cast<const void *>(host_memory), |
| sizeof(T) * size_, cudaMemcpyHostToDevice)); |
| } |
| void MemcpyFrom(const HostMemory<T> *host_memory) { |
| MemcpyFrom(host_memory->get()); |
| } |
| |
| // Copies data to host_memory from this memory. Only copies size objects. |
| void MemcpyTo(T *host_memory, size_t size) const { |
| CHECK_CUDA(cudaMemcpy(reinterpret_cast<void *>(host_memory), memory_, |
| sizeof(T) * size, cudaMemcpyDeviceToHost)); |
| } |
| // Copies data to host_memory from this memory. |
| void MemcpyTo(T *host_memory) const { MemcpyTo(host_memory, size_); } |
| void MemcpyTo(HostMemory<T> *host_memory) const { |
| MemcpyTo(host_memory->get()); |
| } |
| |
| // Sets the memory asynchronously to contain data of type 'val' on the provide |
| // stream. |
| void MemsetAsync(const uint8_t val, CudaStream *stream) const { |
| CHECK_CUDA(cudaMemsetAsync(memory_, val, sizeof(T) * size_, stream->get())); |
| } |
| |
| // Allocates a vector on the host, copies size objects into it, and returns |
| // it. |
| std::vector<T> Copy(size_t s) const { |
| CHECK_LE(s, size_); |
| std::vector<T> result(s); |
| MemcpyTo(result.data(), s); |
| return result; |
| } |
| |
| // Copies all the objects in this memory to a vector on the host and returns |
| // it. |
| std::vector<T> Copy() const { return Copy(size_); } |
| |
| private: |
| T *memory_; |
| const size_t size_; |
| }; |
| |
| // Synchronizes and CHECKs for success the last CUDA operation. |
| void CheckAndSynchronize(std::string_view message = ""); |
| |
| // Synchronizes and CHECKS iff --sync is passed on the command line. Makes it |
| // so we can leave debugging in the code. |
| void MaybeCheckAndSynchronize(); |
| void MaybeCheckAndSynchronize(std::string_view message); |
| |
| } // namespace frc971::apriltag |
| |
| #endif // FRC971_ORIN_CUDA_H_ |