Austin Schuh | 8c267c7 | 2023-11-18 14:05:14 -0800 | [diff] [blame] | 1 | #ifndef FRC971_ORIN_CUDA_H_ |
| 2 | #define FRC971_ORIN_CUDA_H_ |
| 3 | |
| 4 | #include <chrono> |
| 5 | #include <span> |
| 6 | |
| 7 | #include "glog/logging.h" |
| 8 | |
| 9 | #include "cuda_runtime.h" |
| 10 | #include "device_launch_parameters.h" |
| 11 | |
| 12 | // CHECKs that a cuda method returned success. |
| 13 | // TODO(austin): This will not handle if and else statements quite right, fix if |
| 14 | // we care. |
| 15 | #define CHECK_CUDA(condition) \ |
| 16 | if (auto c = condition) \ |
| 17 | LOG(FATAL) << "Check failed: " #condition " (" << cudaGetErrorString(c) \ |
| 18 | << ") " |
| 19 | |
Stephan Pleines | d99b1ee | 2024-02-02 20:56:44 -0800 | [diff] [blame^] | 20 | namespace frc971::apriltag { |
Austin Schuh | 8c267c7 | 2023-11-18 14:05:14 -0800 | [diff] [blame] | 21 | |
| 22 | // Class to manage the lifetime of a Cuda stream. This is used to provide |
| 23 | // relative ordering between kernels on the same stream. |
| 24 | class CudaStream { |
| 25 | public: |
| 26 | CudaStream() { CHECK_CUDA(cudaStreamCreate(&stream_)); } |
| 27 | |
| 28 | CudaStream(const CudaStream &) = delete; |
| 29 | CudaStream &operator=(const CudaStream &) = delete; |
| 30 | |
| 31 | virtual ~CudaStream() { CHECK_CUDA(cudaStreamDestroy(stream_)); } |
| 32 | |
| 33 | // Returns the stream. |
| 34 | cudaStream_t get() { return stream_; } |
| 35 | |
| 36 | private: |
| 37 | cudaStream_t stream_; |
| 38 | }; |
| 39 | |
| 40 | // Class to manage the lifetime of a Cuda Event. Cuda events are used for |
| 41 | // timing events on a stream. |
| 42 | class CudaEvent { |
| 43 | public: |
| 44 | CudaEvent() { CHECK_CUDA(cudaEventCreate(&event_)); } |
| 45 | |
| 46 | CudaEvent(const CudaEvent &) = delete; |
| 47 | CudaEvent &operator=(const CudaEvent &) = delete; |
| 48 | |
| 49 | virtual ~CudaEvent() { CHECK_CUDA(cudaEventDestroy(event_)); } |
| 50 | |
| 51 | // Queues up an event to be timestamped on the stream when it is executed. |
| 52 | void Record(CudaStream *stream) { |
| 53 | CHECK_CUDA(cudaEventRecord(event_, stream->get())); |
| 54 | } |
| 55 | |
| 56 | // Returns the time elapsed between start and this event if it has been |
| 57 | // triggered. |
| 58 | std::chrono::nanoseconds ElapsedTime(const CudaEvent &start) { |
| 59 | float ms; |
| 60 | CHECK_CUDA(cudaEventElapsedTime(&ms, start.event_, event_)); |
| 61 | return std::chrono::duration_cast<std::chrono::nanoseconds>( |
| 62 | std::chrono::duration<float, std::milli>(ms)); |
| 63 | } |
| 64 | |
| 65 | // Waits until the event has been triggered. |
| 66 | void Synchronize() { CHECK_CUDA(cudaEventSynchronize(event_)); } |
| 67 | |
| 68 | private: |
| 69 | cudaEvent_t event_; |
| 70 | }; |
| 71 | |
| 72 | // Class to manage the lifetime of page locked host memory for fast copies back |
| 73 | // to host memory. |
| 74 | template <typename T> |
| 75 | class HostMemory { |
| 76 | public: |
| 77 | // Allocates a block of memory for holding up to size objects of type T. |
| 78 | HostMemory(size_t size) { |
| 79 | T *memory; |
| 80 | CHECK_CUDA(cudaMallocHost((void **)(&memory), size * sizeof(T))); |
| 81 | span_ = std::span<T>(memory, size); |
| 82 | } |
| 83 | HostMemory(const HostMemory &) = delete; |
| 84 | HostMemory &operator=(const HostMemory &) = delete; |
| 85 | |
| 86 | virtual ~HostMemory() { CHECK_CUDA(cudaFreeHost(span_.data())); } |
| 87 | |
| 88 | // Returns a pointer to the memory. |
| 89 | T *get() { return span_.data(); } |
| 90 | const T *get() const { return span_.data(); } |
| 91 | |
| 92 | // Returns the number of objects the memory can hold. |
| 93 | size_t size() const { return span_.size(); } |
| 94 | |
| 95 | // Copies data from other (host memory) to this's memory. |
| 96 | void MemcpyFrom(const T *other) { |
| 97 | memcpy(span_.data(), other, sizeof(T) * size()); |
| 98 | } |
| 99 | // Copies data to other (host memory) from this's memory. |
| 100 | void MemcpyTo(const T *other) { |
| 101 | memcpy(other, span_.data(), sizeof(T) * size()); |
| 102 | } |
| 103 | |
| 104 | private: |
| 105 | std::span<T> span_; |
| 106 | }; |
| 107 | |
| 108 | // Class to manage the lifetime of device memory. |
| 109 | template <typename T> |
| 110 | class GpuMemory { |
| 111 | public: |
| 112 | // Allocates a block of memory for holding up to size objects of type T in |
| 113 | // device memory. |
| 114 | GpuMemory(size_t size) : size_(size) { |
| 115 | CHECK_CUDA(cudaMalloc((void **)(&memory_), size * sizeof(T))); |
| 116 | } |
| 117 | GpuMemory(const GpuMemory &) = delete; |
| 118 | GpuMemory &operator=(const GpuMemory &) = delete; |
| 119 | |
| 120 | virtual ~GpuMemory() { CHECK_CUDA(cudaFree(memory_)); } |
| 121 | |
| 122 | // Returns the device pointer to the memory. |
| 123 | T *get() { return memory_; } |
| 124 | const T *get() const { return memory_; } |
| 125 | |
| 126 | // Returns the number of objects this memory can hold. |
| 127 | size_t size() const { return size_; } |
| 128 | |
| 129 | // Copies data from host memory to this memory asynchronously on the provided |
| 130 | // stream. |
| 131 | void MemcpyAsyncFrom(const T *host_memory, CudaStream *stream) { |
| 132 | CHECK_CUDA(cudaMemcpyAsync(memory_, host_memory, sizeof(T) * size_, |
| 133 | cudaMemcpyHostToDevice, stream->get())); |
| 134 | } |
| 135 | void MemcpyAsyncFrom(const HostMemory<T> *host_memory, CudaStream *stream) { |
| 136 | MemcpyAsyncFrom(host_memory->get(), stream); |
| 137 | } |
| 138 | |
| 139 | // Copies data to host memory from this memory asynchronously on the provided |
| 140 | // stream. |
Austin Schuh | 1fc51fa | 2024-01-01 12:34:00 -0800 | [diff] [blame] | 141 | void MemcpyAsyncTo(T *host_memory, size_t size, CudaStream *stream) const { |
Austin Schuh | 8c267c7 | 2023-11-18 14:05:14 -0800 | [diff] [blame] | 142 | CHECK_CUDA(cudaMemcpyAsync(reinterpret_cast<void *>(host_memory), |
| 143 | reinterpret_cast<void *>(memory_), |
Austin Schuh | 1fc51fa | 2024-01-01 12:34:00 -0800 | [diff] [blame] | 144 | sizeof(T) * size, cudaMemcpyDeviceToHost, |
Austin Schuh | 8c267c7 | 2023-11-18 14:05:14 -0800 | [diff] [blame] | 145 | stream->get())); |
| 146 | } |
Austin Schuh | 1fc51fa | 2024-01-01 12:34:00 -0800 | [diff] [blame] | 147 | void MemcpyAsyncTo(T *host_memory, CudaStream *stream) const { |
| 148 | MemcpyAsyncTo(host_memory, size_, stream); |
| 149 | } |
Austin Schuh | 8c267c7 | 2023-11-18 14:05:14 -0800 | [diff] [blame] | 150 | void MemcpyAsyncTo(HostMemory<T> *host_memory, CudaStream *stream) const { |
| 151 | MemcpyAsyncTo(host_memory->get(), stream); |
| 152 | } |
| 153 | |
| 154 | // Copies data from host_memory to this memory blocking. |
| 155 | void MemcpyFrom(const T *host_memory) { |
| 156 | CHECK_CUDA(cudaMemcpy(reinterpret_cast<void *>(memory_), |
| 157 | reinterpret_cast<const void *>(host_memory), |
| 158 | sizeof(T) * size_, cudaMemcpyHostToDevice)); |
| 159 | } |
| 160 | void MemcpyFrom(const HostMemory<T> *host_memory) { |
| 161 | MemcpyFrom(host_memory->get()); |
| 162 | } |
| 163 | |
| 164 | // Copies data to host_memory from this memory. Only copies size objects. |
| 165 | void MemcpyTo(T *host_memory, size_t size) const { |
| 166 | CHECK_CUDA(cudaMemcpy(reinterpret_cast<void *>(host_memory), memory_, |
| 167 | sizeof(T) * size, cudaMemcpyDeviceToHost)); |
| 168 | } |
| 169 | // Copies data to host_memory from this memory. |
| 170 | void MemcpyTo(T *host_memory) const { MemcpyTo(host_memory, size_); } |
| 171 | void MemcpyTo(HostMemory<T> *host_memory) const { |
| 172 | MemcpyTo(host_memory->get()); |
| 173 | } |
| 174 | |
| 175 | // Sets the memory asynchronously to contain data of type 'val' on the provide |
| 176 | // stream. |
| 177 | void MemsetAsync(const uint8_t val, CudaStream *stream) const { |
| 178 | CHECK_CUDA(cudaMemsetAsync(memory_, val, sizeof(T) * size_, stream->get())); |
| 179 | } |
| 180 | |
| 181 | // Allocates a vector on the host, copies size objects into it, and returns |
| 182 | // it. |
| 183 | std::vector<T> Copy(size_t s) const { |
| 184 | CHECK_LE(s, size_); |
| 185 | std::vector<T> result(s); |
| 186 | MemcpyTo(result.data(), s); |
| 187 | return result; |
| 188 | } |
| 189 | |
| 190 | // Copies all the objects in this memory to a vector on the host and returns |
| 191 | // it. |
| 192 | std::vector<T> Copy() const { return Copy(size_); } |
| 193 | |
| 194 | private: |
| 195 | T *memory_; |
| 196 | const size_t size_; |
| 197 | }; |
| 198 | |
| 199 | // Synchronizes and CHECKs for success the last CUDA operation. |
Austin Schuh | 1fc51fa | 2024-01-01 12:34:00 -0800 | [diff] [blame] | 200 | void CheckAndSynchronize(std::string_view message = ""); |
Austin Schuh | 8c267c7 | 2023-11-18 14:05:14 -0800 | [diff] [blame] | 201 | |
| 202 | // Synchronizes and CHECKS iff --sync is passed on the command line. Makes it |
| 203 | // so we can leave debugging in the code. |
| 204 | void MaybeCheckAndSynchronize(); |
Austin Schuh | 1fc51fa | 2024-01-01 12:34:00 -0800 | [diff] [blame] | 205 | void MaybeCheckAndSynchronize(std::string_view message); |
Austin Schuh | 8c267c7 | 2023-11-18 14:05:14 -0800 | [diff] [blame] | 206 | |
Stephan Pleines | d99b1ee | 2024-02-02 20:56:44 -0800 | [diff] [blame^] | 207 | } // namespace frc971::apriltag |
Austin Schuh | 8c267c7 | 2023-11-18 14:05:14 -0800 | [diff] [blame] | 208 | |
| 209 | #endif // FRC971_ORIN_CUDA_H_ |