blob: 0e44fb4d6afc333f66f01112a653f0a8efc74889 [file] [log] [blame]
Austin Schuh8c267c72023-11-18 14:05:14 -08001#ifndef FRC971_ORIN_CUDA_H_
2#define FRC971_ORIN_CUDA_H_
3
4#include <chrono>
5#include <span>
6
7#include "glog/logging.h"
8
9#include "cuda_runtime.h"
10#include "device_launch_parameters.h"
11
12// CHECKs that a cuda method returned success.
13// TODO(austin): This will not handle if and else statements quite right, fix if
14// we care.
15#define CHECK_CUDA(condition) \
16 if (auto c = condition) \
17 LOG(FATAL) << "Check failed: " #condition " (" << cudaGetErrorString(c) \
18 << ") "
19
Stephan Pleinesd99b1ee2024-02-02 20:56:44 -080020namespace frc971::apriltag {
Austin Schuh8c267c72023-11-18 14:05:14 -080021
22// Class to manage the lifetime of a Cuda stream. This is used to provide
23// relative ordering between kernels on the same stream.
24class CudaStream {
25 public:
26 CudaStream() { CHECK_CUDA(cudaStreamCreate(&stream_)); }
27
28 CudaStream(const CudaStream &) = delete;
29 CudaStream &operator=(const CudaStream &) = delete;
30
31 virtual ~CudaStream() { CHECK_CUDA(cudaStreamDestroy(stream_)); }
32
33 // Returns the stream.
34 cudaStream_t get() { return stream_; }
35
36 private:
37 cudaStream_t stream_;
38};
39
40// Class to manage the lifetime of a Cuda Event. Cuda events are used for
41// timing events on a stream.
42class CudaEvent {
43 public:
44 CudaEvent() { CHECK_CUDA(cudaEventCreate(&event_)); }
45
46 CudaEvent(const CudaEvent &) = delete;
47 CudaEvent &operator=(const CudaEvent &) = delete;
48
49 virtual ~CudaEvent() { CHECK_CUDA(cudaEventDestroy(event_)); }
50
51 // Queues up an event to be timestamped on the stream when it is executed.
52 void Record(CudaStream *stream) {
53 CHECK_CUDA(cudaEventRecord(event_, stream->get()));
54 }
55
56 // Returns the time elapsed between start and this event if it has been
57 // triggered.
58 std::chrono::nanoseconds ElapsedTime(const CudaEvent &start) {
59 float ms;
60 CHECK_CUDA(cudaEventElapsedTime(&ms, start.event_, event_));
61 return std::chrono::duration_cast<std::chrono::nanoseconds>(
62 std::chrono::duration<float, std::milli>(ms));
63 }
64
65 // Waits until the event has been triggered.
66 void Synchronize() { CHECK_CUDA(cudaEventSynchronize(event_)); }
67
68 private:
69 cudaEvent_t event_;
70};
71
72// Class to manage the lifetime of page locked host memory for fast copies back
73// to host memory.
74template <typename T>
75class HostMemory {
76 public:
77 // Allocates a block of memory for holding up to size objects of type T.
78 HostMemory(size_t size) {
79 T *memory;
80 CHECK_CUDA(cudaMallocHost((void **)(&memory), size * sizeof(T)));
81 span_ = std::span<T>(memory, size);
82 }
83 HostMemory(const HostMemory &) = delete;
84 HostMemory &operator=(const HostMemory &) = delete;
85
86 virtual ~HostMemory() { CHECK_CUDA(cudaFreeHost(span_.data())); }
87
88 // Returns a pointer to the memory.
89 T *get() { return span_.data(); }
90 const T *get() const { return span_.data(); }
91
92 // Returns the number of objects the memory can hold.
93 size_t size() const { return span_.size(); }
94
95 // Copies data from other (host memory) to this's memory.
96 void MemcpyFrom(const T *other) {
97 memcpy(span_.data(), other, sizeof(T) * size());
98 }
99 // Copies data to other (host memory) from this's memory.
100 void MemcpyTo(const T *other) {
101 memcpy(other, span_.data(), sizeof(T) * size());
102 }
103
104 private:
105 std::span<T> span_;
106};
107
108// Class to manage the lifetime of device memory.
109template <typename T>
110class GpuMemory {
111 public:
112 // Allocates a block of memory for holding up to size objects of type T in
113 // device memory.
114 GpuMemory(size_t size) : size_(size) {
115 CHECK_CUDA(cudaMalloc((void **)(&memory_), size * sizeof(T)));
116 }
117 GpuMemory(const GpuMemory &) = delete;
118 GpuMemory &operator=(const GpuMemory &) = delete;
119
120 virtual ~GpuMemory() { CHECK_CUDA(cudaFree(memory_)); }
121
122 // Returns the device pointer to the memory.
123 T *get() { return memory_; }
124 const T *get() const { return memory_; }
125
126 // Returns the number of objects this memory can hold.
127 size_t size() const { return size_; }
128
129 // Copies data from host memory to this memory asynchronously on the provided
130 // stream.
131 void MemcpyAsyncFrom(const T *host_memory, CudaStream *stream) {
132 CHECK_CUDA(cudaMemcpyAsync(memory_, host_memory, sizeof(T) * size_,
133 cudaMemcpyHostToDevice, stream->get()));
134 }
135 void MemcpyAsyncFrom(const HostMemory<T> *host_memory, CudaStream *stream) {
136 MemcpyAsyncFrom(host_memory->get(), stream);
137 }
138
139 // Copies data to host memory from this memory asynchronously on the provided
140 // stream.
Austin Schuh1fc51fa2024-01-01 12:34:00 -0800141 void MemcpyAsyncTo(T *host_memory, size_t size, CudaStream *stream) const {
Austin Schuh8c267c72023-11-18 14:05:14 -0800142 CHECK_CUDA(cudaMemcpyAsync(reinterpret_cast<void *>(host_memory),
143 reinterpret_cast<void *>(memory_),
Austin Schuh1fc51fa2024-01-01 12:34:00 -0800144 sizeof(T) * size, cudaMemcpyDeviceToHost,
Austin Schuh8c267c72023-11-18 14:05:14 -0800145 stream->get()));
146 }
Austin Schuh1fc51fa2024-01-01 12:34:00 -0800147 void MemcpyAsyncTo(T *host_memory, CudaStream *stream) const {
148 MemcpyAsyncTo(host_memory, size_, stream);
149 }
Austin Schuh8c267c72023-11-18 14:05:14 -0800150 void MemcpyAsyncTo(HostMemory<T> *host_memory, CudaStream *stream) const {
151 MemcpyAsyncTo(host_memory->get(), stream);
152 }
153
154 // Copies data from host_memory to this memory blocking.
155 void MemcpyFrom(const T *host_memory) {
156 CHECK_CUDA(cudaMemcpy(reinterpret_cast<void *>(memory_),
157 reinterpret_cast<const void *>(host_memory),
158 sizeof(T) * size_, cudaMemcpyHostToDevice));
159 }
160 void MemcpyFrom(const HostMemory<T> *host_memory) {
161 MemcpyFrom(host_memory->get());
162 }
163
164 // Copies data to host_memory from this memory. Only copies size objects.
165 void MemcpyTo(T *host_memory, size_t size) const {
166 CHECK_CUDA(cudaMemcpy(reinterpret_cast<void *>(host_memory), memory_,
167 sizeof(T) * size, cudaMemcpyDeviceToHost));
168 }
169 // Copies data to host_memory from this memory.
170 void MemcpyTo(T *host_memory) const { MemcpyTo(host_memory, size_); }
171 void MemcpyTo(HostMemory<T> *host_memory) const {
172 MemcpyTo(host_memory->get());
173 }
174
175 // Sets the memory asynchronously to contain data of type 'val' on the provide
176 // stream.
177 void MemsetAsync(const uint8_t val, CudaStream *stream) const {
178 CHECK_CUDA(cudaMemsetAsync(memory_, val, sizeof(T) * size_, stream->get()));
179 }
180
181 // Allocates a vector on the host, copies size objects into it, and returns
182 // it.
183 std::vector<T> Copy(size_t s) const {
184 CHECK_LE(s, size_);
185 std::vector<T> result(s);
186 MemcpyTo(result.data(), s);
187 return result;
188 }
189
190 // Copies all the objects in this memory to a vector on the host and returns
191 // it.
192 std::vector<T> Copy() const { return Copy(size_); }
193
194 private:
195 T *memory_;
196 const size_t size_;
197};
198
199// Synchronizes and CHECKs for success the last CUDA operation.
Austin Schuh1fc51fa2024-01-01 12:34:00 -0800200void CheckAndSynchronize(std::string_view message = "");
Austin Schuh8c267c72023-11-18 14:05:14 -0800201
202// Synchronizes and CHECKS iff --sync is passed on the command line. Makes it
203// so we can leave debugging in the code.
204void MaybeCheckAndSynchronize();
Austin Schuh1fc51fa2024-01-01 12:34:00 -0800205void MaybeCheckAndSynchronize(std::string_view message);
Austin Schuh8c267c72023-11-18 14:05:14 -0800206
Stephan Pleinesd99b1ee2024-02-02 20:56:44 -0800207} // namespace frc971::apriltag
Austin Schuh8c267c72023-11-18 14:05:14 -0800208
209#endif // FRC971_ORIN_CUDA_H_