blob: e386aa624f4819cc1bd8597a726732ab89769c15 [file] [log] [blame]
Austin Schuh8c267c72023-11-18 14:05:14 -08001#ifndef FRC971_ORIN_CUDA_H_
2#define FRC971_ORIN_CUDA_H_
3
4#include <chrono>
5#include <span>
6
7#include "glog/logging.h"
8
9#include "cuda_runtime.h"
10#include "device_launch_parameters.h"
11
12// CHECKs that a cuda method returned success.
13// TODO(austin): This will not handle if and else statements quite right, fix if
14// we care.
15#define CHECK_CUDA(condition) \
16 if (auto c = condition) \
17 LOG(FATAL) << "Check failed: " #condition " (" << cudaGetErrorString(c) \
18 << ") "
19
20namespace frc971 {
21namespace apriltag {
22
23// Class to manage the lifetime of a Cuda stream. This is used to provide
24// relative ordering between kernels on the same stream.
25class CudaStream {
26 public:
27 CudaStream() { CHECK_CUDA(cudaStreamCreate(&stream_)); }
28
29 CudaStream(const CudaStream &) = delete;
30 CudaStream &operator=(const CudaStream &) = delete;
31
32 virtual ~CudaStream() { CHECK_CUDA(cudaStreamDestroy(stream_)); }
33
34 // Returns the stream.
35 cudaStream_t get() { return stream_; }
36
37 private:
38 cudaStream_t stream_;
39};
40
41// Class to manage the lifetime of a Cuda Event. Cuda events are used for
42// timing events on a stream.
43class CudaEvent {
44 public:
45 CudaEvent() { CHECK_CUDA(cudaEventCreate(&event_)); }
46
47 CudaEvent(const CudaEvent &) = delete;
48 CudaEvent &operator=(const CudaEvent &) = delete;
49
50 virtual ~CudaEvent() { CHECK_CUDA(cudaEventDestroy(event_)); }
51
52 // Queues up an event to be timestamped on the stream when it is executed.
53 void Record(CudaStream *stream) {
54 CHECK_CUDA(cudaEventRecord(event_, stream->get()));
55 }
56
57 // Returns the time elapsed between start and this event if it has been
58 // triggered.
59 std::chrono::nanoseconds ElapsedTime(const CudaEvent &start) {
60 float ms;
61 CHECK_CUDA(cudaEventElapsedTime(&ms, start.event_, event_));
62 return std::chrono::duration_cast<std::chrono::nanoseconds>(
63 std::chrono::duration<float, std::milli>(ms));
64 }
65
66 // Waits until the event has been triggered.
67 void Synchronize() { CHECK_CUDA(cudaEventSynchronize(event_)); }
68
69 private:
70 cudaEvent_t event_;
71};
72
73// Class to manage the lifetime of page locked host memory for fast copies back
74// to host memory.
75template <typename T>
76class HostMemory {
77 public:
78 // Allocates a block of memory for holding up to size objects of type T.
79 HostMemory(size_t size) {
80 T *memory;
81 CHECK_CUDA(cudaMallocHost((void **)(&memory), size * sizeof(T)));
82 span_ = std::span<T>(memory, size);
83 }
84 HostMemory(const HostMemory &) = delete;
85 HostMemory &operator=(const HostMemory &) = delete;
86
87 virtual ~HostMemory() { CHECK_CUDA(cudaFreeHost(span_.data())); }
88
89 // Returns a pointer to the memory.
90 T *get() { return span_.data(); }
91 const T *get() const { return span_.data(); }
92
93 // Returns the number of objects the memory can hold.
94 size_t size() const { return span_.size(); }
95
96 // Copies data from other (host memory) to this's memory.
97 void MemcpyFrom(const T *other) {
98 memcpy(span_.data(), other, sizeof(T) * size());
99 }
100 // Copies data to other (host memory) from this's memory.
101 void MemcpyTo(const T *other) {
102 memcpy(other, span_.data(), sizeof(T) * size());
103 }
104
105 private:
106 std::span<T> span_;
107};
108
109// Class to manage the lifetime of device memory.
110template <typename T>
111class GpuMemory {
112 public:
113 // Allocates a block of memory for holding up to size objects of type T in
114 // device memory.
115 GpuMemory(size_t size) : size_(size) {
116 CHECK_CUDA(cudaMalloc((void **)(&memory_), size * sizeof(T)));
117 }
118 GpuMemory(const GpuMemory &) = delete;
119 GpuMemory &operator=(const GpuMemory &) = delete;
120
121 virtual ~GpuMemory() { CHECK_CUDA(cudaFree(memory_)); }
122
123 // Returns the device pointer to the memory.
124 T *get() { return memory_; }
125 const T *get() const { return memory_; }
126
127 // Returns the number of objects this memory can hold.
128 size_t size() const { return size_; }
129
130 // Copies data from host memory to this memory asynchronously on the provided
131 // stream.
132 void MemcpyAsyncFrom(const T *host_memory, CudaStream *stream) {
133 CHECK_CUDA(cudaMemcpyAsync(memory_, host_memory, sizeof(T) * size_,
134 cudaMemcpyHostToDevice, stream->get()));
135 }
136 void MemcpyAsyncFrom(const HostMemory<T> *host_memory, CudaStream *stream) {
137 MemcpyAsyncFrom(host_memory->get(), stream);
138 }
139
140 // Copies data to host memory from this memory asynchronously on the provided
141 // stream.
142 void MemcpyAsyncTo(T *host_memory, CudaStream *stream) const {
143 CHECK_CUDA(cudaMemcpyAsync(reinterpret_cast<void *>(host_memory),
144 reinterpret_cast<void *>(memory_),
145 sizeof(T) * size_, cudaMemcpyDeviceToHost,
146 stream->get()));
147 }
148 void MemcpyAsyncTo(HostMemory<T> *host_memory, CudaStream *stream) const {
149 MemcpyAsyncTo(host_memory->get(), stream);
150 }
151
152 // Copies data from host_memory to this memory blocking.
153 void MemcpyFrom(const T *host_memory) {
154 CHECK_CUDA(cudaMemcpy(reinterpret_cast<void *>(memory_),
155 reinterpret_cast<const void *>(host_memory),
156 sizeof(T) * size_, cudaMemcpyHostToDevice));
157 }
158 void MemcpyFrom(const HostMemory<T> *host_memory) {
159 MemcpyFrom(host_memory->get());
160 }
161
162 // Copies data to host_memory from this memory. Only copies size objects.
163 void MemcpyTo(T *host_memory, size_t size) const {
164 CHECK_CUDA(cudaMemcpy(reinterpret_cast<void *>(host_memory), memory_,
165 sizeof(T) * size, cudaMemcpyDeviceToHost));
166 }
167 // Copies data to host_memory from this memory.
168 void MemcpyTo(T *host_memory) const { MemcpyTo(host_memory, size_); }
169 void MemcpyTo(HostMemory<T> *host_memory) const {
170 MemcpyTo(host_memory->get());
171 }
172
173 // Sets the memory asynchronously to contain data of type 'val' on the provide
174 // stream.
175 void MemsetAsync(const uint8_t val, CudaStream *stream) const {
176 CHECK_CUDA(cudaMemsetAsync(memory_, val, sizeof(T) * size_, stream->get()));
177 }
178
179 // Allocates a vector on the host, copies size objects into it, and returns
180 // it.
181 std::vector<T> Copy(size_t s) const {
182 CHECK_LE(s, size_);
183 std::vector<T> result(s);
184 MemcpyTo(result.data(), s);
185 return result;
186 }
187
188 // Copies all the objects in this memory to a vector on the host and returns
189 // it.
190 std::vector<T> Copy() const { return Copy(size_); }
191
192 private:
193 T *memory_;
194 const size_t size_;
195};
196
197// Synchronizes and CHECKs for success the last CUDA operation.
198void CheckAndSynchronize();
199
200// Synchronizes and CHECKS iff --sync is passed on the command line. Makes it
201// so we can leave debugging in the code.
202void MaybeCheckAndSynchronize();
203
204} // namespace apriltag
205} // namespace frc971
206
207#endif // FRC971_ORIN_CUDA_H_