blob: 293fc842929746745cec9b231bfdb7ac79fa6e94 [file] [log] [blame]
Austin Schuh8c267c72023-11-18 14:05:14 -08001#ifndef FRC971_ORIN_CUDA_H_
2#define FRC971_ORIN_CUDA_H_
3
4#include <chrono>
5#include <span>
6
Austin Schuh99f7c6a2024-06-25 22:07:44 -07007#include "absl/log/check.h"
8#include "absl/log/log.h"
Austin Schuh8c267c72023-11-18 14:05:14 -08009
10#include "cuda_runtime.h"
11#include "device_launch_parameters.h"
12
13// CHECKs that a cuda method returned success.
14// TODO(austin): This will not handle if and else statements quite right, fix if
15// we care.
16#define CHECK_CUDA(condition) \
17 if (auto c = condition) \
18 LOG(FATAL) << "Check failed: " #condition " (" << cudaGetErrorString(c) \
19 << ") "
20
Stephan Pleinesd99b1ee2024-02-02 20:56:44 -080021namespace frc971::apriltag {
Austin Schuh8c267c72023-11-18 14:05:14 -080022
23// Class to manage the lifetime of a Cuda stream. This is used to provide
24// relative ordering between kernels on the same stream.
25class CudaStream {
26 public:
27 CudaStream() { CHECK_CUDA(cudaStreamCreate(&stream_)); }
28
29 CudaStream(const CudaStream &) = delete;
30 CudaStream &operator=(const CudaStream &) = delete;
31
32 virtual ~CudaStream() { CHECK_CUDA(cudaStreamDestroy(stream_)); }
33
34 // Returns the stream.
35 cudaStream_t get() { return stream_; }
36
37 private:
38 cudaStream_t stream_;
39};
40
41// Class to manage the lifetime of a Cuda Event. Cuda events are used for
42// timing events on a stream.
43class CudaEvent {
44 public:
45 CudaEvent() { CHECK_CUDA(cudaEventCreate(&event_)); }
46
47 CudaEvent(const CudaEvent &) = delete;
48 CudaEvent &operator=(const CudaEvent &) = delete;
49
50 virtual ~CudaEvent() { CHECK_CUDA(cudaEventDestroy(event_)); }
51
52 // Queues up an event to be timestamped on the stream when it is executed.
53 void Record(CudaStream *stream) {
54 CHECK_CUDA(cudaEventRecord(event_, stream->get()));
55 }
56
57 // Returns the time elapsed between start and this event if it has been
58 // triggered.
59 std::chrono::nanoseconds ElapsedTime(const CudaEvent &start) {
60 float ms;
61 CHECK_CUDA(cudaEventElapsedTime(&ms, start.event_, event_));
62 return std::chrono::duration_cast<std::chrono::nanoseconds>(
63 std::chrono::duration<float, std::milli>(ms));
64 }
65
66 // Waits until the event has been triggered.
67 void Synchronize() { CHECK_CUDA(cudaEventSynchronize(event_)); }
68
69 private:
70 cudaEvent_t event_;
71};
72
73// Class to manage the lifetime of page locked host memory for fast copies back
74// to host memory.
75template <typename T>
76class HostMemory {
77 public:
78 // Allocates a block of memory for holding up to size objects of type T.
79 HostMemory(size_t size) {
80 T *memory;
81 CHECK_CUDA(cudaMallocHost((void **)(&memory), size * sizeof(T)));
82 span_ = std::span<T>(memory, size);
83 }
84 HostMemory(const HostMemory &) = delete;
85 HostMemory &operator=(const HostMemory &) = delete;
86
87 virtual ~HostMemory() { CHECK_CUDA(cudaFreeHost(span_.data())); }
88
89 // Returns a pointer to the memory.
90 T *get() { return span_.data(); }
91 const T *get() const { return span_.data(); }
92
93 // Returns the number of objects the memory can hold.
94 size_t size() const { return span_.size(); }
95
96 // Copies data from other (host memory) to this's memory.
97 void MemcpyFrom(const T *other) {
98 memcpy(span_.data(), other, sizeof(T) * size());
99 }
100 // Copies data to other (host memory) from this's memory.
101 void MemcpyTo(const T *other) {
102 memcpy(other, span_.data(), sizeof(T) * size());
103 }
104
105 private:
106 std::span<T> span_;
107};
108
109// Class to manage the lifetime of device memory.
110template <typename T>
111class GpuMemory {
112 public:
113 // Allocates a block of memory for holding up to size objects of type T in
114 // device memory.
115 GpuMemory(size_t size) : size_(size) {
116 CHECK_CUDA(cudaMalloc((void **)(&memory_), size * sizeof(T)));
117 }
118 GpuMemory(const GpuMemory &) = delete;
119 GpuMemory &operator=(const GpuMemory &) = delete;
120
121 virtual ~GpuMemory() { CHECK_CUDA(cudaFree(memory_)); }
122
123 // Returns the device pointer to the memory.
124 T *get() { return memory_; }
125 const T *get() const { return memory_; }
126
127 // Returns the number of objects this memory can hold.
128 size_t size() const { return size_; }
129
130 // Copies data from host memory to this memory asynchronously on the provided
131 // stream.
132 void MemcpyAsyncFrom(const T *host_memory, CudaStream *stream) {
133 CHECK_CUDA(cudaMemcpyAsync(memory_, host_memory, sizeof(T) * size_,
134 cudaMemcpyHostToDevice, stream->get()));
135 }
136 void MemcpyAsyncFrom(const HostMemory<T> *host_memory, CudaStream *stream) {
137 MemcpyAsyncFrom(host_memory->get(), stream);
138 }
139
140 // Copies data to host memory from this memory asynchronously on the provided
141 // stream.
Austin Schuh1fc51fa2024-01-01 12:34:00 -0800142 void MemcpyAsyncTo(T *host_memory, size_t size, CudaStream *stream) const {
Austin Schuh8c267c72023-11-18 14:05:14 -0800143 CHECK_CUDA(cudaMemcpyAsync(reinterpret_cast<void *>(host_memory),
144 reinterpret_cast<void *>(memory_),
Austin Schuh1fc51fa2024-01-01 12:34:00 -0800145 sizeof(T) * size, cudaMemcpyDeviceToHost,
Austin Schuh8c267c72023-11-18 14:05:14 -0800146 stream->get()));
147 }
Austin Schuh1fc51fa2024-01-01 12:34:00 -0800148 void MemcpyAsyncTo(T *host_memory, CudaStream *stream) const {
149 MemcpyAsyncTo(host_memory, size_, stream);
150 }
Austin Schuh8c267c72023-11-18 14:05:14 -0800151 void MemcpyAsyncTo(HostMemory<T> *host_memory, CudaStream *stream) const {
152 MemcpyAsyncTo(host_memory->get(), stream);
153 }
154
155 // Copies data from host_memory to this memory blocking.
156 void MemcpyFrom(const T *host_memory) {
157 CHECK_CUDA(cudaMemcpy(reinterpret_cast<void *>(memory_),
158 reinterpret_cast<const void *>(host_memory),
159 sizeof(T) * size_, cudaMemcpyHostToDevice));
160 }
161 void MemcpyFrom(const HostMemory<T> *host_memory) {
162 MemcpyFrom(host_memory->get());
163 }
164
165 // Copies data to host_memory from this memory. Only copies size objects.
166 void MemcpyTo(T *host_memory, size_t size) const {
167 CHECK_CUDA(cudaMemcpy(reinterpret_cast<void *>(host_memory), memory_,
168 sizeof(T) * size, cudaMemcpyDeviceToHost));
169 }
170 // Copies data to host_memory from this memory.
171 void MemcpyTo(T *host_memory) const { MemcpyTo(host_memory, size_); }
172 void MemcpyTo(HostMemory<T> *host_memory) const {
173 MemcpyTo(host_memory->get());
174 }
175
176 // Sets the memory asynchronously to contain data of type 'val' on the provide
177 // stream.
178 void MemsetAsync(const uint8_t val, CudaStream *stream) const {
179 CHECK_CUDA(cudaMemsetAsync(memory_, val, sizeof(T) * size_, stream->get()));
180 }
181
182 // Allocates a vector on the host, copies size objects into it, and returns
183 // it.
184 std::vector<T> Copy(size_t s) const {
185 CHECK_LE(s, size_);
186 std::vector<T> result(s);
187 MemcpyTo(result.data(), s);
188 return result;
189 }
190
191 // Copies all the objects in this memory to a vector on the host and returns
192 // it.
193 std::vector<T> Copy() const { return Copy(size_); }
194
195 private:
196 T *memory_;
197 const size_t size_;
198};
199
200// Synchronizes and CHECKs for success the last CUDA operation.
Austin Schuh1fc51fa2024-01-01 12:34:00 -0800201void CheckAndSynchronize(std::string_view message = "");
Austin Schuh8c267c72023-11-18 14:05:14 -0800202
203// Synchronizes and CHECKS iff --sync is passed on the command line. Makes it
204// so we can leave debugging in the code.
205void MaybeCheckAndSynchronize();
Austin Schuh1fc51fa2024-01-01 12:34:00 -0800206void MaybeCheckAndSynchronize(std::string_view message);
Austin Schuh8c267c72023-11-18 14:05:14 -0800207
Stephan Pleinesd99b1ee2024-02-02 20:56:44 -0800208} // namespace frc971::apriltag
Austin Schuh8c267c72023-11-18 14:05:14 -0800209
210#endif // FRC971_ORIN_CUDA_H_