Blame - frc971/orin/cuda.h - RealtimeRoboticsGroup/test

blob: 0e44fb4d6afc333f66f01112a653f0a8efc74889 [file] [log] [blame]

Austin Schuh	8c267c7	2023-11-18 14:05:14 -0800	[diff] [blame]	1	#ifndef FRC971_ORIN_CUDA_H_
				2	#define FRC971_ORIN_CUDA_H_
				3
				4	#include <chrono>
				5	#include <span>
				6
				7	#include "glog/logging.h"
				8
				9	#include "cuda_runtime.h"
				10	#include "device_launch_parameters.h"
				11
				12	// CHECKs that a cuda method returned success.
				13	// TODO(austin): This will not handle if and else statements quite right, fix if
				14	// we care.
				15	#define CHECK_CUDA(condition) \
				16	if (auto c = condition) \
				17	LOG(FATAL) << "Check failed: " #condition " (" << cudaGetErrorString(c) \
				18	<< ") "
				19
Stephan Pleines	d99b1ee	2024-02-02 20:56:44 -0800	[diff] [blame^]	20	namespace frc971::apriltag {
Austin Schuh	8c267c7	2023-11-18 14:05:14 -0800	[diff] [blame]	21
				22	// Class to manage the lifetime of a Cuda stream. This is used to provide
				23	// relative ordering between kernels on the same stream.
				24	class CudaStream {
				25	public:
				26	CudaStream() { CHECK_CUDA(cudaStreamCreate(&stream_)); }
				27
				28	CudaStream(const CudaStream &) = delete;
				29	CudaStream &operator=(const CudaStream &) = delete;
				30
				31	virtual ~CudaStream() { CHECK_CUDA(cudaStreamDestroy(stream_)); }
				32
				33	// Returns the stream.
				34	cudaStream_t get() { return stream_; }
				35
				36	private:
				37	cudaStream_t stream_;
				38	};
				39
				40	// Class to manage the lifetime of a Cuda Event. Cuda events are used for
				41	// timing events on a stream.
				42	class CudaEvent {
				43	public:
				44	CudaEvent() { CHECK_CUDA(cudaEventCreate(&event_)); }
				45
				46	CudaEvent(const CudaEvent &) = delete;
				47	CudaEvent &operator=(const CudaEvent &) = delete;
				48
				49	virtual ~CudaEvent() { CHECK_CUDA(cudaEventDestroy(event_)); }
				50
				51	// Queues up an event to be timestamped on the stream when it is executed.
				52	void Record(CudaStream *stream) {
				53	CHECK_CUDA(cudaEventRecord(event_, stream->get()));
				54	}
				55
				56	// Returns the time elapsed between start and this event if it has been
				57	// triggered.
				58	std::chrono::nanoseconds ElapsedTime(const CudaEvent &start) {
				59	float ms;
				60	CHECK_CUDA(cudaEventElapsedTime(&ms, start.event_, event_));
				61	return std::chrono::duration_cast<std::chrono::nanoseconds>(
				62	std::chrono::duration<float, std::milli>(ms));
				63	}
				64
				65	// Waits until the event has been triggered.
				66	void Synchronize() { CHECK_CUDA(cudaEventSynchronize(event_)); }
				67
				68	private:
				69	cudaEvent_t event_;
				70	};
				71
				72	// Class to manage the lifetime of page locked host memory for fast copies back
				73	// to host memory.
				74	template <typename T>
				75	class HostMemory {
				76	public:
				77	// Allocates a block of memory for holding up to size objects of type T.
				78	HostMemory(size_t size) {
				79	T *memory;
				80	CHECK_CUDA(cudaMallocHost((void *)(&memory), size sizeof(T)));
				81	span_ = std::span<T>(memory, size);
				82	}
				83	HostMemory(const HostMemory &) = delete;
				84	HostMemory &operator=(const HostMemory &) = delete;
				85
				86	virtual ~HostMemory() { CHECK_CUDA(cudaFreeHost(span_.data())); }
				87
				88	// Returns a pointer to the memory.
				89	T *get() { return span_.data(); }
				90	const T *get() const { return span_.data(); }
				91
				92	// Returns the number of objects the memory can hold.
				93	size_t size() const { return span_.size(); }
				94
				95	// Copies data from other (host memory) to this's memory.
				96	void MemcpyFrom(const T *other) {
				97	memcpy(span_.data(), other, sizeof(T) * size());
				98	}
				99	// Copies data to other (host memory) from this's memory.
				100	void MemcpyTo(const T *other) {
				101	memcpy(other, span_.data(), sizeof(T) * size());
				102	}
				103
				104	private:
				105	std::span<T> span_;
				106	};
				107
				108	// Class to manage the lifetime of device memory.
				109	template <typename T>
				110	class GpuMemory {
				111	public:
				112	// Allocates a block of memory for holding up to size objects of type T in
				113	// device memory.
				114	GpuMemory(size_t size) : size_(size) {
				115	CHECK_CUDA(cudaMalloc((void *)(&memory_), size sizeof(T)));
				116	}
				117	GpuMemory(const GpuMemory &) = delete;
				118	GpuMemory &operator=(const GpuMemory &) = delete;
				119
				120	virtual ~GpuMemory() { CHECK_CUDA(cudaFree(memory_)); }
				121
				122	// Returns the device pointer to the memory.
				123	T *get() { return memory_; }
				124	const T *get() const { return memory_; }
				125
				126	// Returns the number of objects this memory can hold.
				127	size_t size() const { return size_; }
				128
				129	// Copies data from host memory to this memory asynchronously on the provided
				130	// stream.
				131	void MemcpyAsyncFrom(const T host_memory, CudaStream stream) {
				132	CHECK_CUDA(cudaMemcpyAsync(memory_, host_memory, sizeof(T) * size_,
				133	cudaMemcpyHostToDevice, stream->get()));
				134	}
				135	void MemcpyAsyncFrom(const HostMemory<T> host_memory, CudaStream stream) {
				136	MemcpyAsyncFrom(host_memory->get(), stream);
				137	}
				138
				139	// Copies data to host memory from this memory asynchronously on the provided
				140	// stream.
Austin Schuh	1fc51fa	2024-01-01 12:34:00 -0800	[diff] [blame]	141	void MemcpyAsyncTo(T host_memory, size_t size, CudaStream stream) const {
Austin Schuh	8c267c7	2023-11-18 14:05:14 -0800	[diff] [blame]	142	CHECK_CUDA(cudaMemcpyAsync(reinterpret_cast<void *>(host_memory),
				143	reinterpret_cast<void *>(memory_),
Austin Schuh	1fc51fa	2024-01-01 12:34:00 -0800	[diff] [blame]	144	sizeof(T) * size, cudaMemcpyDeviceToHost,
Austin Schuh	8c267c7	2023-11-18 14:05:14 -0800	[diff] [blame]	145	stream->get()));
				146	}
Austin Schuh	1fc51fa	2024-01-01 12:34:00 -0800	[diff] [blame]	147	void MemcpyAsyncTo(T host_memory, CudaStream stream) const {
				148	MemcpyAsyncTo(host_memory, size_, stream);
				149	}
Austin Schuh	8c267c7	2023-11-18 14:05:14 -0800	[diff] [blame]	150	void MemcpyAsyncTo(HostMemory<T> host_memory, CudaStream stream) const {
				151	MemcpyAsyncTo(host_memory->get(), stream);
				152	}
				153
				154	// Copies data from host_memory to this memory blocking.
				155	void MemcpyFrom(const T *host_memory) {
				156	CHECK_CUDA(cudaMemcpy(reinterpret_cast<void *>(memory_),
				157	reinterpret_cast<const void *>(host_memory),
				158	sizeof(T) * size_, cudaMemcpyHostToDevice));
				159	}
				160	void MemcpyFrom(const HostMemory<T> *host_memory) {
				161	MemcpyFrom(host_memory->get());
				162	}
				163
				164	// Copies data to host_memory from this memory. Only copies size objects.
				165	void MemcpyTo(T *host_memory, size_t size) const {
				166	CHECK_CUDA(cudaMemcpy(reinterpret_cast<void *>(host_memory), memory_,
				167	sizeof(T) * size, cudaMemcpyDeviceToHost));
				168	}
				169	// Copies data to host_memory from this memory.
				170	void MemcpyTo(T *host_memory) const { MemcpyTo(host_memory, size_); }
				171	void MemcpyTo(HostMemory<T> *host_memory) const {
				172	MemcpyTo(host_memory->get());
				173	}
				174
				175	// Sets the memory asynchronously to contain data of type 'val' on the provide
				176	// stream.
				177	void MemsetAsync(const uint8_t val, CudaStream *stream) const {
				178	CHECK_CUDA(cudaMemsetAsync(memory_, val, sizeof(T) * size_, stream->get()));
				179	}
				180
				181	// Allocates a vector on the host, copies size objects into it, and returns
				182	// it.
				183	std::vector<T> Copy(size_t s) const {
				184	CHECK_LE(s, size_);
				185	std::vector<T> result(s);
				186	MemcpyTo(result.data(), s);
				187	return result;
				188	}
				189
				190	// Copies all the objects in this memory to a vector on the host and returns
				191	// it.
				192	std::vector<T> Copy() const { return Copy(size_); }
				193
				194	private:
				195	T *memory_;
				196	const size_t size_;
				197	};
				198
				199	// Synchronizes and CHECKs for success the last CUDA operation.
Austin Schuh	1fc51fa	2024-01-01 12:34:00 -0800	[diff] [blame]	200	void CheckAndSynchronize(std::string_view message = "");
Austin Schuh	8c267c7	2023-11-18 14:05:14 -0800	[diff] [blame]	201
				202	// Synchronizes and CHECKS iff --sync is passed on the command line. Makes it
				203	// so we can leave debugging in the code.
				204	void MaybeCheckAndSynchronize();
Austin Schuh	1fc51fa	2024-01-01 12:34:00 -0800	[diff] [blame]	205	void MaybeCheckAndSynchronize(std::string_view message);
Austin Schuh	8c267c7	2023-11-18 14:05:14 -0800	[diff] [blame]	206
Stephan Pleines	d99b1ee	2024-02-02 20:56:44 -0800	[diff] [blame^]	207	} // namespace frc971::apriltag
Austin Schuh	8c267c7	2023-11-18 14:05:14 -0800	[diff] [blame]	208
				209	#endif // FRC971_ORIN_CUDA_H_