Blame - frc971/orin/cuda.h - RealtimeRoboticsGroup/test

blob: e386aa624f4819cc1bd8597a726732ab89769c15 [file] [log] [blame]

Austin Schuh	8c267c7	2023-11-18 14:05:14 -0800	[diff] [blame^]	1	#ifndef FRC971_ORIN_CUDA_H_
				2	#define FRC971_ORIN_CUDA_H_
				3
				4	#include <chrono>
				5	#include <span>
				6
				7	#include "glog/logging.h"
				8
				9	#include "cuda_runtime.h"
				10	#include "device_launch_parameters.h"
				11
				12	// CHECKs that a cuda method returned success.
				13	// TODO(austin): This will not handle if and else statements quite right, fix if
				14	// we care.
				15	#define CHECK_CUDA(condition) \
				16	if (auto c = condition) \
				17	LOG(FATAL) << "Check failed: " #condition " (" << cudaGetErrorString(c) \
				18	<< ") "
				19
				20	namespace frc971 {
				21	namespace apriltag {
				22
				23	// Class to manage the lifetime of a Cuda stream. This is used to provide
				24	// relative ordering between kernels on the same stream.
				25	class CudaStream {
				26	public:
				27	CudaStream() { CHECK_CUDA(cudaStreamCreate(&stream_)); }
				28
				29	CudaStream(const CudaStream &) = delete;
				30	CudaStream &operator=(const CudaStream &) = delete;
				31
				32	virtual ~CudaStream() { CHECK_CUDA(cudaStreamDestroy(stream_)); }
				33
				34	// Returns the stream.
				35	cudaStream_t get() { return stream_; }
				36
				37	private:
				38	cudaStream_t stream_;
				39	};
				40
				41	// Class to manage the lifetime of a Cuda Event. Cuda events are used for
				42	// timing events on a stream.
				43	class CudaEvent {
				44	public:
				45	CudaEvent() { CHECK_CUDA(cudaEventCreate(&event_)); }
				46
				47	CudaEvent(const CudaEvent &) = delete;
				48	CudaEvent &operator=(const CudaEvent &) = delete;
				49
				50	virtual ~CudaEvent() { CHECK_CUDA(cudaEventDestroy(event_)); }
				51
				52	// Queues up an event to be timestamped on the stream when it is executed.
				53	void Record(CudaStream *stream) {
				54	CHECK_CUDA(cudaEventRecord(event_, stream->get()));
				55	}
				56
				57	// Returns the time elapsed between start and this event if it has been
				58	// triggered.
				59	std::chrono::nanoseconds ElapsedTime(const CudaEvent &start) {
				60	float ms;
				61	CHECK_CUDA(cudaEventElapsedTime(&ms, start.event_, event_));
				62	return std::chrono::duration_cast<std::chrono::nanoseconds>(
				63	std::chrono::duration<float, std::milli>(ms));
				64	}
				65
				66	// Waits until the event has been triggered.
				67	void Synchronize() { CHECK_CUDA(cudaEventSynchronize(event_)); }
				68
				69	private:
				70	cudaEvent_t event_;
				71	};
				72
				73	// Class to manage the lifetime of page locked host memory for fast copies back
				74	// to host memory.
				75	template <typename T>
				76	class HostMemory {
				77	public:
				78	// Allocates a block of memory for holding up to size objects of type T.
				79	HostMemory(size_t size) {
				80	T *memory;
				81	CHECK_CUDA(cudaMallocHost((void *)(&memory), size sizeof(T)));
				82	span_ = std::span<T>(memory, size);
				83	}
				84	HostMemory(const HostMemory &) = delete;
				85	HostMemory &operator=(const HostMemory &) = delete;
				86
				87	virtual ~HostMemory() { CHECK_CUDA(cudaFreeHost(span_.data())); }
				88
				89	// Returns a pointer to the memory.
				90	T *get() { return span_.data(); }
				91	const T *get() const { return span_.data(); }
				92
				93	// Returns the number of objects the memory can hold.
				94	size_t size() const { return span_.size(); }
				95
				96	// Copies data from other (host memory) to this's memory.
				97	void MemcpyFrom(const T *other) {
				98	memcpy(span_.data(), other, sizeof(T) * size());
				99	}
				100	// Copies data to other (host memory) from this's memory.
				101	void MemcpyTo(const T *other) {
				102	memcpy(other, span_.data(), sizeof(T) * size());
				103	}
				104
				105	private:
				106	std::span<T> span_;
				107	};
				108
				109	// Class to manage the lifetime of device memory.
				110	template <typename T>
				111	class GpuMemory {
				112	public:
				113	// Allocates a block of memory for holding up to size objects of type T in
				114	// device memory.
				115	GpuMemory(size_t size) : size_(size) {
				116	CHECK_CUDA(cudaMalloc((void *)(&memory_), size sizeof(T)));
				117	}
				118	GpuMemory(const GpuMemory &) = delete;
				119	GpuMemory &operator=(const GpuMemory &) = delete;
				120
				121	virtual ~GpuMemory() { CHECK_CUDA(cudaFree(memory_)); }
				122
				123	// Returns the device pointer to the memory.
				124	T *get() { return memory_; }
				125	const T *get() const { return memory_; }
				126
				127	// Returns the number of objects this memory can hold.
				128	size_t size() const { return size_; }
				129
				130	// Copies data from host memory to this memory asynchronously on the provided
				131	// stream.
				132	void MemcpyAsyncFrom(const T host_memory, CudaStream stream) {
				133	CHECK_CUDA(cudaMemcpyAsync(memory_, host_memory, sizeof(T) * size_,
				134	cudaMemcpyHostToDevice, stream->get()));
				135	}
				136	void MemcpyAsyncFrom(const HostMemory<T> host_memory, CudaStream stream) {
				137	MemcpyAsyncFrom(host_memory->get(), stream);
				138	}
				139
				140	// Copies data to host memory from this memory asynchronously on the provided
				141	// stream.
				142	void MemcpyAsyncTo(T host_memory, CudaStream stream) const {
				143	CHECK_CUDA(cudaMemcpyAsync(reinterpret_cast<void *>(host_memory),
				144	reinterpret_cast<void *>(memory_),
				145	sizeof(T) * size_, cudaMemcpyDeviceToHost,
				146	stream->get()));
				147	}
				148	void MemcpyAsyncTo(HostMemory<T> host_memory, CudaStream stream) const {
				149	MemcpyAsyncTo(host_memory->get(), stream);
				150	}
				151
				152	// Copies data from host_memory to this memory blocking.
				153	void MemcpyFrom(const T *host_memory) {
				154	CHECK_CUDA(cudaMemcpy(reinterpret_cast<void *>(memory_),
				155	reinterpret_cast<const void *>(host_memory),
				156	sizeof(T) * size_, cudaMemcpyHostToDevice));
				157	}
				158	void MemcpyFrom(const HostMemory<T> *host_memory) {
				159	MemcpyFrom(host_memory->get());
				160	}
				161
				162	// Copies data to host_memory from this memory. Only copies size objects.
				163	void MemcpyTo(T *host_memory, size_t size) const {
				164	CHECK_CUDA(cudaMemcpy(reinterpret_cast<void *>(host_memory), memory_,
				165	sizeof(T) * size, cudaMemcpyDeviceToHost));
				166	}
				167	// Copies data to host_memory from this memory.
				168	void MemcpyTo(T *host_memory) const { MemcpyTo(host_memory, size_); }
				169	void MemcpyTo(HostMemory<T> *host_memory) const {
				170	MemcpyTo(host_memory->get());
				171	}
				172
				173	// Sets the memory asynchronously to contain data of type 'val' on the provide
				174	// stream.
				175	void MemsetAsync(const uint8_t val, CudaStream *stream) const {
				176	CHECK_CUDA(cudaMemsetAsync(memory_, val, sizeof(T) * size_, stream->get()));
				177	}
				178
				179	// Allocates a vector on the host, copies size objects into it, and returns
				180	// it.
				181	std::vector<T> Copy(size_t s) const {
				182	CHECK_LE(s, size_);
				183	std::vector<T> result(s);
				184	MemcpyTo(result.data(), s);
				185	return result;
				186	}
				187
				188	// Copies all the objects in this memory to a vector on the host and returns
				189	// it.
				190	std::vector<T> Copy() const { return Copy(size_); }
				191
				192	private:
				193	T *memory_;
				194	const size_t size_;
				195	};
				196
				197	// Synchronizes and CHECKs for success the last CUDA operation.
				198	void CheckAndSynchronize();
				199
				200	// Synchronizes and CHECKS iff --sync is passed on the command line. Makes it
				201	// so we can leave debugging in the code.
				202	void MaybeCheckAndSynchronize();
				203
				204	} // namespace apriltag
				205	} // namespace frc971
				206
				207	#endif // FRC971_ORIN_CUDA_H_