Blame - frc971/orin/cuda.h - RealtimeRoboticsGroup/test

blob: 293fc842929746745cec9b231bfdb7ac79fa6e94 [file] [log] [blame]

Austin Schuh	8c267c7	2023-11-18 14:05:14 -0800	[diff] [blame]	1	#ifndef FRC971_ORIN_CUDA_H_
				2	#define FRC971_ORIN_CUDA_H_
				3
				4	#include <chrono>
				5	#include <span>
				6
Austin Schuh	99f7c6a	2024-06-25 22:07:44 -0700	[diff] [blame]	7	#include "absl/log/check.h"
				8	#include "absl/log/log.h"
Austin Schuh	8c267c7	2023-11-18 14:05:14 -0800	[diff] [blame]	9
				10	#include "cuda_runtime.h"
				11	#include "device_launch_parameters.h"
				12
				13	// CHECKs that a cuda method returned success.
				14	// TODO(austin): This will not handle if and else statements quite right, fix if
				15	// we care.
				16	#define CHECK_CUDA(condition) \
				17	if (auto c = condition) \
				18	LOG(FATAL) << "Check failed: " #condition " (" << cudaGetErrorString(c) \
				19	<< ") "
				20
Stephan Pleines	d99b1ee	2024-02-02 20:56:44 -0800	[diff] [blame]	21	namespace frc971::apriltag {
Austin Schuh	8c267c7	2023-11-18 14:05:14 -0800	[diff] [blame]	22
				23	// Class to manage the lifetime of a Cuda stream. This is used to provide
				24	// relative ordering between kernels on the same stream.
				25	class CudaStream {
				26	public:
				27	CudaStream() { CHECK_CUDA(cudaStreamCreate(&stream_)); }
				28
				29	CudaStream(const CudaStream &) = delete;
				30	CudaStream &operator=(const CudaStream &) = delete;
				31
				32	virtual ~CudaStream() { CHECK_CUDA(cudaStreamDestroy(stream_)); }
				33
				34	// Returns the stream.
				35	cudaStream_t get() { return stream_; }
				36
				37	private:
				38	cudaStream_t stream_;
				39	};
				40
				41	// Class to manage the lifetime of a Cuda Event. Cuda events are used for
				42	// timing events on a stream.
				43	class CudaEvent {
				44	public:
				45	CudaEvent() { CHECK_CUDA(cudaEventCreate(&event_)); }
				46
				47	CudaEvent(const CudaEvent &) = delete;
				48	CudaEvent &operator=(const CudaEvent &) = delete;
				49
				50	virtual ~CudaEvent() { CHECK_CUDA(cudaEventDestroy(event_)); }
				51
				52	// Queues up an event to be timestamped on the stream when it is executed.
				53	void Record(CudaStream *stream) {
				54	CHECK_CUDA(cudaEventRecord(event_, stream->get()));
				55	}
				56
				57	// Returns the time elapsed between start and this event if it has been
				58	// triggered.
				59	std::chrono::nanoseconds ElapsedTime(const CudaEvent &start) {
				60	float ms;
				61	CHECK_CUDA(cudaEventElapsedTime(&ms, start.event_, event_));
				62	return std::chrono::duration_cast<std::chrono::nanoseconds>(
				63	std::chrono::duration<float, std::milli>(ms));
				64	}
				65
				66	// Waits until the event has been triggered.
				67	void Synchronize() { CHECK_CUDA(cudaEventSynchronize(event_)); }
				68
				69	private:
				70	cudaEvent_t event_;
				71	};
				72
				73	// Class to manage the lifetime of page locked host memory for fast copies back
				74	// to host memory.
				75	template <typename T>
				76	class HostMemory {
				77	public:
				78	// Allocates a block of memory for holding up to size objects of type T.
				79	HostMemory(size_t size) {
				80	T *memory;
				81	CHECK_CUDA(cudaMallocHost((void *)(&memory), size sizeof(T)));
				82	span_ = std::span<T>(memory, size);
				83	}
				84	HostMemory(const HostMemory &) = delete;
				85	HostMemory &operator=(const HostMemory &) = delete;
				86
				87	virtual ~HostMemory() { CHECK_CUDA(cudaFreeHost(span_.data())); }
				88
				89	// Returns a pointer to the memory.
				90	T *get() { return span_.data(); }
				91	const T *get() const { return span_.data(); }
				92
				93	// Returns the number of objects the memory can hold.
				94	size_t size() const { return span_.size(); }
				95
				96	// Copies data from other (host memory) to this's memory.
				97	void MemcpyFrom(const T *other) {
				98	memcpy(span_.data(), other, sizeof(T) * size());
				99	}
				100	// Copies data to other (host memory) from this's memory.
				101	void MemcpyTo(const T *other) {
				102	memcpy(other, span_.data(), sizeof(T) * size());
				103	}
				104
				105	private:
				106	std::span<T> span_;
				107	};
				108
				109	// Class to manage the lifetime of device memory.
				110	template <typename T>
				111	class GpuMemory {
				112	public:
				113	// Allocates a block of memory for holding up to size objects of type T in
				114	// device memory.
				115	GpuMemory(size_t size) : size_(size) {
				116	CHECK_CUDA(cudaMalloc((void *)(&memory_), size sizeof(T)));
				117	}
				118	GpuMemory(const GpuMemory &) = delete;
				119	GpuMemory &operator=(const GpuMemory &) = delete;
				120
				121	virtual ~GpuMemory() { CHECK_CUDA(cudaFree(memory_)); }
				122
				123	// Returns the device pointer to the memory.
				124	T *get() { return memory_; }
				125	const T *get() const { return memory_; }
				126
				127	// Returns the number of objects this memory can hold.
				128	size_t size() const { return size_; }
				129
				130	// Copies data from host memory to this memory asynchronously on the provided
				131	// stream.
				132	void MemcpyAsyncFrom(const T host_memory, CudaStream stream) {
				133	CHECK_CUDA(cudaMemcpyAsync(memory_, host_memory, sizeof(T) * size_,
				134	cudaMemcpyHostToDevice, stream->get()));
				135	}
				136	void MemcpyAsyncFrom(const HostMemory<T> host_memory, CudaStream stream) {
				137	MemcpyAsyncFrom(host_memory->get(), stream);
				138	}
				139
				140	// Copies data to host memory from this memory asynchronously on the provided
				141	// stream.
Austin Schuh	1fc51fa	2024-01-01 12:34:00 -0800	[diff] [blame]	142	void MemcpyAsyncTo(T host_memory, size_t size, CudaStream stream) const {
Austin Schuh	8c267c7	2023-11-18 14:05:14 -0800	[diff] [blame]	143	CHECK_CUDA(cudaMemcpyAsync(reinterpret_cast<void *>(host_memory),
				144	reinterpret_cast<void *>(memory_),
Austin Schuh	1fc51fa	2024-01-01 12:34:00 -0800	[diff] [blame]	145	sizeof(T) * size, cudaMemcpyDeviceToHost,
Austin Schuh	8c267c7	2023-11-18 14:05:14 -0800	[diff] [blame]	146	stream->get()));
				147	}
Austin Schuh	1fc51fa	2024-01-01 12:34:00 -0800	[diff] [blame]	148	void MemcpyAsyncTo(T host_memory, CudaStream stream) const {
				149	MemcpyAsyncTo(host_memory, size_, stream);
				150	}
Austin Schuh	8c267c7	2023-11-18 14:05:14 -0800	[diff] [blame]	151	void MemcpyAsyncTo(HostMemory<T> host_memory, CudaStream stream) const {
				152	MemcpyAsyncTo(host_memory->get(), stream);
				153	}
				154
				155	// Copies data from host_memory to this memory blocking.
				156	void MemcpyFrom(const T *host_memory) {
				157	CHECK_CUDA(cudaMemcpy(reinterpret_cast<void *>(memory_),
				158	reinterpret_cast<const void *>(host_memory),
				159	sizeof(T) * size_, cudaMemcpyHostToDevice));
				160	}
				161	void MemcpyFrom(const HostMemory<T> *host_memory) {
				162	MemcpyFrom(host_memory->get());
				163	}
				164
				165	// Copies data to host_memory from this memory. Only copies size objects.
				166	void MemcpyTo(T *host_memory, size_t size) const {
				167	CHECK_CUDA(cudaMemcpy(reinterpret_cast<void *>(host_memory), memory_,
				168	sizeof(T) * size, cudaMemcpyDeviceToHost));
				169	}
				170	// Copies data to host_memory from this memory.
				171	void MemcpyTo(T *host_memory) const { MemcpyTo(host_memory, size_); }
				172	void MemcpyTo(HostMemory<T> *host_memory) const {
				173	MemcpyTo(host_memory->get());
				174	}
				175
				176	// Sets the memory asynchronously to contain data of type 'val' on the provide
				177	// stream.
				178	void MemsetAsync(const uint8_t val, CudaStream *stream) const {
				179	CHECK_CUDA(cudaMemsetAsync(memory_, val, sizeof(T) * size_, stream->get()));
				180	}
				181
				182	// Allocates a vector on the host, copies size objects into it, and returns
				183	// it.
				184	std::vector<T> Copy(size_t s) const {
				185	CHECK_LE(s, size_);
				186	std::vector<T> result(s);
				187	MemcpyTo(result.data(), s);
				188	return result;
				189	}
				190
				191	// Copies all the objects in this memory to a vector on the host and returns
				192	// it.
				193	std::vector<T> Copy() const { return Copy(size_); }
				194
				195	private:
				196	T *memory_;
				197	const size_t size_;
				198	};
				199
				200	// Synchronizes and CHECKs for success the last CUDA operation.
Austin Schuh	1fc51fa	2024-01-01 12:34:00 -0800	[diff] [blame]	201	void CheckAndSynchronize(std::string_view message = "");
Austin Schuh	8c267c7	2023-11-18 14:05:14 -0800	[diff] [blame]	202
				203	// Synchronizes and CHECKS iff --sync is passed on the command line. Makes it
				204	// so we can leave debugging in the code.
				205	void MaybeCheckAndSynchronize();
Austin Schuh	1fc51fa	2024-01-01 12:34:00 -0800	[diff] [blame]	206	void MaybeCheckAndSynchronize(std::string_view message);
Austin Schuh	8c267c7	2023-11-18 14:05:14 -0800	[diff] [blame]	207
Stephan Pleines	d99b1ee	2024-02-02 20:56:44 -0800	[diff] [blame]	208	} // namespace frc971::apriltag
Austin Schuh	8c267c7	2023-11-18 14:05:14 -0800	[diff] [blame]	209
				210	#endif // FRC971_ORIN_CUDA_H_