Add first half of GPU based april tag detector
This detects blob boundaries, filters them, and then orders the points
in a circle in preparation for line fitting. It takes 2ms for a 720p
image on the Orin NX 8gb.
Future commits will do the quad fitting and merge back with the original
algorithm.
Change-Id: Idf2869b3521e50a0056a352138d864b409dab6f1
Signed-off-by: Austin Schuh <austin.linux@gmail.com>
diff --git a/frc971/orin/cuda.h b/frc971/orin/cuda.h
new file mode 100644
index 0000000..e386aa6
--- /dev/null
+++ b/frc971/orin/cuda.h
@@ -0,0 +1,207 @@
+#ifndef FRC971_ORIN_CUDA_H_
+#define FRC971_ORIN_CUDA_H_
+
+#include <chrono>
+#include <span>
+
+#include "glog/logging.h"
+
+#include "cuda_runtime.h"
+#include "device_launch_parameters.h"
+
+// CHECKs that a cuda method returned success.
+// TODO(austin): This will not handle if and else statements quite right, fix if
+// we care.
+#define CHECK_CUDA(condition) \
+ if (auto c = condition) \
+ LOG(FATAL) << "Check failed: " #condition " (" << cudaGetErrorString(c) \
+ << ") "
+
+namespace frc971 {
+namespace apriltag {
+
+// Class to manage the lifetime of a Cuda stream. This is used to provide
+// relative ordering between kernels on the same stream.
+class CudaStream {
+ public:
+ CudaStream() { CHECK_CUDA(cudaStreamCreate(&stream_)); }
+
+ CudaStream(const CudaStream &) = delete;
+ CudaStream &operator=(const CudaStream &) = delete;
+
+ virtual ~CudaStream() { CHECK_CUDA(cudaStreamDestroy(stream_)); }
+
+ // Returns the stream.
+ cudaStream_t get() { return stream_; }
+
+ private:
+ cudaStream_t stream_;
+};
+
+// Class to manage the lifetime of a Cuda Event. Cuda events are used for
+// timing events on a stream.
+class CudaEvent {
+ public:
+ CudaEvent() { CHECK_CUDA(cudaEventCreate(&event_)); }
+
+ CudaEvent(const CudaEvent &) = delete;
+ CudaEvent &operator=(const CudaEvent &) = delete;
+
+ virtual ~CudaEvent() { CHECK_CUDA(cudaEventDestroy(event_)); }
+
+ // Queues up an event to be timestamped on the stream when it is executed.
+ void Record(CudaStream *stream) {
+ CHECK_CUDA(cudaEventRecord(event_, stream->get()));
+ }
+
+ // Returns the time elapsed between start and this event if it has been
+ // triggered.
+ std::chrono::nanoseconds ElapsedTime(const CudaEvent &start) {
+ float ms;
+ CHECK_CUDA(cudaEventElapsedTime(&ms, start.event_, event_));
+ return std::chrono::duration_cast<std::chrono::nanoseconds>(
+ std::chrono::duration<float, std::milli>(ms));
+ }
+
+ // Waits until the event has been triggered.
+ void Synchronize() { CHECK_CUDA(cudaEventSynchronize(event_)); }
+
+ private:
+ cudaEvent_t event_;
+};
+
+// Class to manage the lifetime of page locked host memory for fast copies back
+// to host memory.
+template <typename T>
+class HostMemory {
+ public:
+ // Allocates a block of memory for holding up to size objects of type T.
+ HostMemory(size_t size) {
+ T *memory;
+ CHECK_CUDA(cudaMallocHost((void **)(&memory), size * sizeof(T)));
+ span_ = std::span<T>(memory, size);
+ }
+ HostMemory(const HostMemory &) = delete;
+ HostMemory &operator=(const HostMemory &) = delete;
+
+ virtual ~HostMemory() { CHECK_CUDA(cudaFreeHost(span_.data())); }
+
+ // Returns a pointer to the memory.
+ T *get() { return span_.data(); }
+ const T *get() const { return span_.data(); }
+
+ // Returns the number of objects the memory can hold.
+ size_t size() const { return span_.size(); }
+
+ // Copies data from other (host memory) to this's memory.
+ void MemcpyFrom(const T *other) {
+ memcpy(span_.data(), other, sizeof(T) * size());
+ }
+ // Copies data to other (host memory) from this's memory.
+ void MemcpyTo(const T *other) {
+ memcpy(other, span_.data(), sizeof(T) * size());
+ }
+
+ private:
+ std::span<T> span_;
+};
+
+// Class to manage the lifetime of device memory.
+template <typename T>
+class GpuMemory {
+ public:
+ // Allocates a block of memory for holding up to size objects of type T in
+ // device memory.
+ GpuMemory(size_t size) : size_(size) {
+ CHECK_CUDA(cudaMalloc((void **)(&memory_), size * sizeof(T)));
+ }
+ GpuMemory(const GpuMemory &) = delete;
+ GpuMemory &operator=(const GpuMemory &) = delete;
+
+ virtual ~GpuMemory() { CHECK_CUDA(cudaFree(memory_)); }
+
+ // Returns the device pointer to the memory.
+ T *get() { return memory_; }
+ const T *get() const { return memory_; }
+
+ // Returns the number of objects this memory can hold.
+ size_t size() const { return size_; }
+
+ // Copies data from host memory to this memory asynchronously on the provided
+ // stream.
+ void MemcpyAsyncFrom(const T *host_memory, CudaStream *stream) {
+ CHECK_CUDA(cudaMemcpyAsync(memory_, host_memory, sizeof(T) * size_,
+ cudaMemcpyHostToDevice, stream->get()));
+ }
+ void MemcpyAsyncFrom(const HostMemory<T> *host_memory, CudaStream *stream) {
+ MemcpyAsyncFrom(host_memory->get(), stream);
+ }
+
+ // Copies data to host memory from this memory asynchronously on the provided
+ // stream.
+ void MemcpyAsyncTo(T *host_memory, CudaStream *stream) const {
+ CHECK_CUDA(cudaMemcpyAsync(reinterpret_cast<void *>(host_memory),
+ reinterpret_cast<void *>(memory_),
+ sizeof(T) * size_, cudaMemcpyDeviceToHost,
+ stream->get()));
+ }
+ void MemcpyAsyncTo(HostMemory<T> *host_memory, CudaStream *stream) const {
+ MemcpyAsyncTo(host_memory->get(), stream);
+ }
+
+ // Copies data from host_memory to this memory blocking.
+ void MemcpyFrom(const T *host_memory) {
+ CHECK_CUDA(cudaMemcpy(reinterpret_cast<void *>(memory_),
+ reinterpret_cast<const void *>(host_memory),
+ sizeof(T) * size_, cudaMemcpyHostToDevice));
+ }
+ void MemcpyFrom(const HostMemory<T> *host_memory) {
+ MemcpyFrom(host_memory->get());
+ }
+
+ // Copies data to host_memory from this memory. Only copies size objects.
+ void MemcpyTo(T *host_memory, size_t size) const {
+ CHECK_CUDA(cudaMemcpy(reinterpret_cast<void *>(host_memory), memory_,
+ sizeof(T) * size, cudaMemcpyDeviceToHost));
+ }
+ // Copies data to host_memory from this memory.
+ void MemcpyTo(T *host_memory) const { MemcpyTo(host_memory, size_); }
+ void MemcpyTo(HostMemory<T> *host_memory) const {
+ MemcpyTo(host_memory->get());
+ }
+
+ // Sets the memory asynchronously to contain data of type 'val' on the provide
+ // stream.
+ void MemsetAsync(const uint8_t val, CudaStream *stream) const {
+ CHECK_CUDA(cudaMemsetAsync(memory_, val, sizeof(T) * size_, stream->get()));
+ }
+
+ // Allocates a vector on the host, copies size objects into it, and returns
+ // it.
+ std::vector<T> Copy(size_t s) const {
+ CHECK_LE(s, size_);
+ std::vector<T> result(s);
+ MemcpyTo(result.data(), s);
+ return result;
+ }
+
+ // Copies all the objects in this memory to a vector on the host and returns
+ // it.
+ std::vector<T> Copy() const { return Copy(size_); }
+
+ private:
+ T *memory_;
+ const size_t size_;
+};
+
+// Synchronizes and CHECKs for success the last CUDA operation.
+void CheckAndSynchronize();
+
+// Synchronizes and CHECKS iff --sync is passed on the command line. Makes it
+// so we can leave debugging in the code.
+void MaybeCheckAndSynchronize();
+
+} // namespace apriltag
+} // namespace frc971
+
+#endif // FRC971_ORIN_CUDA_H_