Add first half of GPU based april tag detector

This detects blob boundaries, filters them, and then orders the points
in a circle in preparation for line fitting.  It takes 2ms for a 720p
image on the Orin NX 8gb.

Future commits will do the quad fitting and merge back with the original
algorithm.

Change-Id: Idf2869b3521e50a0056a352138d864b409dab6f1
Signed-off-by: Austin Schuh <austin.linux@gmail.com>
diff --git a/frc971/orin/cuda.h b/frc971/orin/cuda.h
new file mode 100644
index 0000000..e386aa6
--- /dev/null
+++ b/frc971/orin/cuda.h
@@ -0,0 +1,207 @@
+#ifndef FRC971_ORIN_CUDA_H_
+#define FRC971_ORIN_CUDA_H_
+
+#include <chrono>
+#include <span>
+
+#include "glog/logging.h"
+
+#include "cuda_runtime.h"
+#include "device_launch_parameters.h"
+
+// CHECKs that a cuda method returned success.
+// TODO(austin): This will not handle if and else statements quite right, fix if
+// we care.
+#define CHECK_CUDA(condition)                                             \
+  if (auto c = condition)                                                 \
+  LOG(FATAL) << "Check failed: " #condition " (" << cudaGetErrorString(c) \
+             << ") "
+
+namespace frc971 {
+namespace apriltag {
+
+// Class to manage the lifetime of a Cuda stream.  This is used to provide
+// relative ordering between kernels on the same stream.
+class CudaStream {
+ public:
+  CudaStream() { CHECK_CUDA(cudaStreamCreate(&stream_)); }
+
+  CudaStream(const CudaStream &) = delete;
+  CudaStream &operator=(const CudaStream &) = delete;
+
+  virtual ~CudaStream() { CHECK_CUDA(cudaStreamDestroy(stream_)); }
+
+  // Returns the stream.
+  cudaStream_t get() { return stream_; }
+
+ private:
+  cudaStream_t stream_;
+};
+
+// Class to manage the lifetime of a Cuda Event.  Cuda events are used for
+// timing events on a stream.
+class CudaEvent {
+ public:
+  CudaEvent() { CHECK_CUDA(cudaEventCreate(&event_)); }
+
+  CudaEvent(const CudaEvent &) = delete;
+  CudaEvent &operator=(const CudaEvent &) = delete;
+
+  virtual ~CudaEvent() { CHECK_CUDA(cudaEventDestroy(event_)); }
+
+  // Queues up an event to be timestamped on the stream when it is executed.
+  void Record(CudaStream *stream) {
+    CHECK_CUDA(cudaEventRecord(event_, stream->get()));
+  }
+
+  // Returns the time elapsed between start and this event if it has been
+  // triggered.
+  std::chrono::nanoseconds ElapsedTime(const CudaEvent &start) {
+    float ms;
+    CHECK_CUDA(cudaEventElapsedTime(&ms, start.event_, event_));
+    return std::chrono::duration_cast<std::chrono::nanoseconds>(
+        std::chrono::duration<float, std::milli>(ms));
+  }
+
+  // Waits until the event has been triggered.
+  void Synchronize() { CHECK_CUDA(cudaEventSynchronize(event_)); }
+
+ private:
+  cudaEvent_t event_;
+};
+
+// Class to manage the lifetime of page locked host memory for fast copies back
+// to host memory.
+template <typename T>
+class HostMemory {
+ public:
+  // Allocates a block of memory for holding up to size objects of type T.
+  HostMemory(size_t size) {
+    T *memory;
+    CHECK_CUDA(cudaMallocHost((void **)(&memory), size * sizeof(T)));
+    span_ = std::span<T>(memory, size);
+  }
+  HostMemory(const HostMemory &) = delete;
+  HostMemory &operator=(const HostMemory &) = delete;
+
+  virtual ~HostMemory() { CHECK_CUDA(cudaFreeHost(span_.data())); }
+
+  // Returns a pointer to the memory.
+  T *get() { return span_.data(); }
+  const T *get() const { return span_.data(); }
+
+  // Returns the number of objects the memory can hold.
+  size_t size() const { return span_.size(); }
+
+  // Copies data from other (host memory) to this's memory.
+  void MemcpyFrom(const T *other) {
+    memcpy(span_.data(), other, sizeof(T) * size());
+  }
+  // Copies data to other (host memory) from this's memory.
+  void MemcpyTo(const T *other) {
+    memcpy(other, span_.data(), sizeof(T) * size());
+  }
+
+ private:
+  std::span<T> span_;
+};
+
+// Class to manage the lifetime of device memory.
+template <typename T>
+class GpuMemory {
+ public:
+  // Allocates a block of memory for holding up to size objects of type T in
+  // device memory.
+  GpuMemory(size_t size) : size_(size) {
+    CHECK_CUDA(cudaMalloc((void **)(&memory_), size * sizeof(T)));
+  }
+  GpuMemory(const GpuMemory &) = delete;
+  GpuMemory &operator=(const GpuMemory &) = delete;
+
+  virtual ~GpuMemory() { CHECK_CUDA(cudaFree(memory_)); }
+
+  // Returns the device pointer to the memory.
+  T *get() { return memory_; }
+  const T *get() const { return memory_; }
+
+  // Returns the number of objects this memory can hold.
+  size_t size() const { return size_; }
+
+  // Copies data from host memory to this memory asynchronously on the provided
+  // stream.
+  void MemcpyAsyncFrom(const T *host_memory, CudaStream *stream) {
+    CHECK_CUDA(cudaMemcpyAsync(memory_, host_memory, sizeof(T) * size_,
+                               cudaMemcpyHostToDevice, stream->get()));
+  }
+  void MemcpyAsyncFrom(const HostMemory<T> *host_memory, CudaStream *stream) {
+    MemcpyAsyncFrom(host_memory->get(), stream);
+  }
+
+  // Copies data to host memory from this memory asynchronously on the provided
+  // stream.
+  void MemcpyAsyncTo(T *host_memory, CudaStream *stream) const {
+    CHECK_CUDA(cudaMemcpyAsync(reinterpret_cast<void *>(host_memory),
+                               reinterpret_cast<void *>(memory_),
+                               sizeof(T) * size_, cudaMemcpyDeviceToHost,
+                               stream->get()));
+  }
+  void MemcpyAsyncTo(HostMemory<T> *host_memory, CudaStream *stream) const {
+    MemcpyAsyncTo(host_memory->get(), stream);
+  }
+
+  // Copies data from host_memory to this memory blocking.
+  void MemcpyFrom(const T *host_memory) {
+    CHECK_CUDA(cudaMemcpy(reinterpret_cast<void *>(memory_),
+                          reinterpret_cast<const void *>(host_memory),
+                          sizeof(T) * size_, cudaMemcpyHostToDevice));
+  }
+  void MemcpyFrom(const HostMemory<T> *host_memory) {
+    MemcpyFrom(host_memory->get());
+  }
+
+  // Copies data to host_memory from this memory.  Only copies size objects.
+  void MemcpyTo(T *host_memory, size_t size) const {
+    CHECK_CUDA(cudaMemcpy(reinterpret_cast<void *>(host_memory), memory_,
+                          sizeof(T) * size, cudaMemcpyDeviceToHost));
+  }
+  // Copies data to host_memory from this memory.
+  void MemcpyTo(T *host_memory) const { MemcpyTo(host_memory, size_); }
+  void MemcpyTo(HostMemory<T> *host_memory) const {
+    MemcpyTo(host_memory->get());
+  }
+
+  // Sets the memory asynchronously to contain data of type 'val' on the provide
+  // stream.
+  void MemsetAsync(const uint8_t val, CudaStream *stream) const {
+    CHECK_CUDA(cudaMemsetAsync(memory_, val, sizeof(T) * size_, stream->get()));
+  }
+
+  // Allocates a vector on the host, copies size objects into it, and returns
+  // it.
+  std::vector<T> Copy(size_t s) const {
+    CHECK_LE(s, size_);
+    std::vector<T> result(s);
+    MemcpyTo(result.data(), s);
+    return result;
+  }
+
+  // Copies all the objects in this memory to a vector on the host and returns
+  // it.
+  std::vector<T> Copy() const { return Copy(size_); }
+
+ private:
+  T *memory_;
+  const size_t size_;
+};
+
+// Synchronizes and CHECKs for success the last CUDA operation.
+void CheckAndSynchronize();
+
+// Synchronizes and CHECKS iff --sync is passed on the command line.  Makes it
+// so we can leave debugging in the code.
+void MaybeCheckAndSynchronize();
+
+}  // namespace apriltag
+}  // namespace frc971
+
+#endif  // FRC971_ORIN_CUDA_H_