Add first half of GPU based april tag detector

This detects blob boundaries, filters them, and then orders the points
in a circle in preparation for line fitting.  It takes 2ms for a 720p
image on the Orin NX 8gb.

Future commits will do the quad fitting and merge back with the original
algorithm.

Change-Id: Idf2869b3521e50a0056a352138d864b409dab6f1
Signed-off-by: Austin Schuh <austin.linux@gmail.com>
diff --git a/frc971/orin/points.h b/frc971/orin/points.h
new file mode 100644
index 0000000..312d90a
--- /dev/null
+++ b/frc971/orin/points.h
@@ -0,0 +1,295 @@
+#ifndef FRC971_ORIN_POINTS_H_
+#define FRC971_ORIN_POINTS_H_
+
+#include <stdint.h>
+
+#include <cub/iterator/transform_input_iterator.cuh>
+#include <cuda/std/tuple>
+#include <iomanip>
+#include <ostream>
+
+#include "cuda_runtime.h"
+#include "device_launch_parameters.h"
+
+namespace frc971 {
+namespace apriltag {
+
+// Class to hold the 2 adjacent blob IDs, a point in decimated image space, the
+// half pixel offset, and the gradient.
+//
+// rep0 and rep1 are the two blob ids, and are each allocated 20 bits.
+// point is the base point and is allocated 10 bits for x and 10 bits for y.
+// dx and dy are allocated 2 bits, and can only take on set values.
+// black_to_white captures the direction of the gradient in 1 bit.
+//
+// This adds up to 63 bits so we can load this with one big load.
+struct QuadBoundaryPoint {
+  static constexpr size_t kRepEndBit = 24;
+  static constexpr size_t kBitsInKey = 64;
+
+  __forceinline__ __host__ __device__ QuadBoundaryPoint() : key(0) {}
+
+  // Sets rep0, the 0th blob id.  This only respects the bottom 20 bits.
+  __forceinline__ __host__ __device__ void set_rep0(uint32_t rep0) {
+    key = (key & 0xfffff00000ffffffull) |
+          (static_cast<uint64_t>(rep0 & 0xfffff) << 24);
+  }
+  // Returns rep0.
+  __forceinline__ __host__ __device__ uint32_t rep0() const {
+    return ((key >> 24) & 0xfffff);
+  }
+
+  // Sets rep1, the 1st blob id.  This only respects the bottom 20 bits.
+  __forceinline__ __host__ __device__ void set_rep1(uint32_t rep1) {
+    key = (key & 0xfffffffffffull) |
+          (static_cast<uint64_t>(rep1 & 0xfffff) << 44);
+  }
+  // Returns rep1.
+  __forceinline__ __host__ __device__ uint32_t rep1() const {
+    return ((key >> 44) & 0xfffff);
+  }
+
+  // Returns both rep0 and rep1 concatenated into a single 40 bit number.
+  __forceinline__ __host__ __device__ uint64_t rep01() const {
+    return ((key >> 24) & 0xffffffffff);
+  }
+
+  // Returns all the bits used to hold position and gradient information.
+  __forceinline__ __host__ __device__ uint32_t point_bits() const {
+    return key & 0xffffff;
+  }
+
+  // Sets the 10 bit x and y.
+  __forceinline__ __host__ __device__ void set_base_xy(uint32_t x, uint32_t y) {
+    key = (key & 0xffffffffff00000full) |
+          (static_cast<uint64_t>(x & 0x3ff) << 14) |
+          (static_cast<uint64_t>(y & 0x3ff) << 4);
+  }
+
+  // Returns the base 10 bit x and y.
+  __forceinline__ __host__ __device__ uint32_t base_x() const {
+    return ((key >> 14) & 0x3ff);
+  }
+  __forceinline__ __host__ __device__ uint32_t base_y() const {
+    return ((key >> 4) & 0x3ff);
+  }
+
+  // Sets dxy, the integer representing which of the 4 search directions we
+  // went.
+  __forceinline__ __host__ __device__ void set_dxy(uint64_t dxy) {
+    key = (key & 0xfffffffffffffffcull) | (static_cast<uint64_t>(dxy & 0x3));
+  }
+
+  // Returns the change in x derived from the search direction.
+  __forceinline__ __host__ __device__ int32_t dx() const {
+    switch (key & 0x3) {
+      case 0:
+        return 1;
+      case 1:
+        return 1;
+      case 2:
+        return 0;
+      case 3:
+        return -1;
+    }
+    return 0;
+  }
+
+  // Returns the change in y derived from the search direction.
+  __forceinline__ __host__ __device__ int32_t dy() const {
+    switch (key & 0x3) {
+      case 0:
+        return 0;
+      case 1:
+      case 2:
+      case 3:
+        return 1;
+    }
+    return 0;
+  }
+
+  // Returns the un-decimated x and y positions.
+  __forceinline__ __host__ __device__ uint32_t x() const {
+    return static_cast<int32_t>(base_x() * 2) + dx();
+  }
+  __forceinline__ __host__ __device__ uint32_t y() const {
+    return static_cast<int32_t>(base_y() * 2) + dy();
+  }
+
+  // Returns the gradient that this point represents, taking into account which
+  // direction the color transitioned.
+  __forceinline__ __host__ __device__ int8_t gx() const {
+    return black_to_white() ? dx() : -dx();
+  }
+  __forceinline__ __host__ __device__ int8_t gy() const {
+    return black_to_white() ? dy() : -dy();
+  }
+
+  // Returns the black to white or white to black bit.
+  __forceinline__ __host__ __device__ void set_black_to_white(
+      bool black_to_white) {
+    key = (key & 0xfffffffffffffff7ull) |
+          (static_cast<uint64_t>(black_to_white) << 3);
+  }
+  __forceinline__ __host__ __device__ bool black_to_white() const {
+    return (key & 0x8) != 0;
+  }
+
+  // Various operators to make it easy to compare points.
+  __forceinline__ __host__ __device__ bool operator!=(
+      const QuadBoundaryPoint other) const {
+    return other.key != key;
+  }
+  __forceinline__ __host__ __device__ bool operator==(
+      const QuadBoundaryPoint other) const {
+    return other.key == key;
+  }
+  __forceinline__ __host__ __device__ bool operator<(
+      const QuadBoundaryPoint other) const {
+    return key < other.key;
+  }
+
+  // Returns true if this point has been set.  Zero is reserved for "invalid"
+  __forceinline__ __host__ __device__ bool nonzero() const {
+    return key != 0ull;
+  }
+
+  // Returns true if this point is about the other point.
+  bool near(QuadBoundaryPoint other) const { return other == *this; }
+
+  // The key.  This shouldn't be parsed directly.
+  uint64_t key;
+};
+
+std::ostream &operator<<(std::ostream &os, const QuadBoundaryPoint &point);
+
+// Holds a compacted blob index, the angle to the X axis from the center of the
+// blob, and the coordinate of the point.
+//
+// The blob index is 12 bits, the angle is 28 bits, and the point is 24 bits.
+struct IndexPoint {
+  // Max number of blob IDs we can hold.
+  static constexpr size_t kMaxBlobs = 2048;
+
+  static constexpr size_t kRepEndBit = 24;
+  static constexpr size_t kBitsInKey = 64;
+
+  __forceinline__ __host__ __device__ IndexPoint() : key(0) {}
+
+  // Constructor to build a point with just the blob index, and point bits.  The
+  // point bits should be grabbed from a QuadBoundaryPoint rather than built up
+  // by hand.
+  __forceinline__ __host__ __device__ IndexPoint(uint32_t blob_index,
+                                                 uint32_t point_bits)
+      : key((static_cast<uint64_t>(blob_index & 0xfff) << 52) |
+            (static_cast<uint64_t>(point_bits & 0xffffff))) {}
+
+  // Sets and gets the 12 bit blob index.
+  __forceinline__ __host__ __device__ void set_blob_index(uint32_t blob_index) {
+    key = (key & 0x000fffffffffffffull) |
+          (static_cast<uint64_t>(blob_index & 0xfff) << 52);
+  }
+  __forceinline__ __host__ __device__ uint32_t blob_index() const {
+    return ((key >> 52) & 0xfff);
+  }
+
+  // Sets and gets the 28 bit angle.
+  __forceinline__ __host__ __device__ void set_theta(uint32_t theta) {
+    key = (key & 0xfff0000000ffffffull) |
+          (static_cast<uint64_t>(theta & 0xfffffff) << 24);
+  }
+  __forceinline__ __host__ __device__ uint32_t theta() const {
+    return ((key >> 24) & 0xfffffff);
+  }
+
+  // See QuadBoundaryPoint for a description of the rest of these.
+  __forceinline__ __host__ __device__ uint32_t base_x() const {
+    return ((key >> 14) & 0x3ff);
+  }
+  __forceinline__ __host__ __device__ uint32_t base_y() const {
+    return ((key >> 4) & 0x3ff);
+  }
+
+  __forceinline__ __host__ __device__ void set_dxy(uint64_t dxy) {
+    key = (key & 0xfffffffffffffffcull) | (static_cast<uint64_t>(dxy & 0x3));
+  }
+
+  __forceinline__ __host__ __device__ int32_t dx() const {
+    switch (key & 0x3) {
+      case 0:
+        return 1;
+      case 1:
+        return 1;
+      case 2:
+        return 0;
+      case 3:
+        return -1;
+    }
+    return 0;
+  }
+
+  __forceinline__ __host__ __device__ int32_t dy() const {
+    switch (key & 0x3) {
+      case 0:
+        return 0;
+      case 1:
+      case 2:
+      case 3:
+        return 1;
+    }
+    return 0;
+  }
+
+  __forceinline__ __host__ __device__ uint32_t x() const {
+    return static_cast<int32_t>(base_x() * 2) + dx();
+  }
+  __forceinline__ __host__ __device__ uint32_t y() const {
+    return static_cast<int32_t>(base_y() * 2) + dy();
+  }
+
+  __forceinline__ __host__ __device__ int8_t gx() const {
+    return black_to_white() ? dx() : -dx();
+  }
+  __forceinline__ __host__ __device__ int8_t gy() const {
+    return black_to_white() ? dy() : -dy();
+  }
+
+  __forceinline__ __host__ __device__ uint32_t point_bits() const {
+    return key & 0xffffff;
+  }
+
+  __forceinline__ __host__ __device__ void set_black_to_white(
+      bool black_to_white) {
+    key = (key & 0xfffffffffffffff7ull) |
+          (static_cast<uint64_t>(black_to_white) << 3);
+  }
+  __forceinline__ __host__ __device__ bool black_to_white() const {
+    return (key & 0x8) != 0;
+  }
+
+  // The key.  This shouldn't be parsed directly.
+  uint64_t key;
+};
+
+std::ostream &operator<<(std::ostream &os, const IndexPoint &point);
+
+// Decomposer for sorting which just returns the key.
+struct QuadBoundaryPointDecomposer {
+  __host__ __device__ ::cuda::std::tuple<uint64_t &> operator()(
+      QuadBoundaryPoint &key) const {
+    return {key.key};
+  }
+};
+
+// Decomposer for sorting which just returns the key.
+struct QuadIndexPointDecomposer {
+  __host__ __device__ ::cuda::std::tuple<uint64_t &> operator()(
+      IndexPoint &key) const {
+    return {key.key};
+  }
+};
+
+}  // namespace apriltag
+}  // namespace frc971
+
+#endif  // FRC971_ORIN_POINTS_H_