Add first half of GPU based april tag detector
This detects blob boundaries, filters them, and then orders the points
in a circle in preparation for line fitting. It takes 2ms for a 720p
image on the Orin NX 8gb.
Future commits will do the quad fitting and merge back with the original
algorithm.
Change-Id: Idf2869b3521e50a0056a352138d864b409dab6f1
Signed-off-by: Austin Schuh <austin.linux@gmail.com>
diff --git a/frc971/orin/points.h b/frc971/orin/points.h
new file mode 100644
index 0000000..312d90a
--- /dev/null
+++ b/frc971/orin/points.h
@@ -0,0 +1,295 @@
+#ifndef FRC971_ORIN_POINTS_H_
+#define FRC971_ORIN_POINTS_H_
+
+#include <stdint.h>
+
+#include <cub/iterator/transform_input_iterator.cuh>
+#include <cuda/std/tuple>
+#include <iomanip>
+#include <ostream>
+
+#include "cuda_runtime.h"
+#include "device_launch_parameters.h"
+
+namespace frc971 {
+namespace apriltag {
+
+// Class to hold the 2 adjacent blob IDs, a point in decimated image space, the
+// half pixel offset, and the gradient.
+//
+// rep0 and rep1 are the two blob ids, and are each allocated 20 bits.
+// point is the base point and is allocated 10 bits for x and 10 bits for y.
+// dx and dy are allocated 2 bits, and can only take on set values.
+// black_to_white captures the direction of the gradient in 1 bit.
+//
+// This adds up to 63 bits so we can load this with one big load.
+struct QuadBoundaryPoint {
+ static constexpr size_t kRepEndBit = 24;
+ static constexpr size_t kBitsInKey = 64;
+
+ __forceinline__ __host__ __device__ QuadBoundaryPoint() : key(0) {}
+
+ // Sets rep0, the 0th blob id. This only respects the bottom 20 bits.
+ __forceinline__ __host__ __device__ void set_rep0(uint32_t rep0) {
+ key = (key & 0xfffff00000ffffffull) |
+ (static_cast<uint64_t>(rep0 & 0xfffff) << 24);
+ }
+ // Returns rep0.
+ __forceinline__ __host__ __device__ uint32_t rep0() const {
+ return ((key >> 24) & 0xfffff);
+ }
+
+ // Sets rep1, the 1st blob id. This only respects the bottom 20 bits.
+ __forceinline__ __host__ __device__ void set_rep1(uint32_t rep1) {
+ key = (key & 0xfffffffffffull) |
+ (static_cast<uint64_t>(rep1 & 0xfffff) << 44);
+ }
+ // Returns rep1.
+ __forceinline__ __host__ __device__ uint32_t rep1() const {
+ return ((key >> 44) & 0xfffff);
+ }
+
+ // Returns both rep0 and rep1 concatenated into a single 40 bit number.
+ __forceinline__ __host__ __device__ uint64_t rep01() const {
+ return ((key >> 24) & 0xffffffffff);
+ }
+
+ // Returns all the bits used to hold position and gradient information.
+ __forceinline__ __host__ __device__ uint32_t point_bits() const {
+ return key & 0xffffff;
+ }
+
+ // Sets the 10 bit x and y.
+ __forceinline__ __host__ __device__ void set_base_xy(uint32_t x, uint32_t y) {
+ key = (key & 0xffffffffff00000full) |
+ (static_cast<uint64_t>(x & 0x3ff) << 14) |
+ (static_cast<uint64_t>(y & 0x3ff) << 4);
+ }
+
+ // Returns the base 10 bit x and y.
+ __forceinline__ __host__ __device__ uint32_t base_x() const {
+ return ((key >> 14) & 0x3ff);
+ }
+ __forceinline__ __host__ __device__ uint32_t base_y() const {
+ return ((key >> 4) & 0x3ff);
+ }
+
+ // Sets dxy, the integer representing which of the 4 search directions we
+ // went.
+ __forceinline__ __host__ __device__ void set_dxy(uint64_t dxy) {
+ key = (key & 0xfffffffffffffffcull) | (static_cast<uint64_t>(dxy & 0x3));
+ }
+
+ // Returns the change in x derived from the search direction.
+ __forceinline__ __host__ __device__ int32_t dx() const {
+ switch (key & 0x3) {
+ case 0:
+ return 1;
+ case 1:
+ return 1;
+ case 2:
+ return 0;
+ case 3:
+ return -1;
+ }
+ return 0;
+ }
+
+ // Returns the change in y derived from the search direction.
+ __forceinline__ __host__ __device__ int32_t dy() const {
+ switch (key & 0x3) {
+ case 0:
+ return 0;
+ case 1:
+ case 2:
+ case 3:
+ return 1;
+ }
+ return 0;
+ }
+
+ // Returns the un-decimated x and y positions.
+ __forceinline__ __host__ __device__ uint32_t x() const {
+ return static_cast<int32_t>(base_x() * 2) + dx();
+ }
+ __forceinline__ __host__ __device__ uint32_t y() const {
+ return static_cast<int32_t>(base_y() * 2) + dy();
+ }
+
+ // Returns the gradient that this point represents, taking into account which
+ // direction the color transitioned.
+ __forceinline__ __host__ __device__ int8_t gx() const {
+ return black_to_white() ? dx() : -dx();
+ }
+ __forceinline__ __host__ __device__ int8_t gy() const {
+ return black_to_white() ? dy() : -dy();
+ }
+
+ // Returns the black to white or white to black bit.
+ __forceinline__ __host__ __device__ void set_black_to_white(
+ bool black_to_white) {
+ key = (key & 0xfffffffffffffff7ull) |
+ (static_cast<uint64_t>(black_to_white) << 3);
+ }
+ __forceinline__ __host__ __device__ bool black_to_white() const {
+ return (key & 0x8) != 0;
+ }
+
+ // Various operators to make it easy to compare points.
+ __forceinline__ __host__ __device__ bool operator!=(
+ const QuadBoundaryPoint other) const {
+ return other.key != key;
+ }
+ __forceinline__ __host__ __device__ bool operator==(
+ const QuadBoundaryPoint other) const {
+ return other.key == key;
+ }
+ __forceinline__ __host__ __device__ bool operator<(
+ const QuadBoundaryPoint other) const {
+ return key < other.key;
+ }
+
+ // Returns true if this point has been set. Zero is reserved for "invalid"
+ __forceinline__ __host__ __device__ bool nonzero() const {
+ return key != 0ull;
+ }
+
+ // Returns true if this point is about the other point.
+ bool near(QuadBoundaryPoint other) const { return other == *this; }
+
+ // The key. This shouldn't be parsed directly.
+ uint64_t key;
+};
+
+std::ostream &operator<<(std::ostream &os, const QuadBoundaryPoint &point);
+
+// Holds a compacted blob index, the angle to the X axis from the center of the
+// blob, and the coordinate of the point.
+//
+// The blob index is 12 bits, the angle is 28 bits, and the point is 24 bits.
+struct IndexPoint {
+ // Max number of blob IDs we can hold.
+ static constexpr size_t kMaxBlobs = 2048;
+
+ static constexpr size_t kRepEndBit = 24;
+ static constexpr size_t kBitsInKey = 64;
+
+ __forceinline__ __host__ __device__ IndexPoint() : key(0) {}
+
+ // Constructor to build a point with just the blob index, and point bits. The
+ // point bits should be grabbed from a QuadBoundaryPoint rather than built up
+ // by hand.
+ __forceinline__ __host__ __device__ IndexPoint(uint32_t blob_index,
+ uint32_t point_bits)
+ : key((static_cast<uint64_t>(blob_index & 0xfff) << 52) |
+ (static_cast<uint64_t>(point_bits & 0xffffff))) {}
+
+ // Sets and gets the 12 bit blob index.
+ __forceinline__ __host__ __device__ void set_blob_index(uint32_t blob_index) {
+ key = (key & 0x000fffffffffffffull) |
+ (static_cast<uint64_t>(blob_index & 0xfff) << 52);
+ }
+ __forceinline__ __host__ __device__ uint32_t blob_index() const {
+ return ((key >> 52) & 0xfff);
+ }
+
+ // Sets and gets the 28 bit angle.
+ __forceinline__ __host__ __device__ void set_theta(uint32_t theta) {
+ key = (key & 0xfff0000000ffffffull) |
+ (static_cast<uint64_t>(theta & 0xfffffff) << 24);
+ }
+ __forceinline__ __host__ __device__ uint32_t theta() const {
+ return ((key >> 24) & 0xfffffff);
+ }
+
+ // See QuadBoundaryPoint for a description of the rest of these.
+ __forceinline__ __host__ __device__ uint32_t base_x() const {
+ return ((key >> 14) & 0x3ff);
+ }
+ __forceinline__ __host__ __device__ uint32_t base_y() const {
+ return ((key >> 4) & 0x3ff);
+ }
+
+ __forceinline__ __host__ __device__ void set_dxy(uint64_t dxy) {
+ key = (key & 0xfffffffffffffffcull) | (static_cast<uint64_t>(dxy & 0x3));
+ }
+
+ __forceinline__ __host__ __device__ int32_t dx() const {
+ switch (key & 0x3) {
+ case 0:
+ return 1;
+ case 1:
+ return 1;
+ case 2:
+ return 0;
+ case 3:
+ return -1;
+ }
+ return 0;
+ }
+
+ __forceinline__ __host__ __device__ int32_t dy() const {
+ switch (key & 0x3) {
+ case 0:
+ return 0;
+ case 1:
+ case 2:
+ case 3:
+ return 1;
+ }
+ return 0;
+ }
+
+ __forceinline__ __host__ __device__ uint32_t x() const {
+ return static_cast<int32_t>(base_x() * 2) + dx();
+ }
+ __forceinline__ __host__ __device__ uint32_t y() const {
+ return static_cast<int32_t>(base_y() * 2) + dy();
+ }
+
+ __forceinline__ __host__ __device__ int8_t gx() const {
+ return black_to_white() ? dx() : -dx();
+ }
+ __forceinline__ __host__ __device__ int8_t gy() const {
+ return black_to_white() ? dy() : -dy();
+ }
+
+ __forceinline__ __host__ __device__ uint32_t point_bits() const {
+ return key & 0xffffff;
+ }
+
+ __forceinline__ __host__ __device__ void set_black_to_white(
+ bool black_to_white) {
+ key = (key & 0xfffffffffffffff7ull) |
+ (static_cast<uint64_t>(black_to_white) << 3);
+ }
+ __forceinline__ __host__ __device__ bool black_to_white() const {
+ return (key & 0x8) != 0;
+ }
+
+ // The key. This shouldn't be parsed directly.
+ uint64_t key;
+};
+
+std::ostream &operator<<(std::ostream &os, const IndexPoint &point);
+
+// Decomposer for sorting which just returns the key.
+struct QuadBoundaryPointDecomposer {
+ __host__ __device__ ::cuda::std::tuple<uint64_t &> operator()(
+ QuadBoundaryPoint &key) const {
+ return {key.key};
+ }
+};
+
+// Decomposer for sorting which just returns the key.
+struct QuadIndexPointDecomposer {
+ __host__ __device__ ::cuda::std::tuple<uint64_t &> operator()(
+ IndexPoint &key) const {
+ return {key.key};
+ }
+};
+
+} // namespace apriltag
+} // namespace frc971
+
+#endif // FRC971_ORIN_POINTS_H_