Add first half of GPU based april tag detector
This detects blob boundaries, filters them, and then orders the points
in a circle in preparation for line fitting. It takes 2ms for a 720p
image on the Orin NX 8gb.
Future commits will do the quad fitting and merge back with the original
algorithm.
Change-Id: Idf2869b3521e50a0056a352138d864b409dab6f1
Signed-off-by: Austin Schuh <austin.linux@gmail.com>
diff --git a/frc971/orin/BUILD b/frc971/orin/BUILD
index 5675ab4..a81abf1 100644
--- a/frc971/orin/BUILD
+++ b/frc971/orin/BUILD
@@ -7,3 +7,61 @@
function = "ycbcr",
visibility = ["//visibility:public"],
)
+
+cc_library(
+ name = "cuda",
+ srcs = [
+ "apriltag.cc",
+ "cuda.cc",
+ "labeling_allegretti_2019_BKE.cc",
+ "points.cc",
+ "threshold.cc",
+ ],
+ hdrs = [
+ "apriltag.h",
+ "cuda.h",
+ "gpu_image.h",
+ "labeling_allegretti_2019_BKE.h",
+ "points.h",
+ "threshold.h",
+ ],
+ copts = [
+ "-Wno-pass-failed",
+ #"-DCUB_DETAIL_DEBUG_ENABLE_LOG=1",
+ #"-DDEBUG=1",
+ ],
+ features = ["cuda"],
+ deps = [
+ "//aos/time",
+ "//third_party/apriltag",
+ "@com_github_google_glog//:glog",
+ "@com_github_nvidia_cccl//:cccl",
+ "@com_github_nvidia_cuco//:cuco",
+ ],
+)
+
+cc_test(
+ name = "cuda_april_tag_test",
+ srcs = [
+ "cuda_april_tag_test.cc",
+ ],
+ data = [
+ "@apriltag_test_bfbs_images",
+ ],
+ features = ["cuda"],
+ deps = [
+ ":cuda",
+ ":ycbcr",
+ "//aos:flatbuffer_merge",
+ "//aos:json_to_flatbuffer",
+ "//aos/testing:googletest",
+ "//aos/testing:path",
+ "//aos/testing:random_seed",
+ "//frc971/vision:vision_fbs",
+ "//third_party:cudart",
+ "//third_party:opencv",
+ "//third_party/apriltag",
+ "//y2023/vision:ThresholdHalide",
+ "//y2023/vision:ToGreyscaleAndDecimateHalide",
+ ],
+)
diff --git a/frc971/orin/apriltag.cc b/frc971/orin/apriltag.cc
new file mode 100644
index 0000000..381ffae
--- /dev/null
+++ b/frc971/orin/apriltag.cc
@@ -0,0 +1,845 @@
+#include "frc971/orin/apriltag.h"
+
+#include <thrust/iterator/constant_iterator.h>
+#include <thrust/iterator/transform_iterator.h>
+
+#include <cub/device/device_copy.cuh>
+#include <cub/device/device_radix_sort.cuh>
+#include <cub/device/device_reduce.cuh>
+#include <cub/device/device_run_length_encode.cuh>
+#include <cub/device/device_scan.cuh>
+#include <cub/device/device_segmented_sort.cuh>
+#include <cub/device/device_select.cuh>
+#include <cub/iterator/discard_output_iterator.cuh>
+#include <cub/iterator/transform_input_iterator.cuh>
+#include <vector>
+
+#include "glog/logging.h"
+
+#include "aos/time/time.h"
+#include "frc971/orin/labeling_allegretti_2019_BKE.h"
+#include "frc971/orin/threshold.h"
+
+namespace frc971 {
+namespace apriltag {
+
+typedef std::chrono::duration<float, std::milli> float_milli;
+
+// Returns true if the QuadBoundaryPoint is nonzero.
+struct NonZero {
+ __host__ __device__ __forceinline__ bool operator()(
+ const QuadBoundaryPoint &a) const {
+ return a.nonzero();
+ }
+};
+
+// Always returns true (only used for scratch space calcs).
+template <typename T>
+struct True {
+ __host__ __device__ __forceinline__ bool operator()(const T &) const {
+ return true;
+ }
+};
+
+// Computes and returns the scratch space needed for DeviceRadixSort::SortKeys
+// of the provided key with the provided number of elements.
+template <typename T>
+static size_t RadixSortScratchSpace(size_t elements) {
+ size_t temp_storage_bytes = 0;
+ QuadBoundaryPointDecomposer decomposer;
+ cub::DeviceRadixSort::SortKeys(nullptr, temp_storage_bytes, (T *)(nullptr),
+ (T *)(nullptr), elements, decomposer);
+ return temp_storage_bytes;
+}
+
+// Computes and returns the scratch space needed for DeviceSelect::If of the
+// provided type Tin, to be written to the provided type Tout, for the provided
+// number of elements. num_markers is the device pointer used to hold the
+// selected number of elements.
+template <typename Tin, typename Tout>
+static size_t DeviceSelectIfScratchSpace(size_t elements, int *num_markers) {
+ size_t temp_storage_bytes = 0;
+ CHECK_CUDA(cub::DeviceSelect::If(nullptr, temp_storage_bytes,
+ (Tin *)(nullptr), (Tout *)(nullptr),
+ num_markers, elements, True<Tin>()));
+ return temp_storage_bytes;
+}
+
+// Always returns the first element (only used for scratch space calcs).
+template <typename T>
+struct CustomFirst {
+ __host__ __device__ __forceinline__ T operator()(const T &a,
+ const T & /*b*/) const {
+ return a;
+ }
+};
+
+// Computes and returns the scratch space needed for DeviceReduce::ReduceByKey
+// of the provided key K and value V with the provided number of elements.
+template <typename K, typename V>
+static size_t DeviceReduceByKeyScratchSpace(size_t elements) {
+ size_t temp_storage_bytes = 0;
+ CHECK_CUDA(cub::DeviceReduce::ReduceByKey(
+ nullptr, temp_storage_bytes, (K *)(nullptr), (K *)(nullptr),
+ (V *)(nullptr), (V *)(nullptr), (size_t *)(nullptr), CustomFirst<V>(),
+ elements));
+ return temp_storage_bytes;
+}
+
+// Computes and returns the scratch space needed for DeviceScan::InclusiveScan
+// of the provided value V with the provided number of elements.
+template <typename V>
+static size_t DeviceScanInclusiveScanScratchSpace(size_t elements) {
+ size_t temp_storage_bytes = 0;
+ CHECK_CUDA(cub::DeviceScan::InclusiveScan(nullptr, temp_storage_bytes,
+ (V *)(nullptr), (V *)(nullptr),
+ CustomFirst<V>(), elements));
+ return temp_storage_bytes;
+}
+
+GpuDetector::GpuDetector(size_t width, size_t height,
+ const apriltag_detector_t *tag_detector)
+ : width_(width),
+ height_(height),
+ tag_detector_(tag_detector),
+ color_image_host_(width * height * 2),
+ color_image_device_(width * height * 2),
+ gray_image_device_(width * height),
+ decimated_image_device_(width / 2 * height / 2),
+ unfiltered_minmax_image_device_((width / 2 / 4 * height / 2 / 4) * 2),
+ minmax_image_device_((width / 2 / 4 * height / 2 / 4) * 2),
+ thresholded_image_device_(width / 2 * height / 2),
+ union_markers_device_(width / 2 * height / 2),
+ union_markers_size_device_(width / 2 * height / 2),
+ union_marker_pair_device_((width / 2 - 2) * (height / 2 - 2) * 4),
+ compressed_union_marker_pair_device_(union_marker_pair_device_.size()),
+ sorted_union_marker_pair_device_(union_marker_pair_device_.size()),
+ extents_device_(union_marker_pair_device_.size()),
+ reduced_dot_blobs_pair_device_(kMaxBlobs),
+ selected_extents_device_(kMaxBlobs),
+ selected_blobs_device_(union_marker_pair_device_.size()),
+ sorted_selected_blobs_device_(selected_blobs_device_.size()),
+ radix_sort_tmpstorage_device_(RadixSortScratchSpace<QuadBoundaryPoint>(
+ sorted_union_marker_pair_device_.size())),
+ temp_storage_compressed_union_marker_pair_device_(
+ DeviceSelectIfScratchSpace<QuadBoundaryPoint, QuadBoundaryPoint>(
+ union_marker_pair_device_.size(),
+ num_compressed_union_marker_pair_device_.get())),
+ temp_storage_bounds_reduce_by_key_device_(
+ DeviceReduceByKeyScratchSpace<uint64_t, MinMaxExtents>(
+ union_marker_pair_device_.size())),
+ temp_storage_dot_product_device_(
+ DeviceReduceByKeyScratchSpace<uint64_t, float>(
+ union_marker_pair_device_.size())),
+ temp_storage_compressed_filtered_blobs_device_(
+ DeviceSelectIfScratchSpace<IndexPoint, IndexPoint>(
+ union_marker_pair_device_.size(),
+ num_selected_blobs_device_.get())),
+ temp_storage_selected_extents_scan_device_(
+ DeviceScanInclusiveScanScratchSpace<
+ cub::KeyValuePair<long, MinMaxExtents>>(kMaxBlobs)) {
+ CHECK_EQ(tag_detector_->quad_decimate, 2);
+ CHECK(!tag_detector_->qtp.deglitch);
+
+ for (int i = 0; i < zarray_size(tag_detector_->tag_families); i++) {
+ apriltag_family_t *family;
+ zarray_get(tag_detector_->tag_families, i, &family);
+ if (family->width_at_border < min_tag_width_) {
+ min_tag_width_ = family->width_at_border;
+ }
+ normal_border_ |= !family->reversed_border;
+ reversed_border_ |= family->reversed_border;
+ }
+ min_tag_width_ /= tag_detector_->quad_decimate;
+ if (min_tag_width_ < 3) {
+ min_tag_width_ = 3;
+ }
+}
+
+GpuDetector::~GpuDetector() {}
+
+// Computes a massive image of 4x QuadBoundaryPoint per pixel with a
+// QuadBoundaryPoint for each pixel pair which crosses a blob boundary.
+template <size_t kBlockWidth, size_t kBlockHeight>
+__global__ void BlobDiff(const uint8_t *thresholded_image,
+ const uint32_t *blobs,
+ const uint32_t *union_markers_size,
+ QuadBoundaryPoint *result, size_t width,
+ size_t height) {
+ __shared__ uint32_t temp_blob_storage[kBlockWidth * kBlockHeight];
+ __shared__ uint8_t temp_image_storage[kBlockWidth * kBlockHeight];
+
+ // We overlap both directions in X, and only one direction in Y.
+ const uint x = blockIdx.x * (blockDim.x - 2) + threadIdx.x;
+ const uint y = blockIdx.y * (blockDim.y - 1) + threadIdx.y;
+
+ // Ignore anything outside the image.
+ if (x >= width || y >= height || y == 0) {
+ return;
+ }
+
+ // Compute the location in the image this threads is responsible for.
+ const uint global_input_index = x + y * width;
+ // And the destination in the temporary storage to save it.
+ const uint thread_linear_index = threadIdx.x + blockDim.x * threadIdx.y;
+
+ // Now, load all the data.
+ const uint32_t rep0 = temp_blob_storage[thread_linear_index] =
+ blobs[global_input_index];
+ const uint8_t v0 = temp_image_storage[thread_linear_index] =
+ thresholded_image[global_input_index];
+
+ // All threads in the block have now gotten this far so the shared memory is
+ // consistent.
+ __syncthreads();
+
+ // We've done our job loading things into memory, this pixel is a boundary
+ // with the upper and left sides, or covered by the next block.
+ if (threadIdx.x == 0) {
+ return;
+ }
+
+ // We need to search both ways in x, this thread doesn't participate further
+ // :(
+ if (threadIdx.x == blockDim.x - 1) {
+ return;
+ }
+
+ if (threadIdx.y == blockDim.y - 1) {
+ return;
+ }
+
+ // This is the last pixel along the lower and right sides, we don't need to
+ // compute it.
+ if (x == width - 1 || y == height - 1) {
+ return;
+ }
+
+ // Place in memory to write the result.
+ const uint global_output_index = (x - 1) + (y - 1) * (width - 2);
+
+ // Short circuit 127's and write an empty point out.
+ if (v0 == 127 || union_markers_size[rep0] < 25) {
+#pragma unroll
+ for (size_t point_offset = 0; point_offset < 4; ++point_offset) {
+ const size_t write_address =
+ (width - 2) * (height - 2) * point_offset + global_output_index;
+ result[write_address] = QuadBoundaryPoint();
+ }
+ return;
+ }
+
+ uint32_t rep1;
+ uint8_t v1;
+
+#define DO_CONN(dx, dy, point_offset) \
+ { \
+ QuadBoundaryPoint cluster_id; \
+ const uint x1 = dx + threadIdx.x; \
+ const uint y1 = dy + threadIdx.y; \
+ const uint thread_linear_index1 = x1 + blockDim.x * y1; \
+ v1 = temp_image_storage[thread_linear_index1]; \
+ rep1 = temp_blob_storage[thread_linear_index1]; \
+ if (v0 + v1 == 255) { \
+ if (union_markers_size[rep1] >= 25) { \
+ if (rep0 < rep1) { \
+ cluster_id.set_rep1(rep1); \
+ cluster_id.set_rep0(rep0); \
+ } else { \
+ cluster_id.set_rep1(rep0); \
+ cluster_id.set_rep0(rep1); \
+ } \
+ cluster_id.set_base_xy(x, y); \
+ cluster_id.set_dxy(point_offset); \
+ cluster_id.set_black_to_white(v1 > v0); \
+ } \
+ } \
+ const size_t write_address = \
+ (width - 2) * (height - 2) * point_offset + global_output_index; \
+ result[write_address] = cluster_id; \
+ }
+
+ // We search the following 4 neighbors.
+ // ________
+ // | x | 0 |
+ // -------------
+ // | 3 | 2 | 1 |
+ // -------------
+ //
+ // If connection 3 has the same IDs as the connection between blocks 0 and 2,
+ // we will have a duplicate entry. Detect and don't add it. This will only
+ // happen if id(x) == id(0) and id(2) == id(3),
+ // or id(x) == id(2) and id(0) ==id(3).
+
+ DO_CONN(1, 0, 0);
+ DO_CONN(1, 1, 1);
+ DO_CONN(0, 1, 2);
+ const uint64_t rep_block_2 = rep1;
+ const uint8_t v1_block_2 = v1;
+
+ const uint left_thread_linear_index1 =
+ threadIdx.x - 1 + blockDim.x * threadIdx.y;
+ const uint8_t v1_block_left = temp_image_storage[left_thread_linear_index1];
+ const uint32_t rep_block_left = temp_blob_storage[left_thread_linear_index1];
+
+ // Do the dedup calculation now.
+ if (v1_block_left != 127 && v1_block_2 != 127 &&
+ v1_block_2 != v1_block_left) {
+ if (x != 1 && union_markers_size[rep_block_left] >= 25 &&
+ union_markers_size[rep_block_2] >= 25) {
+ const size_t write_address =
+ (width - 2) * (height - 2) * 3 + global_output_index;
+ result[write_address] = QuadBoundaryPoint();
+ return;
+ }
+ }
+
+ DO_CONN(-1, 1, 3);
+}
+
+// Masks out just the blob ID pair, rep01.
+struct MaskRep01 {
+ __host__ __device__ __forceinline__ uint64_t
+ operator()(const QuadBoundaryPoint &a) const {
+ return a.rep01();
+ }
+};
+
+// Rewrites a QuadBoundaryPoint to an IndexPoint, adding the angle to the
+// center.
+class RewriteToIndexPoint {
+ public:
+ RewriteToIndexPoint(MinMaxExtents *extents_device, size_t num_extents)
+ : blob_finder_(extents_device, num_extents) {}
+
+ __host__ __device__ __forceinline__ IndexPoint
+ operator()(cub::KeyValuePair<long, QuadBoundaryPoint> pt) const {
+ size_t index = blob_finder_.FindBlobIndex(pt.key);
+ MinMaxExtents extents = blob_finder_.Get(index);
+ IndexPoint result(index, pt.value.point_bits());
+
+ float theta =
+ (atan2f(pt.value.y() - extents.cy(), pt.value.x() - extents.cx()) +
+ M_PI) *
+ 8e6;
+ long long int theta_int = llrintf(theta);
+
+ result.set_theta(std::max<long long int>(0, theta_int));
+
+ return result;
+ }
+
+ BlobExtentsIndexFinder blob_finder_;
+};
+
+// TODO(austin): Make something which rewrites points on the way back out to
+// memory and adds the slope.
+
+// Transforms aQuadBoundaryPoint into a single point extent for Reduce.
+struct TransformQuadBoundaryPointToMinMaxExtents {
+ __host__ __device__ __forceinline__ MinMaxExtents
+ operator()(cub::KeyValuePair<long, QuadBoundaryPoint> pt) const {
+ MinMaxExtents result;
+ result.min_y = result.max_y = pt.value.y();
+ result.min_x = result.max_x = pt.value.x();
+ result.starting_offset = pt.key;
+ result.count = 1;
+ return result;
+ }
+};
+
+// Reduces 2 extents by tracking the range and updating the offset and count
+// accordingly.
+struct QuadBoundaryPointExtents {
+ __host__ __device__ __forceinline__ MinMaxExtents
+ operator()(const MinMaxExtents &a, const MinMaxExtents &b) const {
+ MinMaxExtents result;
+ result.min_x = std::min(a.min_x, b.min_x);
+ result.max_x = std::max(a.max_x, b.max_x);
+ result.min_y = std::min(a.min_y, b.min_y);
+ result.max_y = std::max(a.max_y, b.max_y);
+ // And the actual start is the first of the points.
+ result.starting_offset = std::min(a.starting_offset, b.starting_offset);
+ // We want to count everything.
+ result.count = a.count + b.count;
+ return result;
+ }
+};
+
+// Computes the dot product of the point vector and gradient for detecting
+// inside-out blobs.
+struct ComputeDotProductTransform {
+ ComputeDotProductTransform(MinMaxExtents *extents_device, size_t num_extents,
+ size_t tag_width, size_t min_cluster_pixels,
+ size_t max_cluster_pixels)
+ : index_finder_(extents_device, num_extents),
+ tag_width_(tag_width),
+ min_cluster_pixels_(std::max<size_t>(24u, min_cluster_pixels)),
+ max_cluster_pixels_(max_cluster_pixels) {}
+
+ __host__ __device__ __forceinline__ float operator()(
+ cub::KeyValuePair<long, QuadBoundaryPoint> a) const {
+ const size_t y = a.value.y();
+ const size_t x = a.value.x();
+
+ // Binary search for the index.
+ const size_t index = index_finder_.FindBlobIndex(a.key);
+
+ // Don't bother to compute the dot product for anything we will ignore in
+ // SelectBlobs below.
+ //
+ // TODO(austin): Figure out how to dedup with SelectBlobs below.
+ const MinMaxExtents extents = index_finder_.Get(index);
+ if (extents.count < min_cluster_pixels_) {
+ return 0;
+ }
+ if (extents.count > max_cluster_pixels_) {
+ return 0;
+ }
+
+ // Area must also be reasonable.
+ if ((extents.max_x - extents.min_x) * (extents.max_y - extents.min_y) <
+ tag_width_) {
+ return 0;
+ }
+
+ const float dx = static_cast<float>(x) - extents.cx();
+ const float dy = static_cast<float>(y) - extents.cy();
+
+ return dx * a.value.gx() + dy * a.value.gy();
+ }
+
+ BlobExtentsIndexFinder index_finder_;
+
+ size_t tag_width_;
+ size_t min_cluster_pixels_;
+ size_t max_cluster_pixels_;
+};
+
+// Selects blobs which are big enough, not too big, and have the right color in
+// the middle.
+class SelectBlobs {
+ public:
+ SelectBlobs(const MinMaxExtents *extents_device, const float *dot_products,
+ size_t tag_width, bool reversed_border, bool normal_border,
+ size_t min_cluster_pixels, size_t max_cluster_pixels)
+ : extents_device_(extents_device),
+ dot_products_(dot_products),
+ tag_width_(tag_width),
+ reversed_border_(reversed_border),
+ normal_border_(normal_border),
+ min_cluster_pixels_(std::max<size_t>(24u, min_cluster_pixels)),
+ max_cluster_pixels_(max_cluster_pixels) {}
+
+ // Returns true if the blob passes the size and dot product checks and is
+ // worth further consideration.
+ __host__ __device__ __forceinline__ bool operator()(
+ const size_t blob_index, MinMaxExtents extents) const {
+ if (extents.count < min_cluster_pixels_) {
+ return false;
+ }
+ if (extents.count > max_cluster_pixels_) {
+ return false;
+ }
+
+ // Area must also be reasonable.
+ if ((extents.max_x - extents.min_x) * (extents.max_y - extents.min_y) <
+ tag_width_) {
+ return false;
+ }
+
+ // And the right side must be inside.
+ const bool quad_reversed_border = dot_products_[blob_index] < 0;
+ if (!reversed_border_ && quad_reversed_border) {
+ return false;
+ }
+ if (!normal_border_ && !quad_reversed_border) {
+ return false;
+ }
+
+ return true;
+ }
+
+ __host__ __device__ __forceinline__ bool operator()(
+ const IndexPoint &a) const {
+ bool result = (*this)(a.blob_index(), extents_device_[a.blob_index()]);
+
+ return result;
+ }
+
+ const MinMaxExtents *extents_device_;
+ const float *dot_products_;
+ size_t tag_width_;
+
+ bool reversed_border_;
+ bool normal_border_;
+ size_t min_cluster_pixels_;
+ size_t max_cluster_pixels_;
+};
+
+// Class to zero out the count (and clear the starting offset) for each extents
+// which is going to be filtered out. Used in conjunction with SumPoints to
+// update zero sized blobs.
+struct TransformZeroFilteredBlobSizes {
+ public:
+ TransformZeroFilteredBlobSizes(const float *dot_products, size_t tag_width,
+ bool reversed_border, bool normal_border,
+ size_t min_cluster_pixels,
+ size_t max_cluster_pixels)
+ : select_blobs_(nullptr, dot_products, tag_width, reversed_border,
+ normal_border, min_cluster_pixels, max_cluster_pixels) {}
+
+ __host__ __device__ __forceinline__ cub::KeyValuePair<long, MinMaxExtents>
+ operator()(cub::KeyValuePair<long, MinMaxExtents> pt) const {
+ pt.value.count *= select_blobs_(pt.key, pt.value);
+ pt.value.starting_offset = 0;
+ return pt;
+ }
+
+ private:
+ SelectBlobs select_blobs_;
+};
+
+// Class to implement a custom Scan operator which passes through the previous
+// min/max/etc, but re-sums count into starting_offset. This lets us collapse
+// out regions which don't pass the minimum filters.
+struct SumPoints {
+ __host__ __device__ __forceinline__ cub::KeyValuePair<long, MinMaxExtents>
+ operator()(const cub::KeyValuePair<long, MinMaxExtents> &a,
+ const cub::KeyValuePair<long, MinMaxExtents> &b) const {
+ cub::KeyValuePair<long, MinMaxExtents> result;
+ if (a.key < b.key) {
+ result.value.min_x = b.value.min_x;
+ result.value.min_y = b.value.min_y;
+ result.value.max_x = b.value.max_x;
+ result.value.max_y = b.value.max_y;
+ result.value.count = b.value.count;
+ result.key = b.key;
+ } else {
+ result.value.min_x = a.value.min_x;
+ result.value.min_y = a.value.min_y;
+ result.value.max_x = a.value.max_x;
+ result.value.max_y = a.value.max_y;
+ result.value.count = a.value.count;
+ result.key = a.key;
+ }
+
+ result.value.starting_offset = b.value.starting_offset + b.value.count;
+ return result;
+ }
+};
+
+void GpuDetector::Detect(const uint8_t *image) {
+ color_image_host_.MemcpyFrom(image);
+ start_.Record(&stream_);
+ color_image_device_.MemcpyAsyncFrom(&color_image_host_, &stream_);
+ after_image_memcpy_to_device_.Record(&stream_);
+
+ // Threshold the image.
+ CudaToGreyscaleAndDecimateHalide(
+ color_image_device_.get(), gray_image_device_.get(),
+ decimated_image_device_.get(), unfiltered_minmax_image_device_.get(),
+ minmax_image_device_.get(), thresholded_image_device_.get(), width_,
+ height_, tag_detector_->qtp.min_white_black_diff, &stream_);
+ after_threshold_.Record(&stream_);
+
+ union_markers_size_device_.MemsetAsync(0u, &stream_);
+ after_memset_.Record(&stream_);
+
+ // Unionfind the image.
+ LabelImage(ToGpuImage(thresholded_image_device_),
+ ToGpuImage(union_markers_device_),
+ ToGpuImage(union_markers_size_device_), stream_.get());
+
+ after_unionfinding_.Record(&stream_);
+
+ CHECK((width_ % 8) == 0);
+ CHECK((height_ % 8) == 0);
+
+ size_t decimated_width = width_ / 2;
+ size_t decimated_height = height_ / 2;
+
+ // TODO(austin): Tune for the global shutter camera.
+ // 1280 -> 2 * 128 * 5
+ // 720 -> 2 * 8 * 5 * 9
+
+ // Compute the unfiltered list of blob pairs and points.
+ {
+ constexpr size_t kBlockWidth = 32;
+ constexpr size_t kBlockHeight = 16;
+ dim3 threads(kBlockWidth, kBlockHeight, 1);
+ // Overlap 1 on each side in x, and 1 in y.
+ dim3 blocks((decimated_width + threads.x - 3) / (threads.x - 2),
+ (decimated_height + threads.y - 2) / (threads.y - 1), 1);
+
+ // Make sure we fit in our mask.
+ CHECK_LT(width_ * height_, static_cast<size_t>(1 << 22));
+
+ BlobDiff<kBlockWidth, kBlockHeight><<<blocks, threads, 0, stream_.get()>>>(
+ thresholded_image_device_.get(), union_markers_device_.get(),
+ union_markers_size_device_.get(), union_marker_pair_device_.get(),
+ decimated_width, decimated_height);
+ MaybeCheckAndSynchronize();
+ }
+
+ // TODO(austin): Can I do the first step of the zero removal in the BlobDiff
+ // kernel?
+
+ after_diff_.Record(&stream_);
+
+ {
+ // Remove empty points which aren't to be considered before sorting to speed
+ // things up.
+ size_t temp_storage_bytes =
+ temp_storage_compressed_union_marker_pair_device_.size();
+ NonZero nz;
+ CHECK_CUDA(cub::DeviceSelect::If(
+ temp_storage_compressed_union_marker_pair_device_.get(),
+ temp_storage_bytes, union_marker_pair_device_.get(),
+ compressed_union_marker_pair_device_.get(),
+ num_compressed_union_marker_pair_device_.get(),
+ union_marker_pair_device_.size(), nz, stream_.get()));
+
+ MaybeCheckAndSynchronize();
+ }
+
+ after_compact_.Record(&stream_);
+
+ int num_compressed_union_marker_pair_host;
+ {
+ num_compressed_union_marker_pair_device_.MemcpyTo(
+ &num_compressed_union_marker_pair_host);
+ CHECK_LT(static_cast<size_t>(num_compressed_union_marker_pair_host),
+ union_marker_pair_device_.size());
+
+ // Now, sort just the keys to group like points.
+ size_t temp_storage_bytes = radix_sort_tmpstorage_device_.size();
+ QuadBoundaryPointDecomposer decomposer;
+ CHECK_CUDA(cub::DeviceRadixSort::SortKeys(
+ radix_sort_tmpstorage_device_.get(), temp_storage_bytes,
+ compressed_union_marker_pair_device_.get(),
+ sorted_union_marker_pair_device_.get(),
+ num_compressed_union_marker_pair_host, decomposer,
+ QuadBoundaryPoint::kRepEndBit, QuadBoundaryPoint::kBitsInKey,
+ stream_.get()));
+
+ MaybeCheckAndSynchronize();
+ }
+
+ after_sort_.Record(&stream_);
+
+ size_t num_quads_host = 0;
+ {
+ // Our next step is to compute the extents of each blob so we can filter
+ // blobs.
+ cub::ArgIndexInputIterator<QuadBoundaryPoint *> value_index_input_iterator(
+ sorted_union_marker_pair_device_.get());
+ TransformQuadBoundaryPointToMinMaxExtents min_max;
+ cub::TransformInputIterator<MinMaxExtents,
+ TransformQuadBoundaryPointToMinMaxExtents,
+ cub::ArgIndexInputIterator<QuadBoundaryPoint *>>
+ value_input_iterator(value_index_input_iterator, min_max);
+
+ // Don't care about the output keys...
+ cub::DiscardOutputIterator<uint64_t> key_discard_iterator;
+
+ // Provide a mask to detect keys by rep01()
+ MaskRep01 mask;
+ cub::TransformInputIterator<uint64_t, MaskRep01, QuadBoundaryPoint *>
+ key_input_iterator(sorted_union_marker_pair_device_.get(), mask);
+
+ // Reduction operator.
+ QuadBoundaryPointExtents reduce;
+
+ size_t temp_storage_bytes =
+ temp_storage_bounds_reduce_by_key_device_.size();
+ cub::DeviceReduce::ReduceByKey(
+ temp_storage_bounds_reduce_by_key_device_.get(), temp_storage_bytes,
+ key_input_iterator, key_discard_iterator, value_input_iterator,
+ extents_device_.get(), num_quads_device_.get(), reduce,
+ num_compressed_union_marker_pair_host, stream_.get());
+ after_bounds_.Record(&stream_);
+
+ num_quads_device_.MemcpyTo(&num_quads_host);
+ }
+ CHECK_LE(num_quads_host, reduced_dot_blobs_pair_device_.size());
+
+ // Longest april tag will be the full perimeter of the image. Each point
+ // results in 2 neighbor points, 1 straight, and one at 45 degrees. But, we
+ // are in decimated space here, and width_ and height_ are in full image
+ // space. And there are 2 sides to each rectangle...
+ //
+ // Aprilrobotics has a *3 instead of a *2 here since they have duplicated
+ // points in their list at this stage.
+ const size_t max_april_tag_perimeter = 2 * (width_ + height_);
+
+ {
+ // Now that we have the extents, compute the dot product used to detect
+ // which blobs are inside out or right side out.
+
+ // Don't care about these quantities since the previous ReduceByKey will
+ // ahve computed them too.
+ cub::DiscardOutputIterator<size_t> length_discard_iterator;
+ cub::DiscardOutputIterator<uint64_t> key_discard_iterator;
+
+ size_t dot_reduce_temp_storage_bytes =
+ temp_storage_dot_product_device_.size();
+
+ // Provide a mask to detect keys by rep01()
+ MaskRep01 mask;
+ cub::TransformInputIterator<uint64_t, MaskRep01, QuadBoundaryPoint *>
+ key_iterator(sorted_union_marker_pair_device_.get(), mask);
+
+ // Create a transform iterator which calculates the dot product for each
+ // point individually.
+ cub::ArgIndexInputIterator<QuadBoundaryPoint *> value_index_input_iterator(
+ sorted_union_marker_pair_device_.get());
+ ComputeDotProductTransform dot_operator(
+ extents_device_.get(), num_quads_host, min_tag_width_,
+ tag_detector_->qtp.min_cluster_pixels, max_april_tag_perimeter);
+ cub::TransformInputIterator<float, ComputeDotProductTransform,
+ cub::ArgIndexInputIterator<QuadBoundaryPoint *>>
+ value_iterator(value_index_input_iterator, dot_operator);
+
+ // And now sum per key.
+ CHECK_CUDA(cub::DeviceReduce::ReduceByKey(
+ temp_storage_dot_product_device_.get(), dot_reduce_temp_storage_bytes,
+ key_iterator, key_discard_iterator, value_iterator,
+ reduced_dot_blobs_pair_device_.get(), length_discard_iterator,
+ cub::Sum(), num_compressed_union_marker_pair_host, stream_.get()));
+
+ MaybeCheckAndSynchronize();
+ }
+
+ after_dot_.Record(&stream_);
+
+ {
+ // Now that we have the dot products, we need to rewrite the extents for the
+ // post-thresholded world so we can find the start and end address of blobs
+ // for fitting lines.
+ //
+ // Clear the size of non-passing extents and the starting offset of all
+ // extents.
+ cub::ArgIndexInputIterator<MinMaxExtents *> value_index_input_iterator(
+ extents_device_.get());
+ TransformZeroFilteredBlobSizes rewrite(
+ reduced_dot_blobs_pair_device_.get(), min_tag_width_, reversed_border_,
+ normal_border_, tag_detector_->qtp.min_cluster_pixels,
+ max_april_tag_perimeter);
+ cub::TransformInputIterator<cub::KeyValuePair<long, MinMaxExtents>,
+ TransformZeroFilteredBlobSizes,
+ cub::ArgIndexInputIterator<MinMaxExtents *>>
+ input_iterator(value_index_input_iterator, rewrite);
+
+ // Sum the counts of everything before us, and update the offset.
+ SumPoints sum_points;
+
+ // TODO(justin): Rip the key off when writing.
+
+ // Rewrite the extents to have the starting offset and count match the
+ // post-selected values.
+ size_t temp_storage_bytes =
+ temp_storage_selected_extents_scan_device_.size();
+ CHECK_CUDA(cub::DeviceScan::InclusiveScan(
+ temp_storage_selected_extents_scan_device_.get(), temp_storage_bytes,
+ input_iterator, selected_extents_device_.get(), sum_points,
+ num_quads_host));
+
+ MaybeCheckAndSynchronize();
+ }
+
+ after_transform_extents_.Record(&stream_);
+
+ int num_selected_blobs_host;
+ {
+ // Now, copy over all points which pass our thresholds.
+ cub::ArgIndexInputIterator<QuadBoundaryPoint *> value_index_input_iterator(
+ sorted_union_marker_pair_device_.get());
+ RewriteToIndexPoint rewrite(extents_device_.get(), num_quads_host);
+ cub::TransformInputIterator<IndexPoint, RewriteToIndexPoint,
+ cub::ArgIndexInputIterator<QuadBoundaryPoint *>>
+ input_iterator(value_index_input_iterator, rewrite);
+
+ SelectBlobs select_blobs(
+ extents_device_.get(), reduced_dot_blobs_pair_device_.get(),
+ min_tag_width_, reversed_border_, normal_border_,
+ tag_detector_->qtp.min_cluster_pixels, max_april_tag_perimeter);
+
+ size_t temp_storage_bytes =
+ temp_storage_compressed_filtered_blobs_device_.size();
+ CHECK_CUDA(cub::DeviceSelect::If(
+ temp_storage_compressed_filtered_blobs_device_.get(),
+ temp_storage_bytes, input_iterator, selected_blobs_device_.get(),
+ num_selected_blobs_device_.get(), num_compressed_union_marker_pair_host,
+ select_blobs, stream_.get()));
+
+ MaybeCheckAndSynchronize();
+
+ num_selected_blobs_device_.MemcpyTo(&num_selected_blobs_host);
+ }
+
+ after_filter_.Record(&stream_);
+
+ {
+ // Sort based on the angle.
+ size_t temp_storage_bytes = radix_sort_tmpstorage_device_.size();
+ QuadIndexPointDecomposer decomposer;
+
+ CHECK_CUDA(cub::DeviceRadixSort::SortKeys(
+ radix_sort_tmpstorage_device_.get(), temp_storage_bytes,
+ selected_blobs_device_.get(), sorted_selected_blobs_device_.get(),
+ num_selected_blobs_host, decomposer, IndexPoint::kRepEndBit,
+ IndexPoint::kBitsInKey, stream_.get()));
+
+ MaybeCheckAndSynchronize();
+ }
+
+ after_filtered_sort_.Record(&stream_);
+
+ // Report out how long things took.
+ LOG(INFO) << "Found " << num_compressed_union_marker_pair_host << " items";
+ LOG(INFO) << "Selected " << num_selected_blobs_host << " blobs";
+
+ CudaEvent *previous_event = &start_;
+ for (auto name_event : std::vector<std::tuple<std::string_view, CudaEvent &>>{
+ {"Memcpy", after_image_memcpy_to_device_},
+ {"Threshold", after_threshold_},
+ {"Memset", after_memset_},
+ {"Unionfinding", after_unionfinding_},
+ {"Diff", after_diff_},
+ {"Compact", after_compact_},
+ {"Sort", after_sort_},
+ {"Bounds", after_bounds_},
+ {"Dot product", after_dot_},
+ {"Transform Extents", after_transform_extents_},
+ {"Filter by dot product", after_filter_},
+ {"Filtered sort", after_filtered_sort_},
+ }) {
+ std::get<1>(name_event).Synchronize();
+ LOG(INFO) << std::get<0>(name_event) << " "
+ << float_milli(
+ std::get<1>(name_event).ElapsedTime(*previous_event))
+ .count()
+ << "ms";
+ previous_event = &std::get<1>(name_event);
+ }
+
+ LOG(INFO) << "Overall "
+ << float_milli(previous_event->ElapsedTime(start_)).count() << "ms";
+
+ // Average. Skip the first one as the kernel is warming up and is slower.
+ if (!first_) {
+ ++execution_count_;
+ execution_duration_ += previous_event->ElapsedTime(start_);
+ LOG(INFO) << "Average overall "
+ << float_milli(execution_duration_ / execution_count_).count()
+ << "ms";
+ }
+
+ LOG(INFO) << "Found compressed runs: " << num_quads_host;
+
+ first_ = false;
+}
+
+} // namespace apriltag
+} // namespace frc971
diff --git a/frc971/orin/apriltag.h b/frc971/orin/apriltag.h
new file mode 100644
index 0000000..068c0c7
--- /dev/null
+++ b/frc971/orin/apriltag.h
@@ -0,0 +1,280 @@
+#ifndef FRC971_ORIN_APRILTAG_H_
+#define FRC971_ORIN_APRILTAG_H_
+
+#include <cub/iterator/transform_input_iterator.cuh>
+
+#include "third_party/apriltag/apriltag.h"
+
+#include "cuda_runtime.h"
+#include "device_launch_parameters.h"
+#include "frc971/orin/cuda.h"
+#include "frc971/orin/gpu_image.h"
+#include "frc971/orin/points.h"
+
+namespace frc971 {
+namespace apriltag {
+
+// Class to hold the extents of a blob of points.
+struct MinMaxExtents {
+ // Min and max coordinates (in non-decimated coordinates)
+ uint16_t min_x;
+ uint16_t min_y;
+ uint16_t max_x;
+ uint16_t max_y;
+
+ // The starting offset of this blob of points in the vector holding all the
+ // points.
+ uint32_t starting_offset;
+
+ // The number of points in the blob.
+ uint32_t count;
+
+ // Center location of the blob using the aprilrobotics algorithm.
+ __host__ __device__ float cx() const {
+ return (min_x + max_x) * 0.5f + 0.05118;
+ }
+ __host__ __device__ float cy() const {
+ return (min_y + max_y) * 0.5f + -0.028581;
+ }
+};
+
+static_assert(sizeof(MinMaxExtents) == 16, "MinMaxExtents didn't pack right.");
+
+// Class to find the blob index of a point in a point vector.
+class BlobExtentsIndexFinder {
+ public:
+ BlobExtentsIndexFinder(const MinMaxExtents *extents_device,
+ size_t num_extents)
+ : extents_device_(extents_device), num_extents_(num_extents) {}
+
+ __host__ __device__ size_t FindBlobIndex(size_t point_index) const {
+ // Do a binary search for the blob which has the point in it's
+ // starting_offset range.
+ size_t min = 0;
+ size_t max = num_extents_;
+ while (true) {
+ if (min + 1 == max) {
+ return min;
+ }
+
+ size_t average = min + (max - min) / 2;
+
+ if (average < num_extents_ && extents_device_[average].starting_offset <=
+ static_cast<size_t>(point_index)) {
+ min = average;
+ } else {
+ max = average;
+ }
+ }
+ }
+
+ // Returns the extents for a blob index.
+ __host__ __device__ MinMaxExtents Get(size_t index) const {
+ return extents_device_[index];
+ }
+
+ private:
+ const MinMaxExtents *extents_device_;
+ size_t num_extents_;
+
+ // TODO(austin): Cache the last one?
+};
+
+// GPU based april tag detector.
+class GpuDetector {
+ public:
+ // The number of blobs we will consider when counting april tags.
+ static constexpr size_t kMaxBlobs = IndexPoint::kMaxBlobs;
+
+ // Constructs a detector, reserving space for detecting tags of the provided
+ // with and height, using the provided detector options.
+ GpuDetector(size_t width, size_t height,
+ const apriltag_detector_t *tag_detector);
+
+ virtual ~GpuDetector();
+
+ // Detects april tags in the provided image.
+ void Detect(const uint8_t *image);
+
+ // Debug methods to expose internal state for testing.
+ void CopyGrayTo(uint8_t *output) { gray_image_device_.MemcpyTo(output); }
+ void CopyDecimatedTo(uint8_t *output) {
+ decimated_image_device_.MemcpyTo(output);
+ }
+ void CopyThresholdedTo(uint8_t *output) {
+ thresholded_image_device_.MemcpyTo(output);
+ }
+ void CopyUnionMarkersTo(uint32_t *output) {
+ union_markers_device_.MemcpyTo(output);
+ }
+
+ void CopyUnionMarkerPairTo(QuadBoundaryPoint *output) {
+ union_marker_pair_device_.MemcpyTo(output);
+ }
+
+ void CopyCompressedUnionMarkerPairTo(QuadBoundaryPoint *output) {
+ compressed_union_marker_pair_device_.MemcpyTo(output);
+ }
+
+ std::vector<QuadBoundaryPoint> CopySortedUnionMarkerPair() {
+ std::vector<QuadBoundaryPoint> result;
+ int size = NumCompressedUnionMarkerPairs();
+ result.resize(size);
+ sorted_union_marker_pair_device_.MemcpyTo(result.data(), size);
+ return result;
+ }
+
+ int NumCompressedUnionMarkerPairs() {
+ return num_compressed_union_marker_pair_device_.Copy()[0];
+ }
+
+ void CopyUnionMarkersSizeTo(uint32_t *output) {
+ union_markers_size_device_.MemcpyTo(output);
+ }
+
+ int NumQuads() const { return num_quads_device_.Copy()[0]; }
+
+ std::vector<MinMaxExtents> CopyExtents() {
+ return extents_device_.Copy(NumQuads());
+ }
+
+ std::vector<float> CopyReducedDotBlobs() {
+ return reduced_dot_blobs_pair_device_.Copy(NumQuads());
+ }
+
+ std::vector<cub::KeyValuePair<long, MinMaxExtents>> CopySelectedExtents() {
+ return selected_extents_device_.Copy(NumQuads());
+ }
+
+ int NumSelectedPairs() const { return num_selected_blobs_device_.Copy()[0]; }
+
+ std::vector<IndexPoint> CopySelectedBlobs() {
+ return selected_blobs_device_.Copy(NumSelectedPairs());
+ }
+
+ std::vector<IndexPoint> CopySortedSelectedBlobs() {
+ return sorted_selected_blobs_device_.Copy(NumSelectedPairs());
+ }
+
+ private:
+ // Creates a GPU image wrapped around the provided memory.
+ template <typename T>
+ GpuImage<T> ToGpuImage(GpuMemory<T> &memory) {
+ if (memory.size() == width_ * height_) {
+ return GpuImage<T>{
+ .data = memory.get(),
+ .rows = height_,
+ .cols = width_,
+ .step = width_,
+ };
+ } else if (memory.size() == width_ * height_ / 4) {
+ return GpuImage<T>{
+ .data = memory.get(),
+ .rows = height_ / 2,
+ .cols = width_ / 2,
+ .step = width_ / 2,
+ };
+ } else {
+ LOG(FATAL) << "Unknown image shape";
+ }
+ }
+
+ // Size of the image.
+ const size_t width_;
+ const size_t height_;
+
+ // Detector parameters.
+ const apriltag_detector_t *tag_detector_;
+
+ // Stream to operate on.
+ CudaStream stream_;
+
+ // Events for each of the steps for timing.
+ CudaEvent start_;
+ CudaEvent after_image_memcpy_to_device_;
+ CudaEvent after_threshold_;
+ CudaEvent after_memset_;
+ CudaEvent after_unionfinding_;
+ CudaEvent after_diff_;
+ CudaEvent after_compact_;
+ CudaEvent after_sort_;
+ CudaEvent after_bounds_;
+ CudaEvent after_dot_;
+ CudaEvent after_transform_extents_;
+ CudaEvent after_filter_;
+ CudaEvent after_filtered_sort_;
+
+ // TODO(austin): Remove this...
+ HostMemory<uint8_t> color_image_host_;
+
+ // Starting color image.
+ GpuMemory<uint8_t> color_image_device_;
+ // Full size gray scale image.
+ GpuMemory<uint8_t> gray_image_device_;
+ // Half resolution, gray, decimated image.
+ GpuMemory<uint8_t> decimated_image_device_;
+ // Intermediates for thresholding.
+ GpuMemory<uint8_t> unfiltered_minmax_image_device_;
+ GpuMemory<uint8_t> minmax_image_device_;
+ GpuMemory<uint8_t> thresholded_image_device_;
+
+ // The union markers for each pixel.
+ GpuMemory<uint32_t> union_markers_device_;
+ // The size of each blob. The blob size is stored at the index of the stored
+ // union marker id in union_markers_device_ aboe.
+ GpuMemory<uint32_t> union_markers_size_device_;
+
+ // Full list of boundary points, densly stored but mostly zero.
+ GpuMemory<QuadBoundaryPoint> union_marker_pair_device_;
+ // Unsorted list of points with 0's removed.
+ GpuMemory<QuadBoundaryPoint> compressed_union_marker_pair_device_;
+ // Blob representation sorted list of points.
+ GpuMemory<QuadBoundaryPoint> sorted_union_marker_pair_device_;
+ // Number of compressed points.
+ GpuMemory<int> num_compressed_union_marker_pair_device_{
+ /* allocate 1 integer...*/ 1};
+
+ // Number of unique blob IDs.
+ GpuMemory<size_t> num_quads_device_{/* allocate 1 integer...*/ 1};
+ // Bounds per blob, one blob per ID.
+ GpuMemory<MinMaxExtents> extents_device_;
+ // Sum of the dot products between the vector from center to each point, with
+ // the gradient at each point.
+ GpuMemory<float> reduced_dot_blobs_pair_device_;
+ // Extents of all the blobs under consideration.
+ GpuMemory<cub::KeyValuePair<long, MinMaxExtents>> selected_extents_device_;
+
+ // Number of keys in selected_blobs_device_.
+ GpuMemory<int> num_selected_blobs_device_{/* allocate 1 integer...*/ 1};
+
+ // Compacted blobs which pass our threshold.
+ GpuMemory<IndexPoint> selected_blobs_device_;
+ // Sorted list of those points.
+ GpuMemory<IndexPoint> sorted_selected_blobs_device_;
+
+ // Temporary storage for each of the steps.
+ // TODO(austin): Can we combine these and just use the max?
+ GpuMemory<uint32_t> radix_sort_tmpstorage_device_;
+ GpuMemory<uint8_t> temp_storage_compressed_union_marker_pair_device_;
+ GpuMemory<uint8_t> temp_storage_bounds_reduce_by_key_device_;
+ GpuMemory<uint8_t> temp_storage_dot_product_device_;
+ GpuMemory<uint8_t> temp_storage_compressed_filtered_blobs_device_;
+ GpuMemory<uint8_t> temp_storage_selected_extents_scan_device_;
+
+ // Cumulative duration of april tag detection.
+ std::chrono::nanoseconds execution_duration_{0};
+ // Number of detections.
+ size_t execution_count_ = 0;
+ // True if this is the first detection.
+ bool first_ = true;
+
+ // Cached quantities used for tag filtering.
+ bool normal_border_ = false;
+ bool reversed_border_ = false;
+ int min_tag_width_ = 1000000;
+};
+
+} // namespace apriltag
+} // namespace frc971
+
+#endif // FRC971_ORIN_APRILTAG_H_
diff --git a/frc971/orin/cuda.cc b/frc971/orin/cuda.cc
new file mode 100644
index 0000000..1173f73
--- /dev/null
+++ b/frc971/orin/cuda.cc
@@ -0,0 +1,23 @@
+#include "frc971/orin/cuda.h"
+
+#include "gflags/gflags.h"
+#include "glog/logging.h"
+
+DEFINE_bool(
+ sync, false,
+ "If true, force synchronization after each step to isolate errors better.");
+
+namespace frc971 {
+namespace apriltag {
+
+void CheckAndSynchronize() {
+ CHECK_CUDA(cudaDeviceSynchronize());
+ CHECK_CUDA(cudaGetLastError());
+}
+
+void MaybeCheckAndSynchronize() {
+ if (FLAGS_sync) CheckAndSynchronize();
+}
+
+} // namespace apriltag
+} // namespace frc971
diff --git a/frc971/orin/cuda.h b/frc971/orin/cuda.h
new file mode 100644
index 0000000..e386aa6
--- /dev/null
+++ b/frc971/orin/cuda.h
@@ -0,0 +1,207 @@
+#ifndef FRC971_ORIN_CUDA_H_
+#define FRC971_ORIN_CUDA_H_
+
+#include <chrono>
+#include <span>
+
+#include "glog/logging.h"
+
+#include "cuda_runtime.h"
+#include "device_launch_parameters.h"
+
+// CHECKs that a cuda method returned success.
+// TODO(austin): This will not handle if and else statements quite right, fix if
+// we care.
+#define CHECK_CUDA(condition) \
+ if (auto c = condition) \
+ LOG(FATAL) << "Check failed: " #condition " (" << cudaGetErrorString(c) \
+ << ") "
+
+namespace frc971 {
+namespace apriltag {
+
+// Class to manage the lifetime of a Cuda stream. This is used to provide
+// relative ordering between kernels on the same stream.
+class CudaStream {
+ public:
+ CudaStream() { CHECK_CUDA(cudaStreamCreate(&stream_)); }
+
+ CudaStream(const CudaStream &) = delete;
+ CudaStream &operator=(const CudaStream &) = delete;
+
+ virtual ~CudaStream() { CHECK_CUDA(cudaStreamDestroy(stream_)); }
+
+ // Returns the stream.
+ cudaStream_t get() { return stream_; }
+
+ private:
+ cudaStream_t stream_;
+};
+
+// Class to manage the lifetime of a Cuda Event. Cuda events are used for
+// timing events on a stream.
+class CudaEvent {
+ public:
+ CudaEvent() { CHECK_CUDA(cudaEventCreate(&event_)); }
+
+ CudaEvent(const CudaEvent &) = delete;
+ CudaEvent &operator=(const CudaEvent &) = delete;
+
+ virtual ~CudaEvent() { CHECK_CUDA(cudaEventDestroy(event_)); }
+
+ // Queues up an event to be timestamped on the stream when it is executed.
+ void Record(CudaStream *stream) {
+ CHECK_CUDA(cudaEventRecord(event_, stream->get()));
+ }
+
+ // Returns the time elapsed between start and this event if it has been
+ // triggered.
+ std::chrono::nanoseconds ElapsedTime(const CudaEvent &start) {
+ float ms;
+ CHECK_CUDA(cudaEventElapsedTime(&ms, start.event_, event_));
+ return std::chrono::duration_cast<std::chrono::nanoseconds>(
+ std::chrono::duration<float, std::milli>(ms));
+ }
+
+ // Waits until the event has been triggered.
+ void Synchronize() { CHECK_CUDA(cudaEventSynchronize(event_)); }
+
+ private:
+ cudaEvent_t event_;
+};
+
+// Class to manage the lifetime of page locked host memory for fast copies back
+// to host memory.
+template <typename T>
+class HostMemory {
+ public:
+ // Allocates a block of memory for holding up to size objects of type T.
+ HostMemory(size_t size) {
+ T *memory;
+ CHECK_CUDA(cudaMallocHost((void **)(&memory), size * sizeof(T)));
+ span_ = std::span<T>(memory, size);
+ }
+ HostMemory(const HostMemory &) = delete;
+ HostMemory &operator=(const HostMemory &) = delete;
+
+ virtual ~HostMemory() { CHECK_CUDA(cudaFreeHost(span_.data())); }
+
+ // Returns a pointer to the memory.
+ T *get() { return span_.data(); }
+ const T *get() const { return span_.data(); }
+
+ // Returns the number of objects the memory can hold.
+ size_t size() const { return span_.size(); }
+
+ // Copies data from other (host memory) to this's memory.
+ void MemcpyFrom(const T *other) {
+ memcpy(span_.data(), other, sizeof(T) * size());
+ }
+ // Copies data to other (host memory) from this's memory.
+ void MemcpyTo(const T *other) {
+ memcpy(other, span_.data(), sizeof(T) * size());
+ }
+
+ private:
+ std::span<T> span_;
+};
+
+// Class to manage the lifetime of device memory.
+template <typename T>
+class GpuMemory {
+ public:
+ // Allocates a block of memory for holding up to size objects of type T in
+ // device memory.
+ GpuMemory(size_t size) : size_(size) {
+ CHECK_CUDA(cudaMalloc((void **)(&memory_), size * sizeof(T)));
+ }
+ GpuMemory(const GpuMemory &) = delete;
+ GpuMemory &operator=(const GpuMemory &) = delete;
+
+ virtual ~GpuMemory() { CHECK_CUDA(cudaFree(memory_)); }
+
+ // Returns the device pointer to the memory.
+ T *get() { return memory_; }
+ const T *get() const { return memory_; }
+
+ // Returns the number of objects this memory can hold.
+ size_t size() const { return size_; }
+
+ // Copies data from host memory to this memory asynchronously on the provided
+ // stream.
+ void MemcpyAsyncFrom(const T *host_memory, CudaStream *stream) {
+ CHECK_CUDA(cudaMemcpyAsync(memory_, host_memory, sizeof(T) * size_,
+ cudaMemcpyHostToDevice, stream->get()));
+ }
+ void MemcpyAsyncFrom(const HostMemory<T> *host_memory, CudaStream *stream) {
+ MemcpyAsyncFrom(host_memory->get(), stream);
+ }
+
+ // Copies data to host memory from this memory asynchronously on the provided
+ // stream.
+ void MemcpyAsyncTo(T *host_memory, CudaStream *stream) const {
+ CHECK_CUDA(cudaMemcpyAsync(reinterpret_cast<void *>(host_memory),
+ reinterpret_cast<void *>(memory_),
+ sizeof(T) * size_, cudaMemcpyDeviceToHost,
+ stream->get()));
+ }
+ void MemcpyAsyncTo(HostMemory<T> *host_memory, CudaStream *stream) const {
+ MemcpyAsyncTo(host_memory->get(), stream);
+ }
+
+ // Copies data from host_memory to this memory blocking.
+ void MemcpyFrom(const T *host_memory) {
+ CHECK_CUDA(cudaMemcpy(reinterpret_cast<void *>(memory_),
+ reinterpret_cast<const void *>(host_memory),
+ sizeof(T) * size_, cudaMemcpyHostToDevice));
+ }
+ void MemcpyFrom(const HostMemory<T> *host_memory) {
+ MemcpyFrom(host_memory->get());
+ }
+
+ // Copies data to host_memory from this memory. Only copies size objects.
+ void MemcpyTo(T *host_memory, size_t size) const {
+ CHECK_CUDA(cudaMemcpy(reinterpret_cast<void *>(host_memory), memory_,
+ sizeof(T) * size, cudaMemcpyDeviceToHost));
+ }
+ // Copies data to host_memory from this memory.
+ void MemcpyTo(T *host_memory) const { MemcpyTo(host_memory, size_); }
+ void MemcpyTo(HostMemory<T> *host_memory) const {
+ MemcpyTo(host_memory->get());
+ }
+
+ // Sets the memory asynchronously to contain data of type 'val' on the provide
+ // stream.
+ void MemsetAsync(const uint8_t val, CudaStream *stream) const {
+ CHECK_CUDA(cudaMemsetAsync(memory_, val, sizeof(T) * size_, stream->get()));
+ }
+
+ // Allocates a vector on the host, copies size objects into it, and returns
+ // it.
+ std::vector<T> Copy(size_t s) const {
+ CHECK_LE(s, size_);
+ std::vector<T> result(s);
+ MemcpyTo(result.data(), s);
+ return result;
+ }
+
+ // Copies all the objects in this memory to a vector on the host and returns
+ // it.
+ std::vector<T> Copy() const { return Copy(size_); }
+
+ private:
+ T *memory_;
+ const size_t size_;
+};
+
+// Synchronizes and CHECKs for success the last CUDA operation.
+void CheckAndSynchronize();
+
+// Synchronizes and CHECKS iff --sync is passed on the command line. Makes it
+// so we can leave debugging in the code.
+void MaybeCheckAndSynchronize();
+
+} // namespace apriltag
+} // namespace frc971
+
+#endif // FRC971_ORIN_CUDA_H_
diff --git a/frc971/orin/cuda_april_tag_test.cc b/frc971/orin/cuda_april_tag_test.cc
new file mode 100644
index 0000000..1d5aa9b
--- /dev/null
+++ b/frc971/orin/cuda_april_tag_test.cc
@@ -0,0 +1,1563 @@
+#include <numeric>
+#include <random>
+#include <string>
+
+#include "glog/logging.h"
+#include "gtest/gtest.h"
+#include "opencv2/imgproc.hpp"
+#include "third_party/apriltag/apriltag.h"
+#include "third_party/apriltag/common/unionfind.h"
+#include "third_party/apriltag/tag16h5.h"
+#include <opencv2/highgui.hpp>
+
+#include "aos/flatbuffer_merge.h"
+#include "aos/json_to_flatbuffer.h"
+#include "aos/testing/path.h"
+#include "aos/testing/random_seed.h"
+#include "aos/time/time.h"
+#include "frc971/orin/apriltag.h"
+#include "frc971/vision/vision_generated.h"
+
+DEFINE_int32(pixel_border, 10,
+ "Size of image border within which to reject detected corners");
+DEFINE_double(min_decision_margin, 50.0,
+ "Minimum decision margin (confidence) for an apriltag detection");
+
+DEFINE_bool(debug, false, "If true, write debug images.");
+
+// Get access to the intermediates of aprilrobotics.
+extern "C" {
+
+image_u8_t *threshold(apriltag_detector_t *td, image_u8_t *im);
+unionfind_t *connected_components(apriltag_detector_t *td, image_u8_t *threshim,
+ int w, int h, int ts);
+
+zarray_t *gradient_clusters(apriltag_detector_t *td, image_u8_t *threshim,
+ int w, int h, int ts, unionfind_t *uf);
+
+struct pt {
+ // Note: these represent 2*actual value.
+ uint16_t x, y;
+ int16_t gx, gy;
+
+ float slope;
+};
+} // extern C
+
+// Converts a cv::Mat to an aprilrobotics image.
+image_u8_t ToImageu8t(const cv::Mat &img) {
+ return image_u8_t{
+ .width = img.cols,
+ .height = img.rows,
+ .stride = img.cols,
+ .buf = img.data,
+ };
+}
+
+namespace frc971::apriltag::testing {
+
+// Checks that 2 images match.
+void CheckImage(image_u8_t compare_im_one, image_u8_t compare_im_two,
+ std::string_view label) {
+ CHECK_EQ(compare_im_one.width, compare_im_two.width);
+ CHECK_EQ(compare_im_one.height, compare_im_two.height);
+ ssize_t p = 0;
+ for (int j = 0; j < compare_im_one.height; ++j) {
+ for (int i = 0; i < compare_im_one.width; ++i) {
+ if (p < 0) {
+ LOG(INFO) << i << " " << j << ": "
+ << static_cast<int>(
+ compare_im_one.buf[j * compare_im_one.stride + i])
+ << " (address + " << j * compare_im_one.stride + i << ") vs "
+ << static_cast<int>(
+ compare_im_two.buf[j * compare_im_two.stride + i])
+ << " (address + " << j * compare_im_two.stride + i << ") for "
+ << label;
+
+ ++p;
+ }
+
+ CHECK_EQ(compare_im_one.buf[j * compare_im_one.stride + i],
+ compare_im_two.buf[j * compare_im_two.stride + i])
+ << "First Image Value "
+ << (int)compare_im_one.buf[j * compare_im_one.stride + i] << " "
+ << "Second Image Value "
+ << (int)compare_im_two.buf[j * compare_im_two.stride + i] << " "
+ << "At " << i << ", " << j << " for " << label;
+ }
+ }
+}
+
+// Stores info about a blob.
+struct BlobInfo {
+ size_t size;
+ size_t min_x;
+ size_t min_y;
+ size_t max_x;
+ size_t max_y;
+ size_t april_robotics_id;
+ size_t bounding_area() const { return (max_x - min_x) * (max_y - min_y); }
+};
+
+// Checks that the aprilrobotics and GPU code agree on the results of
+// unionfinding.
+std::map<uint32_t, BlobInfo> CheckUnionfind(unionfind_t *uf,
+ cv::Mat union_markers,
+ const uint32_t *union_markers_size,
+ cv::Mat threshold) {
+ const size_t width = union_markers.cols;
+ const size_t height = union_markers.rows;
+ std::map<uint32_t, uint32_t> id_remap;
+ std::map<uint32_t, size_t> cuda_id_count;
+
+ for (size_t y = 0; y < height; ++y) {
+ for (size_t x = 0; x < width; ++x) {
+ uint32_t v = unionfind_get_representative(uf, y * width + x);
+ uint32_t v_remapped;
+ {
+ auto it = cuda_id_count.emplace(union_markers.at<uint32_t>(y, x), 1);
+ if (!it.second) {
+ ++it.first->second;
+ }
+ }
+ auto it = id_remap.find(v);
+ if (it == id_remap.end()) {
+ v_remapped = union_markers.at<uint32_t>(y, x);
+ id_remap.insert(std::make_pair(v, v_remapped));
+ } else {
+ v_remapped = it->second;
+ }
+
+ CHECK_EQ(v_remapped, union_markers.at<uint32_t>(y, x))
+ << "At " << x << ", " << y;
+ }
+ }
+ for (auto [key, value] : cuda_id_count) {
+ VLOG(2) << "Found " << key << " num times " << value;
+ }
+ LOG(INFO) << "Found " << id_remap.size() << " blob ids in aprilrobotics.";
+
+ std::map<uint32_t, BlobInfo> blob_sizes;
+
+ for (size_t y = 0; y < height; ++y) {
+ for (size_t x = 0; x < width; ++x) {
+ auto it = blob_sizes.emplace(
+ union_markers.at<uint32_t>(y, x),
+ BlobInfo{
+ .size = 1,
+ .min_x = x,
+ .min_y = y,
+ .max_x = x,
+ .max_y = y,
+ .april_robotics_id =
+ unionfind_get_representative(uf, y * width + x),
+ });
+ if (!it.second) {
+ BlobInfo *info = &(it.first->second);
+ ++(info->size);
+ CHECK_EQ(info->april_robotics_id,
+ unionfind_get_representative(uf, y * width + x));
+ info->min_x = std::min(info->min_x, x);
+ info->max_x = std::max(info->max_x, x);
+ info->min_y = std::min(info->min_y, y);
+ info->max_y = std::max(info->max_y, y);
+ }
+ }
+ }
+
+ for (size_t y = 0; y < height; ++y) {
+ for (size_t x = 0; x < width; ++x) {
+ const uint32_t blob_id = union_markers.at<uint32_t>(y, x);
+ const BlobInfo &blob_info = blob_sizes[blob_id];
+ if (threshold.at<uint8_t>(y, x) == 127) {
+ CHECK_EQ(0u, union_markers_size[blob_id])
+ << " at (" << x << ", " << y << ") -> " << blob_id;
+ continue;
+ }
+
+ if (blob_info.size >= 25u) {
+ CHECK_LE(25u, union_markers_size[blob_id])
+ << " at (" << x << ", " << y << ") -> " << blob_id;
+ } else {
+ CHECK_EQ(blob_info.size, union_markers_size[blob_id])
+ << " at (" << x << ", " << y << ") -> " << blob_id;
+ }
+ }
+ }
+
+ LOG(INFO) << "Union finding + stats passed.";
+
+ return blob_sizes;
+}
+
+// Makes a tag detector.
+apriltag_detector_t *MakeTagDetector(apriltag_family_t *tag_family) {
+ apriltag_detector_t *tag_detector = apriltag_detector_create();
+
+ apriltag_detector_add_family_bits(tag_detector, tag_family, 1);
+
+ tag_detector->nthreads = 1;
+ tag_detector->wp = workerpool_create(tag_detector->nthreads);
+ tag_detector->qtp.min_white_black_diff = 5;
+ tag_detector->debug = FLAGS_debug;
+
+ return tag_detector;
+}
+
+class CudaAprilTagDetector {
+ public:
+ CudaAprilTagDetector(size_t width, size_t height)
+ : tag_family_(tag16h5_create()),
+ tag_detector_(MakeTagDetector(tag_family_)),
+ gray_cuda_(cv::Size(width, height), CV_8UC1),
+ decimated_cuda_(gray_cuda_.size() / 2, CV_8UC1),
+ thresholded_cuda_(decimated_cuda_.size(), CV_8UC1),
+ union_markers_(decimated_cuda_.size(), CV_32SC1),
+ union_markers_size_(decimated_cuda_.size(), CV_32SC1),
+ gpu_detector_(width, height, tag_detector_),
+ width_(width),
+ height_(height) {
+ // Report out info about our GPU.
+ {
+ cudaDeviceProp prop;
+ CHECK_EQ(cudaGetDeviceProperties(&prop, 0), cudaSuccess);
+
+ LOG(INFO) << "Device: sm_" << prop.major << prop.minor;
+#define DUMP(x) LOG(INFO) << "" #x ": " << prop.x;
+ DUMP(sharedMemPerBlock);
+ DUMP(l2CacheSize);
+ DUMP(maxThreadsPerBlock);
+ DUMP(maxThreadsPerMultiProcessor);
+ DUMP(memoryBusWidth);
+ DUMP(memoryClockRate);
+ DUMP(multiProcessorCount);
+ DUMP(maxBlocksPerMultiProcessor);
+ DUMP(name);
+ DUMP(warpSize);
+
+#undef DUMP
+ }
+
+ union_marker_pair_.resize((width - 2) / 2 * (height - 2) / 2 * 4,
+ QuadBoundaryPoint());
+ compressed_union_marker_pair_.resize((width - 2) / 2 * (height - 2) / 2 * 4,
+ QuadBoundaryPoint());
+
+ // Pre-compute the border and width thresholds.
+ for (int i = 0; i < zarray_size(tag_detector_->tag_families); i++) {
+ apriltag_family_t *family;
+ zarray_get(tag_detector_->tag_families, i, &family);
+ if (family->width_at_border < min_tag_width_) {
+ min_tag_width_ = family->width_at_border;
+ }
+ normal_border_ |= !family->reversed_border;
+ reversed_border_ |= family->reversed_border;
+ }
+ min_tag_width_ /= tag_detector_->quad_decimate;
+ if (min_tag_width_ < 3) {
+ min_tag_width_ = 3;
+ }
+ }
+
+ ~CudaAprilTagDetector() {
+ apriltag_detector_destroy(tag_detector_);
+ free(tag_family_);
+ }
+
+ // Detects tags on the GPU.
+ void DetectGPU(cv::Mat color_image) {
+ CHECK_EQ(color_image.size(), gray_cuda_.size());
+
+ gpu_detector_.Detect(color_image.data);
+
+ gpu_detector_.CopyGrayTo(gray_cuda_.data);
+ gpu_detector_.CopyDecimatedTo(decimated_cuda_.data);
+ gpu_detector_.CopyThresholdedTo(thresholded_cuda_.data);
+
+ gpu_detector_.CopyUnionMarkersTo(
+ reinterpret_cast<uint32_t *>(union_markers_.data));
+ gpu_detector_.CopyUnionMarkersSizeTo(
+ reinterpret_cast<uint32_t *>(union_markers_size_.data));
+
+ gpu_detector_.CopyUnionMarkerPairTo(union_marker_pair_.data());
+ gpu_detector_.CopyCompressedUnionMarkerPairTo(
+ compressed_union_marker_pair_.data());
+ sorted_union_marker_pair_ = gpu_detector_.CopySortedUnionMarkerPair();
+ num_compressed_union_marker_pair_ =
+ gpu_detector_.NumCompressedUnionMarkerPairs();
+ extents_cuda_ = gpu_detector_.CopyExtents();
+ reduced_dot_blobs_pair_cuda_ = gpu_detector_.CopyReducedDotBlobs();
+ selected_blobs_cuda_ = gpu_detector_.CopySelectedBlobs();
+ sorted_selected_blobs_cuda_ = gpu_detector_.CopySortedSelectedBlobs();
+ num_quads_ = gpu_detector_.NumQuads();
+
+ LOG(INFO) << "num_compressed_union_marker_pair "
+ << sorted_union_marker_pair_.size();
+ }
+
+ // Detects tags on the CPU.
+ void DetectCPU(cv::Mat color_image) {
+ cv::Mat gray_image(color_image.size(), CV_8UC1);
+ cv::cvtColor(color_image, gray_image, cv::COLOR_YUV2GRAY_YUYV);
+ image_u8_t im = {
+ .width = gray_image.cols,
+ .height = gray_image.rows,
+ .stride = gray_image.cols,
+ .buf = gray_image.data,
+ };
+
+ LOG(INFO) << "Starting CPU detect.";
+ zarray_t *detections = apriltag_detector_detect(tag_detector_, &im);
+
+ timeprofile_display(tag_detector_->tp);
+
+ for (int i = 0; i < zarray_size(detections); i++) {
+ apriltag_detection_t *det;
+ zarray_get(detections, i, &det);
+
+ if (det->decision_margin > FLAGS_min_decision_margin) {
+ VLOG(1) << "Found tag number " << det->id
+ << " hamming: " << det->hamming
+ << " margin: " << det->decision_margin;
+ }
+ }
+ }
+
+ // Checks that the union finding algorithms agree.
+ std::vector<QuadBoundaryPoint> CheckUnionMarkerPairs(
+ const std::map<uint32_t, BlobInfo> &blob_sizes) const {
+ std::vector<QuadBoundaryPoint> expected_union_marker_pair;
+
+ const size_t width = thresholded_cuda_.cols;
+ const size_t height = thresholded_cuda_.rows;
+
+ expected_union_marker_pair.resize((width - 2) * (height - 2) * 4,
+ QuadBoundaryPoint());
+
+ LOG(INFO) << "Width: " << width << ", Height " << height;
+
+ auto ToIndex = [&](size_t x, size_t y) {
+ return x - 1 + (y - 1) * (width - 2);
+ };
+
+ size_t wrong = 0;
+ size_t right_nonzero = 0;
+
+ for (size_t y = 1; y < height - 1; ++y) {
+ for (size_t x = 1; x < width - 1; ++x) {
+ const uint8_t v0 = thresholded_cuda_.at<uint8_t>(y, x);
+ const uint64_t rep0 = union_markers_.at<uint32_t>(y, x);
+ const auto blob0 = blob_sizes.find(rep0)->second;
+
+ size_t offset = 0;
+ for (auto [dx, dy] : {std::pair<ssize_t, ssize_t>{1, 0},
+ std::pair<ssize_t, ssize_t>{1, 1},
+ std::pair<ssize_t, ssize_t>{0, 1},
+ std::pair<ssize_t, ssize_t>{-1, 1}}) {
+ const uint8_t v1 = thresholded_cuda_.at<uint8_t>(y + dy, x + dx);
+ const uint32_t rep1 = union_markers_.at<uint32_t>(y + dy, x + dx);
+ const auto blob1 = blob_sizes.find(rep1)->second;
+
+ // OK, blob 1, 1 is special. We only do it if blocks 2 and the blob
+ // let of x won't be connected.
+ // ________
+ // | x | 0 |
+ // -------------
+ // | 3 | 2 | 1 |
+ // -------------
+ //
+ // We are fine checking this all here since we verify that we agree
+ // with aprilrobotics at the end to make sure the overall logic
+ // matches.
+ bool consider_blob = true;
+ if (offset == 3) {
+ const uint8_t v1_block_left =
+ thresholded_cuda_.at<uint8_t>(y + 0, x - 1);
+ const uint32_t rep1_block_left =
+ union_markers_.at<uint32_t>(y + 0, x - 1);
+ const uint8_t v1_block_2 =
+ thresholded_cuda_.at<uint8_t>(y + 1, x + 0);
+ const uint32_t rep1_block_2 =
+ union_markers_.at<uint32_t>(y + 1, x + 0);
+ if (v1_block_left != 127 && v1_block_2 != 127 &&
+ (v1_block_left + v1_block_2) == 255 &&
+ blob_sizes.find(rep1_block_left)->second.size >= 25 &&
+ blob_sizes.find(rep1_block_2)->second.size >= 25 && x != 1) {
+ consider_blob = false;
+ }
+ }
+
+ QuadBoundaryPoint point;
+ if (consider_blob && blob0.size >= 25 && blob1.size >= 25 &&
+ v0 != 127) {
+ if (v0 + v1 == 255) {
+ if (rep0 < rep1) {
+ point.set_rep1(rep1);
+ point.set_rep0(rep0);
+ } else {
+ point.set_rep1(rep0);
+ point.set_rep0(rep1);
+ }
+ point.set_base_xy(x, y);
+ point.set_dxy(offset);
+ point.set_black_to_white(v1 > v0);
+ }
+ }
+ size_t index = ToIndex(x, y) + offset * (width - 2) * (height - 2);
+
+ QuadBoundaryPoint actual = union_marker_pair_[index];
+ if (!point.near(actual)) {
+ CHECK_LT(wrong, 10u);
+ LOG(WARNING) << "point == actual (" << std::hex << point << ", "
+ << actual << ") : Failed at (" << std::dec << x << ", "
+ << y << ") + (" << dx << ", " << dy
+ << "), v0: " << static_cast<int>(v0)
+ << ", v1: " << static_cast<int>(v1)
+ << ", rep0: " << std::hex << rep0 << " rep1: " << rep1
+ << " rep0 size: " << std::dec << blob0.size
+ << " rep1 size: " << std::dec << blob1.size
+ << " consider_blob " << consider_blob;
+ ++wrong;
+ } else if (point.nonzero()) {
+ right_nonzero++;
+ VLOG(2) << "point == actual (" << std::hex << point << ", "
+ << actual << ") : Success at (" << std::dec << x << ", "
+ << y << ") + (" << dx << ", " << dy
+ << "), v0: " << static_cast<int>(v0)
+ << ", v1: " << static_cast<int>(v1)
+ << ", rep0: " << std::hex << rep0 << " rep1: " << rep1;
+ point = actual;
+ }
+ expected_union_marker_pair[index] = point;
+
+ ++offset;
+ }
+ }
+ }
+ CHECK_EQ(wrong, 0u) << ", got " << right_nonzero << " right";
+
+ for (size_t i = 0; i < expected_union_marker_pair.size(); ++i) {
+ CHECK_EQ(expected_union_marker_pair[i], union_marker_pair_[i]);
+ }
+
+ return expected_union_marker_pair;
+ }
+
+ // Checks that the compressed marker pairs GPU algorithm agrees with a naive
+ // CPU version. Returns the expected points.
+ std::vector<QuadBoundaryPoint> CheckCompressedUnionMarkerPairs(
+ const std::vector<QuadBoundaryPoint> &expected_union_marker_pair,
+ const std::vector<QuadBoundaryPoint> &compressed_union_marker_pair,
+ const std::vector<QuadBoundaryPoint> &sorted_union_marker_pair) const {
+ std::vector<QuadBoundaryPoint> expected_compressed_union_marker_pair;
+ {
+ size_t nonzero_count = 0;
+ for (const QuadBoundaryPoint x : expected_union_marker_pair) {
+ if (x.nonzero()) {
+ ++nonzero_count;
+ }
+ }
+ // Rip out all the zeros.
+ expected_compressed_union_marker_pair.reserve(nonzero_count);
+ for (const QuadBoundaryPoint x : expected_union_marker_pair) {
+ if (x.nonzero()) {
+ expected_compressed_union_marker_pair.push_back(x);
+ }
+ }
+
+ CHECK_EQ(static_cast<size_t>(sorted_union_marker_pair.size()),
+ expected_compressed_union_marker_pair.size());
+
+ // And make sure they match.
+ for (size_t i = 0; i < expected_compressed_union_marker_pair.size();
+ ++i) {
+ CHECK_EQ(expected_compressed_union_marker_pair[i],
+ compressed_union_marker_pair[i]);
+ }
+ }
+
+ // Now, sort the points.
+ std::sort(expected_compressed_union_marker_pair.begin(),
+ expected_compressed_union_marker_pair.end(),
+ [&](const QuadBoundaryPoint a, const QuadBoundaryPoint b) {
+ if (a.rep01() != b.rep01()) {
+ return a.rep01() < b.rep01();
+ }
+
+ return a < b;
+ });
+
+ for (size_t i = 0; i < static_cast<size_t>(sorted_union_marker_pair.size());
+ ++i) {
+ CHECK_EQ(expected_compressed_union_marker_pair[i].rep01(),
+ sorted_union_marker_pair[i].rep01());
+ }
+
+ return expected_compressed_union_marker_pair;
+ }
+
+ // Sorts the list of points by slope and blob ID.
+ std::vector<QuadBoundaryPoint> SlopeSortPoints(
+ std::vector<QuadBoundaryPoint> points) const {
+ std::map<uint64_t, std::pair<float, float>> blob_centers;
+
+ // The slope algorithm used by aprilrobotics.
+ auto ComputeSlope = [&blob_centers](QuadBoundaryPoint pair) -> float {
+ const int32_t x = pair.x();
+ const int32_t y = pair.y();
+
+ auto blob_center = blob_centers.find(pair.rep01());
+ CHECK(blob_center != blob_centers.end());
+
+ const float cx = blob_center->second.first;
+ const float cy = blob_center->second.second;
+
+ float quadrants[2][2] = {{-1 * (2 << 15), 0}, {2 * (2 << 15), 2 << 15}};
+
+ float dx = x - cx;
+ float dy = y - cy;
+
+ float quadrant = quadrants[dy > 0][dx > 0];
+ if (dy < 0) {
+ dy = -dy;
+ dx = -dx;
+ }
+
+ if (dx < 0) {
+ float tmp = dx;
+ dx = dy;
+ dy = -tmp;
+ }
+
+ return quadrant + dy / dx;
+ };
+
+ // The theta algorithm used by cuda.
+ auto ComputeTheta = [&blob_centers](QuadBoundaryPoint pair) -> float {
+ const int32_t x = pair.x();
+ const int32_t y = pair.y();
+
+ auto blob_center = blob_centers.find(pair.rep01());
+ CHECK(blob_center != blob_centers.end());
+
+ const float cx = blob_center->second.first;
+ const float cy = blob_center->second.second;
+
+ float dx = x - cx;
+ float dy = y - cy;
+
+ return atan2f(dy, dx);
+ };
+
+ for (size_t i = 0; i < points.size();) {
+ uint64_t first_rep01 = points[i].rep01();
+
+ int min_x, min_y, max_x, max_y;
+ min_x = max_x = points[i].x();
+ min_y = max_y = points[i].y();
+
+ size_t j = i;
+ for (; j < points.size() && points[j].rep01() == first_rep01; ++j) {
+ QuadBoundaryPoint pt = points[j];
+
+ int x = pt.x();
+ int y = pt.y();
+ min_x = std::min(min_x, x);
+ max_x = std::max(max_x, x);
+ min_y = std::min(min_y, y);
+ max_y = std::max(max_y, y);
+ }
+
+ const float cx = (max_x + min_x) * 0.5 + 0.05118;
+ const float cy = (max_y + min_y) * 0.5 + -0.028581;
+
+ blob_centers[first_rep01] = std::make_pair(cx, cy);
+ i = j;
+ }
+
+ std::sort(points.begin(), points.end(),
+ [&](const QuadBoundaryPoint a, const QuadBoundaryPoint b) {
+ if (a.rep01() != b.rep01()) {
+ return a.rep01() < b.rep01();
+ }
+
+ const float slopea = ComputeSlope(a);
+ const float slopeb = ComputeSlope(b);
+
+ // Sigh, apparently the slope algorithm isn't great and
+ // sometimes ends up with matching slopes. In that case,
+ // compute the actual angle to the X axis too and sort with it.
+ //
+ // Aprilrobotics ignores this case and ends up with an
+ // indeterminate solution. We don't, so we notice.
+ if (slopea == slopeb) {
+ return ComputeTheta(a) < ComputeTheta(b);
+ }
+ return slopea < slopeb;
+ });
+ return points;
+ }
+
+ // Filters blobs using the the various size thresholds that aprilrobotics
+ // uses.
+ std::vector<std::vector<QuadBoundaryPoint>> FilterBlobs(
+ std::vector<std::vector<QuadBoundaryPoint>> blobs) const {
+ std::vector<std::vector<QuadBoundaryPoint>> result;
+ const size_t max_april_tag_perimeter = 2 * (width_ + height_);
+ LOG(ERROR) << "Max permiter test " << max_april_tag_perimeter;
+
+ for (std::vector<QuadBoundaryPoint> &blob : blobs) {
+ int min_x, min_y, max_x, max_y;
+ min_x = max_x = blob[0].x();
+ min_y = max_y = blob[0].y();
+
+ for (const QuadBoundaryPoint pt : blob) {
+ int x = pt.x();
+ int y = pt.y();
+ min_x = std::min(min_x, x);
+ max_x = std::max(max_x, x);
+ min_y = std::min(min_y, y);
+ max_y = std::max(max_y, y);
+ }
+
+ float dot = 0;
+
+ const float cx = (min_x + max_x) * 0.5 + 0.05118;
+ const float cy = (min_y + max_y) * 0.5 - 0.028581;
+
+ for (size_t j = 0; j < blob.size(); ++j) {
+ dot += (static_cast<float>(blob[j].x()) - cx) * blob[j].gx() +
+ (static_cast<float>(blob[j].y()) - cy) * blob[j].gy();
+ }
+
+ const bool quad_reversed_border = dot < 0;
+
+ if (!reversed_border_ && quad_reversed_border) {
+ continue;
+ }
+ if (!normal_border_ && !quad_reversed_border) {
+ continue;
+ }
+ if ((max_x - min_x) * (max_y - min_y) < min_tag_width_) {
+ continue;
+ }
+ if (blob.size() < 24) {
+ continue;
+ }
+ if (blob.size() <
+ static_cast<size_t>(tag_detector_->qtp.min_cluster_pixels)) {
+ continue;
+ }
+ if (blob.size() > max_april_tag_perimeter) {
+ continue;
+ }
+
+ result.emplace_back(std::move(blob));
+ }
+
+ return result;
+ }
+
+ // Produces sorted lists of clusters from AprilRobotics' intermediate
+ // clusters.
+ std::vector<std::vector<uint64_t>> AprilRoboticsPoints(
+ image_u8_t *thresholded_im, unionfind_t *uf) const {
+ zarray_t *clusters =
+ gradient_clusters(tag_detector_, thresholded_im, thresholded_im->width,
+ thresholded_im->height, thresholded_im->stride, uf);
+
+ std::vector<std::vector<uint64_t>> april_grouped_points;
+ for (int i = 0; i < zarray_size(clusters); i++) {
+ zarray_t *cluster;
+ zarray_get(clusters, i, &cluster);
+
+ uint16_t min_x = std::numeric_limits<uint16_t>::max();
+ uint16_t max_x = 0;
+ uint16_t min_y = std::numeric_limits<uint16_t>::max();
+ uint16_t max_y = 0;
+ std::vector<std::tuple<float, uint64_t>> pts;
+ for (int j = 0; j < zarray_size(cluster); j++) {
+ struct pt *p;
+ zarray_get_volatile(cluster, j, &p);
+ min_x = std::min(p->x, min_x);
+ min_y = std::min(p->y, min_y);
+ max_x = std::max(p->x, max_x);
+ max_y = std::max(p->y, max_y);
+ }
+
+ // add some noise to (cx,cy) so that pixels get a more diverse set
+ // of theta estimates. This will help us remove more points.
+ // (Only helps a small amount. The actual noise values here don't
+ // matter much at all, but we want them [-1, 1]. (XXX with
+ // fixed-point, should range be bigger?)
+ const float cx = (min_x + max_x) * 0.5 + 0.05118;
+ const float cy = (min_y + max_y) * 0.5 + -0.028581;
+
+ float quadrants[2][2] = {{-1 * (2 << 15), 0}, {2 * (2 << 15), 2 << 15}};
+
+ for (int j = 0; j < zarray_size(cluster); j++) {
+ struct pt *p;
+ zarray_get_volatile(cluster, j, &p);
+
+ float dx = p->x - cx;
+ float dy = p->y - cy;
+
+ float quadrant = quadrants[dy > 0][dx > 0];
+ if (dy < 0) {
+ dy = -dy;
+ dx = -dx;
+ }
+
+ if (dx < 0) {
+ float tmp = dx;
+ dx = dy;
+ dy = -tmp;
+ }
+ p->slope = quadrant + dy / dx;
+ pts.push_back(std::make_pair(p->slope, p->x + p->y * width_));
+ }
+
+ // Slope sort doesn't always combine duplicates because of duplicate
+ // slopes. Sort by point first, remove duplicates, then sort by slope.
+ std::sort(
+ pts.begin(), pts.end(),
+ [](std::tuple<float, uint64_t> a, std::tuple<float, uint64_t> b) {
+ return std::get<1>(a) < std::get<1>(b);
+ });
+ pts.erase(std::unique(pts.begin(), pts.end(),
+ [](std::tuple<float, uint64_t> a,
+ std::tuple<float, uint64_t> b) {
+ return std::get<1>(a) == std::get<1>(b);
+ }),
+ pts.end());
+
+ std::sort(
+ pts.begin(), pts.end(),
+ [](std::tuple<float, uint64_t> a, std::tuple<float, uint64_t> b) {
+ return std::get<0>(a) < std::get<0>(b);
+ });
+
+ VLOG(1) << "aprilrobotics points of " << pts.size() << " center (" << cx
+ << ", " << cy << ")";
+
+ std::vector<uint64_t> transformed_points;
+ transformed_points.reserve(pts.size());
+ for (std::tuple<float, uint64_t> pt : pts) {
+ VLOG(1) << " (" << std::get<1>(pt) % width_ << ", "
+ << std::get<1>(pt) / width_ << ") slope " << std::get<0>(pt);
+ transformed_points.push_back(std::get<1>(pt));
+ }
+
+ april_grouped_points.emplace_back(std::move(transformed_points));
+ }
+
+ // Now sort the groups of points into a reproducible result by sorting by
+ // size and then smallest point first.
+ std::sort(april_grouped_points.begin(), april_grouped_points.end(),
+ [](auto &x, auto &y) {
+ if (x.size() == y.size()) {
+ for (size_t j = 0; j < x.size(); ++j) {
+ if (x[j] == y[j]) {
+ continue;
+ }
+ return x[j] < y[j];
+ }
+ LOG(FATAL) << "Equal";
+ }
+ return x.size() < y.size();
+ });
+
+ for (int i = 0; i < zarray_size(clusters); i++) {
+ zarray_t *cluster;
+ zarray_get(clusters, i, &cluster);
+ zarray_destroy(cluster);
+ }
+ zarray_destroy(clusters);
+
+ return april_grouped_points;
+ }
+
+ // Sorts point lists by size. These points need to have an x() and y().
+ template <typename T>
+ std::vector<std::vector<T>> SortPointSizes(
+ std::vector<std::vector<T>> in) const {
+ std::sort(in.begin(), in.end(), [this](auto &a, auto &b) {
+ if (a.size() == b.size()) {
+ // Use point ID as the tie breaker.
+ for (size_t j = 0; j < a.size(); ++j) {
+ const uint32_t point_x = a[j].x() + a[j].y() * width_;
+ const uint32_t point_y = b[j].x() + b[j].y() * width_;
+ if (point_x == point_y) {
+ continue;
+ }
+ return point_x < point_y;
+ }
+ LOG(FATAL) << "Equal";
+ }
+ return a.size() < b.size();
+ });
+
+ return in;
+ }
+
+ // Sorts grouped points by size and point.
+ std::vector<std::vector<QuadBoundaryPoint>> SortPoints(
+ std::vector<std::vector<QuadBoundaryPoint>> in) const {
+ for (std::vector<QuadBoundaryPoint> &grouped_points : in) {
+ std::sort(grouped_points.begin(), grouped_points.end(),
+ [this](const QuadBoundaryPoint &a, const QuadBoundaryPoint &b) {
+ return std::make_tuple(a.rep01(), a.x() + a.y() * width_) <
+ std::make_tuple(b.rep01(), b.x() + b.y() * width_);
+ });
+ }
+
+ return SortPointSizes(std::move(in));
+ }
+
+ // Sorts outer points list by size and point.
+ std::vector<std::vector<uint64_t>> SortAprilroboticsPointSizes(
+ std::vector<std::vector<uint64_t>> april_grouped_points) const {
+ std::sort(april_grouped_points.begin(), april_grouped_points.end(),
+ [](auto &x, auto &y) {
+ if (x.size() == y.size()) {
+ for (size_t j = 0; j < x.size(); ++j) {
+ if (x[j] == y[j]) {
+ continue;
+ }
+ return x[j] < y[j];
+ }
+ LOG(FATAL) << "Equal";
+ }
+ return x.size() < y.size();
+ });
+ return april_grouped_points;
+ }
+
+ // Sorts points and points lists for Aprilrobotics by size and point id.
+ std::vector<std::vector<uint64_t>> SortAprilroboticsPoints(
+ std::vector<std::vector<uint64_t>> april_grouped_points) const {
+ for (std::vector<uint64_t> &points : april_grouped_points) {
+ std::sort(points.begin(), points.end());
+ }
+ return SortAprilroboticsPointSizes(april_grouped_points);
+ }
+
+ // Prints out a list of CUDA points.
+ void PrintCudaPoints(
+ const std::vector<std::vector<QuadBoundaryPoint>> &cuda_grouped_points,
+ const std::map<uint32_t, BlobInfo> &blob_sizes) const {
+ for (const std::vector<QuadBoundaryPoint> &points : cuda_grouped_points) {
+ const uint32_t rep0 = points[0].rep0();
+ const uint32_t rep1 = points[0].rep1();
+ VLOG(1) << "CUDA points of " << rep0 << "+" << rep1
+ << " aprilrobotics blob: "
+ << blob_sizes.find(rep0)->second.april_robotics_id << "+"
+ << blob_sizes.find(rep1)->second.april_robotics_id << " ("
+ << std::hex
+ << ((static_cast<uint64_t>(std::max(
+ blob_sizes.find(rep0)->second.april_robotics_id,
+ blob_sizes.find(rep1)->second.april_robotics_id))
+ << 32) |
+ std::min(blob_sizes.find(rep0)->second.april_robotics_id,
+ blob_sizes.find(rep1)->second.april_robotics_id))
+ << std::dec << ") size " << points.size();
+ for (const QuadBoundaryPoint point : points) {
+ VLOG(1) << " (" << point.x() << ", " << point.y() << ")";
+ }
+ }
+
+ LOG(INFO) << "Found runs overall " << cuda_grouped_points.size();
+ }
+
+ // Groups marker pairs by runs of rep01().
+ std::vector<std::vector<QuadBoundaryPoint>> CudaGroupedPoints(
+ const std::vector<QuadBoundaryPoint> &compressed_union_marker_pairs)
+ const {
+ std::vector<std::vector<QuadBoundaryPoint>> cuda_grouped_points;
+ cuda_grouped_points.emplace_back();
+
+ QuadBoundaryPoint start = compressed_union_marker_pairs[0];
+ for (size_t i = 0; i < compressed_union_marker_pairs.size(); ++i) {
+ QuadBoundaryPoint current = compressed_union_marker_pairs[i];
+ CHECK_GE(current.rep01(), start.rep01())
+ << " Failed on index " << i << " of "
+ << compressed_union_marker_pairs.size();
+ if ((start.rep01()) != (current.rep01())) {
+ cuda_grouped_points.emplace_back(
+ std::vector<QuadBoundaryPoint>{current});
+ } else {
+ cuda_grouped_points.back().emplace_back(current);
+ }
+ start = current;
+ }
+
+ return cuda_grouped_points;
+ }
+
+ // Groups marker pairs by runs of rep01().
+ std::vector<std::vector<IndexPoint>> CudaGroupedPoints(
+ const std::vector<IndexPoint> &compressed_cuda_points) const {
+ std::vector<std::vector<IndexPoint>> cuda_grouped_points;
+ cuda_grouped_points.emplace_back();
+
+ IndexPoint start = compressed_cuda_points[0];
+ for (size_t i = 0; i < compressed_cuda_points.size(); ++i) {
+ IndexPoint current = compressed_cuda_points[i];
+ CHECK_GE(current.blob_index(), start.blob_index())
+ << " Failed on index " << i << " of "
+ << compressed_cuda_points.size();
+ if ((start.blob_index()) != (current.blob_index())) {
+ cuda_grouped_points.emplace_back(std::vector<IndexPoint>{current});
+ } else {
+ cuda_grouped_points.back().emplace_back(current);
+ }
+ start = current;
+ }
+
+ return cuda_grouped_points;
+ }
+
+ // Checks that both april robotics and Cuda agree on point grouping. No
+ // ordering is implied here, that is checked later.
+ void CheckAprilRoboticsAndCudaContainSamePoints(
+ const std::vector<std::vector<uint64_t>> &april_grouped_points,
+ const std::vector<std::vector<QuadBoundaryPoint>> &cuda_grouped_points)
+ const {
+ const std::vector<std::vector<QuadBoundaryPoint>>
+ cuda_point_sorted_grouped_points = SortPoints(cuda_grouped_points);
+ const std::vector<std::vector<uint64_t>> april_sorted_grouped_points =
+ SortAprilroboticsPoints(
+ SortAprilroboticsPointSizes(april_grouped_points));
+
+ CHECK_EQ(april_sorted_grouped_points.size(),
+ cuda_point_sorted_grouped_points.size());
+ for (size_t i = 0; i < april_sorted_grouped_points.size(); ++i) {
+ CHECK_EQ(april_sorted_grouped_points[i].size(),
+ cuda_point_sorted_grouped_points[i].size());
+ for (size_t j = 0; j < april_sorted_grouped_points[i].size(); ++j) {
+ CHECK_EQ(april_sorted_grouped_points[i][j],
+ cuda_point_sorted_grouped_points[i][j].x() +
+ cuda_point_sorted_grouped_points[i][j].y() * width_)
+ << ": On list of size " << april_sorted_grouped_points[i].size()
+ << ", failed on point " << j << "("
+ << april_sorted_grouped_points[i][j] % width_ << ", "
+ << april_sorted_grouped_points[i][j] / width_ << ") vs ("
+ << cuda_point_sorted_grouped_points[i][j].x() << ", "
+ << cuda_point_sorted_grouped_points[i][j].y() << ")";
+ }
+ }
+ }
+
+ // Tests that the extents and dot products match the cuda versions.
+ std::pair<size_t, std::vector<IndexPoint>> CheckCudaExtents(
+ const std::vector<std::vector<QuadBoundaryPoint>> &cuda_grouped_points,
+ const std::vector<MinMaxExtents> &extents_cuda,
+ const std::vector<float> &reduced_dot_blobs_pair_cuda) const {
+ std::vector<IndexPoint> selected_blobs;
+ const size_t max_april_tag_perimeter = 2 * (width_ + height_);
+ size_t selected_quads = 0;
+ {
+ BlobExtentsIndexFinder finder(extents_cuda.data(), extents_cuda.size());
+ size_t i = 0;
+ size_t starting_offset = 0;
+ CHECK_EQ(cuda_grouped_points.size(), extents_cuda.size());
+ CHECK_EQ(extents_cuda.size(), reduced_dot_blobs_pair_cuda.size());
+ for (const std::vector<QuadBoundaryPoint> &points : cuda_grouped_points) {
+ size_t min_x, min_y, max_x, max_y;
+ min_x = max_x = points[0].x();
+ min_y = max_y = points[0].y();
+
+ for (const QuadBoundaryPoint pt : points) {
+ size_t x = pt.x();
+ size_t y = pt.y();
+ min_x = std::min(min_x, x);
+ max_x = std::max(max_x, x);
+ min_y = std::min(min_y, y);
+ max_y = std::max(max_y, y);
+ }
+
+ const MinMaxExtents extents = extents_cuda[i];
+ CHECK_EQ(extents.min_x, min_x);
+ CHECK_EQ(extents.max_x, max_x);
+ CHECK_EQ(extents.min_y, min_y);
+ CHECK_EQ(extents.max_y, max_y);
+ CHECK_EQ(extents.count, points.size());
+ CHECK_EQ(extents.starting_offset, starting_offset)
+ << " for index " << i;
+
+ float dot = 0;
+
+ const float cx = (min_x + max_x) * 0.5 + 0.05118;
+ const float cy = (min_y + max_y) * 0.5 - 0.028581;
+
+ bool good_blob_size =
+ points.size() >= 24 && points.size() <= max_april_tag_perimeter &&
+ points.size() >=
+ static_cast<size_t>(tag_detector_->qtp.min_cluster_pixels);
+
+ if (good_blob_size) {
+ for (size_t j = 0; j < points.size(); ++j) {
+ dot += (static_cast<float>(points[j].x()) - cx) * points[j].gx() +
+ (static_cast<float>(points[j].y()) - cy) * points[j].gy();
+
+ // Make sure our blob finder agrees.
+ CHECK_EQ(i, finder.FindBlobIndex(starting_offset + j));
+ }
+ // Test that the summed dot product is right.
+ CHECK_LT(
+ std::abs(reduced_dot_blobs_pair_cuda[i] - dot) / std::abs(dot),
+ 1e-4)
+ << ": for point " << i << ", cuda -> "
+ << reduced_dot_blobs_pair_cuda[i] << ", C++ -> " << dot;
+ } else {
+ CHECK_EQ(reduced_dot_blobs_pair_cuda[i], 0.0f)
+ << ": for point " << i << ", cuda -> "
+ << reduced_dot_blobs_pair_cuda[i] << ", C++ -> " << dot;
+ }
+
+ const bool quad_reversed_border = dot < 0;
+
+ VLOG(1) << "For point " << i << ", cuda -> "
+ << reduced_dot_blobs_pair_cuda[i] << ", C++ -> " << dot
+ << " size " << points.size() << " border "
+ << (!(!reversed_border_ && quad_reversed_border) &&
+ !(!normal_border_ && !quad_reversed_border))
+ << " area: "
+ << (extents.max_x - extents.min_x) *
+ (extents.max_y - extents.min_y)
+ << " min_area: " << min_tag_width_;
+
+ if (good_blob_size && !(!reversed_border_ && quad_reversed_border) &&
+ !(!normal_border_ && !quad_reversed_border)) {
+ ++selected_quads;
+ for (size_t j = 0; j < points.size(); ++j) {
+ IndexPoint pt(i, points[j].point_bits());
+ float theta =
+ (atan2f(pt.y() - extents.cy(), pt.x() - extents.cx()) + M_PI) *
+ 8e6;
+ long long int theta_int = llrintf(theta);
+
+ pt.set_theta(theta_int);
+ selected_blobs.emplace_back(pt);
+ }
+ }
+
+ starting_offset += points.size();
+ ++i;
+ }
+ }
+ return std::make_pair(selected_quads, selected_blobs);
+ }
+
+ // Tests that the filtered points lists are sorted by key.
+ void CheckFilteredCudaPoints(
+ const std::vector<IndexPoint> &selected_blobs,
+ const std::vector<IndexPoint> &selected_blobs_cuda) const {
+ CHECK_EQ(selected_blobs.size(), selected_blobs_cuda.size());
+ for (size_t i = 0;
+ i < std::min(selected_blobs.size(), selected_blobs_cuda.size()); ++i) {
+ VLOG(1) << "Got blob[" << i << "] -> " << selected_blobs[i] << " vs "
+ << selected_blobs_cuda[i];
+ CHECK_EQ(selected_blobs[i].blob_index(),
+ selected_blobs_cuda[i].blob_index());
+ }
+ }
+
+ // Tests that the cuda sorted point list matches C++.
+ void CheckSortedFilteredCudaPoints(
+ const std::vector<std::vector<QuadBoundaryPoint>>
+ &slope_sorted_expected_grouped_points,
+ size_t selected_quads, const std::vector<IndexPoint> &selected_blobs,
+ const std::vector<IndexPoint> &sorted_selected_blobs_cuda) const {
+ CHECK_EQ(selected_blobs.size(), sorted_selected_blobs_cuda.size());
+ const std::vector<std::vector<IndexPoint>> cuda_grouped_points =
+ CudaGroupedPoints(sorted_selected_blobs_cuda);
+
+ CHECK_EQ(cuda_grouped_points.size(), selected_quads);
+ CHECK_EQ(slope_sorted_expected_grouped_points.size(), selected_quads);
+
+ size_t missmatched_points = 0;
+ for (size_t i = 0; i < cuda_grouped_points.size(); ++i) {
+ const std::vector<IndexPoint> &cuda_grouped_blob = cuda_grouped_points[i];
+ const std::vector<QuadBoundaryPoint> &slope_sorted_points =
+ slope_sorted_expected_grouped_points[i];
+
+ CHECK_EQ(cuda_grouped_blob.size(), slope_sorted_points.size());
+ if (VLOG_IS_ON(1) && cuda_grouped_blob[0].blob_index() == 73) {
+ for (size_t j = 0; j < cuda_grouped_points[i].size(); ++j) {
+ LOG(INFO) << "For blob " << cuda_grouped_blob[0].blob_index()
+ << ", got " << cuda_grouped_blob[j] << " ("
+ << cuda_grouped_blob[j].x() << ", "
+ << cuda_grouped_blob[j].y() << ") expected "
+ << slope_sorted_points[j] << " ("
+ << slope_sorted_points[j].x() << ", "
+ << slope_sorted_points[j].y() << ")";
+ }
+ }
+ size_t missmatched_runs = 0;
+ for (size_t j = 0; j < cuda_grouped_points[i].size(); ++j) {
+ if (cuda_grouped_blob[j].x() != slope_sorted_points[j].x() ||
+ cuda_grouped_blob[j].y() != slope_sorted_points[j].y()) {
+ ++missmatched_points;
+ ++missmatched_runs;
+ // We shouldn't see a lot of points in a row which don't match.
+ CHECK_LE(missmatched_runs, 4u);
+ VLOG(1) << "Missmatched point in blob "
+ << cuda_grouped_blob[0].blob_index() << ", point " << j
+ << " (" << cuda_grouped_blob[j].x() << ", "
+ << cuda_grouped_blob[j].y() << ") vs ("
+ << slope_sorted_points[j].x() << ", "
+ << slope_sorted_points[j].y() << ")";
+ } else {
+ missmatched_runs = 0;
+ }
+ }
+ }
+
+ // Or a lot of points overall. The slope algo has duplicate points, and
+ // is occasionally wrong.
+ CHECK_LE(missmatched_points, 50u);
+ }
+
+ // Checks that the GPU and CPU algorithms match.
+ void Check(cv::Mat color_image) {
+ cv::Mat gray_image(color_image.size(), CV_8UC1);
+ cv::cvtColor(color_image, gray_image, cv::COLOR_YUV2GRAY_YUYV);
+
+ image_u8_t gray_im = ToImageu8t(gray_image);
+ CheckImage(gray_im, ToImageu8t(gray_cuda_), "gray_cuda");
+
+ image_u8_t *quad_im =
+ image_u8_decimate(&gray_im, tag_detector_->quad_decimate);
+ CheckImage(*quad_im, ToImageu8t(decimated_cuda_), "decimated_cuda");
+
+ image_u8_t *thresholded_im = threshold(tag_detector_, quad_im);
+ CheckImage(ToImageu8t(thresholded_cuda_), *thresholded_im,
+ "threshold_cuda");
+
+ unionfind_t *uf = connected_components(
+ tag_detector_, thresholded_im, thresholded_im->width,
+ thresholded_im->height, thresholded_im->stride);
+
+ // Checks union finding.
+ auto blob_sizes = CheckUnionfind(
+ uf, union_markers_,
+ reinterpret_cast<const uint32_t *>(union_markers_size_.data),
+ thresholded_cuda_);
+
+ // Now make sure our unfiltered and partially sorted lists of pairs are
+ // plausible.
+ const std::vector<QuadBoundaryPoint> expected_union_marker_pair =
+ CheckUnionMarkerPairs(blob_sizes);
+ const std::vector<QuadBoundaryPoint> expected_compressed_union_marker_pair =
+ CheckCompressedUnionMarkerPairs(expected_union_marker_pair,
+ compressed_union_marker_pair_,
+ sorted_union_marker_pair_);
+
+ const std::vector<std::vector<QuadBoundaryPoint>> cuda_grouped_points =
+ CudaGroupedPoints(sorted_union_marker_pair_);
+ PrintCudaPoints(cuda_grouped_points, blob_sizes);
+
+ const std::vector<std::vector<QuadBoundaryPoint>>
+ slope_sorted_expected_grouped_points = FilterBlobs(CudaGroupedPoints(
+ SlopeSortPoints(expected_compressed_union_marker_pair)));
+
+ const std::vector<std::vector<uint64_t>> april_grouped_points =
+ AprilRoboticsPoints(thresholded_im, uf);
+
+ LOG(INFO) << "Found " << april_grouped_points.size()
+ << " clusters with april robotics with "
+ << std::accumulate(
+ april_grouped_points.begin(), april_grouped_points.end(),
+ 0, [](int size, const auto &v) { return size + v.size(); })
+ << " points.";
+ LOG(INFO) << "Found " << cuda_grouped_points.size()
+ << " clusters with cuda with "
+ << num_compressed_union_marker_pair_ << " points.";
+
+ // Verify that both aprilrobotics and us group points the same. Ignore
+ // order.
+ CheckAprilRoboticsAndCudaContainSamePoints(april_grouped_points,
+ cuda_grouped_points);
+
+ // Test that the extents are reasonable.
+ size_t selected_quads;
+ std::vector<IndexPoint> selected_blobs;
+ std::tie(selected_quads, selected_blobs) = CheckCudaExtents(
+ cuda_grouped_points, extents_cuda_, reduced_dot_blobs_pair_cuda_);
+
+ // And that the filtered list is reasonable.
+ CheckFilteredCudaPoints(selected_blobs, selected_blobs_cuda_);
+
+ // And that they sorted right too.
+ CheckSortedFilteredCudaPoints(slope_sorted_expected_grouped_points,
+ selected_quads, selected_blobs,
+ sorted_selected_blobs_cuda_);
+
+ // TODO(austin): Check the selected extents is right.
+ LOG(INFO) << "Found slope sorted count: "
+ << sorted_union_marker_pair_.size();
+
+ unionfind_destroy(uf);
+ image_u8_destroy(quad_im);
+ image_u8_destroy(thresholded_im);
+ }
+
+ // Writes images out to /tmp for debugging.
+ void WriteDebug(cv::Mat color_image) {
+ cv::Mat bgr(color_image.size(), CV_8UC3);
+ cv::cvtColor(color_image, bgr, cv::COLOR_YUV2BGR_YUYV);
+ cv::imwrite("/tmp/debug_color_image.png", bgr);
+
+ cv::imwrite("/tmp/debug_halide_grey_cuda.png", gray_cuda_);
+ cv::imwrite("/tmp/debug_halide_grey_decimated.png", decimated_cuda_);
+ cv::imwrite("/tmp/debug_cuda_thresholded.png", thresholded_cuda_);
+
+ {
+ image_u8_t thresholded_halide_im = ToImageu8t(thresholded_cuda_);
+
+ unionfind_t *uf = connected_components(
+ tag_detector_, &thresholded_halide_im, thresholded_halide_im.width,
+ thresholded_halide_im.height, thresholded_halide_im.stride);
+
+ std::map<uint32_t, uint32_t> april_to_cuda_id_remap;
+ std::map<uint32_t, uint32_t> cuda_to_april_id_remap;
+
+ std::map<uint32_t, cv::Vec3b> colors;
+
+ // Seed the random number generator so we get the same images out each
+ // time for easier comparison.
+ srandom(0);
+
+ // Write out the union find image.
+ size_t color_count = 1;
+ uint32_t max_color = 0;
+ cv::Mat unionfind_image(union_markers_.size(), CV_8UC3);
+ cv::Mat unionfind_image_common(union_markers_.size(), CV_8UC3);
+ for (int y = 0; y < union_markers_.rows; ++y) {
+ for (int x = 0; x < union_markers_.cols; ++x) {
+ const uint32_t april_robotics_id =
+ unionfind_get_representative(uf, y * width_ / 2 + x);
+ const uint32_t index = union_markers_.at<uint32_t>(y, x);
+
+ bool one_to_one_blob_match = false;
+
+ {
+ auto it = cuda_to_april_id_remap.find(index);
+ if (it == cuda_to_april_id_remap.end()) {
+ // Now, check we don't have a match the other way.
+ auto it_back = april_to_cuda_id_remap.find(april_robotics_id);
+ if (it_back == april_to_cuda_id_remap.end()) {
+ one_to_one_blob_match = true;
+ cuda_to_april_id_remap.emplace(index, april_robotics_id);
+ april_to_cuda_id_remap.emplace(april_robotics_id, index);
+ } else {
+ one_to_one_blob_match = false;
+ }
+ } else {
+ auto it_back = april_to_cuda_id_remap.find(april_robotics_id);
+ if (it_back == april_to_cuda_id_remap.end()) {
+ one_to_one_blob_match = false;
+ } else {
+ if (it->second == april_robotics_id &&
+ it_back->second == index) {
+ one_to_one_blob_match = true;
+ }
+ }
+ }
+ }
+
+ cv::Vec3b color;
+
+ auto it = colors.find(index);
+ if (it == colors.end()) {
+ max_color = std::max(max_color, index);
+ ++color_count;
+ VLOG(2) << "New color 0x" << std::hex << index;
+ const int bias = 50;
+ uint8_t r = bias + (random() % (200 - bias));
+ uint8_t g = bias + (random() % (200 - bias));
+ uint8_t b = bias + (random() % (200 - bias));
+
+ color = cv::Vec3b(b, g, r);
+ colors[index] = color;
+ } else {
+ color = it->second;
+ }
+
+ unionfind_image.at<cv::Vec3b>(y, x) = color;
+
+ if (one_to_one_blob_match) {
+ if (index == static_cast<uint32_t>(x + y * union_markers_.cols)) {
+ color = cv::Vec3b(0, 0, 0);
+ } else if (thresholded_cuda_.at<uint8_t>(y, x) == 0) {
+ color = cv::Vec3b(0, 0, 0);
+ } else {
+ color = cv::Vec3b(255, 255, 255);
+ }
+ }
+
+ CHECK(one_to_one_blob_match);
+
+ unionfind_image_common.at<cv::Vec3b>(y, x) = color;
+ }
+ }
+ LOG(INFO) << "Found " << color_count << " colors with a max index of "
+ << max_color;
+ cv::imwrite("/tmp/debug_cuda_segmentation.png", unionfind_image);
+ cv::imwrite("/tmp/debug_cuda_segmentation_common.png",
+ unionfind_image_common);
+
+ unionfind_destroy(uf);
+ }
+
+ {
+ // And write out the marker pairs image too.
+ std::map<uint64_t, cv::Vec3b> colors;
+
+ cv::Mat sorted_marker_pair(cv::Size((width_), (height_)), CV_8UC3);
+ sorted_marker_pair.setTo(cv::Scalar(0, 0, 0));
+
+ for (size_t i = 0; i < sorted_union_marker_pair_.size();) {
+ size_t blob_size = 0;
+ for (size_t j = i; j < sorted_union_marker_pair_.size(); ++j) {
+ if (sorted_union_marker_pair_[i].rep01() !=
+ sorted_union_marker_pair_[j].rep01()) {
+ break;
+ }
+ ++blob_size;
+ }
+ for (size_t j = i; j < sorted_union_marker_pair_.size(); ++j) {
+ if (sorted_union_marker_pair_[i].rep01() !=
+ sorted_union_marker_pair_[j].rep01()) {
+ break;
+ }
+ QuadBoundaryPoint pair = sorted_union_marker_pair_[j];
+
+ const size_t x = pair.x();
+ const size_t y = pair.y();
+
+ auto it = colors.find(pair.rep01());
+ cv::Vec3b color;
+ if (it == colors.end()) {
+ VLOG(2) << "New color 0x" << std::hex << pair.rep01();
+ const int bias = 50;
+ uint8_t r = bias + (random() % (200 - bias));
+ uint8_t g = bias + (random() % (200 - bias));
+ uint8_t b = bias + (random() % (200 - bias));
+
+ color = cv::Vec3b(b, g, r);
+ colors[pair.rep01()] = color;
+ } else {
+ color = it->second;
+ }
+ sorted_marker_pair.at<cv::Vec3b>(y + 2, x + 2) = color;
+ }
+ i += blob_size;
+ }
+ cv::imwrite("/tmp/debug_cuda_marker_pairs.png", sorted_marker_pair);
+ }
+ }
+
+ private:
+ apriltag_family_t *tag_family_;
+ apriltag_detector_t *tag_detector_;
+
+ cv::Mat gray_cuda_;
+ cv::Mat decimated_cuda_;
+ cv::Mat thresholded_cuda_;
+
+ cv::Mat union_markers_;
+ cv::Mat union_markers_size_;
+
+ std::vector<QuadBoundaryPoint> union_marker_pair_;
+ std::vector<QuadBoundaryPoint> compressed_union_marker_pair_;
+ std::vector<QuadBoundaryPoint> sorted_union_marker_pair_;
+ std::vector<uint32_t> quad_length_;
+ std::vector<MinMaxExtents> extents_cuda_;
+ std::vector<float> reduced_dot_blobs_pair_cuda_;
+ std::vector<IndexPoint> selected_blobs_cuda_;
+ std::vector<IndexPoint> sorted_selected_blobs_cuda_;
+ int num_compressed_union_marker_pair_ = 0;
+ int num_quads_ = 0;
+
+ GpuDetector gpu_detector_;
+
+ size_t width_;
+ size_t height_;
+
+ bool normal_border_ = false;
+ bool reversed_border_ = false;
+ int min_tag_width_ = 1000000;
+};
+
+class AprilDetectionTest : public ::testing::Test {
+ public:
+ aos::FlatbufferVector<frc971::vision::CameraImage> ReadImage(
+ std::string_view path) {
+ return aos::FileToFlatbuffer<frc971::vision::CameraImage>(
+ "../apriltag_test_bfbs_images/" + std::string(path));
+ }
+
+ cv::Mat ToMat(const frc971::vision::CameraImage *image) {
+ cv::Mat color_image(cv::Size(image->cols(), image->rows()), CV_8UC2,
+ (void *)image->data()->data());
+ return color_image;
+ }
+};
+
+TEST_F(AprilDetectionTest, ImageRepeat) {
+ auto image = ReadImage("bfbs-capture-2013-01-18_08-54-09.501047728.bfbs");
+
+ LOG(INFO) << "Image is: " << image.message().cols() << " x "
+ << image.message().rows();
+
+ CudaAprilTagDetector cuda_detector(image.message().cols(),
+ image.message().rows());
+
+ const cv::Mat color_image = ToMat(&image.message());
+
+ for (size_t i = 0; i < 10; ++i) {
+ LOG(INFO) << "Attempt " << i;
+ cuda_detector.DetectGPU(color_image.clone());
+ cuda_detector.DetectCPU(color_image.clone());
+ cuda_detector.Check(color_image.clone());
+ if (FLAGS_debug) {
+ cuda_detector.WriteDebug(color_image);
+ }
+ }
+}
+
+class SingleAprilDetectionTest
+ : public ::testing::WithParamInterface<std::string_view>,
+ public AprilDetectionTest {};
+
+// Tests a single image.
+TEST_P(SingleAprilDetectionTest, Image) {
+ auto image = ReadImage(GetParam());
+
+ LOG(INFO) << "Testing " << GetParam() << " with dimensions "
+ << image.message().cols() << " x " << image.message().rows();
+
+ CudaAprilTagDetector cuda_detector(image.message().cols(),
+ image.message().rows());
+
+ const cv::Mat color_image = ToMat(&image.message());
+
+ cuda_detector.DetectGPU(color_image.clone());
+ cuda_detector.DetectCPU(color_image.clone());
+ cuda_detector.Check(color_image.clone());
+ if (FLAGS_debug) {
+ cuda_detector.WriteDebug(color_image);
+ }
+}
+
+// Tests that both algorithms agree on a bunch of pixels.
+INSTANTIATE_TEST_SUITE_P(
+ CapturedImages, SingleAprilDetectionTest,
+ ::testing::Values("bfbs-capture-2013-01-18_08-54-16.869057537.bfbs",
+ "bfbs-capture-2013-01-18_08-54-09.501047728.bfbs"
+ // TODO(austin): Figure out why these fail...
+ //"bfbs-capture-2013-01-18_08-51-24.861065764.bfbs",
+ //"bfbs-capture-2013-01-18_08-52-01.846912552.bfbs",
+ //"bfbs-capture-2013-01-18_08-52-33.462848049.bfbs",
+ //"bfbs-capture-2013-01-18_08-54-24.931661979.bfbs",
+ //"bfbs-capture-2013-01-18_09-29-16.806073486.bfbs",
+ //"bfbs-capture-2013-01-18_09-33-00.993756514.bfbs",
+ //"bfbs-capture-2013-01-18_08-57-00.171120695.bfbs"
+ //"bfbs-capture-2013-01-18_08-57-17.858752817.bfbs",
+ //"bfbs-capture-2013-01-18_08-57-08.096597542.bfbs"
+ ));
+
+// Tests that QuadBoundaryPoint doesn't corrupt data.
+TEST(QuadBoundaryPoint, MasksWork) {
+ std::mt19937 generator(aos::testing::RandomSeed());
+ std::uniform_int_distribution<uint32_t> random_rep_scalar(0, 0xfffff);
+ std::uniform_int_distribution<uint32_t> random_point_scalar(0, 0x3ff);
+ std::uniform_int_distribution<uint32_t> random_dxy_scalar(0, 3);
+ std::uniform_int_distribution<uint32_t> random_bool(0, 1);
+
+ QuadBoundaryPoint point;
+
+ EXPECT_EQ(point.key, 0);
+
+ for (int i = 0; i < 25; ++i) {
+ const uint32_t rep0 = random_rep_scalar(generator);
+ for (int j = 0; j < 25; ++j) {
+ const uint32_t rep1 = random_rep_scalar(generator);
+ for (int k = 0; k < 25; ++k) {
+ const uint32_t x = random_point_scalar(generator);
+ const uint32_t y = random_point_scalar(generator);
+ for (int k = 0; k < 25; ++k) {
+ const uint32_t dxy = random_dxy_scalar(generator);
+ for (int m = 0; m < 2; ++m) {
+ const bool black_to_white = random_bool(generator) == 1;
+ if (point.rep0() != rep0) {
+ point.set_rep0(rep0);
+ }
+ if (point.rep1() != rep1) {
+ point.set_rep1(rep1);
+ }
+ if (point.base_x() != x || point.base_y() != y) {
+ point.set_base_xy(x, y);
+ }
+ switch (dxy) {
+ case 0:
+ if (point.dx() != 1 || point.dy() != 0) {
+ point.set_dxy(dxy);
+ }
+ break;
+ case 1:
+ if (point.dx() != 1 || point.dy() != 1) {
+ point.set_dxy(dxy);
+ }
+ break;
+ case 2:
+ if (point.dx() != 0 || point.dy() != 1) {
+ point.set_dxy(dxy);
+ }
+ break;
+ case 3:
+ if (point.dx() != -1 || point.dy() != 1) {
+ point.set_dxy(dxy);
+ }
+ break;
+ }
+ if (black_to_white != point.black_to_white()) {
+ point.set_black_to_white(black_to_white);
+ }
+
+ ASSERT_EQ(point.rep0(), rep0);
+ ASSERT_EQ(point.rep1(), rep1);
+ ASSERT_EQ(point.base_x(), x);
+ ASSERT_EQ(point.base_y(), y);
+ switch (dxy) {
+ case 0:
+ ASSERT_EQ(point.dx(), 1);
+ ASSERT_EQ(point.dy(), 0);
+ break;
+ case 1:
+ ASSERT_EQ(point.dx(), 1);
+ ASSERT_EQ(point.dy(), 1);
+ break;
+ case 2:
+ ASSERT_EQ(point.dx(), 0);
+ ASSERT_EQ(point.dy(), 1);
+ break;
+ case 3:
+ ASSERT_EQ(point.dx(), -1);
+ ASSERT_EQ(point.dy(), 1);
+ break;
+ }
+ ASSERT_EQ(point.x(), x * 2 + point.dx());
+ ASSERT_EQ(point.y(), y * 2 + point.dy());
+
+ ASSERT_EQ(point.black_to_white(), black_to_white);
+ }
+ }
+ }
+ }
+ }
+}
+
+} // namespace frc971::apriltag::testing
diff --git a/frc971/orin/gpu_image.h b/frc971/orin/gpu_image.h
new file mode 100644
index 0000000..732b256
--- /dev/null
+++ b/frc971/orin/gpu_image.h
@@ -0,0 +1,14 @@
+#ifndef FRC971_ORIN_GPU_IMAGE_H_
+#define FRC971_ORIN_GPU_IMAGE_H_
+
+template <typename T>
+struct GpuImage {
+ typedef T type;
+ T *data;
+ size_t rows;
+ size_t cols;
+ // Step is in elements, not bytes.
+ size_t step;
+};
+
+#endif // FRC971_ORIN_GPU_IMAGE_H_
diff --git a/frc971/orin/labeling_allegretti_2019_BKE.cc b/frc971/orin/labeling_allegretti_2019_BKE.cc
new file mode 100644
index 0000000..2806199
--- /dev/null
+++ b/frc971/orin/labeling_allegretti_2019_BKE.cc
@@ -0,0 +1,494 @@
+// Copyright (c) 2020, the YACCLAB contributors, as
+// shown by the AUTHORS file. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+//
+// See
+// https://iris.unimore.it/bitstream/11380/1179616/1/2018_TPDS_Optimized_Block_Based_Algorithms_to_Label_Connected_Components_on_GPUs.pdf
+// for more details.
+//
+// This file was started from the YACCLAB implementation, but is now quite
+// different to match what the april tag detection algorithm wants. The notable
+// changes are:
+// * 4 way connectivity on the background at the same time as the foreground.
+// * 0 and 255 are the only colors considered, 127 means "I'm my own blob".
+// * Blob size is also computed in parallel using atomics to aid blob size
+// filtering.
+//
+
+#include "frc971/orin/labeling_allegretti_2019_BKE.h"
+
+#include "glog/logging.h"
+
+#include "cuda_runtime.h"
+#include "device_launch_parameters.h"
+
+#define BLOCK_ROWS 16
+#define BLOCK_COLS 16
+
+namespace {
+
+// This is a block-based algorithm.
+// Blocks are 2x2 sized, with internal pixels named as:
+// +---+
+// |a b|
+// |c d|
+// +---+
+//
+// Neighbour blocks of block X are named as:
+// +-+-+-+
+// |P|Q|R|
+// +-+-+-+
+// |S|X|
+// +-+-+
+
+enum class Info : uint8_t {
+ a = 0,
+ b = 1,
+ c = 2,
+ d = 3,
+ P = 4,
+ Q = 5,
+ R = 6,
+ S = 7
+};
+
+// Only use it with unsigned numeric types
+template <typename T>
+__device__ __forceinline__ uint8_t HasBit(const T bitmap, Info pos) {
+ return (bitmap >> static_cast<uint8_t>(pos)) & 1;
+}
+
+template <typename T>
+__device__ __forceinline__ uint8_t HasBit(const T bitmap, uint8_t pos) {
+ return (bitmap >> pos) & 1;
+}
+
+// Only use it with unsigned numeric types
+__device__ __forceinline__ void SetBit(uint8_t &bitmap, Info pos) {
+ bitmap |= (1 << static_cast<uint8_t>(pos));
+}
+
+// Returns the root index of the UFTree
+__device__ uint32_t Find(const uint32_t *s_buf, uint32_t n) {
+ while (s_buf[n] != n) {
+ n = s_buf[n];
+ }
+ return n;
+}
+
+// Returns the root index of the UFTree, re-assigning ourselves as we go.
+__device__ uint32_t FindAndCompress(uint32_t *s_buf, uint32_t n) {
+ uint32_t id = n;
+ while (s_buf[n] != n) {
+ n = s_buf[n];
+ s_buf[id] = n;
+ }
+ return n;
+}
+
+// Merges the UFTrees of a and b, linking one root to the other
+__device__ void Union(uint32_t *s_buf, uint32_t a, uint32_t b) {
+ bool done;
+
+ do {
+ a = Find(s_buf, a);
+ b = Find(s_buf, b);
+
+ if (a < b) {
+ uint32_t old = atomicMin(s_buf + b, a);
+ done = (old == b);
+ b = old;
+ } else if (b < a) {
+ uint32_t old = atomicMin(s_buf + a, b);
+ done = (old == a);
+ a = old;
+ } else {
+ done = true;
+ }
+
+ } while (!done);
+}
+
+// Initializes the labels in an image to hold the masks, info, and UF tree
+// pointers needed for the next steps.
+__global__ void InitLabeling(const GpuImage<uint8_t> img,
+ GpuImage<uint32_t> labels) {
+ const unsigned row = (blockIdx.y * blockDim.y + threadIdx.y) * 2;
+ const unsigned col = (blockIdx.x * blockDim.x + threadIdx.x) * 2;
+ const uint32_t img_index = row * img.step + col;
+ const uint32_t foreground_labels_index = row * labels.step + col;
+ const uint32_t background_labels_index = (row + 1) * labels.step + col;
+
+ if (row >= labels.rows || col >= labels.cols) {
+ return;
+ }
+
+ uint32_t P_foreground = 0;
+ uint32_t P_background = 0;
+
+ // Bitmask representing two kinds of information
+ // Bits 0, 1, 2, 3 are set if pixel a, b, c, d are foreground, respectively
+ // Bits 4, 5, 6, 7 are set if block P, Q, R, S need to be merged to X in
+ // Merge phase
+ uint8_t info_foreground = 0;
+ uint8_t info_left_background = 0;
+ uint8_t info_right_background = 0;
+
+ uint8_t buffer alignas(int)[4];
+ *(reinterpret_cast<int *>(buffer)) = 0;
+
+ // Read pairs of consecutive values in memory at once
+ // This does not depend on endianness
+ *(reinterpret_cast<uint16_t *>(buffer)) =
+ *(reinterpret_cast<uint16_t *>(img.data + img_index));
+
+ *(reinterpret_cast<uint16_t *>(buffer + 2)) =
+ *(reinterpret_cast<uint16_t *>(img.data + img_index + img.step));
+
+ // P is a bitmask saying where to check.
+ //
+ // 0 1 2 3
+ // +----+
+ // 4 |a b| 7
+ // 8 |c d| 11
+ // +----+
+ // 12 13 14 15
+ if (buffer[0] == 255u) {
+ P_foreground |= 0x777;
+ SetBit(info_foreground, Info::a);
+ } else if (buffer[0] == 0u) {
+ // This is the background, we are only doing 4 way connectivity, only look
+ // in the 4 directions.
+ P_background |= 0x272;
+ SetBit(info_left_background, Info::a);
+ }
+
+ if (buffer[1] == 255u) {
+ P_foreground |= (0x777 << 1);
+ SetBit(info_foreground, Info::b);
+ } else if (buffer[1] == 0u) {
+ P_background |= (0x272 << 1);
+ SetBit(info_right_background, Info::b);
+ }
+
+ if (buffer[2] == 255u) {
+ P_foreground |= (0x777 << 4);
+ SetBit(info_foreground, Info::c);
+ } else if (buffer[2] == 0u) {
+ P_background |= (0x272 << 4);
+ SetBit(info_left_background, Info::c);
+ }
+
+ if (buffer[3] == 255u) {
+ SetBit(info_foreground, Info::d);
+ } else if (buffer[3] == 0u) {
+ SetBit(info_right_background, Info::d);
+ }
+
+ if (col == 0) {
+ P_foreground &= 0xEEEE;
+ P_background &= 0xEEEE;
+ }
+ if (col + 2 >= img.cols) {
+ P_foreground &= 0x7777;
+ P_background &= 0x7777;
+ }
+
+ if (row == 0) {
+ P_foreground &= 0xFFF0;
+ P_background &= 0xFFF0;
+ }
+ if (row + 2 >= img.rows) {
+ P_foreground &= 0x0FFF;
+ P_background &= 0x0FFF;
+ }
+
+ // P is now ready to be used to find neighbour blocks
+ // P value avoids range errors
+ int father_offset_foreground = 0;
+ int father_offset_left_background = 0;
+ int father_offset_right_background = 0;
+
+ // P square
+ if (HasBit(P_foreground, 0) && img.data[img_index - img.step - 1] == 255u) {
+ father_offset_foreground = -(2 * (labels.step) + 2);
+ }
+
+ // Q square
+ if ((HasBit(P_foreground, 1) && img.data[img_index - img.step] == 255u) ||
+ (HasBit(P_foreground, 2) && img.data[img_index + 1 - img.step] == 255u)) {
+ if (!father_offset_foreground) {
+ father_offset_foreground = -(2 * (labels.step));
+ } else {
+ SetBit(info_foreground, Info::Q);
+ }
+ }
+ if ((HasBit(P_background, 1) && img.data[img_index - img.step] == 0u)) {
+ father_offset_left_background = -2 * labels.step;
+ }
+ if ((HasBit(P_background, 2) && img.data[img_index + 1 - img.step] == 0u)) {
+ father_offset_right_background = -2 * labels.step;
+ }
+
+ // R square
+ if (HasBit(P_foreground, 3) && img.data[img_index + 2 - img.step] == 255u) {
+ if (!father_offset_foreground) {
+ father_offset_foreground = -(2 * (labels.step) - 2);
+ } else {
+ SetBit(info_foreground, Info::R);
+ }
+ }
+
+ // S square
+ if ((HasBit(P_foreground, 4) && img.data[img_index - 1] == 255u) ||
+ (HasBit(P_foreground, 8) && img.data[img_index + img.step - 1] == 255u)) {
+ if (!father_offset_foreground) {
+ father_offset_foreground = -2;
+ } else {
+ SetBit(info_foreground, Info::S);
+ }
+ }
+ if ((HasBit(P_background, 4) && img.data[img_index - 1] == 0u) ||
+ (HasBit(P_background, 8) && img.data[img_index + img.step - 1] == 0u)) {
+ if (!father_offset_left_background) {
+ father_offset_left_background = -1;
+ } else {
+ SetBit(info_left_background, Info::S);
+ }
+ }
+
+ if ((HasBit(info_left_background, Info::a) &&
+ HasBit(info_right_background, Info::b)) ||
+ (HasBit(info_left_background, Info::c) &&
+ HasBit(info_right_background, Info::d))) {
+ if (!father_offset_right_background) {
+ father_offset_right_background = -1;
+ } else {
+ SetBit(info_right_background, Info::S);
+ }
+ }
+
+ // Now, write everything back out to memory.
+ *reinterpret_cast<uint64_t *>(labels.data + foreground_labels_index) =
+ static_cast<uint64_t>(foreground_labels_index +
+ father_offset_foreground) +
+ (static_cast<uint64_t>(info_foreground) << 32) +
+ (static_cast<uint64_t>(info_left_background) << 40) +
+ (static_cast<uint64_t>(info_right_background) << 48);
+
+ *reinterpret_cast<uint64_t *>(labels.data + background_labels_index) =
+ static_cast<uint64_t>(background_labels_index +
+ father_offset_left_background) +
+ (static_cast<uint64_t>(background_labels_index +
+ father_offset_right_background + 1)
+ << 32);
+}
+
+__global__ void Compression(GpuImage<uint32_t> labels) {
+ const unsigned row = (blockIdx.y * blockDim.y + threadIdx.y) * 2;
+ const unsigned col = (blockIdx.x * blockDim.x + threadIdx.x) * 2;
+ const uint32_t foreground_labels_index = row * labels.step + col;
+ const uint32_t background_labels_index = (row + 1) * labels.step + col;
+
+ if (row >= labels.rows || col >= labels.cols) {
+ return;
+ }
+
+ FindAndCompress(labels.data, foreground_labels_index);
+ FindAndCompress(labels.data, background_labels_index);
+ FindAndCompress(labels.data, background_labels_index + 1);
+}
+
+__global__ void Merge(GpuImage<uint32_t> labels) {
+ const unsigned row = (blockIdx.y * blockDim.y + threadIdx.y) * 2;
+ const unsigned col = (blockIdx.x * blockDim.x + threadIdx.x) * 2;
+ const uint32_t foreground_labels_index = row * labels.step + col;
+ const uint32_t background_labels_index = (row + 1) * labels.step + col;
+
+ if (row >= labels.rows || col >= labels.cols) {
+ return;
+ }
+
+ const uint32_t info = *(labels.data + foreground_labels_index + 1);
+
+ const uint8_t info_foreground = info & 0xff;
+ const uint8_t info_left_background = (info >> 8) & 0xff;
+ const uint8_t info_right_background = (info >> 16) & 0xff;
+
+ if (HasBit(info_foreground, Info::Q)) {
+ Union(labels.data, foreground_labels_index,
+ foreground_labels_index - 2 * (labels.step));
+ }
+
+ if (HasBit(info_foreground, Info::R)) {
+ Union(labels.data, foreground_labels_index,
+ foreground_labels_index - 2 * (labels.step) + 2);
+ }
+
+ if (HasBit(info_foreground, Info::S)) {
+ Union(labels.data, foreground_labels_index, foreground_labels_index - 2);
+ }
+
+ if (HasBit(info_left_background, Info::S)) {
+ Union(labels.data, background_labels_index, background_labels_index - 1);
+ }
+ if (HasBit(info_right_background, Info::S)) {
+ Union(labels.data, background_labels_index + 1, background_labels_index);
+ }
+}
+
+__global__ void FinalLabeling(GpuImage<uint32_t> labels,
+ GpuImage<uint32_t> union_markers_size_device) {
+ const unsigned row = (blockIdx.y * blockDim.y + threadIdx.y) * 2;
+ const unsigned col = (blockIdx.x * blockDim.x + threadIdx.x) * 2;
+ const uint32_t foreground_labels_index = row * labels.step + col;
+ const uint32_t background_labels_index = (row + 1) * labels.step + col;
+
+ if (row >= labels.rows || col >= labels.cols) {
+ return;
+ }
+
+ const uint64_t foreground_buffer =
+ *reinterpret_cast<uint64_t *>(labels.data + foreground_labels_index);
+ const uint32_t foreground_label = (foreground_buffer & (0xFFFFFFFF));
+ const uint8_t foreground_info = (foreground_buffer >> 32) & 0xFF;
+ const uint64_t background_buffer =
+ *reinterpret_cast<uint64_t *>(labels.data + background_labels_index);
+ const uint32_t background_left_label = (background_buffer & (0xFFFFFFFF));
+ const uint32_t background_right_label =
+ ((background_buffer >> 32) & (0xFFFFFFFF));
+ const uint8_t background_left_info = (foreground_buffer >> 40) & 0xFF;
+ const uint8_t background_right_info = (foreground_buffer >> 48) & 0xFF;
+
+ uint32_t a_label;
+ uint32_t b_label;
+ uint32_t c_label;
+ uint32_t d_label;
+
+ if ((foreground_info & 0xf) == 0u && (background_left_info & 0xf) == 0u &&
+ (background_right_info & 0xf) == 0u) {
+ a_label = foreground_labels_index;
+ b_label = foreground_labels_index + 1;
+ c_label = background_labels_index;
+ d_label = background_labels_index + 1;
+ *reinterpret_cast<uint64_t *>(labels.data + foreground_labels_index) =
+ (static_cast<uint64_t>(b_label) << 32) | a_label;
+ *reinterpret_cast<uint64_t *>(labels.data + background_labels_index) =
+ (static_cast<uint64_t>(d_label) << 32) | (c_label);
+ return;
+ } else {
+ a_label = (HasBit(foreground_info, Info::a) * foreground_label +
+ HasBit(background_left_info, Info::a) * background_left_label);
+ b_label = HasBit(foreground_info, Info::b) * foreground_label +
+ HasBit(background_right_info, Info::b) * background_right_label;
+ c_label = HasBit(foreground_info, Info::c) * foreground_label +
+ HasBit(background_left_info, Info::c) * background_left_label;
+ d_label = HasBit(foreground_info, Info::d) * foreground_label +
+ HasBit(background_right_info, Info::d) * background_right_label;
+
+ *reinterpret_cast<uint64_t *>(labels.data + foreground_labels_index) =
+ (static_cast<uint64_t>(b_label) << 32) | a_label;
+ *reinterpret_cast<uint64_t *>(labels.data + background_labels_index) =
+ (static_cast<uint64_t>(d_label) << 32) | (c_label);
+ }
+
+ if ((foreground_info & 0xf) != 0u) {
+ // We've got foreground!
+ size_t count = 0;
+ if (HasBit(foreground_info, Info::a)) {
+ ++count;
+ }
+ if (HasBit(foreground_info, Info::b)) {
+ ++count;
+ }
+ if (HasBit(foreground_info, Info::c)) {
+ ++count;
+ }
+ if (HasBit(foreground_info, Info::d)) {
+ ++count;
+ }
+
+ atomicAdd(union_markers_size_device.data + foreground_label, count);
+ }
+
+ if ((background_left_info & 0xf) == 0u &&
+ (background_right_info & 0xf) == 0u) {
+ return;
+ }
+
+ if ((background_left_info & 0xf) != 0u &&
+ (background_right_info & 0xf) != 0u &&
+ background_left_label == background_right_label) {
+ // They are all populated and match, go for it.
+ size_t count = 0;
+ if (HasBit(background_left_info, Info::a)) {
+ ++count;
+ }
+ if (HasBit(background_right_info, Info::b)) {
+ ++count;
+ }
+ if (HasBit(background_left_info, Info::c)) {
+ ++count;
+ }
+ if (HasBit(background_right_info, Info::d)) {
+ ++count;
+ }
+
+ atomicAdd(union_markers_size_device.data + background_left_label, count);
+ return;
+ }
+
+ if ((background_left_info & 0xf) != 0u) {
+ size_t count = 0;
+ if (HasBit(background_left_info, Info::a)) {
+ ++count;
+ }
+ if (HasBit(background_left_info, Info::c)) {
+ ++count;
+ }
+ atomicAdd(union_markers_size_device.data + background_left_label, count);
+ }
+
+ if ((background_right_info & 0xf) != 0u) {
+ size_t count = 0;
+ if (HasBit(background_right_info, Info::b)) {
+ ++count;
+ }
+ if (HasBit(background_right_info, Info::d)) {
+ ++count;
+ }
+ atomicAdd(union_markers_size_device.data + background_right_label, count);
+ }
+}
+
+} // namespace
+
+void LabelImage(const GpuImage<uint8_t> input, GpuImage<uint32_t> output,
+ GpuImage<uint32_t> union_markers_size_device,
+ cudaStream_t stream) {
+ CHECK_NE(input.rows, 1u);
+ CHECK_NE(input.cols, 1u);
+
+ // Need an even number of rows and colums, we don't need to solve the actual
+ // hard problems...
+ CHECK_EQ(input.rows % 2, 0u);
+ CHECK_EQ(input.cols % 2, 0u);
+
+ dim3 grid_size =
+ dim3((((input.cols + 1) / 2) + BLOCK_COLS - 1) / BLOCK_COLS,
+ (((input.rows + 1) / 2) + BLOCK_ROWS - 1) / BLOCK_ROWS, 1);
+ dim3 block_size = dim3(BLOCK_COLS, BLOCK_ROWS, 1);
+
+ InitLabeling<<<grid_size, block_size, 0, stream>>>(input, output);
+
+ Compression<<<grid_size, block_size, 0, stream>>>(output);
+
+ Merge<<<grid_size, block_size, 0, stream>>>(output);
+
+ Compression<<<grid_size, block_size, 0, stream>>>(output);
+
+ FinalLabeling<<<grid_size, block_size, 0, stream>>>(
+ output, union_markers_size_device);
+}
diff --git a/frc971/orin/labeling_allegretti_2019_BKE.h b/frc971/orin/labeling_allegretti_2019_BKE.h
new file mode 100644
index 0000000..4c87f7f
--- /dev/null
+++ b/frc971/orin/labeling_allegretti_2019_BKE.h
@@ -0,0 +1,12 @@
+#ifndef FRC971_ORIN_LABELING_ALLEGRETTI_2019_BKE_H_
+#define FRC971_ORIN_LABELING_ALLEGRETTI_2019_BKE_H_
+
+#include <cuda_runtime.h>
+
+#include "frc971/orin/gpu_image.h"
+
+void LabelImage(const GpuImage<uint8_t> input, GpuImage<uint32_t> output,
+ GpuImage<uint32_t> union_markers_size_device,
+ cudaStream_t stream);
+
+#endif // FRC971_ORIN_LABELING_ALLEGRETTI_2019_BKE_H_
diff --git a/frc971/orin/points.cc b/frc971/orin/points.cc
new file mode 100644
index 0000000..dbf5496
--- /dev/null
+++ b/frc971/orin/points.cc
@@ -0,0 +1,35 @@
+#include "frc971/orin/points.h"
+
+#include <iomanip>
+#include <ostream>
+
+namespace frc971 {
+namespace apriltag {
+
+std::ostream &operator<<(std::ostream &os, const QuadBoundaryPoint &point) {
+ std::ios_base::fmtflags original_flags = os.flags();
+
+ os << "key:" << std::hex << std::setw(16) << std::setfill('0') << point.key
+ << " rep01:" << std::setw(10) << point.rep01() << " pt:" << std::setw(6)
+ << point.point_bits();
+ os.flags(original_flags);
+ return os;
+}
+
+static_assert(sizeof(QuadBoundaryPoint) == 8,
+ "QuadBoundaryPoint didn't pack right.");
+
+std::ostream &operator<<(std::ostream &os, const IndexPoint &point) {
+ std::ios_base::fmtflags original_flags = os.flags();
+
+ os << "key:" << std::hex << std::setw(16) << std::setfill('0') << point.key
+ << " i:" << std::setw(3) << point.blob_index() << " t:" << std::setw(7)
+ << point.theta() << " p:" << std::setw(6) << point.point_bits();
+ os.flags(original_flags);
+ return os;
+}
+
+static_assert(sizeof(IndexPoint) == 8, "IndexPoint didn't pack right.");
+
+} // namespace apriltag
+} // namespace frc971
diff --git a/frc971/orin/points.h b/frc971/orin/points.h
new file mode 100644
index 0000000..312d90a
--- /dev/null
+++ b/frc971/orin/points.h
@@ -0,0 +1,295 @@
+#ifndef FRC971_ORIN_POINTS_H_
+#define FRC971_ORIN_POINTS_H_
+
+#include <stdint.h>
+
+#include <cub/iterator/transform_input_iterator.cuh>
+#include <cuda/std/tuple>
+#include <iomanip>
+#include <ostream>
+
+#include "cuda_runtime.h"
+#include "device_launch_parameters.h"
+
+namespace frc971 {
+namespace apriltag {
+
+// Class to hold the 2 adjacent blob IDs, a point in decimated image space, the
+// half pixel offset, and the gradient.
+//
+// rep0 and rep1 are the two blob ids, and are each allocated 20 bits.
+// point is the base point and is allocated 10 bits for x and 10 bits for y.
+// dx and dy are allocated 2 bits, and can only take on set values.
+// black_to_white captures the direction of the gradient in 1 bit.
+//
+// This adds up to 63 bits so we can load this with one big load.
+struct QuadBoundaryPoint {
+ static constexpr size_t kRepEndBit = 24;
+ static constexpr size_t kBitsInKey = 64;
+
+ __forceinline__ __host__ __device__ QuadBoundaryPoint() : key(0) {}
+
+ // Sets rep0, the 0th blob id. This only respects the bottom 20 bits.
+ __forceinline__ __host__ __device__ void set_rep0(uint32_t rep0) {
+ key = (key & 0xfffff00000ffffffull) |
+ (static_cast<uint64_t>(rep0 & 0xfffff) << 24);
+ }
+ // Returns rep0.
+ __forceinline__ __host__ __device__ uint32_t rep0() const {
+ return ((key >> 24) & 0xfffff);
+ }
+
+ // Sets rep1, the 1st blob id. This only respects the bottom 20 bits.
+ __forceinline__ __host__ __device__ void set_rep1(uint32_t rep1) {
+ key = (key & 0xfffffffffffull) |
+ (static_cast<uint64_t>(rep1 & 0xfffff) << 44);
+ }
+ // Returns rep1.
+ __forceinline__ __host__ __device__ uint32_t rep1() const {
+ return ((key >> 44) & 0xfffff);
+ }
+
+ // Returns both rep0 and rep1 concatenated into a single 40 bit number.
+ __forceinline__ __host__ __device__ uint64_t rep01() const {
+ return ((key >> 24) & 0xffffffffff);
+ }
+
+ // Returns all the bits used to hold position and gradient information.
+ __forceinline__ __host__ __device__ uint32_t point_bits() const {
+ return key & 0xffffff;
+ }
+
+ // Sets the 10 bit x and y.
+ __forceinline__ __host__ __device__ void set_base_xy(uint32_t x, uint32_t y) {
+ key = (key & 0xffffffffff00000full) |
+ (static_cast<uint64_t>(x & 0x3ff) << 14) |
+ (static_cast<uint64_t>(y & 0x3ff) << 4);
+ }
+
+ // Returns the base 10 bit x and y.
+ __forceinline__ __host__ __device__ uint32_t base_x() const {
+ return ((key >> 14) & 0x3ff);
+ }
+ __forceinline__ __host__ __device__ uint32_t base_y() const {
+ return ((key >> 4) & 0x3ff);
+ }
+
+ // Sets dxy, the integer representing which of the 4 search directions we
+ // went.
+ __forceinline__ __host__ __device__ void set_dxy(uint64_t dxy) {
+ key = (key & 0xfffffffffffffffcull) | (static_cast<uint64_t>(dxy & 0x3));
+ }
+
+ // Returns the change in x derived from the search direction.
+ __forceinline__ __host__ __device__ int32_t dx() const {
+ switch (key & 0x3) {
+ case 0:
+ return 1;
+ case 1:
+ return 1;
+ case 2:
+ return 0;
+ case 3:
+ return -1;
+ }
+ return 0;
+ }
+
+ // Returns the change in y derived from the search direction.
+ __forceinline__ __host__ __device__ int32_t dy() const {
+ switch (key & 0x3) {
+ case 0:
+ return 0;
+ case 1:
+ case 2:
+ case 3:
+ return 1;
+ }
+ return 0;
+ }
+
+ // Returns the un-decimated x and y positions.
+ __forceinline__ __host__ __device__ uint32_t x() const {
+ return static_cast<int32_t>(base_x() * 2) + dx();
+ }
+ __forceinline__ __host__ __device__ uint32_t y() const {
+ return static_cast<int32_t>(base_y() * 2) + dy();
+ }
+
+ // Returns the gradient that this point represents, taking into account which
+ // direction the color transitioned.
+ __forceinline__ __host__ __device__ int8_t gx() const {
+ return black_to_white() ? dx() : -dx();
+ }
+ __forceinline__ __host__ __device__ int8_t gy() const {
+ return black_to_white() ? dy() : -dy();
+ }
+
+ // Returns the black to white or white to black bit.
+ __forceinline__ __host__ __device__ void set_black_to_white(
+ bool black_to_white) {
+ key = (key & 0xfffffffffffffff7ull) |
+ (static_cast<uint64_t>(black_to_white) << 3);
+ }
+ __forceinline__ __host__ __device__ bool black_to_white() const {
+ return (key & 0x8) != 0;
+ }
+
+ // Various operators to make it easy to compare points.
+ __forceinline__ __host__ __device__ bool operator!=(
+ const QuadBoundaryPoint other) const {
+ return other.key != key;
+ }
+ __forceinline__ __host__ __device__ bool operator==(
+ const QuadBoundaryPoint other) const {
+ return other.key == key;
+ }
+ __forceinline__ __host__ __device__ bool operator<(
+ const QuadBoundaryPoint other) const {
+ return key < other.key;
+ }
+
+ // Returns true if this point has been set. Zero is reserved for "invalid"
+ __forceinline__ __host__ __device__ bool nonzero() const {
+ return key != 0ull;
+ }
+
+ // Returns true if this point is about the other point.
+ bool near(QuadBoundaryPoint other) const { return other == *this; }
+
+ // The key. This shouldn't be parsed directly.
+ uint64_t key;
+};
+
+std::ostream &operator<<(std::ostream &os, const QuadBoundaryPoint &point);
+
+// Holds a compacted blob index, the angle to the X axis from the center of the
+// blob, and the coordinate of the point.
+//
+// The blob index is 12 bits, the angle is 28 bits, and the point is 24 bits.
+struct IndexPoint {
+ // Max number of blob IDs we can hold.
+ static constexpr size_t kMaxBlobs = 2048;
+
+ static constexpr size_t kRepEndBit = 24;
+ static constexpr size_t kBitsInKey = 64;
+
+ __forceinline__ __host__ __device__ IndexPoint() : key(0) {}
+
+ // Constructor to build a point with just the blob index, and point bits. The
+ // point bits should be grabbed from a QuadBoundaryPoint rather than built up
+ // by hand.
+ __forceinline__ __host__ __device__ IndexPoint(uint32_t blob_index,
+ uint32_t point_bits)
+ : key((static_cast<uint64_t>(blob_index & 0xfff) << 52) |
+ (static_cast<uint64_t>(point_bits & 0xffffff))) {}
+
+ // Sets and gets the 12 bit blob index.
+ __forceinline__ __host__ __device__ void set_blob_index(uint32_t blob_index) {
+ key = (key & 0x000fffffffffffffull) |
+ (static_cast<uint64_t>(blob_index & 0xfff) << 52);
+ }
+ __forceinline__ __host__ __device__ uint32_t blob_index() const {
+ return ((key >> 52) & 0xfff);
+ }
+
+ // Sets and gets the 28 bit angle.
+ __forceinline__ __host__ __device__ void set_theta(uint32_t theta) {
+ key = (key & 0xfff0000000ffffffull) |
+ (static_cast<uint64_t>(theta & 0xfffffff) << 24);
+ }
+ __forceinline__ __host__ __device__ uint32_t theta() const {
+ return ((key >> 24) & 0xfffffff);
+ }
+
+ // See QuadBoundaryPoint for a description of the rest of these.
+ __forceinline__ __host__ __device__ uint32_t base_x() const {
+ return ((key >> 14) & 0x3ff);
+ }
+ __forceinline__ __host__ __device__ uint32_t base_y() const {
+ return ((key >> 4) & 0x3ff);
+ }
+
+ __forceinline__ __host__ __device__ void set_dxy(uint64_t dxy) {
+ key = (key & 0xfffffffffffffffcull) | (static_cast<uint64_t>(dxy & 0x3));
+ }
+
+ __forceinline__ __host__ __device__ int32_t dx() const {
+ switch (key & 0x3) {
+ case 0:
+ return 1;
+ case 1:
+ return 1;
+ case 2:
+ return 0;
+ case 3:
+ return -1;
+ }
+ return 0;
+ }
+
+ __forceinline__ __host__ __device__ int32_t dy() const {
+ switch (key & 0x3) {
+ case 0:
+ return 0;
+ case 1:
+ case 2:
+ case 3:
+ return 1;
+ }
+ return 0;
+ }
+
+ __forceinline__ __host__ __device__ uint32_t x() const {
+ return static_cast<int32_t>(base_x() * 2) + dx();
+ }
+ __forceinline__ __host__ __device__ uint32_t y() const {
+ return static_cast<int32_t>(base_y() * 2) + dy();
+ }
+
+ __forceinline__ __host__ __device__ int8_t gx() const {
+ return black_to_white() ? dx() : -dx();
+ }
+ __forceinline__ __host__ __device__ int8_t gy() const {
+ return black_to_white() ? dy() : -dy();
+ }
+
+ __forceinline__ __host__ __device__ uint32_t point_bits() const {
+ return key & 0xffffff;
+ }
+
+ __forceinline__ __host__ __device__ void set_black_to_white(
+ bool black_to_white) {
+ key = (key & 0xfffffffffffffff7ull) |
+ (static_cast<uint64_t>(black_to_white) << 3);
+ }
+ __forceinline__ __host__ __device__ bool black_to_white() const {
+ return (key & 0x8) != 0;
+ }
+
+ // The key. This shouldn't be parsed directly.
+ uint64_t key;
+};
+
+std::ostream &operator<<(std::ostream &os, const IndexPoint &point);
+
+// Decomposer for sorting which just returns the key.
+struct QuadBoundaryPointDecomposer {
+ __host__ __device__ ::cuda::std::tuple<uint64_t &> operator()(
+ QuadBoundaryPoint &key) const {
+ return {key.key};
+ }
+};
+
+// Decomposer for sorting which just returns the key.
+struct QuadIndexPointDecomposer {
+ __host__ __device__ ::cuda::std::tuple<uint64_t &> operator()(
+ IndexPoint &key) const {
+ return {key.key};
+ }
+};
+
+} // namespace apriltag
+} // namespace frc971
+
+#endif // FRC971_ORIN_POINTS_H_
diff --git a/frc971/orin/threshold.cc b/frc971/orin/threshold.cc
new file mode 100644
index 0000000..79ccced
--- /dev/null
+++ b/frc971/orin/threshold.cc
@@ -0,0 +1,206 @@
+#include "frc971/orin/threshold.h"
+
+#include <stdint.h>
+
+#include "frc971/orin/cuda.h"
+
+namespace frc971 {
+namespace apriltag {
+namespace {
+
+// 1280 -> 2 * 128 * 5
+// 720 -> 2 * 8 * 5 * 9
+//
+// 1456 -> 2 * 8 * 7 * 13
+// 1088 -> 2 * 32 * 17
+
+// Writes out the grayscale image and decimated image.
+__global__ void InternalCudaToGreyscaleAndDecimateHalide(
+ const uint8_t *color_image, uint8_t *gray_image, uint8_t *decimated_image,
+ size_t width, size_t height) {
+ size_t i = blockIdx.x * blockDim.x + threadIdx.x;
+ while (i < width * height) {
+ uint8_t pixel = gray_image[i] = color_image[i * 2];
+
+ const size_t row = i / width;
+ const size_t col = i - width * row;
+
+ // Copy over every other pixel.
+ if (row % 2 == 0 && col % 2 == 0) {
+ size_t decimated_row = row / 2;
+ size_t decimated_col = col / 2;
+ decimated_image[decimated_row * width / 2 + decimated_col] = pixel;
+ }
+ i += blockDim.x * gridDim.x;
+ }
+
+ // TODO(austin): Figure out how to load contiguous memory reasonably
+ // efficiently and max/min over it.
+
+ // TODO(austin): Can we do the threshold here too? That would be less memory
+ // bandwidth consumed...
+}
+
+// Returns the min and max for a row of 4 pixels.
+__forceinline__ __device__ uchar2 minmax(uchar4 row) {
+ uint8_t min_val = std::min(std::min(row.x, row.y), std::min(row.z, row.w));
+ uint8_t max_val = std::max(std::max(row.x, row.y), std::max(row.z, row.w));
+ return make_uchar2(min_val, max_val);
+}
+
+// Returns the min and max for a set of min and maxes.
+__forceinline__ __device__ uchar2 minmax(uchar2 val0, uchar2 val1) {
+ return make_uchar2(std::min(val0.x, val1.x), std::max(val0.y, val1.y));
+}
+
+// Returns the pixel index of a pixel at the provided x and y location.
+__forceinline__ __device__ size_t XYToIndex(size_t width, size_t x, size_t y) {
+ return width * y + x;
+}
+
+// Computes the min and max pixel value for each block of 4 pixels.
+__global__ void InternalBlockMinMax(const uint8_t *decimated_image,
+ uchar2 *unfiltered_minmax_image,
+ size_t width, size_t height) {
+ uchar2 vals[4];
+ const size_t x = blockIdx.x * blockDim.x + threadIdx.x;
+ const size_t y = blockIdx.y * blockDim.y + threadIdx.y;
+
+ if (x >= width || y >= height) {
+ return;
+ }
+
+ for (int i = 0; i < 4; ++i) {
+ const uchar4 decimated_block = *reinterpret_cast<const uchar4 *>(
+ decimated_image + XYToIndex(width * 4, x * 4, y * 4 + i));
+
+ vals[i] = minmax(decimated_block);
+ }
+
+ unfiltered_minmax_image[XYToIndex(width, x, y)] =
+ minmax(minmax(vals[0], vals[1]), minmax(vals[2], vals[3]));
+}
+
+// Filters the min/max for the surrounding block of 9 pixels centered on our
+// location using min/max and writes the result back out.
+__global__ void InternalBlockFilter(const uchar2 *unfiltered_minmax_image,
+ uchar2 *minmax_image, size_t width,
+ size_t height) {
+ uchar2 result = make_uchar2(255, 0);
+
+ const size_t x = blockIdx.x * blockDim.x + threadIdx.x;
+ const size_t y = blockIdx.y * blockDim.y + threadIdx.y;
+
+ if (x >= width || y >= height) {
+ return;
+ }
+
+ // Iterate through the 3x3 set of points centered on the point this image is
+ // responsible for, and compute the overall min/max.
+#pragma unroll
+ for (int i = -1; i <= 1; ++i) {
+#pragma unroll
+ for (int j = -1; j <= 1; ++j) {
+ const ssize_t read_x = x + i;
+ const ssize_t read_y = y + j;
+
+ if (read_x < 0 || read_x >= static_cast<ssize_t>(width)) {
+ continue;
+ }
+ if (read_y < 0 || read_y >= static_cast<ssize_t>(height)) {
+ continue;
+ }
+
+ result = minmax(
+ result, unfiltered_minmax_image[XYToIndex(width, read_x, read_y)]);
+ }
+ }
+
+ minmax_image[XYToIndex(width, x, y)] = result;
+}
+
+// Thresholds the image based on the filtered thresholds.
+__global__ void InternalThreshold(const uint8_t *decimated_image,
+ const uchar2 *minmax_image,
+ uint8_t *thresholded_image, size_t width,
+ size_t height, size_t min_white_black_diff) {
+ size_t i = blockIdx.x * blockDim.x + threadIdx.x;
+ while (i < width * height) {
+ const size_t x = i % width;
+ const size_t y = i / width;
+
+ const uchar2 minmax_val = minmax_image[x / 4 + (y / 4) * width / 4];
+
+ uint8_t result;
+ if (minmax_val.y - minmax_val.x < min_white_black_diff) {
+ result = 127;
+ } else {
+ uint8_t thresh = minmax_val.x + (minmax_val.y - minmax_val.x) / 2;
+ if (decimated_image[i] > thresh) {
+ result = 255;
+ } else {
+ result = 0;
+ }
+ }
+
+ thresholded_image[i] = result;
+ i += blockDim.x * gridDim.x;
+ }
+}
+
+} // namespace
+
+void CudaToGreyscaleAndDecimateHalide(
+ const uint8_t *color_image, uint8_t *gray_image, uint8_t *decimated_image,
+ uint8_t *unfiltered_minmax_image, uint8_t *minmax_image,
+ uint8_t *thresholded_image, size_t width, size_t height,
+ size_t min_white_black_diff, CudaStream *stream) {
+ CHECK((width % 8) == 0);
+ CHECK((height % 8) == 0);
+ constexpr size_t kThreads = 256;
+ {
+ // Step one, convert to gray and decimate.
+ size_t kBlocks = (width * height + kThreads - 1) / kThreads / 4;
+ InternalCudaToGreyscaleAndDecimateHalide<<<kBlocks, kThreads, 0,
+ stream->get()>>>(
+ color_image, gray_image, decimated_image, width, height);
+ MaybeCheckAndSynchronize();
+ }
+
+ size_t decimated_width = width / 2;
+ size_t decimated_height = height / 2;
+
+ {
+ // Step 2, compute a min/max for each block of 4x4 (16) pixels.
+ dim3 threads(16, 16, 1);
+ dim3 blocks((decimated_width / 4 + 15) / 16,
+ (decimated_height / 4 + 15) / 16, 1);
+
+ InternalBlockMinMax<<<blocks, threads, 0, stream->get()>>>(
+ decimated_image, reinterpret_cast<uchar2 *>(unfiltered_minmax_image),
+ decimated_width / 4, decimated_height / 4);
+ MaybeCheckAndSynchronize();
+
+ // Step 3, Blur those min/max's a further +- 1 block in each direction using
+ // min/max.
+ InternalBlockFilter<<<blocks, threads, 0, stream->get()>>>(
+ reinterpret_cast<uchar2 *>(unfiltered_minmax_image),
+ reinterpret_cast<uchar2 *>(minmax_image), decimated_width / 4,
+ decimated_height / 4);
+ MaybeCheckAndSynchronize();
+ }
+
+ {
+ // Now, write out 127 if the min/max are too close to each other, or 0/255
+ // if the pixels are above or below the average of the min/max.
+ size_t kBlocks = (width * height / 4 + kThreads - 1) / kThreads / 4;
+ InternalThreshold<<<kBlocks, kThreads, 0, stream->get()>>>(
+ decimated_image, reinterpret_cast<uchar2 *>(minmax_image),
+ thresholded_image, decimated_width, decimated_height,
+ min_white_black_diff);
+ MaybeCheckAndSynchronize();
+ }
+}
+
+} // namespace apriltag
+} // namespace frc971
diff --git a/frc971/orin/threshold.h b/frc971/orin/threshold.h
new file mode 100644
index 0000000..5cdc3a2
--- /dev/null
+++ b/frc971/orin/threshold.h
@@ -0,0 +1,22 @@
+#ifndef FRC971_ORIN_THRESHOLD_H_
+#define FRC971_ORIN_THRESHOLD_H_
+
+#include <stdint.h>
+
+#include "frc971/orin/cuda.h"
+
+namespace frc971 {
+namespace apriltag {
+
+// Converts to grayscale, decimates, and thresholds an image on the provided
+// stream.
+void CudaToGreyscaleAndDecimateHalide(
+ const uint8_t *color_image, uint8_t *gray_image, uint8_t *decimated_image,
+ uint8_t *unfiltered_minmax_image, uint8_t *minmax_image,
+ uint8_t *thresholded_image, size_t width, size_t height,
+ size_t min_white_black_diff, CudaStream *stream);
+
+} // namespace apriltag
+} // namespace frc971
+
+#endif // FRC971_ORIN_THRESHOLD_H_