Add first half of GPU based april tag detector

This detects blob boundaries, filters them, and then orders the points
in a circle in preparation for line fitting.  It takes 2ms for a 720p
image on the Orin NX 8gb.

Future commits will do the quad fitting and merge back with the original
algorithm.

Change-Id: Idf2869b3521e50a0056a352138d864b409dab6f1
Signed-off-by: Austin Schuh <austin.linux@gmail.com>
diff --git a/frc971/orin/BUILD b/frc971/orin/BUILD
index 5675ab4..a81abf1 100644
--- a/frc971/orin/BUILD
+++ b/frc971/orin/BUILD
@@ -7,3 +7,61 @@
     function = "ycbcr",
     visibility = ["//visibility:public"],
 )
+
+cc_library(
+    name = "cuda",
+    srcs = [
+        "apriltag.cc",
+        "cuda.cc",
+        "labeling_allegretti_2019_BKE.cc",
+        "points.cc",
+        "threshold.cc",
+    ],
+    hdrs = [
+        "apriltag.h",
+        "cuda.h",
+        "gpu_image.h",
+        "labeling_allegretti_2019_BKE.h",
+        "points.h",
+        "threshold.h",
+    ],
+    copts = [
+        "-Wno-pass-failed",
+        #"-DCUB_DETAIL_DEBUG_ENABLE_LOG=1",
+        #"-DDEBUG=1",
+    ],
+    features = ["cuda"],
+    deps = [
+        "//aos/time",
+        "//third_party/apriltag",
+        "@com_github_google_glog//:glog",
+        "@com_github_nvidia_cccl//:cccl",
+        "@com_github_nvidia_cuco//:cuco",
+    ],
+)
+
+cc_test(
+    name = "cuda_april_tag_test",
+    srcs = [
+        "cuda_april_tag_test.cc",
+    ],
+    data = [
+        "@apriltag_test_bfbs_images",
+    ],
+    features = ["cuda"],
+    deps = [
+        ":cuda",
+        ":ycbcr",
+        "//aos:flatbuffer_merge",
+        "//aos:json_to_flatbuffer",
+        "//aos/testing:googletest",
+        "//aos/testing:path",
+        "//aos/testing:random_seed",
+        "//frc971/vision:vision_fbs",
+        "//third_party:cudart",
+        "//third_party:opencv",
+        "//third_party/apriltag",
+        "//y2023/vision:ThresholdHalide",
+        "//y2023/vision:ToGreyscaleAndDecimateHalide",
+    ],
+)
diff --git a/frc971/orin/apriltag.cc b/frc971/orin/apriltag.cc
new file mode 100644
index 0000000..381ffae
--- /dev/null
+++ b/frc971/orin/apriltag.cc
@@ -0,0 +1,845 @@
+#include "frc971/orin/apriltag.h"
+
+#include <thrust/iterator/constant_iterator.h>
+#include <thrust/iterator/transform_iterator.h>
+
+#include <cub/device/device_copy.cuh>
+#include <cub/device/device_radix_sort.cuh>
+#include <cub/device/device_reduce.cuh>
+#include <cub/device/device_run_length_encode.cuh>
+#include <cub/device/device_scan.cuh>
+#include <cub/device/device_segmented_sort.cuh>
+#include <cub/device/device_select.cuh>
+#include <cub/iterator/discard_output_iterator.cuh>
+#include <cub/iterator/transform_input_iterator.cuh>
+#include <vector>
+
+#include "glog/logging.h"
+
+#include "aos/time/time.h"
+#include "frc971/orin/labeling_allegretti_2019_BKE.h"
+#include "frc971/orin/threshold.h"
+
+namespace frc971 {
+namespace apriltag {
+
+typedef std::chrono::duration<float, std::milli> float_milli;
+
+// Returns true if the QuadBoundaryPoint is nonzero.
+struct NonZero {
+  __host__ __device__ __forceinline__ bool operator()(
+      const QuadBoundaryPoint &a) const {
+    return a.nonzero();
+  }
+};
+
+// Always returns true (only used for scratch space calcs).
+template <typename T>
+struct True {
+  __host__ __device__ __forceinline__ bool operator()(const T &) const {
+    return true;
+  }
+};
+
+// Computes and returns the scratch space needed for DeviceRadixSort::SortKeys
+// of the provided key with the provided number of elements.
+template <typename T>
+static size_t RadixSortScratchSpace(size_t elements) {
+  size_t temp_storage_bytes = 0;
+  QuadBoundaryPointDecomposer decomposer;
+  cub::DeviceRadixSort::SortKeys(nullptr, temp_storage_bytes, (T *)(nullptr),
+                                 (T *)(nullptr), elements, decomposer);
+  return temp_storage_bytes;
+}
+
+// Computes and returns the scratch space needed for DeviceSelect::If of the
+// provided type Tin, to be written to the provided type Tout, for the provided
+// number of elements.  num_markers is the device pointer used to hold the
+// selected number of elements.
+template <typename Tin, typename Tout>
+static size_t DeviceSelectIfScratchSpace(size_t elements, int *num_markers) {
+  size_t temp_storage_bytes = 0;
+  CHECK_CUDA(cub::DeviceSelect::If(nullptr, temp_storage_bytes,
+                                   (Tin *)(nullptr), (Tout *)(nullptr),
+                                   num_markers, elements, True<Tin>()));
+  return temp_storage_bytes;
+}
+
+// Always returns the first element (only used for scratch space calcs).
+template <typename T>
+struct CustomFirst {
+  __host__ __device__ __forceinline__ T operator()(const T &a,
+                                                   const T & /*b*/) const {
+    return a;
+  }
+};
+
+// Computes and returns the scratch space needed for DeviceReduce::ReduceByKey
+// of the provided key K and value V with the provided number of elements.
+template <typename K, typename V>
+static size_t DeviceReduceByKeyScratchSpace(size_t elements) {
+  size_t temp_storage_bytes = 0;
+  CHECK_CUDA(cub::DeviceReduce::ReduceByKey(
+      nullptr, temp_storage_bytes, (K *)(nullptr), (K *)(nullptr),
+      (V *)(nullptr), (V *)(nullptr), (size_t *)(nullptr), CustomFirst<V>(),
+      elements));
+  return temp_storage_bytes;
+}
+
+// Computes and returns the scratch space needed for DeviceScan::InclusiveScan
+// of the provided value V with the provided number of elements.
+template <typename V>
+static size_t DeviceScanInclusiveScanScratchSpace(size_t elements) {
+  size_t temp_storage_bytes = 0;
+  CHECK_CUDA(cub::DeviceScan::InclusiveScan(nullptr, temp_storage_bytes,
+                                            (V *)(nullptr), (V *)(nullptr),
+                                            CustomFirst<V>(), elements));
+  return temp_storage_bytes;
+}
+
+GpuDetector::GpuDetector(size_t width, size_t height,
+                         const apriltag_detector_t *tag_detector)
+    : width_(width),
+      height_(height),
+      tag_detector_(tag_detector),
+      color_image_host_(width * height * 2),
+      color_image_device_(width * height * 2),
+      gray_image_device_(width * height),
+      decimated_image_device_(width / 2 * height / 2),
+      unfiltered_minmax_image_device_((width / 2 / 4 * height / 2 / 4) * 2),
+      minmax_image_device_((width / 2 / 4 * height / 2 / 4) * 2),
+      thresholded_image_device_(width / 2 * height / 2),
+      union_markers_device_(width / 2 * height / 2),
+      union_markers_size_device_(width / 2 * height / 2),
+      union_marker_pair_device_((width / 2 - 2) * (height / 2 - 2) * 4),
+      compressed_union_marker_pair_device_(union_marker_pair_device_.size()),
+      sorted_union_marker_pair_device_(union_marker_pair_device_.size()),
+      extents_device_(union_marker_pair_device_.size()),
+      reduced_dot_blobs_pair_device_(kMaxBlobs),
+      selected_extents_device_(kMaxBlobs),
+      selected_blobs_device_(union_marker_pair_device_.size()),
+      sorted_selected_blobs_device_(selected_blobs_device_.size()),
+      radix_sort_tmpstorage_device_(RadixSortScratchSpace<QuadBoundaryPoint>(
+          sorted_union_marker_pair_device_.size())),
+      temp_storage_compressed_union_marker_pair_device_(
+          DeviceSelectIfScratchSpace<QuadBoundaryPoint, QuadBoundaryPoint>(
+              union_marker_pair_device_.size(),
+              num_compressed_union_marker_pair_device_.get())),
+      temp_storage_bounds_reduce_by_key_device_(
+          DeviceReduceByKeyScratchSpace<uint64_t, MinMaxExtents>(
+              union_marker_pair_device_.size())),
+      temp_storage_dot_product_device_(
+          DeviceReduceByKeyScratchSpace<uint64_t, float>(
+              union_marker_pair_device_.size())),
+      temp_storage_compressed_filtered_blobs_device_(
+          DeviceSelectIfScratchSpace<IndexPoint, IndexPoint>(
+              union_marker_pair_device_.size(),
+              num_selected_blobs_device_.get())),
+      temp_storage_selected_extents_scan_device_(
+          DeviceScanInclusiveScanScratchSpace<
+              cub::KeyValuePair<long, MinMaxExtents>>(kMaxBlobs)) {
+  CHECK_EQ(tag_detector_->quad_decimate, 2);
+  CHECK(!tag_detector_->qtp.deglitch);
+
+  for (int i = 0; i < zarray_size(tag_detector_->tag_families); i++) {
+    apriltag_family_t *family;
+    zarray_get(tag_detector_->tag_families, i, &family);
+    if (family->width_at_border < min_tag_width_) {
+      min_tag_width_ = family->width_at_border;
+    }
+    normal_border_ |= !family->reversed_border;
+    reversed_border_ |= family->reversed_border;
+  }
+  min_tag_width_ /= tag_detector_->quad_decimate;
+  if (min_tag_width_ < 3) {
+    min_tag_width_ = 3;
+  }
+}
+
+GpuDetector::~GpuDetector() {}
+
+// Computes a massive image of 4x QuadBoundaryPoint per pixel with a
+// QuadBoundaryPoint for each pixel pair which crosses a blob boundary.
+template <size_t kBlockWidth, size_t kBlockHeight>
+__global__ void BlobDiff(const uint8_t *thresholded_image,
+                         const uint32_t *blobs,
+                         const uint32_t *union_markers_size,
+                         QuadBoundaryPoint *result, size_t width,
+                         size_t height) {
+  __shared__ uint32_t temp_blob_storage[kBlockWidth * kBlockHeight];
+  __shared__ uint8_t temp_image_storage[kBlockWidth * kBlockHeight];
+
+  // We overlap both directions in X, and only one direction in Y.
+  const uint x = blockIdx.x * (blockDim.x - 2) + threadIdx.x;
+  const uint y = blockIdx.y * (blockDim.y - 1) + threadIdx.y;
+
+  // Ignore anything outside the image.
+  if (x >= width || y >= height || y == 0) {
+    return;
+  }
+
+  // Compute the location in the image this threads is responsible for.
+  const uint global_input_index = x + y * width;
+  // And the destination in the temporary storage to save it.
+  const uint thread_linear_index = threadIdx.x + blockDim.x * threadIdx.y;
+
+  // Now, load all the data.
+  const uint32_t rep0 = temp_blob_storage[thread_linear_index] =
+      blobs[global_input_index];
+  const uint8_t v0 = temp_image_storage[thread_linear_index] =
+      thresholded_image[global_input_index];
+
+  // All threads in the block have now gotten this far so the shared memory is
+  // consistent.
+  __syncthreads();
+
+  // We've done our job loading things into memory, this pixel is a boundary
+  // with the upper and left sides, or covered by the next block.
+  if (threadIdx.x == 0) {
+    return;
+  }
+
+  // We need to search both ways in x, this thread doesn't participate further
+  // :(
+  if (threadIdx.x == blockDim.x - 1) {
+    return;
+  }
+
+  if (threadIdx.y == blockDim.y - 1) {
+    return;
+  }
+
+  // This is the last pixel along the lower and right sides, we don't need to
+  // compute it.
+  if (x == width - 1 || y == height - 1) {
+    return;
+  }
+
+  // Place in memory to write the result.
+  const uint global_output_index = (x - 1) + (y - 1) * (width - 2);
+
+  // Short circuit 127's and write an empty point out.
+  if (v0 == 127 || union_markers_size[rep0] < 25) {
+#pragma unroll
+    for (size_t point_offset = 0; point_offset < 4; ++point_offset) {
+      const size_t write_address =
+          (width - 2) * (height - 2) * point_offset + global_output_index;
+      result[write_address] = QuadBoundaryPoint();
+    }
+    return;
+  }
+
+  uint32_t rep1;
+  uint8_t v1;
+
+#define DO_CONN(dx, dy, point_offset)                                    \
+  {                                                                      \
+    QuadBoundaryPoint cluster_id;                                        \
+    const uint x1 = dx + threadIdx.x;                                    \
+    const uint y1 = dy + threadIdx.y;                                    \
+    const uint thread_linear_index1 = x1 + blockDim.x * y1;              \
+    v1 = temp_image_storage[thread_linear_index1];                       \
+    rep1 = temp_blob_storage[thread_linear_index1];                      \
+    if (v0 + v1 == 255) {                                                \
+      if (union_markers_size[rep1] >= 25) {                              \
+        if (rep0 < rep1) {                                               \
+          cluster_id.set_rep1(rep1);                                     \
+          cluster_id.set_rep0(rep0);                                     \
+        } else {                                                         \
+          cluster_id.set_rep1(rep0);                                     \
+          cluster_id.set_rep0(rep1);                                     \
+        }                                                                \
+        cluster_id.set_base_xy(x, y);                                    \
+        cluster_id.set_dxy(point_offset);                                \
+        cluster_id.set_black_to_white(v1 > v0);                          \
+      }                                                                  \
+    }                                                                    \
+    const size_t write_address =                                         \
+        (width - 2) * (height - 2) * point_offset + global_output_index; \
+    result[write_address] = cluster_id;                                  \
+  }
+
+  // We search the following 4 neighbors.
+  //      ________
+  //      | x | 0 |
+  //  -------------
+  //  | 3 | 2 | 1 |
+  //  -------------
+  //
+  //  If connection 3 has the same IDs as the connection between blocks 0 and 2,
+  //  we will have a duplicate entry.  Detect and don't add it.  This will only
+  //  happen if id(x) == id(0) and id(2) == id(3),
+  //         or id(x) == id(2) and id(0) ==id(3).
+
+  DO_CONN(1, 0, 0);
+  DO_CONN(1, 1, 1);
+  DO_CONN(0, 1, 2);
+  const uint64_t rep_block_2 = rep1;
+  const uint8_t v1_block_2 = v1;
+
+  const uint left_thread_linear_index1 =
+      threadIdx.x - 1 + blockDim.x * threadIdx.y;
+  const uint8_t v1_block_left = temp_image_storage[left_thread_linear_index1];
+  const uint32_t rep_block_left = temp_blob_storage[left_thread_linear_index1];
+
+  // Do the dedup calculation now.
+  if (v1_block_left != 127 && v1_block_2 != 127 &&
+      v1_block_2 != v1_block_left) {
+    if (x != 1 && union_markers_size[rep_block_left] >= 25 &&
+        union_markers_size[rep_block_2] >= 25) {
+      const size_t write_address =
+          (width - 2) * (height - 2) * 3 + global_output_index;
+      result[write_address] = QuadBoundaryPoint();
+      return;
+    }
+  }
+
+  DO_CONN(-1, 1, 3);
+}
+
+// Masks out just the blob ID pair, rep01.
+struct MaskRep01 {
+  __host__ __device__ __forceinline__ uint64_t
+  operator()(const QuadBoundaryPoint &a) const {
+    return a.rep01();
+  }
+};
+
+// Rewrites a QuadBoundaryPoint to an IndexPoint, adding the angle to the
+// center.
+class RewriteToIndexPoint {
+ public:
+  RewriteToIndexPoint(MinMaxExtents *extents_device, size_t num_extents)
+      : blob_finder_(extents_device, num_extents) {}
+
+  __host__ __device__ __forceinline__ IndexPoint
+  operator()(cub::KeyValuePair<long, QuadBoundaryPoint> pt) const {
+    size_t index = blob_finder_.FindBlobIndex(pt.key);
+    MinMaxExtents extents = blob_finder_.Get(index);
+    IndexPoint result(index, pt.value.point_bits());
+
+    float theta =
+        (atan2f(pt.value.y() - extents.cy(), pt.value.x() - extents.cx()) +
+         M_PI) *
+        8e6;
+    long long int theta_int = llrintf(theta);
+
+    result.set_theta(std::max<long long int>(0, theta_int));
+
+    return result;
+  }
+
+  BlobExtentsIndexFinder blob_finder_;
+};
+
+// TODO(austin): Make something which rewrites points on the way back out to
+// memory and adds the slope.
+
+// Transforms aQuadBoundaryPoint into a single point extent for Reduce.
+struct TransformQuadBoundaryPointToMinMaxExtents {
+  __host__ __device__ __forceinline__ MinMaxExtents
+  operator()(cub::KeyValuePair<long, QuadBoundaryPoint> pt) const {
+    MinMaxExtents result;
+    result.min_y = result.max_y = pt.value.y();
+    result.min_x = result.max_x = pt.value.x();
+    result.starting_offset = pt.key;
+    result.count = 1;
+    return result;
+  }
+};
+
+// Reduces 2 extents by tracking the range and updating the offset and count
+// accordingly.
+struct QuadBoundaryPointExtents {
+  __host__ __device__ __forceinline__ MinMaxExtents
+  operator()(const MinMaxExtents &a, const MinMaxExtents &b) const {
+    MinMaxExtents result;
+    result.min_x = std::min(a.min_x, b.min_x);
+    result.max_x = std::max(a.max_x, b.max_x);
+    result.min_y = std::min(a.min_y, b.min_y);
+    result.max_y = std::max(a.max_y, b.max_y);
+    // And the actual start is the first of the points.
+    result.starting_offset = std::min(a.starting_offset, b.starting_offset);
+    // We want to count everything.
+    result.count = a.count + b.count;
+    return result;
+  }
+};
+
+// Computes the dot product of the point vector and gradient for detecting
+// inside-out blobs.
+struct ComputeDotProductTransform {
+  ComputeDotProductTransform(MinMaxExtents *extents_device, size_t num_extents,
+                             size_t tag_width, size_t min_cluster_pixels,
+                             size_t max_cluster_pixels)
+      : index_finder_(extents_device, num_extents),
+        tag_width_(tag_width),
+        min_cluster_pixels_(std::max<size_t>(24u, min_cluster_pixels)),
+        max_cluster_pixels_(max_cluster_pixels) {}
+
+  __host__ __device__ __forceinline__ float operator()(
+      cub::KeyValuePair<long, QuadBoundaryPoint> a) const {
+    const size_t y = a.value.y();
+    const size_t x = a.value.x();
+
+    // Binary search for the index.
+    const size_t index = index_finder_.FindBlobIndex(a.key);
+
+    // Don't bother to compute the dot product for anything we will ignore in
+    // SelectBlobs below.
+    //
+    // TODO(austin): Figure out how to dedup with SelectBlobs below.
+    const MinMaxExtents extents = index_finder_.Get(index);
+    if (extents.count < min_cluster_pixels_) {
+      return 0;
+    }
+    if (extents.count > max_cluster_pixels_) {
+      return 0;
+    }
+
+    // Area must also be reasonable.
+    if ((extents.max_x - extents.min_x) * (extents.max_y - extents.min_y) <
+        tag_width_) {
+      return 0;
+    }
+
+    const float dx = static_cast<float>(x) - extents.cx();
+    const float dy = static_cast<float>(y) - extents.cy();
+
+    return dx * a.value.gx() + dy * a.value.gy();
+  }
+
+  BlobExtentsIndexFinder index_finder_;
+
+  size_t tag_width_;
+  size_t min_cluster_pixels_;
+  size_t max_cluster_pixels_;
+};
+
+// Selects blobs which are big enough, not too big, and have the right color in
+// the middle.
+class SelectBlobs {
+ public:
+  SelectBlobs(const MinMaxExtents *extents_device, const float *dot_products,
+              size_t tag_width, bool reversed_border, bool normal_border,
+              size_t min_cluster_pixels, size_t max_cluster_pixels)
+      : extents_device_(extents_device),
+        dot_products_(dot_products),
+        tag_width_(tag_width),
+        reversed_border_(reversed_border),
+        normal_border_(normal_border),
+        min_cluster_pixels_(std::max<size_t>(24u, min_cluster_pixels)),
+        max_cluster_pixels_(max_cluster_pixels) {}
+
+  // Returns true if the blob passes the size and dot product checks and is
+  // worth further consideration.
+  __host__ __device__ __forceinline__ bool operator()(
+      const size_t blob_index, MinMaxExtents extents) const {
+    if (extents.count < min_cluster_pixels_) {
+      return false;
+    }
+    if (extents.count > max_cluster_pixels_) {
+      return false;
+    }
+
+    // Area must also be reasonable.
+    if ((extents.max_x - extents.min_x) * (extents.max_y - extents.min_y) <
+        tag_width_) {
+      return false;
+    }
+
+    // And the right side must be inside.
+    const bool quad_reversed_border = dot_products_[blob_index] < 0;
+    if (!reversed_border_ && quad_reversed_border) {
+      return false;
+    }
+    if (!normal_border_ && !quad_reversed_border) {
+      return false;
+    }
+
+    return true;
+  }
+
+  __host__ __device__ __forceinline__ bool operator()(
+      const IndexPoint &a) const {
+    bool result = (*this)(a.blob_index(), extents_device_[a.blob_index()]);
+
+    return result;
+  }
+
+  const MinMaxExtents *extents_device_;
+  const float *dot_products_;
+  size_t tag_width_;
+
+  bool reversed_border_;
+  bool normal_border_;
+  size_t min_cluster_pixels_;
+  size_t max_cluster_pixels_;
+};
+
+// Class to zero out the count (and clear the starting offset) for each extents
+// which is going to be filtered out.  Used in conjunction with SumPoints to
+// update zero sized blobs.
+struct TransformZeroFilteredBlobSizes {
+ public:
+  TransformZeroFilteredBlobSizes(const float *dot_products, size_t tag_width,
+                                 bool reversed_border, bool normal_border,
+                                 size_t min_cluster_pixels,
+                                 size_t max_cluster_pixels)
+      : select_blobs_(nullptr, dot_products, tag_width, reversed_border,
+                      normal_border, min_cluster_pixels, max_cluster_pixels) {}
+
+  __host__ __device__ __forceinline__ cub::KeyValuePair<long, MinMaxExtents>
+  operator()(cub::KeyValuePair<long, MinMaxExtents> pt) const {
+    pt.value.count *= select_blobs_(pt.key, pt.value);
+    pt.value.starting_offset = 0;
+    return pt;
+  }
+
+ private:
+  SelectBlobs select_blobs_;
+};
+
+// Class to implement a custom Scan operator which passes through the previous
+// min/max/etc, but re-sums count into starting_offset.  This lets us collapse
+// out regions which don't pass the minimum filters.
+struct SumPoints {
+  __host__ __device__ __forceinline__ cub::KeyValuePair<long, MinMaxExtents>
+  operator()(const cub::KeyValuePair<long, MinMaxExtents> &a,
+             const cub::KeyValuePair<long, MinMaxExtents> &b) const {
+    cub::KeyValuePair<long, MinMaxExtents> result;
+    if (a.key < b.key) {
+      result.value.min_x = b.value.min_x;
+      result.value.min_y = b.value.min_y;
+      result.value.max_x = b.value.max_x;
+      result.value.max_y = b.value.max_y;
+      result.value.count = b.value.count;
+      result.key = b.key;
+    } else {
+      result.value.min_x = a.value.min_x;
+      result.value.min_y = a.value.min_y;
+      result.value.max_x = a.value.max_x;
+      result.value.max_y = a.value.max_y;
+      result.value.count = a.value.count;
+      result.key = a.key;
+    }
+
+    result.value.starting_offset = b.value.starting_offset + b.value.count;
+    return result;
+  }
+};
+
+void GpuDetector::Detect(const uint8_t *image) {
+  color_image_host_.MemcpyFrom(image);
+  start_.Record(&stream_);
+  color_image_device_.MemcpyAsyncFrom(&color_image_host_, &stream_);
+  after_image_memcpy_to_device_.Record(&stream_);
+
+  // Threshold the image.
+  CudaToGreyscaleAndDecimateHalide(
+      color_image_device_.get(), gray_image_device_.get(),
+      decimated_image_device_.get(), unfiltered_minmax_image_device_.get(),
+      minmax_image_device_.get(), thresholded_image_device_.get(), width_,
+      height_, tag_detector_->qtp.min_white_black_diff, &stream_);
+  after_threshold_.Record(&stream_);
+
+  union_markers_size_device_.MemsetAsync(0u, &stream_);
+  after_memset_.Record(&stream_);
+
+  // Unionfind the image.
+  LabelImage(ToGpuImage(thresholded_image_device_),
+             ToGpuImage(union_markers_device_),
+             ToGpuImage(union_markers_size_device_), stream_.get());
+
+  after_unionfinding_.Record(&stream_);
+
+  CHECK((width_ % 8) == 0);
+  CHECK((height_ % 8) == 0);
+
+  size_t decimated_width = width_ / 2;
+  size_t decimated_height = height_ / 2;
+
+  // TODO(austin): Tune for the global shutter camera.
+  // 1280 -> 2 * 128 * 5
+  // 720 -> 2 * 8 * 5 * 9
+
+  // Compute the unfiltered list of blob pairs and points.
+  {
+    constexpr size_t kBlockWidth = 32;
+    constexpr size_t kBlockHeight = 16;
+    dim3 threads(kBlockWidth, kBlockHeight, 1);
+    // Overlap 1 on each side in x, and 1 in y.
+    dim3 blocks((decimated_width + threads.x - 3) / (threads.x - 2),
+                (decimated_height + threads.y - 2) / (threads.y - 1), 1);
+
+    //  Make sure we fit in our mask.
+    CHECK_LT(width_ * height_, static_cast<size_t>(1 << 22));
+
+    BlobDiff<kBlockWidth, kBlockHeight><<<blocks, threads, 0, stream_.get()>>>(
+        thresholded_image_device_.get(), union_markers_device_.get(),
+        union_markers_size_device_.get(), union_marker_pair_device_.get(),
+        decimated_width, decimated_height);
+    MaybeCheckAndSynchronize();
+  }
+
+  // TODO(austin): Can I do the first step of the zero removal in the BlobDiff
+  // kernel?
+
+  after_diff_.Record(&stream_);
+
+  {
+    // Remove empty points which aren't to be considered before sorting to speed
+    // things up.
+    size_t temp_storage_bytes =
+        temp_storage_compressed_union_marker_pair_device_.size();
+    NonZero nz;
+    CHECK_CUDA(cub::DeviceSelect::If(
+        temp_storage_compressed_union_marker_pair_device_.get(),
+        temp_storage_bytes, union_marker_pair_device_.get(),
+        compressed_union_marker_pair_device_.get(),
+        num_compressed_union_marker_pair_device_.get(),
+        union_marker_pair_device_.size(), nz, stream_.get()));
+
+    MaybeCheckAndSynchronize();
+  }
+
+  after_compact_.Record(&stream_);
+
+  int num_compressed_union_marker_pair_host;
+  {
+    num_compressed_union_marker_pair_device_.MemcpyTo(
+        &num_compressed_union_marker_pair_host);
+    CHECK_LT(static_cast<size_t>(num_compressed_union_marker_pair_host),
+             union_marker_pair_device_.size());
+
+    // Now, sort just the keys to group like points.
+    size_t temp_storage_bytes = radix_sort_tmpstorage_device_.size();
+    QuadBoundaryPointDecomposer decomposer;
+    CHECK_CUDA(cub::DeviceRadixSort::SortKeys(
+        radix_sort_tmpstorage_device_.get(), temp_storage_bytes,
+        compressed_union_marker_pair_device_.get(),
+        sorted_union_marker_pair_device_.get(),
+        num_compressed_union_marker_pair_host, decomposer,
+        QuadBoundaryPoint::kRepEndBit, QuadBoundaryPoint::kBitsInKey,
+        stream_.get()));
+
+    MaybeCheckAndSynchronize();
+  }
+
+  after_sort_.Record(&stream_);
+
+  size_t num_quads_host = 0;
+  {
+    // Our next step is to compute the extents of each blob so we can filter
+    // blobs.
+    cub::ArgIndexInputIterator<QuadBoundaryPoint *> value_index_input_iterator(
+        sorted_union_marker_pair_device_.get());
+    TransformQuadBoundaryPointToMinMaxExtents min_max;
+    cub::TransformInputIterator<MinMaxExtents,
+                                TransformQuadBoundaryPointToMinMaxExtents,
+                                cub::ArgIndexInputIterator<QuadBoundaryPoint *>>
+        value_input_iterator(value_index_input_iterator, min_max);
+
+    // Don't care about the output keys...
+    cub::DiscardOutputIterator<uint64_t> key_discard_iterator;
+
+    // Provide a mask to detect keys by rep01()
+    MaskRep01 mask;
+    cub::TransformInputIterator<uint64_t, MaskRep01, QuadBoundaryPoint *>
+        key_input_iterator(sorted_union_marker_pair_device_.get(), mask);
+
+    // Reduction operator.
+    QuadBoundaryPointExtents reduce;
+
+    size_t temp_storage_bytes =
+        temp_storage_bounds_reduce_by_key_device_.size();
+    cub::DeviceReduce::ReduceByKey(
+        temp_storage_bounds_reduce_by_key_device_.get(), temp_storage_bytes,
+        key_input_iterator, key_discard_iterator, value_input_iterator,
+        extents_device_.get(), num_quads_device_.get(), reduce,
+        num_compressed_union_marker_pair_host, stream_.get());
+    after_bounds_.Record(&stream_);
+
+    num_quads_device_.MemcpyTo(&num_quads_host);
+  }
+  CHECK_LE(num_quads_host, reduced_dot_blobs_pair_device_.size());
+
+  // Longest april tag will be the full perimeter of the image.  Each point
+  // results in 2 neighbor points, 1 straight, and one at 45 degrees.  But, we
+  // are in decimated space here, and width_ and height_ are in full image
+  // space.  And there are 2 sides to each rectangle...
+  //
+  // Aprilrobotics has a *3 instead of a *2 here since they have duplicated
+  // points in their list at this stage.
+  const size_t max_april_tag_perimeter = 2 * (width_ + height_);
+
+  {
+    // Now that we have the extents, compute the dot product used to detect
+    // which blobs are inside out or right side out.
+
+    // Don't care about these quantities since the previous ReduceByKey will
+    // ahve computed them too.
+    cub::DiscardOutputIterator<size_t> length_discard_iterator;
+    cub::DiscardOutputIterator<uint64_t> key_discard_iterator;
+
+    size_t dot_reduce_temp_storage_bytes =
+        temp_storage_dot_product_device_.size();
+
+    // Provide a mask to detect keys by rep01()
+    MaskRep01 mask;
+    cub::TransformInputIterator<uint64_t, MaskRep01, QuadBoundaryPoint *>
+        key_iterator(sorted_union_marker_pair_device_.get(), mask);
+
+    // Create a transform iterator which calculates the dot product for each
+    // point individually.
+    cub::ArgIndexInputIterator<QuadBoundaryPoint *> value_index_input_iterator(
+        sorted_union_marker_pair_device_.get());
+    ComputeDotProductTransform dot_operator(
+        extents_device_.get(), num_quads_host, min_tag_width_,
+        tag_detector_->qtp.min_cluster_pixels, max_april_tag_perimeter);
+    cub::TransformInputIterator<float, ComputeDotProductTransform,
+                                cub::ArgIndexInputIterator<QuadBoundaryPoint *>>
+        value_iterator(value_index_input_iterator, dot_operator);
+
+    // And now sum per key.
+    CHECK_CUDA(cub::DeviceReduce::ReduceByKey(
+        temp_storage_dot_product_device_.get(), dot_reduce_temp_storage_bytes,
+        key_iterator, key_discard_iterator, value_iterator,
+        reduced_dot_blobs_pair_device_.get(), length_discard_iterator,
+        cub::Sum(), num_compressed_union_marker_pair_host, stream_.get()));
+
+    MaybeCheckAndSynchronize();
+  }
+
+  after_dot_.Record(&stream_);
+
+  {
+    // Now that we have the dot products, we need to rewrite the extents for the
+    // post-thresholded world so we can find the start and end address of blobs
+    // for fitting lines.
+    //
+    // Clear the size of non-passing extents and the starting offset of all
+    // extents.
+    cub::ArgIndexInputIterator<MinMaxExtents *> value_index_input_iterator(
+        extents_device_.get());
+    TransformZeroFilteredBlobSizes rewrite(
+        reduced_dot_blobs_pair_device_.get(), min_tag_width_, reversed_border_,
+        normal_border_, tag_detector_->qtp.min_cluster_pixels,
+        max_april_tag_perimeter);
+    cub::TransformInputIterator<cub::KeyValuePair<long, MinMaxExtents>,
+                                TransformZeroFilteredBlobSizes,
+                                cub::ArgIndexInputIterator<MinMaxExtents *>>
+        input_iterator(value_index_input_iterator, rewrite);
+
+    // Sum the counts of everything before us, and update the offset.
+    SumPoints sum_points;
+
+    // TODO(justin): Rip the key off when writing.
+
+    // Rewrite the extents to have the starting offset and count match the
+    // post-selected values.
+    size_t temp_storage_bytes =
+        temp_storage_selected_extents_scan_device_.size();
+    CHECK_CUDA(cub::DeviceScan::InclusiveScan(
+        temp_storage_selected_extents_scan_device_.get(), temp_storage_bytes,
+        input_iterator, selected_extents_device_.get(), sum_points,
+        num_quads_host));
+
+    MaybeCheckAndSynchronize();
+  }
+
+  after_transform_extents_.Record(&stream_);
+
+  int num_selected_blobs_host;
+  {
+    // Now, copy over all points which pass our thresholds.
+    cub::ArgIndexInputIterator<QuadBoundaryPoint *> value_index_input_iterator(
+        sorted_union_marker_pair_device_.get());
+    RewriteToIndexPoint rewrite(extents_device_.get(), num_quads_host);
+    cub::TransformInputIterator<IndexPoint, RewriteToIndexPoint,
+                                cub::ArgIndexInputIterator<QuadBoundaryPoint *>>
+        input_iterator(value_index_input_iterator, rewrite);
+
+    SelectBlobs select_blobs(
+        extents_device_.get(), reduced_dot_blobs_pair_device_.get(),
+        min_tag_width_, reversed_border_, normal_border_,
+        tag_detector_->qtp.min_cluster_pixels, max_april_tag_perimeter);
+
+    size_t temp_storage_bytes =
+        temp_storage_compressed_filtered_blobs_device_.size();
+    CHECK_CUDA(cub::DeviceSelect::If(
+        temp_storage_compressed_filtered_blobs_device_.get(),
+        temp_storage_bytes, input_iterator, selected_blobs_device_.get(),
+        num_selected_blobs_device_.get(), num_compressed_union_marker_pair_host,
+        select_blobs, stream_.get()));
+
+    MaybeCheckAndSynchronize();
+
+    num_selected_blobs_device_.MemcpyTo(&num_selected_blobs_host);
+  }
+
+  after_filter_.Record(&stream_);
+
+  {
+    // Sort based on the angle.
+    size_t temp_storage_bytes = radix_sort_tmpstorage_device_.size();
+    QuadIndexPointDecomposer decomposer;
+
+    CHECK_CUDA(cub::DeviceRadixSort::SortKeys(
+        radix_sort_tmpstorage_device_.get(), temp_storage_bytes,
+        selected_blobs_device_.get(), sorted_selected_blobs_device_.get(),
+        num_selected_blobs_host, decomposer, IndexPoint::kRepEndBit,
+        IndexPoint::kBitsInKey, stream_.get()));
+
+    MaybeCheckAndSynchronize();
+  }
+
+  after_filtered_sort_.Record(&stream_);
+
+  // Report out how long things took.
+  LOG(INFO) << "Found " << num_compressed_union_marker_pair_host << " items";
+  LOG(INFO) << "Selected " << num_selected_blobs_host << " blobs";
+
+  CudaEvent *previous_event = &start_;
+  for (auto name_event : std::vector<std::tuple<std::string_view, CudaEvent &>>{
+           {"Memcpy", after_image_memcpy_to_device_},
+           {"Threshold", after_threshold_},
+           {"Memset", after_memset_},
+           {"Unionfinding", after_unionfinding_},
+           {"Diff", after_diff_},
+           {"Compact", after_compact_},
+           {"Sort", after_sort_},
+           {"Bounds", after_bounds_},
+           {"Dot product", after_dot_},
+           {"Transform Extents", after_transform_extents_},
+           {"Filter by dot product", after_filter_},
+           {"Filtered sort", after_filtered_sort_},
+       }) {
+    std::get<1>(name_event).Synchronize();
+    LOG(INFO) << std::get<0>(name_event) << " "
+              << float_milli(
+                     std::get<1>(name_event).ElapsedTime(*previous_event))
+                     .count()
+              << "ms";
+    previous_event = &std::get<1>(name_event);
+  }
+
+  LOG(INFO) << "Overall "
+            << float_milli(previous_event->ElapsedTime(start_)).count() << "ms";
+
+  // Average.  Skip the first one as the kernel is warming up and is slower.
+  if (!first_) {
+    ++execution_count_;
+    execution_duration_ += previous_event->ElapsedTime(start_);
+    LOG(INFO) << "Average overall "
+              << float_milli(execution_duration_ / execution_count_).count()
+              << "ms";
+  }
+
+  LOG(INFO) << "Found compressed runs: " << num_quads_host;
+
+  first_ = false;
+}
+
+}  // namespace apriltag
+}  // namespace frc971
diff --git a/frc971/orin/apriltag.h b/frc971/orin/apriltag.h
new file mode 100644
index 0000000..068c0c7
--- /dev/null
+++ b/frc971/orin/apriltag.h
@@ -0,0 +1,280 @@
+#ifndef FRC971_ORIN_APRILTAG_H_
+#define FRC971_ORIN_APRILTAG_H_
+
+#include <cub/iterator/transform_input_iterator.cuh>
+
+#include "third_party/apriltag/apriltag.h"
+
+#include "cuda_runtime.h"
+#include "device_launch_parameters.h"
+#include "frc971/orin/cuda.h"
+#include "frc971/orin/gpu_image.h"
+#include "frc971/orin/points.h"
+
+namespace frc971 {
+namespace apriltag {
+
+// Class to hold the extents of a blob of points.
+struct MinMaxExtents {
+  // Min and max coordinates (in non-decimated coordinates)
+  uint16_t min_x;
+  uint16_t min_y;
+  uint16_t max_x;
+  uint16_t max_y;
+
+  // The starting offset of this blob of points in the vector holding all the
+  // points.
+  uint32_t starting_offset;
+
+  // The number of points in the blob.
+  uint32_t count;
+
+  // Center location of the blob using the aprilrobotics algorithm.
+  __host__ __device__ float cx() const {
+    return (min_x + max_x) * 0.5f + 0.05118;
+  }
+  __host__ __device__ float cy() const {
+    return (min_y + max_y) * 0.5f + -0.028581;
+  }
+};
+
+static_assert(sizeof(MinMaxExtents) == 16, "MinMaxExtents didn't pack right.");
+
+// Class to find the blob index of a point in a point vector.
+class BlobExtentsIndexFinder {
+ public:
+  BlobExtentsIndexFinder(const MinMaxExtents *extents_device,
+                         size_t num_extents)
+      : extents_device_(extents_device), num_extents_(num_extents) {}
+
+  __host__ __device__ size_t FindBlobIndex(size_t point_index) const {
+    // Do a binary search for the blob which has the point in it's
+    // starting_offset range.
+    size_t min = 0;
+    size_t max = num_extents_;
+    while (true) {
+      if (min + 1 == max) {
+        return min;
+      }
+
+      size_t average = min + (max - min) / 2;
+
+      if (average < num_extents_ && extents_device_[average].starting_offset <=
+                                        static_cast<size_t>(point_index)) {
+        min = average;
+      } else {
+        max = average;
+      }
+    }
+  }
+
+  // Returns the extents for a blob index.
+  __host__ __device__ MinMaxExtents Get(size_t index) const {
+    return extents_device_[index];
+  }
+
+ private:
+  const MinMaxExtents *extents_device_;
+  size_t num_extents_;
+
+  // TODO(austin): Cache the last one?
+};
+
+// GPU based april tag detector.
+class GpuDetector {
+ public:
+  // The number of blobs we will consider when counting april tags.
+  static constexpr size_t kMaxBlobs = IndexPoint::kMaxBlobs;
+
+  // Constructs a detector, reserving space for detecting tags of the provided
+  // with and height, using the provided detector options.
+  GpuDetector(size_t width, size_t height,
+              const apriltag_detector_t *tag_detector);
+
+  virtual ~GpuDetector();
+
+  // Detects april tags in the provided image.
+  void Detect(const uint8_t *image);
+
+  // Debug methods to expose internal state for testing.
+  void CopyGrayTo(uint8_t *output) { gray_image_device_.MemcpyTo(output); }
+  void CopyDecimatedTo(uint8_t *output) {
+    decimated_image_device_.MemcpyTo(output);
+  }
+  void CopyThresholdedTo(uint8_t *output) {
+    thresholded_image_device_.MemcpyTo(output);
+  }
+  void CopyUnionMarkersTo(uint32_t *output) {
+    union_markers_device_.MemcpyTo(output);
+  }
+
+  void CopyUnionMarkerPairTo(QuadBoundaryPoint *output) {
+    union_marker_pair_device_.MemcpyTo(output);
+  }
+
+  void CopyCompressedUnionMarkerPairTo(QuadBoundaryPoint *output) {
+    compressed_union_marker_pair_device_.MemcpyTo(output);
+  }
+
+  std::vector<QuadBoundaryPoint> CopySortedUnionMarkerPair() {
+    std::vector<QuadBoundaryPoint> result;
+    int size = NumCompressedUnionMarkerPairs();
+    result.resize(size);
+    sorted_union_marker_pair_device_.MemcpyTo(result.data(), size);
+    return result;
+  }
+
+  int NumCompressedUnionMarkerPairs() {
+    return num_compressed_union_marker_pair_device_.Copy()[0];
+  }
+
+  void CopyUnionMarkersSizeTo(uint32_t *output) {
+    union_markers_size_device_.MemcpyTo(output);
+  }
+
+  int NumQuads() const { return num_quads_device_.Copy()[0]; }
+
+  std::vector<MinMaxExtents> CopyExtents() {
+    return extents_device_.Copy(NumQuads());
+  }
+
+  std::vector<float> CopyReducedDotBlobs() {
+    return reduced_dot_blobs_pair_device_.Copy(NumQuads());
+  }
+
+  std::vector<cub::KeyValuePair<long, MinMaxExtents>> CopySelectedExtents() {
+    return selected_extents_device_.Copy(NumQuads());
+  }
+
+  int NumSelectedPairs() const { return num_selected_blobs_device_.Copy()[0]; }
+
+  std::vector<IndexPoint> CopySelectedBlobs() {
+    return selected_blobs_device_.Copy(NumSelectedPairs());
+  }
+
+  std::vector<IndexPoint> CopySortedSelectedBlobs() {
+    return sorted_selected_blobs_device_.Copy(NumSelectedPairs());
+  }
+
+ private:
+  // Creates a GPU image wrapped around the provided memory.
+  template <typename T>
+  GpuImage<T> ToGpuImage(GpuMemory<T> &memory) {
+    if (memory.size() == width_ * height_) {
+      return GpuImage<T>{
+          .data = memory.get(),
+          .rows = height_,
+          .cols = width_,
+          .step = width_,
+      };
+    } else if (memory.size() == width_ * height_ / 4) {
+      return GpuImage<T>{
+          .data = memory.get(),
+          .rows = height_ / 2,
+          .cols = width_ / 2,
+          .step = width_ / 2,
+      };
+    } else {
+      LOG(FATAL) << "Unknown image shape";
+    }
+  }
+
+  // Size of the image.
+  const size_t width_;
+  const size_t height_;
+
+  // Detector parameters.
+  const apriltag_detector_t *tag_detector_;
+
+  // Stream to operate on.
+  CudaStream stream_;
+
+  // Events for each of the steps for timing.
+  CudaEvent start_;
+  CudaEvent after_image_memcpy_to_device_;
+  CudaEvent after_threshold_;
+  CudaEvent after_memset_;
+  CudaEvent after_unionfinding_;
+  CudaEvent after_diff_;
+  CudaEvent after_compact_;
+  CudaEvent after_sort_;
+  CudaEvent after_bounds_;
+  CudaEvent after_dot_;
+  CudaEvent after_transform_extents_;
+  CudaEvent after_filter_;
+  CudaEvent after_filtered_sort_;
+
+  // TODO(austin): Remove this...
+  HostMemory<uint8_t> color_image_host_;
+
+  // Starting color image.
+  GpuMemory<uint8_t> color_image_device_;
+  // Full size gray scale image.
+  GpuMemory<uint8_t> gray_image_device_;
+  // Half resolution, gray, decimated image.
+  GpuMemory<uint8_t> decimated_image_device_;
+  // Intermediates for thresholding.
+  GpuMemory<uint8_t> unfiltered_minmax_image_device_;
+  GpuMemory<uint8_t> minmax_image_device_;
+  GpuMemory<uint8_t> thresholded_image_device_;
+
+  // The union markers for each pixel.
+  GpuMemory<uint32_t> union_markers_device_;
+  // The size of each blob.  The blob size is stored at the index of the stored
+  // union marker id in union_markers_device_ aboe.
+  GpuMemory<uint32_t> union_markers_size_device_;
+
+  // Full list of boundary points, densly stored but mostly zero.
+  GpuMemory<QuadBoundaryPoint> union_marker_pair_device_;
+  // Unsorted list of points with 0's removed.
+  GpuMemory<QuadBoundaryPoint> compressed_union_marker_pair_device_;
+  // Blob representation sorted list of points.
+  GpuMemory<QuadBoundaryPoint> sorted_union_marker_pair_device_;
+  // Number of compressed points.
+  GpuMemory<int> num_compressed_union_marker_pair_device_{
+      /* allocate 1 integer...*/ 1};
+
+  // Number of unique blob IDs.
+  GpuMemory<size_t> num_quads_device_{/* allocate 1 integer...*/ 1};
+  // Bounds per blob, one blob per ID.
+  GpuMemory<MinMaxExtents> extents_device_;
+  // Sum of the dot products between the vector from center to each point, with
+  // the gradient at each point.
+  GpuMemory<float> reduced_dot_blobs_pair_device_;
+  // Extents of all the blobs under consideration.
+  GpuMemory<cub::KeyValuePair<long, MinMaxExtents>> selected_extents_device_;
+
+  // Number of keys in selected_blobs_device_.
+  GpuMemory<int> num_selected_blobs_device_{/* allocate 1 integer...*/ 1};
+
+  // Compacted blobs which pass our threshold.
+  GpuMemory<IndexPoint> selected_blobs_device_;
+  // Sorted list of those points.
+  GpuMemory<IndexPoint> sorted_selected_blobs_device_;
+
+  // Temporary storage for each of the steps.
+  // TODO(austin): Can we combine these and just use the max?
+  GpuMemory<uint32_t> radix_sort_tmpstorage_device_;
+  GpuMemory<uint8_t> temp_storage_compressed_union_marker_pair_device_;
+  GpuMemory<uint8_t> temp_storage_bounds_reduce_by_key_device_;
+  GpuMemory<uint8_t> temp_storage_dot_product_device_;
+  GpuMemory<uint8_t> temp_storage_compressed_filtered_blobs_device_;
+  GpuMemory<uint8_t> temp_storage_selected_extents_scan_device_;
+
+  // Cumulative duration of april tag detection.
+  std::chrono::nanoseconds execution_duration_{0};
+  // Number of detections.
+  size_t execution_count_ = 0;
+  // True if this is the first detection.
+  bool first_ = true;
+
+  // Cached quantities used for tag filtering.
+  bool normal_border_ = false;
+  bool reversed_border_ = false;
+  int min_tag_width_ = 1000000;
+};
+
+}  // namespace apriltag
+}  // namespace frc971
+
+#endif  // FRC971_ORIN_APRILTAG_H_
diff --git a/frc971/orin/cuda.cc b/frc971/orin/cuda.cc
new file mode 100644
index 0000000..1173f73
--- /dev/null
+++ b/frc971/orin/cuda.cc
@@ -0,0 +1,23 @@
+#include "frc971/orin/cuda.h"
+
+#include "gflags/gflags.h"
+#include "glog/logging.h"
+
+DEFINE_bool(
+    sync, false,
+    "If true, force synchronization after each step to isolate errors better.");
+
+namespace frc971 {
+namespace apriltag {
+
+void CheckAndSynchronize() {
+  CHECK_CUDA(cudaDeviceSynchronize());
+  CHECK_CUDA(cudaGetLastError());
+}
+
+void MaybeCheckAndSynchronize() {
+  if (FLAGS_sync) CheckAndSynchronize();
+}
+
+}  // namespace apriltag
+}  // namespace frc971
diff --git a/frc971/orin/cuda.h b/frc971/orin/cuda.h
new file mode 100644
index 0000000..e386aa6
--- /dev/null
+++ b/frc971/orin/cuda.h
@@ -0,0 +1,207 @@
+#ifndef FRC971_ORIN_CUDA_H_
+#define FRC971_ORIN_CUDA_H_
+
+#include <chrono>
+#include <span>
+
+#include "glog/logging.h"
+
+#include "cuda_runtime.h"
+#include "device_launch_parameters.h"
+
+// CHECKs that a cuda method returned success.
+// TODO(austin): This will not handle if and else statements quite right, fix if
+// we care.
+#define CHECK_CUDA(condition)                                             \
+  if (auto c = condition)                                                 \
+  LOG(FATAL) << "Check failed: " #condition " (" << cudaGetErrorString(c) \
+             << ") "
+
+namespace frc971 {
+namespace apriltag {
+
+// Class to manage the lifetime of a Cuda stream.  This is used to provide
+// relative ordering between kernels on the same stream.
+class CudaStream {
+ public:
+  CudaStream() { CHECK_CUDA(cudaStreamCreate(&stream_)); }
+
+  CudaStream(const CudaStream &) = delete;
+  CudaStream &operator=(const CudaStream &) = delete;
+
+  virtual ~CudaStream() { CHECK_CUDA(cudaStreamDestroy(stream_)); }
+
+  // Returns the stream.
+  cudaStream_t get() { return stream_; }
+
+ private:
+  cudaStream_t stream_;
+};
+
+// Class to manage the lifetime of a Cuda Event.  Cuda events are used for
+// timing events on a stream.
+class CudaEvent {
+ public:
+  CudaEvent() { CHECK_CUDA(cudaEventCreate(&event_)); }
+
+  CudaEvent(const CudaEvent &) = delete;
+  CudaEvent &operator=(const CudaEvent &) = delete;
+
+  virtual ~CudaEvent() { CHECK_CUDA(cudaEventDestroy(event_)); }
+
+  // Queues up an event to be timestamped on the stream when it is executed.
+  void Record(CudaStream *stream) {
+    CHECK_CUDA(cudaEventRecord(event_, stream->get()));
+  }
+
+  // Returns the time elapsed between start and this event if it has been
+  // triggered.
+  std::chrono::nanoseconds ElapsedTime(const CudaEvent &start) {
+    float ms;
+    CHECK_CUDA(cudaEventElapsedTime(&ms, start.event_, event_));
+    return std::chrono::duration_cast<std::chrono::nanoseconds>(
+        std::chrono::duration<float, std::milli>(ms));
+  }
+
+  // Waits until the event has been triggered.
+  void Synchronize() { CHECK_CUDA(cudaEventSynchronize(event_)); }
+
+ private:
+  cudaEvent_t event_;
+};
+
+// Class to manage the lifetime of page locked host memory for fast copies back
+// to host memory.
+template <typename T>
+class HostMemory {
+ public:
+  // Allocates a block of memory for holding up to size objects of type T.
+  HostMemory(size_t size) {
+    T *memory;
+    CHECK_CUDA(cudaMallocHost((void **)(&memory), size * sizeof(T)));
+    span_ = std::span<T>(memory, size);
+  }
+  HostMemory(const HostMemory &) = delete;
+  HostMemory &operator=(const HostMemory &) = delete;
+
+  virtual ~HostMemory() { CHECK_CUDA(cudaFreeHost(span_.data())); }
+
+  // Returns a pointer to the memory.
+  T *get() { return span_.data(); }
+  const T *get() const { return span_.data(); }
+
+  // Returns the number of objects the memory can hold.
+  size_t size() const { return span_.size(); }
+
+  // Copies data from other (host memory) to this's memory.
+  void MemcpyFrom(const T *other) {
+    memcpy(span_.data(), other, sizeof(T) * size());
+  }
+  // Copies data to other (host memory) from this's memory.
+  void MemcpyTo(const T *other) {
+    memcpy(other, span_.data(), sizeof(T) * size());
+  }
+
+ private:
+  std::span<T> span_;
+};
+
+// Class to manage the lifetime of device memory.
+template <typename T>
+class GpuMemory {
+ public:
+  // Allocates a block of memory for holding up to size objects of type T in
+  // device memory.
+  GpuMemory(size_t size) : size_(size) {
+    CHECK_CUDA(cudaMalloc((void **)(&memory_), size * sizeof(T)));
+  }
+  GpuMemory(const GpuMemory &) = delete;
+  GpuMemory &operator=(const GpuMemory &) = delete;
+
+  virtual ~GpuMemory() { CHECK_CUDA(cudaFree(memory_)); }
+
+  // Returns the device pointer to the memory.
+  T *get() { return memory_; }
+  const T *get() const { return memory_; }
+
+  // Returns the number of objects this memory can hold.
+  size_t size() const { return size_; }
+
+  // Copies data from host memory to this memory asynchronously on the provided
+  // stream.
+  void MemcpyAsyncFrom(const T *host_memory, CudaStream *stream) {
+    CHECK_CUDA(cudaMemcpyAsync(memory_, host_memory, sizeof(T) * size_,
+                               cudaMemcpyHostToDevice, stream->get()));
+  }
+  void MemcpyAsyncFrom(const HostMemory<T> *host_memory, CudaStream *stream) {
+    MemcpyAsyncFrom(host_memory->get(), stream);
+  }
+
+  // Copies data to host memory from this memory asynchronously on the provided
+  // stream.
+  void MemcpyAsyncTo(T *host_memory, CudaStream *stream) const {
+    CHECK_CUDA(cudaMemcpyAsync(reinterpret_cast<void *>(host_memory),
+                               reinterpret_cast<void *>(memory_),
+                               sizeof(T) * size_, cudaMemcpyDeviceToHost,
+                               stream->get()));
+  }
+  void MemcpyAsyncTo(HostMemory<T> *host_memory, CudaStream *stream) const {
+    MemcpyAsyncTo(host_memory->get(), stream);
+  }
+
+  // Copies data from host_memory to this memory blocking.
+  void MemcpyFrom(const T *host_memory) {
+    CHECK_CUDA(cudaMemcpy(reinterpret_cast<void *>(memory_),
+                          reinterpret_cast<const void *>(host_memory),
+                          sizeof(T) * size_, cudaMemcpyHostToDevice));
+  }
+  void MemcpyFrom(const HostMemory<T> *host_memory) {
+    MemcpyFrom(host_memory->get());
+  }
+
+  // Copies data to host_memory from this memory.  Only copies size objects.
+  void MemcpyTo(T *host_memory, size_t size) const {
+    CHECK_CUDA(cudaMemcpy(reinterpret_cast<void *>(host_memory), memory_,
+                          sizeof(T) * size, cudaMemcpyDeviceToHost));
+  }
+  // Copies data to host_memory from this memory.
+  void MemcpyTo(T *host_memory) const { MemcpyTo(host_memory, size_); }
+  void MemcpyTo(HostMemory<T> *host_memory) const {
+    MemcpyTo(host_memory->get());
+  }
+
+  // Sets the memory asynchronously to contain data of type 'val' on the provide
+  // stream.
+  void MemsetAsync(const uint8_t val, CudaStream *stream) const {
+    CHECK_CUDA(cudaMemsetAsync(memory_, val, sizeof(T) * size_, stream->get()));
+  }
+
+  // Allocates a vector on the host, copies size objects into it, and returns
+  // it.
+  std::vector<T> Copy(size_t s) const {
+    CHECK_LE(s, size_);
+    std::vector<T> result(s);
+    MemcpyTo(result.data(), s);
+    return result;
+  }
+
+  // Copies all the objects in this memory to a vector on the host and returns
+  // it.
+  std::vector<T> Copy() const { return Copy(size_); }
+
+ private:
+  T *memory_;
+  const size_t size_;
+};
+
+// Synchronizes and CHECKs for success the last CUDA operation.
+void CheckAndSynchronize();
+
+// Synchronizes and CHECKS iff --sync is passed on the command line.  Makes it
+// so we can leave debugging in the code.
+void MaybeCheckAndSynchronize();
+
+}  // namespace apriltag
+}  // namespace frc971
+
+#endif  // FRC971_ORIN_CUDA_H_
diff --git a/frc971/orin/cuda_april_tag_test.cc b/frc971/orin/cuda_april_tag_test.cc
new file mode 100644
index 0000000..1d5aa9b
--- /dev/null
+++ b/frc971/orin/cuda_april_tag_test.cc
@@ -0,0 +1,1563 @@
+#include <numeric>
+#include <random>
+#include <string>
+
+#include "glog/logging.h"
+#include "gtest/gtest.h"
+#include "opencv2/imgproc.hpp"
+#include "third_party/apriltag/apriltag.h"
+#include "third_party/apriltag/common/unionfind.h"
+#include "third_party/apriltag/tag16h5.h"
+#include <opencv2/highgui.hpp>
+
+#include "aos/flatbuffer_merge.h"
+#include "aos/json_to_flatbuffer.h"
+#include "aos/testing/path.h"
+#include "aos/testing/random_seed.h"
+#include "aos/time/time.h"
+#include "frc971/orin/apriltag.h"
+#include "frc971/vision/vision_generated.h"
+
+DEFINE_int32(pixel_border, 10,
+             "Size of image border within which to reject detected corners");
+DEFINE_double(min_decision_margin, 50.0,
+              "Minimum decision margin (confidence) for an apriltag detection");
+
+DEFINE_bool(debug, false, "If true, write debug images.");
+
+// Get access to the intermediates of aprilrobotics.
+extern "C" {
+
+image_u8_t *threshold(apriltag_detector_t *td, image_u8_t *im);
+unionfind_t *connected_components(apriltag_detector_t *td, image_u8_t *threshim,
+                                  int w, int h, int ts);
+
+zarray_t *gradient_clusters(apriltag_detector_t *td, image_u8_t *threshim,
+                            int w, int h, int ts, unionfind_t *uf);
+
+struct pt {
+  // Note: these represent 2*actual value.
+  uint16_t x, y;
+  int16_t gx, gy;
+
+  float slope;
+};
+}  // extern C
+
+// Converts a cv::Mat to an aprilrobotics image.
+image_u8_t ToImageu8t(const cv::Mat &img) {
+  return image_u8_t{
+      .width = img.cols,
+      .height = img.rows,
+      .stride = img.cols,
+      .buf = img.data,
+  };
+}
+
+namespace frc971::apriltag::testing {
+
+// Checks that 2 images match.
+void CheckImage(image_u8_t compare_im_one, image_u8_t compare_im_two,
+                std::string_view label) {
+  CHECK_EQ(compare_im_one.width, compare_im_two.width);
+  CHECK_EQ(compare_im_one.height, compare_im_two.height);
+  ssize_t p = 0;
+  for (int j = 0; j < compare_im_one.height; ++j) {
+    for (int i = 0; i < compare_im_one.width; ++i) {
+      if (p < 0) {
+        LOG(INFO) << i << " " << j << ": "
+                  << static_cast<int>(
+                         compare_im_one.buf[j * compare_im_one.stride + i])
+                  << " (address + " << j * compare_im_one.stride + i << ") vs "
+                  << static_cast<int>(
+                         compare_im_two.buf[j * compare_im_two.stride + i])
+                  << " (address + " << j * compare_im_two.stride + i << ") for "
+                  << label;
+
+        ++p;
+      }
+
+      CHECK_EQ(compare_im_one.buf[j * compare_im_one.stride + i],
+               compare_im_two.buf[j * compare_im_two.stride + i])
+          << "First Image Value "
+          << (int)compare_im_one.buf[j * compare_im_one.stride + i] << " "
+          << "Second Image Value "
+          << (int)compare_im_two.buf[j * compare_im_two.stride + i] << " "
+          << "At " << i << ", " << j << " for " << label;
+    }
+  }
+}
+
+// Stores info about a blob.
+struct BlobInfo {
+  size_t size;
+  size_t min_x;
+  size_t min_y;
+  size_t max_x;
+  size_t max_y;
+  size_t april_robotics_id;
+  size_t bounding_area() const { return (max_x - min_x) * (max_y - min_y); }
+};
+
+// Checks that the aprilrobotics and GPU code agree on the results of
+// unionfinding.
+std::map<uint32_t, BlobInfo> CheckUnionfind(unionfind_t *uf,
+                                            cv::Mat union_markers,
+                                            const uint32_t *union_markers_size,
+                                            cv::Mat threshold) {
+  const size_t width = union_markers.cols;
+  const size_t height = union_markers.rows;
+  std::map<uint32_t, uint32_t> id_remap;
+  std::map<uint32_t, size_t> cuda_id_count;
+
+  for (size_t y = 0; y < height; ++y) {
+    for (size_t x = 0; x < width; ++x) {
+      uint32_t v = unionfind_get_representative(uf, y * width + x);
+      uint32_t v_remapped;
+      {
+        auto it = cuda_id_count.emplace(union_markers.at<uint32_t>(y, x), 1);
+        if (!it.second) {
+          ++it.first->second;
+        }
+      }
+      auto it = id_remap.find(v);
+      if (it == id_remap.end()) {
+        v_remapped = union_markers.at<uint32_t>(y, x);
+        id_remap.insert(std::make_pair(v, v_remapped));
+      } else {
+        v_remapped = it->second;
+      }
+
+      CHECK_EQ(v_remapped, union_markers.at<uint32_t>(y, x))
+          << "At " << x << ", " << y;
+    }
+  }
+  for (auto [key, value] : cuda_id_count) {
+    VLOG(2) << "Found " << key << " num times " << value;
+  }
+  LOG(INFO) << "Found " << id_remap.size() << " blob ids in aprilrobotics.";
+
+  std::map<uint32_t, BlobInfo> blob_sizes;
+
+  for (size_t y = 0; y < height; ++y) {
+    for (size_t x = 0; x < width; ++x) {
+      auto it = blob_sizes.emplace(
+          union_markers.at<uint32_t>(y, x),
+          BlobInfo{
+              .size = 1,
+              .min_x = x,
+              .min_y = y,
+              .max_x = x,
+              .max_y = y,
+              .april_robotics_id =
+                  unionfind_get_representative(uf, y * width + x),
+          });
+      if (!it.second) {
+        BlobInfo *info = &(it.first->second);
+        ++(info->size);
+        CHECK_EQ(info->april_robotics_id,
+                 unionfind_get_representative(uf, y * width + x));
+        info->min_x = std::min(info->min_x, x);
+        info->max_x = std::max(info->max_x, x);
+        info->min_y = std::min(info->min_y, y);
+        info->max_y = std::max(info->max_y, y);
+      }
+    }
+  }
+
+  for (size_t y = 0; y < height; ++y) {
+    for (size_t x = 0; x < width; ++x) {
+      const uint32_t blob_id = union_markers.at<uint32_t>(y, x);
+      const BlobInfo &blob_info = blob_sizes[blob_id];
+      if (threshold.at<uint8_t>(y, x) == 127) {
+        CHECK_EQ(0u, union_markers_size[blob_id])
+            << " at (" << x << ", " << y << ") -> " << blob_id;
+        continue;
+      }
+
+      if (blob_info.size >= 25u) {
+        CHECK_LE(25u, union_markers_size[blob_id])
+            << " at (" << x << ", " << y << ") -> " << blob_id;
+      } else {
+        CHECK_EQ(blob_info.size, union_markers_size[blob_id])
+            << " at (" << x << ", " << y << ") -> " << blob_id;
+      }
+    }
+  }
+
+  LOG(INFO) << "Union finding + stats passed.";
+
+  return blob_sizes;
+}
+
+// Makes a tag detector.
+apriltag_detector_t *MakeTagDetector(apriltag_family_t *tag_family) {
+  apriltag_detector_t *tag_detector = apriltag_detector_create();
+
+  apriltag_detector_add_family_bits(tag_detector, tag_family, 1);
+
+  tag_detector->nthreads = 1;
+  tag_detector->wp = workerpool_create(tag_detector->nthreads);
+  tag_detector->qtp.min_white_black_diff = 5;
+  tag_detector->debug = FLAGS_debug;
+
+  return tag_detector;
+}
+
+class CudaAprilTagDetector {
+ public:
+  CudaAprilTagDetector(size_t width, size_t height)
+      : tag_family_(tag16h5_create()),
+        tag_detector_(MakeTagDetector(tag_family_)),
+        gray_cuda_(cv::Size(width, height), CV_8UC1),
+        decimated_cuda_(gray_cuda_.size() / 2, CV_8UC1),
+        thresholded_cuda_(decimated_cuda_.size(), CV_8UC1),
+        union_markers_(decimated_cuda_.size(), CV_32SC1),
+        union_markers_size_(decimated_cuda_.size(), CV_32SC1),
+        gpu_detector_(width, height, tag_detector_),
+        width_(width),
+        height_(height) {
+    // Report out info about our GPU.
+    {
+      cudaDeviceProp prop;
+      CHECK_EQ(cudaGetDeviceProperties(&prop, 0), cudaSuccess);
+
+      LOG(INFO) << "Device: sm_" << prop.major << prop.minor;
+#define DUMP(x) LOG(INFO) << "" #x ": " << prop.x;
+      DUMP(sharedMemPerBlock);
+      DUMP(l2CacheSize);
+      DUMP(maxThreadsPerBlock);
+      DUMP(maxThreadsPerMultiProcessor);
+      DUMP(memoryBusWidth);
+      DUMP(memoryClockRate);
+      DUMP(multiProcessorCount);
+      DUMP(maxBlocksPerMultiProcessor);
+      DUMP(name);
+      DUMP(warpSize);
+
+#undef DUMP
+    }
+
+    union_marker_pair_.resize((width - 2) / 2 * (height - 2) / 2 * 4,
+                              QuadBoundaryPoint());
+    compressed_union_marker_pair_.resize((width - 2) / 2 * (height - 2) / 2 * 4,
+                                         QuadBoundaryPoint());
+
+    // Pre-compute the border and width thresholds.
+    for (int i = 0; i < zarray_size(tag_detector_->tag_families); i++) {
+      apriltag_family_t *family;
+      zarray_get(tag_detector_->tag_families, i, &family);
+      if (family->width_at_border < min_tag_width_) {
+        min_tag_width_ = family->width_at_border;
+      }
+      normal_border_ |= !family->reversed_border;
+      reversed_border_ |= family->reversed_border;
+    }
+    min_tag_width_ /= tag_detector_->quad_decimate;
+    if (min_tag_width_ < 3) {
+      min_tag_width_ = 3;
+    }
+  }
+
+  ~CudaAprilTagDetector() {
+    apriltag_detector_destroy(tag_detector_);
+    free(tag_family_);
+  }
+
+  // Detects tags on the GPU.
+  void DetectGPU(cv::Mat color_image) {
+    CHECK_EQ(color_image.size(), gray_cuda_.size());
+
+    gpu_detector_.Detect(color_image.data);
+
+    gpu_detector_.CopyGrayTo(gray_cuda_.data);
+    gpu_detector_.CopyDecimatedTo(decimated_cuda_.data);
+    gpu_detector_.CopyThresholdedTo(thresholded_cuda_.data);
+
+    gpu_detector_.CopyUnionMarkersTo(
+        reinterpret_cast<uint32_t *>(union_markers_.data));
+    gpu_detector_.CopyUnionMarkersSizeTo(
+        reinterpret_cast<uint32_t *>(union_markers_size_.data));
+
+    gpu_detector_.CopyUnionMarkerPairTo(union_marker_pair_.data());
+    gpu_detector_.CopyCompressedUnionMarkerPairTo(
+        compressed_union_marker_pair_.data());
+    sorted_union_marker_pair_ = gpu_detector_.CopySortedUnionMarkerPair();
+    num_compressed_union_marker_pair_ =
+        gpu_detector_.NumCompressedUnionMarkerPairs();
+    extents_cuda_ = gpu_detector_.CopyExtents();
+    reduced_dot_blobs_pair_cuda_ = gpu_detector_.CopyReducedDotBlobs();
+    selected_blobs_cuda_ = gpu_detector_.CopySelectedBlobs();
+    sorted_selected_blobs_cuda_ = gpu_detector_.CopySortedSelectedBlobs();
+    num_quads_ = gpu_detector_.NumQuads();
+
+    LOG(INFO) << "num_compressed_union_marker_pair "
+              << sorted_union_marker_pair_.size();
+  }
+
+  // Detects tags on the CPU.
+  void DetectCPU(cv::Mat color_image) {
+    cv::Mat gray_image(color_image.size(), CV_8UC1);
+    cv::cvtColor(color_image, gray_image, cv::COLOR_YUV2GRAY_YUYV);
+    image_u8_t im = {
+        .width = gray_image.cols,
+        .height = gray_image.rows,
+        .stride = gray_image.cols,
+        .buf = gray_image.data,
+    };
+
+    LOG(INFO) << "Starting CPU detect.";
+    zarray_t *detections = apriltag_detector_detect(tag_detector_, &im);
+
+    timeprofile_display(tag_detector_->tp);
+
+    for (int i = 0; i < zarray_size(detections); i++) {
+      apriltag_detection_t *det;
+      zarray_get(detections, i, &det);
+
+      if (det->decision_margin > FLAGS_min_decision_margin) {
+        VLOG(1) << "Found tag number " << det->id
+                << " hamming: " << det->hamming
+                << " margin: " << det->decision_margin;
+      }
+    }
+  }
+
+  // Checks that the union finding algorithms agree.
+  std::vector<QuadBoundaryPoint> CheckUnionMarkerPairs(
+      const std::map<uint32_t, BlobInfo> &blob_sizes) const {
+    std::vector<QuadBoundaryPoint> expected_union_marker_pair;
+
+    const size_t width = thresholded_cuda_.cols;
+    const size_t height = thresholded_cuda_.rows;
+
+    expected_union_marker_pair.resize((width - 2) * (height - 2) * 4,
+                                      QuadBoundaryPoint());
+
+    LOG(INFO) << "Width: " << width << ", Height " << height;
+
+    auto ToIndex = [&](size_t x, size_t y) {
+      return x - 1 + (y - 1) * (width - 2);
+    };
+
+    size_t wrong = 0;
+    size_t right_nonzero = 0;
+
+    for (size_t y = 1; y < height - 1; ++y) {
+      for (size_t x = 1; x < width - 1; ++x) {
+        const uint8_t v0 = thresholded_cuda_.at<uint8_t>(y, x);
+        const uint64_t rep0 = union_markers_.at<uint32_t>(y, x);
+        const auto blob0 = blob_sizes.find(rep0)->second;
+
+        size_t offset = 0;
+        for (auto [dx, dy] : {std::pair<ssize_t, ssize_t>{1, 0},
+                              std::pair<ssize_t, ssize_t>{1, 1},
+                              std::pair<ssize_t, ssize_t>{0, 1},
+                              std::pair<ssize_t, ssize_t>{-1, 1}}) {
+          const uint8_t v1 = thresholded_cuda_.at<uint8_t>(y + dy, x + dx);
+          const uint32_t rep1 = union_markers_.at<uint32_t>(y + dy, x + dx);
+          const auto blob1 = blob_sizes.find(rep1)->second;
+
+          // OK, blob 1, 1 is special.  We only do it if blocks 2 and the blob
+          // let of x won't be connected.
+          //      ________
+          //      | x | 0 |
+          //  -------------
+          //  | 3 | 2 | 1 |
+          //  -------------
+          //
+          //  We are fine checking this all here since we verify that we agree
+          //  with aprilrobotics at the end to make sure the overall logic
+          //  matches.
+          bool consider_blob = true;
+          if (offset == 3) {
+            const uint8_t v1_block_left =
+                thresholded_cuda_.at<uint8_t>(y + 0, x - 1);
+            const uint32_t rep1_block_left =
+                union_markers_.at<uint32_t>(y + 0, x - 1);
+            const uint8_t v1_block_2 =
+                thresholded_cuda_.at<uint8_t>(y + 1, x + 0);
+            const uint32_t rep1_block_2 =
+                union_markers_.at<uint32_t>(y + 1, x + 0);
+            if (v1_block_left != 127 && v1_block_2 != 127 &&
+                (v1_block_left + v1_block_2) == 255 &&
+                blob_sizes.find(rep1_block_left)->second.size >= 25 &&
+                blob_sizes.find(rep1_block_2)->second.size >= 25 && x != 1) {
+              consider_blob = false;
+            }
+          }
+
+          QuadBoundaryPoint point;
+          if (consider_blob && blob0.size >= 25 && blob1.size >= 25 &&
+              v0 != 127) {
+            if (v0 + v1 == 255) {
+              if (rep0 < rep1) {
+                point.set_rep1(rep1);
+                point.set_rep0(rep0);
+              } else {
+                point.set_rep1(rep0);
+                point.set_rep0(rep1);
+              }
+              point.set_base_xy(x, y);
+              point.set_dxy(offset);
+              point.set_black_to_white(v1 > v0);
+            }
+          }
+          size_t index = ToIndex(x, y) + offset * (width - 2) * (height - 2);
+
+          QuadBoundaryPoint actual = union_marker_pair_[index];
+          if (!point.near(actual)) {
+            CHECK_LT(wrong, 10u);
+            LOG(WARNING) << "point == actual (" << std::hex << point << ", "
+                         << actual << ") : Failed at (" << std::dec << x << ", "
+                         << y << ") + (" << dx << ", " << dy
+                         << "), v0: " << static_cast<int>(v0)
+                         << ", v1: " << static_cast<int>(v1)
+                         << ", rep0: " << std::hex << rep0 << " rep1: " << rep1
+                         << " rep0 size: " << std::dec << blob0.size
+                         << " rep1 size: " << std::dec << blob1.size
+                         << " consider_blob " << consider_blob;
+            ++wrong;
+          } else if (point.nonzero()) {
+            right_nonzero++;
+            VLOG(2) << "point == actual (" << std::hex << point << ", "
+                    << actual << ") : Success at (" << std::dec << x << ", "
+                    << y << ") + (" << dx << ", " << dy
+                    << "), v0: " << static_cast<int>(v0)
+                    << ", v1: " << static_cast<int>(v1)
+                    << ", rep0: " << std::hex << rep0 << " rep1: " << rep1;
+            point = actual;
+          }
+          expected_union_marker_pair[index] = point;
+
+          ++offset;
+        }
+      }
+    }
+    CHECK_EQ(wrong, 0u) << ", got " << right_nonzero << " right";
+
+    for (size_t i = 0; i < expected_union_marker_pair.size(); ++i) {
+      CHECK_EQ(expected_union_marker_pair[i], union_marker_pair_[i]);
+    }
+
+    return expected_union_marker_pair;
+  }
+
+  // Checks that the compressed marker pairs GPU algorithm agrees with a naive
+  // CPU version.  Returns the expected points.
+  std::vector<QuadBoundaryPoint> CheckCompressedUnionMarkerPairs(
+      const std::vector<QuadBoundaryPoint> &expected_union_marker_pair,
+      const std::vector<QuadBoundaryPoint> &compressed_union_marker_pair,
+      const std::vector<QuadBoundaryPoint> &sorted_union_marker_pair) const {
+    std::vector<QuadBoundaryPoint> expected_compressed_union_marker_pair;
+    {
+      size_t nonzero_count = 0;
+      for (const QuadBoundaryPoint x : expected_union_marker_pair) {
+        if (x.nonzero()) {
+          ++nonzero_count;
+        }
+      }
+      // Rip out all the zeros.
+      expected_compressed_union_marker_pair.reserve(nonzero_count);
+      for (const QuadBoundaryPoint x : expected_union_marker_pair) {
+        if (x.nonzero()) {
+          expected_compressed_union_marker_pair.push_back(x);
+        }
+      }
+
+      CHECK_EQ(static_cast<size_t>(sorted_union_marker_pair.size()),
+               expected_compressed_union_marker_pair.size());
+
+      // And make sure they match.
+      for (size_t i = 0; i < expected_compressed_union_marker_pair.size();
+           ++i) {
+        CHECK_EQ(expected_compressed_union_marker_pair[i],
+                 compressed_union_marker_pair[i]);
+      }
+    }
+
+    // Now, sort the points.
+    std::sort(expected_compressed_union_marker_pair.begin(),
+              expected_compressed_union_marker_pair.end(),
+              [&](const QuadBoundaryPoint a, const QuadBoundaryPoint b) {
+                if (a.rep01() != b.rep01()) {
+                  return a.rep01() < b.rep01();
+                }
+
+                return a < b;
+              });
+
+    for (size_t i = 0; i < static_cast<size_t>(sorted_union_marker_pair.size());
+         ++i) {
+      CHECK_EQ(expected_compressed_union_marker_pair[i].rep01(),
+               sorted_union_marker_pair[i].rep01());
+    }
+
+    return expected_compressed_union_marker_pair;
+  }
+
+  // Sorts the list of points by slope and blob ID.
+  std::vector<QuadBoundaryPoint> SlopeSortPoints(
+      std::vector<QuadBoundaryPoint> points) const {
+    std::map<uint64_t, std::pair<float, float>> blob_centers;
+
+    // The slope algorithm used by aprilrobotics.
+    auto ComputeSlope = [&blob_centers](QuadBoundaryPoint pair) -> float {
+      const int32_t x = pair.x();
+      const int32_t y = pair.y();
+
+      auto blob_center = blob_centers.find(pair.rep01());
+      CHECK(blob_center != blob_centers.end());
+
+      const float cx = blob_center->second.first;
+      const float cy = blob_center->second.second;
+
+      float quadrants[2][2] = {{-1 * (2 << 15), 0}, {2 * (2 << 15), 2 << 15}};
+
+      float dx = x - cx;
+      float dy = y - cy;
+
+      float quadrant = quadrants[dy > 0][dx > 0];
+      if (dy < 0) {
+        dy = -dy;
+        dx = -dx;
+      }
+
+      if (dx < 0) {
+        float tmp = dx;
+        dx = dy;
+        dy = -tmp;
+      }
+
+      return quadrant + dy / dx;
+    };
+
+    // The theta algorithm used by cuda.
+    auto ComputeTheta = [&blob_centers](QuadBoundaryPoint pair) -> float {
+      const int32_t x = pair.x();
+      const int32_t y = pair.y();
+
+      auto blob_center = blob_centers.find(pair.rep01());
+      CHECK(blob_center != blob_centers.end());
+
+      const float cx = blob_center->second.first;
+      const float cy = blob_center->second.second;
+
+      float dx = x - cx;
+      float dy = y - cy;
+
+      return atan2f(dy, dx);
+    };
+
+    for (size_t i = 0; i < points.size();) {
+      uint64_t first_rep01 = points[i].rep01();
+
+      int min_x, min_y, max_x, max_y;
+      min_x = max_x = points[i].x();
+      min_y = max_y = points[i].y();
+
+      size_t j = i;
+      for (; j < points.size() && points[j].rep01() == first_rep01; ++j) {
+        QuadBoundaryPoint pt = points[j];
+
+        int x = pt.x();
+        int y = pt.y();
+        min_x = std::min(min_x, x);
+        max_x = std::max(max_x, x);
+        min_y = std::min(min_y, y);
+        max_y = std::max(max_y, y);
+      }
+
+      const float cx = (max_x + min_x) * 0.5 + 0.05118;
+      const float cy = (max_y + min_y) * 0.5 + -0.028581;
+
+      blob_centers[first_rep01] = std::make_pair(cx, cy);
+      i = j;
+    }
+
+    std::sort(points.begin(), points.end(),
+              [&](const QuadBoundaryPoint a, const QuadBoundaryPoint b) {
+                if (a.rep01() != b.rep01()) {
+                  return a.rep01() < b.rep01();
+                }
+
+                const float slopea = ComputeSlope(a);
+                const float slopeb = ComputeSlope(b);
+
+                // Sigh, apparently the slope algorithm isn't great and
+                // sometimes ends up with matching slopes.  In that case,
+                // compute the actual angle to the X axis too and sort with it.
+                //
+                // Aprilrobotics ignores this case and ends up with an
+                // indeterminate solution.  We don't, so we notice.
+                if (slopea == slopeb) {
+                  return ComputeTheta(a) < ComputeTheta(b);
+                }
+                return slopea < slopeb;
+              });
+    return points;
+  }
+
+  // Filters blobs using the the various size thresholds that aprilrobotics
+  // uses.
+  std::vector<std::vector<QuadBoundaryPoint>> FilterBlobs(
+      std::vector<std::vector<QuadBoundaryPoint>> blobs) const {
+    std::vector<std::vector<QuadBoundaryPoint>> result;
+    const size_t max_april_tag_perimeter = 2 * (width_ + height_);
+    LOG(ERROR) << "Max permiter test " << max_april_tag_perimeter;
+
+    for (std::vector<QuadBoundaryPoint> &blob : blobs) {
+      int min_x, min_y, max_x, max_y;
+      min_x = max_x = blob[0].x();
+      min_y = max_y = blob[0].y();
+
+      for (const QuadBoundaryPoint pt : blob) {
+        int x = pt.x();
+        int y = pt.y();
+        min_x = std::min(min_x, x);
+        max_x = std::max(max_x, x);
+        min_y = std::min(min_y, y);
+        max_y = std::max(max_y, y);
+      }
+
+      float dot = 0;
+
+      const float cx = (min_x + max_x) * 0.5 + 0.05118;
+      const float cy = (min_y + max_y) * 0.5 - 0.028581;
+
+      for (size_t j = 0; j < blob.size(); ++j) {
+        dot += (static_cast<float>(blob[j].x()) - cx) * blob[j].gx() +
+               (static_cast<float>(blob[j].y()) - cy) * blob[j].gy();
+      }
+
+      const bool quad_reversed_border = dot < 0;
+
+      if (!reversed_border_ && quad_reversed_border) {
+        continue;
+      }
+      if (!normal_border_ && !quad_reversed_border) {
+        continue;
+      }
+      if ((max_x - min_x) * (max_y - min_y) < min_tag_width_) {
+        continue;
+      }
+      if (blob.size() < 24) {
+        continue;
+      }
+      if (blob.size() <
+          static_cast<size_t>(tag_detector_->qtp.min_cluster_pixels)) {
+        continue;
+      }
+      if (blob.size() > max_april_tag_perimeter) {
+        continue;
+      }
+
+      result.emplace_back(std::move(blob));
+    }
+
+    return result;
+  }
+
+  // Produces sorted lists of clusters from AprilRobotics' intermediate
+  // clusters.
+  std::vector<std::vector<uint64_t>> AprilRoboticsPoints(
+      image_u8_t *thresholded_im, unionfind_t *uf) const {
+    zarray_t *clusters =
+        gradient_clusters(tag_detector_, thresholded_im, thresholded_im->width,
+                          thresholded_im->height, thresholded_im->stride, uf);
+
+    std::vector<std::vector<uint64_t>> april_grouped_points;
+    for (int i = 0; i < zarray_size(clusters); i++) {
+      zarray_t *cluster;
+      zarray_get(clusters, i, &cluster);
+
+      uint16_t min_x = std::numeric_limits<uint16_t>::max();
+      uint16_t max_x = 0;
+      uint16_t min_y = std::numeric_limits<uint16_t>::max();
+      uint16_t max_y = 0;
+      std::vector<std::tuple<float, uint64_t>> pts;
+      for (int j = 0; j < zarray_size(cluster); j++) {
+        struct pt *p;
+        zarray_get_volatile(cluster, j, &p);
+        min_x = std::min(p->x, min_x);
+        min_y = std::min(p->y, min_y);
+        max_x = std::max(p->x, max_x);
+        max_y = std::max(p->y, max_y);
+      }
+
+      // add some noise to (cx,cy) so that pixels get a more diverse set
+      // of theta estimates. This will help us remove more points.
+      // (Only helps a small amount. The actual noise values here don't
+      // matter much at all, but we want them [-1, 1]. (XXX with
+      // fixed-point, should range be bigger?)
+      const float cx = (min_x + max_x) * 0.5 + 0.05118;
+      const float cy = (min_y + max_y) * 0.5 + -0.028581;
+
+      float quadrants[2][2] = {{-1 * (2 << 15), 0}, {2 * (2 << 15), 2 << 15}};
+
+      for (int j = 0; j < zarray_size(cluster); j++) {
+        struct pt *p;
+        zarray_get_volatile(cluster, j, &p);
+
+        float dx = p->x - cx;
+        float dy = p->y - cy;
+
+        float quadrant = quadrants[dy > 0][dx > 0];
+        if (dy < 0) {
+          dy = -dy;
+          dx = -dx;
+        }
+
+        if (dx < 0) {
+          float tmp = dx;
+          dx = dy;
+          dy = -tmp;
+        }
+        p->slope = quadrant + dy / dx;
+        pts.push_back(std::make_pair(p->slope, p->x + p->y * width_));
+      }
+
+      // Slope sort doesn't always combine duplicates because of duplicate
+      // slopes. Sort by point first, remove duplicates, then sort by slope.
+      std::sort(
+          pts.begin(), pts.end(),
+          [](std::tuple<float, uint64_t> a, std::tuple<float, uint64_t> b) {
+            return std::get<1>(a) < std::get<1>(b);
+          });
+      pts.erase(std::unique(pts.begin(), pts.end(),
+                            [](std::tuple<float, uint64_t> a,
+                               std::tuple<float, uint64_t> b) {
+                              return std::get<1>(a) == std::get<1>(b);
+                            }),
+                pts.end());
+
+      std::sort(
+          pts.begin(), pts.end(),
+          [](std::tuple<float, uint64_t> a, std::tuple<float, uint64_t> b) {
+            return std::get<0>(a) < std::get<0>(b);
+          });
+
+      VLOG(1) << "aprilrobotics points of " << pts.size() << " center (" << cx
+              << ", " << cy << ")";
+
+      std::vector<uint64_t> transformed_points;
+      transformed_points.reserve(pts.size());
+      for (std::tuple<float, uint64_t> pt : pts) {
+        VLOG(1) << "    (" << std::get<1>(pt) % width_ << ", "
+                << std::get<1>(pt) / width_ << ") slope " << std::get<0>(pt);
+        transformed_points.push_back(std::get<1>(pt));
+      }
+
+      april_grouped_points.emplace_back(std::move(transformed_points));
+    }
+
+    // Now sort the groups of points into a reproducible result by sorting by
+    // size and then smallest point first.
+    std::sort(april_grouped_points.begin(), april_grouped_points.end(),
+              [](auto &x, auto &y) {
+                if (x.size() == y.size()) {
+                  for (size_t j = 0; j < x.size(); ++j) {
+                    if (x[j] == y[j]) {
+                      continue;
+                    }
+                    return x[j] < y[j];
+                  }
+                  LOG(FATAL) << "Equal";
+                }
+                return x.size() < y.size();
+              });
+
+    for (int i = 0; i < zarray_size(clusters); i++) {
+      zarray_t *cluster;
+      zarray_get(clusters, i, &cluster);
+      zarray_destroy(cluster);
+    }
+    zarray_destroy(clusters);
+
+    return april_grouped_points;
+  }
+
+  // Sorts point lists by size.  These points need to have an x() and y().
+  template <typename T>
+  std::vector<std::vector<T>> SortPointSizes(
+      std::vector<std::vector<T>> in) const {
+    std::sort(in.begin(), in.end(), [this](auto &a, auto &b) {
+      if (a.size() == b.size()) {
+        // Use point ID as the tie breaker.
+        for (size_t j = 0; j < a.size(); ++j) {
+          const uint32_t point_x = a[j].x() + a[j].y() * width_;
+          const uint32_t point_y = b[j].x() + b[j].y() * width_;
+          if (point_x == point_y) {
+            continue;
+          }
+          return point_x < point_y;
+        }
+        LOG(FATAL) << "Equal";
+      }
+      return a.size() < b.size();
+    });
+
+    return in;
+  }
+
+  // Sorts grouped points by size and point.
+  std::vector<std::vector<QuadBoundaryPoint>> SortPoints(
+      std::vector<std::vector<QuadBoundaryPoint>> in) const {
+    for (std::vector<QuadBoundaryPoint> &grouped_points : in) {
+      std::sort(grouped_points.begin(), grouped_points.end(),
+                [this](const QuadBoundaryPoint &a, const QuadBoundaryPoint &b) {
+                  return std::make_tuple(a.rep01(), a.x() + a.y() * width_) <
+                         std::make_tuple(b.rep01(), b.x() + b.y() * width_);
+                });
+    }
+
+    return SortPointSizes(std::move(in));
+  }
+
+  // Sorts outer points list by size and point.
+  std::vector<std::vector<uint64_t>> SortAprilroboticsPointSizes(
+      std::vector<std::vector<uint64_t>> april_grouped_points) const {
+    std::sort(april_grouped_points.begin(), april_grouped_points.end(),
+              [](auto &x, auto &y) {
+                if (x.size() == y.size()) {
+                  for (size_t j = 0; j < x.size(); ++j) {
+                    if (x[j] == y[j]) {
+                      continue;
+                    }
+                    return x[j] < y[j];
+                  }
+                  LOG(FATAL) << "Equal";
+                }
+                return x.size() < y.size();
+              });
+    return april_grouped_points;
+  }
+
+  // Sorts points and points lists for Aprilrobotics by size and point id.
+  std::vector<std::vector<uint64_t>> SortAprilroboticsPoints(
+      std::vector<std::vector<uint64_t>> april_grouped_points) const {
+    for (std::vector<uint64_t> &points : april_grouped_points) {
+      std::sort(points.begin(), points.end());
+    }
+    return SortAprilroboticsPointSizes(april_grouped_points);
+  }
+
+  // Prints out a list of CUDA points.
+  void PrintCudaPoints(
+      const std::vector<std::vector<QuadBoundaryPoint>> &cuda_grouped_points,
+      const std::map<uint32_t, BlobInfo> &blob_sizes) const {
+    for (const std::vector<QuadBoundaryPoint> &points : cuda_grouped_points) {
+      const uint32_t rep0 = points[0].rep0();
+      const uint32_t rep1 = points[0].rep1();
+      VLOG(1) << "CUDA points of " << rep0 << "+" << rep1
+              << " aprilrobotics blob: "
+              << blob_sizes.find(rep0)->second.april_robotics_id << "+"
+              << blob_sizes.find(rep1)->second.april_robotics_id << " ("
+              << std::hex
+              << ((static_cast<uint64_t>(std::max(
+                       blob_sizes.find(rep0)->second.april_robotics_id,
+                       blob_sizes.find(rep1)->second.april_robotics_id))
+                   << 32) |
+                  std::min(blob_sizes.find(rep0)->second.april_robotics_id,
+                           blob_sizes.find(rep1)->second.april_robotics_id))
+              << std::dec << ") size " << points.size();
+      for (const QuadBoundaryPoint point : points) {
+        VLOG(1) << "    (" << point.x() << ", " << point.y() << ")";
+      }
+    }
+
+    LOG(INFO) << "Found runs overall " << cuda_grouped_points.size();
+  }
+
+  // Groups marker pairs by runs of rep01().
+  std::vector<std::vector<QuadBoundaryPoint>> CudaGroupedPoints(
+      const std::vector<QuadBoundaryPoint> &compressed_union_marker_pairs)
+      const {
+    std::vector<std::vector<QuadBoundaryPoint>> cuda_grouped_points;
+    cuda_grouped_points.emplace_back();
+
+    QuadBoundaryPoint start = compressed_union_marker_pairs[0];
+    for (size_t i = 0; i < compressed_union_marker_pairs.size(); ++i) {
+      QuadBoundaryPoint current = compressed_union_marker_pairs[i];
+      CHECK_GE(current.rep01(), start.rep01())
+          << " Failed on index " << i << " of "
+          << compressed_union_marker_pairs.size();
+      if ((start.rep01()) != (current.rep01())) {
+        cuda_grouped_points.emplace_back(
+            std::vector<QuadBoundaryPoint>{current});
+      } else {
+        cuda_grouped_points.back().emplace_back(current);
+      }
+      start = current;
+    }
+
+    return cuda_grouped_points;
+  }
+
+  // Groups marker pairs by runs of rep01().
+  std::vector<std::vector<IndexPoint>> CudaGroupedPoints(
+      const std::vector<IndexPoint> &compressed_cuda_points) const {
+    std::vector<std::vector<IndexPoint>> cuda_grouped_points;
+    cuda_grouped_points.emplace_back();
+
+    IndexPoint start = compressed_cuda_points[0];
+    for (size_t i = 0; i < compressed_cuda_points.size(); ++i) {
+      IndexPoint current = compressed_cuda_points[i];
+      CHECK_GE(current.blob_index(), start.blob_index())
+          << " Failed on index " << i << " of "
+          << compressed_cuda_points.size();
+      if ((start.blob_index()) != (current.blob_index())) {
+        cuda_grouped_points.emplace_back(std::vector<IndexPoint>{current});
+      } else {
+        cuda_grouped_points.back().emplace_back(current);
+      }
+      start = current;
+    }
+
+    return cuda_grouped_points;
+  }
+
+  // Checks that both april robotics and Cuda agree on point grouping.  No
+  // ordering is implied here, that is checked later.
+  void CheckAprilRoboticsAndCudaContainSamePoints(
+      const std::vector<std::vector<uint64_t>> &april_grouped_points,
+      const std::vector<std::vector<QuadBoundaryPoint>> &cuda_grouped_points)
+      const {
+    const std::vector<std::vector<QuadBoundaryPoint>>
+        cuda_point_sorted_grouped_points = SortPoints(cuda_grouped_points);
+    const std::vector<std::vector<uint64_t>> april_sorted_grouped_points =
+        SortAprilroboticsPoints(
+            SortAprilroboticsPointSizes(april_grouped_points));
+
+    CHECK_EQ(april_sorted_grouped_points.size(),
+             cuda_point_sorted_grouped_points.size());
+    for (size_t i = 0; i < april_sorted_grouped_points.size(); ++i) {
+      CHECK_EQ(april_sorted_grouped_points[i].size(),
+               cuda_point_sorted_grouped_points[i].size());
+      for (size_t j = 0; j < april_sorted_grouped_points[i].size(); ++j) {
+        CHECK_EQ(april_sorted_grouped_points[i][j],
+                 cuda_point_sorted_grouped_points[i][j].x() +
+                     cuda_point_sorted_grouped_points[i][j].y() * width_)
+            << ": On list of size " << april_sorted_grouped_points[i].size()
+            << ", failed on point " << j << "("
+            << april_sorted_grouped_points[i][j] % width_ << ", "
+            << april_sorted_grouped_points[i][j] / width_ << ") vs ("
+            << cuda_point_sorted_grouped_points[i][j].x() << ", "
+            << cuda_point_sorted_grouped_points[i][j].y() << ")";
+      }
+    }
+  }
+
+  // Tests that the extents and dot products match the cuda versions.
+  std::pair<size_t, std::vector<IndexPoint>> CheckCudaExtents(
+      const std::vector<std::vector<QuadBoundaryPoint>> &cuda_grouped_points,
+      const std::vector<MinMaxExtents> &extents_cuda,
+      const std::vector<float> &reduced_dot_blobs_pair_cuda) const {
+    std::vector<IndexPoint> selected_blobs;
+    const size_t max_april_tag_perimeter = 2 * (width_ + height_);
+    size_t selected_quads = 0;
+    {
+      BlobExtentsIndexFinder finder(extents_cuda.data(), extents_cuda.size());
+      size_t i = 0;
+      size_t starting_offset = 0;
+      CHECK_EQ(cuda_grouped_points.size(), extents_cuda.size());
+      CHECK_EQ(extents_cuda.size(), reduced_dot_blobs_pair_cuda.size());
+      for (const std::vector<QuadBoundaryPoint> &points : cuda_grouped_points) {
+        size_t min_x, min_y, max_x, max_y;
+        min_x = max_x = points[0].x();
+        min_y = max_y = points[0].y();
+
+        for (const QuadBoundaryPoint pt : points) {
+          size_t x = pt.x();
+          size_t y = pt.y();
+          min_x = std::min(min_x, x);
+          max_x = std::max(max_x, x);
+          min_y = std::min(min_y, y);
+          max_y = std::max(max_y, y);
+        }
+
+        const MinMaxExtents extents = extents_cuda[i];
+        CHECK_EQ(extents.min_x, min_x);
+        CHECK_EQ(extents.max_x, max_x);
+        CHECK_EQ(extents.min_y, min_y);
+        CHECK_EQ(extents.max_y, max_y);
+        CHECK_EQ(extents.count, points.size());
+        CHECK_EQ(extents.starting_offset, starting_offset)
+            << " for index " << i;
+
+        float dot = 0;
+
+        const float cx = (min_x + max_x) * 0.5 + 0.05118;
+        const float cy = (min_y + max_y) * 0.5 - 0.028581;
+
+        bool good_blob_size =
+            points.size() >= 24 && points.size() <= max_april_tag_perimeter &&
+            points.size() >=
+                static_cast<size_t>(tag_detector_->qtp.min_cluster_pixels);
+
+        if (good_blob_size) {
+          for (size_t j = 0; j < points.size(); ++j) {
+            dot += (static_cast<float>(points[j].x()) - cx) * points[j].gx() +
+                   (static_cast<float>(points[j].y()) - cy) * points[j].gy();
+
+            // Make sure our blob finder agrees.
+            CHECK_EQ(i, finder.FindBlobIndex(starting_offset + j));
+          }
+          // Test that the summed dot product is right.
+          CHECK_LT(
+              std::abs(reduced_dot_blobs_pair_cuda[i] - dot) / std::abs(dot),
+              1e-4)
+              << ": for point " << i << ", cuda -> "
+              << reduced_dot_blobs_pair_cuda[i] << ", C++ -> " << dot;
+        } else {
+          CHECK_EQ(reduced_dot_blobs_pair_cuda[i], 0.0f)
+              << ": for point " << i << ", cuda -> "
+              << reduced_dot_blobs_pair_cuda[i] << ", C++ -> " << dot;
+        }
+
+        const bool quad_reversed_border = dot < 0;
+
+        VLOG(1) << "For point " << i << ", cuda -> "
+                << reduced_dot_blobs_pair_cuda[i] << ", C++ -> " << dot
+                << " size " << points.size() << " border "
+                << (!(!reversed_border_ && quad_reversed_border) &&
+                    !(!normal_border_ && !quad_reversed_border))
+                << " area: "
+                << (extents.max_x - extents.min_x) *
+                       (extents.max_y - extents.min_y)
+                << " min_area: " << min_tag_width_;
+
+        if (good_blob_size && !(!reversed_border_ && quad_reversed_border) &&
+            !(!normal_border_ && !quad_reversed_border)) {
+          ++selected_quads;
+          for (size_t j = 0; j < points.size(); ++j) {
+            IndexPoint pt(i, points[j].point_bits());
+            float theta =
+                (atan2f(pt.y() - extents.cy(), pt.x() - extents.cx()) + M_PI) *
+                8e6;
+            long long int theta_int = llrintf(theta);
+
+            pt.set_theta(theta_int);
+            selected_blobs.emplace_back(pt);
+          }
+        }
+
+        starting_offset += points.size();
+        ++i;
+      }
+    }
+    return std::make_pair(selected_quads, selected_blobs);
+  }
+
+  // Tests that the filtered points lists are sorted by key.
+  void CheckFilteredCudaPoints(
+      const std::vector<IndexPoint> &selected_blobs,
+      const std::vector<IndexPoint> &selected_blobs_cuda) const {
+    CHECK_EQ(selected_blobs.size(), selected_blobs_cuda.size());
+    for (size_t i = 0;
+         i < std::min(selected_blobs.size(), selected_blobs_cuda.size()); ++i) {
+      VLOG(1) << "Got blob[" << i << "] -> " << selected_blobs[i] << " vs "
+              << selected_blobs_cuda[i];
+      CHECK_EQ(selected_blobs[i].blob_index(),
+               selected_blobs_cuda[i].blob_index());
+    }
+  }
+
+  // Tests that the cuda sorted point list matches C++.
+  void CheckSortedFilteredCudaPoints(
+      const std::vector<std::vector<QuadBoundaryPoint>>
+          &slope_sorted_expected_grouped_points,
+      size_t selected_quads, const std::vector<IndexPoint> &selected_blobs,
+      const std::vector<IndexPoint> &sorted_selected_blobs_cuda) const {
+    CHECK_EQ(selected_blobs.size(), sorted_selected_blobs_cuda.size());
+    const std::vector<std::vector<IndexPoint>> cuda_grouped_points =
+        CudaGroupedPoints(sorted_selected_blobs_cuda);
+
+    CHECK_EQ(cuda_grouped_points.size(), selected_quads);
+    CHECK_EQ(slope_sorted_expected_grouped_points.size(), selected_quads);
+
+    size_t missmatched_points = 0;
+    for (size_t i = 0; i < cuda_grouped_points.size(); ++i) {
+      const std::vector<IndexPoint> &cuda_grouped_blob = cuda_grouped_points[i];
+      const std::vector<QuadBoundaryPoint> &slope_sorted_points =
+          slope_sorted_expected_grouped_points[i];
+
+      CHECK_EQ(cuda_grouped_blob.size(), slope_sorted_points.size());
+      if (VLOG_IS_ON(1) && cuda_grouped_blob[0].blob_index() == 73) {
+        for (size_t j = 0; j < cuda_grouped_points[i].size(); ++j) {
+          LOG(INFO) << "For blob " << cuda_grouped_blob[0].blob_index()
+                    << ", got " << cuda_grouped_blob[j] << " ("
+                    << cuda_grouped_blob[j].x() << ", "
+                    << cuda_grouped_blob[j].y() << ") expected "
+                    << slope_sorted_points[j] << " ("
+                    << slope_sorted_points[j].x() << ", "
+                    << slope_sorted_points[j].y() << ")";
+        }
+      }
+      size_t missmatched_runs = 0;
+      for (size_t j = 0; j < cuda_grouped_points[i].size(); ++j) {
+        if (cuda_grouped_blob[j].x() != slope_sorted_points[j].x() ||
+            cuda_grouped_blob[j].y() != slope_sorted_points[j].y()) {
+          ++missmatched_points;
+          ++missmatched_runs;
+          // We shouldn't see a lot of points in a row which don't match.
+          CHECK_LE(missmatched_runs, 4u);
+          VLOG(1) << "Missmatched point in blob "
+                  << cuda_grouped_blob[0].blob_index() << ", point " << j
+                  << " (" << cuda_grouped_blob[j].x() << ", "
+                  << cuda_grouped_blob[j].y() << ") vs ("
+                  << slope_sorted_points[j].x() << ", "
+                  << slope_sorted_points[j].y() << ")";
+        } else {
+          missmatched_runs = 0;
+        }
+      }
+    }
+
+    // Or a lot of points overall.  The slope algo has duplicate points, and
+    // is occasionally wrong.
+    CHECK_LE(missmatched_points, 50u);
+  }
+
+  // Checks that the GPU and CPU algorithms match.
+  void Check(cv::Mat color_image) {
+    cv::Mat gray_image(color_image.size(), CV_8UC1);
+    cv::cvtColor(color_image, gray_image, cv::COLOR_YUV2GRAY_YUYV);
+
+    image_u8_t gray_im = ToImageu8t(gray_image);
+    CheckImage(gray_im, ToImageu8t(gray_cuda_), "gray_cuda");
+
+    image_u8_t *quad_im =
+        image_u8_decimate(&gray_im, tag_detector_->quad_decimate);
+    CheckImage(*quad_im, ToImageu8t(decimated_cuda_), "decimated_cuda");
+
+    image_u8_t *thresholded_im = threshold(tag_detector_, quad_im);
+    CheckImage(ToImageu8t(thresholded_cuda_), *thresholded_im,
+               "threshold_cuda");
+
+    unionfind_t *uf = connected_components(
+        tag_detector_, thresholded_im, thresholded_im->width,
+        thresholded_im->height, thresholded_im->stride);
+
+    // Checks union finding.
+    auto blob_sizes = CheckUnionfind(
+        uf, union_markers_,
+        reinterpret_cast<const uint32_t *>(union_markers_size_.data),
+        thresholded_cuda_);
+
+    // Now make sure our unfiltered and partially sorted lists of pairs are
+    // plausible.
+    const std::vector<QuadBoundaryPoint> expected_union_marker_pair =
+        CheckUnionMarkerPairs(blob_sizes);
+    const std::vector<QuadBoundaryPoint> expected_compressed_union_marker_pair =
+        CheckCompressedUnionMarkerPairs(expected_union_marker_pair,
+                                        compressed_union_marker_pair_,
+                                        sorted_union_marker_pair_);
+
+    const std::vector<std::vector<QuadBoundaryPoint>> cuda_grouped_points =
+        CudaGroupedPoints(sorted_union_marker_pair_);
+    PrintCudaPoints(cuda_grouped_points, blob_sizes);
+
+    const std::vector<std::vector<QuadBoundaryPoint>>
+        slope_sorted_expected_grouped_points = FilterBlobs(CudaGroupedPoints(
+            SlopeSortPoints(expected_compressed_union_marker_pair)));
+
+    const std::vector<std::vector<uint64_t>> april_grouped_points =
+        AprilRoboticsPoints(thresholded_im, uf);
+
+    LOG(INFO) << "Found " << april_grouped_points.size()
+              << " clusters with april robotics with "
+              << std::accumulate(
+                     april_grouped_points.begin(), april_grouped_points.end(),
+                     0, [](int size, const auto &v) { return size + v.size(); })
+              << " points.";
+    LOG(INFO) << "Found " << cuda_grouped_points.size()
+              << " clusters with cuda with "
+              << num_compressed_union_marker_pair_ << " points.";
+
+    // Verify that both aprilrobotics and us group points the same.  Ignore
+    // order.
+    CheckAprilRoboticsAndCudaContainSamePoints(april_grouped_points,
+                                               cuda_grouped_points);
+
+    // Test that the extents are reasonable.
+    size_t selected_quads;
+    std::vector<IndexPoint> selected_blobs;
+    std::tie(selected_quads, selected_blobs) = CheckCudaExtents(
+        cuda_grouped_points, extents_cuda_, reduced_dot_blobs_pair_cuda_);
+
+    // And that the filtered list is reasonable.
+    CheckFilteredCudaPoints(selected_blobs, selected_blobs_cuda_);
+
+    // And that they sorted right too.
+    CheckSortedFilteredCudaPoints(slope_sorted_expected_grouped_points,
+                                  selected_quads, selected_blobs,
+                                  sorted_selected_blobs_cuda_);
+
+    // TODO(austin): Check the selected extents is right.
+    LOG(INFO) << "Found slope sorted count: "
+              << sorted_union_marker_pair_.size();
+
+    unionfind_destroy(uf);
+    image_u8_destroy(quad_im);
+    image_u8_destroy(thresholded_im);
+  }
+
+  // Writes images out to /tmp for debugging.
+  void WriteDebug(cv::Mat color_image) {
+    cv::Mat bgr(color_image.size(), CV_8UC3);
+    cv::cvtColor(color_image, bgr, cv::COLOR_YUV2BGR_YUYV);
+    cv::imwrite("/tmp/debug_color_image.png", bgr);
+
+    cv::imwrite("/tmp/debug_halide_grey_cuda.png", gray_cuda_);
+    cv::imwrite("/tmp/debug_halide_grey_decimated.png", decimated_cuda_);
+    cv::imwrite("/tmp/debug_cuda_thresholded.png", thresholded_cuda_);
+
+    {
+      image_u8_t thresholded_halide_im = ToImageu8t(thresholded_cuda_);
+
+      unionfind_t *uf = connected_components(
+          tag_detector_, &thresholded_halide_im, thresholded_halide_im.width,
+          thresholded_halide_im.height, thresholded_halide_im.stride);
+
+      std::map<uint32_t, uint32_t> april_to_cuda_id_remap;
+      std::map<uint32_t, uint32_t> cuda_to_april_id_remap;
+
+      std::map<uint32_t, cv::Vec3b> colors;
+
+      // Seed the random number generator so we get the same images out each
+      // time for easier comparison.
+      srandom(0);
+
+      // Write out the union find image.
+      size_t color_count = 1;
+      uint32_t max_color = 0;
+      cv::Mat unionfind_image(union_markers_.size(), CV_8UC3);
+      cv::Mat unionfind_image_common(union_markers_.size(), CV_8UC3);
+      for (int y = 0; y < union_markers_.rows; ++y) {
+        for (int x = 0; x < union_markers_.cols; ++x) {
+          const uint32_t april_robotics_id =
+              unionfind_get_representative(uf, y * width_ / 2 + x);
+          const uint32_t index = union_markers_.at<uint32_t>(y, x);
+
+          bool one_to_one_blob_match = false;
+
+          {
+            auto it = cuda_to_april_id_remap.find(index);
+            if (it == cuda_to_april_id_remap.end()) {
+              // Now, check we don't have a match the other way.
+              auto it_back = april_to_cuda_id_remap.find(april_robotics_id);
+              if (it_back == april_to_cuda_id_remap.end()) {
+                one_to_one_blob_match = true;
+                cuda_to_april_id_remap.emplace(index, april_robotics_id);
+                april_to_cuda_id_remap.emplace(april_robotics_id, index);
+              } else {
+                one_to_one_blob_match = false;
+              }
+            } else {
+              auto it_back = april_to_cuda_id_remap.find(april_robotics_id);
+              if (it_back == april_to_cuda_id_remap.end()) {
+                one_to_one_blob_match = false;
+              } else {
+                if (it->second == april_robotics_id &&
+                    it_back->second == index) {
+                  one_to_one_blob_match = true;
+                }
+              }
+            }
+          }
+
+          cv::Vec3b color;
+
+          auto it = colors.find(index);
+          if (it == colors.end()) {
+            max_color = std::max(max_color, index);
+            ++color_count;
+            VLOG(2) << "New color 0x" << std::hex << index;
+            const int bias = 50;
+            uint8_t r = bias + (random() % (200 - bias));
+            uint8_t g = bias + (random() % (200 - bias));
+            uint8_t b = bias + (random() % (200 - bias));
+
+            color = cv::Vec3b(b, g, r);
+            colors[index] = color;
+          } else {
+            color = it->second;
+          }
+
+          unionfind_image.at<cv::Vec3b>(y, x) = color;
+
+          if (one_to_one_blob_match) {
+            if (index == static_cast<uint32_t>(x + y * union_markers_.cols)) {
+              color = cv::Vec3b(0, 0, 0);
+            } else if (thresholded_cuda_.at<uint8_t>(y, x) == 0) {
+              color = cv::Vec3b(0, 0, 0);
+            } else {
+              color = cv::Vec3b(255, 255, 255);
+            }
+          }
+
+          CHECK(one_to_one_blob_match);
+
+          unionfind_image_common.at<cv::Vec3b>(y, x) = color;
+        }
+      }
+      LOG(INFO) << "Found " << color_count << " colors with a max index of "
+                << max_color;
+      cv::imwrite("/tmp/debug_cuda_segmentation.png", unionfind_image);
+      cv::imwrite("/tmp/debug_cuda_segmentation_common.png",
+                  unionfind_image_common);
+
+      unionfind_destroy(uf);
+    }
+
+    {
+      // And write out the marker pairs image too.
+      std::map<uint64_t, cv::Vec3b> colors;
+
+      cv::Mat sorted_marker_pair(cv::Size((width_), (height_)), CV_8UC3);
+      sorted_marker_pair.setTo(cv::Scalar(0, 0, 0));
+
+      for (size_t i = 0; i < sorted_union_marker_pair_.size();) {
+        size_t blob_size = 0;
+        for (size_t j = i; j < sorted_union_marker_pair_.size(); ++j) {
+          if (sorted_union_marker_pair_[i].rep01() !=
+              sorted_union_marker_pair_[j].rep01()) {
+            break;
+          }
+          ++blob_size;
+        }
+        for (size_t j = i; j < sorted_union_marker_pair_.size(); ++j) {
+          if (sorted_union_marker_pair_[i].rep01() !=
+              sorted_union_marker_pair_[j].rep01()) {
+            break;
+          }
+          QuadBoundaryPoint pair = sorted_union_marker_pair_[j];
+
+          const size_t x = pair.x();
+          const size_t y = pair.y();
+
+          auto it = colors.find(pair.rep01());
+          cv::Vec3b color;
+          if (it == colors.end()) {
+            VLOG(2) << "New color 0x" << std::hex << pair.rep01();
+            const int bias = 50;
+            uint8_t r = bias + (random() % (200 - bias));
+            uint8_t g = bias + (random() % (200 - bias));
+            uint8_t b = bias + (random() % (200 - bias));
+
+            color = cv::Vec3b(b, g, r);
+            colors[pair.rep01()] = color;
+          } else {
+            color = it->second;
+          }
+          sorted_marker_pair.at<cv::Vec3b>(y + 2, x + 2) = color;
+        }
+        i += blob_size;
+      }
+      cv::imwrite("/tmp/debug_cuda_marker_pairs.png", sorted_marker_pair);
+    }
+  }
+
+ private:
+  apriltag_family_t *tag_family_;
+  apriltag_detector_t *tag_detector_;
+
+  cv::Mat gray_cuda_;
+  cv::Mat decimated_cuda_;
+  cv::Mat thresholded_cuda_;
+
+  cv::Mat union_markers_;
+  cv::Mat union_markers_size_;
+
+  std::vector<QuadBoundaryPoint> union_marker_pair_;
+  std::vector<QuadBoundaryPoint> compressed_union_marker_pair_;
+  std::vector<QuadBoundaryPoint> sorted_union_marker_pair_;
+  std::vector<uint32_t> quad_length_;
+  std::vector<MinMaxExtents> extents_cuda_;
+  std::vector<float> reduced_dot_blobs_pair_cuda_;
+  std::vector<IndexPoint> selected_blobs_cuda_;
+  std::vector<IndexPoint> sorted_selected_blobs_cuda_;
+  int num_compressed_union_marker_pair_ = 0;
+  int num_quads_ = 0;
+
+  GpuDetector gpu_detector_;
+
+  size_t width_;
+  size_t height_;
+
+  bool normal_border_ = false;
+  bool reversed_border_ = false;
+  int min_tag_width_ = 1000000;
+};
+
+class AprilDetectionTest : public ::testing::Test {
+ public:
+  aos::FlatbufferVector<frc971::vision::CameraImage> ReadImage(
+      std::string_view path) {
+    return aos::FileToFlatbuffer<frc971::vision::CameraImage>(
+        "../apriltag_test_bfbs_images/" + std::string(path));
+  }
+
+  cv::Mat ToMat(const frc971::vision::CameraImage *image) {
+    cv::Mat color_image(cv::Size(image->cols(), image->rows()), CV_8UC2,
+                        (void *)image->data()->data());
+    return color_image;
+  }
+};
+
+TEST_F(AprilDetectionTest, ImageRepeat) {
+  auto image = ReadImage("bfbs-capture-2013-01-18_08-54-09.501047728.bfbs");
+
+  LOG(INFO) << "Image is: " << image.message().cols() << " x "
+            << image.message().rows();
+
+  CudaAprilTagDetector cuda_detector(image.message().cols(),
+                                     image.message().rows());
+
+  const cv::Mat color_image = ToMat(&image.message());
+
+  for (size_t i = 0; i < 10; ++i) {
+    LOG(INFO) << "Attempt " << i;
+    cuda_detector.DetectGPU(color_image.clone());
+    cuda_detector.DetectCPU(color_image.clone());
+    cuda_detector.Check(color_image.clone());
+    if (FLAGS_debug) {
+      cuda_detector.WriteDebug(color_image);
+    }
+  }
+}
+
+class SingleAprilDetectionTest
+    : public ::testing::WithParamInterface<std::string_view>,
+      public AprilDetectionTest {};
+
+// Tests a single image.
+TEST_P(SingleAprilDetectionTest, Image) {
+  auto image = ReadImage(GetParam());
+
+  LOG(INFO) << "Testing " << GetParam() << " with dimensions "
+            << image.message().cols() << " x " << image.message().rows();
+
+  CudaAprilTagDetector cuda_detector(image.message().cols(),
+                                     image.message().rows());
+
+  const cv::Mat color_image = ToMat(&image.message());
+
+  cuda_detector.DetectGPU(color_image.clone());
+  cuda_detector.DetectCPU(color_image.clone());
+  cuda_detector.Check(color_image.clone());
+  if (FLAGS_debug) {
+    cuda_detector.WriteDebug(color_image);
+  }
+}
+
+// Tests that both algorithms agree on a bunch of pixels.
+INSTANTIATE_TEST_SUITE_P(
+    CapturedImages, SingleAprilDetectionTest,
+    ::testing::Values("bfbs-capture-2013-01-18_08-54-16.869057537.bfbs",
+                      "bfbs-capture-2013-01-18_08-54-09.501047728.bfbs"
+                      // TODO(austin): Figure out why these fail...
+                      //"bfbs-capture-2013-01-18_08-51-24.861065764.bfbs",
+                      //"bfbs-capture-2013-01-18_08-52-01.846912552.bfbs",
+                      //"bfbs-capture-2013-01-18_08-52-33.462848049.bfbs",
+                      //"bfbs-capture-2013-01-18_08-54-24.931661979.bfbs",
+                      //"bfbs-capture-2013-01-18_09-29-16.806073486.bfbs",
+                      //"bfbs-capture-2013-01-18_09-33-00.993756514.bfbs",
+                      //"bfbs-capture-2013-01-18_08-57-00.171120695.bfbs"
+                      //"bfbs-capture-2013-01-18_08-57-17.858752817.bfbs",
+                      //"bfbs-capture-2013-01-18_08-57-08.096597542.bfbs"
+                      ));
+
+// Tests that QuadBoundaryPoint doesn't corrupt data.
+TEST(QuadBoundaryPoint, MasksWork) {
+  std::mt19937 generator(aos::testing::RandomSeed());
+  std::uniform_int_distribution<uint32_t> random_rep_scalar(0, 0xfffff);
+  std::uniform_int_distribution<uint32_t> random_point_scalar(0, 0x3ff);
+  std::uniform_int_distribution<uint32_t> random_dxy_scalar(0, 3);
+  std::uniform_int_distribution<uint32_t> random_bool(0, 1);
+
+  QuadBoundaryPoint point;
+
+  EXPECT_EQ(point.key, 0);
+
+  for (int i = 0; i < 25; ++i) {
+    const uint32_t rep0 = random_rep_scalar(generator);
+    for (int j = 0; j < 25; ++j) {
+      const uint32_t rep1 = random_rep_scalar(generator);
+      for (int k = 0; k < 25; ++k) {
+        const uint32_t x = random_point_scalar(generator);
+        const uint32_t y = random_point_scalar(generator);
+        for (int k = 0; k < 25; ++k) {
+          const uint32_t dxy = random_dxy_scalar(generator);
+          for (int m = 0; m < 2; ++m) {
+            const bool black_to_white = random_bool(generator) == 1;
+            if (point.rep0() != rep0) {
+              point.set_rep0(rep0);
+            }
+            if (point.rep1() != rep1) {
+              point.set_rep1(rep1);
+            }
+            if (point.base_x() != x || point.base_y() != y) {
+              point.set_base_xy(x, y);
+            }
+            switch (dxy) {
+              case 0:
+                if (point.dx() != 1 || point.dy() != 0) {
+                  point.set_dxy(dxy);
+                }
+                break;
+              case 1:
+                if (point.dx() != 1 || point.dy() != 1) {
+                  point.set_dxy(dxy);
+                }
+                break;
+              case 2:
+                if (point.dx() != 0 || point.dy() != 1) {
+                  point.set_dxy(dxy);
+                }
+                break;
+              case 3:
+                if (point.dx() != -1 || point.dy() != 1) {
+                  point.set_dxy(dxy);
+                }
+                break;
+            }
+            if (black_to_white != point.black_to_white()) {
+              point.set_black_to_white(black_to_white);
+            }
+
+            ASSERT_EQ(point.rep0(), rep0);
+            ASSERT_EQ(point.rep1(), rep1);
+            ASSERT_EQ(point.base_x(), x);
+            ASSERT_EQ(point.base_y(), y);
+            switch (dxy) {
+              case 0:
+                ASSERT_EQ(point.dx(), 1);
+                ASSERT_EQ(point.dy(), 0);
+                break;
+              case 1:
+                ASSERT_EQ(point.dx(), 1);
+                ASSERT_EQ(point.dy(), 1);
+                break;
+              case 2:
+                ASSERT_EQ(point.dx(), 0);
+                ASSERT_EQ(point.dy(), 1);
+                break;
+              case 3:
+                ASSERT_EQ(point.dx(), -1);
+                ASSERT_EQ(point.dy(), 1);
+                break;
+            }
+            ASSERT_EQ(point.x(), x * 2 + point.dx());
+            ASSERT_EQ(point.y(), y * 2 + point.dy());
+
+            ASSERT_EQ(point.black_to_white(), black_to_white);
+          }
+        }
+      }
+    }
+  }
+}
+
+}  // namespace frc971::apriltag::testing
diff --git a/frc971/orin/gpu_image.h b/frc971/orin/gpu_image.h
new file mode 100644
index 0000000..732b256
--- /dev/null
+++ b/frc971/orin/gpu_image.h
@@ -0,0 +1,14 @@
+#ifndef FRC971_ORIN_GPU_IMAGE_H_
+#define FRC971_ORIN_GPU_IMAGE_H_
+
+template <typename T>
+struct GpuImage {
+  typedef T type;
+  T *data;
+  size_t rows;
+  size_t cols;
+  // Step is in elements, not bytes.
+  size_t step;
+};
+
+#endif  // FRC971_ORIN_GPU_IMAGE_H_
diff --git a/frc971/orin/labeling_allegretti_2019_BKE.cc b/frc971/orin/labeling_allegretti_2019_BKE.cc
new file mode 100644
index 0000000..2806199
--- /dev/null
+++ b/frc971/orin/labeling_allegretti_2019_BKE.cc
@@ -0,0 +1,494 @@
+// Copyright (c) 2020, the YACCLAB contributors, as
+// shown by the AUTHORS file. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+//
+// See
+// https://iris.unimore.it/bitstream/11380/1179616/1/2018_TPDS_Optimized_Block_Based_Algorithms_to_Label_Connected_Components_on_GPUs.pdf
+// for more details.
+//
+// This file was started from the YACCLAB implementation, but is now quite
+// different to match what the april tag detection algorithm wants.  The notable
+// changes are:
+//  * 4 way connectivity on the background at the same time as the foreground.
+//  * 0 and 255 are the only colors considered, 127 means "I'm my own blob".
+//  * Blob size is also computed in parallel using atomics to aid blob size
+//    filtering.
+//
+
+#include "frc971/orin/labeling_allegretti_2019_BKE.h"
+
+#include "glog/logging.h"
+
+#include "cuda_runtime.h"
+#include "device_launch_parameters.h"
+
+#define BLOCK_ROWS 16
+#define BLOCK_COLS 16
+
+namespace {
+
+//         This is a block-based algorithm.
+// Blocks are 2x2 sized, with internal pixels named as:
+//                       +---+
+//                       |a b|
+//                       |c d|
+//                       +---+
+//
+//       Neighbour blocks of block X are named as:
+//                      +-+-+-+
+//                      |P|Q|R|
+//                      +-+-+-+
+//                      |S|X|
+//                      +-+-+
+
+enum class Info : uint8_t {
+  a = 0,
+  b = 1,
+  c = 2,
+  d = 3,
+  P = 4,
+  Q = 5,
+  R = 6,
+  S = 7
+};
+
+// Only use it with unsigned numeric types
+template <typename T>
+__device__ __forceinline__ uint8_t HasBit(const T bitmap, Info pos) {
+  return (bitmap >> static_cast<uint8_t>(pos)) & 1;
+}
+
+template <typename T>
+__device__ __forceinline__ uint8_t HasBit(const T bitmap, uint8_t pos) {
+  return (bitmap >> pos) & 1;
+}
+
+// Only use it with unsigned numeric types
+__device__ __forceinline__ void SetBit(uint8_t &bitmap, Info pos) {
+  bitmap |= (1 << static_cast<uint8_t>(pos));
+}
+
+// Returns the root index of the UFTree
+__device__ uint32_t Find(const uint32_t *s_buf, uint32_t n) {
+  while (s_buf[n] != n) {
+    n = s_buf[n];
+  }
+  return n;
+}
+
+// Returns the root index of the UFTree, re-assigning ourselves as we go.
+__device__ uint32_t FindAndCompress(uint32_t *s_buf, uint32_t n) {
+  uint32_t id = n;
+  while (s_buf[n] != n) {
+    n = s_buf[n];
+    s_buf[id] = n;
+  }
+  return n;
+}
+
+// Merges the UFTrees of a and b, linking one root to the other
+__device__ void Union(uint32_t *s_buf, uint32_t a, uint32_t b) {
+  bool done;
+
+  do {
+    a = Find(s_buf, a);
+    b = Find(s_buf, b);
+
+    if (a < b) {
+      uint32_t old = atomicMin(s_buf + b, a);
+      done = (old == b);
+      b = old;
+    } else if (b < a) {
+      uint32_t old = atomicMin(s_buf + a, b);
+      done = (old == a);
+      a = old;
+    } else {
+      done = true;
+    }
+
+  } while (!done);
+}
+
+// Initializes the labels in an image to hold the masks, info, and UF tree
+// pointers needed for the next steps.
+__global__ void InitLabeling(const GpuImage<uint8_t> img,
+                             GpuImage<uint32_t> labels) {
+  const unsigned row = (blockIdx.y * blockDim.y + threadIdx.y) * 2;
+  const unsigned col = (blockIdx.x * blockDim.x + threadIdx.x) * 2;
+  const uint32_t img_index = row * img.step + col;
+  const uint32_t foreground_labels_index = row * labels.step + col;
+  const uint32_t background_labels_index = (row + 1) * labels.step + col;
+
+  if (row >= labels.rows || col >= labels.cols) {
+    return;
+  }
+
+  uint32_t P_foreground = 0;
+  uint32_t P_background = 0;
+
+  // Bitmask representing two kinds of information
+  // Bits 0, 1, 2, 3 are set if pixel a, b, c, d are foreground, respectively
+  // Bits 4, 5, 6, 7 are set if block P, Q, R, S need to be merged to X in
+  // Merge phase
+  uint8_t info_foreground = 0;
+  uint8_t info_left_background = 0;
+  uint8_t info_right_background = 0;
+
+  uint8_t buffer alignas(int)[4];
+  *(reinterpret_cast<int *>(buffer)) = 0;
+
+  // Read pairs of consecutive values in memory at once
+  // This does not depend on endianness
+  *(reinterpret_cast<uint16_t *>(buffer)) =
+      *(reinterpret_cast<uint16_t *>(img.data + img_index));
+
+  *(reinterpret_cast<uint16_t *>(buffer + 2)) =
+      *(reinterpret_cast<uint16_t *>(img.data + img_index + img.step));
+
+  // P is a bitmask saying where to check.
+  //
+  //                     0  1  2  3
+  //                       +----+
+  //                     4 |a  b| 7
+  //                     8 |c  d| 11
+  //                       +----+
+  //                    12  13 14 15
+  if (buffer[0] == 255u) {
+    P_foreground |= 0x777;
+    SetBit(info_foreground, Info::a);
+  } else if (buffer[0] == 0u) {
+    // This is the background, we are only doing 4 way connectivity, only look
+    // in the 4 directions.
+    P_background |= 0x272;
+    SetBit(info_left_background, Info::a);
+  }
+
+  if (buffer[1] == 255u) {
+    P_foreground |= (0x777 << 1);
+    SetBit(info_foreground, Info::b);
+  } else if (buffer[1] == 0u) {
+    P_background |= (0x272 << 1);
+    SetBit(info_right_background, Info::b);
+  }
+
+  if (buffer[2] == 255u) {
+    P_foreground |= (0x777 << 4);
+    SetBit(info_foreground, Info::c);
+  } else if (buffer[2] == 0u) {
+    P_background |= (0x272 << 4);
+    SetBit(info_left_background, Info::c);
+  }
+
+  if (buffer[3] == 255u) {
+    SetBit(info_foreground, Info::d);
+  } else if (buffer[3] == 0u) {
+    SetBit(info_right_background, Info::d);
+  }
+
+  if (col == 0) {
+    P_foreground &= 0xEEEE;
+    P_background &= 0xEEEE;
+  }
+  if (col + 2 >= img.cols) {
+    P_foreground &= 0x7777;
+    P_background &= 0x7777;
+  }
+
+  if (row == 0) {
+    P_foreground &= 0xFFF0;
+    P_background &= 0xFFF0;
+  }
+  if (row + 2 >= img.rows) {
+    P_foreground &= 0x0FFF;
+    P_background &= 0x0FFF;
+  }
+
+  // P is now ready to be used to find neighbour blocks
+  // P value avoids range errors
+  int father_offset_foreground = 0;
+  int father_offset_left_background = 0;
+  int father_offset_right_background = 0;
+
+  // P square
+  if (HasBit(P_foreground, 0) && img.data[img_index - img.step - 1] == 255u) {
+    father_offset_foreground = -(2 * (labels.step) + 2);
+  }
+
+  // Q square
+  if ((HasBit(P_foreground, 1) && img.data[img_index - img.step] == 255u) ||
+      (HasBit(P_foreground, 2) && img.data[img_index + 1 - img.step] == 255u)) {
+    if (!father_offset_foreground) {
+      father_offset_foreground = -(2 * (labels.step));
+    } else {
+      SetBit(info_foreground, Info::Q);
+    }
+  }
+  if ((HasBit(P_background, 1) && img.data[img_index - img.step] == 0u)) {
+    father_offset_left_background = -2 * labels.step;
+  }
+  if ((HasBit(P_background, 2) && img.data[img_index + 1 - img.step] == 0u)) {
+    father_offset_right_background = -2 * labels.step;
+  }
+
+  // R square
+  if (HasBit(P_foreground, 3) && img.data[img_index + 2 - img.step] == 255u) {
+    if (!father_offset_foreground) {
+      father_offset_foreground = -(2 * (labels.step) - 2);
+    } else {
+      SetBit(info_foreground, Info::R);
+    }
+  }
+
+  // S square
+  if ((HasBit(P_foreground, 4) && img.data[img_index - 1] == 255u) ||
+      (HasBit(P_foreground, 8) && img.data[img_index + img.step - 1] == 255u)) {
+    if (!father_offset_foreground) {
+      father_offset_foreground = -2;
+    } else {
+      SetBit(info_foreground, Info::S);
+    }
+  }
+  if ((HasBit(P_background, 4) && img.data[img_index - 1] == 0u) ||
+      (HasBit(P_background, 8) && img.data[img_index + img.step - 1] == 0u)) {
+    if (!father_offset_left_background) {
+      father_offset_left_background = -1;
+    } else {
+      SetBit(info_left_background, Info::S);
+    }
+  }
+
+  if ((HasBit(info_left_background, Info::a) &&
+       HasBit(info_right_background, Info::b)) ||
+      (HasBit(info_left_background, Info::c) &&
+       HasBit(info_right_background, Info::d))) {
+    if (!father_offset_right_background) {
+      father_offset_right_background = -1;
+    } else {
+      SetBit(info_right_background, Info::S);
+    }
+  }
+
+  // Now, write everything back out to memory.
+  *reinterpret_cast<uint64_t *>(labels.data + foreground_labels_index) =
+      static_cast<uint64_t>(foreground_labels_index +
+                            father_offset_foreground) +
+      (static_cast<uint64_t>(info_foreground) << 32) +
+      (static_cast<uint64_t>(info_left_background) << 40) +
+      (static_cast<uint64_t>(info_right_background) << 48);
+
+  *reinterpret_cast<uint64_t *>(labels.data + background_labels_index) =
+      static_cast<uint64_t>(background_labels_index +
+                            father_offset_left_background) +
+      (static_cast<uint64_t>(background_labels_index +
+                             father_offset_right_background + 1)
+       << 32);
+}
+
+__global__ void Compression(GpuImage<uint32_t> labels) {
+  const unsigned row = (blockIdx.y * blockDim.y + threadIdx.y) * 2;
+  const unsigned col = (blockIdx.x * blockDim.x + threadIdx.x) * 2;
+  const uint32_t foreground_labels_index = row * labels.step + col;
+  const uint32_t background_labels_index = (row + 1) * labels.step + col;
+
+  if (row >= labels.rows || col >= labels.cols) {
+    return;
+  }
+
+  FindAndCompress(labels.data, foreground_labels_index);
+  FindAndCompress(labels.data, background_labels_index);
+  FindAndCompress(labels.data, background_labels_index + 1);
+}
+
+__global__ void Merge(GpuImage<uint32_t> labels) {
+  const unsigned row = (blockIdx.y * blockDim.y + threadIdx.y) * 2;
+  const unsigned col = (blockIdx.x * blockDim.x + threadIdx.x) * 2;
+  const uint32_t foreground_labels_index = row * labels.step + col;
+  const uint32_t background_labels_index = (row + 1) * labels.step + col;
+
+  if (row >= labels.rows || col >= labels.cols) {
+    return;
+  }
+
+  const uint32_t info = *(labels.data + foreground_labels_index + 1);
+
+  const uint8_t info_foreground = info & 0xff;
+  const uint8_t info_left_background = (info >> 8) & 0xff;
+  const uint8_t info_right_background = (info >> 16) & 0xff;
+
+  if (HasBit(info_foreground, Info::Q)) {
+    Union(labels.data, foreground_labels_index,
+          foreground_labels_index - 2 * (labels.step));
+  }
+
+  if (HasBit(info_foreground, Info::R)) {
+    Union(labels.data, foreground_labels_index,
+          foreground_labels_index - 2 * (labels.step) + 2);
+  }
+
+  if (HasBit(info_foreground, Info::S)) {
+    Union(labels.data, foreground_labels_index, foreground_labels_index - 2);
+  }
+
+  if (HasBit(info_left_background, Info::S)) {
+    Union(labels.data, background_labels_index, background_labels_index - 1);
+  }
+  if (HasBit(info_right_background, Info::S)) {
+    Union(labels.data, background_labels_index + 1, background_labels_index);
+  }
+}
+
+__global__ void FinalLabeling(GpuImage<uint32_t> labels,
+                              GpuImage<uint32_t> union_markers_size_device) {
+  const unsigned row = (blockIdx.y * blockDim.y + threadIdx.y) * 2;
+  const unsigned col = (blockIdx.x * blockDim.x + threadIdx.x) * 2;
+  const uint32_t foreground_labels_index = row * labels.step + col;
+  const uint32_t background_labels_index = (row + 1) * labels.step + col;
+
+  if (row >= labels.rows || col >= labels.cols) {
+    return;
+  }
+
+  const uint64_t foreground_buffer =
+      *reinterpret_cast<uint64_t *>(labels.data + foreground_labels_index);
+  const uint32_t foreground_label = (foreground_buffer & (0xFFFFFFFF));
+  const uint8_t foreground_info = (foreground_buffer >> 32) & 0xFF;
+  const uint64_t background_buffer =
+      *reinterpret_cast<uint64_t *>(labels.data + background_labels_index);
+  const uint32_t background_left_label = (background_buffer & (0xFFFFFFFF));
+  const uint32_t background_right_label =
+      ((background_buffer >> 32) & (0xFFFFFFFF));
+  const uint8_t background_left_info = (foreground_buffer >> 40) & 0xFF;
+  const uint8_t background_right_info = (foreground_buffer >> 48) & 0xFF;
+
+  uint32_t a_label;
+  uint32_t b_label;
+  uint32_t c_label;
+  uint32_t d_label;
+
+  if ((foreground_info & 0xf) == 0u && (background_left_info & 0xf) == 0u &&
+      (background_right_info & 0xf) == 0u) {
+    a_label = foreground_labels_index;
+    b_label = foreground_labels_index + 1;
+    c_label = background_labels_index;
+    d_label = background_labels_index + 1;
+    *reinterpret_cast<uint64_t *>(labels.data + foreground_labels_index) =
+        (static_cast<uint64_t>(b_label) << 32) | a_label;
+    *reinterpret_cast<uint64_t *>(labels.data + background_labels_index) =
+        (static_cast<uint64_t>(d_label) << 32) | (c_label);
+    return;
+  } else {
+    a_label = (HasBit(foreground_info, Info::a) * foreground_label +
+               HasBit(background_left_info, Info::a) * background_left_label);
+    b_label = HasBit(foreground_info, Info::b) * foreground_label +
+              HasBit(background_right_info, Info::b) * background_right_label;
+    c_label = HasBit(foreground_info, Info::c) * foreground_label +
+              HasBit(background_left_info, Info::c) * background_left_label;
+    d_label = HasBit(foreground_info, Info::d) * foreground_label +
+              HasBit(background_right_info, Info::d) * background_right_label;
+
+    *reinterpret_cast<uint64_t *>(labels.data + foreground_labels_index) =
+        (static_cast<uint64_t>(b_label) << 32) | a_label;
+    *reinterpret_cast<uint64_t *>(labels.data + background_labels_index) =
+        (static_cast<uint64_t>(d_label) << 32) | (c_label);
+  }
+
+  if ((foreground_info & 0xf) != 0u) {
+    // We've got foreground!
+    size_t count = 0;
+    if (HasBit(foreground_info, Info::a)) {
+      ++count;
+    }
+    if (HasBit(foreground_info, Info::b)) {
+      ++count;
+    }
+    if (HasBit(foreground_info, Info::c)) {
+      ++count;
+    }
+    if (HasBit(foreground_info, Info::d)) {
+      ++count;
+    }
+
+    atomicAdd(union_markers_size_device.data + foreground_label, count);
+  }
+
+  if ((background_left_info & 0xf) == 0u &&
+      (background_right_info & 0xf) == 0u) {
+    return;
+  }
+
+  if ((background_left_info & 0xf) != 0u &&
+      (background_right_info & 0xf) != 0u &&
+      background_left_label == background_right_label) {
+    // They are all populated and match, go for it.
+    size_t count = 0;
+    if (HasBit(background_left_info, Info::a)) {
+      ++count;
+    }
+    if (HasBit(background_right_info, Info::b)) {
+      ++count;
+    }
+    if (HasBit(background_left_info, Info::c)) {
+      ++count;
+    }
+    if (HasBit(background_right_info, Info::d)) {
+      ++count;
+    }
+
+    atomicAdd(union_markers_size_device.data + background_left_label, count);
+    return;
+  }
+
+  if ((background_left_info & 0xf) != 0u) {
+    size_t count = 0;
+    if (HasBit(background_left_info, Info::a)) {
+      ++count;
+    }
+    if (HasBit(background_left_info, Info::c)) {
+      ++count;
+    }
+    atomicAdd(union_markers_size_device.data + background_left_label, count);
+  }
+
+  if ((background_right_info & 0xf) != 0u) {
+    size_t count = 0;
+    if (HasBit(background_right_info, Info::b)) {
+      ++count;
+    }
+    if (HasBit(background_right_info, Info::d)) {
+      ++count;
+    }
+    atomicAdd(union_markers_size_device.data + background_right_label, count);
+  }
+}
+
+}  // namespace
+
+void LabelImage(const GpuImage<uint8_t> input, GpuImage<uint32_t> output,
+                GpuImage<uint32_t> union_markers_size_device,
+                cudaStream_t stream) {
+  CHECK_NE(input.rows, 1u);
+  CHECK_NE(input.cols, 1u);
+
+  // Need an even number of rows and colums, we don't need to solve the actual
+  // hard problems...
+  CHECK_EQ(input.rows % 2, 0u);
+  CHECK_EQ(input.cols % 2, 0u);
+
+  dim3 grid_size =
+      dim3((((input.cols + 1) / 2) + BLOCK_COLS - 1) / BLOCK_COLS,
+           (((input.rows + 1) / 2) + BLOCK_ROWS - 1) / BLOCK_ROWS, 1);
+  dim3 block_size = dim3(BLOCK_COLS, BLOCK_ROWS, 1);
+
+  InitLabeling<<<grid_size, block_size, 0, stream>>>(input, output);
+
+  Compression<<<grid_size, block_size, 0, stream>>>(output);
+
+  Merge<<<grid_size, block_size, 0, stream>>>(output);
+
+  Compression<<<grid_size, block_size, 0, stream>>>(output);
+
+  FinalLabeling<<<grid_size, block_size, 0, stream>>>(
+      output, union_markers_size_device);
+}
diff --git a/frc971/orin/labeling_allegretti_2019_BKE.h b/frc971/orin/labeling_allegretti_2019_BKE.h
new file mode 100644
index 0000000..4c87f7f
--- /dev/null
+++ b/frc971/orin/labeling_allegretti_2019_BKE.h
@@ -0,0 +1,12 @@
+#ifndef FRC971_ORIN_LABELING_ALLEGRETTI_2019_BKE_H_
+#define FRC971_ORIN_LABELING_ALLEGRETTI_2019_BKE_H_
+
+#include <cuda_runtime.h>
+
+#include "frc971/orin/gpu_image.h"
+
+void LabelImage(const GpuImage<uint8_t> input, GpuImage<uint32_t> output,
+                GpuImage<uint32_t> union_markers_size_device,
+                cudaStream_t stream);
+
+#endif  // FRC971_ORIN_LABELING_ALLEGRETTI_2019_BKE_H_
diff --git a/frc971/orin/points.cc b/frc971/orin/points.cc
new file mode 100644
index 0000000..dbf5496
--- /dev/null
+++ b/frc971/orin/points.cc
@@ -0,0 +1,35 @@
+#include "frc971/orin/points.h"
+
+#include <iomanip>
+#include <ostream>
+
+namespace frc971 {
+namespace apriltag {
+
+std::ostream &operator<<(std::ostream &os, const QuadBoundaryPoint &point) {
+  std::ios_base::fmtflags original_flags = os.flags();
+
+  os << "key:" << std::hex << std::setw(16) << std::setfill('0') << point.key
+     << " rep01:" << std::setw(10) << point.rep01() << " pt:" << std::setw(6)
+     << point.point_bits();
+  os.flags(original_flags);
+  return os;
+}
+
+static_assert(sizeof(QuadBoundaryPoint) == 8,
+              "QuadBoundaryPoint didn't pack right.");
+
+std::ostream &operator<<(std::ostream &os, const IndexPoint &point) {
+  std::ios_base::fmtflags original_flags = os.flags();
+
+  os << "key:" << std::hex << std::setw(16) << std::setfill('0') << point.key
+     << " i:" << std::setw(3) << point.blob_index() << " t:" << std::setw(7)
+     << point.theta() << " p:" << std::setw(6) << point.point_bits();
+  os.flags(original_flags);
+  return os;
+}
+
+static_assert(sizeof(IndexPoint) == 8, "IndexPoint didn't pack right.");
+
+}  // namespace apriltag
+}  // namespace frc971
diff --git a/frc971/orin/points.h b/frc971/orin/points.h
new file mode 100644
index 0000000..312d90a
--- /dev/null
+++ b/frc971/orin/points.h
@@ -0,0 +1,295 @@
+#ifndef FRC971_ORIN_POINTS_H_
+#define FRC971_ORIN_POINTS_H_
+
+#include <stdint.h>
+
+#include <cub/iterator/transform_input_iterator.cuh>
+#include <cuda/std/tuple>
+#include <iomanip>
+#include <ostream>
+
+#include "cuda_runtime.h"
+#include "device_launch_parameters.h"
+
+namespace frc971 {
+namespace apriltag {
+
+// Class to hold the 2 adjacent blob IDs, a point in decimated image space, the
+// half pixel offset, and the gradient.
+//
+// rep0 and rep1 are the two blob ids, and are each allocated 20 bits.
+// point is the base point and is allocated 10 bits for x and 10 bits for y.
+// dx and dy are allocated 2 bits, and can only take on set values.
+// black_to_white captures the direction of the gradient in 1 bit.
+//
+// This adds up to 63 bits so we can load this with one big load.
+struct QuadBoundaryPoint {
+  static constexpr size_t kRepEndBit = 24;
+  static constexpr size_t kBitsInKey = 64;
+
+  __forceinline__ __host__ __device__ QuadBoundaryPoint() : key(0) {}
+
+  // Sets rep0, the 0th blob id.  This only respects the bottom 20 bits.
+  __forceinline__ __host__ __device__ void set_rep0(uint32_t rep0) {
+    key = (key & 0xfffff00000ffffffull) |
+          (static_cast<uint64_t>(rep0 & 0xfffff) << 24);
+  }
+  // Returns rep0.
+  __forceinline__ __host__ __device__ uint32_t rep0() const {
+    return ((key >> 24) & 0xfffff);
+  }
+
+  // Sets rep1, the 1st blob id.  This only respects the bottom 20 bits.
+  __forceinline__ __host__ __device__ void set_rep1(uint32_t rep1) {
+    key = (key & 0xfffffffffffull) |
+          (static_cast<uint64_t>(rep1 & 0xfffff) << 44);
+  }
+  // Returns rep1.
+  __forceinline__ __host__ __device__ uint32_t rep1() const {
+    return ((key >> 44) & 0xfffff);
+  }
+
+  // Returns both rep0 and rep1 concatenated into a single 40 bit number.
+  __forceinline__ __host__ __device__ uint64_t rep01() const {
+    return ((key >> 24) & 0xffffffffff);
+  }
+
+  // Returns all the bits used to hold position and gradient information.
+  __forceinline__ __host__ __device__ uint32_t point_bits() const {
+    return key & 0xffffff;
+  }
+
+  // Sets the 10 bit x and y.
+  __forceinline__ __host__ __device__ void set_base_xy(uint32_t x, uint32_t y) {
+    key = (key & 0xffffffffff00000full) |
+          (static_cast<uint64_t>(x & 0x3ff) << 14) |
+          (static_cast<uint64_t>(y & 0x3ff) << 4);
+  }
+
+  // Returns the base 10 bit x and y.
+  __forceinline__ __host__ __device__ uint32_t base_x() const {
+    return ((key >> 14) & 0x3ff);
+  }
+  __forceinline__ __host__ __device__ uint32_t base_y() const {
+    return ((key >> 4) & 0x3ff);
+  }
+
+  // Sets dxy, the integer representing which of the 4 search directions we
+  // went.
+  __forceinline__ __host__ __device__ void set_dxy(uint64_t dxy) {
+    key = (key & 0xfffffffffffffffcull) | (static_cast<uint64_t>(dxy & 0x3));
+  }
+
+  // Returns the change in x derived from the search direction.
+  __forceinline__ __host__ __device__ int32_t dx() const {
+    switch (key & 0x3) {
+      case 0:
+        return 1;
+      case 1:
+        return 1;
+      case 2:
+        return 0;
+      case 3:
+        return -1;
+    }
+    return 0;
+  }
+
+  // Returns the change in y derived from the search direction.
+  __forceinline__ __host__ __device__ int32_t dy() const {
+    switch (key & 0x3) {
+      case 0:
+        return 0;
+      case 1:
+      case 2:
+      case 3:
+        return 1;
+    }
+    return 0;
+  }
+
+  // Returns the un-decimated x and y positions.
+  __forceinline__ __host__ __device__ uint32_t x() const {
+    return static_cast<int32_t>(base_x() * 2) + dx();
+  }
+  __forceinline__ __host__ __device__ uint32_t y() const {
+    return static_cast<int32_t>(base_y() * 2) + dy();
+  }
+
+  // Returns the gradient that this point represents, taking into account which
+  // direction the color transitioned.
+  __forceinline__ __host__ __device__ int8_t gx() const {
+    return black_to_white() ? dx() : -dx();
+  }
+  __forceinline__ __host__ __device__ int8_t gy() const {
+    return black_to_white() ? dy() : -dy();
+  }
+
+  // Returns the black to white or white to black bit.
+  __forceinline__ __host__ __device__ void set_black_to_white(
+      bool black_to_white) {
+    key = (key & 0xfffffffffffffff7ull) |
+          (static_cast<uint64_t>(black_to_white) << 3);
+  }
+  __forceinline__ __host__ __device__ bool black_to_white() const {
+    return (key & 0x8) != 0;
+  }
+
+  // Various operators to make it easy to compare points.
+  __forceinline__ __host__ __device__ bool operator!=(
+      const QuadBoundaryPoint other) const {
+    return other.key != key;
+  }
+  __forceinline__ __host__ __device__ bool operator==(
+      const QuadBoundaryPoint other) const {
+    return other.key == key;
+  }
+  __forceinline__ __host__ __device__ bool operator<(
+      const QuadBoundaryPoint other) const {
+    return key < other.key;
+  }
+
+  // Returns true if this point has been set.  Zero is reserved for "invalid"
+  __forceinline__ __host__ __device__ bool nonzero() const {
+    return key != 0ull;
+  }
+
+  // Returns true if this point is about the other point.
+  bool near(QuadBoundaryPoint other) const { return other == *this; }
+
+  // The key.  This shouldn't be parsed directly.
+  uint64_t key;
+};
+
+std::ostream &operator<<(std::ostream &os, const QuadBoundaryPoint &point);
+
+// Holds a compacted blob index, the angle to the X axis from the center of the
+// blob, and the coordinate of the point.
+//
+// The blob index is 12 bits, the angle is 28 bits, and the point is 24 bits.
+struct IndexPoint {
+  // Max number of blob IDs we can hold.
+  static constexpr size_t kMaxBlobs = 2048;
+
+  static constexpr size_t kRepEndBit = 24;
+  static constexpr size_t kBitsInKey = 64;
+
+  __forceinline__ __host__ __device__ IndexPoint() : key(0) {}
+
+  // Constructor to build a point with just the blob index, and point bits.  The
+  // point bits should be grabbed from a QuadBoundaryPoint rather than built up
+  // by hand.
+  __forceinline__ __host__ __device__ IndexPoint(uint32_t blob_index,
+                                                 uint32_t point_bits)
+      : key((static_cast<uint64_t>(blob_index & 0xfff) << 52) |
+            (static_cast<uint64_t>(point_bits & 0xffffff))) {}
+
+  // Sets and gets the 12 bit blob index.
+  __forceinline__ __host__ __device__ void set_blob_index(uint32_t blob_index) {
+    key = (key & 0x000fffffffffffffull) |
+          (static_cast<uint64_t>(blob_index & 0xfff) << 52);
+  }
+  __forceinline__ __host__ __device__ uint32_t blob_index() const {
+    return ((key >> 52) & 0xfff);
+  }
+
+  // Sets and gets the 28 bit angle.
+  __forceinline__ __host__ __device__ void set_theta(uint32_t theta) {
+    key = (key & 0xfff0000000ffffffull) |
+          (static_cast<uint64_t>(theta & 0xfffffff) << 24);
+  }
+  __forceinline__ __host__ __device__ uint32_t theta() const {
+    return ((key >> 24) & 0xfffffff);
+  }
+
+  // See QuadBoundaryPoint for a description of the rest of these.
+  __forceinline__ __host__ __device__ uint32_t base_x() const {
+    return ((key >> 14) & 0x3ff);
+  }
+  __forceinline__ __host__ __device__ uint32_t base_y() const {
+    return ((key >> 4) & 0x3ff);
+  }
+
+  __forceinline__ __host__ __device__ void set_dxy(uint64_t dxy) {
+    key = (key & 0xfffffffffffffffcull) | (static_cast<uint64_t>(dxy & 0x3));
+  }
+
+  __forceinline__ __host__ __device__ int32_t dx() const {
+    switch (key & 0x3) {
+      case 0:
+        return 1;
+      case 1:
+        return 1;
+      case 2:
+        return 0;
+      case 3:
+        return -1;
+    }
+    return 0;
+  }
+
+  __forceinline__ __host__ __device__ int32_t dy() const {
+    switch (key & 0x3) {
+      case 0:
+        return 0;
+      case 1:
+      case 2:
+      case 3:
+        return 1;
+    }
+    return 0;
+  }
+
+  __forceinline__ __host__ __device__ uint32_t x() const {
+    return static_cast<int32_t>(base_x() * 2) + dx();
+  }
+  __forceinline__ __host__ __device__ uint32_t y() const {
+    return static_cast<int32_t>(base_y() * 2) + dy();
+  }
+
+  __forceinline__ __host__ __device__ int8_t gx() const {
+    return black_to_white() ? dx() : -dx();
+  }
+  __forceinline__ __host__ __device__ int8_t gy() const {
+    return black_to_white() ? dy() : -dy();
+  }
+
+  __forceinline__ __host__ __device__ uint32_t point_bits() const {
+    return key & 0xffffff;
+  }
+
+  __forceinline__ __host__ __device__ void set_black_to_white(
+      bool black_to_white) {
+    key = (key & 0xfffffffffffffff7ull) |
+          (static_cast<uint64_t>(black_to_white) << 3);
+  }
+  __forceinline__ __host__ __device__ bool black_to_white() const {
+    return (key & 0x8) != 0;
+  }
+
+  // The key.  This shouldn't be parsed directly.
+  uint64_t key;
+};
+
+std::ostream &operator<<(std::ostream &os, const IndexPoint &point);
+
+// Decomposer for sorting which just returns the key.
+struct QuadBoundaryPointDecomposer {
+  __host__ __device__ ::cuda::std::tuple<uint64_t &> operator()(
+      QuadBoundaryPoint &key) const {
+    return {key.key};
+  }
+};
+
+// Decomposer for sorting which just returns the key.
+struct QuadIndexPointDecomposer {
+  __host__ __device__ ::cuda::std::tuple<uint64_t &> operator()(
+      IndexPoint &key) const {
+    return {key.key};
+  }
+};
+
+}  // namespace apriltag
+}  // namespace frc971
+
+#endif  // FRC971_ORIN_POINTS_H_
diff --git a/frc971/orin/threshold.cc b/frc971/orin/threshold.cc
new file mode 100644
index 0000000..79ccced
--- /dev/null
+++ b/frc971/orin/threshold.cc
@@ -0,0 +1,206 @@
+#include "frc971/orin/threshold.h"
+
+#include <stdint.h>
+
+#include "frc971/orin/cuda.h"
+
+namespace frc971 {
+namespace apriltag {
+namespace {
+
+// 1280 -> 2 * 128 * 5
+// 720 -> 2 * 8 * 5 * 9
+//
+// 1456 -> 2 * 8 * 7 * 13
+// 1088 -> 2 * 32 * 17
+
+// Writes out the grayscale image and decimated image.
+__global__ void InternalCudaToGreyscaleAndDecimateHalide(
+    const uint8_t *color_image, uint8_t *gray_image, uint8_t *decimated_image,
+    size_t width, size_t height) {
+  size_t i = blockIdx.x * blockDim.x + threadIdx.x;
+  while (i < width * height) {
+    uint8_t pixel = gray_image[i] = color_image[i * 2];
+
+    const size_t row = i / width;
+    const size_t col = i - width * row;
+
+    // Copy over every other pixel.
+    if (row % 2 == 0 && col % 2 == 0) {
+      size_t decimated_row = row / 2;
+      size_t decimated_col = col / 2;
+      decimated_image[decimated_row * width / 2 + decimated_col] = pixel;
+    }
+    i += blockDim.x * gridDim.x;
+  }
+
+  // TODO(austin): Figure out how to load contiguous memory reasonably
+  // efficiently and max/min over it.
+
+  // TODO(austin): Can we do the threshold here too?  That would be less memory
+  // bandwidth consumed...
+}
+
+// Returns the min and max for a row of 4 pixels.
+__forceinline__ __device__ uchar2 minmax(uchar4 row) {
+  uint8_t min_val = std::min(std::min(row.x, row.y), std::min(row.z, row.w));
+  uint8_t max_val = std::max(std::max(row.x, row.y), std::max(row.z, row.w));
+  return make_uchar2(min_val, max_val);
+}
+
+// Returns the min and max for a set of min and maxes.
+__forceinline__ __device__ uchar2 minmax(uchar2 val0, uchar2 val1) {
+  return make_uchar2(std::min(val0.x, val1.x), std::max(val0.y, val1.y));
+}
+
+// Returns the pixel index of a pixel at the provided x and y location.
+__forceinline__ __device__ size_t XYToIndex(size_t width, size_t x, size_t y) {
+  return width * y + x;
+}
+
+// Computes the min and max pixel value for each block of 4 pixels.
+__global__ void InternalBlockMinMax(const uint8_t *decimated_image,
+                                    uchar2 *unfiltered_minmax_image,
+                                    size_t width, size_t height) {
+  uchar2 vals[4];
+  const size_t x = blockIdx.x * blockDim.x + threadIdx.x;
+  const size_t y = blockIdx.y * blockDim.y + threadIdx.y;
+
+  if (x >= width || y >= height) {
+    return;
+  }
+
+  for (int i = 0; i < 4; ++i) {
+    const uchar4 decimated_block = *reinterpret_cast<const uchar4 *>(
+        decimated_image + XYToIndex(width * 4, x * 4, y * 4 + i));
+
+    vals[i] = minmax(decimated_block);
+  }
+
+  unfiltered_minmax_image[XYToIndex(width, x, y)] =
+      minmax(minmax(vals[0], vals[1]), minmax(vals[2], vals[3]));
+}
+
+// Filters the min/max for the surrounding block of 9 pixels centered on our
+// location using min/max and writes the result back out.
+__global__ void InternalBlockFilter(const uchar2 *unfiltered_minmax_image,
+                                    uchar2 *minmax_image, size_t width,
+                                    size_t height) {
+  uchar2 result = make_uchar2(255, 0);
+
+  const size_t x = blockIdx.x * blockDim.x + threadIdx.x;
+  const size_t y = blockIdx.y * blockDim.y + threadIdx.y;
+
+  if (x >= width || y >= height) {
+    return;
+  }
+
+  // Iterate through the 3x3 set of points centered on the point this image is
+  // responsible for, and compute the overall min/max.
+#pragma unroll
+  for (int i = -1; i <= 1; ++i) {
+#pragma unroll
+    for (int j = -1; j <= 1; ++j) {
+      const ssize_t read_x = x + i;
+      const ssize_t read_y = y + j;
+
+      if (read_x < 0 || read_x >= static_cast<ssize_t>(width)) {
+        continue;
+      }
+      if (read_y < 0 || read_y >= static_cast<ssize_t>(height)) {
+        continue;
+      }
+
+      result = minmax(
+          result, unfiltered_minmax_image[XYToIndex(width, read_x, read_y)]);
+    }
+  }
+
+  minmax_image[XYToIndex(width, x, y)] = result;
+}
+
+// Thresholds the image based on the filtered thresholds.
+__global__ void InternalThreshold(const uint8_t *decimated_image,
+                                  const uchar2 *minmax_image,
+                                  uint8_t *thresholded_image, size_t width,
+                                  size_t height, size_t min_white_black_diff) {
+  size_t i = blockIdx.x * blockDim.x + threadIdx.x;
+  while (i < width * height) {
+    const size_t x = i % width;
+    const size_t y = i / width;
+
+    const uchar2 minmax_val = minmax_image[x / 4 + (y / 4) * width / 4];
+
+    uint8_t result;
+    if (minmax_val.y - minmax_val.x < min_white_black_diff) {
+      result = 127;
+    } else {
+      uint8_t thresh = minmax_val.x + (minmax_val.y - minmax_val.x) / 2;
+      if (decimated_image[i] > thresh) {
+        result = 255;
+      } else {
+        result = 0;
+      }
+    }
+
+    thresholded_image[i] = result;
+    i += blockDim.x * gridDim.x;
+  }
+}
+
+}  // namespace
+
+void CudaToGreyscaleAndDecimateHalide(
+    const uint8_t *color_image, uint8_t *gray_image, uint8_t *decimated_image,
+    uint8_t *unfiltered_minmax_image, uint8_t *minmax_image,
+    uint8_t *thresholded_image, size_t width, size_t height,
+    size_t min_white_black_diff, CudaStream *stream) {
+  CHECK((width % 8) == 0);
+  CHECK((height % 8) == 0);
+  constexpr size_t kThreads = 256;
+  {
+    // Step one, convert to gray and decimate.
+    size_t kBlocks = (width * height + kThreads - 1) / kThreads / 4;
+    InternalCudaToGreyscaleAndDecimateHalide<<<kBlocks, kThreads, 0,
+                                               stream->get()>>>(
+        color_image, gray_image, decimated_image, width, height);
+    MaybeCheckAndSynchronize();
+  }
+
+  size_t decimated_width = width / 2;
+  size_t decimated_height = height / 2;
+
+  {
+    // Step 2, compute a min/max for each block of 4x4 (16) pixels.
+    dim3 threads(16, 16, 1);
+    dim3 blocks((decimated_width / 4 + 15) / 16,
+                (decimated_height / 4 + 15) / 16, 1);
+
+    InternalBlockMinMax<<<blocks, threads, 0, stream->get()>>>(
+        decimated_image, reinterpret_cast<uchar2 *>(unfiltered_minmax_image),
+        decimated_width / 4, decimated_height / 4);
+    MaybeCheckAndSynchronize();
+
+    // Step 3, Blur those min/max's a further +- 1 block in each direction using
+    // min/max.
+    InternalBlockFilter<<<blocks, threads, 0, stream->get()>>>(
+        reinterpret_cast<uchar2 *>(unfiltered_minmax_image),
+        reinterpret_cast<uchar2 *>(minmax_image), decimated_width / 4,
+        decimated_height / 4);
+    MaybeCheckAndSynchronize();
+  }
+
+  {
+    // Now, write out 127 if the min/max are too close to each other, or 0/255
+    // if the pixels are above or below the average of the min/max.
+    size_t kBlocks = (width * height / 4 + kThreads - 1) / kThreads / 4;
+    InternalThreshold<<<kBlocks, kThreads, 0, stream->get()>>>(
+        decimated_image, reinterpret_cast<uchar2 *>(minmax_image),
+        thresholded_image, decimated_width, decimated_height,
+        min_white_black_diff);
+    MaybeCheckAndSynchronize();
+  }
+}
+
+}  // namespace apriltag
+}  // namespace frc971
diff --git a/frc971/orin/threshold.h b/frc971/orin/threshold.h
new file mode 100644
index 0000000..5cdc3a2
--- /dev/null
+++ b/frc971/orin/threshold.h
@@ -0,0 +1,22 @@
+#ifndef FRC971_ORIN_THRESHOLD_H_
+#define FRC971_ORIN_THRESHOLD_H_
+
+#include <stdint.h>
+
+#include "frc971/orin/cuda.h"
+
+namespace frc971 {
+namespace apriltag {
+
+// Converts to grayscale, decimates, and thresholds an image on the provided
+// stream.
+void CudaToGreyscaleAndDecimateHalide(
+    const uint8_t *color_image, uint8_t *gray_image, uint8_t *decimated_image,
+    uint8_t *unfiltered_minmax_image, uint8_t *minmax_image,
+    uint8_t *thresholded_image, size_t width, size_t height,
+    size_t min_white_black_diff, CudaStream *stream);
+
+}  // namespace apriltag
+}  // namespace frc971
+
+#endif  // FRC971_ORIN_THRESHOLD_H_
diff --git a/third_party/apriltag/apriltag_quad_thresh.c b/third_party/apriltag/apriltag_quad_thresh.c
index 09dc202..e4c8b4e 100644
--- a/third_party/apriltag/apriltag_quad_thresh.c
+++ b/third_party/apriltag/apriltag_quad_thresh.c
@@ -991,7 +991,7 @@
     int y = 0;
     uint8_t v;
 
-    for (int x = 1; x < w - 1; x++) {
+    for (int x = 1; x < w; x++) {
         v = im->buf[y*s + x];
 
         if (v == 127)
@@ -1010,8 +1010,15 @@
     uint8_t v_1_m1 = im->buf[(y - 1)*s + 1];
     uint8_t v_m1_0;
     uint8_t v = im->buf[y*s];
+    {
+      int x = 0;
+      if (v != 127) {
+        DO_UNIONFIND2(0, -1);
+        DO_UNIONFIND2(1, -1);
+      }
+    }
 
-    for (int x = 1; x < w - 1; x++) {
+    for (int x = 1; x < w; x++) {
         v_m1_m1 = v_0_m1;
         v_0_m1 = v_1_m1;
         v_1_m1 = im->buf[(y - 1)*s + x + 1];
@@ -1034,7 +1041,7 @@
             if (x == 1 || !(v_m1_0 == v_m1_m1 || v_0_m1 == v_m1_m1) ) {
                 DO_UNIONFIND2(-1, -1);
             }
-            if (!(v_0_m1 == v_1_m1)) {
+            if ((x < w - 1) && !(v_0_m1 == v_1_m1)) {
                 DO_UNIONFIND2(1, -1);
             }
         }
@@ -1521,46 +1528,52 @@
             // A possible optimization would be to combine entries
             // within the same cluster.
 
-#define DO_CONN(dx, dy)                                                 \
-            if (1) {                                                    \
-                uint8_t v1 = threshim->buf[(y + dy)*ts + x + dx];       \
-                                                                        \
-                if (v0 + v1 == 255) {                                   \
-                    uint64_t rep1 = unionfind_get_representative(uf, (y + dy)*w + x + dx); \
-                    if (unionfind_get_set_size(uf, rep1) > 24) {        \
-                        uint64_t clusterid;                                 \
-                        if (rep0 < rep1)                                    \
-                            clusterid = (rep1 << 32) + rep0;                \
-                        else                                                \
-                            clusterid = (rep0 << 32) + rep1;                \
-                                                                            \
-                        /* XXX lousy hash function */                       \
-                        uint32_t clustermap_bucket = u64hash_2(clusterid) % nclustermap; \
-                        struct uint64_zarray_entry *entry = clustermap[clustermap_bucket]; \
-                        while (entry && entry->id != clusterid) {           \
-                            entry = entry->next;                            \
-                        }                                                   \
-                                                                            \
-                        if (!entry) {                                       \
-                            if (mem_pool_loc == mem_chunk_size) {           \
-                                mem_pool_loc = 0;                           \
-                                mem_pool_idx++;                             \
-                                mem_pools[mem_pool_idx] = calloc(mem_chunk_size, sizeof(struct uint64_zarray_entry)); \
-                            }                                               \
-                            entry = mem_pools[mem_pool_idx] + mem_pool_loc; \
-                            mem_pool_loc++;                                 \
-                                                                            \
-                            entry->id = clusterid;                          \
-                            entry->cluster = zarray_create(sizeof(struct pt)); \
-                            entry->next = clustermap[clustermap_bucket];    \
-                            clustermap[clustermap_bucket] = entry;          \
-                        }                                                   \
-                                                                            \
-                        struct pt p = { .x = 2*x + dx, .y = 2*y + dy, .gx = dx*((int) v1-v0), .gy = dy*((int) v1-v0)}; \
-                        zarray_add(entry->cluster, &p);                     \
-                    }                                                   \
-                }                                                       \
-            }
+#define DO_CONN(dx, dy)                                                        \
+  if (1) {                                                                     \
+    uint8_t v1 = threshim->buf[(y + dy) * ts + x + dx];                        \
+                                                                               \
+    if (v0 + v1 == 255) {                                                      \
+      uint64_t rep1 = unionfind_get_representative(uf, (y + dy) * w + x + dx); \
+      if (unionfind_get_set_size(uf, rep1) > 24) {                             \
+        uint64_t clusterid;                                                    \
+        if (rep0 < rep1)                                                       \
+          clusterid = (rep1 << 32) + rep0;                                     \
+        else                                                                   \
+          clusterid = (rep0 << 32) + rep1;                                     \
+                                                                               \
+        /* XXX lousy hash function */                                          \
+        uint32_t clustermap_bucket = u64hash_2(clusterid) % nclustermap;       \
+        struct uint64_zarray_entry *entry = clustermap[clustermap_bucket];     \
+        while (entry && entry->id != clusterid) {                              \
+          entry = entry->next;                                                 \
+        }                                                                      \
+                                                                               \
+        if (!entry) {                                                          \
+          if (mem_pool_loc == mem_chunk_size) {                                \
+            mem_pool_loc = 0;                                                  \
+            mem_pool_idx++;                                                    \
+            mem_pools[mem_pool_idx] =                                          \
+                calloc(mem_chunk_size, sizeof(struct uint64_zarray_entry));    \
+          }                                                                    \
+          entry = mem_pools[mem_pool_idx] + mem_pool_loc;                      \
+          mem_pool_loc++;                                                      \
+                                                                               \
+          entry->id = clusterid;                                               \
+          entry->cluster = zarray_create(sizeof(struct pt));                   \
+          entry->next = clustermap[clustermap_bucket];                         \
+          clustermap[clustermap_bucket] = entry;                               \
+        }                                                                      \
+                                                                               \
+        struct pt p = {.x = 2 * x + dx,                                        \
+                       .y = 2 * y + dy,                                        \
+                       .gx = dx * ((int)v1 - v0),                              \
+                       .gy = dy * ((int)v1 - v0)};                             \
+        /*fprintf(stderr, "Adding point %d+%d(%llx) -> (%d, %d)\n",              \
+                min(rep0, rep1), max(rep0, rep1), entry->id, p.x, p.y);       */ \
+        zarray_add(entry->cluster, &p);                                        \
+      }                                                                        \
+    }                                                                          \
+  }
 
             // do 4 connectivity. NB: Arguments must be [-1, 1] or we'll overflow .gx, .gy
             DO_CONN(1, 0);
@@ -1796,6 +1809,7 @@
 
     // make segmentation image.
     if (td->debug) {
+        srandom(0);
         image_u8x3_t *d = image_u8x3_create(w, h);
 
         uint32_t *colors = (uint32_t*) calloc(w*h, sizeof(*colors));
@@ -1804,9 +1818,6 @@
             for (int x = 0; x < w; x++) {
                 uint32_t v = unionfind_get_representative(uf, y*w+x);
 
-                if (unionfind_get_set_size(uf, v) < td->qtp.min_cluster_pixels)
-                    continue;
-
                 uint32_t color = colors[v];
                 uint8_t r = color >> 16,
                     g = color >> 8,
@@ -1820,6 +1831,10 @@
                     colors[v] = (r << 16) | (g << 8) | b;
                 }
 
+                if (unionfind_get_set_size(uf, v) < 1) //td->qtp.min_cluster_pixels)
+                    continue;
+
+
                 d->buf[y*d->stride + 3*x + 0] = r;
                 d->buf[y*d->stride + 3*x + 1] = g;
                 d->buf[y*d->stride + 3*x + 2] = b;
diff --git a/third_party/apriltag/common/timeprofile.h b/third_party/apriltag/common/timeprofile.h
index 8016386..40ab9e6 100644
--- a/third_party/apriltag/common/timeprofile.h
+++ b/third_party/apriltag/common/timeprofile.h
@@ -97,7 +97,7 @@
 
         double parttime = (stamp->utime - lastutime)/1000000.0;
 
-        printf("%2d %32s %15f ms %15f ms\n", i, stamp->name, parttime*1000, cumtime*1000);
+        fprintf(stderr, "%2d %32s %15f ms %15f ms\n", i, stamp->name, parttime*1000, cumtime*1000);
 
         lastutime = stamp->utime;
     }
diff --git a/third_party/bazel-toolchain/bazel_tools_changes/tools/cpp/unix_cc_toolchain_config.bzl b/third_party/bazel-toolchain/bazel_tools_changes/tools/cpp/unix_cc_toolchain_config.bzl
index b80b5ec..7b6265f 100755
--- a/third_party/bazel-toolchain/bazel_tools_changes/tools/cpp/unix_cc_toolchain_config.bzl
+++ b/third_party/bazel-toolchain/bazel_tools_changes/tools/cpp/unix_cc_toolchain_config.bzl
@@ -1141,16 +1141,13 @@
     )
 
     cuda_flags = [
-        "--cuda-gpu-arch=sm_75",
-        "--cuda-gpu-arch=sm_80",
-        "--cuda-gpu-arch=sm_86",
-        "--cuda-gpu-arch=sm_87",
         "-x",
         "cuda",
     ]
 
     if ctx.attr.cpu == "aarch64":
         cuda_flags += [
+            "--cuda-gpu-arch=sm_87",
             "--cuda-path=external/arm64_debian_sysroot/usr/local/cuda-11.8/",
             "--ptxas-path=external/arm64_debian_sysroot/usr/local/cuda-11.8/bin/ptxas",
             "-D__CUDACC_VER_MAJOR__=11",
@@ -1159,6 +1156,8 @@
         pass
     elif ctx.attr.cpu == "k8":
         cuda_flags += [
+            "--cuda-gpu-arch=sm_75",
+            "--cuda-gpu-arch=sm_86",
             "--cuda-path=external/amd64_debian_sysroot/usr/lib/cuda/",
             "--ptxas-path=external/amd64_debian_sysroot/usr/bin/ptxas",
             "-D__CUDACC_VER_MAJOR__=11",