Actually detect apriltags This does the line fits for the borders, finds the best up to 10 candidate corners, and picks the best one. It then does the final filtering to pick which quads to evaluate, evaluates them, and then uses aprilrobotics to refine and report out the solution. There's a small (1e-3) discrepency on some of the tags to track down, but overall it works pretty well. Takes 4ms to process the imx296 images. Change-Id: I6cff5654252897f46def422c021b1dce74036476 Signed-off-by: Austin Schuh <austin.linux@gmail.com>

commit: 1fc51faaa13d141b1b611b312f7ff8b967bcf307 [log] [tgz]
author: Austin Schuh <austin.linux@gmail.com> Mon Jan 01 12:34:00 2024 -0800
committer: Austin Schuh <austin.linux@gmail.com> Thu Jan 04 18:02:00 2024 -0800
tree: 6afdac6d7cd573dec011b64c38de00338578d415
parent: 8de7def4d13bc5309534877e95462e0a4100db0c [diff]
diff --git a/WORKSPACE b/WORKSPACE
index d76d359..913d969 100644
--- a/WORKSPACE
+++ b/WORKSPACE

@@ -1562,6 +1562,20 @@
     url = "https://software.frc971.org/Build-Dependencies/2023_arducam_apriltag_test_images.tar.gz",
 )
 
+http_file(
+    name = "orin_image_apriltag",
+    downloaded_file_path = "orin_image_apriltag.bfbs",
+    sha256 = "c86604fd0b1301b301e299b1bba2573af8c586413934a386a2bd28fd9b037b84",
+    url = "https://software.frc971.org/Build-Dependencies/orin_image_apriltag.bfbs",
+)
+
+http_file(
+    name = "orin_large_image_apriltag",
+    downloaded_file_path = "orin_large_gs_apriltag.bfbs",
+    sha256 = "d933adac0d6c205c574791060be73701ead05977ff5dd9f6f4eadb45817c3ccb",
+    url = "https://software.frc971.org/Build-Dependencies/orin_large_gs_apriltag.bfbs",
+)
+
 http_archive(
     name = "libedgetpu",
     build_file = "//third_party:libedgetpu/libedgetpu.BUILD",

diff --git a/frc971/orin/BUILD b/frc971/orin/BUILD
index 61d9b8e..6075a22 100644
--- a/frc971/orin/BUILD
+++ b/frc971/orin/BUILD

@@ -17,11 +17,13 @@
 )
 
 cc_library(
-    name = "cuda",
+    name = "apriltag",
     srcs = [
         "apriltag.cc",
+        "apriltag_detect.cc",
         "cuda.cc",
         "labeling_allegretti_2019_BKE.cc",
+        "line_fit_filter.cc",
         "points.cc",
         "threshold.cc",
     ],
@@ -30,6 +32,7 @@
         "cuda.h",
         "gpu_image.h",
         "labeling_allegretti_2019_BKE.h",
+        "line_fit_filter.h",
         "points.h",
         "threshold.h",
         "transform_output_iterator.h",
@@ -42,6 +45,7 @@
     features = ["cuda"],
     deps = [
         "//aos/time",
+        "//third_party:cudart",
         "//third_party/apriltag",
         "@com_github_google_glog//:glog",
         "@com_github_nvidia_cccl//:cccl",
@@ -56,10 +60,12 @@
     ],
     data = [
         "@apriltag_test_bfbs_images",
+        "@orin_image_apriltag//file",
+        "@orin_large_image_apriltag//file",
     ],
     features = ["cuda"],
     deps = [
-        ":cuda",
+        ":apriltag",
         ":ycbcr",
         "//aos:flatbuffer_merge",
         "//aos:json_to_flatbuffer",
@@ -67,7 +73,6 @@
         "//aos/testing:path",
         "//aos/testing:random_seed",
         "//frc971/vision:vision_fbs",
-        "//third_party:cudart",
         "//third_party:opencv",
         "//third_party/apriltag",
         "//y2023/vision:ThresholdHalide",
@@ -82,10 +87,9 @@
     ],
     features = ["cuda"],
     deps = [
-        ":cuda",
+        ":apriltag",
         "//aos/testing:googletest",
         "//aos/testing:random_seed",
-        "//third_party:cudart",
     ],
 )
 
@@ -96,10 +100,9 @@
     ],
     features = ["cuda"],
     deps = [
-        ":cuda",
+        ":apriltag",
         "//aos/testing:googletest",
         "//aos/testing:random_seed",
-        "//third_party:cudart",
     ],
 )
 

diff --git a/frc971/orin/apriltag.cc b/frc971/orin/apriltag.cc
index ff8ccec..263543d 100644
--- a/frc971/orin/apriltag.cc
+++ b/frc971/orin/apriltag.cc

@@ -15,6 +15,7 @@
 #include <vector>
 
 #include "glog/logging.h"
+#include "third_party/apriltag/common/g2d.h"
 
 #include "aos/time/time.h"
 #include "frc971/orin/labeling_allegretti_2019_BKE.h"
@@ -23,6 +24,7 @@
 
 namespace frc971 {
 namespace apriltag {
+namespace {
 
 typedef std::chrono::duration<float, std::milli> float_milli;
 
@@ -98,12 +100,26 @@
   return temp_storage_bytes;
 }
 
+// Computes and returns the scratch space needed for DeviceScan::InclusiveScan
+// of the provided value V with the provided number of elements.
+template <typename K, typename V>
+static size_t DeviceScanInclusiveScanByKeyScratchSpace(size_t elements) {
+  size_t temp_storage_bytes = 0;
+  CHECK_CUDA(cub::DeviceScan::InclusiveScanByKey(
+      nullptr, temp_storage_bytes, (K *)(nullptr), (V *)(nullptr),
+      (V *)(nullptr), CustomFirst<V>(), elements));
+  return temp_storage_bytes;
+}
+
+}  // namespace
+
 GpuDetector::GpuDetector(size_t width, size_t height,
-                         const apriltag_detector_t *tag_detector)
+                         apriltag_detector_t *tag_detector)
     : width_(width),
       height_(height),
       tag_detector_(tag_detector),
       color_image_host_(width * height * 2),
+      gray_image_host_(width * height),
       color_image_device_(width * height * 2),
       gray_image_device_(width * height),
       decimated_image_device_(width / 2 * height / 2),
@@ -116,10 +132,17 @@
       compressed_union_marker_pair_device_(union_marker_pair_device_.size()),
       sorted_union_marker_pair_device_(union_marker_pair_device_.size()),
       extents_device_(union_marker_pair_device_.size()),
-      reduced_dot_blobs_pair_device_(kMaxBlobs),
       selected_extents_device_(kMaxBlobs),
       selected_blobs_device_(union_marker_pair_device_.size()),
       sorted_selected_blobs_device_(selected_blobs_device_.size()),
+      line_fit_points_device_(selected_blobs_device_.size()),
+      errs_device_(line_fit_points_device_.size()),
+      filtered_errs_device_(line_fit_points_device_.size()),
+      filtered_is_local_peak_device_(line_fit_points_device_.size()),
+      compressed_peaks_device_(line_fit_points_device_.size()),
+      sorted_compressed_peaks_device_(line_fit_points_device_.size()),
+      peak_extents_device_(kMaxBlobs),
+      fit_quads_device_(kMaxBlobs),
       radix_sort_tmpstorage_device_(RadixSortScratchSpace<QuadBoundaryPoint>(
           sorted_union_marker_pair_device_.size())),
       temp_storage_compressed_union_marker_pair_device_(
@@ -138,7 +161,13 @@
               num_selected_blobs_device_.get())),
       temp_storage_selected_extents_scan_device_(
           DeviceScanInclusiveScanScratchSpace<
-              cub::KeyValuePair<long, MinMaxExtents>>(kMaxBlobs)) {
+              cub::KeyValuePair<long, MinMaxExtents>>(kMaxBlobs)),
+      temp_storage_line_fit_scan_device_(
+          DeviceScanInclusiveScanByKeyScratchSpace<uint32_t, LineFitPoint>(
+              sorted_selected_blobs_device_.size())) {
+  fit_quads_host_.reserve(kMaxBlobs);
+  quad_corners_host_.reserve(kMaxBlobs);
+
   CHECK_EQ(tag_detector_->quad_decimate, 2);
   CHECK(!tag_detector_->qtp.deglitch);
 
@@ -155,9 +184,27 @@
   if (min_tag_width_ < 3) {
     min_tag_width_ = 3;
   }
+
+  poly0_ = g2d_polygon_create_zeros(4);
+  poly1_ = g2d_polygon_create_zeros(4);
+
+  detections_ = zarray_create(sizeof(apriltag_detection_t *));
+  zarray_ensure_capacity(detections_, kMaxBlobs);
 }
 
-GpuDetector::~GpuDetector() {}
+GpuDetector::~GpuDetector() {
+  for (int i = 0; i < zarray_size(detections_); ++i) {
+    apriltag_detection_t *det;
+    zarray_get(detections_, i, &det);
+    apriltag_detection_destroy(det);
+  }
+
+  zarray_destroy(detections_);
+  zarray_destroy(poly1_);
+  zarray_destroy(poly0_);
+}
+
+namespace {
 
 // Computes a massive image of 4x QuadBoundaryPoint per pixel with a
 // QuadBoundaryPoint for each pixel pair which crosses a blob boundary.
@@ -306,6 +353,14 @@
   }
 };
 
+// Masks out just the blob ID pair, rep01.
+struct MaskBlobIndex {
+  __host__ __device__ __forceinline__ uint32_t
+  operator()(const IndexPoint &a) const {
+    return a.blob_index();
+  }
+};
+
 // Rewrites a QuadBoundaryPoint to an IndexPoint, adding the angle to the
 // center.
 class RewriteToIndexPoint {
@@ -354,6 +409,11 @@
     result.min_x = result.max_x = pt.value.x();
     result.starting_offset = pt.key;
     result.count = 1;
+    result.pxgx_plus_pygy_sum =
+        static_cast<int64_t>(pt.value.x()) * pt.value.gx() +
+        static_cast<int64_t>(pt.value.y()) * pt.value.gy();
+    result.gx_sum = pt.value.gx();
+    result.gy_sum = pt.value.gy();
     return result;
   }
 };
@@ -372,6 +432,9 @@
     result.starting_offset = std::min(a.starting_offset, b.starting_offset);
     // We want to count everything.
     result.count = a.count + b.count;
+    result.pxgx_plus_pygy_sum = a.pxgx_plus_pygy_sum + b.pxgx_plus_pygy_sum;
+    result.gx_sum = a.gx_sum + b.gx_sum;
+    result.gy_sum = a.gy_sum + b.gy_sum;
     return result;
   }
 };
@@ -426,15 +489,29 @@
   size_t max_cluster_pixels_;
 };
 
+class NonzeroBlobs {
+ public:
+  __host__ __device__
+  NonzeroBlobs(const cub::KeyValuePair<long, MinMaxExtents> *extents_device)
+      : extents_device_(extents_device) {}
+
+  __host__ __device__ __forceinline__ bool operator()(
+      const IndexPoint &a) const {
+    return extents_device_[a.blob_index()].value.count > 0;
+  }
+
+ private:
+  const cub::KeyValuePair<long, MinMaxExtents> *extents_device_;
+};
+
 // Selects blobs which are big enough, not too big, and have the right color in
 // the middle.
 class SelectBlobs {
  public:
-  SelectBlobs(const MinMaxExtents *extents_device, const float *dot_products,
-              size_t tag_width, bool reversed_border, bool normal_border,
+  SelectBlobs(const MinMaxExtents *extents_device, size_t tag_width,
+              bool reversed_border, bool normal_border,
               size_t min_cluster_pixels, size_t max_cluster_pixels)
       : extents_device_(extents_device),
-        dot_products_(dot_products),
         tag_width_(tag_width),
         reversed_border_(reversed_border),
         normal_border_(normal_border),
@@ -444,7 +521,7 @@
   // Returns true if the blob passes the size and dot product checks and is
   // worth further consideration.
   __host__ __device__ __forceinline__ bool operator()(
-      const size_t blob_index, MinMaxExtents extents) const {
+      MinMaxExtents extents) const {
     if (extents.count < min_cluster_pixels_) {
       return false;
     }
@@ -459,7 +536,7 @@
     }
 
     // And the right side must be inside.
-    const bool quad_reversed_border = dot_products_[blob_index] < 0;
+    const bool quad_reversed_border = extents.dot() < 0.0;
     if (!reversed_border_ && quad_reversed_border) {
       return false;
     }
@@ -472,13 +549,12 @@
 
   __host__ __device__ __forceinline__ bool operator()(
       const IndexPoint &a) const {
-    bool result = (*this)(a.blob_index(), extents_device_[a.blob_index()]);
+    bool result = (*this)(extents_device_[a.blob_index()]);
 
     return result;
   }
 
   const MinMaxExtents *extents_device_;
-  const float *dot_products_;
   size_t tag_width_;
 
   bool reversed_border_;
@@ -492,16 +568,15 @@
 // update zero sized blobs.
 struct TransformZeroFilteredBlobSizes {
  public:
-  TransformZeroFilteredBlobSizes(const float *dot_products, size_t tag_width,
-                                 bool reversed_border, bool normal_border,
-                                 size_t min_cluster_pixels,
+  TransformZeroFilteredBlobSizes(size_t tag_width, bool reversed_border,
+                                 bool normal_border, size_t min_cluster_pixels,
                                  size_t max_cluster_pixels)
-      : select_blobs_(nullptr, dot_products, tag_width, reversed_border,
-                      normal_border, min_cluster_pixels, max_cluster_pixels) {}
+      : select_blobs_(nullptr, tag_width, reversed_border, normal_border,
+                      min_cluster_pixels, max_cluster_pixels) {}
 
   __host__ __device__ __forceinline__ cub::KeyValuePair<long, MinMaxExtents>
   operator()(cub::KeyValuePair<long, MinMaxExtents> pt) const {
-    pt.value.count *= select_blobs_(pt.key, pt.value);
+    pt.value.count *= select_blobs_(pt.value);
     pt.value.starting_offset = 0;
     return pt;
   }
@@ -534,13 +609,112 @@
       result.key = a.key;
     }
 
-    result.value.starting_offset = b.value.starting_offset + b.value.count;
+    result.value.starting_offset = a.value.starting_offset +
+                                   b.value.starting_offset + a.value.count +
+                                   b.value.count - result.value.count;
+
     return result;
   }
 };
 
+struct TransformLineFitPoint {
+  __host__ __device__ __forceinline__ LineFitPoint
+  operator()(IndexPoint p) const {
+    LineFitPoint result;
+
+    // we now undo our fixed-point arithmetic.
+    // adjust for pixel center bias
+    constexpr int delta = 1;
+    int32_t ix2 = p.x() + delta;
+    int32_t iy2 = p.y() + delta;
+    int32_t ix = ix2 / 2;
+    int32_t iy = iy2 / 2;
+
+    int32_t W = 1;
+
+    if (ix > 0 && ix + 1 < decimated_width && iy > 0 &&
+        iy + 1 < decimated_height) {
+      int32_t grad_x = decimated_image_device_[iy * decimated_width + ix + 1] -
+                       decimated_image_device_[iy * decimated_width + ix - 1];
+
+      int32_t grad_y =
+          decimated_image_device_[(iy + 1) * decimated_width + ix] -
+          decimated_image_device_[(iy - 1) * decimated_width + ix];
+
+      // XXX Tunable. How to shape the gradient magnitude?
+      W = hypotf(grad_x, grad_y) + 1;
+    }
+
+    result.Mx = W * ix2;
+    result.My = W * iy2;
+    result.Mxx = W * ix2 * ix2;
+    result.Mxy = W * ix2 * iy2;
+    result.Myy = W * iy2 * iy2;
+    result.W = W;
+    result.blob_index = p.blob_index();
+    return result;
+  }
+
+  const uint8_t *decimated_image_device_;
+  int decimated_width;
+  int decimated_height;
+};
+
+struct SumLineFitPoints {
+  __host__ __device__ __forceinline__ LineFitPoint
+  operator()(const LineFitPoint &a, const LineFitPoint &b) const {
+    LineFitPoint result;
+    result.Mx = a.Mx + b.Mx;
+    result.My = a.My + b.My;
+    result.Mxx = a.Mxx + b.Mxx;
+    result.Mxy = a.Mxy + b.Mxy;
+    result.Myy = a.Myy + b.Myy;
+    result.W = a.W + b.W;
+    result.blob_index = a.blob_index;
+    return result;
+  }
+};
+
+struct ValidPeaks {
+  __host__ __device__ __forceinline__ bool operator()(const Peak &a) const {
+    return a.blob_index != Peak::kNoPeak();
+  }
+};
+
+struct TransformToPeakExtents {
+  __host__ __device__ __forceinline__ PeakExtents
+  operator()(const cub::KeyValuePair<long, Peak> &a) const {
+    PeakExtents result;
+    result.blob_index = a.value.blob_index;
+    result.starting_offset = a.key;
+    result.count = 1;
+    return result;
+  }
+};
+
+struct MaskPeakExtentsByBlobId {
+  __host__ __device__ __forceinline__ uint32_t operator()(const Peak &a) const {
+    return a.blob_index;
+  }
+};
+
+struct MergePeakExtents {
+  __host__ __device__ __forceinline__ PeakExtents
+  operator()(const PeakExtents &a, const PeakExtents &b) const {
+    PeakExtents result;
+    result.blob_index = a.blob_index;
+    result.starting_offset = std::min(a.starting_offset, b.starting_offset);
+    result.count = a.count + b.count;
+    return result;
+  }
+};
+
+}  // namespace
+
 void GpuDetector::Detect(const uint8_t *image) {
   color_image_host_.MemcpyFrom(image);
+  const aos::monotonic_clock::time_point start_time =
+      aos::monotonic_clock::now();
   start_.Record(&stream_);
   color_image_device_.MemcpyAsyncFrom(&color_image_host_, &stream_);
   after_image_memcpy_to_device_.Record(&stream_);
@@ -553,6 +727,10 @@
       height_, tag_detector_->qtp.min_white_black_diff, &stream_);
   after_threshold_.Record(&stream_);
 
+  gray_image_device_.MemcpyAsyncTo(&gray_image_host_, &stream_);
+
+  after_memcpy_gray_.Record(&stream_);
+
   union_markers_size_device_.MemsetAsync(0u, &stream_);
   after_memset_.Record(&stream_);
 
@@ -589,7 +767,7 @@
         thresholded_image_device_.get(), union_markers_device_.get(),
         union_markers_size_device_.get(), union_marker_pair_device_.get(),
         decimated_width, decimated_height);
-    MaybeCheckAndSynchronize();
+    MaybeCheckAndSynchronize("BlobDiff");
   }
 
   // TODO(austin): Can I do the first step of the zero removal in the BlobDiff
@@ -610,7 +788,7 @@
         num_compressed_union_marker_pair_device_.get(),
         union_marker_pair_device_.size(), nz, stream_.get()));
 
-    MaybeCheckAndSynchronize();
+    MaybeCheckAndSynchronize("cub::DeviceSelect::If");
   }
 
   after_compact_.Record(&stream_);
@@ -633,14 +811,14 @@
         QuadBoundaryPoint::kRepEndBit, QuadBoundaryPoint::kBitsInKey,
         stream_.get()));
 
-    MaybeCheckAndSynchronize();
+    MaybeCheckAndSynchronize("cub::DeviceRadixSort::SortKeys");
   }
 
   after_sort_.Record(&stream_);
 
   size_t num_quads_host = 0;
   {
-    // Our next step is to compute the extents of each blob so we can filter
+    // Our next step is to compute the extents and dot product so we can filter
     // blobs.
     cub::ArgIndexInputIterator<QuadBoundaryPoint *> value_index_input_iterator(
         sorted_union_marker_pair_device_.get());
@@ -672,7 +850,6 @@
 
     num_quads_device_.MemcpyTo(&num_quads_host);
   }
-  CHECK_LE(num_quads_host, reduced_dot_blobs_pair_device_.size());
 
   // Longest april tag will be the full perimeter of the image.  Each point
   // results in 2 neighbor points, 1 straight, and one at 45 degrees.  But, we
@@ -684,46 +861,6 @@
   const size_t max_april_tag_perimeter = 2 * (width_ + height_);
 
   {
-    // Now that we have the extents, compute the dot product used to detect
-    // which blobs are inside out or right side out.
-
-    // Don't care about these quantities since the previous ReduceByKey will
-    // ahve computed them too.
-    cub::DiscardOutputIterator<size_t> length_discard_iterator;
-    cub::DiscardOutputIterator<uint64_t> key_discard_iterator;
-
-    size_t dot_reduce_temp_storage_bytes =
-        temp_storage_dot_product_device_.size();
-
-    // Provide a mask to detect keys by rep01()
-    MaskRep01 mask;
-    cub::TransformInputIterator<uint64_t, MaskRep01, QuadBoundaryPoint *>
-        key_iterator(sorted_union_marker_pair_device_.get(), mask);
-
-    // Create a transform iterator which calculates the dot product for each
-    // point individually.
-    cub::ArgIndexInputIterator<QuadBoundaryPoint *> value_index_input_iterator(
-        sorted_union_marker_pair_device_.get());
-    ComputeDotProductTransform dot_operator(
-        extents_device_.get(), num_quads_host, min_tag_width_,
-        tag_detector_->qtp.min_cluster_pixels, max_april_tag_perimeter);
-    cub::TransformInputIterator<float, ComputeDotProductTransform,
-                                cub::ArgIndexInputIterator<QuadBoundaryPoint *>>
-        value_iterator(value_index_input_iterator, dot_operator);
-
-    // And now sum per key.
-    CHECK_CUDA(cub::DeviceReduce::ReduceByKey(
-        temp_storage_dot_product_device_.get(), dot_reduce_temp_storage_bytes,
-        key_iterator, key_discard_iterator, value_iterator,
-        reduced_dot_blobs_pair_device_.get(), length_discard_iterator,
-        cub::Sum(), num_compressed_union_marker_pair_host, stream_.get()));
-
-    MaybeCheckAndSynchronize();
-  }
-
-  after_dot_.Record(&stream_);
-
-  {
     // Now that we have the dot products, we need to rewrite the extents for the
     // post-thresholded world so we can find the start and end address of blobs
     // for fitting lines.
@@ -733,9 +870,8 @@
     cub::ArgIndexInputIterator<MinMaxExtents *> value_index_input_iterator(
         extents_device_.get());
     TransformZeroFilteredBlobSizes rewrite(
-        reduced_dot_blobs_pair_device_.get(), min_tag_width_, reversed_border_,
-        normal_border_, tag_detector_->qtp.min_cluster_pixels,
-        max_april_tag_perimeter);
+        min_tag_width_, reversed_border_, normal_border_,
+        tag_detector_->qtp.min_cluster_pixels, max_april_tag_perimeter);
     cub::TransformInputIterator<cub::KeyValuePair<long, MinMaxExtents>,
                                 TransformZeroFilteredBlobSizes,
                                 cub::ArgIndexInputIterator<MinMaxExtents *>>
@@ -755,7 +891,7 @@
         input_iterator, selected_extents_device_.get(), sum_points,
         num_quads_host));
 
-    MaybeCheckAndSynchronize();
+    MaybeCheckAndSynchronize("cub::DeviceScan::InclusiveScan");
   }
 
   after_transform_extents_.Record(&stream_);
@@ -776,10 +912,7 @@
     TransformOutputIterator<IndexPoint, IndexPoint, AddThetaToIndexPoint>
         output_iterator(selected_blobs_device_.get(), add_theta);
 
-    SelectBlobs select_blobs(
-        extents_device_.get(), reduced_dot_blobs_pair_device_.get(),
-        min_tag_width_, reversed_border_, normal_border_,
-        tag_detector_->qtp.min_cluster_pixels, max_april_tag_perimeter);
+    NonzeroBlobs select_blobs(selected_extents_device_.get());
 
     size_t temp_storage_bytes =
         temp_storage_compressed_filtered_blobs_device_.size();
@@ -789,13 +922,15 @@
         temp_storage_bytes, input_iterator, output_iterator,
         num_selected_blobs_device_.get(), num_compressed_union_marker_pair_host,
         select_blobs, stream_.get()));
-    MaybeCheckAndSynchronize();
 
-    num_selected_blobs_device_.MemcpyTo(&num_selected_blobs_host);
+    MaybeCheckAndSynchronize("cub::DeviceSelect::If");
+
+    num_selected_blobs_device_.MemcpyAsyncTo(&num_selected_blobs_host,
+                                             &stream_);
+    after_filter_.Record(&stream_);
+    after_filter_.Synchronize();
   }
 
-  after_filter_.Record(&stream_);
-
   {
     // Sort based on the angle.
     size_t temp_storage_bytes = radix_sort_tmpstorage_device_.size();
@@ -807,53 +942,213 @@
         num_selected_blobs_host, decomposer, IndexPoint::kRepEndBit,
         IndexPoint::kBitsInKey, stream_.get()));
 
-    MaybeCheckAndSynchronize();
+    MaybeCheckAndSynchronize("cub::DeviceRadixSort::SortKeys");
   }
 
   after_filtered_sort_.Record(&stream_);
 
-  // Report out how long things took.
-  LOG(INFO) << "Found " << num_compressed_union_marker_pair_host << " items";
-  LOG(INFO) << "Selected " << num_selected_blobs_host << " blobs";
+  {
+    // Now that we have the dot products, we need to rewrite the extents for the
+    // post-thresholded world so we can find the start and end address of blobs
+    // for fitting lines.
+    //
+    // Clear the size of non-passing extents and the starting offset of all
+    // extents.
+    TransformLineFitPoint rewrite(decimated_image_device_.get(), width_ / 2,
+                                  height_ / 2);
+    cub::TransformInputIterator<LineFitPoint, TransformLineFitPoint,
+                                IndexPoint *>
+        input_iterator(sorted_selected_blobs_device_.get(), rewrite);
 
+    MaskBlobIndex mask;
+    cub::TransformInputIterator<uint32_t, MaskBlobIndex, IndexPoint *>
+        key_iterator(sorted_selected_blobs_device_.get(), mask);
+
+    // Sum the counts of everything before us, and update the offset.
+    SumLineFitPoints sum_points;
+
+    // Rewrite the extents to have the starting offset and count match the
+    // post-selected values.
+    size_t temp_storage_bytes = temp_storage_line_fit_scan_device_.size();
+
+    CHECK_CUDA(cub::DeviceScan::InclusiveScanByKey(
+        temp_storage_line_fit_scan_device_.get(), temp_storage_bytes,
+        key_iterator, input_iterator, line_fit_points_device_.get(), sum_points,
+        num_selected_blobs_host));
+
+    MaybeCheckAndSynchronize("cub::DeviceScan::InclusiveScanByKey");
+  }
+  after_line_fit_.Record(&stream_);
+
+  {
+    FitLines(line_fit_points_device_.get(), num_selected_blobs_host,
+             selected_extents_device_.get(), num_quads_host, errs_device_.get(),
+             filtered_errs_device_.get(), filtered_is_local_peak_device_.get(),
+             &stream_);
+  }
+  after_line_filter_.Record(&stream_);
+
+  int num_compressed_peaks_host;
+  {
+    // Remove empty points which aren't to be considered before sorting to speed
+    // things up.
+    size_t temp_storage_bytes =
+        temp_storage_compressed_union_marker_pair_device_.size();
+    ValidPeaks peak_filter;
+    CHECK_CUDA(cub::DeviceSelect::If(
+        temp_storage_compressed_union_marker_pair_device_.get(),
+        temp_storage_bytes, filtered_is_local_peak_device_.get(),
+        compressed_peaks_device_.get(), num_compressed_peaks_device_.get(),
+        num_selected_blobs_host, peak_filter, stream_.get()));
+
+    after_peak_compression_.Record(&stream_);
+    MaybeCheckAndSynchronize("cub::DeviceSelect::If");
+    num_compressed_peaks_device_.MemcpyAsyncTo(&num_compressed_peaks_host,
+                                               &stream_);
+    after_peak_count_memcpy_.Record(&stream_);
+    after_peak_count_memcpy_.Synchronize();
+  }
+
+  {
+    // Sort based on the angle.
+    size_t temp_storage_bytes = radix_sort_tmpstorage_device_.size();
+    PeakDecomposer decomposer;
+
+    CHECK_CUDA(cub::DeviceRadixSort::SortKeys(
+        radix_sort_tmpstorage_device_.get(), temp_storage_bytes,
+        compressed_peaks_device_.get(), sorted_compressed_peaks_device_.get(),
+        num_compressed_peaks_host, decomposer, 0, PeakDecomposer::kBitsInKey,
+        stream_.get()));
+
+    MaybeCheckAndSynchronize("cub::DeviceRadixSort::SortKeys");
+  }
+
+  after_peak_sort_.Record(&stream_);
+
+  int num_quad_peaked_quads_host;
+  // Now that we have the peaks sorted, recompute the extents so we can easily
+  // pick out the number and top 10 peaks for line fitting.
+  {
+    // Our next step is to compute the extents of each blob so we can filter
+    // blobs.
+    cub::ArgIndexInputIterator<Peak *> value_index_input_iterator(
+        sorted_compressed_peaks_device_.get());
+    TransformToPeakExtents transform_extents;
+    cub::TransformInputIterator<PeakExtents, TransformToPeakExtents,
+                                cub::ArgIndexInputIterator<Peak *>>
+        value_input_iterator(value_index_input_iterator, transform_extents);
+
+    // Don't care about the output keys...
+    cub::DiscardOutputIterator<uint32_t> key_discard_iterator;
+
+    // Provide a mask to detect keys by rep01()
+    MaskPeakExtentsByBlobId mask;
+    cub::TransformInputIterator<uint32_t, MaskPeakExtentsByBlobId, Peak *>
+        key_input_iterator(sorted_compressed_peaks_device_.get(), mask);
+
+    // Reduction operator.
+    MergePeakExtents reduce;
+
+    size_t temp_storage_bytes =
+        temp_storage_bounds_reduce_by_key_device_.size();
+    cub::DeviceReduce::ReduceByKey(
+        temp_storage_bounds_reduce_by_key_device_.get(), temp_storage_bytes,
+        key_input_iterator, key_discard_iterator, value_input_iterator,
+        peak_extents_device_.get(), num_quad_peaked_quads_device_.get(), reduce,
+        num_compressed_peaks_host, stream_.get());
+    MaybeCheckAndSynchronize("cub::DeviceReduce::ReduceByKey");
+
+    after_filtered_peak_reduce_.Record(&stream_);
+
+    num_quad_peaked_quads_device_.MemcpyAsyncTo(&num_quad_peaked_quads_host,
+                                                &stream_);
+    MaybeCheckAndSynchronize("num_quad_peaked_quads_device_.MemcpyTo");
+    after_filtered_peak_host_memcpy_.Record(&stream_);
+    after_filtered_peak_host_memcpy_.Synchronize();
+  }
+
+  {
+    apriltag::FitQuads(
+        sorted_compressed_peaks_device_.get(), num_compressed_peaks_host,
+        peak_extents_device_.get(), num_quad_peaked_quads_host,
+        line_fit_points_device_.get(), tag_detector_->qtp.max_nmaxima,
+        selected_extents_device_.get(), tag_detector_->qtp.max_line_fit_mse,
+        tag_detector_->qtp.cos_critical_rad, fit_quads_device_.get(), &stream_);
+    MaybeCheckAndSynchronize("FitQuads");
+  }
+  after_quad_fit_.Record(&stream_);
+
+  {
+    fit_quads_host_.resize(num_quad_peaked_quads_host);
+    fit_quads_device_.MemcpyAsyncTo(fit_quads_host_.data(),
+                                    num_quad_peaked_quads_host, &stream_);
+    after_quad_fit_memcpy_.Record(&stream_);
+    after_quad_fit_memcpy_.Synchronize();
+  }
+
+  const aos::monotonic_clock::time_point before_fit_quads =
+      aos::monotonic_clock::now();
+  UpdateFitQuads();
+  AdjustPixelCenters();
+
+  DecodeTags();
+
+  const aos::monotonic_clock::time_point end_time = aos::monotonic_clock::now();
+
+  // TODO(austin): Bring it back to the CPU and see how good we did.
+
+  // Report out how long things took.
+
+  VLOG(1) << "Found " << num_compressed_union_marker_pair_host << " items";
+  VLOG(1) << "Selected " << num_selected_blobs_host << " right side out points";
+  VLOG(1) << "Found compressed runs: " << num_quads_host;
+  VLOG(1) << "Peaks " << num_compressed_peaks_host << " peaks";
+  VLOG(1) << "Peak Selected blobs " << num_quad_peaked_quads_host << " quads";
   CudaEvent *previous_event = &start_;
   for (auto name_event : std::vector<std::tuple<std::string_view, CudaEvent &>>{
            {"Memcpy", after_image_memcpy_to_device_},
            {"Threshold", after_threshold_},
+           {"Memcpy Gray", after_memcpy_gray_},
            {"Memset", after_memset_},
            {"Unionfinding", after_unionfinding_},
            {"Diff", after_diff_},
            {"Compact", after_compact_},
            {"Sort", after_sort_},
            {"Bounds", after_bounds_},
-           {"Dot product", after_dot_},
            {"Transform Extents", after_transform_extents_},
            {"Filter by dot product", after_filter_},
            {"Filtered sort", after_filtered_sort_},
+           {"Line Fit", after_line_fit_},
+           {"Error Filter", after_line_filter_},
+           {"Compress Peaks", after_peak_compression_},
+           {"Memcpy Peaks", after_peak_count_memcpy_},
+           {"Sort Peaks", after_peak_sort_},
+           {"Peak Extents", after_filtered_peak_reduce_},
+           {"Memcpy num Extents", after_filtered_peak_host_memcpy_},
+           {"FitQuads", after_quad_fit_},
        }) {
     std::get<1>(name_event).Synchronize();
-    LOG(INFO) << std::get<0>(name_event) << " "
-              << float_milli(
-                     std::get<1>(name_event).ElapsedTime(*previous_event))
-                     .count()
-              << "ms";
+    VLOG(1) << "    " << std::get<0>(name_event) << " "
+            << float_milli(std::get<1>(name_event).ElapsedTime(*previous_event))
+                   .count()
+            << "ms";
     previous_event = &std::get<1>(name_event);
   }
+  VLOG(1) << "  FitQuads " << float_milli(end_time - before_fit_quads).count()
+          << "ms on host";
 
-  LOG(INFO) << "Overall "
-            << float_milli(previous_event->ElapsedTime(start_)).count() << "ms";
-
+  VLOG(1) << "Overall "
+          << float_milli(previous_event->ElapsedTime(start_)).count() << "ms, "
+          << float_milli(end_time - start_time).count() << "ms on host";
   // Average.  Skip the first one as the kernel is warming up and is slower.
   if (!first_) {
     ++execution_count_;
     execution_duration_ += previous_event->ElapsedTime(start_);
-    LOG(INFO) << "Average overall "
-              << float_milli(execution_duration_ / execution_count_).count()
-              << "ms";
+    VLOG(1) << "Average overall "
+            << float_milli(execution_duration_ / execution_count_).count()
+            << "ms";
   }
 
-  LOG(INFO) << "Found compressed runs: " << num_quads_host;
-
   first_ = false;
 }
 

diff --git a/frc971/orin/apriltag.h b/frc971/orin/apriltag.h
index 068c0c7..f87e107 100644
--- a/frc971/orin/apriltag.h
+++ b/frc971/orin/apriltag.h

@@ -9,37 +9,12 @@
 #include "device_launch_parameters.h"
 #include "frc971/orin/cuda.h"
 #include "frc971/orin/gpu_image.h"
+#include "frc971/orin/line_fit_filter.h"
 #include "frc971/orin/points.h"
 
 namespace frc971 {
 namespace apriltag {
 
-// Class to hold the extents of a blob of points.
-struct MinMaxExtents {
-  // Min and max coordinates (in non-decimated coordinates)
-  uint16_t min_x;
-  uint16_t min_y;
-  uint16_t max_x;
-  uint16_t max_y;
-
-  // The starting offset of this blob of points in the vector holding all the
-  // points.
-  uint32_t starting_offset;
-
-  // The number of points in the blob.
-  uint32_t count;
-
-  // Center location of the blob using the aprilrobotics algorithm.
-  __host__ __device__ float cx() const {
-    return (min_x + max_x) * 0.5f + 0.05118;
-  }
-  __host__ __device__ float cy() const {
-    return (min_y + max_y) * 0.5f + -0.028581;
-  }
-};
-
-static_assert(sizeof(MinMaxExtents) == 16, "MinMaxExtents didn't pack right.");
-
 // Class to find the blob index of a point in a point vector.
 class BlobExtentsIndexFinder {
  public:
@@ -58,7 +33,6 @@
       }
 
       size_t average = min + (max - min) / 2;
-
       if (average < num_extents_ && extents_device_[average].starting_offset <=
                                         static_cast<size_t>(point_index)) {
         min = average;
@@ -80,6 +54,12 @@
   // TODO(austin): Cache the last one?
 };
 
+struct QuadCorners {
+  float corners[4][2];
+  bool reversed_border;
+  uint32_t blob_index;
+};
+
 // GPU based april tag detector.
 class GpuDetector {
  public:
@@ -88,35 +68,40 @@
 
   // Constructs a detector, reserving space for detecting tags of the provided
   // with and height, using the provided detector options.
-  GpuDetector(size_t width, size_t height,
-              const apriltag_detector_t *tag_detector);
+  GpuDetector(size_t width, size_t height, apriltag_detector_t *tag_detector);
 
   virtual ~GpuDetector();
 
   // Detects april tags in the provided image.
   void Detect(const uint8_t *image);
 
+  const std::vector<QuadCorners> &FitQuads() const;
+
+  const zarray_t *Detections() const { return detections_; }
+
   // Debug methods to expose internal state for testing.
-  void CopyGrayTo(uint8_t *output) { gray_image_device_.MemcpyTo(output); }
-  void CopyDecimatedTo(uint8_t *output) {
+  void CopyGrayTo(uint8_t *output) const {
+    gray_image_device_.MemcpyTo(output);
+  }
+  void CopyDecimatedTo(uint8_t *output) const {
     decimated_image_device_.MemcpyTo(output);
   }
-  void CopyThresholdedTo(uint8_t *output) {
+  void CopyThresholdedTo(uint8_t *output) const {
     thresholded_image_device_.MemcpyTo(output);
   }
-  void CopyUnionMarkersTo(uint32_t *output) {
+  void CopyUnionMarkersTo(uint32_t *output) const {
     union_markers_device_.MemcpyTo(output);
   }
 
-  void CopyUnionMarkerPairTo(QuadBoundaryPoint *output) {
+  void CopyUnionMarkerPairTo(QuadBoundaryPoint *output) const {
     union_marker_pair_device_.MemcpyTo(output);
   }
 
-  void CopyCompressedUnionMarkerPairTo(QuadBoundaryPoint *output) {
+  void CopyCompressedUnionMarkerPairTo(QuadBoundaryPoint *output) const {
     compressed_union_marker_pair_device_.MemcpyTo(output);
   }
 
-  std::vector<QuadBoundaryPoint> CopySortedUnionMarkerPair() {
+  std::vector<QuadBoundaryPoint> CopySortedUnionMarkerPair() const {
     std::vector<QuadBoundaryPoint> result;
     int size = NumCompressedUnionMarkerPairs();
     result.resize(size);
@@ -124,39 +109,75 @@
     return result;
   }
 
-  int NumCompressedUnionMarkerPairs() {
+  int NumCompressedUnionMarkerPairs() const {
     return num_compressed_union_marker_pair_device_.Copy()[0];
   }
 
-  void CopyUnionMarkersSizeTo(uint32_t *output) {
+  void CopyUnionMarkersSizeTo(uint32_t *output) const {
     union_markers_size_device_.MemcpyTo(output);
   }
 
   int NumQuads() const { return num_quads_device_.Copy()[0]; }
 
-  std::vector<MinMaxExtents> CopyExtents() {
+  std::vector<MinMaxExtents> CopyExtents() const {
     return extents_device_.Copy(NumQuads());
   }
 
-  std::vector<float> CopyReducedDotBlobs() {
-    return reduced_dot_blobs_pair_device_.Copy(NumQuads());
-  }
-
-  std::vector<cub::KeyValuePair<long, MinMaxExtents>> CopySelectedExtents() {
+  std::vector<cub::KeyValuePair<long, MinMaxExtents>> CopySelectedExtents()
+      const {
     return selected_extents_device_.Copy(NumQuads());
   }
 
   int NumSelectedPairs() const { return num_selected_blobs_device_.Copy()[0]; }
 
-  std::vector<IndexPoint> CopySelectedBlobs() {
+  std::vector<IndexPoint> CopySelectedBlobs() const {
     return selected_blobs_device_.Copy(NumSelectedPairs());
   }
 
-  std::vector<IndexPoint> CopySortedSelectedBlobs() {
+  std::vector<IndexPoint> CopySortedSelectedBlobs() const {
     return sorted_selected_blobs_device_.Copy(NumSelectedPairs());
   }
 
+  std::vector<LineFitPoint> CopyLineFitPoints() const {
+    return line_fit_points_device_.Copy(NumSelectedPairs());
+  }
+
+  std::vector<double> CopyErrors() const {
+    return errs_device_.Copy(NumSelectedPairs());
+  }
+
+  std::vector<double> CopyFilteredErrors() const {
+    return filtered_errs_device_.Copy(NumSelectedPairs());
+  }
+  std::vector<Peak> CopyPeaks() const {
+    return filtered_is_local_peak_device_.Copy(NumSelectedPairs());
+  }
+
+  int NumCompressedPeaks() const {
+    return num_compressed_peaks_device_.Copy()[0];
+  }
+
+  std::vector<Peak> CopyCompressedPeaks() const {
+    return compressed_peaks_device_.Copy(NumCompressedPeaks());
+  }
+
+  int NumFitQuads() const { return num_quad_peaked_quads_device_.Copy()[0]; }
+
+  std::vector<FitQuad> CopyFitQuads() const {
+    return fit_quads_device_.Copy(NumFitQuads());
+  }
+
+  void AdjustCenter(float corners[4][2]) const;
+
  private:
+  void UpdateFitQuads();
+
+  void AdjustPixelCenters();
+
+  void DecodeTags();
+
+  static void QuadDecodeTask(void *_u);
+
   // Creates a GPU image wrapped around the provided memory.
   template <typename T>
   GpuImage<T> ToGpuImage(GpuMemory<T> &memory) {
@@ -184,7 +205,7 @@
   const size_t height_;
 
   // Detector parameters.
-  const apriltag_detector_t *tag_detector_;
+  apriltag_detector_t *tag_detector_;
 
   // Stream to operate on.
   CudaStream stream_;
@@ -193,19 +214,29 @@
   CudaEvent start_;
   CudaEvent after_image_memcpy_to_device_;
   CudaEvent after_threshold_;
+  CudaEvent after_memcpy_gray_;
   CudaEvent after_memset_;
   CudaEvent after_unionfinding_;
   CudaEvent after_diff_;
   CudaEvent after_compact_;
   CudaEvent after_sort_;
   CudaEvent after_bounds_;
-  CudaEvent after_dot_;
   CudaEvent after_transform_extents_;
   CudaEvent after_filter_;
   CudaEvent after_filtered_sort_;
+  CudaEvent after_line_fit_;
+  CudaEvent after_line_filter_;
+  CudaEvent after_peak_compression_;
+  CudaEvent after_peak_count_memcpy_;
+  CudaEvent after_peak_sort_;
+  CudaEvent after_filtered_peak_reduce_;
+  CudaEvent after_filtered_peak_host_memcpy_;
+  CudaEvent after_quad_fit_;
+  CudaEvent after_quad_fit_memcpy_;
 
   // TODO(austin): Remove this...
   HostMemory<uint8_t> color_image_host_;
+  HostMemory<uint8_t> gray_image_host_;
 
   // Starting color image.
   GpuMemory<uint8_t> color_image_device_;
@@ -238,9 +269,6 @@
   GpuMemory<size_t> num_quads_device_{/* allocate 1 integer...*/ 1};
   // Bounds per blob, one blob per ID.
   GpuMemory<MinMaxExtents> extents_device_;
-  // Sum of the dot products between the vector from center to each point, with
-  // the gradient at each point.
-  GpuMemory<float> reduced_dot_blobs_pair_device_;
   // Extents of all the blobs under consideration.
   GpuMemory<cub::KeyValuePair<long, MinMaxExtents>> selected_extents_device_;
 
@@ -252,6 +280,24 @@
   // Sorted list of those points.
   GpuMemory<IndexPoint> sorted_selected_blobs_device_;
 
+  // TODO(austin): Can we bound this better?  This is a lot of memory.
+  GpuMemory<LineFitPoint> line_fit_points_device_;
+
+  GpuMemory<double> errs_device_;
+  GpuMemory<double> filtered_errs_device_;
+  GpuMemory<Peak> filtered_is_local_peak_device_;
+  GpuMemory<int> num_compressed_peaks_device_{/* allocate 1 integer...*/ 1};
+  GpuMemory<Peak> compressed_peaks_device_;
+  GpuMemory<Peak> sorted_compressed_peaks_device_;
+
+  GpuMemory<int> num_quad_peaked_quads_device_{/* allocate 1 integer...*/ 1};
+  GpuMemory<PeakExtents> peak_extents_device_;
+
+  GpuMemory<FitQuad> fit_quads_device_;
+
+  std::vector<FitQuad> fit_quads_host_;
+  std::vector<QuadCorners> quad_corners_host_;
+
   // Temporary storage for each of the steps.
   // TODO(austin): Can we combine these and just use the max?
   GpuMemory<uint32_t> radix_sort_tmpstorage_device_;
@@ -260,6 +306,7 @@
   GpuMemory<uint8_t> temp_storage_dot_product_device_;
   GpuMemory<uint8_t> temp_storage_compressed_filtered_blobs_device_;
   GpuMemory<uint8_t> temp_storage_selected_extents_scan_device_;
+  GpuMemory<uint8_t> temp_storage_line_fit_scan_device_;
 
   // Cumulative duration of april tag detection.
   std::chrono::nanoseconds execution_duration_{0};
@@ -272,6 +319,11 @@
   bool normal_border_ = false;
   bool reversed_border_ = false;
   int min_tag_width_ = 1000000;
+
+  zarray_t *poly0_;
+  zarray_t *poly1_;
+
+  zarray_t *detections_ = nullptr;
 };
 
 }  // namespace apriltag

diff --git a/frc971/orin/apriltag_detect.cc b/frc971/orin/apriltag_detect.cc
new file mode 100644
index 0000000..eecc056
--- /dev/null
+++ b/frc971/orin/apriltag_detect.cc

@@ -0,0 +1,523 @@
+#include <thrust/iterator/constant_iterator.h>
+#include <thrust/iterator/transform_iterator.h>
+
+#include <cub/device/device_copy.cuh>
+#include <cub/device/device_radix_sort.cuh>
+#include <cub/device/device_reduce.cuh>
+#include <cub/device/device_run_length_encode.cuh>
+#include <cub/device/device_scan.cuh>
+#include <cub/device/device_segmented_sort.cuh>
+#include <cub/device/device_select.cuh>
+#include <cub/iterator/discard_output_iterator.cuh>
+#include <cub/iterator/transform_input_iterator.cuh>
+#include <vector>
+
+#include "glog/logging.h"
+
+#include "aos/time/time.h"
+#include "frc971/orin/apriltag.h"
+#include "frc971/orin/labeling_allegretti_2019_BKE.h"
+#include "frc971/orin/threshold.h"
+
+DEFINE_int32(debug_blob_index, 4096, "Blob to print out for");
+
+extern "C" {
+
+void quad_decode_index(apriltag_detector_t *td, struct quad *quad_original,
+                       image_u8_t *im, image_u8_t *im_samples,
+                       zarray_t *detections);
+
+void reconcile_detections(zarray_t *detections, zarray_t *poly0,
+                          zarray_t *poly1);
+};
+
+namespace frc971 {
+namespace apriltag {
+namespace {
+
+void HostFitLine(LineFitMoments moments, double *lineparam01,
+                 double *lineparam23, double *err, double *mse) {
+  int64_t Cxx = moments.Mxx * moments.W - static_cast<int64_t>(moments.Mx) *
+                                              static_cast<int64_t>(moments.Mx);
+  int64_t Cxy = moments.Mxy * moments.W - static_cast<int64_t>(moments.Mx) *
+                                              static_cast<int64_t>(moments.My);
+  int64_t Cyy = moments.Myy * moments.W - static_cast<int64_t>(moments.My) *
+                                              static_cast<int64_t>(moments.My);
+
+  // Pose it as an eigenvalue problem.
+  const float hypot_cached = std::hypotf((Cxx - Cyy), 2 * Cxy);
+  const float eight_w_squared = static_cast<float>(
+      static_cast<int64_t>(moments.W) * static_cast<int64_t>(moments.W) * 8.0);
+  const float eig_small = (Cxx + Cyy - hypot_cached) / eight_w_squared;
+
+  if (lineparam01) {
+    lineparam01[0] =
+        static_cast<float>(moments.Mx) / static_cast<float>(moments.W * 2);
+    lineparam01[1] =
+        static_cast<float>(moments.My) / static_cast<float>(moments.W * 2);
+  }
+  if (lineparam23) {
+    // These don't match the originals at all, but the math should come out
+    // right.  n{xy}{12} end up being multiplied by 8 W^2, and we compare the
+    // square but use hypot on nx, ny directly.  (and let the W^2 term come out
+    // as common to both the hypot and nxy terms.)
+    const float nx1 = static_cast<float>(Cxx - Cyy) - hypot_cached;
+    const float ny1 = Cxy * 2;
+    const float M1 = nx1 * nx1 + ny1 * ny1;
+    const float nx2 = Cxy * 2;
+    const float ny2 = static_cast<float>(Cyy - Cxx) - hypot_cached;
+    const float M2 = nx2 * nx2 + ny2 * ny2;
+
+    float nx, ny;
+    if (M1 > M2) {
+      nx = nx1;
+      ny = ny1;
+    } else {
+      nx = nx2;
+      ny = ny2;
+    }
+
+    float length = std::hypotf(nx, ny);
+    lineparam23[0] = nx / length;
+    lineparam23[1] = ny / length;
+  }
+
+  // sum of squared errors
+  *err = moments.N * eig_small;
+
+  // mean squared error
+  *mse = eig_small;
+}
+
+}  // namespace
+
+const std::vector<QuadCorners> &GpuDetector::FitQuads() const {
+  return quad_corners_host_;
+}
+
+void GpuDetector::UpdateFitQuads() {
+  quad_corners_host_.resize(0);
+  VLOG(1) << "Considering " << fit_quads_host_.size();
+  for (const FitQuad &quad : fit_quads_host_) {
+    bool print = quad.blob_index == FLAGS_debug_blob_index;
+    if (!quad.valid) {
+      continue;
+    }
+    QuadCorners corners;
+    corners.blob_index = quad.blob_index;
+    CHECK(normal_border_ != reversed_border_);
+    corners.reversed_border = reversed_border_;
+
+    double lines[4][4];
+    for (int i = 0; i < 4; i++) {
+      double err;
+      double mse;
+      HostFitLine(quad.moments[i], lines[i], lines[i] + 2, &err, &mse);
+      if (print) {
+        LOG(INFO) << "Blob " << corners.blob_index << " mse -> " << mse
+                  << " err " << err << " index " << quad.indices[i] << ", "
+                  << quad.indices[(i + 1) % 4];
+        LOG(INFO) << "   " << quad.moments[i];
+      }
+    }
+
+    bool bad_determinant = false;
+    for (int i = 0; i < 4; i++) {
+      // solve for the intersection of lines (i) and (i+1)&3.
+      // p0 + lambda0*u0 = p1 + lambda1*u1, where u0 and u1
+      // are the line directions.
+      //
+      // lambda0*u0 - lambda1*u1 = (p1 - p0)
+      //
+      // rearrange (solve for lambdas)
+      //
+      // [u0_x   -u1_x ] [lambda0] = [ p1_x - p0_x ]
+      // [u0_y   -u1_y ] [lambda1]   [ p1_y - p0_y ]
+      //
+      // remember that lines[i][0,1] = p, lines[i][2,3] = NORMAL vector.
+      // We want the unit vector, so we need the perpendiculars. Thus, below
+      // we have swapped the x and y components and flipped the y components.
+
+      double A00 = lines[i][3], A01 = -lines[(i + 1) & 3][3];
+      double A10 = -lines[i][2], A11 = lines[(i + 1) & 3][2];
+      double B0 = -lines[i][0] + lines[(i + 1) & 3][0];
+      double B1 = -lines[i][1] + lines[(i + 1) & 3][1];
+
+      double det = A00 * A11 - A10 * A01;
+
+      // inverse.
+      double W00 = A11 / det, W01 = -A01 / det;
+      if (fabs(det) < 0.001) {
+        bad_determinant = true;
+        break;
+      }
+
+      // solve
+      double L0 = W00 * B0 + W01 * B1;
+
+      // compute intersection
+      corners.corners[i][0] = lines[i][0] + L0 * A00;
+      corners.corners[i][1] = lines[i][1] + L0 * A10;
+      if (print) {
+        LOG(INFO) << "Calculated corner[" << i << "] -> ("
+                  << std::setprecision(20) << corners.corners[i][0] << ", "
+                  << std::setprecision(20) << corners.corners[i][1] << ")";
+      }
+    }
+    if (bad_determinant) {
+      continue;
+    }
+
+    {
+      float area = 0;
+
+      // get area of triangle formed by points 0, 1, 2, 0
+      float length[3], p;
+      for (int i = 0; i < 3; i++) {
+        int idxa = i;            // 0, 1, 2,
+        int idxb = (i + 1) % 3;  // 1, 2, 0
+        length[i] =
+            hypotf((corners.corners[idxb][0] - corners.corners[idxa][0]),
+                   (corners.corners[idxb][1] - corners.corners[idxa][1]));
+      }
+      p = (length[0] + length[1] + length[2]) / 2;
+
+      area += sqrtf(p * (p - length[0]) * (p - length[1]) * (p - length[2]));
+
+      // get area of triangle formed by points 2, 3, 0, 2
+      for (int i = 0; i < 3; i++) {
+        int idxs[] = {2, 3, 0, 2};
+        int idxa = idxs[i];
+        int idxb = idxs[i + 1];
+        length[i] =
+            hypotf((corners.corners[idxb][0] - corners.corners[idxa][0]),
+                   (corners.corners[idxb][1] - corners.corners[idxa][1]));
+      }
+      p = (length[0] + length[1] + length[2]) / 2;
+
+      area += sqrtf(p * (p - length[0]) * (p - length[1]) * (p - length[2]));
+
+      if (area < 0.95 * min_tag_width_ * min_tag_width_) {
+        if (print) {
+          LOG(INFO) << "Area of " << area << " smaller than "
+                    << 0.95 * min_tag_width_ * min_tag_width_;
+        }
+        continue;
+      }
+    }
+
+    // reject quads whose cumulative angle change isn't equal to 2PI
+    {
+      bool reject_corner = false;
+      for (int i = 0; i < 4; i++) {
+        int i0 = i, i1 = (i + 1) & 3, i2 = (i + 2) & 3;
+
+        float dx1 = corners.corners[i1][0] - corners.corners[i0][0];
+        float dy1 = corners.corners[i1][1] - corners.corners[i0][1];
+        float dx2 = corners.corners[i2][0] - corners.corners[i1][0];
+        float dy2 = corners.corners[i2][1] - corners.corners[i1][1];
+        float cos_dtheta =
+            (dx1 * dx2 + dy1 * dy2) /
+            sqrtf((dx1 * dx1 + dy1 * dy1) * (dx2 * dx2 + dy2 * dy2));
+
+        if (print) {
+          LOG(INFO) << "Cosdtheta -> for " << i0 << " " << i1 << " " << i2
+                    << " -> " << cos_dtheta << " threshold "
+                    << tag_detector_->qtp.cos_critical_rad;
+        }
+
+        if (std::abs(cos_dtheta) > tag_detector_->qtp.cos_critical_rad ||
+            dx1 * dy2 < dy1 * dx2) {
+          reject_corner = true;
+          break;
+        }
+      }
+      if (reject_corner) {
+        continue;
+      }
+    }
+    quad_corners_host_.push_back(corners);
+  }
+}
+
+void GpuDetector::AdjustCenter(float corners[4][2]) const {
+  const float quad_decimate = tag_detector_->quad_decimate;
+  if (tag_detector_->quad_decimate > 1) {
+    if (tag_detector_->quad_decimate == 1.5) {
+      for (int j = 0; j < 4; j++) {
+        corners[j][0] *= quad_decimate;
+        corners[j][1] *= quad_decimate;
+      }
+    } else {
+      for (int j = 0; j < 4; j++) {
+        corners[j][0] = (corners[j][0] - 0.5f) * quad_decimate + 0.5f;
+        corners[j][1] = (corners[j][1] - 0.5f) * quad_decimate + 0.5f;
+      }
+    }
+  }
+}
+
+void GpuDetector::AdjustPixelCenters() {
+  const float quad_decimate = tag_detector_->quad_decimate;
+
+  if (quad_decimate > 1) {
+    if (tag_detector_->quad_decimate == 1.5) {
+      for (QuadCorners &quad : quad_corners_host_) {
+        for (int j = 0; j < 4; j++) {
+          quad.corners[j][0] *= quad_decimate;
+          quad.corners[j][1] *= quad_decimate;
+        }
+      }
+    } else {
+      for (QuadCorners &quad : quad_corners_host_) {
+        for (int j = 0; j < 4; j++) {
+          quad.corners[j][0] =
+              (quad.corners[j][0] - 0.5f) * quad_decimate + 0.5f;
+          quad.corners[j][1] =
+              (quad.corners[j][1] - 0.5f) * quad_decimate + 0.5f;
+        }
+      }
+    }
+  }
+}
+
+static inline int detection_compare_function(const void *_a, const void *_b) {
+  apriltag_detection_t *a = *(apriltag_detection_t **)_a;
+  apriltag_detection_t *b = *(apriltag_detection_t **)_b;
+  return a->id - b->id;
+}
+
+struct QuadDecodeTaskStruct {
+  int i0, i1;
+  QuadCorners *quads;
+  apriltag_detector_t *td;
+
+  image_u8_t *im;
+  zarray_t *detections;
+
+  image_u8_t *im_samples;
+};
+
+// Mostly stolen from aprilrobotics, but modified to implement the dewarp.
+void RefineEdges(apriltag_detector_t *td, image_u8_t *im_orig,
+                 struct quad *quad) {
+  double lines[4][4];  // for each line, [Ex Ey nx ny]
+
+  for (int edge = 0; edge < 4; edge++) {
+    int a = edge, b = (edge + 1) & 3;  // indices of the end points.
+
+    // compute the normal to the current line estimate
+    float nx = quad->p[b][1] - quad->p[a][1];
+    float ny = -quad->p[b][0] + quad->p[a][0];
+    float mag = sqrtf(nx * nx + ny * ny);
+    nx /= mag;
+    ny /= mag;
+
+    if (quad->reversed_border) {
+      nx = -nx;
+      ny = -ny;
+    }
+
+    // we will now fit a NEW line by sampling points near
+    // our original line that have large gradients. On really big tags,
+    // we're willing to sample more to get an even better estimate.
+    int nsamples = std::max<int>(16, mag / 8);  // XXX tunable
+
+    // stats for fitting a line...
+    double Mx = 0, My = 0, Mxx = 0, Mxy = 0, Myy = 0, N = 0;
+
+    for (int s = 0; s < nsamples; s++) {
+      // compute a point along the line... Note, we're avoiding
+      // sampling *right* at the corners, since those points are
+      // the least reliable.
+      double alpha = (1.0 + s) / (nsamples + 1);
+      double x0 = alpha * quad->p[a][0] + (1 - alpha) * quad->p[b][0];
+      double y0 = alpha * quad->p[a][1] + (1 - alpha) * quad->p[b][1];
+
+      // search along the normal to this line, looking at the
+      // gradients along the way. We're looking for a strong
+      // response.
+      double Mn = 0;
+      double Mcount = 0;
+
+      // XXX tunable: how far to search?  We want to search far
+      // enough that we find the best edge, but not so far that
+      // we hit other edges that aren't part of the tag. We
+      // shouldn't ever have to search more than quad_decimate,
+      // since otherwise we would (ideally) have started our
+      // search on another pixel in the first place. Likewise,
+      // for very small tags, we don't want the range to be too
+      // big.
+      double range = td->quad_decimate + 1;
+
+      // XXX tunable step size.
+      for (double n = -range; n <= range; n += 0.25) {
+        // Because of the guaranteed winding order of the
+        // points in the quad, we will start inside the white
+        // portion of the quad and work our way outward.
+        //
+        // sample to points (x1,y1) and (x2,y2) XXX tunable:
+        // how far +/- to look? Small values compute the
+        // gradient more precisely, but are more sensitive to
+        // noise.
+        double grange = 1;
+        int x1 = x0 + (n + grange) * nx;
+        int y1 = y0 + (n + grange) * ny;
+        if (x1 < 0 || x1 >= im_orig->width || y1 < 0 || y1 >= im_orig->height)
+          continue;
+
+        int x2 = x0 + (n - grange) * nx;
+        int y2 = y0 + (n - grange) * ny;
+        if (x2 < 0 || x2 >= im_orig->width || y2 < 0 || y2 >= im_orig->height)
+          continue;
+
+        int g1 = im_orig->buf[y1 * im_orig->stride + x1];
+        int g2 = im_orig->buf[y2 * im_orig->stride + x2];
+
+        if (g1 < g2)  // reject points whose gradient is "backwards". They can
+                      // only hurt us.
+          continue;
+
+        double weight =
+            (g2 - g1) *
+            (g2 - g1);  // XXX tunable. What shape for weight=f(g2-g1)?
+
+        // compute weighted average of the gradient at this point.
+        Mn += weight * n;
+        Mcount += weight;
+      }
+
+      // what was the average point along the line?
+      if (Mcount == 0) continue;
+
+      double n0 = Mn / Mcount;
+
+      // where is the point along the line?
+      double bestx = x0 + n0 * nx;
+      double besty = y0 + n0 * ny;
+
+      // update our line fit statistics
+      Mx += bestx;
+      My += besty;
+      Mxx += bestx * bestx;
+      Mxy += bestx * besty;
+      Myy += besty * besty;
+      N++;
+    }
+
+    // fit a line
+    double Ex = Mx / N, Ey = My / N;
+    double Cxx = Mxx / N - Ex * Ex;
+    double Cxy = Mxy / N - Ex * Ey;
+    double Cyy = Myy / N - Ey * Ey;
+
+    // TODO: Can replace this with same code as in fit_line.
+    double normal_theta = .5 * atan2f(-2 * Cxy, (Cyy - Cxx));
+    nx = cosf(normal_theta);
+    ny = sinf(normal_theta);
+    lines[edge][0] = Ex;
+    lines[edge][1] = Ey;
+    lines[edge][2] = nx;
+    lines[edge][3] = ny;
+  }
+
+  // now refit the corners of the quad
+  for (int i = 0; i < 4; i++) {
+    // solve for the intersection of lines (i) and (i+1)&3.
+    double A00 = lines[i][3], A01 = -lines[(i + 1) & 3][3];
+    double A10 = -lines[i][2], A11 = lines[(i + 1) & 3][2];
+    double B0 = -lines[i][0] + lines[(i + 1) & 3][0];
+    double B1 = -lines[i][1] + lines[(i + 1) & 3][1];
+
+    double det = A00 * A11 - A10 * A01;
+
+    // inverse.
+    if (fabs(det) > 0.001) {
+      // solve
+      double W00 = A11 / det, W01 = -A01 / det;
+
+      double L0 = W00 * B0 + W01 * B1;
+
+      // Compute intersection. Note that line i represents the line from corner
+      // i to (i+1)&3, so the intersection of line i with line (i+1)&3
+      // represents corner (i+1)&3.
+      quad->p[(i + 1) & 3][0] = lines[i][0] + L0 * A00;
+      quad->p[(i + 1) & 3][1] = lines[i][1] + L0 * A10;
+    } else {
+      // this is a bad sign. We'll just keep the corner we had.
+      //            debug_print("bad det: %15f %15f %15f %15f %15f\n", A00, A11,
+      //            A10, A01, det);
+    }
+  }
+}
+
+void GpuDetector::QuadDecodeTask(void *_u) {
+  QuadDecodeTaskStruct *task = reinterpret_cast<QuadDecodeTaskStruct *>(_u);
+  apriltag_detector_t *td = task->td;
+  image_u8_t *im = task->im;
+
+  for (int quadidx = task->i0; quadidx < task->i1; quadidx++) {
+    struct quad quad_original;
+    std::memcpy(quad_original.p, task->quads[quadidx].corners,
+                sizeof(task->quads[quadidx].corners));
+
+    quad_original.reversed_border = task->quads[quadidx].reversed_border;
+    quad_original.H = nullptr;
+    quad_original.Hinv = nullptr;
+
+    if (td->refine_edges) {
+      RefineEdges(td, im, &quad_original);
+    }
+
+    quad_decode_index(td, &quad_original, im, task->im_samples,
+                      task->detections);
+  }
+}
+
+void GpuDetector::DecodeTags() {
+  size_t chunksize =
+      1 + quad_corners_host_.size() /
+              (APRILTAG_TASKS_PER_THREAD_TARGET * tag_detector_->nthreads);
+
+  std::vector<QuadDecodeTaskStruct> tasks(
+      (quad_corners_host_.size() / chunksize + 1));
+
+  for (int i = 0; i < zarray_size(detections_); ++i) {
+    apriltag_detection_t *det;
+    zarray_get(detections_, i, &det);
+    apriltag_detection_destroy(det);
+  }
+
+  zarray_truncate(detections_, 0);
+
+  image_u8_t im_orig{
+      .width = static_cast<int32_t>(width_),
+      .height = static_cast<int32_t>(height_),
+      .stride = static_cast<int32_t>(width_),
+      .buf = gray_image_host_.get(),
+  };
+
+  int ntasks = 0;
+  for (size_t i = 0; i < quad_corners_host_.size(); i += chunksize) {
+    tasks[ntasks].i0 = i;
+    tasks[ntasks].i1 = std::min(quad_corners_host_.size(), i + chunksize);
+    tasks[ntasks].quads = quad_corners_host_.data();
+    tasks[ntasks].td = tag_detector_;
+    tasks[ntasks].im = &im_orig;
+    tasks[ntasks].detections = detections_;
+
+    tasks[ntasks].im_samples = nullptr;
+
+    workerpool_add_task(tag_detector_->wp, QuadDecodeTask, &tasks[ntasks]);
+    ntasks++;
+  }
+
+  workerpool_run(tag_detector_->wp);
+
+  reconcile_detections(detections_, poly0_, poly1_);
+
+  zarray_sort(detections_, detection_compare_function);
+}
+
+}  // namespace apriltag
+}  // namespace frc971

diff --git a/frc971/orin/cuda.cc b/frc971/orin/cuda.cc
index 1173f73..89b1529 100644
--- a/frc971/orin/cuda.cc
+++ b/frc971/orin/cuda.cc

@@ -10,14 +10,20 @@
 namespace frc971 {
 namespace apriltag {
 
-void CheckAndSynchronize() {
-  CHECK_CUDA(cudaDeviceSynchronize());
-  CHECK_CUDA(cudaGetLastError());
+size_t overall_memory = 0;
+
+void CheckAndSynchronize(std::string_view message) {
+  CHECK_CUDA(cudaDeviceSynchronize()) << message;
+  CHECK_CUDA(cudaGetLastError()) << message;
 }
 
 void MaybeCheckAndSynchronize() {
   if (FLAGS_sync) CheckAndSynchronize();
 }
 
+void MaybeCheckAndSynchronize(std::string_view message) {
+  if (FLAGS_sync) CheckAndSynchronize(message);
+}
+
 }  // namespace apriltag
 }  // namespace frc971

diff --git a/frc971/orin/cuda.h b/frc971/orin/cuda.h
index e386aa6..2cc8f44 100644
--- a/frc971/orin/cuda.h
+++ b/frc971/orin/cuda.h

@@ -139,12 +139,15 @@
 
   // Copies data to host memory from this memory asynchronously on the provided
   // stream.
-  void MemcpyAsyncTo(T *host_memory, CudaStream *stream) const {
+  void MemcpyAsyncTo(T *host_memory, size_t size, CudaStream *stream) const {
     CHECK_CUDA(cudaMemcpyAsync(reinterpret_cast<void *>(host_memory),
                                reinterpret_cast<void *>(memory_),
-                               sizeof(T) * size_, cudaMemcpyDeviceToHost,
+                               sizeof(T) * size, cudaMemcpyDeviceToHost,
                                stream->get()));
   }
+  void MemcpyAsyncTo(T *host_memory, CudaStream *stream) const {
+    MemcpyAsyncTo(host_memory, size_, stream);
+  }
   void MemcpyAsyncTo(HostMemory<T> *host_memory, CudaStream *stream) const {
     MemcpyAsyncTo(host_memory->get(), stream);
   }
@@ -195,11 +198,12 @@
 };
 
 // Synchronizes and CHECKs for success the last CUDA operation.
-void CheckAndSynchronize();
+void CheckAndSynchronize(std::string_view message = "");
 
 // Synchronizes and CHECKS iff --sync is passed on the command line.  Makes it
 // so we can leave debugging in the code.
 void MaybeCheckAndSynchronize();
+void MaybeCheckAndSynchronize(std::string_view message);
 
 }  // namespace apriltag
 }  // namespace frc971

diff --git a/frc971/orin/cuda_april_tag_test.cc b/frc971/orin/cuda_april_tag_test.cc
index 776caf5..9e5eb5f 100644
--- a/frc971/orin/cuda_april_tag_test.cc
+++ b/frc971/orin/cuda_april_tag_test.cc

@@ -25,6 +25,8 @@
 
 DEFINE_bool(debug, false, "If true, write debug images.");
 
+DECLARE_int32(debug_blob_index);
+
 // Get access to the intermediates of aprilrobotics.
 extern "C" {
 
@@ -42,8 +44,45 @@
 
   float slope;
 };
+
+struct line_fit_pt {
+  double Mx, My;
+  double Mxx, Myy, Mxy;
+  double W;  // total weight
+};
+
+struct line_fit_pt *compute_lfps(int sz, zarray_t *cluster, image_u8_t *im);
+
+void fit_line(struct line_fit_pt *lfps, int sz, int i0, int i1,
+              double *lineparm, double *err, double *mse);
+
+int fit_quad(apriltag_detector_t *td, image_u8_t *im, zarray_t *cluster,
+             struct quad *quad, int tag_width, bool normal_border,
+             bool reversed_border);
+
 }  // extern C
 
+std::ostream &operator<<(std::ostream &os, const line_fit_pt &point) {
+  os << "{Mx:" << std::setprecision(20) << point.Mx
+     << ", My:" << std::setprecision(20) << point.My
+     << ", Mxx:" << std::setprecision(20) << point.Mxx
+     << ", Myy:" << std::setprecision(20) << point.Myy
+     << ", Mxy:" << std::setprecision(20) << point.Mxy << ", W:" << point.W
+     << "}";
+  return os;
+}
+
+namespace frc971::apriltag {
+std::ostream &operator<<(std::ostream &os,
+                         const frc971::apriltag::LineFitPoint &point) {
+  os << "{Mx:" << point.Mx / 2.0 << ", My:" << point.My / 2.0
+     << ", Mxx:" << point.Mxx / 4.0 << ", Mxy:" << point.Mxy / 4.0
+     << ", Myy:" << point.Myy / 4.0 << ", W:" << point.W << "}";
+  return os;
+}
+
+}  // namespace frc971::apriltag
+
 // Converts a cv::Mat to an aprilrobotics image.
 image_u8_t ToImageu8t(const cv::Mat &img) {
   return image_u8_t{
@@ -196,7 +235,7 @@
 
   apriltag_detector_add_family_bits(tag_detector, tag_family, 1);
 
-  tag_detector->nthreads = 1;
+  tag_detector->nthreads = 6;
   tag_detector->wp = workerpool_create(tag_detector->nthreads);
   tag_detector->qtp.min_white_black_diff = 5;
   tag_detector->debug = FLAGS_debug;
@@ -260,6 +299,9 @@
   }
 
   ~CudaAprilTagDetector() {
+    if (aprilrobotics_detections_ != nullptr) {
+      apriltag_detections_destroy(aprilrobotics_detections_);
+    }
     apriltag_detector_destroy(tag_detector_);
     free(tag_family_);
   }
@@ -286,11 +328,17 @@
     num_compressed_union_marker_pair_ =
         gpu_detector_.NumCompressedUnionMarkerPairs();
     extents_cuda_ = gpu_detector_.CopyExtents();
-    reduced_dot_blobs_pair_cuda_ = gpu_detector_.CopyReducedDotBlobs();
+    selected_extents_cuda_ = gpu_detector_.CopySelectedExtents();
     selected_blobs_cuda_ = gpu_detector_.CopySelectedBlobs();
     sorted_selected_blobs_cuda_ = gpu_detector_.CopySortedSelectedBlobs();
+    line_fit_points_cuda_ = gpu_detector_.CopyLineFitPoints();
+    errors_device_ = gpu_detector_.CopyErrors();
+    filtered_errors_device_ = gpu_detector_.CopyFilteredErrors();
+    peaks_device_ = gpu_detector_.CopyPeaks();
     num_quads_ = gpu_detector_.NumQuads();
 
+    fit_quads_ = gpu_detector_.FitQuads();
+
     LOG(INFO) << "num_compressed_union_marker_pair "
               << sorted_union_marker_pair_.size();
   }
@@ -307,18 +355,21 @@
     };
 
     LOG(INFO) << "Starting CPU detect.";
-    zarray_t *detections = apriltag_detector_detect(tag_detector_, &im);
+    if (aprilrobotics_detections_ != nullptr) {
+      apriltag_detections_destroy(aprilrobotics_detections_);
+    }
+    aprilrobotics_detections_ = apriltag_detector_detect(tag_detector_, &im);
 
     timeprofile_display(tag_detector_->tp);
 
-    for (int i = 0; i < zarray_size(detections); i++) {
+    for (int i = 0; i < zarray_size(aprilrobotics_detections_); i++) {
       apriltag_detection_t *det;
-      zarray_get(detections, i, &det);
+      zarray_get(aprilrobotics_detections_, i, &det);
 
       if (det->decision_margin > FLAGS_min_decision_margin) {
-        VLOG(1) << "Found tag number " << det->id
-                << " hamming: " << det->hamming
-                << " margin: " << det->decision_margin;
+        LOG(INFO) << "Found tag number " << det->id
+                  << " hamming: " << det->hamming
+                  << " margin: " << det->decision_margin;
       }
     }
   }
@@ -725,12 +776,14 @@
           [](std::tuple<float, uint64_t> a, std::tuple<float, uint64_t> b) {
             return std::get<1>(a) < std::get<1>(b);
           });
+      size_t before_size = pts.size();
       pts.erase(std::unique(pts.begin(), pts.end(),
                             [](std::tuple<float, uint64_t> a,
                                std::tuple<float, uint64_t> b) {
                               return std::get<1>(a) == std::get<1>(b);
                             }),
                 pts.end());
+      CHECK_EQ(pts.size(), before_size);
 
       std::sort(
           pts.begin(), pts.end(),
@@ -927,8 +980,7 @@
     const std::vector<std::vector<QuadBoundaryPoint>>
         cuda_point_sorted_grouped_points = SortPoints(cuda_grouped_points);
     const std::vector<std::vector<uint64_t>> april_sorted_grouped_points =
-        SortAprilroboticsPoints(
-            SortAprilroboticsPointSizes(april_grouped_points));
+        SortAprilroboticsPoints(april_grouped_points);
 
     CHECK_EQ(april_sorted_grouped_points.size(),
              cuda_point_sorted_grouped_points.size());
@@ -952,8 +1004,7 @@
   // Tests that the extents and dot products match the cuda versions.
   std::pair<size_t, std::vector<IndexPoint>> CheckCudaExtents(
       const std::vector<std::vector<QuadBoundaryPoint>> &cuda_grouped_points,
-      const std::vector<MinMaxExtents> &extents_cuda,
-      const std::vector<float> &reduced_dot_blobs_pair_cuda) const {
+      const std::vector<MinMaxExtents> &extents_cuda) const {
     std::vector<IndexPoint> selected_blobs;
     const size_t max_april_tag_perimeter = 2 * (width_ + height_);
     size_t selected_quads = 0;
@@ -962,7 +1013,6 @@
       size_t i = 0;
       size_t starting_offset = 0;
       CHECK_EQ(cuda_grouped_points.size(), extents_cuda.size());
-      CHECK_EQ(extents_cuda.size(), reduced_dot_blobs_pair_cuda.size());
       for (const std::vector<QuadBoundaryPoint> &points : cuda_grouped_points) {
         size_t min_x, min_y, max_x, max_y;
         min_x = max_x = points[0].x();
@@ -994,33 +1044,27 @@
         bool good_blob_size =
             points.size() >= 24 && points.size() <= max_april_tag_perimeter &&
             points.size() >=
-                static_cast<size_t>(tag_detector_->qtp.min_cluster_pixels);
+                static_cast<size_t>(tag_detector_->qtp.min_cluster_pixels) &&
+            static_cast<int>((max_x - min_x) * (max_y - min_y)) >=
+                min_tag_width_;
 
-        if (good_blob_size) {
-          for (size_t j = 0; j < points.size(); ++j) {
-            dot += (static_cast<float>(points[j].x()) - cx) * points[j].gx() +
-                   (static_cast<float>(points[j].y()) - cy) * points[j].gy();
+        for (size_t j = 0; j < points.size(); ++j) {
+          dot += (static_cast<float>(points[j].x()) - cx) * points[j].gx() +
+                 (static_cast<float>(points[j].y()) - cy) * points[j].gy();
 
-            // Make sure our blob finder agrees.
-            CHECK_EQ(i, finder.FindBlobIndex(starting_offset + j));
-          }
-          // Test that the summed dot product is right.
-          CHECK_LT(
-              std::abs(reduced_dot_blobs_pair_cuda[i] - dot) / std::abs(dot),
-              1e-4)
-              << ": for point " << i << ", cuda -> "
-              << reduced_dot_blobs_pair_cuda[i] << ", C++ -> " << dot;
-        } else {
-          CHECK_EQ(reduced_dot_blobs_pair_cuda[i], 0.0f)
-              << ": for point " << i << ", cuda -> "
-              << reduced_dot_blobs_pair_cuda[i] << ", C++ -> " << dot;
+          // Make sure our blob finder agrees.
+          CHECK_EQ(i, finder.FindBlobIndex(starting_offset + j));
         }
 
+        // Test that the summed dot product is right.
+        CHECK_LT(std::abs(extents.dot() - dot) / std::abs(dot), 1e-2)
+            << ": for point " << i << ", cuda -> " << extents.dot()
+            << ", C++ -> " << dot;
+
         const bool quad_reversed_border = dot < 0;
 
-        VLOG(1) << "For point " << i << ", cuda -> "
-                << reduced_dot_blobs_pair_cuda[i] << ", C++ -> " << dot
-                << " size " << points.size() << " border "
+        VLOG(1) << "For point " << i << ", cuda -> " << extents.dot()
+                << ", C++ -> " << dot << " size " << points.size() << " border "
                 << (!(!reversed_border_ && quad_reversed_border) &&
                     !(!normal_border_ && !quad_reversed_border))
                 << " area: "
@@ -1167,13 +1211,14 @@
           ++missmatched_points;
           ++missmatched_runs;
           // We shouldn't see a lot of points in a row which don't match.
-          VLOG(1) << "Missmatched point in blob "
+          VLOG(0) << "Missmatched point in blob "
                   << cuda_grouped_blob[0].blob_index() << ", point " << j
                   << " (" << cuda_grouped_blob[j].x() << ", "
-                  << cuda_grouped_blob[j].y() << ") vs ("
+                  << cuda_grouped_blob[j].y() << "), theta "
+                  << cuda_grouped_blob[j].theta() << " vs ("
                   << slope_sorted_points[j].x() << ", "
-                  << slope_sorted_points[j].y() << ")"
-                  << " in size " << cuda_grouped_points[i].size();
+                  << slope_sorted_points[j].y() << "), in size "
+                  << cuda_grouped_points[i].size();
           CHECK_LE(missmatched_runs, 4u);
         } else {
           missmatched_runs = 0;
@@ -1183,7 +1228,540 @@
 
     // Or a lot of points overall.  The slope algo has duplicate points, and
     // is occasionally wrong.
-    CHECK_LE(missmatched_points, 50u);
+    CHECK_LE(missmatched_points, 25u);
+  }
+
+  void CheckCudaFilteredExtents(
+      const std::vector<cub::KeyValuePair<long, MinMaxExtents>>
+          &selected_extents_cuda,
+      const std::vector<IndexPoint> &sorted_selected_blobs_cuda) {
+    size_t start = 0;
+    size_t sorted_point_index = 0;
+    for (size_t i = 0; i < selected_extents_cuda.size(); ++i) {
+      VLOG(1) << "Extent " << i << " started at "
+              << selected_extents_cuda[i].value.starting_offset
+              << " with count " << selected_extents_cuda[i].value.count;
+      CHECK_EQ(selected_extents_cuda[i].value.starting_offset, start)
+          << " for extent " << i;
+      size_t found_blobs = 0;
+      CHECK_EQ(selected_extents_cuda[i].value.starting_offset,
+               sorted_point_index);
+      while (sorted_selected_blobs_cuda[sorted_point_index].blob_index() == i) {
+        ++found_blobs;
+        ++sorted_point_index;
+      }
+      CHECK_EQ(found_blobs, selected_extents_cuda[i].value.count);
+      start += selected_extents_cuda[i].value.count;
+    }
+  }
+
+  void CheckErrors(const std::vector<IndexPoint> &sorted_selected_blobs_cuda,
+                   const std::vector<LineFitPoint> &line_fit_points_cuda,
+                   const std::vector<double> &errors_device,
+                   const std::vector<double> &filtered_errors_device,
+                   const std::vector<Peak> &peaks_device) {
+    std::vector<std::vector<IndexPoint>> grouped_points =
+        CudaGroupedPoints(sorted_selected_blobs_cuda);
+    image_u8_t quad_im = ToImageu8t(decimated_cuda_);
+    size_t accumulated_size = 0u;
+    size_t blob_index = 0u;
+    size_t bad_errors = 0u;
+    size_t summed_cuda_pts = 0;
+    for (const std::vector<IndexPoint> &group : grouped_points) {
+      std::vector<double> errs;
+      errs.resize(group.size(), 0.0);
+      const size_t ksz = std::min<size_t>(20u, group.size() / 12);
+
+      zarray_t *cluster = zarray_create(sizeof(struct pt));
+
+      for (size_t i = 0; i < group.size(); i++) {
+        pt p{
+            .x = static_cast<uint16_t>(group[i].x()),
+            .y = static_cast<uint16_t>(group[i].y()),
+        };
+        zarray_add(cluster, &p);
+      }
+
+      CHECK_EQ(static_cast<size_t>(zarray_size(cluster)), group.size());
+
+      struct line_fit_pt *lfps = compute_lfps(group.size(), cluster, &quad_im);
+
+      VLOG(1) << "Inspecting blob of size " << group.size() << " global start "
+              << accumulated_size;
+      for (size_t i = 0; i < group.size(); i++) {
+        if (group[i].blob_index() == (size_t)FLAGS_debug_blob_index) {
+          LOG(INFO) << "For idx " << i << " global " << accumulated_size + i
+                    << "(" << group[i].x() << ", " << group[i].y()
+                    << "), cuda: " << line_fit_points_cuda[accumulated_size + i]
+                    << " aprilrobotics " << lfps[i];
+        }
+        CHECK_EQ(line_fit_points_cuda[accumulated_size + i].Mx / 2.0,
+                 lfps[i].Mx)
+            << ": for blob index " << group[i].blob_index();
+        CHECK_EQ(line_fit_points_cuda[accumulated_size + i].My / 2.0,
+                 lfps[i].My)
+            << ": for blob index " << group[i].blob_index();
+        CHECK_EQ(line_fit_points_cuda[accumulated_size + i].Mxx / 4.0,
+                 lfps[i].Mxx)
+            << ": for blob index " << group[i].blob_index();
+        CHECK_EQ(line_fit_points_cuda[accumulated_size + i].Mxy / 4.0,
+                 lfps[i].Mxy)
+            << ": for blob index " << group[i].blob_index();
+        CHECK_EQ(line_fit_points_cuda[accumulated_size + i].Myy / 4.0,
+                 lfps[i].Myy)
+            << ": for blob index " << group[i].blob_index();
+        CHECK_EQ(line_fit_points_cuda[accumulated_size + i].W, lfps[i].W)
+            << ": for blob index " << group[i].blob_index();
+      }
+
+      for (size_t i = 0; i < group.size(); i++) {
+        if (group[i].blob_index() == (size_t)FLAGS_debug_blob_index) {
+          LOG(INFO) << "  Cuda error[" << i << "] -> "
+                    << errors_device[accumulated_size + i] << ", filtered "
+                    << filtered_errors_device[i + accumulated_size];
+        }
+      }
+
+      for (size_t i = 0; i < group.size(); i++) {
+        const size_t i0 = (i + group.size() - ksz) % group.size();
+        const size_t i1 = (i + ksz) % group.size();
+        const size_t gi0 = accumulated_size + i0;
+        const size_t gi1 = accumulated_size + i1;
+
+        fit_line(lfps, group.size(), i0, i1, NULL, &errs[i], NULL);
+
+        if (std::abs(errs[i] - errors_device[accumulated_size + i]) >=
+            0.001 * std::max<double>(1.0, errs[i])) {
+          VLOG(1) << "i0   aprilrobotics " << lfps[i0];
+          VLOG(1) << "i0            cuda " << line_fit_points_cuda[gi0];
+          VLOG(1) << " dMx: "
+                  << (lfps[i0].Mx * 2 - line_fit_points_cuda[gi0].Mx);
+          VLOG(1) << " dMy: "
+                  << (lfps[i0].My * 2 - line_fit_points_cuda[gi0].My);
+          VLOG(1) << " dMxx: "
+                  << (lfps[i0].Mxx * 4 - line_fit_points_cuda[gi0].Mxx);
+          VLOG(1) << " dMxy: "
+                  << (lfps[i0].Mxy * 4 - line_fit_points_cuda[gi0].Mxy);
+          VLOG(1) << " dMyy: "
+                  << (lfps[i0].Myy * 4 - line_fit_points_cuda[gi0].Myy);
+          VLOG(1) << " dW: " << (lfps[i0].W - line_fit_points_cuda[gi0].W);
+          VLOG(1) << "i1   aprilrobotics " << lfps[i1];
+          VLOG(1) << "i1            cuda " << line_fit_points_cuda[gi1];
+          VLOG(1) << " dMx: "
+                  << (lfps[i1].Mx * 2 - line_fit_points_cuda[gi1].Mx);
+          VLOG(1) << " dMy: "
+                  << (lfps[i1].My * 2 - line_fit_points_cuda[gi1].My);
+          VLOG(1) << " dMxx: "
+                  << (lfps[i1].Mxx * 4 - line_fit_points_cuda[gi1].Mxx);
+          VLOG(1) << " dMxy: "
+                  << (lfps[i1].Mxy * 4 - line_fit_points_cuda[gi1].Mxy);
+          VLOG(1) << " dMyy: "
+                  << (lfps[i1].Myy * 4 - line_fit_points_cuda[gi1].Myy);
+          VLOG(1) << " dW: " << (lfps[i1].W - line_fit_points_cuda[gi1].W);
+          VLOG(1) << "derror: " << errs[i] << " - "
+                  << errors_device[accumulated_size + i] << " = "
+                  << errs[i] - errors_device[accumulated_size + i];
+
+          ++bad_errors;
+        }
+      }
+
+      std::vector<float> filter_coefficients;
+      {
+        // Stolen directly from aprilrobotics to test.
+
+        double sigma = 1;  // was 3
+
+        // cutoff = exp(-j*j/(2*sigma*sigma));
+        // log(cutoff) = -j*j / (2*sigma*sigma)
+        // log(cutoff)*2*sigma*sigma = -j*j;
+
+        // how big a filter should we use? We make our kernel big
+        // enough such that we represent any values larger than
+        // 'cutoff'.
+
+        // XXX Tunable (though not super useful to change)
+        double cutoff = 0.05;
+        int fsz = sqrt(-log(cutoff) * 2 * sigma * sigma) + 1;
+        fsz = 2 * fsz + 1;
+
+        // For default values of cutoff = 0.05, sigma = 3,
+        // we have fsz = 17.
+
+        for (int i = 0; i < fsz; i++) {
+          int j = i - fsz / 2;
+          filter_coefficients.push_back(exp(-j * j / (2 * sigma * sigma)));
+          CHECK_EQ(filter_coefficients[i], FilterCoefficients()[i]);
+        }
+      }
+
+      std::vector<double> filtered_errors;
+      filtered_errors.resize(group.size(), 0.0);
+
+      for (size_t i = 0; i < group.size(); i++) {
+        double accumulated_cuda = 0;
+        double accumulated = 0;
+        for (int filter_coefficient_index = 0;
+             filter_coefficient_index <
+             static_cast<int>(filter_coefficients.size());
+             filter_coefficient_index++) {
+          const double e =
+              errs[(i + filter_coefficient_index -
+                    filter_coefficients.size() / 2 + group.size()) %
+                   group.size()];
+          accumulated += e * filter_coefficients[filter_coefficient_index];
+
+          const double e_cuda =
+              errors_device[(i + filter_coefficient_index -
+                             filter_coefficients.size() / 2 + group.size()) %
+                                group.size() +
+                            accumulated_size];
+          accumulated_cuda +=
+              e_cuda * filter_coefficients[filter_coefficient_index];
+        }
+        filtered_errors[i] = accumulated;
+
+        // Make sure the filter math works.  This can be a lot tighter since the
+        // error calc appears to be less accurate.
+        CHECK_LT(std::abs(accumulated_cuda -
+                          filtered_errors_device[accumulated_size + i]),
+                 1e-3)
+            << " Failed to match error at blob " << blob_index << " index " << i
+            << " global index " << accumulated_size + i
+            << " for a blob of size " << group.size() << " expected "
+            << filtered_errors[i] << " got "
+            << filtered_errors_device[accumulated_size + i];
+
+        // Make sure it hasn't walked too far off from the actual error.
+        /*
+        CHECK_LT(std::abs(filtered_errors[i] -
+                          filtered_errors_device[accumulated_size + i]),
+                 1e-1)
+            << " Failed to match error at blob " << blob_index << " index " << i
+            << " global index " << accumulated_size + i
+            << " for a blob of size " << group.size() << " expected "
+            << filtered_errors[i] << " got "
+            << filtered_errors_device[accumulated_size + i];
+            */
+      }
+
+      // Now, check the peaks.
+      size_t num_peaks_cuda = 0;
+      size_t num_peaks_april = 0;
+      for (size_t i = 0; i < group.size(); i++) {
+        double before_aprilrobotics =
+            filtered_errors[(i + group.size() - 1) % group.size()];
+        double after_aprilrobotics = filtered_errors[(i + 1) % group.size()];
+        double us_aprilrobotics = filtered_errors[i];
+
+        double before_cuda =
+            filtered_errors_device[(i + group.size() - 1) % group.size() +
+                                   accumulated_size];
+        double after_cuda =
+            filtered_errors_device[(i + 1) % group.size() + accumulated_size];
+        double us_cuda = filtered_errors_device[i + accumulated_size];
+
+        const bool is_peak_aprilrobotics =
+            us_aprilrobotics > before_aprilrobotics &&
+            us_aprilrobotics > after_aprilrobotics;
+        const bool is_peak_cuda = us_cuda > before_cuda && us_cuda > after_cuda;
+
+        if (is_peak_aprilrobotics) {
+          ++num_peaks_april;
+        }
+        // We don't agree...  But it is on little stupid stuff.
+        /*CHECK_EQ(is_peak_aprilrobotics, peaks_device[accumulated_size + i])
+            << " Failed to match peak at blob " << blob_index << " index " << i
+            << " global index " << accumulated_size + i
+            << " for a blob of size " << group.size() << " compared "
+            << before_aprilrobotics << " vs " << us_aprilrobotics << " vs "
+            << after_aprilrobotics;*/
+        CHECK_EQ(is_peak_cuda, peaks_device[accumulated_size + i].blob_index !=
+                                   Peak::kNoPeak())
+            << " Failed to match peak at blob " << blob_index << " index " << i
+            << " global index " << accumulated_size + i
+            << " for a blob of size " << group.size() << " compared "
+            << before_cuda << " vs " << us_cuda << " vs " << after_cuda;
+        CHECK_EQ(peaks_device[accumulated_size + i].filtered_point_index,
+                 accumulated_size + i);
+        CHECK_NEAR(peaks_device[accumulated_size + i].error,
+                   -filtered_errors_device[accumulated_size + i], 1e-3);
+        if (is_peak_cuda) {
+          CHECK_EQ(peaks_device[accumulated_size + i].blob_index,
+                   group[0].blob_index())
+              << " Failed to match peak at blob " << blob_index << " index "
+              << i << " global index " << accumulated_size + i
+              << " for a blob of size " << group.size() << " compared "
+              << before_cuda << " vs " << us_cuda << " vs " << after_cuda;
+        }
+        if (is_peak_cuda) {
+          ++num_peaks_cuda;
+          ++summed_cuda_pts;
+        }
+      }
+
+      (void)num_peaks_cuda;
+      (void)num_peaks_april;
+      // CHECK_EQ(num_peaks_cuda, num_peaks_april);
+
+      zarray_destroy(cluster);
+      free(lfps);
+      accumulated_size += group.size();
+      ++blob_index;
+    }
+    LOG(INFO) << "Overall, found " << summed_cuda_pts << " peaks with "
+              << bad_errors << " bad errors.";
+    // CHECK_LT(bad_errors, 60u);
+  }
+
+  // Orders the clusters reported by AprilRobotics to match the inbound cuda
+  // cluster order.
+  zarray_t *OrderAprilroboticsLikeCuda(
+      image_u8_t *thresholded_im, unionfind_t *uf,
+      const std::vector<std::vector<QuadBoundaryPoint>> &cuda_grouped_points)
+      const {
+    // Step 1, label both the cuda and april robotics points with their original
+    // index. Step 2, sort both, retaining the indices. Step 3, reorder the
+    // april robotics points to match the cuda points using the indices.
+    zarray_t *clusters =
+        gradient_clusters(tag_detector_, thresholded_im, thresholded_im->width,
+                          thresholded_im->height, thresholded_im->stride, uf);
+
+    // Reorder a copy of april robotics clusters to our format.
+    struct AprilClusterIndexPair {
+      size_t index;
+      std::vector<uint64_t> points;
+    };
+    std::vector<AprilClusterIndexPair> april_points;
+
+    for (int i = 0; i < zarray_size(clusters); i++) {
+      zarray_t *cluster;
+      zarray_get(clusters, i, &cluster);
+
+      AprilClusterIndexPair points;
+      points.index = i;
+
+      for (int j = 0; j < zarray_size(cluster); j++) {
+        struct pt *p;
+        zarray_get_volatile(cluster, j, &p);
+
+        points.points.push_back(p->x + p->y * width_);
+      }
+
+      std::sort(points.points.begin(), points.points.end());
+      points.points.erase(
+          std::unique(points.points.begin(), points.points.end()),
+          points.points.end());
+      april_points.emplace_back(std::move(points));
+    }
+
+    // And sort the blobs now.
+    std::sort(april_points.begin(), april_points.end(), [](auto &x, auto &y) {
+      if (x.points.size() == y.points.size()) {
+        for (size_t j = 0; j < x.points.size(); ++j) {
+          if (x.points[j] == y.points[j]) {
+            continue;
+          }
+          return x.points[j] < y.points[j];
+        }
+        LOG(FATAL) << "Equal";
+      }
+      return x.points.size() < y.points.size();
+    });
+
+    struct CudaClusterIndexPair {
+      size_t index;
+      std::vector<QuadBoundaryPoint> points;
+    };
+
+    // Do the same for CUDA.
+    std::vector<CudaClusterIndexPair> cuda_points;
+    for (size_t i = 0; i < cuda_grouped_points.size(); ++i) {
+      CudaClusterIndexPair points;
+      points.points = cuda_grouped_points[i];
+      points.index = i;
+
+      std::sort(points.points.begin(), points.points.end(),
+                [this](const QuadBoundaryPoint &a, const QuadBoundaryPoint &b) {
+                  CHECK_EQ(a.rep01(), b.rep01());
+                  return a.x() + a.y() * width_ < b.x() + b.y() * width_;
+                });
+      cuda_points.emplace_back(std::move(points));
+    }
+
+    // And sort the blobs.
+    std::sort(cuda_points.begin(), cuda_points.end(), [this](auto &x, auto &y) {
+      if (x.points.size() == y.points.size()) {
+        for (size_t j = 0; j < x.points.size(); ++j) {
+          if (x.points[j].x() + x.points[j].y() * width_ ==
+              y.points[j].x() + y.points[j].y() * width_) {
+            continue;
+          }
+          return x.points[j].x() + x.points[j].y() * width_ <
+                 y.points[j].x() + y.points[j].y() * width_;
+        }
+        LOG(FATAL) << "Equal";
+      }
+      return x.points.size() < y.points.size();
+    });
+
+    // OK, we now have the cuda points in the same order as the aprilrobotics
+    // points.  First CHECK that they match, just in case...
+    std::vector<zarray_t *> sorted_result;
+    sorted_result.resize(cuda_points.size());
+
+    CHECK_EQ(april_points.size(), cuda_points.size());
+    for (size_t i = 0; i < april_points.size(); ++i) {
+      CHECK_EQ(april_points[i].points.size(), cuda_points[i].points.size());
+      for (size_t j = 0; j < april_points[i].points.size(); ++j) {
+        CHECK_EQ(april_points[i].points[j],
+                 cuda_points[i].points[j].x() +
+                     cuda_points[i].points[j].y() * width_)
+            << ": " << i << " " << j;
+      }
+
+      // And, while we are iterating, make a vector with the clusters in the
+      // right order.
+      zarray_t *cluster;
+      zarray_get(clusters, april_points[i].index, &cluster);
+      sorted_result[cuda_points[i].index] = cluster;
+    }
+
+    // Finally, put it back in the clusters.
+    for (size_t i = 0; i < april_points.size(); ++i) {
+      zarray_set(clusters, i, &sorted_result[i], nullptr);
+    }
+
+    return clusters;
+  }
+
+  void CheckQuads(const zarray_t *clusters,
+                  std::vector<IndexPoint> sorted_selected_blobs_cuda,
+                  const std::vector<QuadCorners> &fit_quads) {
+    std::vector<std::vector<IndexPoint>> sorted_blobs;
+    for (auto blob : CudaGroupedPoints(sorted_selected_blobs_cuda)) {
+      while (blob[0].blob_index() > sorted_blobs.size()) {
+        sorted_blobs.emplace_back();
+      }
+      sorted_blobs.emplace_back(std::move(blob));
+    }
+
+    image_u8_t quad_im = ToImageu8t(decimated_cuda_);
+
+    std::vector<bool> aprilrobotics_valid;
+    std::vector<struct quad> aprilrobotics_quad;
+    aprilrobotics_valid.resize(zarray_size(clusters));
+    aprilrobotics_quad.resize(zarray_size(clusters));
+
+    for (int i = 0; i < zarray_size(clusters); ++i) {
+      struct quad quad_result;
+
+      zarray_t *cluster;
+      zarray_get(clusters, i, &cluster);
+
+      if (i == FLAGS_debug_blob_index) {
+        LOG(INFO) << "cuda points for blob " << i << " are";
+        for (size_t j = 0; j < sorted_blobs[i].size(); ++j) {
+          LOG(INFO) << "  blob[" << j << "]: (" << sorted_blobs[i][j].x()
+                    << ", " << sorted_blobs[i][j].y() << ")";
+        }
+      }
+      VLOG(1) << "Going to fit quad " << i << " of size "
+              << zarray_size(cluster);
+      int valid_blob =
+          fit_quad(tag_detector_, &quad_im, cluster, &quad_result,
+                   min_tag_width_, normal_border_, reversed_border_);
+
+      auto quad_iterator = std::find_if(fit_quads.begin(), fit_quads.end(),
+                                        [&](const QuadCorners corner) {
+                                          return (int)corner.blob_index == i;
+                                        });
+
+      CHECK_EQ(quad_iterator != fit_quads.end(), valid_blob != 0)
+          << ": Missmatch on quad " << i;
+
+      if (!valid_blob) {
+        continue;
+      }
+
+      // Apply the center adjustment algorithm to the ground truth so we compare
+      // apples to apples.
+      gpu_detector_.AdjustCenter(quad_result.p);
+
+      QuadCorners cuda_corner = *quad_iterator;
+
+      for (size_t point = 0; point < 4; ++point) {
+        CHECK_NEAR(quad_result.p[point][0], cuda_corner.corners[point][0],
+                   1e-3);
+        CHECK_NEAR(quad_result.p[point][1], cuda_corner.corners[point][1],
+                   1e-3);
+      }
+    }
+  }
+
+  void CheckDetections(zarray_t *aprilrobotics_detections,
+                       const zarray_t *gpu_detections) {
+    CHECK_EQ(zarray_size(aprilrobotics_detections),
+             zarray_size(gpu_detections));
+    LOG(INFO) << "Found " << zarray_size(gpu_detections) << " tags";
+
+    for (int i = 0; i < zarray_size(aprilrobotics_detections); ++i) {
+      const apriltag_detection_t *aprilrobotics_detection;
+      const apriltag_detection_t *gpu_detection;
+
+      zarray_get(aprilrobotics_detections, i, &aprilrobotics_detection);
+      zarray_get(gpu_detections, i, &gpu_detection);
+
+      bool valid = gpu_detection->decision_margin > FLAGS_min_decision_margin;
+
+      LOG(INFO) << "Found GPU " << (valid ? "valid" : "invalid")
+                << " tag number " << gpu_detection->id
+                << " hamming: " << gpu_detection->hamming
+                << " margin: " << gpu_detection->decision_margin;
+      LOG(INFO) << "Found CPU " << (valid ? "valid" : "invalid")
+                << " tag number " << aprilrobotics_detection->id
+                << " hamming: " << aprilrobotics_detection->hamming
+                << " margin: " << aprilrobotics_detection->decision_margin;
+    }
+
+    for (int i = 0; i < zarray_size(aprilrobotics_detections); ++i) {
+      const apriltag_detection_t *aprilrobotics_detection;
+      const apriltag_detection_t *gpu_detection;
+
+      zarray_get(aprilrobotics_detections, i, &aprilrobotics_detection);
+      zarray_get(gpu_detections, i, &gpu_detection);
+
+      const bool valid =
+          gpu_detection->decision_margin > FLAGS_min_decision_margin;
+
+      // TODO(austin): Crank down the thresholds and figure out why these
+      // deviate.  It should be the same function for both at this point.
+      const double threshold = valid ? 2e-3 : 1e-1;
+
+      CHECK_EQ(aprilrobotics_detection->id, gpu_detection->id);
+      CHECK_EQ(aprilrobotics_detection->hamming, gpu_detection->hamming);
+      EXPECT_NEAR(aprilrobotics_detection->c[0], gpu_detection->c[0],
+                  threshold);
+      EXPECT_NEAR(aprilrobotics_detection->c[1], gpu_detection->c[1],
+                  threshold);
+
+      for (int j = 0; j < 4; ++j) {
+        for (int k = 0; k < 2; ++k) {
+          EXPECT_NEAR(aprilrobotics_detection->p[j][k], gpu_detection->p[j][k],
+                      threshold);
+        }
+      }
+
+      CHECK_EQ(aprilrobotics_detection->H->nrows, gpu_detection->H->nrows);
+      CHECK_EQ(aprilrobotics_detection->H->ncols, gpu_detection->H->ncols);
+
+      for (size_t j = 0; j < gpu_detection->H->ncols; ++j) {
+        for (size_t k = 0; k < gpu_detection->H->nrows; ++k) {
+          EXPECT_NEAR(matd_get(gpu_detection->H, k, j),
+                      matd_get(aprilrobotics_detection->H, k, j), threshold);
+        }
+      }
+    }
   }
 
   // Checks that the GPU and CPU algorithms match.
@@ -1250,8 +1828,8 @@
     // Test that the extents are reasonable.
     size_t selected_quads;
     std::vector<IndexPoint> selected_blobs;
-    std::tie(selected_quads, selected_blobs) = CheckCudaExtents(
-        cuda_grouped_points, extents_cuda_, reduced_dot_blobs_pair_cuda_);
+    std::tie(selected_quads, selected_blobs) =
+        CheckCudaExtents(cuda_grouped_points, extents_cuda_);
 
     // And that the filtered list is reasonable.
     CheckFilteredCudaPoints(selected_blobs, selected_blobs_cuda_);
@@ -1261,10 +1839,30 @@
                                   selected_quads, selected_blobs,
                                   sorted_selected_blobs_cuda_);
 
-    // TODO(austin): Check the selected extents is right.
+    CheckCudaFilteredExtents(selected_extents_cuda_,
+                             sorted_selected_blobs_cuda_);
+
+    CheckErrors(sorted_selected_blobs_cuda_, line_fit_points_cuda_,
+                errors_device_, filtered_errors_device_, peaks_device_);
+
+    zarray_t *april_clusters =
+        OrderAprilroboticsLikeCuda(thresholded_im, uf, cuda_grouped_points);
+
+    CheckQuads(april_clusters, sorted_selected_blobs_cuda_, fit_quads_);
+
+    const zarray_t *gpu_detections = gpu_detector_.Detections();
+    CheckDetections(aprilrobotics_detections_, gpu_detections);
+
     LOG(INFO) << "Found slope sorted count: "
               << sorted_union_marker_pair_.size();
 
+    for (int i = 0; i < zarray_size(april_clusters); i++) {
+      zarray_t *cluster;
+      zarray_get(april_clusters, i, &cluster);
+      zarray_destroy(cluster);
+    }
+
+    zarray_destroy(april_clusters);
     unionfind_destroy(uf);
     image_u8_destroy(quad_im);
     image_u8_destroy(thresholded_im);
@@ -1442,14 +2040,22 @@
   std::vector<QuadBoundaryPoint> sorted_union_marker_pair_;
   std::vector<uint32_t> quad_length_;
   std::vector<MinMaxExtents> extents_cuda_;
-  std::vector<float> reduced_dot_blobs_pair_cuda_;
+  std::vector<cub::KeyValuePair<long, MinMaxExtents>> selected_extents_cuda_;
   std::vector<IndexPoint> selected_blobs_cuda_;
   std::vector<IndexPoint> sorted_selected_blobs_cuda_;
+  std::vector<LineFitPoint> line_fit_points_cuda_;
+
+  std::vector<double> errors_device_;
+  std::vector<double> filtered_errors_device_;
+  std::vector<Peak> peaks_device_;
   int num_compressed_union_marker_pair_ = 0;
   int num_quads_ = 0;
+  std::vector<QuadCorners> fit_quads_;
 
   GpuDetector gpu_detector_;
 
+  zarray_t *aprilrobotics_detections_ = nullptr;
+
   size_t width_;
   size_t height_;
 
@@ -1463,7 +2069,7 @@
   aos::FlatbufferVector<frc971::vision::CameraImage> ReadImage(
       std::string_view path) {
     return aos::FileToFlatbuffer<frc971::vision::CameraImage>(
-        "../apriltag_test_bfbs_images/" + std::string(path));
+        "../" + std::string(path));
   }
 
   cv::Mat ToMat(const frc971::vision::CameraImage *image) {
@@ -1474,7 +2080,7 @@
 };
 
 TEST_F(AprilDetectionTest, ImageRepeat) {
-  auto image = ReadImage("bfbs-capture-2013-01-18_08-54-09.501047728.bfbs");
+  auto image = ReadImage("orin_image_apriltag/file/orin_image_apriltag.bfbs");
 
   LOG(INFO) << "Image is: " << image.message().cols() << " x "
             << image.message().rows();
@@ -1484,14 +2090,14 @@
 
   const cv::Mat color_image = ToMat(&image.message());
 
-  for (size_t i = 0; i < 10; ++i) {
+  for (size_t i = 0; i < 4; ++i) {
     LOG(INFO) << "Attempt " << i;
     cuda_detector.DetectGPU(color_image.clone());
     cuda_detector.DetectCPU(color_image.clone());
-    cuda_detector.Check(color_image.clone());
-    if (FLAGS_debug) {
-      cuda_detector.WriteDebug(color_image);
-    }
+  }
+  cuda_detector.Check(color_image.clone());
+  if (FLAGS_debug) {
+    cuda_detector.WriteDebug(color_image);
   }
 }
 
@@ -1522,16 +2128,91 @@
 // Tests that both algorithms agree on a bunch of pixels.
 INSTANTIATE_TEST_SUITE_P(
     CapturedImages, SingleAprilDetectionTest,
-    ::testing::Values("bfbs-capture-2013-01-18_08-54-16.869057537.bfbs",
-                      "bfbs-capture-2013-01-18_08-54-09.501047728.bfbs",
-                      "bfbs-capture-2013-01-18_08-51-24.861065764.bfbs",
-                      "bfbs-capture-2013-01-18_08-52-01.846912552.bfbs",
-                      "bfbs-capture-2013-01-18_08-52-33.462848049.bfbs",
-                      "bfbs-capture-2013-01-18_08-54-24.931661979.bfbs",
-                      "bfbs-capture-2013-01-18_09-29-16.806073486.bfbs",
-                      "bfbs-capture-2013-01-18_09-33-00.993756514.bfbs",
-                      "bfbs-capture-2013-01-18_08-57-00.171120695.bfbs",
-                      "bfbs-capture-2013-01-18_08-57-17.858752817.bfbs",
-                      "bfbs-capture-2013-01-18_08-57-08.096597542.bfbs"));
+    ::testing::Values(
+        "apriltag_test_bfbs_images/"
+        "bfbs-capture-2013-01-18_08-54-16.869057537.bfbs",
+        "orin_image_apriltag/file/orin_image_apriltag.bfbs",
+        "orin_large_image_apriltag/file/orin_large_gs_apriltag.bfbs",
+        "apriltag_test_bfbs_images/"
+        "bfbs-capture-2013-01-18_08-54-09.501047728.bfbs",
+        "apriltag_test_bfbs_images/"
+        "bfbs-capture-2013-01-18_08-51-24.861065764.bfbs",
+        "apriltag_test_bfbs_images/"
+        "bfbs-capture-2013-01-18_08-52-01.846912552.bfbs",
+        "apriltag_test_bfbs_images/"
+        "bfbs-capture-2013-01-18_08-52-33.462848049.bfbs",
+        "apriltag_test_bfbs_images/"
+        "bfbs-capture-2013-01-18_08-54-24.931661979.bfbs",
+        "apriltag_test_bfbs_images/"
+        "bfbs-capture-2013-01-18_09-29-16.806073486.bfbs",
+        "apriltag_test_bfbs_images/"
+        "bfbs-capture-2013-01-18_09-33-00.993756514.bfbs",
+        "apriltag_test_bfbs_images/"
+        "bfbs-capture-2013-01-18_08-57-00.171120695.bfbs",
+        "apriltag_test_bfbs_images/"
+        "bfbs-capture-2013-01-18_08-57-17.858752817.bfbs",
+        "apriltag_test_bfbs_images/"
+        "bfbs-capture-2013-01-18_08-57-08.096597542.bfbs"));
+
+// Tests that the filter coefficients produced by FilterCoefficients() match the
+// implementation in AprilRobotics.
+TEST(FilterTest, Matches) {
+  constexpr double sigma = 1;  // was 3
+
+  // cutoff = exp(-j*j/(2*sigma*sigma));
+  // log(cutoff) = -j*j / (2*sigma*sigma)
+  // log(cutoff)*2*sigma*sigma = -j*j;
+
+  // how big a filter should we use? We make our kernel big
+  // enough such that we represent any values larger than
+  // 'cutoff'.
+
+  // XXX Tunable (though not super useful to change)
+  constexpr double cutoff = 0.05;
+  int fsz = sqrt(-log(cutoff) * 2 * sigma * sigma) + 1;
+  fsz = 2 * fsz + 1;
+
+  // For default values of cutoff = 0.05, sigma = 3,
+  // we have fsz = 17.
+  std::vector<float> f;
+
+  for (int i = 0; i < fsz; i++) {
+    const int j = i - fsz / 2;
+    f.emplace_back(exp(-j * j / (2 * sigma * sigma)));
+  }
+
+  constexpr std::array<float, 7> coefficients = FilterCoefficients();
+  ASSERT_EQ(coefficients.size(), f.size());
+
+  for (size_t i = 0; i < f.size(); ++i) {
+    EXPECT_EQ(coefficients[i], f[i]);
+  }
+}
+
+// Tests that Unrank returns the right value for each input.
+TEST(FilterTest, Unrank) {
+  const int nmaxima = 10;
+  int overall_count = 0;
+  for (int m0 = 0; m0 < nmaxima - 3; m0++) {
+    for (int m1 = m0 + 1; m1 < nmaxima - 2; m1++) {
+      for (int m2 = m1 + 1; m2 < nmaxima - 1; m2++) {
+        for (int m3 = m2 + 1; m3 < nmaxima; m3++) {
+          const std::tuple<int, int, int, int> unranked = Unrank(overall_count);
+
+          VLOG(1) << overall_count << " -> [" << m0 << ", " << m1 << ", " << m2
+                  << ", " << m3 << "] -> [" << std::get<0>(unranked) << ", "
+                  << std::get<1>(unranked) << ", " << std::get<2>(unranked)
+                  << ", " << std::get<3>(unranked) << "]";
+          ASSERT_EQ(m0, std::get<0>(unranked));
+          ASSERT_EQ(m1, std::get<1>(unranked));
+          ASSERT_EQ(m2, std::get<2>(unranked));
+          ASSERT_EQ(m3, std::get<3>(unranked));
+          ++overall_count;
+        }
+      }
+    }
+  }
+  EXPECT_EQ(overall_count, MaxRankedIndex());
+}
 
 }  // namespace frc971::apriltag::testing

diff --git a/frc971/orin/line_fit_filter.cc b/frc971/orin/line_fit_filter.cc
new file mode 100644
index 0000000..954976a
--- /dev/null
+++ b/frc971/orin/line_fit_filter.cc

@@ -0,0 +1,1235 @@
+#include "frc971/orin/line_fit_filter.h"
+
+#include <cub/block/block_reduce.cuh>
+#include <cub/warp/warp_merge_sort.cuh>
+#include <iomanip>
+
+#include "frc971/orin/cuda.h"
+
+// #define DEBUG_BLOB_NUMBER 401
+
+namespace frc971 {
+namespace apriltag {
+
+static_assert(sizeof(LineFitPoint) == 40, "Size of LineFitPoint changed");
+static_assert(sizeof(int4) == 16, "Size of int4 changed");
+
+constexpr size_t kPointsPerBlock = 256;
+// Number of errors to calculate before and after for the err filter and peak
+// finder.
+constexpr size_t kBeforeBuffer = 4;
+constexpr size_t kAfterBuffer = 4;
+constexpr size_t kErrorsBuffer = kBeforeBuffer + kAfterBuffer;
+
+__device__ double FitLineError(size_t N, int64_t Mx, int64_t My, int64_t Mxx,
+                               int64_t Myy, int64_t Mxy, int64_t W) {
+  int64_t Cxx = Mxx * W - Mx * Mx;
+  int64_t Cxy = Mxy * W - Mx * My;
+  int64_t Cyy = Myy * W - My * My;
+
+  // Pose it as an eigenvalue problem.
+  //
+  // TODO(austin): Are floats good enough?  Hard to tell what the "right answer"
+  // actually is...
+  float eig_small = ((Cxx + Cyy) - std::hypotf((Cxx - Cyy), 2 * Cxy)) /
+                    static_cast<float>(W * W * 8.0);
+  return N * eig_small;
+}
+
+struct TempStorage {
+  // Block of points that we pre-load.
+  LineFitPoint tmp_storage[kPointsPerBlock];
+
+  // The errors loaded.  The first kAfterBuffer errors are the indices (could be
+  // duplicated) immediately before the first blob.  The last kAfterBuffer
+  // errors are the indices immediately following the last blob.
+  double tmp_errs[kPointsPerBlock - 2 * kAfterBuffer];
+
+  // The errors which are from the start of the first blob before the beginning
+  // of the buffer.
+  double tmp_errs_before[kAfterBuffer];
+  // The errors which are from the end of the last blob after the end of the
+  // buffer.
+  double tmp_errs_after[kAfterBuffer];
+
+  // The filtered errors in the primary block.
+  double tmp_filtered_errs[kPointsPerBlock - 4 * kAfterBuffer + 2];
+
+  // The filtered error corresponding to the first error in tmp_errs_before.
+  double tmp_filtered_err_before;
+  // The filtered error corresponding to the last error in tmp_errs_after.
+  double tmp_filtered_err_after;
+
+  static_assert(sizeof(tmp_storage) == sizeof(LineFitPoint) * kPointsPerBlock,
+                "Storage changed size.");
+};
+
+template <size_t kThreads>
+class ErrorCalculator {
+ public:
+  __host__ __device__ ErrorCalculator(
+      const LineFitPoint *line_fit_points_device, size_t points,
+      const cub::KeyValuePair<long, MinMaxExtents> *selected_extents_device,
+      TempStorage *storage)
+      : line_fit_points_device_(line_fit_points_device),
+        points_(points),
+        selected_extents_device_(selected_extents_device),
+        storage_(storage) {}
+
+  __host__ __device__ void Load() {
+    constexpr size_t kLoadSize = sizeof(int4);
+    static_assert(kLoadSize == 16, "kload");
+
+    // Global index in 128 bit loads.
+    const uint32_t global_index =
+        global_block_index_cache_start_ * sizeof(LineFitPoint) / kLoadSize +
+        threadIdx.x;
+
+    for (uint32_t i = 0; i < kPointsPerBlock * sizeof(LineFitPoint) / kLoadSize;
+         i += kThreads) {
+      // Load things in 128 bit increments strided across our threads in such a
+      // way our loads will coalesce nicely.
+      //
+      // Maybe overload a little bit past the end, but we over-allocate
+      // things on the host too.  Otherwise the end of the message can be
+      // corrupted.  For arrays which are multiples of kPointsPerBlock long,
+      // this will have no issues.
+      if (i + global_index <= points_ * sizeof(LineFitPoint) / kLoadSize) {
+        *(reinterpret_cast<int4 *>(storage_->tmp_storage) + i + threadIdx.x) =
+            *(reinterpret_cast<const int4 *>(line_fit_points_device_) +
+              global_index + i);
+      }
+    }
+  }
+
+  // Loads the blob extents for the provided buffer index. This index is from
+  // the front of the range we are responsible for computing.
+  __host__ __device__ void LoadExtents(size_t index) {
+    const size_t current_blob_index =
+        storage_->tmp_storage[index + (blockIdx.x == 0 ? 0 : kErrorsBuffer)]
+            .blob_index;
+    if (current_blob_index != blob_index_) {
+      starting_offset_ =
+          selected_extents_device_[current_blob_index].value.starting_offset;
+      count_ = selected_extents_device_[current_blob_index].value.count;
+      blob_index_ = current_blob_index;
+      ksz_ = std::min<int>(20, count_ / 12);
+    }
+  }
+
+  // Loads the error from our shared memory.  This follows the same model as
+  // BlobIndex and CalculateError below.  Call LoadExtents first to get the
+  // right index, and proceed from there.
+  __host__ __device__ double LoadError(ssize_t index) const {
+    // Index into the current blob.  Should be between 0 and count_.
+    const uint32_t positive_blob_index = (index + count_) % count_;
+
+    // This one is a normal one, it's global index is inside our range or
+    // contiguously just before our range.  Just do it.
+    //
+    // (note: kBeforeBuffer should be - on the right side of the >=, but due to
+    // unsigned arithmatic, it needs to be on the left.)
+    if (positive_blob_index + starting_offset_ + kBeforeBuffer >=
+        global_block_index_start()) {
+      // It is either inside our block, or just after our block.  That is all
+      // contiguous in memory and easy to calculate, so just return it.
+      if (positive_blob_index + starting_offset_ <
+          global_block_index_end() + kAfterBuffer) {
+        return storage_->tmp_errs[positive_blob_index + starting_offset_ +
+                                  kBeforeBuffer - global_block_index_start()];
+      }
+
+      // OK, this is past the back, but at the back end of the blob at the back
+      // end. These bytes are packed onto the back 4 objects of the buffer.
+      const size_t buffer_end = starting_offset_ + count_;
+      const size_t distance_from_end =
+          buffer_end - (positive_blob_index + starting_offset_);
+      return storage_->tmp_errs_after[4 - distance_from_end];
+    } else {
+      // We must be inside the before bits.
+      return storage_->tmp_errs_before[positive_blob_index];
+    }
+  }
+
+  // Loads the filtered error from the shared memory.  This follows the same
+  // model as BlobIndex and CalculateError below.  Call LoadExtents first to get
+  // the right index, and proceed from there.
+  __host__ __device__ double LoadFilteredError(ssize_t index) const {
+    const uint32_t positive_blob_index =
+        (index + global_block_index_start_ + count_ - starting_offset_) %
+        count_;
+    // This one is a normal one, it's global index is inside our range or
+    // contiguously just before our range.  Just do it.
+    if (positive_blob_index + starting_offset_ + 1 >=
+        global_block_index_start()) {
+      // It is either inside our block, or just after our block.  That is all
+      // contiguous in memory and easy to calculate, so just return it.
+      if (positive_blob_index + starting_offset_ <
+          global_block_index_end() + 1) {
+        return storage_
+            ->tmp_filtered_errs[positive_blob_index + starting_offset_ + 1 -
+                                global_block_index_start()];
+      }
+
+      // OK, this is past the back, but at the back end of the blob at the back
+      // end.  Return the on error we've got there.
+      return storage_->tmp_filtered_err_after;
+    } else {
+      // We must be inside the before bits.
+      return storage_->tmp_filtered_err_before;
+    }
+  }
+
+  // Gets the line fit point from either our local cache, or from main memory if
+  // it is too far away.
+  __host__ __device__ LineFitPoint GetPoint(size_t global_index,
+                                            bool print = false) const {
+    if (global_index >= global_block_index_cache_end_ ||
+        global_index < global_block_index_cache_start_) {
+      if (print) {
+#ifdef DEBUG_BLOB_NUMBER
+        printf(
+            "Block %d Thread %d   Loading global %d, relative %d from global\n",
+            blockIdx.x, threadIdx.x, (int)global_index,
+            (int)(global_index - starting_offset_));
+#endif
+      }
+      return line_fit_points_device_[global_index];
+    } else {
+      if (print) {
+#ifdef DEBUG_BLOB_NUMBER
+        printf(
+            "Block %d Thread %d   Loading global %d, relative %d from cache\n",
+            blockIdx.x, threadIdx.x, (int)global_index,
+            (int)(global_index - starting_offset_));
+#endif
+      }
+      return storage_
+          ->tmp_storage[global_index - global_block_index_cache_start_];
+    }
+  }
+
+  // Returns the blob index for the provided buffer index.  This is relative to
+  // the start of the range we are responsible for.
+  __forceinline__ __host__ __device__ size_t
+  BlobIndex(size_t buffer_index) const {
+    return global_block_index_start_ + buffer_index - starting_offset_;
+  }
+
+  // Calculates the line fit error centered on the provided blob index in the
+  // current extent.
+  __host__ __device__ double CalculateError(ssize_t blob_index,
+                                            bool print = false) const {
+    // Index into the blob list for the current key.
+    const size_t i0 = (blob_index + 2 * count_ - ksz_) % count_;
+    const size_t i1 = (blob_index + count_ + ksz_) % count_;
+
+    const size_t global_i0 = i0 + starting_offset_;
+    const size_t global_i1 = i1 + starting_offset_;
+
+    int32_t Mx, My, W;
+    int64_t Mxx, Myy, Mxy;
+    int N;  // how many points are included in the set?
+
+    if (i0 < i1) {
+      N = i1 - i0 + 1;
+
+      LineFitPoint lf1 = GetPoint(global_i1);
+
+      Mx = lf1.Mx;
+      My = lf1.My;
+      Mxx = lf1.Mxx;
+      Mxy = lf1.Mxy;
+      Myy = lf1.Myy;
+      W = lf1.W;
+
+      if (i0 > 0) {
+        LineFitPoint lf0 = GetPoint(global_i0 - 1);
+
+        Mx -= lf0.Mx;
+        My -= lf0.My;
+        Mxx -= lf0.Mxx;
+        Mxy -= lf0.Mxy;
+        Myy -= lf0.Myy;
+        W -= lf0.W;
+      }
+    } else {
+      // i0 > i1, e.g. [15, 2]. Wrap around.
+      LineFitPoint lf0 = GetPoint(global_i0 - 1, print);
+      LineFitPoint lfsz = GetPoint(starting_offset_ + count_ - 1, print);
+
+      Mx = lfsz.Mx - lf0.Mx;
+      My = lfsz.My - lf0.My;
+      Mxx = lfsz.Mxx - lf0.Mxx;
+      Mxy = lfsz.Mxy - lf0.Mxy;
+      Myy = lfsz.Myy - lf0.Myy;
+      W = lfsz.W - lf0.W;
+
+      LineFitPoint lf1 = GetPoint(global_i1, print);
+
+      Mx += lf1.Mx;
+      My += lf1.My;
+      Mxx += lf1.Mxx;
+      Mxy += lf1.Mxy;
+      Myy += lf1.Myy;
+      W += lf1.W;
+
+      N = count_ - i0 + i1 + 1;
+    }
+
+    // And now fit it.
+    return FitLineError(N, Mx, My, Mxx, Myy, Mxy, W);
+  }
+
+  // Returns the starting global index of the region we are responsible for
+  // computing.
+  __host__ __device__ __forceinline__ uint32_t
+  global_block_index_start() const {
+    return global_block_index_start_;
+  }
+  // Returns the ending global index of the region we are responsible for
+  // computing.
+  __host__ __device__ __forceinline__ uint32_t global_block_index_end() const {
+    return global_block_index_end_;
+  }
+  // Returns the size of the region we are responsible for computing.
+  __host__ __device__ __forceinline__ uint32_t global_block_index_size() const {
+    return global_block_index_end_ - global_block_index_start_;
+  }
+
+  // Returns the starting index in global indices of the current blob ID.
+  __host__ __device__ __forceinline__ uint32_t
+  global_current_blob_starting_index() const {
+    return starting_offset_;
+  }
+
+  // Returns the number of points in the current blob ID.
+  __host__ __device__ __forceinline__ uint32_t
+  global_current_blob_count() const {
+    return count_;
+  }
+
+  // Returns the current blob ID.
+  __host__ __device__ __forceinline__ uint32_t blob_index() const {
+    return blob_index_;
+  }
+
+ private:
+  const LineFitPoint *line_fit_points_device_;
+  size_t points_;
+  const cub::KeyValuePair<long, MinMaxExtents> *selected_extents_device_;
+
+  TempStorage *storage_;
+
+  // Start and end location of the LineFitPoint cache in tmp_storage_.
+  const uint32_t global_block_index_cache_start_ =
+      blockIdx.x == 0
+          ? 0
+          : blockIdx.x * (kPointsPerBlock - 2 * kErrorsBuffer) - kErrorsBuffer;
+  const uint32_t global_block_index_cache_end_ = std::min<uint32_t>(
+      global_block_index_cache_start_ + kPointsPerBlock, points_);
+
+  const uint32_t global_block_index_start_ =
+      blockIdx.x * (kPointsPerBlock - 2 * kErrorsBuffer);
+  const uint32_t global_block_index_end_ = std::min<uint32_t>(
+      global_block_index_start_ + kPointsPerBlock - 2 * kErrorsBuffer, points_);
+
+  size_t blob_index_ = 0xffffffff;
+  size_t starting_offset_ = 0;
+  size_t count_ = 0;
+  size_t ksz_ = 0;
+};
+
+template <size_t kThreads>
+__global__ void DoFitLines(
+    const LineFitPoint *line_fit_points_device, size_t points,
+    const cub::KeyValuePair<long, MinMaxExtents> *selected_extents_device,
+    double *errs_device, double *filtered_errs_device, Peak *peaks_device) {
+  __shared__ TempStorage storage;
+
+  ErrorCalculator<kThreads> calculator(line_fit_points_device, points,
+                                       selected_extents_device, &storage);
+
+  calculator.Load();
+
+  // TODO(austin): Whichever warp loads the first ksz blobs should load the
+  // prior ksz blobs.  Figure out if things are too slow first.  __syncwarp()
+  // and make the first warp do it?
+
+  __syncthreads();
+
+  // We need to compute a couple of errs_device past each end in shared memory.
+  // This needs to be (FilterCoefficients().size() - 1) + 2.  The first term is
+  // the number of extra errors we need for the filter.  The second is the extra
+  // number of errors needed to implement the peak finder (1 on each side).
+
+  // OK, the way this all works now, error calculation work [0, kErrorsBuffer)
+  // is done in the first kErrorsBuffer threads, and written to
+  // tmp_errs [0, kErrorsBuffer), even if this wraps back into the buffer.
+
+  // Now that everything is loaded, have each thread process its points by
+  // strides.
+
+  // TODO(austin): We need 4 reserved ranges.  Proposal is:
+  //   [0, kErrorsBuffer) -> beginning points of the extent of blob 0, ie
+  //     starting at starting_offset_ globally.
+  //
+  //   [kErrorsBuffer, 2 * kErrorsBuffer) -> 4 points immediately before the
+  //   starting point.
+  //
+  // And then symetrically off the back end too.  This will mean we process 16
+  // less points per thread.  Whatever.
+
+  for (size_t i = threadIdx.x; i < kPointsPerBlock; i += blockDim.x) {
+    // OK, now we need to do the actual line fit.
+    if (i < kErrorsBuffer) {
+      // Start by extending the errors before the blob in our buffer space.  All
+      // of this will be done in blob0's address space.
+      //
+      // In the first block, this will load the 8th element.  But, since blobs
+      // are 24 or longer, this will be the 0th blob.  For following blocks,
+      // this is actually the right index.
+      calculator.LoadExtents(0);
+
+      // First 4 threads do the first 4 elements in the first blob.
+      if (i < kBeforeBuffer) {
+        storage.tmp_errs_before[i] = calculator.CalculateError(i);
+      } else {
+        // Then, compute the 4 errors right before the buffer.
+        storage.tmp_errs[i - kBeforeBuffer] = calculator.CalculateError(
+            calculator.BlobIndex(i - kBeforeBuffer) - kBeforeBuffer);
+      }
+    } else if (i - kErrorsBuffer + calculator.global_block_index_start() <
+               calculator.global_block_index_end() + kBeforeBuffer) {
+      // Now we are solidly in the middle of the block.  Make sure anything
+      // computed after the end of the buffer uses the extents of the end.
+      calculator.LoadExtents(std::min<uint32_t>(
+          calculator.global_block_index_size() - 1, i - kErrorsBuffer));
+      size_t target_index = calculator.BlobIndex(i - kErrorsBuffer);
+
+#ifdef DEBUG_BLOB_NUMBER
+      const bool print =
+          (calculator.blob_index() == DEBUG_BLOB_NUMBER &&
+           ((484 - 4 <= target_index && target_index <= 484 + 4)));
+#endif
+
+      storage.tmp_errs[i - kBeforeBuffer] =
+          calculator.CalculateError(target_index);
+#ifdef DEBUG_BLOB_NUMBER
+      if (print) {
+        printf("Block %d Thread %d (idx %d)   Err: %f\n", blockIdx.x,
+               threadIdx.x, (int)target_index,
+               calculator.CalculateError(target_index, true));
+      }
+#endif
+    } else {
+      // Past the end of the normal calcs, do the end buffering.
+      if (i < kPointsPerBlock - kBeforeBuffer) {
+        continue;
+      }
+
+      // We are just supposed to continue with the extents of the last index
+      // and keep going.
+      calculator.LoadExtents(calculator.global_block_index_size() - 1);
+
+      // Wrap before the beginning now of the last blob.
+      storage.tmp_errs_after[i - (kPointsPerBlock - kBeforeBuffer)] =
+          calculator.CalculateError(calculator.global_current_blob_count() -
+                                    kBeforeBuffer +
+                                    (i - (kPointsPerBlock - kBeforeBuffer)));
+    }
+  }
+
+  __syncthreads();
+
+#ifdef DEBUG_BLOB_NUMBER
+  /*
+  if (threadIdx.x == 0) {
+    for (int i = (int)calculator.global_block_index_cache_start_;
+         i < (int)calculator.global_block_index_cache_end_; ++ i) {
+      auto x = calculator.GetPoint(i);
+      if (x.blob_index == DEBUG_BLOB_NUMBER) {
+        printf("Block %d Thread %d   Loading global %d, relative %d, Mx: %f\n",
+               blockIdx.x, threadIdx.x, i,
+               (int)(i - calculator.selected_extents_device_[x.blob_index]
+                             .value.starting_offset),
+               x.Mx / 2.0);
+      }
+    }
+  }
+
+  __syncthreads();
+  */
+#endif
+
+  // We now have all the errors loaded! Box filter them.
+  for (size_t i = threadIdx.x; i < kPointsPerBlock; i += blockDim.x) {
+    // The peak finder needs 1 more filtered error in each direction.
+    ssize_t target_index;
+    double *destination;
+    if (i < kErrorsBuffer) {
+      if (i < kErrorsBuffer - 2) {
+        continue;
+      } else {
+        calculator.LoadExtents(0);
+        if (i < kErrorsBuffer - 1) {
+          // Target index is now 0.
+          target_index = 0;
+          destination = &storage.tmp_filtered_err_before;
+        } else {
+          target_index = calculator.BlobIndex(0) - 1;
+          destination = &storage.tmp_filtered_errs[i + 1 - kErrorsBuffer];
+        }
+      }
+    } else if (i - kErrorsBuffer + calculator.global_block_index_start() >=
+               calculator.global_block_index_end()) {
+      if (i - kErrorsBuffer + calculator.global_block_index_start() >=
+          calculator.global_block_index_end() + 2) {
+        // Past 1 past the end for the peak finder.
+        break;
+      }
+      calculator.LoadExtents(calculator.global_block_index_end() -
+                             calculator.global_block_index_start() - 1);
+
+      if (i - kErrorsBuffer + calculator.global_block_index_start() >=
+          calculator.global_block_index_end() + 1) {
+        destination = &storage.tmp_filtered_err_after;
+        target_index = calculator.global_current_blob_count() - 1;
+      } else {
+        destination = &storage.tmp_filtered_errs[i + 1 - kErrorsBuffer];
+        target_index = calculator.BlobIndex(i - kErrorsBuffer);
+      }
+    } else {
+      calculator.LoadExtents(i - kErrorsBuffer);
+      target_index = calculator.BlobIndex(i - kErrorsBuffer);
+      destination = &storage.tmp_filtered_errs[i + 1 - kErrorsBuffer];
+    }
+
+    double accumulated_error = 0.0;
+
+#ifdef DEBUG_BLOB_NUMBER
+    const bool print = (calculator.blob_index() == DEBUG_BLOB_NUMBER &&
+                        ((484 - 4 <= target_index && target_index <= 484 + 4)));
+#endif
+
+#pragma unroll
+    for (size_t j = 0; j < FilterCoefficients().size(); ++j) {
+      const double e = calculator.LoadError(target_index + j -
+                                            FilterCoefficients().size() / 2);
+      accumulated_error += e * FilterCoefficients()[j];
+#ifdef DEBUG_BLOB_NUMBER
+      if (print) {
+        printf("Block %d Thread %d (idx %d)    + %f * %f (%f) -> %f\n",
+               blockIdx.x, threadIdx.x, (int)target_index, e,
+               FilterCoefficients()[j], e * FilterCoefficients()[j],
+               accumulated_error);
+      }
+#endif
+    }
+    *destination = accumulated_error;
+
+#ifdef DEBUG_BLOB_NUMBER
+    if (print) {
+      printf(
+          "Block %d Thread %d (idx %d)  Blob %d of size %d, index %d "
+          "filtered_error = "
+          "%f, error = %f\n",
+          blockIdx.x, threadIdx.x, (int)target_index,
+          (int)calculator.blob_index(),
+          (int)calculator.global_current_blob_count(), (int)target_index,
+          accumulated_error, calculator.LoadError(target_index));
+    }
+#endif
+  }
+
+  __syncthreads();
+  // OK, errors are now saved to shared memory.  Compute peaks and push out to
+  // disk.
+
+  // TODO(austin): Don't save out the unfiltered errors if we aren't
+  // testing/debugging...
+  for (size_t i = threadIdx.x; i < kPointsPerBlock; i += blockDim.x) {
+    if (i < kErrorsBuffer) {
+      continue;
+    }
+
+    if (i - kErrorsBuffer + calculator.global_block_index_start() >=
+        calculator.global_block_index_end()) {
+      break;
+    }
+
+    errs_device[i - kErrorsBuffer + calculator.global_block_index_start()] =
+        storage.tmp_errs[i - kBeforeBuffer];
+  }
+
+  for (size_t i = threadIdx.x; i < kPointsPerBlock; i += blockDim.x) {
+    if (i < kErrorsBuffer) {
+      continue;
+    }
+
+    if (i - kErrorsBuffer + calculator.global_block_index_start() >=
+        calculator.global_block_index_end()) {
+      break;
+    }
+
+    calculator.LoadExtents(i - kErrorsBuffer);
+
+    const double before_error =
+        calculator.LoadFilteredError(i - kErrorsBuffer - 1);
+    const double my_error = calculator.LoadFilteredError(i - kErrorsBuffer);
+    const double after_error =
+        calculator.LoadFilteredError(i - kErrorsBuffer + 1);
+
+    // This gets us ready to sort.
+    filtered_errs_device[i - kErrorsBuffer +
+                         calculator.global_block_index_start()] = my_error;
+
+    const bool is_peak = my_error > before_error && my_error > after_error;
+
+    Peak peak;
+    peak.error = -my_error;
+    peak.blob_index = is_peak ? calculator.blob_index() : Peak::kNoPeak();
+    peak.filtered_point_index =
+        i - kErrorsBuffer + calculator.global_block_index_start();
+
+    peaks_device[peak.filtered_point_index] = peak;
+  }
+}
+
+void FitLines(
+    const LineFitPoint *line_fit_points_device, size_t points,
+    const cub::KeyValuePair<long, MinMaxExtents> *selected_extents_device,
+    size_t num_extents, double *errs_device, double *filtered_errs_device,
+    Peak *peaks_device, CudaStream *stream) {
+  constexpr size_t kThreads = 128;
+  const size_t kBlocks = (points + kPointsPerBlock - 2 * kErrorsBuffer - 1) /
+                         (kPointsPerBlock - 2 * kErrorsBuffer);
+
+  VLOG(1) << "Spawning with " << kThreads << " threads, and " << kBlocks
+          << " blocks for " << num_extents << " blob_ids and " << points
+          << " points";
+  DoFitLines<kThreads><<<kBlocks, kThreads, 0, stream->get()>>>(
+      line_fit_points_device, points, selected_extents_device, errs_device,
+      filtered_errs_device, peaks_device);
+}
+
+namespace {
+
+// Returns n choose k as a constexpr.
+constexpr uint Binomial(uint n, uint k) {
+  if (n < k) {
+    return 0;
+  }
+
+  uint m = std::min(k, n - k);
+  uint result = 1;
+
+  for (uint i = 0; i < m; ++i) {
+    result *= (n - i);
+    result /= (i + 1);
+  }
+
+  return result;
+}
+
+constexpr int kNMaxima = 10;
+
+// Returns the sum of binomials for CumulativeSumsM0.
+template <size_t nmaxima, size_t N>
+constexpr uint SumBinomial(uint i, uint start = 0) {
+  size_t result = 0;
+  for (uint m0 = start; m0 <= i; ++m0) {
+    result += Binomial(nmaxima - m0 - 1, N);
+  }
+  return result;
+}
+
+// Returns the cumulative number of indices consumed by incrementing each m0
+// index.
+constexpr std::array<uint, 7> CumulativeSumsM0() {
+  constexpr size_t nmaxima = kNMaxima;
+  return {
+      SumBinomial<nmaxima, 3>(0, 0), SumBinomial<nmaxima, 3>(1, 0),
+      SumBinomial<nmaxima, 3>(2, 0), SumBinomial<nmaxima, 3>(3, 0),
+      SumBinomial<nmaxima, 3>(4, 0), SumBinomial<nmaxima, 3>(5, 0),
+      SumBinomial<nmaxima, 3>(6, 0),
+  };
+}
+
+// Returns the number of indices consumed by each m1 index.  [0] is how many
+// you'd consume if m0 and m1 were both as small as they could be.  To handle m0
+// > 0, increment your index.
+constexpr std::array<uint, 7> BinomialM1() {
+  constexpr size_t nmaxima = kNMaxima;
+  return {
+      Binomial(nmaxima - 2, 2), Binomial(nmaxima - 3, 2),
+      Binomial(nmaxima - 4, 2), Binomial(nmaxima - 5, 2),
+      Binomial(nmaxima - 6, 2), Binomial(nmaxima - 7, 2),
+      Binomial(nmaxima - 8, 2),
+  };
+}
+
+// Returns the m0 for the provided global index.
+__host__ __device__ const cuda::std::pair<uint, uint> FindM0(uint i) {
+  uint m0 = 0;
+  uint last = 0;
+  while (true) {
+    uint next = CumulativeSumsM0()[m0];
+    if (i < next) {
+      return cuda::std::make_pair(m0, i - last);
+    }
+    last = next;
+    ++m0;
+  }
+}
+
+// Returns the m1 for the provided m0 and global index remainder.
+__host__ __device__ const cuda::std::pair<uint, uint> FindM1(uint m0, uint i) {
+  uint m1 = m0;
+  while (true) {
+    uint next = BinomialM1()[m1];
+    if (i < next || m1 == kNMaxima - 4) {
+      return cuda::std::make_pair(m1, i);
+    }
+    i -= next;
+    ++m1;
+  }
+}
+
+// Returns the m2 for the provided m1 and global index remainder.
+__host__ __device__ const cuda::std::pair<uint, uint> FindM2(uint m1, uint i) {
+  uint m2 = m1;
+  while (true) {
+    uint next = kNMaxima - m2 - 1 - 1;
+    if (i < next || m2 == kNMaxima - 3) {
+      return cuda::std::make_pair(m2, i);
+    }
+    i -= next;
+    ++m2;
+  }
+}
+
+}  // namespace
+
+__host__ __device__ std::tuple<uint, uint, uint, uint> Unrank(uint i) {
+  uint m0 = 0;
+  uint m1;
+  uint m2;
+  uint m3;
+
+  auto result0 = FindM0(i);
+  m0 = result0.first;
+  i = result0.second;
+
+  auto result1 = FindM1(m0, result0.second);
+  m1 = result1.first + 1;
+
+  i = result1.second;
+  auto result2 = FindM2(m1, result1.second);
+  m2 = result2.first + 1;
+
+  m3 = m2 + result2.second + 1;
+  return std::make_tuple(m0, m1, m2, m3);
+}
+
+// TODO(austin): Convert this into a constant lookup.
+__device__ __host__ cuda::std::pair<uint, uint> GetM0M1(uint tid) {
+  constexpr int nmaxima = kNMaxima;
+  uint count = 0;
+  for (uint m0 = 0; m0 < nmaxima - 3; m0++) {
+    for (uint m1 = m0 + 1; m1 < nmaxima - 2; m1++) {
+      if (count == tid) {
+        return cuda::std::make_pair(m0, m1);
+      }
+      ++count;
+    }
+  }
+  return cuda::std::make_pair(0, 0);
+}
+
+__device__ __forceinline__ LineFitMoments
+ReadMoments(const LineFitPoint *line_fit_points_device, size_t blob_point_count,
+            size_t index0, size_t index1) {
+  LineFitMoments result;
+
+  if (index0 < index1) {
+    result.N = index1 - index0 + 1;
+
+    LineFitPoint lf1 = line_fit_points_device[index1];
+
+    result.Mx = lf1.Mx;
+    result.My = lf1.My;
+    result.Mxx = lf1.Mxx;
+    result.Mxy = lf1.Mxy;
+    result.Myy = lf1.Myy;
+    result.W = lf1.W;
+
+    if (index0 > 0) {
+      LineFitPoint lf0 = line_fit_points_device[index0 - 1];
+
+      result.Mx -= lf0.Mx;
+      result.My -= lf0.My;
+      result.Mxx -= lf0.Mxx;
+      result.Mxy -= lf0.Mxy;
+      result.Myy -= lf0.Myy;
+      result.W -= lf0.W;
+    }
+  } else {
+    // index0 > index1, e.g. [15, 2]. Wrap around.
+    LineFitPoint lf0 = line_fit_points_device[index0 - 1];
+    LineFitPoint lfsz = line_fit_points_device[blob_point_count - 1];
+
+    result.Mx = lfsz.Mx - lf0.Mx;
+    result.My = lfsz.My - lf0.My;
+    result.Mxx = lfsz.Mxx - lf0.Mxx;
+    result.Mxy = lfsz.Mxy - lf0.Mxy;
+    result.Myy = lfsz.Myy - lf0.Myy;
+    result.W = lfsz.W - lf0.W;
+
+    LineFitPoint lf1 = line_fit_points_device[index1];
+
+    result.Mx += lf1.Mx;
+    result.My += lf1.My;
+    result.Mxx += lf1.Mxx;
+    result.Mxy += lf1.Mxy;
+    result.Myy += lf1.Myy;
+    result.W += lf1.W;
+
+    result.N = blob_point_count - index0 + index1 + 1;
+  }
+  return result;
+}
+
+__device__ void FitLine(LineFitMoments moments, double *lineparam01,
+                        double *lineparam23, double *err, double *mse,
+                        bool print = false) {
+  if (print) {
+    printf(
+        "Block %d Thread %d   Mx: %.4f, My: %.4f, Mxx: %.4f, Mxy: %.4f, Myy: "
+        "%.4f, W: %d, N: %d\n",
+        blockIdx.x, threadIdx.x, moments.Mx / 2.0, moments.My / 2.0,
+        moments.Mxx / 4.0, moments.Mxy / 4.0, moments.Myy / 4.0, moments.W,
+        moments.N);
+  }
+  int64_t Cxx = moments.Mxx * moments.W - static_cast<int64_t>(moments.Mx) *
+                                              static_cast<int64_t>(moments.Mx);
+  int64_t Cxy = moments.Mxy * moments.W - static_cast<int64_t>(moments.Mx) *
+                                              static_cast<int64_t>(moments.My);
+  int64_t Cyy = moments.Myy * moments.W - static_cast<int64_t>(moments.My) *
+                                              static_cast<int64_t>(moments.My);
+
+  // Pose it as an eigenvalue problem.
+  //
+  // TODO(austin): Are floats good enough?  Hard to tell what the "right answer"
+  // actually is...
+  const float hypot_cached = std::hypotf((Cxx - Cyy), 2 * Cxy);
+  const float eight_w_squared = static_cast<float>(
+      static_cast<int64_t>(moments.W) * static_cast<int64_t>(moments.W) * 8.0);
+  const float eig_small = (Cxx + Cyy - hypot_cached) / eight_w_squared;
+  if (print) {
+    printf("Block %d Thread %d   eig_small: (%ld + %ld - %f) / %f -> %f\n",
+           blockIdx.x, threadIdx.x, Cxx, Cyy, hypot_cached, eight_w_squared,
+           eig_small);
+  }
+
+  if (lineparam01) {
+    lineparam01[0] =
+        static_cast<float>(moments.Mx) / static_cast<float>(moments.W * 2);
+    lineparam01[1] =
+        static_cast<float>(moments.My) / static_cast<float>(moments.W * 2);
+  }
+  if (lineparam23) {
+    // These don't match the originals at all, but the math should come out
+    // right.  n{xy}{12} end up being multiplied by 8 W^2, and we compare the
+    // square but use hypot on nx, ny directly.  (and let the W^2 term come out
+    // as common to both the hypot and nxy terms.)
+    const float nx1 = static_cast<float>(Cxx - Cyy) - hypot_cached;
+    const float ny1 = 2 * Cxy;
+    const float M1 = nx1 * nx1 + ny1 * ny1;
+    const float nx2 = 2 * Cxy;
+    const float ny2 = static_cast<float>(Cyy - Cxx) - hypot_cached;
+    const float M2 = nx2 * nx2 + ny2 * ny2;
+
+    float nx, ny;
+    if (M1 > M2) {
+      nx = nx1;
+      ny = ny1;
+    } else {
+      nx = nx2;
+      ny = ny2;
+    }
+
+    float length = std::hypotf(nx, ny);
+    lineparam23[0] = nx / length;
+    lineparam23[1] = ny / length;
+  }
+
+  // sum of squared errors
+  *err = moments.N * eig_small;
+
+  if (print) {
+    printf("Block %d Thread %d   err: %f\n", blockIdx.x, threadIdx.x,
+           moments.N * eig_small);
+  }
+
+  // mean squared error
+  *mse = eig_small;
+}
+
+__device__ __forceinline__ void FitLine(
+    const LineFitPoint *line_fit_points_device, size_t blob_point_count,
+    size_t index0, size_t index1, double *lineparam01, double *lineparam23,
+    double *err, double *mse, bool print = false) {
+  LineFitMoments moments =
+      ReadMoments(line_fit_points_device, blob_point_count, index0, index1);
+  FitLine(moments, lineparam01, lineparam23, err, mse, print);
+}
+
+struct QuadFitStorage {
+  double errorm0m1[kNMaxima - 3][kNMaxima - 3];
+  double lineparams23m0m1[kNMaxima - 3][kNMaxima - 3][2];
+  uint16_t point_indices[16];
+};
+
+class QuadFitCalculator {
+ public:
+  __host__ __device__ QuadFitCalculator(
+      const Peak *peaks_device, const PeakExtents *peak_extents,
+      const LineFitPoint *line_fit_points_device,
+      const cub::KeyValuePair<long, MinMaxExtents> *selected_extents_device,
+      float max_line_fit_mse, double max_dot, QuadFitStorage *storage)
+      : peaks_device_(peaks_device),
+        extents_(peak_extents[blockIdx.x]),
+        selected_extent_(selected_extents_device[extents_.blob_index].value),
+        line_fit_points_device_(line_fit_points_device +
+                                selected_extent_.starting_offset),
+        max_line_fit_mse_(max_line_fit_mse),
+        max_dot_(max_dot),
+        storage_(storage) {}
+
+  __host__ __device__ int RawIndexFromMaxima(int m) const {
+    return peaks_device_[m + extents_.starting_offset].filtered_point_index -
+           selected_extent_.starting_offset;
+  }
+
+  __host__ __device__ uint32_t IndexFromMaxima(int m) const {
+    return storage_->point_indices[m];
+  }
+
+  __device__ __forceinline__ uint32_t sz() const {
+    return selected_extent_.count;
+  }
+
+  __device__ __forceinline__ void FitLine(size_t index0, size_t index1,
+                                          double *lineparam01,
+                                          double *lineparam23, double *err,
+                                          double *mse,
+                                          bool print = false) const {
+    if (print) {
+      printf("Block %d Thread %d   Fitting line %d, %d\n", blockIdx.x,
+             threadIdx.x, (int)index0, (int)index1);
+    }
+    ::frc971::apriltag::FitLine(line_fit_points_device_, selected_extent_.count,
+                                index0, index1, lineparam01, lineparam23, err,
+                                mse, print);
+  }
+
+  __device__ __forceinline__ LineFitMoments ReadMoments(size_t index0,
+                                                        size_t index1) {
+    return ::frc971::apriltag::ReadMoments(
+        line_fit_points_device_, selected_extent_.count, index0, index1);
+  }
+
+  __host__ __device__ void ComputeM0M1Fit() {
+    if (extents_.count < 4) {
+      return;
+    }
+
+    cuda::std::pair<uint, uint> m0m1 = GetM0M1(threadIdx.x);
+    const uint m0 = m0m1.first;
+    const uint m1 = m0m1.second;
+    if (m0 >= m1) {
+      return;
+    }
+
+    // First 35 threads compute the first 35 m0, m1 pairs.
+    // That lets us cache them in shared memory.  Then, all the threads
+    // calculate the full depth of the tree.
+    //
+    // If we don't have the full kNMaxima maxima, skip any indices which are
+    // past the end.
+    if (m1 < kNMaxima && m1 < extents_.count) {
+      const uint i0 = IndexFromMaxima(m0);
+      const uint i1 = IndexFromMaxima(m1);
+
+      double err01;
+      double mse01;
+
+      FitLine(i0, i1, nullptr, storage_->lineparams23m0m1[m0][m1 - 1], &err01,
+              &mse01);
+
+      if (mse01 > max_line_fit_mse_) {
+        err01 = std::numeric_limits<double>::max();
+      }
+
+      storage_->errorm0m1[m0][m1 - 1] = err01;
+    } else {
+      storage_->errorm0m1[m0][m1 - 1] = std::numeric_limits<double>::max();
+    }
+  }
+
+  __host__ __device__ int blob_index() const { return extents_.blob_index; }
+
+  __host__ __device__ double FitLines(uint m0, uint m1, uint m2,
+                                      uint m3) const {
+    const bool print =
+#ifdef DEBUG_BLOB_NUMBER
+        (blob_index() == DEBUG_BLOB_NUMBER &&
+         (  //(m0 == 5 && m1 == 6 && m2 == 7 && m3 == 8) ||
+             (m0 == 0 && m1 == 4 && m2 == 8 && m3 == 9)));
+#else
+        false;
+#endif
+    double err = std::numeric_limits<double>::max();
+
+    if (extents_.count < 4) {
+      return err;
+    }
+
+    if (m3 >= kNMaxima || m3 >= extents_.count) {
+      return err;
+    }
+
+    const double errm0m1 = storage_->errorm0m1[m0][m1 - 1];
+    if (errm0m1 == std::numeric_limits<double>::max()) {
+      return err;
+    }
+
+    const double *paramsm0m123 = storage_->lineparams23m0m1[m0][m1 - 1];
+
+    double errm1m2;
+    double msem1m2;
+    double paramsm1m223[2];
+
+    const int i1 = IndexFromMaxima(m1);
+    const int i2 = IndexFromMaxima(m2);
+    FitLine(i1, i2, nullptr, paramsm1m223, &errm1m2, &msem1m2, print);
+    if (msem1m2 > max_line_fit_mse_) {
+      return std::numeric_limits<double>::max();
+    }
+
+    // return m0 + m1 * 10 + m2 * 100 + m3 * 1000 + errm0m1 + errm1m2;
+
+    const double dot =
+        paramsm0m123[0] * paramsm1m223[0] + paramsm0m123[1] * paramsm1m223[1];
+    if (fabs(dot) > max_dot_) {
+      return std::numeric_limits<double>::max();
+    }
+
+    const int i3 = IndexFromMaxima(m3);
+
+    double errm2m3;
+    double msem2m3;
+
+    FitLine(i2, i3, nullptr, nullptr, &errm2m3, &msem2m3, print);
+    if (msem2m3 > max_line_fit_mse_) {
+      return std::numeric_limits<double>::max();
+    }
+
+    const int i0 = IndexFromMaxima(m0);
+    double errm3m0;
+    double msem3m0;
+    FitLine(i3, i0, nullptr, nullptr, &errm3m0, &msem3m0, print);
+    if (msem3m0 > max_line_fit_mse_) {
+      return std::numeric_limits<double>::max();
+    }
+
+    if (print) {
+      printf(
+          "Block %d Thread %d  %d %d %d %d: errm0m1 %f errm1m2 %f errm2m3 %f "
+          "errm3m0 %f\n",
+          blockIdx.x, threadIdx.x, i0, i1, i2, i3, errm0m1, errm1m2, errm2m3,
+          errm3m0);
+    }
+
+    return errm0m1 + errm1m2 + errm2m3 + errm3m0;
+  }
+
+  __host__ __device__ size_t peaks_count() const { return extents_.count; }
+
+ private:
+  const Peak *peaks_device_;
+  const PeakExtents extents_;
+  const MinMaxExtents selected_extent_;
+
+  const LineFitPoint *line_fit_points_device_;
+  const float max_line_fit_mse_;
+  const double max_dot_;
+  QuadFitStorage *storage_;
+};
+
+struct QuadError {
+  uint8_t m0;
+  uint8_t m1;
+  uint8_t m2;
+  uint8_t m3;
+  double error;
+};
+
+struct MinQuadError {
+  __host__ __device__ QuadError operator()(const QuadError &a,
+                                           const QuadError &b) {
+    if (a.error <= b.error) {
+      return a;
+    } else {
+      return b;
+    }
+  }
+};
+
+struct CustomLess {
+  __device__ bool operator()(const uint16_t lhs, const uint16_t rhs) {
+    return lhs < rhs;
+  }
+};
+
+__global__ void DoFitQuads(
+    const Peak *peaks_device, const PeakExtents *peak_extents,
+    const LineFitPoint *line_fit_points_device,
+    const cub::KeyValuePair<long, MinMaxExtents> *selected_extents_device,
+    float max_line_fit_mse, double cos_critical_rad,
+    FitQuad *fit_quads_device) {
+  __shared__ QuadFitStorage storage;
+  // Specialize BlockReduce for a 1D block of 128 threads of type int
+  typedef cub::BlockReduce<QuadError, MaxRankedIndex()> BlockReduce;
+  // Allocate shared memory for BlockReduce
+  __shared__ typename BlockReduce::TempStorage temp_storage_reduce;
+
+  QuadFitCalculator calculator(peaks_device, peak_extents,
+                               line_fit_points_device, selected_extents_device,
+                               max_line_fit_mse, cos_critical_rad, &storage);
+
+  // Step 1, unsort the maxima back by point index.
+  constexpr size_t kItemsPerThread = 1;
+  constexpr size_t kItems = 16;
+  using WarpMergeSortT = cub::WarpMergeSort<uint16_t, kItemsPerThread, kItems>;
+
+  if (threadIdx.x < kItems) {
+    __shared__ typename WarpMergeSortT::TempStorage temp_storage_merge;
+    uint16_t index[1];
+    if (threadIdx.x >= calculator.peaks_count() || threadIdx.x >= kNMaxima) {
+      index[0] = std::numeric_limits<uint16_t>::max();
+    } else {
+      index[0] = calculator.RawIndexFromMaxima(threadIdx.x);
+    }
+    WarpMergeSortT(temp_storage_merge).Sort(index, CustomLess());
+    storage.point_indices[threadIdx.x] = index[0];
+  }
+
+  __syncthreads();
+
+#ifdef DEBUG_BLOB_NUMBER
+  if (threadIdx.x == 0) {
+    if (calculator.blob_index() == DEBUG_BLOB_NUMBER) {
+      for (size_t i = 0; i < std::min(calculator.peaks_count(), kItems); ++i) {
+        printf("  %d: %d, now %d\n", (int)i, calculator.RawIndexFromMaxima(i),
+               storage.point_indices[i]);
+      }
+    }
+  }
+#endif
+
+  // TODO(austin): Where do we handle blobs with less than 4 extents?
+  calculator.ComputeM0M1Fit();
+
+  __syncthreads();
+
+  const std::tuple<uint, uint, uint, uint> ms = Unrank(threadIdx.x);
+  QuadError quad_error;
+  quad_error.m0 = std::get<0>(ms);
+  quad_error.m1 = std::get<1>(ms);
+  quad_error.m2 = std::get<2>(ms);
+  quad_error.m3 = std::get<3>(ms);
+  quad_error.error = calculator.FitLines(quad_error.m0, quad_error.m1,
+                                         quad_error.m2, quad_error.m3);
+
+#ifdef DEBUG_BLOB_NUMBER
+  if (calculator.blob_index() == DEBUG_BLOB_NUMBER) {
+    if (quad_error.error < 0.0) {
+      printf("blob index %d maxima: %d %d %d %d, error negative of: %f\n",
+             (int)calculator.blob_index(), (int)quad_error.m0,
+             (int)quad_error.m1, (int)quad_error.m2, (int)quad_error.m3,
+             quad_error.error);
+    }
+  }
+#endif
+
+  // Each thread obtains an input item
+  // Compute the block-wide max for thread0
+  QuadError min_error =
+      BlockReduce(temp_storage_reduce).Reduce(quad_error, MinQuadError());
+
+  if (threadIdx.x == 0) {
+    const bool valid = min_error.error < max_line_fit_mse * calculator.sz();
+    fit_quads_device[blockIdx.x].valid = valid;
+    fit_quads_device[blockIdx.x].blob_index = calculator.blob_index();
+    uint32_t i0 = calculator.IndexFromMaxima(min_error.m0);
+    uint32_t i1 = calculator.IndexFromMaxima(min_error.m1);
+    uint32_t i2 = calculator.IndexFromMaxima(min_error.m2);
+    uint32_t i3 = calculator.IndexFromMaxima(min_error.m3);
+#ifdef DEBUG_BLOB_NUMBER
+    if (calculator.blob_index() == DEBUG_BLOB_NUMBER) {
+      printf("blob index %d maxima: %d\n", (int)calculator.blob_index(),
+             (int)calculator.peaks_count());
+      for (size_t i = 0; i < std::min(kItems, calculator.peaks_count()); ++i) {
+        printf("  %d: %d\n", (int)i, calculator.IndexFromMaxima(i));
+      }
+      printf("Best fit is %d(%d) %d(%d) %d(%d) %d(%d) -> %f, valid? %d\n", i0,
+             (int)min_error.m0, i1, (int)min_error.m1, i2, (int)min_error.m2,
+             i3, (int)min_error.m3, min_error.error, valid);
+    }
+#endif
+    fit_quads_device[blockIdx.x].indices[0] = i0;
+    fit_quads_device[blockIdx.x].indices[1] = i1;
+    fit_quads_device[blockIdx.x].indices[2] = i2;
+    fit_quads_device[blockIdx.x].indices[3] = i3;
+    fit_quads_device[blockIdx.x].moments[0] = calculator.ReadMoments(i0, i1);
+    fit_quads_device[blockIdx.x].moments[1] = calculator.ReadMoments(i1, i2);
+    fit_quads_device[blockIdx.x].moments[2] = calculator.ReadMoments(i2, i3);
+    fit_quads_device[blockIdx.x].moments[3] = calculator.ReadMoments(i3, i0);
+  }
+}
+
+void FitQuads(
+    const Peak *peaks_device, size_t /*peaks*/, const PeakExtents *peak_extents,
+    size_t num_extents, const LineFitPoint *line_fit_points_device, int nmaxima,
+    const cub::KeyValuePair<long, MinMaxExtents> *selected_extents_device,
+    float max_line_fit_mse, double cos_critical_rad, FitQuad *fit_quads_device,
+    CudaStream *stream) {
+  constexpr size_t kThreads = MaxRankedIndex();
+  const size_t kBlocks = num_extents;
+  VLOG(1) << "Spawning with " << kThreads << " threads, and " << kBlocks
+          << " blocks for " << num_extents << " blob_ids";
+  CHECK_EQ(nmaxima, kNMaxima)
+      << ": Kernel is compiled and optimized for a fixed nmaxima, please "
+         "recompile if you want to change it.";
+  DoFitQuads<<<kBlocks, kThreads, 0, stream->get()>>>(
+      peaks_device, peak_extents, line_fit_points_device,
+      selected_extents_device, max_line_fit_mse, cos_critical_rad,
+      fit_quads_device);
+}
+
+std::ostream &operator<<(std::ostream &os,
+                         const frc971::apriltag::LineFitMoments &moments) {
+  os << "{Mx:" << std::setprecision(20) << moments.Mx / 2.
+     << ", My:" << std::setprecision(20) << moments.My / 2.
+     << ", Mxx:" << std::setprecision(20) << moments.Mxx / 4.
+     << ", Mxy:" << std::setprecision(20) << moments.Mxy / 4.
+     << ", Myy:" << std::setprecision(20) << moments.Myy / 4.
+     << ", W:" << std::setprecision(20) << moments.W << ", N:" << moments.N
+     << "}";
+  return os;
+}
+
+}  // namespace apriltag
+}  // namespace frc971

diff --git a/frc971/orin/line_fit_filter.h b/frc971/orin/line_fit_filter.h
new file mode 100644
index 0000000..0c37f0b
--- /dev/null
+++ b/frc971/orin/line_fit_filter.h

@@ -0,0 +1,166 @@
+#ifndef FRC971_ORIN_LINE_FIT_FILTER_H_
+#define FRC971_ORIN_LINE_FIT_FILTER_H_
+
+#include <cub/iterator/transform_input_iterator.cuh>
+#include <cuda/std/tuple>
+
+#include "cuda_runtime.h"
+#include "device_launch_parameters.h"
+#include "frc971/orin/cuda.h"
+
+namespace frc971 {
+namespace apriltag {
+
+// Class to hold the extents of a blob of points.
+struct MinMaxExtents {
+  // Min and max coordinates (in non-decimated coordinates)
+  uint16_t min_x;
+  uint16_t min_y;
+  uint16_t max_x;
+  uint16_t max_y;
+
+  // The starting offset of this blob of points in the vector holding all the
+  // points.
+  uint32_t starting_offset;
+
+  // The number of points in the blob.
+  uint32_t count;
+
+  // The dot product is:
+  //  dot = sum( (px - cx) * gx) + (py - cy) * gy )
+  //
+  // We can split this up into:
+  //  dot = sum(px * gx + py * gy) - cx * sum(gx) - cy * sum(gy)
+  //
+  // Which can be calculated without knowing the center.
+  //
+  // Since p and g are all integers, we can sum them into integers too so we
+  // don't lose precision.
+  int32_t gx_sum;
+  int32_t gy_sum;
+
+  int64_t pxgx_plus_pygy_sum;
+
+  // Center location of the blob using the aprilrobotics algorithm.
+  __host__ __device__ double cx() const {
+    return (min_x + max_x) * 0.5f + 0.05118;
+  }
+  __host__ __device__ double cy() const {
+    return (min_y + max_y) * 0.5f + -0.028581;
+  }
+
+  __host__ __device__ float dot() const {
+    return static_cast<double>(pxgx_plus_pygy_sum * 2 -
+                               (min_x + max_x) * gx_sum -
+                               (min_y + max_y) * gy_sum) *
+               0.5 -
+           0.05118 * static_cast<double>(gx_sum) +
+           0.028581 * static_cast<double>(gy_sum);
+  }
+};
+
+__align__(16) struct LineFitPoint {
+  // TODO(austin): How much precision do we actually need?  The less, the
+  // faster...  The less memory too, the faster.
+  //
+  // Is it the double's or the memory which makes us slow?  Could probably bit
+  // pack ints in here...
+  int64_t Mxx;
+  int64_t Myy;
+  int64_t Mxy;
+  // TODO(austin): These both fit in 4 byte numbers :)
+  // There are at most 2 * (width + height) -> 13 bits points in a blob.
+  // The weight can be at most 8 bits (probably 9 because it has a sign).
+  // A point has 11 bits of position data in it.
+  //
+  // To do this with sub bit precision:
+  //   2 * (1088 + 1456) * 1456 * 255 -> 0x7098f600
+  //
+  // Which is < 31 bits, so we can hold a sign bit too!
+  int32_t Mx;
+  int32_t My;
+  int32_t W;
+  uint32_t blob_index;
+};
+
+struct LineFitMoments {
+  // See LineFitPoint for more info.
+  int32_t Mx;
+  int32_t My;
+  int32_t W;
+  int64_t Mxx;
+  int64_t Myy;
+  int64_t Mxy;
+  int N;  // how many points are included in the set?
+};
+
+std::ostream &operator<<(std::ostream &os,
+                         const frc971::apriltag::LineFitMoments &moments);
+
+struct Peak {
+  static constexpr uint16_t kNoPeak() { return 0xffff; }
+  float error;
+  // Point index.
+  uint32_t filtered_point_index;
+  // 0xffff if this isn't a peak, otherwise the blob index.
+  uint16_t blob_index;
+};
+
+struct PeakExtents {
+  uint16_t blob_index;
+  uint32_t starting_offset;
+  uint32_t count;
+};
+
+struct PeakDecomposer {
+  static constexpr size_t kBitsInKey = 16 + 32;
+  __host__ __device__ ::cuda::std::tuple<uint16_t &, float &> operator()(
+      Peak &key) const {
+    return {key.blob_index, key.error};
+  }
+};
+
+constexpr std::array<float, 7> FilterCoefficients() {
+  return std::array<float, 7>{
+      0.01110899634659290314, 0.13533528149127960205, 0.60653066635131835938,
+      1.00000000000000000000, 0.60653066635131835938, 0.13533528149127960205,
+      0.01110899634659290314,
+  };
+}
+
+struct FitQuad {
+  uint16_t blob_index;
+  bool valid;
+  uint16_t indices[4];
+  LineFitMoments moments[4];
+};
+
+__host__ __device__ void FitLine(LineFitMoments moments, double *lineparam01,
+                                 double *lineparam23, double *err, double *mse);
+
+void FitLines(
+    const LineFitPoint *line_fit_points_device, size_t points,
+    const cub::KeyValuePair<long, MinMaxExtents> *selected_extents_device,
+    size_t num_extents, double *errs_device, double *filtered_errs_device,
+    Peak *peaks_device, CudaStream *stream);
+
+void FitQuads(
+    const Peak *peaks_device, size_t peaks, const PeakExtents *peak_extents,
+    size_t num_extents, const LineFitPoint *line_fit_points_device, int nmaxima,
+    const cub::KeyValuePair<long, MinMaxExtents> *selected_extents_device,
+    float max_line_fit_mse, double cos_critical_rad, FitQuad *fit_quads_device,
+    CudaStream *stream);
+
+// Returns the m0, m1, m2, m3 indices for the provided index.  The index is the
+// inner loop number when you process the 4 for loops in order and count.  See
+// FilterTest.Unrank for an example.
+//
+// This lets us distribute work amoung the cuda threads and get back the index.
+__host__ __device__ std::tuple<uint, uint, uint, uint> Unrank(uint i);
+// The max number of work elements for a max maxes of 10.
+constexpr size_t MaxRankedIndex() { return 210; }
+
+}  // namespace apriltag
+}  // namespace frc971
+
+#endif  // FRC971_ORIN_LINE_FIT_FILTER_H_

diff --git a/third_party/apriltag/apriltag.c b/third_party/apriltag/apriltag.c
index 3086228..a2170fd 100644
--- a/third_party/apriltag/apriltag.c
+++ b/third_party/apriltag/apriltag.c

@@ -888,6 +888,77 @@
     }
 }
 
+void quad_decode_index(apriltag_detector_t *td, struct quad *quad_original, image_u8_t *im, image_u8_t *im_samples, zarray_t *detections) {
+    // make sure the homographies are computed...
+    if (quad_update_homographies(quad_original) != 0)
+        return;
+
+    for (int famidx = 0; famidx < zarray_size(td->tag_families); famidx++) {
+        apriltag_family_t *family;
+        zarray_get(td->tag_families, famidx, &family);
+
+        if (family->reversed_border != quad_original->reversed_border) {
+            continue;
+        }
+
+        // since the geometry of tag families can vary, start any
+        // optimization process over with the original quad.
+        struct quad *quad = quad_copy(quad_original);
+
+        struct quick_decode_entry entry;
+
+        float decision_margin = quad_decode(td, family, im, quad, &entry, im_samples);
+
+        if (decision_margin >= 0 && entry.hamming < 255) {
+            apriltag_detection_t *det = calloc(1, sizeof(apriltag_detection_t));
+
+            det->family = family;
+            det->id = entry.id;
+            det->hamming = entry.hamming;
+            det->decision_margin = decision_margin;
+
+            double theta = entry.rotation * M_PI / 2.0;
+            double c = cos(theta), s = sin(theta);
+
+            // Fix the rotation of our homography to properly orient the tag
+            matd_t *R = matd_create(3,3);
+            MATD_EL(R, 0, 0) = c;
+            MATD_EL(R, 0, 1) = -s;
+            MATD_EL(R, 1, 0) = s;
+            MATD_EL(R, 1, 1) = c;
+            MATD_EL(R, 2, 2) = 1;
+
+            det->H = matd_op("M*M", quad->H, R);
+
+            matd_destroy(R);
+
+            homography_project(det->H, 0, 0, &det->c[0], &det->c[1]);
+
+            // [-1, -1], [1, -1], [1, 1], [-1, 1], Desired points
+            // [-1, 1], [1, 1], [1, -1], [-1, -1], FLIP Y
+            // adjust the points in det->p so that they correspond to
+            // counter-clockwise around the quad, starting at -1,-1.
+            for (int i = 0; i < 4; i++) {
+                int tcx = (i == 1 || i == 2) ? 1 : -1;
+                int tcy = (i < 2) ? 1 : -1;
+
+                double p[2];
+
+                homography_project(det->H, tcx, tcy, &p[0], &p[1]);
+
+                det->p[i][0] = p[0];
+                det->p[i][1] = p[1];
+            }
+
+            pthread_mutex_lock(&td->mutex);
+            zarray_add(detections, &det);
+            pthread_mutex_unlock(&td->mutex);
+        }
+
+        quad_destroy(quad);
+    }
+}
+
 static void quad_decode_task(void *_u)
 {
     struct quad_decode_task *task = (struct quad_decode_task*) _u;
@@ -905,74 +976,7 @@
             refine_edges(td, im, quad_original);
         }
 
-        // make sure the homographies are computed...
-        if (quad_update_homographies(quad_original) != 0)
-            continue;
-
-        for (int famidx = 0; famidx < zarray_size(td->tag_families); famidx++) {
-            apriltag_family_t *family;
-            zarray_get(td->tag_families, famidx, &family);
-
-            if (family->reversed_border != quad_original->reversed_border) {
-                continue;
-            }
-
-            // since the geometry of tag families can vary, start any
-            // optimization process over with the original quad.
-            struct quad *quad = quad_copy(quad_original);
-
-            struct quick_decode_entry entry;
-
-            float decision_margin = quad_decode(td, family, im, quad, &entry, task->im_samples);
-
-            if (decision_margin >= 0 && entry.hamming < 255) {
-                apriltag_detection_t *det = calloc(1, sizeof(apriltag_detection_t));
-
-                det->family = family;
-                det->id = entry.id;
-                det->hamming = entry.hamming;
-                det->decision_margin = decision_margin;
-
-                double theta = entry.rotation * M_PI / 2.0;
-                double c = cos(theta), s = sin(theta);
-
-                // Fix the rotation of our homography to properly orient the tag
-                matd_t *R = matd_create(3,3);
-                MATD_EL(R, 0, 0) = c;
-                MATD_EL(R, 0, 1) = -s;
-                MATD_EL(R, 1, 0) = s;
-                MATD_EL(R, 1, 1) = c;
-                MATD_EL(R, 2, 2) = 1;
-
-                det->H = matd_op("M*M", quad->H, R);
-
-                matd_destroy(R);
-
-                homography_project(det->H, 0, 0, &det->c[0], &det->c[1]);
-
-                // [-1, -1], [1, -1], [1, 1], [-1, 1], Desired points
-                // [-1, 1], [1, 1], [1, -1], [-1, -1], FLIP Y
-                // adjust the points in det->p so that they correspond to
-                // counter-clockwise around the quad, starting at -1,-1.
-                for (int i = 0; i < 4; i++) {
-                    int tcx = (i == 1 || i == 2) ? 1 : -1;
-                    int tcy = (i < 2) ? 1 : -1;
-
-                    double p[2];
-
-                    homography_project(det->H, tcx, tcy, &p[0], &p[1]);
-
-                    det->p[i][0] = p[0];
-                    det->p[i][1] = p[1];
-                }
-
-                pthread_mutex_lock(&td->mutex);
-                zarray_add(task->detections, &det);
-                pthread_mutex_unlock(&td->mutex);
-            }
-
-            quad_destroy(quad);
-        }
+        quad_decode_index(td, quad_original, im, task->im_samples, task->detections);
     }
 }
 
@@ -999,6 +1003,69 @@
     return 0;
 }
 
+void reconcile_detections(zarray_t *detections, zarray_t *poly0, zarray_t *poly1)
+{
+    for (int i0 = 0; i0 < zarray_size(detections); i0++) {
+
+        apriltag_detection_t *det0;
+        zarray_get(detections, i0, &det0);
+
+        for (int k = 0; k < 4; k++)
+            zarray_set(poly0, k, det0->p[k], NULL);
+
+        for (int i1 = i0+1; i1 < zarray_size(detections); i1++) {
+
+            apriltag_detection_t *det1;
+            zarray_get(detections, i1, &det1);
+
+            if (det0->id != det1->id || det0->family != det1->family)
+                continue;
+
+            for (int k = 0; k < 4; k++)
+                zarray_set(poly1, k, det1->p[k], NULL);
+
+            if (g2d_polygon_overlaps_polygon(poly0, poly1)) {
+                // the tags overlap. Delete one, keep the other.
+
+                int pref = 0; // 0 means undecided which one we'll keep.
+                pref = prefer_smaller(pref, det0->hamming, det1->hamming);     // want small hamming
+                pref = prefer_smaller(pref, -det0->decision_margin, -det1->decision_margin);      // want bigger margins
+
+                // if we STILL don't prefer one detection over the other, then pick
+                // any deterministic criterion.
+                for (int i = 0; i < 4; i++) {
+                    pref = prefer_smaller(pref, det0->p[i][0], det1->p[i][0]);
+                    pref = prefer_smaller(pref, det0->p[i][1], det1->p[i][1]);
+                }
+
+                if (pref == 0) {
+                    // at this point, we should only be undecided if the tag detections
+                    // are *exactly* the same. How would that happen?
+                    debug_print("uh oh, no preference for overlappingdetection\n");
+                }
+
+                if (pref < 0) {
+                    // keep det0, destroy det1
+                    apriltag_detection_destroy(det1);
+                    zarray_remove_index(detections, i1, 1);
+                    i1--; // retry the same index
+                    goto retry1;
+                } else {
+                    // keep det1, destroy det0
+                    apriltag_detection_destroy(det0);
+                    zarray_remove_index(detections, i0, 1);
+                    i0--; // retry the same index.
+                    goto retry0;
+                }
+            }
+
+          retry1: ;
+        }
+
+      retry0: ;
+    }
+}
+
 zarray_t *apriltag_detector_detect(apriltag_detector_t *td, image_u8_t *im_orig)
 {
     if (zarray_size(td->tag_families) == 0) {
@@ -1201,65 +1268,7 @@
         zarray_t *poly0 = g2d_polygon_create_zeros(4);
         zarray_t *poly1 = g2d_polygon_create_zeros(4);
 
-        for (int i0 = 0; i0 < zarray_size(detections); i0++) {
-
-            apriltag_detection_t *det0;
-            zarray_get(detections, i0, &det0);
-
-            for (int k = 0; k < 4; k++)
-                zarray_set(poly0, k, det0->p[k], NULL);
-
-            for (int i1 = i0+1; i1 < zarray_size(detections); i1++) {
-
-                apriltag_detection_t *det1;
-                zarray_get(detections, i1, &det1);
-
-                if (det0->id != det1->id || det0->family != det1->family)
-                    continue;
-
-                for (int k = 0; k < 4; k++)
-                    zarray_set(poly1, k, det1->p[k], NULL);
-
-                if (g2d_polygon_overlaps_polygon(poly0, poly1)) {
-                    // the tags overlap. Delete one, keep the other.
-
-                    int pref = 0; // 0 means undecided which one we'll keep.
-                    pref = prefer_smaller(pref, det0->hamming, det1->hamming);     // want small hamming
-                    pref = prefer_smaller(pref, -det0->decision_margin, -det1->decision_margin);      // want bigger margins
-
-                    // if we STILL don't prefer one detection over the other, then pick
-                    // any deterministic criterion.
-                    for (int i = 0; i < 4; i++) {
-                        pref = prefer_smaller(pref, det0->p[i][0], det1->p[i][0]);
-                        pref = prefer_smaller(pref, det0->p[i][1], det1->p[i][1]);
-                    }
-
-                    if (pref == 0) {
-                        // at this point, we should only be undecided if the tag detections
-                        // are *exactly* the same. How would that happen?
-                        debug_print("uh oh, no preference for overlappingdetection\n");
-                    }
-
-                    if (pref < 0) {
-                        // keep det0, destroy det1
-                        apriltag_detection_destroy(det1);
-                        zarray_remove_index(detections, i1, 1);
-                        i1--; // retry the same index
-                        goto retry1;
-                    } else {
-                        // keep det1, destroy det0
-                        apriltag_detection_destroy(det0);
-                        zarray_remove_index(detections, i0, 1);
-                        i0--; // retry the same index.
-                        goto retry0;
-                    }
-                }
-
-              retry1: ;
-            }
-
-          retry0: ;
-        }
+        reconcile_detections(detections, poly0, poly1);
 
         zarray_destroy(poly0);
         zarray_destroy(poly1);

diff --git a/third_party/apriltag/apriltag_quad_thresh.c b/third_party/apriltag/apriltag_quad_thresh.c
index 1ce73ad..5811770 100644
--- a/third_party/apriltag/apriltag_quad_thresh.c
+++ b/third_party/apriltag/apriltag_quad_thresh.c

@@ -262,7 +262,15 @@
         *mse = eig_small;
 }
 
-float pt_compare_angle(struct pt *a, struct pt *b) {
+float pt_compare_angle(float cx, float cy, struct pt *a, struct pt *b) {
+    if (a->slope == b->slope) {
+      float dxa = a->x - cx;
+      float dya = a->y - cy;
+      float dxb = b->x - cx;
+      float dyb = b->y - cy;
+      return dya * dxb - dxa * dyb;
+    }
+
     return a->slope - b->slope;
 }
 
@@ -354,7 +362,7 @@
             for (int i = 0; i < fsz; i++) {
                 acc += errs[(iy + i - fsz / 2 + sz) % sz] * f[i];
             }
-            y[iy] = acc;
+            y[iy] = max(0.0, acc);
         }
 
         memcpy(errs, y, sizeof(double)*sz);
@@ -367,7 +375,7 @@
     int nmaxima = 0;
 
     for (int i = 0; i < sz; i++) {
-        if (errs[i] > errs[(i+1)%sz] && errs[i] > errs[(i+sz-1)%sz]) {
+        if (errs[i] > errs[(i+1)%sz] && errs[i] > errs[(i+sz-1)%sz] && errs[i] > 1e-5) {
             maxima[nmaxima] = i;
             maxima_errs[nmaxima] = errs[i];
             nmaxima++;
@@ -409,7 +417,7 @@
 
     double err01, err12, err23, err30;
     double mse01, mse12, mse23, mse30;
-    double params01[4], params12[4], params23[4], params30[4];
+    double params01[4], params12[4];
 
     // disallow quads where the angle is less than a critical value.
     double max_dot = td->qtp.cos_critical_rad; //25*M_PI/180);
@@ -439,11 +447,11 @@
                 for (int m3 = m2+1; m3 < nmaxima; m3++) {
                     int i3 = maxima[m3];
 
-                    fit_line(lfps, sz, i2, i3, params23, &err23, &mse23);
+                    fit_line(lfps, sz, i2, i3, NULL, &err23, &mse23);
                     if (mse23 > td->qtp.max_line_fit_mse)
                         continue;
 
-                    fit_line(lfps, sz, i3, i0, params30, &err30, &mse30);
+                    fit_line(lfps, sz, i3, i0, NULL, &err30, &mse30);
                     if (mse30 > td->qtp.max_line_fit_mse)
                         continue;
 
@@ -605,7 +613,7 @@
             double x = p->x * .5 + delta;
             double y = p->y * .5 + delta;
             int ix = x, iy = y;
-            double W = 1;
+            int W = 1;
 
             if (ix > 0 && ix+1 < im->width && iy > 0 && iy+1 < im->height) {
                 int grad_x = im->buf[iy * im->stride + ix + 1] -
@@ -615,7 +623,7 @@
                     im->buf[(iy-1) * im->stride + ix];
 
                 // XXX Tunable. How to shape the gradient magnitude?
-                W = sqrt(grad_x*grad_x + grad_y*grad_y) + 1;
+                W = hypotf(grad_x, grad_y) + 1;
             }
 
             double fx = x, fy = y;
@@ -630,10 +638,10 @@
     return lfps;
 }
 
-static inline void ptsort(struct pt *pts, int sz)
+static inline void ptsort(float cx, float cy, struct pt *pts, int sz)
 {
 #define MAYBE_SWAP(arr,apos,bpos)                                   \
-    if (pt_compare_angle(&(arr[apos]), &(arr[bpos])) > 0) {                        \
+    if (pt_compare_angle(cx, cy, &(arr[apos]), &(arr[bpos])) > 0) {                        \
         tmp = arr[apos]; arr[apos] = arr[bpos]; arr[bpos] = tmp;    \
     };
 
@@ -695,11 +703,11 @@
     struct pt *as = &tmp[0];
     struct pt *bs = &tmp[asz];
 
-    ptsort(as, asz);
-    ptsort(bs, bsz);
+    ptsort(cx, cy, as, asz);
+    ptsort(cx, cy, bs, bsz);
 
     #define MERGE(apos,bpos)                        \
-    if (pt_compare_angle(&(as[apos]), &(bs[bpos])) < 0)        \
+    if (pt_compare_angle(cx, cy, &(as[apos]), &(bs[bpos])) < 0)        \
         pts[outpos++] = as[apos++];             \
     else                                        \
         pts[outpos++] = bs[bpos++];
@@ -820,38 +828,7 @@
     // we now sort the points according to theta. This is a prepatory
     // step for segmenting them into four lines.
     if (1) {
-        ptsort((struct pt*) cluster->data, zarray_size(cluster));
-
-        // remove duplicate points. (A byproduct of our segmentation system.)
-        if (1) {
-            int outpos = 1;
-
-            struct pt *last;
-            zarray_get_volatile(cluster, 0, &last);
-
-            for (int i = 1; i < sz; i++) {
-
-                struct pt *p;
-                zarray_get_volatile(cluster, i, &p);
-
-                if (p->x != last->x || p->y != last->y) {
-
-                    if (i != outpos)  {
-                        struct pt *out;
-                        zarray_get_volatile(cluster, outpos, &out);
-                        memcpy(out, p, sizeof(struct pt));
-                    }
-
-                    outpos++;
-                }
-
-                last = p;
-            }
-
-            cluster->size = outpos;
-            sz = outpos;
-        }
-
+        ptsort(cx, cy, (struct pt*) cluster->data, zarray_size(cluster));
     }
 
     if (sz < 24)
@@ -877,9 +854,10 @@
         int i1 = indices[(i+1)&3];
 
         double err;
-        fit_line(lfps, sz, i0, i1, lines[i], NULL, &err);
+        double mse;
+        fit_line(lfps, sz, i0, i1, lines[i], &err, &mse);
 
-        if (err > td->qtp.max_line_fit_mse) {
+        if (mse > td->qtp.max_line_fit_mse) {
             res = 0;
             goto finish;
         }
@@ -1083,7 +1061,7 @@
         // fit quads to.) A typical point along an edge is added three
         // times (because it has 3 neighbors). The maximum perimeter
         // is 2w+2h.
-        if (zarray_size(cluster) > 3*(2*w+2*h)) {
+        if (zarray_size(cluster) > 2*(2*w+2*h)) {
             continue;
         }
 
@@ -1498,15 +1476,20 @@
     mem_pools[mem_pool_idx] = calloc(mem_chunk_size, sizeof(struct uint64_zarray_entry));
 
     for (int y = y0; y < y1; y++) {
+        bool connected_last = false;
         for (int x = 1; x < w-1; x++) {
+            bool connected = false;
 
             uint8_t v0 = threshim->buf[y*ts + x];
-            if (v0 == 127)
+            if (v0 == 127) {
+                connected_last = false;
                 continue;
+            }
 
             // XXX don't query this until we know we need it?
             uint64_t rep0 = unionfind_get_representative(uf, y*w + x);
             if (unionfind_get_set_size(uf, rep0) < 25) {
+                connected_last = false;
                 continue;
             }
 
@@ -1570,9 +1553,8 @@
                        .y = 2 * y + dy,                                        \
                        .gx = dx * ((int)v1 - v0),                              \
                        .gy = dy * ((int)v1 - v0)};                             \
-        /*fprintf(stderr, "Adding point %d+%d(%llx) -> (%d, %d)\n",              \
-                min(rep0, rep1), max(rep0, rep1), entry->id, p.x, p.y);       */ \
         zarray_add(entry->cluster, &p);                                        \
+        connected = true;                                                      \
       }                                                                        \
     }                                                                          \
   }
@@ -1582,8 +1564,12 @@
             DO_CONN(0, 1);
 
             // do 8 connectivity
+            if (!connected_last) {
             DO_CONN(-1, 1);
+            }
+            connected = false;
             DO_CONN(1, 1);
+            connected_last = connected;
         }
     }
 #undef DO_CONN
commit	1fc51faaa13d141b1b611b312f7ff8b967bcf307	[log] [tgz]
author	Austin Schuh <austin.linux@gmail.com>	Mon Jan 01 12:34:00 2024 -0800
committer	Austin Schuh <austin.linux@gmail.com>	Thu Jan 04 18:02:00 2024 -0800
tree	6afdac6d7cd573dec011b64c38de00338578d415
parent	8de7def4d13bc5309534877e95462e0a4100db0c [diff]