Add first half of GPU based april tag detector
This detects blob boundaries, filters them, and then orders the points
in a circle in preparation for line fitting. It takes 2ms for a 720p
image on the Orin NX 8gb.
Future commits will do the quad fitting and merge back with the original
algorithm.
Change-Id: Idf2869b3521e50a0056a352138d864b409dab6f1
Signed-off-by: Austin Schuh <austin.linux@gmail.com>
diff --git a/frc971/orin/threshold.cc b/frc971/orin/threshold.cc
new file mode 100644
index 0000000..79ccced
--- /dev/null
+++ b/frc971/orin/threshold.cc
@@ -0,0 +1,206 @@
+#include "frc971/orin/threshold.h"
+
+#include <stdint.h>
+
+#include "frc971/orin/cuda.h"
+
+namespace frc971 {
+namespace apriltag {
+namespace {
+
+// 1280 -> 2 * 128 * 5
+// 720 -> 2 * 8 * 5 * 9
+//
+// 1456 -> 2 * 8 * 7 * 13
+// 1088 -> 2 * 32 * 17
+
+// Writes out the grayscale image and decimated image.
+__global__ void InternalCudaToGreyscaleAndDecimateHalide(
+ const uint8_t *color_image, uint8_t *gray_image, uint8_t *decimated_image,
+ size_t width, size_t height) {
+ size_t i = blockIdx.x * blockDim.x + threadIdx.x;
+ while (i < width * height) {
+ uint8_t pixel = gray_image[i] = color_image[i * 2];
+
+ const size_t row = i / width;
+ const size_t col = i - width * row;
+
+ // Copy over every other pixel.
+ if (row % 2 == 0 && col % 2 == 0) {
+ size_t decimated_row = row / 2;
+ size_t decimated_col = col / 2;
+ decimated_image[decimated_row * width / 2 + decimated_col] = pixel;
+ }
+ i += blockDim.x * gridDim.x;
+ }
+
+ // TODO(austin): Figure out how to load contiguous memory reasonably
+ // efficiently and max/min over it.
+
+ // TODO(austin): Can we do the threshold here too? That would be less memory
+ // bandwidth consumed...
+}
+
+// Returns the min and max for a row of 4 pixels.
+__forceinline__ __device__ uchar2 minmax(uchar4 row) {
+ uint8_t min_val = std::min(std::min(row.x, row.y), std::min(row.z, row.w));
+ uint8_t max_val = std::max(std::max(row.x, row.y), std::max(row.z, row.w));
+ return make_uchar2(min_val, max_val);
+}
+
+// Returns the min and max for a set of min and maxes.
+__forceinline__ __device__ uchar2 minmax(uchar2 val0, uchar2 val1) {
+ return make_uchar2(std::min(val0.x, val1.x), std::max(val0.y, val1.y));
+}
+
+// Returns the pixel index of a pixel at the provided x and y location.
+__forceinline__ __device__ size_t XYToIndex(size_t width, size_t x, size_t y) {
+ return width * y + x;
+}
+
+// Computes the min and max pixel value for each block of 4 pixels.
+__global__ void InternalBlockMinMax(const uint8_t *decimated_image,
+ uchar2 *unfiltered_minmax_image,
+ size_t width, size_t height) {
+ uchar2 vals[4];
+ const size_t x = blockIdx.x * blockDim.x + threadIdx.x;
+ const size_t y = blockIdx.y * blockDim.y + threadIdx.y;
+
+ if (x >= width || y >= height) {
+ return;
+ }
+
+ for (int i = 0; i < 4; ++i) {
+ const uchar4 decimated_block = *reinterpret_cast<const uchar4 *>(
+ decimated_image + XYToIndex(width * 4, x * 4, y * 4 + i));
+
+ vals[i] = minmax(decimated_block);
+ }
+
+ unfiltered_minmax_image[XYToIndex(width, x, y)] =
+ minmax(minmax(vals[0], vals[1]), minmax(vals[2], vals[3]));
+}
+
+// Filters the min/max for the surrounding block of 9 pixels centered on our
+// location using min/max and writes the result back out.
+__global__ void InternalBlockFilter(const uchar2 *unfiltered_minmax_image,
+ uchar2 *minmax_image, size_t width,
+ size_t height) {
+ uchar2 result = make_uchar2(255, 0);
+
+ const size_t x = blockIdx.x * blockDim.x + threadIdx.x;
+ const size_t y = blockIdx.y * blockDim.y + threadIdx.y;
+
+ if (x >= width || y >= height) {
+ return;
+ }
+
+ // Iterate through the 3x3 set of points centered on the point this image is
+ // responsible for, and compute the overall min/max.
+#pragma unroll
+ for (int i = -1; i <= 1; ++i) {
+#pragma unroll
+ for (int j = -1; j <= 1; ++j) {
+ const ssize_t read_x = x + i;
+ const ssize_t read_y = y + j;
+
+ if (read_x < 0 || read_x >= static_cast<ssize_t>(width)) {
+ continue;
+ }
+ if (read_y < 0 || read_y >= static_cast<ssize_t>(height)) {
+ continue;
+ }
+
+ result = minmax(
+ result, unfiltered_minmax_image[XYToIndex(width, read_x, read_y)]);
+ }
+ }
+
+ minmax_image[XYToIndex(width, x, y)] = result;
+}
+
+// Thresholds the image based on the filtered thresholds.
+__global__ void InternalThreshold(const uint8_t *decimated_image,
+ const uchar2 *minmax_image,
+ uint8_t *thresholded_image, size_t width,
+ size_t height, size_t min_white_black_diff) {
+ size_t i = blockIdx.x * blockDim.x + threadIdx.x;
+ while (i < width * height) {
+ const size_t x = i % width;
+ const size_t y = i / width;
+
+ const uchar2 minmax_val = minmax_image[x / 4 + (y / 4) * width / 4];
+
+ uint8_t result;
+ if (minmax_val.y - minmax_val.x < min_white_black_diff) {
+ result = 127;
+ } else {
+ uint8_t thresh = minmax_val.x + (minmax_val.y - minmax_val.x) / 2;
+ if (decimated_image[i] > thresh) {
+ result = 255;
+ } else {
+ result = 0;
+ }
+ }
+
+ thresholded_image[i] = result;
+ i += blockDim.x * gridDim.x;
+ }
+}
+
+} // namespace
+
+void CudaToGreyscaleAndDecimateHalide(
+ const uint8_t *color_image, uint8_t *gray_image, uint8_t *decimated_image,
+ uint8_t *unfiltered_minmax_image, uint8_t *minmax_image,
+ uint8_t *thresholded_image, size_t width, size_t height,
+ size_t min_white_black_diff, CudaStream *stream) {
+ CHECK((width % 8) == 0);
+ CHECK((height % 8) == 0);
+ constexpr size_t kThreads = 256;
+ {
+ // Step one, convert to gray and decimate.
+ size_t kBlocks = (width * height + kThreads - 1) / kThreads / 4;
+ InternalCudaToGreyscaleAndDecimateHalide<<<kBlocks, kThreads, 0,
+ stream->get()>>>(
+ color_image, gray_image, decimated_image, width, height);
+ MaybeCheckAndSynchronize();
+ }
+
+ size_t decimated_width = width / 2;
+ size_t decimated_height = height / 2;
+
+ {
+ // Step 2, compute a min/max for each block of 4x4 (16) pixels.
+ dim3 threads(16, 16, 1);
+ dim3 blocks((decimated_width / 4 + 15) / 16,
+ (decimated_height / 4 + 15) / 16, 1);
+
+ InternalBlockMinMax<<<blocks, threads, 0, stream->get()>>>(
+ decimated_image, reinterpret_cast<uchar2 *>(unfiltered_minmax_image),
+ decimated_width / 4, decimated_height / 4);
+ MaybeCheckAndSynchronize();
+
+ // Step 3, Blur those min/max's a further +- 1 block in each direction using
+ // min/max.
+ InternalBlockFilter<<<blocks, threads, 0, stream->get()>>>(
+ reinterpret_cast<uchar2 *>(unfiltered_minmax_image),
+ reinterpret_cast<uchar2 *>(minmax_image), decimated_width / 4,
+ decimated_height / 4);
+ MaybeCheckAndSynchronize();
+ }
+
+ {
+ // Now, write out 127 if the min/max are too close to each other, or 0/255
+ // if the pixels are above or below the average of the min/max.
+ size_t kBlocks = (width * height / 4 + kThreads - 1) / kThreads / 4;
+ InternalThreshold<<<kBlocks, kThreads, 0, stream->get()>>>(
+ decimated_image, reinterpret_cast<uchar2 *>(minmax_image),
+ thresholded_image, decimated_width, decimated_height,
+ min_white_black_diff);
+ MaybeCheckAndSynchronize();
+ }
+}
+
+} // namespace apriltag
+} // namespace frc971