Make SIFT faster

This uses various halide-optimized functions to do the actual image
processing. It still finds around the same number of features, but much
faster.

Change-Id: I9d7f7093b0ec41acf7ed16b2c91cdadada2f9a22
diff --git a/y2020/vision/sift/BUILD b/y2020/vision/sift/BUILD
index 0a2cb3a..d3bafc6 100644
--- a/y2020/vision/sift/BUILD
+++ b/y2020/vision/sift/BUILD
@@ -1,3 +1,116 @@
+load(":fast_gaussian.bzl", "fast_gaussian")
+
+cc_binary(
+    name = "fast_gaussian_generator",
+    srcs = [
+        "fast_gaussian_generator.cc",
+    ],
+    restricted_to = [
+        "//tools:k8",
+        "//tools:armhf-debian",
+    ],
+    deps = [
+        "//third_party:halide",
+        "//third_party:halide_gengen",
+        "//third_party:opencv",
+        "@com_github_google_glog//:glog",
+    ],
+)
+
+py_binary(
+    name = "fast_gaussian_runner",
+    srcs = [
+        "fast_gaussian_runner.py",
+    ],
+    data = [
+        ":fast_gaussian_generator",
+        # TODO(Brian): Replace this with something more fine-grained from the
+        # configuration fragment or something.
+        "//tools/cpp:toolchain",
+    ],
+    default_python_version = "PY3",
+    main = "fast_gaussian_runner.py",
+    restricted_to = [
+        "//tools:k8",
+        "//tools:armhf-debian",
+    ],
+    srcs_version = "PY2AND3",
+    deps = [
+        "@bazel_tools//tools/python/runfiles",
+    ],
+)
+
+# Each element is [sigma, sigma_name, radius].
+# opencv's default width is calculated as:
+#   cvRound(sigma1 * (depth == CV_8U ? 3 : 4) * 2 + 1) | 1
+# Pulling that in helps a lot with making it faster (less data to read, and less
+# math to do), but if you make it too narrow SIFT quickly freaks out.
+sigmas = [
+    [
+        "1.2262734984654078",
+        "1p2",
+        "9",
+    ],
+    [
+        "1.5450077936447955",
+        "1p5",
+        "11",
+    ],
+    [
+        "1.9465878414647133",
+        "1p9",
+        "13",
+    ],
+    [
+        "2.4525469969308156",
+        "2p4",
+        "15",
+    ],
+    [
+        "3.0900155872895909",
+        "3p1",
+        "19",
+    ],
+    # TODO(Brian): We only need one of these two for 1280x720. Don't generate
+    # all the redundant versions for other sizes, and maybe stop doing the one
+    # we don't actually use.
+    [
+        "1.2489997148513794",
+        "1p24",
+        "11",
+    ],
+    [
+        "1.5198683738708496",
+        "1p52",
+        "15",
+    ],
+]
+
+sizes = [
+    [
+        1280,
+        720,
+    ],
+    [
+        640,
+        360,
+    ],
+    [
+        320,
+        180,
+    ],
+    [
+        160,
+        90,
+    ],
+    [
+        80,
+        45,
+    ],
+]
+
+fast_gaussian(sigmas, sizes)
+
 cc_library(
     name = "sift971",
     srcs = [
@@ -12,6 +125,47 @@
     ],
     visibility = ["//visibility:public"],
     deps = [
+        ":fast_gaussian",
         "//third_party:opencv",
+        "@com_github_google_glog//:glog",
+    ],
+)
+
+cc_library(
+    name = "fast_gaussian",
+    srcs = [
+        "fast_gaussian.cc",
+    ],
+    hdrs = [
+        "fast_gaussian.h",
+    ],
+    restricted_to = [
+        "//tools:k8",
+        "//tools:armhf-debian",
+    ],
+    deps = [
+        ":fast_gaussian_all",
+        "//third_party:halide_runtime",
+        "//third_party:opencv",
+        "@com_github_google_glog//:glog",
+    ],
+)
+
+cc_binary(
+    name = "testing_sift",
+    srcs = [
+        "testing_sift.cc",
+    ],
+    restricted_to = [
+        "//tools:k8",
+        "//tools:armhf-debian",
+    ],
+    deps = [
+        ":fast_gaussian",
+        "//aos:init",
+        "//aos/time",
+        "//third_party:opencv",
+        "//y2020/vision/sift:sift971",
+        "@com_github_google_glog//:glog",
     ],
 )
diff --git a/y2020/vision/sift/fast_gaussian.bzl b/y2020/vision/sift/fast_gaussian.bzl
new file mode 100644
index 0000000..a1c3173
--- /dev/null
+++ b/y2020/vision/sift/fast_gaussian.bzl
@@ -0,0 +1,55 @@
+def fast_gaussian(sigmas, sizes):
+  files = []
+  for _, sigma_name, _ in sigmas:
+    for cols, rows in sizes:
+      files.append("fast_gaussian_%dx%d_%s" % (cols, rows, sigma_name))
+  for _, sigma_name, _ in sigmas:
+    for cols, rows in sizes:
+      files.append("fast_gaussian_subtract_%dx%d_%s" % (cols, rows, sigma_name))
+  for cols, rows in sizes:
+    files.append('fast_subtract_%dx%d' % (cols, rows))
+
+  params = struct(
+    sigmas = sigmas,
+    sizes = sizes,
+  )
+
+  headers = [f + '.h' for f in files] + [
+    'fast_gaussian_all.h',
+  ]
+  objects = [f + '.o' for f in files] + [
+    'fast_gaussian_runtime.o',
+  ]
+  htmls = [f + '.html' for f in files]
+
+  native.genrule(
+    name = "generate_fast_gaussian",
+    tools = [
+        ":fast_gaussian_runner",
+    ],
+    cmd = ' '.join([
+      '$(location fast_gaussian_runner)',
+      "'" + params.to_json() + "'",
+      # TODO(Brian): This should be RULEDIR once we have support for that.
+      '$(@D)',
+      '$(TARGET_CPU)',
+    ]),
+    outs = headers + objects + htmls,
+    restricted_to = [
+      "//tools:k8",
+      "//tools:armhf-debian",
+    ],
+  )
+
+  native.cc_library(
+    name = 'fast_gaussian_all',
+    hdrs = ['fast_gaussian_all.h'],
+    srcs = headers + objects,
+    deps = [
+      '//third_party:halide_runtime',
+    ],
+    restricted_to = [
+      "//tools:k8",
+      "//tools:armhf-debian",
+    ],
+  )
diff --git a/y2020/vision/sift/fast_gaussian.cc b/y2020/vision/sift/fast_gaussian.cc
new file mode 100644
index 0000000..22549ac
--- /dev/null
+++ b/y2020/vision/sift/fast_gaussian.cc
@@ -0,0 +1,126 @@
+#include "y2020/vision/sift/fast_gaussian.h"
+
+#include <iomanip>
+
+#include <opencv2/imgproc.hpp>
+
+#include "y2020/vision/sift/fast_gaussian_all.h"
+
+namespace frc971 {
+namespace vision {
+namespace {
+
+void CheckNonOverlapping(const cv::Mat &a, const cv::Mat &b) {
+  CHECK(a.data > b.data + b.total() * b.elemSize() || a.data < b.data)
+      << ": images may not overlap";
+  CHECK(b.data > a.data + a.total() * a.elemSize() || b.data < a.data)
+      << ": images may not overlap";
+}
+
+// An easy toggle to always fall back to the slow implementations, to verify the
+// results are the same.
+constexpr bool kUseFast = true;
+
+// An easy toggle to print the result of all operations, for verifying that the
+// halide code is doing what we expect.
+constexpr bool kPrintAll = false;
+
+// We deliberately don't generate code for images smaller than this, so don't
+// print warnings about them.
+//
+// The opencv implementations are so fast below this size, the build time to
+// generate halide versions isn't worthwhile.
+constexpr int kMinWarnSize = 80;
+
+bool IsSmall(cv::Size size) {
+  return size.height <= kMinWarnSize && size.width <= kMinWarnSize;
+}
+
+}  // namespace
+
+void FastGaussian(const cv::Mat &source, cv::Mat *destination, double sigma) {
+  CHECK_EQ(source.type(), CV_16SC1);
+
+  destination->create(source.size(), source.type());
+  CheckNonOverlapping(source, *destination);
+
+  int result = 1;
+  if (kUseFast) {
+    result = DoGeneratedFastGaussian(MatToHalide<const int16_t>(source),
+                                     MatToHalide<int16_t>(*destination), sigma);
+  }
+  if (kPrintAll) {
+    LOG(INFO) << result << ": " << source.rows << " " << source.cols << " "
+              << std::setprecision(17) << sigma;
+  }
+  if (result == 0) {
+    return;
+  }
+  if (!IsSmall(source.size())) {
+    LOG(WARNING) << "slow gaussian blur: " << source.rows << " " << source.cols
+                 << " " << std::setprecision(17) << sigma;
+  }
+  CHECK_EQ(result, 1);
+
+  cv::GaussianBlur(source, *destination, cv::Size(), sigma, sigma,
+                   cv::BORDER_REPLICATE);
+}
+
+void FastSubtract(const cv::Mat &a, const cv::Mat &b, cv::Mat *destination) {
+  CHECK(a.size() == b.size());
+  destination->create(a.size(), a.type());
+  CheckNonOverlapping(a, *destination);
+  CheckNonOverlapping(b, *destination);
+
+  int result = 1;
+  if (kUseFast) {
+    result = DoGeneratedFastSubtract(MatToHalide<const int16_t>(a),
+                                     MatToHalide<const int16_t>(b),
+                                     MatToHalide<int16_t>(*destination));
+  }
+  if (kPrintAll) {
+    LOG(INFO) << result << ": " << a.rows << " " << a.cols;
+  }
+  if (result == 0) {
+    return;
+  }
+  if (!IsSmall(a.size())) {
+    LOG(WARNING) << "slow subtract: " << a.rows << " " << a.cols;
+  }
+  CHECK_EQ(result, 1);
+
+  cv::subtract(a, b, *destination);
+}
+
+void FastGaussianAndSubtract(const cv::Mat &source, cv::Mat *blurred,
+                             cv::Mat *difference, double sigma) {
+  CHECK_EQ(source.type(), CV_16SC1);
+  blurred->create(source.size(), source.type());
+  difference->create(source.size(), source.type());
+
+  int result = 1;
+  if (kUseFast) {
+    result = DoGeneratedFastGaussianAndSubtract(
+        MatToHalide<const int16_t>(source), MatToHalide<int16_t>(*blurred),
+        MatToHalide<int16_t>(*difference), sigma);
+  }
+  if (kPrintAll) {
+    LOG(INFO) << result << ": " << source.rows << " " << source.cols << " "
+              << std::setprecision(17) << sigma;
+  }
+  if (result == 0) {
+    return;
+  }
+  if (!IsSmall(source.size())) {
+    LOG(WARNING) << "slow gaussian blur: " << source.rows << " " << source.cols
+                 << " " << std::setprecision(17) << sigma;
+  }
+  CHECK_EQ(result, 1);
+
+  cv::GaussianBlur(source, *blurred, cv::Size(), sigma, sigma,
+                   cv::BORDER_REPLICATE);
+  cv::subtract(*blurred, source, *difference);
+}
+
+}  // namespace vision
+}  // namespace frc971
diff --git a/y2020/vision/sift/fast_gaussian.h b/y2020/vision/sift/fast_gaussian.h
new file mode 100644
index 0000000..580083f
--- /dev/null
+++ b/y2020/vision/sift/fast_gaussian.h
@@ -0,0 +1,44 @@
+#ifndef Y2020_VISION_SIFT_FAST_GAUSSIAN_H_
+#define Y2020_VISION_SIFT_FAST_GAUSSIAN_H_
+
+#include <type_traits>
+
+#include <opencv2/core/mat.hpp>
+#include "HalideBuffer.h"
+#include "glog/logging.h"
+
+namespace frc971 {
+namespace vision {
+
+// Returns a Halide buffer representing the data in mat.
+template <typename T>
+inline Halide::Runtime::Buffer<T, 2> MatToHalide(const cv::Mat &mat) {
+  CHECK_EQ(cv::DataType<typename std::remove_const<T>::type>::type, mat.type());
+  // Verify that at<T>(row, col) accesses this address:
+  //   data + sizeof(T) * (row * cols + col)
+  CHECK_EQ(mat.elemSize(), sizeof(T));
+  CHECK_EQ(mat.elemSize1(), sizeof(T));
+  CHECK_EQ(mat.step1(0), static_cast<size_t>(mat.cols));
+  CHECK_EQ(mat.step1(1), 1u);
+  CHECK_EQ(mat.dims, 2);
+  CHECK(mat.isContinuous());
+  return Halide::Runtime::Buffer<T, 2>(reinterpret_cast<T *>(mat.data),
+                                       mat.cols, mat.rows);
+}
+
+// Performs a gaussian blur with the specified sigma, truncated to a reasonable
+// width. Attempts to use faster implementations, but will fall back to
+// cv::GaussianBlur otherwise. Only handles a limited set of Mat formats.
+//
+// source and destination may not overlap.
+//
+// Always uses BORDER_REPLICATE mode.
+void FastGaussian(const cv::Mat &source, cv::Mat *destination, double sigma);
+void FastSubtract(const cv::Mat &a, const cv::Mat &b, cv::Mat *destination);
+void FastGaussianAndSubtract(const cv::Mat &source, cv::Mat *blurred,
+                             cv::Mat *difference, double sigma);
+
+}  // namespace vision
+}  // namespace vision
+
+#endif  // Y2020_VISION_SIFT_FAST_GAUSSIAN_H_
diff --git a/y2020/vision/sift/fast_gaussian_generator.cc b/y2020/vision/sift/fast_gaussian_generator.cc
new file mode 100644
index 0000000..6418618
--- /dev/null
+++ b/y2020/vision/sift/fast_gaussian_generator.cc
@@ -0,0 +1,206 @@
+#include <opencv2/core/mat.hpp>
+#include <opencv2/imgproc.hpp>
+#include "Halide.h"
+#include "glog/logging.h"
+
+// This is a Halide "generator". This means it is a binary which generates
+// ahead-of-time optimized functions as directed by command-line arguments.
+// https://halide-lang.org/tutorials/tutorial_lesson_15_generators.html has an
+// introduction to much of the magic in this file.
+
+namespace frc971 {
+namespace vision {
+namespace {
+
+// Returns a function implementating a 1-dimensional gaussian blur convolution.
+Halide::Func GenerateBlur(std::string name, Halide::Func in, int col_step,
+                          int row_step, int radius, cv::Mat kernel,
+                          Halide::Var col, Halide::Var row) {
+  Halide::Expr expr = kernel.at<float>(0) * in(col, row);
+  for (int i = 1; i <= radius; ++i) {
+    expr += kernel.at<float>(i) * (in(col - i * col_step, row - i * row_step) +
+                                   in(col + i * col_step, row + i * row_step));
+  }
+  Halide::Func func(name);
+  func(col, row) = expr;
+  return func;
+}
+
+template <typename T>
+void SetRowMajor(T *buffer_parameter, int cols, int rows) {
+  buffer_parameter->dim(0).set_stride(1);
+  buffer_parameter->dim(0).set_extent(cols);
+  buffer_parameter->dim(0).set_min(0);
+  buffer_parameter->dim(1).set_stride(cols);
+  buffer_parameter->dim(1).set_extent(rows);
+  buffer_parameter->dim(1).set_min(0);
+}
+
+}  // namespace
+
+class GaussianGenerator : public Halide::Generator<GaussianGenerator> {
+ public:
+  GeneratorParam<int> cols{"cols", 0};
+  GeneratorParam<int> rows{"rows", 0};
+  GeneratorParam<double> sigma{"sigma", -1};
+  GeneratorParam<int> filter_width{"filter_width", 0};
+
+  Input<Buffer<int16_t>> input{"input", 2};
+  Output<Buffer<int16_t>> output{"output", 2};
+
+  // We use opencv's naming convention, instead of the (x, y) which most of the
+  // halide examples use. This is easier to keep straight with the row-major
+  // storage order we're using though.
+  // col is first because incrementing the data index by 1 moves over 1 column.
+  Var col{"col"}, row{"row"};
+
+  void generate() {
+    CHECK(cols > 0) << ": Must specify a cols";
+    CHECK(rows > 0) << ": Must specify a rows";
+    CHECK(sigma > 0) << ": Must specify a sigma";
+    CHECK(filter_width > 0) << ": Must specify a filter_width";
+    CHECK((filter_width % 2) == 1)
+        << ": Invalid filter_width: " << static_cast<int>(filter_width);
+
+    SetRowMajor(&input, cols, rows);
+
+    const int radius = (filter_width - 1) / 2;
+    const cv::Mat kernel =
+        cv::getGaussianKernel(filter_width, sigma, CV_32F)
+            .rowRange(radius, filter_width);
+
+    Halide::Func in_bounded = Halide::BoundaryConditions::repeat_edge(input);
+    Halide::Func blur_col =
+        GenerateBlur("blur_col", in_bounded, 1, 0, radius, kernel, col, row);
+    output(col, row) = Halide::cast<int16_t>(
+        GenerateBlur("blur_row", blur_col, 0, 1, radius, kernel, col, row)(col, row));
+
+    // Vectorize along the col dimension. Most of the data needed by each lane
+    // overlaps this way. This also has the advantage of being the first
+    // dimension, so as we move along it we will have good data locality.
+    blur_col.vectorize(col, 8);
+
+    // The fun part: we tile the algorithm. This tile size is designed to fit
+    // within each CPU core's L1 cache. On the boundaries of the tiles, we end
+    // re-computing the first blur, but fitting within the L1 cache is worth it.
+    Halide::Var col_inner("col_inner"), row_inner("row_inner");
+    output.tile(col, row, col_inner, row_inner, 64, 32);
+    Halide::Var tile_index("tile_index");
+    output.fuse(col, row, tile_index);
+
+    // Compute the first blur as needed for the second one, within each tile.
+    blur_col.compute_at(output, tile_index);
+    // And then vectorize the second blur within each tile.
+    output.vectorize(col_inner, 8);
+
+    // Lastly, compute all the tiles in parallel.
+    output.parallel(tile_index);
+
+    SetRowMajor(&output, cols, rows);
+  }
+};
+
+class SubtractGenerator : public Halide::Generator<SubtractGenerator> {
+ public:
+  GeneratorParam<int> cols{"cols", 0};
+  GeneratorParam<int> rows{"rows", 0};
+
+  Input<Buffer<int16_t>> input_a{"input_a", 2};
+  Input<Buffer<int16_t>> input_b{"input_b", 2};
+  Output<Buffer<int16_t>> output{"output", 2};
+
+  Var col{"col"}, row{"row"};
+
+  void generate() {
+    CHECK(cols > 0) << ": Must specify a cols";
+    CHECK(rows > 0) << ": Must specify a rows";
+
+    SetRowMajor(&input_a, cols, rows);
+    SetRowMajor(&input_b, cols, rows);
+
+    output(col, row) = Halide::saturating_cast<int16_t>(
+        Halide::cast<int32_t>(input_a(col, row)) - input_b(col, row));
+    output.vectorize(col, 16);
+
+    SetRowMajor(&output, cols, rows);
+  }
+};
+
+class GaussianAndSubtractGenerator
+    : public Halide::Generator<GaussianAndSubtractGenerator> {
+ public:
+  GeneratorParam<int> cols{"cols", 0};
+  GeneratorParam<int> rows{"rows", 0};
+  GeneratorParam<double> sigma{"sigma", -1};
+  GeneratorParam<int> filter_width{"filter_width", 0};
+
+  Input<Buffer<int16_t>> input{"input", 2};
+  Output<Buffer<int16_t>> blurred{"blurred", 2};
+  Output<Buffer<int16_t>> difference{"difference", 2};
+
+  // We use opencv's naming convention, instead of the (x, y) which most of the
+  // halide examples use. This is easier to keep straight with the row-major
+  // storage order we're using though.
+  // col is first because incrementing the data index by 1 moves over 1 column.
+  Var col{"col"}, row{"row"};
+
+  void generate() {
+    CHECK(cols > 0) << ": Must specify a cols";
+    CHECK(rows > 0) << ": Must specify a rows";
+    CHECK(sigma > 0) << ": Must specify a sigma";
+    CHECK(filter_width > 0) << ": Must specify a filter_width";
+    CHECK((filter_width % 2) == 1)
+        << ": Invalid filter_width: " << static_cast<int>(filter_width);
+
+    SetRowMajor(&input, cols, rows);
+
+    const int radius = (filter_width - 1) / 2;
+    const cv::Mat kernel =
+        cv::getGaussianKernel(filter_width, sigma, CV_32F)
+            .rowRange(radius, filter_width);
+
+    Halide::Func in_bounded = Halide::BoundaryConditions::repeat_edge(input);
+    Halide::Func blur_col =
+        GenerateBlur("blur_col", in_bounded, 1, 0, radius, kernel, col, row);
+    blurred(col, row) = Halide::cast<int16_t>(
+        GenerateBlur("blur_row", blur_col, 0, 1, radius, kernel, col, row)(col, row));
+    difference(col, row) = Halide::saturating_cast<int16_t>(
+        Halide::cast<int32_t>(blurred(col, row)) - input(col, row));
+
+    // Vectorize along the col dimension. Most of the data needed by each lane
+    // overlaps this way. This also has the advantage of being the first
+    // dimension, so as we move along it we will have good data locality.
+    blur_col.vectorize(col, 8);
+
+    // The fun part: we tile the algorithm. This tile size is designed to fit
+    // within each CPU core's L1 cache. On the boundaries of the tiles, we end
+    // re-computing the first blur, but fitting within the L1 cache is worth it.
+    Halide::Var col_inner("col_inner"), row_inner("row_inner");
+    blurred.tile(col, row, col_inner, row_inner, 64, 32);
+    Halide::Var tile_index("tile_index");
+    blurred.fuse(col, row, tile_index);
+
+    // Compute the first blur as needed for the second one, within each tile.
+    blur_col.compute_at(blurred, tile_index);
+    // And then vectorize the second blur within each tile.
+    blurred.vectorize(col_inner, 8);
+
+    // Lastly, compute all the tiles in parallel.
+    blurred.parallel(tile_index);
+    blurred.compute_root();
+
+    // TODO(Brian): Calulate difference within each of the tiles to speed things
+    // up.
+
+    SetRowMajor(&blurred, cols, rows);
+    SetRowMajor(&difference, cols, rows);
+  }
+};
+
+}  // namespace vision
+}  // namespace frc971
+
+HALIDE_REGISTER_GENERATOR(frc971::vision::GaussianGenerator, gaussian_generator)
+HALIDE_REGISTER_GENERATOR(frc971::vision::SubtractGenerator, subtract_generator)
+HALIDE_REGISTER_GENERATOR(frc971::vision::GaussianAndSubtractGenerator,
+                          gaussian_and_subtract_generator)
diff --git a/y2020/vision/sift/fast_gaussian_runner.py b/y2020/vision/sift/fast_gaussian_runner.py
new file mode 100755
index 0000000..9699fef
--- /dev/null
+++ b/y2020/vision/sift/fast_gaussian_runner.py
@@ -0,0 +1,194 @@
+from __future__ import print_function
+
+import json
+import sys
+import subprocess
+import os
+import threading
+
+from bazel_tools.tools.python.runfiles import runfiles
+
+def main(params):
+  r = runfiles.Create()
+  generator = r.Rlocation('org_frc971/y2020/vision/sift/fast_gaussian_generator')
+
+  ruledir = sys.argv[2]
+  target_cpu = sys.argv[3]
+
+  target = {
+      'armhf-debian': 'arm-32-linux-no_asserts',
+      'k8': 'x86-64-linux-no_asserts',
+  }[target_cpu]
+
+  commands = []
+
+  env = os.environ.copy()
+  env['LD_LIBRARY_PATH'] = ':'.join([
+      'debian_amd64_sysroot/lib/x86_64-linux-gnu',
+      'debian_amd64_sysroot/lib',
+      'debian_amd64_sysroot/usr/lib/x86_64-linux-gnu',
+      'debian_amd64_sysroot/usr/lib',
+  ])
+
+  all_header = [
+      '#ifndef Y2020_VISION_SIFT_FAST_GAUSSIAN_ALL_H_',
+      '#define Y2020_VISION_SIFT_FAST_GAUSSIAN_ALL_H_',
+      '#include "HalideBuffer.h"',
+  ]
+
+  for cols, rows in params['sizes']:
+    for sigma, sigma_name, filter_width in params['sigmas']:
+      name = "fast_gaussian_%dx%d_%s" % (cols, rows, sigma_name)
+
+      commands.append([
+          generator,
+          '-g', 'gaussian_generator',
+          '-o', ruledir,
+          '-f', name,
+          '-e', 'o,h,html',
+          'target=%s-no_runtime' % target,
+          'cols=%s' % cols,
+          'rows=%s' % rows,
+          'sigma=%s' % sigma,
+          'filter_width=%s' % filter_width,
+      ])
+      all_header += [
+          '#include "y2020/vision/sift/%s.h"' % name,
+      ]
+
+      name = "fast_gaussian_subtract_%dx%d_%s" % (cols, rows, sigma_name)
+
+      commands.append([
+          generator,
+          '-g', 'gaussian_and_subtract_generator',
+          '-o', ruledir,
+          '-f', name,
+          '-e', 'o,h,html',
+          'target=%s-no_runtime' % target,
+          'cols=%s' % cols,
+          'rows=%s' % rows,
+          'sigma=%s' % sigma,
+          'filter_width=%s' % filter_width,
+      ])
+      all_header += [
+          '#include "y2020/vision/sift/%s.h"' % name,
+      ]
+
+    name = 'fast_subtract_%dx%d' % (cols, rows)
+    commands.append([
+        generator,
+        '-g', 'subtract_generator',
+        '-o', ruledir,
+        '-f', name,
+        '-e', 'o,h,html',
+        'target=%s-no_runtime' % target,
+        'cols=%s' % cols,
+        'rows=%s' % rows,
+    ])
+    all_header += [
+        '#include "y2020/vision/sift/%s.h"' % name,
+    ]
+  commands.append([
+      generator,
+      '-r', 'fast_gaussian_runtime',
+      '-o', ruledir,
+      '-e', 'o',
+      'target=%s' % target,
+  ])
+
+  all_header += [
+      'namespace frc971 {',
+      'namespace vision {',
+      '// 0 is success. 1 is non-implemented size. Negative is a Halide error.',
+      'inline int DoGeneratedFastGaussian(',
+      '    Halide::Runtime::Buffer<const int16_t, 2> input,',
+      '    Halide::Runtime::Buffer<int16_t, 2> output,',
+      '    double sigma) {',
+  ]
+
+  for sigma, sigma_name, filter_width in params['sigmas']:
+    for cols, rows in params['sizes']:
+      name = "fast_gaussian_%dx%d_%s" % (cols, rows, sigma_name)
+      all_header += [
+          '  if (input.dim(0).extent() == %s' % cols,
+          '      && input.dim(1).extent() == %s' % rows,
+          '      && sigma == %s) {' % sigma,
+          '    return %s(input, output);' % name,
+          '  }',
+      ]
+
+  all_header += [
+      '  return 1;',
+      '}',
+      'inline int DoGeneratedFastGaussianAndSubtract(',
+      '    Halide::Runtime::Buffer<const int16_t, 2> input,',
+      '    Halide::Runtime::Buffer<int16_t, 2> blurred,',
+      '    Halide::Runtime::Buffer<int16_t, 2> difference,',
+      '    double sigma) {',
+  ]
+
+  for sigma, sigma_name, filter_width in params['sigmas']:
+    for cols, rows in params['sizes']:
+      name = "fast_gaussian_subtract_%dx%d_%s" % (cols, rows, sigma_name)
+      all_header += [
+          '  if (input.dim(0).extent() == %s' % cols,
+          '      && input.dim(1).extent() == %s' % rows,
+          '      && sigma == %s) {' % sigma,
+          '    return %s(input, blurred, difference);' % name,
+          '  }',
+      ]
+
+  all_header += [
+      '  return 1;',
+      '}',
+      'inline int DoGeneratedFastSubtract('
+      '    Halide::Runtime::Buffer<const int16_t, 2> input_a,',
+      '    Halide::Runtime::Buffer<const int16_t, 2> input_b,',
+      '    Halide::Runtime::Buffer<int16_t, 2> output) {',
+  ]
+  for cols, rows in params['sizes']:
+    name = 'fast_subtract_%dx%d' % (cols, rows)
+    all_header += [
+        '  if (input_a.dim(0).extent() == %s' % cols,
+        '      && input_a.dim(1).extent() == %s) {' % rows,
+        '    return %s(input_a, input_b, output);' % name,
+        '  }',
+    ]
+  all_header += [
+      '  return 1;',
+      '}',
+      '}  // namespace vision',
+      '}  // namespace frc971',
+      '#endif  // Y2020_VISION_SIFT_FAST_GAUSSIAN_ALL_H_',
+  ]
+
+  with open(os.path.join(ruledir, 'fast_gaussian_all.h'), 'w') as f:
+    f.writelines([line + '\n' for line in all_header])
+
+  commands_lock = threading.Lock()
+  success = [True]
+
+  def run_commands():
+    while True:
+      with commands_lock:
+        if not commands:
+          return
+        if not success[0]:
+          return
+        command = commands.pop()
+      try:
+        subprocess.check_call(command, env=env)
+      except:
+        with commands_lock:
+          success[0] = False
+        raise
+  threads = [threading.Thread(target=run_commands) for _ in range(4)]
+  for thread in threads:
+    thread.start()
+  for thread in threads:
+    thread.join()
+  if not success[0]:
+    sys.exit(1)
+
+if __name__ == '__main__':
+  main(json.loads(sys.argv[1]))
diff --git a/y2020/vision/sift/sift971.cc b/y2020/vision/sift/sift971.cc
index 6223f77..7152906 100644
--- a/y2020/vision/sift/sift971.cc
+++ b/y2020/vision/sift/sift971.cc
@@ -111,6 +111,9 @@
 #include <stdarg.h>
 #include <opencv2/core/hal/hal.hpp>
 #include <opencv2/imgproc.hpp>
+#include "glog/logging.h"
+
+#include "y2020/vision/sift/fast_gaussian.h"
 
 using namespace cv;
 
@@ -158,7 +161,7 @@
 // factor used to convert floating-point descriptor to unsigned char
 static const float SIFT_INT_DESCR_FCTR = 512.f;
 
-#define DoG_TYPE_SHORT 0
+#define DoG_TYPE_SHORT 1
 #if DoG_TYPE_SHORT
 // intermediate type used for DoG pyramids
 typedef short sift_wt;
@@ -177,37 +180,7 @@
   scale = octave >= 0 ? 1.f / (1 << octave) : (float)(1 << -octave);
 }
 
-static Mat createInitialImage(const Mat &img, bool doubleImageSize,
-                              float sigma) {
-  Mat gray, gray_fpt;
-  if (img.channels() == 3 || img.channels() == 4) {
-    cvtColor(img, gray, COLOR_BGR2GRAY);
-    gray.convertTo(gray_fpt, DataType<sift_wt>::type, SIFT_FIXPT_SCALE, 0);
-  } else
-    img.convertTo(gray_fpt, DataType<sift_wt>::type, SIFT_FIXPT_SCALE, 0);
-
-  float sig_diff;
-
-  if (doubleImageSize) {
-    sig_diff = sqrtf(
-        std::max(sigma * sigma - SIFT_INIT_SIGMA * SIFT_INIT_SIGMA * 4, 0.01f));
-    Mat dbl;
-#if DoG_TYPE_SHORT
-    resize(gray_fpt, dbl, Size(gray_fpt.cols * 2, gray_fpt.rows * 2), 0, 0,
-           INTER_LINEAR_EXACT);
-#else
-    resize(gray_fpt, dbl, Size(gray_fpt.cols * 2, gray_fpt.rows * 2), 0, 0,
-           INTER_LINEAR);
-#endif
-    GaussianBlur(dbl, dbl, Size(), sig_diff, sig_diff);
-    return dbl;
-  } else {
-    sig_diff = sqrtf(
-        std::max(sigma * sigma - SIFT_INIT_SIGMA * SIFT_INIT_SIGMA, 0.01f));
-    GaussianBlur(gray_fpt, gray_fpt, Size(), sig_diff, sig_diff);
-    return gray_fpt;
-  }
-}
+constexpr bool kLogTiming = false;
 
 }  // namespace
 
@@ -229,14 +202,19 @@
   for (int o = 0; o < nOctaves; o++) {
     for (int i = 0; i < nOctaveLayers + 3; i++) {
       Mat &dst = pyr[o * (nOctaveLayers + 3) + i];
-      if (o == 0 && i == 0) dst = base;
-      // base of new octave is halved image from end of previous octave
-      else if (i == 0) {
+      if (o == 0 && i == 0) {
+        dst = base;
+      } else if (i == 0) {
+        // base of new octave is halved image from end of previous octave
         const Mat &src = pyr[(o - 1) * (nOctaveLayers + 3) + nOctaveLayers];
         resize(src, dst, Size(src.cols / 2, src.rows / 2), 0, 0, INTER_NEAREST);
       } else {
         const Mat &src = pyr[o * (nOctaveLayers + 3) + i - 1];
-        GaussianBlur(src, dst, Size(), sig[i], sig[i]);
+        if (use_fast_gaussian_pyramid_) {
+          FastGaussian(src, &dst, sig[i]);
+        } else {
+          GaussianBlur(src, dst, Size(), sig[i], sig[i]);
+        }
       }
     }
   }
@@ -247,8 +225,12 @@
 class buildDoGPyramidComputer : public ParallelLoopBody {
  public:
   buildDoGPyramidComputer(int _nOctaveLayers, const std::vector<Mat> &_gpyr,
-                          std::vector<Mat> &_dogpyr)
-      : nOctaveLayers(_nOctaveLayers), gpyr(_gpyr), dogpyr(_dogpyr) {}
+                          std::vector<Mat> &_dogpyr,
+                          bool use_fast_subtract_dogpyr)
+      : nOctaveLayers(_nOctaveLayers),
+        gpyr(_gpyr),
+        dogpyr(_dogpyr),
+        use_fast_subtract_dogpyr_(use_fast_subtract_dogpyr) {}
 
   void operator()(const cv::Range &range) const override {
     const int begin = range.start;
@@ -260,15 +242,21 @@
 
       const Mat &src1 = gpyr[o * (nOctaveLayers + 3) + i];
       const Mat &src2 = gpyr[o * (nOctaveLayers + 3) + i + 1];
+      CHECK_EQ(a, o * (nOctaveLayers + 2) + i);
       Mat &dst = dogpyr[o * (nOctaveLayers + 2) + i];
-      subtract(src2, src1, dst, noArray(), DataType<sift_wt>::type);
+      if (use_fast_subtract_dogpyr_) {
+        FastSubtract(src2, src1, &dst);
+      } else {
+        subtract(src2, src1, dst, noArray(), DataType<sift_wt>::type);
+      }
     }
   }
 
  private:
-  int nOctaveLayers;
+  const int nOctaveLayers;
   const std::vector<Mat> &gpyr;
   std::vector<Mat> &dogpyr;
+  const bool use_fast_subtract_dogpyr_;
 };
 
 }  // namespace
@@ -278,8 +266,97 @@
   int nOctaves = (int)gpyr.size() / (nOctaveLayers + 3);
   dogpyr.resize(nOctaves * (nOctaveLayers + 2));
 
+#if 0
   parallel_for_(Range(0, nOctaves * (nOctaveLayers + 2)),
-                buildDoGPyramidComputer(nOctaveLayers, gpyr, dogpyr));
+                buildDoGPyramidComputer(nOctaveLayers, gpyr, dogpyr, use_fast_subtract_dogpyr_));
+#else
+  buildDoGPyramidComputer(
+      nOctaveLayers, gpyr, dogpyr,
+      use_fast_subtract_dogpyr_)(Range(0, nOctaves * (nOctaveLayers + 2)));
+#endif
+}
+
+// base is the image to start with.
+// gpyr is the pyramid of gaussian blurs. This is both an output and a place
+// where we store intermediates.
+// dogpyr is the pyramid of gaussian differences which we fill out.
+// number_octaves is the number of octaves to calculate.
+void SIFT971_Impl::buildGaussianAndDifferencePyramid(
+    const cv::Mat &base, std::vector<cv::Mat> &gpyr,
+    std::vector<cv::Mat> &dogpyr, int number_octaves) const {
+  const int layers_per_octave = nOctaveLayers;
+  // We use the base (possibly after downscaling) as the first "blurred" image.
+  // Then we calculate 2 more than the number of octaves.
+  // TODO(Brian): Why are there 2 extra?
+  const int gpyr_layers_per_octave = layers_per_octave + 3;
+  // There is 1 less difference than the number of blurs.
+  const int dogpyr_layers_per_octave = gpyr_layers_per_octave - 1;
+  gpyr.resize(number_octaves * gpyr_layers_per_octave);
+  dogpyr.resize(number_octaves * dogpyr_layers_per_octave);
+
+  std::vector<double> sig(gpyr_layers_per_octave);
+  // precompute Gaussian sigmas using the following formula:
+  //  \sigma_{total}^2 = \sigma_{i}^2 + \sigma_{i-1}^2
+  sig[0] = sigma;
+  double k = std::pow(2., 1. / layers_per_octave);
+  for (int i = 1; i < gpyr_layers_per_octave; i++) {
+    double sig_prev = std::pow<double>(k, i - 1) * sigma;
+    double sig_total = sig_prev * k;
+    sig[i] = std::sqrt(sig_total * sig_total - sig_prev * sig_prev);
+  }
+
+  for (int octave = 0; octave < number_octaves; octave++) {
+    // At the beginning of each octave, calculate the new base image.
+    {
+      Mat &dst = gpyr[octave * gpyr_layers_per_octave];
+      if (octave == 0) {
+        // For the first octave, it's just the base image.
+        dst = base;
+      } else {
+        // For the other octaves, it's a halved version of the end of the
+        // previous octave.
+        const Mat &src = gpyr[(octave - 1) * gpyr_layers_per_octave +
+                              gpyr_layers_per_octave - 1];
+        resize(src, dst, Size(src.cols / 2, src.rows / 2), 0, 0, INTER_NEAREST);
+      }
+    }
+    // We start with layer==1 because the "first layer" is just the base image
+    // (or a downscaled version of it).
+    for (int layer = 1; layer < gpyr_layers_per_octave; layer++) {
+      // The index where the current layer starts.
+      const int layer_index = octave * gpyr_layers_per_octave + layer;
+      if (use_fast_pyramid_difference_) {
+        const Mat &input = gpyr[layer_index - 1];
+        Mat &blurred = gpyr[layer_index];
+        Mat &difference =
+            dogpyr[octave * dogpyr_layers_per_octave + (layer - 1)];
+        FastGaussianAndSubtract(input, &blurred, &difference, sig[layer]);
+      } else {
+        // First, calculate the new gaussian blur.
+        {
+          const Mat &src = gpyr[layer_index - 1];
+          Mat &dst = gpyr[layer_index];
+          if (use_fast_gaussian_pyramid_) {
+            FastGaussian(src, &dst, sig[layer]);
+          } else {
+            GaussianBlur(src, dst, Size(), sig[layer], sig[layer]);
+          }
+        }
+
+        // Then, calculate the difference from the previous one.
+        {
+          const Mat &src1 = gpyr[layer_index - 1];
+          const Mat &src2 = gpyr[layer_index];
+          Mat &dst = dogpyr[octave * dogpyr_layers_per_octave + (layer - 1)];
+          if (use_fast_subtract_dogpyr_) {
+            FastSubtract(src2, src1, &dst);
+          } else {
+            subtract(src2, src1, dst, noArray(), DataType<sift_wt>::type);
+          }
+        }
+      }
+    }
+  }
 }
 
 namespace {
@@ -1073,7 +1150,7 @@
                                     std::vector<KeyPoint> &keypoints,
                                     OutputArray _descriptors,
                                     bool useProvidedKeypoints) {
-  int firstOctave = -1, actualNOctaves = 0, actualNLayers = 0;
+  int firstOctave = 0, actualNOctaves = 0, actualNLayers = 0;
   Mat image = _image.getMat(), mask = _mask.getMat();
 
   if (image.empty() || image.depth() != CV_8U)
@@ -1084,6 +1161,7 @@
     CV_Error(Error::StsBadArg, "mask has incorrect type (!=CV_8UC1)");
 
   if (useProvidedKeypoints) {
+    LOG_IF(INFO, kLogTiming);
     firstOctave = 0;
     int maxOctave = INT_MIN;
     for (size_t i = 0; i < keypoints.size(); i++) {
@@ -1100,35 +1178,39 @@
     actualNOctaves = maxOctave - firstOctave + 1;
   }
 
-  Mat base = createInitialImage(image, firstOctave < 0, (float)sigma);
+  LOG_IF(INFO, kLogTiming);
+  Mat base = createInitialImage(image, firstOctave < 0);
+  LOG_IF(INFO, kLogTiming);
   std::vector<Mat> gpyr;
-  int nOctaves =
-      actualNOctaves > 0
-          ? actualNOctaves
-          : cvRound(std::log((double)std::min(base.cols, base.rows)) /
-                        std::log(2.) -
-                    2) -
-                firstOctave;
-
-  // double t, tf = getTickFrequency();
-  // t = (double)getTickCount();
-  buildGaussianPyramid(base, gpyr, nOctaves);
-
-  // t = (double)getTickCount() - t;
-  // printf("pyramid construction time: %g\n", t*1000./tf);
+  int nOctaves;
+  if (actualNOctaves > 0) {
+    nOctaves = actualNOctaves;
+  } else {
+    nOctaves = cvRound(std::log((double)std::min(base.cols, base.rows)) /
+                           std::log(2.) -
+                       2) -
+               firstOctave;
+  }
 
   if (!useProvidedKeypoints) {
     std::vector<Mat> dogpyr;
-    buildDoGPyramid(gpyr, dogpyr);
-    // t = (double)getTickCount();
+    if (use_fused_pyramid_difference_) {
+      buildGaussianAndDifferencePyramid(base, gpyr, dogpyr, nOctaves);
+      LOG_IF(INFO, kLogTiming);
+    } else {
+      buildGaussianPyramid(base, gpyr, nOctaves);
+      LOG_IF(INFO, kLogTiming);
+
+      buildDoGPyramid(gpyr, dogpyr);
+      LOG_IF(INFO, kLogTiming);
+    }
+
     findScaleSpaceExtrema(gpyr, dogpyr, keypoints);
     // TODO(Brian): I think it can go faster by knowing they're sorted?
     // KeyPointsFilter::removeDuplicatedSorted( keypoints );
     KeyPointsFilter::removeDuplicated(keypoints);
 
     if (nfeatures > 0) KeyPointsFilter::retainBest(keypoints, nfeatures);
-    // t = (double)getTickCount() - t;
-    // printf("keypoint detection time: %g\n", t*1000./tf);
 
     if (firstOctave < 0)
       for (size_t i = 0; i < keypoints.size(); i++) {
@@ -1140,20 +1222,54 @@
       }
 
     if (!mask.empty()) KeyPointsFilter::runByPixelsMask(keypoints, mask);
+    LOG_IF(INFO, kLogTiming);
   } else {
+    buildGaussianPyramid(base, gpyr, nOctaves);
+    LOG_IF(INFO, kLogTiming);
     // filter keypoints by mask
     // KeyPointsFilter::runByPixelsMask( keypoints, mask );
   }
 
   if (_descriptors.needed()) {
-    // t = (double)getTickCount();
     int dsize = descriptorSize();
     _descriptors.create((int)keypoints.size(), dsize, CV_32F);
     Mat descriptors = _descriptors.getMat();
 
     calcDescriptors(gpyr, keypoints, descriptors, nOctaveLayers, firstOctave);
-    // t = (double)getTickCount() - t;
-    // printf("descriptor extraction time: %g\n", t*1000./tf);
+    LOG_IF(INFO, kLogTiming);
+  }
+}
+
+Mat SIFT971_Impl::createInitialImage(const Mat &img,
+                                     bool doubleImageSize) const {
+  Mat gray, gray_fpt;
+  if (img.channels() == 3 || img.channels() == 4) {
+    cvtColor(img, gray, COLOR_BGR2GRAY);
+    gray.convertTo(gray_fpt, DataType<sift_wt>::type, SIFT_FIXPT_SCALE, 0);
+  } else {
+    img.convertTo(gray_fpt, DataType<sift_wt>::type, SIFT_FIXPT_SCALE, 0);
+  }
+
+  float sig_diff;
+
+  Mat maybe_doubled;
+  if (doubleImageSize) {
+    sig_diff = std::sqrt(
+        std::max(sigma * sigma - SIFT_INIT_SIGMA * SIFT_INIT_SIGMA * 4, 0.01));
+    resize(gray_fpt, maybe_doubled, Size(gray_fpt.cols * 2, gray_fpt.rows * 2),
+           0, 0, INTER_LINEAR);
+  } else {
+    sig_diff = std::sqrt(
+        std::max(sigma * sigma - SIFT_INIT_SIGMA * SIFT_INIT_SIGMA, 0.01));
+    maybe_doubled = gray_fpt;
+  }
+  if (use_fast_guassian_initial_) {
+    Mat temp;
+    FastGaussian(maybe_doubled, &temp, sig_diff);
+    return temp;
+  } else {
+    GaussianBlur(maybe_doubled, maybe_doubled, Size(), sig_diff, sig_diff);
+    return maybe_doubled;
   }
 }
 
diff --git a/y2020/vision/sift/sift971.h b/y2020/vision/sift/sift971.h
index d58dec8..b351d70 100644
--- a/y2020/vision/sift/sift971.h
+++ b/y2020/vision/sift/sift971.h
@@ -41,6 +41,10 @@
                             int nOctaves) const;
   void buildDoGPyramid(const std::vector<cv::Mat> &pyr,
                        std::vector<cv::Mat> &dogpyr) const;
+  void buildGaussianAndDifferencePyramid(const cv::Mat &base,
+                                         std::vector<cv::Mat> &pyr,
+                                         std::vector<cv::Mat> &dogpyr,
+                                         int nOctaves) const;
   void findScaleSpaceExtrema(const std::vector<cv::Mat> &gauss_pyr,
                              const std::vector<cv::Mat> &dog_pyr,
                              std::vector<cv::KeyPoint> &keypoints) const;
@@ -51,6 +55,15 @@
   CV_PROP_RW double contrastThreshold;
   CV_PROP_RW double edgeThreshold;
   CV_PROP_RW double sigma;
+
+ private:
+  cv::Mat createInitialImage(const cv::Mat &img, bool doubleImageSize) const;
+
+  bool use_fast_gaussian_pyramid_ = true;
+  bool use_fast_subtract_dogpyr_ = true;
+  bool use_fast_guassian_initial_ = true;
+  bool use_fused_pyramid_difference_ = true;
+  bool use_fast_pyramid_difference_ = true;
 };
 
 }  // namespace vision
diff --git a/y2020/vision/sift/testing_sift.cc b/y2020/vision/sift/testing_sift.cc
new file mode 100644
index 0000000..d4b1306
--- /dev/null
+++ b/y2020/vision/sift/testing_sift.cc
@@ -0,0 +1,87 @@
+#include <memory>
+
+#include <opencv2/highgui/highgui.hpp>
+#include <opencv2/imgcodecs.hpp>
+#include <opencv2/imgproc.hpp>
+
+#include "aos/init.h"
+#include "aos/time/time.h"
+#include "y2020/vision/sift/fast_gaussian.h"
+#include "glog/logging.h"
+#include "y2020/vision/sift/sift971.h"
+
+DEFINE_string(image, "", "Image to test with");
+
+int main(int argc, char **argv) {
+  aos::InitGoogle(&argc, &argv);
+
+  cv::setNumThreads	(4);
+
+  const cv::Mat raw_image = cv::imread(FLAGS_image);
+  CHECK(!raw_image.empty()) << ": Failed to read: " << FLAGS_image;
+  CHECK_EQ(CV_8UC3, raw_image.type());
+#if 0
+  cv::Mat color_image;
+  raw_image.convertTo(color_image, CV_32F, 1.0/255.0);
+  cv::Mat image;
+  cv::cvtColor(color_image, image, cv::COLOR_BGR2GRAY);
+#else
+  cv::Mat gray_image;
+  cv::cvtColor(raw_image, gray_image, cv::COLOR_BGR2GRAY);
+  cv::Mat float_image;
+#if 0
+  gray_image.convertTo(float_image, CV_32F, 0.00390625);
+#else
+  float_image = gray_image;
+#endif
+  cv::Mat image;
+  cv::resize(float_image, image, cv::Size(1280, 720), 0, 0, cv::INTER_AREA);
+#endif
+#if 0
+#if 0
+  cv::namedWindow("source", cv::WINDOW_AUTOSIZE);
+  cv::imshow("source", raw_image);
+  cv::namedWindow("converted", cv::WINDOW_AUTOSIZE);
+  cv::imshow("converted", image);
+#endif
+
+  cv::Mat slow_blurred, fast_blurred;
+  const double sigma = 3.0900155872895909;
+  cv::GaussianBlur(image, slow_blurred, cv::Size(9, 9), sigma, sigma);
+  frc971::vision::FastGaussian(image, &fast_blurred, sigma);
+  cv::namedWindow("slow", cv::WINDOW_AUTOSIZE);
+  cv::imshow("slow", slow_blurred);
+  cv::namedWindow("fast", cv::WINDOW_AUTOSIZE);
+  cv::imshow("fast", fast_blurred);
+  cv::waitKey(0);
+  return 0;
+#endif
+
+  LOG(INFO);
+  std::unique_ptr<frc971::vision::SIFT971_Impl> sift(new frc971::vision::SIFT971_Impl());
+  std::vector<cv::KeyPoint> keypoints;
+  cv::Mat descriptors;
+  LOG(INFO) << "detectAndCompute on " << image.rows << "x" << image.cols;
+  sift->detectAndCompute(image, cv::noArray(), keypoints, descriptors);
+  LOG(INFO);
+
+#if 0
+  return 0;
+#endif
+
+  static constexpr int kIterations = 40;
+  const auto start = aos::monotonic_clock::now();
+  for (int i = 0; i < kIterations; ++i) {
+    keypoints.clear();
+    descriptors.release();
+    sift->detectAndCompute(image, cv::noArray(), keypoints, descriptors);
+  }
+  const auto end = aos::monotonic_clock::now();
+  LOG(INFO)
+      << "Took: "
+      << (std::chrono::duration<double>(end - start) / kIterations).count();
+  // Should be ~352 for FRC-Image4-cleaned.png downscaled to 640x360.
+  // 376 in DoG_TYPE_SHORT mode.
+  // 344 now with 1280x720 non-upscaled.
+  LOG(INFO) << "found " << keypoints.size() << " and " << descriptors.size();
+}