Make SIFT faster

This uses various halide-optimized functions to do the actual image
processing. It still finds around the same number of features, but much
faster.

Change-Id: I9d7f7093b0ec41acf7ed16b2c91cdadada2f9a22
diff --git a/y2020/vision/sift/sift971.cc b/y2020/vision/sift/sift971.cc
index 6223f77..7152906 100644
--- a/y2020/vision/sift/sift971.cc
+++ b/y2020/vision/sift/sift971.cc
@@ -111,6 +111,9 @@
 #include <stdarg.h>
 #include <opencv2/core/hal/hal.hpp>
 #include <opencv2/imgproc.hpp>
+#include "glog/logging.h"
+
+#include "y2020/vision/sift/fast_gaussian.h"
 
 using namespace cv;
 
@@ -158,7 +161,7 @@
 // factor used to convert floating-point descriptor to unsigned char
 static const float SIFT_INT_DESCR_FCTR = 512.f;
 
-#define DoG_TYPE_SHORT 0
+#define DoG_TYPE_SHORT 1
 #if DoG_TYPE_SHORT
 // intermediate type used for DoG pyramids
 typedef short sift_wt;
@@ -177,37 +180,7 @@
   scale = octave >= 0 ? 1.f / (1 << octave) : (float)(1 << -octave);
 }
 
-static Mat createInitialImage(const Mat &img, bool doubleImageSize,
-                              float sigma) {
-  Mat gray, gray_fpt;
-  if (img.channels() == 3 || img.channels() == 4) {
-    cvtColor(img, gray, COLOR_BGR2GRAY);
-    gray.convertTo(gray_fpt, DataType<sift_wt>::type, SIFT_FIXPT_SCALE, 0);
-  } else
-    img.convertTo(gray_fpt, DataType<sift_wt>::type, SIFT_FIXPT_SCALE, 0);
-
-  float sig_diff;
-
-  if (doubleImageSize) {
-    sig_diff = sqrtf(
-        std::max(sigma * sigma - SIFT_INIT_SIGMA * SIFT_INIT_SIGMA * 4, 0.01f));
-    Mat dbl;
-#if DoG_TYPE_SHORT
-    resize(gray_fpt, dbl, Size(gray_fpt.cols * 2, gray_fpt.rows * 2), 0, 0,
-           INTER_LINEAR_EXACT);
-#else
-    resize(gray_fpt, dbl, Size(gray_fpt.cols * 2, gray_fpt.rows * 2), 0, 0,
-           INTER_LINEAR);
-#endif
-    GaussianBlur(dbl, dbl, Size(), sig_diff, sig_diff);
-    return dbl;
-  } else {
-    sig_diff = sqrtf(
-        std::max(sigma * sigma - SIFT_INIT_SIGMA * SIFT_INIT_SIGMA, 0.01f));
-    GaussianBlur(gray_fpt, gray_fpt, Size(), sig_diff, sig_diff);
-    return gray_fpt;
-  }
-}
+constexpr bool kLogTiming = false;
 
 }  // namespace
 
@@ -229,14 +202,19 @@
   for (int o = 0; o < nOctaves; o++) {
     for (int i = 0; i < nOctaveLayers + 3; i++) {
       Mat &dst = pyr[o * (nOctaveLayers + 3) + i];
-      if (o == 0 && i == 0) dst = base;
-      // base of new octave is halved image from end of previous octave
-      else if (i == 0) {
+      if (o == 0 && i == 0) {
+        dst = base;
+      } else if (i == 0) {
+        // base of new octave is halved image from end of previous octave
         const Mat &src = pyr[(o - 1) * (nOctaveLayers + 3) + nOctaveLayers];
         resize(src, dst, Size(src.cols / 2, src.rows / 2), 0, 0, INTER_NEAREST);
       } else {
         const Mat &src = pyr[o * (nOctaveLayers + 3) + i - 1];
-        GaussianBlur(src, dst, Size(), sig[i], sig[i]);
+        if (use_fast_gaussian_pyramid_) {
+          FastGaussian(src, &dst, sig[i]);
+        } else {
+          GaussianBlur(src, dst, Size(), sig[i], sig[i]);
+        }
       }
     }
   }
@@ -247,8 +225,12 @@
 class buildDoGPyramidComputer : public ParallelLoopBody {
  public:
   buildDoGPyramidComputer(int _nOctaveLayers, const std::vector<Mat> &_gpyr,
-                          std::vector<Mat> &_dogpyr)
-      : nOctaveLayers(_nOctaveLayers), gpyr(_gpyr), dogpyr(_dogpyr) {}
+                          std::vector<Mat> &_dogpyr,
+                          bool use_fast_subtract_dogpyr)
+      : nOctaveLayers(_nOctaveLayers),
+        gpyr(_gpyr),
+        dogpyr(_dogpyr),
+        use_fast_subtract_dogpyr_(use_fast_subtract_dogpyr) {}
 
   void operator()(const cv::Range &range) const override {
     const int begin = range.start;
@@ -260,15 +242,21 @@
 
       const Mat &src1 = gpyr[o * (nOctaveLayers + 3) + i];
       const Mat &src2 = gpyr[o * (nOctaveLayers + 3) + i + 1];
+      CHECK_EQ(a, o * (nOctaveLayers + 2) + i);
       Mat &dst = dogpyr[o * (nOctaveLayers + 2) + i];
-      subtract(src2, src1, dst, noArray(), DataType<sift_wt>::type);
+      if (use_fast_subtract_dogpyr_) {
+        FastSubtract(src2, src1, &dst);
+      } else {
+        subtract(src2, src1, dst, noArray(), DataType<sift_wt>::type);
+      }
     }
   }
 
  private:
-  int nOctaveLayers;
+  const int nOctaveLayers;
   const std::vector<Mat> &gpyr;
   std::vector<Mat> &dogpyr;
+  const bool use_fast_subtract_dogpyr_;
 };
 
 }  // namespace
@@ -278,8 +266,97 @@
   int nOctaves = (int)gpyr.size() / (nOctaveLayers + 3);
   dogpyr.resize(nOctaves * (nOctaveLayers + 2));
 
+#if 0
   parallel_for_(Range(0, nOctaves * (nOctaveLayers + 2)),
-                buildDoGPyramidComputer(nOctaveLayers, gpyr, dogpyr));
+                buildDoGPyramidComputer(nOctaveLayers, gpyr, dogpyr, use_fast_subtract_dogpyr_));
+#else
+  buildDoGPyramidComputer(
+      nOctaveLayers, gpyr, dogpyr,
+      use_fast_subtract_dogpyr_)(Range(0, nOctaves * (nOctaveLayers + 2)));
+#endif
+}
+
+// base is the image to start with.
+// gpyr is the pyramid of gaussian blurs. This is both an output and a place
+// where we store intermediates.
+// dogpyr is the pyramid of gaussian differences which we fill out.
+// number_octaves is the number of octaves to calculate.
+void SIFT971_Impl::buildGaussianAndDifferencePyramid(
+    const cv::Mat &base, std::vector<cv::Mat> &gpyr,
+    std::vector<cv::Mat> &dogpyr, int number_octaves) const {
+  const int layers_per_octave = nOctaveLayers;
+  // We use the base (possibly after downscaling) as the first "blurred" image.
+  // Then we calculate 2 more than the number of octaves.
+  // TODO(Brian): Why are there 2 extra?
+  const int gpyr_layers_per_octave = layers_per_octave + 3;
+  // There is 1 less difference than the number of blurs.
+  const int dogpyr_layers_per_octave = gpyr_layers_per_octave - 1;
+  gpyr.resize(number_octaves * gpyr_layers_per_octave);
+  dogpyr.resize(number_octaves * dogpyr_layers_per_octave);
+
+  std::vector<double> sig(gpyr_layers_per_octave);
+  // precompute Gaussian sigmas using the following formula:
+  //  \sigma_{total}^2 = \sigma_{i}^2 + \sigma_{i-1}^2
+  sig[0] = sigma;
+  double k = std::pow(2., 1. / layers_per_octave);
+  for (int i = 1; i < gpyr_layers_per_octave; i++) {
+    double sig_prev = std::pow<double>(k, i - 1) * sigma;
+    double sig_total = sig_prev * k;
+    sig[i] = std::sqrt(sig_total * sig_total - sig_prev * sig_prev);
+  }
+
+  for (int octave = 0; octave < number_octaves; octave++) {
+    // At the beginning of each octave, calculate the new base image.
+    {
+      Mat &dst = gpyr[octave * gpyr_layers_per_octave];
+      if (octave == 0) {
+        // For the first octave, it's just the base image.
+        dst = base;
+      } else {
+        // For the other octaves, it's a halved version of the end of the
+        // previous octave.
+        const Mat &src = gpyr[(octave - 1) * gpyr_layers_per_octave +
+                              gpyr_layers_per_octave - 1];
+        resize(src, dst, Size(src.cols / 2, src.rows / 2), 0, 0, INTER_NEAREST);
+      }
+    }
+    // We start with layer==1 because the "first layer" is just the base image
+    // (or a downscaled version of it).
+    for (int layer = 1; layer < gpyr_layers_per_octave; layer++) {
+      // The index where the current layer starts.
+      const int layer_index = octave * gpyr_layers_per_octave + layer;
+      if (use_fast_pyramid_difference_) {
+        const Mat &input = gpyr[layer_index - 1];
+        Mat &blurred = gpyr[layer_index];
+        Mat &difference =
+            dogpyr[octave * dogpyr_layers_per_octave + (layer - 1)];
+        FastGaussianAndSubtract(input, &blurred, &difference, sig[layer]);
+      } else {
+        // First, calculate the new gaussian blur.
+        {
+          const Mat &src = gpyr[layer_index - 1];
+          Mat &dst = gpyr[layer_index];
+          if (use_fast_gaussian_pyramid_) {
+            FastGaussian(src, &dst, sig[layer]);
+          } else {
+            GaussianBlur(src, dst, Size(), sig[layer], sig[layer]);
+          }
+        }
+
+        // Then, calculate the difference from the previous one.
+        {
+          const Mat &src1 = gpyr[layer_index - 1];
+          const Mat &src2 = gpyr[layer_index];
+          Mat &dst = dogpyr[octave * dogpyr_layers_per_octave + (layer - 1)];
+          if (use_fast_subtract_dogpyr_) {
+            FastSubtract(src2, src1, &dst);
+          } else {
+            subtract(src2, src1, dst, noArray(), DataType<sift_wt>::type);
+          }
+        }
+      }
+    }
+  }
 }
 
 namespace {
@@ -1073,7 +1150,7 @@
                                     std::vector<KeyPoint> &keypoints,
                                     OutputArray _descriptors,
                                     bool useProvidedKeypoints) {
-  int firstOctave = -1, actualNOctaves = 0, actualNLayers = 0;
+  int firstOctave = 0, actualNOctaves = 0, actualNLayers = 0;
   Mat image = _image.getMat(), mask = _mask.getMat();
 
   if (image.empty() || image.depth() != CV_8U)
@@ -1084,6 +1161,7 @@
     CV_Error(Error::StsBadArg, "mask has incorrect type (!=CV_8UC1)");
 
   if (useProvidedKeypoints) {
+    LOG_IF(INFO, kLogTiming);
     firstOctave = 0;
     int maxOctave = INT_MIN;
     for (size_t i = 0; i < keypoints.size(); i++) {
@@ -1100,35 +1178,39 @@
     actualNOctaves = maxOctave - firstOctave + 1;
   }
 
-  Mat base = createInitialImage(image, firstOctave < 0, (float)sigma);
+  LOG_IF(INFO, kLogTiming);
+  Mat base = createInitialImage(image, firstOctave < 0);
+  LOG_IF(INFO, kLogTiming);
   std::vector<Mat> gpyr;
-  int nOctaves =
-      actualNOctaves > 0
-          ? actualNOctaves
-          : cvRound(std::log((double)std::min(base.cols, base.rows)) /
-                        std::log(2.) -
-                    2) -
-                firstOctave;
-
-  // double t, tf = getTickFrequency();
-  // t = (double)getTickCount();
-  buildGaussianPyramid(base, gpyr, nOctaves);
-
-  // t = (double)getTickCount() - t;
-  // printf("pyramid construction time: %g\n", t*1000./tf);
+  int nOctaves;
+  if (actualNOctaves > 0) {
+    nOctaves = actualNOctaves;
+  } else {
+    nOctaves = cvRound(std::log((double)std::min(base.cols, base.rows)) /
+                           std::log(2.) -
+                       2) -
+               firstOctave;
+  }
 
   if (!useProvidedKeypoints) {
     std::vector<Mat> dogpyr;
-    buildDoGPyramid(gpyr, dogpyr);
-    // t = (double)getTickCount();
+    if (use_fused_pyramid_difference_) {
+      buildGaussianAndDifferencePyramid(base, gpyr, dogpyr, nOctaves);
+      LOG_IF(INFO, kLogTiming);
+    } else {
+      buildGaussianPyramid(base, gpyr, nOctaves);
+      LOG_IF(INFO, kLogTiming);
+
+      buildDoGPyramid(gpyr, dogpyr);
+      LOG_IF(INFO, kLogTiming);
+    }
+
     findScaleSpaceExtrema(gpyr, dogpyr, keypoints);
     // TODO(Brian): I think it can go faster by knowing they're sorted?
     // KeyPointsFilter::removeDuplicatedSorted( keypoints );
     KeyPointsFilter::removeDuplicated(keypoints);
 
     if (nfeatures > 0) KeyPointsFilter::retainBest(keypoints, nfeatures);
-    // t = (double)getTickCount() - t;
-    // printf("keypoint detection time: %g\n", t*1000./tf);
 
     if (firstOctave < 0)
       for (size_t i = 0; i < keypoints.size(); i++) {
@@ -1140,20 +1222,54 @@
       }
 
     if (!mask.empty()) KeyPointsFilter::runByPixelsMask(keypoints, mask);
+    LOG_IF(INFO, kLogTiming);
   } else {
+    buildGaussianPyramid(base, gpyr, nOctaves);
+    LOG_IF(INFO, kLogTiming);
     // filter keypoints by mask
     // KeyPointsFilter::runByPixelsMask( keypoints, mask );
   }
 
   if (_descriptors.needed()) {
-    // t = (double)getTickCount();
     int dsize = descriptorSize();
     _descriptors.create((int)keypoints.size(), dsize, CV_32F);
     Mat descriptors = _descriptors.getMat();
 
     calcDescriptors(gpyr, keypoints, descriptors, nOctaveLayers, firstOctave);
-    // t = (double)getTickCount() - t;
-    // printf("descriptor extraction time: %g\n", t*1000./tf);
+    LOG_IF(INFO, kLogTiming);
+  }
+}
+
+Mat SIFT971_Impl::createInitialImage(const Mat &img,
+                                     bool doubleImageSize) const {
+  Mat gray, gray_fpt;
+  if (img.channels() == 3 || img.channels() == 4) {
+    cvtColor(img, gray, COLOR_BGR2GRAY);
+    gray.convertTo(gray_fpt, DataType<sift_wt>::type, SIFT_FIXPT_SCALE, 0);
+  } else {
+    img.convertTo(gray_fpt, DataType<sift_wt>::type, SIFT_FIXPT_SCALE, 0);
+  }
+
+  float sig_diff;
+
+  Mat maybe_doubled;
+  if (doubleImageSize) {
+    sig_diff = std::sqrt(
+        std::max(sigma * sigma - SIFT_INIT_SIGMA * SIFT_INIT_SIGMA * 4, 0.01));
+    resize(gray_fpt, maybe_doubled, Size(gray_fpt.cols * 2, gray_fpt.rows * 2),
+           0, 0, INTER_LINEAR);
+  } else {
+    sig_diff = std::sqrt(
+        std::max(sigma * sigma - SIFT_INIT_SIGMA * SIFT_INIT_SIGMA, 0.01));
+    maybe_doubled = gray_fpt;
+  }
+  if (use_fast_guassian_initial_) {
+    Mat temp;
+    FastGaussian(maybe_doubled, &temp, sig_diff);
+    return temp;
+  } else {
+    GaussianBlur(maybe_doubled, maybe_doubled, Size(), sig_diff, sig_diff);
+    return maybe_doubled;
   }
 }