Use a solver for hub estimation

Uses ceres to estimate the rotation and distance
from the hub to the camera.
Ready for reviewing

Signed-off-by: Milind Upadhyay <milind.upadhyay@gmail.com>
Change-Id: Idd4fb3334146a0587c713887626cb0abe43cdd4e
diff --git a/y2022/vision/target_estimator.cc b/y2022/vision/target_estimator.cc
index 1ac3d55..3ebec95 100644
--- a/y2022/vision/target_estimator.cc
+++ b/y2022/vision/target_estimator.cc
@@ -1,31 +1,339 @@
 #include "y2022/vision/target_estimator.h"
 
+#include "absl/strings/str_format.h"
+#include "aos/time/time.h"
+#include "ceres/ceres.h"
+#include "frc971/control_loops/quaternion_utils.h"
+#include "opencv2/core/core.hpp"
+#include "opencv2/core/eigen.hpp"
+#include "opencv2/features2d.hpp"
+#include "opencv2/highgui/highgui.hpp"
+#include "opencv2/imgproc.hpp"
+
+DEFINE_bool(freeze_roll, true, "If true, don't solve for roll");
+DEFINE_bool(freeze_pitch, false, "If true, don't solve for pitch");
+DEFINE_bool(freeze_yaw, false, "If true, don't solve for yaw");
+DEFINE_bool(freeze_camera_height, true,
+            "If true, don't solve for camera height");
+DEFINE_bool(freeze_angle_to_camera, true,
+            "If true, don't solve for polar angle to camera");
+
+DEFINE_uint64(max_num_iterations, 200,
+              "Maximum number of iterations for the ceres solver");
+DEFINE_bool(solver_output, false,
+            "If true, log the solver progress and results");
+
 namespace y2022::vision {
 
-void TargetEstimator::EstimateTargetLocation(cv::Point2i centroid,
-                                             const cv::Mat &intrinsics,
-                                             const cv::Mat &extrinsics,
-                                             TargetEstimate::Builder *builder) {
-  const cv::Point2d focal_length(intrinsics.at<double>(0, 0),
-                                 intrinsics.at<double>(1, 1));
-  const cv::Point2d offset(intrinsics.at<double>(0, 2),
-                           intrinsics.at<double>(1, 2));
+std::vector<cv::Point3d> TargetEstimator::ComputeTapePoints() {
+  std::vector<cv::Point3d> tape_points;
+  tape_points.reserve(kNumPiecesOfTape);
 
-  // Blob pitch in camera reference frame
-  const double blob_pitch =
-      std::atan(static_cast<double>(-(centroid.y - offset.y)) /
-                static_cast<double>(focal_length.y));
-  const double camera_height = extrinsics.at<double>(2, 3);
-  // Depth from camera to blob
-  const double depth = (kTapeHeight - camera_height) / std::tan(blob_pitch);
+  constexpr size_t kNumVisiblePiecesOfTape = 5;
+  for (size_t i = 0; i < kNumVisiblePiecesOfTape; i++) {
+    // The center piece of tape is at 0 rad, so the angle indices are offset
+    // by the number of pieces of tape on each side of it
+    const double theta_index =
+        static_cast<double>(i) - ((kNumVisiblePiecesOfTape - 1) / 2);
+    // The polar angle is a multiple of the angle between tape centers
+    double theta = theta_index * ((2.0 * M_PI) / kNumPiecesOfTape);
+    tape_points.emplace_back(kUpperHubRadius * std::cos(theta),
+                             kUpperHubRadius * std::sin(theta), kTapeHeight);
+  }
 
-  double angle_to_target =
-      std::atan2(static_cast<double>(centroid.x - offset.x),
-                 static_cast<double>(focal_length.x));
-  double distance = (depth / std::cos(angle_to_target)) + kUpperHubRadius;
+  return tape_points;
+}
 
-  builder->add_angle_to_target(angle_to_target);
-  builder->add_distance(distance);
+const std::vector<cv::Point3d> TargetEstimator::kTapePoints =
+    ComputeTapePoints();
+
+TargetEstimator::TargetEstimator(cv::Mat intrinsics, cv::Mat extrinsics)
+    : centroids_(),
+      image_(std::nullopt),
+      roll_(0.0),
+      pitch_(0.0),
+      yaw_(M_PI),
+      distance_(3.0),
+      angle_to_camera_(0.0),
+      // TODO(milind): add IMU height
+      camera_height_(extrinsics.at<double>(2, 3)) {
+  cv::cv2eigen(intrinsics, intrinsics_);
+  cv::cv2eigen(extrinsics, extrinsics_);
+}
+
+namespace {
+void SetBoundsOrFreeze(double *param, bool freeze, double min, double max,
+                       ceres::Problem *problem) {
+  if (freeze) {
+    problem->SetParameterization(
+        param, new ceres::SubsetParameterization(1, std::vector<int>{0}));
+  } else {
+    problem->SetParameterLowerBound(param, 0, min);
+    problem->SetParameterUpperBound(param, 0, max);
+  }
+}
+}  // namespace
+
+void TargetEstimator::Solve(const std::vector<cv::Point> &centroids,
+                            std::optional<cv::Mat> image) {
+  auto start = aos::monotonic_clock::now();
+
+  centroids_ = centroids;
+  image_ = image;
+
+  ceres::Problem problem;
+
+  // Set up the only cost function (also known as residual). This uses
+  // auto-differentiation to obtain the derivative (jacobian).
+  ceres::CostFunction *cost_function =
+      new ceres::AutoDiffCostFunction<TargetEstimator, ceres::DYNAMIC, 1, 1, 1,
+                                      1, 1, 1>(this, centroids_.size() * 2,
+                                               ceres::DO_NOT_TAKE_OWNERSHIP);
+
+  // TODO(milind): add loss function when we get more noisy data
+  problem.AddResidualBlock(cost_function, nullptr, &roll_, &pitch_, &yaw_,
+                           &distance_, &angle_to_camera_, &camera_height_);
+
+  // TODO(milind): seed values at localizer output, and constrain to be close to
+  // that.
+
+  // Constrain the rotation, otherwise there can be multiple solutions.
+  // There shouldn't be too much roll or pitch
+  constexpr double kMaxRoll = 0.1;
+  SetBoundsOrFreeze(&roll_, FLAGS_freeze_roll, -kMaxRoll, kMaxRoll, &problem);
+
+  constexpr double kPitch = -35.0 * M_PI / 180.0;
+  constexpr double kMaxPitchDelta = 0.15;
+  SetBoundsOrFreeze(&pitch_, FLAGS_freeze_pitch, kPitch - kMaxPitchDelta,
+                    kPitch + kMaxPitchDelta, &problem);
+  // Constrain the yaw to where the target would be visible
+  constexpr double kMaxYawDelta = M_PI / 4.0;
+  SetBoundsOrFreeze(&yaw_, FLAGS_freeze_yaw, M_PI - kMaxYawDelta,
+                    M_PI + kMaxYawDelta, &problem);
+
+  constexpr double kMaxHeightDelta = 0.1;
+  SetBoundsOrFreeze(&camera_height_, FLAGS_freeze_camera_height,
+                    camera_height_ - kMaxHeightDelta,
+                    camera_height_ + kMaxHeightDelta, &problem);
+
+  // Distances shouldn't be too close to the target or too far
+  constexpr double kMinDistance = 1.0;
+  constexpr double kMaxDistance = 10.0;
+  SetBoundsOrFreeze(&distance_, false, kMinDistance, kMaxDistance, &problem);
+
+  // Keep the angle between +/- half of the angle between piece of tape
+  constexpr double kMaxAngle = ((2.0 * M_PI) / kNumPiecesOfTape) / 2.0;
+  SetBoundsOrFreeze(&angle_to_camera_, FLAGS_freeze_angle_to_camera, -kMaxAngle,
+                    kMaxAngle, &problem);
+
+  ceres::Solver::Options options;
+  options.minimizer_progress_to_stdout = FLAGS_solver_output;
+  options.gradient_tolerance = 1e-12;
+  options.function_tolerance = 1e-16;
+  options.parameter_tolerance = 1e-12;
+  options.max_num_iterations = FLAGS_max_num_iterations;
+  ceres::Solver::Summary summary;
+  ceres::Solve(options, &problem, &summary);
+
+  auto end = aos::monotonic_clock::now();
+  LOG(INFO) << "Target estimation elapsed time: "
+            << std::chrono::duration<double, std::milli>(end - start).count()
+            << " ms";
+
+  if (FLAGS_solver_output) {
+    LOG(INFO) << summary.FullReport();
+
+    LOG(INFO) << "roll: " << roll_;
+    LOG(INFO) << "pitch: " << pitch_;
+    LOG(INFO) << "yaw: " << yaw_;
+    LOG(INFO) << "angle to target (based on yaw): " << angle_to_target();
+    LOG(INFO) << "angle to camera (polar): " << angle_to_camera_;
+    LOG(INFO) << "distance (polar): " << distance_;
+    LOG(INFO) << "camera height: " << camera_height_;
+  }
+}
+
+namespace {
+// Hacks to extract a double from a scalar, which is either a ceres jet or a
+// double. Only used for debugging and displaying.
+template <typename S>
+double ScalarToDouble(S s) {
+  const double *ptr = reinterpret_cast<double *>(&s);
+  return *ptr;
+}
+
+template <typename S>
+cv::Point2d ScalarPointToDouble(cv::Point_<S> p) {
+  return cv::Point2d(ScalarToDouble(p.x), ScalarToDouble(p.y));
+}
+}  // namespace
+
+template <typename S>
+bool TargetEstimator::operator()(const S *const roll, const S *const pitch,
+                                 const S *const yaw, const S *const distance,
+                                 const S *const theta,
+                                 const S *const camera_height,
+                                 S *residual) const {
+  using Vector3s = Eigen::Matrix<S, 3, 1>;
+  using Affine3s = Eigen::Transform<S, 3, Eigen::Affine>;
+
+  Eigen::AngleAxis<S> roll_angle(*roll, Vector3s::UnitX());
+  Eigen::AngleAxis<S> pitch_angle(*pitch, Vector3s::UnitY());
+  Eigen::AngleAxis<S> yaw_angle(*yaw, Vector3s::UnitZ());
+  // Construct the rotation and translation of the camera in the hub's frame
+  Eigen::Quaternion<S> R_camera_hub = yaw_angle * pitch_angle * roll_angle;
+  Vector3s T_camera_hub(*distance * ceres::cos(*theta),
+                        *distance * ceres::sin(*theta), *camera_height);
+
+  Affine3s H_camera_hub = Eigen::Translation<S, 3>(T_camera_hub) * R_camera_hub;
+
+  std::vector<cv::Point_<S>> tape_points_proj;
+  for (cv::Point3d tape_point_hub : kTapePoints) {
+    Vector3s tape_point_hub_eigen(S(tape_point_hub.x), S(tape_point_hub.y),
+                                  S(tape_point_hub.z));
+
+    // With X, Y, Z being world axes and x, y, z being camera axes,
+    // x = Y, y = Z, z = -X
+    static const Eigen::Matrix3d kCameraAxisConversion =
+        (Eigen::Matrix3d() << 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, -1.0, 0.0, 0.0)
+            .finished();
+    // Project the 3d tape point onto the image using the transformation and
+    // intrinsics
+    Vector3s tape_point_proj =
+        intrinsics_ * (kCameraAxisConversion *
+                       (H_camera_hub.inverse() * tape_point_hub_eigen));
+
+    // Normalize the projected point
+    tape_points_proj.emplace_back(tape_point_proj.x() / tape_point_proj.z(),
+                                  tape_point_proj.y() / tape_point_proj.z());
+    VLOG(1) << "Projected tape point: "
+            << ScalarPointToDouble(
+                   tape_points_proj[tape_points_proj.size() - 1]);
+  }
+
+  for (size_t i = 0; i < centroids_.size(); i++) {
+    const auto distance = DistanceFromTape(i, tape_points_proj);
+    // Set the residual to the (x, y) distance of the centroid from the
+    // nearest projected piece of tape
+    residual[i * 2] = distance.x;
+    residual[(i * 2) + 1] = distance.y;
+  }
+
+  if (image_.has_value()) {
+    // Draw the current stage of the solving
+    cv::Mat image = image_->clone();
+    for (size_t i = 0; i < tape_points_proj.size() - 1; i++) {
+      cv::line(image, ScalarPointToDouble(tape_points_proj[i]),
+               ScalarPointToDouble(tape_points_proj[i + 1]),
+               cv::Scalar(255, 255, 255));
+      cv::circle(image, ScalarPointToDouble(tape_points_proj[i]), 2,
+                 cv::Scalar(255, 20, 147), cv::FILLED);
+      cv::circle(image, ScalarPointToDouble(tape_points_proj[i + 1]), 2,
+                 cv::Scalar(255, 20, 147), cv::FILLED);
+    }
+    cv::imshow("image", image);
+    cv::waitKey(10);
+  }
+
+  return true;
+}
+
+namespace {
+template <typename S>
+cv::Point_<S> Distance(cv::Point p, cv::Point_<S> q) {
+  return cv::Point_<S>(S(p.x) - q.x, S(p.y) - q.y);
+}
+
+template <typename S>
+bool Less(cv::Point_<S> distance_1, cv::Point_<S> distance_2) {
+  return (ceres::pow(distance_1.x, 2) + ceres::pow(distance_1.y, 2) <
+          ceres::pow(distance_2.x, 2) + ceres::pow(distance_2.y, 2));
+}
+}  // namespace
+
+template <typename S>
+cv::Point_<S> TargetEstimator::DistanceFromTape(
+    size_t centroid_index,
+    const std::vector<cv::Point_<S>> &tape_points) const {
+  // Figure out the middle index in the tape points
+  size_t middle_index = centroids_.size() / 2;
+  if (centroids_.size() % 2 == 0) {
+    // There are two possible middles in this case. Figure out which one fits
+    // the current better
+    const auto tape_middle = tape_points[tape_points.size() / 2];
+    const auto middle_distance_1 =
+        Distance(centroids_[(centroids_.size() / 2) - 1], tape_middle);
+    const auto middle_distance_2 =
+        Distance(centroids_[centroids_.size() / 2], tape_middle);
+    if (Less(middle_distance_1, middle_distance_2)) {
+      middle_index--;
+    }
+  }
+
+  auto distance = cv::Point_<S>(std::numeric_limits<S>::infinity(),
+                                std::numeric_limits<S>::infinity());
+  if (centroid_index == middle_index) {
+    // Fix the middle centroid so the solver can't go too far off
+    distance =
+        Distance(centroids_[middle_index], tape_points[tape_points.size() / 2]);
+  } else {
+    // Give the other centroids some freedom in case some are split into pieces
+    for (auto tape_point : tape_points) {
+      const auto current_distance =
+          Distance(centroids_[centroid_index], tape_point);
+      if (Less(current_distance, distance)) {
+        distance = current_distance;
+      }
+    }
+  }
+
+  return distance;
+}
+
+namespace {
+void DrawEstimateValues(double distance, double angle_to_target,
+                        double angle_to_camera, double roll, double pitch,
+                        double yaw, cv::Mat view_image) {
+  constexpr int kTextX = 10;
+  int text_y = 330;
+  constexpr int kTextSpacing = 35;
+
+  const auto kTextColor = cv::Scalar(0, 255, 255);
+  constexpr double kFontScale = 1.0;
+
+  cv::putText(view_image, absl::StrFormat("Distance: %.3f", distance),
+              cv::Point(kTextX, text_y += kTextSpacing),
+              cv::FONT_HERSHEY_DUPLEX, kFontScale, kTextColor, 2);
+  cv::putText(view_image,
+              absl::StrFormat("Angle to target: %.3f", angle_to_target),
+              cv::Point(kTextX, text_y += kTextSpacing),
+              cv::FONT_HERSHEY_DUPLEX, kFontScale, kTextColor, 2);
+  cv::putText(view_image,
+              absl::StrFormat("Angle to camera: %.3f", angle_to_camera),
+              cv::Point(kTextX, text_y += kTextSpacing),
+              cv::FONT_HERSHEY_DUPLEX, kFontScale, kTextColor, 2);
+
+  cv::putText(
+      view_image,
+      absl::StrFormat("Roll: %.3f, pitch: %.3f, yaw: %.3f", roll, pitch, yaw),
+      cv::Point(kTextX, text_y += kTextSpacing), cv::FONT_HERSHEY_DUPLEX,
+      kFontScale, kTextColor, 2);
+}
+}  // namespace
+
+void TargetEstimator::DrawEstimate(const TargetEstimate &target_estimate,
+                                   cv::Mat view_image) {
+  DrawEstimateValues(target_estimate.distance(),
+                     target_estimate.angle_to_target(),
+                     target_estimate.angle_to_camera(),
+                     target_estimate.rotation_camera_hub()->roll(),
+                     target_estimate.rotation_camera_hub()->pitch(),
+                     target_estimate.rotation_camera_hub()->yaw(), view_image);
+}
+
+void TargetEstimator::DrawEstimate(cv::Mat view_image) const {
+  DrawEstimateValues(distance_, angle_to_target(), angle_to_camera_, roll_,
+                     pitch_, yaw_, view_image);
 }
 
 }  // namespace y2022::vision