Optimize discretization of A/Q for EKF

Because we are recomputing the discretization of A/Q for every single
step in the EKF, it had been taking up ~70-80% of the total computation
time of the entire EKF. This knocks it down to something like 50% of the
total EKF computation time (for ~3-4x speed improvements overall).

Because the discretizations of A/Q are dependent on both dt and on the
heading of the robot, they are not trivially cacheable. Future
improvements would be:
-Only recalculate discretization when the timestep changes and assume
 that the heading estimate will only change by minute amounts and
 so we don't need to redescritize unless dt has changed.
-Figure out how to cache 7x7 components of each that should stay
 constant for constant dt.
-Determine whether any convenient relationships hold with respect to
 dt/heading that would let us make a reasonably small interpolation
 table.

Change-Id: I8838d4837f380a2411ab9da99d542cdbe9999e41
diff --git a/frc971/control_loops/c2d.h b/frc971/control_loops/c2d.h
index 1bf99a2..b6f4198 100644
--- a/frc971/control_loops/c2d.h
+++ b/frc971/control_loops/c2d.h
@@ -64,8 +64,60 @@
   Eigen::Matrix<Scalar, num_states, num_states> phi22 =
       phi.block(num_states, num_states, num_states, num_states);
 
-  *Q_d = phi22.transpose() * phi12;
-  *Q_d = (*Q_d + Q_d->transpose()) / static_cast<Scalar>(2.0);
+  Qtemp = phi22.transpose() * phi12;
+  *Q_d = (Qtemp + Qtemp.transpose()) / static_cast<Scalar>(2.0);
+}
+
+// Does a faster approximation for the discretizing A/Q, for if solving a 2Nx2N
+// matrix exponential is too expensive.
+// Basic reasoning/method:
+// The original algorithm does a matrix exponential on a 2N x 2N matrix (the
+// block matrix made of of A and Q). This is extremely expensive for larg-ish
+// matrices. This function takes advantage of the structure of the matrix
+// we are taking the exponential and notes that we care about two things:
+// 1) The exponential of A*t, which is only NxN and so is relatively cheap.
+// 2) The upper-right quarter of the 2Nx2N matrix, which we can approximate
+//    using a taylor series to several terms and still be substantially cheaper
+//    than taking the big exponential.
+template <typename Scalar, int num_states>
+void DiscretizeQAFast(
+    const Eigen::Matrix<Scalar, num_states, num_states> &Q_continuous,
+    const Eigen::Matrix<Scalar, num_states, num_states> &A_continuous,
+    ::std::chrono::nanoseconds dt,
+    Eigen::Matrix<Scalar, num_states, num_states> *Q_d,
+    Eigen::Matrix<Scalar, num_states, num_states> *A_d) {
+  Eigen::Matrix<Scalar, num_states, num_states> Qtemp =
+      (Q_continuous + Q_continuous.transpose()) / static_cast<Scalar>(2.0);
+  Scalar dt_d =
+      ::std::chrono::duration_cast<::std::chrono::duration<Scalar>>(dt).count();
+  Eigen::Matrix<Scalar, num_states, num_states> last_term = Qtemp;
+  double last_coeff = dt_d;
+  const Eigen::Matrix<Scalar, num_states, num_states> At =
+      A_continuous.transpose();
+  Eigen::Matrix<Scalar, num_states, num_states> Atn = At;
+  Eigen::Matrix<Scalar, num_states, num_states> phi12 = last_term * last_coeff;
+  Eigen::Matrix<Scalar, num_states, num_states> phi22 =
+      At.Identity() + Atn * last_coeff;
+  // TODO(james): Tune this once we have the robot up; ii < 6 is enough to get
+  // beyond any real numerical issues. 5 should be fine, but just happened to
+  // kick a test over to failing. 4 would probably work.
+  for (int ii = 2; ii < 6; ++ii) {
+    Eigen::Matrix<Scalar, num_states, num_states> next_term =
+        -A_continuous * last_term + Qtemp * Atn;
+    last_coeff *= dt_d / static_cast<Scalar>(ii);
+    phi12 += next_term * last_coeff;
+
+    last_term = next_term;
+
+    Atn *= At;
+    phi22 += last_coeff * Atn;
+  }
+  Eigen::Matrix<Scalar, num_states, num_states> phi22t =
+    phi22.transpose();
+
+  Qtemp = phi22t * phi12;
+  *Q_d = (Qtemp + Qtemp.transpose()) / static_cast<Scalar>(2.0);
+  *A_d = phi22t;
 }
 
 }  // namespace controls
diff --git a/frc971/control_loops/c2d_test.cc b/frc971/control_loops/c2d_test.cc
index dcdb048..a0c4deb 100644
--- a/frc971/control_loops/c2d_test.cc
+++ b/frc971/control_loops/c2d_test.cc
@@ -69,6 +69,21 @@
       << "\nQ_d_integrated:\n" << Q_d_integrated;
 }
 
+// Tests that the "fast" discretization produces nearly identical results.
+TEST_F(C2DTest, DiscretizeQAFast) {
+  Eigen::Matrix<double, 2, 2> Q_d;
+  Eigen::Matrix<double, 2, 2> Q_d_fast;
+  Eigen::Matrix<double, 2, 2> A_d;
+  Eigen::Matrix<double, 2, 2> A_d_fast;
+  Eigen::Matrix<double, 2, 1> B_d;
+  const auto dt = ::std::chrono::seconds(1);
+  DiscretizeQ(Q_continuous, A_continuous, dt, &Q_d);
+  C2D(A_continuous, B_continuous, dt, &A_d, &B_d);
+  DiscretizeQAFast(Q_continuous, A_continuous, dt, &Q_d_fast, &A_d_fast);
+  EXPECT_LT((Q_d - Q_d_fast).norm(), 1e-20);
+  EXPECT_LT((A_d - A_d_fast).norm(), 1e-20);
+}
+
 }  // namespace testing
 }  // namespace controls
 }  // namespace frc971
diff --git a/frc971/control_loops/drivetrain/hybrid_ekf.h b/frc971/control_loops/drivetrain/hybrid_ekf.h
index 01a49a9..82f409c 100644
--- a/frc971/control_loops/drivetrain/hybrid_ekf.h
+++ b/frc971/control_loops/drivetrain/hybrid_ekf.h
@@ -251,12 +251,8 @@
                    StateSquare *P) {
     StateSquare A_c = AForState(*state);
     StateSquare A_d;
-    Eigen::Matrix<Scalar, kNStates, 0> dummy_B;
-    controls::C2D<Scalar, kNStates, 0>(
-        A_c, Eigen::Matrix<Scalar, kNStates, 0>::Zero(), dt, &A_d, &dummy_B);
     StateSquare Q_d;
-    controls::DiscretizeQ(Q_continuous_, A_c, dt, &Q_d);
-    *P = A_d * *P * A_d.transpose() + Q_d;
+    controls::DiscretizeQAFast(Q_continuous_, A_c, dt, &Q_d, &A_d);
 
     *state = RungeKuttaU(
         [this](const State &X,
@@ -264,6 +260,9 @@
         *state, U,
         ::std::chrono::duration_cast<::std::chrono::duration<double>>(dt)
             .count());
+
+    StateSquare Ptemp = A_d * *P * A_d.transpose() + Q_d;
+    *P = Ptemp;
   }
 
   void CorrectImpl(const Eigen::Matrix<Scalar, kNOutputs, kNOutputs> &R,
@@ -274,8 +273,9 @@
     Eigen::Matrix<Scalar, kNStates, kNOutputs> PH = *P * H.transpose();
     Eigen::Matrix<Scalar, kNOutputs, kNOutputs> S = H * PH + R;
     Eigen::Matrix<Scalar, kNStates, kNOutputs> K = PH * S.inverse();
-    *state = *state + K * err;
-    *P = (StateSquare::Identity() - K * H) * *P;
+    *state += K * err;
+    StateSquare Ptemp = (StateSquare::Identity() - K * H) * *P;
+    *P = Ptemp;
   }
 
   void ProcessObservation(Observation *obs, const std::chrono::nanoseconds dt,