Make starterd less likely to crash when under stress

When I added information to the starter daemon to allow it to more
easily print out the versions of applications that had crashed, this
inadvertently created a scenario where the starterd would crash if the
system was under heavy load and the starter fell behind on watching
timing reports. We really shouldn't do this in starterd, so alter it so
that if the code that looks for timing reports observes that we have
fallen behind significantly, it skips over all the old data rather than
trying to read every single old message.

Change-Id: Ia3318d347a528f1f1539ce9fcad741c3268f4f44
Signed-off-by: James Kuszmaul <james.kuszmaul@bluerivertech.com>
diff --git a/aos/starter/starterd_lib.cc b/aos/starter/starterd_lib.cc
index e3c02d2..bf0fc31 100644
--- a/aos/starter/starterd_lib.cc
+++ b/aos/starter/starterd_lib.cc
@@ -36,11 +36,13 @@
     : config_msg_(event_loop_config),
       event_loop_(event_loop_config),
       status_sender_(event_loop_.MakeSender<aos::starter::Status>("/aos")),
-      status_timer_(event_loop_.AddTimer([this] {
-        ServiceTimingReportFetcher();
-        SendStatus();
-        status_count_ = 0;
-      })),
+      status_timer_(event_loop_.AddPhasedLoop(
+          [this](int elapsed_cycles) {
+            ServiceTimingReportFetcher(elapsed_cycles);
+            SendStatus();
+            status_count_ = 0;
+          },
+          std::chrono::milliseconds(1000))),
       cleanup_timer_(event_loop_.AddTimer([this] {
         event_loop_.Exit();
         LOG(INFO) << "Starter event loop exit finished.";
@@ -56,11 +58,6 @@
       top_(&event_loop_) {
   event_loop_.SkipAosLog();
 
-  event_loop_.OnRun([this] {
-    status_timer_->Schedule(event_loop_.monotonic_now(),
-                            std::chrono::milliseconds(1000));
-  });
-
   if (!aos::configuration::MultiNode(config_msg_)) {
     event_loop_.MakeWatcher(
         "/aos",
@@ -269,7 +266,14 @@
   event_loop_.Run();
 }
 
-void Starter::ServiceTimingReportFetcher() {
+void Starter::ServiceTimingReportFetcher(int elapsed_cycles) {
+  // If there is any chance that it has been longer than one cycle since we last
+  // serviced the fetcher, call Fetch(). This reduces the chances that the
+  // fetcher falls behind when the system is under heavy load. Dropping a few
+  // timing report messages when the system is under stress is fine.
+  if (timing_report_fetcher_.get() == nullptr || elapsed_cycles > 1) {
+    timing_report_fetcher_.Fetch();
+  }
   while (timing_report_fetcher_.FetchNext()) {
     for (auto &application : applications_) {
       application.second.ObserveTimingReport(
diff --git a/aos/starter/starterd_lib.h b/aos/starter/starterd_lib.h
index 81a4deb..92b20fa 100644
--- a/aos/starter/starterd_lib.h
+++ b/aos/starter/starterd_lib.h
@@ -61,7 +61,7 @@
 
   // Called periodically to run through the timing report fetcher and alert all
   // the Application's to the new messages.
-  void ServiceTimingReportFetcher();
+  void ServiceTimingReportFetcher(int elapsed_cycles);
 
   void SendStatus();
 
@@ -75,7 +75,7 @@
 
   aos::ShmEventLoop event_loop_;
   aos::Sender<aos::starter::Status> status_sender_;
-  aos::TimerHandler *status_timer_;
+  aos::PhasedLoopHandler *status_timer_;
   aos::TimerHandler *cleanup_timer_;
 
   int status_count_ = 0;