Make starterd less likely to crash when under stress When I added information to the starter daemon to allow it to more easily print out the versions of applications that had crashed, this inadvertently created a scenario where the starterd would crash if the system was under heavy load and the starter fell behind on watching timing reports. We really shouldn't do this in starterd, so alter it so that if the code that looks for timing reports observes that we have fallen behind significantly, it skips over all the old data rather than trying to read every single old message. Change-Id: Ia3318d347a528f1f1539ce9fcad741c3268f4f44 Signed-off-by: James Kuszmaul <james.kuszmaul@bluerivertech.com>

commit: 2c10e05a9b75d872542a2805a8b285ba1e79c77e [log] [tgz]
author: James Kuszmaul <james.kuszmaul@bluerivertech.com> Wed Aug 09 10:22:36 2023 -0700
committer: James Kuszmaul <james.kuszmaul@bluerivertech.com> Wed Sep 13 10:24:04 2023 -0700
tree: 9ac2174328138d57e7734fbeb1ad2c18beb2f445
parent: 1124c51089c50813da06a3894ef533812b4c03c0 [diff]
diff --git a/aos/starter/starterd_lib.cc b/aos/starter/starterd_lib.cc
index e3c02d2..bf0fc31 100644
--- a/aos/starter/starterd_lib.cc
+++ b/aos/starter/starterd_lib.cc

@@ -36,11 +36,13 @@
     : config_msg_(event_loop_config),
       event_loop_(event_loop_config),
       status_sender_(event_loop_.MakeSender<aos::starter::Status>("/aos")),
-      status_timer_(event_loop_.AddTimer([this] {
-        ServiceTimingReportFetcher();
-        SendStatus();
-        status_count_ = 0;
-      })),
+      status_timer_(event_loop_.AddPhasedLoop(
+          [this](int elapsed_cycles) {
+            ServiceTimingReportFetcher(elapsed_cycles);
+            SendStatus();
+            status_count_ = 0;
+          },
+          std::chrono::milliseconds(1000))),
       cleanup_timer_(event_loop_.AddTimer([this] {
         event_loop_.Exit();
         LOG(INFO) << "Starter event loop exit finished.";
@@ -56,11 +58,6 @@
       top_(&event_loop_) {
   event_loop_.SkipAosLog();
 
-  event_loop_.OnRun([this] {
-    status_timer_->Schedule(event_loop_.monotonic_now(),
-                            std::chrono::milliseconds(1000));
-  });
-
   if (!aos::configuration::MultiNode(config_msg_)) {
     event_loop_.MakeWatcher(
         "/aos",
@@ -269,7 +266,14 @@
   event_loop_.Run();
 }
 
-void Starter::ServiceTimingReportFetcher() {
+void Starter::ServiceTimingReportFetcher(int elapsed_cycles) {
+  // If there is any chance that it has been longer than one cycle since we last
+  // serviced the fetcher, call Fetch(). This reduces the chances that the
+  // fetcher falls behind when the system is under heavy load. Dropping a few
+  // timing report messages when the system is under stress is fine.
+  if (timing_report_fetcher_.get() == nullptr || elapsed_cycles > 1) {
+    timing_report_fetcher_.Fetch();
+  }
   while (timing_report_fetcher_.FetchNext()) {
     for (auto &application : applications_) {
       application.second.ObserveTimingReport(

diff --git a/aos/starter/starterd_lib.h b/aos/starter/starterd_lib.h
index 81a4deb..92b20fa 100644
--- a/aos/starter/starterd_lib.h
+++ b/aos/starter/starterd_lib.h

@@ -61,7 +61,7 @@
 
   // Called periodically to run through the timing report fetcher and alert all
   // the Application's to the new messages.
-  void ServiceTimingReportFetcher();
+  void ServiceTimingReportFetcher(int elapsed_cycles);
 
   void SendStatus();
 
@@ -75,7 +75,7 @@
 
   aos::ShmEventLoop event_loop_;
   aos::Sender<aos::starter::Status> status_sender_;
-  aos::TimerHandler *status_timer_;
+  aos::PhasedLoopHandler *status_timer_;
   aos::TimerHandler *cleanup_timer_;
 
   int status_count_ = 0;
commit	2c10e05a9b75d872542a2805a8b285ba1e79c77e	[log] [tgz]
author	James Kuszmaul <james.kuszmaul@bluerivertech.com>	Wed Aug 09 10:22:36 2023 -0700
committer	James Kuszmaul <james.kuszmaul@bluerivertech.com>	Wed Sep 13 10:24:04 2023 -0700
tree	9ac2174328138d57e7734fbeb1ad2c18beb2f445
parent	1124c51089c50813da06a3894ef533812b4c03c0 [diff]