Make AOS starter print out application version on crashes

This way a journal log parser can unambiguously know what application
crahsed. Note that when an application crashes before it can send a
timing report out (e.g., if it crashes during initialization), this will
not end up helping.

Change-Id: I5217aecc395d2419ed7b622358f213cfd8ecd30f
Signed-off-by: James Kuszmaul <james.kuszmaul@bluerivertech.com>
diff --git a/aos/starter/starter.fbs b/aos/starter/starter.fbs
index 7285281..d2cd573 100644
--- a/aos/starter/starter.fbs
+++ b/aos/starter/starter.fbs
@@ -82,6 +82,12 @@
   // actually stopping and the parent process receiving the signal indicating that the application
   // finished stopping.
   process_info: util.ProcessInfo (id: 7);
+
+  // Indicates whether we have observed a recent AOS timing report from
+  // the application. Staleness is calculated based on the timing report period
+  // specified for the starterd application (defaults to 1 Hz, can be overridden
+  // by --timing_report_ms).
+  has_active_timing_report: bool (id: 8);
 }
 
 root_type Status;
diff --git a/aos/starter/starter_test.cc b/aos/starter/starter_test.cc
index 3070f8e..cbb83f6 100644
--- a/aos/starter/starter_test.cc
+++ b/aos/starter/starter_test.cc
@@ -358,6 +358,8 @@
       ASSERT_EQ(pong_app_status->pid(), pong_app_status->process_info()->pid());
       ASSERT_TRUE(pong_app_status->process_info()->has_cpu_usage());
       ASSERT_LE(0.0, pong_app_status->process_info()->cpu_usage());
+      ASSERT_TRUE(pong_app_status->has_has_active_timing_report());
+      ASSERT_TRUE(pong_app_status->has_active_timing_report());
       watcher_loop.Exit();
       SUCCEED();
     }
diff --git a/aos/starter/starterd_lib.cc b/aos/starter/starterd_lib.cc
index f08ecc2..e3c02d2 100644
--- a/aos/starter/starterd_lib.cc
+++ b/aos/starter/starterd_lib.cc
@@ -37,6 +37,7 @@
       event_loop_(event_loop_config),
       status_sender_(event_loop_.MakeSender<aos::starter::Status>("/aos")),
       status_timer_(event_loop_.AddTimer([this] {
+        ServiceTimingReportFetcher();
         SendStatus();
         status_count_ = 0;
       })),
@@ -47,6 +48,8 @@
       max_status_count_(
           event_loop_.GetChannel<aos::starter::Status>("/aos")->frequency() -
           1),
+      timing_report_fetcher_(
+          event_loop_.MakeFetcher<aos::timing::Report>("/aos")),
       shm_base_(FLAGS_shm_base),
       listener_(&event_loop_,
                 [this](signalfd_siginfo signal) { OnSignal(signal); }),
@@ -266,6 +269,16 @@
   event_loop_.Run();
 }
 
+void Starter::ServiceTimingReportFetcher() {
+  while (timing_report_fetcher_.FetchNext()) {
+    for (auto &application : applications_) {
+      application.second.ObserveTimingReport(
+          timing_report_fetcher_.context().monotonic_event_time,
+          timing_report_fetcher_.get());
+    }
+  }
+}
+
 void Starter::SendStatus() {
   aos::Sender<aos::starter::Status>::Builder builder =
       status_sender_.MakeBuilder();
diff --git a/aos/starter/starterd_lib.h b/aos/starter/starterd_lib.h
index 775cdbc..81a4deb 100644
--- a/aos/starter/starterd_lib.h
+++ b/aos/starter/starterd_lib.h
@@ -59,6 +59,10 @@
   // limit.
   void HandleStateChange();
 
+  // Called periodically to run through the timing report fetcher and alert all
+  // the Application's to the new messages.
+  void ServiceTimingReportFetcher();
+
   void SendStatus();
 
   // Creates a MemoryMappedQueue for the given channel, to pre-allocate shared
@@ -77,6 +81,8 @@
   int status_count_ = 0;
   const int max_status_count_;
 
+  aos::Fetcher<aos::timing::Report> timing_report_fetcher_;
+
   std::unordered_map<std::string, Application> applications_;
 
   // Lock and list of all the queues.  This makes it so we can initialize the
diff --git a/aos/starter/subprocess.cc b/aos/starter/subprocess.cc
index b1320f4..a518c0b 100644
--- a/aos/starter/subprocess.cc
+++ b/aos/starter/subprocess.cc
@@ -8,6 +8,8 @@
 
 #include "glog/logging.h"
 
+#include "aos/flatbuffer_merge.h"
+
 namespace aos::starter {
 
 // RAII class to become root and restore back to the original user and group
@@ -202,6 +204,7 @@
       id_ = next_id_++;
       start_time_ = event_loop_->monotonic_now();
       status_ = aos::starter::State::STARTING;
+      latest_timing_report_version_.reset();
       LOG_IF(INFO, quiet_flag_ == QuietLogging::kNo)
           << "Starting '" << name_ << "' pid " << pid_;
 
@@ -308,6 +311,16 @@
   _exit(EXIT_FAILURE);
 }
 
+void Application::ObserveTimingReport(
+    const aos::monotonic_clock::time_point send_time,
+    const aos::timing::Report *msg) {
+  if (msg->name()->string_view() == name_ && msg->pid() == pid_ &&
+      msg->has_version()) {
+    latest_timing_report_version_ = msg->version()->str();
+    last_timing_report_ = send_time;
+  }
+}
+
 void Application::FetchOutputs() {
   if (capture_stdout_) {
     stdout_pipes_.read->Read(&stdout_);
@@ -462,6 +475,12 @@
   if (exit_code_.has_value()) {
     status_builder.add_last_exit_code(exit_code_.value());
   }
+  status_builder.add_has_active_timing_report(
+      last_timing_report_ +
+          // Leave a bit of margin on the timing report receipt time, to allow
+          // for timing errors.
+          3 * std::chrono::milliseconds(FLAGS_timing_report_ms) >
+      event_loop_->monotonic_now());
   status_builder.add_last_stop_reason(stop_reason_);
   if (pid_ != -1) {
     status_builder.add_pid(pid_);
@@ -572,9 +591,17 @@
             << "Application '" << name_ << "' pid " << pid_
             << " exited with status " << exit_code_.value();
       } else {
-        LOG_IF(WARNING, quiet_flag_ == QuietLogging::kNo)
-            << "Application '" << name_ << "' pid " << pid_
-            << " exited unexpectedly with status " << exit_code_.value();
+        if (quiet_flag_ == QuietLogging::kNo) {
+          std::string version_string =
+              latest_timing_report_version_.has_value()
+                  ? absl::StrCat("'", latest_timing_report_version_.value(),
+                                 "'")
+                  : "unknown";
+          LOG_IF(WARNING, quiet_flag_ == QuietLogging::kNo)
+              << "Application '" << name_ << "' pid " << pid_ << " version "
+              << version_string << " exited unexpectedly with status "
+              << exit_code_.value();
+        }
       }
       if (autorestart()) {
         QueueStart();
diff --git a/aos/starter/subprocess.h b/aos/starter/subprocess.h
index ff62117..1efd56d 100644
--- a/aos/starter/subprocess.h
+++ b/aos/starter/subprocess.h
@@ -124,6 +124,12 @@
     memory_cgroup_->SetLimit("memory.limit_in_bytes", limit);
   }
 
+  // Observe a timing report message, and save it if it is relevant to us.
+  // It is the responsibility of the caller to manage this, because the lifetime
+  // of the Application itself is such that it cannot own Fetchers readily.
+  void ObserveTimingReport(const aos::monotonic_clock::time_point send_time,
+                           const aos::timing::Report *msg);
+
  private:
   typedef aos::util::ScopedPipe::PipePair PipePair;
 
@@ -192,6 +198,12 @@
   aos::TimerHandler *start_timer_, *restart_timer_, *stop_timer_, *pipe_timer_,
       *child_status_handler_;
 
+  // Version string from the most recent valid timing report for this
+  // application. Cleared when the application restarts.
+  std::optional<std::string> latest_timing_report_version_;
+  aos::monotonic_clock::time_point last_timing_report_ =
+      aos::monotonic_clock::min_time;
+
   std::vector<std::function<void()>> on_change_;
 
   std::unique_ptr<MemoryCGroup> memory_cgroup_;