Make AOS starter print out application version on crashes
This way a journal log parser can unambiguously know what application
crahsed. Note that when an application crashes before it can send a
timing report out (e.g., if it crashes during initialization), this will
not end up helping.
Change-Id: I5217aecc395d2419ed7b622358f213cfd8ecd30f
Signed-off-by: James Kuszmaul <james.kuszmaul@bluerivertech.com>
diff --git a/aos/starter/starter.fbs b/aos/starter/starter.fbs
index 7285281..d2cd573 100644
--- a/aos/starter/starter.fbs
+++ b/aos/starter/starter.fbs
@@ -82,6 +82,12 @@
// actually stopping and the parent process receiving the signal indicating that the application
// finished stopping.
process_info: util.ProcessInfo (id: 7);
+
+ // Indicates whether we have observed a recent AOS timing report from
+ // the application. Staleness is calculated based on the timing report period
+ // specified for the starterd application (defaults to 1 Hz, can be overridden
+ // by --timing_report_ms).
+ has_active_timing_report: bool (id: 8);
}
root_type Status;
diff --git a/aos/starter/starter_test.cc b/aos/starter/starter_test.cc
index 3070f8e..cbb83f6 100644
--- a/aos/starter/starter_test.cc
+++ b/aos/starter/starter_test.cc
@@ -358,6 +358,8 @@
ASSERT_EQ(pong_app_status->pid(), pong_app_status->process_info()->pid());
ASSERT_TRUE(pong_app_status->process_info()->has_cpu_usage());
ASSERT_LE(0.0, pong_app_status->process_info()->cpu_usage());
+ ASSERT_TRUE(pong_app_status->has_has_active_timing_report());
+ ASSERT_TRUE(pong_app_status->has_active_timing_report());
watcher_loop.Exit();
SUCCEED();
}
diff --git a/aos/starter/starterd_lib.cc b/aos/starter/starterd_lib.cc
index f08ecc2..e3c02d2 100644
--- a/aos/starter/starterd_lib.cc
+++ b/aos/starter/starterd_lib.cc
@@ -37,6 +37,7 @@
event_loop_(event_loop_config),
status_sender_(event_loop_.MakeSender<aos::starter::Status>("/aos")),
status_timer_(event_loop_.AddTimer([this] {
+ ServiceTimingReportFetcher();
SendStatus();
status_count_ = 0;
})),
@@ -47,6 +48,8 @@
max_status_count_(
event_loop_.GetChannel<aos::starter::Status>("/aos")->frequency() -
1),
+ timing_report_fetcher_(
+ event_loop_.MakeFetcher<aos::timing::Report>("/aos")),
shm_base_(FLAGS_shm_base),
listener_(&event_loop_,
[this](signalfd_siginfo signal) { OnSignal(signal); }),
@@ -266,6 +269,16 @@
event_loop_.Run();
}
+void Starter::ServiceTimingReportFetcher() {
+ while (timing_report_fetcher_.FetchNext()) {
+ for (auto &application : applications_) {
+ application.second.ObserveTimingReport(
+ timing_report_fetcher_.context().monotonic_event_time,
+ timing_report_fetcher_.get());
+ }
+ }
+}
+
void Starter::SendStatus() {
aos::Sender<aos::starter::Status>::Builder builder =
status_sender_.MakeBuilder();
diff --git a/aos/starter/starterd_lib.h b/aos/starter/starterd_lib.h
index 775cdbc..81a4deb 100644
--- a/aos/starter/starterd_lib.h
+++ b/aos/starter/starterd_lib.h
@@ -59,6 +59,10 @@
// limit.
void HandleStateChange();
+ // Called periodically to run through the timing report fetcher and alert all
+ // the Application's to the new messages.
+ void ServiceTimingReportFetcher();
+
void SendStatus();
// Creates a MemoryMappedQueue for the given channel, to pre-allocate shared
@@ -77,6 +81,8 @@
int status_count_ = 0;
const int max_status_count_;
+ aos::Fetcher<aos::timing::Report> timing_report_fetcher_;
+
std::unordered_map<std::string, Application> applications_;
// Lock and list of all the queues. This makes it so we can initialize the
diff --git a/aos/starter/subprocess.cc b/aos/starter/subprocess.cc
index b1320f4..a518c0b 100644
--- a/aos/starter/subprocess.cc
+++ b/aos/starter/subprocess.cc
@@ -8,6 +8,8 @@
#include "glog/logging.h"
+#include "aos/flatbuffer_merge.h"
+
namespace aos::starter {
// RAII class to become root and restore back to the original user and group
@@ -202,6 +204,7 @@
id_ = next_id_++;
start_time_ = event_loop_->monotonic_now();
status_ = aos::starter::State::STARTING;
+ latest_timing_report_version_.reset();
LOG_IF(INFO, quiet_flag_ == QuietLogging::kNo)
<< "Starting '" << name_ << "' pid " << pid_;
@@ -308,6 +311,16 @@
_exit(EXIT_FAILURE);
}
+void Application::ObserveTimingReport(
+ const aos::monotonic_clock::time_point send_time,
+ const aos::timing::Report *msg) {
+ if (msg->name()->string_view() == name_ && msg->pid() == pid_ &&
+ msg->has_version()) {
+ latest_timing_report_version_ = msg->version()->str();
+ last_timing_report_ = send_time;
+ }
+}
+
void Application::FetchOutputs() {
if (capture_stdout_) {
stdout_pipes_.read->Read(&stdout_);
@@ -462,6 +475,12 @@
if (exit_code_.has_value()) {
status_builder.add_last_exit_code(exit_code_.value());
}
+ status_builder.add_has_active_timing_report(
+ last_timing_report_ +
+ // Leave a bit of margin on the timing report receipt time, to allow
+ // for timing errors.
+ 3 * std::chrono::milliseconds(FLAGS_timing_report_ms) >
+ event_loop_->monotonic_now());
status_builder.add_last_stop_reason(stop_reason_);
if (pid_ != -1) {
status_builder.add_pid(pid_);
@@ -572,9 +591,17 @@
<< "Application '" << name_ << "' pid " << pid_
<< " exited with status " << exit_code_.value();
} else {
- LOG_IF(WARNING, quiet_flag_ == QuietLogging::kNo)
- << "Application '" << name_ << "' pid " << pid_
- << " exited unexpectedly with status " << exit_code_.value();
+ if (quiet_flag_ == QuietLogging::kNo) {
+ std::string version_string =
+ latest_timing_report_version_.has_value()
+ ? absl::StrCat("'", latest_timing_report_version_.value(),
+ "'")
+ : "unknown";
+ LOG_IF(WARNING, quiet_flag_ == QuietLogging::kNo)
+ << "Application '" << name_ << "' pid " << pid_ << " version "
+ << version_string << " exited unexpectedly with status "
+ << exit_code_.value();
+ }
}
if (autorestart()) {
QueueStart();
diff --git a/aos/starter/subprocess.h b/aos/starter/subprocess.h
index ff62117..1efd56d 100644
--- a/aos/starter/subprocess.h
+++ b/aos/starter/subprocess.h
@@ -124,6 +124,12 @@
memory_cgroup_->SetLimit("memory.limit_in_bytes", limit);
}
+ // Observe a timing report message, and save it if it is relevant to us.
+ // It is the responsibility of the caller to manage this, because the lifetime
+ // of the Application itself is such that it cannot own Fetchers readily.
+ void ObserveTimingReport(const aos::monotonic_clock::time_point send_time,
+ const aos::timing::Report *msg);
+
private:
typedef aos::util::ScopedPipe::PipePair PipePair;
@@ -192,6 +198,12 @@
aos::TimerHandler *start_timer_, *restart_timer_, *stop_timer_, *pipe_timer_,
*child_status_handler_;
+ // Version string from the most recent valid timing report for this
+ // application. Cleared when the application restarts.
+ std::optional<std::string> latest_timing_report_version_;
+ aos::monotonic_clock::time_point last_timing_report_ =
+ aos::monotonic_clock::min_time;
+
std::vector<std::function<void()>> on_change_;
std::unique_ptr<MemoryCGroup> memory_cgroup_;