aos: Optionally give subprocesses more than 1 second to quit

Right now our application manager is hard-coded to send SIGKILL to its
child if it hasn't shut down within 1 second. This is suboptimal
because some applications simply take longer than 1 second to shut
down.

The goal here is to eventually add a "stop duration" field in the AOS
config.

Change-Id: I885f39d4503519bb75598a1b7e595b56e3f21348
Signed-off-by: James Kuszmaul <james.kuszmaul@bluerivertech.com>
diff --git a/aos/configuration.fbs b/aos/configuration.fbs
index 0b42f5b..b2b34c1 100644
--- a/aos/configuration.fbs
+++ b/aos/configuration.fbs
@@ -169,6 +169,10 @@
   // If set, this is the memory limit to enforce in bytes for the application
   // (and it's children)
   memory_limit:uint64 = 0 (id: 8);
+
+  // If set, this is the number of nanoseconds the application has to stop. If the application
+  // doesn't stop within the specified time, then it is killed.
+  stop_time:int64 = 1000000000 (id: 9);
 }
 
 // Per node data and connection information.
diff --git a/aos/events/logging/multinode_logger_test_lib.h b/aos/events/logging/multinode_logger_test_lib.h
index 63604d6..8f64f66 100644
--- a/aos/events/logging/multinode_logger_test_lib.h
+++ b/aos/events/logging/multinode_logger_test_lib.h
@@ -76,13 +76,13 @@
 };
 
 constexpr std::string_view kCombinedConfigSha1() {
-  return "32514f3a686e5f8936cc4651e7c81350112f7be8d80dcb8d4afaa29d233c5619";
+  return "71eb8341221fbabefb4ddde43bcebf794fd5855e3ad77786a1db0f9e27a39091";
 }
 constexpr std::string_view kSplitConfigSha1() {
-  return "416da222c09d83325c6f453591d34c7ef12c12c2dd129ddeea657c4bec61b7fd";
+  return "f61d45dc0bda026e852e2da9b3e5c2c7f1c89c9f7958cfba3d02e2c960416f04";
 }
 constexpr std::string_view kReloggedSplitConfigSha1() {
-  return "3fe428684a38298d3323ef087f44517574da3f07dd84b3740829156d6d870108";
+  return "3d8fd3d13955b517ee3d66a50b5e4dd7a13fd648f469d16910990418bcfc6beb";
 }
 
 LoggerState MakeLoggerState(NodeEventLoopFactory *node,
diff --git a/aos/starter/subprocess.cc b/aos/starter/subprocess.cc
index 9606adc..d677922 100644
--- a/aos/starter/subprocess.cc
+++ b/aos/starter/subprocess.cc
@@ -266,6 +266,8 @@
   if (application->has_memory_limit() && application->memory_limit() > 0) {
     SetMemoryLimit(application->memory_limit());
   }
+
+  set_stop_grace_period(std::chrono::nanoseconds(application->stop_time()));
 }
 
 void Application::DoStart() {
@@ -496,8 +498,7 @@
 
       // Watchdog timer to SIGKILL application if it is still running 1 second
       // after SIGINT
-      stop_timer_->Schedule(event_loop_->monotonic_now() +
-                            std::chrono::seconds(1));
+      stop_timer_->Schedule(event_loop_->monotonic_now() + stop_grace_period_);
       queue_restart_ = restart;
       OnChange();
       break;
diff --git a/aos/starter/subprocess.h b/aos/starter/subprocess.h
index ca7ef9d..eb36874 100644
--- a/aos/starter/subprocess.h
+++ b/aos/starter/subprocess.h
@@ -139,6 +139,14 @@
   void set_capture_stderr(bool capture);
   void set_run_as_sudo(bool value) { run_as_sudo_ = value; }
 
+  // Sets the time for a process to stop gracefully. If an application is asked
+  // to stop, but doesn't stop within the specified time limit, then it is
+  // forcefully killed. Defaults to 1 second unless overridden by the
+  // aos::Application instance in the constructor.
+  void set_stop_grace_period(std::chrono::nanoseconds stop_grace_period) {
+    stop_grace_period_ = stop_grace_period;
+  }
+
   bool autostart() const { return autostart_; }
 
   bool autorestart() const { return autorestart_; }
@@ -222,6 +230,7 @@
   std::optional<uid_t> user_;
   std::optional<gid_t> group_;
   bool run_as_sudo_ = false;
+  std::chrono::nanoseconds stop_grace_period_ = std::chrono::seconds(1);
 
   bool capture_stdout_ = false;
   PipePair stdout_pipes_;
diff --git a/aos/starter/subprocess_reliable_test.cc b/aos/starter/subprocess_reliable_test.cc
index 0460bc3..701de11 100644
--- a/aos/starter/subprocess_reliable_test.cc
+++ b/aos/starter/subprocess_reliable_test.cc
@@ -3,6 +3,7 @@
 
 #include <filesystem>
 
+#include "gmock/gmock.h"
 #include "gtest/gtest.h"
 
 #include "aos/events/shm_event_loop.h"
@@ -124,4 +125,55 @@
   ASSERT_TRUE(std::filesystem::exists(shutdown_signal_file));
 }
 
+// Validates that a process that is known to take a while to stop can shut down
+// gracefully without being killed.
+TEST(SubprocessTest, CanSlowlyStopGracefully) {
+  const std::string config_file =
+      ::aos::testing::ArtifactPath("aos/events/pingpong_config.json");
+  aos::FlatbufferDetachedBuffer<aos::Configuration> config =
+      aos::configuration::ReadConfig(config_file);
+  aos::ShmEventLoop event_loop(&config.message());
+
+  // Use a file to signal that the subprocess has started up properly and that
+  // the exit handler has been installed. Otherwise we risk killing the process
+  // uncleanly before the signal handler got installed.
+  auto signal_dir = std::filesystem::path(aos::testing::TestTmpDir()) /
+                    "slow_death_startup_file_signals";
+  ASSERT_TRUE(std::filesystem::create_directory(signal_dir));
+  auto startup_signal_file = signal_dir / "startup";
+
+  // Create an application that should never get killed automatically. It should
+  // have plenty of time to shut down on its own. In this case, we use 2 seconds
+  // to mean "plenty of time".
+  auto application = std::make_unique<Application>("/bin/bash", "/bin/bash",
+                                                   &event_loop, [] {});
+  application->set_args(
+      {"-c",
+       absl::StrCat(
+           "trap 'echo got int; sleep 2; echo shutting down; exit 0' SIGINT; "
+           "while true; do sleep 0.1; touch ",
+           startup_signal_file.string(), "; done;")});
+  application->set_capture_stdout(true);
+  application->set_stop_grace_period(std::chrono::seconds(999));
+  application->AddOnChange([&] {
+    if (application->status() == aos::starter::State::STOPPED) {
+      event_loop.Exit();
+    }
+  });
+  application->Start();
+  event_loop
+      .AddTimer([&] {
+        if (std::filesystem::exists(startup_signal_file)) {
+          // Now that the subprocess has properly started up, let's kill it.
+          application->Stop();
+        }
+      })
+      ->Schedule(event_loop.monotonic_now(), std::chrono::milliseconds(100));
+  event_loop.Run();
+
+  EXPECT_EQ(application->exit_code(), 0);
+  EXPECT_THAT(application->GetStdout(), ::testing::HasSubstr("got int"));
+  EXPECT_THAT(application->GetStdout(), ::testing::HasSubstr("shutting down"));
+}
+
 }  // namespace aos::starter::testing