aos: Optionally give subprocesses more than 1 second to quit
Right now our application manager is hard-coded to send SIGKILL to its
child if it hasn't shut down within 1 second. This is suboptimal
because some applications simply take longer than 1 second to shut
down.
The goal here is to eventually add a "stop duration" field in the AOS
config.
Change-Id: I885f39d4503519bb75598a1b7e595b56e3f21348
Signed-off-by: James Kuszmaul <james.kuszmaul@bluerivertech.com>
diff --git a/aos/configuration.fbs b/aos/configuration.fbs
index 0b42f5b..b2b34c1 100644
--- a/aos/configuration.fbs
+++ b/aos/configuration.fbs
@@ -169,6 +169,10 @@
// If set, this is the memory limit to enforce in bytes for the application
// (and it's children)
memory_limit:uint64 = 0 (id: 8);
+
+ // If set, this is the number of nanoseconds the application has to stop. If the application
+ // doesn't stop within the specified time, then it is killed.
+ stop_time:int64 = 1000000000 (id: 9);
}
// Per node data and connection information.
diff --git a/aos/events/logging/multinode_logger_test_lib.h b/aos/events/logging/multinode_logger_test_lib.h
index 63604d6..8f64f66 100644
--- a/aos/events/logging/multinode_logger_test_lib.h
+++ b/aos/events/logging/multinode_logger_test_lib.h
@@ -76,13 +76,13 @@
};
constexpr std::string_view kCombinedConfigSha1() {
- return "32514f3a686e5f8936cc4651e7c81350112f7be8d80dcb8d4afaa29d233c5619";
+ return "71eb8341221fbabefb4ddde43bcebf794fd5855e3ad77786a1db0f9e27a39091";
}
constexpr std::string_view kSplitConfigSha1() {
- return "416da222c09d83325c6f453591d34c7ef12c12c2dd129ddeea657c4bec61b7fd";
+ return "f61d45dc0bda026e852e2da9b3e5c2c7f1c89c9f7958cfba3d02e2c960416f04";
}
constexpr std::string_view kReloggedSplitConfigSha1() {
- return "3fe428684a38298d3323ef087f44517574da3f07dd84b3740829156d6d870108";
+ return "3d8fd3d13955b517ee3d66a50b5e4dd7a13fd648f469d16910990418bcfc6beb";
}
LoggerState MakeLoggerState(NodeEventLoopFactory *node,
diff --git a/aos/starter/subprocess.cc b/aos/starter/subprocess.cc
index 9606adc..d677922 100644
--- a/aos/starter/subprocess.cc
+++ b/aos/starter/subprocess.cc
@@ -266,6 +266,8 @@
if (application->has_memory_limit() && application->memory_limit() > 0) {
SetMemoryLimit(application->memory_limit());
}
+
+ set_stop_grace_period(std::chrono::nanoseconds(application->stop_time()));
}
void Application::DoStart() {
@@ -496,8 +498,7 @@
// Watchdog timer to SIGKILL application if it is still running 1 second
// after SIGINT
- stop_timer_->Schedule(event_loop_->monotonic_now() +
- std::chrono::seconds(1));
+ stop_timer_->Schedule(event_loop_->monotonic_now() + stop_grace_period_);
queue_restart_ = restart;
OnChange();
break;
diff --git a/aos/starter/subprocess.h b/aos/starter/subprocess.h
index ca7ef9d..eb36874 100644
--- a/aos/starter/subprocess.h
+++ b/aos/starter/subprocess.h
@@ -139,6 +139,14 @@
void set_capture_stderr(bool capture);
void set_run_as_sudo(bool value) { run_as_sudo_ = value; }
+ // Sets the time for a process to stop gracefully. If an application is asked
+ // to stop, but doesn't stop within the specified time limit, then it is
+ // forcefully killed. Defaults to 1 second unless overridden by the
+ // aos::Application instance in the constructor.
+ void set_stop_grace_period(std::chrono::nanoseconds stop_grace_period) {
+ stop_grace_period_ = stop_grace_period;
+ }
+
bool autostart() const { return autostart_; }
bool autorestart() const { return autorestart_; }
@@ -222,6 +230,7 @@
std::optional<uid_t> user_;
std::optional<gid_t> group_;
bool run_as_sudo_ = false;
+ std::chrono::nanoseconds stop_grace_period_ = std::chrono::seconds(1);
bool capture_stdout_ = false;
PipePair stdout_pipes_;
diff --git a/aos/starter/subprocess_reliable_test.cc b/aos/starter/subprocess_reliable_test.cc
index 0460bc3..701de11 100644
--- a/aos/starter/subprocess_reliable_test.cc
+++ b/aos/starter/subprocess_reliable_test.cc
@@ -3,6 +3,7 @@
#include <filesystem>
+#include "gmock/gmock.h"
#include "gtest/gtest.h"
#include "aos/events/shm_event_loop.h"
@@ -124,4 +125,55 @@
ASSERT_TRUE(std::filesystem::exists(shutdown_signal_file));
}
+// Validates that a process that is known to take a while to stop can shut down
+// gracefully without being killed.
+TEST(SubprocessTest, CanSlowlyStopGracefully) {
+ const std::string config_file =
+ ::aos::testing::ArtifactPath("aos/events/pingpong_config.json");
+ aos::FlatbufferDetachedBuffer<aos::Configuration> config =
+ aos::configuration::ReadConfig(config_file);
+ aos::ShmEventLoop event_loop(&config.message());
+
+ // Use a file to signal that the subprocess has started up properly and that
+ // the exit handler has been installed. Otherwise we risk killing the process
+ // uncleanly before the signal handler got installed.
+ auto signal_dir = std::filesystem::path(aos::testing::TestTmpDir()) /
+ "slow_death_startup_file_signals";
+ ASSERT_TRUE(std::filesystem::create_directory(signal_dir));
+ auto startup_signal_file = signal_dir / "startup";
+
+ // Create an application that should never get killed automatically. It should
+ // have plenty of time to shut down on its own. In this case, we use 2 seconds
+ // to mean "plenty of time".
+ auto application = std::make_unique<Application>("/bin/bash", "/bin/bash",
+ &event_loop, [] {});
+ application->set_args(
+ {"-c",
+ absl::StrCat(
+ "trap 'echo got int; sleep 2; echo shutting down; exit 0' SIGINT; "
+ "while true; do sleep 0.1; touch ",
+ startup_signal_file.string(), "; done;")});
+ application->set_capture_stdout(true);
+ application->set_stop_grace_period(std::chrono::seconds(999));
+ application->AddOnChange([&] {
+ if (application->status() == aos::starter::State::STOPPED) {
+ event_loop.Exit();
+ }
+ });
+ application->Start();
+ event_loop
+ .AddTimer([&] {
+ if (std::filesystem::exists(startup_signal_file)) {
+ // Now that the subprocess has properly started up, let's kill it.
+ application->Stop();
+ }
+ })
+ ->Schedule(event_loop.monotonic_now(), std::chrono::milliseconds(100));
+ event_loop.Run();
+
+ EXPECT_EQ(application->exit_code(), 0);
+ EXPECT_THAT(application->GetStdout(), ::testing::HasSubstr("got int"));
+ EXPECT_THAT(application->GetStdout(), ::testing::HasSubstr("shutting down"));
+}
+
} // namespace aos::starter::testing