Prevent starter from killing itself accidentally
This patch aims to fix a race condition where a user of subprocess.cc
(e.g. starter) and `shm_event_loop` can kill itself accidentally. This
generally happens when the user attempts to kill the child before the
`exec()` call has happened. In that case, the
`SignalHandler::DoHandleSignal()` function from `shm_event_loop.cc`
runs inside the child, but actually writes the quit event to the pipe
that the parent is listening to.
The fix here is to block the signals before the fork. The parent
process will unblock the signals before exiting from the `DoStart()`
function. The child will first restore the default signal handlers and
_then_ unblock the signals. This will cause the child process to
terminate as expected when it sees a relevant signal.
If I undo the changes in `subprocess.cc`, the test becomes flaky.
$ bazel test //aos/starter:subprocess_reliable_test --runs_per_test=1000
...
FAILED in 986 out of 1000 in 100.6s
References: PRO-23549
Change-Id: I958071df561bcdef78088740136e52a8956dfffe
Signed-off-by: James Kuszmaul <james.kuszmaul@bluerivertech.com>
diff --git a/aos/starter/BUILD b/aos/starter/BUILD
index 732cfe3..1873166 100644
--- a/aos/starter/BUILD
+++ b/aos/starter/BUILD
@@ -93,6 +93,24 @@
],
)
+# Similar to subprocess_test, but here are all the tests that are not flaky.
+cc_test(
+ name = "subprocess_reliable_test",
+ srcs = ["subprocess_reliable_test.cc"],
+ data = [
+ "//aos/events:pingpong_config",
+ ],
+ # The roborio compiler doesn't support <filesystem>.
+ target_compatible_with = ["@platforms//os:linux"],
+ deps = [
+ ":subprocess",
+ "//aos/events:shm_event_loop",
+ "//aos/testing:googletest",
+ "//aos/testing:path",
+ "//aos/testing:tmpdir",
+ ],
+)
+
cc_test(
name = "starter_test",
srcs = ["starter_test.cc"],
diff --git a/aos/starter/subprocess.cc b/aos/starter/subprocess.cc
index 039f3d8..4d21c61 100644
--- a/aos/starter/subprocess.cc
+++ b/aos/starter/subprocess.cc
@@ -12,6 +12,25 @@
namespace aos::starter {
+// Blocks all signals while an instance of this class is in scope.
+class ScopedCompleteSignalBlocker {
+ public:
+ ScopedCompleteSignalBlocker() {
+ sigset_t mask;
+ sigfillset(&mask);
+ // Remember the current mask.
+ PCHECK(sigprocmask(SIG_SETMASK, &mask, &old_mask_) == 0);
+ }
+
+ ~ScopedCompleteSignalBlocker() {
+ // Restore the remembered mask.
+ PCHECK(sigprocmask(SIG_SETMASK, &old_mask_, nullptr) == 0);
+ }
+
+ private:
+ sigset_t old_mask_;
+};
+
// RAII class to become root and restore back to the original user and group
// afterwards.
class Sudo {
@@ -211,35 +230,56 @@
pipe_timer_->Schedule(event_loop_->monotonic_now(),
std::chrono::milliseconds(100));
- const pid_t pid = fork();
+ {
+ // Block all signals during the fork() call. Together with the default
+ // signal handler restoration below, This prevents signal handlers from
+ // getting called in the child and accidentally affecting the parent. In
+ // particular, the exit handler for shm_event_loop could be called here if
+ // we don't exec() quickly enough.
+ ScopedCompleteSignalBlocker signal_blocker;
- if (pid != 0) {
- if (pid == -1) {
- PLOG_IF(WARNING, quiet_flag_ == QuietLogging::kNo ||
- quiet_flag_ == QuietLogging::kNotForDebugging)
- << "Failed to fork '" << name_ << "'";
- stop_reason_ = aos::starter::LastStopReason::FORK_ERR;
- status_ = aos::starter::State::STOPPED;
- } else {
- pid_ = pid;
- id_ = next_id_++;
- start_time_ = event_loop_->monotonic_now();
- status_ = aos::starter::State::STARTING;
- latest_timing_report_version_.reset();
- LOG_IF(INFO, quiet_flag_ == QuietLogging::kNo)
- << "Starting '" << name_ << "' pid " << pid_;
+ const pid_t pid = fork();
- // Set up timer which moves application to RUNNING state if it is still
- // alive in 1 second.
- start_timer_->Schedule(event_loop_->monotonic_now() +
- std::chrono::seconds(1));
- // Since we are the parent process, clear our write-side of all the pipes.
- status_pipes_.write.reset();
- stdout_pipes_.write.reset();
- stderr_pipes_.write.reset();
+ if (pid != 0) {
+ if (pid == -1) {
+ PLOG_IF(WARNING, quiet_flag_ == QuietLogging::kNo ||
+ quiet_flag_ == QuietLogging::kNotForDebugging)
+ << "Failed to fork '" << name_ << "'";
+ stop_reason_ = aos::starter::LastStopReason::FORK_ERR;
+ status_ = aos::starter::State::STOPPED;
+ } else {
+ pid_ = pid;
+ id_ = next_id_++;
+ start_time_ = event_loop_->monotonic_now();
+ status_ = aos::starter::State::STARTING;
+ latest_timing_report_version_.reset();
+ LOG_IF(INFO, quiet_flag_ == QuietLogging::kNo)
+ << "Starting '" << name_ << "' pid " << pid_;
+
+ // Set up timer which moves application to RUNNING state if it is still
+ // alive in 1 second.
+ start_timer_->Schedule(event_loop_->monotonic_now() +
+ std::chrono::seconds(1));
+ // Since we are the parent process, clear our write-side of all the
+ // pipes.
+ status_pipes_.write.reset();
+ stdout_pipes_.write.reset();
+ stderr_pipes_.write.reset();
+ }
+ OnChange();
+ return;
}
- OnChange();
- return;
+
+ // Clear any signal handlers so that they don't accidentally interfere with
+ // the parent process. Is there a better way to iterate over all the
+ // signals? Right now we're just dealing with the most common ones.
+ for (int signal : {SIGINT, SIGHUP, SIGTERM}) {
+ struct sigaction action;
+ sigemptyset(&action.sa_mask);
+ action.sa_flags = 0;
+ action.sa_handler = SIG_DFL;
+ PCHECK(sigaction(signal, &action, nullptr) == 0);
+ }
}
if (memory_cgroup_) {
diff --git a/aos/starter/subprocess_reliable_test.cc b/aos/starter/subprocess_reliable_test.cc
new file mode 100644
index 0000000..0460bc3
--- /dev/null
+++ b/aos/starter/subprocess_reliable_test.cc
@@ -0,0 +1,127 @@
+#include <signal.h>
+#include <sys/types.h>
+
+#include <filesystem>
+
+#include "gtest/gtest.h"
+
+#include "aos/events/shm_event_loop.h"
+#include "aos/starter/subprocess.h"
+#include "aos/testing/path.h"
+#include "aos/testing/tmpdir.h"
+#include "aos/util/file.h"
+
+namespace aos::starter::testing {
+
+namespace {
+void Wait(pid_t pid) {
+ int status;
+ if (waitpid(pid, &status, 0) != pid) {
+ if (errno != ECHILD) {
+ PLOG(ERROR) << "Failed to wait for PID " << pid << ": " << status;
+ FAIL();
+ }
+ }
+ LOG(INFO) << "Succesfully waited for PID " << pid;
+}
+
+} // namespace
+
+// Validates that killing a child process right after startup doesn't have any
+// unexpected consequences. The child process should exit even if it hasn't
+// `exec()`d yet.
+TEST(SubprocessTest, KillDuringStartup) {
+ const std::string config_file =
+ ::aos::testing::ArtifactPath("aos/events/pingpong_config.json");
+ aos::FlatbufferDetachedBuffer<aos::Configuration> config =
+ aos::configuration::ReadConfig(config_file);
+ aos::ShmEventLoop event_loop(&config.message());
+
+ // Run an application that takes a really long time to exit. The exact details
+ // here don't matter. We just need to to survive long enough until we can call
+ // Terminate() below.
+ auto application =
+ std::make_unique<Application>("sleep", "sleep", &event_loop, []() {});
+ application->set_args({"100"});
+
+ // Track whether we exit via our own timer callback. We don't want to exit
+ // because of any strange interactions with the child process.
+ bool exited_as_expected = false;
+
+ // Here's the sequence of events that we expect to see:
+ // 1. Start child process.
+ // 2. Stop child process (via `Terminate()`).
+ // 3. Wait 1 second.
+ // 4. Set `exited_as_expected` to `true`.
+ // 5. Exit the event loop.
+ //
+ // At the end, if `exited_as_expected` is `false`, then something unexpected
+ // happened and we failed the test here.
+ aos::TimerHandler *shutdown_timer = event_loop.AddTimer([&]() {
+ exited_as_expected = true;
+ event_loop.Exit();
+ });
+ aos::TimerHandler *trigger_timer = event_loop.AddTimer([&]() {
+ application->Start();
+ application->Terminate();
+ shutdown_timer->Schedule(event_loop.monotonic_now() +
+ std::chrono::seconds(1));
+ });
+ trigger_timer->Schedule(event_loop.monotonic_now());
+ event_loop.Run();
+ application->Stop();
+ Wait(application->get_pid());
+
+ EXPECT_TRUE(exited_as_expected) << "It looks like we killed ourselves.";
+}
+
+// Validates that the code in subprocess.cc doesn't accidentally block signals
+// in the child process.
+TEST(SubprocessTest, CanKillAfterStartup) {
+ const std::string config_file =
+ ::aos::testing::ArtifactPath("aos/events/pingpong_config.json");
+ aos::FlatbufferDetachedBuffer<aos::Configuration> config =
+ aos::configuration::ReadConfig(config_file);
+ aos::ShmEventLoop event_loop(&config.message());
+
+ // We create a directory in which we create some files so this test here and
+ // the subsequently created child process can "signal" one another. We roughly
+ // expect the following sequence of events:
+ // 1. Start the child process.
+ // 2. Test waits for "startup" file to be created by child.
+ // 3. Child process creates the "startup" file.
+ // 4. Test sees "startup" file being created, sends SIGTERM to child.
+ // 5. Child sees SIGTERM, creates "shutdown" file, and exits.
+ // 6. Test waits for child to exit.
+ // 7. Test validates that the "shutdown" file was created by the child.
+ auto signal_dir = std::filesystem::path(aos::testing::TestTmpDir()) /
+ "startup_file_signals";
+ ASSERT_TRUE(std::filesystem::create_directory(signal_dir));
+ auto startup_signal_file = signal_dir / "startup";
+ auto shutdown_signal_file = signal_dir / "shutdown";
+
+ auto application = std::make_unique<Application>("/bin/bash", "/bin/bash",
+ &event_loop, []() {});
+ application->set_args(
+ {"-c", absl::StrCat("cleanup() { touch ", shutdown_signal_file.string(),
+ "; exit 0; }; trap cleanup SIGTERM; touch ",
+ startup_signal_file.string(),
+ "; while true; do sleep 0.1; done;")});
+
+ // Wait for the child process to create the "startup" file.
+ ASSERT_FALSE(std::filesystem::exists(startup_signal_file));
+ application->Start();
+ while (!std::filesystem::exists(startup_signal_file)) {
+ std::this_thread::sleep_for(std::chrono::milliseconds(50));
+ }
+ ASSERT_FALSE(std::filesystem::exists(shutdown_signal_file));
+
+ // Manually kill the application here. The Stop() and Terminate() helpers
+ // trigger some timeout behaviour that interferes with the test here. This
+ // should cause the child to exit and create the "shutdown" file.
+ PCHECK(kill(application->get_pid(), SIGTERM) == 0);
+ Wait(application->get_pid());
+ ASSERT_TRUE(std::filesystem::exists(shutdown_signal_file));
+}
+
+} // namespace aos::starter::testing