Prevent starter from killing itself accidentally
This patch aims to fix a race condition where a user of subprocess.cc
(e.g. starter) and `shm_event_loop` can kill itself accidentally. This
generally happens when the user attempts to kill the child before the
`exec()` call has happened. In that case, the
`SignalHandler::DoHandleSignal()` function from `shm_event_loop.cc`
runs inside the child, but actually writes the quit event to the pipe
that the parent is listening to.
The fix here is to block the signals before the fork. The parent
process will unblock the signals before exiting from the `DoStart()`
function. The child will first restore the default signal handlers and
_then_ unblock the signals. This will cause the child process to
terminate as expected when it sees a relevant signal.
If I undo the changes in `subprocess.cc`, the test becomes flaky.
$ bazel test //aos/starter:subprocess_reliable_test --runs_per_test=1000
...
FAILED in 986 out of 1000 in 100.6s
References: PRO-23549
Change-Id: I958071df561bcdef78088740136e52a8956dfffe
Signed-off-by: James Kuszmaul <james.kuszmaul@bluerivertech.com>
diff --git a/aos/starter/subprocess.cc b/aos/starter/subprocess.cc
index 039f3d8..4d21c61 100644
--- a/aos/starter/subprocess.cc
+++ b/aos/starter/subprocess.cc
@@ -12,6 +12,25 @@
namespace aos::starter {
+// Blocks all signals while an instance of this class is in scope.
+class ScopedCompleteSignalBlocker {
+ public:
+ ScopedCompleteSignalBlocker() {
+ sigset_t mask;
+ sigfillset(&mask);
+ // Remember the current mask.
+ PCHECK(sigprocmask(SIG_SETMASK, &mask, &old_mask_) == 0);
+ }
+
+ ~ScopedCompleteSignalBlocker() {
+ // Restore the remembered mask.
+ PCHECK(sigprocmask(SIG_SETMASK, &old_mask_, nullptr) == 0);
+ }
+
+ private:
+ sigset_t old_mask_;
+};
+
// RAII class to become root and restore back to the original user and group
// afterwards.
class Sudo {
@@ -211,35 +230,56 @@
pipe_timer_->Schedule(event_loop_->monotonic_now(),
std::chrono::milliseconds(100));
- const pid_t pid = fork();
+ {
+ // Block all signals during the fork() call. Together with the default
+ // signal handler restoration below, This prevents signal handlers from
+ // getting called in the child and accidentally affecting the parent. In
+ // particular, the exit handler for shm_event_loop could be called here if
+ // we don't exec() quickly enough.
+ ScopedCompleteSignalBlocker signal_blocker;
- if (pid != 0) {
- if (pid == -1) {
- PLOG_IF(WARNING, quiet_flag_ == QuietLogging::kNo ||
- quiet_flag_ == QuietLogging::kNotForDebugging)
- << "Failed to fork '" << name_ << "'";
- stop_reason_ = aos::starter::LastStopReason::FORK_ERR;
- status_ = aos::starter::State::STOPPED;
- } else {
- pid_ = pid;
- id_ = next_id_++;
- start_time_ = event_loop_->monotonic_now();
- status_ = aos::starter::State::STARTING;
- latest_timing_report_version_.reset();
- LOG_IF(INFO, quiet_flag_ == QuietLogging::kNo)
- << "Starting '" << name_ << "' pid " << pid_;
+ const pid_t pid = fork();
- // Set up timer which moves application to RUNNING state if it is still
- // alive in 1 second.
- start_timer_->Schedule(event_loop_->monotonic_now() +
- std::chrono::seconds(1));
- // Since we are the parent process, clear our write-side of all the pipes.
- status_pipes_.write.reset();
- stdout_pipes_.write.reset();
- stderr_pipes_.write.reset();
+ if (pid != 0) {
+ if (pid == -1) {
+ PLOG_IF(WARNING, quiet_flag_ == QuietLogging::kNo ||
+ quiet_flag_ == QuietLogging::kNotForDebugging)
+ << "Failed to fork '" << name_ << "'";
+ stop_reason_ = aos::starter::LastStopReason::FORK_ERR;
+ status_ = aos::starter::State::STOPPED;
+ } else {
+ pid_ = pid;
+ id_ = next_id_++;
+ start_time_ = event_loop_->monotonic_now();
+ status_ = aos::starter::State::STARTING;
+ latest_timing_report_version_.reset();
+ LOG_IF(INFO, quiet_flag_ == QuietLogging::kNo)
+ << "Starting '" << name_ << "' pid " << pid_;
+
+ // Set up timer which moves application to RUNNING state if it is still
+ // alive in 1 second.
+ start_timer_->Schedule(event_loop_->monotonic_now() +
+ std::chrono::seconds(1));
+ // Since we are the parent process, clear our write-side of all the
+ // pipes.
+ status_pipes_.write.reset();
+ stdout_pipes_.write.reset();
+ stderr_pipes_.write.reset();
+ }
+ OnChange();
+ return;
}
- OnChange();
- return;
+
+ // Clear any signal handlers so that they don't accidentally interfere with
+ // the parent process. Is there a better way to iterate over all the
+ // signals? Right now we're just dealing with the most common ones.
+ for (int signal : {SIGINT, SIGHUP, SIGTERM}) {
+ struct sigaction action;
+ sigemptyset(&action.sa_mask);
+ action.sa_flags = 0;
+ action.sa_handler = SIG_DFL;
+ PCHECK(sigaction(signal, &action, nullptr) == 0);
+ }
}
if (memory_cgroup_) {