blob: c1eb618da57a05d63c93329a256687d98b721032 [file] [log] [blame]
James Kuszmaul3224b8e2022-01-07 19:00:39 -08001#include "aos/starter/subprocess.h"
2
3#include <grp.h>
4#include <pwd.h>
5#include <sys/prctl.h>
6#include <sys/types.h>
7#include <sys/wait.h>
8
9#include "glog/logging.h"
10
11namespace aos::starter {
12
13SignalListener::SignalListener(aos::ShmEventLoop *loop,
14 std::function<void(signalfd_siginfo)> callback)
15 : SignalListener(loop, callback,
16 {SIGHUP, SIGINT, SIGQUIT, SIGABRT, SIGFPE, SIGSEGV,
17 SIGPIPE, SIGTERM, SIGBUS, SIGXCPU, SIGCHLD}) {}
18
19SignalListener::SignalListener(aos::ShmEventLoop *loop,
20 std::function<void(signalfd_siginfo)> callback,
21 std::initializer_list<unsigned int> signals)
22 : loop_(loop), callback_(std::move(callback)), signalfd_(signals) {
23 loop->epoll()->OnReadable(signalfd_.fd(), [this] {
24 signalfd_siginfo info = signalfd_.Read();
25
26 if (info.ssi_signo == 0) {
27 LOG(WARNING) << "Could not read " << sizeof(signalfd_siginfo) << " bytes";
28 return;
29 }
30
31 callback_(info);
32 });
33}
34
35SignalListener::~SignalListener() { loop_->epoll()->DeleteFd(signalfd_.fd()); }
36
James Kuszmauld42edb42022-01-07 18:00:16 -080037Application::Application(std::string_view name,
38 std::string_view executable_name,
James Kuszmaul3224b8e2022-01-07 19:00:39 -080039 aos::EventLoop *event_loop,
40 std::function<void()> on_change)
James Kuszmauld42edb42022-01-07 18:00:16 -080041 : name_(name),
42 path_(executable_name),
James Kuszmaul3224b8e2022-01-07 19:00:39 -080043 event_loop_(event_loop),
44 start_timer_(event_loop_->AddTimer([this] {
45 status_ = aos::starter::State::RUNNING;
46 LOG(INFO) << "Started '" << name_ << "' pid: " << pid_;
47 })),
48 restart_timer_(event_loop_->AddTimer([this] { DoStart(); })),
49 stop_timer_(event_loop_->AddTimer([this] {
50 if (kill(pid_, SIGKILL) == 0) {
51 LOG(WARNING) << "Failed to stop, sending SIGKILL to '" << name_
52 << "' pid: " << pid_;
53 }
54 })),
James Kuszmauld42edb42022-01-07 18:00:16 -080055 pipe_timer_(event_loop_->AddTimer([this]() { FetchOutputs(); })),
56 child_status_handler_(
57 event_loop_->AddTimer([this]() { MaybeHandleSignal(); })),
58 on_change_(on_change) {
59 event_loop_->OnRun([this]() {
60 // Every second poll to check if the child is dead. This is used as a
61 // default for the case where the user is not directly catching SIGCHLD and
62 // calling MaybeHandleSignal for us.
63 child_status_handler_->Setup(event_loop_->monotonic_now(),
64 std::chrono::seconds(1));
65 });
66}
67
68Application::Application(const aos::Application *application,
69 aos::EventLoop *event_loop,
70 std::function<void()> on_change)
71 : Application(application->name()->string_view(),
72 application->has_executable_name()
73 ? application->executable_name()->string_view()
74 : application->name()->string_view(),
75 event_loop, on_change) {
76 user_name_ = application->has_user() ? application->user()->str() : "";
77 user_ = application->has_user() ? FindUid(user_name_.c_str()) : std::nullopt;
78 group_ = application->has_user() ? FindPrimaryGidForUser(user_name_.c_str())
79 : std::nullopt;
80 autostart_ = application->autostart();
81 autorestart_ = application->autorestart();
82 if (application->has_args()) {
83 set_args(*application->args());
84 }
85}
James Kuszmaul3224b8e2022-01-07 19:00:39 -080086
87void Application::DoStart() {
88 if (status_ != aos::starter::State::WAITING) {
89 return;
90 }
91
92 start_timer_->Disable();
93 restart_timer_->Disable();
94
James Kuszmauld42edb42022-01-07 18:00:16 -080095 status_pipes_ = util::ScopedPipe::MakePipe();
96
97 if (capture_stdout_) {
98 stdout_pipes_ = util::ScopedPipe::MakePipe();
99 stdout_.clear();
100 }
101 if (capture_stderr_) {
102 stderr_pipes_ = util::ScopedPipe::MakePipe();
103 stderr_.clear();
104 }
105
106 pipe_timer_->Setup(event_loop_->monotonic_now(),
107 std::chrono::milliseconds(100));
James Kuszmaul3224b8e2022-01-07 19:00:39 -0800108
109 const pid_t pid = fork();
110
111 if (pid != 0) {
112 if (pid == -1) {
113 PLOG(WARNING) << "Failed to fork '" << name_ << "'";
114 stop_reason_ = aos::starter::LastStopReason::FORK_ERR;
115 status_ = aos::starter::State::STOPPED;
116 } else {
117 pid_ = pid;
118 id_ = next_id_++;
119 start_time_ = event_loop_->monotonic_now();
120 status_ = aos::starter::State::STARTING;
121 LOG(INFO) << "Starting '" << name_ << "' pid " << pid_;
122
123 // Setup timer which moves application to RUNNING state if it is still
124 // alive in 1 second.
125 start_timer_->Setup(event_loop_->monotonic_now() +
126 std::chrono::seconds(1));
James Kuszmauld42edb42022-01-07 18:00:16 -0800127 // Since we are the parent process, clear our write-side of all the pipes.
128 status_pipes_.write.reset();
129 stdout_pipes_.write.reset();
130 stderr_pipes_.write.reset();
James Kuszmaul3224b8e2022-01-07 19:00:39 -0800131 }
132 on_change_();
133 return;
134 }
135
James Kuszmauld42edb42022-01-07 18:00:16 -0800136 // Since we are the child process, clear our read-side of all the pipes.
137 status_pipes_.read.reset();
138 stdout_pipes_.read.reset();
139 stderr_pipes_.read.reset();
140
141 // The status pipe will not be needed if the execve succeeds.
142 status_pipes_.write->SetCloexec();
143
James Kuszmaul3224b8e2022-01-07 19:00:39 -0800144 // Clear out signal mask of parent so forked process receives all signals
145 // normally.
146 sigset_t empty_mask;
147 sigemptyset(&empty_mask);
148 sigprocmask(SIG_SETMASK, &empty_mask, nullptr);
149
150 // Cleanup children if starter dies in a way that is not handled gracefully.
151 if (prctl(PR_SET_PDEATHSIG, SIGKILL) == -1) {
James Kuszmauld42edb42022-01-07 18:00:16 -0800152 status_pipes_.write->Write(
James Kuszmaul3224b8e2022-01-07 19:00:39 -0800153 static_cast<uint32_t>(aos::starter::LastStopReason::SET_PRCTL_ERR));
154 PLOG(FATAL) << "Could not set PR_SET_PDEATHSIG to SIGKILL";
155 }
156
157 if (group_) {
158 CHECK(!user_name_.empty());
159 // The manpage for setgroups says we just need CAP_SETGID, but empirically
160 // we also need the effective UID to be 0 to make it work. user_ must also
161 // be set so we change this effective UID back later.
162 CHECK(user_);
163 if (seteuid(0) == -1) {
James Kuszmauld42edb42022-01-07 18:00:16 -0800164 status_pipes_.write->Write(
James Kuszmaul3224b8e2022-01-07 19:00:39 -0800165 static_cast<uint32_t>(aos::starter::LastStopReason::SET_GRP_ERR));
166 PLOG(FATAL) << "Could not seteuid(0) for " << name_
167 << " in preparation for setting groups";
168 }
169 if (initgroups(user_name_.c_str(), *group_) == -1) {
James Kuszmauld42edb42022-01-07 18:00:16 -0800170 status_pipes_.write->Write(
James Kuszmaul3224b8e2022-01-07 19:00:39 -0800171 static_cast<uint32_t>(aos::starter::LastStopReason::SET_GRP_ERR));
172 PLOG(FATAL) << "Could not initialize normal groups for " << name_
173 << " as " << user_name_ << " with " << *group_;
174 }
175 if (setgid(*group_) == -1) {
James Kuszmauld42edb42022-01-07 18:00:16 -0800176 status_pipes_.write->Write(
James Kuszmaul3224b8e2022-01-07 19:00:39 -0800177 static_cast<uint32_t>(aos::starter::LastStopReason::SET_GRP_ERR));
178 PLOG(FATAL) << "Could not set group for " << name_ << " to " << *group_;
179 }
180 }
181
182 if (user_) {
183 if (setuid(*user_) == -1) {
James Kuszmauld42edb42022-01-07 18:00:16 -0800184 status_pipes_.write->Write(
James Kuszmaul3224b8e2022-01-07 19:00:39 -0800185 static_cast<uint32_t>(aos::starter::LastStopReason::SET_USR_ERR));
186 PLOG(FATAL) << "Could not set user for " << name_ << " to " << *user_;
187 }
188 }
189
James Kuszmauld42edb42022-01-07 18:00:16 -0800190 if (capture_stdout_) {
191 PCHECK(STDOUT_FILENO == dup2(stdout_pipes_.write->fd(), STDOUT_FILENO));
192 stdout_pipes_.write.reset();
193 }
James Kuszmaul3224b8e2022-01-07 19:00:39 -0800194
James Kuszmauld42edb42022-01-07 18:00:16 -0800195 if (capture_stderr_) {
196 PCHECK(STDERR_FILENO == dup2(stderr_pipes_.write->fd(), STDERR_FILENO));
197 stderr_pipes_.write.reset();
198 }
199
200 // argv[0] should be the program name
201 args_.insert(args_.begin(), path_);
202
203 std::vector<char *> cargs = CArgs();
204 execvp(path_.c_str(), cargs.data());
James Kuszmaul3224b8e2022-01-07 19:00:39 -0800205
206 // If we got here, something went wrong
James Kuszmauld42edb42022-01-07 18:00:16 -0800207 status_pipes_.write->Write(
James Kuszmaul3224b8e2022-01-07 19:00:39 -0800208 static_cast<uint32_t>(aos::starter::LastStopReason::EXECV_ERR));
209 PLOG(WARNING) << "Could not execute " << name_ << " (" << path_ << ')';
210
211 _exit(EXIT_FAILURE);
212}
213
James Kuszmauld42edb42022-01-07 18:00:16 -0800214void Application::FetchOutputs() {
215 if (capture_stdout_) {
216 stdout_pipes_.read->Read(&stdout_);
217 }
218 if (capture_stderr_) {
219 stderr_pipes_.read->Read(&stderr_);
220 }
221}
222
223const std::string &Application::GetStdout() {
224 CHECK(capture_stdout_);
225 FetchOutputs();
226 return stdout_;
227}
228
229const std::string &Application::GetStderr() {
230 CHECK(capture_stderr_);
231 FetchOutputs();
232 return stderr_;
233}
234
James Kuszmaul3224b8e2022-01-07 19:00:39 -0800235void Application::DoStop(bool restart) {
236 // If stop or restart received, the old state of these is no longer applicable
237 // so cancel both.
238 restart_timer_->Disable();
239 start_timer_->Disable();
240
James Kuszmauld42edb42022-01-07 18:00:16 -0800241 FetchOutputs();
242
James Kuszmaul3224b8e2022-01-07 19:00:39 -0800243 switch (status_) {
244 case aos::starter::State::STARTING:
245 case aos::starter::State::RUNNING: {
246 LOG(INFO) << "Stopping '" << name_ << "' pid: " << pid_ << " with signal "
247 << SIGINT;
248 status_ = aos::starter::State::STOPPING;
249
250 kill(pid_, SIGINT);
251
252 // Watchdog timer to SIGKILL application if it is still running 1 second
253 // after SIGINT
254 stop_timer_->Setup(event_loop_->monotonic_now() +
255 std::chrono::seconds(1));
256 queue_restart_ = restart;
257 on_change_();
258 break;
259 }
260 case aos::starter::State::WAITING: {
261 // If waiting to restart, and receives restart, skip the waiting period
262 // and restart immediately. If stop received, all we have to do is move
263 // to the STOPPED state.
264 if (restart) {
265 DoStart();
266 } else {
267 status_ = aos::starter::State::STOPPED;
268 on_change_();
269 }
270 break;
271 }
272 case aos::starter::State::STOPPING: {
273 // If the application is already stopping, then we just need to update the
274 // restart flag to the most recent status.
275 queue_restart_ = restart;
276 break;
277 }
278 case aos::starter::State::STOPPED: {
279 // Restart immediately if the application is already stopped
280 if (restart) {
281 status_ = aos::starter::State::WAITING;
282 DoStart();
283 }
284 break;
285 }
286 }
287}
288
289void Application::QueueStart() {
290 status_ = aos::starter::State::WAITING;
291
292 LOG(INFO) << "Restarting " << name_ << " in 3 seconds";
293 restart_timer_->Setup(event_loop_->monotonic_now() + std::chrono::seconds(3));
294 start_timer_->Disable();
295 stop_timer_->Disable();
296 on_change_();
297}
298
James Kuszmauld42edb42022-01-07 18:00:16 -0800299std::vector<char *> Application::CArgs() {
300 std::vector<char *> cargs;
301 std::transform(args_.begin(), args_.end(), std::back_inserter(cargs),
302 [](std::string &str) { return str.data(); });
303 cargs.push_back(nullptr);
304 return cargs;
305}
306
James Kuszmaul3224b8e2022-01-07 19:00:39 -0800307void Application::set_args(
308 const flatbuffers::Vector<flatbuffers::Offset<flatbuffers::String>> &v) {
309 args_.clear();
310 std::transform(v.begin(), v.end(), std::back_inserter(args_),
James Kuszmauld42edb42022-01-07 18:00:16 -0800311 [](const flatbuffers::String *str) { return str->str(); });
312}
313
314void Application::set_args(std::vector<std::string> args) {
315 args_ = std::move(args);
316}
317
318void Application::set_capture_stdout(bool capture) {
319 capture_stdout_ = capture;
320}
321
322void Application::set_capture_stderr(bool capture) {
323 capture_stderr_ = capture;
James Kuszmaul3224b8e2022-01-07 19:00:39 -0800324}
325
326std::optional<uid_t> Application::FindUid(const char *name) {
327 // TODO(austin): Use the reentrant version. This should be safe.
328 struct passwd *user_data = getpwnam(name);
329 if (user_data != nullptr) {
330 return user_data->pw_uid;
331 } else {
332 LOG(FATAL) << "Could not find user " << name;
333 return std::nullopt;
334 }
335}
336
337std::optional<gid_t> Application::FindPrimaryGidForUser(const char *name) {
338 // TODO(austin): Use the reentrant version. This should be safe.
339 struct passwd *user_data = getpwnam(name);
340 if (user_data != nullptr) {
341 return user_data->pw_gid;
342 } else {
343 LOG(FATAL) << "Could not find user " << name;
344 return std::nullopt;
345 }
346}
347
348flatbuffers::Offset<aos::starter::ApplicationStatus>
349Application::PopulateStatus(flatbuffers::FlatBufferBuilder *builder) {
350 CHECK_NOTNULL(builder);
351 auto name_fbs = builder->CreateString(name_);
352
353 aos::starter::ApplicationStatus::Builder status_builder(*builder);
354 status_builder.add_name(name_fbs);
355 status_builder.add_state(status_);
James Kuszmauld42edb42022-01-07 18:00:16 -0800356 if (exit_code_.has_value()) {
357 status_builder.add_last_exit_code(exit_code_.value());
358 }
James Kuszmaul3224b8e2022-01-07 19:00:39 -0800359 status_builder.add_last_stop_reason(stop_reason_);
360 if (pid_ != -1) {
361 status_builder.add_pid(pid_);
362 status_builder.add_id(id_);
363 }
364 status_builder.add_last_start_time(start_time_.time_since_epoch().count());
365 return status_builder.Finish();
366}
367
368void Application::Terminate() {
369 stop_reason_ = aos::starter::LastStopReason::TERMINATE;
370 DoStop(false);
371 terminating_ = true;
372}
373
374void Application::HandleCommand(aos::starter::Command cmd) {
375 switch (cmd) {
376 case aos::starter::Command::START: {
377 switch (status_) {
378 case aos::starter::State::WAITING: {
379 restart_timer_->Disable();
380 DoStart();
381 break;
382 }
383 case aos::starter::State::STARTING: {
384 break;
385 }
386 case aos::starter::State::RUNNING: {
387 break;
388 }
389 case aos::starter::State::STOPPING: {
390 queue_restart_ = true;
391 break;
392 }
393 case aos::starter::State::STOPPED: {
394 status_ = aos::starter::State::WAITING;
395 DoStart();
396 break;
397 }
398 }
399 break;
400 }
401 case aos::starter::Command::STOP: {
402 stop_reason_ = aos::starter::LastStopReason::STOP_REQUESTED;
403 DoStop(false);
404 break;
405 }
406 case aos::starter::Command::RESTART: {
407 stop_reason_ = aos::starter::LastStopReason::RESTART_REQUESTED;
408 DoStop(true);
409 break;
410 }
411 }
412}
413
414bool Application::MaybeHandleSignal() {
415 int status;
416
417 // Check if the status of this process has changed
418 if (pid_ == -1 || waitpid(pid_, &status, WNOHANG) != pid_) {
419 return false;
420 }
421
422 // Check that the event was the process exiting
423 if (!WIFEXITED(status) && !WIFSIGNALED(status)) {
424 return false;
425 }
426
James Kuszmauld42edb42022-01-07 18:00:16 -0800427 start_timer_->Disable();
James Kuszmaul3224b8e2022-01-07 19:00:39 -0800428 exit_time_ = event_loop_->monotonic_now();
429 exit_code_ = WIFEXITED(status) ? WEXITSTATUS(status) : WTERMSIG(status);
430
James Kuszmauld42edb42022-01-07 18:00:16 -0800431 if (auto read_result = status_pipes_.read->Read()) {
James Kuszmaul3224b8e2022-01-07 19:00:39 -0800432 stop_reason_ = static_cast<aos::starter::LastStopReason>(*read_result);
433 }
434
435 switch (status_) {
436 case aos::starter::State::STARTING: {
James Kuszmauld42edb42022-01-07 18:00:16 -0800437 if (exit_code_.value() == 0) {
438 LOG(INFO) << "Application '" << name_ << "' pid " << pid_
439 << " exited with status " << exit_code_.value();
440 } else {
441 LOG(WARNING) << "Failed to start '" << name_ << "' on pid " << pid_
442 << " : Exited with status " << exit_code_.value();
443 }
James Kuszmaul3224b8e2022-01-07 19:00:39 -0800444 if (autorestart()) {
445 QueueStart();
James Kuszmauld42edb42022-01-07 18:00:16 -0800446 } else {
447 status_ = aos::starter::State::STOPPED;
448 on_change_();
James Kuszmaul3224b8e2022-01-07 19:00:39 -0800449 }
450 break;
451 }
452 case aos::starter::State::RUNNING: {
James Kuszmauld42edb42022-01-07 18:00:16 -0800453 if (exit_code_.value() == 0) {
James Kuszmaul3224b8e2022-01-07 19:00:39 -0800454 LOG(INFO) << "Application '" << name_ << "' pid " << pid_
James Kuszmauld42edb42022-01-07 18:00:16 -0800455 << " exited with status " << exit_code_.value();
James Kuszmaul3224b8e2022-01-07 19:00:39 -0800456 } else {
457 LOG(WARNING) << "Application '" << name_ << "' pid " << pid_
James Kuszmauld42edb42022-01-07 18:00:16 -0800458 << " exited unexpectedly with status "
459 << exit_code_.value();
James Kuszmaul3224b8e2022-01-07 19:00:39 -0800460 }
461 if (autorestart()) {
462 QueueStart();
James Kuszmauld42edb42022-01-07 18:00:16 -0800463 } else {
464 status_ = aos::starter::State::STOPPED;
465 on_change_();
James Kuszmaul3224b8e2022-01-07 19:00:39 -0800466 }
467 break;
468 }
469 case aos::starter::State::STOPPING: {
470 LOG(INFO) << "Successfully stopped '" << name_ << "' pid: " << pid_
James Kuszmauld42edb42022-01-07 18:00:16 -0800471 << " with status " << exit_code_.value();
James Kuszmaul3224b8e2022-01-07 19:00:39 -0800472 status_ = aos::starter::State::STOPPED;
473
474 // Disable force stop timer since the process already died
475 stop_timer_->Disable();
476
477 on_change_();
478 if (terminating_) {
479 return true;
480 }
481
482 if (queue_restart_) {
483 queue_restart_ = false;
484 status_ = aos::starter::State::WAITING;
485 DoStart();
486 }
487 break;
488 }
489 case aos::starter::State::WAITING:
490 case aos::starter::State::STOPPED: {
491 LOG(FATAL)
492 << "Received signal on process that was already stopped : name: '"
493 << name_ << "' pid: " << pid_;
494 break;
495 }
496 }
497
498 return false;
499}
500
501} // namespace aos::starter