blob: f0c8f851ff42995b231c5c71389ee4015dcb42ef [file] [log] [blame]
James Kuszmaul3224b8e2022-01-07 19:00:39 -08001#include "aos/starter/subprocess.h"
2
3#include <grp.h>
4#include <pwd.h>
5#include <sys/prctl.h>
6#include <sys/types.h>
7#include <sys/wait.h>
8
9#include "glog/logging.h"
10
11namespace aos::starter {
12
13SignalListener::SignalListener(aos::ShmEventLoop *loop,
14 std::function<void(signalfd_siginfo)> callback)
15 : SignalListener(loop, callback,
16 {SIGHUP, SIGINT, SIGQUIT, SIGABRT, SIGFPE, SIGSEGV,
17 SIGPIPE, SIGTERM, SIGBUS, SIGXCPU, SIGCHLD}) {}
18
19SignalListener::SignalListener(aos::ShmEventLoop *loop,
20 std::function<void(signalfd_siginfo)> callback,
21 std::initializer_list<unsigned int> signals)
22 : loop_(loop), callback_(std::move(callback)), signalfd_(signals) {
23 loop->epoll()->OnReadable(signalfd_.fd(), [this] {
24 signalfd_siginfo info = signalfd_.Read();
25
26 if (info.ssi_signo == 0) {
27 LOG(WARNING) << "Could not read " << sizeof(signalfd_siginfo) << " bytes";
28 return;
29 }
30
31 callback_(info);
32 });
33}
34
35SignalListener::~SignalListener() { loop_->epoll()->DeleteFd(signalfd_.fd()); }
36
James Kuszmauld42edb42022-01-07 18:00:16 -080037Application::Application(std::string_view name,
38 std::string_view executable_name,
James Kuszmaul3224b8e2022-01-07 19:00:39 -080039 aos::EventLoop *event_loop,
40 std::function<void()> on_change)
James Kuszmauld42edb42022-01-07 18:00:16 -080041 : name_(name),
42 path_(executable_name),
James Kuszmaul3224b8e2022-01-07 19:00:39 -080043 event_loop_(event_loop),
44 start_timer_(event_loop_->AddTimer([this] {
45 status_ = aos::starter::State::RUNNING;
46 LOG(INFO) << "Started '" << name_ << "' pid: " << pid_;
47 })),
48 restart_timer_(event_loop_->AddTimer([this] { DoStart(); })),
49 stop_timer_(event_loop_->AddTimer([this] {
50 if (kill(pid_, SIGKILL) == 0) {
51 LOG(WARNING) << "Failed to stop, sending SIGKILL to '" << name_
52 << "' pid: " << pid_;
53 }
54 })),
James Kuszmauld42edb42022-01-07 18:00:16 -080055 pipe_timer_(event_loop_->AddTimer([this]() { FetchOutputs(); })),
56 child_status_handler_(
57 event_loop_->AddTimer([this]() { MaybeHandleSignal(); })),
58 on_change_(on_change) {
59 event_loop_->OnRun([this]() {
60 // Every second poll to check if the child is dead. This is used as a
61 // default for the case where the user is not directly catching SIGCHLD and
62 // calling MaybeHandleSignal for us.
63 child_status_handler_->Setup(event_loop_->monotonic_now(),
64 std::chrono::seconds(1));
65 });
66}
67
68Application::Application(const aos::Application *application,
69 aos::EventLoop *event_loop,
70 std::function<void()> on_change)
71 : Application(application->name()->string_view(),
72 application->has_executable_name()
73 ? application->executable_name()->string_view()
74 : application->name()->string_view(),
75 event_loop, on_change) {
76 user_name_ = application->has_user() ? application->user()->str() : "";
77 user_ = application->has_user() ? FindUid(user_name_.c_str()) : std::nullopt;
78 group_ = application->has_user() ? FindPrimaryGidForUser(user_name_.c_str())
79 : std::nullopt;
80 autostart_ = application->autostart();
81 autorestart_ = application->autorestart();
82 if (application->has_args()) {
83 set_args(*application->args());
84 }
85}
James Kuszmaul3224b8e2022-01-07 19:00:39 -080086
87void Application::DoStart() {
88 if (status_ != aos::starter::State::WAITING) {
89 return;
90 }
91
92 start_timer_->Disable();
93 restart_timer_->Disable();
94
James Kuszmauld42edb42022-01-07 18:00:16 -080095 status_pipes_ = util::ScopedPipe::MakePipe();
96
97 if (capture_stdout_) {
98 stdout_pipes_ = util::ScopedPipe::MakePipe();
99 stdout_.clear();
100 }
101 if (capture_stderr_) {
102 stderr_pipes_ = util::ScopedPipe::MakePipe();
103 stderr_.clear();
104 }
105
106 pipe_timer_->Setup(event_loop_->monotonic_now(),
107 std::chrono::milliseconds(100));
James Kuszmaul3224b8e2022-01-07 19:00:39 -0800108
109 const pid_t pid = fork();
110
111 if (pid != 0) {
112 if (pid == -1) {
113 PLOG(WARNING) << "Failed to fork '" << name_ << "'";
114 stop_reason_ = aos::starter::LastStopReason::FORK_ERR;
115 status_ = aos::starter::State::STOPPED;
116 } else {
117 pid_ = pid;
118 id_ = next_id_++;
119 start_time_ = event_loop_->monotonic_now();
120 status_ = aos::starter::State::STARTING;
121 LOG(INFO) << "Starting '" << name_ << "' pid " << pid_;
122
123 // Setup timer which moves application to RUNNING state if it is still
124 // alive in 1 second.
125 start_timer_->Setup(event_loop_->monotonic_now() +
126 std::chrono::seconds(1));
James Kuszmauld42edb42022-01-07 18:00:16 -0800127 // Since we are the parent process, clear our write-side of all the pipes.
128 status_pipes_.write.reset();
129 stdout_pipes_.write.reset();
130 stderr_pipes_.write.reset();
James Kuszmaul3224b8e2022-01-07 19:00:39 -0800131 }
132 on_change_();
133 return;
134 }
135
James Kuszmauld42edb42022-01-07 18:00:16 -0800136 // Since we are the child process, clear our read-side of all the pipes.
137 status_pipes_.read.reset();
138 stdout_pipes_.read.reset();
139 stderr_pipes_.read.reset();
140
141 // The status pipe will not be needed if the execve succeeds.
142 status_pipes_.write->SetCloexec();
143
James Kuszmaul3224b8e2022-01-07 19:00:39 -0800144 // Clear out signal mask of parent so forked process receives all signals
145 // normally.
146 sigset_t empty_mask;
147 sigemptyset(&empty_mask);
148 sigprocmask(SIG_SETMASK, &empty_mask, nullptr);
149
150 // Cleanup children if starter dies in a way that is not handled gracefully.
151 if (prctl(PR_SET_PDEATHSIG, SIGKILL) == -1) {
James Kuszmauld42edb42022-01-07 18:00:16 -0800152 status_pipes_.write->Write(
James Kuszmaul3224b8e2022-01-07 19:00:39 -0800153 static_cast<uint32_t>(aos::starter::LastStopReason::SET_PRCTL_ERR));
154 PLOG(FATAL) << "Could not set PR_SET_PDEATHSIG to SIGKILL";
155 }
156
157 if (group_) {
158 CHECK(!user_name_.empty());
159 // The manpage for setgroups says we just need CAP_SETGID, but empirically
160 // we also need the effective UID to be 0 to make it work. user_ must also
161 // be set so we change this effective UID back later.
162 CHECK(user_);
163 if (seteuid(0) == -1) {
James Kuszmauld42edb42022-01-07 18:00:16 -0800164 status_pipes_.write->Write(
James Kuszmaul3224b8e2022-01-07 19:00:39 -0800165 static_cast<uint32_t>(aos::starter::LastStopReason::SET_GRP_ERR));
166 PLOG(FATAL) << "Could not seteuid(0) for " << name_
167 << " in preparation for setting groups";
168 }
169 if (initgroups(user_name_.c_str(), *group_) == -1) {
James Kuszmauld42edb42022-01-07 18:00:16 -0800170 status_pipes_.write->Write(
James Kuszmaul3224b8e2022-01-07 19:00:39 -0800171 static_cast<uint32_t>(aos::starter::LastStopReason::SET_GRP_ERR));
172 PLOG(FATAL) << "Could not initialize normal groups for " << name_
173 << " as " << user_name_ << " with " << *group_;
174 }
175 if (setgid(*group_) == -1) {
James Kuszmauld42edb42022-01-07 18:00:16 -0800176 status_pipes_.write->Write(
James Kuszmaul3224b8e2022-01-07 19:00:39 -0800177 static_cast<uint32_t>(aos::starter::LastStopReason::SET_GRP_ERR));
178 PLOG(FATAL) << "Could not set group for " << name_ << " to " << *group_;
179 }
180 }
181
182 if (user_) {
183 if (setuid(*user_) == -1) {
James Kuszmauld42edb42022-01-07 18:00:16 -0800184 status_pipes_.write->Write(
James Kuszmaul3224b8e2022-01-07 19:00:39 -0800185 static_cast<uint32_t>(aos::starter::LastStopReason::SET_USR_ERR));
186 PLOG(FATAL) << "Could not set user for " << name_ << " to " << *user_;
187 }
188 }
189
James Kuszmauld42edb42022-01-07 18:00:16 -0800190 if (capture_stdout_) {
191 PCHECK(STDOUT_FILENO == dup2(stdout_pipes_.write->fd(), STDOUT_FILENO));
192 stdout_pipes_.write.reset();
193 }
James Kuszmaul3224b8e2022-01-07 19:00:39 -0800194
James Kuszmauld42edb42022-01-07 18:00:16 -0800195 if (capture_stderr_) {
196 PCHECK(STDERR_FILENO == dup2(stderr_pipes_.write->fd(), STDERR_FILENO));
197 stderr_pipes_.write.reset();
198 }
199
200 // argv[0] should be the program name
James Kuszmaul6f10b382022-03-11 22:31:38 -0800201 args_.insert(args_.begin(), path_);
James Kuszmauld42edb42022-01-07 18:00:16 -0800202
203 std::vector<char *> cargs = CArgs();
James Kuszmaul6f10b382022-03-11 22:31:38 -0800204 execvp(path_.c_str(), cargs.data());
James Kuszmaul3224b8e2022-01-07 19:00:39 -0800205
206 // If we got here, something went wrong
James Kuszmauld42edb42022-01-07 18:00:16 -0800207 status_pipes_.write->Write(
James Kuszmaul3224b8e2022-01-07 19:00:39 -0800208 static_cast<uint32_t>(aos::starter::LastStopReason::EXECV_ERR));
James Kuszmaul6f10b382022-03-11 22:31:38 -0800209 PLOG(WARNING) << "Could not execute " << name_ << " (" << path_ << ')';
James Kuszmaul3224b8e2022-01-07 19:00:39 -0800210
211 _exit(EXIT_FAILURE);
212}
213
James Kuszmauld42edb42022-01-07 18:00:16 -0800214void Application::FetchOutputs() {
215 if (capture_stdout_) {
216 stdout_pipes_.read->Read(&stdout_);
217 }
218 if (capture_stderr_) {
219 stderr_pipes_.read->Read(&stderr_);
220 }
221}
222
223const std::string &Application::GetStdout() {
224 CHECK(capture_stdout_);
225 FetchOutputs();
226 return stdout_;
227}
228
229const std::string &Application::GetStderr() {
230 CHECK(capture_stderr_);
231 FetchOutputs();
232 return stderr_;
233}
234
James Kuszmaul3224b8e2022-01-07 19:00:39 -0800235void Application::DoStop(bool restart) {
236 // If stop or restart received, the old state of these is no longer applicable
237 // so cancel both.
238 restart_timer_->Disable();
239 start_timer_->Disable();
240
James Kuszmauld42edb42022-01-07 18:00:16 -0800241 FetchOutputs();
242
James Kuszmaul3224b8e2022-01-07 19:00:39 -0800243 switch (status_) {
244 case aos::starter::State::STARTING:
245 case aos::starter::State::RUNNING: {
246 LOG(INFO) << "Stopping '" << name_ << "' pid: " << pid_ << " with signal "
247 << SIGINT;
248 status_ = aos::starter::State::STOPPING;
249
250 kill(pid_, SIGINT);
251
252 // Watchdog timer to SIGKILL application if it is still running 1 second
253 // after SIGINT
254 stop_timer_->Setup(event_loop_->monotonic_now() +
255 std::chrono::seconds(1));
256 queue_restart_ = restart;
257 on_change_();
258 break;
259 }
260 case aos::starter::State::WAITING: {
261 // If waiting to restart, and receives restart, skip the waiting period
262 // and restart immediately. If stop received, all we have to do is move
263 // to the STOPPED state.
264 if (restart) {
265 DoStart();
266 } else {
267 status_ = aos::starter::State::STOPPED;
268 on_change_();
269 }
270 break;
271 }
272 case aos::starter::State::STOPPING: {
273 // If the application is already stopping, then we just need to update the
274 // restart flag to the most recent status.
275 queue_restart_ = restart;
276 break;
277 }
278 case aos::starter::State::STOPPED: {
279 // Restart immediately if the application is already stopped
280 if (restart) {
281 status_ = aos::starter::State::WAITING;
282 DoStart();
283 }
284 break;
285 }
286 }
287}
288
289void Application::QueueStart() {
290 status_ = aos::starter::State::WAITING;
291
292 LOG(INFO) << "Restarting " << name_ << " in 3 seconds";
293 restart_timer_->Setup(event_loop_->monotonic_now() + std::chrono::seconds(3));
294 start_timer_->Disable();
295 stop_timer_->Disable();
296 on_change_();
297}
298
James Kuszmauld42edb42022-01-07 18:00:16 -0800299std::vector<char *> Application::CArgs() {
300 std::vector<char *> cargs;
301 std::transform(args_.begin(), args_.end(), std::back_inserter(cargs),
302 [](std::string &str) { return str.data(); });
303 cargs.push_back(nullptr);
304 return cargs;
305}
306
James Kuszmaul3224b8e2022-01-07 19:00:39 -0800307void Application::set_args(
308 const flatbuffers::Vector<flatbuffers::Offset<flatbuffers::String>> &v) {
309 args_.clear();
310 std::transform(v.begin(), v.end(), std::back_inserter(args_),
James Kuszmauld42edb42022-01-07 18:00:16 -0800311 [](const flatbuffers::String *str) { return str->str(); });
312}
313
314void Application::set_args(std::vector<std::string> args) {
315 args_ = std::move(args);
316}
317
318void Application::set_capture_stdout(bool capture) {
319 capture_stdout_ = capture;
320}
321
322void Application::set_capture_stderr(bool capture) {
323 capture_stderr_ = capture;
James Kuszmaul3224b8e2022-01-07 19:00:39 -0800324}
325
326std::optional<uid_t> Application::FindUid(const char *name) {
327 // TODO(austin): Use the reentrant version. This should be safe.
328 struct passwd *user_data = getpwnam(name);
329 if (user_data != nullptr) {
330 return user_data->pw_uid;
331 } else {
332 LOG(FATAL) << "Could not find user " << name;
333 return std::nullopt;
334 }
335}
336
337std::optional<gid_t> Application::FindPrimaryGidForUser(const char *name) {
338 // TODO(austin): Use the reentrant version. This should be safe.
339 struct passwd *user_data = getpwnam(name);
340 if (user_data != nullptr) {
341 return user_data->pw_gid;
342 } else {
343 LOG(FATAL) << "Could not find user " << name;
344 return std::nullopt;
345 }
346}
347
348flatbuffers::Offset<aos::starter::ApplicationStatus>
James Kuszmaul6295a642022-03-22 15:23:59 -0700349Application::PopulateStatus(flatbuffers::FlatBufferBuilder *builder,
350 util::Top *top) {
James Kuszmaul3224b8e2022-01-07 19:00:39 -0800351 CHECK_NOTNULL(builder);
352 auto name_fbs = builder->CreateString(name_);
353
James Kuszmaul6295a642022-03-22 15:23:59 -0700354 const bool valid_pid = pid_ > 0 && status_ != aos::starter::State::STOPPED;
355 const flatbuffers::Offset<util::ProcessInfo> process_info =
356 valid_pid ? top->InfoForProcess(builder, pid_)
357 : flatbuffers::Offset<util::ProcessInfo>();
358
James Kuszmaul3224b8e2022-01-07 19:00:39 -0800359 aos::starter::ApplicationStatus::Builder status_builder(*builder);
360 status_builder.add_name(name_fbs);
361 status_builder.add_state(status_);
James Kuszmauld42edb42022-01-07 18:00:16 -0800362 if (exit_code_.has_value()) {
363 status_builder.add_last_exit_code(exit_code_.value());
364 }
James Kuszmaul3224b8e2022-01-07 19:00:39 -0800365 status_builder.add_last_stop_reason(stop_reason_);
366 if (pid_ != -1) {
367 status_builder.add_pid(pid_);
368 status_builder.add_id(id_);
369 }
James Kuszmaul6295a642022-03-22 15:23:59 -0700370 // Note that even if process_info is null, calling add_process_info is fine.
371 status_builder.add_process_info(process_info);
James Kuszmaul3224b8e2022-01-07 19:00:39 -0800372 status_builder.add_last_start_time(start_time_.time_since_epoch().count());
373 return status_builder.Finish();
374}
375
376void Application::Terminate() {
377 stop_reason_ = aos::starter::LastStopReason::TERMINATE;
378 DoStop(false);
379 terminating_ = true;
380}
381
382void Application::HandleCommand(aos::starter::Command cmd) {
383 switch (cmd) {
384 case aos::starter::Command::START: {
385 switch (status_) {
386 case aos::starter::State::WAITING: {
387 restart_timer_->Disable();
388 DoStart();
389 break;
390 }
391 case aos::starter::State::STARTING: {
392 break;
393 }
394 case aos::starter::State::RUNNING: {
395 break;
396 }
397 case aos::starter::State::STOPPING: {
398 queue_restart_ = true;
399 break;
400 }
401 case aos::starter::State::STOPPED: {
402 status_ = aos::starter::State::WAITING;
403 DoStart();
404 break;
405 }
406 }
407 break;
408 }
409 case aos::starter::Command::STOP: {
410 stop_reason_ = aos::starter::LastStopReason::STOP_REQUESTED;
411 DoStop(false);
412 break;
413 }
414 case aos::starter::Command::RESTART: {
415 stop_reason_ = aos::starter::LastStopReason::RESTART_REQUESTED;
416 DoStop(true);
417 break;
418 }
419 }
420}
421
422bool Application::MaybeHandleSignal() {
423 int status;
424
425 // Check if the status of this process has changed
426 if (pid_ == -1 || waitpid(pid_, &status, WNOHANG) != pid_) {
427 return false;
428 }
429
430 // Check that the event was the process exiting
431 if (!WIFEXITED(status) && !WIFSIGNALED(status)) {
432 return false;
433 }
434
James Kuszmauld42edb42022-01-07 18:00:16 -0800435 start_timer_->Disable();
James Kuszmaul3224b8e2022-01-07 19:00:39 -0800436 exit_time_ = event_loop_->monotonic_now();
437 exit_code_ = WIFEXITED(status) ? WEXITSTATUS(status) : WTERMSIG(status);
438
James Kuszmauld42edb42022-01-07 18:00:16 -0800439 if (auto read_result = status_pipes_.read->Read()) {
James Kuszmaul3224b8e2022-01-07 19:00:39 -0800440 stop_reason_ = static_cast<aos::starter::LastStopReason>(*read_result);
441 }
442
443 switch (status_) {
444 case aos::starter::State::STARTING: {
James Kuszmauld42edb42022-01-07 18:00:16 -0800445 if (exit_code_.value() == 0) {
446 LOG(INFO) << "Application '" << name_ << "' pid " << pid_
447 << " exited with status " << exit_code_.value();
448 } else {
449 LOG(WARNING) << "Failed to start '" << name_ << "' on pid " << pid_
450 << " : Exited with status " << exit_code_.value();
451 }
James Kuszmaul6f10b382022-03-11 22:31:38 -0800452 if (autorestart()) {
453 QueueStart();
454 } else {
455 status_ = aos::starter::State::STOPPED;
456 on_change_();
457 }
James Kuszmaul3224b8e2022-01-07 19:00:39 -0800458 break;
459 }
460 case aos::starter::State::RUNNING: {
James Kuszmauld42edb42022-01-07 18:00:16 -0800461 if (exit_code_.value() == 0) {
James Kuszmaul3224b8e2022-01-07 19:00:39 -0800462 LOG(INFO) << "Application '" << name_ << "' pid " << pid_
James Kuszmauld42edb42022-01-07 18:00:16 -0800463 << " exited with status " << exit_code_.value();
James Kuszmaul3224b8e2022-01-07 19:00:39 -0800464 } else {
465 LOG(WARNING) << "Application '" << name_ << "' pid " << pid_
James Kuszmauld42edb42022-01-07 18:00:16 -0800466 << " exited unexpectedly with status "
467 << exit_code_.value();
James Kuszmaul3224b8e2022-01-07 19:00:39 -0800468 }
James Kuszmaul6f10b382022-03-11 22:31:38 -0800469 if (autorestart()) {
470 QueueStart();
471 } else {
472 status_ = aos::starter::State::STOPPED;
473 on_change_();
474 }
James Kuszmaul3224b8e2022-01-07 19:00:39 -0800475 break;
476 }
477 case aos::starter::State::STOPPING: {
478 LOG(INFO) << "Successfully stopped '" << name_ << "' pid: " << pid_
James Kuszmauld42edb42022-01-07 18:00:16 -0800479 << " with status " << exit_code_.value();
James Kuszmaul3224b8e2022-01-07 19:00:39 -0800480 status_ = aos::starter::State::STOPPED;
481
482 // Disable force stop timer since the process already died
483 stop_timer_->Disable();
484
485 on_change_();
486 if (terminating_) {
487 return true;
488 }
489
490 if (queue_restart_) {
491 queue_restart_ = false;
492 status_ = aos::starter::State::WAITING;
493 DoStart();
494 }
495 break;
496 }
497 case aos::starter::State::WAITING:
498 case aos::starter::State::STOPPED: {
499 LOG(FATAL)
500 << "Received signal on process that was already stopped : name: '"
501 << name_ << "' pid: " << pid_;
502 break;
503 }
504 }
505
506 return false;
507}
508
509} // namespace aos::starter