blob: 9066a7817376517b1b728dc4ec768d4ebf03a76a [file] [log] [blame]
Tyler Chatowa79419d2020-08-12 20:12:11 -07001#include "starterd_lib.h"
2
3#include <fcntl.h>
4#include <pwd.h>
5#include <sys/fsuid.h>
6#include <sys/prctl.h>
7
8#include <algorithm>
9#include <utility>
10
11#include "glog/logging.h"
12#include "glog/stl_logging.h"
13
14namespace aos {
15namespace starter {
16
17Application::Application(const aos::Application *application,
18 aos::ShmEventLoop *event_loop)
19 : name_(application->name()->string_view()),
20 path_(application->has_executable_name()
21 ? application->executable_name()->string_view()
22 : application->name()->string_view()),
Tyler Chatow2acff482020-12-19 22:29:04 -080023 args_(1),
Tyler Chatowa79419d2020-08-12 20:12:11 -070024 user_(application->has_user() ? FindUid(application->user()->c_str())
25 : std::nullopt),
Austin Schuh529ac592021-10-14 16:11:13 -070026 group_(application->has_user()
27 ? FindPrimaryGidForUser(application->user()->c_str())
28 : std::nullopt),
Austin Schuh5f79a5a2021-10-12 17:46:50 -070029 autostart_(application->autostart()),
Tyler Chatowa79419d2020-08-12 20:12:11 -070030 event_loop_(event_loop),
31 start_timer_(event_loop_->AddTimer([this] {
32 status_ = aos::starter::State::RUNNING;
Austin Schuh3204b332021-10-16 14:20:10 -070033 LOG(INFO) << "Started '" << name_ << "' pid: " << pid_;
Tyler Chatowa79419d2020-08-12 20:12:11 -070034 })),
35 restart_timer_(event_loop_->AddTimer([this] { DoStart(); })),
36 stop_timer_(event_loop_->AddTimer([this] {
37 if (kill(pid_, SIGKILL) == 0) {
Austin Schuh3204b332021-10-16 14:20:10 -070038 LOG(WARNING) << "Failed to stop, sending SIGKILL to '" << name_
39 << "' pid: " << pid_;
Tyler Chatowa79419d2020-08-12 20:12:11 -070040 }
Austin Schuh5f79a5a2021-10-12 17:46:50 -070041 })) {}
Tyler Chatowa79419d2020-08-12 20:12:11 -070042
43void Application::DoStart() {
44 if (status_ != aos::starter::State::WAITING) {
45 return;
46 }
47
48 start_timer_->Disable();
49 restart_timer_->Disable();
50
Tyler Chatowa79419d2020-08-12 20:12:11 -070051 std::tie(read_pipe_, write_pipe_) = ScopedPipe::MakePipe();
52
53 const pid_t pid = fork();
54
55 if (pid != 0) {
56 if (pid == -1) {
Austin Schuh3204b332021-10-16 14:20:10 -070057 PLOG(WARNING) << "Failed to fork '" << name_ << "'";
Tyler Chatowa79419d2020-08-12 20:12:11 -070058 stop_reason_ = aos::starter::LastStopReason::FORK_ERR;
59 status_ = aos::starter::State::STOPPED;
60 } else {
61 pid_ = pid;
62 id_ = next_id_++;
63 start_time_ = event_loop_->monotonic_now();
64 status_ = aos::starter::State::STARTING;
Austin Schuh3204b332021-10-16 14:20:10 -070065 LOG(INFO) << "Starting '" << name_ << "' pid " << pid_;
Tyler Chatowa79419d2020-08-12 20:12:11 -070066
67 // Setup timer which moves application to RUNNING state if it is still
68 // alive in 1 second.
69 start_timer_->Setup(event_loop_->monotonic_now() +
70 std::chrono::seconds(1));
71 }
72 return;
73 }
74
75 // Clear out signal mask of parent so forked process receives all signals
76 // normally.
77 sigset_t empty_mask;
78 sigemptyset(&empty_mask);
79 sigprocmask(SIG_SETMASK, &empty_mask, nullptr);
80
81 // Cleanup children if starter dies in a way that is not handled gracefully.
82 if (prctl(PR_SET_PDEATHSIG, SIGKILL) == -1) {
83 write_pipe_.Write(
84 static_cast<uint32_t>(aos::starter::LastStopReason::SET_PRCTL_ERR));
85 PLOG(FATAL) << "Could not set PR_SET_PDEATHSIG to SIGKILL";
86 }
87
Austin Schuh529ac592021-10-14 16:11:13 -070088 if (group_) {
89 if (setgid(*group_) == -1) {
90 write_pipe_.Write(
91 static_cast<uint32_t>(aos::starter::LastStopReason::SET_GRP_ERR));
92 PLOG(FATAL) << "Could not set group for " << name_ << " to " << *group_;
93 }
94 }
95
Tyler Chatowa79419d2020-08-12 20:12:11 -070096 if (user_) {
Tyler Chatow03fdb2a2020-12-26 18:39:36 -080097 if (setuid(*user_) == -1) {
Tyler Chatowa79419d2020-08-12 20:12:11 -070098 write_pipe_.Write(
99 static_cast<uint32_t>(aos::starter::LastStopReason::SET_USR_ERR));
100 PLOG(FATAL) << "Could not set user for " << name_ << " to " << *user_;
101 }
102 }
103
104 // argv[0] should be the program name
105 args_.insert(args_.begin(), path_.data());
106
Austin Schuh529ac592021-10-14 16:11:13 -0700107 execvp(path_.c_str(), args_.data());
Tyler Chatowa79419d2020-08-12 20:12:11 -0700108
109 // If we got here, something went wrong
110 write_pipe_.Write(
111 static_cast<uint32_t>(aos::starter::LastStopReason::EXECV_ERR));
112 PLOG(WARNING) << "Could not execute " << name_ << " (" << path_ << ')';
113
114 _exit(EXIT_FAILURE);
115}
116
117void Application::DoStop(bool restart) {
118 // If stop or restart received, the old state of these is no longer applicable
119 // so cancel both.
120 restart_timer_->Disable();
121 start_timer_->Disable();
122
123 switch (status_) {
124 case aos::starter::State::STARTING:
125 case aos::starter::State::RUNNING: {
Austin Schuh3204b332021-10-16 14:20:10 -0700126 LOG(INFO) << "Stopping '" << name_ << "' pid: " << pid_ << " with signal "
127 << SIGINT;
Tyler Chatowa79419d2020-08-12 20:12:11 -0700128 status_ = aos::starter::State::STOPPING;
129
130 kill(pid_, SIGINT);
131
132 // Watchdog timer to SIGKILL application if it is still running 1 second
133 // after SIGINT
134 stop_timer_->Setup(event_loop_->monotonic_now() +
135 std::chrono::seconds(1));
136 queue_restart_ = restart;
137 break;
138 }
139 case aos::starter::State::WAITING: {
140 // If waiting to restart, and receives restart, skip the waiting period
141 // and restart immediately. If stop received, all we have to do is move
142 // to the STOPPED state.
143 if (restart) {
144 DoStart();
145 } else {
146 status_ = aos::starter::State::STOPPED;
147 }
148 break;
149 }
150 case aos::starter::State::STOPPING: {
151 // If the application is already stopping, then we just need to update the
152 // restart flag to the most recent status.
153 queue_restart_ = restart;
154 break;
155 }
156 case aos::starter::State::STOPPED: {
157 // Restart immediately if the application is already stopped
158 if (restart) {
159 status_ = aos::starter::State::WAITING;
160 DoStart();
161 }
162 break;
163 }
164 }
165}
166
167void Application::QueueStart() {
168 status_ = aos::starter::State::WAITING;
169
Austin Schuha07b3ce2021-10-10 12:33:21 -0700170 LOG(INFO) << "Restarting " << name_ << " in 3 seconds";
171 restart_timer_->Setup(event_loop_->monotonic_now() + std::chrono::seconds(3));
Tyler Chatowa79419d2020-08-12 20:12:11 -0700172 start_timer_->Disable();
173 stop_timer_->Disable();
174}
175
176void Application::set_args(
177 const flatbuffers::Vector<flatbuffers::Offset<flatbuffers::String>> &v) {
178 args_.clear();
179 std::transform(v.begin(), v.end(), std::back_inserter(args_),
180 [](const flatbuffers::String *str) {
181 return const_cast<char *>(str->c_str());
182 });
183 args_.push_back(nullptr);
184}
185
186std::optional<uid_t> Application::FindUid(const char *name) {
Austin Schuh529ac592021-10-14 16:11:13 -0700187 // TODO(austin): Use the reentrant version. This should be safe.
Tyler Chatowa79419d2020-08-12 20:12:11 -0700188 struct passwd *user_data = getpwnam(name);
189 if (user_data != nullptr) {
190 return user_data->pw_uid;
191 } else {
192 LOG(FATAL) << "Could not find user " << name;
193 return std::nullopt;
194 }
195}
196
Austin Schuh529ac592021-10-14 16:11:13 -0700197std::optional<gid_t> Application::FindPrimaryGidForUser(const char *name) {
198 // TODO(austin): Use the reentrant version. This should be safe.
199 struct passwd *user_data = getpwnam(name);
200 if (user_data != nullptr) {
201 return user_data->pw_gid;
202 } else {
203 LOG(FATAL) << "Could not find user " << name;
204 return std::nullopt;
205 }
206}
207
Tyler Chatowa79419d2020-08-12 20:12:11 -0700208flatbuffers::Offset<aos::starter::ApplicationStatus>
209Application::PopulateStatus(flatbuffers::FlatBufferBuilder *builder) {
210 CHECK_NOTNULL(builder);
211 auto name_fbs = builder->CreateString(name_);
212
213 aos::starter::ApplicationStatus::Builder status_builder(*builder);
214 status_builder.add_name(name_fbs);
215 status_builder.add_state(status_);
216 status_builder.add_last_exit_code(exit_code_);
217 status_builder.add_last_stop_reason(stop_reason_);
218 if (pid_ != -1) {
219 status_builder.add_pid(pid_);
220 status_builder.add_id(id_);
221 }
222 status_builder.add_last_start_time(start_time_.time_since_epoch().count());
223 return status_builder.Finish();
224}
225
226void Application::Terminate() {
227 stop_reason_ = aos::starter::LastStopReason::TERMINATE;
228 DoStop(false);
229 terminating_ = true;
230}
231
232void Application::HandleCommand(aos::starter::Command cmd) {
233 switch (cmd) {
234 case aos::starter::Command::START: {
235 switch (status_) {
236 case aos::starter::State::WAITING: {
237 restart_timer_->Disable();
238 DoStart();
239 break;
240 }
241 case aos::starter::State::STARTING: {
242 break;
243 }
244 case aos::starter::State::RUNNING: {
245 break;
246 }
247 case aos::starter::State::STOPPING: {
248 queue_restart_ = true;
249 break;
250 }
251 case aos::starter::State::STOPPED: {
252 status_ = aos::starter::State::WAITING;
253 DoStart();
254 break;
255 }
256 }
257 break;
258 }
259 case aos::starter::Command::STOP: {
260 stop_reason_ = aos::starter::LastStopReason::STOP_REQUESTED;
261 DoStop(false);
262 break;
263 }
264 case aos::starter::Command::RESTART: {
265 stop_reason_ = aos::starter::LastStopReason::RESTART_REQUESTED;
266 DoStop(true);
267 break;
268 }
269 }
270}
271
272bool Application::MaybeHandleSignal() {
273 int status;
274
275 // Check if the status of this process has changed
276 if (pid_ == -1 || waitpid(pid_, &status, WNOHANG) != pid_) {
277 return false;
278 }
279
280 // Check that the event was the process exiting
281 if (!WIFEXITED(status) && !WIFSIGNALED(status)) {
282 return false;
283 }
284
285 exit_time_ = event_loop_->monotonic_now();
286 exit_code_ = WIFEXITED(status) ? WEXITSTATUS(status) : WTERMSIG(status);
287
288 if (auto read_result = read_pipe_.Read()) {
289 stop_reason_ = static_cast<aos::starter::LastStopReason>(*read_result);
290 }
291
292 switch (status_) {
293 case aos::starter::State::STARTING: {
Austin Schuh3204b332021-10-16 14:20:10 -0700294 LOG(WARNING) << "Failed to start '" << name_ << "' on pid " << pid_
Tyler Chatowa79419d2020-08-12 20:12:11 -0700295 << " : Exited with status " << exit_code_;
296 QueueStart();
297 break;
298 }
299 case aos::starter::State::RUNNING: {
Austin Schuh3204b332021-10-16 14:20:10 -0700300 LOG(WARNING) << "Application '" << name_ << "' pid " << pid_
301 << " exited unexpectedly with status " << exit_code_;
Tyler Chatowa79419d2020-08-12 20:12:11 -0700302 QueueStart();
303 break;
304 }
305 case aos::starter::State::STOPPING: {
Austin Schuh3204b332021-10-16 14:20:10 -0700306 LOG(INFO) << "Successfully stopped '" << name_ << "' pid: " << pid_
307 << " with status " << exit_code_;
Tyler Chatowa79419d2020-08-12 20:12:11 -0700308 status_ = aos::starter::State::STOPPED;
309
310 // Disable force stop timer since the process already died
311 stop_timer_->Disable();
312
313 if (terminating_) {
314 return true;
315 }
316
317 if (queue_restart_) {
318 queue_restart_ = false;
319 status_ = aos::starter::State::WAITING;
320 DoStart();
321 }
322 break;
323 }
324 case aos::starter::State::WAITING:
325 case aos::starter::State::STOPPED: {
326 LOG(FATAL)
Austin Schuh3204b332021-10-16 14:20:10 -0700327 << "Received signal on process that was already stopped : name: '"
328 << name_ << "' pid: " << pid_;
Tyler Chatowa79419d2020-08-12 20:12:11 -0700329 break;
330 }
331 }
332
333 return false;
334}
335
336ScopedPipe::ScopedPipe(int fd) : fd_(fd) {}
337
338ScopedPipe::~ScopedPipe() {
339 if (fd_ != -1) {
340 PCHECK(close(fd_) != -1);
341 }
342}
343
344ScopedPipe::ScopedPipe(ScopedPipe &&scoped_pipe) : fd_(scoped_pipe.fd_) {
345 scoped_pipe.fd_ = -1;
346}
347
348ScopedPipe &ScopedPipe::operator=(ScopedPipe &&scoped_pipe) {
349 if (fd_ != -1) {
350 PCHECK(close(fd_) != -1);
351 }
352 fd_ = scoped_pipe.fd_;
353 scoped_pipe.fd_ = -1;
354 return *this;
355}
356
357std::tuple<ScopedPipe::ScopedReadPipe, ScopedPipe::ScopedWritePipe>
358ScopedPipe::MakePipe() {
359 int fds[2];
360 PCHECK(pipe(fds) != -1);
361 PCHECK(fcntl(fds[0], F_SETFL, fcntl(fds[0], F_GETFL) | O_NONBLOCK) != -1);
362 PCHECK(fcntl(fds[1], F_SETFL, fcntl(fds[1], F_GETFL) | O_NONBLOCK) != -1);
363 return {ScopedReadPipe(fds[0]), ScopedWritePipe(fds[1])};
364}
365
366std::optional<uint32_t> ScopedPipe::ScopedReadPipe::Read() {
367 uint32_t buf;
368 ssize_t result = read(fd(), &buf, sizeof(buf));
369 if (result == sizeof(buf)) {
370 return buf;
371 } else {
372 return std::nullopt;
373 }
374}
375
376void ScopedPipe::ScopedWritePipe::Write(uint32_t data) {
377 ssize_t result = write(fd(), &data, sizeof(data));
378 PCHECK(result != -1);
379 CHECK(result == sizeof(data));
380}
381
382SignalListener::SignalListener(aos::ShmEventLoop *loop,
383 std::function<void(signalfd_siginfo)> callback)
384 : loop_(loop),
385 callback_(std::move(callback)),
386 signalfd_({SIGHUP, SIGINT, SIGQUIT, SIGABRT, SIGFPE, SIGSEGV, SIGPIPE,
387 SIGTERM, SIGBUS, SIGXCPU, SIGCHLD}) {
388 loop->epoll()->OnReadable(signalfd_.fd(), [this] {
389 signalfd_siginfo info = signalfd_.Read();
390
391 if (info.ssi_signo == 0) {
392 LOG(WARNING) << "Could not read " << sizeof(signalfd_siginfo) << " bytes";
393 return;
394 }
395
396 callback_(info);
397 });
398}
399
400SignalListener::~SignalListener() { loop_->epoll()->DeleteFd(signalfd_.fd()); }
401
402Starter::Starter(const aos::Configuration *event_loop_config)
403 : config_msg_(event_loop_config),
404 event_loop_(event_loop_config),
405 status_sender_(event_loop_.MakeSender<aos::starter::Status>("/aos")),
406 status_timer_(event_loop_.AddTimer([this] { SendStatus(); })),
407 cleanup_timer_(event_loop_.AddTimer([this] { event_loop_.Exit(); })),
408 listener_(&event_loop_,
409 [this](signalfd_siginfo signal) { OnSignal(signal); }) {
Tyler Chatowa79419d2020-08-12 20:12:11 -0700410 event_loop_.SkipAosLog();
411
412 event_loop_.OnRun([this] {
413 status_timer_->Setup(event_loop_.monotonic_now(),
414 std::chrono::milliseconds(500));
415 });
416
417 event_loop_.MakeWatcher("/aos", [this](const aos::starter::StarterRpc &cmd) {
418 if (!cmd.has_command() || !cmd.has_name() || exiting_) {
419 return;
420 }
421 LOG(INFO) << "Received command "
422 << aos::starter::EnumNameCommand(cmd.command()) << ' '
423 << cmd.name()->string_view();
424
425 auto search = applications_.find(cmd.name()->str());
426 if (search != applications_.end()) {
427 // If an applicatione exists by the given name, dispatch the command
428 search->second.HandleCommand(cmd.command());
429 }
430 });
431
432 if (config_msg_->has_applications()) {
433 const flatbuffers::Vector<flatbuffers::Offset<aos::Application>>
434 *applications = config_msg_->applications();
Ravago Jones7e2dd322020-11-21 15:58:58 -0800435
436 if (aos::configuration::MultiNode(config_msg_)) {
437 std::string_view current_node = event_loop_.node()->name()->string_view();
438 for (const aos::Application *application : *applications) {
439 CHECK(application->has_nodes());
440 for (const flatbuffers::String *node : *application->nodes()) {
441 if (node->string_view() == current_node) {
442 AddApplication(application);
443 break;
444 }
445 }
446 }
447 } else {
448 for (const aos::Application *application : *applications) {
449 AddApplication(application);
450 }
Tyler Chatowa79419d2020-08-12 20:12:11 -0700451 }
452 }
453}
454
455void Starter::Cleanup() {
456 if (exiting_) {
457 return;
458 }
459 exiting_ = true;
460 for (auto &application : applications_) {
461 application.second.Terminate();
462 }
463 cleanup_timer_->Setup(event_loop_.monotonic_now() +
464 std::chrono::milliseconds(1500));
465}
466
467void Starter::OnSignal(signalfd_siginfo info) {
Tyler Chatowa79419d2020-08-12 20:12:11 -0700468 if (info.ssi_signo == SIGCHLD) {
469 // SIGCHLD messages can be collapsed if multiple are received, so all
470 // applications must check their status.
471 for (auto iter = applications_.begin(); iter != applications_.end();) {
472 if (iter->second.MaybeHandleSignal()) {
473 iter = applications_.erase(iter);
474 } else {
475 ++iter;
476 }
477 }
478
479 if (exiting_ && applications_.empty()) {
480 event_loop_.Exit();
481 }
Austin Schuh3204b332021-10-16 14:20:10 -0700482 } else {
483 LOG(INFO) << "Received signal '" << strsignal(info.ssi_signo) << "'";
484
485 if (std::find(kStarterDeath.begin(), kStarterDeath.end(), info.ssi_signo) !=
486 kStarterDeath.end()) {
487 LOG(WARNING) << "Starter shutting down";
488 Cleanup();
489 }
Tyler Chatowa79419d2020-08-12 20:12:11 -0700490 }
491}
492
493Application *Starter::AddApplication(const aos::Application *application) {
494 auto [iter, success] = applications_.try_emplace(application->name()->str(),
495 application, &event_loop_);
496 if (success) {
497 if (application->has_args()) {
498 iter->second.set_args(*application->args());
499 }
500 return &(iter->second);
501 }
502 return nullptr;
503}
504
505void Starter::Run() {
Tyler Chatow03fdb2a2020-12-26 18:39:36 -0800506#ifdef AOS_ARCHITECTURE_arm_frc
507 PCHECK(setuid(0) == 0) << "Failed to change user to root";
508#endif
509
Tyler Chatowa79419d2020-08-12 20:12:11 -0700510 for (auto &application : applications_) {
Austin Schuh5f79a5a2021-10-12 17:46:50 -0700511 if (application.second.autostart()) {
512 application.second.Start();
513 }
Tyler Chatowa79419d2020-08-12 20:12:11 -0700514 }
515
516 event_loop_.Run();
517}
518
519void Starter::SendStatus() {
520 aos::Sender<aos::starter::Status>::Builder builder =
521 status_sender_.MakeBuilder();
522
523 std::vector<flatbuffers::Offset<aos::starter::ApplicationStatus>> statuses;
524
525 for (auto &application : applications_) {
526 statuses.push_back(application.second.PopulateStatus(builder.fbb()));
527 }
528
529 auto statuses_fbs = builder.fbb()->CreateVector(statuses);
530
531 aos::starter::Status::Builder status_builder(*builder.fbb());
532 status_builder.add_statuses(statuses_fbs);
533 CHECK(builder.Send(status_builder.Finish()));
534}
535
536} // namespace starter
537} // namespace aos