blob: 19051bbdf9ab2ea8a34f72e24a39350624c9eb9d [file] [log] [blame]
Tyler Chatowa79419d2020-08-12 20:12:11 -07001#include "starterd_lib.h"
2
3#include <fcntl.h>
4#include <pwd.h>
5#include <sys/fsuid.h>
6#include <sys/prctl.h>
7
8#include <algorithm>
9#include <utility>
10
11#include "glog/logging.h"
12#include "glog/stl_logging.h"
13
14namespace aos {
15namespace starter {
16
17Application::Application(const aos::Application *application,
Austin Schuhfc304942021-10-16 14:20:05 -070018 aos::ShmEventLoop *event_loop,
19 std::function<void()> on_change)
Tyler Chatowa79419d2020-08-12 20:12:11 -070020 : name_(application->name()->string_view()),
21 path_(application->has_executable_name()
22 ? application->executable_name()->string_view()
23 : application->name()->string_view()),
Tyler Chatow2acff482020-12-19 22:29:04 -080024 args_(1),
Tyler Chatowa79419d2020-08-12 20:12:11 -070025 user_(application->has_user() ? FindUid(application->user()->c_str())
26 : std::nullopt),
Austin Schuh529ac592021-10-14 16:11:13 -070027 group_(application->has_user()
28 ? FindPrimaryGidForUser(application->user()->c_str())
29 : std::nullopt),
Austin Schuh5f79a5a2021-10-12 17:46:50 -070030 autostart_(application->autostart()),
Tyler Chatowa79419d2020-08-12 20:12:11 -070031 event_loop_(event_loop),
32 start_timer_(event_loop_->AddTimer([this] {
33 status_ = aos::starter::State::RUNNING;
Austin Schuh3204b332021-10-16 14:20:10 -070034 LOG(INFO) << "Started '" << name_ << "' pid: " << pid_;
Tyler Chatowa79419d2020-08-12 20:12:11 -070035 })),
36 restart_timer_(event_loop_->AddTimer([this] { DoStart(); })),
37 stop_timer_(event_loop_->AddTimer([this] {
38 if (kill(pid_, SIGKILL) == 0) {
Austin Schuh3204b332021-10-16 14:20:10 -070039 LOG(WARNING) << "Failed to stop, sending SIGKILL to '" << name_
40 << "' pid: " << pid_;
Tyler Chatowa79419d2020-08-12 20:12:11 -070041 }
Austin Schuhfc304942021-10-16 14:20:05 -070042 })),
43 on_change_(on_change) {}
Tyler Chatowa79419d2020-08-12 20:12:11 -070044
45void Application::DoStart() {
46 if (status_ != aos::starter::State::WAITING) {
47 return;
48 }
49
50 start_timer_->Disable();
51 restart_timer_->Disable();
52
Tyler Chatowa79419d2020-08-12 20:12:11 -070053 std::tie(read_pipe_, write_pipe_) = ScopedPipe::MakePipe();
54
55 const pid_t pid = fork();
56
57 if (pid != 0) {
58 if (pid == -1) {
Austin Schuh3204b332021-10-16 14:20:10 -070059 PLOG(WARNING) << "Failed to fork '" << name_ << "'";
Tyler Chatowa79419d2020-08-12 20:12:11 -070060 stop_reason_ = aos::starter::LastStopReason::FORK_ERR;
61 status_ = aos::starter::State::STOPPED;
62 } else {
63 pid_ = pid;
64 id_ = next_id_++;
65 start_time_ = event_loop_->monotonic_now();
66 status_ = aos::starter::State::STARTING;
Austin Schuh3204b332021-10-16 14:20:10 -070067 LOG(INFO) << "Starting '" << name_ << "' pid " << pid_;
Tyler Chatowa79419d2020-08-12 20:12:11 -070068
69 // Setup timer which moves application to RUNNING state if it is still
70 // alive in 1 second.
71 start_timer_->Setup(event_loop_->monotonic_now() +
72 std::chrono::seconds(1));
73 }
Austin Schuhfc304942021-10-16 14:20:05 -070074 on_change_();
Tyler Chatowa79419d2020-08-12 20:12:11 -070075 return;
76 }
77
78 // Clear out signal mask of parent so forked process receives all signals
79 // normally.
80 sigset_t empty_mask;
81 sigemptyset(&empty_mask);
82 sigprocmask(SIG_SETMASK, &empty_mask, nullptr);
83
84 // Cleanup children if starter dies in a way that is not handled gracefully.
85 if (prctl(PR_SET_PDEATHSIG, SIGKILL) == -1) {
86 write_pipe_.Write(
87 static_cast<uint32_t>(aos::starter::LastStopReason::SET_PRCTL_ERR));
88 PLOG(FATAL) << "Could not set PR_SET_PDEATHSIG to SIGKILL";
89 }
90
Austin Schuh529ac592021-10-14 16:11:13 -070091 if (group_) {
92 if (setgid(*group_) == -1) {
93 write_pipe_.Write(
94 static_cast<uint32_t>(aos::starter::LastStopReason::SET_GRP_ERR));
95 PLOG(FATAL) << "Could not set group for " << name_ << " to " << *group_;
96 }
97 }
98
Tyler Chatowa79419d2020-08-12 20:12:11 -070099 if (user_) {
Tyler Chatow03fdb2a2020-12-26 18:39:36 -0800100 if (setuid(*user_) == -1) {
Tyler Chatowa79419d2020-08-12 20:12:11 -0700101 write_pipe_.Write(
102 static_cast<uint32_t>(aos::starter::LastStopReason::SET_USR_ERR));
103 PLOG(FATAL) << "Could not set user for " << name_ << " to " << *user_;
104 }
105 }
106
107 // argv[0] should be the program name
108 args_.insert(args_.begin(), path_.data());
109
Austin Schuh529ac592021-10-14 16:11:13 -0700110 execvp(path_.c_str(), args_.data());
Tyler Chatowa79419d2020-08-12 20:12:11 -0700111
112 // If we got here, something went wrong
113 write_pipe_.Write(
114 static_cast<uint32_t>(aos::starter::LastStopReason::EXECV_ERR));
115 PLOG(WARNING) << "Could not execute " << name_ << " (" << path_ << ')';
116
117 _exit(EXIT_FAILURE);
118}
119
120void Application::DoStop(bool restart) {
121 // If stop or restart received, the old state of these is no longer applicable
122 // so cancel both.
123 restart_timer_->Disable();
124 start_timer_->Disable();
125
126 switch (status_) {
127 case aos::starter::State::STARTING:
128 case aos::starter::State::RUNNING: {
Austin Schuh3204b332021-10-16 14:20:10 -0700129 LOG(INFO) << "Stopping '" << name_ << "' pid: " << pid_ << " with signal "
130 << SIGINT;
Tyler Chatowa79419d2020-08-12 20:12:11 -0700131 status_ = aos::starter::State::STOPPING;
132
133 kill(pid_, SIGINT);
134
135 // Watchdog timer to SIGKILL application if it is still running 1 second
136 // after SIGINT
137 stop_timer_->Setup(event_loop_->monotonic_now() +
138 std::chrono::seconds(1));
139 queue_restart_ = restart;
Austin Schuhfc304942021-10-16 14:20:05 -0700140 on_change_();
Tyler Chatowa79419d2020-08-12 20:12:11 -0700141 break;
142 }
143 case aos::starter::State::WAITING: {
144 // If waiting to restart, and receives restart, skip the waiting period
145 // and restart immediately. If stop received, all we have to do is move
146 // to the STOPPED state.
147 if (restart) {
148 DoStart();
149 } else {
150 status_ = aos::starter::State::STOPPED;
Austin Schuhfc304942021-10-16 14:20:05 -0700151 on_change_();
Tyler Chatowa79419d2020-08-12 20:12:11 -0700152 }
153 break;
154 }
155 case aos::starter::State::STOPPING: {
156 // If the application is already stopping, then we just need to update the
157 // restart flag to the most recent status.
158 queue_restart_ = restart;
159 break;
160 }
161 case aos::starter::State::STOPPED: {
162 // Restart immediately if the application is already stopped
163 if (restart) {
164 status_ = aos::starter::State::WAITING;
165 DoStart();
166 }
167 break;
168 }
169 }
170}
171
172void Application::QueueStart() {
173 status_ = aos::starter::State::WAITING;
174
Austin Schuha07b3ce2021-10-10 12:33:21 -0700175 LOG(INFO) << "Restarting " << name_ << " in 3 seconds";
176 restart_timer_->Setup(event_loop_->monotonic_now() + std::chrono::seconds(3));
Tyler Chatowa79419d2020-08-12 20:12:11 -0700177 start_timer_->Disable();
178 stop_timer_->Disable();
Austin Schuhfc304942021-10-16 14:20:05 -0700179 on_change_();
Tyler Chatowa79419d2020-08-12 20:12:11 -0700180}
181
182void Application::set_args(
183 const flatbuffers::Vector<flatbuffers::Offset<flatbuffers::String>> &v) {
184 args_.clear();
185 std::transform(v.begin(), v.end(), std::back_inserter(args_),
186 [](const flatbuffers::String *str) {
187 return const_cast<char *>(str->c_str());
188 });
189 args_.push_back(nullptr);
190}
191
192std::optional<uid_t> Application::FindUid(const char *name) {
Austin Schuh529ac592021-10-14 16:11:13 -0700193 // TODO(austin): Use the reentrant version. This should be safe.
Tyler Chatowa79419d2020-08-12 20:12:11 -0700194 struct passwd *user_data = getpwnam(name);
195 if (user_data != nullptr) {
196 return user_data->pw_uid;
197 } else {
198 LOG(FATAL) << "Could not find user " << name;
199 return std::nullopt;
200 }
201}
202
Austin Schuh529ac592021-10-14 16:11:13 -0700203std::optional<gid_t> Application::FindPrimaryGidForUser(const char *name) {
204 // TODO(austin): Use the reentrant version. This should be safe.
205 struct passwd *user_data = getpwnam(name);
206 if (user_data != nullptr) {
207 return user_data->pw_gid;
208 } else {
209 LOG(FATAL) << "Could not find user " << name;
210 return std::nullopt;
211 }
212}
213
Tyler Chatowa79419d2020-08-12 20:12:11 -0700214flatbuffers::Offset<aos::starter::ApplicationStatus>
215Application::PopulateStatus(flatbuffers::FlatBufferBuilder *builder) {
216 CHECK_NOTNULL(builder);
217 auto name_fbs = builder->CreateString(name_);
218
219 aos::starter::ApplicationStatus::Builder status_builder(*builder);
220 status_builder.add_name(name_fbs);
221 status_builder.add_state(status_);
222 status_builder.add_last_exit_code(exit_code_);
223 status_builder.add_last_stop_reason(stop_reason_);
224 if (pid_ != -1) {
225 status_builder.add_pid(pid_);
226 status_builder.add_id(id_);
227 }
228 status_builder.add_last_start_time(start_time_.time_since_epoch().count());
229 return status_builder.Finish();
230}
231
232void Application::Terminate() {
233 stop_reason_ = aos::starter::LastStopReason::TERMINATE;
234 DoStop(false);
235 terminating_ = true;
236}
237
238void Application::HandleCommand(aos::starter::Command cmd) {
239 switch (cmd) {
240 case aos::starter::Command::START: {
241 switch (status_) {
242 case aos::starter::State::WAITING: {
243 restart_timer_->Disable();
244 DoStart();
245 break;
246 }
247 case aos::starter::State::STARTING: {
248 break;
249 }
250 case aos::starter::State::RUNNING: {
251 break;
252 }
253 case aos::starter::State::STOPPING: {
254 queue_restart_ = true;
255 break;
256 }
257 case aos::starter::State::STOPPED: {
258 status_ = aos::starter::State::WAITING;
259 DoStart();
260 break;
261 }
262 }
263 break;
264 }
265 case aos::starter::Command::STOP: {
266 stop_reason_ = aos::starter::LastStopReason::STOP_REQUESTED;
267 DoStop(false);
268 break;
269 }
270 case aos::starter::Command::RESTART: {
271 stop_reason_ = aos::starter::LastStopReason::RESTART_REQUESTED;
272 DoStop(true);
273 break;
274 }
275 }
276}
277
278bool Application::MaybeHandleSignal() {
279 int status;
280
281 // Check if the status of this process has changed
282 if (pid_ == -1 || waitpid(pid_, &status, WNOHANG) != pid_) {
283 return false;
284 }
285
286 // Check that the event was the process exiting
287 if (!WIFEXITED(status) && !WIFSIGNALED(status)) {
288 return false;
289 }
290
291 exit_time_ = event_loop_->monotonic_now();
292 exit_code_ = WIFEXITED(status) ? WEXITSTATUS(status) : WTERMSIG(status);
293
294 if (auto read_result = read_pipe_.Read()) {
295 stop_reason_ = static_cast<aos::starter::LastStopReason>(*read_result);
296 }
297
298 switch (status_) {
299 case aos::starter::State::STARTING: {
Austin Schuh3204b332021-10-16 14:20:10 -0700300 LOG(WARNING) << "Failed to start '" << name_ << "' on pid " << pid_
Tyler Chatowa79419d2020-08-12 20:12:11 -0700301 << " : Exited with status " << exit_code_;
302 QueueStart();
303 break;
304 }
305 case aos::starter::State::RUNNING: {
Austin Schuh3204b332021-10-16 14:20:10 -0700306 LOG(WARNING) << "Application '" << name_ << "' pid " << pid_
307 << " exited unexpectedly with status " << exit_code_;
Tyler Chatowa79419d2020-08-12 20:12:11 -0700308 QueueStart();
309 break;
310 }
311 case aos::starter::State::STOPPING: {
Austin Schuh3204b332021-10-16 14:20:10 -0700312 LOG(INFO) << "Successfully stopped '" << name_ << "' pid: " << pid_
313 << " with status " << exit_code_;
Tyler Chatowa79419d2020-08-12 20:12:11 -0700314 status_ = aos::starter::State::STOPPED;
315
316 // Disable force stop timer since the process already died
317 stop_timer_->Disable();
318
Austin Schuhfc304942021-10-16 14:20:05 -0700319 on_change_();
Tyler Chatowa79419d2020-08-12 20:12:11 -0700320 if (terminating_) {
321 return true;
322 }
323
324 if (queue_restart_) {
325 queue_restart_ = false;
326 status_ = aos::starter::State::WAITING;
327 DoStart();
328 }
329 break;
330 }
331 case aos::starter::State::WAITING:
332 case aos::starter::State::STOPPED: {
333 LOG(FATAL)
Austin Schuh3204b332021-10-16 14:20:10 -0700334 << "Received signal on process that was already stopped : name: '"
335 << name_ << "' pid: " << pid_;
Tyler Chatowa79419d2020-08-12 20:12:11 -0700336 break;
337 }
338 }
339
340 return false;
341}
342
343ScopedPipe::ScopedPipe(int fd) : fd_(fd) {}
344
345ScopedPipe::~ScopedPipe() {
346 if (fd_ != -1) {
347 PCHECK(close(fd_) != -1);
348 }
349}
350
351ScopedPipe::ScopedPipe(ScopedPipe &&scoped_pipe) : fd_(scoped_pipe.fd_) {
352 scoped_pipe.fd_ = -1;
353}
354
355ScopedPipe &ScopedPipe::operator=(ScopedPipe &&scoped_pipe) {
356 if (fd_ != -1) {
357 PCHECK(close(fd_) != -1);
358 }
359 fd_ = scoped_pipe.fd_;
360 scoped_pipe.fd_ = -1;
361 return *this;
362}
363
364std::tuple<ScopedPipe::ScopedReadPipe, ScopedPipe::ScopedWritePipe>
365ScopedPipe::MakePipe() {
366 int fds[2];
367 PCHECK(pipe(fds) != -1);
368 PCHECK(fcntl(fds[0], F_SETFL, fcntl(fds[0], F_GETFL) | O_NONBLOCK) != -1);
369 PCHECK(fcntl(fds[1], F_SETFL, fcntl(fds[1], F_GETFL) | O_NONBLOCK) != -1);
370 return {ScopedReadPipe(fds[0]), ScopedWritePipe(fds[1])};
371}
372
373std::optional<uint32_t> ScopedPipe::ScopedReadPipe::Read() {
374 uint32_t buf;
375 ssize_t result = read(fd(), &buf, sizeof(buf));
376 if (result == sizeof(buf)) {
377 return buf;
378 } else {
379 return std::nullopt;
380 }
381}
382
383void ScopedPipe::ScopedWritePipe::Write(uint32_t data) {
384 ssize_t result = write(fd(), &data, sizeof(data));
385 PCHECK(result != -1);
386 CHECK(result == sizeof(data));
387}
388
389SignalListener::SignalListener(aos::ShmEventLoop *loop,
390 std::function<void(signalfd_siginfo)> callback)
391 : loop_(loop),
392 callback_(std::move(callback)),
393 signalfd_({SIGHUP, SIGINT, SIGQUIT, SIGABRT, SIGFPE, SIGSEGV, SIGPIPE,
394 SIGTERM, SIGBUS, SIGXCPU, SIGCHLD}) {
395 loop->epoll()->OnReadable(signalfd_.fd(), [this] {
396 signalfd_siginfo info = signalfd_.Read();
397
398 if (info.ssi_signo == 0) {
399 LOG(WARNING) << "Could not read " << sizeof(signalfd_siginfo) << " bytes";
400 return;
401 }
402
403 callback_(info);
404 });
405}
406
407SignalListener::~SignalListener() { loop_->epoll()->DeleteFd(signalfd_.fd()); }
408
409Starter::Starter(const aos::Configuration *event_loop_config)
410 : config_msg_(event_loop_config),
411 event_loop_(event_loop_config),
412 status_sender_(event_loop_.MakeSender<aos::starter::Status>("/aos")),
Austin Schuhfc304942021-10-16 14:20:05 -0700413 status_timer_(event_loop_.AddTimer([this] {
414 SendStatus();
415 status_count_ = 0;
416 })),
Tyler Chatowa79419d2020-08-12 20:12:11 -0700417 cleanup_timer_(event_loop_.AddTimer([this] { event_loop_.Exit(); })),
Austin Schuhfc304942021-10-16 14:20:05 -0700418 max_status_count_(
419 event_loop_.GetChannel<aos::starter::Status>("/aos")->frequency() -
420 1),
Tyler Chatowa79419d2020-08-12 20:12:11 -0700421 listener_(&event_loop_,
422 [this](signalfd_siginfo signal) { OnSignal(signal); }) {
Tyler Chatowa79419d2020-08-12 20:12:11 -0700423 event_loop_.SkipAosLog();
424
425 event_loop_.OnRun([this] {
426 status_timer_->Setup(event_loop_.monotonic_now(),
Austin Schuhfc304942021-10-16 14:20:05 -0700427 std::chrono::milliseconds(1000));
Tyler Chatowa79419d2020-08-12 20:12:11 -0700428 });
429
430 event_loop_.MakeWatcher("/aos", [this](const aos::starter::StarterRpc &cmd) {
431 if (!cmd.has_command() || !cmd.has_name() || exiting_) {
432 return;
433 }
434 LOG(INFO) << "Received command "
435 << aos::starter::EnumNameCommand(cmd.command()) << ' '
436 << cmd.name()->string_view();
437
438 auto search = applications_.find(cmd.name()->str());
439 if (search != applications_.end()) {
440 // If an applicatione exists by the given name, dispatch the command
441 search->second.HandleCommand(cmd.command());
442 }
443 });
444
445 if (config_msg_->has_applications()) {
446 const flatbuffers::Vector<flatbuffers::Offset<aos::Application>>
447 *applications = config_msg_->applications();
Ravago Jones7e2dd322020-11-21 15:58:58 -0800448
449 if (aos::configuration::MultiNode(config_msg_)) {
450 std::string_view current_node = event_loop_.node()->name()->string_view();
451 for (const aos::Application *application : *applications) {
452 CHECK(application->has_nodes());
453 for (const flatbuffers::String *node : *application->nodes()) {
454 if (node->string_view() == current_node) {
455 AddApplication(application);
456 break;
457 }
458 }
459 }
460 } else {
461 for (const aos::Application *application : *applications) {
462 AddApplication(application);
463 }
Tyler Chatowa79419d2020-08-12 20:12:11 -0700464 }
465 }
466}
467
Austin Schuhfc304942021-10-16 14:20:05 -0700468void Starter::MaybeSendStatus() {
469 if (status_count_ < max_status_count_) {
470 SendStatus();
471 ++status_count_;
472 } else {
473 VLOG(1) << "That's enough " << status_count_ << " " << max_status_count_;
474 }
475}
476
Tyler Chatowa79419d2020-08-12 20:12:11 -0700477void Starter::Cleanup() {
478 if (exiting_) {
479 return;
480 }
481 exiting_ = true;
482 for (auto &application : applications_) {
483 application.second.Terminate();
484 }
485 cleanup_timer_->Setup(event_loop_.monotonic_now() +
486 std::chrono::milliseconds(1500));
487}
488
489void Starter::OnSignal(signalfd_siginfo info) {
Tyler Chatowa79419d2020-08-12 20:12:11 -0700490 if (info.ssi_signo == SIGCHLD) {
491 // SIGCHLD messages can be collapsed if multiple are received, so all
492 // applications must check their status.
493 for (auto iter = applications_.begin(); iter != applications_.end();) {
494 if (iter->second.MaybeHandleSignal()) {
495 iter = applications_.erase(iter);
496 } else {
497 ++iter;
498 }
499 }
500
501 if (exiting_ && applications_.empty()) {
502 event_loop_.Exit();
503 }
Austin Schuh3204b332021-10-16 14:20:10 -0700504 } else {
505 LOG(INFO) << "Received signal '" << strsignal(info.ssi_signo) << "'";
506
507 if (std::find(kStarterDeath.begin(), kStarterDeath.end(), info.ssi_signo) !=
508 kStarterDeath.end()) {
509 LOG(WARNING) << "Starter shutting down";
510 Cleanup();
511 }
Tyler Chatowa79419d2020-08-12 20:12:11 -0700512 }
513}
514
515Application *Starter::AddApplication(const aos::Application *application) {
Austin Schuhfc304942021-10-16 14:20:05 -0700516 auto [iter, success] =
517 applications_.try_emplace(application->name()->str(), application,
518 &event_loop_, [this]() { MaybeSendStatus(); });
Tyler Chatowa79419d2020-08-12 20:12:11 -0700519 if (success) {
520 if (application->has_args()) {
521 iter->second.set_args(*application->args());
522 }
523 return &(iter->second);
524 }
525 return nullptr;
526}
527
528void Starter::Run() {
Tyler Chatow03fdb2a2020-12-26 18:39:36 -0800529#ifdef AOS_ARCHITECTURE_arm_frc
530 PCHECK(setuid(0) == 0) << "Failed to change user to root";
531#endif
532
Tyler Chatowa79419d2020-08-12 20:12:11 -0700533 for (auto &application : applications_) {
Austin Schuh5f79a5a2021-10-12 17:46:50 -0700534 if (application.second.autostart()) {
535 application.second.Start();
536 }
Tyler Chatowa79419d2020-08-12 20:12:11 -0700537 }
538
539 event_loop_.Run();
540}
541
542void Starter::SendStatus() {
543 aos::Sender<aos::starter::Status>::Builder builder =
544 status_sender_.MakeBuilder();
545
546 std::vector<flatbuffers::Offset<aos::starter::ApplicationStatus>> statuses;
547
548 for (auto &application : applications_) {
549 statuses.push_back(application.second.PopulateStatus(builder.fbb()));
550 }
551
552 auto statuses_fbs = builder.fbb()->CreateVector(statuses);
553
554 aos::starter::Status::Builder status_builder(*builder.fbb());
555 status_builder.add_statuses(statuses_fbs);
556 CHECK(builder.Send(status_builder.Finish()));
557}
558
559} // namespace starter
560} // namespace aos