blob: 8193d0f7e03b8205ca0312d611752b67006dfdc2 [file] [log] [blame]
Tyler Chatowa79419d2020-08-12 20:12:11 -07001#include "starterd_lib.h"
2
3#include <fcntl.h>
4#include <pwd.h>
5#include <sys/fsuid.h>
6#include <sys/prctl.h>
7
8#include <algorithm>
9#include <utility>
10
11#include "glog/logging.h"
12#include "glog/stl_logging.h"
13
14namespace aos {
15namespace starter {
16
17Application::Application(const aos::Application *application,
18 aos::ShmEventLoop *event_loop)
19 : name_(application->name()->string_view()),
20 path_(application->has_executable_name()
21 ? application->executable_name()->string_view()
22 : application->name()->string_view()),
Tyler Chatow2acff482020-12-19 22:29:04 -080023 args_(1),
Tyler Chatowa79419d2020-08-12 20:12:11 -070024 user_(application->has_user() ? FindUid(application->user()->c_str())
25 : std::nullopt),
Austin Schuh529ac592021-10-14 16:11:13 -070026 group_(application->has_user()
27 ? FindPrimaryGidForUser(application->user()->c_str())
28 : std::nullopt),
Austin Schuh5f79a5a2021-10-12 17:46:50 -070029 autostart_(application->autostart()),
Tyler Chatowa79419d2020-08-12 20:12:11 -070030 event_loop_(event_loop),
31 start_timer_(event_loop_->AddTimer([this] {
32 status_ = aos::starter::State::RUNNING;
33 LOG(INFO) << "Started " << name_;
34 })),
35 restart_timer_(event_loop_->AddTimer([this] { DoStart(); })),
36 stop_timer_(event_loop_->AddTimer([this] {
37 if (kill(pid_, SIGKILL) == 0) {
38 LOG(WARNING) << "Sent SIGKILL to " << name_ << " pid: " << pid_;
39 }
Austin Schuh5f79a5a2021-10-12 17:46:50 -070040 })) {}
Tyler Chatowa79419d2020-08-12 20:12:11 -070041
42void Application::DoStart() {
43 if (status_ != aos::starter::State::WAITING) {
44 return;
45 }
46
47 start_timer_->Disable();
48 restart_timer_->Disable();
49
50 LOG(INFO) << "Starting " << name_;
51
52 std::tie(read_pipe_, write_pipe_) = ScopedPipe::MakePipe();
53
54 const pid_t pid = fork();
55
56 if (pid != 0) {
57 if (pid == -1) {
58 PLOG(WARNING) << "Failed to fork";
59 stop_reason_ = aos::starter::LastStopReason::FORK_ERR;
60 status_ = aos::starter::State::STOPPED;
61 } else {
62 pid_ = pid;
63 id_ = next_id_++;
64 start_time_ = event_loop_->monotonic_now();
65 status_ = aos::starter::State::STARTING;
66
67 // Setup timer which moves application to RUNNING state if it is still
68 // alive in 1 second.
69 start_timer_->Setup(event_loop_->monotonic_now() +
70 std::chrono::seconds(1));
71 }
72 return;
73 }
74
75 // Clear out signal mask of parent so forked process receives all signals
76 // normally.
77 sigset_t empty_mask;
78 sigemptyset(&empty_mask);
79 sigprocmask(SIG_SETMASK, &empty_mask, nullptr);
80
81 // Cleanup children if starter dies in a way that is not handled gracefully.
82 if (prctl(PR_SET_PDEATHSIG, SIGKILL) == -1) {
83 write_pipe_.Write(
84 static_cast<uint32_t>(aos::starter::LastStopReason::SET_PRCTL_ERR));
85 PLOG(FATAL) << "Could not set PR_SET_PDEATHSIG to SIGKILL";
86 }
87
Austin Schuh529ac592021-10-14 16:11:13 -070088 if (group_) {
89 if (setgid(*group_) == -1) {
90 write_pipe_.Write(
91 static_cast<uint32_t>(aos::starter::LastStopReason::SET_GRP_ERR));
92 PLOG(FATAL) << "Could not set group for " << name_ << " to " << *group_;
93 }
94 }
95
Tyler Chatowa79419d2020-08-12 20:12:11 -070096 if (user_) {
Tyler Chatow03fdb2a2020-12-26 18:39:36 -080097 if (setuid(*user_) == -1) {
Tyler Chatowa79419d2020-08-12 20:12:11 -070098 write_pipe_.Write(
99 static_cast<uint32_t>(aos::starter::LastStopReason::SET_USR_ERR));
100 PLOG(FATAL) << "Could not set user for " << name_ << " to " << *user_;
101 }
102 }
103
104 // argv[0] should be the program name
105 args_.insert(args_.begin(), path_.data());
106
Austin Schuh529ac592021-10-14 16:11:13 -0700107 execvp(path_.c_str(), args_.data());
Tyler Chatowa79419d2020-08-12 20:12:11 -0700108
109 // If we got here, something went wrong
110 write_pipe_.Write(
111 static_cast<uint32_t>(aos::starter::LastStopReason::EXECV_ERR));
112 PLOG(WARNING) << "Could not execute " << name_ << " (" << path_ << ')';
113
114 _exit(EXIT_FAILURE);
115}
116
117void Application::DoStop(bool restart) {
118 // If stop or restart received, the old state of these is no longer applicable
119 // so cancel both.
120 restart_timer_->Disable();
121 start_timer_->Disable();
122
123 switch (status_) {
124 case aos::starter::State::STARTING:
125 case aos::starter::State::RUNNING: {
126 LOG(INFO) << "Killing " << name_ << " pid: " << pid_;
127 status_ = aos::starter::State::STOPPING;
128
129 kill(pid_, SIGINT);
130
131 // Watchdog timer to SIGKILL application if it is still running 1 second
132 // after SIGINT
133 stop_timer_->Setup(event_loop_->monotonic_now() +
134 std::chrono::seconds(1));
135 queue_restart_ = restart;
136 break;
137 }
138 case aos::starter::State::WAITING: {
139 // If waiting to restart, and receives restart, skip the waiting period
140 // and restart immediately. If stop received, all we have to do is move
141 // to the STOPPED state.
142 if (restart) {
143 DoStart();
144 } else {
145 status_ = aos::starter::State::STOPPED;
146 }
147 break;
148 }
149 case aos::starter::State::STOPPING: {
150 // If the application is already stopping, then we just need to update the
151 // restart flag to the most recent status.
152 queue_restart_ = restart;
153 break;
154 }
155 case aos::starter::State::STOPPED: {
156 // Restart immediately if the application is already stopped
157 if (restart) {
158 status_ = aos::starter::State::WAITING;
159 DoStart();
160 }
161 break;
162 }
163 }
164}
165
166void Application::QueueStart() {
167 status_ = aos::starter::State::WAITING;
168
Austin Schuha07b3ce2021-10-10 12:33:21 -0700169 LOG(INFO) << "Restarting " << name_ << " in 3 seconds";
170 restart_timer_->Setup(event_loop_->monotonic_now() + std::chrono::seconds(3));
Tyler Chatowa79419d2020-08-12 20:12:11 -0700171 start_timer_->Disable();
172 stop_timer_->Disable();
173}
174
175void Application::set_args(
176 const flatbuffers::Vector<flatbuffers::Offset<flatbuffers::String>> &v) {
177 args_.clear();
178 std::transform(v.begin(), v.end(), std::back_inserter(args_),
179 [](const flatbuffers::String *str) {
180 return const_cast<char *>(str->c_str());
181 });
182 args_.push_back(nullptr);
183}
184
185std::optional<uid_t> Application::FindUid(const char *name) {
Austin Schuh529ac592021-10-14 16:11:13 -0700186 // TODO(austin): Use the reentrant version. This should be safe.
Tyler Chatowa79419d2020-08-12 20:12:11 -0700187 struct passwd *user_data = getpwnam(name);
188 if (user_data != nullptr) {
189 return user_data->pw_uid;
190 } else {
191 LOG(FATAL) << "Could not find user " << name;
192 return std::nullopt;
193 }
194}
195
Austin Schuh529ac592021-10-14 16:11:13 -0700196std::optional<gid_t> Application::FindPrimaryGidForUser(const char *name) {
197 // TODO(austin): Use the reentrant version. This should be safe.
198 struct passwd *user_data = getpwnam(name);
199 if (user_data != nullptr) {
200 return user_data->pw_gid;
201 } else {
202 LOG(FATAL) << "Could not find user " << name;
203 return std::nullopt;
204 }
205}
206
Tyler Chatowa79419d2020-08-12 20:12:11 -0700207flatbuffers::Offset<aos::starter::ApplicationStatus>
208Application::PopulateStatus(flatbuffers::FlatBufferBuilder *builder) {
209 CHECK_NOTNULL(builder);
210 auto name_fbs = builder->CreateString(name_);
211
212 aos::starter::ApplicationStatus::Builder status_builder(*builder);
213 status_builder.add_name(name_fbs);
214 status_builder.add_state(status_);
215 status_builder.add_last_exit_code(exit_code_);
216 status_builder.add_last_stop_reason(stop_reason_);
217 if (pid_ != -1) {
218 status_builder.add_pid(pid_);
219 status_builder.add_id(id_);
220 }
221 status_builder.add_last_start_time(start_time_.time_since_epoch().count());
222 return status_builder.Finish();
223}
224
225void Application::Terminate() {
226 stop_reason_ = aos::starter::LastStopReason::TERMINATE;
227 DoStop(false);
228 terminating_ = true;
229}
230
231void Application::HandleCommand(aos::starter::Command cmd) {
232 switch (cmd) {
233 case aos::starter::Command::START: {
234 switch (status_) {
235 case aos::starter::State::WAITING: {
236 restart_timer_->Disable();
237 DoStart();
238 break;
239 }
240 case aos::starter::State::STARTING: {
241 break;
242 }
243 case aos::starter::State::RUNNING: {
244 break;
245 }
246 case aos::starter::State::STOPPING: {
247 queue_restart_ = true;
248 break;
249 }
250 case aos::starter::State::STOPPED: {
251 status_ = aos::starter::State::WAITING;
252 DoStart();
253 break;
254 }
255 }
256 break;
257 }
258 case aos::starter::Command::STOP: {
259 stop_reason_ = aos::starter::LastStopReason::STOP_REQUESTED;
260 DoStop(false);
261 break;
262 }
263 case aos::starter::Command::RESTART: {
264 stop_reason_ = aos::starter::LastStopReason::RESTART_REQUESTED;
265 DoStop(true);
266 break;
267 }
268 }
269}
270
271bool Application::MaybeHandleSignal() {
272 int status;
273
274 // Check if the status of this process has changed
275 if (pid_ == -1 || waitpid(pid_, &status, WNOHANG) != pid_) {
276 return false;
277 }
278
279 // Check that the event was the process exiting
280 if (!WIFEXITED(status) && !WIFSIGNALED(status)) {
281 return false;
282 }
283
284 exit_time_ = event_loop_->monotonic_now();
285 exit_code_ = WIFEXITED(status) ? WEXITSTATUS(status) : WTERMSIG(status);
286
287 if (auto read_result = read_pipe_.Read()) {
288 stop_reason_ = static_cast<aos::starter::LastStopReason>(*read_result);
289 }
290
291 switch (status_) {
292 case aos::starter::State::STARTING: {
293 LOG(WARNING) << "Failed to start " << name_ << " on pid " << pid_
294 << " : Exited with status " << exit_code_;
295 QueueStart();
296 break;
297 }
298 case aos::starter::State::RUNNING: {
299 QueueStart();
300 break;
301 }
302 case aos::starter::State::STOPPING: {
303 LOG(INFO) << "Successfully stopped " << name_;
304 status_ = aos::starter::State::STOPPED;
305
306 // Disable force stop timer since the process already died
307 stop_timer_->Disable();
308
309 if (terminating_) {
310 return true;
311 }
312
313 if (queue_restart_) {
314 queue_restart_ = false;
315 status_ = aos::starter::State::WAITING;
316 DoStart();
317 }
318 break;
319 }
320 case aos::starter::State::WAITING:
321 case aos::starter::State::STOPPED: {
322 LOG(FATAL)
323 << "Received signal on process that was already stopped : name: "
324 << name_ << " pid: " << pid_;
325 break;
326 }
327 }
328
329 return false;
330}
331
332ScopedPipe::ScopedPipe(int fd) : fd_(fd) {}
333
334ScopedPipe::~ScopedPipe() {
335 if (fd_ != -1) {
336 PCHECK(close(fd_) != -1);
337 }
338}
339
340ScopedPipe::ScopedPipe(ScopedPipe &&scoped_pipe) : fd_(scoped_pipe.fd_) {
341 scoped_pipe.fd_ = -1;
342}
343
344ScopedPipe &ScopedPipe::operator=(ScopedPipe &&scoped_pipe) {
345 if (fd_ != -1) {
346 PCHECK(close(fd_) != -1);
347 }
348 fd_ = scoped_pipe.fd_;
349 scoped_pipe.fd_ = -1;
350 return *this;
351}
352
353std::tuple<ScopedPipe::ScopedReadPipe, ScopedPipe::ScopedWritePipe>
354ScopedPipe::MakePipe() {
355 int fds[2];
356 PCHECK(pipe(fds) != -1);
357 PCHECK(fcntl(fds[0], F_SETFL, fcntl(fds[0], F_GETFL) | O_NONBLOCK) != -1);
358 PCHECK(fcntl(fds[1], F_SETFL, fcntl(fds[1], F_GETFL) | O_NONBLOCK) != -1);
359 return {ScopedReadPipe(fds[0]), ScopedWritePipe(fds[1])};
360}
361
362std::optional<uint32_t> ScopedPipe::ScopedReadPipe::Read() {
363 uint32_t buf;
364 ssize_t result = read(fd(), &buf, sizeof(buf));
365 if (result == sizeof(buf)) {
366 return buf;
367 } else {
368 return std::nullopt;
369 }
370}
371
372void ScopedPipe::ScopedWritePipe::Write(uint32_t data) {
373 ssize_t result = write(fd(), &data, sizeof(data));
374 PCHECK(result != -1);
375 CHECK(result == sizeof(data));
376}
377
378SignalListener::SignalListener(aos::ShmEventLoop *loop,
379 std::function<void(signalfd_siginfo)> callback)
380 : loop_(loop),
381 callback_(std::move(callback)),
382 signalfd_({SIGHUP, SIGINT, SIGQUIT, SIGABRT, SIGFPE, SIGSEGV, SIGPIPE,
383 SIGTERM, SIGBUS, SIGXCPU, SIGCHLD}) {
384 loop->epoll()->OnReadable(signalfd_.fd(), [this] {
385 signalfd_siginfo info = signalfd_.Read();
386
387 if (info.ssi_signo == 0) {
388 LOG(WARNING) << "Could not read " << sizeof(signalfd_siginfo) << " bytes";
389 return;
390 }
391
392 callback_(info);
393 });
394}
395
396SignalListener::~SignalListener() { loop_->epoll()->DeleteFd(signalfd_.fd()); }
397
398Starter::Starter(const aos::Configuration *event_loop_config)
399 : config_msg_(event_loop_config),
400 event_loop_(event_loop_config),
401 status_sender_(event_loop_.MakeSender<aos::starter::Status>("/aos")),
402 status_timer_(event_loop_.AddTimer([this] { SendStatus(); })),
403 cleanup_timer_(event_loop_.AddTimer([this] { event_loop_.Exit(); })),
404 listener_(&event_loop_,
405 [this](signalfd_siginfo signal) { OnSignal(signal); }) {
Tyler Chatowa79419d2020-08-12 20:12:11 -0700406 event_loop_.SkipAosLog();
407
408 event_loop_.OnRun([this] {
409 status_timer_->Setup(event_loop_.monotonic_now(),
410 std::chrono::milliseconds(500));
411 });
412
413 event_loop_.MakeWatcher("/aos", [this](const aos::starter::StarterRpc &cmd) {
414 if (!cmd.has_command() || !cmd.has_name() || exiting_) {
415 return;
416 }
417 LOG(INFO) << "Received command "
418 << aos::starter::EnumNameCommand(cmd.command()) << ' '
419 << cmd.name()->string_view();
420
421 auto search = applications_.find(cmd.name()->str());
422 if (search != applications_.end()) {
423 // If an applicatione exists by the given name, dispatch the command
424 search->second.HandleCommand(cmd.command());
425 }
426 });
427
428 if (config_msg_->has_applications()) {
429 const flatbuffers::Vector<flatbuffers::Offset<aos::Application>>
430 *applications = config_msg_->applications();
Ravago Jones7e2dd322020-11-21 15:58:58 -0800431
432 if (aos::configuration::MultiNode(config_msg_)) {
433 std::string_view current_node = event_loop_.node()->name()->string_view();
434 for (const aos::Application *application : *applications) {
435 CHECK(application->has_nodes());
436 for (const flatbuffers::String *node : *application->nodes()) {
437 if (node->string_view() == current_node) {
438 AddApplication(application);
439 break;
440 }
441 }
442 }
443 } else {
444 for (const aos::Application *application : *applications) {
445 AddApplication(application);
446 }
Tyler Chatowa79419d2020-08-12 20:12:11 -0700447 }
448 }
449}
450
451void Starter::Cleanup() {
452 if (exiting_) {
453 return;
454 }
455 exiting_ = true;
456 for (auto &application : applications_) {
457 application.second.Terminate();
458 }
459 cleanup_timer_->Setup(event_loop_.monotonic_now() +
460 std::chrono::milliseconds(1500));
461}
462
463void Starter::OnSignal(signalfd_siginfo info) {
464 LOG(INFO) << "Received signal " << strsignal(info.ssi_signo);
465
466 if (info.ssi_signo == SIGCHLD) {
467 // SIGCHLD messages can be collapsed if multiple are received, so all
468 // applications must check their status.
469 for (auto iter = applications_.begin(); iter != applications_.end();) {
470 if (iter->second.MaybeHandleSignal()) {
471 iter = applications_.erase(iter);
472 } else {
473 ++iter;
474 }
475 }
476
477 if (exiting_ && applications_.empty()) {
478 event_loop_.Exit();
479 }
480 } else if (std::find(kStarterDeath.begin(), kStarterDeath.end(),
481 info.ssi_signo) != kStarterDeath.end()) {
482 LOG(WARNING) << "Starter shutting down";
483 Cleanup();
484 }
485}
486
487Application *Starter::AddApplication(const aos::Application *application) {
488 auto [iter, success] = applications_.try_emplace(application->name()->str(),
489 application, &event_loop_);
490 if (success) {
491 if (application->has_args()) {
492 iter->second.set_args(*application->args());
493 }
494 return &(iter->second);
495 }
496 return nullptr;
497}
498
499void Starter::Run() {
Tyler Chatow03fdb2a2020-12-26 18:39:36 -0800500#ifdef AOS_ARCHITECTURE_arm_frc
501 PCHECK(setuid(0) == 0) << "Failed to change user to root";
502#endif
503
Tyler Chatowa79419d2020-08-12 20:12:11 -0700504 for (auto &application : applications_) {
Austin Schuh5f79a5a2021-10-12 17:46:50 -0700505 if (application.second.autostart()) {
506 application.second.Start();
507 }
Tyler Chatowa79419d2020-08-12 20:12:11 -0700508 }
509
510 event_loop_.Run();
511}
512
513void Starter::SendStatus() {
514 aos::Sender<aos::starter::Status>::Builder builder =
515 status_sender_.MakeBuilder();
516
517 std::vector<flatbuffers::Offset<aos::starter::ApplicationStatus>> statuses;
518
519 for (auto &application : applications_) {
520 statuses.push_back(application.second.PopulateStatus(builder.fbb()));
521 }
522
523 auto statuses_fbs = builder.fbb()->CreateVector(statuses);
524
525 aos::starter::Status::Builder status_builder(*builder.fbb());
526 status_builder.add_statuses(statuses_fbs);
527 CHECK(builder.Send(status_builder.Finish()));
528}
529
530} // namespace starter
531} // namespace aos