blob: ff8bff9a6c5e1929e1af47fd7ef6457180d4a353 [file] [log] [blame]
Tyler Chatowa79419d2020-08-12 20:12:11 -07001#include "starterd_lib.h"
2
3#include <fcntl.h>
4#include <pwd.h>
5#include <sys/fsuid.h>
6#include <sys/prctl.h>
7
8#include <algorithm>
9#include <utility>
10
11#include "glog/logging.h"
12#include "glog/stl_logging.h"
13
14namespace aos {
15namespace starter {
16
17Application::Application(const aos::Application *application,
18 aos::ShmEventLoop *event_loop)
19 : name_(application->name()->string_view()),
20 path_(application->has_executable_name()
21 ? application->executable_name()->string_view()
22 : application->name()->string_view()),
Tyler Chatow2acff482020-12-19 22:29:04 -080023 args_(1),
Tyler Chatowa79419d2020-08-12 20:12:11 -070024 user_(application->has_user() ? FindUid(application->user()->c_str())
25 : std::nullopt),
Austin Schuh5f79a5a2021-10-12 17:46:50 -070026 autostart_(application->autostart()),
Tyler Chatowa79419d2020-08-12 20:12:11 -070027 event_loop_(event_loop),
28 start_timer_(event_loop_->AddTimer([this] {
29 status_ = aos::starter::State::RUNNING;
30 LOG(INFO) << "Started " << name_;
31 })),
32 restart_timer_(event_loop_->AddTimer([this] { DoStart(); })),
33 stop_timer_(event_loop_->AddTimer([this] {
34 if (kill(pid_, SIGKILL) == 0) {
35 LOG(WARNING) << "Sent SIGKILL to " << name_ << " pid: " << pid_;
36 }
Austin Schuh5f79a5a2021-10-12 17:46:50 -070037 })) {}
Tyler Chatowa79419d2020-08-12 20:12:11 -070038
39void Application::DoStart() {
40 if (status_ != aos::starter::State::WAITING) {
41 return;
42 }
43
44 start_timer_->Disable();
45 restart_timer_->Disable();
46
47 LOG(INFO) << "Starting " << name_;
48
49 std::tie(read_pipe_, write_pipe_) = ScopedPipe::MakePipe();
50
51 const pid_t pid = fork();
52
53 if (pid != 0) {
54 if (pid == -1) {
55 PLOG(WARNING) << "Failed to fork";
56 stop_reason_ = aos::starter::LastStopReason::FORK_ERR;
57 status_ = aos::starter::State::STOPPED;
58 } else {
59 pid_ = pid;
60 id_ = next_id_++;
61 start_time_ = event_loop_->monotonic_now();
62 status_ = aos::starter::State::STARTING;
63
64 // Setup timer which moves application to RUNNING state if it is still
65 // alive in 1 second.
66 start_timer_->Setup(event_loop_->monotonic_now() +
67 std::chrono::seconds(1));
68 }
69 return;
70 }
71
72 // Clear out signal mask of parent so forked process receives all signals
73 // normally.
74 sigset_t empty_mask;
75 sigemptyset(&empty_mask);
76 sigprocmask(SIG_SETMASK, &empty_mask, nullptr);
77
78 // Cleanup children if starter dies in a way that is not handled gracefully.
79 if (prctl(PR_SET_PDEATHSIG, SIGKILL) == -1) {
80 write_pipe_.Write(
81 static_cast<uint32_t>(aos::starter::LastStopReason::SET_PRCTL_ERR));
82 PLOG(FATAL) << "Could not set PR_SET_PDEATHSIG to SIGKILL";
83 }
84
85 if (user_) {
Tyler Chatow03fdb2a2020-12-26 18:39:36 -080086 if (setuid(*user_) == -1) {
Tyler Chatowa79419d2020-08-12 20:12:11 -070087 write_pipe_.Write(
88 static_cast<uint32_t>(aos::starter::LastStopReason::SET_USR_ERR));
89 PLOG(FATAL) << "Could not set user for " << name_ << " to " << *user_;
90 }
91 }
92
93 // argv[0] should be the program name
94 args_.insert(args_.begin(), path_.data());
95
96 execv(path_.c_str(), args_.data());
97
98 // If we got here, something went wrong
99 write_pipe_.Write(
100 static_cast<uint32_t>(aos::starter::LastStopReason::EXECV_ERR));
101 PLOG(WARNING) << "Could not execute " << name_ << " (" << path_ << ')';
102
103 _exit(EXIT_FAILURE);
104}
105
106void Application::DoStop(bool restart) {
107 // If stop or restart received, the old state of these is no longer applicable
108 // so cancel both.
109 restart_timer_->Disable();
110 start_timer_->Disable();
111
112 switch (status_) {
113 case aos::starter::State::STARTING:
114 case aos::starter::State::RUNNING: {
115 LOG(INFO) << "Killing " << name_ << " pid: " << pid_;
116 status_ = aos::starter::State::STOPPING;
117
118 kill(pid_, SIGINT);
119
120 // Watchdog timer to SIGKILL application if it is still running 1 second
121 // after SIGINT
122 stop_timer_->Setup(event_loop_->monotonic_now() +
123 std::chrono::seconds(1));
124 queue_restart_ = restart;
125 break;
126 }
127 case aos::starter::State::WAITING: {
128 // If waiting to restart, and receives restart, skip the waiting period
129 // and restart immediately. If stop received, all we have to do is move
130 // to the STOPPED state.
131 if (restart) {
132 DoStart();
133 } else {
134 status_ = aos::starter::State::STOPPED;
135 }
136 break;
137 }
138 case aos::starter::State::STOPPING: {
139 // If the application is already stopping, then we just need to update the
140 // restart flag to the most recent status.
141 queue_restart_ = restart;
142 break;
143 }
144 case aos::starter::State::STOPPED: {
145 // Restart immediately if the application is already stopped
146 if (restart) {
147 status_ = aos::starter::State::WAITING;
148 DoStart();
149 }
150 break;
151 }
152 }
153}
154
155void Application::QueueStart() {
156 status_ = aos::starter::State::WAITING;
157
158 LOG(INFO) << "Restarting " << name_ << " in 1 second";
159 restart_timer_->Setup(event_loop_->monotonic_now() + std::chrono::seconds(1));
160 start_timer_->Disable();
161 stop_timer_->Disable();
162}
163
164void Application::set_args(
165 const flatbuffers::Vector<flatbuffers::Offset<flatbuffers::String>> &v) {
166 args_.clear();
167 std::transform(v.begin(), v.end(), std::back_inserter(args_),
168 [](const flatbuffers::String *str) {
169 return const_cast<char *>(str->c_str());
170 });
171 args_.push_back(nullptr);
172}
173
174std::optional<uid_t> Application::FindUid(const char *name) {
175 struct passwd *user_data = getpwnam(name);
176 if (user_data != nullptr) {
177 return user_data->pw_uid;
178 } else {
179 LOG(FATAL) << "Could not find user " << name;
180 return std::nullopt;
181 }
182}
183
184flatbuffers::Offset<aos::starter::ApplicationStatus>
185Application::PopulateStatus(flatbuffers::FlatBufferBuilder *builder) {
186 CHECK_NOTNULL(builder);
187 auto name_fbs = builder->CreateString(name_);
188
189 aos::starter::ApplicationStatus::Builder status_builder(*builder);
190 status_builder.add_name(name_fbs);
191 status_builder.add_state(status_);
192 status_builder.add_last_exit_code(exit_code_);
193 status_builder.add_last_stop_reason(stop_reason_);
194 if (pid_ != -1) {
195 status_builder.add_pid(pid_);
196 status_builder.add_id(id_);
197 }
198 status_builder.add_last_start_time(start_time_.time_since_epoch().count());
199 return status_builder.Finish();
200}
201
202void Application::Terminate() {
203 stop_reason_ = aos::starter::LastStopReason::TERMINATE;
204 DoStop(false);
205 terminating_ = true;
206}
207
208void Application::HandleCommand(aos::starter::Command cmd) {
209 switch (cmd) {
210 case aos::starter::Command::START: {
211 switch (status_) {
212 case aos::starter::State::WAITING: {
213 restart_timer_->Disable();
214 DoStart();
215 break;
216 }
217 case aos::starter::State::STARTING: {
218 break;
219 }
220 case aos::starter::State::RUNNING: {
221 break;
222 }
223 case aos::starter::State::STOPPING: {
224 queue_restart_ = true;
225 break;
226 }
227 case aos::starter::State::STOPPED: {
228 status_ = aos::starter::State::WAITING;
229 DoStart();
230 break;
231 }
232 }
233 break;
234 }
235 case aos::starter::Command::STOP: {
236 stop_reason_ = aos::starter::LastStopReason::STOP_REQUESTED;
237 DoStop(false);
238 break;
239 }
240 case aos::starter::Command::RESTART: {
241 stop_reason_ = aos::starter::LastStopReason::RESTART_REQUESTED;
242 DoStop(true);
243 break;
244 }
245 }
246}
247
248bool Application::MaybeHandleSignal() {
249 int status;
250
251 // Check if the status of this process has changed
252 if (pid_ == -1 || waitpid(pid_, &status, WNOHANG) != pid_) {
253 return false;
254 }
255
256 // Check that the event was the process exiting
257 if (!WIFEXITED(status) && !WIFSIGNALED(status)) {
258 return false;
259 }
260
261 exit_time_ = event_loop_->monotonic_now();
262 exit_code_ = WIFEXITED(status) ? WEXITSTATUS(status) : WTERMSIG(status);
263
264 if (auto read_result = read_pipe_.Read()) {
265 stop_reason_ = static_cast<aos::starter::LastStopReason>(*read_result);
266 }
267
268 switch (status_) {
269 case aos::starter::State::STARTING: {
270 LOG(WARNING) << "Failed to start " << name_ << " on pid " << pid_
271 << " : Exited with status " << exit_code_;
272 QueueStart();
273 break;
274 }
275 case aos::starter::State::RUNNING: {
276 QueueStart();
277 break;
278 }
279 case aos::starter::State::STOPPING: {
280 LOG(INFO) << "Successfully stopped " << name_;
281 status_ = aos::starter::State::STOPPED;
282
283 // Disable force stop timer since the process already died
284 stop_timer_->Disable();
285
286 if (terminating_) {
287 return true;
288 }
289
290 if (queue_restart_) {
291 queue_restart_ = false;
292 status_ = aos::starter::State::WAITING;
293 DoStart();
294 }
295 break;
296 }
297 case aos::starter::State::WAITING:
298 case aos::starter::State::STOPPED: {
299 LOG(FATAL)
300 << "Received signal on process that was already stopped : name: "
301 << name_ << " pid: " << pid_;
302 break;
303 }
304 }
305
306 return false;
307}
308
309ScopedPipe::ScopedPipe(int fd) : fd_(fd) {}
310
311ScopedPipe::~ScopedPipe() {
312 if (fd_ != -1) {
313 PCHECK(close(fd_) != -1);
314 }
315}
316
317ScopedPipe::ScopedPipe(ScopedPipe &&scoped_pipe) : fd_(scoped_pipe.fd_) {
318 scoped_pipe.fd_ = -1;
319}
320
321ScopedPipe &ScopedPipe::operator=(ScopedPipe &&scoped_pipe) {
322 if (fd_ != -1) {
323 PCHECK(close(fd_) != -1);
324 }
325 fd_ = scoped_pipe.fd_;
326 scoped_pipe.fd_ = -1;
327 return *this;
328}
329
330std::tuple<ScopedPipe::ScopedReadPipe, ScopedPipe::ScopedWritePipe>
331ScopedPipe::MakePipe() {
332 int fds[2];
333 PCHECK(pipe(fds) != -1);
334 PCHECK(fcntl(fds[0], F_SETFL, fcntl(fds[0], F_GETFL) | O_NONBLOCK) != -1);
335 PCHECK(fcntl(fds[1], F_SETFL, fcntl(fds[1], F_GETFL) | O_NONBLOCK) != -1);
336 return {ScopedReadPipe(fds[0]), ScopedWritePipe(fds[1])};
337}
338
339std::optional<uint32_t> ScopedPipe::ScopedReadPipe::Read() {
340 uint32_t buf;
341 ssize_t result = read(fd(), &buf, sizeof(buf));
342 if (result == sizeof(buf)) {
343 return buf;
344 } else {
345 return std::nullopt;
346 }
347}
348
349void ScopedPipe::ScopedWritePipe::Write(uint32_t data) {
350 ssize_t result = write(fd(), &data, sizeof(data));
351 PCHECK(result != -1);
352 CHECK(result == sizeof(data));
353}
354
355SignalListener::SignalListener(aos::ShmEventLoop *loop,
356 std::function<void(signalfd_siginfo)> callback)
357 : loop_(loop),
358 callback_(std::move(callback)),
359 signalfd_({SIGHUP, SIGINT, SIGQUIT, SIGABRT, SIGFPE, SIGSEGV, SIGPIPE,
360 SIGTERM, SIGBUS, SIGXCPU, SIGCHLD}) {
361 loop->epoll()->OnReadable(signalfd_.fd(), [this] {
362 signalfd_siginfo info = signalfd_.Read();
363
364 if (info.ssi_signo == 0) {
365 LOG(WARNING) << "Could not read " << sizeof(signalfd_siginfo) << " bytes";
366 return;
367 }
368
369 callback_(info);
370 });
371}
372
373SignalListener::~SignalListener() { loop_->epoll()->DeleteFd(signalfd_.fd()); }
374
375Starter::Starter(const aos::Configuration *event_loop_config)
376 : config_msg_(event_loop_config),
377 event_loop_(event_loop_config),
378 status_sender_(event_loop_.MakeSender<aos::starter::Status>("/aos")),
379 status_timer_(event_loop_.AddTimer([this] { SendStatus(); })),
380 cleanup_timer_(event_loop_.AddTimer([this] { event_loop_.Exit(); })),
381 listener_(&event_loop_,
382 [this](signalfd_siginfo signal) { OnSignal(signal); }) {
383 event_loop_.SkipTimingReport();
384 event_loop_.SkipAosLog();
385
386 event_loop_.OnRun([this] {
387 status_timer_->Setup(event_loop_.monotonic_now(),
388 std::chrono::milliseconds(500));
389 });
390
391 event_loop_.MakeWatcher("/aos", [this](const aos::starter::StarterRpc &cmd) {
392 if (!cmd.has_command() || !cmd.has_name() || exiting_) {
393 return;
394 }
395 LOG(INFO) << "Received command "
396 << aos::starter::EnumNameCommand(cmd.command()) << ' '
397 << cmd.name()->string_view();
398
399 auto search = applications_.find(cmd.name()->str());
400 if (search != applications_.end()) {
401 // If an applicatione exists by the given name, dispatch the command
402 search->second.HandleCommand(cmd.command());
403 }
404 });
405
406 if (config_msg_->has_applications()) {
407 const flatbuffers::Vector<flatbuffers::Offset<aos::Application>>
408 *applications = config_msg_->applications();
Ravago Jones7e2dd322020-11-21 15:58:58 -0800409
410 if (aos::configuration::MultiNode(config_msg_)) {
411 std::string_view current_node = event_loop_.node()->name()->string_view();
412 for (const aos::Application *application : *applications) {
413 CHECK(application->has_nodes());
414 for (const flatbuffers::String *node : *application->nodes()) {
415 if (node->string_view() == current_node) {
416 AddApplication(application);
417 break;
418 }
419 }
420 }
421 } else {
422 for (const aos::Application *application : *applications) {
423 AddApplication(application);
424 }
Tyler Chatowa79419d2020-08-12 20:12:11 -0700425 }
426 }
427}
428
429void Starter::Cleanup() {
430 if (exiting_) {
431 return;
432 }
433 exiting_ = true;
434 for (auto &application : applications_) {
435 application.second.Terminate();
436 }
437 cleanup_timer_->Setup(event_loop_.monotonic_now() +
438 std::chrono::milliseconds(1500));
439}
440
441void Starter::OnSignal(signalfd_siginfo info) {
442 LOG(INFO) << "Received signal " << strsignal(info.ssi_signo);
443
444 if (info.ssi_signo == SIGCHLD) {
445 // SIGCHLD messages can be collapsed if multiple are received, so all
446 // applications must check their status.
447 for (auto iter = applications_.begin(); iter != applications_.end();) {
448 if (iter->second.MaybeHandleSignal()) {
449 iter = applications_.erase(iter);
450 } else {
451 ++iter;
452 }
453 }
454
455 if (exiting_ && applications_.empty()) {
456 event_loop_.Exit();
457 }
458 } else if (std::find(kStarterDeath.begin(), kStarterDeath.end(),
459 info.ssi_signo) != kStarterDeath.end()) {
460 LOG(WARNING) << "Starter shutting down";
461 Cleanup();
462 }
463}
464
465Application *Starter::AddApplication(const aos::Application *application) {
466 auto [iter, success] = applications_.try_emplace(application->name()->str(),
467 application, &event_loop_);
468 if (success) {
469 if (application->has_args()) {
470 iter->second.set_args(*application->args());
471 }
472 return &(iter->second);
473 }
474 return nullptr;
475}
476
477void Starter::Run() {
Tyler Chatow03fdb2a2020-12-26 18:39:36 -0800478#ifdef AOS_ARCHITECTURE_arm_frc
479 PCHECK(setuid(0) == 0) << "Failed to change user to root";
480#endif
481
Tyler Chatowa79419d2020-08-12 20:12:11 -0700482 for (auto &application : applications_) {
Austin Schuh5f79a5a2021-10-12 17:46:50 -0700483 if (application.second.autostart()) {
484 application.second.Start();
485 }
Tyler Chatowa79419d2020-08-12 20:12:11 -0700486 }
487
488 event_loop_.Run();
489}
490
491void Starter::SendStatus() {
492 aos::Sender<aos::starter::Status>::Builder builder =
493 status_sender_.MakeBuilder();
494
495 std::vector<flatbuffers::Offset<aos::starter::ApplicationStatus>> statuses;
496
497 for (auto &application : applications_) {
498 statuses.push_back(application.second.PopulateStatus(builder.fbb()));
499 }
500
501 auto statuses_fbs = builder.fbb()->CreateVector(statuses);
502
503 aos::starter::Status::Builder status_builder(*builder.fbb());
504 status_builder.add_statuses(statuses_fbs);
505 CHECK(builder.Send(status_builder.Finish()));
506}
507
508} // namespace starter
509} // namespace aos