blob: 1c32be34a90ed9f31dd13018e1bf2cfdd3687601 [file] [log] [blame]
Tyler Chatowa79419d2020-08-12 20:12:11 -07001#include "starterd_lib.h"
2
3#include <fcntl.h>
4#include <pwd.h>
5#include <sys/fsuid.h>
6#include <sys/prctl.h>
7
8#include <algorithm>
9#include <utility>
10
11#include "glog/logging.h"
12#include "glog/stl_logging.h"
13
14namespace aos {
15namespace starter {
16
17Application::Application(const aos::Application *application,
18 aos::ShmEventLoop *event_loop)
19 : name_(application->name()->string_view()),
20 path_(application->has_executable_name()
21 ? application->executable_name()->string_view()
22 : application->name()->string_view()),
Tyler Chatow2acff482020-12-19 22:29:04 -080023 args_(1),
Tyler Chatowa79419d2020-08-12 20:12:11 -070024 user_(application->has_user() ? FindUid(application->user()->c_str())
25 : std::nullopt),
26 event_loop_(event_loop),
27 start_timer_(event_loop_->AddTimer([this] {
28 status_ = aos::starter::State::RUNNING;
29 LOG(INFO) << "Started " << name_;
30 })),
31 restart_timer_(event_loop_->AddTimer([this] { DoStart(); })),
32 stop_timer_(event_loop_->AddTimer([this] {
33 if (kill(pid_, SIGKILL) == 0) {
34 LOG(WARNING) << "Sent SIGKILL to " << name_ << " pid: " << pid_;
35 }
36 }))
37
38{}
39
40void Application::DoStart() {
41 if (status_ != aos::starter::State::WAITING) {
42 return;
43 }
44
45 start_timer_->Disable();
46 restart_timer_->Disable();
47
48 LOG(INFO) << "Starting " << name_;
49
50 std::tie(read_pipe_, write_pipe_) = ScopedPipe::MakePipe();
51
52 const pid_t pid = fork();
53
54 if (pid != 0) {
55 if (pid == -1) {
56 PLOG(WARNING) << "Failed to fork";
57 stop_reason_ = aos::starter::LastStopReason::FORK_ERR;
58 status_ = aos::starter::State::STOPPED;
59 } else {
60 pid_ = pid;
61 id_ = next_id_++;
62 start_time_ = event_loop_->monotonic_now();
63 status_ = aos::starter::State::STARTING;
64
65 // Setup timer which moves application to RUNNING state if it is still
66 // alive in 1 second.
67 start_timer_->Setup(event_loop_->monotonic_now() +
68 std::chrono::seconds(1));
69 }
70 return;
71 }
72
73 // Clear out signal mask of parent so forked process receives all signals
74 // normally.
75 sigset_t empty_mask;
76 sigemptyset(&empty_mask);
77 sigprocmask(SIG_SETMASK, &empty_mask, nullptr);
78
79 // Cleanup children if starter dies in a way that is not handled gracefully.
80 if (prctl(PR_SET_PDEATHSIG, SIGKILL) == -1) {
81 write_pipe_.Write(
82 static_cast<uint32_t>(aos::starter::LastStopReason::SET_PRCTL_ERR));
83 PLOG(FATAL) << "Could not set PR_SET_PDEATHSIG to SIGKILL";
84 }
85
86 if (user_) {
Tyler Chatow03fdb2a2020-12-26 18:39:36 -080087 if (setuid(*user_) == -1) {
Tyler Chatowa79419d2020-08-12 20:12:11 -070088 write_pipe_.Write(
89 static_cast<uint32_t>(aos::starter::LastStopReason::SET_USR_ERR));
90 PLOG(FATAL) << "Could not set user for " << name_ << " to " << *user_;
91 }
92 }
93
94 // argv[0] should be the program name
95 args_.insert(args_.begin(), path_.data());
96
97 execv(path_.c_str(), args_.data());
98
99 // If we got here, something went wrong
100 write_pipe_.Write(
101 static_cast<uint32_t>(aos::starter::LastStopReason::EXECV_ERR));
102 PLOG(WARNING) << "Could not execute " << name_ << " (" << path_ << ')';
103
104 _exit(EXIT_FAILURE);
105}
106
107void Application::DoStop(bool restart) {
108 // If stop or restart received, the old state of these is no longer applicable
109 // so cancel both.
110 restart_timer_->Disable();
111 start_timer_->Disable();
112
113 switch (status_) {
114 case aos::starter::State::STARTING:
115 case aos::starter::State::RUNNING: {
116 LOG(INFO) << "Killing " << name_ << " pid: " << pid_;
117 status_ = aos::starter::State::STOPPING;
118
119 kill(pid_, SIGINT);
120
121 // Watchdog timer to SIGKILL application if it is still running 1 second
122 // after SIGINT
123 stop_timer_->Setup(event_loop_->monotonic_now() +
124 std::chrono::seconds(1));
125 queue_restart_ = restart;
126 break;
127 }
128 case aos::starter::State::WAITING: {
129 // If waiting to restart, and receives restart, skip the waiting period
130 // and restart immediately. If stop received, all we have to do is move
131 // to the STOPPED state.
132 if (restart) {
133 DoStart();
134 } else {
135 status_ = aos::starter::State::STOPPED;
136 }
137 break;
138 }
139 case aos::starter::State::STOPPING: {
140 // If the application is already stopping, then we just need to update the
141 // restart flag to the most recent status.
142 queue_restart_ = restart;
143 break;
144 }
145 case aos::starter::State::STOPPED: {
146 // Restart immediately if the application is already stopped
147 if (restart) {
148 status_ = aos::starter::State::WAITING;
149 DoStart();
150 }
151 break;
152 }
153 }
154}
155
156void Application::QueueStart() {
157 status_ = aos::starter::State::WAITING;
158
159 LOG(INFO) << "Restarting " << name_ << " in 1 second";
160 restart_timer_->Setup(event_loop_->monotonic_now() + std::chrono::seconds(1));
161 start_timer_->Disable();
162 stop_timer_->Disable();
163}
164
165void Application::set_args(
166 const flatbuffers::Vector<flatbuffers::Offset<flatbuffers::String>> &v) {
167 args_.clear();
168 std::transform(v.begin(), v.end(), std::back_inserter(args_),
169 [](const flatbuffers::String *str) {
170 return const_cast<char *>(str->c_str());
171 });
172 args_.push_back(nullptr);
173}
174
175std::optional<uid_t> Application::FindUid(const char *name) {
176 struct passwd *user_data = getpwnam(name);
177 if (user_data != nullptr) {
178 return user_data->pw_uid;
179 } else {
180 LOG(FATAL) << "Could not find user " << name;
181 return std::nullopt;
182 }
183}
184
185flatbuffers::Offset<aos::starter::ApplicationStatus>
186Application::PopulateStatus(flatbuffers::FlatBufferBuilder *builder) {
187 CHECK_NOTNULL(builder);
188 auto name_fbs = builder->CreateString(name_);
189
190 aos::starter::ApplicationStatus::Builder status_builder(*builder);
191 status_builder.add_name(name_fbs);
192 status_builder.add_state(status_);
193 status_builder.add_last_exit_code(exit_code_);
194 status_builder.add_last_stop_reason(stop_reason_);
195 if (pid_ != -1) {
196 status_builder.add_pid(pid_);
197 status_builder.add_id(id_);
198 }
199 status_builder.add_last_start_time(start_time_.time_since_epoch().count());
200 return status_builder.Finish();
201}
202
203void Application::Terminate() {
204 stop_reason_ = aos::starter::LastStopReason::TERMINATE;
205 DoStop(false);
206 terminating_ = true;
207}
208
209void Application::HandleCommand(aos::starter::Command cmd) {
210 switch (cmd) {
211 case aos::starter::Command::START: {
212 switch (status_) {
213 case aos::starter::State::WAITING: {
214 restart_timer_->Disable();
215 DoStart();
216 break;
217 }
218 case aos::starter::State::STARTING: {
219 break;
220 }
221 case aos::starter::State::RUNNING: {
222 break;
223 }
224 case aos::starter::State::STOPPING: {
225 queue_restart_ = true;
226 break;
227 }
228 case aos::starter::State::STOPPED: {
229 status_ = aos::starter::State::WAITING;
230 DoStart();
231 break;
232 }
233 }
234 break;
235 }
236 case aos::starter::Command::STOP: {
237 stop_reason_ = aos::starter::LastStopReason::STOP_REQUESTED;
238 DoStop(false);
239 break;
240 }
241 case aos::starter::Command::RESTART: {
242 stop_reason_ = aos::starter::LastStopReason::RESTART_REQUESTED;
243 DoStop(true);
244 break;
245 }
246 }
247}
248
249bool Application::MaybeHandleSignal() {
250 int status;
251
252 // Check if the status of this process has changed
253 if (pid_ == -1 || waitpid(pid_, &status, WNOHANG) != pid_) {
254 return false;
255 }
256
257 // Check that the event was the process exiting
258 if (!WIFEXITED(status) && !WIFSIGNALED(status)) {
259 return false;
260 }
261
262 exit_time_ = event_loop_->monotonic_now();
263 exit_code_ = WIFEXITED(status) ? WEXITSTATUS(status) : WTERMSIG(status);
264
265 if (auto read_result = read_pipe_.Read()) {
266 stop_reason_ = static_cast<aos::starter::LastStopReason>(*read_result);
267 }
268
269 switch (status_) {
270 case aos::starter::State::STARTING: {
271 LOG(WARNING) << "Failed to start " << name_ << " on pid " << pid_
272 << " : Exited with status " << exit_code_;
273 QueueStart();
274 break;
275 }
276 case aos::starter::State::RUNNING: {
277 QueueStart();
278 break;
279 }
280 case aos::starter::State::STOPPING: {
281 LOG(INFO) << "Successfully stopped " << name_;
282 status_ = aos::starter::State::STOPPED;
283
284 // Disable force stop timer since the process already died
285 stop_timer_->Disable();
286
287 if (terminating_) {
288 return true;
289 }
290
291 if (queue_restart_) {
292 queue_restart_ = false;
293 status_ = aos::starter::State::WAITING;
294 DoStart();
295 }
296 break;
297 }
298 case aos::starter::State::WAITING:
299 case aos::starter::State::STOPPED: {
300 LOG(FATAL)
301 << "Received signal on process that was already stopped : name: "
302 << name_ << " pid: " << pid_;
303 break;
304 }
305 }
306
307 return false;
308}
309
310ScopedPipe::ScopedPipe(int fd) : fd_(fd) {}
311
312ScopedPipe::~ScopedPipe() {
313 if (fd_ != -1) {
314 PCHECK(close(fd_) != -1);
315 }
316}
317
318ScopedPipe::ScopedPipe(ScopedPipe &&scoped_pipe) : fd_(scoped_pipe.fd_) {
319 scoped_pipe.fd_ = -1;
320}
321
322ScopedPipe &ScopedPipe::operator=(ScopedPipe &&scoped_pipe) {
323 if (fd_ != -1) {
324 PCHECK(close(fd_) != -1);
325 }
326 fd_ = scoped_pipe.fd_;
327 scoped_pipe.fd_ = -1;
328 return *this;
329}
330
331std::tuple<ScopedPipe::ScopedReadPipe, ScopedPipe::ScopedWritePipe>
332ScopedPipe::MakePipe() {
333 int fds[2];
334 PCHECK(pipe(fds) != -1);
335 PCHECK(fcntl(fds[0], F_SETFL, fcntl(fds[0], F_GETFL) | O_NONBLOCK) != -1);
336 PCHECK(fcntl(fds[1], F_SETFL, fcntl(fds[1], F_GETFL) | O_NONBLOCK) != -1);
337 return {ScopedReadPipe(fds[0]), ScopedWritePipe(fds[1])};
338}
339
340std::optional<uint32_t> ScopedPipe::ScopedReadPipe::Read() {
341 uint32_t buf;
342 ssize_t result = read(fd(), &buf, sizeof(buf));
343 if (result == sizeof(buf)) {
344 return buf;
345 } else {
346 return std::nullopt;
347 }
348}
349
350void ScopedPipe::ScopedWritePipe::Write(uint32_t data) {
351 ssize_t result = write(fd(), &data, sizeof(data));
352 PCHECK(result != -1);
353 CHECK(result == sizeof(data));
354}
355
356SignalListener::SignalListener(aos::ShmEventLoop *loop,
357 std::function<void(signalfd_siginfo)> callback)
358 : loop_(loop),
359 callback_(std::move(callback)),
360 signalfd_({SIGHUP, SIGINT, SIGQUIT, SIGABRT, SIGFPE, SIGSEGV, SIGPIPE,
361 SIGTERM, SIGBUS, SIGXCPU, SIGCHLD}) {
362 loop->epoll()->OnReadable(signalfd_.fd(), [this] {
363 signalfd_siginfo info = signalfd_.Read();
364
365 if (info.ssi_signo == 0) {
366 LOG(WARNING) << "Could not read " << sizeof(signalfd_siginfo) << " bytes";
367 return;
368 }
369
370 callback_(info);
371 });
372}
373
374SignalListener::~SignalListener() { loop_->epoll()->DeleteFd(signalfd_.fd()); }
375
376Starter::Starter(const aos::Configuration *event_loop_config)
377 : config_msg_(event_loop_config),
378 event_loop_(event_loop_config),
379 status_sender_(event_loop_.MakeSender<aos::starter::Status>("/aos")),
380 status_timer_(event_loop_.AddTimer([this] { SendStatus(); })),
381 cleanup_timer_(event_loop_.AddTimer([this] { event_loop_.Exit(); })),
382 listener_(&event_loop_,
383 [this](signalfd_siginfo signal) { OnSignal(signal); }) {
384 event_loop_.SkipTimingReport();
385 event_loop_.SkipAosLog();
386
387 event_loop_.OnRun([this] {
388 status_timer_->Setup(event_loop_.monotonic_now(),
389 std::chrono::milliseconds(500));
390 });
391
392 event_loop_.MakeWatcher("/aos", [this](const aos::starter::StarterRpc &cmd) {
393 if (!cmd.has_command() || !cmd.has_name() || exiting_) {
394 return;
395 }
396 LOG(INFO) << "Received command "
397 << aos::starter::EnumNameCommand(cmd.command()) << ' '
398 << cmd.name()->string_view();
399
400 auto search = applications_.find(cmd.name()->str());
401 if (search != applications_.end()) {
402 // If an applicatione exists by the given name, dispatch the command
403 search->second.HandleCommand(cmd.command());
404 }
405 });
406
407 if (config_msg_->has_applications()) {
408 const flatbuffers::Vector<flatbuffers::Offset<aos::Application>>
409 *applications = config_msg_->applications();
Ravago Jones7e2dd322020-11-21 15:58:58 -0800410
411 if (aos::configuration::MultiNode(config_msg_)) {
412 std::string_view current_node = event_loop_.node()->name()->string_view();
413 for (const aos::Application *application : *applications) {
414 CHECK(application->has_nodes());
415 for (const flatbuffers::String *node : *application->nodes()) {
416 if (node->string_view() == current_node) {
417 AddApplication(application);
418 break;
419 }
420 }
421 }
422 } else {
423 for (const aos::Application *application : *applications) {
424 AddApplication(application);
425 }
Tyler Chatowa79419d2020-08-12 20:12:11 -0700426 }
427 }
428}
429
430void Starter::Cleanup() {
431 if (exiting_) {
432 return;
433 }
434 exiting_ = true;
435 for (auto &application : applications_) {
436 application.second.Terminate();
437 }
438 cleanup_timer_->Setup(event_loop_.monotonic_now() +
439 std::chrono::milliseconds(1500));
440}
441
442void Starter::OnSignal(signalfd_siginfo info) {
443 LOG(INFO) << "Received signal " << strsignal(info.ssi_signo);
444
445 if (info.ssi_signo == SIGCHLD) {
446 // SIGCHLD messages can be collapsed if multiple are received, so all
447 // applications must check their status.
448 for (auto iter = applications_.begin(); iter != applications_.end();) {
449 if (iter->second.MaybeHandleSignal()) {
450 iter = applications_.erase(iter);
451 } else {
452 ++iter;
453 }
454 }
455
456 if (exiting_ && applications_.empty()) {
457 event_loop_.Exit();
458 }
459 } else if (std::find(kStarterDeath.begin(), kStarterDeath.end(),
460 info.ssi_signo) != kStarterDeath.end()) {
461 LOG(WARNING) << "Starter shutting down";
462 Cleanup();
463 }
464}
465
466Application *Starter::AddApplication(const aos::Application *application) {
467 auto [iter, success] = applications_.try_emplace(application->name()->str(),
468 application, &event_loop_);
469 if (success) {
470 if (application->has_args()) {
471 iter->second.set_args(*application->args());
472 }
473 return &(iter->second);
474 }
475 return nullptr;
476}
477
478void Starter::Run() {
Tyler Chatow03fdb2a2020-12-26 18:39:36 -0800479#ifdef AOS_ARCHITECTURE_arm_frc
480 PCHECK(setuid(0) == 0) << "Failed to change user to root";
481#endif
482
Tyler Chatowa79419d2020-08-12 20:12:11 -0700483 for (auto &application : applications_) {
484 application.second.Start();
485 }
486
487 event_loop_.Run();
488}
489
490void Starter::SendStatus() {
491 aos::Sender<aos::starter::Status>::Builder builder =
492 status_sender_.MakeBuilder();
493
494 std::vector<flatbuffers::Offset<aos::starter::ApplicationStatus>> statuses;
495
496 for (auto &application : applications_) {
497 statuses.push_back(application.second.PopulateStatus(builder.fbb()));
498 }
499
500 auto statuses_fbs = builder.fbb()->CreateVector(statuses);
501
502 aos::starter::Status::Builder status_builder(*builder.fbb());
503 status_builder.add_statuses(statuses_fbs);
504 CHECK(builder.Send(status_builder.Finish()));
505}
506
507} // namespace starter
508} // namespace aos