blob: ea7fdee1c05dbb4fc70e37b2ea75dc5e6a8d2508 [file] [log] [blame]
Tyler Chatowa79419d2020-08-12 20:12:11 -07001#include "starterd_lib.h"
2
3#include <fcntl.h>
4#include <pwd.h>
5#include <sys/fsuid.h>
6#include <sys/prctl.h>
7
8#include <algorithm>
9#include <utility>
10
11#include "glog/logging.h"
12#include "glog/stl_logging.h"
13
14namespace aos {
15namespace starter {
16
17Application::Application(const aos::Application *application,
18 aos::ShmEventLoop *event_loop)
19 : name_(application->name()->string_view()),
20 path_(application->has_executable_name()
21 ? application->executable_name()->string_view()
22 : application->name()->string_view()),
23 user_(application->has_user() ? FindUid(application->user()->c_str())
24 : std::nullopt),
25 event_loop_(event_loop),
26 start_timer_(event_loop_->AddTimer([this] {
27 status_ = aos::starter::State::RUNNING;
28 LOG(INFO) << "Started " << name_;
29 })),
30 restart_timer_(event_loop_->AddTimer([this] { DoStart(); })),
31 stop_timer_(event_loop_->AddTimer([this] {
32 if (kill(pid_, SIGKILL) == 0) {
33 LOG(WARNING) << "Sent SIGKILL to " << name_ << " pid: " << pid_;
34 }
35 }))
36
37{}
38
39void Application::DoStart() {
40 if (status_ != aos::starter::State::WAITING) {
41 return;
42 }
43
44 start_timer_->Disable();
45 restart_timer_->Disable();
46
47 LOG(INFO) << "Starting " << name_;
48
49 std::tie(read_pipe_, write_pipe_) = ScopedPipe::MakePipe();
50
51 const pid_t pid = fork();
52
53 if (pid != 0) {
54 if (pid == -1) {
55 PLOG(WARNING) << "Failed to fork";
56 stop_reason_ = aos::starter::LastStopReason::FORK_ERR;
57 status_ = aos::starter::State::STOPPED;
58 } else {
59 pid_ = pid;
60 id_ = next_id_++;
61 start_time_ = event_loop_->monotonic_now();
62 status_ = aos::starter::State::STARTING;
63
64 // Setup timer which moves application to RUNNING state if it is still
65 // alive in 1 second.
66 start_timer_->Setup(event_loop_->monotonic_now() +
67 std::chrono::seconds(1));
68 }
69 return;
70 }
71
72 // Clear out signal mask of parent so forked process receives all signals
73 // normally.
74 sigset_t empty_mask;
75 sigemptyset(&empty_mask);
76 sigprocmask(SIG_SETMASK, &empty_mask, nullptr);
77
78 // Cleanup children if starter dies in a way that is not handled gracefully.
79 if (prctl(PR_SET_PDEATHSIG, SIGKILL) == -1) {
80 write_pipe_.Write(
81 static_cast<uint32_t>(aos::starter::LastStopReason::SET_PRCTL_ERR));
82 PLOG(FATAL) << "Could not set PR_SET_PDEATHSIG to SIGKILL";
83 }
84
85 if (user_) {
86 if (seteuid(*user_) == -1 || setfsuid(*user_) == -1) {
87 write_pipe_.Write(
88 static_cast<uint32_t>(aos::starter::LastStopReason::SET_USR_ERR));
89 PLOG(FATAL) << "Could not set user for " << name_ << " to " << *user_;
90 }
91 }
92
93 // argv[0] should be the program name
94 args_.insert(args_.begin(), path_.data());
95
96 execv(path_.c_str(), args_.data());
97
98 // If we got here, something went wrong
99 write_pipe_.Write(
100 static_cast<uint32_t>(aos::starter::LastStopReason::EXECV_ERR));
101 PLOG(WARNING) << "Could not execute " << name_ << " (" << path_ << ')';
102
103 _exit(EXIT_FAILURE);
104}
105
106void Application::DoStop(bool restart) {
107 // If stop or restart received, the old state of these is no longer applicable
108 // so cancel both.
109 restart_timer_->Disable();
110 start_timer_->Disable();
111
112 switch (status_) {
113 case aos::starter::State::STARTING:
114 case aos::starter::State::RUNNING: {
115 LOG(INFO) << "Killing " << name_ << " pid: " << pid_;
116 status_ = aos::starter::State::STOPPING;
117
118 kill(pid_, SIGINT);
119
120 // Watchdog timer to SIGKILL application if it is still running 1 second
121 // after SIGINT
122 stop_timer_->Setup(event_loop_->monotonic_now() +
123 std::chrono::seconds(1));
124 queue_restart_ = restart;
125 break;
126 }
127 case aos::starter::State::WAITING: {
128 // If waiting to restart, and receives restart, skip the waiting period
129 // and restart immediately. If stop received, all we have to do is move
130 // to the STOPPED state.
131 if (restart) {
132 DoStart();
133 } else {
134 status_ = aos::starter::State::STOPPED;
135 }
136 break;
137 }
138 case aos::starter::State::STOPPING: {
139 // If the application is already stopping, then we just need to update the
140 // restart flag to the most recent status.
141 queue_restart_ = restart;
142 break;
143 }
144 case aos::starter::State::STOPPED: {
145 // Restart immediately if the application is already stopped
146 if (restart) {
147 status_ = aos::starter::State::WAITING;
148 DoStart();
149 }
150 break;
151 }
152 }
153}
154
155void Application::QueueStart() {
156 status_ = aos::starter::State::WAITING;
157
158 LOG(INFO) << "Restarting " << name_ << " in 1 second";
159 restart_timer_->Setup(event_loop_->monotonic_now() + std::chrono::seconds(1));
160 start_timer_->Disable();
161 stop_timer_->Disable();
162}
163
164void Application::set_args(
165 const flatbuffers::Vector<flatbuffers::Offset<flatbuffers::String>> &v) {
166 args_.clear();
167 std::transform(v.begin(), v.end(), std::back_inserter(args_),
168 [](const flatbuffers::String *str) {
169 return const_cast<char *>(str->c_str());
170 });
171 args_.push_back(nullptr);
172}
173
174std::optional<uid_t> Application::FindUid(const char *name) {
175 struct passwd *user_data = getpwnam(name);
176 if (user_data != nullptr) {
177 return user_data->pw_uid;
178 } else {
179 LOG(FATAL) << "Could not find user " << name;
180 return std::nullopt;
181 }
182}
183
184flatbuffers::Offset<aos::starter::ApplicationStatus>
185Application::PopulateStatus(flatbuffers::FlatBufferBuilder *builder) {
186 CHECK_NOTNULL(builder);
187 auto name_fbs = builder->CreateString(name_);
188
189 aos::starter::ApplicationStatus::Builder status_builder(*builder);
190 status_builder.add_name(name_fbs);
191 status_builder.add_state(status_);
192 status_builder.add_last_exit_code(exit_code_);
193 status_builder.add_last_stop_reason(stop_reason_);
194 if (pid_ != -1) {
195 status_builder.add_pid(pid_);
196 status_builder.add_id(id_);
197 }
198 status_builder.add_last_start_time(start_time_.time_since_epoch().count());
199 return status_builder.Finish();
200}
201
202void Application::Terminate() {
203 stop_reason_ = aos::starter::LastStopReason::TERMINATE;
204 DoStop(false);
205 terminating_ = true;
206}
207
208void Application::HandleCommand(aos::starter::Command cmd) {
209 switch (cmd) {
210 case aos::starter::Command::START: {
211 switch (status_) {
212 case aos::starter::State::WAITING: {
213 restart_timer_->Disable();
214 DoStart();
215 break;
216 }
217 case aos::starter::State::STARTING: {
218 break;
219 }
220 case aos::starter::State::RUNNING: {
221 break;
222 }
223 case aos::starter::State::STOPPING: {
224 queue_restart_ = true;
225 break;
226 }
227 case aos::starter::State::STOPPED: {
228 status_ = aos::starter::State::WAITING;
229 DoStart();
230 break;
231 }
232 }
233 break;
234 }
235 case aos::starter::Command::STOP: {
236 stop_reason_ = aos::starter::LastStopReason::STOP_REQUESTED;
237 DoStop(false);
238 break;
239 }
240 case aos::starter::Command::RESTART: {
241 stop_reason_ = aos::starter::LastStopReason::RESTART_REQUESTED;
242 DoStop(true);
243 break;
244 }
245 }
246}
247
248bool Application::MaybeHandleSignal() {
249 int status;
250
251 // Check if the status of this process has changed
252 if (pid_ == -1 || waitpid(pid_, &status, WNOHANG) != pid_) {
253 return false;
254 }
255
256 // Check that the event was the process exiting
257 if (!WIFEXITED(status) && !WIFSIGNALED(status)) {
258 return false;
259 }
260
261 exit_time_ = event_loop_->monotonic_now();
262 exit_code_ = WIFEXITED(status) ? WEXITSTATUS(status) : WTERMSIG(status);
263
264 if (auto read_result = read_pipe_.Read()) {
265 stop_reason_ = static_cast<aos::starter::LastStopReason>(*read_result);
266 }
267
268 switch (status_) {
269 case aos::starter::State::STARTING: {
270 LOG(WARNING) << "Failed to start " << name_ << " on pid " << pid_
271 << " : Exited with status " << exit_code_;
272 QueueStart();
273 break;
274 }
275 case aos::starter::State::RUNNING: {
276 QueueStart();
277 break;
278 }
279 case aos::starter::State::STOPPING: {
280 LOG(INFO) << "Successfully stopped " << name_;
281 status_ = aos::starter::State::STOPPED;
282
283 // Disable force stop timer since the process already died
284 stop_timer_->Disable();
285
286 if (terminating_) {
287 return true;
288 }
289
290 if (queue_restart_) {
291 queue_restart_ = false;
292 status_ = aos::starter::State::WAITING;
293 DoStart();
294 }
295 break;
296 }
297 case aos::starter::State::WAITING:
298 case aos::starter::State::STOPPED: {
299 LOG(FATAL)
300 << "Received signal on process that was already stopped : name: "
301 << name_ << " pid: " << pid_;
302 break;
303 }
304 }
305
306 return false;
307}
308
309ScopedPipe::ScopedPipe(int fd) : fd_(fd) {}
310
311ScopedPipe::~ScopedPipe() {
312 if (fd_ != -1) {
313 PCHECK(close(fd_) != -1);
314 }
315}
316
317ScopedPipe::ScopedPipe(ScopedPipe &&scoped_pipe) : fd_(scoped_pipe.fd_) {
318 scoped_pipe.fd_ = -1;
319}
320
321ScopedPipe &ScopedPipe::operator=(ScopedPipe &&scoped_pipe) {
322 if (fd_ != -1) {
323 PCHECK(close(fd_) != -1);
324 }
325 fd_ = scoped_pipe.fd_;
326 scoped_pipe.fd_ = -1;
327 return *this;
328}
329
330std::tuple<ScopedPipe::ScopedReadPipe, ScopedPipe::ScopedWritePipe>
331ScopedPipe::MakePipe() {
332 int fds[2];
333 PCHECK(pipe(fds) != -1);
334 PCHECK(fcntl(fds[0], F_SETFL, fcntl(fds[0], F_GETFL) | O_NONBLOCK) != -1);
335 PCHECK(fcntl(fds[1], F_SETFL, fcntl(fds[1], F_GETFL) | O_NONBLOCK) != -1);
336 return {ScopedReadPipe(fds[0]), ScopedWritePipe(fds[1])};
337}
338
339std::optional<uint32_t> ScopedPipe::ScopedReadPipe::Read() {
340 uint32_t buf;
341 ssize_t result = read(fd(), &buf, sizeof(buf));
342 if (result == sizeof(buf)) {
343 return buf;
344 } else {
345 return std::nullopt;
346 }
347}
348
349void ScopedPipe::ScopedWritePipe::Write(uint32_t data) {
350 ssize_t result = write(fd(), &data, sizeof(data));
351 PCHECK(result != -1);
352 CHECK(result == sizeof(data));
353}
354
355SignalListener::SignalListener(aos::ShmEventLoop *loop,
356 std::function<void(signalfd_siginfo)> callback)
357 : loop_(loop),
358 callback_(std::move(callback)),
359 signalfd_({SIGHUP, SIGINT, SIGQUIT, SIGABRT, SIGFPE, SIGSEGV, SIGPIPE,
360 SIGTERM, SIGBUS, SIGXCPU, SIGCHLD}) {
361 loop->epoll()->OnReadable(signalfd_.fd(), [this] {
362 signalfd_siginfo info = signalfd_.Read();
363
364 if (info.ssi_signo == 0) {
365 LOG(WARNING) << "Could not read " << sizeof(signalfd_siginfo) << " bytes";
366 return;
367 }
368
369 callback_(info);
370 });
371}
372
373SignalListener::~SignalListener() { loop_->epoll()->DeleteFd(signalfd_.fd()); }
374
375Starter::Starter(const aos::Configuration *event_loop_config)
376 : config_msg_(event_loop_config),
377 event_loop_(event_loop_config),
378 status_sender_(event_loop_.MakeSender<aos::starter::Status>("/aos")),
379 status_timer_(event_loop_.AddTimer([this] { SendStatus(); })),
380 cleanup_timer_(event_loop_.AddTimer([this] { event_loop_.Exit(); })),
381 listener_(&event_loop_,
382 [this](signalfd_siginfo signal) { OnSignal(signal); }) {
383 event_loop_.SkipTimingReport();
384 event_loop_.SkipAosLog();
385
386 event_loop_.OnRun([this] {
387 status_timer_->Setup(event_loop_.monotonic_now(),
388 std::chrono::milliseconds(500));
389 });
390
391 event_loop_.MakeWatcher("/aos", [this](const aos::starter::StarterRpc &cmd) {
392 if (!cmd.has_command() || !cmd.has_name() || exiting_) {
393 return;
394 }
395 LOG(INFO) << "Received command "
396 << aos::starter::EnumNameCommand(cmd.command()) << ' '
397 << cmd.name()->string_view();
398
399 auto search = applications_.find(cmd.name()->str());
400 if (search != applications_.end()) {
401 // If an applicatione exists by the given name, dispatch the command
402 search->second.HandleCommand(cmd.command());
403 }
404 });
405
406 if (config_msg_->has_applications()) {
407 const flatbuffers::Vector<flatbuffers::Offset<aos::Application>>
408 *applications = config_msg_->applications();
409 for (const aos::Application *application : *applications) {
410 AddApplication(application);
411 }
412 }
413}
414
415void Starter::Cleanup() {
416 if (exiting_) {
417 return;
418 }
419 exiting_ = true;
420 for (auto &application : applications_) {
421 application.second.Terminate();
422 }
423 cleanup_timer_->Setup(event_loop_.monotonic_now() +
424 std::chrono::milliseconds(1500));
425}
426
427void Starter::OnSignal(signalfd_siginfo info) {
428 LOG(INFO) << "Received signal " << strsignal(info.ssi_signo);
429
430 if (info.ssi_signo == SIGCHLD) {
431 // SIGCHLD messages can be collapsed if multiple are received, so all
432 // applications must check their status.
433 for (auto iter = applications_.begin(); iter != applications_.end();) {
434 if (iter->second.MaybeHandleSignal()) {
435 iter = applications_.erase(iter);
436 } else {
437 ++iter;
438 }
439 }
440
441 if (exiting_ && applications_.empty()) {
442 event_loop_.Exit();
443 }
444 } else if (std::find(kStarterDeath.begin(), kStarterDeath.end(),
445 info.ssi_signo) != kStarterDeath.end()) {
446 LOG(WARNING) << "Starter shutting down";
447 Cleanup();
448 }
449}
450
451Application *Starter::AddApplication(const aos::Application *application) {
452 auto [iter, success] = applications_.try_emplace(application->name()->str(),
453 application, &event_loop_);
454 if (success) {
455 if (application->has_args()) {
456 iter->second.set_args(*application->args());
457 }
458 return &(iter->second);
459 }
460 return nullptr;
461}
462
463void Starter::Run() {
464 for (auto &application : applications_) {
465 application.second.Start();
466 }
467
468 event_loop_.Run();
469}
470
471void Starter::SendStatus() {
472 aos::Sender<aos::starter::Status>::Builder builder =
473 status_sender_.MakeBuilder();
474
475 std::vector<flatbuffers::Offset<aos::starter::ApplicationStatus>> statuses;
476
477 for (auto &application : applications_) {
478 statuses.push_back(application.second.PopulateStatus(builder.fbb()));
479 }
480
481 auto statuses_fbs = builder.fbb()->CreateVector(statuses);
482
483 aos::starter::Status::Builder status_builder(*builder.fbb());
484 status_builder.add_statuses(statuses_fbs);
485 CHECK(builder.Send(status_builder.Finish()));
486}
487
488} // namespace starter
489} // namespace aos