blob: e68f604c20342bac4bf375a3a9409a691a8747c1 [file] [log] [blame]
James Kuszmaul3224b8e2022-01-07 19:00:39 -08001#include "aos/starter/subprocess.h"
2
3#include <grp.h>
4#include <pwd.h>
5#include <sys/prctl.h>
6#include <sys/types.h>
7#include <sys/wait.h>
8
9#include "glog/logging.h"
10
11namespace aos::starter {
12
13SignalListener::SignalListener(aos::ShmEventLoop *loop,
14 std::function<void(signalfd_siginfo)> callback)
15 : SignalListener(loop, callback,
16 {SIGHUP, SIGINT, SIGQUIT, SIGABRT, SIGFPE, SIGSEGV,
17 SIGPIPE, SIGTERM, SIGBUS, SIGXCPU, SIGCHLD}) {}
18
19SignalListener::SignalListener(aos::ShmEventLoop *loop,
20 std::function<void(signalfd_siginfo)> callback,
21 std::initializer_list<unsigned int> signals)
22 : loop_(loop), callback_(std::move(callback)), signalfd_(signals) {
23 loop->epoll()->OnReadable(signalfd_.fd(), [this] {
24 signalfd_siginfo info = signalfd_.Read();
25
26 if (info.ssi_signo == 0) {
27 LOG(WARNING) << "Could not read " << sizeof(signalfd_siginfo) << " bytes";
28 return;
29 }
30
31 callback_(info);
32 });
33}
34
35SignalListener::~SignalListener() { loop_->epoll()->DeleteFd(signalfd_.fd()); }
36
37Application::Application(const aos::Application *application,
38 aos::EventLoop *event_loop,
39 std::function<void()> on_change)
40 : name_(application->name()->string_view()),
41 path_(application->has_executable_name()
42 ? application->executable_name()->string_view()
43 : application->name()->string_view()),
44 args_(1),
45 user_name_(application->has_user() ? application->user()->str() : ""),
46 user_(application->has_user() ? FindUid(user_name_.c_str())
47 : std::nullopt),
48 group_(application->has_user() ? FindPrimaryGidForUser(user_name_.c_str())
49 : std::nullopt),
50 autostart_(application->autostart()),
51 autorestart_(application->autorestart()),
52 event_loop_(event_loop),
53 start_timer_(event_loop_->AddTimer([this] {
54 status_ = aos::starter::State::RUNNING;
55 LOG(INFO) << "Started '" << name_ << "' pid: " << pid_;
56 })),
57 restart_timer_(event_loop_->AddTimer([this] { DoStart(); })),
58 stop_timer_(event_loop_->AddTimer([this] {
59 if (kill(pid_, SIGKILL) == 0) {
60 LOG(WARNING) << "Failed to stop, sending SIGKILL to '" << name_
61 << "' pid: " << pid_;
62 }
63 })),
64 on_change_(on_change) {}
65
66void Application::DoStart() {
67 if (status_ != aos::starter::State::WAITING) {
68 return;
69 }
70
71 start_timer_->Disable();
72 restart_timer_->Disable();
73
74 std::tie(read_pipe_, write_pipe_) = util::ScopedPipe::MakePipe();
75
76 const pid_t pid = fork();
77
78 if (pid != 0) {
79 if (pid == -1) {
80 PLOG(WARNING) << "Failed to fork '" << name_ << "'";
81 stop_reason_ = aos::starter::LastStopReason::FORK_ERR;
82 status_ = aos::starter::State::STOPPED;
83 } else {
84 pid_ = pid;
85 id_ = next_id_++;
86 start_time_ = event_loop_->monotonic_now();
87 status_ = aos::starter::State::STARTING;
88 LOG(INFO) << "Starting '" << name_ << "' pid " << pid_;
89
90 // Setup timer which moves application to RUNNING state if it is still
91 // alive in 1 second.
92 start_timer_->Setup(event_loop_->monotonic_now() +
93 std::chrono::seconds(1));
94 }
95 on_change_();
96 return;
97 }
98
99 // Clear out signal mask of parent so forked process receives all signals
100 // normally.
101 sigset_t empty_mask;
102 sigemptyset(&empty_mask);
103 sigprocmask(SIG_SETMASK, &empty_mask, nullptr);
104
105 // Cleanup children if starter dies in a way that is not handled gracefully.
106 if (prctl(PR_SET_PDEATHSIG, SIGKILL) == -1) {
107 write_pipe_.Write(
108 static_cast<uint32_t>(aos::starter::LastStopReason::SET_PRCTL_ERR));
109 PLOG(FATAL) << "Could not set PR_SET_PDEATHSIG to SIGKILL";
110 }
111
112 if (group_) {
113 CHECK(!user_name_.empty());
114 // The manpage for setgroups says we just need CAP_SETGID, but empirically
115 // we also need the effective UID to be 0 to make it work. user_ must also
116 // be set so we change this effective UID back later.
117 CHECK(user_);
118 if (seteuid(0) == -1) {
119 write_pipe_.Write(
120 static_cast<uint32_t>(aos::starter::LastStopReason::SET_GRP_ERR));
121 PLOG(FATAL) << "Could not seteuid(0) for " << name_
122 << " in preparation for setting groups";
123 }
124 if (initgroups(user_name_.c_str(), *group_) == -1) {
125 write_pipe_.Write(
126 static_cast<uint32_t>(aos::starter::LastStopReason::SET_GRP_ERR));
127 PLOG(FATAL) << "Could not initialize normal groups for " << name_
128 << " as " << user_name_ << " with " << *group_;
129 }
130 if (setgid(*group_) == -1) {
131 write_pipe_.Write(
132 static_cast<uint32_t>(aos::starter::LastStopReason::SET_GRP_ERR));
133 PLOG(FATAL) << "Could not set group for " << name_ << " to " << *group_;
134 }
135 }
136
137 if (user_) {
138 if (setuid(*user_) == -1) {
139 write_pipe_.Write(
140 static_cast<uint32_t>(aos::starter::LastStopReason::SET_USR_ERR));
141 PLOG(FATAL) << "Could not set user for " << name_ << " to " << *user_;
142 }
143 }
144
145 // argv[0] should be the program name
146 args_.insert(args_.begin(), path_.data());
147
148 execvp(path_.c_str(), args_.data());
149
150 // If we got here, something went wrong
151 write_pipe_.Write(
152 static_cast<uint32_t>(aos::starter::LastStopReason::EXECV_ERR));
153 PLOG(WARNING) << "Could not execute " << name_ << " (" << path_ << ')';
154
155 _exit(EXIT_FAILURE);
156}
157
158void Application::DoStop(bool restart) {
159 // If stop or restart received, the old state of these is no longer applicable
160 // so cancel both.
161 restart_timer_->Disable();
162 start_timer_->Disable();
163
164 switch (status_) {
165 case aos::starter::State::STARTING:
166 case aos::starter::State::RUNNING: {
167 LOG(INFO) << "Stopping '" << name_ << "' pid: " << pid_ << " with signal "
168 << SIGINT;
169 status_ = aos::starter::State::STOPPING;
170
171 kill(pid_, SIGINT);
172
173 // Watchdog timer to SIGKILL application if it is still running 1 second
174 // after SIGINT
175 stop_timer_->Setup(event_loop_->monotonic_now() +
176 std::chrono::seconds(1));
177 queue_restart_ = restart;
178 on_change_();
179 break;
180 }
181 case aos::starter::State::WAITING: {
182 // If waiting to restart, and receives restart, skip the waiting period
183 // and restart immediately. If stop received, all we have to do is move
184 // to the STOPPED state.
185 if (restart) {
186 DoStart();
187 } else {
188 status_ = aos::starter::State::STOPPED;
189 on_change_();
190 }
191 break;
192 }
193 case aos::starter::State::STOPPING: {
194 // If the application is already stopping, then we just need to update the
195 // restart flag to the most recent status.
196 queue_restart_ = restart;
197 break;
198 }
199 case aos::starter::State::STOPPED: {
200 // Restart immediately if the application is already stopped
201 if (restart) {
202 status_ = aos::starter::State::WAITING;
203 DoStart();
204 }
205 break;
206 }
207 }
208}
209
210void Application::QueueStart() {
211 status_ = aos::starter::State::WAITING;
212
213 LOG(INFO) << "Restarting " << name_ << " in 3 seconds";
214 restart_timer_->Setup(event_loop_->monotonic_now() + std::chrono::seconds(3));
215 start_timer_->Disable();
216 stop_timer_->Disable();
217 on_change_();
218}
219
220void Application::set_args(
221 const flatbuffers::Vector<flatbuffers::Offset<flatbuffers::String>> &v) {
222 args_.clear();
223 std::transform(v.begin(), v.end(), std::back_inserter(args_),
224 [](const flatbuffers::String *str) {
225 return const_cast<char *>(str->c_str());
226 });
227 args_.push_back(nullptr);
228}
229
230std::optional<uid_t> Application::FindUid(const char *name) {
231 // TODO(austin): Use the reentrant version. This should be safe.
232 struct passwd *user_data = getpwnam(name);
233 if (user_data != nullptr) {
234 return user_data->pw_uid;
235 } else {
236 LOG(FATAL) << "Could not find user " << name;
237 return std::nullopt;
238 }
239}
240
241std::optional<gid_t> Application::FindPrimaryGidForUser(const char *name) {
242 // TODO(austin): Use the reentrant version. This should be safe.
243 struct passwd *user_data = getpwnam(name);
244 if (user_data != nullptr) {
245 return user_data->pw_gid;
246 } else {
247 LOG(FATAL) << "Could not find user " << name;
248 return std::nullopt;
249 }
250}
251
252flatbuffers::Offset<aos::starter::ApplicationStatus>
253Application::PopulateStatus(flatbuffers::FlatBufferBuilder *builder) {
254 CHECK_NOTNULL(builder);
255 auto name_fbs = builder->CreateString(name_);
256
257 aos::starter::ApplicationStatus::Builder status_builder(*builder);
258 status_builder.add_name(name_fbs);
259 status_builder.add_state(status_);
260 status_builder.add_last_exit_code(exit_code_);
261 status_builder.add_last_stop_reason(stop_reason_);
262 if (pid_ != -1) {
263 status_builder.add_pid(pid_);
264 status_builder.add_id(id_);
265 }
266 status_builder.add_last_start_time(start_time_.time_since_epoch().count());
267 return status_builder.Finish();
268}
269
270void Application::Terminate() {
271 stop_reason_ = aos::starter::LastStopReason::TERMINATE;
272 DoStop(false);
273 terminating_ = true;
274}
275
276void Application::HandleCommand(aos::starter::Command cmd) {
277 switch (cmd) {
278 case aos::starter::Command::START: {
279 switch (status_) {
280 case aos::starter::State::WAITING: {
281 restart_timer_->Disable();
282 DoStart();
283 break;
284 }
285 case aos::starter::State::STARTING: {
286 break;
287 }
288 case aos::starter::State::RUNNING: {
289 break;
290 }
291 case aos::starter::State::STOPPING: {
292 queue_restart_ = true;
293 break;
294 }
295 case aos::starter::State::STOPPED: {
296 status_ = aos::starter::State::WAITING;
297 DoStart();
298 break;
299 }
300 }
301 break;
302 }
303 case aos::starter::Command::STOP: {
304 stop_reason_ = aos::starter::LastStopReason::STOP_REQUESTED;
305 DoStop(false);
306 break;
307 }
308 case aos::starter::Command::RESTART: {
309 stop_reason_ = aos::starter::LastStopReason::RESTART_REQUESTED;
310 DoStop(true);
311 break;
312 }
313 }
314}
315
316bool Application::MaybeHandleSignal() {
317 int status;
318
319 // Check if the status of this process has changed
320 if (pid_ == -1 || waitpid(pid_, &status, WNOHANG) != pid_) {
321 return false;
322 }
323
324 // Check that the event was the process exiting
325 if (!WIFEXITED(status) && !WIFSIGNALED(status)) {
326 return false;
327 }
328
329 exit_time_ = event_loop_->monotonic_now();
330 exit_code_ = WIFEXITED(status) ? WEXITSTATUS(status) : WTERMSIG(status);
331
332 if (auto read_result = read_pipe_.Read()) {
333 stop_reason_ = static_cast<aos::starter::LastStopReason>(*read_result);
334 }
335
336 switch (status_) {
337 case aos::starter::State::STARTING: {
338 LOG(WARNING) << "Failed to start '" << name_ << "' on pid " << pid_
339 << " : Exited with status " << exit_code_;
340 if (autorestart()) {
341 QueueStart();
342 }
343 break;
344 }
345 case aos::starter::State::RUNNING: {
346 if (exit_code_ == 0) {
347 LOG(INFO) << "Application '" << name_ << "' pid " << pid_
348 << " exited with status " << exit_code_;
349 } else {
350 LOG(WARNING) << "Application '" << name_ << "' pid " << pid_
351 << " exited unexpectedly with status " << exit_code_;
352 }
353 if (autorestart()) {
354 QueueStart();
355 }
356 break;
357 }
358 case aos::starter::State::STOPPING: {
359 LOG(INFO) << "Successfully stopped '" << name_ << "' pid: " << pid_
360 << " with status " << exit_code_;
361 status_ = aos::starter::State::STOPPED;
362
363 // Disable force stop timer since the process already died
364 stop_timer_->Disable();
365
366 on_change_();
367 if (terminating_) {
368 return true;
369 }
370
371 if (queue_restart_) {
372 queue_restart_ = false;
373 status_ = aos::starter::State::WAITING;
374 DoStart();
375 }
376 break;
377 }
378 case aos::starter::State::WAITING:
379 case aos::starter::State::STOPPED: {
380 LOG(FATAL)
381 << "Received signal on process that was already stopped : name: '"
382 << name_ << "' pid: " << pid_;
383 break;
384 }
385 }
386
387 return false;
388}
389
390} // namespace aos::starter