blob: d4d7e10c827bebc6b02cdfe133fd7222170bbb11 [file] [log] [blame]
Brian Silvermand169fcd2013-02-27 13:18:47 -08001#include <stdio.h>
2#include <stdlib.h>
3#include <sys/types.h>
4#include <fcntl.h>
5#include <sys/inotify.h>
6#include <sys/stat.h>
7#include <sys/ioctl.h>
8#include <assert.h>
9#include <signal.h>
10#include <stdint.h>
11#include <errno.h>
12#include <string.h>
13#include <sys/wait.h>
14
15#include <map>
16#include <functional>
17#include <deque>
18#include <fstream>
19#include <queue>
20#include <list>
21#include <string>
22#include <vector>
23#include <memory>
24
25#include <event2/event.h>
26
27#include "aos/common/logging/logging.h"
28#include "aos/common/logging/logging_impl.h"
29#include "aos/atom_code/init.h"
30#include "aos/common/unique_malloc_ptr.h"
31#include "aos/common/time.h"
Brian Silverman5cc661b2013-02-27 15:23:36 -080032#include "aos/common/once.h"
Brian Silvermand169fcd2013-02-27 13:18:47 -080033
34// This is the main piece of code that starts all of the rest of the code and
35// restarts it when the binaries are modified.
36//
Brian Silverman5cc661b2013-02-27 15:23:36 -080037// NOTE: This program should never exit nicely. It catches all nice attempts to
38// exit, forwards them to all of the children that it has started, waits for
Brian Silvermand169fcd2013-02-27 13:18:47 -080039// them to exit nicely, and then SIGKILLs anybody left (which will always
40// include itself).
41
42using ::std::unique_ptr;
43
44namespace aos {
45namespace starter {
46
Brian Silvermand169fcd2013-02-27 13:18:47 -080047class EventBaseDeleter {
48 public:
49 void operator()(event_base *base) {
Brian Silvermand169fcd2013-02-27 13:18:47 -080050 event_base_free(base);
51 }
52};
53typedef unique_ptr<event_base, EventBaseDeleter> EventBaseUniquePtr;
Brian Silverman5cc661b2013-02-27 15:23:36 -080054EventBaseUniquePtr libevent_base;
Brian Silvermand169fcd2013-02-27 13:18:47 -080055
56class EventDeleter {
57 public:
58 void operator()(event *evt) {
Brian Silvermand169fcd2013-02-27 13:18:47 -080059 if (event_del(evt) != 0) {
60 LOG(WARNING, "event_del(%p) failed\n", evt);
61 }
62 }
63};
64typedef unique_ptr<event, EventDeleter> EventUniquePtr;
65
Brian Silverman5cc661b2013-02-27 15:23:36 -080066// Watches a file path for modifications. Once created, keeps watching until
67// destroyed or RemoveWatch() is called.
Brian Silvermand169fcd2013-02-27 13:18:47 -080068class FileWatch {
69 public:
70 // Will call callback(value) when filename is modified.
71 // If value is NULL, then a pointer to this object will be passed instead.
Brian Silverman5cc661b2013-02-27 15:23:36 -080072 //
73 // Watching for file creations is slightly different. To do that, pass true
74 // for create, the directory where the file will be created for filename, and
75 // the name of the file (without directory name) for check_filename.
Brian Silvermand169fcd2013-02-27 13:18:47 -080076 FileWatch(std::string filename,
77 std::function<void(void *)> callback, void *value,
78 bool create = false, std::string check_filename = "")
79 : filename_(filename), callback_(callback), value_(value),
80 check_filename_(check_filename) {
Brian Silverman5cc661b2013-02-27 15:23:36 -080081 init_once.Get();
82
Brian Silvermand169fcd2013-02-27 13:18:47 -080083 watch_ = inotify_add_watch(notify_fd, filename.c_str(),
84 create ? IN_CREATE : (IN_ATTRIB | IN_MODIFY));
85 if (watch_ == -1) {
Brian Silverman5cc661b2013-02-27 15:23:36 -080086 LOG(FATAL, "inotify_add_watch(%d, %s,"
87 " %s ? IN_CREATE : (IN_ATTRIB | IN_MODIFY)) failed with %d: %s\n",
88 notify_fd, filename.c_str(), create ? "true" : "false",
89 errno, strerror(errno));
Brian Silvermand169fcd2013-02-27 13:18:47 -080090 }
91 watchers[watch_] = this;
92 }
93 // Cleans up everything.
94 ~FileWatch() {
95 if (watch_ != -1) {
96 RemoveWatch();
97 }
98 }
99
100 // After calling this method, this object won't really be doing much of
Brian Silverman5cc661b2013-02-27 15:23:36 -0800101 // anything besides possibly running its callback or something.
Brian Silvermand169fcd2013-02-27 13:18:47 -0800102 void RemoveWatch() {
103 assert(watch_ != -1);
Brian Silverman5cc661b2013-02-27 15:23:36 -0800104
Brian Silvermand169fcd2013-02-27 13:18:47 -0800105 if (inotify_rm_watch(notify_fd, watch_) == -1) {
106 LOG(WARNING, "inotify_rm_watch(%d, %d) failed with %d: %s\n",
107 notify_fd, watch_, errno, strerror(errno));
108 }
Brian Silverman5cc661b2013-02-27 15:23:36 -0800109
Brian Silvermand169fcd2013-02-27 13:18:47 -0800110 if (watchers[watch_] != this) {
Brian Silverman5cc661b2013-02-27 15:23:36 -0800111 LOG(WARNING, "watcher for %s (%p) didn't find itself in the map\n",
112 filename_.c_str(), this);
Brian Silvermand169fcd2013-02-27 13:18:47 -0800113 } else {
114 watchers.erase(watch_);
115 }
Brian Silvermanb1e4f6c2013-02-27 15:42:02 -0800116 LOG(DEBUG, "removed watch ID %d\n", watch_);
Brian Silvermand169fcd2013-02-27 13:18:47 -0800117 watch_ = -1;
118 }
119
Brian Silverman5cc661b2013-02-27 15:23:36 -0800120 private:
121 // Performs the static initialization. Called by init_once from the
122 // constructor.
123 static void *Init() {
124 notify_fd = inotify_init1(IN_CLOEXEC);
125 EventUniquePtr notify_event(event_new(libevent_base.get(), notify_fd,
126 EV_READ | EV_PERSIST,
127 FileWatch::INotifyReadable, NULL));
128 event_add(notify_event.release(), NULL);
129 return NULL;
Brian Silvermand169fcd2013-02-27 13:18:47 -0800130 }
131
132 // This gets set up as the callback for EV_READ on the inotify file
Brian Silverman5cc661b2013-02-27 15:23:36 -0800133 // descriptor. It calls FileNotified on the appropriate instance.
Brian Silvermand169fcd2013-02-27 13:18:47 -0800134 static void INotifyReadable(int /*fd*/, short /*events*/, void *) {
135 unsigned int to_read;
Brian Silverman5cc661b2013-02-27 15:23:36 -0800136 // Use FIONREAD to figure out how many bytes there are to read.
Brian Silvermand169fcd2013-02-27 13:18:47 -0800137 if (ioctl(notify_fd, FIONREAD, &to_read) < 0) {
138 LOG(FATAL, "FIONREAD(%d, %p) failed with %d: %s\n",
139 notify_fd, &to_read, errno, strerror(errno));
140 }
141 inotify_event *notifyevt = static_cast<inotify_event *>(malloc(to_read));
142 const char *end = reinterpret_cast<char *>(notifyevt) + to_read;
143 aos::unique_c_ptr<inotify_event> freer(notifyevt);
144
145 ssize_t ret = read(notify_fd, notifyevt, to_read);
146 if (ret < 0) {
147 LOG(FATAL, "read(%d, %p, %u) failed with %d: %s\n",
148 notify_fd, notifyevt, to_read, errno, strerror(errno));
149 }
150 if (static_cast<size_t>(ret) != to_read) {
151 LOG(ERROR, "read(%d, %p, %u) returned %zd instead of %u\n",
152 notify_fd, notifyevt, to_read, ret, to_read);
153 return;
154 }
155
Brian Silverman5cc661b2013-02-27 15:23:36 -0800156 // Keep looping through until we get to the end because inotify does return
157 // multiple events at once.
Brian Silvermanb1e4f6c2013-02-27 15:42:02 -0800158 while (reinterpret_cast<char *>(notifyevt) < end) {
Brian Silvermand169fcd2013-02-27 13:18:47 -0800159 if (watchers.count(notifyevt->wd) != 1) {
Brian Silvermanfe06fe12013-02-27 18:54:58 -0800160 LOG(WARNING, "couldn't find whose watch ID %d is\n", notifyevt->wd);
Brian Silvermanb1e4f6c2013-02-27 15:42:02 -0800161 } else {
162 watchers[notifyevt->wd]->FileNotified((notifyevt->len > 0) ?
163 notifyevt->name : NULL);
Brian Silvermand169fcd2013-02-27 13:18:47 -0800164 }
Brian Silvermand169fcd2013-02-27 13:18:47 -0800165
166 notifyevt = reinterpret_cast<inotify_event *>(
167 reinterpret_cast<char *>(notifyevt) +
168 sizeof(*notifyevt) + notifyevt->len);
Brian Silvermand169fcd2013-02-27 13:18:47 -0800169 }
170 }
171
Brian Silverman5cc661b2013-02-27 15:23:36 -0800172 // INotifyReadable calls this method whenever the watch for our file triggers.
173 void FileNotified(const char *filename) {
174 assert(watch_ != -1);
Brian Silvermanb1e4f6c2013-02-27 15:42:02 -0800175 LOG(DEBUG, "got a notification for %s\n", filename_.c_str());
Brian Silverman5cc661b2013-02-27 15:23:36 -0800176
177 if (!check_filename_.empty()) {
178 if (filename == NULL) {
179 return;
180 }
181 if (std::string(filename) != check_filename_) {
182 return;
183 }
184 }
185
186 callback_((value_ == NULL) ? this : value_);
Brian Silvermand169fcd2013-02-27 13:18:47 -0800187 }
188
Brian Silverman5cc661b2013-02-27 15:23:36 -0800189 // To make sure that Init gets called exactly once.
190 static ::aos::Once<void> init_once;
191
Brian Silvermand169fcd2013-02-27 13:18:47 -0800192 const std::string filename_;
193 const std::function<void(void *)> callback_;
194 void *const value_;
195 std::string check_filename_;
196
197 // The watch descriptor or -1 if we don't have one any more.
198 int watch_;
199
Brian Silverman5cc661b2013-02-27 15:23:36 -0800200 // Map from watch IDs to instances.
201 // <https://patchwork.kernel.org/patch/73192/> says they won't get reused, but
202 // that shouldn't be counted on because we might have a
203 // modified/different version/whatever kernel.
Brian Silvermand169fcd2013-02-27 13:18:47 -0800204 static std::map<int, FileWatch *> watchers;
Brian Silverman5cc661b2013-02-27 15:23:36 -0800205 // The inotify(7) file descriptor.
Brian Silvermand169fcd2013-02-27 13:18:47 -0800206 static int notify_fd;
207};
Brian Silverman5cc661b2013-02-27 15:23:36 -0800208::aos::Once<void> FileWatch::init_once(FileWatch::Init);
Brian Silvermand169fcd2013-02-27 13:18:47 -0800209std::map<int, FileWatch *> FileWatch::watchers;
210int FileWatch::notify_fd;
211
Brian Silverman5cc661b2013-02-27 15:23:36 -0800212// Runs the given command and returns its first line of output (not including
213// the \n). LOG(FATAL)s if the command has an exit status other than 0 or does
214// not print out an entire line.
Brian Silvermand169fcd2013-02-27 13:18:47 -0800215std::string RunCommand(std::string command) {
Brian Silverman5cc661b2013-02-27 15:23:36 -0800216 // popen(3) might fail and not set it.
Brian Silvermand169fcd2013-02-27 13:18:47 -0800217 errno = 0;
Brian Silverman5cc661b2013-02-27 15:23:36 -0800218 FILE *pipe = popen(command.c_str(), "r");
219 if (pipe == NULL) {
Brian Silvermand169fcd2013-02-27 13:18:47 -0800220 LOG(FATAL, "popen(\"%s\", \"r\") failed with %d: %s\n",
221 command.c_str(), errno, strerror(errno));
222 }
223
Brian Silverman5cc661b2013-02-27 15:23:36 -0800224 // result_size is how many bytes result is currently allocated to.
Brian Silvermand169fcd2013-02-27 13:18:47 -0800225 size_t result_size = 128, read = 0;
226 unique_c_ptr<char> result(static_cast<char *>(malloc(result_size)));
227 while (true) {
Brian Silverman5cc661b2013-02-27 15:23:36 -0800228 // If we filled up the buffer, then realloc(3) it bigger.
Brian Silvermand169fcd2013-02-27 13:18:47 -0800229 if (read == result_size) {
230 result_size *= 2;
231 void *new_result = realloc(result.get(), result_size);
232 if (new_result == NULL) {
Brian Silverman5cc661b2013-02-27 15:23:36 -0800233 LOG(FATAL, "realloc(%p, %zd) failed because of %d: %s\n",
234 result.get(), result_size, errno, strerror(errno));
Brian Silvermand169fcd2013-02-27 13:18:47 -0800235 } else {
236 result.release();
237 result = unique_c_ptr<char>(static_cast<char *>(new_result));
238 }
239 }
240
Brian Silverman5cc661b2013-02-27 15:23:36 -0800241 size_t ret = fread(result.get() + read, 1, result_size - read, pipe);
242 // If the read didn't fill up the whole buffer, check to see if it was
243 // because of an error.
Brian Silvermand169fcd2013-02-27 13:18:47 -0800244 if (ret < result_size - read) {
Brian Silverman5cc661b2013-02-27 15:23:36 -0800245 if (ferror(pipe)) {
Brian Silvermand169fcd2013-02-27 13:18:47 -0800246 LOG(FATAL, "couldn't finish reading output of \"%s\"\n",
247 command.c_str());
248 }
249 }
250 read += ret;
251 if (read > 0 && result.get()[read - 1] == '\n') {
252 break;
253 }
254
Brian Silverman5cc661b2013-02-27 15:23:36 -0800255 if (feof(pipe)) {
256 LOG(FATAL, "`%s` failed. didn't print a whole line\n", command.c_str());
Brian Silvermand169fcd2013-02-27 13:18:47 -0800257 }
258 }
259
Brian Silverman5cc661b2013-02-27 15:23:36 -0800260 // Get rid of the first \n and anything after it.
Brian Silvermand169fcd2013-02-27 13:18:47 -0800261 *strchrnul(result.get(), '\n') = '\0';
262
Brian Silverman5cc661b2013-02-27 15:23:36 -0800263 int child_status = pclose(pipe);
Brian Silvermand169fcd2013-02-27 13:18:47 -0800264 if (child_status == -1) {
Brian Silverman5cc661b2013-02-27 15:23:36 -0800265 LOG(FATAL, "pclose(%p) failed with %d: %s\n", pipe,
Brian Silvermand169fcd2013-02-27 13:18:47 -0800266 errno, strerror(errno));
267 }
268
269 if (child_status != 0) {
270 LOG(FATAL, "`%s` failed. return %d\n", command.c_str(), child_status);
271 }
272
273 return std::string(result.get());
274}
275
276// Will call callback(arg) after time.
277void Timeout(time::Time time, void (*callback)(int, short, void *), void *arg) {
278 EventUniquePtr timeout(evtimer_new(libevent_base.get(), callback, arg));
279 struct timeval time_timeval = time.ToTimeval();
280 evtimer_add(timeout.release(), &time_timeval);
281}
282
283// Represents a child process. It will take care of restarting itself etc.
284class Child {
285 public:
Brian Silverman5cc661b2013-02-27 15:23:36 -0800286 // command is the (space-separated) command to run and its arguments.
287 Child(const std::string &command) : pid_(-1),
Brian Silvermand169fcd2013-02-27 13:18:47 -0800288 restart_timeout_(
Brian Silvermanfe06fe12013-02-27 18:54:58 -0800289 evtimer_new(libevent_base.get(), StaticDoRestart, this)),
290 stat_at_start_valid_(false) {
Brian Silvermand169fcd2013-02-27 13:18:47 -0800291 const char *start, *end;
292 start = command.c_str();
293 while (true) {
294 end = strchrnul(start, ' ');
295 args_.push_back(std::string(start, end - start));
296 start = end + 1;
297 if (*end == '\0') {
298 break;
299 }
300 }
301
Brian Silverman5cc661b2013-02-27 15:23:36 -0800302 original_binary_ = RunCommand("which " + args_[0]);
303 binary_ = original_binary_ + ".stm";
Brian Silvermand169fcd2013-02-27 13:18:47 -0800304
305 watcher_ = unique_ptr<FileWatch>(
Brian Silverman5cc661b2013-02-27 15:23:36 -0800306 new FileWatch(original_binary_, StaticFileModified, this));
Brian Silvermand169fcd2013-02-27 13:18:47 -0800307
308 Start();
309 }
310
311 pid_t pid() { return pid_; }
312
313 // This gets called whenever the actual process dies and should (probably) be
314 // restarted.
315 void ProcessDied() {
316 pid_ = -1;
317 restarts_.push(time::Time::Now());
318 if (restarts_.size() > kMaxRestartsNumber) {
319 time::Time oldest = restarts_.front();
320 restarts_.pop();
321 if ((time::Time::Now() - oldest) > kMaxRestartsTime) {
322 LOG(WARNING, "process %s getting restarted too often\n", name());
323 Timeout(kResumeWait, StaticStart, this);
324 return;
325 }
326 }
327 Start();
328 }
329
330 // Returns a name for logging purposes.
331 const char *name() {
332 return args_[0].c_str();
333 }
334
335 private:
336 struct CheckDiedStatus {
337 Child *self;
338 pid_t old_pid;
339 };
340
341 // How long to wait for a child to die nicely.
342 static const time::Time kProcessDieTime;
343
344 // How long to wait after the file is modified to restart it.
345 // This is important because some programs like modifying the binaries by
346 // writing them in little bits, which results in attempting to start partial
347 // binaries without this.
348 static const time::Time kRestartWaitTime;
349
Brian Silverman5cc661b2013-02-27 15:23:36 -0800350 // Only kMaxRestartsNumber restarts will be allowed in kMaxRestartsTime.
Brian Silvermand169fcd2013-02-27 13:18:47 -0800351 static const time::Time kMaxRestartsTime;
352 static const size_t kMaxRestartsNumber = 5;
353 // How long to wait if it gets restarted too many times.
354 static const time::Time kResumeWait;
Brian Silverman5cc661b2013-02-27 15:23:36 -0800355
Brian Silvermand169fcd2013-02-27 13:18:47 -0800356 // A history of the times that this process has been restarted.
357 std::queue<time::Time, std::list<time::Time>> restarts_;
358
Brian Silverman5cc661b2013-02-27 15:23:36 -0800359 // The currently running child's PID or NULL.
Brian Silvermand169fcd2013-02-27 13:18:47 -0800360 pid_t pid_;
361
Brian Silverman5cc661b2013-02-27 15:23:36 -0800362 // All of the arguments (including the name of the binary).
Brian Silvermand169fcd2013-02-27 13:18:47 -0800363 std::deque<std::string> args_;
Brian Silverman5cc661b2013-02-27 15:23:36 -0800364
365 // The name of the real binary that we were told to run.
366 std::string original_binary_;
367 // The name of the file that we're actually running.
Brian Silvermand169fcd2013-02-27 13:18:47 -0800368 std::string binary_;
369
Brian Silverman5cc661b2013-02-27 15:23:36 -0800370 // Watches original_binary_.
Brian Silvermand169fcd2013-02-27 13:18:47 -0800371 unique_ptr<FileWatch> watcher_;
372
373 // An event that restarts after kRestartWaitTime.
374 EventUniquePtr restart_timeout_;
375
Brian Silvermanfe06fe12013-02-27 18:54:58 -0800376 // Captured from the original file when we most recently started a new child
377 // process. Used to see if it actually changes or not.
378 struct stat stat_at_start_;
379 bool stat_at_start_valid_;
380
Brian Silvermand169fcd2013-02-27 13:18:47 -0800381 static void StaticFileModified(void *self) {
382 static_cast<Child *>(self)->FileModified();
383 }
Brian Silverman5cc661b2013-02-27 15:23:36 -0800384
Brian Silvermand169fcd2013-02-27 13:18:47 -0800385 void FileModified() {
386 struct timeval restart_time_timeval = kRestartWaitTime.ToTimeval();
387 // This will reset the timeout again if it hasn't run yet.
388 evtimer_add(restart_timeout_.get(), &restart_time_timeval);
389 }
390
391 static void StaticDoRestart(int, short, void *self) {
392 static_cast<Child *>(self)->DoRestart();
393 }
394
Brian Silvermanb1e4f6c2013-02-27 15:42:02 -0800395 // Called after somebody else has finished modifying the file.
Brian Silvermand169fcd2013-02-27 13:18:47 -0800396 void DoRestart() {
Brian Silvermanfe06fe12013-02-27 18:54:58 -0800397 if (stat_at_start_valid_) {
398 struct stat current_stat;
399 if (stat(original_binary_.c_str(), &current_stat) == -1) {
400 LOG(FATAL, "stat(%s, %p) failed with %d: %s\n",
401 original_binary_.c_str(), &current_stat, errno, strerror(errno));
402 }
403 if (current_stat.st_mtime == stat_at_start_.st_mtime) {
404 LOG(DEBUG, "ignoring trigger for %s because mtime didn't change\n",
405 name());
406 return;
407 }
408 }
409
Brian Silvermand169fcd2013-02-27 13:18:47 -0800410 if (pid_ != -1) {
411 LOG(DEBUG, "sending SIGTERM to child %d to restart it\n", pid_);
412 if (kill(pid_, SIGTERM) == -1) {
413 LOG(WARNING, "kill(%d, SIGTERM) failed with %d: %s\n",
414 pid_, errno, strerror(errno));
415 }
416 CheckDiedStatus *status = new CheckDiedStatus();
417 status->self = this;
418 status->old_pid = pid_;
419 Timeout(kProcessDieTime, StaticCheckDied, status);
420 }
421 }
422
423 static void StaticCheckDied(int, short, void *status_in) {
424 CheckDiedStatus *status = static_cast<CheckDiedStatus *>(status_in);
425 status->self->CheckDied(status->old_pid);
426 delete status;
427 }
Brian Silverman5cc661b2013-02-27 15:23:36 -0800428
429 // Checks to see if the child using the PID old_pid is still running.
Brian Silvermand169fcd2013-02-27 13:18:47 -0800430 void CheckDied(pid_t old_pid) {
431 if (pid_ == old_pid) {
432 LOG(WARNING, "child %d refused to die\n", old_pid);
433 if (kill(old_pid, SIGKILL) == -1) {
434 LOG(WARNING, "kill(%d, SIGKILL) failed with %d: %s\n",
435 old_pid, errno, strerror(errno));
436 }
437 }
438 }
439
440 static void StaticStart(int, short, void *self) {
441 static_cast<Child *>(self)->Start();
442 }
Brian Silverman5cc661b2013-02-27 15:23:36 -0800443
444 // Actually starts the child.
Brian Silvermand169fcd2013-02-27 13:18:47 -0800445 void Start() {
446 if (pid_ != -1) {
447 LOG(WARNING, "calling Start() but already have child %d running\n",
448 pid_);
Brian Silverman5cc661b2013-02-27 15:23:36 -0800449 if (kill(pid_, SIGKILL) == -1) {
450 LOG(WARNING, "kill(%d, SIGKILL) failed with %d: %s\n",
451 pid_, errno, strerror(errno));
452 return;
453 }
Brian Silvermand169fcd2013-02-27 13:18:47 -0800454 pid_ = -1;
455 }
Brian Silverman5cc661b2013-02-27 15:23:36 -0800456
457 // Remove the name that we run from (ie from a previous execution) and then
458 // hard link the real filename to it.
459 if (unlink(binary_.c_str()) != 0 && errno != ENOENT) {
460 LOG(FATAL, "removing %s failed because of %d: %s\n",
461 binary_.c_str(), errno, strerror(errno));
462 }
463 if (link(original_binary_.c_str(), binary_.c_str()) != 0) {
464 LOG(FATAL, "link('%s', '%s') failed because of %d: %s\n",
465 original_binary_.c_str(), binary_.c_str(), errno, strerror(errno));
466 }
467
Brian Silvermanfe06fe12013-02-27 18:54:58 -0800468 if (stat(original_binary_.c_str(), &stat_at_start_) == -1) {
469 LOG(FATAL, "stat(%s, %p) failed with %d: %s\n",
470 original_binary_.c_str(), &stat_at_start_, errno, strerror(errno));
471 }
472 stat_at_start_valid_ = true;
473
Brian Silvermand169fcd2013-02-27 13:18:47 -0800474 if ((pid_ = fork()) == 0) {
475 ssize_t args_size = args_.size();
476 const char **argv = new const char *[args_size + 1];
477 for (int i = 0; i < args_size; ++i) {
478 argv[i] = args_[i].c_str();
479 }
480 argv[args_size] = NULL;
481 // The const_cast is safe because no code that might care if it gets
482 // modified can run afterwards.
483 execv(binary_.c_str(), const_cast<char **>(argv));
484 LOG(FATAL, "execv(%s, %p) failed with %d: %s\n",
485 binary_.c_str(), argv, errno, strerror(errno));
486 _exit(EXIT_FAILURE);
487 }
488 if (pid_ == -1) {
Brian Silverman5cc661b2013-02-27 15:23:36 -0800489 LOG(FATAL, "forking to run \"%s\" failed with %d: %s\n",
Brian Silvermand169fcd2013-02-27 13:18:47 -0800490 binary_.c_str(), errno, strerror(errno));
491 }
492 }
493};
494const time::Time Child::kProcessDieTime = time::Time::InSeconds(0.5);
495const time::Time Child::kMaxRestartsTime = time::Time::InSeconds(2);
496const time::Time Child::kResumeWait = time::Time::InSeconds(1.5);
497const time::Time Child::kRestartWaitTime = time::Time::InSeconds(1.5);
498
499// This is where all of the Child instances except core live.
500std::vector<unique_ptr<Child>> children;
Brian Silverman5cc661b2013-02-27 15:23:36 -0800501// A global place to hold on to which child is core.
502unique_ptr<Child> core;
Brian Silvermand169fcd2013-02-27 13:18:47 -0800503
Brian Silverman5cc661b2013-02-27 15:23:36 -0800504// Kills off the entire process group (including ourself).
505void KillChildren(bool try_nice) {
Brian Silvermand169fcd2013-02-27 13:18:47 -0800506 if (try_nice) {
507 static const int kNiceStopSignal = SIGTERM;
508 static const time::Time kNiceWaitTime = time::Time::InSeconds(1);
509
510 // Make sure that we don't just nicely stop ourself...
511 sigset_t mask;
512 sigemptyset(&mask);
513 sigaddset(&mask, kNiceStopSignal);
514 sigprocmask(SIG_BLOCK, &mask, NULL);
515
Brian Silverman5cc661b2013-02-27 15:23:36 -0800516 kill(-getpid(), kNiceStopSignal);
517
518 fflush(NULL);
Brian Silvermand169fcd2013-02-27 13:18:47 -0800519 time::SleepFor(kNiceWaitTime);
520 }
Brian Silverman5cc661b2013-02-27 15:23:36 -0800521
Brian Silvermand169fcd2013-02-27 13:18:47 -0800522 // Send SIGKILL to our whole process group, which will forcibly terminate any
523 // of them that are still running (us for sure, maybe more too).
Brian Silverman5cc661b2013-02-27 15:23:36 -0800524 kill(-getpid(), SIGKILL);
Brian Silvermand169fcd2013-02-27 13:18:47 -0800525}
526
Brian Silverman5cc661b2013-02-27 15:23:36 -0800527void ExitHandler() {
528 KillChildren(true);
Brian Silvermand169fcd2013-02-27 13:18:47 -0800529}
Brian Silverman5cc661b2013-02-27 15:23:36 -0800530
531void KillChildrenSignalHandler(int signum) {
532 // If we get SIGSEGV or some other random signal who knows what's happening
533 // and we should just kill everybody immediately.
534 // This is a list of all of the signals that mean some form of "nicely stop".
535 KillChildren(signum == SIGHUP || signum == SIGINT || signum == SIGQUIT ||
Brian Silvermand169fcd2013-02-27 13:18:47 -0800536 signum == SIGABRT || signum == SIGPIPE || signum == SIGTERM ||
537 signum == SIGXCPU);
538}
539
Brian Silverman5cc661b2013-02-27 15:23:36 -0800540// Returns the currently running child with PID pid or an empty unique_ptr.
Brian Silvermand169fcd2013-02-27 13:18:47 -0800541const unique_ptr<Child> &FindChild(pid_t pid) {
542 for (auto it = children.begin(); it != children.end(); ++it) {
543 if (pid == (*it)->pid()) {
544 return *it;
545 }
546 }
547
548 if (pid == core->pid()) {
549 return core;
550 }
551
Brian Silverman5cc661b2013-02-27 15:23:36 -0800552 static const unique_ptr<Child> kNothing;
Brian Silvermand169fcd2013-02-27 13:18:47 -0800553 return kNothing;
554}
555
Brian Silverman5cc661b2013-02-27 15:23:36 -0800556// Gets set up as a libevent handler for SIGCHLD.
557// Handles calling Child::ProcessDied() on the appropriate one.
558void SigCHLDReceived(int /*fd*/, short /*events*/, void *) {
Brian Silvermand169fcd2013-02-27 13:18:47 -0800559 // In a while loop in case we miss any SIGCHLDs.
560 while (true) {
561 siginfo_t infop;
562 infop.si_pid = 0;
563 if (waitid(P_ALL, 0, &infop, WEXITED | WSTOPPED | WNOHANG) != 0) {
564 LOG(WARNING, "waitid failed with %d: %s", errno, strerror(errno));
Brian Silverman5cc661b2013-02-27 15:23:36 -0800565 continue;
Brian Silvermand169fcd2013-02-27 13:18:47 -0800566 }
Brian Silverman5cc661b2013-02-27 15:23:36 -0800567 // If there are no more child process deaths to process.
Brian Silvermand169fcd2013-02-27 13:18:47 -0800568 if (infop.si_pid == 0) {
Brian Silverman5cc661b2013-02-27 15:23:36 -0800569 return;
Brian Silvermand169fcd2013-02-27 13:18:47 -0800570 }
571
572 pid_t pid = infop.si_pid;
573 int status = infop.si_status;
574 const unique_ptr<Child> &child = FindChild(pid);
575 if (child) {
576 switch (infop.si_code) {
577 case CLD_EXITED:
578 LOG(WARNING, "child %d (%s) exited with status %d\n",
579 pid, child->name(), status);
580 break;
581 case CLD_DUMPED:
582 LOG(INFO, "child %d actually dumped core. "
583 "falling through to killed by signal case\n", pid);
584 case CLD_KILLED:
585 // If somebody (possibly us) sent it SIGTERM that means that they just
586 // want it to stop, so it stopping isn't a WARNING.
587 LOG((status == SIGTERM) ? DEBUG : WARNING,
588 "child %d (%s) was killed by signal %d (%s)\n",
589 pid, child->name(), status,
590 strsignal(status));
591 break;
592 case CLD_STOPPED:
593 LOG(WARNING, "child %d (%s) was stopped by signal %d "
594 "(giving it a SIGCONT(%d))\n",
595 pid, child->name(), status, SIGCONT);
596 kill(pid, SIGCONT);
597 continue;
598 default:
599 LOG(WARNING, "something happened to child %d (%s) (killing it)\n",
600 pid, child->name());
601 kill(pid, SIGKILL);
602 continue;
603 }
604 } else {
Brian Silverman5cc661b2013-02-27 15:23:36 -0800605 LOG(WARNING, "couldn't find a Child for pid %d\n", pid);
606 return;
Brian Silvermand169fcd2013-02-27 13:18:47 -0800607 }
608
Brian Silverman5cc661b2013-02-27 15:23:36 -0800609 if (child == core) {
610 LOG(FATAL, "core died\n");
611 }
Brian Silvermand169fcd2013-02-27 13:18:47 -0800612 child->ProcessDied();
613 }
614}
615
Brian Silverman5cc661b2013-02-27 15:23:36 -0800616// This is used for communicating the name of the file to read processes to
617// start from main to Run.
618const char *child_list_file;
619
Brian Silvermand169fcd2013-02-27 13:18:47 -0800620// This is the callback for when core creates the file indicating that it has
621// started.
622void Run(void *watch) {
623 // Make it so it doesn't keep on seeing random changes in /tmp.
624 static_cast<FileWatch *>(watch)->RemoveWatch();
625
626 // It's safe now because core is up.
627 aos::InitNRT();
628
629 std::ifstream list_file(child_list_file);
630
Brian Silvermand169fcd2013-02-27 13:18:47 -0800631 while (true) {
632 std::string child_name;
633 getline(list_file, child_name);
634 if ((list_file.rdstate() & std::ios_base::eofbit) != 0) {
635 break;
636 }
637 if (list_file.rdstate() != 0) {
638 LOG(FATAL, "reading input file %s failed\n", child_list_file);
639 }
640 children.push_back(unique_ptr<Child>(new Child(child_name)));
641 }
642
643 EventUniquePtr sigchld(event_new(libevent_base.get(), SIGCHLD,
Brian Silverman5cc661b2013-02-27 15:23:36 -0800644 EV_SIGNAL | EV_PERSIST,
645 SigCHLDReceived, NULL));
Brian Silvermand169fcd2013-02-27 13:18:47 -0800646 event_add(sigchld.release(), NULL);
647}
648
649void Main() {
650 logging::Init();
651 // TODO(brians) tell logging that using the root logger from here until we
652 // bring up shm is ok
653
Brian Silverman5cc661b2013-02-27 15:23:36 -0800654 if (setpgid(0 /*self*/, 0 /*make PGID the same as PID*/) != 0) {
655 LOG(FATAL, "setpgid(0, 0) failed with %d: %s\n", errno, strerror(errno));
656 }
Brian Silvermand169fcd2013-02-27 13:18:47 -0800657
658 // Make sure that we kill all children when we exit.
Brian Silverman5cc661b2013-02-27 15:23:36 -0800659 atexit(ExitHandler);
Brian Silvermand169fcd2013-02-27 13:18:47 -0800660 // Do it on some signals too (ones that we otherwise tend to receive and then
661 // leave all of our children going).
Brian Silverman5cc661b2013-02-27 15:23:36 -0800662 signal(SIGHUP, KillChildrenSignalHandler);
663 signal(SIGINT, KillChildrenSignalHandler);
664 signal(SIGQUIT, KillChildrenSignalHandler);
665 signal(SIGILL, KillChildrenSignalHandler);
666 signal(SIGABRT, KillChildrenSignalHandler);
667 signal(SIGFPE, KillChildrenSignalHandler);
668 signal(SIGSEGV, KillChildrenSignalHandler);
669 signal(SIGPIPE, KillChildrenSignalHandler);
670 signal(SIGTERM, KillChildrenSignalHandler);
671 signal(SIGBUS, KillChildrenSignalHandler);
672 signal(SIGXCPU, KillChildrenSignalHandler);
Brian Silvermand169fcd2013-02-27 13:18:47 -0800673
674 libevent_base = EventBaseUniquePtr(event_base_new());
675
Brian Silvermanb1e4f6c2013-02-27 15:42:02 -0800676 std::string core_touch_file = "/tmp/starter.";
Brian Silvermand169fcd2013-02-27 13:18:47 -0800677 core_touch_file += std::to_string(static_cast<intmax_t>(getpid()));
678 core_touch_file += ".core_touch_file";
Brian Silvermanb1e4f6c2013-02-27 15:42:02 -0800679 if (system(("touch '" + core_touch_file + "'").c_str()) != 0) {
680 LOG(FATAL, "running `touch '%s'` failed\n", core_touch_file.c_str());
681 }
682 FileWatch core_touch_file_watch(core_touch_file, Run, NULL);
Brian Silvermand169fcd2013-02-27 13:18:47 -0800683 core = unique_ptr<Child>(
Brian Silvermanb1e4f6c2013-02-27 15:42:02 -0800684 new Child("core " + core_touch_file));
Brian Silvermand169fcd2013-02-27 13:18:47 -0800685
686 FILE *pid_file = fopen("/tmp/starter.pid", "w");
687 if (pid_file == NULL) {
Brian Silverman5cc661b2013-02-27 15:23:36 -0800688 LOG(FATAL, "fopen(\"/tmp/starter.pid\", \"w\") failed with %d: %s\n",
Brian Silvermand169fcd2013-02-27 13:18:47 -0800689 errno, strerror(errno));
690 } else {
691 if (fprintf(pid_file, "%d", core->pid()) == -1) {
Brian Silverman5cc661b2013-02-27 15:23:36 -0800692 LOG(WARNING, "fprintf(%p, \"%%d\", %d) failed with %d: %s\n",
693 pid_file, core->pid(), errno, strerror(errno));
Brian Silvermand169fcd2013-02-27 13:18:47 -0800694 }
695 fclose(pid_file);
696 }
697
698 LOG(INFO, "waiting for %s to appear\n", core_touch_file.c_str());
699
700 event_base_dispatch(libevent_base.get());
701 LOG(FATAL, "event_base_dispatch(%p) returned\n", libevent_base.get());
702}
703
704} // namespace starter
705} // namespace aos
706
707int main(int argc, char *argv[]) {
708 if (argc < 2) {
709 fputs("starter: error: need an argument specifying what file to use\n",
710 stderr);
711 exit(EXIT_FAILURE);
712 } else if(argc > 2) {
713 fputs("starter: warning: too many arguments\n", stderr);
714 }
715 aos::starter::child_list_file = argv[1];
716
717 aos::starter::Main();
718}