blob: dcff517266dd41768b28c9aa79bf935bcea5fbaf [file] [log] [blame]
Brian Silvermand169fcd2013-02-27 13:18:47 -08001#include <stdio.h>
2#include <stdlib.h>
3#include <sys/types.h>
4#include <fcntl.h>
5#include <sys/inotify.h>
6#include <sys/stat.h>
7#include <sys/ioctl.h>
8#include <assert.h>
9#include <signal.h>
10#include <stdint.h>
11#include <errno.h>
12#include <string.h>
13#include <sys/wait.h>
14
15#include <map>
16#include <functional>
17#include <deque>
18#include <fstream>
19#include <queue>
20#include <list>
21#include <string>
22#include <vector>
23#include <memory>
24
25#include <event2/event.h>
26
27#include "aos/common/logging/logging.h"
28#include "aos/common/logging/logging_impl.h"
29#include "aos/atom_code/init.h"
30#include "aos/common/unique_malloc_ptr.h"
31#include "aos/common/time.h"
Brian Silverman5cc661b2013-02-27 15:23:36 -080032#include "aos/common/once.h"
Brian Silvermand169fcd2013-02-27 13:18:47 -080033
34// This is the main piece of code that starts all of the rest of the code and
35// restarts it when the binaries are modified.
36//
Brian Silvermanbc4fc2f2013-02-27 19:33:42 -080037// Throughout, the code is not terribly concerned with thread safety because
38// there is only 1 thread. It does some setup and then lets inotify run things
39// when appropriate.
40//
Brian Silverman5cc661b2013-02-27 15:23:36 -080041// NOTE: This program should never exit nicely. It catches all nice attempts to
42// exit, forwards them to all of the children that it has started, waits for
Brian Silvermand169fcd2013-02-27 13:18:47 -080043// them to exit nicely, and then SIGKILLs anybody left (which will always
44// include itself).
45
46using ::std::unique_ptr;
47
48namespace aos {
49namespace starter {
50
Brian Silverman0eec9532013-02-27 20:24:16 -080051// TODO(brians): split out the c++ libevent wrapper stuff into its own file(s)
Brian Silvermand169fcd2013-02-27 13:18:47 -080052class EventBaseDeleter {
53 public:
54 void operator()(event_base *base) {
Brian Silverman8070a222013-02-28 15:01:36 -080055 if (base == NULL) return;
Brian Silvermand169fcd2013-02-27 13:18:47 -080056 event_base_free(base);
57 }
58};
59typedef unique_ptr<event_base, EventBaseDeleter> EventBaseUniquePtr;
Brian Silverman5cc661b2013-02-27 15:23:36 -080060EventBaseUniquePtr libevent_base;
Brian Silvermand169fcd2013-02-27 13:18:47 -080061
62class EventDeleter {
63 public:
64 void operator()(event *evt) {
Brian Silverman8070a222013-02-28 15:01:36 -080065 if (evt == NULL) return;
Brian Silvermand169fcd2013-02-27 13:18:47 -080066 if (event_del(evt) != 0) {
67 LOG(WARNING, "event_del(%p) failed\n", evt);
68 }
69 }
70};
71typedef unique_ptr<event, EventDeleter> EventUniquePtr;
72
Brian Silverman5cc661b2013-02-27 15:23:36 -080073// Watches a file path for modifications. Once created, keeps watching until
74// destroyed or RemoveWatch() is called.
Brian Silverman0eec9532013-02-27 20:24:16 -080075// TODO(brians): split this out into its own file + tests
Brian Silvermand169fcd2013-02-27 13:18:47 -080076class FileWatch {
77 public:
78 // Will call callback(value) when filename is modified.
79 // If value is NULL, then a pointer to this object will be passed instead.
Brian Silverman5cc661b2013-02-27 15:23:36 -080080 //
81 // Watching for file creations is slightly different. To do that, pass true
Brian Silverman8070a222013-02-28 15:01:36 -080082 // as create, the directory where the file will be created for filename, and
Brian Silverman5cc661b2013-02-27 15:23:36 -080083 // the name of the file (without directory name) for check_filename.
Brian Silvermand169fcd2013-02-27 13:18:47 -080084 FileWatch(std::string filename,
Brian Silverman8070a222013-02-28 15:01:36 -080085 std::function<void(void *)> callback,
86 void *value,
87 bool create = false,
88 std::string check_filename = "")
89 : filename_(filename),
90 callback_(callback),
91 value_(value),
Brian Silvermand169fcd2013-02-27 13:18:47 -080092 check_filename_(check_filename) {
Brian Silverman5cc661b2013-02-27 15:23:36 -080093 init_once.Get();
94
Brian Silvermand169fcd2013-02-27 13:18:47 -080095 watch_ = inotify_add_watch(notify_fd, filename.c_str(),
96 create ? IN_CREATE : (IN_ATTRIB | IN_MODIFY));
97 if (watch_ == -1) {
Brian Silverman5cc661b2013-02-27 15:23:36 -080098 LOG(FATAL, "inotify_add_watch(%d, %s,"
99 " %s ? IN_CREATE : (IN_ATTRIB | IN_MODIFY)) failed with %d: %s\n",
100 notify_fd, filename.c_str(), create ? "true" : "false",
101 errno, strerror(errno));
Brian Silvermand169fcd2013-02-27 13:18:47 -0800102 }
103 watchers[watch_] = this;
104 }
105 // Cleans up everything.
106 ~FileWatch() {
107 if (watch_ != -1) {
108 RemoveWatch();
109 }
110 }
111
112 // After calling this method, this object won't really be doing much of
Brian Silverman5cc661b2013-02-27 15:23:36 -0800113 // anything besides possibly running its callback or something.
Brian Silvermand169fcd2013-02-27 13:18:47 -0800114 void RemoveWatch() {
115 assert(watch_ != -1);
Brian Silverman5cc661b2013-02-27 15:23:36 -0800116
Brian Silvermand169fcd2013-02-27 13:18:47 -0800117 if (inotify_rm_watch(notify_fd, watch_) == -1) {
118 LOG(WARNING, "inotify_rm_watch(%d, %d) failed with %d: %s\n",
119 notify_fd, watch_, errno, strerror(errno));
120 }
Brian Silverman5cc661b2013-02-27 15:23:36 -0800121
Brian Silvermand169fcd2013-02-27 13:18:47 -0800122 if (watchers[watch_] != this) {
Brian Silverman5cc661b2013-02-27 15:23:36 -0800123 LOG(WARNING, "watcher for %s (%p) didn't find itself in the map\n",
124 filename_.c_str(), this);
Brian Silvermand169fcd2013-02-27 13:18:47 -0800125 } else {
126 watchers.erase(watch_);
127 }
Brian Silvermanb1e4f6c2013-02-27 15:42:02 -0800128 LOG(DEBUG, "removed watch ID %d\n", watch_);
Brian Silvermand169fcd2013-02-27 13:18:47 -0800129 watch_ = -1;
130 }
131
Brian Silverman5cc661b2013-02-27 15:23:36 -0800132 private:
133 // Performs the static initialization. Called by init_once from the
134 // constructor.
135 static void *Init() {
136 notify_fd = inotify_init1(IN_CLOEXEC);
137 EventUniquePtr notify_event(event_new(libevent_base.get(), notify_fd,
138 EV_READ | EV_PERSIST,
139 FileWatch::INotifyReadable, NULL));
140 event_add(notify_event.release(), NULL);
141 return NULL;
Brian Silvermand169fcd2013-02-27 13:18:47 -0800142 }
143
144 // This gets set up as the callback for EV_READ on the inotify file
Brian Silverman5cc661b2013-02-27 15:23:36 -0800145 // descriptor. It calls FileNotified on the appropriate instance.
Brian Silvermand169fcd2013-02-27 13:18:47 -0800146 static void INotifyReadable(int /*fd*/, short /*events*/, void *) {
147 unsigned int to_read;
Brian Silverman5cc661b2013-02-27 15:23:36 -0800148 // Use FIONREAD to figure out how many bytes there are to read.
Brian Silvermand169fcd2013-02-27 13:18:47 -0800149 if (ioctl(notify_fd, FIONREAD, &to_read) < 0) {
150 LOG(FATAL, "FIONREAD(%d, %p) failed with %d: %s\n",
151 notify_fd, &to_read, errno, strerror(errno));
152 }
153 inotify_event *notifyevt = static_cast<inotify_event *>(malloc(to_read));
154 const char *end = reinterpret_cast<char *>(notifyevt) + to_read;
155 aos::unique_c_ptr<inotify_event> freer(notifyevt);
156
157 ssize_t ret = read(notify_fd, notifyevt, to_read);
158 if (ret < 0) {
159 LOG(FATAL, "read(%d, %p, %u) failed with %d: %s\n",
160 notify_fd, notifyevt, to_read, errno, strerror(errno));
161 }
162 if (static_cast<size_t>(ret) != to_read) {
163 LOG(ERROR, "read(%d, %p, %u) returned %zd instead of %u\n",
164 notify_fd, notifyevt, to_read, ret, to_read);
165 return;
166 }
167
Brian Silverman5cc661b2013-02-27 15:23:36 -0800168 // Keep looping through until we get to the end because inotify does return
169 // multiple events at once.
Brian Silvermanb1e4f6c2013-02-27 15:42:02 -0800170 while (reinterpret_cast<char *>(notifyevt) < end) {
Brian Silvermand169fcd2013-02-27 13:18:47 -0800171 if (watchers.count(notifyevt->wd) != 1) {
Brian Silvermanfe06fe12013-02-27 18:54:58 -0800172 LOG(WARNING, "couldn't find whose watch ID %d is\n", notifyevt->wd);
Brian Silvermanb1e4f6c2013-02-27 15:42:02 -0800173 } else {
174 watchers[notifyevt->wd]->FileNotified((notifyevt->len > 0) ?
175 notifyevt->name : NULL);
Brian Silvermand169fcd2013-02-27 13:18:47 -0800176 }
Brian Silvermand169fcd2013-02-27 13:18:47 -0800177
178 notifyevt = reinterpret_cast<inotify_event *>(
179 reinterpret_cast<char *>(notifyevt) +
180 sizeof(*notifyevt) + notifyevt->len);
Brian Silvermand169fcd2013-02-27 13:18:47 -0800181 }
182 }
183
Brian Silverman5cc661b2013-02-27 15:23:36 -0800184 // INotifyReadable calls this method whenever the watch for our file triggers.
185 void FileNotified(const char *filename) {
186 assert(watch_ != -1);
Brian Silvermanb1e4f6c2013-02-27 15:42:02 -0800187 LOG(DEBUG, "got a notification for %s\n", filename_.c_str());
Brian Silverman5cc661b2013-02-27 15:23:36 -0800188
189 if (!check_filename_.empty()) {
190 if (filename == NULL) {
191 return;
192 }
193 if (std::string(filename) != check_filename_) {
194 return;
195 }
196 }
197
198 callback_((value_ == NULL) ? this : value_);
Brian Silvermand169fcd2013-02-27 13:18:47 -0800199 }
200
Brian Silverman5cc661b2013-02-27 15:23:36 -0800201 // To make sure that Init gets called exactly once.
202 static ::aos::Once<void> init_once;
203
Brian Silvermand169fcd2013-02-27 13:18:47 -0800204 const std::string filename_;
205 const std::function<void(void *)> callback_;
206 void *const value_;
207 std::string check_filename_;
208
209 // The watch descriptor or -1 if we don't have one any more.
210 int watch_;
211
Brian Silvermanbc4fc2f2013-02-27 19:33:42 -0800212 // Map from watch IDs to instances of this class.
213 // <https://patchwork.kernel.org/patch/73192/> ("inotify: do not reuse watch
214 // descriptors") says they won't get reused, but that shouldn't be counted on
215 // because we might have a modified/different version/whatever kernel.
Brian Silvermand169fcd2013-02-27 13:18:47 -0800216 static std::map<int, FileWatch *> watchers;
Brian Silverman5cc661b2013-02-27 15:23:36 -0800217 // The inotify(7) file descriptor.
Brian Silvermand169fcd2013-02-27 13:18:47 -0800218 static int notify_fd;
Brian Silvermanbc4fc2f2013-02-27 19:33:42 -0800219
220 DISALLOW_COPY_AND_ASSIGN(FileWatch);
Brian Silvermand169fcd2013-02-27 13:18:47 -0800221};
Brian Silverman5cc661b2013-02-27 15:23:36 -0800222::aos::Once<void> FileWatch::init_once(FileWatch::Init);
Brian Silvermand169fcd2013-02-27 13:18:47 -0800223std::map<int, FileWatch *> FileWatch::watchers;
224int FileWatch::notify_fd;
225
Brian Silverman5cc661b2013-02-27 15:23:36 -0800226// Runs the given command and returns its first line of output (not including
227// the \n). LOG(FATAL)s if the command has an exit status other than 0 or does
228// not print out an entire line.
Brian Silvermand169fcd2013-02-27 13:18:47 -0800229std::string RunCommand(std::string command) {
Brian Silverman5cc661b2013-02-27 15:23:36 -0800230 // popen(3) might fail and not set it.
Brian Silvermand169fcd2013-02-27 13:18:47 -0800231 errno = 0;
Brian Silverman5cc661b2013-02-27 15:23:36 -0800232 FILE *pipe = popen(command.c_str(), "r");
233 if (pipe == NULL) {
Brian Silvermand169fcd2013-02-27 13:18:47 -0800234 LOG(FATAL, "popen(\"%s\", \"r\") failed with %d: %s\n",
235 command.c_str(), errno, strerror(errno));
236 }
237
Brian Silverman5cc661b2013-02-27 15:23:36 -0800238 // result_size is how many bytes result is currently allocated to.
Brian Silvermand169fcd2013-02-27 13:18:47 -0800239 size_t result_size = 128, read = 0;
240 unique_c_ptr<char> result(static_cast<char *>(malloc(result_size)));
241 while (true) {
Brian Silverman5cc661b2013-02-27 15:23:36 -0800242 // If we filled up the buffer, then realloc(3) it bigger.
Brian Silvermand169fcd2013-02-27 13:18:47 -0800243 if (read == result_size) {
244 result_size *= 2;
245 void *new_result = realloc(result.get(), result_size);
246 if (new_result == NULL) {
Brian Silverman5cc661b2013-02-27 15:23:36 -0800247 LOG(FATAL, "realloc(%p, %zd) failed because of %d: %s\n",
248 result.get(), result_size, errno, strerror(errno));
Brian Silvermand169fcd2013-02-27 13:18:47 -0800249 } else {
250 result.release();
251 result = unique_c_ptr<char>(static_cast<char *>(new_result));
252 }
253 }
254
Brian Silverman5cc661b2013-02-27 15:23:36 -0800255 size_t ret = fread(result.get() + read, 1, result_size - read, pipe);
256 // If the read didn't fill up the whole buffer, check to see if it was
257 // because of an error.
Brian Silvermand169fcd2013-02-27 13:18:47 -0800258 if (ret < result_size - read) {
Brian Silverman5cc661b2013-02-27 15:23:36 -0800259 if (ferror(pipe)) {
Brian Silvermand169fcd2013-02-27 13:18:47 -0800260 LOG(FATAL, "couldn't finish reading output of \"%s\"\n",
261 command.c_str());
262 }
263 }
264 read += ret;
265 if (read > 0 && result.get()[read - 1] == '\n') {
266 break;
267 }
268
Brian Silverman5cc661b2013-02-27 15:23:36 -0800269 if (feof(pipe)) {
270 LOG(FATAL, "`%s` failed. didn't print a whole line\n", command.c_str());
Brian Silvermand169fcd2013-02-27 13:18:47 -0800271 }
272 }
273
Brian Silverman5cc661b2013-02-27 15:23:36 -0800274 // Get rid of the first \n and anything after it.
Brian Silvermand169fcd2013-02-27 13:18:47 -0800275 *strchrnul(result.get(), '\n') = '\0';
276
Brian Silverman5cc661b2013-02-27 15:23:36 -0800277 int child_status = pclose(pipe);
Brian Silvermand169fcd2013-02-27 13:18:47 -0800278 if (child_status == -1) {
Brian Silverman5cc661b2013-02-27 15:23:36 -0800279 LOG(FATAL, "pclose(%p) failed with %d: %s\n", pipe,
Brian Silvermand169fcd2013-02-27 13:18:47 -0800280 errno, strerror(errno));
281 }
282
283 if (child_status != 0) {
284 LOG(FATAL, "`%s` failed. return %d\n", command.c_str(), child_status);
285 }
286
287 return std::string(result.get());
288}
289
290// Will call callback(arg) after time.
291void Timeout(time::Time time, void (*callback)(int, short, void *), void *arg) {
292 EventUniquePtr timeout(evtimer_new(libevent_base.get(), callback, arg));
293 struct timeval time_timeval = time.ToTimeval();
294 evtimer_add(timeout.release(), &time_timeval);
295}
296
297// Represents a child process. It will take care of restarting itself etc.
298class Child {
299 public:
Brian Silverman5cc661b2013-02-27 15:23:36 -0800300 // command is the (space-separated) command to run and its arguments.
301 Child(const std::string &command) : pid_(-1),
Brian Silvermand169fcd2013-02-27 13:18:47 -0800302 restart_timeout_(
Brian Silvermanfe06fe12013-02-27 18:54:58 -0800303 evtimer_new(libevent_base.get(), StaticDoRestart, this)),
304 stat_at_start_valid_(false) {
Brian Silvermand169fcd2013-02-27 13:18:47 -0800305 const char *start, *end;
306 start = command.c_str();
307 while (true) {
308 end = strchrnul(start, ' ');
309 args_.push_back(std::string(start, end - start));
310 start = end + 1;
311 if (*end == '\0') {
312 break;
313 }
314 }
315
Brian Silverman5cc661b2013-02-27 15:23:36 -0800316 original_binary_ = RunCommand("which " + args_[0]);
317 binary_ = original_binary_ + ".stm";
Brian Silvermand169fcd2013-02-27 13:18:47 -0800318
319 watcher_ = unique_ptr<FileWatch>(
Brian Silverman5cc661b2013-02-27 15:23:36 -0800320 new FileWatch(original_binary_, StaticFileModified, this));
Brian Silvermand169fcd2013-02-27 13:18:47 -0800321
322 Start();
323 }
324
325 pid_t pid() { return pid_; }
326
327 // This gets called whenever the actual process dies and should (probably) be
328 // restarted.
329 void ProcessDied() {
330 pid_ = -1;
331 restarts_.push(time::Time::Now());
332 if (restarts_.size() > kMaxRestartsNumber) {
333 time::Time oldest = restarts_.front();
334 restarts_.pop();
335 if ((time::Time::Now() - oldest) > kMaxRestartsTime) {
336 LOG(WARNING, "process %s getting restarted too often\n", name());
337 Timeout(kResumeWait, StaticStart, this);
338 return;
339 }
340 }
341 Start();
342 }
343
344 // Returns a name for logging purposes.
345 const char *name() {
346 return args_[0].c_str();
347 }
348
349 private:
350 struct CheckDiedStatus {
351 Child *self;
352 pid_t old_pid;
353 };
354
355 // How long to wait for a child to die nicely.
356 static const time::Time kProcessDieTime;
357
358 // How long to wait after the file is modified to restart it.
359 // This is important because some programs like modifying the binaries by
360 // writing them in little bits, which results in attempting to start partial
361 // binaries without this.
362 static const time::Time kRestartWaitTime;
363
Brian Silverman5cc661b2013-02-27 15:23:36 -0800364 // Only kMaxRestartsNumber restarts will be allowed in kMaxRestartsTime.
Brian Silvermand169fcd2013-02-27 13:18:47 -0800365 static const time::Time kMaxRestartsTime;
Brian Silverman8070a222013-02-28 15:01:36 -0800366 static const size_t kMaxRestartsNumber = 4;
Brian Silvermand169fcd2013-02-27 13:18:47 -0800367 // How long to wait if it gets restarted too many times.
368 static const time::Time kResumeWait;
Brian Silverman5cc661b2013-02-27 15:23:36 -0800369
Brian Silvermand169fcd2013-02-27 13:18:47 -0800370 static void StaticFileModified(void *self) {
371 static_cast<Child *>(self)->FileModified();
372 }
Brian Silverman5cc661b2013-02-27 15:23:36 -0800373
Brian Silvermand169fcd2013-02-27 13:18:47 -0800374 void FileModified() {
375 struct timeval restart_time_timeval = kRestartWaitTime.ToTimeval();
376 // This will reset the timeout again if it hasn't run yet.
377 evtimer_add(restart_timeout_.get(), &restart_time_timeval);
378 }
379
380 static void StaticDoRestart(int, short, void *self) {
381 static_cast<Child *>(self)->DoRestart();
382 }
383
Brian Silvermanb1e4f6c2013-02-27 15:42:02 -0800384 // Called after somebody else has finished modifying the file.
Brian Silvermand169fcd2013-02-27 13:18:47 -0800385 void DoRestart() {
Brian Silvermanfe06fe12013-02-27 18:54:58 -0800386 if (stat_at_start_valid_) {
387 struct stat current_stat;
388 if (stat(original_binary_.c_str(), &current_stat) == -1) {
389 LOG(FATAL, "stat(%s, %p) failed with %d: %s\n",
390 original_binary_.c_str(), &current_stat, errno, strerror(errno));
391 }
392 if (current_stat.st_mtime == stat_at_start_.st_mtime) {
393 LOG(DEBUG, "ignoring trigger for %s because mtime didn't change\n",
394 name());
395 return;
396 }
397 }
398
Brian Silvermand169fcd2013-02-27 13:18:47 -0800399 if (pid_ != -1) {
400 LOG(DEBUG, "sending SIGTERM to child %d to restart it\n", pid_);
401 if (kill(pid_, SIGTERM) == -1) {
402 LOG(WARNING, "kill(%d, SIGTERM) failed with %d: %s\n",
403 pid_, errno, strerror(errno));
404 }
405 CheckDiedStatus *status = new CheckDiedStatus();
406 status->self = this;
407 status->old_pid = pid_;
408 Timeout(kProcessDieTime, StaticCheckDied, status);
409 }
410 }
411
412 static void StaticCheckDied(int, short, void *status_in) {
413 CheckDiedStatus *status = static_cast<CheckDiedStatus *>(status_in);
414 status->self->CheckDied(status->old_pid);
415 delete status;
416 }
Brian Silverman5cc661b2013-02-27 15:23:36 -0800417
418 // Checks to see if the child using the PID old_pid is still running.
Brian Silvermand169fcd2013-02-27 13:18:47 -0800419 void CheckDied(pid_t old_pid) {
420 if (pid_ == old_pid) {
421 LOG(WARNING, "child %d refused to die\n", old_pid);
422 if (kill(old_pid, SIGKILL) == -1) {
423 LOG(WARNING, "kill(%d, SIGKILL) failed with %d: %s\n",
424 old_pid, errno, strerror(errno));
425 }
426 }
427 }
428
429 static void StaticStart(int, short, void *self) {
430 static_cast<Child *>(self)->Start();
431 }
Brian Silverman5cc661b2013-02-27 15:23:36 -0800432
433 // Actually starts the child.
Brian Silvermand169fcd2013-02-27 13:18:47 -0800434 void Start() {
435 if (pid_ != -1) {
436 LOG(WARNING, "calling Start() but already have child %d running\n",
437 pid_);
Brian Silverman5cc661b2013-02-27 15:23:36 -0800438 if (kill(pid_, SIGKILL) == -1) {
439 LOG(WARNING, "kill(%d, SIGKILL) failed with %d: %s\n",
440 pid_, errno, strerror(errno));
441 return;
442 }
Brian Silvermand169fcd2013-02-27 13:18:47 -0800443 pid_ = -1;
444 }
Brian Silverman5cc661b2013-02-27 15:23:36 -0800445
446 // Remove the name that we run from (ie from a previous execution) and then
447 // hard link the real filename to it.
448 if (unlink(binary_.c_str()) != 0 && errno != ENOENT) {
449 LOG(FATAL, "removing %s failed because of %d: %s\n",
450 binary_.c_str(), errno, strerror(errno));
451 }
452 if (link(original_binary_.c_str(), binary_.c_str()) != 0) {
453 LOG(FATAL, "link('%s', '%s') failed because of %d: %s\n",
454 original_binary_.c_str(), binary_.c_str(), errno, strerror(errno));
455 }
456
Brian Silvermanfe06fe12013-02-27 18:54:58 -0800457 if (stat(original_binary_.c_str(), &stat_at_start_) == -1) {
458 LOG(FATAL, "stat(%s, %p) failed with %d: %s\n",
459 original_binary_.c_str(), &stat_at_start_, errno, strerror(errno));
460 }
461 stat_at_start_valid_ = true;
462
Brian Silvermand169fcd2013-02-27 13:18:47 -0800463 if ((pid_ = fork()) == 0) {
464 ssize_t args_size = args_.size();
465 const char **argv = new const char *[args_size + 1];
466 for (int i = 0; i < args_size; ++i) {
467 argv[i] = args_[i].c_str();
468 }
469 argv[args_size] = NULL;
470 // The const_cast is safe because no code that might care if it gets
471 // modified can run afterwards.
472 execv(binary_.c_str(), const_cast<char **>(argv));
473 LOG(FATAL, "execv(%s, %p) failed with %d: %s\n",
474 binary_.c_str(), argv, errno, strerror(errno));
475 _exit(EXIT_FAILURE);
476 }
477 if (pid_ == -1) {
Brian Silverman5cc661b2013-02-27 15:23:36 -0800478 LOG(FATAL, "forking to run \"%s\" failed with %d: %s\n",
Brian Silvermand169fcd2013-02-27 13:18:47 -0800479 binary_.c_str(), errno, strerror(errno));
480 }
481 }
Brian Silvermanbc4fc2f2013-02-27 19:33:42 -0800482
483 // A history of the times that this process has been restarted.
484 std::queue<time::Time, std::list<time::Time>> restarts_;
485
486 // The currently running child's PID or NULL.
487 pid_t pid_;
488
489 // All of the arguments (including the name of the binary).
490 std::deque<std::string> args_;
491
492 // The name of the real binary that we were told to run.
493 std::string original_binary_;
494 // The name of the file that we're actually running.
495 std::string binary_;
496
497 // Watches original_binary_.
498 unique_ptr<FileWatch> watcher_;
499
500 // An event that restarts after kRestartWaitTime.
501 EventUniquePtr restart_timeout_;
502
503 // Captured from the original file when we most recently started a new child
504 // process. Used to see if it actually changes or not.
505 struct stat stat_at_start_;
506 bool stat_at_start_valid_;
507
508 DISALLOW_COPY_AND_ASSIGN(Child);
Brian Silvermand169fcd2013-02-27 13:18:47 -0800509};
510const time::Time Child::kProcessDieTime = time::Time::InSeconds(0.5);
511const time::Time Child::kMaxRestartsTime = time::Time::InSeconds(2);
Brian Silverman8070a222013-02-28 15:01:36 -0800512const time::Time Child::kResumeWait = time::Time::InSeconds(2);
513const time::Time Child::kRestartWaitTime = time::Time::InSeconds(1);
Brian Silvermand169fcd2013-02-27 13:18:47 -0800514
515// This is where all of the Child instances except core live.
516std::vector<unique_ptr<Child>> children;
Brian Silverman5cc661b2013-02-27 15:23:36 -0800517// A global place to hold on to which child is core.
518unique_ptr<Child> core;
Brian Silvermand169fcd2013-02-27 13:18:47 -0800519
Brian Silverman5cc661b2013-02-27 15:23:36 -0800520// Kills off the entire process group (including ourself).
521void KillChildren(bool try_nice) {
Brian Silvermand169fcd2013-02-27 13:18:47 -0800522 if (try_nice) {
523 static const int kNiceStopSignal = SIGTERM;
524 static const time::Time kNiceWaitTime = time::Time::InSeconds(1);
525
526 // Make sure that we don't just nicely stop ourself...
527 sigset_t mask;
528 sigemptyset(&mask);
529 sigaddset(&mask, kNiceStopSignal);
530 sigprocmask(SIG_BLOCK, &mask, NULL);
531
Brian Silverman5cc661b2013-02-27 15:23:36 -0800532 kill(-getpid(), kNiceStopSignal);
533
534 fflush(NULL);
Brian Silvermand169fcd2013-02-27 13:18:47 -0800535 time::SleepFor(kNiceWaitTime);
536 }
Brian Silverman5cc661b2013-02-27 15:23:36 -0800537
Brian Silvermand169fcd2013-02-27 13:18:47 -0800538 // Send SIGKILL to our whole process group, which will forcibly terminate any
539 // of them that are still running (us for sure, maybe more too).
Brian Silverman5cc661b2013-02-27 15:23:36 -0800540 kill(-getpid(), SIGKILL);
Brian Silvermand169fcd2013-02-27 13:18:47 -0800541}
542
Brian Silverman5cc661b2013-02-27 15:23:36 -0800543void ExitHandler() {
544 KillChildren(true);
Brian Silvermand169fcd2013-02-27 13:18:47 -0800545}
Brian Silverman5cc661b2013-02-27 15:23:36 -0800546
547void KillChildrenSignalHandler(int signum) {
548 // If we get SIGSEGV or some other random signal who knows what's happening
549 // and we should just kill everybody immediately.
550 // This is a list of all of the signals that mean some form of "nicely stop".
551 KillChildren(signum == SIGHUP || signum == SIGINT || signum == SIGQUIT ||
Brian Silverman0eec9532013-02-27 20:24:16 -0800552 signum == SIGABRT || signum == SIGPIPE || signum == SIGTERM ||
553 signum == SIGXCPU);
Brian Silvermand169fcd2013-02-27 13:18:47 -0800554}
555
Brian Silverman5cc661b2013-02-27 15:23:36 -0800556// Returns the currently running child with PID pid or an empty unique_ptr.
Brian Silvermand169fcd2013-02-27 13:18:47 -0800557const unique_ptr<Child> &FindChild(pid_t pid) {
558 for (auto it = children.begin(); it != children.end(); ++it) {
559 if (pid == (*it)->pid()) {
560 return *it;
561 }
562 }
563
564 if (pid == core->pid()) {
565 return core;
566 }
567
Brian Silverman5cc661b2013-02-27 15:23:36 -0800568 static const unique_ptr<Child> kNothing;
Brian Silvermand169fcd2013-02-27 13:18:47 -0800569 return kNothing;
570}
571
Brian Silverman5cc661b2013-02-27 15:23:36 -0800572// Gets set up as a libevent handler for SIGCHLD.
573// Handles calling Child::ProcessDied() on the appropriate one.
574void SigCHLDReceived(int /*fd*/, short /*events*/, void *) {
Brian Silvermand169fcd2013-02-27 13:18:47 -0800575 // In a while loop in case we miss any SIGCHLDs.
576 while (true) {
577 siginfo_t infop;
578 infop.si_pid = 0;
579 if (waitid(P_ALL, 0, &infop, WEXITED | WSTOPPED | WNOHANG) != 0) {
580 LOG(WARNING, "waitid failed with %d: %s", errno, strerror(errno));
Brian Silverman5cc661b2013-02-27 15:23:36 -0800581 continue;
Brian Silvermand169fcd2013-02-27 13:18:47 -0800582 }
Brian Silverman5cc661b2013-02-27 15:23:36 -0800583 // If there are no more child process deaths to process.
Brian Silvermand169fcd2013-02-27 13:18:47 -0800584 if (infop.si_pid == 0) {
Brian Silverman5cc661b2013-02-27 15:23:36 -0800585 return;
Brian Silvermand169fcd2013-02-27 13:18:47 -0800586 }
587
588 pid_t pid = infop.si_pid;
589 int status = infop.si_status;
590 const unique_ptr<Child> &child = FindChild(pid);
591 if (child) {
592 switch (infop.si_code) {
593 case CLD_EXITED:
594 LOG(WARNING, "child %d (%s) exited with status %d\n",
595 pid, child->name(), status);
596 break;
597 case CLD_DUMPED:
598 LOG(INFO, "child %d actually dumped core. "
599 "falling through to killed by signal case\n", pid);
600 case CLD_KILLED:
601 // If somebody (possibly us) sent it SIGTERM that means that they just
602 // want it to stop, so it stopping isn't a WARNING.
603 LOG((status == SIGTERM) ? DEBUG : WARNING,
604 "child %d (%s) was killed by signal %d (%s)\n",
605 pid, child->name(), status,
606 strsignal(status));
607 break;
608 case CLD_STOPPED:
609 LOG(WARNING, "child %d (%s) was stopped by signal %d "
610 "(giving it a SIGCONT(%d))\n",
611 pid, child->name(), status, SIGCONT);
612 kill(pid, SIGCONT);
613 continue;
614 default:
615 LOG(WARNING, "something happened to child %d (%s) (killing it)\n",
616 pid, child->name());
617 kill(pid, SIGKILL);
618 continue;
619 }
620 } else {
Brian Silverman5cc661b2013-02-27 15:23:36 -0800621 LOG(WARNING, "couldn't find a Child for pid %d\n", pid);
622 return;
Brian Silvermand169fcd2013-02-27 13:18:47 -0800623 }
624
Brian Silverman5cc661b2013-02-27 15:23:36 -0800625 if (child == core) {
626 LOG(FATAL, "core died\n");
627 }
Brian Silvermand169fcd2013-02-27 13:18:47 -0800628 child->ProcessDied();
629 }
630}
631
Brian Silverman5cc661b2013-02-27 15:23:36 -0800632// This is used for communicating the name of the file to read processes to
633// start from main to Run.
634const char *child_list_file;
635
Brian Silverman8070a222013-02-28 15:01:36 -0800636void Run(void *watch);
Brian Silvermand169fcd2013-02-27 13:18:47 -0800637void Main() {
638 logging::Init();
Brian Silverman0eec9532013-02-27 20:24:16 -0800639 // TODO(brians): tell logging that using the root logger from here until we
Brian Silvermand169fcd2013-02-27 13:18:47 -0800640 // bring up shm is ok
641
Brian Silverman5cc661b2013-02-27 15:23:36 -0800642 if (setpgid(0 /*self*/, 0 /*make PGID the same as PID*/) != 0) {
643 LOG(FATAL, "setpgid(0, 0) failed with %d: %s\n", errno, strerror(errno));
644 }
Brian Silvermand169fcd2013-02-27 13:18:47 -0800645
646 // Make sure that we kill all children when we exit.
Brian Silverman5cc661b2013-02-27 15:23:36 -0800647 atexit(ExitHandler);
Brian Silvermand169fcd2013-02-27 13:18:47 -0800648 // Do it on some signals too (ones that we otherwise tend to receive and then
649 // leave all of our children going).
Brian Silverman5cc661b2013-02-27 15:23:36 -0800650 signal(SIGHUP, KillChildrenSignalHandler);
651 signal(SIGINT, KillChildrenSignalHandler);
652 signal(SIGQUIT, KillChildrenSignalHandler);
653 signal(SIGILL, KillChildrenSignalHandler);
654 signal(SIGABRT, KillChildrenSignalHandler);
655 signal(SIGFPE, KillChildrenSignalHandler);
656 signal(SIGSEGV, KillChildrenSignalHandler);
657 signal(SIGPIPE, KillChildrenSignalHandler);
658 signal(SIGTERM, KillChildrenSignalHandler);
659 signal(SIGBUS, KillChildrenSignalHandler);
660 signal(SIGXCPU, KillChildrenSignalHandler);
Brian Silvermand169fcd2013-02-27 13:18:47 -0800661
662 libevent_base = EventBaseUniquePtr(event_base_new());
663
Brian Silvermanb1e4f6c2013-02-27 15:42:02 -0800664 std::string core_touch_file = "/tmp/starter.";
Brian Silvermand169fcd2013-02-27 13:18:47 -0800665 core_touch_file += std::to_string(static_cast<intmax_t>(getpid()));
666 core_touch_file += ".core_touch_file";
Brian Silvermanb1e4f6c2013-02-27 15:42:02 -0800667 if (system(("touch '" + core_touch_file + "'").c_str()) != 0) {
668 LOG(FATAL, "running `touch '%s'` failed\n", core_touch_file.c_str());
669 }
670 FileWatch core_touch_file_watch(core_touch_file, Run, NULL);
Brian Silvermand169fcd2013-02-27 13:18:47 -0800671 core = unique_ptr<Child>(
Brian Silvermanb1e4f6c2013-02-27 15:42:02 -0800672 new Child("core " + core_touch_file));
Brian Silvermand169fcd2013-02-27 13:18:47 -0800673
674 FILE *pid_file = fopen("/tmp/starter.pid", "w");
675 if (pid_file == NULL) {
Brian Silverman5cc661b2013-02-27 15:23:36 -0800676 LOG(FATAL, "fopen(\"/tmp/starter.pid\", \"w\") failed with %d: %s\n",
Brian Silvermand169fcd2013-02-27 13:18:47 -0800677 errno, strerror(errno));
678 } else {
679 if (fprintf(pid_file, "%d", core->pid()) == -1) {
Brian Silverman5cc661b2013-02-27 15:23:36 -0800680 LOG(WARNING, "fprintf(%p, \"%%d\", %d) failed with %d: %s\n",
681 pid_file, core->pid(), errno, strerror(errno));
Brian Silvermand169fcd2013-02-27 13:18:47 -0800682 }
683 fclose(pid_file);
684 }
685
686 LOG(INFO, "waiting for %s to appear\n", core_touch_file.c_str());
687
688 event_base_dispatch(libevent_base.get());
689 LOG(FATAL, "event_base_dispatch(%p) returned\n", libevent_base.get());
690}
691
Brian Silverman0eec9532013-02-27 20:24:16 -0800692// This is the callback for when core creates the file indicating that it has
693// started.
694void Run(void *watch) {
695 // Make it so it doesn't keep on seeing random changes in /tmp.
696 static_cast<FileWatch *>(watch)->RemoveWatch();
697
698 // It's safe now because core is up.
699 aos::InitNRT();
700
701 std::ifstream list_file(child_list_file);
702
703 while (true) {
704 std::string child_name;
705 getline(list_file, child_name);
706 if ((list_file.rdstate() & std::ios_base::eofbit) != 0) {
707 break;
708 }
709 if (list_file.rdstate() != 0) {
710 LOG(FATAL, "reading input file %s failed\n", child_list_file);
711 }
712 children.push_back(unique_ptr<Child>(new Child(child_name)));
713 }
714
715 EventUniquePtr sigchld(event_new(libevent_base.get(), SIGCHLD,
716 EV_SIGNAL | EV_PERSIST,
717 SigCHLDReceived, NULL));
718 event_add(sigchld.release(), NULL);
719}
720
Brian Silverman8070a222013-02-28 15:01:36 -0800721const char *kArgsHelp = "[OPTION]... START_LIST\n"
722 "Start all of the robot code binaries in START_LIST.\n"
723 "\n"
724 "START_LIST is the file to read binaries (looked up on PATH) to run.\n"
725 " --help display this help and exit\n";
726void PrintHelp() {
727 fprintf(stderr, "Usage: %s %s", program_invocation_name, kArgsHelp);
728}
729
Brian Silvermand169fcd2013-02-27 13:18:47 -0800730} // namespace starter
731} // namespace aos
732
733int main(int argc, char *argv[]) {
Brian Silverman8070a222013-02-28 15:01:36 -0800734 if (argc != 2) {
735 aos::starter::PrintHelp();
Brian Silvermand169fcd2013-02-27 13:18:47 -0800736 exit(EXIT_FAILURE);
Brian Silvermand169fcd2013-02-27 13:18:47 -0800737 }
Brian Silverman8070a222013-02-28 15:01:36 -0800738 if (strcmp(argv[1], "--help") == 0) {
739 aos::starter::PrintHelp();
740 exit(EXIT_SUCCESS);
741 }
742
Brian Silvermand169fcd2013-02-27 13:18:47 -0800743 aos::starter::child_list_file = argv[1];
744
745 aos::starter::Main();
746}