blob: 59aa88d3bba2ad2e8f7976f1e2c05745dd22a86d [file] [log] [blame]
Brian Silvermanaf784862014-05-13 08:14:55 -07001// This has to come before anybody drags in <stdlib.h> or else we end up with
2// the wrong version of WIFEXITED etc (for one thing, they don't const-qualify
3// their casts) (sometimes at least).
4#include <sys/wait.h>
5
Brian Silvermand169fcd2013-02-27 13:18:47 -08006#include <stdio.h>
7#include <stdlib.h>
8#include <sys/types.h>
9#include <fcntl.h>
10#include <sys/inotify.h>
11#include <sys/stat.h>
12#include <sys/ioctl.h>
13#include <assert.h>
14#include <signal.h>
15#include <stdint.h>
16#include <errno.h>
17#include <string.h>
Brian Silvermand90b5fe2013-03-10 18:34:42 -070018#include <inttypes.h>
Brian Silvermand169fcd2013-02-27 13:18:47 -080019
20#include <map>
21#include <functional>
22#include <deque>
23#include <fstream>
24#include <queue>
25#include <list>
26#include <string>
27#include <vector>
28#include <memory>
Brian Silvermand94642c2014-03-27 18:21:41 -070029#include <set>
Brian Silvermand169fcd2013-02-27 13:18:47 -080030
31#include <event2/event.h>
32
33#include "aos/common/logging/logging.h"
34#include "aos/common/logging/logging_impl.h"
Brian Silverman14fd0fb2014-01-14 21:42:01 -080035#include "aos/linux_code/init.h"
Brian Silvermand169fcd2013-02-27 13:18:47 -080036#include "aos/common/unique_malloc_ptr.h"
37#include "aos/common/time.h"
Brian Silverman5cc661b2013-02-27 15:23:36 -080038#include "aos/common/once.h"
Brian Silvermanaf784862014-05-13 08:14:55 -070039#include "aos/common/libc/aos_strsignal.h"
40#include "aos/common/util/run_command.h"
Brian Silvermand169fcd2013-02-27 13:18:47 -080041
42// This is the main piece of code that starts all of the rest of the code and
43// restarts it when the binaries are modified.
44//
Brian Silvermanbc4fc2f2013-02-27 19:33:42 -080045// Throughout, the code is not terribly concerned with thread safety because
46// there is only 1 thread. It does some setup and then lets inotify run things
47// when appropriate.
48//
Brian Silverman5cc661b2013-02-27 15:23:36 -080049// NOTE: This program should never exit nicely. It catches all nice attempts to
50// exit, forwards them to all of the children that it has started, waits for
Brian Silvermand169fcd2013-02-27 13:18:47 -080051// them to exit nicely, and then SIGKILLs anybody left (which will always
52// include itself).
53
54using ::std::unique_ptr;
55
56namespace aos {
57namespace starter {
58
Brian Silverman0eec9532013-02-27 20:24:16 -080059// TODO(brians): split out the c++ libevent wrapper stuff into its own file(s)
Brian Silvermand169fcd2013-02-27 13:18:47 -080060class EventBaseDeleter {
61 public:
62 void operator()(event_base *base) {
Brian Silverman8070a222013-02-28 15:01:36 -080063 if (base == NULL) return;
Brian Silvermand169fcd2013-02-27 13:18:47 -080064 event_base_free(base);
65 }
66};
67typedef unique_ptr<event_base, EventBaseDeleter> EventBaseUniquePtr;
Brian Silverman5cc661b2013-02-27 15:23:36 -080068EventBaseUniquePtr libevent_base;
Brian Silvermand169fcd2013-02-27 13:18:47 -080069
70class EventDeleter {
71 public:
72 void operator()(event *evt) {
Brian Silverman8070a222013-02-28 15:01:36 -080073 if (evt == NULL) return;
Brian Silvermand169fcd2013-02-27 13:18:47 -080074 if (event_del(evt) != 0) {
75 LOG(WARNING, "event_del(%p) failed\n", evt);
76 }
77 }
78};
79typedef unique_ptr<event, EventDeleter> EventUniquePtr;
80
Brian Silverman5cc661b2013-02-27 15:23:36 -080081// Watches a file path for modifications. Once created, keeps watching until
82// destroyed or RemoveWatch() is called.
Brian Silverman0eec9532013-02-27 20:24:16 -080083// TODO(brians): split this out into its own file + tests
Brian Silvermand169fcd2013-02-27 13:18:47 -080084class FileWatch {
85 public:
86 // Will call callback(value) when filename is modified.
87 // If value is NULL, then a pointer to this object will be passed instead.
Brian Silverman5cc661b2013-02-27 15:23:36 -080088 //
89 // Watching for file creations is slightly different. To do that, pass true
Brian Silverman8070a222013-02-28 15:01:36 -080090 // as create, the directory where the file will be created for filename, and
Brian Silverman5cc661b2013-02-27 15:23:36 -080091 // the name of the file (without directory name) for check_filename.
Brian Silvermand169fcd2013-02-27 13:18:47 -080092 FileWatch(std::string filename,
Brian Silverman8070a222013-02-28 15:01:36 -080093 std::function<void(void *)> callback,
94 void *value,
95 bool create = false,
96 std::string check_filename = "")
97 : filename_(filename),
98 callback_(callback),
99 value_(value),
Brian Silvermand90b5fe2013-03-10 18:34:42 -0700100 create_(create),
101 check_filename_(check_filename),
102 watch_(-1) {
Brian Silverman5cc661b2013-02-27 15:23:36 -0800103 init_once.Get();
104
Brian Silvermand90b5fe2013-03-10 18:34:42 -0700105 CreateWatch();
Brian Silvermand169fcd2013-02-27 13:18:47 -0800106 }
107 // Cleans up everything.
108 ~FileWatch() {
109 if (watch_ != -1) {
110 RemoveWatch();
111 }
112 }
113
114 // After calling this method, this object won't really be doing much of
Brian Silverman5cc661b2013-02-27 15:23:36 -0800115 // anything besides possibly running its callback or something.
Brian Silvermand169fcd2013-02-27 13:18:47 -0800116 void RemoveWatch() {
117 assert(watch_ != -1);
Brian Silvermand94642c2014-03-27 18:21:41 -0700118 assert(watch_to_remove_ == -1);
Brian Silverman5cc661b2013-02-27 15:23:36 -0800119
Brian Silvermand169fcd2013-02-27 13:18:47 -0800120 if (inotify_rm_watch(notify_fd, watch_) == -1) {
Brian Silverman01be0002014-05-10 15:44:38 -0700121 PLOG(WARNING, "inotify_rm_watch(%d, %d) failed", notify_fd, watch_);
Brian Silvermand169fcd2013-02-27 13:18:47 -0800122 }
Brian Silvermand94642c2014-03-27 18:21:41 -0700123 watch_to_remove_ = watch_;
124 watch_ = -1;
Brian Silvermand169fcd2013-02-27 13:18:47 -0800125 }
126
Brian Silverman5cc661b2013-02-27 15:23:36 -0800127 private:
128 // Performs the static initialization. Called by init_once from the
129 // constructor.
130 static void *Init() {
131 notify_fd = inotify_init1(IN_CLOEXEC);
132 EventUniquePtr notify_event(event_new(libevent_base.get(), notify_fd,
133 EV_READ | EV_PERSIST,
134 FileWatch::INotifyReadable, NULL));
135 event_add(notify_event.release(), NULL);
136 return NULL;
Brian Silvermand169fcd2013-02-27 13:18:47 -0800137 }
138
Brian Silvermand90b5fe2013-03-10 18:34:42 -0700139 void RemoveWatchFromMap() {
Brian Silvermand94642c2014-03-27 18:21:41 -0700140 int watch = watch_to_remove_;
141 if (watch == -1) {
142 assert(watch_ != -1);
143 watch = watch_;
144 }
145 if (watchers[watch] != this) {
Brian Silvermand90b5fe2013-03-10 18:34:42 -0700146 LOG(WARNING, "watcher for %s (%p) didn't find itself in the map\n",
147 filename_.c_str(), this);
148 } else {
Brian Silvermand94642c2014-03-27 18:21:41 -0700149 watchers.erase(watch);
Brian Silvermand90b5fe2013-03-10 18:34:42 -0700150 }
Brian Silvermand94642c2014-03-27 18:21:41 -0700151 LOG(DEBUG, "removed watch ID %d\n", watch);
152 if (watch_to_remove_ == -1) {
153 watch_ = -1;
154 } else {
155 watch_to_remove_ = -1;
156 }
Brian Silvermand90b5fe2013-03-10 18:34:42 -0700157 }
158
159 void CreateWatch() {
160 assert(watch_ == -1);
161 watch_ = inotify_add_watch(notify_fd, filename_.c_str(),
162 create_ ? IN_CREATE : (IN_ATTRIB |
163 IN_MODIFY |
164 IN_DELETE_SELF |
165 IN_MOVE_SELF));
166 if (watch_ == -1) {
Brian Silverman01be0002014-05-10 15:44:38 -0700167 PLOG(FATAL, "inotify_add_watch(%d, %s,"
168 " %s ? IN_CREATE : (IN_ATTRIB | IN_MODIFY)) failed",
169 notify_fd, filename_.c_str(), create_ ? "true" : "false");
Brian Silvermand90b5fe2013-03-10 18:34:42 -0700170 }
171 watchers[watch_] = this;
172 LOG(DEBUG, "watch for %s is %d\n", filename_.c_str(), watch_);
173 }
174
Brian Silvermand169fcd2013-02-27 13:18:47 -0800175 // This gets set up as the callback for EV_READ on the inotify file
Brian Silverman5cc661b2013-02-27 15:23:36 -0800176 // descriptor. It calls FileNotified on the appropriate instance.
Brian Silvermand169fcd2013-02-27 13:18:47 -0800177 static void INotifyReadable(int /*fd*/, short /*events*/, void *) {
178 unsigned int to_read;
Brian Silverman5cc661b2013-02-27 15:23:36 -0800179 // Use FIONREAD to figure out how many bytes there are to read.
Brian Silvermand169fcd2013-02-27 13:18:47 -0800180 if (ioctl(notify_fd, FIONREAD, &to_read) < 0) {
Brian Silverman01be0002014-05-10 15:44:38 -0700181 PLOG(FATAL, "FIONREAD(%d, %p) failed", notify_fd, &to_read);
Brian Silvermand169fcd2013-02-27 13:18:47 -0800182 }
183 inotify_event *notifyevt = static_cast<inotify_event *>(malloc(to_read));
184 const char *end = reinterpret_cast<char *>(notifyevt) + to_read;
185 aos::unique_c_ptr<inotify_event> freer(notifyevt);
186
187 ssize_t ret = read(notify_fd, notifyevt, to_read);
188 if (ret < 0) {
Brian Silverman01be0002014-05-10 15:44:38 -0700189 PLOG(FATAL, "read(%d, %p, %u) failed", notify_fd, notifyevt, to_read);
Brian Silvermand169fcd2013-02-27 13:18:47 -0800190 }
191 if (static_cast<size_t>(ret) != to_read) {
192 LOG(ERROR, "read(%d, %p, %u) returned %zd instead of %u\n",
193 notify_fd, notifyevt, to_read, ret, to_read);
194 return;
195 }
196
Brian Silverman5cc661b2013-02-27 15:23:36 -0800197 // Keep looping through until we get to the end because inotify does return
198 // multiple events at once.
Brian Silvermanb1e4f6c2013-02-27 15:42:02 -0800199 while (reinterpret_cast<char *>(notifyevt) < end) {
Brian Silvermand169fcd2013-02-27 13:18:47 -0800200 if (watchers.count(notifyevt->wd) != 1) {
Brian Silvermanfe06fe12013-02-27 18:54:58 -0800201 LOG(WARNING, "couldn't find whose watch ID %d is\n", notifyevt->wd);
Brian Silvermanb1e4f6c2013-02-27 15:42:02 -0800202 } else {
Brian Silverman8efe23e2013-07-07 23:31:37 -0700203 LOG(DEBUG, "mask=%" PRIu32 "\n", notifyevt->mask);
Brian Silvermand94642c2014-03-27 18:21:41 -0700204 // If the watch was removed.
205 if (notifyevt->mask & IN_IGNORED) {
Brian Silvermand90b5fe2013-03-10 18:34:42 -0700206 watchers[notifyevt->wd]->WatchDeleted();
207 } else {
Brian Silvermand94642c2014-03-27 18:21:41 -0700208 watchers[notifyevt->wd]
209 ->FileNotified((notifyevt->len > 0) ? notifyevt->name : NULL);
Brian Silvermand90b5fe2013-03-10 18:34:42 -0700210 }
Brian Silvermand169fcd2013-02-27 13:18:47 -0800211 }
Brian Silvermand169fcd2013-02-27 13:18:47 -0800212
213 notifyevt = reinterpret_cast<inotify_event *>(
Brian Silvermandbdf1d02013-11-17 13:19:41 -0800214 __builtin_assume_aligned(reinterpret_cast<char *>(notifyevt) +
215 sizeof(*notifyevt) + notifyevt->len,
Brian Silvermanafc00a62014-04-21 17:51:23 -0700216 alignof(inotify_event)));
Brian Silvermand169fcd2013-02-27 13:18:47 -0800217 }
218 }
219
Brian Silvermand90b5fe2013-03-10 18:34:42 -0700220 // INotifyReadable calls this method whenever the watch for our file gets
221 // removed somehow.
222 void WatchDeleted() {
223 LOG(DEBUG, "watch for %s deleted\n", filename_.c_str());
224 RemoveWatchFromMap();
225 CreateWatch();
226 }
227
Brian Silverman5cc661b2013-02-27 15:23:36 -0800228 // INotifyReadable calls this method whenever the watch for our file triggers.
229 void FileNotified(const char *filename) {
230 assert(watch_ != -1);
Brian Silvermanb1e4f6c2013-02-27 15:42:02 -0800231 LOG(DEBUG, "got a notification for %s\n", filename_.c_str());
Brian Silverman5cc661b2013-02-27 15:23:36 -0800232
233 if (!check_filename_.empty()) {
234 if (filename == NULL) {
235 return;
236 }
237 if (std::string(filename) != check_filename_) {
238 return;
239 }
240 }
241
242 callback_((value_ == NULL) ? this : value_);
Brian Silvermand169fcd2013-02-27 13:18:47 -0800243 }
244
Brian Silverman5cc661b2013-02-27 15:23:36 -0800245 // To make sure that Init gets called exactly once.
246 static ::aos::Once<void> init_once;
247
Brian Silvermand169fcd2013-02-27 13:18:47 -0800248 const std::string filename_;
249 const std::function<void(void *)> callback_;
250 void *const value_;
Brian Silvermand90b5fe2013-03-10 18:34:42 -0700251 const bool create_;
Brian Silvermand169fcd2013-02-27 13:18:47 -0800252 std::string check_filename_;
253
254 // The watch descriptor or -1 if we don't have one any more.
255 int watch_;
Brian Silvermand94642c2014-03-27 18:21:41 -0700256 // The watch that we still have to take out of the map once we get the
257 // IN_IGNORED or -1.
258 int watch_to_remove_ = -1;
Brian Silvermand169fcd2013-02-27 13:18:47 -0800259
Brian Silvermanbc4fc2f2013-02-27 19:33:42 -0800260 // Map from watch IDs to instances of this class.
261 // <https://patchwork.kernel.org/patch/73192/> ("inotify: do not reuse watch
262 // descriptors") says they won't get reused, but that shouldn't be counted on
263 // because we might have a modified/different version/whatever kernel.
Brian Silvermand169fcd2013-02-27 13:18:47 -0800264 static std::map<int, FileWatch *> watchers;
Brian Silverman5cc661b2013-02-27 15:23:36 -0800265 // The inotify(7) file descriptor.
Brian Silvermand169fcd2013-02-27 13:18:47 -0800266 static int notify_fd;
Brian Silvermanbc4fc2f2013-02-27 19:33:42 -0800267
268 DISALLOW_COPY_AND_ASSIGN(FileWatch);
Brian Silvermand169fcd2013-02-27 13:18:47 -0800269};
Brian Silverman5cc661b2013-02-27 15:23:36 -0800270::aos::Once<void> FileWatch::init_once(FileWatch::Init);
Brian Silvermand169fcd2013-02-27 13:18:47 -0800271std::map<int, FileWatch *> FileWatch::watchers;
272int FileWatch::notify_fd;
273
Brian Silverman5cc661b2013-02-27 15:23:36 -0800274// Runs the given command and returns its first line of output (not including
275// the \n). LOG(FATAL)s if the command has an exit status other than 0 or does
276// not print out an entire line.
Brian Silvermand169fcd2013-02-27 13:18:47 -0800277std::string RunCommand(std::string command) {
Brian Silverman5cc661b2013-02-27 15:23:36 -0800278 // popen(3) might fail and not set it.
Brian Silvermand169fcd2013-02-27 13:18:47 -0800279 errno = 0;
Brian Silverman5cc661b2013-02-27 15:23:36 -0800280 FILE *pipe = popen(command.c_str(), "r");
281 if (pipe == NULL) {
Brian Silverman01be0002014-05-10 15:44:38 -0700282 PLOG(FATAL, "popen(\"%s\", \"r\") failed", command.c_str());
Brian Silvermand169fcd2013-02-27 13:18:47 -0800283 }
284
Brian Silverman5cc661b2013-02-27 15:23:36 -0800285 // result_size is how many bytes result is currently allocated to.
Brian Silvermand169fcd2013-02-27 13:18:47 -0800286 size_t result_size = 128, read = 0;
287 unique_c_ptr<char> result(static_cast<char *>(malloc(result_size)));
288 while (true) {
Brian Silverman5cc661b2013-02-27 15:23:36 -0800289 // If we filled up the buffer, then realloc(3) it bigger.
Brian Silvermand169fcd2013-02-27 13:18:47 -0800290 if (read == result_size) {
291 result_size *= 2;
292 void *new_result = realloc(result.get(), result_size);
293 if (new_result == NULL) {
Brian Silverman01be0002014-05-10 15:44:38 -0700294 PLOG(FATAL, "realloc(%p, %zd) failed", result.get(), result_size);
Brian Silvermand169fcd2013-02-27 13:18:47 -0800295 } else {
296 result.release();
297 result = unique_c_ptr<char>(static_cast<char *>(new_result));
298 }
299 }
300
Brian Silverman5cc661b2013-02-27 15:23:36 -0800301 size_t ret = fread(result.get() + read, 1, result_size - read, pipe);
302 // If the read didn't fill up the whole buffer, check to see if it was
303 // because of an error.
Brian Silvermand169fcd2013-02-27 13:18:47 -0800304 if (ret < result_size - read) {
Brian Silverman5cc661b2013-02-27 15:23:36 -0800305 if (ferror(pipe)) {
Brian Silverman01be0002014-05-10 15:44:38 -0700306 PLOG(FATAL, "couldn't finish reading output of \"%s\"\n",
307 command.c_str());
Brian Silvermand169fcd2013-02-27 13:18:47 -0800308 }
309 }
310 read += ret;
311 if (read > 0 && result.get()[read - 1] == '\n') {
312 break;
313 }
314
Brian Silverman5cc661b2013-02-27 15:23:36 -0800315 if (feof(pipe)) {
316 LOG(FATAL, "`%s` failed. didn't print a whole line\n", command.c_str());
Brian Silvermand169fcd2013-02-27 13:18:47 -0800317 }
318 }
319
Brian Silverman5cc661b2013-02-27 15:23:36 -0800320 // Get rid of the first \n and anything after it.
Brian Silvermand169fcd2013-02-27 13:18:47 -0800321 *strchrnul(result.get(), '\n') = '\0';
322
Brian Silverman5cc661b2013-02-27 15:23:36 -0800323 int child_status = pclose(pipe);
Brian Silvermand169fcd2013-02-27 13:18:47 -0800324 if (child_status == -1) {
Brian Silverman01be0002014-05-10 15:44:38 -0700325 PLOG(FATAL, "pclose(%p) failed", pipe);
Brian Silvermand169fcd2013-02-27 13:18:47 -0800326 }
327
328 if (child_status != 0) {
329 LOG(FATAL, "`%s` failed. return %d\n", command.c_str(), child_status);
330 }
331
332 return std::string(result.get());
333}
334
335// Will call callback(arg) after time.
336void Timeout(time::Time time, void (*callback)(int, short, void *), void *arg) {
337 EventUniquePtr timeout(evtimer_new(libevent_base.get(), callback, arg));
338 struct timeval time_timeval = time.ToTimeval();
Brian Silvermand94642c2014-03-27 18:21:41 -0700339 if (evtimer_add(timeout.release(), &time_timeval) != 0) {
340 LOG(FATAL, "evtimer_add(%p, %p) failed\n",
341 timeout.release(), &time_timeval);
342 }
Brian Silvermand169fcd2013-02-27 13:18:47 -0800343}
344
Brian Silvermand94642c2014-03-27 18:21:41 -0700345class Child;
346// This is where all of the Child instances except core live.
347std::vector<unique_ptr<Child>> children;
348// A global place to hold on to which child is core.
349unique_ptr<Child> core;
350
Brian Silvermand169fcd2013-02-27 13:18:47 -0800351// Represents a child process. It will take care of restarting itself etc.
352class Child {
353 public:
Brian Silverman5cc661b2013-02-27 15:23:36 -0800354 // command is the (space-separated) command to run and its arguments.
355 Child(const std::string &command) : pid_(-1),
Brian Silvermanfe06fe12013-02-27 18:54:58 -0800356 stat_at_start_valid_(false) {
Brian Silvermand94642c2014-03-27 18:21:41 -0700357 if (!restart_timeout) {
358 restart_timeout = EventUniquePtr(
359 evtimer_new(libevent_base.get(), StaticDoRestart, nullptr));
360 }
Brian Silvermand169fcd2013-02-27 13:18:47 -0800361 const char *start, *end;
362 start = command.c_str();
363 while (true) {
364 end = strchrnul(start, ' ');
365 args_.push_back(std::string(start, end - start));
366 start = end + 1;
367 if (*end == '\0') {
368 break;
369 }
370 }
371
Brian Silverman5cc661b2013-02-27 15:23:36 -0800372 original_binary_ = RunCommand("which " + args_[0]);
373 binary_ = original_binary_ + ".stm";
Brian Silvermand169fcd2013-02-27 13:18:47 -0800374
375 watcher_ = unique_ptr<FileWatch>(
Brian Silverman5cc661b2013-02-27 15:23:36 -0800376 new FileWatch(original_binary_, StaticFileModified, this));
Brian Silvermand169fcd2013-02-27 13:18:47 -0800377
378 Start();
379 }
380
381 pid_t pid() { return pid_; }
382
383 // This gets called whenever the actual process dies and should (probably) be
384 // restarted.
385 void ProcessDied() {
386 pid_ = -1;
387 restarts_.push(time::Time::Now());
388 if (restarts_.size() > kMaxRestartsNumber) {
389 time::Time oldest = restarts_.front();
390 restarts_.pop();
Brian Silvermand90b5fe2013-03-10 18:34:42 -0700391 if ((time::Time::Now() - oldest) <= kMaxRestartsTime) {
Brian Silvermand169fcd2013-02-27 13:18:47 -0800392 LOG(WARNING, "process %s getting restarted too often\n", name());
393 Timeout(kResumeWait, StaticStart, this);
394 return;
395 }
396 }
397 Start();
398 }
399
400 // Returns a name for logging purposes.
401 const char *name() {
402 return args_[0].c_str();
403 }
404
405 private:
406 struct CheckDiedStatus {
407 Child *self;
408 pid_t old_pid;
409 };
410
411 // How long to wait for a child to die nicely.
Brian Silvermand94642c2014-03-27 18:21:41 -0700412 static constexpr time::Time kProcessDieTime = time::Time::InSeconds(2);
Brian Silvermand169fcd2013-02-27 13:18:47 -0800413
414 // How long to wait after the file is modified to restart it.
415 // This is important because some programs like modifying the binaries by
416 // writing them in little bits, which results in attempting to start partial
417 // binaries without this.
Brian Silverman52aeeac2013-08-28 16:20:53 -0700418 static constexpr time::Time kRestartWaitTime = time::Time::InSeconds(1.5);
Brian Silvermand169fcd2013-02-27 13:18:47 -0800419
Brian Silverman5cc661b2013-02-27 15:23:36 -0800420 // Only kMaxRestartsNumber restarts will be allowed in kMaxRestartsTime.
Brian Silverman52aeeac2013-08-28 16:20:53 -0700421 static constexpr time::Time kMaxRestartsTime = time::Time::InSeconds(4);
422 static const size_t kMaxRestartsNumber = 3;
Brian Silvermand169fcd2013-02-27 13:18:47 -0800423 // How long to wait if it gets restarted too many times.
Brian Silverman52aeeac2013-08-28 16:20:53 -0700424 static constexpr time::Time kResumeWait = time::Time::InSeconds(5);
Brian Silverman5cc661b2013-02-27 15:23:36 -0800425
Brian Silvermand169fcd2013-02-27 13:18:47 -0800426 static void StaticFileModified(void *self) {
427 static_cast<Child *>(self)->FileModified();
428 }
Brian Silverman5cc661b2013-02-27 15:23:36 -0800429
Brian Silvermand169fcd2013-02-27 13:18:47 -0800430 void FileModified() {
Brian Silvermand90b5fe2013-03-10 18:34:42 -0700431 LOG(DEBUG, "file for %s modified\n", name());
Brian Silvermand169fcd2013-02-27 13:18:47 -0800432 struct timeval restart_time_timeval = kRestartWaitTime.ToTimeval();
433 // This will reset the timeout again if it hasn't run yet.
Brian Silvermand94642c2014-03-27 18:21:41 -0700434 if (evtimer_add(restart_timeout.get(), &restart_time_timeval) != 0) {
Brian Silvermand90b5fe2013-03-10 18:34:42 -0700435 LOG(FATAL, "evtimer_add(%p, %p) failed\n",
Brian Silvermand94642c2014-03-27 18:21:41 -0700436 restart_timeout.get(), &restart_time_timeval);
Brian Silvermand90b5fe2013-03-10 18:34:42 -0700437 }
Brian Silvermand94642c2014-03-27 18:21:41 -0700438 waiting_to_restart.insert(this);
Brian Silvermand169fcd2013-02-27 13:18:47 -0800439 }
440
Brian Silvermand94642c2014-03-27 18:21:41 -0700441 static void StaticDoRestart(int, short, void *) {
442 LOG(DEBUG, "restarting everything that needs it\n");
443 if (waiting_to_restart.find(core.get()) != waiting_to_restart.end()) {
444 core->DoRestart();
445 waiting_to_restart.erase(core.get());
446 }
447 for (auto c : waiting_to_restart) {
448 c->DoRestart();
449 }
450 waiting_to_restart.clear();
Brian Silvermand169fcd2013-02-27 13:18:47 -0800451 }
452
Brian Silvermanb1e4f6c2013-02-27 15:42:02 -0800453 // Called after somebody else has finished modifying the file.
Brian Silvermand169fcd2013-02-27 13:18:47 -0800454 void DoRestart() {
Brian Silvermand94642c2014-03-27 18:21:41 -0700455 fprintf(stderr, "DoRestart(%s)\n", binary_.c_str());
Brian Silvermanfe06fe12013-02-27 18:54:58 -0800456 if (stat_at_start_valid_) {
457 struct stat current_stat;
458 if (stat(original_binary_.c_str(), &current_stat) == -1) {
Brian Silverman01be0002014-05-10 15:44:38 -0700459 PLOG(FATAL, "stat(%s, %p) failed",
460 original_binary_.c_str(), &current_stat);
Brian Silvermanfe06fe12013-02-27 18:54:58 -0800461 }
462 if (current_stat.st_mtime == stat_at_start_.st_mtime) {
463 LOG(DEBUG, "ignoring trigger for %s because mtime didn't change\n",
464 name());
465 return;
466 }
467 }
468
Brian Silvermand94642c2014-03-27 18:21:41 -0700469 if (this == core.get()) {
470 fprintf(stderr, "Restarting core -> exiting now.\n");
471 exit(0);
472 }
Brian Silvermand169fcd2013-02-27 13:18:47 -0800473 if (pid_ != -1) {
474 LOG(DEBUG, "sending SIGTERM to child %d to restart it\n", pid_);
475 if (kill(pid_, SIGTERM) == -1) {
Brian Silverman01be0002014-05-10 15:44:38 -0700476 PLOG(WARNING, "kill(%d, SIGTERM) failed", pid_);
Brian Silvermand169fcd2013-02-27 13:18:47 -0800477 }
478 CheckDiedStatus *status = new CheckDiedStatus();
479 status->self = this;
480 status->old_pid = pid_;
481 Timeout(kProcessDieTime, StaticCheckDied, status);
Brian Silvermand90b5fe2013-03-10 18:34:42 -0700482 } else {
483 LOG(WARNING, "%s restart attempted but not running\n", name());
Brian Silvermand169fcd2013-02-27 13:18:47 -0800484 }
485 }
486
487 static void StaticCheckDied(int, short, void *status_in) {
488 CheckDiedStatus *status = static_cast<CheckDiedStatus *>(status_in);
489 status->self->CheckDied(status->old_pid);
490 delete status;
491 }
Brian Silverman5cc661b2013-02-27 15:23:36 -0800492
493 // Checks to see if the child using the PID old_pid is still running.
Brian Silvermand169fcd2013-02-27 13:18:47 -0800494 void CheckDied(pid_t old_pid) {
495 if (pid_ == old_pid) {
496 LOG(WARNING, "child %d refused to die\n", old_pid);
497 if (kill(old_pid, SIGKILL) == -1) {
Brian Silverman01be0002014-05-10 15:44:38 -0700498 PLOG(WARNING, "kill(%d, SIGKILL) failed", old_pid);
Brian Silvermand169fcd2013-02-27 13:18:47 -0800499 }
500 }
501 }
502
503 static void StaticStart(int, short, void *self) {
504 static_cast<Child *>(self)->Start();
505 }
Brian Silverman5cc661b2013-02-27 15:23:36 -0800506
507 // Actually starts the child.
Brian Silvermand169fcd2013-02-27 13:18:47 -0800508 void Start() {
509 if (pid_ != -1) {
510 LOG(WARNING, "calling Start() but already have child %d running\n",
511 pid_);
Brian Silverman5cc661b2013-02-27 15:23:36 -0800512 if (kill(pid_, SIGKILL) == -1) {
Brian Silverman01be0002014-05-10 15:44:38 -0700513 PLOG(WARNING, "kill(%d, SIGKILL) failed", pid_);
Brian Silverman5cc661b2013-02-27 15:23:36 -0800514 return;
515 }
Brian Silvermand169fcd2013-02-27 13:18:47 -0800516 pid_ = -1;
517 }
Brian Silverman5cc661b2013-02-27 15:23:36 -0800518
519 // Remove the name that we run from (ie from a previous execution) and then
520 // hard link the real filename to it.
521 if (unlink(binary_.c_str()) != 0 && errno != ENOENT) {
Brian Silverman01be0002014-05-10 15:44:38 -0700522 PLOG(FATAL, "removing %s failed", binary_.c_str());
Brian Silverman5cc661b2013-02-27 15:23:36 -0800523 }
524 if (link(original_binary_.c_str(), binary_.c_str()) != 0) {
Brian Silverman01be0002014-05-10 15:44:38 -0700525 PLOG(FATAL, "link('%s', '%s') failed",
526 original_binary_.c_str(), binary_.c_str());
Brian Silverman5cc661b2013-02-27 15:23:36 -0800527 }
528
Brian Silvermanfe06fe12013-02-27 18:54:58 -0800529 if (stat(original_binary_.c_str(), &stat_at_start_) == -1) {
Brian Silverman01be0002014-05-10 15:44:38 -0700530 PLOG(FATAL, "stat(%s, %p) failed",
531 original_binary_.c_str(), &stat_at_start_);
Brian Silvermanfe06fe12013-02-27 18:54:58 -0800532 }
533 stat_at_start_valid_ = true;
534
Brian Silvermand169fcd2013-02-27 13:18:47 -0800535 if ((pid_ = fork()) == 0) {
536 ssize_t args_size = args_.size();
537 const char **argv = new const char *[args_size + 1];
538 for (int i = 0; i < args_size; ++i) {
539 argv[i] = args_[i].c_str();
540 }
541 argv[args_size] = NULL;
542 // The const_cast is safe because no code that might care if it gets
543 // modified can run afterwards.
544 execv(binary_.c_str(), const_cast<char **>(argv));
Brian Silverman01be0002014-05-10 15:44:38 -0700545 PLOG(FATAL, "execv(%s, %p) failed", binary_.c_str(), argv);
Brian Silvermand169fcd2013-02-27 13:18:47 -0800546 _exit(EXIT_FAILURE);
547 }
548 if (pid_ == -1) {
Brian Silverman01be0002014-05-10 15:44:38 -0700549 PLOG(FATAL, "forking to run \"%s\" failed", binary_.c_str());
Brian Silvermand169fcd2013-02-27 13:18:47 -0800550 }
Brian Silvermand90b5fe2013-03-10 18:34:42 -0700551 LOG(DEBUG, "started \"%s\" successfully\n", binary_.c_str());
Brian Silvermand169fcd2013-02-27 13:18:47 -0800552 }
Brian Silvermanbc4fc2f2013-02-27 19:33:42 -0800553
554 // A history of the times that this process has been restarted.
555 std::queue<time::Time, std::list<time::Time>> restarts_;
556
557 // The currently running child's PID or NULL.
558 pid_t pid_;
559
560 // All of the arguments (including the name of the binary).
561 std::deque<std::string> args_;
562
563 // The name of the real binary that we were told to run.
564 std::string original_binary_;
565 // The name of the file that we're actually running.
566 std::string binary_;
567
568 // Watches original_binary_.
569 unique_ptr<FileWatch> watcher_;
570
Brian Silvermanbc4fc2f2013-02-27 19:33:42 -0800571 // Captured from the original file when we most recently started a new child
572 // process. Used to see if it actually changes or not.
573 struct stat stat_at_start_;
574 bool stat_at_start_valid_;
575
Brian Silvermand94642c2014-03-27 18:21:41 -0700576 // An event that restarts after kRestartWaitTime.
577 static EventUniquePtr restart_timeout;
578
579 // The set of children waiting to be restarted once all modifications stop.
580 static ::std::set<Child *> waiting_to_restart;
581
Brian Silvermanbc4fc2f2013-02-27 19:33:42 -0800582 DISALLOW_COPY_AND_ASSIGN(Child);
Brian Silvermand169fcd2013-02-27 13:18:47 -0800583};
Brian Silverman52aeeac2013-08-28 16:20:53 -0700584
585constexpr time::Time Child::kProcessDieTime;
586constexpr time::Time Child::kRestartWaitTime;
587constexpr time::Time Child::kMaxRestartsTime;
588constexpr time::Time Child::kResumeWait;
Brian Silvermand169fcd2013-02-27 13:18:47 -0800589
Brian Silvermand94642c2014-03-27 18:21:41 -0700590EventUniquePtr Child::restart_timeout;
591::std::set<Child *> Child::waiting_to_restart;
Brian Silvermand169fcd2013-02-27 13:18:47 -0800592
Brian Silverman5cc661b2013-02-27 15:23:36 -0800593// Kills off the entire process group (including ourself).
594void KillChildren(bool try_nice) {
Brian Silvermand169fcd2013-02-27 13:18:47 -0800595 if (try_nice) {
596 static const int kNiceStopSignal = SIGTERM;
597 static const time::Time kNiceWaitTime = time::Time::InSeconds(1);
598
599 // Make sure that we don't just nicely stop ourself...
600 sigset_t mask;
601 sigemptyset(&mask);
602 sigaddset(&mask, kNiceStopSignal);
603 sigprocmask(SIG_BLOCK, &mask, NULL);
604
Brian Silverman5cc661b2013-02-27 15:23:36 -0800605 kill(-getpid(), kNiceStopSignal);
606
607 fflush(NULL);
Brian Silvermand169fcd2013-02-27 13:18:47 -0800608 time::SleepFor(kNiceWaitTime);
609 }
Brian Silverman5cc661b2013-02-27 15:23:36 -0800610
Brian Silvermand169fcd2013-02-27 13:18:47 -0800611 // Send SIGKILL to our whole process group, which will forcibly terminate any
612 // of them that are still running (us for sure, maybe more too).
Brian Silverman5cc661b2013-02-27 15:23:36 -0800613 kill(-getpid(), SIGKILL);
Brian Silvermand169fcd2013-02-27 13:18:47 -0800614}
615
Brian Silverman5cc661b2013-02-27 15:23:36 -0800616void ExitHandler() {
617 KillChildren(true);
Brian Silvermand169fcd2013-02-27 13:18:47 -0800618}
Brian Silverman5cc661b2013-02-27 15:23:36 -0800619
620void KillChildrenSignalHandler(int signum) {
621 // If we get SIGSEGV or some other random signal who knows what's happening
622 // and we should just kill everybody immediately.
623 // This is a list of all of the signals that mean some form of "nicely stop".
624 KillChildren(signum == SIGHUP || signum == SIGINT || signum == SIGQUIT ||
Brian Silverman0eec9532013-02-27 20:24:16 -0800625 signum == SIGABRT || signum == SIGPIPE || signum == SIGTERM ||
626 signum == SIGXCPU);
Brian Silvermand169fcd2013-02-27 13:18:47 -0800627}
628
Brian Silverman5cc661b2013-02-27 15:23:36 -0800629// Returns the currently running child with PID pid or an empty unique_ptr.
Brian Silvermand169fcd2013-02-27 13:18:47 -0800630const unique_ptr<Child> &FindChild(pid_t pid) {
631 for (auto it = children.begin(); it != children.end(); ++it) {
632 if (pid == (*it)->pid()) {
633 return *it;
634 }
635 }
636
637 if (pid == core->pid()) {
638 return core;
639 }
640
Brian Silverman5cc661b2013-02-27 15:23:36 -0800641 static const unique_ptr<Child> kNothing;
Brian Silvermand169fcd2013-02-27 13:18:47 -0800642 return kNothing;
643}
644
Brian Silverman5cc661b2013-02-27 15:23:36 -0800645// Gets set up as a libevent handler for SIGCHLD.
646// Handles calling Child::ProcessDied() on the appropriate one.
647void SigCHLDReceived(int /*fd*/, short /*events*/, void *) {
Brian Silvermand169fcd2013-02-27 13:18:47 -0800648 // In a while loop in case we miss any SIGCHLDs.
649 while (true) {
650 siginfo_t infop;
651 infop.si_pid = 0;
652 if (waitid(P_ALL, 0, &infop, WEXITED | WSTOPPED | WNOHANG) != 0) {
Brian Silverman01be0002014-05-10 15:44:38 -0700653 PLOG(WARNING, "waitid failed");
Brian Silverman5cc661b2013-02-27 15:23:36 -0800654 continue;
Brian Silvermand169fcd2013-02-27 13:18:47 -0800655 }
Brian Silverman5cc661b2013-02-27 15:23:36 -0800656 // If there are no more child process deaths to process.
Brian Silvermand169fcd2013-02-27 13:18:47 -0800657 if (infop.si_pid == 0) {
Brian Silverman5cc661b2013-02-27 15:23:36 -0800658 return;
Brian Silvermand169fcd2013-02-27 13:18:47 -0800659 }
660
661 pid_t pid = infop.si_pid;
662 int status = infop.si_status;
663 const unique_ptr<Child> &child = FindChild(pid);
664 if (child) {
665 switch (infop.si_code) {
666 case CLD_EXITED:
667 LOG(WARNING, "child %d (%s) exited with status %d\n",
668 pid, child->name(), status);
669 break;
670 case CLD_DUMPED:
671 LOG(INFO, "child %d actually dumped core. "
672 "falling through to killed by signal case\n", pid);
673 case CLD_KILLED:
674 // If somebody (possibly us) sent it SIGTERM that means that they just
675 // want it to stop, so it stopping isn't a WARNING.
676 LOG((status == SIGTERM) ? DEBUG : WARNING,
677 "child %d (%s) was killed by signal %d (%s)\n",
Brian Silvermanaf784862014-05-13 08:14:55 -0700678 pid, child->name(), status, aos_strsignal(status));
Brian Silvermand169fcd2013-02-27 13:18:47 -0800679 break;
680 case CLD_STOPPED:
681 LOG(WARNING, "child %d (%s) was stopped by signal %d "
682 "(giving it a SIGCONT(%d))\n",
683 pid, child->name(), status, SIGCONT);
684 kill(pid, SIGCONT);
685 continue;
686 default:
687 LOG(WARNING, "something happened to child %d (%s) (killing it)\n",
688 pid, child->name());
689 kill(pid, SIGKILL);
690 continue;
691 }
692 } else {
Brian Silverman5cc661b2013-02-27 15:23:36 -0800693 LOG(WARNING, "couldn't find a Child for pid %d\n", pid);
694 return;
Brian Silvermand169fcd2013-02-27 13:18:47 -0800695 }
696
Brian Silverman5cc661b2013-02-27 15:23:36 -0800697 if (child == core) {
698 LOG(FATAL, "core died\n");
699 }
Brian Silvermand169fcd2013-02-27 13:18:47 -0800700 child->ProcessDied();
701 }
702}
703
Brian Silverman5cc661b2013-02-27 15:23:36 -0800704// This is used for communicating the name of the file to read processes to
705// start from main to Run.
706const char *child_list_file;
707
Brian Silverman8070a222013-02-28 15:01:36 -0800708void Run(void *watch);
Brian Silvermand169fcd2013-02-27 13:18:47 -0800709void Main() {
710 logging::Init();
Brian Silverman0eec9532013-02-27 20:24:16 -0800711 // TODO(brians): tell logging that using the root logger from here until we
Brian Silvermand169fcd2013-02-27 13:18:47 -0800712 // bring up shm is ok
713
Brian Silverman5cc661b2013-02-27 15:23:36 -0800714 if (setpgid(0 /*self*/, 0 /*make PGID the same as PID*/) != 0) {
Brian Silverman01be0002014-05-10 15:44:38 -0700715 PLOG(FATAL, "setpgid(0, 0) failed");
Brian Silverman5cc661b2013-02-27 15:23:36 -0800716 }
Brian Silvermand169fcd2013-02-27 13:18:47 -0800717
718 // Make sure that we kill all children when we exit.
Brian Silverman5cc661b2013-02-27 15:23:36 -0800719 atexit(ExitHandler);
Brian Silvermand169fcd2013-02-27 13:18:47 -0800720 // Do it on some signals too (ones that we otherwise tend to receive and then
721 // leave all of our children going).
Brian Silverman5cc661b2013-02-27 15:23:36 -0800722 signal(SIGHUP, KillChildrenSignalHandler);
723 signal(SIGINT, KillChildrenSignalHandler);
724 signal(SIGQUIT, KillChildrenSignalHandler);
725 signal(SIGILL, KillChildrenSignalHandler);
726 signal(SIGABRT, KillChildrenSignalHandler);
727 signal(SIGFPE, KillChildrenSignalHandler);
728 signal(SIGSEGV, KillChildrenSignalHandler);
729 signal(SIGPIPE, KillChildrenSignalHandler);
730 signal(SIGTERM, KillChildrenSignalHandler);
731 signal(SIGBUS, KillChildrenSignalHandler);
732 signal(SIGXCPU, KillChildrenSignalHandler);
Brian Silvermand169fcd2013-02-27 13:18:47 -0800733
734 libevent_base = EventBaseUniquePtr(event_base_new());
735
Brian Silvermanb1e4f6c2013-02-27 15:42:02 -0800736 std::string core_touch_file = "/tmp/starter.";
Brian Silvermand169fcd2013-02-27 13:18:47 -0800737 core_touch_file += std::to_string(static_cast<intmax_t>(getpid()));
738 core_touch_file += ".core_touch_file";
Brian Silvermanaf784862014-05-13 08:14:55 -0700739 const int result =
740 ::aos::util::RunCommand(("touch '" + core_touch_file + "'").c_str());
741 if (result == -1) {
Brian Silverman01be0002014-05-10 15:44:38 -0700742 PLOG(FATAL, "running `touch '%s'` failed\n", core_touch_file.c_str());
Brian Silvermanaf784862014-05-13 08:14:55 -0700743 } else if (!WIFEXITED(result) || WEXITSTATUS(result) != 0) {
744 LOG(FATAL, "`touch '%s'` gave result %x\n", core_touch_file.c_str(),
745 result);
Brian Silvermanb1e4f6c2013-02-27 15:42:02 -0800746 }
747 FileWatch core_touch_file_watch(core_touch_file, Run, NULL);
Brian Silvermand169fcd2013-02-27 13:18:47 -0800748 core = unique_ptr<Child>(
Brian Silvermanb1e4f6c2013-02-27 15:42:02 -0800749 new Child("core " + core_touch_file));
Brian Silvermand169fcd2013-02-27 13:18:47 -0800750
751 FILE *pid_file = fopen("/tmp/starter.pid", "w");
752 if (pid_file == NULL) {
Brian Silverman01be0002014-05-10 15:44:38 -0700753 PLOG(FATAL, "fopen(\"/tmp/starter.pid\", \"w\") failed");
Brian Silvermand169fcd2013-02-27 13:18:47 -0800754 } else {
755 if (fprintf(pid_file, "%d", core->pid()) == -1) {
Brian Silverman01be0002014-05-10 15:44:38 -0700756 PLOG(WARNING, "fprintf(%p, \"%%d\", %d) failed",
757 pid_file, core->pid());
Brian Silvermand169fcd2013-02-27 13:18:47 -0800758 }
759 fclose(pid_file);
760 }
761
762 LOG(INFO, "waiting for %s to appear\n", core_touch_file.c_str());
763
764 event_base_dispatch(libevent_base.get());
765 LOG(FATAL, "event_base_dispatch(%p) returned\n", libevent_base.get());
766}
767
Brian Silverman0eec9532013-02-27 20:24:16 -0800768// This is the callback for when core creates the file indicating that it has
769// started.
770void Run(void *watch) {
771 // Make it so it doesn't keep on seeing random changes in /tmp.
772 static_cast<FileWatch *>(watch)->RemoveWatch();
773
774 // It's safe now because core is up.
775 aos::InitNRT();
776
777 std::ifstream list_file(child_list_file);
778
779 while (true) {
780 std::string child_name;
781 getline(list_file, child_name);
782 if ((list_file.rdstate() & std::ios_base::eofbit) != 0) {
783 break;
784 }
785 if (list_file.rdstate() != 0) {
786 LOG(FATAL, "reading input file %s failed\n", child_list_file);
787 }
788 children.push_back(unique_ptr<Child>(new Child(child_name)));
789 }
790
791 EventUniquePtr sigchld(event_new(libevent_base.get(), SIGCHLD,
792 EV_SIGNAL | EV_PERSIST,
793 SigCHLDReceived, NULL));
794 event_add(sigchld.release(), NULL);
795}
796
Brian Silverman8070a222013-02-28 15:01:36 -0800797const char *kArgsHelp = "[OPTION]... START_LIST\n"
798 "Start all of the robot code binaries in START_LIST.\n"
799 "\n"
800 "START_LIST is the file to read binaries (looked up on PATH) to run.\n"
801 " --help display this help and exit\n";
802void PrintHelp() {
803 fprintf(stderr, "Usage: %s %s", program_invocation_name, kArgsHelp);
804}
805
Brian Silvermand169fcd2013-02-27 13:18:47 -0800806} // namespace starter
807} // namespace aos
808
809int main(int argc, char *argv[]) {
Brian Silverman8070a222013-02-28 15:01:36 -0800810 if (argc != 2) {
811 aos::starter::PrintHelp();
Brian Silvermand169fcd2013-02-27 13:18:47 -0800812 exit(EXIT_FAILURE);
Brian Silvermand169fcd2013-02-27 13:18:47 -0800813 }
Brian Silverman8070a222013-02-28 15:01:36 -0800814 if (strcmp(argv[1], "--help") == 0) {
815 aos::starter::PrintHelp();
816 exit(EXIT_SUCCESS);
817 }
818
Brian Silvermand169fcd2013-02-27 13:18:47 -0800819 aos::starter::child_list_file = argv[1];
820
821 aos::starter::Main();
822}