blob: 114148fb1b5a1edf9f38b55d229b09d0f0da5806 [file] [log] [blame]
Brian Silvermand169fcd2013-02-27 13:18:47 -08001#include <stdio.h>
2#include <stdlib.h>
3#include <sys/types.h>
4#include <fcntl.h>
5#include <sys/inotify.h>
6#include <sys/stat.h>
7#include <sys/ioctl.h>
8#include <assert.h>
9#include <signal.h>
10#include <stdint.h>
11#include <errno.h>
12#include <string.h>
13#include <sys/wait.h>
Brian Silvermand90b5fe2013-03-10 18:34:42 -070014#include <inttypes.h>
Brian Silvermand169fcd2013-02-27 13:18:47 -080015
16#include <map>
17#include <functional>
18#include <deque>
19#include <fstream>
20#include <queue>
21#include <list>
22#include <string>
23#include <vector>
24#include <memory>
Brian Silvermand94642c2014-03-27 18:21:41 -070025#include <set>
Brian Silvermand169fcd2013-02-27 13:18:47 -080026
27#include <event2/event.h>
28
29#include "aos/common/logging/logging.h"
30#include "aos/common/logging/logging_impl.h"
Brian Silverman14fd0fb2014-01-14 21:42:01 -080031#include "aos/linux_code/init.h"
Brian Silvermand169fcd2013-02-27 13:18:47 -080032#include "aos/common/unique_malloc_ptr.h"
33#include "aos/common/time.h"
Brian Silverman5cc661b2013-02-27 15:23:36 -080034#include "aos/common/once.h"
Brian Silvermand169fcd2013-02-27 13:18:47 -080035
36// This is the main piece of code that starts all of the rest of the code and
37// restarts it when the binaries are modified.
38//
Brian Silvermanbc4fc2f2013-02-27 19:33:42 -080039// Throughout, the code is not terribly concerned with thread safety because
40// there is only 1 thread. It does some setup and then lets inotify run things
41// when appropriate.
42//
Brian Silverman5cc661b2013-02-27 15:23:36 -080043// NOTE: This program should never exit nicely. It catches all nice attempts to
44// exit, forwards them to all of the children that it has started, waits for
Brian Silvermand169fcd2013-02-27 13:18:47 -080045// them to exit nicely, and then SIGKILLs anybody left (which will always
46// include itself).
47
48using ::std::unique_ptr;
49
50namespace aos {
51namespace starter {
52
Brian Silverman0eec9532013-02-27 20:24:16 -080053// TODO(brians): split out the c++ libevent wrapper stuff into its own file(s)
Brian Silvermand169fcd2013-02-27 13:18:47 -080054class EventBaseDeleter {
55 public:
56 void operator()(event_base *base) {
Brian Silverman8070a222013-02-28 15:01:36 -080057 if (base == NULL) return;
Brian Silvermand169fcd2013-02-27 13:18:47 -080058 event_base_free(base);
59 }
60};
61typedef unique_ptr<event_base, EventBaseDeleter> EventBaseUniquePtr;
Brian Silverman5cc661b2013-02-27 15:23:36 -080062EventBaseUniquePtr libevent_base;
Brian Silvermand169fcd2013-02-27 13:18:47 -080063
64class EventDeleter {
65 public:
66 void operator()(event *evt) {
Brian Silverman8070a222013-02-28 15:01:36 -080067 if (evt == NULL) return;
Brian Silvermand169fcd2013-02-27 13:18:47 -080068 if (event_del(evt) != 0) {
69 LOG(WARNING, "event_del(%p) failed\n", evt);
70 }
71 }
72};
73typedef unique_ptr<event, EventDeleter> EventUniquePtr;
74
Brian Silverman5cc661b2013-02-27 15:23:36 -080075// Watches a file path for modifications. Once created, keeps watching until
76// destroyed or RemoveWatch() is called.
Brian Silverman0eec9532013-02-27 20:24:16 -080077// TODO(brians): split this out into its own file + tests
Brian Silvermand169fcd2013-02-27 13:18:47 -080078class FileWatch {
79 public:
80 // Will call callback(value) when filename is modified.
81 // If value is NULL, then a pointer to this object will be passed instead.
Brian Silverman5cc661b2013-02-27 15:23:36 -080082 //
83 // Watching for file creations is slightly different. To do that, pass true
Brian Silverman8070a222013-02-28 15:01:36 -080084 // as create, the directory where the file will be created for filename, and
Brian Silverman5cc661b2013-02-27 15:23:36 -080085 // the name of the file (without directory name) for check_filename.
Brian Silvermand169fcd2013-02-27 13:18:47 -080086 FileWatch(std::string filename,
Brian Silverman8070a222013-02-28 15:01:36 -080087 std::function<void(void *)> callback,
88 void *value,
89 bool create = false,
90 std::string check_filename = "")
91 : filename_(filename),
92 callback_(callback),
93 value_(value),
Brian Silvermand90b5fe2013-03-10 18:34:42 -070094 create_(create),
95 check_filename_(check_filename),
96 watch_(-1) {
Brian Silverman5cc661b2013-02-27 15:23:36 -080097 init_once.Get();
98
Brian Silvermand90b5fe2013-03-10 18:34:42 -070099 CreateWatch();
Brian Silvermand169fcd2013-02-27 13:18:47 -0800100 }
101 // Cleans up everything.
102 ~FileWatch() {
103 if (watch_ != -1) {
104 RemoveWatch();
105 }
106 }
107
108 // After calling this method, this object won't really be doing much of
Brian Silverman5cc661b2013-02-27 15:23:36 -0800109 // anything besides possibly running its callback or something.
Brian Silvermand169fcd2013-02-27 13:18:47 -0800110 void RemoveWatch() {
111 assert(watch_ != -1);
Brian Silvermand94642c2014-03-27 18:21:41 -0700112 assert(watch_to_remove_ == -1);
Brian Silverman5cc661b2013-02-27 15:23:36 -0800113
Brian Silvermand169fcd2013-02-27 13:18:47 -0800114 if (inotify_rm_watch(notify_fd, watch_) == -1) {
115 LOG(WARNING, "inotify_rm_watch(%d, %d) failed with %d: %s\n",
116 notify_fd, watch_, errno, strerror(errno));
117 }
Brian Silvermand94642c2014-03-27 18:21:41 -0700118 watch_to_remove_ = watch_;
119 watch_ = -1;
Brian Silvermand169fcd2013-02-27 13:18:47 -0800120 }
121
Brian Silverman5cc661b2013-02-27 15:23:36 -0800122 private:
123 // Performs the static initialization. Called by init_once from the
124 // constructor.
125 static void *Init() {
126 notify_fd = inotify_init1(IN_CLOEXEC);
127 EventUniquePtr notify_event(event_new(libevent_base.get(), notify_fd,
128 EV_READ | EV_PERSIST,
129 FileWatch::INotifyReadable, NULL));
130 event_add(notify_event.release(), NULL);
131 return NULL;
Brian Silvermand169fcd2013-02-27 13:18:47 -0800132 }
133
Brian Silvermand90b5fe2013-03-10 18:34:42 -0700134 void RemoveWatchFromMap() {
Brian Silvermand94642c2014-03-27 18:21:41 -0700135 int watch = watch_to_remove_;
136 if (watch == -1) {
137 assert(watch_ != -1);
138 watch = watch_;
139 }
140 if (watchers[watch] != this) {
Brian Silvermand90b5fe2013-03-10 18:34:42 -0700141 LOG(WARNING, "watcher for %s (%p) didn't find itself in the map\n",
142 filename_.c_str(), this);
143 } else {
Brian Silvermand94642c2014-03-27 18:21:41 -0700144 watchers.erase(watch);
Brian Silvermand90b5fe2013-03-10 18:34:42 -0700145 }
Brian Silvermand94642c2014-03-27 18:21:41 -0700146 LOG(DEBUG, "removed watch ID %d\n", watch);
147 if (watch_to_remove_ == -1) {
148 watch_ = -1;
149 } else {
150 watch_to_remove_ = -1;
151 }
Brian Silvermand90b5fe2013-03-10 18:34:42 -0700152 }
153
154 void CreateWatch() {
155 assert(watch_ == -1);
156 watch_ = inotify_add_watch(notify_fd, filename_.c_str(),
157 create_ ? IN_CREATE : (IN_ATTRIB |
158 IN_MODIFY |
159 IN_DELETE_SELF |
160 IN_MOVE_SELF));
161 if (watch_ == -1) {
162 LOG(FATAL, "inotify_add_watch(%d, %s,"
163 " %s ? IN_CREATE : (IN_ATTRIB | IN_MODIFY)) failed with %d: %s\n",
164 notify_fd, filename_.c_str(), create_ ? "true" : "false",
165 errno, strerror(errno));
166 }
167 watchers[watch_] = this;
168 LOG(DEBUG, "watch for %s is %d\n", filename_.c_str(), watch_);
169 }
170
Brian Silvermand169fcd2013-02-27 13:18:47 -0800171 // This gets set up as the callback for EV_READ on the inotify file
Brian Silverman5cc661b2013-02-27 15:23:36 -0800172 // descriptor. It calls FileNotified on the appropriate instance.
Brian Silvermand169fcd2013-02-27 13:18:47 -0800173 static void INotifyReadable(int /*fd*/, short /*events*/, void *) {
174 unsigned int to_read;
Brian Silverman5cc661b2013-02-27 15:23:36 -0800175 // Use FIONREAD to figure out how many bytes there are to read.
Brian Silvermand169fcd2013-02-27 13:18:47 -0800176 if (ioctl(notify_fd, FIONREAD, &to_read) < 0) {
177 LOG(FATAL, "FIONREAD(%d, %p) failed with %d: %s\n",
178 notify_fd, &to_read, errno, strerror(errno));
179 }
180 inotify_event *notifyevt = static_cast<inotify_event *>(malloc(to_read));
181 const char *end = reinterpret_cast<char *>(notifyevt) + to_read;
182 aos::unique_c_ptr<inotify_event> freer(notifyevt);
183
184 ssize_t ret = read(notify_fd, notifyevt, to_read);
185 if (ret < 0) {
186 LOG(FATAL, "read(%d, %p, %u) failed with %d: %s\n",
187 notify_fd, notifyevt, to_read, errno, strerror(errno));
188 }
189 if (static_cast<size_t>(ret) != to_read) {
190 LOG(ERROR, "read(%d, %p, %u) returned %zd instead of %u\n",
191 notify_fd, notifyevt, to_read, ret, to_read);
192 return;
193 }
194
Brian Silverman5cc661b2013-02-27 15:23:36 -0800195 // Keep looping through until we get to the end because inotify does return
196 // multiple events at once.
Brian Silvermanb1e4f6c2013-02-27 15:42:02 -0800197 while (reinterpret_cast<char *>(notifyevt) < end) {
Brian Silvermand169fcd2013-02-27 13:18:47 -0800198 if (watchers.count(notifyevt->wd) != 1) {
Brian Silvermanfe06fe12013-02-27 18:54:58 -0800199 LOG(WARNING, "couldn't find whose watch ID %d is\n", notifyevt->wd);
Brian Silvermanb1e4f6c2013-02-27 15:42:02 -0800200 } else {
Brian Silverman8efe23e2013-07-07 23:31:37 -0700201 LOG(DEBUG, "mask=%" PRIu32 "\n", notifyevt->mask);
Brian Silvermand94642c2014-03-27 18:21:41 -0700202 // If the watch was removed.
203 if (notifyevt->mask & IN_IGNORED) {
Brian Silvermand90b5fe2013-03-10 18:34:42 -0700204 watchers[notifyevt->wd]->WatchDeleted();
205 } else {
Brian Silvermand94642c2014-03-27 18:21:41 -0700206 watchers[notifyevt->wd]
207 ->FileNotified((notifyevt->len > 0) ? notifyevt->name : NULL);
Brian Silvermand90b5fe2013-03-10 18:34:42 -0700208 }
Brian Silvermand169fcd2013-02-27 13:18:47 -0800209 }
Brian Silvermand169fcd2013-02-27 13:18:47 -0800210
211 notifyevt = reinterpret_cast<inotify_event *>(
Brian Silvermandbdf1d02013-11-17 13:19:41 -0800212 __builtin_assume_aligned(reinterpret_cast<char *>(notifyevt) +
213 sizeof(*notifyevt) + notifyevt->len,
Brian Silvermanafc00a62014-04-21 17:51:23 -0700214 alignof(inotify_event)));
Brian Silvermand169fcd2013-02-27 13:18:47 -0800215 }
216 }
217
Brian Silvermand90b5fe2013-03-10 18:34:42 -0700218 // INotifyReadable calls this method whenever the watch for our file gets
219 // removed somehow.
220 void WatchDeleted() {
221 LOG(DEBUG, "watch for %s deleted\n", filename_.c_str());
222 RemoveWatchFromMap();
223 CreateWatch();
224 }
225
Brian Silverman5cc661b2013-02-27 15:23:36 -0800226 // INotifyReadable calls this method whenever the watch for our file triggers.
227 void FileNotified(const char *filename) {
228 assert(watch_ != -1);
Brian Silvermanb1e4f6c2013-02-27 15:42:02 -0800229 LOG(DEBUG, "got a notification for %s\n", filename_.c_str());
Brian Silverman5cc661b2013-02-27 15:23:36 -0800230
231 if (!check_filename_.empty()) {
232 if (filename == NULL) {
233 return;
234 }
235 if (std::string(filename) != check_filename_) {
236 return;
237 }
238 }
239
240 callback_((value_ == NULL) ? this : value_);
Brian Silvermand169fcd2013-02-27 13:18:47 -0800241 }
242
Brian Silverman5cc661b2013-02-27 15:23:36 -0800243 // To make sure that Init gets called exactly once.
244 static ::aos::Once<void> init_once;
245
Brian Silvermand169fcd2013-02-27 13:18:47 -0800246 const std::string filename_;
247 const std::function<void(void *)> callback_;
248 void *const value_;
Brian Silvermand90b5fe2013-03-10 18:34:42 -0700249 const bool create_;
Brian Silvermand169fcd2013-02-27 13:18:47 -0800250 std::string check_filename_;
251
252 // The watch descriptor or -1 if we don't have one any more.
253 int watch_;
Brian Silvermand94642c2014-03-27 18:21:41 -0700254 // The watch that we still have to take out of the map once we get the
255 // IN_IGNORED or -1.
256 int watch_to_remove_ = -1;
Brian Silvermand169fcd2013-02-27 13:18:47 -0800257
Brian Silvermanbc4fc2f2013-02-27 19:33:42 -0800258 // Map from watch IDs to instances of this class.
259 // <https://patchwork.kernel.org/patch/73192/> ("inotify: do not reuse watch
260 // descriptors") says they won't get reused, but that shouldn't be counted on
261 // because we might have a modified/different version/whatever kernel.
Brian Silvermand169fcd2013-02-27 13:18:47 -0800262 static std::map<int, FileWatch *> watchers;
Brian Silverman5cc661b2013-02-27 15:23:36 -0800263 // The inotify(7) file descriptor.
Brian Silvermand169fcd2013-02-27 13:18:47 -0800264 static int notify_fd;
Brian Silvermanbc4fc2f2013-02-27 19:33:42 -0800265
266 DISALLOW_COPY_AND_ASSIGN(FileWatch);
Brian Silvermand169fcd2013-02-27 13:18:47 -0800267};
Brian Silverman5cc661b2013-02-27 15:23:36 -0800268::aos::Once<void> FileWatch::init_once(FileWatch::Init);
Brian Silvermand169fcd2013-02-27 13:18:47 -0800269std::map<int, FileWatch *> FileWatch::watchers;
270int FileWatch::notify_fd;
271
Brian Silverman5cc661b2013-02-27 15:23:36 -0800272// Runs the given command and returns its first line of output (not including
273// the \n). LOG(FATAL)s if the command has an exit status other than 0 or does
274// not print out an entire line.
Brian Silvermand169fcd2013-02-27 13:18:47 -0800275std::string RunCommand(std::string command) {
Brian Silverman5cc661b2013-02-27 15:23:36 -0800276 // popen(3) might fail and not set it.
Brian Silvermand169fcd2013-02-27 13:18:47 -0800277 errno = 0;
Brian Silverman5cc661b2013-02-27 15:23:36 -0800278 FILE *pipe = popen(command.c_str(), "r");
279 if (pipe == NULL) {
Brian Silvermand169fcd2013-02-27 13:18:47 -0800280 LOG(FATAL, "popen(\"%s\", \"r\") failed with %d: %s\n",
281 command.c_str(), errno, strerror(errno));
282 }
283
Brian Silverman5cc661b2013-02-27 15:23:36 -0800284 // result_size is how many bytes result is currently allocated to.
Brian Silvermand169fcd2013-02-27 13:18:47 -0800285 size_t result_size = 128, read = 0;
286 unique_c_ptr<char> result(static_cast<char *>(malloc(result_size)));
287 while (true) {
Brian Silverman5cc661b2013-02-27 15:23:36 -0800288 // If we filled up the buffer, then realloc(3) it bigger.
Brian Silvermand169fcd2013-02-27 13:18:47 -0800289 if (read == result_size) {
290 result_size *= 2;
291 void *new_result = realloc(result.get(), result_size);
292 if (new_result == NULL) {
Brian Silverman5cc661b2013-02-27 15:23:36 -0800293 LOG(FATAL, "realloc(%p, %zd) failed because of %d: %s\n",
294 result.get(), result_size, errno, strerror(errno));
Brian Silvermand169fcd2013-02-27 13:18:47 -0800295 } else {
296 result.release();
297 result = unique_c_ptr<char>(static_cast<char *>(new_result));
298 }
299 }
300
Brian Silverman5cc661b2013-02-27 15:23:36 -0800301 size_t ret = fread(result.get() + read, 1, result_size - read, pipe);
302 // If the read didn't fill up the whole buffer, check to see if it was
303 // because of an error.
Brian Silvermand169fcd2013-02-27 13:18:47 -0800304 if (ret < result_size - read) {
Brian Silverman5cc661b2013-02-27 15:23:36 -0800305 if (ferror(pipe)) {
Brian Silvermand169fcd2013-02-27 13:18:47 -0800306 LOG(FATAL, "couldn't finish reading output of \"%s\"\n",
307 command.c_str());
308 }
309 }
310 read += ret;
311 if (read > 0 && result.get()[read - 1] == '\n') {
312 break;
313 }
314
Brian Silverman5cc661b2013-02-27 15:23:36 -0800315 if (feof(pipe)) {
316 LOG(FATAL, "`%s` failed. didn't print a whole line\n", command.c_str());
Brian Silvermand169fcd2013-02-27 13:18:47 -0800317 }
318 }
319
Brian Silverman5cc661b2013-02-27 15:23:36 -0800320 // Get rid of the first \n and anything after it.
Brian Silvermand169fcd2013-02-27 13:18:47 -0800321 *strchrnul(result.get(), '\n') = '\0';
322
Brian Silverman5cc661b2013-02-27 15:23:36 -0800323 int child_status = pclose(pipe);
Brian Silvermand169fcd2013-02-27 13:18:47 -0800324 if (child_status == -1) {
Brian Silverman5cc661b2013-02-27 15:23:36 -0800325 LOG(FATAL, "pclose(%p) failed with %d: %s\n", pipe,
Brian Silvermand169fcd2013-02-27 13:18:47 -0800326 errno, strerror(errno));
327 }
328
329 if (child_status != 0) {
330 LOG(FATAL, "`%s` failed. return %d\n", command.c_str(), child_status);
331 }
332
333 return std::string(result.get());
334}
335
336// Will call callback(arg) after time.
337void Timeout(time::Time time, void (*callback)(int, short, void *), void *arg) {
338 EventUniquePtr timeout(evtimer_new(libevent_base.get(), callback, arg));
339 struct timeval time_timeval = time.ToTimeval();
Brian Silvermand94642c2014-03-27 18:21:41 -0700340 if (evtimer_add(timeout.release(), &time_timeval) != 0) {
341 LOG(FATAL, "evtimer_add(%p, %p) failed\n",
342 timeout.release(), &time_timeval);
343 }
Brian Silvermand169fcd2013-02-27 13:18:47 -0800344}
345
Brian Silvermand94642c2014-03-27 18:21:41 -0700346class Child;
347// This is where all of the Child instances except core live.
348std::vector<unique_ptr<Child>> children;
349// A global place to hold on to which child is core.
350unique_ptr<Child> core;
351
Brian Silvermand169fcd2013-02-27 13:18:47 -0800352// Represents a child process. It will take care of restarting itself etc.
353class Child {
354 public:
Brian Silverman5cc661b2013-02-27 15:23:36 -0800355 // command is the (space-separated) command to run and its arguments.
356 Child(const std::string &command) : pid_(-1),
Brian Silvermanfe06fe12013-02-27 18:54:58 -0800357 stat_at_start_valid_(false) {
Brian Silvermand94642c2014-03-27 18:21:41 -0700358 if (!restart_timeout) {
359 restart_timeout = EventUniquePtr(
360 evtimer_new(libevent_base.get(), StaticDoRestart, nullptr));
361 }
Brian Silvermand169fcd2013-02-27 13:18:47 -0800362 const char *start, *end;
363 start = command.c_str();
364 while (true) {
365 end = strchrnul(start, ' ');
366 args_.push_back(std::string(start, end - start));
367 start = end + 1;
368 if (*end == '\0') {
369 break;
370 }
371 }
372
Brian Silverman5cc661b2013-02-27 15:23:36 -0800373 original_binary_ = RunCommand("which " + args_[0]);
374 binary_ = original_binary_ + ".stm";
Brian Silvermand169fcd2013-02-27 13:18:47 -0800375
376 watcher_ = unique_ptr<FileWatch>(
Brian Silverman5cc661b2013-02-27 15:23:36 -0800377 new FileWatch(original_binary_, StaticFileModified, this));
Brian Silvermand169fcd2013-02-27 13:18:47 -0800378
379 Start();
380 }
381
382 pid_t pid() { return pid_; }
383
384 // This gets called whenever the actual process dies and should (probably) be
385 // restarted.
386 void ProcessDied() {
387 pid_ = -1;
388 restarts_.push(time::Time::Now());
389 if (restarts_.size() > kMaxRestartsNumber) {
390 time::Time oldest = restarts_.front();
391 restarts_.pop();
Brian Silvermand90b5fe2013-03-10 18:34:42 -0700392 if ((time::Time::Now() - oldest) <= kMaxRestartsTime) {
Brian Silvermand169fcd2013-02-27 13:18:47 -0800393 LOG(WARNING, "process %s getting restarted too often\n", name());
394 Timeout(kResumeWait, StaticStart, this);
395 return;
396 }
397 }
398 Start();
399 }
400
401 // Returns a name for logging purposes.
402 const char *name() {
403 return args_[0].c_str();
404 }
405
406 private:
407 struct CheckDiedStatus {
408 Child *self;
409 pid_t old_pid;
410 };
411
412 // How long to wait for a child to die nicely.
Brian Silvermand94642c2014-03-27 18:21:41 -0700413 static constexpr time::Time kProcessDieTime = time::Time::InSeconds(2);
Brian Silvermand169fcd2013-02-27 13:18:47 -0800414
415 // How long to wait after the file is modified to restart it.
416 // This is important because some programs like modifying the binaries by
417 // writing them in little bits, which results in attempting to start partial
418 // binaries without this.
Brian Silverman52aeeac2013-08-28 16:20:53 -0700419 static constexpr time::Time kRestartWaitTime = time::Time::InSeconds(1.5);
Brian Silvermand169fcd2013-02-27 13:18:47 -0800420
Brian Silverman5cc661b2013-02-27 15:23:36 -0800421 // Only kMaxRestartsNumber restarts will be allowed in kMaxRestartsTime.
Brian Silverman52aeeac2013-08-28 16:20:53 -0700422 static constexpr time::Time kMaxRestartsTime = time::Time::InSeconds(4);
423 static const size_t kMaxRestartsNumber = 3;
Brian Silvermand169fcd2013-02-27 13:18:47 -0800424 // How long to wait if it gets restarted too many times.
Brian Silverman52aeeac2013-08-28 16:20:53 -0700425 static constexpr time::Time kResumeWait = time::Time::InSeconds(5);
Brian Silverman5cc661b2013-02-27 15:23:36 -0800426
Brian Silvermand169fcd2013-02-27 13:18:47 -0800427 static void StaticFileModified(void *self) {
428 static_cast<Child *>(self)->FileModified();
429 }
Brian Silverman5cc661b2013-02-27 15:23:36 -0800430
Brian Silvermand169fcd2013-02-27 13:18:47 -0800431 void FileModified() {
Brian Silvermand90b5fe2013-03-10 18:34:42 -0700432 LOG(DEBUG, "file for %s modified\n", name());
Brian Silvermand169fcd2013-02-27 13:18:47 -0800433 struct timeval restart_time_timeval = kRestartWaitTime.ToTimeval();
434 // This will reset the timeout again if it hasn't run yet.
Brian Silvermand94642c2014-03-27 18:21:41 -0700435 if (evtimer_add(restart_timeout.get(), &restart_time_timeval) != 0) {
Brian Silvermand90b5fe2013-03-10 18:34:42 -0700436 LOG(FATAL, "evtimer_add(%p, %p) failed\n",
Brian Silvermand94642c2014-03-27 18:21:41 -0700437 restart_timeout.get(), &restart_time_timeval);
Brian Silvermand90b5fe2013-03-10 18:34:42 -0700438 }
Brian Silvermand94642c2014-03-27 18:21:41 -0700439 waiting_to_restart.insert(this);
Brian Silvermand169fcd2013-02-27 13:18:47 -0800440 }
441
Brian Silvermand94642c2014-03-27 18:21:41 -0700442 static void StaticDoRestart(int, short, void *) {
443 LOG(DEBUG, "restarting everything that needs it\n");
444 if (waiting_to_restart.find(core.get()) != waiting_to_restart.end()) {
445 core->DoRestart();
446 waiting_to_restart.erase(core.get());
447 }
448 for (auto c : waiting_to_restart) {
449 c->DoRestart();
450 }
451 waiting_to_restart.clear();
Brian Silvermand169fcd2013-02-27 13:18:47 -0800452 }
453
Brian Silvermanb1e4f6c2013-02-27 15:42:02 -0800454 // Called after somebody else has finished modifying the file.
Brian Silvermand169fcd2013-02-27 13:18:47 -0800455 void DoRestart() {
Brian Silvermand94642c2014-03-27 18:21:41 -0700456 fprintf(stderr, "DoRestart(%s)\n", binary_.c_str());
Brian Silvermanfe06fe12013-02-27 18:54:58 -0800457 if (stat_at_start_valid_) {
458 struct stat current_stat;
459 if (stat(original_binary_.c_str(), &current_stat) == -1) {
460 LOG(FATAL, "stat(%s, %p) failed with %d: %s\n",
461 original_binary_.c_str(), &current_stat, errno, strerror(errno));
462 }
463 if (current_stat.st_mtime == stat_at_start_.st_mtime) {
464 LOG(DEBUG, "ignoring trigger for %s because mtime didn't change\n",
465 name());
466 return;
467 }
468 }
469
Brian Silvermand94642c2014-03-27 18:21:41 -0700470 if (this == core.get()) {
471 fprintf(stderr, "Restarting core -> exiting now.\n");
472 exit(0);
473 }
Brian Silvermand169fcd2013-02-27 13:18:47 -0800474 if (pid_ != -1) {
475 LOG(DEBUG, "sending SIGTERM to child %d to restart it\n", pid_);
476 if (kill(pid_, SIGTERM) == -1) {
477 LOG(WARNING, "kill(%d, SIGTERM) failed with %d: %s\n",
478 pid_, errno, strerror(errno));
479 }
480 CheckDiedStatus *status = new CheckDiedStatus();
481 status->self = this;
482 status->old_pid = pid_;
483 Timeout(kProcessDieTime, StaticCheckDied, status);
Brian Silvermand90b5fe2013-03-10 18:34:42 -0700484 } else {
485 LOG(WARNING, "%s restart attempted but not running\n", name());
Brian Silvermand169fcd2013-02-27 13:18:47 -0800486 }
487 }
488
489 static void StaticCheckDied(int, short, void *status_in) {
490 CheckDiedStatus *status = static_cast<CheckDiedStatus *>(status_in);
491 status->self->CheckDied(status->old_pid);
492 delete status;
493 }
Brian Silverman5cc661b2013-02-27 15:23:36 -0800494
495 // Checks to see if the child using the PID old_pid is still running.
Brian Silvermand169fcd2013-02-27 13:18:47 -0800496 void CheckDied(pid_t old_pid) {
497 if (pid_ == old_pid) {
498 LOG(WARNING, "child %d refused to die\n", old_pid);
499 if (kill(old_pid, SIGKILL) == -1) {
500 LOG(WARNING, "kill(%d, SIGKILL) failed with %d: %s\n",
501 old_pid, errno, strerror(errno));
502 }
503 }
504 }
505
506 static void StaticStart(int, short, void *self) {
507 static_cast<Child *>(self)->Start();
508 }
Brian Silverman5cc661b2013-02-27 15:23:36 -0800509
510 // Actually starts the child.
Brian Silvermand169fcd2013-02-27 13:18:47 -0800511 void Start() {
512 if (pid_ != -1) {
513 LOG(WARNING, "calling Start() but already have child %d running\n",
514 pid_);
Brian Silverman5cc661b2013-02-27 15:23:36 -0800515 if (kill(pid_, SIGKILL) == -1) {
516 LOG(WARNING, "kill(%d, SIGKILL) failed with %d: %s\n",
517 pid_, errno, strerror(errno));
518 return;
519 }
Brian Silvermand169fcd2013-02-27 13:18:47 -0800520 pid_ = -1;
521 }
Brian Silverman5cc661b2013-02-27 15:23:36 -0800522
523 // Remove the name that we run from (ie from a previous execution) and then
524 // hard link the real filename to it.
525 if (unlink(binary_.c_str()) != 0 && errno != ENOENT) {
526 LOG(FATAL, "removing %s failed because of %d: %s\n",
527 binary_.c_str(), errno, strerror(errno));
528 }
529 if (link(original_binary_.c_str(), binary_.c_str()) != 0) {
530 LOG(FATAL, "link('%s', '%s') failed because of %d: %s\n",
531 original_binary_.c_str(), binary_.c_str(), errno, strerror(errno));
532 }
533
Brian Silvermanfe06fe12013-02-27 18:54:58 -0800534 if (stat(original_binary_.c_str(), &stat_at_start_) == -1) {
535 LOG(FATAL, "stat(%s, %p) failed with %d: %s\n",
536 original_binary_.c_str(), &stat_at_start_, errno, strerror(errno));
537 }
538 stat_at_start_valid_ = true;
539
Brian Silvermand169fcd2013-02-27 13:18:47 -0800540 if ((pid_ = fork()) == 0) {
541 ssize_t args_size = args_.size();
542 const char **argv = new const char *[args_size + 1];
543 for (int i = 0; i < args_size; ++i) {
544 argv[i] = args_[i].c_str();
545 }
546 argv[args_size] = NULL;
547 // The const_cast is safe because no code that might care if it gets
548 // modified can run afterwards.
549 execv(binary_.c_str(), const_cast<char **>(argv));
550 LOG(FATAL, "execv(%s, %p) failed with %d: %s\n",
551 binary_.c_str(), argv, errno, strerror(errno));
552 _exit(EXIT_FAILURE);
553 }
554 if (pid_ == -1) {
Brian Silverman5cc661b2013-02-27 15:23:36 -0800555 LOG(FATAL, "forking to run \"%s\" failed with %d: %s\n",
Brian Silvermand169fcd2013-02-27 13:18:47 -0800556 binary_.c_str(), errno, strerror(errno));
557 }
Brian Silvermand90b5fe2013-03-10 18:34:42 -0700558 LOG(DEBUG, "started \"%s\" successfully\n", binary_.c_str());
Brian Silvermand169fcd2013-02-27 13:18:47 -0800559 }
Brian Silvermanbc4fc2f2013-02-27 19:33:42 -0800560
561 // A history of the times that this process has been restarted.
562 std::queue<time::Time, std::list<time::Time>> restarts_;
563
564 // The currently running child's PID or NULL.
565 pid_t pid_;
566
567 // All of the arguments (including the name of the binary).
568 std::deque<std::string> args_;
569
570 // The name of the real binary that we were told to run.
571 std::string original_binary_;
572 // The name of the file that we're actually running.
573 std::string binary_;
574
575 // Watches original_binary_.
576 unique_ptr<FileWatch> watcher_;
577
Brian Silvermanbc4fc2f2013-02-27 19:33:42 -0800578 // Captured from the original file when we most recently started a new child
579 // process. Used to see if it actually changes or not.
580 struct stat stat_at_start_;
581 bool stat_at_start_valid_;
582
Brian Silvermand94642c2014-03-27 18:21:41 -0700583 // An event that restarts after kRestartWaitTime.
584 static EventUniquePtr restart_timeout;
585
586 // The set of children waiting to be restarted once all modifications stop.
587 static ::std::set<Child *> waiting_to_restart;
588
Brian Silvermanbc4fc2f2013-02-27 19:33:42 -0800589 DISALLOW_COPY_AND_ASSIGN(Child);
Brian Silvermand169fcd2013-02-27 13:18:47 -0800590};
Brian Silverman52aeeac2013-08-28 16:20:53 -0700591
592constexpr time::Time Child::kProcessDieTime;
593constexpr time::Time Child::kRestartWaitTime;
594constexpr time::Time Child::kMaxRestartsTime;
595constexpr time::Time Child::kResumeWait;
Brian Silvermand169fcd2013-02-27 13:18:47 -0800596
Brian Silvermand94642c2014-03-27 18:21:41 -0700597EventUniquePtr Child::restart_timeout;
598::std::set<Child *> Child::waiting_to_restart;
Brian Silvermand169fcd2013-02-27 13:18:47 -0800599
Brian Silverman5cc661b2013-02-27 15:23:36 -0800600// Kills off the entire process group (including ourself).
601void KillChildren(bool try_nice) {
Brian Silvermand169fcd2013-02-27 13:18:47 -0800602 if (try_nice) {
603 static const int kNiceStopSignal = SIGTERM;
604 static const time::Time kNiceWaitTime = time::Time::InSeconds(1);
605
606 // Make sure that we don't just nicely stop ourself...
607 sigset_t mask;
608 sigemptyset(&mask);
609 sigaddset(&mask, kNiceStopSignal);
610 sigprocmask(SIG_BLOCK, &mask, NULL);
611
Brian Silverman5cc661b2013-02-27 15:23:36 -0800612 kill(-getpid(), kNiceStopSignal);
613
614 fflush(NULL);
Brian Silvermand169fcd2013-02-27 13:18:47 -0800615 time::SleepFor(kNiceWaitTime);
616 }
Brian Silverman5cc661b2013-02-27 15:23:36 -0800617
Brian Silvermand169fcd2013-02-27 13:18:47 -0800618 // Send SIGKILL to our whole process group, which will forcibly terminate any
619 // of them that are still running (us for sure, maybe more too).
Brian Silverman5cc661b2013-02-27 15:23:36 -0800620 kill(-getpid(), SIGKILL);
Brian Silvermand169fcd2013-02-27 13:18:47 -0800621}
622
Brian Silverman5cc661b2013-02-27 15:23:36 -0800623void ExitHandler() {
624 KillChildren(true);
Brian Silvermand169fcd2013-02-27 13:18:47 -0800625}
Brian Silverman5cc661b2013-02-27 15:23:36 -0800626
627void KillChildrenSignalHandler(int signum) {
628 // If we get SIGSEGV or some other random signal who knows what's happening
629 // and we should just kill everybody immediately.
630 // This is a list of all of the signals that mean some form of "nicely stop".
631 KillChildren(signum == SIGHUP || signum == SIGINT || signum == SIGQUIT ||
Brian Silverman0eec9532013-02-27 20:24:16 -0800632 signum == SIGABRT || signum == SIGPIPE || signum == SIGTERM ||
633 signum == SIGXCPU);
Brian Silvermand169fcd2013-02-27 13:18:47 -0800634}
635
Brian Silverman5cc661b2013-02-27 15:23:36 -0800636// Returns the currently running child with PID pid or an empty unique_ptr.
Brian Silvermand169fcd2013-02-27 13:18:47 -0800637const unique_ptr<Child> &FindChild(pid_t pid) {
638 for (auto it = children.begin(); it != children.end(); ++it) {
639 if (pid == (*it)->pid()) {
640 return *it;
641 }
642 }
643
644 if (pid == core->pid()) {
645 return core;
646 }
647
Brian Silverman5cc661b2013-02-27 15:23:36 -0800648 static const unique_ptr<Child> kNothing;
Brian Silvermand169fcd2013-02-27 13:18:47 -0800649 return kNothing;
650}
651
Brian Silverman5cc661b2013-02-27 15:23:36 -0800652// Gets set up as a libevent handler for SIGCHLD.
653// Handles calling Child::ProcessDied() on the appropriate one.
654void SigCHLDReceived(int /*fd*/, short /*events*/, void *) {
Brian Silvermand169fcd2013-02-27 13:18:47 -0800655 // In a while loop in case we miss any SIGCHLDs.
656 while (true) {
657 siginfo_t infop;
658 infop.si_pid = 0;
659 if (waitid(P_ALL, 0, &infop, WEXITED | WSTOPPED | WNOHANG) != 0) {
660 LOG(WARNING, "waitid failed with %d: %s", errno, strerror(errno));
Brian Silverman5cc661b2013-02-27 15:23:36 -0800661 continue;
Brian Silvermand169fcd2013-02-27 13:18:47 -0800662 }
Brian Silverman5cc661b2013-02-27 15:23:36 -0800663 // If there are no more child process deaths to process.
Brian Silvermand169fcd2013-02-27 13:18:47 -0800664 if (infop.si_pid == 0) {
Brian Silverman5cc661b2013-02-27 15:23:36 -0800665 return;
Brian Silvermand169fcd2013-02-27 13:18:47 -0800666 }
667
668 pid_t pid = infop.si_pid;
669 int status = infop.si_status;
670 const unique_ptr<Child> &child = FindChild(pid);
671 if (child) {
672 switch (infop.si_code) {
673 case CLD_EXITED:
674 LOG(WARNING, "child %d (%s) exited with status %d\n",
675 pid, child->name(), status);
676 break;
677 case CLD_DUMPED:
678 LOG(INFO, "child %d actually dumped core. "
679 "falling through to killed by signal case\n", pid);
680 case CLD_KILLED:
681 // If somebody (possibly us) sent it SIGTERM that means that they just
682 // want it to stop, so it stopping isn't a WARNING.
683 LOG((status == SIGTERM) ? DEBUG : WARNING,
684 "child %d (%s) was killed by signal %d (%s)\n",
685 pid, child->name(), status,
686 strsignal(status));
687 break;
688 case CLD_STOPPED:
689 LOG(WARNING, "child %d (%s) was stopped by signal %d "
690 "(giving it a SIGCONT(%d))\n",
691 pid, child->name(), status, SIGCONT);
692 kill(pid, SIGCONT);
693 continue;
694 default:
695 LOG(WARNING, "something happened to child %d (%s) (killing it)\n",
696 pid, child->name());
697 kill(pid, SIGKILL);
698 continue;
699 }
700 } else {
Brian Silverman5cc661b2013-02-27 15:23:36 -0800701 LOG(WARNING, "couldn't find a Child for pid %d\n", pid);
702 return;
Brian Silvermand169fcd2013-02-27 13:18:47 -0800703 }
704
Brian Silverman5cc661b2013-02-27 15:23:36 -0800705 if (child == core) {
706 LOG(FATAL, "core died\n");
707 }
Brian Silvermand169fcd2013-02-27 13:18:47 -0800708 child->ProcessDied();
709 }
710}
711
Brian Silverman5cc661b2013-02-27 15:23:36 -0800712// This is used for communicating the name of the file to read processes to
713// start from main to Run.
714const char *child_list_file;
715
Brian Silverman8070a222013-02-28 15:01:36 -0800716void Run(void *watch);
Brian Silvermand169fcd2013-02-27 13:18:47 -0800717void Main() {
718 logging::Init();
Brian Silverman0eec9532013-02-27 20:24:16 -0800719 // TODO(brians): tell logging that using the root logger from here until we
Brian Silvermand169fcd2013-02-27 13:18:47 -0800720 // bring up shm is ok
721
Brian Silverman5cc661b2013-02-27 15:23:36 -0800722 if (setpgid(0 /*self*/, 0 /*make PGID the same as PID*/) != 0) {
723 LOG(FATAL, "setpgid(0, 0) failed with %d: %s\n", errno, strerror(errno));
724 }
Brian Silvermand169fcd2013-02-27 13:18:47 -0800725
726 // Make sure that we kill all children when we exit.
Brian Silverman5cc661b2013-02-27 15:23:36 -0800727 atexit(ExitHandler);
Brian Silvermand169fcd2013-02-27 13:18:47 -0800728 // Do it on some signals too (ones that we otherwise tend to receive and then
729 // leave all of our children going).
Brian Silverman5cc661b2013-02-27 15:23:36 -0800730 signal(SIGHUP, KillChildrenSignalHandler);
731 signal(SIGINT, KillChildrenSignalHandler);
732 signal(SIGQUIT, KillChildrenSignalHandler);
733 signal(SIGILL, KillChildrenSignalHandler);
734 signal(SIGABRT, KillChildrenSignalHandler);
735 signal(SIGFPE, KillChildrenSignalHandler);
736 signal(SIGSEGV, KillChildrenSignalHandler);
737 signal(SIGPIPE, KillChildrenSignalHandler);
738 signal(SIGTERM, KillChildrenSignalHandler);
739 signal(SIGBUS, KillChildrenSignalHandler);
740 signal(SIGXCPU, KillChildrenSignalHandler);
Brian Silvermand169fcd2013-02-27 13:18:47 -0800741
742 libevent_base = EventBaseUniquePtr(event_base_new());
743
Brian Silvermanb1e4f6c2013-02-27 15:42:02 -0800744 std::string core_touch_file = "/tmp/starter.";
Brian Silvermand169fcd2013-02-27 13:18:47 -0800745 core_touch_file += std::to_string(static_cast<intmax_t>(getpid()));
746 core_touch_file += ".core_touch_file";
Brian Silvermanb1e4f6c2013-02-27 15:42:02 -0800747 if (system(("touch '" + core_touch_file + "'").c_str()) != 0) {
748 LOG(FATAL, "running `touch '%s'` failed\n", core_touch_file.c_str());
749 }
750 FileWatch core_touch_file_watch(core_touch_file, Run, NULL);
Brian Silvermand169fcd2013-02-27 13:18:47 -0800751 core = unique_ptr<Child>(
Brian Silvermanb1e4f6c2013-02-27 15:42:02 -0800752 new Child("core " + core_touch_file));
Brian Silvermand169fcd2013-02-27 13:18:47 -0800753
754 FILE *pid_file = fopen("/tmp/starter.pid", "w");
755 if (pid_file == NULL) {
Brian Silverman5cc661b2013-02-27 15:23:36 -0800756 LOG(FATAL, "fopen(\"/tmp/starter.pid\", \"w\") failed with %d: %s\n",
Brian Silvermand169fcd2013-02-27 13:18:47 -0800757 errno, strerror(errno));
758 } else {
759 if (fprintf(pid_file, "%d", core->pid()) == -1) {
Brian Silverman5cc661b2013-02-27 15:23:36 -0800760 LOG(WARNING, "fprintf(%p, \"%%d\", %d) failed with %d: %s\n",
761 pid_file, core->pid(), errno, strerror(errno));
Brian Silvermand169fcd2013-02-27 13:18:47 -0800762 }
763 fclose(pid_file);
764 }
765
766 LOG(INFO, "waiting for %s to appear\n", core_touch_file.c_str());
767
768 event_base_dispatch(libevent_base.get());
769 LOG(FATAL, "event_base_dispatch(%p) returned\n", libevent_base.get());
770}
771
Brian Silverman0eec9532013-02-27 20:24:16 -0800772// This is the callback for when core creates the file indicating that it has
773// started.
774void Run(void *watch) {
775 // Make it so it doesn't keep on seeing random changes in /tmp.
776 static_cast<FileWatch *>(watch)->RemoveWatch();
777
778 // It's safe now because core is up.
779 aos::InitNRT();
780
781 std::ifstream list_file(child_list_file);
782
783 while (true) {
784 std::string child_name;
785 getline(list_file, child_name);
786 if ((list_file.rdstate() & std::ios_base::eofbit) != 0) {
787 break;
788 }
789 if (list_file.rdstate() != 0) {
790 LOG(FATAL, "reading input file %s failed\n", child_list_file);
791 }
792 children.push_back(unique_ptr<Child>(new Child(child_name)));
793 }
794
795 EventUniquePtr sigchld(event_new(libevent_base.get(), SIGCHLD,
796 EV_SIGNAL | EV_PERSIST,
797 SigCHLDReceived, NULL));
798 event_add(sigchld.release(), NULL);
799}
800
Brian Silverman8070a222013-02-28 15:01:36 -0800801const char *kArgsHelp = "[OPTION]... START_LIST\n"
802 "Start all of the robot code binaries in START_LIST.\n"
803 "\n"
804 "START_LIST is the file to read binaries (looked up on PATH) to run.\n"
805 " --help display this help and exit\n";
806void PrintHelp() {
807 fprintf(stderr, "Usage: %s %s", program_invocation_name, kArgsHelp);
808}
809
Brian Silvermand169fcd2013-02-27 13:18:47 -0800810} // namespace starter
811} // namespace aos
812
813int main(int argc, char *argv[]) {
Brian Silverman8070a222013-02-28 15:01:36 -0800814 if (argc != 2) {
815 aos::starter::PrintHelp();
Brian Silvermand169fcd2013-02-27 13:18:47 -0800816 exit(EXIT_FAILURE);
Brian Silvermand169fcd2013-02-27 13:18:47 -0800817 }
Brian Silverman8070a222013-02-28 15:01:36 -0800818 if (strcmp(argv[1], "--help") == 0) {
819 aos::starter::PrintHelp();
820 exit(EXIT_SUCCESS);
821 }
822
Brian Silvermand169fcd2013-02-27 13:18:47 -0800823 aos::starter::child_list_file = argv[1];
824
825 aos::starter::Main();
826}