blob: 6860cc446e08e426b0ee2feefc3beaf56dfb51f9 [file] [log] [blame]
Brian Silvermanaf784862014-05-13 08:14:55 -07001// This has to come before anybody drags in <stdlib.h> or else we end up with
2// the wrong version of WIFEXITED etc (for one thing, they don't const-qualify
3// their casts) (sometimes at least).
4#include <sys/wait.h>
5
Brian Silvermand169fcd2013-02-27 13:18:47 -08006#include <stdio.h>
7#include <stdlib.h>
8#include <sys/types.h>
9#include <fcntl.h>
10#include <sys/inotify.h>
11#include <sys/stat.h>
12#include <sys/ioctl.h>
Brian Silvermand169fcd2013-02-27 13:18:47 -080013#include <signal.h>
14#include <stdint.h>
15#include <errno.h>
16#include <string.h>
Brian Silvermand90b5fe2013-03-10 18:34:42 -070017#include <inttypes.h>
Brian Silvermand169fcd2013-02-27 13:18:47 -080018
19#include <map>
20#include <functional>
21#include <deque>
22#include <fstream>
23#include <queue>
24#include <list>
25#include <string>
26#include <vector>
27#include <memory>
Brian Silvermand94642c2014-03-27 18:21:41 -070028#include <set>
Brian Silvermand169fcd2013-02-27 13:18:47 -080029
Brian Silverman258b9172015-09-19 14:32:57 -040030#include "third_party/libevent/event.h"
Brian Silvermand169fcd2013-02-27 13:18:47 -080031
John Park33858a32018-09-28 23:05:48 -070032#include "aos/libc/aos_strsignal.h"
33#include "aos/logging/implementations.h"
34#include "aos/logging/logging.h"
35#include "aos/time/time.h"
36#include "aos/unique_malloc_ptr.h"
37#include "aos/util/run_command.h"
John Park398c74a2018-10-20 21:17:39 -070038#include "aos/init.h"
Sabina Davis2ed5ea22017-09-26 22:27:42 -070039#include "aos/once.h"
Brian Silvermand169fcd2013-02-27 13:18:47 -080040
41// This is the main piece of code that starts all of the rest of the code and
42// restarts it when the binaries are modified.
43//
Brian Silvermanbc4fc2f2013-02-27 19:33:42 -080044// Throughout, the code is not terribly concerned with thread safety because
45// there is only 1 thread. It does some setup and then lets inotify run things
46// when appropriate.
47//
Brian Silverman5cc661b2013-02-27 15:23:36 -080048// NOTE: This program should never exit nicely. It catches all nice attempts to
49// exit, forwards them to all of the children that it has started, waits for
Brian Silvermand169fcd2013-02-27 13:18:47 -080050// them to exit nicely, and then SIGKILLs anybody left (which will always
51// include itself).
52
53using ::std::unique_ptr;
54
55namespace aos {
56namespace starter {
57
Austin Schuhf2a50ba2016-12-24 16:16:26 -080058namespace chrono = ::std::chrono;
59
Brian Silverman0eec9532013-02-27 20:24:16 -080060// TODO(brians): split out the c++ libevent wrapper stuff into its own file(s)
Brian Silvermand169fcd2013-02-27 13:18:47 -080061class EventBaseDeleter {
62 public:
63 void operator()(event_base *base) {
Brian Silverman8070a222013-02-28 15:01:36 -080064 if (base == NULL) return;
Brian Silvermand169fcd2013-02-27 13:18:47 -080065 event_base_free(base);
66 }
67};
68typedef unique_ptr<event_base, EventBaseDeleter> EventBaseUniquePtr;
Brian Silverman5cc661b2013-02-27 15:23:36 -080069EventBaseUniquePtr libevent_base;
Brian Silvermand169fcd2013-02-27 13:18:47 -080070
71class EventDeleter {
72 public:
73 void operator()(event *evt) {
Brian Silverman8070a222013-02-28 15:01:36 -080074 if (evt == NULL) return;
Brian Silvermand169fcd2013-02-27 13:18:47 -080075 if (event_del(evt) != 0) {
Austin Schuhf257f3c2019-10-27 21:00:43 -070076 AOS_LOG(WARNING, "event_del(%p) failed\n", evt);
Brian Silvermand169fcd2013-02-27 13:18:47 -080077 }
78 }
79};
80typedef unique_ptr<event, EventDeleter> EventUniquePtr;
81
Brian Silverman5cc661b2013-02-27 15:23:36 -080082// Watches a file path for modifications. Once created, keeps watching until
83// destroyed or RemoveWatch() is called.
Brian Silverman0eec9532013-02-27 20:24:16 -080084// TODO(brians): split this out into its own file + tests
Brian Silvermand169fcd2013-02-27 13:18:47 -080085class FileWatch {
86 public:
87 // Will call callback(value) when filename is modified.
88 // If value is NULL, then a pointer to this object will be passed instead.
Brian Silverman5cc661b2013-02-27 15:23:36 -080089 //
90 // Watching for file creations is slightly different. To do that, pass true
Brian Silverman8070a222013-02-28 15:01:36 -080091 // as create, the directory where the file will be created for filename, and
Brian Silverman5cc661b2013-02-27 15:23:36 -080092 // the name of the file (without directory name) for check_filename.
Brian Silvermand169fcd2013-02-27 13:18:47 -080093 FileWatch(std::string filename,
Brian Silverman8070a222013-02-28 15:01:36 -080094 std::function<void(void *)> callback,
95 void *value,
96 bool create = false,
97 std::string check_filename = "")
98 : filename_(filename),
99 callback_(callback),
100 value_(value),
Brian Silvermand90b5fe2013-03-10 18:34:42 -0700101 create_(create),
102 check_filename_(check_filename),
103 watch_(-1) {
Brian Silverman5cc661b2013-02-27 15:23:36 -0800104 init_once.Get();
105
Brian Silvermand90b5fe2013-03-10 18:34:42 -0700106 CreateWatch();
Brian Silvermand169fcd2013-02-27 13:18:47 -0800107 }
108 // Cleans up everything.
109 ~FileWatch() {
110 if (watch_ != -1) {
111 RemoveWatch();
112 }
113 }
114
115 // After calling this method, this object won't really be doing much of
Brian Silverman5cc661b2013-02-27 15:23:36 -0800116 // anything besides possibly running its callback or something.
Brian Silvermand169fcd2013-02-27 13:18:47 -0800117 void RemoveWatch() {
Austin Schuhf257f3c2019-10-27 21:00:43 -0700118 AOS_CHECK_NE(watch_, -1);
119 AOS_CHECK_EQ(watch_to_remove_, -1);
Brian Silverman5cc661b2013-02-27 15:23:36 -0800120
Brian Silvermand169fcd2013-02-27 13:18:47 -0800121 if (inotify_rm_watch(notify_fd, watch_) == -1) {
Austin Schuhf257f3c2019-10-27 21:00:43 -0700122 AOS_PLOG(WARNING, "inotify_rm_watch(%d, %d) failed", notify_fd, watch_);
Brian Silvermand169fcd2013-02-27 13:18:47 -0800123 }
Brian Silvermand94642c2014-03-27 18:21:41 -0700124 watch_to_remove_ = watch_;
125 watch_ = -1;
Brian Silvermand169fcd2013-02-27 13:18:47 -0800126 }
127
Brian Silverman5cc661b2013-02-27 15:23:36 -0800128 private:
129 // Performs the static initialization. Called by init_once from the
130 // constructor.
131 static void *Init() {
132 notify_fd = inotify_init1(IN_CLOEXEC);
133 EventUniquePtr notify_event(event_new(libevent_base.get(), notify_fd,
134 EV_READ | EV_PERSIST,
135 FileWatch::INotifyReadable, NULL));
136 event_add(notify_event.release(), NULL);
137 return NULL;
Brian Silvermand169fcd2013-02-27 13:18:47 -0800138 }
139
Brian Silvermand90b5fe2013-03-10 18:34:42 -0700140 void RemoveWatchFromMap() {
Brian Silvermand94642c2014-03-27 18:21:41 -0700141 int watch = watch_to_remove_;
142 if (watch == -1) {
Austin Schuhf257f3c2019-10-27 21:00:43 -0700143 AOS_CHECK_NE(watch_, -1);
Brian Silvermand94642c2014-03-27 18:21:41 -0700144 watch = watch_;
145 }
146 if (watchers[watch] != this) {
Austin Schuhf257f3c2019-10-27 21:00:43 -0700147 AOS_LOG(WARNING, "watcher for %s (%p) didn't find itself in the map\n",
148 filename_.c_str(), this);
Brian Silvermand90b5fe2013-03-10 18:34:42 -0700149 } else {
Brian Silvermand94642c2014-03-27 18:21:41 -0700150 watchers.erase(watch);
Brian Silvermand90b5fe2013-03-10 18:34:42 -0700151 }
Austin Schuhf257f3c2019-10-27 21:00:43 -0700152 AOS_LOG(DEBUG, "removed watch ID %d\n", watch);
Brian Silvermand94642c2014-03-27 18:21:41 -0700153 if (watch_to_remove_ == -1) {
154 watch_ = -1;
155 } else {
156 watch_to_remove_ = -1;
157 }
Brian Silvermand90b5fe2013-03-10 18:34:42 -0700158 }
159
160 void CreateWatch() {
Austin Schuhf257f3c2019-10-27 21:00:43 -0700161 AOS_CHECK_EQ(watch_, -1);
Brian Silvermand90b5fe2013-03-10 18:34:42 -0700162 watch_ = inotify_add_watch(notify_fd, filename_.c_str(),
163 create_ ? IN_CREATE : (IN_ATTRIB |
164 IN_MODIFY |
165 IN_DELETE_SELF |
166 IN_MOVE_SELF));
167 if (watch_ == -1) {
Austin Schuhf257f3c2019-10-27 21:00:43 -0700168 AOS_PLOG(FATAL,
169 "inotify_add_watch(%d, %s,"
170 " %s ? IN_CREATE : (IN_ATTRIB | IN_MODIFY)) failed",
171 notify_fd, filename_.c_str(), create_ ? "true" : "false");
Brian Silvermand90b5fe2013-03-10 18:34:42 -0700172 }
173 watchers[watch_] = this;
Austin Schuhf257f3c2019-10-27 21:00:43 -0700174 AOS_LOG(DEBUG, "watch for %s is %d\n", filename_.c_str(), watch_);
Brian Silvermand90b5fe2013-03-10 18:34:42 -0700175 }
176
Brian Silvermand169fcd2013-02-27 13:18:47 -0800177 // This gets set up as the callback for EV_READ on the inotify file
Brian Silverman5cc661b2013-02-27 15:23:36 -0800178 // descriptor. It calls FileNotified on the appropriate instance.
Brian Silvermand169fcd2013-02-27 13:18:47 -0800179 static void INotifyReadable(int /*fd*/, short /*events*/, void *) {
180 unsigned int to_read;
Brian Silverman5cc661b2013-02-27 15:23:36 -0800181 // Use FIONREAD to figure out how many bytes there are to read.
Brian Silvermand169fcd2013-02-27 13:18:47 -0800182 if (ioctl(notify_fd, FIONREAD, &to_read) < 0) {
Austin Schuhf257f3c2019-10-27 21:00:43 -0700183 AOS_PLOG(FATAL, "FIONREAD(%d, %p) failed", notify_fd, &to_read);
Brian Silvermand169fcd2013-02-27 13:18:47 -0800184 }
185 inotify_event *notifyevt = static_cast<inotify_event *>(malloc(to_read));
186 const char *end = reinterpret_cast<char *>(notifyevt) + to_read;
187 aos::unique_c_ptr<inotify_event> freer(notifyevt);
188
189 ssize_t ret = read(notify_fd, notifyevt, to_read);
190 if (ret < 0) {
Austin Schuhf257f3c2019-10-27 21:00:43 -0700191 AOS_PLOG(FATAL, "read(%d, %p, %u) failed", notify_fd, notifyevt, to_read);
Brian Silvermand169fcd2013-02-27 13:18:47 -0800192 }
193 if (static_cast<size_t>(ret) != to_read) {
Austin Schuhf257f3c2019-10-27 21:00:43 -0700194 AOS_LOG(ERROR, "read(%d, %p, %u) returned %zd instead of %u\n", notify_fd,
195 notifyevt, to_read, ret, to_read);
Brian Silvermand169fcd2013-02-27 13:18:47 -0800196 return;
197 }
198
Brian Silverman5cc661b2013-02-27 15:23:36 -0800199 // Keep looping through until we get to the end because inotify does return
200 // multiple events at once.
Brian Silvermanb1e4f6c2013-02-27 15:42:02 -0800201 while (reinterpret_cast<char *>(notifyevt) < end) {
Brian Silvermand169fcd2013-02-27 13:18:47 -0800202 if (watchers.count(notifyevt->wd) != 1) {
Austin Schuhf257f3c2019-10-27 21:00:43 -0700203 AOS_LOG(WARNING, "couldn't find whose watch ID %d is\n", notifyevt->wd);
Brian Silvermanb1e4f6c2013-02-27 15:42:02 -0800204 } else {
Austin Schuhf257f3c2019-10-27 21:00:43 -0700205 AOS_LOG(DEBUG, "mask=%" PRIu32 "\n", notifyevt->mask);
Brian Silvermand94642c2014-03-27 18:21:41 -0700206 // If the watch was removed.
207 if (notifyevt->mask & IN_IGNORED) {
Brian Silvermand90b5fe2013-03-10 18:34:42 -0700208 watchers[notifyevt->wd]->WatchDeleted();
209 } else {
Brian Silvermand94642c2014-03-27 18:21:41 -0700210 watchers[notifyevt->wd]
211 ->FileNotified((notifyevt->len > 0) ? notifyevt->name : NULL);
Brian Silvermand90b5fe2013-03-10 18:34:42 -0700212 }
Brian Silvermand169fcd2013-02-27 13:18:47 -0800213 }
Brian Silvermand169fcd2013-02-27 13:18:47 -0800214
215 notifyevt = reinterpret_cast<inotify_event *>(
Brian Silvermandbdf1d02013-11-17 13:19:41 -0800216 __builtin_assume_aligned(reinterpret_cast<char *>(notifyevt) +
217 sizeof(*notifyevt) + notifyevt->len,
Brian Silvermanafc00a62014-04-21 17:51:23 -0700218 alignof(inotify_event)));
Brian Silvermand169fcd2013-02-27 13:18:47 -0800219 }
220 }
221
Brian Silvermand90b5fe2013-03-10 18:34:42 -0700222 // INotifyReadable calls this method whenever the watch for our file gets
223 // removed somehow.
224 void WatchDeleted() {
Austin Schuhf257f3c2019-10-27 21:00:43 -0700225 AOS_LOG(DEBUG, "watch for %s deleted\n", filename_.c_str());
Brian Silvermand90b5fe2013-03-10 18:34:42 -0700226 RemoveWatchFromMap();
227 CreateWatch();
228 }
229
Brian Silverman5cc661b2013-02-27 15:23:36 -0800230 // INotifyReadable calls this method whenever the watch for our file triggers.
231 void FileNotified(const char *filename) {
Austin Schuhf257f3c2019-10-27 21:00:43 -0700232 AOS_CHECK_NE(watch_, -1);
233 AOS_LOG(DEBUG, "got a notification for %s\n", filename_.c_str());
Brian Silverman5cc661b2013-02-27 15:23:36 -0800234
235 if (!check_filename_.empty()) {
236 if (filename == NULL) {
237 return;
238 }
239 if (std::string(filename) != check_filename_) {
240 return;
241 }
242 }
243
244 callback_((value_ == NULL) ? this : value_);
Brian Silvermand169fcd2013-02-27 13:18:47 -0800245 }
246
Brian Silverman5cc661b2013-02-27 15:23:36 -0800247 // To make sure that Init gets called exactly once.
248 static ::aos::Once<void> init_once;
249
Brian Silvermand169fcd2013-02-27 13:18:47 -0800250 const std::string filename_;
251 const std::function<void(void *)> callback_;
252 void *const value_;
Brian Silvermand90b5fe2013-03-10 18:34:42 -0700253 const bool create_;
Brian Silvermand169fcd2013-02-27 13:18:47 -0800254 std::string check_filename_;
255
256 // The watch descriptor or -1 if we don't have one any more.
257 int watch_;
Brian Silvermand94642c2014-03-27 18:21:41 -0700258 // The watch that we still have to take out of the map once we get the
259 // IN_IGNORED or -1.
260 int watch_to_remove_ = -1;
Brian Silvermand169fcd2013-02-27 13:18:47 -0800261
Brian Silvermanbc4fc2f2013-02-27 19:33:42 -0800262 // Map from watch IDs to instances of this class.
263 // <https://patchwork.kernel.org/patch/73192/> ("inotify: do not reuse watch
264 // descriptors") says they won't get reused, but that shouldn't be counted on
265 // because we might have a modified/different version/whatever kernel.
Brian Silvermand169fcd2013-02-27 13:18:47 -0800266 static std::map<int, FileWatch *> watchers;
Brian Silverman5cc661b2013-02-27 15:23:36 -0800267 // The inotify(7) file descriptor.
Brian Silvermand169fcd2013-02-27 13:18:47 -0800268 static int notify_fd;
Brian Silvermanbc4fc2f2013-02-27 19:33:42 -0800269
270 DISALLOW_COPY_AND_ASSIGN(FileWatch);
Brian Silvermand169fcd2013-02-27 13:18:47 -0800271};
Brian Silverman5cc661b2013-02-27 15:23:36 -0800272::aos::Once<void> FileWatch::init_once(FileWatch::Init);
Brian Silvermand169fcd2013-02-27 13:18:47 -0800273std::map<int, FileWatch *> FileWatch::watchers;
274int FileWatch::notify_fd;
275
Brian Silverman5cc661b2013-02-27 15:23:36 -0800276// Runs the given command and returns its first line of output (not including
277// the \n). LOG(FATAL)s if the command has an exit status other than 0 or does
278// not print out an entire line.
Brian Silvermand169fcd2013-02-27 13:18:47 -0800279std::string RunCommand(std::string command) {
Brian Silverman5cc661b2013-02-27 15:23:36 -0800280 // popen(3) might fail and not set it.
Brian Silvermand169fcd2013-02-27 13:18:47 -0800281 errno = 0;
Brian Silverman5cc661b2013-02-27 15:23:36 -0800282 FILE *pipe = popen(command.c_str(), "r");
283 if (pipe == NULL) {
Austin Schuhf257f3c2019-10-27 21:00:43 -0700284 AOS_PLOG(FATAL, "popen(\"%s\", \"r\") failed", command.c_str());
Brian Silvermand169fcd2013-02-27 13:18:47 -0800285 }
286
Brian Silverman5cc661b2013-02-27 15:23:36 -0800287 // result_size is how many bytes result is currently allocated to.
Brian Silvermand169fcd2013-02-27 13:18:47 -0800288 size_t result_size = 128, read = 0;
289 unique_c_ptr<char> result(static_cast<char *>(malloc(result_size)));
290 while (true) {
Brian Silverman5cc661b2013-02-27 15:23:36 -0800291 // If we filled up the buffer, then realloc(3) it bigger.
Brian Silvermand169fcd2013-02-27 13:18:47 -0800292 if (read == result_size) {
293 result_size *= 2;
294 void *new_result = realloc(result.get(), result_size);
295 if (new_result == NULL) {
Austin Schuhf257f3c2019-10-27 21:00:43 -0700296 AOS_PLOG(FATAL, "realloc(%p, %zd) failed", result.get(), result_size);
Brian Silvermand169fcd2013-02-27 13:18:47 -0800297 } else {
298 result.release();
299 result = unique_c_ptr<char>(static_cast<char *>(new_result));
300 }
301 }
302
Brian Silverman5cc661b2013-02-27 15:23:36 -0800303 size_t ret = fread(result.get() + read, 1, result_size - read, pipe);
304 // If the read didn't fill up the whole buffer, check to see if it was
305 // because of an error.
Brian Silvermand169fcd2013-02-27 13:18:47 -0800306 if (ret < result_size - read) {
Brian Silverman5cc661b2013-02-27 15:23:36 -0800307 if (ferror(pipe)) {
Austin Schuhf257f3c2019-10-27 21:00:43 -0700308 AOS_PLOG(FATAL, "couldn't finish reading output of \"%s\"\n",
309 command.c_str());
Brian Silvermand169fcd2013-02-27 13:18:47 -0800310 }
311 }
312 read += ret;
313 if (read > 0 && result.get()[read - 1] == '\n') {
314 break;
315 }
316
Brian Silverman5cc661b2013-02-27 15:23:36 -0800317 if (feof(pipe)) {
Austin Schuhf257f3c2019-10-27 21:00:43 -0700318 AOS_LOG(FATAL, "`%s` failed. didn't print a whole line\n",
319 command.c_str());
Brian Silvermand169fcd2013-02-27 13:18:47 -0800320 }
321 }
322
Brian Silverman5cc661b2013-02-27 15:23:36 -0800323 // Get rid of the first \n and anything after it.
Brian Silvermand169fcd2013-02-27 13:18:47 -0800324 *strchrnul(result.get(), '\n') = '\0';
325
Brian Silverman5cc661b2013-02-27 15:23:36 -0800326 int child_status = pclose(pipe);
Brian Silvermand169fcd2013-02-27 13:18:47 -0800327 if (child_status == -1) {
Austin Schuhf257f3c2019-10-27 21:00:43 -0700328 AOS_PLOG(FATAL, "pclose(%p) failed", pipe);
Brian Silvermand169fcd2013-02-27 13:18:47 -0800329 }
330
331 if (child_status != 0) {
Austin Schuhf257f3c2019-10-27 21:00:43 -0700332 AOS_LOG(FATAL, "`%s` failed. return %d\n", command.c_str(), child_status);
Brian Silvermand169fcd2013-02-27 13:18:47 -0800333 }
334
335 return std::string(result.get());
336}
337
338// Will call callback(arg) after time.
Austin Schuhf2a50ba2016-12-24 16:16:26 -0800339void Timeout(monotonic_clock::duration time,
340 void (*callback)(int, short, void *), void *arg) {
Brian Silvermand169fcd2013-02-27 13:18:47 -0800341 EventUniquePtr timeout(evtimer_new(libevent_base.get(), callback, arg));
Austin Schuhf2a50ba2016-12-24 16:16:26 -0800342 struct timeval time_timeval;
343 {
344 ::std::chrono::seconds sec =
345 ::std::chrono::duration_cast<::std::chrono::seconds>(time);
346 ::std::chrono::microseconds usec =
347 ::std::chrono::duration_cast<::std::chrono::microseconds>(time - sec);
348 time_timeval.tv_sec = sec.count();
349 time_timeval.tv_usec = usec.count();
350 }
Brian Silvermand94642c2014-03-27 18:21:41 -0700351 if (evtimer_add(timeout.release(), &time_timeval) != 0) {
Austin Schuhf257f3c2019-10-27 21:00:43 -0700352 AOS_LOG(FATAL, "evtimer_add(%p, %p) failed\n", timeout.release(),
353 &time_timeval);
Brian Silvermand94642c2014-03-27 18:21:41 -0700354 }
Brian Silvermand169fcd2013-02-27 13:18:47 -0800355}
356
Brian Silvermand94642c2014-03-27 18:21:41 -0700357class Child;
358// This is where all of the Child instances except core live.
359std::vector<unique_ptr<Child>> children;
360// A global place to hold on to which child is core.
361unique_ptr<Child> core;
362
Brian Silvermand169fcd2013-02-27 13:18:47 -0800363// Represents a child process. It will take care of restarting itself etc.
364class Child {
365 public:
Brian Silverman5cc661b2013-02-27 15:23:36 -0800366 // command is the (space-separated) command to run and its arguments.
367 Child(const std::string &command) : pid_(-1),
Brian Silvermanfe06fe12013-02-27 18:54:58 -0800368 stat_at_start_valid_(false) {
Brian Silvermand94642c2014-03-27 18:21:41 -0700369 if (!restart_timeout) {
370 restart_timeout = EventUniquePtr(
371 evtimer_new(libevent_base.get(), StaticDoRestart, nullptr));
372 }
Brian Silvermand169fcd2013-02-27 13:18:47 -0800373 const char *start, *end;
374 start = command.c_str();
375 while (true) {
376 end = strchrnul(start, ' ');
377 args_.push_back(std::string(start, end - start));
378 start = end + 1;
379 if (*end == '\0') {
380 break;
381 }
382 }
383
Brian Silverman5cc661b2013-02-27 15:23:36 -0800384 original_binary_ = RunCommand("which " + args_[0]);
385 binary_ = original_binary_ + ".stm";
Brian Silvermand169fcd2013-02-27 13:18:47 -0800386
387 watcher_ = unique_ptr<FileWatch>(
Brian Silverman5cc661b2013-02-27 15:23:36 -0800388 new FileWatch(original_binary_, StaticFileModified, this));
Brian Silvermand169fcd2013-02-27 13:18:47 -0800389
390 Start();
391 }
392
393 pid_t pid() { return pid_; }
394
395 // This gets called whenever the actual process dies and should (probably) be
396 // restarted.
397 void ProcessDied() {
398 pid_ = -1;
Austin Schuhf2a50ba2016-12-24 16:16:26 -0800399 restarts_.push(monotonic_clock::now());
Brian Silvermand169fcd2013-02-27 13:18:47 -0800400 if (restarts_.size() > kMaxRestartsNumber) {
Austin Schuhf2a50ba2016-12-24 16:16:26 -0800401 monotonic_clock::time_point oldest = restarts_.front();
Brian Silvermand169fcd2013-02-27 13:18:47 -0800402 restarts_.pop();
Austin Schuhf2a50ba2016-12-24 16:16:26 -0800403 if (monotonic_clock::now() <= kMaxRestartsTime + oldest) {
Austin Schuhf257f3c2019-10-27 21:00:43 -0700404 AOS_LOG(WARNING, "process %s getting restarted too often\n", name());
Brian Silvermand169fcd2013-02-27 13:18:47 -0800405 Timeout(kResumeWait, StaticStart, this);
406 return;
407 }
408 }
409 Start();
410 }
411
412 // Returns a name for logging purposes.
413 const char *name() {
414 return args_[0].c_str();
415 }
416
417 private:
418 struct CheckDiedStatus {
419 Child *self;
420 pid_t old_pid;
421 };
422
423 // How long to wait for a child to die nicely.
Austin Schuhf2a50ba2016-12-24 16:16:26 -0800424 static constexpr chrono::nanoseconds kProcessDieTime = chrono::seconds(2);
Brian Silvermand169fcd2013-02-27 13:18:47 -0800425
426 // How long to wait after the file is modified to restart it.
427 // This is important because some programs like modifying the binaries by
428 // writing them in little bits, which results in attempting to start partial
429 // binaries without this.
Austin Schuhf2a50ba2016-12-24 16:16:26 -0800430 static constexpr chrono::nanoseconds kRestartWaitTime =
431 chrono::milliseconds(1500);
Brian Silvermand169fcd2013-02-27 13:18:47 -0800432
Brian Silverman5cc661b2013-02-27 15:23:36 -0800433 // Only kMaxRestartsNumber restarts will be allowed in kMaxRestartsTime.
Austin Schuhf2a50ba2016-12-24 16:16:26 -0800434 static constexpr chrono::nanoseconds kMaxRestartsTime = chrono::seconds(4);
Brian Silverman52aeeac2013-08-28 16:20:53 -0700435 static const size_t kMaxRestartsNumber = 3;
Brian Silvermand169fcd2013-02-27 13:18:47 -0800436 // How long to wait if it gets restarted too many times.
Austin Schuhf2a50ba2016-12-24 16:16:26 -0800437 static constexpr chrono::nanoseconds kResumeWait = chrono::seconds(5);
Brian Silverman5cc661b2013-02-27 15:23:36 -0800438
Brian Silvermand169fcd2013-02-27 13:18:47 -0800439 static void StaticFileModified(void *self) {
440 static_cast<Child *>(self)->FileModified();
441 }
Brian Silverman5cc661b2013-02-27 15:23:36 -0800442
Brian Silvermand169fcd2013-02-27 13:18:47 -0800443 void FileModified() {
Austin Schuhf257f3c2019-10-27 21:00:43 -0700444 AOS_LOG(DEBUG, "file for %s modified\n", name());
Austin Schuhf2a50ba2016-12-24 16:16:26 -0800445 struct timeval restart_time_timeval;
446 {
447 ::std::chrono::seconds sec =
448 ::std::chrono::duration_cast<::std::chrono::seconds>(
449 kRestartWaitTime);
450 ::std::chrono::microseconds usec =
451 ::std::chrono::duration_cast<::std::chrono::microseconds>(
452 kRestartWaitTime - sec);
453 restart_time_timeval.tv_sec = sec.count();
454 restart_time_timeval.tv_usec = usec.count();
455 }
Brian Silvermand169fcd2013-02-27 13:18:47 -0800456 // This will reset the timeout again if it hasn't run yet.
Brian Silvermand94642c2014-03-27 18:21:41 -0700457 if (evtimer_add(restart_timeout.get(), &restart_time_timeval) != 0) {
Austin Schuhf257f3c2019-10-27 21:00:43 -0700458 AOS_LOG(FATAL, "evtimer_add(%p, %p) failed\n", restart_timeout.get(),
459 &restart_time_timeval);
Brian Silvermand90b5fe2013-03-10 18:34:42 -0700460 }
Brian Silvermand94642c2014-03-27 18:21:41 -0700461 waiting_to_restart.insert(this);
Brian Silvermand169fcd2013-02-27 13:18:47 -0800462 }
463
Brian Silvermand94642c2014-03-27 18:21:41 -0700464 static void StaticDoRestart(int, short, void *) {
Austin Schuhf257f3c2019-10-27 21:00:43 -0700465 AOS_LOG(DEBUG, "restarting everything that needs it\n");
Brian Silvermand94642c2014-03-27 18:21:41 -0700466 if (waiting_to_restart.find(core.get()) != waiting_to_restart.end()) {
467 core->DoRestart();
468 waiting_to_restart.erase(core.get());
469 }
470 for (auto c : waiting_to_restart) {
471 c->DoRestart();
472 }
473 waiting_to_restart.clear();
Brian Silvermand169fcd2013-02-27 13:18:47 -0800474 }
475
Brian Silvermanb1e4f6c2013-02-27 15:42:02 -0800476 // Called after somebody else has finished modifying the file.
Brian Silvermand169fcd2013-02-27 13:18:47 -0800477 void DoRestart() {
Brian Silvermand94642c2014-03-27 18:21:41 -0700478 fprintf(stderr, "DoRestart(%s)\n", binary_.c_str());
Brian Silvermanfe06fe12013-02-27 18:54:58 -0800479 if (stat_at_start_valid_) {
480 struct stat current_stat;
481 if (stat(original_binary_.c_str(), &current_stat) == -1) {
Austin Schuhf257f3c2019-10-27 21:00:43 -0700482 AOS_PLOG(FATAL, "stat(%s, %p) failed", original_binary_.c_str(),
483 &current_stat);
Brian Silvermanfe06fe12013-02-27 18:54:58 -0800484 }
485 if (current_stat.st_mtime == stat_at_start_.st_mtime) {
Austin Schuhf257f3c2019-10-27 21:00:43 -0700486 AOS_LOG(DEBUG, "ignoring trigger for %s because mtime didn't change\n",
487 name());
Brian Silvermanfe06fe12013-02-27 18:54:58 -0800488 return;
489 }
490 }
491
Brian Silvermand94642c2014-03-27 18:21:41 -0700492 if (this == core.get()) {
493 fprintf(stderr, "Restarting core -> exiting now.\n");
494 exit(0);
495 }
Brian Silvermand169fcd2013-02-27 13:18:47 -0800496 if (pid_ != -1) {
Austin Schuhf257f3c2019-10-27 21:00:43 -0700497 AOS_LOG(DEBUG, "sending SIGTERM to child %d to restart it\n", pid_);
Brian Silvermand169fcd2013-02-27 13:18:47 -0800498 if (kill(pid_, SIGTERM) == -1) {
Austin Schuhf257f3c2019-10-27 21:00:43 -0700499 AOS_PLOG(WARNING, "kill(%d, SIGTERM) failed", pid_);
Brian Silvermand169fcd2013-02-27 13:18:47 -0800500 }
501 CheckDiedStatus *status = new CheckDiedStatus();
502 status->self = this;
503 status->old_pid = pid_;
504 Timeout(kProcessDieTime, StaticCheckDied, status);
Brian Silvermand90b5fe2013-03-10 18:34:42 -0700505 } else {
Austin Schuhf257f3c2019-10-27 21:00:43 -0700506 AOS_LOG(WARNING, "%s restart attempted but not running\n", name());
Brian Silvermand169fcd2013-02-27 13:18:47 -0800507 }
508 }
509
510 static void StaticCheckDied(int, short, void *status_in) {
511 CheckDiedStatus *status = static_cast<CheckDiedStatus *>(status_in);
512 status->self->CheckDied(status->old_pid);
513 delete status;
514 }
Brian Silverman5cc661b2013-02-27 15:23:36 -0800515
516 // Checks to see if the child using the PID old_pid is still running.
Brian Silvermand169fcd2013-02-27 13:18:47 -0800517 void CheckDied(pid_t old_pid) {
518 if (pid_ == old_pid) {
Austin Schuhf257f3c2019-10-27 21:00:43 -0700519 AOS_LOG(WARNING, "child %d refused to die\n", old_pid);
Brian Silvermand169fcd2013-02-27 13:18:47 -0800520 if (kill(old_pid, SIGKILL) == -1) {
Austin Schuhf257f3c2019-10-27 21:00:43 -0700521 AOS_PLOG(WARNING, "kill(%d, SIGKILL) failed", old_pid);
Brian Silvermand169fcd2013-02-27 13:18:47 -0800522 }
523 }
524 }
525
526 static void StaticStart(int, short, void *self) {
527 static_cast<Child *>(self)->Start();
528 }
Brian Silverman5cc661b2013-02-27 15:23:36 -0800529
530 // Actually starts the child.
Brian Silvermand169fcd2013-02-27 13:18:47 -0800531 void Start() {
532 if (pid_ != -1) {
Austin Schuhf257f3c2019-10-27 21:00:43 -0700533 AOS_LOG(WARNING, "calling Start() but already have child %d running\n",
534 pid_);
Brian Silverman5cc661b2013-02-27 15:23:36 -0800535 if (kill(pid_, SIGKILL) == -1) {
Austin Schuhf257f3c2019-10-27 21:00:43 -0700536 AOS_PLOG(WARNING, "kill(%d, SIGKILL) failed", pid_);
Brian Silverman5cc661b2013-02-27 15:23:36 -0800537 return;
538 }
Brian Silvermand169fcd2013-02-27 13:18:47 -0800539 pid_ = -1;
540 }
Brian Silverman5cc661b2013-02-27 15:23:36 -0800541
542 // Remove the name that we run from (ie from a previous execution) and then
543 // hard link the real filename to it.
544 if (unlink(binary_.c_str()) != 0 && errno != ENOENT) {
Austin Schuhf257f3c2019-10-27 21:00:43 -0700545 AOS_PLOG(FATAL, "removing %s failed", binary_.c_str());
Brian Silverman5cc661b2013-02-27 15:23:36 -0800546 }
547 if (link(original_binary_.c_str(), binary_.c_str()) != 0) {
Austin Schuhf257f3c2019-10-27 21:00:43 -0700548 AOS_PLOG(FATAL, "link('%s', '%s') failed", original_binary_.c_str(),
549 binary_.c_str());
Brian Silverman5cc661b2013-02-27 15:23:36 -0800550 }
551
Brian Silvermanfe06fe12013-02-27 18:54:58 -0800552 if (stat(original_binary_.c_str(), &stat_at_start_) == -1) {
Austin Schuhf257f3c2019-10-27 21:00:43 -0700553 AOS_PLOG(FATAL, "stat(%s, %p) failed", original_binary_.c_str(),
554 &stat_at_start_);
Brian Silvermanfe06fe12013-02-27 18:54:58 -0800555 }
556 stat_at_start_valid_ = true;
557
Brian Silvermand169fcd2013-02-27 13:18:47 -0800558 if ((pid_ = fork()) == 0) {
559 ssize_t args_size = args_.size();
560 const char **argv = new const char *[args_size + 1];
561 for (int i = 0; i < args_size; ++i) {
562 argv[i] = args_[i].c_str();
563 }
564 argv[args_size] = NULL;
565 // The const_cast is safe because no code that might care if it gets
566 // modified can run afterwards.
567 execv(binary_.c_str(), const_cast<char **>(argv));
Austin Schuhf257f3c2019-10-27 21:00:43 -0700568 AOS_PLOG(FATAL, "execv(%s, %p) failed", binary_.c_str(), argv);
Brian Silvermand169fcd2013-02-27 13:18:47 -0800569 _exit(EXIT_FAILURE);
570 }
571 if (pid_ == -1) {
Austin Schuhf257f3c2019-10-27 21:00:43 -0700572 AOS_PLOG(FATAL, "forking to run \"%s\" failed", binary_.c_str());
Brian Silvermand169fcd2013-02-27 13:18:47 -0800573 }
Austin Schuhf257f3c2019-10-27 21:00:43 -0700574 AOS_LOG(DEBUG, "started \"%s\" successfully\n", binary_.c_str());
Brian Silvermand169fcd2013-02-27 13:18:47 -0800575 }
Brian Silvermanbc4fc2f2013-02-27 19:33:42 -0800576
577 // A history of the times that this process has been restarted.
Austin Schuhf2a50ba2016-12-24 16:16:26 -0800578 std::queue<monotonic_clock::time_point,
579 std::list<monotonic_clock::time_point>> restarts_;
Brian Silvermanbc4fc2f2013-02-27 19:33:42 -0800580
581 // The currently running child's PID or NULL.
582 pid_t pid_;
583
584 // All of the arguments (including the name of the binary).
585 std::deque<std::string> args_;
586
587 // The name of the real binary that we were told to run.
588 std::string original_binary_;
589 // The name of the file that we're actually running.
590 std::string binary_;
591
592 // Watches original_binary_.
593 unique_ptr<FileWatch> watcher_;
594
Brian Silvermanbc4fc2f2013-02-27 19:33:42 -0800595 // Captured from the original file when we most recently started a new child
596 // process. Used to see if it actually changes or not.
597 struct stat stat_at_start_;
598 bool stat_at_start_valid_;
599
Brian Silvermand94642c2014-03-27 18:21:41 -0700600 // An event that restarts after kRestartWaitTime.
601 static EventUniquePtr restart_timeout;
602
603 // The set of children waiting to be restarted once all modifications stop.
604 static ::std::set<Child *> waiting_to_restart;
605
Brian Silvermanbc4fc2f2013-02-27 19:33:42 -0800606 DISALLOW_COPY_AND_ASSIGN(Child);
Brian Silvermand169fcd2013-02-27 13:18:47 -0800607};
Brian Silverman52aeeac2013-08-28 16:20:53 -0700608
Austin Schuhf2a50ba2016-12-24 16:16:26 -0800609constexpr chrono::nanoseconds Child::kProcessDieTime;
610constexpr chrono::nanoseconds Child::kRestartWaitTime;
611constexpr chrono::nanoseconds Child::kMaxRestartsTime;
612constexpr chrono::nanoseconds Child::kResumeWait;
Brian Silvermand169fcd2013-02-27 13:18:47 -0800613
Brian Silvermand94642c2014-03-27 18:21:41 -0700614EventUniquePtr Child::restart_timeout;
615::std::set<Child *> Child::waiting_to_restart;
Brian Silvermand169fcd2013-02-27 13:18:47 -0800616
Brian Silverman5cc661b2013-02-27 15:23:36 -0800617// Kills off the entire process group (including ourself).
618void KillChildren(bool try_nice) {
Brian Silvermand169fcd2013-02-27 13:18:47 -0800619 if (try_nice) {
Austin Schuhf2a50ba2016-12-24 16:16:26 -0800620 static constexpr int kNiceStopSignal = SIGTERM;
621 static constexpr auto kNiceWaitTime = chrono::seconds(1);
Brian Silvermand169fcd2013-02-27 13:18:47 -0800622
623 // Make sure that we don't just nicely stop ourself...
624 sigset_t mask;
625 sigemptyset(&mask);
626 sigaddset(&mask, kNiceStopSignal);
627 sigprocmask(SIG_BLOCK, &mask, NULL);
628
Brian Silverman5cc661b2013-02-27 15:23:36 -0800629 kill(-getpid(), kNiceStopSignal);
630
631 fflush(NULL);
Austin Schuhf2a50ba2016-12-24 16:16:26 -0800632 ::std::this_thread::sleep_for(kNiceWaitTime);
Brian Silvermand169fcd2013-02-27 13:18:47 -0800633 }
Brian Silverman5cc661b2013-02-27 15:23:36 -0800634
Brian Silvermand169fcd2013-02-27 13:18:47 -0800635 // Send SIGKILL to our whole process group, which will forcibly terminate any
636 // of them that are still running (us for sure, maybe more too).
Brian Silverman5cc661b2013-02-27 15:23:36 -0800637 kill(-getpid(), SIGKILL);
Brian Silvermand169fcd2013-02-27 13:18:47 -0800638}
639
Brian Silverman5cc661b2013-02-27 15:23:36 -0800640void ExitHandler() {
641 KillChildren(true);
Brian Silvermand169fcd2013-02-27 13:18:47 -0800642}
Brian Silverman5cc661b2013-02-27 15:23:36 -0800643
644void KillChildrenSignalHandler(int signum) {
645 // If we get SIGSEGV or some other random signal who knows what's happening
646 // and we should just kill everybody immediately.
647 // This is a list of all of the signals that mean some form of "nicely stop".
648 KillChildren(signum == SIGHUP || signum == SIGINT || signum == SIGQUIT ||
Brian Silverman0eec9532013-02-27 20:24:16 -0800649 signum == SIGABRT || signum == SIGPIPE || signum == SIGTERM ||
650 signum == SIGXCPU);
Brian Silvermand169fcd2013-02-27 13:18:47 -0800651}
652
Brian Silverman5cc661b2013-02-27 15:23:36 -0800653// Returns the currently running child with PID pid or an empty unique_ptr.
Brian Silvermand169fcd2013-02-27 13:18:47 -0800654const unique_ptr<Child> &FindChild(pid_t pid) {
655 for (auto it = children.begin(); it != children.end(); ++it) {
656 if (pid == (*it)->pid()) {
657 return *it;
658 }
659 }
660
661 if (pid == core->pid()) {
662 return core;
663 }
664
Brian Silverman5cc661b2013-02-27 15:23:36 -0800665 static const unique_ptr<Child> kNothing;
Brian Silvermand169fcd2013-02-27 13:18:47 -0800666 return kNothing;
667}
668
Brian Silverman5cc661b2013-02-27 15:23:36 -0800669// Gets set up as a libevent handler for SIGCHLD.
670// Handles calling Child::ProcessDied() on the appropriate one.
671void SigCHLDReceived(int /*fd*/, short /*events*/, void *) {
Brian Silvermand169fcd2013-02-27 13:18:47 -0800672 // In a while loop in case we miss any SIGCHLDs.
673 while (true) {
674 siginfo_t infop;
675 infop.si_pid = 0;
676 if (waitid(P_ALL, 0, &infop, WEXITED | WSTOPPED | WNOHANG) != 0) {
Austin Schuhf257f3c2019-10-27 21:00:43 -0700677 AOS_PLOG(WARNING, "waitid failed");
Brian Silverman5cc661b2013-02-27 15:23:36 -0800678 continue;
Brian Silvermand169fcd2013-02-27 13:18:47 -0800679 }
Brian Silverman5cc661b2013-02-27 15:23:36 -0800680 // If there are no more child process deaths to process.
Brian Silvermand169fcd2013-02-27 13:18:47 -0800681 if (infop.si_pid == 0) {
Brian Silverman5cc661b2013-02-27 15:23:36 -0800682 return;
Brian Silvermand169fcd2013-02-27 13:18:47 -0800683 }
684
685 pid_t pid = infop.si_pid;
686 int status = infop.si_status;
687 const unique_ptr<Child> &child = FindChild(pid);
688 if (child) {
689 switch (infop.si_code) {
690 case CLD_EXITED:
Austin Schuhf257f3c2019-10-27 21:00:43 -0700691 AOS_LOG(WARNING, "child %d (%s) exited with status %d\n", pid,
692 child->name(), status);
Brian Silvermand169fcd2013-02-27 13:18:47 -0800693 break;
694 case CLD_DUMPED:
Austin Schuhf257f3c2019-10-27 21:00:43 -0700695 AOS_LOG(INFO,
696 "child %d actually dumped core. "
697 "falling through to killed by signal case\n",
698 pid);
James Kuszmaul3ae42262019-11-08 12:33:41 -0800699 [[fallthrough]];
700 /* FALLTHRU */
Brian Silvermand169fcd2013-02-27 13:18:47 -0800701 case CLD_KILLED:
702 // If somebody (possibly us) sent it SIGTERM that means that they just
703 // want it to stop, so it stopping isn't a WARNING.
Austin Schuhf257f3c2019-10-27 21:00:43 -0700704 AOS_LOG((status == SIGTERM) ? DEBUG : WARNING,
705 "child %d (%s) was killed by signal %d (%s)\n", pid,
706 child->name(), status, aos_strsignal(status));
Brian Silvermand169fcd2013-02-27 13:18:47 -0800707 break;
708 case CLD_STOPPED:
Austin Schuhf257f3c2019-10-27 21:00:43 -0700709 AOS_LOG(WARNING,
710 "child %d (%s) was stopped by signal %d "
711 "(giving it a SIGCONT(%d))\n",
712 pid, child->name(), status, SIGCONT);
Brian Silvermand169fcd2013-02-27 13:18:47 -0800713 kill(pid, SIGCONT);
714 continue;
715 default:
Austin Schuhf257f3c2019-10-27 21:00:43 -0700716 AOS_LOG(WARNING, "something happened to child %d (%s) (killing it)\n",
717 pid, child->name());
Brian Silvermand169fcd2013-02-27 13:18:47 -0800718 kill(pid, SIGKILL);
719 continue;
720 }
721 } else {
Austin Schuhf257f3c2019-10-27 21:00:43 -0700722 AOS_LOG(WARNING, "couldn't find a Child for pid %d\n", pid);
Brian Silverman5cc661b2013-02-27 15:23:36 -0800723 return;
Brian Silvermand169fcd2013-02-27 13:18:47 -0800724 }
725
Brian Silverman5cc661b2013-02-27 15:23:36 -0800726 if (child == core) {
Austin Schuhf257f3c2019-10-27 21:00:43 -0700727 AOS_LOG(FATAL, "core died\n");
Brian Silverman5cc661b2013-02-27 15:23:36 -0800728 }
Brian Silvermand169fcd2013-02-27 13:18:47 -0800729 child->ProcessDied();
730 }
731}
732
Brian Silverman5cc661b2013-02-27 15:23:36 -0800733// This is used for communicating the name of the file to read processes to
734// start from main to Run.
735const char *child_list_file;
736
Brian Silverman8070a222013-02-28 15:01:36 -0800737void Run(void *watch);
Brian Silvermand169fcd2013-02-27 13:18:47 -0800738void Main() {
739 logging::Init();
Brian Silvermand169fcd2013-02-27 13:18:47 -0800740
Comran Morshed7f6ba792016-02-21 16:54:05 +0000741 // Set UID to 0 so we can run things as root down below. Since the starter
742 // program on the roborio runs starter.sh under "lvuser", it will continuously
743 // fail due to lack of permissions if we do not manually set the UID to admin.
744#ifdef AOS_ARCHITECTURE_arm_frc
745 if (setuid(0) != 0) {
Austin Schuhf257f3c2019-10-27 21:00:43 -0700746 AOS_PLOG(FATAL, "setuid(0) failed");
Comran Morshed7f6ba792016-02-21 16:54:05 +0000747 }
748#endif
749
Brian Silverman5cc661b2013-02-27 15:23:36 -0800750 if (setpgid(0 /*self*/, 0 /*make PGID the same as PID*/) != 0) {
Austin Schuhf257f3c2019-10-27 21:00:43 -0700751 AOS_PLOG(FATAL, "setpgid(0, 0) failed");
Brian Silverman5cc661b2013-02-27 15:23:36 -0800752 }
Brian Silvermand169fcd2013-02-27 13:18:47 -0800753
754 // Make sure that we kill all children when we exit.
Brian Silverman5cc661b2013-02-27 15:23:36 -0800755 atexit(ExitHandler);
Brian Silvermand169fcd2013-02-27 13:18:47 -0800756 // Do it on some signals too (ones that we otherwise tend to receive and then
757 // leave all of our children going).
Brian Silverman5cc661b2013-02-27 15:23:36 -0800758 signal(SIGHUP, KillChildrenSignalHandler);
759 signal(SIGINT, KillChildrenSignalHandler);
760 signal(SIGQUIT, KillChildrenSignalHandler);
761 signal(SIGILL, KillChildrenSignalHandler);
762 signal(SIGABRT, KillChildrenSignalHandler);
763 signal(SIGFPE, KillChildrenSignalHandler);
764 signal(SIGSEGV, KillChildrenSignalHandler);
765 signal(SIGPIPE, KillChildrenSignalHandler);
766 signal(SIGTERM, KillChildrenSignalHandler);
767 signal(SIGBUS, KillChildrenSignalHandler);
768 signal(SIGXCPU, KillChildrenSignalHandler);
Brian Silverman35df22f2015-12-27 17:57:10 -0800769
770#ifdef AOS_ARCHITECTURE_arm_frc
771 // Just allow overcommit memory like usual. Various processes map memory they
772 // will never use, and the roboRIO doesn't have enough RAM to handle it.
773 // This is in here instead of starter.sh because starter.sh doesn't run with
774 // permissions on a roboRIO.
Austin Schuhf257f3c2019-10-27 21:00:43 -0700775 AOS_CHECK(system("echo 0 > /proc/sys/vm/overcommit_memory") == 0);
Brian Silverman35df22f2015-12-27 17:57:10 -0800776#endif
James Kuszmaul3ae42262019-11-08 12:33:41 -0800777
Brian Silvermand169fcd2013-02-27 13:18:47 -0800778 libevent_base = EventBaseUniquePtr(event_base_new());
779
Brian Silvermanb1e4f6c2013-02-27 15:42:02 -0800780 std::string core_touch_file = "/tmp/starter.";
Brian Silvermand169fcd2013-02-27 13:18:47 -0800781 core_touch_file += std::to_string(static_cast<intmax_t>(getpid()));
782 core_touch_file += ".core_touch_file";
Brian Silvermanaf784862014-05-13 08:14:55 -0700783 const int result =
784 ::aos::util::RunCommand(("touch '" + core_touch_file + "'").c_str());
785 if (result == -1) {
Austin Schuhf257f3c2019-10-27 21:00:43 -0700786 AOS_PLOG(FATAL, "running `touch '%s'` failed\n", core_touch_file.c_str());
Brian Silvermanaf784862014-05-13 08:14:55 -0700787 } else if (!WIFEXITED(result) || WEXITSTATUS(result) != 0) {
Austin Schuhf257f3c2019-10-27 21:00:43 -0700788 AOS_LOG(FATAL, "`touch '%s'` gave result %x\n", core_touch_file.c_str(),
789 result);
Brian Silvermanb1e4f6c2013-02-27 15:42:02 -0800790 }
791 FileWatch core_touch_file_watch(core_touch_file, Run, NULL);
Brian Silvermand169fcd2013-02-27 13:18:47 -0800792 core = unique_ptr<Child>(
Brian Silvermanb1e4f6c2013-02-27 15:42:02 -0800793 new Child("core " + core_touch_file));
Brian Silvermand169fcd2013-02-27 13:18:47 -0800794
795 FILE *pid_file = fopen("/tmp/starter.pid", "w");
796 if (pid_file == NULL) {
Austin Schuhf257f3c2019-10-27 21:00:43 -0700797 AOS_PLOG(FATAL, "fopen(\"/tmp/starter.pid\", \"w\") failed");
Brian Silvermand169fcd2013-02-27 13:18:47 -0800798 } else {
799 if (fprintf(pid_file, "%d", core->pid()) == -1) {
Austin Schuhf257f3c2019-10-27 21:00:43 -0700800 AOS_PLOG(WARNING, "fprintf(%p, \"%%d\", %d) failed", pid_file,
801 core->pid());
Brian Silvermand169fcd2013-02-27 13:18:47 -0800802 }
803 fclose(pid_file);
804 }
805
Austin Schuhf257f3c2019-10-27 21:00:43 -0700806 AOS_LOG(INFO, "waiting for %s to appear\n", core_touch_file.c_str());
Brian Silvermand169fcd2013-02-27 13:18:47 -0800807
808 event_base_dispatch(libevent_base.get());
Austin Schuhf257f3c2019-10-27 21:00:43 -0700809 AOS_LOG(FATAL, "event_base_dispatch(%p) returned\n", libevent_base.get());
Brian Silvermand169fcd2013-02-27 13:18:47 -0800810}
811
Brian Silverman0eec9532013-02-27 20:24:16 -0800812// This is the callback for when core creates the file indicating that it has
813// started.
814void Run(void *watch) {
815 // Make it so it doesn't keep on seeing random changes in /tmp.
816 static_cast<FileWatch *>(watch)->RemoveWatch();
817
818 // It's safe now because core is up.
819 aos::InitNRT();
820
821 std::ifstream list_file(child_list_file);
James Kuszmaul3ae42262019-11-08 12:33:41 -0800822
Brian Silverman0eec9532013-02-27 20:24:16 -0800823 while (true) {
824 std::string child_name;
825 getline(list_file, child_name);
826 if ((list_file.rdstate() & std::ios_base::eofbit) != 0) {
827 break;
828 }
829 if (list_file.rdstate() != 0) {
Austin Schuhf257f3c2019-10-27 21:00:43 -0700830 AOS_LOG(FATAL, "reading input file %s failed\n", child_list_file);
Brian Silverman0eec9532013-02-27 20:24:16 -0800831 }
832 children.push_back(unique_ptr<Child>(new Child(child_name)));
833 }
834
835 EventUniquePtr sigchld(event_new(libevent_base.get(), SIGCHLD,
836 EV_SIGNAL | EV_PERSIST,
837 SigCHLDReceived, NULL));
838 event_add(sigchld.release(), NULL);
839}
840
Brian Silverman8070a222013-02-28 15:01:36 -0800841const char *kArgsHelp = "[OPTION]... START_LIST\n"
842 "Start all of the robot code binaries in START_LIST.\n"
843 "\n"
844 "START_LIST is the file to read binaries (looked up on PATH) to run.\n"
845 " --help display this help and exit\n";
846void PrintHelp() {
847 fprintf(stderr, "Usage: %s %s", program_invocation_name, kArgsHelp);
848}
849
Brian Silvermand169fcd2013-02-27 13:18:47 -0800850} // namespace starter
851} // namespace aos
852
853int main(int argc, char *argv[]) {
Brian Silverman8070a222013-02-28 15:01:36 -0800854 if (argc != 2) {
855 aos::starter::PrintHelp();
Brian Silvermand169fcd2013-02-27 13:18:47 -0800856 exit(EXIT_FAILURE);
Brian Silvermand169fcd2013-02-27 13:18:47 -0800857 }
Brian Silverman8070a222013-02-28 15:01:36 -0800858 if (strcmp(argv[1], "--help") == 0) {
859 aos::starter::PrintHelp();
860 exit(EXIT_SUCCESS);
861 }
862
Brian Silvermand169fcd2013-02-27 13:18:47 -0800863 aos::starter::child_list_file = argv[1];
864
865 aos::starter::Main();
866}