blob: 187acc39eb41ae04836bdb11e5088ec342a3b048 [file] [log] [blame]
Brian Silvermand169fcd2013-02-27 13:18:47 -08001#include <stdio.h>
2#include <stdlib.h>
3#include <sys/types.h>
4#include <fcntl.h>
5#include <sys/inotify.h>
6#include <sys/stat.h>
7#include <sys/ioctl.h>
8#include <assert.h>
9#include <signal.h>
10#include <stdint.h>
11#include <errno.h>
12#include <string.h>
13#include <sys/wait.h>
Brian Silvermand90b5fe2013-03-10 18:34:42 -070014#include <inttypes.h>
Brian Silvermand169fcd2013-02-27 13:18:47 -080015
16#include <map>
17#include <functional>
18#include <deque>
19#include <fstream>
20#include <queue>
21#include <list>
22#include <string>
23#include <vector>
24#include <memory>
25
26#include <event2/event.h>
27
28#include "aos/common/logging/logging.h"
29#include "aos/common/logging/logging_impl.h"
Brian Silverman14fd0fb2014-01-14 21:42:01 -080030#include "aos/linux_code/init.h"
Brian Silvermand169fcd2013-02-27 13:18:47 -080031#include "aos/common/unique_malloc_ptr.h"
32#include "aos/common/time.h"
Brian Silverman5cc661b2013-02-27 15:23:36 -080033#include "aos/common/once.h"
Brian Silvermand169fcd2013-02-27 13:18:47 -080034
35// This is the main piece of code that starts all of the rest of the code and
36// restarts it when the binaries are modified.
37//
Brian Silvermanbc4fc2f2013-02-27 19:33:42 -080038// Throughout, the code is not terribly concerned with thread safety because
39// there is only 1 thread. It does some setup and then lets inotify run things
40// when appropriate.
41//
Brian Silverman5cc661b2013-02-27 15:23:36 -080042// NOTE: This program should never exit nicely. It catches all nice attempts to
43// exit, forwards them to all of the children that it has started, waits for
Brian Silvermand169fcd2013-02-27 13:18:47 -080044// them to exit nicely, and then SIGKILLs anybody left (which will always
45// include itself).
46
47using ::std::unique_ptr;
48
49namespace aos {
50namespace starter {
51
Brian Silverman0eec9532013-02-27 20:24:16 -080052// TODO(brians): split out the c++ libevent wrapper stuff into its own file(s)
Brian Silvermand169fcd2013-02-27 13:18:47 -080053class EventBaseDeleter {
54 public:
55 void operator()(event_base *base) {
Brian Silverman8070a222013-02-28 15:01:36 -080056 if (base == NULL) return;
Brian Silvermand169fcd2013-02-27 13:18:47 -080057 event_base_free(base);
58 }
59};
60typedef unique_ptr<event_base, EventBaseDeleter> EventBaseUniquePtr;
Brian Silverman5cc661b2013-02-27 15:23:36 -080061EventBaseUniquePtr libevent_base;
Brian Silvermand169fcd2013-02-27 13:18:47 -080062
63class EventDeleter {
64 public:
65 void operator()(event *evt) {
Brian Silverman8070a222013-02-28 15:01:36 -080066 if (evt == NULL) return;
Brian Silvermand169fcd2013-02-27 13:18:47 -080067 if (event_del(evt) != 0) {
68 LOG(WARNING, "event_del(%p) failed\n", evt);
69 }
70 }
71};
72typedef unique_ptr<event, EventDeleter> EventUniquePtr;
73
Brian Silverman5cc661b2013-02-27 15:23:36 -080074// Watches a file path for modifications. Once created, keeps watching until
75// destroyed or RemoveWatch() is called.
Brian Silverman0eec9532013-02-27 20:24:16 -080076// TODO(brians): split this out into its own file + tests
Brian Silvermand169fcd2013-02-27 13:18:47 -080077class FileWatch {
78 public:
79 // Will call callback(value) when filename is modified.
80 // If value is NULL, then a pointer to this object will be passed instead.
Brian Silverman5cc661b2013-02-27 15:23:36 -080081 //
82 // Watching for file creations is slightly different. To do that, pass true
Brian Silverman8070a222013-02-28 15:01:36 -080083 // as create, the directory where the file will be created for filename, and
Brian Silverman5cc661b2013-02-27 15:23:36 -080084 // the name of the file (without directory name) for check_filename.
Brian Silvermand169fcd2013-02-27 13:18:47 -080085 FileWatch(std::string filename,
Brian Silverman8070a222013-02-28 15:01:36 -080086 std::function<void(void *)> callback,
87 void *value,
88 bool create = false,
89 std::string check_filename = "")
90 : filename_(filename),
91 callback_(callback),
92 value_(value),
Brian Silvermand90b5fe2013-03-10 18:34:42 -070093 create_(create),
94 check_filename_(check_filename),
95 watch_(-1) {
Brian Silverman5cc661b2013-02-27 15:23:36 -080096 init_once.Get();
97
Brian Silvermand90b5fe2013-03-10 18:34:42 -070098 CreateWatch();
Brian Silvermand169fcd2013-02-27 13:18:47 -080099 }
100 // Cleans up everything.
101 ~FileWatch() {
102 if (watch_ != -1) {
103 RemoveWatch();
104 }
105 }
106
107 // After calling this method, this object won't really be doing much of
Brian Silverman5cc661b2013-02-27 15:23:36 -0800108 // anything besides possibly running its callback or something.
Brian Silvermand169fcd2013-02-27 13:18:47 -0800109 void RemoveWatch() {
110 assert(watch_ != -1);
Brian Silverman5cc661b2013-02-27 15:23:36 -0800111
Brian Silvermand169fcd2013-02-27 13:18:47 -0800112 if (inotify_rm_watch(notify_fd, watch_) == -1) {
113 LOG(WARNING, "inotify_rm_watch(%d, %d) failed with %d: %s\n",
114 notify_fd, watch_, errno, strerror(errno));
115 }
Brian Silverman5cc661b2013-02-27 15:23:36 -0800116
Brian Silvermand90b5fe2013-03-10 18:34:42 -0700117 RemoveWatchFromMap();
Brian Silvermand169fcd2013-02-27 13:18:47 -0800118 }
119
Brian Silverman5cc661b2013-02-27 15:23:36 -0800120 private:
121 // Performs the static initialization. Called by init_once from the
122 // constructor.
123 static void *Init() {
124 notify_fd = inotify_init1(IN_CLOEXEC);
125 EventUniquePtr notify_event(event_new(libevent_base.get(), notify_fd,
126 EV_READ | EV_PERSIST,
127 FileWatch::INotifyReadable, NULL));
128 event_add(notify_event.release(), NULL);
129 return NULL;
Brian Silvermand169fcd2013-02-27 13:18:47 -0800130 }
131
Brian Silvermand90b5fe2013-03-10 18:34:42 -0700132 void RemoveWatchFromMap() {
133 if (watchers[watch_] != this) {
134 LOG(WARNING, "watcher for %s (%p) didn't find itself in the map\n",
135 filename_.c_str(), this);
136 } else {
137 watchers.erase(watch_);
138 }
139 LOG(DEBUG, "removed watch ID %d\n", watch_);
140 watch_ = -1;
141 }
142
143 void CreateWatch() {
144 assert(watch_ == -1);
145 watch_ = inotify_add_watch(notify_fd, filename_.c_str(),
146 create_ ? IN_CREATE : (IN_ATTRIB |
147 IN_MODIFY |
148 IN_DELETE_SELF |
149 IN_MOVE_SELF));
150 if (watch_ == -1) {
151 LOG(FATAL, "inotify_add_watch(%d, %s,"
152 " %s ? IN_CREATE : (IN_ATTRIB | IN_MODIFY)) failed with %d: %s\n",
153 notify_fd, filename_.c_str(), create_ ? "true" : "false",
154 errno, strerror(errno));
155 }
156 watchers[watch_] = this;
157 LOG(DEBUG, "watch for %s is %d\n", filename_.c_str(), watch_);
158 }
159
Brian Silvermand169fcd2013-02-27 13:18:47 -0800160 // This gets set up as the callback for EV_READ on the inotify file
Brian Silverman5cc661b2013-02-27 15:23:36 -0800161 // descriptor. It calls FileNotified on the appropriate instance.
Brian Silvermand169fcd2013-02-27 13:18:47 -0800162 static void INotifyReadable(int /*fd*/, short /*events*/, void *) {
163 unsigned int to_read;
Brian Silverman5cc661b2013-02-27 15:23:36 -0800164 // Use FIONREAD to figure out how many bytes there are to read.
Brian Silvermand169fcd2013-02-27 13:18:47 -0800165 if (ioctl(notify_fd, FIONREAD, &to_read) < 0) {
166 LOG(FATAL, "FIONREAD(%d, %p) failed with %d: %s\n",
167 notify_fd, &to_read, errno, strerror(errno));
168 }
169 inotify_event *notifyevt = static_cast<inotify_event *>(malloc(to_read));
170 const char *end = reinterpret_cast<char *>(notifyevt) + to_read;
171 aos::unique_c_ptr<inotify_event> freer(notifyevt);
172
173 ssize_t ret = read(notify_fd, notifyevt, to_read);
174 if (ret < 0) {
175 LOG(FATAL, "read(%d, %p, %u) failed with %d: %s\n",
176 notify_fd, notifyevt, to_read, errno, strerror(errno));
177 }
178 if (static_cast<size_t>(ret) != to_read) {
179 LOG(ERROR, "read(%d, %p, %u) returned %zd instead of %u\n",
180 notify_fd, notifyevt, to_read, ret, to_read);
181 return;
182 }
183
Brian Silverman5cc661b2013-02-27 15:23:36 -0800184 // Keep looping through until we get to the end because inotify does return
185 // multiple events at once.
Brian Silvermanb1e4f6c2013-02-27 15:42:02 -0800186 while (reinterpret_cast<char *>(notifyevt) < end) {
Brian Silvermand169fcd2013-02-27 13:18:47 -0800187 if (watchers.count(notifyevt->wd) != 1) {
Brian Silvermanfe06fe12013-02-27 18:54:58 -0800188 LOG(WARNING, "couldn't find whose watch ID %d is\n", notifyevt->wd);
Brian Silvermanb1e4f6c2013-02-27 15:42:02 -0800189 } else {
Brian Silverman8efe23e2013-07-07 23:31:37 -0700190 LOG(DEBUG, "mask=%" PRIu32 "\n", notifyevt->mask);
Brian Silvermand90b5fe2013-03-10 18:34:42 -0700191 // If it was something that means the file got deleted.
192 if (notifyevt->mask & (IN_MOVE_SELF | IN_DELETE_SELF | IN_IGNORED)) {
193 watchers[notifyevt->wd]->WatchDeleted();
194 } else {
195 watchers[notifyevt->wd]->FileNotified((notifyevt->len > 0) ?
196 notifyevt->name : NULL);
197 }
Brian Silvermand169fcd2013-02-27 13:18:47 -0800198 }
Brian Silvermand169fcd2013-02-27 13:18:47 -0800199
200 notifyevt = reinterpret_cast<inotify_event *>(
Brian Silvermandbdf1d02013-11-17 13:19:41 -0800201 __builtin_assume_aligned(reinterpret_cast<char *>(notifyevt) +
202 sizeof(*notifyevt) + notifyevt->len,
203 alignof(notifyevt)));
Brian Silvermand169fcd2013-02-27 13:18:47 -0800204 }
205 }
206
Brian Silvermand90b5fe2013-03-10 18:34:42 -0700207 // INotifyReadable calls this method whenever the watch for our file gets
208 // removed somehow.
209 void WatchDeleted() {
210 LOG(DEBUG, "watch for %s deleted\n", filename_.c_str());
211 RemoveWatchFromMap();
212 CreateWatch();
213 }
214
Brian Silverman5cc661b2013-02-27 15:23:36 -0800215 // INotifyReadable calls this method whenever the watch for our file triggers.
216 void FileNotified(const char *filename) {
217 assert(watch_ != -1);
Brian Silvermanb1e4f6c2013-02-27 15:42:02 -0800218 LOG(DEBUG, "got a notification for %s\n", filename_.c_str());
Brian Silverman5cc661b2013-02-27 15:23:36 -0800219
220 if (!check_filename_.empty()) {
221 if (filename == NULL) {
222 return;
223 }
224 if (std::string(filename) != check_filename_) {
225 return;
226 }
227 }
228
229 callback_((value_ == NULL) ? this : value_);
Brian Silvermand169fcd2013-02-27 13:18:47 -0800230 }
231
Brian Silverman5cc661b2013-02-27 15:23:36 -0800232 // To make sure that Init gets called exactly once.
233 static ::aos::Once<void> init_once;
234
Brian Silvermand169fcd2013-02-27 13:18:47 -0800235 const std::string filename_;
236 const std::function<void(void *)> callback_;
237 void *const value_;
Brian Silvermand90b5fe2013-03-10 18:34:42 -0700238 const bool create_;
Brian Silvermand169fcd2013-02-27 13:18:47 -0800239 std::string check_filename_;
240
241 // The watch descriptor or -1 if we don't have one any more.
242 int watch_;
243
Brian Silvermanbc4fc2f2013-02-27 19:33:42 -0800244 // Map from watch IDs to instances of this class.
245 // <https://patchwork.kernel.org/patch/73192/> ("inotify: do not reuse watch
246 // descriptors") says they won't get reused, but that shouldn't be counted on
247 // because we might have a modified/different version/whatever kernel.
Brian Silvermand169fcd2013-02-27 13:18:47 -0800248 static std::map<int, FileWatch *> watchers;
Brian Silverman5cc661b2013-02-27 15:23:36 -0800249 // The inotify(7) file descriptor.
Brian Silvermand169fcd2013-02-27 13:18:47 -0800250 static int notify_fd;
Brian Silvermanbc4fc2f2013-02-27 19:33:42 -0800251
252 DISALLOW_COPY_AND_ASSIGN(FileWatch);
Brian Silvermand169fcd2013-02-27 13:18:47 -0800253};
Brian Silverman5cc661b2013-02-27 15:23:36 -0800254::aos::Once<void> FileWatch::init_once(FileWatch::Init);
Brian Silvermand169fcd2013-02-27 13:18:47 -0800255std::map<int, FileWatch *> FileWatch::watchers;
256int FileWatch::notify_fd;
257
Brian Silverman5cc661b2013-02-27 15:23:36 -0800258// Runs the given command and returns its first line of output (not including
259// the \n). LOG(FATAL)s if the command has an exit status other than 0 or does
260// not print out an entire line.
Brian Silvermand169fcd2013-02-27 13:18:47 -0800261std::string RunCommand(std::string command) {
Brian Silverman5cc661b2013-02-27 15:23:36 -0800262 // popen(3) might fail and not set it.
Brian Silvermand169fcd2013-02-27 13:18:47 -0800263 errno = 0;
Brian Silverman5cc661b2013-02-27 15:23:36 -0800264 FILE *pipe = popen(command.c_str(), "r");
265 if (pipe == NULL) {
Brian Silvermand169fcd2013-02-27 13:18:47 -0800266 LOG(FATAL, "popen(\"%s\", \"r\") failed with %d: %s\n",
267 command.c_str(), errno, strerror(errno));
268 }
269
Brian Silverman5cc661b2013-02-27 15:23:36 -0800270 // result_size is how many bytes result is currently allocated to.
Brian Silvermand169fcd2013-02-27 13:18:47 -0800271 size_t result_size = 128, read = 0;
272 unique_c_ptr<char> result(static_cast<char *>(malloc(result_size)));
273 while (true) {
Brian Silverman5cc661b2013-02-27 15:23:36 -0800274 // If we filled up the buffer, then realloc(3) it bigger.
Brian Silvermand169fcd2013-02-27 13:18:47 -0800275 if (read == result_size) {
276 result_size *= 2;
277 void *new_result = realloc(result.get(), result_size);
278 if (new_result == NULL) {
Brian Silverman5cc661b2013-02-27 15:23:36 -0800279 LOG(FATAL, "realloc(%p, %zd) failed because of %d: %s\n",
280 result.get(), result_size, errno, strerror(errno));
Brian Silvermand169fcd2013-02-27 13:18:47 -0800281 } else {
282 result.release();
283 result = unique_c_ptr<char>(static_cast<char *>(new_result));
284 }
285 }
286
Brian Silverman5cc661b2013-02-27 15:23:36 -0800287 size_t ret = fread(result.get() + read, 1, result_size - read, pipe);
288 // If the read didn't fill up the whole buffer, check to see if it was
289 // because of an error.
Brian Silvermand169fcd2013-02-27 13:18:47 -0800290 if (ret < result_size - read) {
Brian Silverman5cc661b2013-02-27 15:23:36 -0800291 if (ferror(pipe)) {
Brian Silvermand169fcd2013-02-27 13:18:47 -0800292 LOG(FATAL, "couldn't finish reading output of \"%s\"\n",
293 command.c_str());
294 }
295 }
296 read += ret;
297 if (read > 0 && result.get()[read - 1] == '\n') {
298 break;
299 }
300
Brian Silverman5cc661b2013-02-27 15:23:36 -0800301 if (feof(pipe)) {
302 LOG(FATAL, "`%s` failed. didn't print a whole line\n", command.c_str());
Brian Silvermand169fcd2013-02-27 13:18:47 -0800303 }
304 }
305
Brian Silverman5cc661b2013-02-27 15:23:36 -0800306 // Get rid of the first \n and anything after it.
Brian Silvermand169fcd2013-02-27 13:18:47 -0800307 *strchrnul(result.get(), '\n') = '\0';
308
Brian Silverman5cc661b2013-02-27 15:23:36 -0800309 int child_status = pclose(pipe);
Brian Silvermand169fcd2013-02-27 13:18:47 -0800310 if (child_status == -1) {
Brian Silverman5cc661b2013-02-27 15:23:36 -0800311 LOG(FATAL, "pclose(%p) failed with %d: %s\n", pipe,
Brian Silvermand169fcd2013-02-27 13:18:47 -0800312 errno, strerror(errno));
313 }
314
315 if (child_status != 0) {
316 LOG(FATAL, "`%s` failed. return %d\n", command.c_str(), child_status);
317 }
318
319 return std::string(result.get());
320}
321
322// Will call callback(arg) after time.
323void Timeout(time::Time time, void (*callback)(int, short, void *), void *arg) {
324 EventUniquePtr timeout(evtimer_new(libevent_base.get(), callback, arg));
325 struct timeval time_timeval = time.ToTimeval();
326 evtimer_add(timeout.release(), &time_timeval);
327}
328
329// Represents a child process. It will take care of restarting itself etc.
330class Child {
331 public:
Brian Silverman5cc661b2013-02-27 15:23:36 -0800332 // command is the (space-separated) command to run and its arguments.
333 Child(const std::string &command) : pid_(-1),
Brian Silvermand169fcd2013-02-27 13:18:47 -0800334 restart_timeout_(
Brian Silvermanfe06fe12013-02-27 18:54:58 -0800335 evtimer_new(libevent_base.get(), StaticDoRestart, this)),
336 stat_at_start_valid_(false) {
Brian Silvermand169fcd2013-02-27 13:18:47 -0800337 const char *start, *end;
338 start = command.c_str();
339 while (true) {
340 end = strchrnul(start, ' ');
341 args_.push_back(std::string(start, end - start));
342 start = end + 1;
343 if (*end == '\0') {
344 break;
345 }
346 }
347
Brian Silverman5cc661b2013-02-27 15:23:36 -0800348 original_binary_ = RunCommand("which " + args_[0]);
349 binary_ = original_binary_ + ".stm";
Brian Silvermand169fcd2013-02-27 13:18:47 -0800350
351 watcher_ = unique_ptr<FileWatch>(
Brian Silverman5cc661b2013-02-27 15:23:36 -0800352 new FileWatch(original_binary_, StaticFileModified, this));
Brian Silvermand169fcd2013-02-27 13:18:47 -0800353
354 Start();
355 }
356
357 pid_t pid() { return pid_; }
358
359 // This gets called whenever the actual process dies and should (probably) be
360 // restarted.
361 void ProcessDied() {
362 pid_ = -1;
363 restarts_.push(time::Time::Now());
364 if (restarts_.size() > kMaxRestartsNumber) {
365 time::Time oldest = restarts_.front();
366 restarts_.pop();
Brian Silvermand90b5fe2013-03-10 18:34:42 -0700367 if ((time::Time::Now() - oldest) <= kMaxRestartsTime) {
Brian Silvermand169fcd2013-02-27 13:18:47 -0800368 LOG(WARNING, "process %s getting restarted too often\n", name());
369 Timeout(kResumeWait, StaticStart, this);
370 return;
371 }
372 }
373 Start();
374 }
375
376 // Returns a name for logging purposes.
377 const char *name() {
378 return args_[0].c_str();
379 }
380
381 private:
382 struct CheckDiedStatus {
383 Child *self;
384 pid_t old_pid;
385 };
386
387 // How long to wait for a child to die nicely.
Brian Silverman52aeeac2013-08-28 16:20:53 -0700388 static constexpr time::Time kProcessDieTime = time::Time::InSeconds(0.75);
Brian Silvermand169fcd2013-02-27 13:18:47 -0800389
390 // How long to wait after the file is modified to restart it.
391 // This is important because some programs like modifying the binaries by
392 // writing them in little bits, which results in attempting to start partial
393 // binaries without this.
Brian Silverman52aeeac2013-08-28 16:20:53 -0700394 static constexpr time::Time kRestartWaitTime = time::Time::InSeconds(1.5);
Brian Silvermand169fcd2013-02-27 13:18:47 -0800395
Brian Silverman5cc661b2013-02-27 15:23:36 -0800396 // Only kMaxRestartsNumber restarts will be allowed in kMaxRestartsTime.
Brian Silverman52aeeac2013-08-28 16:20:53 -0700397 static constexpr time::Time kMaxRestartsTime = time::Time::InSeconds(4);
398 static const size_t kMaxRestartsNumber = 3;
Brian Silvermand169fcd2013-02-27 13:18:47 -0800399 // How long to wait if it gets restarted too many times.
Brian Silverman52aeeac2013-08-28 16:20:53 -0700400 static constexpr time::Time kResumeWait = time::Time::InSeconds(5);
Brian Silverman5cc661b2013-02-27 15:23:36 -0800401
Brian Silvermand169fcd2013-02-27 13:18:47 -0800402 static void StaticFileModified(void *self) {
403 static_cast<Child *>(self)->FileModified();
404 }
Brian Silverman5cc661b2013-02-27 15:23:36 -0800405
Brian Silvermand169fcd2013-02-27 13:18:47 -0800406 void FileModified() {
Brian Silvermand90b5fe2013-03-10 18:34:42 -0700407 LOG(DEBUG, "file for %s modified\n", name());
Brian Silvermand169fcd2013-02-27 13:18:47 -0800408 struct timeval restart_time_timeval = kRestartWaitTime.ToTimeval();
409 // This will reset the timeout again if it hasn't run yet.
Brian Silvermand90b5fe2013-03-10 18:34:42 -0700410 if (evtimer_add(restart_timeout_.get(), &restart_time_timeval) != 0) {
411 LOG(FATAL, "evtimer_add(%p, %p) failed\n",
412 restart_timeout_.get(), &restart_time_timeval);
413 }
Brian Silvermand169fcd2013-02-27 13:18:47 -0800414 }
415
416 static void StaticDoRestart(int, short, void *self) {
417 static_cast<Child *>(self)->DoRestart();
418 }
419
Brian Silvermanb1e4f6c2013-02-27 15:42:02 -0800420 // Called after somebody else has finished modifying the file.
Brian Silvermand169fcd2013-02-27 13:18:47 -0800421 void DoRestart() {
Brian Silvermanfe06fe12013-02-27 18:54:58 -0800422 if (stat_at_start_valid_) {
423 struct stat current_stat;
424 if (stat(original_binary_.c_str(), &current_stat) == -1) {
425 LOG(FATAL, "stat(%s, %p) failed with %d: %s\n",
426 original_binary_.c_str(), &current_stat, errno, strerror(errno));
427 }
428 if (current_stat.st_mtime == stat_at_start_.st_mtime) {
429 LOG(DEBUG, "ignoring trigger for %s because mtime didn't change\n",
430 name());
431 return;
432 }
433 }
434
Brian Silvermand169fcd2013-02-27 13:18:47 -0800435 if (pid_ != -1) {
436 LOG(DEBUG, "sending SIGTERM to child %d to restart it\n", pid_);
437 if (kill(pid_, SIGTERM) == -1) {
438 LOG(WARNING, "kill(%d, SIGTERM) failed with %d: %s\n",
439 pid_, errno, strerror(errno));
440 }
441 CheckDiedStatus *status = new CheckDiedStatus();
442 status->self = this;
443 status->old_pid = pid_;
444 Timeout(kProcessDieTime, StaticCheckDied, status);
Brian Silvermand90b5fe2013-03-10 18:34:42 -0700445 } else {
446 LOG(WARNING, "%s restart attempted but not running\n", name());
Brian Silvermand169fcd2013-02-27 13:18:47 -0800447 }
448 }
449
450 static void StaticCheckDied(int, short, void *status_in) {
451 CheckDiedStatus *status = static_cast<CheckDiedStatus *>(status_in);
452 status->self->CheckDied(status->old_pid);
453 delete status;
454 }
Brian Silverman5cc661b2013-02-27 15:23:36 -0800455
456 // Checks to see if the child using the PID old_pid is still running.
Brian Silvermand169fcd2013-02-27 13:18:47 -0800457 void CheckDied(pid_t old_pid) {
458 if (pid_ == old_pid) {
459 LOG(WARNING, "child %d refused to die\n", old_pid);
460 if (kill(old_pid, SIGKILL) == -1) {
461 LOG(WARNING, "kill(%d, SIGKILL) failed with %d: %s\n",
462 old_pid, errno, strerror(errno));
463 }
464 }
465 }
466
467 static void StaticStart(int, short, void *self) {
468 static_cast<Child *>(self)->Start();
469 }
Brian Silverman5cc661b2013-02-27 15:23:36 -0800470
471 // Actually starts the child.
Brian Silvermand169fcd2013-02-27 13:18:47 -0800472 void Start() {
473 if (pid_ != -1) {
474 LOG(WARNING, "calling Start() but already have child %d running\n",
475 pid_);
Brian Silverman5cc661b2013-02-27 15:23:36 -0800476 if (kill(pid_, SIGKILL) == -1) {
477 LOG(WARNING, "kill(%d, SIGKILL) failed with %d: %s\n",
478 pid_, errno, strerror(errno));
479 return;
480 }
Brian Silvermand169fcd2013-02-27 13:18:47 -0800481 pid_ = -1;
482 }
Brian Silverman5cc661b2013-02-27 15:23:36 -0800483
484 // Remove the name that we run from (ie from a previous execution) and then
485 // hard link the real filename to it.
486 if (unlink(binary_.c_str()) != 0 && errno != ENOENT) {
487 LOG(FATAL, "removing %s failed because of %d: %s\n",
488 binary_.c_str(), errno, strerror(errno));
489 }
490 if (link(original_binary_.c_str(), binary_.c_str()) != 0) {
491 LOG(FATAL, "link('%s', '%s') failed because of %d: %s\n",
492 original_binary_.c_str(), binary_.c_str(), errno, strerror(errno));
493 }
494
Brian Silvermanfe06fe12013-02-27 18:54:58 -0800495 if (stat(original_binary_.c_str(), &stat_at_start_) == -1) {
496 LOG(FATAL, "stat(%s, %p) failed with %d: %s\n",
497 original_binary_.c_str(), &stat_at_start_, errno, strerror(errno));
498 }
499 stat_at_start_valid_ = true;
500
Brian Silvermand169fcd2013-02-27 13:18:47 -0800501 if ((pid_ = fork()) == 0) {
502 ssize_t args_size = args_.size();
503 const char **argv = new const char *[args_size + 1];
504 for (int i = 0; i < args_size; ++i) {
505 argv[i] = args_[i].c_str();
506 }
507 argv[args_size] = NULL;
508 // The const_cast is safe because no code that might care if it gets
509 // modified can run afterwards.
510 execv(binary_.c_str(), const_cast<char **>(argv));
511 LOG(FATAL, "execv(%s, %p) failed with %d: %s\n",
512 binary_.c_str(), argv, errno, strerror(errno));
513 _exit(EXIT_FAILURE);
514 }
515 if (pid_ == -1) {
Brian Silverman5cc661b2013-02-27 15:23:36 -0800516 LOG(FATAL, "forking to run \"%s\" failed with %d: %s\n",
Brian Silvermand169fcd2013-02-27 13:18:47 -0800517 binary_.c_str(), errno, strerror(errno));
518 }
Brian Silvermand90b5fe2013-03-10 18:34:42 -0700519 LOG(DEBUG, "started \"%s\" successfully\n", binary_.c_str());
Brian Silvermand169fcd2013-02-27 13:18:47 -0800520 }
Brian Silvermanbc4fc2f2013-02-27 19:33:42 -0800521
522 // A history of the times that this process has been restarted.
523 std::queue<time::Time, std::list<time::Time>> restarts_;
524
525 // The currently running child's PID or NULL.
526 pid_t pid_;
527
528 // All of the arguments (including the name of the binary).
529 std::deque<std::string> args_;
530
531 // The name of the real binary that we were told to run.
532 std::string original_binary_;
533 // The name of the file that we're actually running.
534 std::string binary_;
535
536 // Watches original_binary_.
537 unique_ptr<FileWatch> watcher_;
538
539 // An event that restarts after kRestartWaitTime.
540 EventUniquePtr restart_timeout_;
541
542 // Captured from the original file when we most recently started a new child
543 // process. Used to see if it actually changes or not.
544 struct stat stat_at_start_;
545 bool stat_at_start_valid_;
546
547 DISALLOW_COPY_AND_ASSIGN(Child);
Brian Silvermand169fcd2013-02-27 13:18:47 -0800548};
Brian Silverman52aeeac2013-08-28 16:20:53 -0700549
550constexpr time::Time Child::kProcessDieTime;
551constexpr time::Time Child::kRestartWaitTime;
552constexpr time::Time Child::kMaxRestartsTime;
553constexpr time::Time Child::kResumeWait;
Brian Silvermand169fcd2013-02-27 13:18:47 -0800554
555// This is where all of the Child instances except core live.
556std::vector<unique_ptr<Child>> children;
Brian Silverman5cc661b2013-02-27 15:23:36 -0800557// A global place to hold on to which child is core.
558unique_ptr<Child> core;
Brian Silvermand169fcd2013-02-27 13:18:47 -0800559
Brian Silverman5cc661b2013-02-27 15:23:36 -0800560// Kills off the entire process group (including ourself).
561void KillChildren(bool try_nice) {
Brian Silvermand169fcd2013-02-27 13:18:47 -0800562 if (try_nice) {
563 static const int kNiceStopSignal = SIGTERM;
564 static const time::Time kNiceWaitTime = time::Time::InSeconds(1);
565
566 // Make sure that we don't just nicely stop ourself...
567 sigset_t mask;
568 sigemptyset(&mask);
569 sigaddset(&mask, kNiceStopSignal);
570 sigprocmask(SIG_BLOCK, &mask, NULL);
571
Brian Silverman5cc661b2013-02-27 15:23:36 -0800572 kill(-getpid(), kNiceStopSignal);
573
574 fflush(NULL);
Brian Silvermand169fcd2013-02-27 13:18:47 -0800575 time::SleepFor(kNiceWaitTime);
576 }
Brian Silverman5cc661b2013-02-27 15:23:36 -0800577
Brian Silvermand169fcd2013-02-27 13:18:47 -0800578 // Send SIGKILL to our whole process group, which will forcibly terminate any
579 // of them that are still running (us for sure, maybe more too).
Brian Silverman5cc661b2013-02-27 15:23:36 -0800580 kill(-getpid(), SIGKILL);
Brian Silvermand169fcd2013-02-27 13:18:47 -0800581}
582
Brian Silverman5cc661b2013-02-27 15:23:36 -0800583void ExitHandler() {
584 KillChildren(true);
Brian Silvermand169fcd2013-02-27 13:18:47 -0800585}
Brian Silverman5cc661b2013-02-27 15:23:36 -0800586
587void KillChildrenSignalHandler(int signum) {
588 // If we get SIGSEGV or some other random signal who knows what's happening
589 // and we should just kill everybody immediately.
590 // This is a list of all of the signals that mean some form of "nicely stop".
591 KillChildren(signum == SIGHUP || signum == SIGINT || signum == SIGQUIT ||
Brian Silverman0eec9532013-02-27 20:24:16 -0800592 signum == SIGABRT || signum == SIGPIPE || signum == SIGTERM ||
593 signum == SIGXCPU);
Brian Silvermand169fcd2013-02-27 13:18:47 -0800594}
595
Brian Silverman5cc661b2013-02-27 15:23:36 -0800596// Returns the currently running child with PID pid or an empty unique_ptr.
Brian Silvermand169fcd2013-02-27 13:18:47 -0800597const unique_ptr<Child> &FindChild(pid_t pid) {
598 for (auto it = children.begin(); it != children.end(); ++it) {
599 if (pid == (*it)->pid()) {
600 return *it;
601 }
602 }
603
604 if (pid == core->pid()) {
605 return core;
606 }
607
Brian Silverman5cc661b2013-02-27 15:23:36 -0800608 static const unique_ptr<Child> kNothing;
Brian Silvermand169fcd2013-02-27 13:18:47 -0800609 return kNothing;
610}
611
Brian Silverman5cc661b2013-02-27 15:23:36 -0800612// Gets set up as a libevent handler for SIGCHLD.
613// Handles calling Child::ProcessDied() on the appropriate one.
614void SigCHLDReceived(int /*fd*/, short /*events*/, void *) {
Brian Silvermand169fcd2013-02-27 13:18:47 -0800615 // In a while loop in case we miss any SIGCHLDs.
616 while (true) {
617 siginfo_t infop;
618 infop.si_pid = 0;
619 if (waitid(P_ALL, 0, &infop, WEXITED | WSTOPPED | WNOHANG) != 0) {
620 LOG(WARNING, "waitid failed with %d: %s", errno, strerror(errno));
Brian Silverman5cc661b2013-02-27 15:23:36 -0800621 continue;
Brian Silvermand169fcd2013-02-27 13:18:47 -0800622 }
Brian Silverman5cc661b2013-02-27 15:23:36 -0800623 // If there are no more child process deaths to process.
Brian Silvermand169fcd2013-02-27 13:18:47 -0800624 if (infop.si_pid == 0) {
Brian Silverman5cc661b2013-02-27 15:23:36 -0800625 return;
Brian Silvermand169fcd2013-02-27 13:18:47 -0800626 }
627
628 pid_t pid = infop.si_pid;
629 int status = infop.si_status;
630 const unique_ptr<Child> &child = FindChild(pid);
631 if (child) {
632 switch (infop.si_code) {
633 case CLD_EXITED:
634 LOG(WARNING, "child %d (%s) exited with status %d\n",
635 pid, child->name(), status);
636 break;
637 case CLD_DUMPED:
638 LOG(INFO, "child %d actually dumped core. "
639 "falling through to killed by signal case\n", pid);
640 case CLD_KILLED:
641 // If somebody (possibly us) sent it SIGTERM that means that they just
642 // want it to stop, so it stopping isn't a WARNING.
643 LOG((status == SIGTERM) ? DEBUG : WARNING,
644 "child %d (%s) was killed by signal %d (%s)\n",
645 pid, child->name(), status,
646 strsignal(status));
647 break;
648 case CLD_STOPPED:
649 LOG(WARNING, "child %d (%s) was stopped by signal %d "
650 "(giving it a SIGCONT(%d))\n",
651 pid, child->name(), status, SIGCONT);
652 kill(pid, SIGCONT);
653 continue;
654 default:
655 LOG(WARNING, "something happened to child %d (%s) (killing it)\n",
656 pid, child->name());
657 kill(pid, SIGKILL);
658 continue;
659 }
660 } else {
Brian Silverman5cc661b2013-02-27 15:23:36 -0800661 LOG(WARNING, "couldn't find a Child for pid %d\n", pid);
662 return;
Brian Silvermand169fcd2013-02-27 13:18:47 -0800663 }
664
Brian Silverman5cc661b2013-02-27 15:23:36 -0800665 if (child == core) {
666 LOG(FATAL, "core died\n");
667 }
Brian Silvermand169fcd2013-02-27 13:18:47 -0800668 child->ProcessDied();
669 }
670}
671
Brian Silverman5cc661b2013-02-27 15:23:36 -0800672// This is used for communicating the name of the file to read processes to
673// start from main to Run.
674const char *child_list_file;
675
Brian Silverman8070a222013-02-28 15:01:36 -0800676void Run(void *watch);
Brian Silvermand169fcd2013-02-27 13:18:47 -0800677void Main() {
678 logging::Init();
Brian Silverman0eec9532013-02-27 20:24:16 -0800679 // TODO(brians): tell logging that using the root logger from here until we
Brian Silvermand169fcd2013-02-27 13:18:47 -0800680 // bring up shm is ok
681
Brian Silverman5cc661b2013-02-27 15:23:36 -0800682 if (setpgid(0 /*self*/, 0 /*make PGID the same as PID*/) != 0) {
683 LOG(FATAL, "setpgid(0, 0) failed with %d: %s\n", errno, strerror(errno));
684 }
Brian Silvermand169fcd2013-02-27 13:18:47 -0800685
686 // Make sure that we kill all children when we exit.
Brian Silverman5cc661b2013-02-27 15:23:36 -0800687 atexit(ExitHandler);
Brian Silvermand169fcd2013-02-27 13:18:47 -0800688 // Do it on some signals too (ones that we otherwise tend to receive and then
689 // leave all of our children going).
Brian Silverman5cc661b2013-02-27 15:23:36 -0800690 signal(SIGHUP, KillChildrenSignalHandler);
691 signal(SIGINT, KillChildrenSignalHandler);
692 signal(SIGQUIT, KillChildrenSignalHandler);
693 signal(SIGILL, KillChildrenSignalHandler);
694 signal(SIGABRT, KillChildrenSignalHandler);
695 signal(SIGFPE, KillChildrenSignalHandler);
696 signal(SIGSEGV, KillChildrenSignalHandler);
697 signal(SIGPIPE, KillChildrenSignalHandler);
698 signal(SIGTERM, KillChildrenSignalHandler);
699 signal(SIGBUS, KillChildrenSignalHandler);
700 signal(SIGXCPU, KillChildrenSignalHandler);
Brian Silvermand169fcd2013-02-27 13:18:47 -0800701
702 libevent_base = EventBaseUniquePtr(event_base_new());
703
Brian Silvermanb1e4f6c2013-02-27 15:42:02 -0800704 std::string core_touch_file = "/tmp/starter.";
Brian Silvermand169fcd2013-02-27 13:18:47 -0800705 core_touch_file += std::to_string(static_cast<intmax_t>(getpid()));
706 core_touch_file += ".core_touch_file";
Brian Silvermanb1e4f6c2013-02-27 15:42:02 -0800707 if (system(("touch '" + core_touch_file + "'").c_str()) != 0) {
708 LOG(FATAL, "running `touch '%s'` failed\n", core_touch_file.c_str());
709 }
710 FileWatch core_touch_file_watch(core_touch_file, Run, NULL);
Brian Silvermand169fcd2013-02-27 13:18:47 -0800711 core = unique_ptr<Child>(
Brian Silvermanb1e4f6c2013-02-27 15:42:02 -0800712 new Child("core " + core_touch_file));
Brian Silvermand169fcd2013-02-27 13:18:47 -0800713
714 FILE *pid_file = fopen("/tmp/starter.pid", "w");
715 if (pid_file == NULL) {
Brian Silverman5cc661b2013-02-27 15:23:36 -0800716 LOG(FATAL, "fopen(\"/tmp/starter.pid\", \"w\") failed with %d: %s\n",
Brian Silvermand169fcd2013-02-27 13:18:47 -0800717 errno, strerror(errno));
718 } else {
719 if (fprintf(pid_file, "%d", core->pid()) == -1) {
Brian Silverman5cc661b2013-02-27 15:23:36 -0800720 LOG(WARNING, "fprintf(%p, \"%%d\", %d) failed with %d: %s\n",
721 pid_file, core->pid(), errno, strerror(errno));
Brian Silvermand169fcd2013-02-27 13:18:47 -0800722 }
723 fclose(pid_file);
724 }
725
726 LOG(INFO, "waiting for %s to appear\n", core_touch_file.c_str());
727
728 event_base_dispatch(libevent_base.get());
729 LOG(FATAL, "event_base_dispatch(%p) returned\n", libevent_base.get());
730}
731
Brian Silverman0eec9532013-02-27 20:24:16 -0800732// This is the callback for when core creates the file indicating that it has
733// started.
734void Run(void *watch) {
735 // Make it so it doesn't keep on seeing random changes in /tmp.
736 static_cast<FileWatch *>(watch)->RemoveWatch();
737
738 // It's safe now because core is up.
739 aos::InitNRT();
740
741 std::ifstream list_file(child_list_file);
742
743 while (true) {
744 std::string child_name;
745 getline(list_file, child_name);
746 if ((list_file.rdstate() & std::ios_base::eofbit) != 0) {
747 break;
748 }
749 if (list_file.rdstate() != 0) {
750 LOG(FATAL, "reading input file %s failed\n", child_list_file);
751 }
752 children.push_back(unique_ptr<Child>(new Child(child_name)));
753 }
754
755 EventUniquePtr sigchld(event_new(libevent_base.get(), SIGCHLD,
756 EV_SIGNAL | EV_PERSIST,
757 SigCHLDReceived, NULL));
758 event_add(sigchld.release(), NULL);
759}
760
Brian Silverman8070a222013-02-28 15:01:36 -0800761const char *kArgsHelp = "[OPTION]... START_LIST\n"
762 "Start all of the robot code binaries in START_LIST.\n"
763 "\n"
764 "START_LIST is the file to read binaries (looked up on PATH) to run.\n"
765 " --help display this help and exit\n";
766void PrintHelp() {
767 fprintf(stderr, "Usage: %s %s", program_invocation_name, kArgsHelp);
768}
769
Brian Silvermand169fcd2013-02-27 13:18:47 -0800770} // namespace starter
771} // namespace aos
772
773int main(int argc, char *argv[]) {
Brian Silverman8070a222013-02-28 15:01:36 -0800774 if (argc != 2) {
775 aos::starter::PrintHelp();
Brian Silvermand169fcd2013-02-27 13:18:47 -0800776 exit(EXIT_FAILURE);
Brian Silvermand169fcd2013-02-27 13:18:47 -0800777 }
Brian Silverman8070a222013-02-28 15:01:36 -0800778 if (strcmp(argv[1], "--help") == 0) {
779 aos::starter::PrintHelp();
780 exit(EXIT_SUCCESS);
781 }
782
Brian Silvermand169fcd2013-02-27 13:18:47 -0800783 aos::starter::child_list_file = argv[1];
784
785 aos::starter::Main();
786}