blob: e47e21315b6c0f24ab205711a7aba63386a0e1f8 [file] [log] [blame]
brians343bc112013-02-10 01:53:46 +00001#include "aos/atom_code/starter/starter.h"
2
3#include <stdio.h>
4#include <stdlib.h>
5#include <sys/signalfd.h>
6#include <sys/types.h>
7#include <fcntl.h>
8#include <sys/inotify.h>
9#include <sys/stat.h>
10
Brian Silvermanf665d692013-02-17 22:11:39 -080011#include "aos/common/logging/logging.h"
12#include "aos/common/logging/logging_impl.h"
13#include "aos/atom_code/init.h"
brians343bc112013-02-10 01:53:46 +000014
15void niceexit(int status);
16
17pid_t start(const char *cmd, uint8_t times) {
18 char *which_cmd, *which_cmd_stm;
19 if (asprintf(&which_cmd, "which %s", cmd) == -1) {
Brian Silvermanf665d692013-02-17 22:11:39 -080020 LOG(ERROR, "creating \"which %s\" failed with %d: %s\n",
21 cmd, errno, strerror(errno));
brians343bc112013-02-10 01:53:46 +000022 niceexit(EXIT_FAILURE);
23 }
24 if (asprintf(&which_cmd_stm, "which %s.stm", cmd) == -1) {
Brian Silvermanf665d692013-02-17 22:11:39 -080025 LOG(ERROR, "creating \"which %s.stm\" failed with %d: %s\n",
26 cmd, errno, strerror(errno));
brians343bc112013-02-10 01:53:46 +000027 niceexit(EXIT_FAILURE);
28 }
29 FILE *which = popen(which_cmd, "r");
30 char exe[CMDLEN + 5], orig_exe[CMDLEN];
31 size_t ret;
32 if ((ret = fread(orig_exe, 1, sizeof(orig_exe), which)) == CMDLEN) {
Brian Silvermanf665d692013-02-17 22:11:39 -080033 LOG(ERROR, "`which %s` was too long. not starting '%s'\n", cmd, cmd);
brians343bc112013-02-10 01:53:46 +000034 return 0;
35 }
36 orig_exe[ret] = '\0';
37 if (pclose(which) == -1) {
Brian Silvermanf665d692013-02-17 22:11:39 -080038 LOG(WARNING, "pclose failed with %d: %s\n", errno, strerror(errno));
brians343bc112013-02-10 01:53:46 +000039 }
40 free(which_cmd);
41 if (strlen(orig_exe) == 0) { // which returned nothing; check if stm exists
Brian Silvermanf665d692013-02-17 22:11:39 -080042 LOG(INFO, "%s didn't exist. trying %s.stm\n", cmd, cmd);
brians343bc112013-02-10 01:53:46 +000043 FILE *which_stm = popen(which_cmd_stm, "r");
44 if ((ret = fread(orig_exe, 1, sizeof(orig_exe), which_stm)) == CMDLEN) {
Brian Silvermanf665d692013-02-17 22:11:39 -080045 LOG(ERROR, "`which %s.stm` was too long. not starting %s\n", cmd, cmd);
brians343bc112013-02-10 01:53:46 +000046 return 0;
47 }
48 orig_exe[ret] = '\0';
49 if (pclose(which) == -1) {
Brian Silvermanf665d692013-02-17 22:11:39 -080050 LOG(WARNING, "pclose failed with %d: %s\n", errno, strerror(errno));
brians343bc112013-02-10 01:53:46 +000051 }
52 }
53 if (strlen(orig_exe) == 0) {
Brian Silvermanf665d692013-02-17 22:11:39 -080054 LOG(WARNING, "couldn't find file '%s[.stm]'. not going to start it\n",
55 cmd);
brians343bc112013-02-10 01:53:46 +000056 return 0;
57 }
58 if (orig_exe[strlen(orig_exe) - 1] != '\n') {
Brian Silvermanf665d692013-02-17 22:11:39 -080059 LOG(WARNING, "no \\n on the end of `which %s[.stm]` output ('%s')\n",
60 cmd, orig_exe);
brians343bc112013-02-10 01:53:46 +000061 } else {
62 orig_exe[strlen(orig_exe) - 1] = '\0'; // get rid of the \n
63 }
64 strncpy(exe, orig_exe, CMDLEN);
65
66 strcat(exe, ".stm");
67 struct stat st;
68 errno = 0;
69 if (stat(orig_exe, &st) != 0 && errno != ENOENT) {
Brian Silvermanf665d692013-02-17 22:11:39 -080070 LOG(ERROR, "killing everything because stat('%s') failed with %d: %s\n",
71 orig_exe, errno, strerror(errno));
brians343bc112013-02-10 01:53:46 +000072 niceexit(EXIT_FAILURE);
73 } else if (errno == ENOENT) {
Brian Silvermanf665d692013-02-17 22:11:39 -080074 LOG(WARNING, "binary '%s' doesn't exist. not starting it\n", orig_exe);
brians343bc112013-02-10 01:53:46 +000075 return 0;
76 }
77 struct stat st2;
78 // if we can confirm it's already 0 size
79 bool orig_zero = stat(orig_exe, &st2) == 0 && st2.st_size == 0;
80 if (!orig_zero) {
81 // if it failed and it wasn't because it was missing
82 if (unlink(exe) != 0 && (errno != EROFS && errno != ENOENT)) {
Brian Silvermanf665d692013-02-17 22:11:39 -080083 LOG(ERROR,
84 "killing everything because unlink('%s') failed with %d: %s\n",
85 exe, errno, strerror(errno));
brians343bc112013-02-10 01:53:46 +000086 niceexit(EXIT_FAILURE);
87 }
88 if (link(orig_exe, exe) != 0) {
Brian Silvermanf665d692013-02-17 22:11:39 -080089 LOG(ERROR,
90 "killing everything because link('%s', '%s') failed with %d: %s\n",
91 orig_exe, exe, errno, strerror(errno));
brians343bc112013-02-10 01:53:46 +000092 niceexit(EXIT_FAILURE);
93 }
94 }
95 if (errno == EEXIST) {
Brian Silvermanf665d692013-02-17 22:11:39 -080096 LOG(INFO, "exe ('%s') already existed\n", exe);
brians343bc112013-02-10 01:53:46 +000097 }
98
99 pid_t child;
100 if ((child = fork()) == 0) {
101 execlp(exe, orig_exe, static_cast<char *>(NULL));
Brian Silvermanf665d692013-02-17 22:11:39 -0800102 LOG(ERROR,
103 "killing everything because execlp('%s', '%s', NULL) "
104 "failed with %d: %s\n",
105 exe, cmd, errno, strerror(errno));
brians343bc112013-02-10 01:53:46 +0000106 _exit(EXIT_FAILURE); // don't niceexit or anything because this is the child!!
107 }
108 if (child == -1) {
Brian Silvermanf665d692013-02-17 22:11:39 -0800109 LOG(WARNING, "fork on '%s' failed with %d: %s",
110 cmd, errno, strerror(errno));
brians343bc112013-02-10 01:53:46 +0000111 if (times < 100) {
112 return start(cmd, times + 1);
113 } else {
Brian Silvermanf665d692013-02-17 22:11:39 -0800114 LOG(ERROR, "tried to start '%s' too many times. giving up\n", cmd);
brians343bc112013-02-10 01:53:46 +0000115 return 0;
116 }
117 } else {
118 children[child] = cmd;
119 files[child] = orig_exe;
120 int ret = inotify_add_watch(notifyfd, orig_exe, IN_ATTRIB | IN_MODIFY);
121 if (ret < 0) {
Brian Silvermanf665d692013-02-17 22:11:39 -0800122 LOG(WARNING, "inotify_add_watch('%s') failed: "
123 "not going to watch for changes to it because of %d: %s\n",
124 orig_exe, errno, strerror(errno));
brians343bc112013-02-10 01:53:46 +0000125 } else {
126 watches[ret] = child;
127 mtimes[ret] = st2.st_mtime;
128 }
129 return child;
130 }
131}
132
133static bool exited = false;
134void exit_handler() {
135 if(exited) {
136 return;
137 } else {
138 exited = true;
139 }
140 fputs("starter: killing all children for exit\n", stdout);
141 for (auto it = children.begin(); it != children.end(); ++it) {
142 printf("starter: killing child %d ('%s') for exit\n", it->first, it->second);
143 kill(it->first, SIGKILL);
144 }
145 if (sigfd != 0) {
146 close(sigfd);
147 }
148 if (notifyfd != 0) {
149 close(notifyfd);
150 }
151}
152void niceexit(int status) {
153 printf("starter: niceexit(%d) EXIT_SUCCESS=%d EXIT_FAILURE=%d\n",
154 status, EXIT_SUCCESS, EXIT_FAILURE);
155 exit_handler();
156 exit(status);
157}
158
159int main(int argc, char *argv[]) {
160 if (argc < 2) {
161 fputs("starter: error: need an argument specifying what file to use\n", stderr);
162 niceexit(EXIT_FAILURE);
163 } else if(argc > 2) {
164 fputs("starter: warning: too many arguments\n", stderr);
165 }
166
167 atexit(exit_handler);
168
Brian Silvermanf665d692013-02-17 22:11:39 -0800169 aos::logging::Init();
170
brians343bc112013-02-10 01:53:46 +0000171 notifyfd = inotify_init1(IN_NONBLOCK);
172
173 pid_t core = start("core", 0);
174 if (core == 0) {
175 fputs("starter: error: core didn't exist\n", stderr);
176 niceexit(EXIT_FAILURE);
177 }
178 fprintf(stderr, "starter: info: core's pid is %jd\n", static_cast<intmax_t>(core));
179 FILE *pid_file = fopen("/tmp/starter.pid", "w");
180 if (pid_file == NULL) {
181 perror("fopen(/tmp/starter.pid)");
182 } else {
183 if (fprintf(pid_file, "%d", core) == -1) {
184 fprintf(stderr, "starter: error: fprintf(pid_file, core(=%d)) failed "
185 "with %d: %s",
186 core, errno, strerror(errno));
187 }
188 fclose(pid_file);
189 }
190 sleep(1);
191 if (kill(core, 0) != 0) {
192 fprintf(stderr, "starter: couldn't kill(%jd(=core), 0) because of %d: %s\n",
193 static_cast<intmax_t>(core), errno, strerror(errno));
194 niceexit(EXIT_FAILURE);
195 }
196 fputs("starter: before init\n", stdout);
197 aos::InitNRT();
198 fputs("starter: after init\n", stdout);
199
200 FILE *list = fopen(argv[1], "re");
201 char line[CMDLEN + 1];
202 char *line_copy;
203 uint8_t too_long = 0;
204 while (fgets(line, sizeof(line), list) != NULL) {
205 if (line[strlen(line) - 1] != '\n') {
206 LOG(WARNING, "command segment '%s' is too long. "
207 "increase the size of the line char[] above " __FILE__ ": %d\n",
208 line, __LINE__);
209 too_long = 1;
210 continue;
211 }
212 if (too_long) {
213 too_long = 0;
214 LOG(WARNING, "\tgot last chunk of too long line: '%s'\n", line);
215 continue; // don't try running the last little chunk
216 }
217 line[strlen(line) - 1] = '\0'; // get rid of the \n
218 line_copy = new char[strlen(line) + 1];
219 memcpy(line_copy, line, strlen(line) + 1);
220 fprintf(stderr, "starter: info: going to start \"%s\"\n", line_copy);
221 start(line_copy, 0);
222 }
223 fclose(list);
224 LOG(INFO, "started everything\n");
225
226 sigset_t mask;
227 sigemptyset (&mask);
228 sigaddset (&mask, SIGCHLD);
229 sigprocmask (SIG_BLOCK, &mask, NULL);
230 sigfd = signalfd (-1, &mask, O_NONBLOCK);
231
232 fd_set readfds;
233 FD_ZERO(&readfds);
234 siginfo_t infop;
235 signalfd_siginfo fdsi;
236 inotify_event notifyevt;
237 int ret;
238 while (1) {
239 FD_SET(sigfd, &readfds);
240 FD_SET(notifyfd, &readfds);
241 timeval timeout;
242 timeout.tv_sec = restarts.empty() ? 2 : 0;
243 timeout.tv_usec = 100000;
244 ret = select (FD_SETSIZE, &readfds, NULL, NULL, &timeout);
245
246 if (ret == 0) { // timeout
247 auto it = restarts.begin();
248 // WARNING because the message about it dying will be
249 for (; it != restarts.end(); it++) {
250 LOG(WARNING, "restarting process %d ('%s') by giving it a SIGKILL(%d)\n",
251 *it, children[*it], SIGKILL);
252 kill(*it, SIGKILL);
253 }
254 restarts.clear();
255 }
256
257 if (FD_ISSET(notifyfd, &readfds)) {
258 if ((ret = read(notifyfd, &notifyevt, sizeof(notifyevt))) ==
259 sizeof(notifyevt)) {
260 if (watches.count(notifyevt.wd)) {
261 struct stat st;
262 if (!children.count(watches[notifyevt.wd]) ||
263 stat(files[watches[notifyevt.wd]], &st) == 0) {
264 if (mtimes[notifyevt.wd] == st.st_mtime) {
265 LOG(DEBUG, "ignoring trigger of watch id %d (file '%s')"
266 " because mtime didn't change\n",
267 notifyevt.wd, files[watches[notifyevt.wd]]);
268 } else if (children.count(watches[notifyevt.wd])) {
269 LOG(DEBUG, "adding process %d to the restart list\n",
270 watches[notifyevt.wd]);
271 restarts.insert(watches[notifyevt.wd]);
272 } else {
273 LOG(DEBUG, "children doesn't have entry for PID %d\n",
274 watches[notifyevt.wd]);
275 }
276 } else {
277 LOG(ERROR, "stat('%s') failed with %d: %s\n",
278 files[watches[notifyevt.wd]], errno, strerror(errno));
279 }
280 } else {
281 LOG(WARNING, "no PID for watch id %d\n", notifyevt.wd);
282 }
283 } else {
284 if (ret == -1) {
285 LOG(WARNING, "read(notifyfd) failed with %d: %s", errno, strerror(errno));
286 } else {
287 LOG(WARNING, "couldn't get a whole inotify_event(%d) (only got %d)\n",
288 sizeof(notifyevt), ret);
289 }
290 }
291 }
292
293 if (FD_ISSET(sigfd, &readfds)) {
294 while(read (sigfd, &fdsi, sizeof fdsi) > 0);
295 }
296 while (1) {
297 infop.si_pid = 0;
298 if (waitid(P_ALL, 0, &infop, WEXITED | WSTOPPED | WNOHANG) == 0) {
299 if (infop.si_pid == 0) {
300 goto after_loop; // no more child process changes pending
301 }
302 switch (infop.si_code) {
303 case CLD_EXITED:
304 LOG(WARNING, "child %d (%s) exited with status %d\n",
305 infop.si_pid, children[infop.si_pid], infop.si_status);
306 break;
307 case CLD_DUMPED:
308 LOG(INFO, "child %d actually dumped core. "
309 "falling through to killed by signal case\n", infop.si_pid);
310 case CLD_KILLED:
311 LOG(WARNING, "child %d (%s) was killed by signal %d (%s)\n",
312 infop.si_pid, children[infop.si_pid], infop.si_status,
313 strsignal(infop.si_status));
314 break;
315 case CLD_STOPPED:
316 LOG(WARNING, "child %d (%s) was stopped by signal %d "
317 "(giving it a SIGCONT(%d))\n",
318 infop.si_pid, children[infop.si_pid], infop.si_status, SIGCONT);
319 kill(infop.si_pid, SIGCONT);
320 continue;
321 default:
322 LOG(WARNING, "something happened to child %d (%s) (killing it)\n",
323 infop.si_pid, children[infop.si_pid]);
324 kill(infop.si_pid, SIGKILL);
325 continue;
326 }
327 if (infop.si_pid == core) {
328 fprintf(stderr, "starter: si_code=%d CLD_EXITED=%d CLD_DUMPED=%d "
329 "CLD_KILLED=%d CLD_STOPPED=%d si_status=%d (sig '%s')\n",
330 infop.si_code, CLD_EXITED, CLD_DUMPED, CLD_KILLED,
331 CLD_STOPPED, infop.si_status, strsignal(infop.si_status));
332 // core has died. logging is down too
333 fputs("starter: error: core died. exiting\n", stderr);
334 niceexit(EXIT_FAILURE);
335 }
336
brians343bc112013-02-10 01:53:46 +0000337 start(children[infop.si_pid], 0);
338 LOG(DEBUG, "erasing %d from children\n", infop.si_pid);
339 children.erase(infop.si_pid);
340 } else {
341 LOG(WARNING, "waitid failed with %d: %s", errno, strerror(errno));
342 }
343 }
344after_loop: ;
345 }
346}