blob: 0cfddd5e38ebe85ae17c53fec4167d338202ef9f [file] [log] [blame]
brians343bc112013-02-10 01:53:46 +00001#include "aos/atom_code/starter/starter.h"
2
3#include <stdio.h>
4#include <stdlib.h>
5#include <sys/signalfd.h>
6#include <sys/types.h>
7#include <fcntl.h>
8#include <sys/inotify.h>
9#include <sys/stat.h>
10
11#include "aos/aos_core.h"
12
13void niceexit(int status);
14
15pid_t start(const char *cmd, uint8_t times) {
16 char *which_cmd, *which_cmd_stm;
17 if (asprintf(&which_cmd, "which %s", cmd) == -1) {
18 LOG_IFINIT(ERROR, "creating \"which %s\" failed with %d: %s\n",
19 cmd, errno, strerror(errno));
20 niceexit(EXIT_FAILURE);
21 }
22 if (asprintf(&which_cmd_stm, "which %s.stm", cmd) == -1) {
23 LOG_IFINIT(ERROR, "creating \"which %s.stm\" failed with %d: %s\n",
24 cmd, errno, strerror(errno));
25 niceexit(EXIT_FAILURE);
26 }
27 FILE *which = popen(which_cmd, "r");
28 char exe[CMDLEN + 5], orig_exe[CMDLEN];
29 size_t ret;
30 if ((ret = fread(orig_exe, 1, sizeof(orig_exe), which)) == CMDLEN) {
31 LOG_IFINIT(ERROR, "`which %s` was too long. not starting '%s'\n", cmd, cmd);
32 return 0;
33 }
34 orig_exe[ret] = '\0';
35 if (pclose(which) == -1) {
36 LOG_IFINIT(WARNING, "pclose failed with %d: %s\n", errno, strerror(errno));
37 }
38 free(which_cmd);
39 if (strlen(orig_exe) == 0) { // which returned nothing; check if stm exists
40 LOG_IFINIT(INFO, "%s didn't exist. trying %s.stm\n", cmd, cmd);
41 FILE *which_stm = popen(which_cmd_stm, "r");
42 if ((ret = fread(orig_exe, 1, sizeof(orig_exe), which_stm)) == CMDLEN) {
43 LOG_IFINIT(ERROR, "`which %s.stm` was too long. not starting %s\n", cmd, cmd);
44 return 0;
45 }
46 orig_exe[ret] = '\0';
47 if (pclose(which) == -1) {
48 LOG_IFINIT(WARNING, "pclose failed with %d: %s\n", errno, strerror(errno));
49 }
50 }
51 if (strlen(orig_exe) == 0) {
52 LOG_IFINIT(WARNING, "couldn't find file '%s[.stm]'. not going to start it\n",
53 cmd);
54 return 0;
55 }
56 if (orig_exe[strlen(orig_exe) - 1] != '\n') {
57 LOG_IFINIT(WARNING, "no \\n on the end of `which %s[.stm]` output ('%s')\n",
58 cmd, orig_exe);
59 } else {
60 orig_exe[strlen(orig_exe) - 1] = '\0'; // get rid of the \n
61 }
62 strncpy(exe, orig_exe, CMDLEN);
63
64 strcat(exe, ".stm");
65 struct stat st;
66 errno = 0;
67 if (stat(orig_exe, &st) != 0 && errno != ENOENT) {
68 LOG_IFINIT(ERROR, "killing everything because stat('%s') failed with %d: %s\n",
69 orig_exe, errno, strerror(errno));
70 niceexit(EXIT_FAILURE);
71 } else if (errno == ENOENT) {
72 LOG_IFINIT(WARNING, "binary '%s' doesn't exist. not starting it\n", orig_exe);
73 return 0;
74 }
75 struct stat st2;
76 // if we can confirm it's already 0 size
77 bool orig_zero = stat(orig_exe, &st2) == 0 && st2.st_size == 0;
78 if (!orig_zero) {
79 // if it failed and it wasn't because it was missing
80 if (unlink(exe) != 0 && (errno != EROFS && errno != ENOENT)) {
81 LOG_IFINIT(ERROR,
82 "killing everything because unlink('%s') failed with %d: %s\n",
83 exe, errno, strerror(errno));
84 niceexit(EXIT_FAILURE);
85 }
86 if (link(orig_exe, exe) != 0) {
87 LOG_IFINIT(ERROR,
88 "killing everything because link('%s', '%s') failed with %d: %s\n",
89 orig_exe, exe, errno, strerror(errno));
90 niceexit(EXIT_FAILURE);
91 }
92 }
93 if (errno == EEXIST) {
94 LOG_IFINIT(INFO, "exe ('%s') already existed\n", exe);
95 }
96
97 pid_t child;
98 if ((child = fork()) == 0) {
99 execlp(exe, orig_exe, static_cast<char *>(NULL));
100 LOG_IFINIT(ERROR,
101 "killing everything because execlp('%s', '%s', NULL) "
102 "failed with %d: %s\n",
103 exe, cmd, errno, strerror(errno));
104 _exit(EXIT_FAILURE); // don't niceexit or anything because this is the child!!
105 }
106 if (child == -1) {
107 LOG_IFINIT(WARNING, "fork on '%s' failed with %d: %s",
108 cmd, errno, strerror(errno));
109 if (times < 100) {
110 return start(cmd, times + 1);
111 } else {
112 LOG_IFINIT(ERROR, "tried to start '%s' too many times. giving up\n", cmd);
113 return 0;
114 }
115 } else {
116 children[child] = cmd;
117 files[child] = orig_exe;
118 int ret = inotify_add_watch(notifyfd, orig_exe, IN_ATTRIB | IN_MODIFY);
119 if (ret < 0) {
120 LOG_IFINIT(WARNING, "inotify_add_watch('%s') failed: "
121 "not going to watch for changes to it because of %d: %s\n",
122 orig_exe, errno, strerror(errno));
123 } else {
124 watches[ret] = child;
125 mtimes[ret] = st2.st_mtime;
126 }
127 return child;
128 }
129}
130
131static bool exited = false;
132void exit_handler() {
133 if(exited) {
134 return;
135 } else {
136 exited = true;
137 }
138 fputs("starter: killing all children for exit\n", stdout);
139 for (auto it = children.begin(); it != children.end(); ++it) {
140 printf("starter: killing child %d ('%s') for exit\n", it->first, it->second);
141 kill(it->first, SIGKILL);
142 }
143 if (sigfd != 0) {
144 close(sigfd);
145 }
146 if (notifyfd != 0) {
147 close(notifyfd);
148 }
149}
150void niceexit(int status) {
151 printf("starter: niceexit(%d) EXIT_SUCCESS=%d EXIT_FAILURE=%d\n",
152 status, EXIT_SUCCESS, EXIT_FAILURE);
153 exit_handler();
154 exit(status);
155}
156
157int main(int argc, char *argv[]) {
158 if (argc < 2) {
159 fputs("starter: error: need an argument specifying what file to use\n", stderr);
160 niceexit(EXIT_FAILURE);
161 } else if(argc > 2) {
162 fputs("starter: warning: too many arguments\n", stderr);
163 }
164
165 atexit(exit_handler);
166
167 notifyfd = inotify_init1(IN_NONBLOCK);
168
169 pid_t core = start("core", 0);
170 if (core == 0) {
171 fputs("starter: error: core didn't exist\n", stderr);
172 niceexit(EXIT_FAILURE);
173 }
174 fprintf(stderr, "starter: info: core's pid is %jd\n", static_cast<intmax_t>(core));
175 FILE *pid_file = fopen("/tmp/starter.pid", "w");
176 if (pid_file == NULL) {
177 perror("fopen(/tmp/starter.pid)");
178 } else {
179 if (fprintf(pid_file, "%d", core) == -1) {
180 fprintf(stderr, "starter: error: fprintf(pid_file, core(=%d)) failed "
181 "with %d: %s",
182 core, errno, strerror(errno));
183 }
184 fclose(pid_file);
185 }
186 sleep(1);
187 if (kill(core, 0) != 0) {
188 fprintf(stderr, "starter: couldn't kill(%jd(=core), 0) because of %d: %s\n",
189 static_cast<intmax_t>(core), errno, strerror(errno));
190 niceexit(EXIT_FAILURE);
191 }
192 fputs("starter: before init\n", stdout);
193 aos::InitNRT();
194 fputs("starter: after init\n", stdout);
195
196 FILE *list = fopen(argv[1], "re");
197 char line[CMDLEN + 1];
198 char *line_copy;
199 uint8_t too_long = 0;
200 while (fgets(line, sizeof(line), list) != NULL) {
201 if (line[strlen(line) - 1] != '\n') {
202 LOG(WARNING, "command segment '%s' is too long. "
203 "increase the size of the line char[] above " __FILE__ ": %d\n",
204 line, __LINE__);
205 too_long = 1;
206 continue;
207 }
208 if (too_long) {
209 too_long = 0;
210 LOG(WARNING, "\tgot last chunk of too long line: '%s'\n", line);
211 continue; // don't try running the last little chunk
212 }
213 line[strlen(line) - 1] = '\0'; // get rid of the \n
214 line_copy = new char[strlen(line) + 1];
215 memcpy(line_copy, line, strlen(line) + 1);
216 fprintf(stderr, "starter: info: going to start \"%s\"\n", line_copy);
217 start(line_copy, 0);
218 }
219 fclose(list);
220 LOG(INFO, "started everything\n");
221
222 sigset_t mask;
223 sigemptyset (&mask);
224 sigaddset (&mask, SIGCHLD);
225 sigprocmask (SIG_BLOCK, &mask, NULL);
226 sigfd = signalfd (-1, &mask, O_NONBLOCK);
227
228 fd_set readfds;
229 FD_ZERO(&readfds);
230 siginfo_t infop;
231 signalfd_siginfo fdsi;
232 inotify_event notifyevt;
233 int ret;
234 while (1) {
235 FD_SET(sigfd, &readfds);
236 FD_SET(notifyfd, &readfds);
237 timeval timeout;
238 timeout.tv_sec = restarts.empty() ? 2 : 0;
239 timeout.tv_usec = 100000;
240 ret = select (FD_SETSIZE, &readfds, NULL, NULL, &timeout);
241
242 if (ret == 0) { // timeout
243 auto it = restarts.begin();
244 // WARNING because the message about it dying will be
245 for (; it != restarts.end(); it++) {
246 LOG(WARNING, "restarting process %d ('%s') by giving it a SIGKILL(%d)\n",
247 *it, children[*it], SIGKILL);
248 kill(*it, SIGKILL);
249 }
250 restarts.clear();
251 }
252
253 if (FD_ISSET(notifyfd, &readfds)) {
254 if ((ret = read(notifyfd, &notifyevt, sizeof(notifyevt))) ==
255 sizeof(notifyevt)) {
256 if (watches.count(notifyevt.wd)) {
257 struct stat st;
258 if (!children.count(watches[notifyevt.wd]) ||
259 stat(files[watches[notifyevt.wd]], &st) == 0) {
260 if (mtimes[notifyevt.wd] == st.st_mtime) {
261 LOG(DEBUG, "ignoring trigger of watch id %d (file '%s')"
262 " because mtime didn't change\n",
263 notifyevt.wd, files[watches[notifyevt.wd]]);
264 } else if (children.count(watches[notifyevt.wd])) {
265 LOG(DEBUG, "adding process %d to the restart list\n",
266 watches[notifyevt.wd]);
267 restarts.insert(watches[notifyevt.wd]);
268 } else {
269 LOG(DEBUG, "children doesn't have entry for PID %d\n",
270 watches[notifyevt.wd]);
271 }
272 } else {
273 LOG(ERROR, "stat('%s') failed with %d: %s\n",
274 files[watches[notifyevt.wd]], errno, strerror(errno));
275 }
276 } else {
277 LOG(WARNING, "no PID for watch id %d\n", notifyevt.wd);
278 }
279 } else {
280 if (ret == -1) {
281 LOG(WARNING, "read(notifyfd) failed with %d: %s", errno, strerror(errno));
282 } else {
283 LOG(WARNING, "couldn't get a whole inotify_event(%d) (only got %d)\n",
284 sizeof(notifyevt), ret);
285 }
286 }
287 }
288
289 if (FD_ISSET(sigfd, &readfds)) {
290 while(read (sigfd, &fdsi, sizeof fdsi) > 0);
291 }
292 while (1) {
293 infop.si_pid = 0;
294 if (waitid(P_ALL, 0, &infop, WEXITED | WSTOPPED | WNOHANG) == 0) {
295 if (infop.si_pid == 0) {
296 goto after_loop; // no more child process changes pending
297 }
298 switch (infop.si_code) {
299 case CLD_EXITED:
300 LOG(WARNING, "child %d (%s) exited with status %d\n",
301 infop.si_pid, children[infop.si_pid], infop.si_status);
302 break;
303 case CLD_DUMPED:
304 LOG(INFO, "child %d actually dumped core. "
305 "falling through to killed by signal case\n", infop.si_pid);
306 case CLD_KILLED:
307 LOG(WARNING, "child %d (%s) was killed by signal %d (%s)\n",
308 infop.si_pid, children[infop.si_pid], infop.si_status,
309 strsignal(infop.si_status));
310 break;
311 case CLD_STOPPED:
312 LOG(WARNING, "child %d (%s) was stopped by signal %d "
313 "(giving it a SIGCONT(%d))\n",
314 infop.si_pid, children[infop.si_pid], infop.si_status, SIGCONT);
315 kill(infop.si_pid, SIGCONT);
316 continue;
317 default:
318 LOG(WARNING, "something happened to child %d (%s) (killing it)\n",
319 infop.si_pid, children[infop.si_pid]);
320 kill(infop.si_pid, SIGKILL);
321 continue;
322 }
323 if (infop.si_pid == core) {
324 fprintf(stderr, "starter: si_code=%d CLD_EXITED=%d CLD_DUMPED=%d "
325 "CLD_KILLED=%d CLD_STOPPED=%d si_status=%d (sig '%s')\n",
326 infop.si_code, CLD_EXITED, CLD_DUMPED, CLD_KILLED,
327 CLD_STOPPED, infop.si_status, strsignal(infop.si_status));
328 // core has died. logging is down too
329 fputs("starter: error: core died. exiting\n", stderr);
330 niceexit(EXIT_FAILURE);
331 }
332
333 /*// remove all of the watches assigned to the pid that just died
334 for (auto it = watches.begin(); it != watches.end(); ++it) {
335 if (it->second == infop.si_pid) {
336 watches_to_ignore.insert(it->first);
337 }
338 }
339 for (auto it = watches_to_ignore.begin();
340 it != watches_to_ignore.end(); ++it) {
341 LOG(DEBUG, "watch id %d was on PID %d\n", *it, infop.si_pid);
342 watches.erase(*it);
343 }*/
344
345 start(children[infop.si_pid], 0);
346 LOG(DEBUG, "erasing %d from children\n", infop.si_pid);
347 children.erase(infop.si_pid);
348 } else {
349 LOG(WARNING, "waitid failed with %d: %s", errno, strerror(errno));
350 }
351 }
352after_loop: ;
353 }
354}