brians | 343bc11 | 2013-02-10 01:53:46 +0000 | [diff] [blame] | 1 | #include "aos/atom_code/starter/starter.h" |
| 2 | |
| 3 | #include <stdio.h> |
| 4 | #include <stdlib.h> |
| 5 | #include <sys/signalfd.h> |
| 6 | #include <sys/types.h> |
| 7 | #include <fcntl.h> |
| 8 | #include <sys/inotify.h> |
| 9 | #include <sys/stat.h> |
| 10 | |
| 11 | #include "aos/aos_core.h" |
| 12 | |
| 13 | void niceexit(int status); |
| 14 | |
| 15 | pid_t start(const char *cmd, uint8_t times) { |
| 16 | char *which_cmd, *which_cmd_stm; |
| 17 | if (asprintf(&which_cmd, "which %s", cmd) == -1) { |
| 18 | LOG_IFINIT(ERROR, "creating \"which %s\" failed with %d: %s\n", |
| 19 | cmd, errno, strerror(errno)); |
| 20 | niceexit(EXIT_FAILURE); |
| 21 | } |
| 22 | if (asprintf(&which_cmd_stm, "which %s.stm", cmd) == -1) { |
| 23 | LOG_IFINIT(ERROR, "creating \"which %s.stm\" failed with %d: %s\n", |
| 24 | cmd, errno, strerror(errno)); |
| 25 | niceexit(EXIT_FAILURE); |
| 26 | } |
| 27 | FILE *which = popen(which_cmd, "r"); |
| 28 | char exe[CMDLEN + 5], orig_exe[CMDLEN]; |
| 29 | size_t ret; |
| 30 | if ((ret = fread(orig_exe, 1, sizeof(orig_exe), which)) == CMDLEN) { |
| 31 | LOG_IFINIT(ERROR, "`which %s` was too long. not starting '%s'\n", cmd, cmd); |
| 32 | return 0; |
| 33 | } |
| 34 | orig_exe[ret] = '\0'; |
| 35 | if (pclose(which) == -1) { |
| 36 | LOG_IFINIT(WARNING, "pclose failed with %d: %s\n", errno, strerror(errno)); |
| 37 | } |
| 38 | free(which_cmd); |
| 39 | if (strlen(orig_exe) == 0) { // which returned nothing; check if stm exists |
| 40 | LOG_IFINIT(INFO, "%s didn't exist. trying %s.stm\n", cmd, cmd); |
| 41 | FILE *which_stm = popen(which_cmd_stm, "r"); |
| 42 | if ((ret = fread(orig_exe, 1, sizeof(orig_exe), which_stm)) == CMDLEN) { |
| 43 | LOG_IFINIT(ERROR, "`which %s.stm` was too long. not starting %s\n", cmd, cmd); |
| 44 | return 0; |
| 45 | } |
| 46 | orig_exe[ret] = '\0'; |
| 47 | if (pclose(which) == -1) { |
| 48 | LOG_IFINIT(WARNING, "pclose failed with %d: %s\n", errno, strerror(errno)); |
| 49 | } |
| 50 | } |
| 51 | if (strlen(orig_exe) == 0) { |
| 52 | LOG_IFINIT(WARNING, "couldn't find file '%s[.stm]'. not going to start it\n", |
| 53 | cmd); |
| 54 | return 0; |
| 55 | } |
| 56 | if (orig_exe[strlen(orig_exe) - 1] != '\n') { |
| 57 | LOG_IFINIT(WARNING, "no \\n on the end of `which %s[.stm]` output ('%s')\n", |
| 58 | cmd, orig_exe); |
| 59 | } else { |
| 60 | orig_exe[strlen(orig_exe) - 1] = '\0'; // get rid of the \n |
| 61 | } |
| 62 | strncpy(exe, orig_exe, CMDLEN); |
| 63 | |
| 64 | strcat(exe, ".stm"); |
| 65 | struct stat st; |
| 66 | errno = 0; |
| 67 | if (stat(orig_exe, &st) != 0 && errno != ENOENT) { |
| 68 | LOG_IFINIT(ERROR, "killing everything because stat('%s') failed with %d: %s\n", |
| 69 | orig_exe, errno, strerror(errno)); |
| 70 | niceexit(EXIT_FAILURE); |
| 71 | } else if (errno == ENOENT) { |
| 72 | LOG_IFINIT(WARNING, "binary '%s' doesn't exist. not starting it\n", orig_exe); |
| 73 | return 0; |
| 74 | } |
| 75 | struct stat st2; |
| 76 | // if we can confirm it's already 0 size |
| 77 | bool orig_zero = stat(orig_exe, &st2) == 0 && st2.st_size == 0; |
| 78 | if (!orig_zero) { |
| 79 | // if it failed and it wasn't because it was missing |
| 80 | if (unlink(exe) != 0 && (errno != EROFS && errno != ENOENT)) { |
| 81 | LOG_IFINIT(ERROR, |
| 82 | "killing everything because unlink('%s') failed with %d: %s\n", |
| 83 | exe, errno, strerror(errno)); |
| 84 | niceexit(EXIT_FAILURE); |
| 85 | } |
| 86 | if (link(orig_exe, exe) != 0) { |
| 87 | LOG_IFINIT(ERROR, |
| 88 | "killing everything because link('%s', '%s') failed with %d: %s\n", |
| 89 | orig_exe, exe, errno, strerror(errno)); |
| 90 | niceexit(EXIT_FAILURE); |
| 91 | } |
| 92 | } |
| 93 | if (errno == EEXIST) { |
| 94 | LOG_IFINIT(INFO, "exe ('%s') already existed\n", exe); |
| 95 | } |
| 96 | |
| 97 | pid_t child; |
| 98 | if ((child = fork()) == 0) { |
| 99 | execlp(exe, orig_exe, static_cast<char *>(NULL)); |
| 100 | LOG_IFINIT(ERROR, |
| 101 | "killing everything because execlp('%s', '%s', NULL) " |
| 102 | "failed with %d: %s\n", |
| 103 | exe, cmd, errno, strerror(errno)); |
| 104 | _exit(EXIT_FAILURE); // don't niceexit or anything because this is the child!! |
| 105 | } |
| 106 | if (child == -1) { |
| 107 | LOG_IFINIT(WARNING, "fork on '%s' failed with %d: %s", |
| 108 | cmd, errno, strerror(errno)); |
| 109 | if (times < 100) { |
| 110 | return start(cmd, times + 1); |
| 111 | } else { |
| 112 | LOG_IFINIT(ERROR, "tried to start '%s' too many times. giving up\n", cmd); |
| 113 | return 0; |
| 114 | } |
| 115 | } else { |
| 116 | children[child] = cmd; |
| 117 | files[child] = orig_exe; |
| 118 | int ret = inotify_add_watch(notifyfd, orig_exe, IN_ATTRIB | IN_MODIFY); |
| 119 | if (ret < 0) { |
| 120 | LOG_IFINIT(WARNING, "inotify_add_watch('%s') failed: " |
| 121 | "not going to watch for changes to it because of %d: %s\n", |
| 122 | orig_exe, errno, strerror(errno)); |
| 123 | } else { |
| 124 | watches[ret] = child; |
| 125 | mtimes[ret] = st2.st_mtime; |
| 126 | } |
| 127 | return child; |
| 128 | } |
| 129 | } |
| 130 | |
| 131 | static bool exited = false; |
| 132 | void exit_handler() { |
| 133 | if(exited) { |
| 134 | return; |
| 135 | } else { |
| 136 | exited = true; |
| 137 | } |
| 138 | fputs("starter: killing all children for exit\n", stdout); |
| 139 | for (auto it = children.begin(); it != children.end(); ++it) { |
| 140 | printf("starter: killing child %d ('%s') for exit\n", it->first, it->second); |
| 141 | kill(it->first, SIGKILL); |
| 142 | } |
| 143 | if (sigfd != 0) { |
| 144 | close(sigfd); |
| 145 | } |
| 146 | if (notifyfd != 0) { |
| 147 | close(notifyfd); |
| 148 | } |
| 149 | } |
| 150 | void niceexit(int status) { |
| 151 | printf("starter: niceexit(%d) EXIT_SUCCESS=%d EXIT_FAILURE=%d\n", |
| 152 | status, EXIT_SUCCESS, EXIT_FAILURE); |
| 153 | exit_handler(); |
| 154 | exit(status); |
| 155 | } |
| 156 | |
| 157 | int main(int argc, char *argv[]) { |
| 158 | if (argc < 2) { |
| 159 | fputs("starter: error: need an argument specifying what file to use\n", stderr); |
| 160 | niceexit(EXIT_FAILURE); |
| 161 | } else if(argc > 2) { |
| 162 | fputs("starter: warning: too many arguments\n", stderr); |
| 163 | } |
| 164 | |
| 165 | atexit(exit_handler); |
| 166 | |
| 167 | notifyfd = inotify_init1(IN_NONBLOCK); |
| 168 | |
| 169 | pid_t core = start("core", 0); |
| 170 | if (core == 0) { |
| 171 | fputs("starter: error: core didn't exist\n", stderr); |
| 172 | niceexit(EXIT_FAILURE); |
| 173 | } |
| 174 | fprintf(stderr, "starter: info: core's pid is %jd\n", static_cast<intmax_t>(core)); |
| 175 | FILE *pid_file = fopen("/tmp/starter.pid", "w"); |
| 176 | if (pid_file == NULL) { |
| 177 | perror("fopen(/tmp/starter.pid)"); |
| 178 | } else { |
| 179 | if (fprintf(pid_file, "%d", core) == -1) { |
| 180 | fprintf(stderr, "starter: error: fprintf(pid_file, core(=%d)) failed " |
| 181 | "with %d: %s", |
| 182 | core, errno, strerror(errno)); |
| 183 | } |
| 184 | fclose(pid_file); |
| 185 | } |
| 186 | sleep(1); |
| 187 | if (kill(core, 0) != 0) { |
| 188 | fprintf(stderr, "starter: couldn't kill(%jd(=core), 0) because of %d: %s\n", |
| 189 | static_cast<intmax_t>(core), errno, strerror(errno)); |
| 190 | niceexit(EXIT_FAILURE); |
| 191 | } |
| 192 | fputs("starter: before init\n", stdout); |
| 193 | aos::InitNRT(); |
| 194 | fputs("starter: after init\n", stdout); |
| 195 | |
| 196 | FILE *list = fopen(argv[1], "re"); |
| 197 | char line[CMDLEN + 1]; |
| 198 | char *line_copy; |
| 199 | uint8_t too_long = 0; |
| 200 | while (fgets(line, sizeof(line), list) != NULL) { |
| 201 | if (line[strlen(line) - 1] != '\n') { |
| 202 | LOG(WARNING, "command segment '%s' is too long. " |
| 203 | "increase the size of the line char[] above " __FILE__ ": %d\n", |
| 204 | line, __LINE__); |
| 205 | too_long = 1; |
| 206 | continue; |
| 207 | } |
| 208 | if (too_long) { |
| 209 | too_long = 0; |
| 210 | LOG(WARNING, "\tgot last chunk of too long line: '%s'\n", line); |
| 211 | continue; // don't try running the last little chunk |
| 212 | } |
| 213 | line[strlen(line) - 1] = '\0'; // get rid of the \n |
| 214 | line_copy = new char[strlen(line) + 1]; |
| 215 | memcpy(line_copy, line, strlen(line) + 1); |
| 216 | fprintf(stderr, "starter: info: going to start \"%s\"\n", line_copy); |
| 217 | start(line_copy, 0); |
| 218 | } |
| 219 | fclose(list); |
| 220 | LOG(INFO, "started everything\n"); |
| 221 | |
| 222 | sigset_t mask; |
| 223 | sigemptyset (&mask); |
| 224 | sigaddset (&mask, SIGCHLD); |
| 225 | sigprocmask (SIG_BLOCK, &mask, NULL); |
| 226 | sigfd = signalfd (-1, &mask, O_NONBLOCK); |
| 227 | |
| 228 | fd_set readfds; |
| 229 | FD_ZERO(&readfds); |
| 230 | siginfo_t infop; |
| 231 | signalfd_siginfo fdsi; |
| 232 | inotify_event notifyevt; |
| 233 | int ret; |
| 234 | while (1) { |
| 235 | FD_SET(sigfd, &readfds); |
| 236 | FD_SET(notifyfd, &readfds); |
| 237 | timeval timeout; |
| 238 | timeout.tv_sec = restarts.empty() ? 2 : 0; |
| 239 | timeout.tv_usec = 100000; |
| 240 | ret = select (FD_SETSIZE, &readfds, NULL, NULL, &timeout); |
| 241 | |
| 242 | if (ret == 0) { // timeout |
| 243 | auto it = restarts.begin(); |
| 244 | // WARNING because the message about it dying will be |
| 245 | for (; it != restarts.end(); it++) { |
| 246 | LOG(WARNING, "restarting process %d ('%s') by giving it a SIGKILL(%d)\n", |
| 247 | *it, children[*it], SIGKILL); |
| 248 | kill(*it, SIGKILL); |
| 249 | } |
| 250 | restarts.clear(); |
| 251 | } |
| 252 | |
| 253 | if (FD_ISSET(notifyfd, &readfds)) { |
| 254 | if ((ret = read(notifyfd, ¬ifyevt, sizeof(notifyevt))) == |
| 255 | sizeof(notifyevt)) { |
| 256 | if (watches.count(notifyevt.wd)) { |
| 257 | struct stat st; |
| 258 | if (!children.count(watches[notifyevt.wd]) || |
| 259 | stat(files[watches[notifyevt.wd]], &st) == 0) { |
| 260 | if (mtimes[notifyevt.wd] == st.st_mtime) { |
| 261 | LOG(DEBUG, "ignoring trigger of watch id %d (file '%s')" |
| 262 | " because mtime didn't change\n", |
| 263 | notifyevt.wd, files[watches[notifyevt.wd]]); |
| 264 | } else if (children.count(watches[notifyevt.wd])) { |
| 265 | LOG(DEBUG, "adding process %d to the restart list\n", |
| 266 | watches[notifyevt.wd]); |
| 267 | restarts.insert(watches[notifyevt.wd]); |
| 268 | } else { |
| 269 | LOG(DEBUG, "children doesn't have entry for PID %d\n", |
| 270 | watches[notifyevt.wd]); |
| 271 | } |
| 272 | } else { |
| 273 | LOG(ERROR, "stat('%s') failed with %d: %s\n", |
| 274 | files[watches[notifyevt.wd]], errno, strerror(errno)); |
| 275 | } |
| 276 | } else { |
| 277 | LOG(WARNING, "no PID for watch id %d\n", notifyevt.wd); |
| 278 | } |
| 279 | } else { |
| 280 | if (ret == -1) { |
| 281 | LOG(WARNING, "read(notifyfd) failed with %d: %s", errno, strerror(errno)); |
| 282 | } else { |
| 283 | LOG(WARNING, "couldn't get a whole inotify_event(%d) (only got %d)\n", |
| 284 | sizeof(notifyevt), ret); |
| 285 | } |
| 286 | } |
| 287 | } |
| 288 | |
| 289 | if (FD_ISSET(sigfd, &readfds)) { |
| 290 | while(read (sigfd, &fdsi, sizeof fdsi) > 0); |
| 291 | } |
| 292 | while (1) { |
| 293 | infop.si_pid = 0; |
| 294 | if (waitid(P_ALL, 0, &infop, WEXITED | WSTOPPED | WNOHANG) == 0) { |
| 295 | if (infop.si_pid == 0) { |
| 296 | goto after_loop; // no more child process changes pending |
| 297 | } |
| 298 | switch (infop.si_code) { |
| 299 | case CLD_EXITED: |
| 300 | LOG(WARNING, "child %d (%s) exited with status %d\n", |
| 301 | infop.si_pid, children[infop.si_pid], infop.si_status); |
| 302 | break; |
| 303 | case CLD_DUMPED: |
| 304 | LOG(INFO, "child %d actually dumped core. " |
| 305 | "falling through to killed by signal case\n", infop.si_pid); |
| 306 | case CLD_KILLED: |
| 307 | LOG(WARNING, "child %d (%s) was killed by signal %d (%s)\n", |
| 308 | infop.si_pid, children[infop.si_pid], infop.si_status, |
| 309 | strsignal(infop.si_status)); |
| 310 | break; |
| 311 | case CLD_STOPPED: |
| 312 | LOG(WARNING, "child %d (%s) was stopped by signal %d " |
| 313 | "(giving it a SIGCONT(%d))\n", |
| 314 | infop.si_pid, children[infop.si_pid], infop.si_status, SIGCONT); |
| 315 | kill(infop.si_pid, SIGCONT); |
| 316 | continue; |
| 317 | default: |
| 318 | LOG(WARNING, "something happened to child %d (%s) (killing it)\n", |
| 319 | infop.si_pid, children[infop.si_pid]); |
| 320 | kill(infop.si_pid, SIGKILL); |
| 321 | continue; |
| 322 | } |
| 323 | if (infop.si_pid == core) { |
| 324 | fprintf(stderr, "starter: si_code=%d CLD_EXITED=%d CLD_DUMPED=%d " |
| 325 | "CLD_KILLED=%d CLD_STOPPED=%d si_status=%d (sig '%s')\n", |
| 326 | infop.si_code, CLD_EXITED, CLD_DUMPED, CLD_KILLED, |
| 327 | CLD_STOPPED, infop.si_status, strsignal(infop.si_status)); |
| 328 | // core has died. logging is down too |
| 329 | fputs("starter: error: core died. exiting\n", stderr); |
| 330 | niceexit(EXIT_FAILURE); |
| 331 | } |
| 332 | |
| 333 | /*// remove all of the watches assigned to the pid that just died |
| 334 | for (auto it = watches.begin(); it != watches.end(); ++it) { |
| 335 | if (it->second == infop.si_pid) { |
| 336 | watches_to_ignore.insert(it->first); |
| 337 | } |
| 338 | } |
| 339 | for (auto it = watches_to_ignore.begin(); |
| 340 | it != watches_to_ignore.end(); ++it) { |
| 341 | LOG(DEBUG, "watch id %d was on PID %d\n", *it, infop.si_pid); |
| 342 | watches.erase(*it); |
| 343 | }*/ |
| 344 | |
| 345 | start(children[infop.si_pid], 0); |
| 346 | LOG(DEBUG, "erasing %d from children\n", infop.si_pid); |
| 347 | children.erase(infop.si_pid); |
| 348 | } else { |
| 349 | LOG(WARNING, "waitid failed with %d: %s", errno, strerror(errno)); |
| 350 | } |
| 351 | } |
| 352 | after_loop: ; |
| 353 | } |
| 354 | } |