brians | 343bc11 | 2013-02-10 01:53:46 +0000 | [diff] [blame] | 1 | #include "aos/atom_code/starter/starter.h" |
| 2 | |
| 3 | #include <stdio.h> |
| 4 | #include <stdlib.h> |
| 5 | #include <sys/signalfd.h> |
| 6 | #include <sys/types.h> |
| 7 | #include <fcntl.h> |
| 8 | #include <sys/inotify.h> |
| 9 | #include <sys/stat.h> |
| 10 | |
Brian Silverman | f665d69 | 2013-02-17 22:11:39 -0800 | [diff] [blame] | 11 | #include "aos/common/logging/logging.h" |
| 12 | #include "aos/common/logging/logging_impl.h" |
| 13 | #include "aos/atom_code/init.h" |
brians | 343bc11 | 2013-02-10 01:53:46 +0000 | [diff] [blame] | 14 | |
| 15 | void niceexit(int status); |
| 16 | |
| 17 | pid_t start(const char *cmd, uint8_t times) { |
| 18 | char *which_cmd, *which_cmd_stm; |
| 19 | if (asprintf(&which_cmd, "which %s", cmd) == -1) { |
Brian Silverman | f665d69 | 2013-02-17 22:11:39 -0800 | [diff] [blame] | 20 | LOG(ERROR, "creating \"which %s\" failed with %d: %s\n", |
| 21 | cmd, errno, strerror(errno)); |
brians | 343bc11 | 2013-02-10 01:53:46 +0000 | [diff] [blame] | 22 | niceexit(EXIT_FAILURE); |
| 23 | } |
| 24 | if (asprintf(&which_cmd_stm, "which %s.stm", cmd) == -1) { |
Brian Silverman | f665d69 | 2013-02-17 22:11:39 -0800 | [diff] [blame] | 25 | LOG(ERROR, "creating \"which %s.stm\" failed with %d: %s\n", |
| 26 | cmd, errno, strerror(errno)); |
brians | 343bc11 | 2013-02-10 01:53:46 +0000 | [diff] [blame] | 27 | niceexit(EXIT_FAILURE); |
| 28 | } |
| 29 | FILE *which = popen(which_cmd, "r"); |
| 30 | char exe[CMDLEN + 5], orig_exe[CMDLEN]; |
| 31 | size_t ret; |
| 32 | if ((ret = fread(orig_exe, 1, sizeof(orig_exe), which)) == CMDLEN) { |
Brian Silverman | f665d69 | 2013-02-17 22:11:39 -0800 | [diff] [blame] | 33 | LOG(ERROR, "`which %s` was too long. not starting '%s'\n", cmd, cmd); |
brians | 343bc11 | 2013-02-10 01:53:46 +0000 | [diff] [blame] | 34 | return 0; |
| 35 | } |
| 36 | orig_exe[ret] = '\0'; |
| 37 | if (pclose(which) == -1) { |
Brian Silverman | f665d69 | 2013-02-17 22:11:39 -0800 | [diff] [blame] | 38 | LOG(WARNING, "pclose failed with %d: %s\n", errno, strerror(errno)); |
brians | 343bc11 | 2013-02-10 01:53:46 +0000 | [diff] [blame] | 39 | } |
| 40 | free(which_cmd); |
| 41 | if (strlen(orig_exe) == 0) { // which returned nothing; check if stm exists |
Brian Silverman | f665d69 | 2013-02-17 22:11:39 -0800 | [diff] [blame] | 42 | LOG(INFO, "%s didn't exist. trying %s.stm\n", cmd, cmd); |
brians | 343bc11 | 2013-02-10 01:53:46 +0000 | [diff] [blame] | 43 | FILE *which_stm = popen(which_cmd_stm, "r"); |
| 44 | if ((ret = fread(orig_exe, 1, sizeof(orig_exe), which_stm)) == CMDLEN) { |
Brian Silverman | f665d69 | 2013-02-17 22:11:39 -0800 | [diff] [blame] | 45 | LOG(ERROR, "`which %s.stm` was too long. not starting %s\n", cmd, cmd); |
brians | 343bc11 | 2013-02-10 01:53:46 +0000 | [diff] [blame] | 46 | return 0; |
| 47 | } |
| 48 | orig_exe[ret] = '\0'; |
| 49 | if (pclose(which) == -1) { |
Brian Silverman | f665d69 | 2013-02-17 22:11:39 -0800 | [diff] [blame] | 50 | LOG(WARNING, "pclose failed with %d: %s\n", errno, strerror(errno)); |
brians | 343bc11 | 2013-02-10 01:53:46 +0000 | [diff] [blame] | 51 | } |
| 52 | } |
| 53 | if (strlen(orig_exe) == 0) { |
Brian Silverman | f665d69 | 2013-02-17 22:11:39 -0800 | [diff] [blame] | 54 | LOG(WARNING, "couldn't find file '%s[.stm]'. not going to start it\n", |
| 55 | cmd); |
brians | 343bc11 | 2013-02-10 01:53:46 +0000 | [diff] [blame] | 56 | return 0; |
| 57 | } |
| 58 | if (orig_exe[strlen(orig_exe) - 1] != '\n') { |
Brian Silverman | f665d69 | 2013-02-17 22:11:39 -0800 | [diff] [blame] | 59 | LOG(WARNING, "no \\n on the end of `which %s[.stm]` output ('%s')\n", |
| 60 | cmd, orig_exe); |
brians | 343bc11 | 2013-02-10 01:53:46 +0000 | [diff] [blame] | 61 | } else { |
| 62 | orig_exe[strlen(orig_exe) - 1] = '\0'; // get rid of the \n |
| 63 | } |
| 64 | strncpy(exe, orig_exe, CMDLEN); |
| 65 | |
| 66 | strcat(exe, ".stm"); |
| 67 | struct stat st; |
| 68 | errno = 0; |
| 69 | if (stat(orig_exe, &st) != 0 && errno != ENOENT) { |
Brian Silverman | f665d69 | 2013-02-17 22:11:39 -0800 | [diff] [blame] | 70 | LOG(ERROR, "killing everything because stat('%s') failed with %d: %s\n", |
| 71 | orig_exe, errno, strerror(errno)); |
brians | 343bc11 | 2013-02-10 01:53:46 +0000 | [diff] [blame] | 72 | niceexit(EXIT_FAILURE); |
| 73 | } else if (errno == ENOENT) { |
Brian Silverman | f665d69 | 2013-02-17 22:11:39 -0800 | [diff] [blame] | 74 | LOG(WARNING, "binary '%s' doesn't exist. not starting it\n", orig_exe); |
brians | 343bc11 | 2013-02-10 01:53:46 +0000 | [diff] [blame] | 75 | return 0; |
| 76 | } |
| 77 | struct stat st2; |
| 78 | // if we can confirm it's already 0 size |
| 79 | bool orig_zero = stat(orig_exe, &st2) == 0 && st2.st_size == 0; |
| 80 | if (!orig_zero) { |
| 81 | // if it failed and it wasn't because it was missing |
| 82 | if (unlink(exe) != 0 && (errno != EROFS && errno != ENOENT)) { |
Brian Silverman | f665d69 | 2013-02-17 22:11:39 -0800 | [diff] [blame] | 83 | LOG(ERROR, |
| 84 | "killing everything because unlink('%s') failed with %d: %s\n", |
| 85 | exe, errno, strerror(errno)); |
brians | 343bc11 | 2013-02-10 01:53:46 +0000 | [diff] [blame] | 86 | niceexit(EXIT_FAILURE); |
| 87 | } |
| 88 | if (link(orig_exe, exe) != 0) { |
Brian Silverman | f665d69 | 2013-02-17 22:11:39 -0800 | [diff] [blame] | 89 | LOG(ERROR, |
| 90 | "killing everything because link('%s', '%s') failed with %d: %s\n", |
| 91 | orig_exe, exe, errno, strerror(errno)); |
brians | 343bc11 | 2013-02-10 01:53:46 +0000 | [diff] [blame] | 92 | niceexit(EXIT_FAILURE); |
| 93 | } |
| 94 | } |
| 95 | if (errno == EEXIST) { |
Brian Silverman | f665d69 | 2013-02-17 22:11:39 -0800 | [diff] [blame] | 96 | LOG(INFO, "exe ('%s') already existed\n", exe); |
brians | 343bc11 | 2013-02-10 01:53:46 +0000 | [diff] [blame] | 97 | } |
| 98 | |
| 99 | pid_t child; |
| 100 | if ((child = fork()) == 0) { |
| 101 | execlp(exe, orig_exe, static_cast<char *>(NULL)); |
Brian Silverman | f665d69 | 2013-02-17 22:11:39 -0800 | [diff] [blame] | 102 | LOG(ERROR, |
| 103 | "killing everything because execlp('%s', '%s', NULL) " |
| 104 | "failed with %d: %s\n", |
| 105 | exe, cmd, errno, strerror(errno)); |
brians | 343bc11 | 2013-02-10 01:53:46 +0000 | [diff] [blame] | 106 | _exit(EXIT_FAILURE); // don't niceexit or anything because this is the child!! |
| 107 | } |
| 108 | if (child == -1) { |
Brian Silverman | f665d69 | 2013-02-17 22:11:39 -0800 | [diff] [blame] | 109 | LOG(WARNING, "fork on '%s' failed with %d: %s", |
| 110 | cmd, errno, strerror(errno)); |
brians | 343bc11 | 2013-02-10 01:53:46 +0000 | [diff] [blame] | 111 | if (times < 100) { |
| 112 | return start(cmd, times + 1); |
| 113 | } else { |
Brian Silverman | f665d69 | 2013-02-17 22:11:39 -0800 | [diff] [blame] | 114 | LOG(ERROR, "tried to start '%s' too many times. giving up\n", cmd); |
brians | 343bc11 | 2013-02-10 01:53:46 +0000 | [diff] [blame] | 115 | return 0; |
| 116 | } |
| 117 | } else { |
| 118 | children[child] = cmd; |
| 119 | files[child] = orig_exe; |
| 120 | int ret = inotify_add_watch(notifyfd, orig_exe, IN_ATTRIB | IN_MODIFY); |
| 121 | if (ret < 0) { |
Brian Silverman | f665d69 | 2013-02-17 22:11:39 -0800 | [diff] [blame] | 122 | LOG(WARNING, "inotify_add_watch('%s') failed: " |
| 123 | "not going to watch for changes to it because of %d: %s\n", |
| 124 | orig_exe, errno, strerror(errno)); |
brians | 343bc11 | 2013-02-10 01:53:46 +0000 | [diff] [blame] | 125 | } else { |
| 126 | watches[ret] = child; |
| 127 | mtimes[ret] = st2.st_mtime; |
| 128 | } |
| 129 | return child; |
| 130 | } |
| 131 | } |
| 132 | |
| 133 | static bool exited = false; |
| 134 | void exit_handler() { |
| 135 | if(exited) { |
| 136 | return; |
| 137 | } else { |
| 138 | exited = true; |
| 139 | } |
| 140 | fputs("starter: killing all children for exit\n", stdout); |
| 141 | for (auto it = children.begin(); it != children.end(); ++it) { |
| 142 | printf("starter: killing child %d ('%s') for exit\n", it->first, it->second); |
| 143 | kill(it->first, SIGKILL); |
| 144 | } |
| 145 | if (sigfd != 0) { |
| 146 | close(sigfd); |
| 147 | } |
| 148 | if (notifyfd != 0) { |
| 149 | close(notifyfd); |
| 150 | } |
| 151 | } |
| 152 | void niceexit(int status) { |
| 153 | printf("starter: niceexit(%d) EXIT_SUCCESS=%d EXIT_FAILURE=%d\n", |
| 154 | status, EXIT_SUCCESS, EXIT_FAILURE); |
| 155 | exit_handler(); |
| 156 | exit(status); |
| 157 | } |
| 158 | |
| 159 | int main(int argc, char *argv[]) { |
| 160 | if (argc < 2) { |
| 161 | fputs("starter: error: need an argument specifying what file to use\n", stderr); |
| 162 | niceexit(EXIT_FAILURE); |
| 163 | } else if(argc > 2) { |
| 164 | fputs("starter: warning: too many arguments\n", stderr); |
| 165 | } |
| 166 | |
| 167 | atexit(exit_handler); |
| 168 | |
Brian Silverman | f665d69 | 2013-02-17 22:11:39 -0800 | [diff] [blame] | 169 | aos::logging::Init(); |
| 170 | |
brians | 343bc11 | 2013-02-10 01:53:46 +0000 | [diff] [blame] | 171 | notifyfd = inotify_init1(IN_NONBLOCK); |
| 172 | |
| 173 | pid_t core = start("core", 0); |
| 174 | if (core == 0) { |
| 175 | fputs("starter: error: core didn't exist\n", stderr); |
| 176 | niceexit(EXIT_FAILURE); |
| 177 | } |
| 178 | fprintf(stderr, "starter: info: core's pid is %jd\n", static_cast<intmax_t>(core)); |
| 179 | FILE *pid_file = fopen("/tmp/starter.pid", "w"); |
| 180 | if (pid_file == NULL) { |
| 181 | perror("fopen(/tmp/starter.pid)"); |
| 182 | } else { |
| 183 | if (fprintf(pid_file, "%d", core) == -1) { |
| 184 | fprintf(stderr, "starter: error: fprintf(pid_file, core(=%d)) failed " |
| 185 | "with %d: %s", |
| 186 | core, errno, strerror(errno)); |
| 187 | } |
| 188 | fclose(pid_file); |
| 189 | } |
| 190 | sleep(1); |
| 191 | if (kill(core, 0) != 0) { |
| 192 | fprintf(stderr, "starter: couldn't kill(%jd(=core), 0) because of %d: %s\n", |
| 193 | static_cast<intmax_t>(core), errno, strerror(errno)); |
| 194 | niceexit(EXIT_FAILURE); |
| 195 | } |
| 196 | fputs("starter: before init\n", stdout); |
| 197 | aos::InitNRT(); |
| 198 | fputs("starter: after init\n", stdout); |
| 199 | |
| 200 | FILE *list = fopen(argv[1], "re"); |
| 201 | char line[CMDLEN + 1]; |
| 202 | char *line_copy; |
| 203 | uint8_t too_long = 0; |
| 204 | while (fgets(line, sizeof(line), list) != NULL) { |
| 205 | if (line[strlen(line) - 1] != '\n') { |
| 206 | LOG(WARNING, "command segment '%s' is too long. " |
| 207 | "increase the size of the line char[] above " __FILE__ ": %d\n", |
| 208 | line, __LINE__); |
| 209 | too_long = 1; |
| 210 | continue; |
| 211 | } |
| 212 | if (too_long) { |
| 213 | too_long = 0; |
| 214 | LOG(WARNING, "\tgot last chunk of too long line: '%s'\n", line); |
| 215 | continue; // don't try running the last little chunk |
| 216 | } |
| 217 | line[strlen(line) - 1] = '\0'; // get rid of the \n |
| 218 | line_copy = new char[strlen(line) + 1]; |
| 219 | memcpy(line_copy, line, strlen(line) + 1); |
| 220 | fprintf(stderr, "starter: info: going to start \"%s\"\n", line_copy); |
| 221 | start(line_copy, 0); |
| 222 | } |
| 223 | fclose(list); |
| 224 | LOG(INFO, "started everything\n"); |
| 225 | |
| 226 | sigset_t mask; |
| 227 | sigemptyset (&mask); |
| 228 | sigaddset (&mask, SIGCHLD); |
| 229 | sigprocmask (SIG_BLOCK, &mask, NULL); |
| 230 | sigfd = signalfd (-1, &mask, O_NONBLOCK); |
| 231 | |
| 232 | fd_set readfds; |
| 233 | FD_ZERO(&readfds); |
| 234 | siginfo_t infop; |
| 235 | signalfd_siginfo fdsi; |
| 236 | inotify_event notifyevt; |
| 237 | int ret; |
| 238 | while (1) { |
| 239 | FD_SET(sigfd, &readfds); |
| 240 | FD_SET(notifyfd, &readfds); |
| 241 | timeval timeout; |
| 242 | timeout.tv_sec = restarts.empty() ? 2 : 0; |
| 243 | timeout.tv_usec = 100000; |
| 244 | ret = select (FD_SETSIZE, &readfds, NULL, NULL, &timeout); |
| 245 | |
| 246 | if (ret == 0) { // timeout |
| 247 | auto it = restarts.begin(); |
| 248 | // WARNING because the message about it dying will be |
| 249 | for (; it != restarts.end(); it++) { |
| 250 | LOG(WARNING, "restarting process %d ('%s') by giving it a SIGKILL(%d)\n", |
| 251 | *it, children[*it], SIGKILL); |
| 252 | kill(*it, SIGKILL); |
| 253 | } |
| 254 | restarts.clear(); |
| 255 | } |
| 256 | |
| 257 | if (FD_ISSET(notifyfd, &readfds)) { |
| 258 | if ((ret = read(notifyfd, ¬ifyevt, sizeof(notifyevt))) == |
| 259 | sizeof(notifyevt)) { |
| 260 | if (watches.count(notifyevt.wd)) { |
| 261 | struct stat st; |
| 262 | if (!children.count(watches[notifyevt.wd]) || |
| 263 | stat(files[watches[notifyevt.wd]], &st) == 0) { |
| 264 | if (mtimes[notifyevt.wd] == st.st_mtime) { |
| 265 | LOG(DEBUG, "ignoring trigger of watch id %d (file '%s')" |
| 266 | " because mtime didn't change\n", |
| 267 | notifyevt.wd, files[watches[notifyevt.wd]]); |
| 268 | } else if (children.count(watches[notifyevt.wd])) { |
| 269 | LOG(DEBUG, "adding process %d to the restart list\n", |
| 270 | watches[notifyevt.wd]); |
| 271 | restarts.insert(watches[notifyevt.wd]); |
| 272 | } else { |
| 273 | LOG(DEBUG, "children doesn't have entry for PID %d\n", |
| 274 | watches[notifyevt.wd]); |
| 275 | } |
| 276 | } else { |
| 277 | LOG(ERROR, "stat('%s') failed with %d: %s\n", |
| 278 | files[watches[notifyevt.wd]], errno, strerror(errno)); |
| 279 | } |
| 280 | } else { |
| 281 | LOG(WARNING, "no PID for watch id %d\n", notifyevt.wd); |
| 282 | } |
| 283 | } else { |
| 284 | if (ret == -1) { |
| 285 | LOG(WARNING, "read(notifyfd) failed with %d: %s", errno, strerror(errno)); |
| 286 | } else { |
| 287 | LOG(WARNING, "couldn't get a whole inotify_event(%d) (only got %d)\n", |
| 288 | sizeof(notifyevt), ret); |
| 289 | } |
| 290 | } |
| 291 | } |
| 292 | |
| 293 | if (FD_ISSET(sigfd, &readfds)) { |
| 294 | while(read (sigfd, &fdsi, sizeof fdsi) > 0); |
| 295 | } |
| 296 | while (1) { |
| 297 | infop.si_pid = 0; |
| 298 | if (waitid(P_ALL, 0, &infop, WEXITED | WSTOPPED | WNOHANG) == 0) { |
| 299 | if (infop.si_pid == 0) { |
| 300 | goto after_loop; // no more child process changes pending |
| 301 | } |
| 302 | switch (infop.si_code) { |
| 303 | case CLD_EXITED: |
| 304 | LOG(WARNING, "child %d (%s) exited with status %d\n", |
| 305 | infop.si_pid, children[infop.si_pid], infop.si_status); |
| 306 | break; |
| 307 | case CLD_DUMPED: |
| 308 | LOG(INFO, "child %d actually dumped core. " |
| 309 | "falling through to killed by signal case\n", infop.si_pid); |
| 310 | case CLD_KILLED: |
| 311 | LOG(WARNING, "child %d (%s) was killed by signal %d (%s)\n", |
| 312 | infop.si_pid, children[infop.si_pid], infop.si_status, |
| 313 | strsignal(infop.si_status)); |
| 314 | break; |
| 315 | case CLD_STOPPED: |
| 316 | LOG(WARNING, "child %d (%s) was stopped by signal %d " |
| 317 | "(giving it a SIGCONT(%d))\n", |
| 318 | infop.si_pid, children[infop.si_pid], infop.si_status, SIGCONT); |
| 319 | kill(infop.si_pid, SIGCONT); |
| 320 | continue; |
| 321 | default: |
| 322 | LOG(WARNING, "something happened to child %d (%s) (killing it)\n", |
| 323 | infop.si_pid, children[infop.si_pid]); |
| 324 | kill(infop.si_pid, SIGKILL); |
| 325 | continue; |
| 326 | } |
| 327 | if (infop.si_pid == core) { |
| 328 | fprintf(stderr, "starter: si_code=%d CLD_EXITED=%d CLD_DUMPED=%d " |
| 329 | "CLD_KILLED=%d CLD_STOPPED=%d si_status=%d (sig '%s')\n", |
| 330 | infop.si_code, CLD_EXITED, CLD_DUMPED, CLD_KILLED, |
| 331 | CLD_STOPPED, infop.si_status, strsignal(infop.si_status)); |
| 332 | // core has died. logging is down too |
| 333 | fputs("starter: error: core died. exiting\n", stderr); |
| 334 | niceexit(EXIT_FAILURE); |
| 335 | } |
| 336 | |
brians | 343bc11 | 2013-02-10 01:53:46 +0000 | [diff] [blame] | 337 | start(children[infop.si_pid], 0); |
| 338 | LOG(DEBUG, "erasing %d from children\n", infop.si_pid); |
| 339 | children.erase(infop.si_pid); |
| 340 | } else { |
| 341 | LOG(WARNING, "waitid failed with %d: %s", errno, strerror(errno)); |
| 342 | } |
| 343 | } |
| 344 | after_loop: ; |
| 345 | } |
| 346 | } |