diff --git a/doc/ChangeLog.md b/doc/ChangeLog.md index ae0635bb..35c61552 100644 --- a/doc/ChangeLog.md +++ b/doc/ChangeLog.md @@ -3,6 +3,31 @@ Change Log All relevant changes are documented in this file. +[Unreleased] +------------ + +### Changes + +- Restart log now spells out the signal name and flags core dumps, + e.g. `killed by SIGKILL` or `killed by SIGSEGV, core dumped`, in + place of the bare numeric `by signal: N`. Gives operators a much + stronger breadcrumb when a daemon dies unexpectedly + +### Fixes + +- Remove stale daemon-owned (`pid:!`) pidfiles after unclean exits. + When a daemon dies via SIGKILL/OOM/segfault, or exits early during + startup, its pidfile lingers and can prevent the next instance from + starting -- `dbus-daemon` for example refuses to start when its + pidfile already exists, so Finit's restart loop gives up. Finit now + drops the file when it still names the just-reaped PID and that PID + is no longer alive (the liveness check guards against PID reuse). + This is a controlled exception to the long-standing rule that Finit + does not touch service-owned pidfiles +- Fix misspelled `SIGUNKOWN` returned by `sig_name()` for unknown + signal numbers, now spelled correctly as `SIGUNKNOWN`. Surfaced by + the new restart log above + [4.17][] - 2026-04-28 --------------------- diff --git a/doc/config/services.md b/doc/config/services.md index a315b372..a65b037c 100644 --- a/doc/config/services.md +++ b/doc/config/services.md @@ -31,6 +31,14 @@ basename of the binary to guess the PID file to watch for the PID: `/var/run/ntpd.pid`. If Finit guesses wrong, you have to submit the full `pid:!/path/to/file.pid`. +With `pid:!/path`, the file belongs to the service: Finit reads it +but does not create or remove it. The one exception is *stale* +cleanup — if the service dies without removing its own pidfile +(SIGKILL, OOM, segfault), and the file still names the just-reaped +PID, Finit removes it before the next retry. This prevents daemons +that refuse to start on an existing pidfile (e.g. `dbus-daemon`) +from getting stuck in a crash-restart loop. + **Example:** In the case of `ospfd` (below), we omit the `-d` flag (daemonize) to diff --git a/src/pid.c b/src/pid.c index 16f97b59..60994061 100644 --- a/src/pid.c +++ b/src/pid.c @@ -201,7 +201,10 @@ int pid_file_set(svc_t *svc, char *file, int not) * pid:!foo --> !/run/foo.pid * pid:!/run/foo.pid --> !/run/foo.pid * - * Note, nothing is created or removed by Finit in this latter form. + * Nothing is created or removed by Finit in this latter form, with one + * exception: a verifiably stale pidfile (still names the just-reaped + * PID, and that PID is no longer alive) is removed so the next instance + * can start. See service_clean_pidfile() in src/service.c. */ int pid_file_parse(svc_t *svc, char *arg) { diff --git a/src/service.c b/src/service.c index 7ed4fceb..127e0099 100644 --- a/src/service.c +++ b/src/service.c @@ -1120,6 +1120,35 @@ static void service_notify_stop(svc_t *svc) } } +/* + * Drop a daemon-owned (pid:!) pidfile if it still names the just-reaped + * PID and that PID is gone. The liveness check guards against reuse. + */ +static void service_clean_pidfile(svc_t *svc, pid_t reaped) +{ + pid_t pid; + char *fn; + + if (reaped <= 1) + return; + + fn = pid_file(svc); + if (!fn) + return; + + pid = pid_file_read(fn); + if (pid != reaped || pid_alive(pid)) + return; + + if (remove(fn) && errno != ENOENT) { + logit(LOG_CRIT, "Failed removing stale service %s pidfile %s", + svc_ident(svc, NULL, 0), fn); + return; + } + + dbg("Removed stale service %s pidfile %s", svc_ident(svc, NULL, 0), fn); +} + /* * Clean up any lingering state from dead/killed services */ @@ -1137,6 +1166,8 @@ static void service_cleanup(svc_t *svc) if (remove(fn) && errno != ENOENT) logit(LOG_CRIT, "Failed removing service %s pidfile %s", svc_ident(svc, NULL, 0), fn); + } else if (svc->pidfile[0] == '!') { + service_clean_pidfile(svc, svc->pid); } /* @@ -2405,7 +2436,10 @@ void service_monitor(pid_t lost, int status) if (svc_is_forking(svc)) { /* Likely start script exiting */ if (svc_is_starting(svc)) { - svc->pid = 0; /* Expect no more activity from this one */ + /* Daemon died before clearing 'starting'; drop any stale pidfile. */ + service_clean_pidfile(svc, lost); + svc->oldpid = lost; /* So service_retry() logs the real PID */ + svc->pid = 0; /* Expect no more activity from this one */ goto cont; } @@ -2794,13 +2828,18 @@ static void service_retry(svc_t *svc) timeout = ((*restart_cnt) <= (svc->restart_max / 2)) ? 2000 : 5000; /* If a longer timeout was specified in the conf, use that instead. */ svc->restart_tmo = max(svc->restart_tmo, timeout); - logit(LOG_CONSOLE|LOG_WARNING, "Service %s[%d] died (%s%d), restarting (retry in %d msec) (attempt: %d/%d)", - svc_ident(svc, NULL, 0), svc->oldpid, - WIFEXITED(svc->status) ? "with exit status: " : "by signal: ", - WIFEXITED(svc->status) ? WEXITSTATUS(svc->status) : WTERMSIG(svc->status), - svc->restart_tmo, - *restart_cnt, - svc->restart_max); + if (WIFEXITED(svc->status)) + logit(LOG_CONSOLE|LOG_WARNING, + "Service %s[%d] died (exit status: %d), restarting (retry in %d msec) (attempt: %d/%d)", + svc_ident(svc, NULL, 0), svc->oldpid, WEXITSTATUS(svc->status), + svc->restart_tmo, *restart_cnt, svc->restart_max); + else + logit(LOG_CONSOLE|LOG_WARNING, + "Service %s[%d] died (killed by %s%s), restarting (retry in %d msec) (attempt: %d/%d)", + svc_ident(svc, NULL, 0), svc->oldpid, + sig_name(WTERMSIG(svc->status)), + WCOREDUMP(svc->status) ? ", core dumped" : "", + svc->restart_tmo, *restart_cnt, svc->restart_max); svc_unblock(svc); service_step(svc); diff --git a/src/sig.c b/src/sig.c index 204cefbc..6db745d3 100644 --- a/src/sig.c +++ b/src/sig.c @@ -630,7 +630,7 @@ const char *sig_name(int signo) return signames[i].name; } - return "SIGUNKOWN"; + return "SIGUNKNOWN"; } /* diff --git a/test/Makefile.am b/test/Makefile.am index 6e5ddfce..da6eb905 100644 --- a/test/Makefile.am +++ b/test/Makefile.am @@ -41,6 +41,7 @@ EXTRA_DIST += global-envs.sh EXTRA_DIST += initctl-status-subset.sh EXTRA_DIST += notify.sh EXTRA_DIST += pidfile.sh +EXTRA_DIST += stale-pidfile.sh EXTRA_DIST += pre-post-serv.sh EXTRA_DIST += pre-fail.sh EXTRA_DIST += process-depends.sh @@ -84,6 +85,7 @@ TESTS += global-envs.sh TESTS += initctl-status-subset.sh TESTS += notify.sh TESTS += pidfile.sh +TESTS += stale-pidfile.sh TESTS += pre-post-serv.sh TESTS += pre-fail.sh TESTS += process-depends.sh diff --git a/test/src/serv.c b/test/src/serv.c index 8129efb1..16d35bb1 100644 --- a/test/src/serv.c +++ b/test/src/serv.c @@ -34,6 +34,7 @@ volatile sig_atomic_t reloading = 1; volatile sig_atomic_t running = 1; static char *ident = PROGNM; static char fn[80]; +static int exclusive = 0; /* -x: refuse to start if pidfile exists (dbus-style) */ static void verify_env(char *arg) { @@ -122,7 +123,7 @@ static void writefn(char *fn, int val) static int checkfn(char *fn) { - return !access(fn, R_OK); + return !access(fn, F_OK); } static void mine(char *fn) @@ -133,12 +134,18 @@ static void mine(char *fn) static void pidfile(char *pidfn) { + static int once = 0; + if (!pidfn) { if (fn[0] == 0) snprintf(fn, sizeof(fn), "%s%s.pid", _PATH_VARRUN, ident); pidfn = fn; } + if (exclusive && !once && checkfn(pidfn)) + errx(EX_SOFTWARE, "PID file %s exists, refusing to start", pidfn); + once = 1; + if (!checkfn(pidfn)) { pid_t pid; @@ -148,7 +155,7 @@ static void pidfile(char *pidfn) atexit(cleanup); } else { inf("Touching PID file %s", pidfn); - utimensat(0, fn, NULL, 0); + utimensat(AT_FDCWD, pidfn, NULL, 0); } } @@ -182,6 +189,7 @@ static int usage(int rc) " -p Create PID file despite running in foreground\n" " -P FILE Create PID file using FILE\n" " -r SVC Call initctl to restart service SVC (self)\n" + " -x Refuse to start if PID file already exists (dbus-style)\n" "\n" "By default this program daemonizes itself to the background, and,\n" "when it's done setting up its signal handler(s), creates a PID file\n" @@ -212,7 +220,7 @@ int main(int argc, char *argv[]) char cmd[80]; int c; - while ((c = getopt(argc, argv, "cCe:E:f:F:hi:nN:pP:r:")) != EOF) { + while ((c = getopt(argc, argv, "cCe:E:f:F:hi:nN:pP:r:x")) != EOF) { switch (c) { case 'c': do_crash = 1; @@ -249,12 +257,17 @@ int main(int argc, char *argv[]) break; case 'P': pidfn = optarg; + if ((size_t)snprintf(fn, sizeof(fn), "%s", optarg) >= sizeof(fn)) + errx(EX_USAGE, "-P path too long (max %zu)", sizeof(fn) - 1); do_pidfile++; break; case 'r': snprintf(cmd, sizeof(cmd), "initctl restart %s", optarg); do_restart = 1; break; + case 'x': + exclusive = 1; + break; default: return usage(1); } diff --git a/test/stale-pidfile.sh b/test/stale-pidfile.sh new file mode 100755 index 00000000..07b4be47 --- /dev/null +++ b/test/stale-pidfile.sh @@ -0,0 +1,38 @@ +#!/bin/sh +# Regression: Finit must remove a daemon-owned (pid:!) stale pidfile +# left behind by an unclean exit (SIGKILL), so the next instance can +# start. Simulates the dbus-daemon pattern: 'serv -x' refuses to +# start if its pidfile already exists. + +set -eu + +TEST_DIR=$(dirname "$0") +PIDFN="/run/serv.pid" + +test_teardown() +{ + say "Running test teardown." + run "rm -f $FINIT_CONF $PIDFN" +} + +# shellcheck source=/dev/null +. "$TEST_DIR/lib/setup.sh" + +say "Add service stanza '$FINIT_CONF' with pid:!$PIDFN" +run "echo 'service pid:!$PIDFN serv -np -P $PIDFN -x' > $FINIT_CONF" +run "initctl reload" + +retry "assert_num_children 1 serv" +assert_is_pidfile "serv" "$PIDFN" + +PID=$(texec cat "$PIDFN") +say "SIGKILL serv ($PID) -- leaves stale pidfile" +run "kill -9 $PID" + +# Without the fix, 'serv -x' refuses to start on each retry until +# Finit hits restart_max and marks the service crashed. With the fix +# Finit removes the stale pidfile and the next instance comes up. +retry "assert_pidiff serv $PID" +retry "assert_num_children 1 serv" + +sep