From 4a53f610cd05c2aba3da770384460f7e66488ff5 Mon Sep 17 00:00:00 2001 From: Joachim Wiberg Date: Mon, 11 May 2026 13:55:11 +0200 Subject: [PATCH 1/5] service: clean stale pidfile after unclean daemon exit With `pid:!/path` Finit does not manage the file -- the daemon creates it on start and removes it on graceful exit. If the daemon dies before cleanup (SIGKILL, OOM, segfault, exit during startup) the file lingers and can block the next instance from starting, e.g. dbus-daemon refuses with EEXIST and the restart loop fails. Remove the file when it still names the just-reaped PID and that PID is no longer alive (the liveness check guards against reuse). Called from service_cleanup(), and from service_monitor()'s forking+starting branch where cleanup was previously skipped. Signed-off-by: Joachim Wiberg --- src/service.c | 36 +++++++++++++++++++++++++++++++++++- 1 file changed, 35 insertions(+), 1 deletion(-) diff --git a/src/service.c b/src/service.c index 7ed4fceb..e930c4fd 100644 --- a/src/service.c +++ b/src/service.c @@ -1120,6 +1120,35 @@ static void service_notify_stop(svc_t *svc) } } +/* + * Drop a daemon-owned (pid:!) pidfile if it still names the just-reaped + * PID and that PID is gone. The liveness check guards against reuse. + */ +static void service_clean_pidfile(svc_t *svc, pid_t reaped) +{ + pid_t pid; + char *fn; + + if (reaped <= 1) + return; + + fn = pid_file(svc); + if (!fn) + return; + + pid = pid_file_read(fn); + if (pid != reaped || pid_alive(pid)) + return; + + if (remove(fn) && errno != ENOENT) { + logit(LOG_CRIT, "Failed removing stale service %s pidfile %s", + svc_ident(svc, NULL, 0), fn); + return; + } + + dbg("Removed stale service %s pidfile %s", svc_ident(svc, NULL, 0), fn); +} + /* * Clean up any lingering state from dead/killed services */ @@ -1137,6 +1166,8 @@ static void service_cleanup(svc_t *svc) if (remove(fn) && errno != ENOENT) logit(LOG_CRIT, "Failed removing service %s pidfile %s", svc_ident(svc, NULL, 0), fn); + } else if (svc->pidfile[0] == '!') { + service_clean_pidfile(svc, svc->pid); } /* @@ -2405,7 +2436,10 @@ void service_monitor(pid_t lost, int status) if (svc_is_forking(svc)) { /* Likely start script exiting */ if (svc_is_starting(svc)) { - svc->pid = 0; /* Expect no more activity from this one */ + /* Daemon died before clearing 'starting'; drop any stale pidfile. */ + service_clean_pidfile(svc, lost); + svc->oldpid = lost; /* So service_retry() logs the real PID */ + svc->pid = 0; /* Expect no more activity from this one */ goto cont; } From 30f2ca3b2e64bce7db1e2d9dcb37a06d53e0b6bf Mon Sep 17 00:00:00 2001 From: Joachim Wiberg Date: Mon, 11 May 2026 17:08:25 +0200 Subject: [PATCH 2/5] service: log signal name and core dumps in death message Replace the bare signal number ("by signal: 9") with the symbolic name ("killed by SIGKILL") and annotate when the kernel wrote a core:("killed by SIGSEGV, core dumped"). Makes the restart line self-explanatory and gives operators a strong breadcrumb when a daemon dies unexpectedly. Signed-off-by: Joachim Wiberg --- src/service.c | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/src/service.c b/src/service.c index e930c4fd..127e0099 100644 --- a/src/service.c +++ b/src/service.c @@ -2828,13 +2828,18 @@ static void service_retry(svc_t *svc) timeout = ((*restart_cnt) <= (svc->restart_max / 2)) ? 2000 : 5000; /* If a longer timeout was specified in the conf, use that instead. */ svc->restart_tmo = max(svc->restart_tmo, timeout); - logit(LOG_CONSOLE|LOG_WARNING, "Service %s[%d] died (%s%d), restarting (retry in %d msec) (attempt: %d/%d)", - svc_ident(svc, NULL, 0), svc->oldpid, - WIFEXITED(svc->status) ? "with exit status: " : "by signal: ", - WIFEXITED(svc->status) ? WEXITSTATUS(svc->status) : WTERMSIG(svc->status), - svc->restart_tmo, - *restart_cnt, - svc->restart_max); + if (WIFEXITED(svc->status)) + logit(LOG_CONSOLE|LOG_WARNING, + "Service %s[%d] died (exit status: %d), restarting (retry in %d msec) (attempt: %d/%d)", + svc_ident(svc, NULL, 0), svc->oldpid, WEXITSTATUS(svc->status), + svc->restart_tmo, *restart_cnt, svc->restart_max); + else + logit(LOG_CONSOLE|LOG_WARNING, + "Service %s[%d] died (killed by %s%s), restarting (retry in %d msec) (attempt: %d/%d)", + svc_ident(svc, NULL, 0), svc->oldpid, + sig_name(WTERMSIG(svc->status)), + WCOREDUMP(svc->status) ? ", core dumped" : "", + svc->restart_tmo, *restart_cnt, svc->restart_max); svc_unblock(svc); service_step(svc); From 309ad1bea713d2d61364b936ee41797f46f9f3f6 Mon Sep 17 00:00:00 2001 From: Joachim Wiberg Date: Mon, 11 May 2026 18:53:05 +0200 Subject: [PATCH 3/5] sig: spell SIGUNKNOWN correctly MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The fallback for unknown signal numbers in sig_name() returned the misspelled "SIGUNKOWN". Now that this string surfaces in user- facing logs ("killed by …"), fix the typo. Signed-off-by: Joachim Wiberg --- src/sig.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/sig.c b/src/sig.c index 204cefbc..6db745d3 100644 --- a/src/sig.c +++ b/src/sig.c @@ -630,7 +630,7 @@ const char *sig_name(int signo) return signames[i].name; } - return "SIGUNKOWN"; + return "SIGUNKNOWN"; } /* From 3febc7d513ab420d7add54beb93b32f1a72255f2 Mon Sep 17 00:00:00 2001 From: Joachim Wiberg Date: Mon, 11 May 2026 19:09:28 +0200 Subject: [PATCH 4/5] test: regression for stale pidfile cleanup after unclean exit Cover the scenario fixed in "service: clean stale pidfile after unclean daemon exit": a daemon with a pid:!/path config dies via SIGKILL, leaving its pidfile behind, and the next instance must still come up. Add a 'serv -x' flag (refuse to start when the pidfile already exists, dbus-style) so the test actually exercises the cleanup -- without it, plain 'serv' would happily overwrite the file and the test would pass with or without the fix. Signed-off-by: Joachim Wiberg --- test/Makefile.am | 2 ++ test/src/serv.c | 19 ++++++++++++++++--- test/stale-pidfile.sh | 38 ++++++++++++++++++++++++++++++++++++++ 3 files changed, 56 insertions(+), 3 deletions(-) create mode 100755 test/stale-pidfile.sh diff --git a/test/Makefile.am b/test/Makefile.am index 6e5ddfce..da6eb905 100644 --- a/test/Makefile.am +++ b/test/Makefile.am @@ -41,6 +41,7 @@ EXTRA_DIST += global-envs.sh EXTRA_DIST += initctl-status-subset.sh EXTRA_DIST += notify.sh EXTRA_DIST += pidfile.sh +EXTRA_DIST += stale-pidfile.sh EXTRA_DIST += pre-post-serv.sh EXTRA_DIST += pre-fail.sh EXTRA_DIST += process-depends.sh @@ -84,6 +85,7 @@ TESTS += global-envs.sh TESTS += initctl-status-subset.sh TESTS += notify.sh TESTS += pidfile.sh +TESTS += stale-pidfile.sh TESTS += pre-post-serv.sh TESTS += pre-fail.sh TESTS += process-depends.sh diff --git a/test/src/serv.c b/test/src/serv.c index 8129efb1..16d35bb1 100644 --- a/test/src/serv.c +++ b/test/src/serv.c @@ -34,6 +34,7 @@ volatile sig_atomic_t reloading = 1; volatile sig_atomic_t running = 1; static char *ident = PROGNM; static char fn[80]; +static int exclusive = 0; /* -x: refuse to start if pidfile exists (dbus-style) */ static void verify_env(char *arg) { @@ -122,7 +123,7 @@ static void writefn(char *fn, int val) static int checkfn(char *fn) { - return !access(fn, R_OK); + return !access(fn, F_OK); } static void mine(char *fn) @@ -133,12 +134,18 @@ static void mine(char *fn) static void pidfile(char *pidfn) { + static int once = 0; + if (!pidfn) { if (fn[0] == 0) snprintf(fn, sizeof(fn), "%s%s.pid", _PATH_VARRUN, ident); pidfn = fn; } + if (exclusive && !once && checkfn(pidfn)) + errx(EX_SOFTWARE, "PID file %s exists, refusing to start", pidfn); + once = 1; + if (!checkfn(pidfn)) { pid_t pid; @@ -148,7 +155,7 @@ static void pidfile(char *pidfn) atexit(cleanup); } else { inf("Touching PID file %s", pidfn); - utimensat(0, fn, NULL, 0); + utimensat(AT_FDCWD, pidfn, NULL, 0); } } @@ -182,6 +189,7 @@ static int usage(int rc) " -p Create PID file despite running in foreground\n" " -P FILE Create PID file using FILE\n" " -r SVC Call initctl to restart service SVC (self)\n" + " -x Refuse to start if PID file already exists (dbus-style)\n" "\n" "By default this program daemonizes itself to the background, and,\n" "when it's done setting up its signal handler(s), creates a PID file\n" @@ -212,7 +220,7 @@ int main(int argc, char *argv[]) char cmd[80]; int c; - while ((c = getopt(argc, argv, "cCe:E:f:F:hi:nN:pP:r:")) != EOF) { + while ((c = getopt(argc, argv, "cCe:E:f:F:hi:nN:pP:r:x")) != EOF) { switch (c) { case 'c': do_crash = 1; @@ -249,12 +257,17 @@ int main(int argc, char *argv[]) break; case 'P': pidfn = optarg; + if ((size_t)snprintf(fn, sizeof(fn), "%s", optarg) >= sizeof(fn)) + errx(EX_USAGE, "-P path too long (max %zu)", sizeof(fn) - 1); do_pidfile++; break; case 'r': snprintf(cmd, sizeof(cmd), "initctl restart %s", optarg); do_restart = 1; break; + case 'x': + exclusive = 1; + break; default: return usage(1); } diff --git a/test/stale-pidfile.sh b/test/stale-pidfile.sh new file mode 100755 index 00000000..07b4be47 --- /dev/null +++ b/test/stale-pidfile.sh @@ -0,0 +1,38 @@ +#!/bin/sh +# Regression: Finit must remove a daemon-owned (pid:!) stale pidfile +# left behind by an unclean exit (SIGKILL), so the next instance can +# start. Simulates the dbus-daemon pattern: 'serv -x' refuses to +# start if its pidfile already exists. + +set -eu + +TEST_DIR=$(dirname "$0") +PIDFN="/run/serv.pid" + +test_teardown() +{ + say "Running test teardown." + run "rm -f $FINIT_CONF $PIDFN" +} + +# shellcheck source=/dev/null +. "$TEST_DIR/lib/setup.sh" + +say "Add service stanza '$FINIT_CONF' with pid:!$PIDFN" +run "echo 'service pid:!$PIDFN serv -np -P $PIDFN -x' > $FINIT_CONF" +run "initctl reload" + +retry "assert_num_children 1 serv" +assert_is_pidfile "serv" "$PIDFN" + +PID=$(texec cat "$PIDFN") +say "SIGKILL serv ($PID) -- leaves stale pidfile" +run "kill -9 $PID" + +# Without the fix, 'serv -x' refuses to start on each retry until +# Finit hits restart_max and marks the service crashed. With the fix +# Finit removes the stale pidfile and the next instance comes up. +retry "assert_pidiff serv $PID" +retry "assert_num_children 1 serv" + +sep From 7d09e34b807fa4105908738a0ec9ab1afb8c433f Mon Sep 17 00:00:00 2001 From: Joachim Wiberg Date: Mon, 11 May 2026 19:45:15 +0200 Subject: [PATCH 5/5] doc: document stale pidfile cleanup and new restart log * src/pid.c: note the stale-pidfile-cleanup exception to the documented "Finit does not touch pid:! pidfiles" rule. * doc/config/services.md: add a user-facing paragraph on the same. * doc/ChangeLog.md: add Unreleased section covering this PR -- stale pidfile cleanup, restart log with signal name and core dump flag, and the SIGUNKOWN typo fix. Signed-off-by: Joachim Wiberg --- doc/ChangeLog.md | 25 +++++++++++++++++++++++++ doc/config/services.md | 8 ++++++++ src/pid.c | 5 ++++- 3 files changed, 37 insertions(+), 1 deletion(-) diff --git a/doc/ChangeLog.md b/doc/ChangeLog.md index ae0635bb..35c61552 100644 --- a/doc/ChangeLog.md +++ b/doc/ChangeLog.md @@ -3,6 +3,31 @@ Change Log All relevant changes are documented in this file. +[Unreleased] +------------ + +### Changes + +- Restart log now spells out the signal name and flags core dumps, + e.g. `killed by SIGKILL` or `killed by SIGSEGV, core dumped`, in + place of the bare numeric `by signal: N`. Gives operators a much + stronger breadcrumb when a daemon dies unexpectedly + +### Fixes + +- Remove stale daemon-owned (`pid:!`) pidfiles after unclean exits. + When a daemon dies via SIGKILL/OOM/segfault, or exits early during + startup, its pidfile lingers and can prevent the next instance from + starting -- `dbus-daemon` for example refuses to start when its + pidfile already exists, so Finit's restart loop gives up. Finit now + drops the file when it still names the just-reaped PID and that PID + is no longer alive (the liveness check guards against PID reuse). + This is a controlled exception to the long-standing rule that Finit + does not touch service-owned pidfiles +- Fix misspelled `SIGUNKOWN` returned by `sig_name()` for unknown + signal numbers, now spelled correctly as `SIGUNKNOWN`. Surfaced by + the new restart log above + [4.17][] - 2026-04-28 --------------------- diff --git a/doc/config/services.md b/doc/config/services.md index a315b372..a65b037c 100644 --- a/doc/config/services.md +++ b/doc/config/services.md @@ -31,6 +31,14 @@ basename of the binary to guess the PID file to watch for the PID: `/var/run/ntpd.pid`. If Finit guesses wrong, you have to submit the full `pid:!/path/to/file.pid`. +With `pid:!/path`, the file belongs to the service: Finit reads it +but does not create or remove it. The one exception is *stale* +cleanup — if the service dies without removing its own pidfile +(SIGKILL, OOM, segfault), and the file still names the just-reaped +PID, Finit removes it before the next retry. This prevents daemons +that refuse to start on an existing pidfile (e.g. `dbus-daemon`) +from getting stuck in a crash-restart loop. + **Example:** In the case of `ospfd` (below), we omit the `-d` flag (daemonize) to diff --git a/src/pid.c b/src/pid.c index 16f97b59..60994061 100644 --- a/src/pid.c +++ b/src/pid.c @@ -201,7 +201,10 @@ int pid_file_set(svc_t *svc, char *file, int not) * pid:!foo --> !/run/foo.pid * pid:!/run/foo.pid --> !/run/foo.pid * - * Note, nothing is created or removed by Finit in this latter form. + * Nothing is created or removed by Finit in this latter form, with one + * exception: a verifiably stale pidfile (still names the just-reaped + * PID, and that PID is no longer alive) is removed so the next instance + * can start. See service_clean_pidfile() in src/service.c. */ int pid_file_parse(svc_t *svc, char *arg) {