diff --git a/pi/skills/control-agent/startup-pi.sh b/pi/skills/control-agent/startup-pi.sh index e65bb9b..35eaa01 100755 --- a/pi/skills/control-agent/startup-pi.sh +++ b/pi/skills/control-agent/startup-pi.sh @@ -12,14 +12,14 @@ # Stale .alias symlinks pointing to removed sockets also get cleaned. # Then starts the slack-bridge process with the current control-agent UUID. # -# This script is the SOLE owner of the bridge lifecycle. start.sh only does -# pre-cleanup (kill stale processes, release port) — it never launches the bridge. +# Process lifecycle is managed via process groups (see runtime/start.sh). +# When start.sh kills the old control-agent PGID, all spawned services +# (bridge, workers, etc.) are automatically terminated. This script only needs +# to launch new services; cleanup is handled by the process group mechanism. set -euo pipefail -# Prevent varlock SEA binary from misinterpreting argv when called from a -# session that was itself launched via varlock (PKG_EXECPATH leaks into child -# processes and causes `varlock run` to treat subcommands as Node module paths). +# Prevent varlock SEA binary from misinterpreting argv unset PKG_EXECPATH 2>/dev/null || true RUNTIME_NODE_HELPER="$HOME/runtime/bin/lib/runtime-node.sh" @@ -71,7 +71,7 @@ echo "Cleaned $cleaned stale socket(s)." # Restart Slack bridge with current control-agent UUID echo "" -echo "=== Slack Bridge Restart ===" +echo "=== Slack Bridge Startup ===" # Find control-agent UUID from alias CONTROL_ALIAS="$SOCKET_DIR/control-agent.alias" @@ -86,56 +86,10 @@ fi BRIDGE_LOG_DIR="$HOME/.pi/agent/logs" BRIDGE_LOG_FILE="$BRIDGE_LOG_DIR/slack-bridge.log" BRIDGE_DIR="/opt/baudbot/current/slack-bridge" -BRIDGE_TMUX_SESSION="slack-bridge" +BRIDGE_TMUX_SESSION="baudbot-slack-bridge" mkdir -p "$BRIDGE_LOG_DIR" -# --- Kill anything holding port 7890, any existing bridge tmux session, -# and any leftover old-style PID-file supervisor. -echo "Cleaning up old bridge..." - -# Kill the tmux session first — this stops the restart loop from respawning -# the bridge while we're trying to clean up the port. -tmux kill-session -t "$BRIDGE_TMUX_SESSION" 2>/dev/null || true - -# Kill ALL bridge processes (broker-bridge.mjs and bridge.mjs) to prevent -# orphaned processes from holding port 7890 after control-agent restarts. -# This is more aggressive than just killing port holders, but prevents the -# common failure mode where a bridge process survives tmux session cleanup -# (e.g., detached, zombied, or in a different session tree). -BRIDGE_PIDS=$(pgrep -f 'node (broker-)?bridge\.mjs' 2>/dev/null || true) -if [ -n "$BRIDGE_PIDS" ]; then - echo "Killing all bridge processes (SIGTERM): $BRIDGE_PIDS" - echo "$BRIDGE_PIDS" | xargs kill 2>/dev/null || true - # Wait up to 3s for graceful shutdown - for i in 1 2 3; do - sleep 1 - BRIDGE_PIDS=$(pgrep -f 'node (broker-)?bridge\.mjs' 2>/dev/null || true) - [ -z "$BRIDGE_PIDS" ] && break - done - # Force-kill anything that didn't exit - if [ -n "$BRIDGE_PIDS" ]; then - echo "Force-killing stubborn bridge processes: $BRIDGE_PIDS" - echo "$BRIDGE_PIDS" | xargs kill -9 2>/dev/null || true - sleep 1 - fi -fi - -# Final safety check: kill anything still on port 7890 -PORT_PIDS=$(lsof -ti :7890 2>/dev/null || true) -if [ -n "$PORT_PIDS" ]; then - echo "Force-killing remaining processes on port 7890: $PORT_PIDS" - echo "$PORT_PIDS" | xargs kill -9 2>/dev/null || true - sleep 1 -fi - -OLD_PID_FILE="$HOME/.pi/agent/slack-bridge.pid" -if [ -f "$OLD_PID_FILE" ]; then - OLD_PID="$(cat "$OLD_PID_FILE" 2>/dev/null || true)" - [ -n "$OLD_PID" ] && kill "$OLD_PID" 2>/dev/null || true - rm -f "$OLD_PID_FILE" -fi - # --- Detect bridge mode --- BRIDGE_SCRIPT="" if [ -f "$BRIDGE_DIR/broker-bridge.mjs" ] && varlock run --path "$HOME/.config/" -- sh -c ' @@ -156,7 +110,7 @@ fi if [ -z "$BRIDGE_SCRIPT" ]; then echo "No Slack transport configured (missing broker keys and socket tokens); skipping bridge startup." echo "" - echo "=== Cleanup Complete ===" + echo "=== Startup Complete ===" exit 0 fi @@ -167,8 +121,22 @@ fi # - Tracks consecutive fast failures (<60s runtime) and gives up after 10 # - Backs off: 5s base + 2s per failure, capped at 60s # - Kills port holders before retrying (avoids EADDRINUSE spin) +# +# Note: tmux creates its own session (PGID), so it's not killed by process group +# termination. We need to explicitly kill old sessions before creating new ones. MAX_CONSECUTIVE_FAILURES=10 +# Kill all agent tmux sessions (prefix: baudbot-*) +# Tmux sessions create their own PGID, so they survive process group cleanup. +# Using a naming convention allows us to kill all agent sessions without +# tracking individual session names. +AGENT_SESSIONS=$(tmux list-sessions -F '#{session_name}' 2>/dev/null | grep '^baudbot-' || true) +if [ -n "$AGENT_SESSIONS" ]; then + echo "Killing agent tmux sessions: $AGENT_SESSIONS" + echo "$AGENT_SESSIONS" | xargs -r -I{} tmux kill-session -t {} 2>/dev/null || true + sleep 1 +fi + echo "Starting slack-bridge ($BRIDGE_SCRIPT) via tmux..." NODE_BIN_DIR="${NODE_BIN_DIR:-$HOME/opt/node/bin}" if command -v bb_resolve_runtime_node_bin_dir >/dev/null 2>&1; then @@ -228,4 +196,4 @@ else fi echo "" -echo "=== Cleanup Complete ===" +echo "=== Startup Complete ===" diff --git a/start.sh b/start.sh index ea18b04..893a091 100755 --- a/start.sh +++ b/start.sh @@ -14,8 +14,6 @@ set -euo pipefail SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)" # shellcheck source=bin/lib/runtime-node.sh source "$SCRIPT_DIR/bin/lib/runtime-node.sh" -# bridge-restart-policy.sh no longer needed — bridge is started by -# startup-pi.sh, not start.sh (see PR #164) cd ~ NODE_BIN_DIR="$(bb_resolve_runtime_node_bin_dir "$HOME")" @@ -24,7 +22,6 @@ NODE_BIN_DIR="$(bb_resolve_runtime_node_bin_dir "$HOME")" export PATH="$HOME/.varlock/bin:$NODE_BIN_DIR:$PATH" # Work around varlock telemetry config crash by opting out at runtime. -# This avoids loading anonymousId from user config and keeps startup deterministic. export VARLOCK_TELEMETRY_DISABLED=1 # Validate and load secrets via varlock @@ -33,7 +30,7 @@ varlock load --path ~/.config/ || { exit 1 } set -a -# shellcheck disable=SC1090 # path is dynamic (agent home) +# shellcheck disable=SC1090 source ~/.config/.env set +a @@ -48,7 +45,6 @@ umask 077 ~/runtime/bin/redact-logs.sh 2>/dev/null || true # Verify deployed runtime integrity against deploy manifest. -# Modes: off | warn | strict (default: warn) INTEGRITY_MODE="${BAUDBOT_STARTUP_INTEGRITY_MODE:-warn}" if [ -x "$HOME/runtime/bin/verify-manifest.sh" ]; then if ! BAUDBOT_STARTUP_INTEGRITY_MODE="$INTEGRITY_MODE" "$HOME/runtime/bin/verify-manifest.sh"; then @@ -66,7 +62,6 @@ if [ -d "$SOCKET_DIR" ]; then if command -v fuser &>/dev/null; then for sock in "$SOCKET_DIR"/*.sock; do [ -e "$sock" ] || continue - # If no process has the socket open, it's stale if ! fuser "$sock" &>/dev/null 2>&1; then rm -f "$sock" fi @@ -74,7 +69,6 @@ if [ -d "$SOCKET_DIR" ]; then else echo " fuser not found, skipping socket cleanup (install psmisc)" fi - # Clean broken alias symlinks for alias in "$SOCKET_DIR"/*.alias; do [ -L "$alias" ] || continue target=$(readlink "$alias") @@ -84,35 +78,33 @@ if [ -d "$SOCKET_DIR" ]; then done fi -# ── Slack bridge cleanup (bridge is started by startup-pi.sh) ── -# The bridge needs the control-agent's session UUID (PI_SESSION_ID) to deliver -# messages to the correct socket. That UUID isn't known until pi starts and -# registers its socket. So we DON'T start the bridge here — the control-agent's -# startup-pi.sh handles it after the session is live. -# -# We DO kill any stale bridge processes from previous runs to avoid port -# conflicts when startup-pi.sh launches a fresh one. -BRIDGE_PID_FILE="$HOME/.pi/agent/slack-bridge.pid" -if [ -f "$BRIDGE_PID_FILE" ]; then - old_pid="$(cat "$BRIDGE_PID_FILE" 2>/dev/null || true)" - if [ -n "$old_pid" ] && kill -0 "$old_pid" 2>/dev/null; then - echo "Stopping stale bridge supervisor (PID $old_pid)..." - kill "$old_pid" 2>/dev/null || true - sleep 1 - kill -9 "$old_pid" 2>/dev/null || true +# ── Process Group Management ── +# Kill old control-agent process group to ensure clean slate. +# This automatically terminates all spawned services (bridge, workers, etc.) +# without needing to track individual PIDs or process names. +CONTROL_PGID_FILE="$HOME/.pi/agent/control-agent.pgid" + +if [ -f "$CONTROL_PGID_FILE" ]; then + OLD_PGID=$(cat "$CONTROL_PGID_FILE" 2>/dev/null || echo "") + if [ -n "$OLD_PGID" ] && kill -0 -"$OLD_PGID" 2>/dev/null; then + echo "Terminating old control-agent process group (PGID $OLD_PGID)..." + kill -TERM -"$OLD_PGID" 2>/dev/null || true + # Wait up to 5s for graceful shutdown + for _i in 1 2 3 4 5; do + if ! kill -0 -"$OLD_PGID" 2>/dev/null; then + echo " Process group terminated cleanly" + break + fi + sleep 1 + done + # Force-kill any survivors + if kill -0 -"$OLD_PGID" 2>/dev/null; then + echo " Force-killing stubborn processes in group $OLD_PGID..." + kill -KILL -"$OLD_PGID" 2>/dev/null || true + sleep 1 + fi fi - rm -f "$BRIDGE_PID_FILE" -fi -# Kill the tmux session too (startup-pi.sh uses this) -tmux kill-session -t slack-bridge 2>/dev/null || true -# Force-release port 7890 in case anything survived -PORT_PIDS="$(lsof -ti :7890 2>/dev/null || true)" -if [ -n "$PORT_PIDS" ]; then - echo "Releasing port 7890 (PIDs: $PORT_PIDS)..." - echo "$PORT_PIDS" | xargs kill 2>/dev/null || true - sleep 1 - PORT_PIDS="$(lsof -ti :7890 2>/dev/null || true)" - [ -n "$PORT_PIDS" ] && echo "$PORT_PIDS" | xargs kill -9 2>/dev/null || true + rm -f "$CONTROL_PGID_FILE" fi # Set session name (read by auto-name.ts extension) @@ -134,6 +126,14 @@ else exit 1 fi -# Start control-agent +# Start control-agent. +# Save our PID as the process group ID for cleanup on next restart. +# When systemd launches start.sh (Type=simple), our PID is already the +# process group leader. `exec pi` replaces this process in-place (same PID, +# same PGID), so all child processes (bridge, workers) inherit the group. +# On restart, killing -$PGID terminates the entire tree automatically. +# # --session-control: enables inter-session communication (handled by control.ts extension) -pi --session-control --model "$MODEL" --skill ~/.pi/agent/skills/control-agent "/skill:control-agent" +echo "Starting control-agent..." +echo $$ > "$CONTROL_PGID_FILE" +exec pi --session-control --model "$MODEL" --skill ~/.pi/agent/skills/control-agent "/skill:control-agent"