Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
78 changes: 23 additions & 55 deletions pi/skills/control-agent/startup-pi.sh
Original file line number Diff line number Diff line change
Expand Up @@ -12,14 +12,14 @@
# Stale .alias symlinks pointing to removed sockets also get cleaned.
# Then starts the slack-bridge process with the current control-agent UUID.
#
# This script is the SOLE owner of the bridge lifecycle. start.sh only does
# pre-cleanup (kill stale processes, release port) — it never launches the bridge.
# Process lifecycle is managed via process groups (see runtime/start.sh).
# When start.sh kills the old control-agent PGID, all spawned services
# (bridge, workers, etc.) are automatically terminated. This script only needs
# to launch new services; cleanup is handled by the process group mechanism.

set -euo pipefail

# Prevent varlock SEA binary from misinterpreting argv when called from a
# session that was itself launched via varlock (PKG_EXECPATH leaks into child
# processes and causes `varlock run` to treat subcommands as Node module paths).
# Prevent varlock SEA binary from misinterpreting argv
unset PKG_EXECPATH 2>/dev/null || true

RUNTIME_NODE_HELPER="$HOME/runtime/bin/lib/runtime-node.sh"
Expand Down Expand Up @@ -71,7 +71,7 @@ echo "Cleaned $cleaned stale socket(s)."

# Restart Slack bridge with current control-agent UUID
echo ""
echo "=== Slack Bridge Restart ==="
echo "=== Slack Bridge Startup ==="

# Find control-agent UUID from alias
CONTROL_ALIAS="$SOCKET_DIR/control-agent.alias"
Expand All @@ -86,56 +86,10 @@ fi
BRIDGE_LOG_DIR="$HOME/.pi/agent/logs"
BRIDGE_LOG_FILE="$BRIDGE_LOG_DIR/slack-bridge.log"
BRIDGE_DIR="/opt/baudbot/current/slack-bridge"
BRIDGE_TMUX_SESSION="slack-bridge"
BRIDGE_TMUX_SESSION="baudbot-slack-bridge"

mkdir -p "$BRIDGE_LOG_DIR"

# --- Kill anything holding port 7890, any existing bridge tmux session,
# and any leftover old-style PID-file supervisor.
echo "Cleaning up old bridge..."

# Kill the tmux session first — this stops the restart loop from respawning
# the bridge while we're trying to clean up the port.
tmux kill-session -t "$BRIDGE_TMUX_SESSION" 2>/dev/null || true

# Kill ALL bridge processes (broker-bridge.mjs and bridge.mjs) to prevent
# orphaned processes from holding port 7890 after control-agent restarts.
# This is more aggressive than just killing port holders, but prevents the
# common failure mode where a bridge process survives tmux session cleanup
# (e.g., detached, zombied, or in a different session tree).
BRIDGE_PIDS=$(pgrep -f 'node (broker-)?bridge\.mjs' 2>/dev/null || true)
if [ -n "$BRIDGE_PIDS" ]; then
echo "Killing all bridge processes (SIGTERM): $BRIDGE_PIDS"
echo "$BRIDGE_PIDS" | xargs kill 2>/dev/null || true
# Wait up to 3s for graceful shutdown
for i in 1 2 3; do
sleep 1
BRIDGE_PIDS=$(pgrep -f 'node (broker-)?bridge\.mjs' 2>/dev/null || true)
[ -z "$BRIDGE_PIDS" ] && break
done
# Force-kill anything that didn't exit
if [ -n "$BRIDGE_PIDS" ]; then
echo "Force-killing stubborn bridge processes: $BRIDGE_PIDS"
echo "$BRIDGE_PIDS" | xargs kill -9 2>/dev/null || true
sleep 1
fi
fi

# Final safety check: kill anything still on port 7890
PORT_PIDS=$(lsof -ti :7890 2>/dev/null || true)
if [ -n "$PORT_PIDS" ]; then
echo "Force-killing remaining processes on port 7890: $PORT_PIDS"
echo "$PORT_PIDS" | xargs kill -9 2>/dev/null || true
sleep 1
fi

OLD_PID_FILE="$HOME/.pi/agent/slack-bridge.pid"
if [ -f "$OLD_PID_FILE" ]; then
OLD_PID="$(cat "$OLD_PID_FILE" 2>/dev/null || true)"
[ -n "$OLD_PID" ] && kill "$OLD_PID" 2>/dev/null || true
rm -f "$OLD_PID_FILE"
fi

# --- Detect bridge mode ---
BRIDGE_SCRIPT=""
if [ -f "$BRIDGE_DIR/broker-bridge.mjs" ] && varlock run --path "$HOME/.config/" -- sh -c '
Expand All @@ -156,7 +110,7 @@ fi
if [ -z "$BRIDGE_SCRIPT" ]; then
echo "No Slack transport configured (missing broker keys and socket tokens); skipping bridge startup."
echo ""
echo "=== Cleanup Complete ==="
echo "=== Startup Complete ==="
exit 0
fi

Expand All @@ -167,8 +121,22 @@ fi
# - Tracks consecutive fast failures (<60s runtime) and gives up after 10
# - Backs off: 5s base + 2s per failure, capped at 60s
# - Kills port holders before retrying (avoids EADDRINUSE spin)
#
# Note: tmux creates its own session (PGID), so it's not killed by process group
# termination. We need to explicitly kill old sessions before creating new ones.
MAX_CONSECUTIVE_FAILURES=10

# Kill all agent tmux sessions (prefix: baudbot-*)
# Tmux sessions create their own PGID, so they survive process group cleanup.
# Using a naming convention allows us to kill all agent sessions without
# tracking individual session names.
AGENT_SESSIONS=$(tmux list-sessions -F '#{session_name}' 2>/dev/null | grep '^baudbot-' || true)
if [ -n "$AGENT_SESSIONS" ]; then
echo "Killing agent tmux sessions: $AGENT_SESSIONS"
echo "$AGENT_SESSIONS" | xargs -r -I{} tmux kill-session -t {} 2>/dev/null || true
sleep 1
fi

echo "Starting slack-bridge ($BRIDGE_SCRIPT) via tmux..."
NODE_BIN_DIR="${NODE_BIN_DIR:-$HOME/opt/node/bin}"
if command -v bb_resolve_runtime_node_bin_dir >/dev/null 2>&1; then
Expand Down Expand Up @@ -228,4 +196,4 @@ else
fi

echo ""
echo "=== Cleanup Complete ==="
echo "=== Startup Complete ==="
74 changes: 37 additions & 37 deletions start.sh
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,6 @@ set -euo pipefail
SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
# shellcheck source=bin/lib/runtime-node.sh
source "$SCRIPT_DIR/bin/lib/runtime-node.sh"
# bridge-restart-policy.sh no longer needed — bridge is started by
# startup-pi.sh, not start.sh (see PR #164)
cd ~

NODE_BIN_DIR="$(bb_resolve_runtime_node_bin_dir "$HOME")"
Expand All @@ -24,7 +22,6 @@ NODE_BIN_DIR="$(bb_resolve_runtime_node_bin_dir "$HOME")"
export PATH="$HOME/.varlock/bin:$NODE_BIN_DIR:$PATH"

# Work around varlock telemetry config crash by opting out at runtime.
# This avoids loading anonymousId from user config and keeps startup deterministic.
export VARLOCK_TELEMETRY_DISABLED=1

# Validate and load secrets via varlock
Expand All @@ -33,7 +30,7 @@ varlock load --path ~/.config/ || {
exit 1
}
set -a
# shellcheck disable=SC1090 # path is dynamic (agent home)
# shellcheck disable=SC1090
source ~/.config/.env
set +a

Expand All @@ -48,7 +45,6 @@ umask 077
~/runtime/bin/redact-logs.sh 2>/dev/null || true

# Verify deployed runtime integrity against deploy manifest.
# Modes: off | warn | strict (default: warn)
INTEGRITY_MODE="${BAUDBOT_STARTUP_INTEGRITY_MODE:-warn}"
if [ -x "$HOME/runtime/bin/verify-manifest.sh" ]; then
if ! BAUDBOT_STARTUP_INTEGRITY_MODE="$INTEGRITY_MODE" "$HOME/runtime/bin/verify-manifest.sh"; then
Expand All @@ -66,15 +62,13 @@ if [ -d "$SOCKET_DIR" ]; then
if command -v fuser &>/dev/null; then
for sock in "$SOCKET_DIR"/*.sock; do
[ -e "$sock" ] || continue
# If no process has the socket open, it's stale
if ! fuser "$sock" &>/dev/null 2>&1; then
rm -f "$sock"
fi
done
else
echo " fuser not found, skipping socket cleanup (install psmisc)"
fi
# Clean broken alias symlinks
for alias in "$SOCKET_DIR"/*.alias; do
[ -L "$alias" ] || continue
target=$(readlink "$alias")
Expand All @@ -84,35 +78,33 @@ if [ -d "$SOCKET_DIR" ]; then
done
fi

# ── Slack bridge cleanup (bridge is started by startup-pi.sh) ──
# The bridge needs the control-agent's session UUID (PI_SESSION_ID) to deliver
# messages to the correct socket. That UUID isn't known until pi starts and
# registers its socket. So we DON'T start the bridge here — the control-agent's
# startup-pi.sh handles it after the session is live.
#
# We DO kill any stale bridge processes from previous runs to avoid port
# conflicts when startup-pi.sh launches a fresh one.
BRIDGE_PID_FILE="$HOME/.pi/agent/slack-bridge.pid"
if [ -f "$BRIDGE_PID_FILE" ]; then
old_pid="$(cat "$BRIDGE_PID_FILE" 2>/dev/null || true)"
if [ -n "$old_pid" ] && kill -0 "$old_pid" 2>/dev/null; then
echo "Stopping stale bridge supervisor (PID $old_pid)..."
kill "$old_pid" 2>/dev/null || true
sleep 1
kill -9 "$old_pid" 2>/dev/null || true
# ── Process Group Management ──
# Kill old control-agent process group to ensure clean slate.
# This automatically terminates all spawned services (bridge, workers, etc.)
# without needing to track individual PIDs or process names.
CONTROL_PGID_FILE="$HOME/.pi/agent/control-agent.pgid"

if [ -f "$CONTROL_PGID_FILE" ]; then
OLD_PGID=$(cat "$CONTROL_PGID_FILE" 2>/dev/null || echo "")
if [ -n "$OLD_PGID" ] && kill -0 -"$OLD_PGID" 2>/dev/null; then
echo "Terminating old control-agent process group (PGID $OLD_PGID)..."
kill -TERM -"$OLD_PGID" 2>/dev/null || true
# Wait up to 5s for graceful shutdown
for _i in 1 2 3 4 5; do
if ! kill -0 -"$OLD_PGID" 2>/dev/null; then
echo " Process group terminated cleanly"
break
fi
sleep 1
done
# Force-kill any survivors
if kill -0 -"$OLD_PGID" 2>/dev/null; then
echo " Force-killing stubborn processes in group $OLD_PGID..."
kill -KILL -"$OLD_PGID" 2>/dev/null || true
sleep 1
fi
fi
rm -f "$BRIDGE_PID_FILE"
fi
# Kill the tmux session too (startup-pi.sh uses this)
tmux kill-session -t slack-bridge 2>/dev/null || true
# Force-release port 7890 in case anything survived
PORT_PIDS="$(lsof -ti :7890 2>/dev/null || true)"
if [ -n "$PORT_PIDS" ]; then
echo "Releasing port 7890 (PIDs: $PORT_PIDS)..."
echo "$PORT_PIDS" | xargs kill 2>/dev/null || true
sleep 1
PORT_PIDS="$(lsof -ti :7890 2>/dev/null || true)"
[ -n "$PORT_PIDS" ] && echo "$PORT_PIDS" | xargs kill -9 2>/dev/null || true
rm -f "$CONTROL_PGID_FILE"
fi

# Set session name (read by auto-name.ts extension)
Expand All @@ -134,6 +126,14 @@ else
exit 1
fi

# Start control-agent
# Start control-agent.
# Save our PID as the process group ID for cleanup on next restart.
# When systemd launches start.sh (Type=simple), our PID is already the
# process group leader. `exec pi` replaces this process in-place (same PID,
# same PGID), so all child processes (bridge, workers) inherit the group.
# On restart, killing -$PGID terminates the entire tree automatically.
#
# --session-control: enables inter-session communication (handled by control.ts extension)
pi --session-control --model "$MODEL" --skill ~/.pi/agent/skills/control-agent "/skill:control-agent"
echo "Starting control-agent..."
echo $$ > "$CONTROL_PGID_FILE"
exec pi --session-control --model "$MODEL" --skill ~/.pi/agent/skills/control-agent "/skill:control-agent"