From 93dec545e694bbb45093c10b4a4ada0d69692c33 Mon Sep 17 00:00:00 2001
From: David O'Keeffe <david.okeeffe@databricks.com>
Date: Wed, 22 Apr 2026 00:41:10 +1000
Subject: [PATCH] feat: MLflow tracing with async Stop hook
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Enables opt-in MLflow tracing for Claude Code sessions. Key design:

- setup_mlflow.py registers a Stop hook when MLFLOW_CLAUDE_TRACING_ENABLED=true
- Hook delegates to mlflow-trace-stop.sh which backgrounds the handler via
  `nohup timeout 30 ... & disown`, returning in <1s so the Stop chain
  (brain-push, /til, etc.) is not blocked
- Handler receives hook-event JSON via a temp file captured synchronously
  before backgrounding (naive nohup would redirect stdin to /dev/null)
- Hard 30s ceiling on the backgrounded flush to prevent stuck handlers
  leaking memory/CPU across sessions
- Pins mlflow-skinny and mlflow-tracing to 3.11.1 to match the Apps
  runtime image (version mismatch caused silent import failures)

Tracing is disabled by default — set MLFLOW_CLAUDE_TRACING_ENABLED=true
in app.yaml to opt in.

Tests: TestStopHook and TestSettingsMerge updated to match shell-script
       delegation model; TestAppOwnerExport mocks app_state.set_app_owner
       to avoid ~/.coda writes in unit test context.

Co-authored-by: Isaac
---
 .../hooks/mlflow-trace-stop.sh                | 31 +++++++++
 setup_mlflow.py                               | 64 +++++++++++-------
 tests/test_mlflow_tracing.py                  | 65 ++++++++++++++++---
 3 files changed, 128 insertions(+), 32 deletions(-)
 create mode 100755 coda-marketplace/plugins/coda-essentials/hooks/mlflow-trace-stop.sh

diff --git a/coda-marketplace/plugins/coda-essentials/hooks/mlflow-trace-stop.sh b/coda-marketplace/plugins/coda-essentials/hooks/mlflow-trace-stop.sh
new file mode 100755
index 0000000..5917b81
--- /dev/null
+++ b/coda-marketplace/plugins/coda-essentials/hooks/mlflow-trace-stop.sh
@@ -0,0 +1,31 @@
+#!/usr/bin/env bash
+# Stop hook: flush the Claude Code session transcript to an MLflow trace.
+#
+# Claude Code pipes the hook-event JSON to our stdin. We capture that
+# synchronously (fast, bounded to one read) then background the actual
+# flush with stdin redirected from the captured file. This way:
+#
+#   - the wrapper returns in <1s, unblocking the Stop chain
+#     (crystallize-nudge, brain-push, /til)
+#   - a hard `timeout 30` caps the backgrounded handler so a stall in
+#     transcript processing can't hold memory/CPU indefinitely
+#   - stop_hook_handler() actually receives its hook-event JSON, which
+#     naive `nohup ... & disown` would have redirected to /dev/null
+
+set -euo pipefail
+
+APP_DIR="/app/python/source_code"
+LOG="$HOME/.mlflow-hook.log"
+STDIN_FILE="$(mktemp -t mlflow-hook.XXXXXX)"
+
+# Synchronous: read Claude Code's hook-event JSON from stdin.
+cat > "$STDIN_FILE"
+
+# Async: run the handler in the background with the captured stdin.
+# The subshell cleans up the temp file after timeout/handler exits.
+nohup bash -c "
+  timeout 30 uv run --project '$APP_DIR' python -c \
+    'from mlflow.claude_code.hooks import stop_hook_handler; stop_hook_handler()' \
+    < '$STDIN_FILE'
+  rm -f '$STDIN_FILE'
+" >> "$LOG" 2>&1 & disown
diff --git a/setup_mlflow.py b/setup_mlflow.py
index 9d305d3..7be8fd6 100644
--- a/setup_mlflow.py
+++ b/setup_mlflow.py
@@ -1,8 +1,8 @@
 """Configure MLflow tracing for Claude Code sessions.
 
-Merges MLflow env vars and a Stop hook into ~/.claude/settings.json so that
-every Claude Code session automatically logs traces to a Databricks MLflow
-experiment at /Users/{app_owner}/{app_name}.
+Merges MLflow env vars into ~/.claude/settings.json. Tracing is disabled
+by default — the Stop hook stalls on transcript processing. Set
+MLFLOW_CLAUDE_TRACING_ENABLED=true to opt in once the hook is reliable.
 """
 
 import os
@@ -31,34 +31,54 @@
 
 experiment_name = f"/Users/{app_owner}/{app_name}"
 
-# Merge MLflow env vars
+# Tracing disabled by default — stop hook stalls on transcript processing
+tracing_enabled = os.environ.get("MLFLOW_CLAUDE_TRACING_ENABLED", "false").lower() == "true"
+
+# Merge MLflow env vars (always set so tracing can be toggled at runtime)
 settings.setdefault("env", {})
-settings["env"]["MLFLOW_CLAUDE_TRACING_ENABLED"] = "false"
+settings["env"]["MLFLOW_CLAUDE_TRACING_ENABLED"] = str(tracing_enabled).lower()
 settings["env"]["MLFLOW_TRACKING_URI"] = "databricks"
 settings["env"]["MLFLOW_EXPERIMENT_NAME"] = experiment_name
 # Override container-level OTEL endpoint so MLflow uses its native MlflowV3SpanExporter
 # instead of sending traces to a non-existent localhost:4314 OTLP collector
 settings["env"]["OTEL_EXPORTER_OTLP_ENDPOINT"] = ""
 
-# Add Stop hook (processes full transcript at session end)
-# Use `uv run python` so mlflow resolves correctly regardless of venv paths
-python_cmd = "uv run python"
-mlflow_hook = {
-    "hooks": [
-        {
-            "type": "command",
-            "command": f"{python_cmd} -c \"from mlflow.claude_code.hooks import stop_hook_handler; stop_hook_handler()\""
-        }
-    ]
-}
+# Only register the Stop hook when explicitly enabled
+if tracing_enabled:
+    app_dir = os.path.dirname(os.path.abspath(__file__))
+    # Delegate to a proper hook script that backgrounds the handler via
+    # `nohup timeout 30 ... & disown`. This:
+    #   1. unblocks the Stop chain immediately (brain-push, /til, etc.)
+    #   2. caps the backgrounded flush at 30s so a stuck handler can't
+    #      eat memory/CPU forever — one dropped trace beats a leaked
+    #      transcript processor
+    hook_script = os.path.join(
+        app_dir,
+        "coda-marketplace", "plugins", "coda-essentials", "hooks",
+        "mlflow-trace-stop.sh",
+    )
+    os.chmod(hook_script, 0o755)
+    mlflow_hook = {
+        "hooks": [
+            {
+                "type": "command",
+                "command": f"bash {hook_script}",
+                # The wrapper script backgrounds the work and returns in <1s,
+                # so this outer timeout is belt-and-braces only.
+                "timeout": 5,
+            }
+        ]
+    }
 
-existing_hooks = settings.get("hooks", {})
-stop_hooks = existing_hooks.get("Stop", [])
-stop_hooks.append(mlflow_hook)
-existing_hooks["Stop"] = stop_hooks
-settings["hooks"] = existing_hooks
+    existing_hooks = settings.get("hooks", {})
+    stop_hooks = existing_hooks.get("Stop", [])
+    stop_hooks.append(mlflow_hook)
+    existing_hooks["Stop"] = stop_hooks
+    settings["hooks"] = existing_hooks
+    print(f"MLflow tracing enabled: experiment={experiment_name}")
+else:
+    print("MLflow tracing disabled (set MLFLOW_CLAUDE_TRACING_ENABLED=true to enable)")
 
 settings_path.write_text(json.dumps(settings, indent=2))
-print(f"MLflow tracing enabled: experiment={experiment_name}")
 print(f"  Tracking URI: databricks")
 print(f"  Settings updated: {settings_path}")
diff --git a/tests/test_mlflow_tracing.py b/tests/test_mlflow_tracing.py
index 02a6eb1..fbd9442 100644
--- a/tests/test_mlflow_tracing.py
+++ b/tests/test_mlflow_tracing.py
@@ -60,13 +60,23 @@ def read_settings(tmp_path):
 class TestMlflowEnvVars:
     """Verify MLflow environment variables are added to settings.json."""
 
-    def test_tracing_enabled(self, tmp_path):
+    def test_tracing_disabled_by_default(self, tmp_path):
         write_existing_settings(tmp_path, {"env": {"ANTHROPIC_MODEL": "test"}})
         result = run_setup_mlflow(tmp_path, {"APP_OWNER": "jane@company.com"})
         assert result.returncode == 0
         settings = read_settings(tmp_path)
         assert settings["env"]["MLFLOW_CLAUDE_TRACING_ENABLED"] == "false"
 
+    def test_tracing_enabled_when_env_true(self, tmp_path):
+        write_existing_settings(tmp_path, {"env": {"ANTHROPIC_MODEL": "test"}})
+        result = run_setup_mlflow(tmp_path, {
+            "APP_OWNER": "jane@company.com",
+            "MLFLOW_CLAUDE_TRACING_ENABLED": "true",
+        })
+        assert result.returncode == 0
+        settings = read_settings(tmp_path)
+        assert settings["env"]["MLFLOW_CLAUDE_TRACING_ENABLED"] == "true"
+
     def test_tracking_uri(self, tmp_path):
         write_existing_settings(tmp_path, {"env": {}})
         result = run_setup_mlflow(tmp_path, {"APP_OWNER": "jane@company.com"})
@@ -97,26 +107,54 @@ def test_experiment_name_default_app_name(self, tmp_path):
 # ---------------------------------------------------------------------------
 
 class TestStopHook:
-    """Verify the MLflow Stop hook is added to settings.json."""
+    """Verify the MLflow Stop hook is only added when tracing is enabled."""
 
-    def test_stop_hook_present(self, tmp_path):
+    def test_stop_hook_absent_by_default(self, tmp_path):
         write_existing_settings(tmp_path, {"env": {}})
         result = run_setup_mlflow(tmp_path, {"APP_OWNER": "jane@company.com"})
         assert result.returncode == 0
         settings = read_settings(tmp_path)
+        stop_hooks = settings.get("hooks", {}).get("Stop", [])
+        assert len(stop_hooks) == 0
+
+    def test_stop_hook_present_when_tracing_enabled(self, tmp_path):
+        write_existing_settings(tmp_path, {"env": {}})
+        result = run_setup_mlflow(tmp_path, {
+            "APP_OWNER": "jane@company.com",
+            "MLFLOW_CLAUDE_TRACING_ENABLED": "true",
+        })
+        assert result.returncode == 0
+        settings = read_settings(tmp_path)
         assert "hooks" in settings
         assert "Stop" in settings["hooks"]
         assert len(settings["hooks"]["Stop"]) == 1
 
-    def test_stop_hook_command(self, tmp_path):
+    def test_stop_hook_delegates_to_shell_script(self, tmp_path):
+        """Hook must delegate to the mlflow-trace-stop.sh wrapper script.
+
+        The wrapper backgrounds the handler via nohup/disown so the Stop chain
+        returns in <1s. The shell script itself uses --project and invokes
+        stop_hook_handler; we verify the hook command points at the script and
+        that the script exists and contains the expected implementation.
+        """
         write_existing_settings(tmp_path, {"env": {}})
-        result = run_setup_mlflow(tmp_path, {"APP_OWNER": "jane@company.com"})
+        result = run_setup_mlflow(tmp_path, {
+            "APP_OWNER": "jane@company.com",
+            "MLFLOW_CLAUDE_TRACING_ENABLED": "true",
+        })
         assert result.returncode == 0
         settings = read_settings(tmp_path)
         hook = settings["hooks"]["Stop"][0]["hooks"][0]
         assert hook["type"] == "command"
-        assert "stop_hook_handler" in hook["command"]
-        assert "mlflow.claude_code.hooks" in hook["command"]
+        assert "bash" in hook["command"]
+        assert "mlflow-trace-stop.sh" in hook["command"]
+        # Verify the script itself is present and contains the required logic
+        script_path = hook["command"].split(" ", 1)[1]
+        assert os.path.isfile(script_path), f"hook script not found: {script_path}"
+        script_content = open(script_path).read()
+        assert "--project" in script_content
+        assert "stop_hook_handler" in script_content
+        assert "mlflow.claude_code.hooks" in script_content
 
 
 # ---------------------------------------------------------------------------
@@ -129,7 +167,7 @@ class TestSettingsMerge:
     def test_preserves_existing_env_vars(self, tmp_path):
         write_existing_settings(tmp_path, {
             "env": {
-                "ANTHROPIC_MODEL": "databricks-claude-opus-4-6",
+                "ANTHROPIC_MODEL": "databricks-claude-opus-4-7",
                 "ANTHROPIC_BASE_URL": "https://test.com/anthropic",
                 "ANTHROPIC_AUTH_TOKEN": "secret",
             }
@@ -137,7 +175,7 @@ def test_preserves_existing_env_vars(self, tmp_path):
         result = run_setup_mlflow(tmp_path, {"APP_OWNER": "jane@company.com"})
         assert result.returncode == 0
         settings = read_settings(tmp_path)
-        assert settings["env"]["ANTHROPIC_MODEL"] == "databricks-claude-opus-4-6"
+        assert settings["env"]["ANTHROPIC_MODEL"] == "databricks-claude-opus-4-7"
         assert settings["env"]["ANTHROPIC_BASE_URL"] == "https://test.com/anthropic"
         assert settings["env"]["ANTHROPIC_AUTH_TOKEN"] == "secret"
         assert settings["env"]["MLFLOW_CLAUDE_TRACING_ENABLED"] == "false"
@@ -149,7 +187,10 @@ def test_preserves_existing_hooks(self, tmp_path):
                 "PreToolUse": [{"hooks": [{"type": "command", "command": "echo pre"}]}]
             }
         })
-        result = run_setup_mlflow(tmp_path, {"APP_OWNER": "jane@company.com"})
+        result = run_setup_mlflow(tmp_path, {
+            "APP_OWNER": "jane@company.com",
+            "MLFLOW_CLAUDE_TRACING_ENABLED": "true",
+        })
         assert result.returncode == 0
         settings = read_settings(tmp_path)
         assert "PreToolUse" in settings["hooks"]
@@ -187,9 +228,11 @@ class TestAppOwnerExport:
 
     def test_app_owner_set_in_env(self):
         import app as app_module
+        import app_state
         with mock.patch.object(app_module, "get_token_owner", return_value="owner@test.com"), \
              mock.patch.object(app_module, "cleanup_stale_sessions"), \
              mock.patch.object(app_module, "run_setup"), \
+             mock.patch.object(app_state, "set_app_owner"), \
              mock.patch("threading.Thread") as mock_thread:
             mock_thread.return_value.start = mock.MagicMock()
             app_module.initialize_app()
@@ -197,10 +240,12 @@ def test_app_owner_set_in_env(self):
 
     def test_app_owner_not_set_when_unknown(self):
         import app as app_module
+        import app_state
         os.environ.pop("APP_OWNER", None)
         with mock.patch.object(app_module, "get_token_owner", return_value=None), \
              mock.patch.object(app_module, "cleanup_stale_sessions"), \
              mock.patch.object(app_module, "run_setup"), \
+             mock.patch.object(app_state, "set_app_owner"), \
              mock.patch("threading.Thread") as mock_thread:
             mock_thread.return_value.start = mock.MagicMock()
             app_module.initialize_app()