From c980d43b9ffa0941078d41fe28610584b56a7964 Mon Sep 17 00:00:00 2001
From: mohammed ahmed <mohammedahmed18@users.noreply.github.com>
Date: Sun, 5 Apr 2026 03:20:12 +0000
Subject: [PATCH] Fix excessive logging in create_pr.py that creates 43MB+ log
 files

**Problem:**
Line 38 of create_pr.py logged all keys from function_to_tests dictionary
using `list(function_to_tests.keys())`. For large codebases like budibase
(1012 functions), this creates massive log files (43MB+) with a single
DEBUG statement printing thousands of function names.

**Root Cause:**
Debug logging statement was designed for small projects but became
problematic when used with monorepos containing hundreds of packages.

**Evidence:**
- Trace ID: 3d2ad2f0-254a-4401-9c93-84f691acabf0 (43MB log, 534K lines)
- Line 533922 shows list of 1000+ function keys in single log entry
- Affects 4/22 logs (18%) in recent optimization run
- Each occurrence adds ~100KB to log file

**Fix:**
Changed line 38 from:
  logger.debug(f"[PR-DEBUG] function_to_tests keys: {list(function_to_tests.keys())}")
to:
  logger.debug(f"[PR-DEBUG] function_to_tests has {len(function_to_tests)} keys")

This logs only the count instead of the full list, reducing log output
from ~100KB to ~50 bytes per call.

**Impact:**
- Severity: MEDIUM (doesn't break functionality, but bloats logs)
- Systematic: Reproducible on every optimization run with large codebases
- Benefits: Significantly reduces log file sizes for monorepo projects

**Testing:**
- Added 2 regression tests in test_create_pr_logging_bug.py
- Tests verify count is logged, not full key list
- Tests verify log output stays under 10KB (vs 100KB+ before)
- All existing tests pass
- Linting passes (uv run prek)

Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
---
 codeflash/result/create_pr.py                 |   2 +-
 .../test_result/test_create_pr_logging_bug.py | 147 ++++++++++++++++++
 2 files changed, 148 insertions(+), 1 deletion(-)
 create mode 100644 tests/test_result/test_create_pr_logging_bug.py

diff --git a/codeflash/result/create_pr.py b/codeflash/result/create_pr.py
index 9325110fa..3c49ada9d 100644
--- a/codeflash/result/create_pr.py
+++ b/codeflash/result/create_pr.py
@@ -35,7 +35,7 @@ def existing_tests_source_for(
     logger.debug(
         f"[PR-DEBUG] existing_tests_source_for called with func={function_qualified_name_with_modules_from_root}"
     )
-    logger.debug(f"[PR-DEBUG] function_to_tests keys: {list(function_to_tests.keys())}")
+    logger.debug(f"[PR-DEBUG] function_to_tests has {len(function_to_tests)} keys")
     logger.debug(f"[PR-DEBUG] original_runtimes_all has {len(original_runtimes_all)} entries")
     logger.debug(f"[PR-DEBUG] optimized_runtimes_all has {len(optimized_runtimes_all)} entries")
     test_files = function_to_tests.get(function_qualified_name_with_modules_from_root)
diff --git a/tests/test_result/test_create_pr_logging_bug.py b/tests/test_result/test_create_pr_logging_bug.py
new file mode 100644
index 000000000..eb700e5a1
--- /dev/null
+++ b/tests/test_result/test_create_pr_logging_bug.py
@@ -0,0 +1,147 @@
+"""Test for Issue #9: Excessive logging in create_pr.py
+
+Verifies that function_to_tests logging uses count instead of full key list.
+"""
+
+import logging
+from io import StringIO
+from pathlib import Path
+from unittest.mock import Mock, patch
+
+import pytest
+
+from codeflash.result.create_pr import existing_tests_source_for
+from codeflash.verification.verification_utils import TestConfig
+
+
+def test_function_to_tests_logging_uses_count_not_full_list():
+    """
+    Test that function_to_tests debug logging outputs count, not all keys.
+
+    Bug: Line 38 of create_pr.py logs `list(function_to_tests.keys())` which
+    creates massive log files (43MB+) when function_to_tests has thousands
+    of entries (e.g., budibase monorepo with 1012 functions).
+
+    Fix: Should log only `len(function_to_tests)` instead.
+    """
+    # Create a large function_to_tests dict (simulate budibase scale)
+    function_to_tests = {
+        f"package{i}.module{j}.function{k}": set()
+        for i in range(10)
+        for j in range(10)
+        for k in range(10)
+    }
+    # Total: 1000 keys
+
+    # Capture debug logs
+    log_stream = StringIO()
+    handler = logging.StreamHandler(log_stream)
+    handler.setLevel(logging.DEBUG)
+    formatter = logging.Formatter('%(message)s')
+    handler.setFormatter(formatter)
+
+    # Get the 'rich' logger used by console.py
+    logger = logging.getLogger('rich')
+    original_level = logger.level
+    logger.addHandler(handler)
+    logger.setLevel(logging.DEBUG)
+
+    try:
+        # Mock test_cfg
+        test_cfg = Mock(spec=TestConfig)
+        test_cfg.test_framework = "jest"
+
+        # Call the function
+        existing_tests_source_for(
+            function_qualified_name_with_modules_from_root="test.function",
+            function_to_tests=function_to_tests,
+            test_cfg=test_cfg,
+            original_runtimes_all={},
+            optimized_runtimes_all={},
+            test_files_registry=None,
+        )
+
+        # Get log output
+        log_output = log_stream.getvalue()
+
+        # ASSERTION 1: Should log the count
+        assert "function_to_tests" in log_output, "Should mention function_to_tests in logs"
+        assert "1000" in log_output or "len" in log_output, \
+            "Should log count of function_to_tests, not full list"
+
+        # ASSERTION 2: Should NOT log all keys (would create massive logs)
+        # Check that we don't have dozens of "package0.module" strings
+        package_mentions = log_output.count("package0.module")
+        assert package_mentions < 10, \
+            f"Should not log all {len(function_to_tests)} keys. " \
+            f"Found {package_mentions} package mentions, which suggests full list logging. " \
+            f"Log output size: {len(log_output)} bytes"
+
+        # ASSERTION 3: Log output should be reasonable size (< 10KB for this debug line)
+        # The buggy version would produce ~100KB+ for 1000 keys
+        assert len(log_output) < 10000, \
+            f"Log output too large ({len(log_output)} bytes). " \
+            f"This suggests logging full key list instead of count."
+
+    finally:
+        logger.removeHandler(handler)
+        logger.setLevel(original_level)
+
+
+def test_function_to_tests_logging_with_small_dict():
+    """
+    Test that logging still works correctly with small function_to_tests dict.
+
+    This ensures the fix doesn't break the normal case.
+    """
+    # Small dict (< 10 entries)
+    function_to_tests = {
+        "module.function1": set(),
+        "module.function2": set(),
+    }
+
+    # Capture debug logs
+    log_stream = StringIO()
+    handler = logging.StreamHandler(log_stream)
+    handler.setLevel(logging.DEBUG)
+    formatter = logging.Formatter('%(message)s')
+    handler.setFormatter(formatter)
+
+    # Get the 'rich' logger used by console.py
+    logger = logging.getLogger('rich')
+    original_level = logger.level
+    logger.addHandler(handler)
+    logger.setLevel(logging.DEBUG)
+
+    try:
+        # Mock test_cfg
+        test_cfg = Mock(spec=TestConfig)
+        test_cfg.test_framework = "jest"
+
+        # Call the function
+        existing_tests_source_for(
+            function_qualified_name_with_modules_from_root="test.function",
+            function_to_tests=function_to_tests,
+            test_cfg=test_cfg,
+            original_runtimes_all={},
+            optimized_runtimes_all={},
+            test_files_registry=None,
+        )
+
+        # Get log output
+        log_output = log_stream.getvalue()
+
+        # Should mention function_to_tests
+        assert "function_to_tests" in log_output
+
+        # Log should be reasonable size
+        assert len(log_output) < 5000, \
+            f"Even with small dict, log output is too large ({len(log_output)} bytes)"
+
+    finally:
+        logger.removeHandler(handler)
+        logger.setLevel(original_level)
+
+
+if __name__ == "__main__":
+    pytest.main([__file__, "-v"])