feat(test-generation): force tool calls, 400 turns, gpt-5.4 model

Atlas · Atlas · commit fec5df98ac14 · 2026-03-30T08:17:09.000Z
- MAX_AGENT_TURNS: 200 -&gt; 400
- Default model: moonshotai/kimi-k2.5 -&gt; openai/gpt-5.4
- Force tool usage: every response must have tool calls
- Updated SYSTEM_PROMPT: no text, only tools
- Updated user prompt: explicit tool examples
- Force error message when no tool used
- Tests: 1197 pass
diff --git a/src/swe_forge/cli/mine.py b/src/swe_forge/cli/mine.py
@@ -423,7 +423,7 @@ async def _run_pipeline(
     config: SwePipelineConfig,
     repo_filter: Optional[str],
     verbose: bool,
-    model: str = "moonshotai/kimi-k2.5",
+    model: str = "openai/gpt-5.4",
 ):
     """Run the SWE pipeline with progress tracking."""
     from dataclasses import dataclass, field
diff --git a/src/swe_forge/llm/tools.py b/src/swe_forge/llm/tools.py
@@ -26,7 +26,7 @@
 # ─────────────────────────────────────────────────────────────────────────────
 
 DEFAULT_SHELL_TIMEOUT_MS = 30_000  # 30 seconds default timeout
-MAX_TURNS_DEFAULT = 200
+MAX_TURNS_DEFAULT = 400
 
 
 def shell_tool_schema() -> ToolDefinition:
diff --git a/src/swe_forge/swe/test_generator.py b/src/swe_forge/swe/test_generator.py
@@ -40,7 +40,7 @@
 # Constants
 # ─────────────────────────────────────────────────────────────────────────────
 
-MAX_AGENT_TURNS = 200
+MAX_AGENT_TURNS = 400
 MAX_VALIDATION_RETRIES = 3
 DEFAULT_TIMEOUT_MS = 60_000
 
@@ -565,32 +565,40 @@ def _build_user_message(self, task: SweTask) -> str:
 {self._truncate(task.patch, 4000)}
 ```
 
-== WORKFLOW (FOLLOW EXACTLY) ==
+== CRITICAL: YOU MUST USE TOOLS, NOT TEXT ==
 
-Step 1: Install dependencies
-- Run: apt-get update && apt-get install -y python3 python3-pip git
-- Check pyproject.toml/setup.py for install commands
-- Run install commands via `shell`
+DO NOT write text responses. DO NOT ask questions. DO NOT explain. 
+USE TOOLS: shell, read_file, write_file, submit_tests.
 
-Step 2: Explore the changed code
-- Use `read_file` to read the files mentioned in the diff
-- Use `list_dir` to understand project structure
+== WORKFLOW (EXECUTE NOW) ==
 
-Step 3: WRITE TEST FILES (DO THIS NOW!)
-- Use `write_file` to create test_swe_<feature>.py
-- Write behavioral tests that EXERCISE the changed functionality
+Step 1: INSTALL (use shell tool)
+shell: apt-get update && apt-get install -y python3 python3-pip git
+shell: pip install -e . (or check pyproject.toml)
 
-Step 4: Run tests to validate
-- Use `shell` to run: pytest -c /dev/null test_swe_<feature>.py -v
+Step 2: READ the changed files (use read_file tool)
+read_file: path to files from the diff
 
-Step 5: SUBMIT (MUST CALL THIS!)
-- Call `submit_tests` with your test files and install commands
+Step 3: WRITE TESTS (use write_file tool) 
+write_file: path=test_swe_feature.py, content=your test code
+Tests MUST import modules and call functions - behavioral tests only.
 
-== CRITICAL REMINDERS ==
-- Do NOT just explore forever - WRITE TEST FILES using `write_file`
-- Do NOT end without calling `submit_tests`
-- Tests MUST be behavioral (import, call functions, check values)
-- Do NOT read source and assert on file content."""
+Step 4: RUN TESTS (use shell tool)
+shell: pytest -c /dev/null test_swe_feature.py -v
+
+Step 5: SUBMIT (use submit_tests tool) - THIS IS MANDATORY!
+submit_tests: fail_to_pass=["pytest test_swe_feature.py"], 
+              pass_to_pass=[], 
+              test_files=[{{path, content}}], 
+              install_commands=["apt-get...", "pip install..."]
+
+== ABSOLUTE RULES ==
+- EVERY response MUST include a tool call
+- NO text explanations - only tool calls
+- NO questions - just execute
+- MUST call submit_tests to finish
+- If unsure, try something with shell tool
+- YOU HAVE 400 TURNS - use them to complete the task"""
 
     def _test_commands_for_language(self, language: str) -> tuple[list[str], list[str]]:
         """Get suggested build and test commands for a language.
@@ -905,17 +913,16 @@ async def generate_tests(
 
                 continue
 
-            # No tool calls, check for text response
-            if message.content and message.content.strip():
-                loop.add_assistant(message.content)
-                loop.add_user(
-                    "Use the `shell` tool to explore the repo and run tests, "
-                    "then call `submit_tests`."
-                )
-                continue
-
-            # Empty response, we're done
-            break
+            # No tool calls - FORCE TOOL USAGE
+            # Agent must use tools, not write text
+            loop.add_assistant(message.content or "")
+            loop.add_user(
+                "ERROR: You must use tools (shell, read_file, write_file, submit_tests), not text. "
+                "DO NOT explain. DO NOT ask questions. "
+                "Execute: shell('apt-get update'), read_file('file.py'), write_file('test.py', '...'), submit_tests(...). "
+                "Call a tool NOW."
+            )
+            continue
 
         # Exhausted turns without success
         return GeneratedTests(
diff --git a/test-output/sgl-project-sglang-21653/patch.diff b/test-output/sgl-project-sglang-21653/patch.diff
@@ -0,0 +1,41 @@
+diff --git a/scripts/ci/utils/diffusion/generate_diffusion_dashboard.py b/scripts/ci/utils/diffusion/generate_diffusion_dashboard.py
+index 183611ef044d..bb223fbe6dde 100644
+--- a/scripts/ci/utils/diffusion/generate_diffusion_dashboard.py
++++ b/scripts/ci/utils/diffusion/generate_diffusion_dashboard.py
+@@ -26,7 +26,7 @@
+ CI_DATA_REPO_NAME = "sglang-ci-data"
+ CI_DATA_BRANCH = "main"
+ HISTORY_PREFIX = "diffusion-comparisons"
+-MAX_HISTORY_RUNS = 7
++MAX_HISTORY_RUNS = 14
+ 
+ # Base URL for chart images pushed to sglang-ci-data
+ CHARTS_RAW_BASE_URL = (
+@@ -344,7 +344,7 @@ def generate_dashboard(
+ 
+     # ---- Section 2: SGLang Performance Trend ----
+     if history:
+-        lines.append("\n## SGLang Performance Trend (Last 7 Runs)\n")
++        lines.append(f"\n## SGLang Performance Trend (Last {len(history) + 1} Runs)\n")
+ 
+         # Build header
+         header = "| Date | Commit |"
+@@ -491,9 +491,16 @@ def _chart_label(run: dict) -> str:
+                 ax.set_xticklabels(labels, fontsize=7)
+                 ax.set_ylabel("Latency (s)")
+                 ax.set_title(f"Latency Trend -- {cid}", fontsize=11, fontweight="bold")
+-                ax.legend(loc="upper right", fontsize=8)
++                ax.legend(loc="lower right", fontsize=8, framealpha=0.8)
+                 ax.grid(True, alpha=0.3)
+-                ax.set_ylim(bottom=0)
++                all_vals = sg_vals + [v for v in vl_vals if v is not None]
++                y_min = min(all_vals)
++                y_max = max(all_vals)
++                y_range = y_max - y_min if y_max > y_min else max(y_max * 0.1, 0.1)
++                ax.set_ylim(
++                    bottom=max(0, y_min - y_range * 0.3),
++                    top=y_max + y_range * 0.3,
++                )
+ 
+                 filename = f"latency_{_sanitize_filename(cid)}.png"
+                 chart_path = os.path.join(charts_dir, filename)
diff --git a/test-output/sgl-project-sglang-21653/workspace.yaml b/test-output/sgl-project-sglang-21653/workspace.yaml
@@ -0,0 +1,16 @@
+task_id: sgl-project-sglang-21653
+repo:
+  url: https://github.com/sgl-project/sglang.git
+  base_commit: c06ca1526cb6008a8dacb4fdb06567e648134664
+  merge_commit: 0c5620e6b142898b11d01ec4546015c5687fd9eb
+language: python
+difficulty_score: 3
+prompt: '[diffusion] Fix dashboard chart display issues'
+environment:
+  image: ubuntu:24.04
+  language_version: '3.12'
+install:
+  commands: []
+tests:
+  fail_to_pass: []
+  pass_to_pass: []
diff --git a/tests/test_swe/test_test_generator.py b/tests/test_swe/test_test_generator.py
@@ -651,7 +651,7 @@ def test_test_commands_for_language(self):
 
 class TestConstants:
     def test_max_agent_turns(self):
-        assert MAX_AGENT_TURNS == 200
+        assert MAX_AGENT_TURNS == 400
 
     def test_max_validation_retries(self):
         assert MAX_VALIDATION_RETRIES == 3