Skip to content

Commit fec5df9

Browse files
author
Atlas
committed
feat(test-generation): force tool calls, 400 turns, gpt-5.4 model
- MAX_AGENT_TURNS: 200 -> 400 - Default model: moonshotai/kimi-k2.5 -> openai/gpt-5.4 - Force tool usage: every response must have tool calls - Updated SYSTEM_PROMPT: no text, only tools - Updated user prompt: explicit tool examples - Force error message when no tool used - Tests: 1197 pass
1 parent ad545c2 commit fec5df9

6 files changed

Lines changed: 99 additions & 35 deletions

File tree

src/swe_forge/cli/mine.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -423,7 +423,7 @@ async def _run_pipeline(
423423
config: SwePipelineConfig,
424424
repo_filter: Optional[str],
425425
verbose: bool,
426-
model: str = "moonshotai/kimi-k2.5",
426+
model: str = "openai/gpt-5.4",
427427
):
428428
"""Run the SWE pipeline with progress tracking."""
429429
from dataclasses import dataclass, field

src/swe_forge/llm/tools.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@
2626
# ─────────────────────────────────────────────────────────────────────────────
2727

2828
DEFAULT_SHELL_TIMEOUT_MS = 30_000 # 30 seconds default timeout
29-
MAX_TURNS_DEFAULT = 200
29+
MAX_TURNS_DEFAULT = 400
3030

3131

3232
def shell_tool_schema() -> ToolDefinition:

src/swe_forge/swe/test_generator.py

Lines changed: 39 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,7 @@
4040
# Constants
4141
# ─────────────────────────────────────────────────────────────────────────────
4242

43-
MAX_AGENT_TURNS = 200
43+
MAX_AGENT_TURNS = 400
4444
MAX_VALIDATION_RETRIES = 3
4545
DEFAULT_TIMEOUT_MS = 60_000
4646

@@ -565,32 +565,40 @@ def _build_user_message(self, task: SweTask) -> str:
565565
{self._truncate(task.patch, 4000)}
566566
```
567567
568-
== WORKFLOW (FOLLOW EXACTLY) ==
568+
== CRITICAL: YOU MUST USE TOOLS, NOT TEXT ==
569569
570-
Step 1: Install dependencies
571-
- Run: apt-get update && apt-get install -y python3 python3-pip git
572-
- Check pyproject.toml/setup.py for install commands
573-
- Run install commands via `shell`
570+
DO NOT write text responses. DO NOT ask questions. DO NOT explain.
571+
USE TOOLS: shell, read_file, write_file, submit_tests.
574572
575-
Step 2: Explore the changed code
576-
- Use `read_file` to read the files mentioned in the diff
577-
- Use `list_dir` to understand project structure
573+
== WORKFLOW (EXECUTE NOW) ==
578574
579-
Step 3: WRITE TEST FILES (DO THIS NOW!)
580-
- Use `write_file` to create test_swe_<feature>.py
581-
- Write behavioral tests that EXERCISE the changed functionality
575+
Step 1: INSTALL (use shell tool)
576+
shell: apt-get update && apt-get install -y python3 python3-pip git
577+
shell: pip install -e . (or check pyproject.toml)
582578
583-
Step 4: Run tests to validate
584-
- Use `shell` to run: pytest -c /dev/null test_swe_<feature>.py -v
579+
Step 2: READ the changed files (use read_file tool)
580+
read_file: path to files from the diff
585581
586-
Step 5: SUBMIT (MUST CALL THIS!)
587-
- Call `submit_tests` with your test files and install commands
582+
Step 3: WRITE TESTS (use write_file tool)
583+
write_file: path=test_swe_feature.py, content=your test code
584+
Tests MUST import modules and call functions - behavioral tests only.
588585
589-
== CRITICAL REMINDERS ==
590-
- Do NOT just explore forever - WRITE TEST FILES using `write_file`
591-
- Do NOT end without calling `submit_tests`
592-
- Tests MUST be behavioral (import, call functions, check values)
593-
- Do NOT read source and assert on file content."""
586+
Step 4: RUN TESTS (use shell tool)
587+
shell: pytest -c /dev/null test_swe_feature.py -v
588+
589+
Step 5: SUBMIT (use submit_tests tool) - THIS IS MANDATORY!
590+
submit_tests: fail_to_pass=["pytest test_swe_feature.py"],
591+
pass_to_pass=[],
592+
test_files=[{{path, content}}],
593+
install_commands=["apt-get...", "pip install..."]
594+
595+
== ABSOLUTE RULES ==
596+
- EVERY response MUST include a tool call
597+
- NO text explanations - only tool calls
598+
- NO questions - just execute
599+
- MUST call submit_tests to finish
600+
- If unsure, try something with shell tool
601+
- YOU HAVE 400 TURNS - use them to complete the task"""
594602

595603
def _test_commands_for_language(self, language: str) -> tuple[list[str], list[str]]:
596604
"""Get suggested build and test commands for a language.
@@ -905,17 +913,16 @@ async def generate_tests(
905913

906914
continue
907915

908-
# No tool calls, check for text response
909-
if message.content and message.content.strip():
910-
loop.add_assistant(message.content)
911-
loop.add_user(
912-
"Use the `shell` tool to explore the repo and run tests, "
913-
"then call `submit_tests`."
914-
)
915-
continue
916-
917-
# Empty response, we're done
918-
break
916+
# No tool calls - FORCE TOOL USAGE
917+
# Agent must use tools, not write text
918+
loop.add_assistant(message.content or "")
919+
loop.add_user(
920+
"ERROR: You must use tools (shell, read_file, write_file, submit_tests), not text. "
921+
"DO NOT explain. DO NOT ask questions. "
922+
"Execute: shell('apt-get update'), read_file('file.py'), write_file('test.py', '...'), submit_tests(...). "
923+
"Call a tool NOW."
924+
)
925+
continue
919926

920927
# Exhausted turns without success
921928
return GeneratedTests(
Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,41 @@
1+
diff --git a/scripts/ci/utils/diffusion/generate_diffusion_dashboard.py b/scripts/ci/utils/diffusion/generate_diffusion_dashboard.py
2+
index 183611ef044d..bb223fbe6dde 100644
3+
--- a/scripts/ci/utils/diffusion/generate_diffusion_dashboard.py
4+
+++ b/scripts/ci/utils/diffusion/generate_diffusion_dashboard.py
5+
@@ -26,7 +26,7 @@
6+
CI_DATA_REPO_NAME = "sglang-ci-data"
7+
CI_DATA_BRANCH = "main"
8+
HISTORY_PREFIX = "diffusion-comparisons"
9+
-MAX_HISTORY_RUNS = 7
10+
+MAX_HISTORY_RUNS = 14
11+
12+
# Base URL for chart images pushed to sglang-ci-data
13+
CHARTS_RAW_BASE_URL = (
14+
@@ -344,7 +344,7 @@ def generate_dashboard(
15+
16+
# ---- Section 2: SGLang Performance Trend ----
17+
if history:
18+
- lines.append("\n## SGLang Performance Trend (Last 7 Runs)\n")
19+
+ lines.append(f"\n## SGLang Performance Trend (Last {len(history) + 1} Runs)\n")
20+
21+
# Build header
22+
header = "| Date | Commit |"
23+
@@ -491,9 +491,16 @@ def _chart_label(run: dict) -> str:
24+
ax.set_xticklabels(labels, fontsize=7)
25+
ax.set_ylabel("Latency (s)")
26+
ax.set_title(f"Latency Trend -- {cid}", fontsize=11, fontweight="bold")
27+
- ax.legend(loc="upper right", fontsize=8)
28+
+ ax.legend(loc="lower right", fontsize=8, framealpha=0.8)
29+
ax.grid(True, alpha=0.3)
30+
- ax.set_ylim(bottom=0)
31+
+ all_vals = sg_vals + [v for v in vl_vals if v is not None]
32+
+ y_min = min(all_vals)
33+
+ y_max = max(all_vals)
34+
+ y_range = y_max - y_min if y_max > y_min else max(y_max * 0.1, 0.1)
35+
+ ax.set_ylim(
36+
+ bottom=max(0, y_min - y_range * 0.3),
37+
+ top=y_max + y_range * 0.3,
38+
+ )
39+
40+
filename = f"latency_{_sanitize_filename(cid)}.png"
41+
chart_path = os.path.join(charts_dir, filename)
Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
task_id: sgl-project-sglang-21653
2+
repo:
3+
url: https://github.com/sgl-project/sglang.git
4+
base_commit: c06ca1526cb6008a8dacb4fdb06567e648134664
5+
merge_commit: 0c5620e6b142898b11d01ec4546015c5687fd9eb
6+
language: python
7+
difficulty_score: 3
8+
prompt: '[diffusion] Fix dashboard chart display issues'
9+
environment:
10+
image: ubuntu:24.04
11+
language_version: '3.12'
12+
install:
13+
commands: []
14+
tests:
15+
fail_to_pass: []
16+
pass_to_pass: []

tests/test_swe/test_test_generator.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -651,7 +651,7 @@ def test_test_commands_for_language(self):
651651

652652
class TestConstants:
653653
def test_max_agent_turns(self):
654-
assert MAX_AGENT_TURNS == 200
654+
assert MAX_AGENT_TURNS == 400
655655

656656
def test_max_validation_retries(self):
657657
assert MAX_VALIDATION_RETRIES == 3

0 commit comments

Comments
 (0)