From 5472589b55179bdb13db57331d5aaf534487f783 Mon Sep 17 00:00:00 2001
From: zhuohua <zz297429@alibaba-inc.com>
Date: Tue, 13 Jan 2026 20:59:45 +0800
Subject: [PATCH 1/2] feat(grader): add metric_type parameter to
 ToolCallSequenceMatchGrader

- Add metric_type parameter to control score calculation when strict_mode=False and use_jaccard_similarity=False
- Support 'recall' (default): matched_count / reference_count
- Support 'precision': matched_count / predicted_count
- Add validation for metric_type parameter
- Add corresponding test cases
---
 .gitignore                                    |   1 +
 .../agent/tool/tool_call_sequence_match.py    |  35 ++++-
 .../tool/test_tool_call_sequence_match.py     | 137 ++++++++++++++++++
 3 files changed, 167 insertions(+), 6 deletions(-)

diff --git a/.gitignore b/.gitignore
index afeef1f0..c6484807 100644
--- a/.gitignore
+++ b/.gitignore
@@ -16,6 +16,7 @@ __pycache__/
 poetry.lock
 /site
 uv*/
+uv.lock
 
 # Build and Distribution Files
 /dist
diff --git a/openjudge/graders/agent/tool/tool_call_sequence_match.py b/openjudge/graders/agent/tool/tool_call_sequence_match.py
index f4bcd668..f58db593 100644
--- a/openjudge/graders/agent/tool/tool_call_sequence_match.py
+++ b/openjudge/graders/agent/tool/tool_call_sequence_match.py
@@ -44,16 +44,31 @@ def __init__(
         self,
         strict_mode: bool = True,
         use_jaccard_similarity: bool = True,
+        metric_type: str = "recall",
         **kwargs,
     ):
+        """
+        Initialize the ToolCallSequenceMatchGrader.
+
+        Args:
+            strict_mode: If True, matches both tool_call name and arguments; if False, only matches tool_call name
+            use_jaccard_similarity: If True, use Jaccard similarity for loose mode (ignores step order)
+            metric_type: Metric type for step matching when use_jaccard_similarity=False.
+                - "recall": matched_count / reference_count (default)
+                - "precision": matched_count / predicted_count
+            **kwargs: Additional arguments passed to BaseGrader
+        """
         super().__init__(
             name="tool_call_sequence",
             mode=GraderMode.POINTWISE,
             description="Evaluate tool call sequence matching against reference",
             **kwargs,
         )
+        if metric_type not in ("recall", "precision"):
+            raise ValueError(f"metric_type must be 'recall' or 'precision', got '{metric_type}'")
         self.strict_mode = strict_mode
         self.use_jaccard_similarity = use_jaccard_similarity
+        self.metric_type = metric_type
 
     def extract_predicted_tool_sequence(
         self,
@@ -267,11 +282,19 @@ def calculate_step_matching_score(
                         step_score = sum(tool_scores) / len(tool_scores) if tool_scores else 0.0
                 else:
                     # In loose mode, calculate step score based on the ratio of matched tools
-                    if len(gt_tool_names) > 0:
-                        matched_count = len(gt_tool_names) - len(missing)
-                        step_score = matched_count / len(gt_tool_names)
-                    else:
-                        step_score = 1.0
+                    matched_count = len(gt_tool_names) - len(missing)
+                    if self.metric_type == "recall":
+                        # Recall: matched / reference
+                        if len(gt_tool_names) > 0:
+                            step_score = matched_count / len(gt_tool_names)
+                        else:
+                            step_score = 1.0
+                    else:  # precision
+                        # Precision: matched / predicted
+                        if len(pred_tool_names) > 0:
+                            step_score = matched_count / len(pred_tool_names)
+                        else:
+                            step_score = 0.0 if len(gt_tool_names) > 0 else 1.0
             else:
                 step_score = 0.0  # No matching step in model
             total_score += step_score
@@ -420,7 +443,7 @@ async def aevaluate(
             score_type = "step_matching"
         # Generate detailed reason
         mode_str = "strict" if self.strict_mode else "loose"
-        method_str = "jaccard" if self.use_jaccard_similarity else "step-by-step"
+        method_str = "jaccard" if self.use_jaccard_similarity else f"step-by-step/{self.metric_type}"
         reason = f"Tool call sequence evaluation ({mode_str} mode, {method_str}): {score_type}={final_score:.3f}"
         # Count tools for metadata
         predicted_tool_count = sum(len(tools) for tools in predicted_tool_steps.values())
diff --git a/tests/graders/agent/tool/test_tool_call_sequence_match.py b/tests/graders/agent/tool/test_tool_call_sequence_match.py
index 03cc1726..36e65a33 100644
--- a/tests/graders/agent/tool/test_tool_call_sequence_match.py
+++ b/tests/graders/agent/tool/test_tool_call_sequence_match.py
@@ -178,3 +178,140 @@ def test_tool_call_sequence_match_grader_extract_predicted_tool_sequence():
     assert 1 in sequence
     assert sequence[0][0]["name"] == "search"
     assert sequence[1][0]["name"] == "analyze"
+
+
+def test_tool_call_sequence_match_grader_metric_type_default():
+    """Test that default metric_type is recall"""
+    grader = ToolCallSequenceMatchGrader(strict_mode=False, use_jaccard_similarity=False)
+    assert grader.metric_type == "recall"
+
+
+def test_tool_call_sequence_match_grader_metric_type_precision():
+    """Test creating grader with precision metric_type"""
+    grader = ToolCallSequenceMatchGrader(
+        strict_mode=False,
+        use_jaccard_similarity=False,
+        metric_type="precision",
+    )
+    assert grader.metric_type == "precision"
+
+
+def test_tool_call_sequence_match_grader_invalid_metric_type():
+    """Test that invalid metric_type raises ValueError"""
+    with pytest.raises(ValueError, match="metric_type must be 'recall' or 'precision'"):
+        ToolCallSequenceMatchGrader(metric_type="invalid")
+
+
+@pytest.mark.asyncio
+async def test_tool_call_sequence_match_grader_recall_metric():
+    """Test loose mode with recall metric (matched / reference)"""
+    grader = ToolCallSequenceMatchGrader(
+        strict_mode=False,
+        use_jaccard_similarity=False,
+        metric_type="recall",
+    )
+
+    # Predicted has 1 tool, reference has 2 tools, 1 match
+    messages = [
+        {
+            "role": "assistant",
+            "tool_calls": [
+                {"function": {"name": "search", "arguments": "{}"}},
+            ],
+        },
+    ]
+
+    reference_tool_calls = [
+        [
+            {"name": "search", "arguments": {}},
+            {"name": "calculate", "arguments": {}},
+        ],
+    ]
+
+    result = await grader.aevaluate(
+        messages=messages,
+        reference_tool_calls=reference_tool_calls,
+    )
+
+    # Recall = 1 matched / 2 reference = 0.5
+    assert result.score == 0.5
+
+
+@pytest.mark.asyncio
+async def test_tool_call_sequence_match_grader_precision_metric():
+    """Test loose mode with precision metric (matched / predicted)"""
+    grader = ToolCallSequenceMatchGrader(
+        strict_mode=False,
+        use_jaccard_similarity=False,
+        metric_type="precision",
+    )
+
+    # Predicted has 2 tools, reference has 1 tool, 1 match
+    messages = [
+        {
+            "role": "assistant",
+            "tool_calls": [
+                {"function": {"name": "search", "arguments": "{}"}},
+                {"function": {"name": "calculate", "arguments": "{}"}},
+            ],
+        },
+    ]
+
+    reference_tool_calls = [
+        [
+            {"name": "search", "arguments": {}},
+        ],
+    ]
+
+    result = await grader.aevaluate(
+        messages=messages,
+        reference_tool_calls=reference_tool_calls,
+    )
+
+    # Precision = 1 matched / 2 predicted = 0.5
+    assert result.score == 0.5
+
+
+@pytest.mark.asyncio
+async def test_tool_call_sequence_match_grader_recall_vs_precision():
+    """Test that recall and precision give different scores for same input"""
+    messages = [
+        {
+            "role": "assistant",
+            "tool_calls": [
+                {"function": {"name": "search", "arguments": "{}"}},
+                {"function": {"name": "extra_tool", "arguments": "{}"}},
+            ],
+        },
+    ]
+
+    reference_tool_calls = [
+        [
+            {"name": "search", "arguments": {}},
+        ],
+    ]
+
+    # Recall grader: 1 matched / 1 reference = 1.0
+    recall_grader = ToolCallSequenceMatchGrader(
+        strict_mode=False,
+        use_jaccard_similarity=False,
+        metric_type="recall",
+    )
+    recall_result = await recall_grader.aevaluate(
+        messages=messages,
+        reference_tool_calls=reference_tool_calls,
+    )
+
+    # Precision grader: 1 matched / 2 predicted = 0.5
+    precision_grader = ToolCallSequenceMatchGrader(
+        strict_mode=False,
+        use_jaccard_similarity=False,
+        metric_type="precision",
+    )
+    precision_result = await precision_grader.aevaluate(
+        messages=messages,
+        reference_tool_calls=reference_tool_calls,
+    )
+
+    assert recall_result.score == 1.0
+    assert precision_result.score == 0.5

From ac95a448bf59e89dbf3470cbe86ad6c32b26ad1e Mon Sep 17 00:00:00 2001
From: zhuohua <zz297429@alibaba-inc.com>
Date: Tue, 13 Jan 2026 21:09:18 +0800
Subject: [PATCH 2/2] update param desc for metric_type

---
 openjudge/graders/agent/tool/tool_call_sequence_match.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/openjudge/graders/agent/tool/tool_call_sequence_match.py b/openjudge/graders/agent/tool/tool_call_sequence_match.py
index f58db593..8a2cccd3 100644
--- a/openjudge/graders/agent/tool/tool_call_sequence_match.py
+++ b/openjudge/graders/agent/tool/tool_call_sequence_match.py
@@ -53,7 +53,7 @@ def __init__(
         Args:
             strict_mode: If True, matches both tool_call name and arguments; if False, only matches tool_call name
             use_jaccard_similarity: If True, use Jaccard similarity for loose mode (ignores step order)
-            metric_type: Metric type for step matching when use_jaccard_similarity=False.
+            metric_type: Metric type for step matching when use_jaccard_similarity=False and strict_mode=False.
                 - "recall": matched_count / reference_count (default)
                 - "precision": matched_count / predicted_count
             **kwargs: Additional arguments passed to BaseGrader