google · robinpats182 · Jan 12, 2026 · Jan 13, 2026 · Jan 14, 2026 · gemini-code-assist
diff --git a/src/google/adk/evaluation/final_response_match_v1.py b/src/google/adk/evaluation/final_response_match_v1.py
@@ -27,6 +27,7 @@
 from .evaluator import EvaluationResult
 from .evaluator import Evaluator
 from .evaluator import PerInvocationResult
+from .text_utils import normalize_text #importing normalize_text function for non-English text comparison
-from .text_utils import normalize_text #importing normalize_text function for non-English text comparison
+from .text_utils import normalize_text
-from .text_utils import normalize_text #importing normalize_text function for non-English text comparison
+from .text_utils import normalize_text
 
 
 class RougeEvaluator(Evaluator):
@@ -110,10 +111,55 @@ def _calculate_rouge_1_scores(candidate: str, reference: str):
   Returns:
       A dictionary containing the ROUGE-1 precision, recall, and f-measure.
-      A dictionary containing the ROUGE-1 precision, recall, and f-measure.
+      A Score namedtuple containing the ROUGE-1 precision, recall, and f-measure.
-      A dictionary containing the ROUGE-1 precision, recall, and f-measure.
+      A Score namedtuple containing the ROUGE-1 precision, recall, and f-measure.
   """
-  scorer = rouge_scorer.RougeScorer(["rouge1"], use_stemmer=True)
-
-  # The score method returns a dictionary where keys are the ROUGE types
-  # and values are Score objects (tuples) with precision, recall, and fmeasure.
-  scores = scorer.score(reference, candidate)
-
-  return scores["rouge1"]
+  # Normalize both texts before scoring to handle Unicode variations
+  normalized_candidate = normalize_text(candidate)
+  normalized_reference = normalize_text(reference)
+
+  # Check if the text contains spaces (word-separated languages)
+  has_spaces = ' ' in normalized_reference or ' ' in normalized_candidate
+
+  if has_spaces:
+    # Use standard word-level ROUGE for space-separated languages
+    scorer = rouge_scorer.RougeScorer(["rouge1"], use_stemmer=True)
+    scores = scorer.score(normalized_reference, normalized_candidate)
+    return scores["rouge1"]
+  else:
+    # For non-space-separated languages, use character-level comparison
+    return _calculate_character_level_rouge(normalized_candidate, normalized_reference)
+
+
+def _calculate_character_level_rouge(candidate: str, reference: str):
+  """Calculates character-level ROUGE-1 score for non-space-separated text.
+
+  Args:
+    candidate: The candidate text (already normalized).
+    reference: The reference text (already normalized).
+
+  Returns:
+    A Score namedtuple with precision, recall, and fmeasure.
+  """
+  from collections import Counter, namedtuple
+
+  if not reference or not candidate:
+    Score = namedtuple('Score', ['precision', 'recall', 'fmeasure'])
+    return Score(precision=0.0, recall=0.0, fmeasure=0.0)
+
+  # Count character occurrences
+  ref_chars = Counter(reference)
+  cand_chars = Counter(candidate)
+
+  # Calculate overlapping characters
+  overlap = sum((ref_chars & cand_chars).values())
+
+  # Calculate precision and recall
+  precision = overlap / len(candidate) if len(candidate) > 0 else 0.0
+  recall = overlap / len(reference) if len(reference) > 0 else 0.0
-  precision = overlap / len(candidate) if len(candidate) > 0 else 0.0
-  recall = overlap / len(reference) if len(reference) > 0 else 0.0
+  precision = overlap / len(candidate)
+  recall = overlap / len(reference)
-  precision = overlap / len(candidate) if len(candidate) > 0 else 0.0
-  recall = overlap / len(reference) if len(reference) > 0 else 0.0
+  precision = overlap / len(candidate)
+  recall = overlap / len(reference)
+
+  # Calculate F-measure
+  if precision + recall > 0:
+    fmeasure = 2 * (precision * recall) / (precision + recall)
+  else:
+    fmeasure = 0.0
+
+  Score = namedtuple('Score', ['precision', 'recall', 'fmeasure'])
+  return Score(precision=precision, recall=recall, fmeasure=fmeasure)
diff --git a/src/google/adk/evaluation/text_utils.py b/src/google/adk/evaluation/text_utils.py
@@ -0,0 +1,34 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Text utilities for evaluation."""
+
+from __future__ import annotations
+
+import unicodedata
+
+
+def normalize_text(text: str) -> str:
+  """Normalize text using NFC normalization and strip whitespace.
+
+  This ensures consistent text comparison across different Unicode
+  representations, which is particularly important for non-English text.
+
+  Args:
+    text: The text to normalize.
+
+  Returns:
+    The normalized text.
+  """
+  return unicodedata.normalize("NFC", text).strip()
diff --git a/tests/unittests/evaluation/test_non_english_eval.py b/tests/unittests/evaluation/test_non_english_eval.py
@@ -0,0 +1,40 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Tests for final_response_match_v1."""
+
+from __future__ import annotations
+
+
+def test_debug_normalization():
+  """Debug test to see if normalization is being applied."""
+  from google.adk.evaluation.final_response_match_v1 import _calculate_rouge_1_scores
+  from google.adk.evaluation.text_utils import normalize_text
+
+  reference = "สวัสดี"
+  candidate = "สวัสดี"
+
+  # Check normalization directly
+  norm_ref = normalize_text(reference)
+  norm_cand = normalize_text(candidate)
+
+  print(f"Reference: {repr(reference)}")
+  print(f"Candidate: {repr(candidate)}")
+  print(f"Normalized reference: {repr(norm_ref)}")
+  print(f"Normalized candidate: {repr(norm_cand)}")
+  print(f"Are they equal after normalization? {norm_ref == norm_cand}")
+
+  # Now test the actual function
+  score = _calculate_rouge_1_scores(candidate, reference)
+  print(f"ROUGE score: {score}")