Skip to content
Open
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
60 changes: 53 additions & 7 deletions src/google/adk/evaluation/final_response_match_v1.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@
from .evaluator import EvaluationResult
from .evaluator import Evaluator
from .evaluator import PerInvocationResult
from .text_utils import normalize_text #importing normalize_text function for non-English text comparison
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

As a matter of style, it's better to avoid inline comments on import statements, as per PEP 8. The purpose of the import is clear from the code that uses it. Please remove the comment.

Suggested change
from .text_utils import normalize_text #importing normalize_text function for non-English text comparison
from .text_utils import normalize_text



class RougeEvaluator(Evaluator):
Expand Down Expand Up @@ -110,10 +111,55 @@ def _calculate_rouge_1_scores(candidate: str, reference: str):
Returns:
A dictionary containing the ROUGE-1 precision, recall, and f-measure.
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

The docstring is incorrect. This function returns a Score namedtuple, not a dictionary. Please update the docstring to reflect the actual return type. This will improve clarity for future developers.

Suggested change
A dictionary containing the ROUGE-1 precision, recall, and f-measure.
A Score namedtuple containing the ROUGE-1 precision, recall, and f-measure.

"""
scorer = rouge_scorer.RougeScorer(["rouge1"], use_stemmer=True)

# The score method returns a dictionary where keys are the ROUGE types
# and values are Score objects (tuples) with precision, recall, and fmeasure.
scores = scorer.score(reference, candidate)

return scores["rouge1"]
# Normalize both texts before scoring to handle Unicode variations
normalized_candidate = normalize_text(candidate)
normalized_reference = normalize_text(reference)

# Check if the text contains spaces (word-separated languages)
has_spaces = ' ' in normalized_reference or ' ' in normalized_candidate

if has_spaces:
# Use standard word-level ROUGE for space-separated languages
scorer = rouge_scorer.RougeScorer(["rouge1"], use_stemmer=True)
scores = scorer.score(normalized_reference, normalized_candidate)
return scores["rouge1"]
else:
# For non-space-separated languages, use character-level comparison
return _calculate_character_level_rouge(normalized_candidate, normalized_reference)


def _calculate_character_level_rouge(candidate: str, reference: str):
"""Calculates character-level ROUGE-1 score for non-space-separated text.

Args:
candidate: The candidate text (already normalized).
reference: The reference text (already normalized).

Returns:
A Score namedtuple with precision, recall, and fmeasure.
"""
from collections import Counter, namedtuple

if not reference or not candidate:
Score = namedtuple('Score', ['precision', 'recall', 'fmeasure'])
return Score(precision=0.0, recall=0.0, fmeasure=0.0)

# Count character occurrences
ref_chars = Counter(reference)
cand_chars = Counter(candidate)

# Calculate overlapping characters
overlap = sum((ref_chars & cand_chars).values())

# Calculate precision and recall
precision = overlap / len(candidate) if len(candidate) > 0 else 0.0
recall = overlap / len(reference) if len(reference) > 0 else 0.0
Comment on lines +160 to +161
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

The checks if len(candidate) > 0 and if len(reference) > 0 are redundant. The guard clause on line 149 (if not reference or not candidate:) ensures that if this part of the code is reached, both candidate and reference are non-empty strings, so their lengths will be greater than 0. You can simplify the code by removing these checks.

Suggested change
precision = overlap / len(candidate) if len(candidate) > 0 else 0.0
recall = overlap / len(reference) if len(reference) > 0 else 0.0
precision = overlap / len(candidate)
recall = overlap / len(reference)


# Calculate F-measure
if precision + recall > 0:
fmeasure = 2 * (precision * recall) / (precision + recall)
else:
fmeasure = 0.0

Score = namedtuple('Score', ['precision', 'recall', 'fmeasure'])
return Score(precision=precision, recall=recall, fmeasure=fmeasure)
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

high

This function has a few areas for improvement regarding style and efficiency:

  • Imports: The import from collections import Counter, namedtuple should be at the top of the file, per PEP 8.
  • namedtuple definition: The Score namedtuple is defined twice inside this function. This is inefficient as it's redefined on every call. It should be defined once at the module level.
  • Redundant checks: The checks for non-zero length before division are redundant, as the if not reference or not candidate: guard at the beginning already handles this.

Here is a suggested refactoring that addresses these points. Please remember to move the import to the top of the file and define Score at the module level.

# At top of file:
from collections import Counter, namedtuple
# ...

# At module level, after imports:
Score = namedtuple('Score', ['precision', 'recall', 'fmeasure'])
def _calculate_character_level_rouge(candidate: str, reference: str):
  """Calculates character-level ROUGE-1 score for non-space-separated text.
  
  Args:
    candidate: The candidate text (already normalized).
    reference: The reference text (already normalized).
  
  Returns:
    A Score namedtuple with precision, recall, and fmeasure.
  """
  if not reference or not candidate:
    return Score(precision=0.0, recall=0.0, fmeasure=0.0)
  
  # Count character occurrences
  ref_chars = Counter(reference)
  cand_chars = Counter(candidate)
  
  # Calculate overlapping characters
  overlap = sum((ref_chars & cand_chars).values())
  
  # Calculate precision and recall
  precision = overlap / len(candidate)
  recall = overlap / len(reference)
  
  # Calculate F-measure
  if precision + recall > 0:
    fmeasure = 2 * (precision * recall) / (precision + recall)
  else:
    fmeasure = 0.0
  
  return Score(precision=precision, recall=recall, fmeasure=fmeasure)

34 changes: 34 additions & 0 deletions src/google/adk/evaluation/text_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
# Copyright 2025 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""Text utilities for evaluation."""

from __future__ import annotations

import unicodedata


def normalize_text(text: str) -> str:
"""Normalize text using NFC normalization and strip whitespace.

This ensures consistent text comparison across different Unicode
representations, which is particularly important for non-English text.

Args:
text: The text to normalize.

Returns:
The normalized text.
"""
return unicodedata.normalize("NFC", text).strip()
40 changes: 40 additions & 0 deletions tests/unittests/evaluation/test_non_english_eval.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
# Copyright 2025 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""Tests for final_response_match_v1."""

from __future__ import annotations


def test_debug_normalization():
"""Debug test to see if normalization is being applied."""
from google.adk.evaluation.final_response_match_v1 import _calculate_rouge_1_scores
from google.adk.evaluation.text_utils import normalize_text

reference = "สวัสดี"
candidate = "สวัสดี"

# Check normalization directly
norm_ref = normalize_text(reference)
norm_cand = normalize_text(candidate)

print(f"Reference: {repr(reference)}")
print(f"Candidate: {repr(candidate)}")
print(f"Normalized reference: {repr(norm_ref)}")
print(f"Normalized candidate: {repr(norm_cand)}")
print(f"Are they equal after normalization? {norm_ref == norm_cand}")

# Now test the actual function
score = _calculate_rouge_1_scores(candidate, reference)
print(f"ROUGE score: {score}")
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

high

This test file contains a debug-style test with print statements instead of assertions. To make the tests more robust and automated, it's better to use pytest features like parametrize to cover multiple scenarios and assert to verify the results. This also makes the test suite cleaner by not printing to standard output during runs.

Here is a suggested replacement for the current test function that uses pytest.mark.parametrize to cover several cases, including perfect matches, partial matches, and no matches for non-space-separated text.

from __future__ import annotations

from collections import namedtuple

import pytest

from google.adk.evaluation.final_response_match_v1 import (
    _calculate_rouge_1_scores,
)

Score = namedtuple("Score", ["precision", "recall", "fmeasure"])


@pytest.mark.parametrize(
    "candidate, reference, expected_score",
    [
        # Perfect match
        ("สวัสดี", "สวัสดี", Score(1.0, 1.0, 1.0)),
        # Partial match
        ("ab", "ac", Score(0.5, 0.5, 0.5)),
        # No match
        ("abc", "def", Score(0.0, 0.0, 0.0)),
        # Candidate is subset of reference
        ("a", "ab", Score(1.0, 0.5, 2 / 3)),
        # Empty candidate
        ("", "abc", Score(0.0, 0.0, 0.0)),
        # Empty reference
        ("abc", "", Score(0.0, 0.0, 0.0)),
        # Both empty
        ("", "", Score(0.0, 0.0, 0.0)),
    ],
)
def test_character_level_rouge(candidate, reference, expected_score):
  """Tests character-level ROUGE for various non-space-separated strings."""
  actual_score = _calculate_rouge_1_scores(candidate, reference)
  assert actual_score.precision == pytest.approx(expected_score.precision)
  assert actual_score.recall == pytest.approx(expected_score.recall)
  assert actual_score.fmeasure == pytest.approx(expected_score.fmeasure)