-
Notifications
You must be signed in to change notification settings - Fork 3.3k
Modified _calculate_rouge_1_scores() to normalize texts before comparison and implemented _calculate_character_level_rouge() using Counter to calculate precision/recall/F-measure from character overlap #4131
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from 1 commit
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change | ||||||||
|---|---|---|---|---|---|---|---|---|---|---|
|
|
@@ -27,6 +27,7 @@ | |||||||||
| from .evaluator import EvaluationResult | ||||||||||
| from .evaluator import Evaluator | ||||||||||
| from .evaluator import PerInvocationResult | ||||||||||
| from .text_utils import normalize_text #importing normalize_text function for non-English text comparison | ||||||||||
|
|
||||||||||
|
|
||||||||||
| class RougeEvaluator(Evaluator): | ||||||||||
|
|
@@ -110,10 +111,55 @@ def _calculate_rouge_1_scores(candidate: str, reference: str): | |||||||||
| Returns: | ||||||||||
| A dictionary containing the ROUGE-1 precision, recall, and f-measure. | ||||||||||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The docstring is incorrect. This function returns a
Suggested change
|
||||||||||
| """ | ||||||||||
| scorer = rouge_scorer.RougeScorer(["rouge1"], use_stemmer=True) | ||||||||||
|
|
||||||||||
| # The score method returns a dictionary where keys are the ROUGE types | ||||||||||
| # and values are Score objects (tuples) with precision, recall, and fmeasure. | ||||||||||
| scores = scorer.score(reference, candidate) | ||||||||||
|
|
||||||||||
| return scores["rouge1"] | ||||||||||
| # Normalize both texts before scoring to handle Unicode variations | ||||||||||
| normalized_candidate = normalize_text(candidate) | ||||||||||
| normalized_reference = normalize_text(reference) | ||||||||||
|
|
||||||||||
| # Check if the text contains spaces (word-separated languages) | ||||||||||
| has_spaces = ' ' in normalized_reference or ' ' in normalized_candidate | ||||||||||
|
|
||||||||||
| if has_spaces: | ||||||||||
| # Use standard word-level ROUGE for space-separated languages | ||||||||||
| scorer = rouge_scorer.RougeScorer(["rouge1"], use_stemmer=True) | ||||||||||
| scores = scorer.score(normalized_reference, normalized_candidate) | ||||||||||
| return scores["rouge1"] | ||||||||||
| else: | ||||||||||
| # For non-space-separated languages, use character-level comparison | ||||||||||
| return _calculate_character_level_rouge(normalized_candidate, normalized_reference) | ||||||||||
|
|
||||||||||
|
|
||||||||||
| def _calculate_character_level_rouge(candidate: str, reference: str): | ||||||||||
| """Calculates character-level ROUGE-1 score for non-space-separated text. | ||||||||||
|
|
||||||||||
| Args: | ||||||||||
| candidate: The candidate text (already normalized). | ||||||||||
| reference: The reference text (already normalized). | ||||||||||
|
|
||||||||||
| Returns: | ||||||||||
| A Score namedtuple with precision, recall, and fmeasure. | ||||||||||
| """ | ||||||||||
| from collections import Counter, namedtuple | ||||||||||
|
|
||||||||||
| if not reference or not candidate: | ||||||||||
| Score = namedtuple('Score', ['precision', 'recall', 'fmeasure']) | ||||||||||
| return Score(precision=0.0, recall=0.0, fmeasure=0.0) | ||||||||||
|
|
||||||||||
| # Count character occurrences | ||||||||||
| ref_chars = Counter(reference) | ||||||||||
| cand_chars = Counter(candidate) | ||||||||||
|
|
||||||||||
| # Calculate overlapping characters | ||||||||||
| overlap = sum((ref_chars & cand_chars).values()) | ||||||||||
|
|
||||||||||
| # Calculate precision and recall | ||||||||||
| precision = overlap / len(candidate) if len(candidate) > 0 else 0.0 | ||||||||||
| recall = overlap / len(reference) if len(reference) > 0 else 0.0 | ||||||||||
|
Comment on lines
+160
to
+161
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The checks
Suggested change
|
||||||||||
|
|
||||||||||
| # Calculate F-measure | ||||||||||
| if precision + recall > 0: | ||||||||||
| fmeasure = 2 * (precision * recall) / (precision + recall) | ||||||||||
| else: | ||||||||||
| fmeasure = 0.0 | ||||||||||
|
|
||||||||||
| Score = namedtuple('Score', ['precision', 'recall', 'fmeasure']) | ||||||||||
| return Score(precision=precision, recall=recall, fmeasure=fmeasure) | ||||||||||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This function has a few areas for improvement regarding style and efficiency:
Here is a suggested refactoring that addresses these points. Please remember to move the import to the top of the file and define # At top of file:
from collections import Counter, namedtuple
# ...
# At module level, after imports:
Score = namedtuple('Score', ['precision', 'recall', 'fmeasure'])def _calculate_character_level_rouge(candidate: str, reference: str):
"""Calculates character-level ROUGE-1 score for non-space-separated text.
Args:
candidate: The candidate text (already normalized).
reference: The reference text (already normalized).
Returns:
A Score namedtuple with precision, recall, and fmeasure.
"""
if not reference or not candidate:
return Score(precision=0.0, recall=0.0, fmeasure=0.0)
# Count character occurrences
ref_chars = Counter(reference)
cand_chars = Counter(candidate)
# Calculate overlapping characters
overlap = sum((ref_chars & cand_chars).values())
# Calculate precision and recall
precision = overlap / len(candidate)
recall = overlap / len(reference)
# Calculate F-measure
if precision + recall > 0:
fmeasure = 2 * (precision * recall) / (precision + recall)
else:
fmeasure = 0.0
return Score(precision=precision, recall=recall, fmeasure=fmeasure) |
||||||||||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,34 @@ | ||
| # Copyright 2025 Google LLC | ||
| # | ||
| # Licensed under the Apache License, Version 2.0 (the "License"); | ||
| # you may not use this file except in compliance with the License. | ||
| # You may obtain a copy of the License at | ||
| # | ||
| # http://www.apache.org/licenses/LICENSE-2.0 | ||
| # | ||
| # Unless required by applicable law or agreed to in writing, software | ||
| # distributed under the License is distributed on an "AS IS" BASIS, | ||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
| # See the License for the specific language governing permissions and | ||
| # limitations under the License. | ||
|
|
||
| """Text utilities for evaluation.""" | ||
|
|
||
| from __future__ import annotations | ||
|
|
||
| import unicodedata | ||
|
|
||
|
|
||
| def normalize_text(text: str) -> str: | ||
| """Normalize text using NFC normalization and strip whitespace. | ||
|
|
||
| This ensures consistent text comparison across different Unicode | ||
| representations, which is particularly important for non-English text. | ||
|
|
||
| Args: | ||
| text: The text to normalize. | ||
|
|
||
| Returns: | ||
| The normalized text. | ||
| """ | ||
| return unicodedata.normalize("NFC", text).strip() |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,40 @@ | ||
| # Copyright 2025 Google LLC | ||
| # | ||
| # Licensed under the Apache License, Version 2.0 (the "License"); | ||
| # you may not use this file except in compliance with the License. | ||
| # You may obtain a copy of the License at | ||
| # | ||
| # http://www.apache.org/licenses/LICENSE-2.0 | ||
| # | ||
| # Unless required by applicable law or agreed to in writing, software | ||
| # distributed under the License is distributed on an "AS IS" BASIS, | ||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
| # See the License for the specific language governing permissions and | ||
| # limitations under the License. | ||
|
|
||
| """Tests for final_response_match_v1.""" | ||
|
|
||
| from __future__ import annotations | ||
|
|
||
|
|
||
| def test_debug_normalization(): | ||
| """Debug test to see if normalization is being applied.""" | ||
| from google.adk.evaluation.final_response_match_v1 import _calculate_rouge_1_scores | ||
| from google.adk.evaluation.text_utils import normalize_text | ||
|
|
||
| reference = "สวัสดี" | ||
| candidate = "สวัสดี" | ||
|
|
||
| # Check normalization directly | ||
| norm_ref = normalize_text(reference) | ||
| norm_cand = normalize_text(candidate) | ||
|
|
||
| print(f"Reference: {repr(reference)}") | ||
| print(f"Candidate: {repr(candidate)}") | ||
| print(f"Normalized reference: {repr(norm_ref)}") | ||
| print(f"Normalized candidate: {repr(norm_cand)}") | ||
| print(f"Are they equal after normalization? {norm_ref == norm_cand}") | ||
|
|
||
| # Now test the actual function | ||
| score = _calculate_rouge_1_scores(candidate, reference) | ||
| print(f"ROUGE score: {score}") | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This test file contains a debug-style test with Here is a suggested replacement for the current test function that uses from __future__ import annotations
from collections import namedtuple
import pytest
from google.adk.evaluation.final_response_match_v1 import (
_calculate_rouge_1_scores,
)
Score = namedtuple("Score", ["precision", "recall", "fmeasure"])
@pytest.mark.parametrize(
"candidate, reference, expected_score",
[
# Perfect match
("สวัสดี", "สวัสดี", Score(1.0, 1.0, 1.0)),
# Partial match
("ab", "ac", Score(0.5, 0.5, 0.5)),
# No match
("abc", "def", Score(0.0, 0.0, 0.0)),
# Candidate is subset of reference
("a", "ab", Score(1.0, 0.5, 2 / 3)),
# Empty candidate
("", "abc", Score(0.0, 0.0, 0.0)),
# Empty reference
("abc", "", Score(0.0, 0.0, 0.0)),
# Both empty
("", "", Score(0.0, 0.0, 0.0)),
],
)
def test_character_level_rouge(candidate, reference, expected_score):
"""Tests character-level ROUGE for various non-space-separated strings."""
actual_score = _calculate_rouge_1_scores(candidate, reference)
assert actual_score.precision == pytest.approx(expected_score.precision)
assert actual_score.recall == pytest.approx(expected_score.recall)
assert actual_score.fmeasure == pytest.approx(expected_score.fmeasure) |
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
As a matter of style, it's better to avoid inline comments on import statements, as per PEP 8. The purpose of the import is clear from the code that uses it. Please remove the comment.