Skip to content

Commit d3c411d

Browse files
committed
test(eval): Sharpen rouge score assertions per reviewer feedback
Replace loose inequality checks (> 0, < 1.0) with exact pytest.approx values in non-English ROUGE tests. Corrects gemini-code-assist suggestion for Thai test (actual F-measure is 8/17 due to combining marks being split by \w+ tokenizer, not 0.4 as the bot calculated).
1 parent e31cdca commit d3c411d

1 file changed

Lines changed: 8 additions & 10 deletions

File tree

tests/unittests/evaluation/test_final_response_match_v1.py

Lines changed: 8 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -246,9 +246,9 @@ def test_thai_sentence_with_overlap(self):
246246
candidate = "สวัสดี คุณ สบายดี ไหม วันนี้"
247247
reference = "สวัสดี คุณ อากาศ เป็น อย่างไร"
248248
rouge_1_score = _calculate_rouge_1_scores(candidate, reference)
249-
# Should match "สวัสดี" and "คุณ" (2 out of 5 words each)
250-
assert rouge_1_score.fmeasure > 0
251-
assert rouge_1_score.fmeasure < 1.0
249+
# Tokenizer splits combining marks (e.g. "สวัสดี" → ["สว", "สด"]), yielding
250+
# 4 common tokens out of 8 candidate and 9 reference tokens → F = 8/17
251+
assert rouge_1_score.fmeasure == pytest.approx(8 / 17)
252252

253253
def test_thai_polite_particle_variation(self):
254254
"""Thai: Same meaning with polite particle should show high match."""
@@ -257,7 +257,7 @@ def test_thai_polite_particle_variation(self):
257257
reference = "สวัสดี ค่ะ"
258258
rouge_1_score = _calculate_rouge_1_scores(candidate, reference)
259259
# Should match "สวัสดี" (1 out of 2 words)
260-
assert rouge_1_score.fmeasure == pytest.approx(0.5, rel=0.1)
260+
assert rouge_1_score.fmeasure == pytest.approx(0.5)
261261

262262
# === Chinese Language Tests ===
263263

@@ -271,12 +271,11 @@ def test_chinese_greeting_identical(self):
271271
def test_chinese_sentence_with_overlap(self):
272272
"""Chinese: Sentences with common words should show partial match."""
273273
# Space-separated for tokenization
274-
candidate = "今天 天气 很好" # "Today's weather is good"
274+
candidate = "今天 天气 很 好" # "Today's weather is very good"
275275
reference = "今天 我 很 开心" # "Today I am happy"
276276
rouge_1_score = _calculate_rouge_1_scores(candidate, reference)
277277
# Should match "今天" and "很"
278-
assert rouge_1_score.fmeasure > 0
279-
assert rouge_1_score.fmeasure < 1.0
278+
assert rouge_1_score.fmeasure == pytest.approx(0.5)
280279

281280
def test_chinese_different_sentences(self):
282281
"""Chinese: Completely different sentences should have zero score."""
@@ -318,7 +317,7 @@ def test_japanese_sentence_with_overlap(self):
318317
reference = "今日 は 仕事 が 忙しい です" # "Today work is busy"
319318
rouge_1_score = _calculate_rouge_1_scores(candidate, reference)
320319
# Should match "今日", "は", "が", "です"
321-
assert rouge_1_score.fmeasure > 0.5
320+
assert rouge_1_score.fmeasure == pytest.approx(2 / 3)
322321

323322
# === Korean Language Tests ===
324323

@@ -335,8 +334,7 @@ def test_korean_sentence_with_overlap(self):
335334
reference = "오늘 기분이 좋습니다" # "Today my mood is good"
336335
rouge_1_score = _calculate_rouge_1_scores(candidate, reference)
337336
# Should match "오늘" and "좋습니다"
338-
assert rouge_1_score.fmeasure > 0
339-
assert rouge_1_score.fmeasure < 1.0
337+
assert rouge_1_score.fmeasure == pytest.approx(2 / 3)
340338

341339
# === European Languages (Latin script with accents) ===
342340

0 commit comments

Comments
 (0)