"""Tests for the ROUGE metric.""" import pytest from veritext.metrics import Rouge, RougeResult, RougeScore class TestRouge: @pytest.fixture def rouge(self) -> Rouge: return Rouge() def test_name(self, rouge: Rouge) -> None: assert rouge.name == "rouge" def test_requires_reference(self, rouge: Rouge) -> None: assert rouge.requires_reference is True def test_identical_texts(self, rouge: Rouge) -> None: text = "The cat sat on the mat" result = rouge.score(text, text) assert result.rouge1.precision == 1.0 assert result.rouge1.recall == 1.0 assert result.rouge1.fmeasure == 1.0 assert result.rouge2.fmeasure == 1.0 assert result.rouge_l.fmeasure == 1.0 def test_no_overlap(self, rouge: Rouge) -> None: candidate = "apple banana cherry" reference = "dog elephant fox" result = rouge.score(candidate, reference) assert result.rouge1.precision == 0.0 assert result.rouge1.recall == 0.0 assert result.rouge1.fmeasure == 0.0 assert result.rouge2.fmeasure == 0.0 assert result.rouge_l.fmeasure == 0.0 def test_partial_overlap_rouge1(self, rouge: Rouge) -> None: candidate = "the cat sat" reference = "the dog sat" result = rouge.score(candidate, reference) # Candidate: {the, cat, sat}, Reference: {the, dog, sat} # Overlap: {the, sat} = 2 # Precision = 2/3, Recall = 2/3 assert abs(result.rouge1.precision - 2 / 3) < 1e-10 assert abs(result.rouge1.recall - 2 / 3) < 1e-10 def test_partial_overlap_rouge2(self, rouge: Rouge) -> None: candidate = "the cat sat on the mat" reference = "the cat lay on the mat" result = rouge.score(candidate, reference) # Bigrams in candidate: (the, cat), (cat, sat), (sat, on), (on, the), (the, mat) # Bigrams in reference: (the, cat), (cat, lay), (lay, on), (on, the), (the, mat) # Overlap: (the, cat), (on, the), (the, mat) = 3 # Precision = 3/5, Recall = 3/5 assert abs(result.rouge2.precision - 3 / 5) < 1e-10 assert abs(result.rouge2.recall - 3 / 5) < 1e-10 def test_rouge_l_basic(self, rouge: Rouge) -> None: candidate = "the cat sat on the mat" reference = "the cat sat" result = rouge.score(candidate, reference) # LCS = "the cat sat" = 3 tokens # Precision = 3/6 = 0.5, Recall = 3/3 = 1.0 assert result.rouge_l.precision == 0.5 assert result.rouge_l.recall == 1.0 def test_rouge_l_non_contiguous(self, rouge: Rouge) -> None: candidate = "the big cat sat" reference = "the cat sat" result = rouge.score(candidate, reference) # LCS = "the cat sat" = 3 (skipping "big") # Precision = 3/4, Recall = 3/3 = 1.0 assert result.rouge_l.precision == 0.75 assert result.rouge_l.recall == 1.0 def test_precision_vs_recall(self, rouge: Rouge) -> None: # Short candidate, long reference candidate = "the cat" reference = "the cat sat on the mat" result = rouge.score(candidate, reference) # Precision should be high (all candidate tokens in reference) assert result.rouge1.precision == 1.0 # Recall should be lower (not all reference tokens in candidate) assert result.rouge1.recall < 1.0 def test_empty_candidate(self, rouge: Rouge) -> None: result = rouge.score("", "The cat sat") assert result.rouge1.fmeasure == 0.0 assert result.rouge2.fmeasure == 0.0 assert result.rouge_l.fmeasure == 0.0 def test_whitespace_only_candidate(self, rouge: Rouge) -> None: result = rouge.score(" \t\n ", "The cat sat") assert result.rouge1.fmeasure == 0.0 assert result.rouge_l.fmeasure == 0.0 def test_empty_reference_raises(self, rouge: Rouge) -> None: with pytest.raises(ValueError, match="cannot be empty"): rouge.score("The cat sat", "") def test_none_reference_raises(self, rouge: Rouge) -> None: with pytest.raises(ValueError, match="requires reference"): rouge.score("The cat sat", None) def test_multiple_references_uses_max(self, rouge: Rouge) -> None: candidate = "the cat sat on the mat" references = [ "a dog ran across the room", # Low overlap "the cat sat on the mat", # Exact match ] result = rouge.score(candidate, references) # Should get perfect scores due to exact match assert result.rouge1.fmeasure == 1.0 assert result.rouge_l.fmeasure == 1.0 def test_multiple_references_partial(self, rouge: Rouge) -> None: candidate = "the quick brown fox" references = [ "the fast brown fox", # 3/4 match "a quick brown dog", # 3/4 match different tokens ] result = rouge.score(candidate, references) # Should pick best from either reference assert result.rouge1.fmeasure > 0.0 def test_result_score_property(self, rouge: Rouge) -> None: result = rouge.score("The cat sat", "The cat sat") assert result.score == result.rouge_l.fmeasure def test_case_insensitivity(self, rouge: Rouge) -> None: result = rouge.score("THE CAT SAT", "the cat sat") assert result.rouge1.fmeasure == 1.0 assert result.rouge_l.fmeasure == 1.0 def test_punctuation_ignored(self, rouge: Rouge) -> None: result = rouge.score("The cat sat.", "The cat sat!") assert result.rouge1.fmeasure == 1.0 def test_single_word(self, rouge: Rouge) -> None: result = rouge.score("cat", "cat") assert result.rouge1.fmeasure == 1.0 # ROUGE-2 should be 0 for single words (no bigrams) assert result.rouge2.fmeasure == 0.0 assert result.rouge_l.fmeasure == 1.0 def test_fmeasure_calculation(self, rouge: Rouge) -> None: # Create a case where P != R candidate = "the cat sat on" reference = "the cat" result = rouge.score(candidate, reference) # P = 2/4 = 0.5, R = 2/2 = 1.0 # F = 2 * 0.5 * 1.0 / (0.5 + 1.0) = 1.0 / 1.5 = 2/3 expected_f = 2 * 0.5 * 1.0 / (0.5 + 1.0) assert abs(result.rouge1.fmeasure - expected_f) < 1e-10 class TestRougeBatch: @pytest.fixture def rouge(self) -> Rouge: return Rouge() def test_batch_score_basic(self, rouge: Rouge) -> None: candidates = ["The cat sat", "A dog runs"] references = ["The cat sat", "A dog runs"] result = rouge.batch_score(candidates, references) assert result.count == 2 assert len(result.results) == 2 assert all(r.rouge_l.fmeasure == 1.0 for r in result.results) def test_batch_score_statistics(self, rouge: Rouge) -> None: candidates = ["The cat sat", "Completely different words"] references = ["The cat sat", "The cat sat"] result = rouge.batch_score(candidates, references) # Check statistics are computed assert "rouge1_fmeasure" in result.stats assert "rouge2_fmeasure" in result.stats assert "rouge_l_fmeasure" in result.stats assert "rouge1_precision" in result.stats assert "rouge1_recall" in result.stats # First result should be 1.0, second should be 0.0 assert result.results[0].rouge1.fmeasure == 1.0 assert result.results[1].rouge1.fmeasure == 0.0 def test_batch_score_percentiles(self, rouge: Rouge) -> None: candidates = ["a", "b", "c", "d", "e"] references = ["a", "b", "c", "d", "e"] result = rouge.batch_score(candidates, references) stats = result.stats["rouge1_fmeasure"] assert 25 in stats.percentiles assert 50 in stats.percentiles assert 75 in stats.percentiles assert 95 in stats.percentiles def test_batch_score_none_references_raises(self, rouge: Rouge) -> None: with pytest.raises(ValueError, match="requires reference"): rouge.batch_score(["text"], None) def test_batch_score_length_mismatch_raises(self, rouge: Rouge) -> None: with pytest.raises(ValueError, match="must match"): rouge.batch_score(["a", "b"], ["a"]) def test_batch_score_multi_refs(self, rouge: Rouge) -> None: candidates = [ "The cat sat on the mat", "A quick brown fox", ] references = [ ["The cat sat on the mat", "A cat rests on floor"], ["A quick brown fox", "The fast brown fox"], ] result = rouge.batch_score(candidates, references) assert result.count == 2 # Both should get perfect scores due to exact matches assert result.results[0].rouge_l.fmeasure == 1.0 assert result.results[1].rouge_l.fmeasure == 1.0 class TestRougeResult: def test_rouge_score_frozen(self) -> None: from pydantic import ValidationError score = RougeScore(precision=0.5, recall=0.6, fmeasure=0.55) with pytest.raises(ValidationError): score.precision = 0.7 # type: ignore[misc] def test_rouge_result_frozen(self) -> None: from pydantic import ValidationError score = RougeScore(precision=0.5, recall=0.6, fmeasure=0.55) result = RougeResult(rouge1=score, rouge2=score, rouge_l=score) with pytest.raises(ValidationError): result.rouge1 = score # type: ignore[misc] def test_score_property(self) -> None: r1 = RougeScore(precision=0.9, recall=0.9, fmeasure=0.9) r2 = RougeScore(precision=0.8, recall=0.8, fmeasure=0.8) rl = RougeScore(precision=0.7, recall=0.7, fmeasure=0.7) result = RougeResult(rouge1=r1, rouge2=r2, rouge_l=rl) assert result.score == 0.7