test(metrics): add ROUGE and readability tests

2026-02-03 17:03:34 +00:00
parent 14ac7dbbb9
commit 62fac688e4
2 changed files with 569 additions and 0 deletions
@@ -0,0 +1,274 @@
+"""Tests for the readability metric."""
+
+import pytest
+
+from veritext.metrics import Readability, ReadabilityResult
+
+
+class TestReadability:
+    """Tests for the Readability metric class."""
+
+    @pytest.fixture
+    def readability(self) -> Readability:
+        """Provide a readability metric instance."""
+        return Readability()
+
+    def test_name(self, readability: Readability) -> None:
+        """Test that name returns 'readability'."""
+        assert readability.name == "readability"
+
+    def test_requires_reference(self, readability: Readability) -> None:
+        """Test that readability does NOT require reference text."""
+        assert readability.requires_reference is False
+
+    def test_simple_text(self, readability: Readability) -> None:
+        """Test readability of simple, easy text."""
+        # Simple children's text - short sentences, simple words
+        text = "The cat sat. The dog ran. I see a bird."
+        result = readability.score(text)
+
+        # Should have low grade level and high reading ease
+        assert result.flesch_kincaid_grade < 5.0
+        assert result.flesch_reading_ease > 80.0
+
+    def test_complex_text(self, readability: Readability) -> None:
+        """Test readability of complex, academic text."""
+        # Complex academic text - long sentences, polysyllabic words
+        text = (
+            "The implementation of sophisticated computational methodologies "
+            "necessitates comprehensive understanding of algorithmic complexity "
+            "and architectural considerations."
+        )
+        result = readability.score(text)
+
+        # Should have high grade level and low reading ease
+        assert result.flesch_kincaid_grade > 12.0
+        assert result.flesch_reading_ease < 30.0
+
+    def test_medium_text(self, readability: Readability) -> None:
+        """Test readability of medium-difficulty text."""
+        text = (
+            "The weather today is quite pleasant. "
+            "Many people are enjoying the sunshine in the park. "
+            "Children play while parents watch nearby."
+        )
+        result = readability.score(text)
+
+        # Should be middle of the road
+        assert 3.0 < result.flesch_kincaid_grade < 10.0
+        assert 50.0 < result.flesch_reading_ease < 90.0
+
+    def test_single_sentence(self, readability: Readability) -> None:
+        """Test readability with a single sentence."""
+        text = "The cat sat on the mat."
+        result = readability.score(text)
+
+        # Should compute without error
+        assert result.flesch_kincaid_grade is not None
+        assert result.flesch_reading_ease is not None
+
+    def test_single_word(self, readability: Readability) -> None:
+        """Test readability with a single word."""
+        text = "Cat"
+        result = readability.score(text)
+
+        # Should handle single word (1 word, 1 sentence, 1 syllable)
+        assert result.flesch_kincaid_grade is not None
+        assert result.flesch_reading_ease is not None
+
+    def test_empty_text(self, readability: Readability) -> None:
+        """Test that empty text returns zero scores."""
+        result = readability.score("")
+
+        assert result.flesch_kincaid_grade == 0.0
+        assert result.flesch_reading_ease == 0.0
+
+    def test_whitespace_only(self, readability: Readability) -> None:
+        """Test that whitespace-only text returns zero scores."""
+        result = readability.score("   \t\n  ")
+
+        assert result.flesch_kincaid_grade == 0.0
+        assert result.flesch_reading_ease == 0.0
+
+    def test_reference_ignored(self, readability: Readability) -> None:
+        """Test that reference parameter is ignored."""
+        text = "The cat sat on the mat."
+
+        # Score with no reference
+        result1 = readability.score(text)
+        # Score with reference (should be ignored)
+        result2 = readability.score(text, "Completely different text")
+        # Score with list of references
+        result3 = readability.score(text, ["ref1", "ref2"])
+
+        # All should produce identical results
+        assert result1.flesch_kincaid_grade == result2.flesch_kincaid_grade
+        assert result1.flesch_reading_ease == result2.flesch_reading_ease
+        assert result1.flesch_kincaid_grade == result3.flesch_kincaid_grade
+
+    def test_punctuation_handling(self, readability: Readability) -> None:
+        """Test that punctuation affects sentence counting."""
+        # Same words, different sentence structure
+        text1 = "The cat sat on the mat"  # 1 sentence
+        text2 = "The cat sat. On the mat."  # 2 sentences
+
+        result1 = readability.score(text1)
+        result2 = readability.score(text2)
+
+        # Different sentence counts should affect scores
+        assert result1.flesch_kincaid_grade != result2.flesch_kincaid_grade
+
+    def test_question_marks_count_sentences(self, readability: Readability) -> None:
+        """Test that question marks end sentences."""
+        text = "What is this? It is a test."
+        result = readability.score(text)
+
+        # Should count as 2 sentences
+        # With 7 words total, words_per_sentence = 3.5
+        assert result.flesch_kincaid_grade is not None
+
+    def test_exclamation_marks_count_sentences(self, readability: Readability) -> None:
+        """Test that exclamation marks end sentences."""
+        text = "Wow! That is amazing!"
+        result = readability.score(text)
+
+        # Should count as 2 sentences
+        assert result.flesch_kincaid_grade is not None
+
+    def test_multiple_punctuation(self, readability: Readability) -> None:
+        """Test handling of multiple punctuation marks."""
+        text = "What?! That's crazy... Well then."
+        result = readability.score(text)
+
+        # Should handle gracefully
+        assert result.flesch_kincaid_grade is not None
+
+    def test_result_score_property(self, readability: Readability) -> None:
+        """Test that result.score returns flesch_reading_ease."""
+        result = readability.score("The cat sat on the mat.")
+        assert result.score == result.flesch_reading_ease
+
+    def test_contractions(self, readability: Readability) -> None:
+        """Test handling of contractions."""
+        text = "I'm going to the store. It's not far away."
+        result = readability.score(text)
+
+        # Should handle contractions as words
+        assert result.flesch_kincaid_grade is not None
+        assert result.flesch_reading_ease is not None
+
+
+class TestReadabilityBatch:
+    """Tests for readability batch scoring."""
+
+    @pytest.fixture
+    def readability(self) -> Readability:
+        """Provide a readability metric instance."""
+        return Readability()
+
+    def test_batch_score_basic(self, readability: Readability) -> None:
+        """Test basic batch scoring."""
+        candidates = [
+            "The cat sat on the mat.",
+            "A dog ran through the park.",
+        ]
+        result = readability.batch_score(candidates)
+
+        assert result.count == 2
+        assert len(result.results) == 2
+
+    def test_batch_score_statistics(self, readability: Readability) -> None:
+        """Test that batch scoring computes statistics."""
+        candidates = [
+            "Cat sat.",  # Very simple
+            "The implementation of sophisticated methodologies requires expertise.",
+        ]
+        result = readability.batch_score(candidates)
+
+        # Check statistics are computed
+        assert "flesch_kincaid_grade" in result.stats
+        assert "flesch_reading_ease" in result.stats
+
+        # First should be easier than second
+        assert (
+            result.results[0].flesch_reading_ease
+            > result.results[1].flesch_reading_ease
+        )
+
+    def test_batch_score_percentiles(self, readability: Readability) -> None:
+        """Test that batch scoring computes percentiles."""
+        candidates = ["a", "b", "c", "d", "e"]
+        result = readability.batch_score(candidates)
+
+        stats = result.stats["flesch_reading_ease"]
+        assert 25 in stats.percentiles
+        assert 50 in stats.percentiles
+        assert 75 in stats.percentiles
+        assert 95 in stats.percentiles
+
+    def test_batch_score_references_ignored(self, readability: Readability) -> None:
+        """Test that batch scoring ignores references."""
+        candidates = ["The cat sat.", "A dog ran."]
+
+        result1 = readability.batch_score(candidates)
+        result2 = readability.batch_score(candidates, ["ref1", "ref2"])
+
+        # Results should be identical
+        assert result1.results[0].flesch_kincaid_grade == (
+            result2.results[0].flesch_kincaid_grade
+        )
+
+    def test_batch_score_empty_list_raises(self, readability: Readability) -> None:
+        """Test that empty candidate list raises ValueError."""
+        with pytest.raises(ValueError, match="empty"):
+            readability.batch_score([])
+
+
+class TestReadabilityResult:
+    """Tests for ReadabilityResult type."""
+
+    def test_frozen(self) -> None:
+        """Test that ReadabilityResult is frozen."""
+        from pydantic import ValidationError
+
+        result = ReadabilityResult(flesch_kincaid_grade=5.0, flesch_reading_ease=70.0)
+        with pytest.raises(ValidationError):
+            result.flesch_kincaid_grade = 6.0  # type: ignore[misc]
+
+    def test_values(self) -> None:
+        """Test that values are stored correctly."""
+        result = ReadabilityResult(flesch_kincaid_grade=8.5, flesch_reading_ease=65.0)
+        assert result.flesch_kincaid_grade == 8.5
+        assert result.flesch_reading_ease == 65.0
+
+    def test_score_property(self) -> None:
+        """Test that score property returns flesch_reading_ease."""
+        result = ReadabilityResult(flesch_kincaid_grade=8.5, flesch_reading_ease=65.0)
+        assert result.score == 65.0
+
+
+class TestSyllableCounting:
+    """Tests for syllable counting heuristics."""
+
+    @pytest.fixture
+    def readability(self) -> Readability:
+        """Provide a readability metric instance."""
+        return Readability()
+
+    def test_monosyllabic_words(self, readability: Readability) -> None:
+        """Test that monosyllabic words don't inflate scores."""
+        # All one-syllable words
+        text = "The cat sat on the mat."
+        result = readability.score(text)
+
+        # Should be very easy to read
+        assert result.flesch_reading_ease > 90.0
+
+    def test_polysyllabic_words(self, readability: Readability) -> None:
+        """Test that polysyllabic words affect scores."""
+        # Words with multiple syllables
+        text = "International communication facilitates understanding."
+        result = readability.score(text)
+
+        # Should be harder to read
+        assert result.flesch_reading_ease < 50.0
@@ -0,0 +1,295 @@
+"""Tests for the ROUGE metric."""
+
+import pytest
+
+from veritext.metrics import Rouge, RougeResult, RougeScore
+
+
+class TestRouge:
+    """Tests for the Rouge metric class."""
+
+    @pytest.fixture
+    def rouge(self) -> Rouge:
+        """Provide a ROUGE metric instance."""
+        return Rouge()
+
+    def test_name(self, rouge: Rouge) -> None:
+        """Test that name returns 'rouge'."""
+        assert rouge.name == "rouge"
+
+    def test_requires_reference(self, rouge: Rouge) -> None:
+        """Test that ROUGE requires reference text."""
+        assert rouge.requires_reference is True
+
+    def test_identical_texts(self, rouge: Rouge) -> None:
+        """Test that identical texts produce perfect scores."""
+        text = "The cat sat on the mat"
+        result = rouge.score(text, text)
+
+        assert result.rouge1.precision == 1.0
+        assert result.rouge1.recall == 1.0
+        assert result.rouge1.fmeasure == 1.0
+        assert result.rouge2.fmeasure == 1.0
+        assert result.rouge_l.fmeasure == 1.0
+
+    def test_no_overlap(self, rouge: Rouge) -> None:
+        """Test that texts with no overlap produce zero scores."""
+        candidate = "apple banana cherry"
+        reference = "dog elephant fox"
+        result = rouge.score(candidate, reference)
+
+        assert result.rouge1.precision == 0.0
+        assert result.rouge1.recall == 0.0
+        assert result.rouge1.fmeasure == 0.0
+        assert result.rouge2.fmeasure == 0.0
+        assert result.rouge_l.fmeasure == 0.0
+
+    def test_partial_overlap_rouge1(self, rouge: Rouge) -> None:
+        """Test ROUGE-1 with partial overlap."""
+        candidate = "the cat sat"
+        reference = "the dog sat"
+        result = rouge.score(candidate, reference)
+
+        # Candidate: {the, cat, sat}, Reference: {the, dog, sat}
+        # Overlap: {the, sat} = 2
+        # Precision = 2/3, Recall = 2/3
+        assert abs(result.rouge1.precision - 2 / 3) < 1e-10
+        assert abs(result.rouge1.recall - 2 / 3) < 1e-10
+
+    def test_partial_overlap_rouge2(self, rouge: Rouge) -> None:
+        """Test ROUGE-2 (bigram) with partial overlap."""
+        candidate = "the cat sat on the mat"
+        reference = "the cat lay on the mat"
+        result = rouge.score(candidate, reference)
+
+        # Bigrams in candidate: (the, cat), (cat, sat), (sat, on), (on, the), (the, mat)
+        # Bigrams in reference: (the, cat), (cat, lay), (lay, on), (on, the), (the, mat)
+        # Overlap: (the, cat), (on, the), (the, mat) = 3
+        # Precision = 3/5, Recall = 3/5
+        assert abs(result.rouge2.precision - 3 / 5) < 1e-10
+        assert abs(result.rouge2.recall - 3 / 5) < 1e-10
+
+    def test_rouge_l_basic(self, rouge: Rouge) -> None:
+        """Test ROUGE-L (LCS) computation."""
+        candidate = "the cat sat on the mat"
+        reference = "the cat sat"
+        result = rouge.score(candidate, reference)
+
+        # LCS = "the cat sat" = 3 tokens
+        # Precision = 3/6 = 0.5, Recall = 3/3 = 1.0
+        assert result.rouge_l.precision == 0.5
+        assert result.rouge_l.recall == 1.0
+
+    def test_rouge_l_non_contiguous(self, rouge: Rouge) -> None:
+        """Test ROUGE-L with non-contiguous LCS."""
+        candidate = "the big cat sat"
+        reference = "the cat sat"
+        result = rouge.score(candidate, reference)
+
+        # LCS = "the cat sat" = 3 (skipping "big")
+        # Precision = 3/4, Recall = 3/3 = 1.0
+        assert result.rouge_l.precision == 0.75
+        assert result.rouge_l.recall == 1.0
+
+    def test_precision_vs_recall(self, rouge: Rouge) -> None:
+        """Test that precision and recall differ appropriately."""
+        # Short candidate, long reference
+        candidate = "the cat"
+        reference = "the cat sat on the mat"
+        result = rouge.score(candidate, reference)
+
+        # Precision should be high (all candidate tokens in reference)
+        assert result.rouge1.precision == 1.0
+        # Recall should be lower (not all reference tokens in candidate)
+        assert result.rouge1.recall < 1.0
+
+    def test_empty_candidate(self, rouge: Rouge) -> None:
+        """Test that empty candidate returns zero scores."""
+        result = rouge.score("", "The cat sat")
+
+        assert result.rouge1.fmeasure == 0.0
+        assert result.rouge2.fmeasure == 0.0
+        assert result.rouge_l.fmeasure == 0.0
+
+    def test_whitespace_only_candidate(self, rouge: Rouge) -> None:
+        """Test that whitespace-only candidate returns zero scores."""
+        result = rouge.score("   \t\n  ", "The cat sat")
+
+        assert result.rouge1.fmeasure == 0.0
+        assert result.rouge_l.fmeasure == 0.0
+
+    def test_empty_reference_raises(self, rouge: Rouge) -> None:
+        """Test that empty reference raises ValueError."""
+        with pytest.raises(ValueError, match="cannot be empty"):
+            rouge.score("The cat sat", "")
+
+    def test_none_reference_raises(self, rouge: Rouge) -> None:
+        """Test that None reference raises ValueError."""
+        with pytest.raises(ValueError, match="requires reference"):
+            rouge.score("The cat sat", None)
+
+    def test_multiple_references_uses_max(self, rouge: Rouge) -> None:
+        """Test that multiple references use max scores."""
+        candidate = "the cat sat on the mat"
+        references = [
+            "a dog ran across the room",  # Low overlap
+            "the cat sat on the mat",  # Exact match
+        ]
+        result = rouge.score(candidate, references)
+
+        # Should get perfect scores due to exact match
+        assert result.rouge1.fmeasure == 1.0
+        assert result.rouge_l.fmeasure == 1.0
+
+    def test_multiple_references_partial(self, rouge: Rouge) -> None:
+        """Test multiple references with partial matches."""
+        candidate = "the quick brown fox"
+        references = [
+            "the fast brown fox",  # 3/4 match
+            "a quick brown dog",  # 3/4 match different tokens
+        ]
+        result = rouge.score(candidate, references)
+
+        # Should pick best from either reference
+        assert result.rouge1.fmeasure > 0.0
+
+    def test_result_score_property(self, rouge: Rouge) -> None:
+        """Test that result.score returns rouge_l.fmeasure."""
+        result = rouge.score("The cat sat", "The cat sat")
+        assert result.score == result.rouge_l.fmeasure
+
+    def test_case_insensitivity(self, rouge: Rouge) -> None:
+        """Test that ROUGE is case insensitive by default."""
+        result = rouge.score("THE CAT SAT", "the cat sat")
+        assert result.rouge1.fmeasure == 1.0
+        assert result.rouge_l.fmeasure == 1.0
+
+    def test_punctuation_ignored(self, rouge: Rouge) -> None:
+        """Test that punctuation is ignored by default."""
+        result = rouge.score("The cat sat.", "The cat sat!")
+        assert result.rouge1.fmeasure == 1.0
+
+    def test_single_word(self, rouge: Rouge) -> None:
+        """Test ROUGE with single word texts."""
+        result = rouge.score("cat", "cat")
+
+        assert result.rouge1.fmeasure == 1.0
+        # ROUGE-2 should be 0 for single words (no bigrams)
+        assert result.rouge2.fmeasure == 0.0
+        assert result.rouge_l.fmeasure == 1.0
+
+    def test_fmeasure_calculation(self, rouge: Rouge) -> None:
+        """Test that F-measure is calculated correctly."""
+        # Create a case where P != R
+        candidate = "the cat sat on"
+        reference = "the cat"
+        result = rouge.score(candidate, reference)
+
+        # P = 2/4 = 0.5, R = 2/2 = 1.0
+        # F = 2 * 0.5 * 1.0 / (0.5 + 1.0) = 1.0 / 1.5 = 2/3
+        expected_f = 2 * 0.5 * 1.0 / (0.5 + 1.0)
+        assert abs(result.rouge1.fmeasure - expected_f) < 1e-10
+
+
+class TestRougeBatch:
+    """Tests for ROUGE batch scoring."""
+
+    @pytest.fixture
+    def rouge(self) -> Rouge:
+        """Provide a ROUGE metric instance."""
+        return Rouge()
+
+    def test_batch_score_basic(self, rouge: Rouge) -> None:
+        """Test basic batch scoring."""
+        candidates = ["The cat sat", "A dog runs"]
+        references = ["The cat sat", "A dog runs"]
+        result = rouge.batch_score(candidates, references)
+
+        assert result.count == 2
+        assert len(result.results) == 2
+        assert all(r.rouge_l.fmeasure == 1.0 for r in result.results)
+
+    def test_batch_score_statistics(self, rouge: Rouge) -> None:
+        """Test that batch scoring computes statistics."""
+        candidates = ["The cat sat", "Completely different words"]
+        references = ["The cat sat", "The cat sat"]
+        result = rouge.batch_score(candidates, references)
+
+        # Check statistics are computed
+        assert "rouge1_fmeasure" in result.stats
+        assert "rouge2_fmeasure" in result.stats
+        assert "rouge_l_fmeasure" in result.stats
+        assert "rouge1_precision" in result.stats
+        assert "rouge1_recall" in result.stats
+
+        # First result should be 1.0, second should be 0.0
+        assert result.results[0].rouge1.fmeasure == 1.0
+        assert result.results[1].rouge1.fmeasure == 0.0
+
+    def test_batch_score_percentiles(self, rouge: Rouge) -> None:
+        """Test that batch scoring computes percentiles."""
+        candidates = ["a", "b", "c", "d", "e"]
+        references = ["a", "b", "c", "d", "e"]
+        result = rouge.batch_score(candidates, references)
+
+        stats = result.stats["rouge1_fmeasure"]
+        assert 25 in stats.percentiles
+        assert 50 in stats.percentiles
+        assert 75 in stats.percentiles
+        assert 95 in stats.percentiles
+
+    def test_batch_score_none_references_raises(self, rouge: Rouge) -> None:
+        """Test that batch scoring raises for None references."""
+        with pytest.raises(ValueError, match="requires reference"):
+            rouge.batch_score(["text"], None)
+
+    def test_batch_score_length_mismatch_raises(self, rouge: Rouge) -> None:
+        """Test that batch scoring raises for mismatched lengths."""
+        with pytest.raises(ValueError, match="must match"):
+            rouge.batch_score(["a", "b"], ["a"])
+
+    def test_batch_score_with_multiple_references(self, rouge: Rouge) -> None:
+        """Test batch scoring with multiple references per candidate."""
+        candidates = [
+            "The cat sat on the mat",
+            "A quick brown fox",
+        ]
+        references = [
+            ["The cat sat on the mat", "A cat rests on floor"],
+            ["A quick brown fox", "The fast brown fox"],
+        ]
+        result = rouge.batch_score(candidates, references)
+
+        assert result.count == 2
+        # Both should get perfect scores due to exact matches
+        assert result.results[0].rouge_l.fmeasure == 1.0
+        assert result.results[1].rouge_l.fmeasure == 1.0
+
+
+class TestRougeResult:
+    """Tests for RougeResult and RougeScore types."""
+
+    def test_rouge_score_frozen(self) -> None:
+        """Test that RougeScore is frozen."""
+        from pydantic import ValidationError
+
+        score = RougeScore(precision=0.5, recall=0.6, fmeasure=0.55)
+        with pytest.raises(ValidationError):
+            score.precision = 0.7  # type: ignore[misc]
+
+    def test_rouge_result_frozen(self) -> None:
+        """Test that RougeResult is frozen."""
+        from pydantic import ValidationError
+
+        score = RougeScore(precision=0.5, recall=0.6, fmeasure=0.55)
+        result = RougeResult(rouge1=score, rouge2=score, rouge_l=score)
+        with pytest.raises(ValidationError):
+            result.rouge1 = score  # type: ignore[misc]
+
+    def test_score_property(self) -> None:
+        """Test that score property returns rouge_l.fmeasure."""
+        r1 = RougeScore(precision=0.9, recall=0.9, fmeasure=0.9)
+        r2 = RougeScore(precision=0.8, recall=0.8, fmeasure=0.8)
+        rl = RougeScore(precision=0.7, recall=0.7, fmeasure=0.7)
+        result = RougeResult(rouge1=r1, rouge2=r2, rouge_l=rl)
+        assert result.score == 0.7