From ec48eb5bf59ae7d470015728d6dc2680489aadf9 Mon Sep 17 00:00:00 2001 From: Kai Chappell Date: Thu, 20 Mar 2025 20:32:41 +0000 Subject: [PATCH] tests for ROUGE and readability --- tests/test_metrics/test_readability.py | 237 +++++++++++++++++++++++ tests/test_metrics/test_rouge.py | 258 +++++++++++++++++++++++++ 2 files changed, 495 insertions(+) create mode 100644 tests/test_metrics/test_readability.py create mode 100644 tests/test_metrics/test_rouge.py diff --git a/tests/test_metrics/test_readability.py b/tests/test_metrics/test_readability.py new file mode 100644 index 0000000..7da6c1e --- /dev/null +++ b/tests/test_metrics/test_readability.py @@ -0,0 +1,237 @@ +"""Tests for the readability metric.""" + +import pytest + +from veritext.metrics import Readability, ReadabilityResult + + +class TestReadability: + @pytest.fixture + def readability(self) -> Readability: + return Readability() + + def test_name(self, readability: Readability) -> None: + assert readability.name == "readability" + + def test_requires_reference(self, readability: Readability) -> None: + assert readability.requires_reference is False + + def test_simple_text(self, readability: Readability) -> None: + # Simple children's text - short sentences, simple words + text = "The cat sat. The dog ran. I see a bird." + result = readability.score(text) + + # Should have low grade level and high reading ease + assert result.flesch_kincaid_grade < 5.0 + assert result.flesch_reading_ease > 80.0 + + def test_complex_text(self, readability: Readability) -> None: + # Complex academic text - long sentences, polysyllabic words + text = ( + "The implementation of sophisticated computational methodologies " + "necessitates thorough understanding of algorithmic complexity " + "and architectural considerations." + ) + result = readability.score(text) + + # Should have high grade level and low reading ease + assert result.flesch_kincaid_grade > 12.0 + assert result.flesch_reading_ease < 30.0 + + def test_medium_text(self, readability: Readability) -> None: + text = ( + "The weather today is quite pleasant. " + "Many people are enjoying the sunshine in the park. " + "Children play while parents watch nearby." + ) + result = readability.score(text) + + # Should be middle of the road + assert 3.0 < result.flesch_kincaid_grade < 10.0 + assert 50.0 < result.flesch_reading_ease < 90.0 + + def test_single_sentence(self, readability: Readability) -> None: + text = "The cat sat on the mat." + result = readability.score(text) + + # Should compute without error + assert result.flesch_kincaid_grade is not None + assert result.flesch_reading_ease is not None + + def test_single_word(self, readability: Readability) -> None: + text = "Cat" + result = readability.score(text) + + # Should handle single word (1 word, 1 sentence, 1 syllable) + assert result.flesch_kincaid_grade is not None + assert result.flesch_reading_ease is not None + + def test_empty_text(self, readability: Readability) -> None: + result = readability.score("") + + assert result.flesch_kincaid_grade == 0.0 + assert result.flesch_reading_ease == 0.0 + + def test_whitespace_only(self, readability: Readability) -> None: + result = readability.score(" \t\n ") + + assert result.flesch_kincaid_grade == 0.0 + assert result.flesch_reading_ease == 0.0 + + def test_reference_ignored(self, readability: Readability) -> None: + text = "The cat sat on the mat." + + # Score with no reference + result1 = readability.score(text) + # Score with reference (should be ignored) + result2 = readability.score(text, "Completely different text") + # Score with list of references + result3 = readability.score(text, ["ref1", "ref2"]) + + # All should produce identical results + assert result1.flesch_kincaid_grade == result2.flesch_kincaid_grade + assert result1.flesch_reading_ease == result2.flesch_reading_ease + assert result1.flesch_kincaid_grade == result3.flesch_kincaid_grade + + def test_punctuation_handling(self, readability: Readability) -> None: + # Same words, different sentence structure + text1 = "The cat sat on the mat" # 1 sentence + text2 = "The cat sat. On the mat." # 2 sentences + + result1 = readability.score(text1) + result2 = readability.score(text2) + + # Different sentence counts should affect scores + assert result1.flesch_kincaid_grade != result2.flesch_kincaid_grade + + def test_question_marks_count_sentences(self, readability: Readability) -> None: + text = "What is this? It is a test." + result = readability.score(text) + + # Should count as 2 sentences + # With 7 words total, words_per_sentence = 3.5 + assert result.flesch_kincaid_grade is not None + + def test_exclamation_marks_count_sentences(self, readability: Readability) -> None: + text = "Wow! That is amazing!" + result = readability.score(text) + + # Should count as 2 sentences + assert result.flesch_kincaid_grade is not None + + def test_multiple_punctuation(self, readability: Readability) -> None: + text = "What?! That's crazy... Well then." + result = readability.score(text) + + # Should handle gracefully + assert result.flesch_kincaid_grade is not None + + def test_result_score_property(self, readability: Readability) -> None: + result = readability.score("The cat sat on the mat.") + assert result.score == result.flesch_reading_ease + + def test_contractions(self, readability: Readability) -> None: + text = "I'm going to the store. It's not far away." + result = readability.score(text) + + # Should handle contractions as words + assert result.flesch_kincaid_grade is not None + assert result.flesch_reading_ease is not None + + +class TestReadabilityBatch: + @pytest.fixture + def readability(self) -> Readability: + return Readability() + + def test_batch_score_basic(self, readability: Readability) -> None: + candidates = [ + "The cat sat on the mat.", + "A dog ran through the park.", + ] + result = readability.batch_score(candidates) + + assert result.count == 2 + assert len(result.results) == 2 + + def test_batch_score_statistics(self, readability: Readability) -> None: + candidates = [ + "Cat sat.", # Very simple + "The implementation of sophisticated methodologies requires expertise.", + ] + result = readability.batch_score(candidates) + + # Check statistics are computed + assert "flesch_kincaid_grade" in result.stats + assert "flesch_reading_ease" in result.stats + + # First should be easier than second + assert ( + result.results[0].flesch_reading_ease + > result.results[1].flesch_reading_ease + ) + + def test_batch_score_percentiles(self, readability: Readability) -> None: + candidates = ["a", "b", "c", "d", "e"] + result = readability.batch_score(candidates) + + stats = result.stats["flesch_reading_ease"] + assert 25 in stats.percentiles + assert 50 in stats.percentiles + assert 75 in stats.percentiles + assert 95 in stats.percentiles + + def test_batch_score_references_ignored(self, readability: Readability) -> None: + candidates = ["The cat sat.", "A dog ran."] + + result1 = readability.batch_score(candidates) + result2 = readability.batch_score(candidates, ["ref1", "ref2"]) + + # Results should be identical + assert result1.results[0].flesch_kincaid_grade == ( + result2.results[0].flesch_kincaid_grade + ) + + def test_batch_score_empty_list_raises(self, readability: Readability) -> None: + with pytest.raises(ValueError, match="empty"): + readability.batch_score([]) + + +class TestReadabilityResult: + def test_frozen(self) -> None: + from pydantic import ValidationError + + result = ReadabilityResult(flesch_kincaid_grade=5.0, flesch_reading_ease=70.0) + with pytest.raises(ValidationError): + result.flesch_kincaid_grade = 6.0 # type: ignore[misc] + + def test_values(self) -> None: + result = ReadabilityResult(flesch_kincaid_grade=8.5, flesch_reading_ease=65.0) + assert result.flesch_kincaid_grade == 8.5 + assert result.flesch_reading_ease == 65.0 + + def test_score_property(self) -> None: + result = ReadabilityResult(flesch_kincaid_grade=8.5, flesch_reading_ease=65.0) + assert result.score == 65.0 + + +class TestSyllableCounting: + @pytest.fixture + def readability(self) -> Readability: + return Readability() + + def test_monosyllabic_words(self, readability: Readability) -> None: + # All one-syllable words + text = "The cat sat on the mat." + result = readability.score(text) + + # Should be very easy to read + assert result.flesch_reading_ease > 90.0 + + def test_polysyllabic_words(self, readability: Readability) -> None: + # Words with multiple syllables + text = "International communication facilitates understanding." + result = readability.score(text) + + # Should be harder to read + assert result.flesch_reading_ease < 50.0 diff --git a/tests/test_metrics/test_rouge.py b/tests/test_metrics/test_rouge.py new file mode 100644 index 0000000..0d2c718 --- /dev/null +++ b/tests/test_metrics/test_rouge.py @@ -0,0 +1,258 @@ +"""Tests for the ROUGE metric.""" + +import pytest + +from veritext.metrics import Rouge, RougeResult, RougeScore + + +class TestRouge: + @pytest.fixture + def rouge(self) -> Rouge: + return Rouge() + + def test_name(self, rouge: Rouge) -> None: + assert rouge.name == "rouge" + + def test_requires_reference(self, rouge: Rouge) -> None: + assert rouge.requires_reference is True + + def test_identical_texts(self, rouge: Rouge) -> None: + text = "The cat sat on the mat" + result = rouge.score(text, text) + + assert result.rouge1.precision == 1.0 + assert result.rouge1.recall == 1.0 + assert result.rouge1.fmeasure == 1.0 + assert result.rouge2.fmeasure == 1.0 + assert result.rouge_l.fmeasure == 1.0 + + def test_no_overlap(self, rouge: Rouge) -> None: + candidate = "apple banana cherry" + reference = "dog elephant fox" + result = rouge.score(candidate, reference) + + assert result.rouge1.precision == 0.0 + assert result.rouge1.recall == 0.0 + assert result.rouge1.fmeasure == 0.0 + assert result.rouge2.fmeasure == 0.0 + assert result.rouge_l.fmeasure == 0.0 + + def test_partial_overlap_rouge1(self, rouge: Rouge) -> None: + candidate = "the cat sat" + reference = "the dog sat" + result = rouge.score(candidate, reference) + + # Candidate: {the, cat, sat}, Reference: {the, dog, sat} + # Overlap: {the, sat} = 2 + # Precision = 2/3, Recall = 2/3 + assert abs(result.rouge1.precision - 2 / 3) < 1e-10 + assert abs(result.rouge1.recall - 2 / 3) < 1e-10 + + def test_partial_overlap_rouge2(self, rouge: Rouge) -> None: + candidate = "the cat sat on the mat" + reference = "the cat lay on the mat" + result = rouge.score(candidate, reference) + + # Bigrams in candidate: (the, cat), (cat, sat), (sat, on), (on, the), (the, mat) + # Bigrams in reference: (the, cat), (cat, lay), (lay, on), (on, the), (the, mat) + # Overlap: (the, cat), (on, the), (the, mat) = 3 + # Precision = 3/5, Recall = 3/5 + assert abs(result.rouge2.precision - 3 / 5) < 1e-10 + assert abs(result.rouge2.recall - 3 / 5) < 1e-10 + + def test_rouge_l_basic(self, rouge: Rouge) -> None: + candidate = "the cat sat on the mat" + reference = "the cat sat" + result = rouge.score(candidate, reference) + + # LCS = "the cat sat" = 3 tokens + # Precision = 3/6 = 0.5, Recall = 3/3 = 1.0 + assert result.rouge_l.precision == 0.5 + assert result.rouge_l.recall == 1.0 + + def test_rouge_l_non_contiguous(self, rouge: Rouge) -> None: + candidate = "the big cat sat" + reference = "the cat sat" + result = rouge.score(candidate, reference) + + # LCS = "the cat sat" = 3 (skipping "big") + # Precision = 3/4, Recall = 3/3 = 1.0 + assert result.rouge_l.precision == 0.75 + assert result.rouge_l.recall == 1.0 + + def test_precision_vs_recall(self, rouge: Rouge) -> None: + # Short candidate, long reference + candidate = "the cat" + reference = "the cat sat on the mat" + result = rouge.score(candidate, reference) + + # Precision should be high (all candidate tokens in reference) + assert result.rouge1.precision == 1.0 + # Recall should be lower (not all reference tokens in candidate) + assert result.rouge1.recall < 1.0 + + def test_empty_candidate(self, rouge: Rouge) -> None: + result = rouge.score("", "The cat sat") + + assert result.rouge1.fmeasure == 0.0 + assert result.rouge2.fmeasure == 0.0 + assert result.rouge_l.fmeasure == 0.0 + + def test_whitespace_only_candidate(self, rouge: Rouge) -> None: + result = rouge.score(" \t\n ", "The cat sat") + + assert result.rouge1.fmeasure == 0.0 + assert result.rouge_l.fmeasure == 0.0 + + def test_empty_reference_raises(self, rouge: Rouge) -> None: + with pytest.raises(ValueError, match="cannot be empty"): + rouge.score("The cat sat", "") + + def test_none_reference_raises(self, rouge: Rouge) -> None: + with pytest.raises(ValueError, match="requires reference"): + rouge.score("The cat sat", None) + + def test_multiple_references_uses_max(self, rouge: Rouge) -> None: + candidate = "the cat sat on the mat" + references = [ + "a dog ran across the room", # Low overlap + "the cat sat on the mat", # Exact match + ] + result = rouge.score(candidate, references) + + # Should get perfect scores due to exact match + assert result.rouge1.fmeasure == 1.0 + assert result.rouge_l.fmeasure == 1.0 + + def test_multiple_references_partial(self, rouge: Rouge) -> None: + candidate = "the quick brown fox" + references = [ + "the fast brown fox", # 3/4 match + "a quick brown dog", # 3/4 match different tokens + ] + result = rouge.score(candidate, references) + + # Should pick best from either reference + assert result.rouge1.fmeasure > 0.0 + + def test_result_score_property(self, rouge: Rouge) -> None: + result = rouge.score("The cat sat", "The cat sat") + assert result.score == result.rouge_l.fmeasure + + def test_case_insensitivity(self, rouge: Rouge) -> None: + result = rouge.score("THE CAT SAT", "the cat sat") + assert result.rouge1.fmeasure == 1.0 + assert result.rouge_l.fmeasure == 1.0 + + def test_punctuation_ignored(self, rouge: Rouge) -> None: + result = rouge.score("The cat sat.", "The cat sat!") + assert result.rouge1.fmeasure == 1.0 + + def test_single_word(self, rouge: Rouge) -> None: + result = rouge.score("cat", "cat") + + assert result.rouge1.fmeasure == 1.0 + # ROUGE-2 should be 0 for single words (no bigrams) + assert result.rouge2.fmeasure == 0.0 + assert result.rouge_l.fmeasure == 1.0 + + def test_fmeasure_calculation(self, rouge: Rouge) -> None: + # Create a case where P != R + candidate = "the cat sat on" + reference = "the cat" + result = rouge.score(candidate, reference) + + # P = 2/4 = 0.5, R = 2/2 = 1.0 + # F = 2 * 0.5 * 1.0 / (0.5 + 1.0) = 1.0 / 1.5 = 2/3 + expected_f = 2 * 0.5 * 1.0 / (0.5 + 1.0) + assert abs(result.rouge1.fmeasure - expected_f) < 1e-10 + + +class TestRougeBatch: + @pytest.fixture + def rouge(self) -> Rouge: + return Rouge() + + def test_batch_score_basic(self, rouge: Rouge) -> None: + candidates = ["The cat sat", "A dog runs"] + references = ["The cat sat", "A dog runs"] + result = rouge.batch_score(candidates, references) + + assert result.count == 2 + assert len(result.results) == 2 + assert all(r.rouge_l.fmeasure == 1.0 for r in result.results) + + def test_batch_score_statistics(self, rouge: Rouge) -> None: + candidates = ["The cat sat", "Completely different words"] + references = ["The cat sat", "The cat sat"] + result = rouge.batch_score(candidates, references) + + # Check statistics are computed + assert "rouge1_fmeasure" in result.stats + assert "rouge2_fmeasure" in result.stats + assert "rouge_l_fmeasure" in result.stats + assert "rouge1_precision" in result.stats + assert "rouge1_recall" in result.stats + + # First result should be 1.0, second should be 0.0 + assert result.results[0].rouge1.fmeasure == 1.0 + assert result.results[1].rouge1.fmeasure == 0.0 + + def test_batch_score_percentiles(self, rouge: Rouge) -> None: + candidates = ["a", "b", "c", "d", "e"] + references = ["a", "b", "c", "d", "e"] + result = rouge.batch_score(candidates, references) + + stats = result.stats["rouge1_fmeasure"] + assert 25 in stats.percentiles + assert 50 in stats.percentiles + assert 75 in stats.percentiles + assert 95 in stats.percentiles + + def test_batch_score_none_references_raises(self, rouge: Rouge) -> None: + with pytest.raises(ValueError, match="requires reference"): + rouge.batch_score(["text"], None) + + def test_batch_score_length_mismatch_raises(self, rouge: Rouge) -> None: + with pytest.raises(ValueError, match="must match"): + rouge.batch_score(["a", "b"], ["a"]) + + def test_batch_score_multi_refs(self, rouge: Rouge) -> None: + candidates = [ + "The cat sat on the mat", + "A quick brown fox", + ] + references = [ + ["The cat sat on the mat", "A cat rests on floor"], + ["A quick brown fox", "The fast brown fox"], + ] + result = rouge.batch_score(candidates, references) + + assert result.count == 2 + # Both should get perfect scores due to exact matches + assert result.results[0].rouge_l.fmeasure == 1.0 + assert result.results[1].rouge_l.fmeasure == 1.0 + + +class TestRougeResult: + def test_rouge_score_frozen(self) -> None: + from pydantic import ValidationError + + score = RougeScore(precision=0.5, recall=0.6, fmeasure=0.55) + with pytest.raises(ValidationError): + score.precision = 0.7 # type: ignore[misc] + + def test_rouge_result_frozen(self) -> None: + from pydantic import ValidationError + + score = RougeScore(precision=0.5, recall=0.6, fmeasure=0.55) + result = RougeResult(rouge1=score, rouge2=score, rouge_l=score) + with pytest.raises(ValidationError): + result.rouge1 = score # type: ignore[misc] + + def test_score_property(self) -> None: + r1 = RougeScore(precision=0.9, recall=0.9, fmeasure=0.9) + r2 = RougeScore(precision=0.8, recall=0.8, fmeasure=0.8) + rl = RougeScore(precision=0.7, recall=0.7, fmeasure=0.7) + result = RougeResult(rouge1=r1, rouge2=r2, rouge_l=rl) + assert result.score == 0.7