tests for ROUGE and readability

This commit is contained in:
2025-03-20 20:32:41 +00:00
parent 5c2d626208
commit ec48eb5bf5
2 changed files with 495 additions and 0 deletions

View File

@@ -0,0 +1,237 @@
"""Tests for the readability metric."""
import pytest
from veritext.metrics import Readability, ReadabilityResult
class TestReadability:
@pytest.fixture
def readability(self) -> Readability:
return Readability()
def test_name(self, readability: Readability) -> None:
assert readability.name == "readability"
def test_requires_reference(self, readability: Readability) -> None:
assert readability.requires_reference is False
def test_simple_text(self, readability: Readability) -> None:
# Simple children's text - short sentences, simple words
text = "The cat sat. The dog ran. I see a bird."
result = readability.score(text)
# Should have low grade level and high reading ease
assert result.flesch_kincaid_grade < 5.0
assert result.flesch_reading_ease > 80.0
def test_complex_text(self, readability: Readability) -> None:
# Complex academic text - long sentences, polysyllabic words
text = (
"The implementation of sophisticated computational methodologies "
"necessitates thorough understanding of algorithmic complexity "
"and architectural considerations."
)
result = readability.score(text)
# Should have high grade level and low reading ease
assert result.flesch_kincaid_grade > 12.0
assert result.flesch_reading_ease < 30.0
def test_medium_text(self, readability: Readability) -> None:
text = (
"The weather today is quite pleasant. "
"Many people are enjoying the sunshine in the park. "
"Children play while parents watch nearby."
)
result = readability.score(text)
# Should be middle of the road
assert 3.0 < result.flesch_kincaid_grade < 10.0
assert 50.0 < result.flesch_reading_ease < 90.0
def test_single_sentence(self, readability: Readability) -> None:
text = "The cat sat on the mat."
result = readability.score(text)
# Should compute without error
assert result.flesch_kincaid_grade is not None
assert result.flesch_reading_ease is not None
def test_single_word(self, readability: Readability) -> None:
text = "Cat"
result = readability.score(text)
# Should handle single word (1 word, 1 sentence, 1 syllable)
assert result.flesch_kincaid_grade is not None
assert result.flesch_reading_ease is not None
def test_empty_text(self, readability: Readability) -> None:
result = readability.score("")
assert result.flesch_kincaid_grade == 0.0
assert result.flesch_reading_ease == 0.0
def test_whitespace_only(self, readability: Readability) -> None:
result = readability.score(" \t\n ")
assert result.flesch_kincaid_grade == 0.0
assert result.flesch_reading_ease == 0.0
def test_reference_ignored(self, readability: Readability) -> None:
text = "The cat sat on the mat."
# Score with no reference
result1 = readability.score(text)
# Score with reference (should be ignored)
result2 = readability.score(text, "Completely different text")
# Score with list of references
result3 = readability.score(text, ["ref1", "ref2"])
# All should produce identical results
assert result1.flesch_kincaid_grade == result2.flesch_kincaid_grade
assert result1.flesch_reading_ease == result2.flesch_reading_ease
assert result1.flesch_kincaid_grade == result3.flesch_kincaid_grade
def test_punctuation_handling(self, readability: Readability) -> None:
# Same words, different sentence structure
text1 = "The cat sat on the mat" # 1 sentence
text2 = "The cat sat. On the mat." # 2 sentences
result1 = readability.score(text1)
result2 = readability.score(text2)
# Different sentence counts should affect scores
assert result1.flesch_kincaid_grade != result2.flesch_kincaid_grade
def test_question_marks_count_sentences(self, readability: Readability) -> None:
text = "What is this? It is a test."
result = readability.score(text)
# Should count as 2 sentences
# With 7 words total, words_per_sentence = 3.5
assert result.flesch_kincaid_grade is not None
def test_exclamation_marks_count_sentences(self, readability: Readability) -> None:
text = "Wow! That is amazing!"
result = readability.score(text)
# Should count as 2 sentences
assert result.flesch_kincaid_grade is not None
def test_multiple_punctuation(self, readability: Readability) -> None:
text = "What?! That's crazy... Well then."
result = readability.score(text)
# Should handle gracefully
assert result.flesch_kincaid_grade is not None
def test_result_score_property(self, readability: Readability) -> None:
result = readability.score("The cat sat on the mat.")
assert result.score == result.flesch_reading_ease
def test_contractions(self, readability: Readability) -> None:
text = "I'm going to the store. It's not far away."
result = readability.score(text)
# Should handle contractions as words
assert result.flesch_kincaid_grade is not None
assert result.flesch_reading_ease is not None
class TestReadabilityBatch:
@pytest.fixture
def readability(self) -> Readability:
return Readability()
def test_batch_score_basic(self, readability: Readability) -> None:
candidates = [
"The cat sat on the mat.",
"A dog ran through the park.",
]
result = readability.batch_score(candidates)
assert result.count == 2
assert len(result.results) == 2
def test_batch_score_statistics(self, readability: Readability) -> None:
candidates = [
"Cat sat.", # Very simple
"The implementation of sophisticated methodologies requires expertise.",
]
result = readability.batch_score(candidates)
# Check statistics are computed
assert "flesch_kincaid_grade" in result.stats
assert "flesch_reading_ease" in result.stats
# First should be easier than second
assert (
result.results[0].flesch_reading_ease
> result.results[1].flesch_reading_ease
)
def test_batch_score_percentiles(self, readability: Readability) -> None:
candidates = ["a", "b", "c", "d", "e"]
result = readability.batch_score(candidates)
stats = result.stats["flesch_reading_ease"]
assert 25 in stats.percentiles
assert 50 in stats.percentiles
assert 75 in stats.percentiles
assert 95 in stats.percentiles
def test_batch_score_references_ignored(self, readability: Readability) -> None:
candidates = ["The cat sat.", "A dog ran."]
result1 = readability.batch_score(candidates)
result2 = readability.batch_score(candidates, ["ref1", "ref2"])
# Results should be identical
assert result1.results[0].flesch_kincaid_grade == (
result2.results[0].flesch_kincaid_grade
)
def test_batch_score_empty_list_raises(self, readability: Readability) -> None:
with pytest.raises(ValueError, match="empty"):
readability.batch_score([])
class TestReadabilityResult:
def test_frozen(self) -> None:
from pydantic import ValidationError
result = ReadabilityResult(flesch_kincaid_grade=5.0, flesch_reading_ease=70.0)
with pytest.raises(ValidationError):
result.flesch_kincaid_grade = 6.0 # type: ignore[misc]
def test_values(self) -> None:
result = ReadabilityResult(flesch_kincaid_grade=8.5, flesch_reading_ease=65.0)
assert result.flesch_kincaid_grade == 8.5
assert result.flesch_reading_ease == 65.0
def test_score_property(self) -> None:
result = ReadabilityResult(flesch_kincaid_grade=8.5, flesch_reading_ease=65.0)
assert result.score == 65.0
class TestSyllableCounting:
@pytest.fixture
def readability(self) -> Readability:
return Readability()
def test_monosyllabic_words(self, readability: Readability) -> None:
# All one-syllable words
text = "The cat sat on the mat."
result = readability.score(text)
# Should be very easy to read
assert result.flesch_reading_ease > 90.0
def test_polysyllabic_words(self, readability: Readability) -> None:
# Words with multiple syllables
text = "International communication facilitates understanding."
result = readability.score(text)
# Should be harder to read
assert result.flesch_reading_ease < 50.0

View File

@@ -0,0 +1,258 @@
"""Tests for the ROUGE metric."""
import pytest
from veritext.metrics import Rouge, RougeResult, RougeScore
class TestRouge:
@pytest.fixture
def rouge(self) -> Rouge:
return Rouge()
def test_name(self, rouge: Rouge) -> None:
assert rouge.name == "rouge"
def test_requires_reference(self, rouge: Rouge) -> None:
assert rouge.requires_reference is True
def test_identical_texts(self, rouge: Rouge) -> None:
text = "The cat sat on the mat"
result = rouge.score(text, text)
assert result.rouge1.precision == 1.0
assert result.rouge1.recall == 1.0
assert result.rouge1.fmeasure == 1.0
assert result.rouge2.fmeasure == 1.0
assert result.rouge_l.fmeasure == 1.0
def test_no_overlap(self, rouge: Rouge) -> None:
candidate = "apple banana cherry"
reference = "dog elephant fox"
result = rouge.score(candidate, reference)
assert result.rouge1.precision == 0.0
assert result.rouge1.recall == 0.0
assert result.rouge1.fmeasure == 0.0
assert result.rouge2.fmeasure == 0.0
assert result.rouge_l.fmeasure == 0.0
def test_partial_overlap_rouge1(self, rouge: Rouge) -> None:
candidate = "the cat sat"
reference = "the dog sat"
result = rouge.score(candidate, reference)
# Candidate: {the, cat, sat}, Reference: {the, dog, sat}
# Overlap: {the, sat} = 2
# Precision = 2/3, Recall = 2/3
assert abs(result.rouge1.precision - 2 / 3) < 1e-10
assert abs(result.rouge1.recall - 2 / 3) < 1e-10
def test_partial_overlap_rouge2(self, rouge: Rouge) -> None:
candidate = "the cat sat on the mat"
reference = "the cat lay on the mat"
result = rouge.score(candidate, reference)
# Bigrams in candidate: (the, cat), (cat, sat), (sat, on), (on, the), (the, mat)
# Bigrams in reference: (the, cat), (cat, lay), (lay, on), (on, the), (the, mat)
# Overlap: (the, cat), (on, the), (the, mat) = 3
# Precision = 3/5, Recall = 3/5
assert abs(result.rouge2.precision - 3 / 5) < 1e-10
assert abs(result.rouge2.recall - 3 / 5) < 1e-10
def test_rouge_l_basic(self, rouge: Rouge) -> None:
candidate = "the cat sat on the mat"
reference = "the cat sat"
result = rouge.score(candidate, reference)
# LCS = "the cat sat" = 3 tokens
# Precision = 3/6 = 0.5, Recall = 3/3 = 1.0
assert result.rouge_l.precision == 0.5
assert result.rouge_l.recall == 1.0
def test_rouge_l_non_contiguous(self, rouge: Rouge) -> None:
candidate = "the big cat sat"
reference = "the cat sat"
result = rouge.score(candidate, reference)
# LCS = "the cat sat" = 3 (skipping "big")
# Precision = 3/4, Recall = 3/3 = 1.0
assert result.rouge_l.precision == 0.75
assert result.rouge_l.recall == 1.0
def test_precision_vs_recall(self, rouge: Rouge) -> None:
# Short candidate, long reference
candidate = "the cat"
reference = "the cat sat on the mat"
result = rouge.score(candidate, reference)
# Precision should be high (all candidate tokens in reference)
assert result.rouge1.precision == 1.0
# Recall should be lower (not all reference tokens in candidate)
assert result.rouge1.recall < 1.0
def test_empty_candidate(self, rouge: Rouge) -> None:
result = rouge.score("", "The cat sat")
assert result.rouge1.fmeasure == 0.0
assert result.rouge2.fmeasure == 0.0
assert result.rouge_l.fmeasure == 0.0
def test_whitespace_only_candidate(self, rouge: Rouge) -> None:
result = rouge.score(" \t\n ", "The cat sat")
assert result.rouge1.fmeasure == 0.0
assert result.rouge_l.fmeasure == 0.0
def test_empty_reference_raises(self, rouge: Rouge) -> None:
with pytest.raises(ValueError, match="cannot be empty"):
rouge.score("The cat sat", "")
def test_none_reference_raises(self, rouge: Rouge) -> None:
with pytest.raises(ValueError, match="requires reference"):
rouge.score("The cat sat", None)
def test_multiple_references_uses_max(self, rouge: Rouge) -> None:
candidate = "the cat sat on the mat"
references = [
"a dog ran across the room", # Low overlap
"the cat sat on the mat", # Exact match
]
result = rouge.score(candidate, references)
# Should get perfect scores due to exact match
assert result.rouge1.fmeasure == 1.0
assert result.rouge_l.fmeasure == 1.0
def test_multiple_references_partial(self, rouge: Rouge) -> None:
candidate = "the quick brown fox"
references = [
"the fast brown fox", # 3/4 match
"a quick brown dog", # 3/4 match different tokens
]
result = rouge.score(candidate, references)
# Should pick best from either reference
assert result.rouge1.fmeasure > 0.0
def test_result_score_property(self, rouge: Rouge) -> None:
result = rouge.score("The cat sat", "The cat sat")
assert result.score == result.rouge_l.fmeasure
def test_case_insensitivity(self, rouge: Rouge) -> None:
result = rouge.score("THE CAT SAT", "the cat sat")
assert result.rouge1.fmeasure == 1.0
assert result.rouge_l.fmeasure == 1.0
def test_punctuation_ignored(self, rouge: Rouge) -> None:
result = rouge.score("The cat sat.", "The cat sat!")
assert result.rouge1.fmeasure == 1.0
def test_single_word(self, rouge: Rouge) -> None:
result = rouge.score("cat", "cat")
assert result.rouge1.fmeasure == 1.0
# ROUGE-2 should be 0 for single words (no bigrams)
assert result.rouge2.fmeasure == 0.0
assert result.rouge_l.fmeasure == 1.0
def test_fmeasure_calculation(self, rouge: Rouge) -> None:
# Create a case where P != R
candidate = "the cat sat on"
reference = "the cat"
result = rouge.score(candidate, reference)
# P = 2/4 = 0.5, R = 2/2 = 1.0
# F = 2 * 0.5 * 1.0 / (0.5 + 1.0) = 1.0 / 1.5 = 2/3
expected_f = 2 * 0.5 * 1.0 / (0.5 + 1.0)
assert abs(result.rouge1.fmeasure - expected_f) < 1e-10
class TestRougeBatch:
@pytest.fixture
def rouge(self) -> Rouge:
return Rouge()
def test_batch_score_basic(self, rouge: Rouge) -> None:
candidates = ["The cat sat", "A dog runs"]
references = ["The cat sat", "A dog runs"]
result = rouge.batch_score(candidates, references)
assert result.count == 2
assert len(result.results) == 2
assert all(r.rouge_l.fmeasure == 1.0 for r in result.results)
def test_batch_score_statistics(self, rouge: Rouge) -> None:
candidates = ["The cat sat", "Completely different words"]
references = ["The cat sat", "The cat sat"]
result = rouge.batch_score(candidates, references)
# Check statistics are computed
assert "rouge1_fmeasure" in result.stats
assert "rouge2_fmeasure" in result.stats
assert "rouge_l_fmeasure" in result.stats
assert "rouge1_precision" in result.stats
assert "rouge1_recall" in result.stats
# First result should be 1.0, second should be 0.0
assert result.results[0].rouge1.fmeasure == 1.0
assert result.results[1].rouge1.fmeasure == 0.0
def test_batch_score_percentiles(self, rouge: Rouge) -> None:
candidates = ["a", "b", "c", "d", "e"]
references = ["a", "b", "c", "d", "e"]
result = rouge.batch_score(candidates, references)
stats = result.stats["rouge1_fmeasure"]
assert 25 in stats.percentiles
assert 50 in stats.percentiles
assert 75 in stats.percentiles
assert 95 in stats.percentiles
def test_batch_score_none_references_raises(self, rouge: Rouge) -> None:
with pytest.raises(ValueError, match="requires reference"):
rouge.batch_score(["text"], None)
def test_batch_score_length_mismatch_raises(self, rouge: Rouge) -> None:
with pytest.raises(ValueError, match="must match"):
rouge.batch_score(["a", "b"], ["a"])
def test_batch_score_multi_refs(self, rouge: Rouge) -> None:
candidates = [
"The cat sat on the mat",
"A quick brown fox",
]
references = [
["The cat sat on the mat", "A cat rests on floor"],
["A quick brown fox", "The fast brown fox"],
]
result = rouge.batch_score(candidates, references)
assert result.count == 2
# Both should get perfect scores due to exact matches
assert result.results[0].rouge_l.fmeasure == 1.0
assert result.results[1].rouge_l.fmeasure == 1.0
class TestRougeResult:
def test_rouge_score_frozen(self) -> None:
from pydantic import ValidationError
score = RougeScore(precision=0.5, recall=0.6, fmeasure=0.55)
with pytest.raises(ValidationError):
score.precision = 0.7 # type: ignore[misc]
def test_rouge_result_frozen(self) -> None:
from pydantic import ValidationError
score = RougeScore(precision=0.5, recall=0.6, fmeasure=0.55)
result = RougeResult(rouge1=score, rouge2=score, rouge_l=score)
with pytest.raises(ValidationError):
result.rouge1 = score # type: ignore[misc]
def test_score_property(self) -> None:
r1 = RougeScore(precision=0.9, recall=0.9, fmeasure=0.9)
r2 = RougeScore(precision=0.8, recall=0.8, fmeasure=0.8)
rl = RougeScore(precision=0.7, recall=0.7, fmeasure=0.7)
result = RougeResult(rouge1=r1, rouge2=r2, rouge_l=rl)
assert result.score == 0.7