6 Commits

7 changed files with 1114 additions and 2 deletions

View File

@@ -18,4 +18,6 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
- Metrics module with `Metric` protocol, `AggregateStats`, and `BatchResult` types - Metrics module with `Metric` protocol, `AggregateStats`, and `BatchResult` types
- BLEU metric implementation (BLEU-1 through BLEU-4 with brevity penalty) - BLEU metric implementation (BLEU-1 through BLEU-4 with brevity penalty)
- Lexical similarity metric (Jaccard similarity and token overlap) - Lexical similarity metric (Jaccard similarity and token overlap)
- ROUGE metric (ROUGE-1, ROUGE-2, ROUGE-L with precision/recall/F-measure)
- Flesch-Kincaid readability metrics (grade level and reading ease)
- Batch scoring with aggregate statistics for all metrics - Batch scoring with aggregate statistics for all metrics

View File

@@ -1,9 +1,17 @@
"""Metrics module: BLEU, lexical similarity, and batch processing.""" """Metrics module: BLEU, ROUGE, lexical similarity, readability, and batch processing."""
from veritext.metrics.base import AggregateStats, BatchResult, Metric from veritext.metrics.base import AggregateStats, BatchResult, Metric
from veritext.metrics.bleu import Bleu from veritext.metrics.bleu import Bleu
from veritext.metrics.lexical import Lexical from veritext.metrics.lexical import Lexical
from veritext.metrics.results import BleuResult, LexicalResult from veritext.metrics.readability import Readability
from veritext.metrics.results import (
BleuResult,
LexicalResult,
ReadabilityResult,
RougeResult,
RougeScore,
)
from veritext.metrics.rouge import Rouge
__all__ = [ __all__ = [
"AggregateStats", "AggregateStats",
@@ -13,4 +21,9 @@ __all__ = [
"Lexical", "Lexical",
"LexicalResult", "LexicalResult",
"Metric", "Metric",
"Readability",
"ReadabilityResult",
"Rouge",
"RougeResult",
"RougeScore",
] ]

View File

@@ -0,0 +1,195 @@
"""Readability metrics implementation (Flesch-Kincaid)."""
import re
from veritext.metrics.base import AggregateStats, BatchResult
from veritext.metrics.results import ReadabilityResult
# Sentence-ending punctuation pattern
_SENTENCE_ENDINGS = re.compile(r"[.!?]+")
# Vowel pattern for syllable counting
_VOWELS = re.compile(r"[aeiouy]+", re.IGNORECASE)
def _count_syllables(word: str) -> int:
"""
Count syllables in a word using a heuristic approach.
Uses vowel group counting with adjustments for common patterns.
Args:
word: The word to count syllables for.
Returns:
Estimated syllable count (minimum 1 for non-empty words).
"""
if not word:
return 0
word = word.lower().strip()
if not word:
return 0
# Count vowel groups
vowel_groups = _VOWELS.findall(word)
count = len(vowel_groups)
# Adjust for silent 'e' at end
if word.endswith("e") and count > 1:
count -= 1
# Adjust for 'le' ending (e.g., "table", "able")
if word.endswith("le") and len(word) > 2 and word[-3] not in "aeiouy":
count += 1
# Adjust for 'ed' ending when not adding syllable
if word.endswith("ed") and len(word) > 2 and word[-3] not in "dt":
count = max(count - 1, 1)
# Ensure at least 1 syllable for any word
return max(count, 1)
def _count_sentences(text: str) -> int:
"""
Count sentences in text.
Splits on sentence-ending punctuation (.!?).
Args:
text: The text to count sentences in.
Returns:
Number of sentences (minimum 1 for non-empty text).
"""
if not text or not text.strip():
return 0
# Split on sentence endings and filter empty strings
sentences = _SENTENCE_ENDINGS.split(text)
# Filter out empty segments
sentences = [s for s in sentences if s.strip()]
return max(len(sentences), 1)
def _count_words(text: str) -> tuple[list[str], int]:
"""
Extract words from text and count them.
Args:
text: The text to process.
Returns:
Tuple of (word list, word count).
"""
# Extract words (sequences of letters and apostrophes)
words = re.findall(r"[a-zA-Z']+", text)
# Filter out standalone apostrophes
words = [w for w in words if w.replace("'", "")]
return words, len(words)
class Readability:
"""
Readability metric using Flesch-Kincaid formulas.
Computes:
- Flesch-Kincaid Grade Level: US grade level required to understand text
- Flesch Reading Ease: Score from 0-100 (higher = easier to read)
This metric does NOT require reference text.
"""
@property
def name(self) -> str:
"""Return the name of this metric."""
return "readability"
@property
def requires_reference(self) -> bool:
"""Return whether this metric requires reference text."""
return False
def score(
self,
candidate: str,
reference: str | list[str] | None = None, # noqa: ARG002
) -> ReadabilityResult:
"""
Compute readability scores for a text.
Args:
candidate: The text to score.
reference: Ignored (readability doesn't use reference text).
Returns:
ReadabilityResult with Flesch-Kincaid scores.
"""
# Extract words and count
words, word_count = _count_words(candidate)
# Handle empty or trivial text
if word_count == 0:
return ReadabilityResult(
flesch_kincaid_grade=0.0,
flesch_reading_ease=0.0,
)
# Count sentences
sentence_count = _count_sentences(candidate)
# Count syllables
syllable_count = sum(_count_syllables(word) for word in words)
# Compute ratios
words_per_sentence = word_count / sentence_count
syllables_per_word = syllable_count / word_count
# Flesch-Kincaid Grade Level
# Formula: 0.39 * (words/sentences) + 11.8 * (syllables/words) - 15.59
grade_level = 0.39 * words_per_sentence + 11.8 * syllables_per_word - 15.59
# Flesch Reading Ease
# Formula: 206.835 - 1.015 * (words/sentences) - 84.6 * (syllables/words)
reading_ease = 206.835 - 1.015 * words_per_sentence - 84.6 * syllables_per_word
return ReadabilityResult(
flesch_kincaid_grade=grade_level,
flesch_reading_ease=reading_ease,
)
def batch_score(
self,
candidates: list[str],
references: list[str] | list[list[str]] | None = None, # noqa: ARG002
) -> BatchResult[ReadabilityResult]:
"""
Compute readability scores for a batch of texts.
Args:
candidates: List of texts to score.
references: Ignored (readability doesn't use reference text).
Returns:
BatchResult containing individual results and aggregate statistics.
"""
if not candidates:
raise ValueError("Cannot compute batch statistics from empty list")
results: list[ReadabilityResult] = []
for cand in candidates:
results.append(self.score(cand))
# Compute aggregate statistics
stats = {
"flesch_kincaid_grade": AggregateStats.from_values(
[r.flesch_kincaid_grade for r in results]
),
"flesch_reading_ease": AggregateStats.from_values(
[r.flesch_reading_ease for r in results]
),
}
return BatchResult(results=results, count=len(results), stats=stats)

View File

@@ -39,3 +39,55 @@ class LexicalResult(BaseModel):
token_overlap: float token_overlap: float
"""Proportion of candidate tokens found in reference.""" """Proportion of candidate tokens found in reference."""
class RougeScore(BaseModel):
"""Individual ROUGE variant score with precision, recall, F-measure."""
model_config = ConfigDict(frozen=True)
precision: float
"""Precision: overlap / candidate length."""
recall: float
"""Recall: overlap / reference length."""
fmeasure: float
"""F1-measure: harmonic mean of precision and recall."""
class RougeResult(BaseModel):
"""Result of ROUGE score computation."""
model_config = ConfigDict(frozen=True)
rouge1: RougeScore
"""ROUGE-1 (unigram) score."""
rouge2: RougeScore
"""ROUGE-2 (bigram) score."""
rouge_l: RougeScore
"""ROUGE-L (longest common subsequence) score."""
@property
def score(self) -> float:
"""Return ROUGE-L F-measure as the primary score."""
return self.rouge_l.fmeasure
class ReadabilityResult(BaseModel):
"""Result of readability computation."""
model_config = ConfigDict(frozen=True)
flesch_kincaid_grade: float
"""US grade level (e.g., 8.0 = 8th grade reading level)."""
flesch_reading_ease: float
"""Score 0-100, higher = easier to read."""
@property
def score(self) -> float:
"""Return Flesch reading ease as the primary score."""
return self.flesch_reading_ease

View File

@@ -0,0 +1,281 @@
"""ROUGE (Recall-Oriented Understudy for Gisting Evaluation) metric implementation."""
from collections import Counter
from veritext.core.tokenisation import WordTokeniser
from veritext.metrics.base import AggregateStats, BatchResult
from veritext.metrics.results import RougeResult, RougeScore
def _get_ngrams(tokens: list[str], n: int) -> Counter[tuple[str, ...]]:
"""Extract n-grams from a list of tokens."""
if n > len(tokens):
return Counter()
return Counter(tuple(tokens[i : i + n]) for i in range(len(tokens) - n + 1))
def _ngram_overlap(
candidate_ngrams: Counter[tuple[str, ...]],
reference_ngrams: Counter[tuple[str, ...]],
) -> int:
"""Compute the overlap count between candidate and reference n-grams."""
overlap = 0
for ngram, count in candidate_ngrams.items():
overlap += min(count, reference_ngrams.get(ngram, 0))
return overlap
def _compute_rouge_score(
candidate_tokens: list[str],
reference_tokens: list[str],
n: int,
) -> RougeScore:
"""
Compute ROUGE-n score for given n-gram size.
Args:
candidate_tokens: Tokenised candidate text.
reference_tokens: Tokenised reference text.
n: N-gram size.
Returns:
RougeScore with precision, recall, and F-measure.
"""
candidate_ngrams = _get_ngrams(candidate_tokens, n)
reference_ngrams = _get_ngrams(reference_tokens, n)
candidate_count = sum(candidate_ngrams.values())
reference_count = sum(reference_ngrams.values())
if candidate_count == 0 and reference_count == 0:
return RougeScore(precision=0.0, recall=0.0, fmeasure=0.0)
overlap = _ngram_overlap(candidate_ngrams, reference_ngrams)
precision = overlap / candidate_count if candidate_count > 0 else 0.0
recall = overlap / reference_count if reference_count > 0 else 0.0
if precision + recall > 0:
fmeasure = 2 * precision * recall / (precision + recall)
else:
fmeasure = 0.0
return RougeScore(precision=precision, recall=recall, fmeasure=fmeasure)
def _lcs_length(seq1: list[str], seq2: list[str]) -> int:
"""
Compute the length of the longest common subsequence.
Uses dynamic programming with O(m*n) time and O(min(m,n)) space.
"""
if not seq1 or not seq2:
return 0
# Optimise by using shorter sequence for columns
if len(seq1) < len(seq2):
seq1, seq2 = seq2, seq1
m, n = len(seq1), len(seq2)
# Only need two rows at a time
prev = [0] * (n + 1)
curr = [0] * (n + 1)
for i in range(1, m + 1):
for j in range(1, n + 1):
if seq1[i - 1] == seq2[j - 1]:
curr[j] = prev[j - 1] + 1
else:
curr[j] = max(prev[j], curr[j - 1])
prev, curr = curr, prev
return prev[n]
def _compute_rouge_l(
candidate_tokens: list[str],
reference_tokens: list[str],
) -> RougeScore:
"""
Compute ROUGE-L score using longest common subsequence.
Args:
candidate_tokens: Tokenised candidate text.
reference_tokens: Tokenised reference text.
Returns:
RougeScore with precision, recall, and F-measure.
"""
if not candidate_tokens and not reference_tokens:
return RougeScore(precision=0.0, recall=0.0, fmeasure=0.0)
if not candidate_tokens or not reference_tokens:
return RougeScore(precision=0.0, recall=0.0, fmeasure=0.0)
lcs = _lcs_length(candidate_tokens, reference_tokens)
precision = lcs / len(candidate_tokens)
recall = lcs / len(reference_tokens)
if precision + recall > 0:
fmeasure = 2 * precision * recall / (precision + recall)
else:
fmeasure = 0.0
return RougeScore(precision=precision, recall=recall, fmeasure=fmeasure)
def _max_rouge_scores(scores: list[RougeScore]) -> RougeScore:
"""Select the RougeScore with the highest F-measure from a list."""
return max(scores, key=lambda s: s.fmeasure)
class Rouge:
"""
ROUGE metric for measuring summary/generation quality.
Computes ROUGE-1 (unigram), ROUGE-2 (bigram), and ROUGE-L (LCS) scores.
ROUGE is recall-oriented, measuring how much of the reference is captured.
"""
def __init__(self, tokeniser: WordTokeniser | None = None) -> None:
"""
Initialise the ROUGE metric.
Args:
tokeniser: Tokeniser to use. Defaults to WordTokeniser().
"""
self._tokeniser = tokeniser or WordTokeniser()
@property
def name(self) -> str:
"""Return the name of this metric."""
return "rouge"
@property
def requires_reference(self) -> bool:
"""Return whether this metric requires reference text."""
return True
def score(
self, candidate: str, reference: str | list[str] | None = None
) -> RougeResult:
"""
Compute ROUGE scores for a candidate text.
Args:
candidate: The text to score.
reference: Reference text(s) for comparison. If multiple references
are provided, returns the maximum score for each variant.
Returns:
RougeResult with ROUGE-1, ROUGE-2, and ROUGE-L scores.
Raises:
ValueError: If reference is None or empty.
"""
if reference is None:
raise ValueError("ROUGE requires reference text")
# Normalise reference to list
references = [reference] if isinstance(reference, str) else reference
# Tokenise
candidate_tokens = self._tokeniser.tokenise(candidate)
reference_token_lists = [self._tokeniser.tokenise(r) for r in references]
# Handle empty references
if all(not ref for ref in reference_token_lists):
raise ValueError("Reference text cannot be empty")
# Handle empty candidate
if not candidate_tokens:
return RougeResult(
rouge1=RougeScore(precision=0.0, recall=0.0, fmeasure=0.0),
rouge2=RougeScore(precision=0.0, recall=0.0, fmeasure=0.0),
rouge_l=RougeScore(precision=0.0, recall=0.0, fmeasure=0.0),
)
# Compute scores for each reference and take max
rouge1_scores = []
rouge2_scores = []
rouge_l_scores = []
for ref_tokens in reference_token_lists:
if not ref_tokens:
continue
rouge1_scores.append(_compute_rouge_score(candidate_tokens, ref_tokens, 1))
rouge2_scores.append(_compute_rouge_score(candidate_tokens, ref_tokens, 2))
rouge_l_scores.append(_compute_rouge_l(candidate_tokens, ref_tokens))
return RougeResult(
rouge1=_max_rouge_scores(rouge1_scores),
rouge2=_max_rouge_scores(rouge2_scores),
rouge_l=_max_rouge_scores(rouge_l_scores),
)
def batch_score(
self,
candidates: list[str],
references: list[str] | list[list[str]] | None = None,
) -> BatchResult[RougeResult]:
"""
Compute ROUGE scores for a batch of candidates.
Args:
candidates: List of texts to score.
references: Reference text(s) for each candidate.
Returns:
BatchResult containing individual results and aggregate statistics.
Raises:
ValueError: If references is None or length mismatch.
"""
if references is None:
raise ValueError("ROUGE requires reference texts")
if len(candidates) != len(references):
raise ValueError(
f"Number of candidates ({len(candidates)}) must match "
f"number of references ({len(references)})"
)
results: list[RougeResult] = []
for i, cand in enumerate(candidates):
ref: str | list[str] = references[i]
results.append(self.score(cand, ref))
# Compute aggregate statistics for each score type
stats = {
"rouge1_precision": AggregateStats.from_values(
[r.rouge1.precision for r in results]
),
"rouge1_recall": AggregateStats.from_values(
[r.rouge1.recall for r in results]
),
"rouge1_fmeasure": AggregateStats.from_values(
[r.rouge1.fmeasure for r in results]
),
"rouge2_precision": AggregateStats.from_values(
[r.rouge2.precision for r in results]
),
"rouge2_recall": AggregateStats.from_values(
[r.rouge2.recall for r in results]
),
"rouge2_fmeasure": AggregateStats.from_values(
[r.rouge2.fmeasure for r in results]
),
"rouge_l_precision": AggregateStats.from_values(
[r.rouge_l.precision for r in results]
),
"rouge_l_recall": AggregateStats.from_values(
[r.rouge_l.recall for r in results]
),
"rouge_l_fmeasure": AggregateStats.from_values(
[r.rouge_l.fmeasure for r in results]
),
}
return BatchResult(results=results, count=len(results), stats=stats)

View File

@@ -0,0 +1,274 @@
"""Tests for the readability metric."""
import pytest
from veritext.metrics import Readability, ReadabilityResult
class TestReadability:
"""Tests for the Readability metric class."""
@pytest.fixture
def readability(self) -> Readability:
"""Provide a readability metric instance."""
return Readability()
def test_name(self, readability: Readability) -> None:
"""Test that name returns 'readability'."""
assert readability.name == "readability"
def test_requires_reference(self, readability: Readability) -> None:
"""Test that readability does NOT require reference text."""
assert readability.requires_reference is False
def test_simple_text(self, readability: Readability) -> None:
"""Test readability of simple, easy text."""
# Simple children's text - short sentences, simple words
text = "The cat sat. The dog ran. I see a bird."
result = readability.score(text)
# Should have low grade level and high reading ease
assert result.flesch_kincaid_grade < 5.0
assert result.flesch_reading_ease > 80.0
def test_complex_text(self, readability: Readability) -> None:
"""Test readability of complex, academic text."""
# Complex academic text - long sentences, polysyllabic words
text = (
"The implementation of sophisticated computational methodologies "
"necessitates comprehensive understanding of algorithmic complexity "
"and architectural considerations."
)
result = readability.score(text)
# Should have high grade level and low reading ease
assert result.flesch_kincaid_grade > 12.0
assert result.flesch_reading_ease < 30.0
def test_medium_text(self, readability: Readability) -> None:
"""Test readability of medium-difficulty text."""
text = (
"The weather today is quite pleasant. "
"Many people are enjoying the sunshine in the park. "
"Children play while parents watch nearby."
)
result = readability.score(text)
# Should be middle of the road
assert 3.0 < result.flesch_kincaid_grade < 10.0
assert 50.0 < result.flesch_reading_ease < 90.0
def test_single_sentence(self, readability: Readability) -> None:
"""Test readability with a single sentence."""
text = "The cat sat on the mat."
result = readability.score(text)
# Should compute without error
assert result.flesch_kincaid_grade is not None
assert result.flesch_reading_ease is not None
def test_single_word(self, readability: Readability) -> None:
"""Test readability with a single word."""
text = "Cat"
result = readability.score(text)
# Should handle single word (1 word, 1 sentence, 1 syllable)
assert result.flesch_kincaid_grade is not None
assert result.flesch_reading_ease is not None
def test_empty_text(self, readability: Readability) -> None:
"""Test that empty text returns zero scores."""
result = readability.score("")
assert result.flesch_kincaid_grade == 0.0
assert result.flesch_reading_ease == 0.0
def test_whitespace_only(self, readability: Readability) -> None:
"""Test that whitespace-only text returns zero scores."""
result = readability.score(" \t\n ")
assert result.flesch_kincaid_grade == 0.0
assert result.flesch_reading_ease == 0.0
def test_reference_ignored(self, readability: Readability) -> None:
"""Test that reference parameter is ignored."""
text = "The cat sat on the mat."
# Score with no reference
result1 = readability.score(text)
# Score with reference (should be ignored)
result2 = readability.score(text, "Completely different text")
# Score with list of references
result3 = readability.score(text, ["ref1", "ref2"])
# All should produce identical results
assert result1.flesch_kincaid_grade == result2.flesch_kincaid_grade
assert result1.flesch_reading_ease == result2.flesch_reading_ease
assert result1.flesch_kincaid_grade == result3.flesch_kincaid_grade
def test_punctuation_handling(self, readability: Readability) -> None:
"""Test that punctuation affects sentence counting."""
# Same words, different sentence structure
text1 = "The cat sat on the mat" # 1 sentence
text2 = "The cat sat. On the mat." # 2 sentences
result1 = readability.score(text1)
result2 = readability.score(text2)
# Different sentence counts should affect scores
assert result1.flesch_kincaid_grade != result2.flesch_kincaid_grade
def test_question_marks_count_sentences(self, readability: Readability) -> None:
"""Test that question marks end sentences."""
text = "What is this? It is a test."
result = readability.score(text)
# Should count as 2 sentences
# With 7 words total, words_per_sentence = 3.5
assert result.flesch_kincaid_grade is not None
def test_exclamation_marks_count_sentences(self, readability: Readability) -> None:
"""Test that exclamation marks end sentences."""
text = "Wow! That is amazing!"
result = readability.score(text)
# Should count as 2 sentences
assert result.flesch_kincaid_grade is not None
def test_multiple_punctuation(self, readability: Readability) -> None:
"""Test handling of multiple punctuation marks."""
text = "What?! That's crazy... Well then."
result = readability.score(text)
# Should handle gracefully
assert result.flesch_kincaid_grade is not None
def test_result_score_property(self, readability: Readability) -> None:
"""Test that result.score returns flesch_reading_ease."""
result = readability.score("The cat sat on the mat.")
assert result.score == result.flesch_reading_ease
def test_contractions(self, readability: Readability) -> None:
"""Test handling of contractions."""
text = "I'm going to the store. It's not far away."
result = readability.score(text)
# Should handle contractions as words
assert result.flesch_kincaid_grade is not None
assert result.flesch_reading_ease is not None
class TestReadabilityBatch:
"""Tests for readability batch scoring."""
@pytest.fixture
def readability(self) -> Readability:
"""Provide a readability metric instance."""
return Readability()
def test_batch_score_basic(self, readability: Readability) -> None:
"""Test basic batch scoring."""
candidates = [
"The cat sat on the mat.",
"A dog ran through the park.",
]
result = readability.batch_score(candidates)
assert result.count == 2
assert len(result.results) == 2
def test_batch_score_statistics(self, readability: Readability) -> None:
"""Test that batch scoring computes statistics."""
candidates = [
"Cat sat.", # Very simple
"The implementation of sophisticated methodologies requires expertise.",
]
result = readability.batch_score(candidates)
# Check statistics are computed
assert "flesch_kincaid_grade" in result.stats
assert "flesch_reading_ease" in result.stats
# First should be easier than second
assert (
result.results[0].flesch_reading_ease
> result.results[1].flesch_reading_ease
)
def test_batch_score_percentiles(self, readability: Readability) -> None:
"""Test that batch scoring computes percentiles."""
candidates = ["a", "b", "c", "d", "e"]
result = readability.batch_score(candidates)
stats = result.stats["flesch_reading_ease"]
assert 25 in stats.percentiles
assert 50 in stats.percentiles
assert 75 in stats.percentiles
assert 95 in stats.percentiles
def test_batch_score_references_ignored(self, readability: Readability) -> None:
"""Test that batch scoring ignores references."""
candidates = ["The cat sat.", "A dog ran."]
result1 = readability.batch_score(candidates)
result2 = readability.batch_score(candidates, ["ref1", "ref2"])
# Results should be identical
assert result1.results[0].flesch_kincaid_grade == (
result2.results[0].flesch_kincaid_grade
)
def test_batch_score_empty_list_raises(self, readability: Readability) -> None:
"""Test that empty candidate list raises ValueError."""
with pytest.raises(ValueError, match="empty"):
readability.batch_score([])
class TestReadabilityResult:
"""Tests for ReadabilityResult type."""
def test_frozen(self) -> None:
"""Test that ReadabilityResult is frozen."""
from pydantic import ValidationError
result = ReadabilityResult(flesch_kincaid_grade=5.0, flesch_reading_ease=70.0)
with pytest.raises(ValidationError):
result.flesch_kincaid_grade = 6.0 # type: ignore[misc]
def test_values(self) -> None:
"""Test that values are stored correctly."""
result = ReadabilityResult(flesch_kincaid_grade=8.5, flesch_reading_ease=65.0)
assert result.flesch_kincaid_grade == 8.5
assert result.flesch_reading_ease == 65.0
def test_score_property(self) -> None:
"""Test that score property returns flesch_reading_ease."""
result = ReadabilityResult(flesch_kincaid_grade=8.5, flesch_reading_ease=65.0)
assert result.score == 65.0
class TestSyllableCounting:
"""Tests for syllable counting heuristics."""
@pytest.fixture
def readability(self) -> Readability:
"""Provide a readability metric instance."""
return Readability()
def test_monosyllabic_words(self, readability: Readability) -> None:
"""Test that monosyllabic words don't inflate scores."""
# All one-syllable words
text = "The cat sat on the mat."
result = readability.score(text)
# Should be very easy to read
assert result.flesch_reading_ease > 90.0
def test_polysyllabic_words(self, readability: Readability) -> None:
"""Test that polysyllabic words affect scores."""
# Words with multiple syllables
text = "International communication facilitates understanding."
result = readability.score(text)
# Should be harder to read
assert result.flesch_reading_ease < 50.0

View File

@@ -0,0 +1,295 @@
"""Tests for the ROUGE metric."""
import pytest
from veritext.metrics import Rouge, RougeResult, RougeScore
class TestRouge:
"""Tests for the Rouge metric class."""
@pytest.fixture
def rouge(self) -> Rouge:
"""Provide a ROUGE metric instance."""
return Rouge()
def test_name(self, rouge: Rouge) -> None:
"""Test that name returns 'rouge'."""
assert rouge.name == "rouge"
def test_requires_reference(self, rouge: Rouge) -> None:
"""Test that ROUGE requires reference text."""
assert rouge.requires_reference is True
def test_identical_texts(self, rouge: Rouge) -> None:
"""Test that identical texts produce perfect scores."""
text = "The cat sat on the mat"
result = rouge.score(text, text)
assert result.rouge1.precision == 1.0
assert result.rouge1.recall == 1.0
assert result.rouge1.fmeasure == 1.0
assert result.rouge2.fmeasure == 1.0
assert result.rouge_l.fmeasure == 1.0
def test_no_overlap(self, rouge: Rouge) -> None:
"""Test that texts with no overlap produce zero scores."""
candidate = "apple banana cherry"
reference = "dog elephant fox"
result = rouge.score(candidate, reference)
assert result.rouge1.precision == 0.0
assert result.rouge1.recall == 0.0
assert result.rouge1.fmeasure == 0.0
assert result.rouge2.fmeasure == 0.0
assert result.rouge_l.fmeasure == 0.0
def test_partial_overlap_rouge1(self, rouge: Rouge) -> None:
"""Test ROUGE-1 with partial overlap."""
candidate = "the cat sat"
reference = "the dog sat"
result = rouge.score(candidate, reference)
# Candidate: {the, cat, sat}, Reference: {the, dog, sat}
# Overlap: {the, sat} = 2
# Precision = 2/3, Recall = 2/3
assert abs(result.rouge1.precision - 2 / 3) < 1e-10
assert abs(result.rouge1.recall - 2 / 3) < 1e-10
def test_partial_overlap_rouge2(self, rouge: Rouge) -> None:
"""Test ROUGE-2 (bigram) with partial overlap."""
candidate = "the cat sat on the mat"
reference = "the cat lay on the mat"
result = rouge.score(candidate, reference)
# Bigrams in candidate: (the, cat), (cat, sat), (sat, on), (on, the), (the, mat)
# Bigrams in reference: (the, cat), (cat, lay), (lay, on), (on, the), (the, mat)
# Overlap: (the, cat), (on, the), (the, mat) = 3
# Precision = 3/5, Recall = 3/5
assert abs(result.rouge2.precision - 3 / 5) < 1e-10
assert abs(result.rouge2.recall - 3 / 5) < 1e-10
def test_rouge_l_basic(self, rouge: Rouge) -> None:
"""Test ROUGE-L (LCS) computation."""
candidate = "the cat sat on the mat"
reference = "the cat sat"
result = rouge.score(candidate, reference)
# LCS = "the cat sat" = 3 tokens
# Precision = 3/6 = 0.5, Recall = 3/3 = 1.0
assert result.rouge_l.precision == 0.5
assert result.rouge_l.recall == 1.0
def test_rouge_l_non_contiguous(self, rouge: Rouge) -> None:
"""Test ROUGE-L with non-contiguous LCS."""
candidate = "the big cat sat"
reference = "the cat sat"
result = rouge.score(candidate, reference)
# LCS = "the cat sat" = 3 (skipping "big")
# Precision = 3/4, Recall = 3/3 = 1.0
assert result.rouge_l.precision == 0.75
assert result.rouge_l.recall == 1.0
def test_precision_vs_recall(self, rouge: Rouge) -> None:
"""Test that precision and recall differ appropriately."""
# Short candidate, long reference
candidate = "the cat"
reference = "the cat sat on the mat"
result = rouge.score(candidate, reference)
# Precision should be high (all candidate tokens in reference)
assert result.rouge1.precision == 1.0
# Recall should be lower (not all reference tokens in candidate)
assert result.rouge1.recall < 1.0
def test_empty_candidate(self, rouge: Rouge) -> None:
"""Test that empty candidate returns zero scores."""
result = rouge.score("", "The cat sat")
assert result.rouge1.fmeasure == 0.0
assert result.rouge2.fmeasure == 0.0
assert result.rouge_l.fmeasure == 0.0
def test_whitespace_only_candidate(self, rouge: Rouge) -> None:
"""Test that whitespace-only candidate returns zero scores."""
result = rouge.score(" \t\n ", "The cat sat")
assert result.rouge1.fmeasure == 0.0
assert result.rouge_l.fmeasure == 0.0
def test_empty_reference_raises(self, rouge: Rouge) -> None:
"""Test that empty reference raises ValueError."""
with pytest.raises(ValueError, match="cannot be empty"):
rouge.score("The cat sat", "")
def test_none_reference_raises(self, rouge: Rouge) -> None:
"""Test that None reference raises ValueError."""
with pytest.raises(ValueError, match="requires reference"):
rouge.score("The cat sat", None)
def test_multiple_references_uses_max(self, rouge: Rouge) -> None:
"""Test that multiple references use max scores."""
candidate = "the cat sat on the mat"
references = [
"a dog ran across the room", # Low overlap
"the cat sat on the mat", # Exact match
]
result = rouge.score(candidate, references)
# Should get perfect scores due to exact match
assert result.rouge1.fmeasure == 1.0
assert result.rouge_l.fmeasure == 1.0
def test_multiple_references_partial(self, rouge: Rouge) -> None:
"""Test multiple references with partial matches."""
candidate = "the quick brown fox"
references = [
"the fast brown fox", # 3/4 match
"a quick brown dog", # 3/4 match different tokens
]
result = rouge.score(candidate, references)
# Should pick best from either reference
assert result.rouge1.fmeasure > 0.0
def test_result_score_property(self, rouge: Rouge) -> None:
"""Test that result.score returns rouge_l.fmeasure."""
result = rouge.score("The cat sat", "The cat sat")
assert result.score == result.rouge_l.fmeasure
def test_case_insensitivity(self, rouge: Rouge) -> None:
"""Test that ROUGE is case insensitive by default."""
result = rouge.score("THE CAT SAT", "the cat sat")
assert result.rouge1.fmeasure == 1.0
assert result.rouge_l.fmeasure == 1.0
def test_punctuation_ignored(self, rouge: Rouge) -> None:
"""Test that punctuation is ignored by default."""
result = rouge.score("The cat sat.", "The cat sat!")
assert result.rouge1.fmeasure == 1.0
def test_single_word(self, rouge: Rouge) -> None:
"""Test ROUGE with single word texts."""
result = rouge.score("cat", "cat")
assert result.rouge1.fmeasure == 1.0
# ROUGE-2 should be 0 for single words (no bigrams)
assert result.rouge2.fmeasure == 0.0
assert result.rouge_l.fmeasure == 1.0
def test_fmeasure_calculation(self, rouge: Rouge) -> None:
"""Test that F-measure is calculated correctly."""
# Create a case where P != R
candidate = "the cat sat on"
reference = "the cat"
result = rouge.score(candidate, reference)
# P = 2/4 = 0.5, R = 2/2 = 1.0
# F = 2 * 0.5 * 1.0 / (0.5 + 1.0) = 1.0 / 1.5 = 2/3
expected_f = 2 * 0.5 * 1.0 / (0.5 + 1.0)
assert abs(result.rouge1.fmeasure - expected_f) < 1e-10
class TestRougeBatch:
"""Tests for ROUGE batch scoring."""
@pytest.fixture
def rouge(self) -> Rouge:
"""Provide a ROUGE metric instance."""
return Rouge()
def test_batch_score_basic(self, rouge: Rouge) -> None:
"""Test basic batch scoring."""
candidates = ["The cat sat", "A dog runs"]
references = ["The cat sat", "A dog runs"]
result = rouge.batch_score(candidates, references)
assert result.count == 2
assert len(result.results) == 2
assert all(r.rouge_l.fmeasure == 1.0 for r in result.results)
def test_batch_score_statistics(self, rouge: Rouge) -> None:
"""Test that batch scoring computes statistics."""
candidates = ["The cat sat", "Completely different words"]
references = ["The cat sat", "The cat sat"]
result = rouge.batch_score(candidates, references)
# Check statistics are computed
assert "rouge1_fmeasure" in result.stats
assert "rouge2_fmeasure" in result.stats
assert "rouge_l_fmeasure" in result.stats
assert "rouge1_precision" in result.stats
assert "rouge1_recall" in result.stats
# First result should be 1.0, second should be 0.0
assert result.results[0].rouge1.fmeasure == 1.0
assert result.results[1].rouge1.fmeasure == 0.0
def test_batch_score_percentiles(self, rouge: Rouge) -> None:
"""Test that batch scoring computes percentiles."""
candidates = ["a", "b", "c", "d", "e"]
references = ["a", "b", "c", "d", "e"]
result = rouge.batch_score(candidates, references)
stats = result.stats["rouge1_fmeasure"]
assert 25 in stats.percentiles
assert 50 in stats.percentiles
assert 75 in stats.percentiles
assert 95 in stats.percentiles
def test_batch_score_none_references_raises(self, rouge: Rouge) -> None:
"""Test that batch scoring raises for None references."""
with pytest.raises(ValueError, match="requires reference"):
rouge.batch_score(["text"], None)
def test_batch_score_length_mismatch_raises(self, rouge: Rouge) -> None:
"""Test that batch scoring raises for mismatched lengths."""
with pytest.raises(ValueError, match="must match"):
rouge.batch_score(["a", "b"], ["a"])
def test_batch_score_with_multiple_references(self, rouge: Rouge) -> None:
"""Test batch scoring with multiple references per candidate."""
candidates = [
"The cat sat on the mat",
"A quick brown fox",
]
references = [
["The cat sat on the mat", "A cat rests on floor"],
["A quick brown fox", "The fast brown fox"],
]
result = rouge.batch_score(candidates, references)
assert result.count == 2
# Both should get perfect scores due to exact matches
assert result.results[0].rouge_l.fmeasure == 1.0
assert result.results[1].rouge_l.fmeasure == 1.0
class TestRougeResult:
"""Tests for RougeResult and RougeScore types."""
def test_rouge_score_frozen(self) -> None:
"""Test that RougeScore is frozen."""
from pydantic import ValidationError
score = RougeScore(precision=0.5, recall=0.6, fmeasure=0.55)
with pytest.raises(ValidationError):
score.precision = 0.7 # type: ignore[misc]
def test_rouge_result_frozen(self) -> None:
"""Test that RougeResult is frozen."""
from pydantic import ValidationError
score = RougeScore(precision=0.5, recall=0.6, fmeasure=0.55)
result = RougeResult(rouge1=score, rouge2=score, rouge_l=score)
with pytest.raises(ValidationError):
result.rouge1 = score # type: ignore[misc]
def test_score_property(self) -> None:
"""Test that score property returns rouge_l.fmeasure."""
r1 = RougeScore(precision=0.9, recall=0.9, fmeasure=0.9)
r2 = RougeScore(precision=0.8, recall=0.8, fmeasure=0.8)
rl = RougeScore(precision=0.7, recall=0.7, fmeasure=0.7)
result = RougeResult(rouge1=r1, rouge2=r2, rouge_l=rl)
assert result.score == 0.7