docs(changelog): add ROUGE and readability entries

test(metrics): add ROUGE and readability tests
feat(metrics): export ROUGE and readability from module
2026-02-03 17:03:39 +00:00 · 2026-02-03 17:03:34 +00:00 · 2026-02-03 17:03:28 +00:00 · 2026-02-03 17:03:24 +00:00 · 2026-02-03 17:03:19 +00:00 · 2026-02-03 17:03:14 +00:00
7 changed files with 1114 additions and 2 deletions
@@ -18,4 +18,6 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 - Metrics module with `Metric` protocol, `AggregateStats`, and `BatchResult` types
 - BLEU metric implementation (BLEU-1 through BLEU-4 with brevity penalty)
 - Lexical similarity metric (Jaccard similarity and token overlap)
+- ROUGE metric (ROUGE-1, ROUGE-2, ROUGE-L with precision/recall/F-measure)
+- Flesch-Kincaid readability metrics (grade level and reading ease)
 - Batch scoring with aggregate statistics for all metrics
@@ -1,9 +1,17 @@
-"""Metrics module: BLEU, lexical similarity, and batch processing."""
+"""Metrics module: BLEU, ROUGE, lexical similarity, readability, and batch processing."""

 from veritext.metrics.base import AggregateStats, BatchResult, Metric
 from veritext.metrics.bleu import Bleu
 from veritext.metrics.lexical import Lexical
-from veritext.metrics.results import BleuResult, LexicalResult
+from veritext.metrics.readability import Readability
+from veritext.metrics.results import (
+    BleuResult,
+    LexicalResult,
+    ReadabilityResult,
+    RougeResult,
+    RougeScore,
+)
+from veritext.metrics.rouge import Rouge

 __all__ = [
    "AggregateStats",
@@ -13,4 +21,9 @@ __all__ = [
    "Lexical",
    "LexicalResult",
    "Metric",
+    "Readability",
+    "ReadabilityResult",
+    "Rouge",
+    "RougeResult",
+    "RougeScore",
 ]
@@ -0,0 +1,195 @@
+"""Readability metrics implementation (Flesch-Kincaid)."""
+
+import re
+
+from veritext.metrics.base import AggregateStats, BatchResult
+from veritext.metrics.results import ReadabilityResult
+
+# Sentence-ending punctuation pattern
+_SENTENCE_ENDINGS = re.compile(r"[.!?]+")
+
+# Vowel pattern for syllable counting
+_VOWELS = re.compile(r"[aeiouy]+", re.IGNORECASE)
+
+
+def _count_syllables(word: str) -> int:
+    """
+    Count syllables in a word using a heuristic approach.
+
+    Uses vowel group counting with adjustments for common patterns.
+
+    Args:
+        word: The word to count syllables for.
+
+    Returns:
+        Estimated syllable count (minimum 1 for non-empty words).
+    """
+    if not word:
+        return 0
+
+    word = word.lower().strip()
+    if not word:
+        return 0
+
+    # Count vowel groups
+    vowel_groups = _VOWELS.findall(word)
+    count = len(vowel_groups)
+
+    # Adjust for silent 'e' at end
+    if word.endswith("e") and count > 1:
+        count -= 1
+
+    # Adjust for 'le' ending (e.g., "table", "able")
+    if word.endswith("le") and len(word) > 2 and word[-3] not in "aeiouy":
+        count += 1
+
+    # Adjust for 'ed' ending when not adding syllable
+    if word.endswith("ed") and len(word) > 2 and word[-3] not in "dt":
+        count = max(count - 1, 1)
+
+    # Ensure at least 1 syllable for any word
+    return max(count, 1)
+
+
+def _count_sentences(text: str) -> int:
+    """
+    Count sentences in text.
+
+    Splits on sentence-ending punctuation (.!?).
+
+    Args:
+        text: The text to count sentences in.
+
+    Returns:
+        Number of sentences (minimum 1 for non-empty text).
+    """
+    if not text or not text.strip():
+        return 0
+
+    # Split on sentence endings and filter empty strings
+    sentences = _SENTENCE_ENDINGS.split(text)
+    # Filter out empty segments
+    sentences = [s for s in sentences if s.strip()]
+
+    return max(len(sentences), 1)
+
+
+def _count_words(text: str) -> tuple[list[str], int]:
+    """
+    Extract words from text and count them.
+
+    Args:
+        text: The text to process.
+
+    Returns:
+        Tuple of (word list, word count).
+    """
+    # Extract words (sequences of letters and apostrophes)
+    words = re.findall(r"[a-zA-Z']+", text)
+    # Filter out standalone apostrophes
+    words = [w for w in words if w.replace("'", "")]
+    return words, len(words)
+
+
+class Readability:
+    """
+    Readability metric using Flesch-Kincaid formulas.
+
+    Computes:
+    - Flesch-Kincaid Grade Level: US grade level required to understand text
+    - Flesch Reading Ease: Score from 0-100 (higher = easier to read)
+
+    This metric does NOT require reference text.
+    """
+
+    @property
+    def name(self) -> str:
+        """Return the name of this metric."""
+        return "readability"
+
+    @property
+    def requires_reference(self) -> bool:
+        """Return whether this metric requires reference text."""
+        return False
+
+    def score(
+        self,
+        candidate: str,
+        reference: str | list[str] | None = None,  # noqa: ARG002
+    ) -> ReadabilityResult:
+        """
+        Compute readability scores for a text.
+
+        Args:
+            candidate: The text to score.
+            reference: Ignored (readability doesn't use reference text).
+
+        Returns:
+            ReadabilityResult with Flesch-Kincaid scores.
+        """
+        # Extract words and count
+        words, word_count = _count_words(candidate)
+
+        # Handle empty or trivial text
+        if word_count == 0:
+            return ReadabilityResult(
+                flesch_kincaid_grade=0.0,
+                flesch_reading_ease=0.0,
+            )
+
+        # Count sentences
+        sentence_count = _count_sentences(candidate)
+
+        # Count syllables
+        syllable_count = sum(_count_syllables(word) for word in words)
+
+        # Compute ratios
+        words_per_sentence = word_count / sentence_count
+        syllables_per_word = syllable_count / word_count
+
+        # Flesch-Kincaid Grade Level
+        # Formula: 0.39 * (words/sentences) + 11.8 * (syllables/words) - 15.59
+        grade_level = 0.39 * words_per_sentence + 11.8 * syllables_per_word - 15.59
+
+        # Flesch Reading Ease
+        # Formula: 206.835 - 1.015 * (words/sentences) - 84.6 * (syllables/words)
+        reading_ease = 206.835 - 1.015 * words_per_sentence - 84.6 * syllables_per_word
+
+        return ReadabilityResult(
+            flesch_kincaid_grade=grade_level,
+            flesch_reading_ease=reading_ease,
+        )
+
+    def batch_score(
+        self,
+        candidates: list[str],
+        references: list[str] | list[list[str]] | None = None,  # noqa: ARG002
+    ) -> BatchResult[ReadabilityResult]:
+        """
+        Compute readability scores for a batch of texts.
+
+        Args:
+            candidates: List of texts to score.
+            references: Ignored (readability doesn't use reference text).
+
+        Returns:
+            BatchResult containing individual results and aggregate statistics.
+        """
+        if not candidates:
+            raise ValueError("Cannot compute batch statistics from empty list")
+
+        results: list[ReadabilityResult] = []
+        for cand in candidates:
+            results.append(self.score(cand))
+
+        # Compute aggregate statistics
+        stats = {
+            "flesch_kincaid_grade": AggregateStats.from_values(
+                [r.flesch_kincaid_grade for r in results]
+            ),
+            "flesch_reading_ease": AggregateStats.from_values(
+                [r.flesch_reading_ease for r in results]
+            ),
+        }
+
+        return BatchResult(results=results, count=len(results), stats=stats)
@@ -39,3 +39,55 @@ class LexicalResult(BaseModel):

    token_overlap: float
    """Proportion of candidate tokens found in reference."""
+
+
+class RougeScore(BaseModel):
+    """Individual ROUGE variant score with precision, recall, F-measure."""
+
+    model_config = ConfigDict(frozen=True)
+
+    precision: float
+    """Precision: overlap / candidate length."""
+
+    recall: float
+    """Recall: overlap / reference length."""
+
+    fmeasure: float
+    """F1-measure: harmonic mean of precision and recall."""
+
+
+class RougeResult(BaseModel):
+    """Result of ROUGE score computation."""
+
+    model_config = ConfigDict(frozen=True)
+
+    rouge1: RougeScore
+    """ROUGE-1 (unigram) score."""
+
+    rouge2: RougeScore
+    """ROUGE-2 (bigram) score."""
+
+    rouge_l: RougeScore
+    """ROUGE-L (longest common subsequence) score."""
+
+    @property
+    def score(self) -> float:
+        """Return ROUGE-L F-measure as the primary score."""
+        return self.rouge_l.fmeasure
+
+
+class ReadabilityResult(BaseModel):
+    """Result of readability computation."""
+
+    model_config = ConfigDict(frozen=True)
+
+    flesch_kincaid_grade: float
+    """US grade level (e.g., 8.0 = 8th grade reading level)."""
+
+    flesch_reading_ease: float
+    """Score 0-100, higher = easier to read."""
+
+    @property
+    def score(self) -> float:
+        """Return Flesch reading ease as the primary score."""
+        return self.flesch_reading_ease
@@ -0,0 +1,281 @@
+"""ROUGE (Recall-Oriented Understudy for Gisting Evaluation) metric implementation."""
+
+from collections import Counter
+
+from veritext.core.tokenisation import WordTokeniser
+from veritext.metrics.base import AggregateStats, BatchResult
+from veritext.metrics.results import RougeResult, RougeScore
+
+
+def _get_ngrams(tokens: list[str], n: int) -> Counter[tuple[str, ...]]:
+    """Extract n-grams from a list of tokens."""
+    if n > len(tokens):
+        return Counter()
+    return Counter(tuple(tokens[i : i + n]) for i in range(len(tokens) - n + 1))
+
+
+def _ngram_overlap(
+    candidate_ngrams: Counter[tuple[str, ...]],
+    reference_ngrams: Counter[tuple[str, ...]],
+) -> int:
+    """Compute the overlap count between candidate and reference n-grams."""
+    overlap = 0
+    for ngram, count in candidate_ngrams.items():
+        overlap += min(count, reference_ngrams.get(ngram, 0))
+    return overlap
+
+
+def _compute_rouge_score(
+    candidate_tokens: list[str],
+    reference_tokens: list[str],
+    n: int,
+) -> RougeScore:
+    """
+    Compute ROUGE-n score for given n-gram size.
+
+    Args:
+        candidate_tokens: Tokenised candidate text.
+        reference_tokens: Tokenised reference text.
+        n: N-gram size.
+
+    Returns:
+        RougeScore with precision, recall, and F-measure.
+    """
+    candidate_ngrams = _get_ngrams(candidate_tokens, n)
+    reference_ngrams = _get_ngrams(reference_tokens, n)
+
+    candidate_count = sum(candidate_ngrams.values())
+    reference_count = sum(reference_ngrams.values())
+
+    if candidate_count == 0 and reference_count == 0:
+        return RougeScore(precision=0.0, recall=0.0, fmeasure=0.0)
+
+    overlap = _ngram_overlap(candidate_ngrams, reference_ngrams)
+
+    precision = overlap / candidate_count if candidate_count > 0 else 0.0
+    recall = overlap / reference_count if reference_count > 0 else 0.0
+
+    if precision + recall > 0:
+        fmeasure = 2 * precision * recall / (precision + recall)
+    else:
+        fmeasure = 0.0
+
+    return RougeScore(precision=precision, recall=recall, fmeasure=fmeasure)
+
+
+def _lcs_length(seq1: list[str], seq2: list[str]) -> int:
+    """
+    Compute the length of the longest common subsequence.
+
+    Uses dynamic programming with O(m*n) time and O(min(m,n)) space.
+    """
+    if not seq1 or not seq2:
+        return 0
+
+    # Optimise by using shorter sequence for columns
+    if len(seq1) < len(seq2):
+        seq1, seq2 = seq2, seq1
+
+    m, n = len(seq1), len(seq2)
+
+    # Only need two rows at a time
+    prev = [0] * (n + 1)
+    curr = [0] * (n + 1)
+
+    for i in range(1, m + 1):
+        for j in range(1, n + 1):
+            if seq1[i - 1] == seq2[j - 1]:
+                curr[j] = prev[j - 1] + 1
+            else:
+                curr[j] = max(prev[j], curr[j - 1])
+        prev, curr = curr, prev
+
+    return prev[n]
+
+
+def _compute_rouge_l(
+    candidate_tokens: list[str],
+    reference_tokens: list[str],
+) -> RougeScore:
+    """
+    Compute ROUGE-L score using longest common subsequence.
+
+    Args:
+        candidate_tokens: Tokenised candidate text.
+        reference_tokens: Tokenised reference text.
+
+    Returns:
+        RougeScore with precision, recall, and F-measure.
+    """
+    if not candidate_tokens and not reference_tokens:
+        return RougeScore(precision=0.0, recall=0.0, fmeasure=0.0)
+
+    if not candidate_tokens or not reference_tokens:
+        return RougeScore(precision=0.0, recall=0.0, fmeasure=0.0)
+
+    lcs = _lcs_length(candidate_tokens, reference_tokens)
+
+    precision = lcs / len(candidate_tokens)
+    recall = lcs / len(reference_tokens)
+
+    if precision + recall > 0:
+        fmeasure = 2 * precision * recall / (precision + recall)
+    else:
+        fmeasure = 0.0
+
+    return RougeScore(precision=precision, recall=recall, fmeasure=fmeasure)
+
+
+def _max_rouge_scores(scores: list[RougeScore]) -> RougeScore:
+    """Select the RougeScore with the highest F-measure from a list."""
+    return max(scores, key=lambda s: s.fmeasure)
+
+
+class Rouge:
+    """
+    ROUGE metric for measuring summary/generation quality.
+
+    Computes ROUGE-1 (unigram), ROUGE-2 (bigram), and ROUGE-L (LCS) scores.
+    ROUGE is recall-oriented, measuring how much of the reference is captured.
+    """
+
+    def __init__(self, tokeniser: WordTokeniser | None = None) -> None:
+        """
+        Initialise the ROUGE metric.
+
+        Args:
+            tokeniser: Tokeniser to use. Defaults to WordTokeniser().
+        """
+        self._tokeniser = tokeniser or WordTokeniser()
+
+    @property
+    def name(self) -> str:
+        """Return the name of this metric."""
+        return "rouge"
+
+    @property
+    def requires_reference(self) -> bool:
+        """Return whether this metric requires reference text."""
+        return True
+
+    def score(
+        self, candidate: str, reference: str | list[str] | None = None
+    ) -> RougeResult:
+        """
+        Compute ROUGE scores for a candidate text.
+
+        Args:
+            candidate: The text to score.
+            reference: Reference text(s) for comparison. If multiple references
+                are provided, returns the maximum score for each variant.
+
+        Returns:
+            RougeResult with ROUGE-1, ROUGE-2, and ROUGE-L scores.
+
+        Raises:
+            ValueError: If reference is None or empty.
+        """
+        if reference is None:
+            raise ValueError("ROUGE requires reference text")
+
+        # Normalise reference to list
+        references = [reference] if isinstance(reference, str) else reference
+
+        # Tokenise
+        candidate_tokens = self._tokeniser.tokenise(candidate)
+        reference_token_lists = [self._tokeniser.tokenise(r) for r in references]
+
+        # Handle empty references
+        if all(not ref for ref in reference_token_lists):
+            raise ValueError("Reference text cannot be empty")
+
+        # Handle empty candidate
+        if not candidate_tokens:
+            return RougeResult(
+                rouge1=RougeScore(precision=0.0, recall=0.0, fmeasure=0.0),
+                rouge2=RougeScore(precision=0.0, recall=0.0, fmeasure=0.0),
+                rouge_l=RougeScore(precision=0.0, recall=0.0, fmeasure=0.0),
+            )
+
+        # Compute scores for each reference and take max
+        rouge1_scores = []
+        rouge2_scores = []
+        rouge_l_scores = []
+
+        for ref_tokens in reference_token_lists:
+            if not ref_tokens:
+                continue
+            rouge1_scores.append(_compute_rouge_score(candidate_tokens, ref_tokens, 1))
+            rouge2_scores.append(_compute_rouge_score(candidate_tokens, ref_tokens, 2))
+            rouge_l_scores.append(_compute_rouge_l(candidate_tokens, ref_tokens))
+
+        return RougeResult(
+            rouge1=_max_rouge_scores(rouge1_scores),
+            rouge2=_max_rouge_scores(rouge2_scores),
+            rouge_l=_max_rouge_scores(rouge_l_scores),
+        )
+
+    def batch_score(
+        self,
+        candidates: list[str],
+        references: list[str] | list[list[str]] | None = None,
+    ) -> BatchResult[RougeResult]:
+        """
+        Compute ROUGE scores for a batch of candidates.
+
+        Args:
+            candidates: List of texts to score.
+            references: Reference text(s) for each candidate.
+
+        Returns:
+            BatchResult containing individual results and aggregate statistics.
+
+        Raises:
+            ValueError: If references is None or length mismatch.
+        """
+        if references is None:
+            raise ValueError("ROUGE requires reference texts")
+
+        if len(candidates) != len(references):
+            raise ValueError(
+                f"Number of candidates ({len(candidates)}) must match "
+                f"number of references ({len(references)})"
+            )
+
+        results: list[RougeResult] = []
+        for i, cand in enumerate(candidates):
+            ref: str | list[str] = references[i]
+            results.append(self.score(cand, ref))
+
+        # Compute aggregate statistics for each score type
+        stats = {
+            "rouge1_precision": AggregateStats.from_values(
+                [r.rouge1.precision for r in results]
+            ),
+            "rouge1_recall": AggregateStats.from_values(
+                [r.rouge1.recall for r in results]
+            ),
+            "rouge1_fmeasure": AggregateStats.from_values(
+                [r.rouge1.fmeasure for r in results]
+            ),
+            "rouge2_precision": AggregateStats.from_values(
+                [r.rouge2.precision for r in results]
+            ),
+            "rouge2_recall": AggregateStats.from_values(
+                [r.rouge2.recall for r in results]
+            ),
+            "rouge2_fmeasure": AggregateStats.from_values(
+                [r.rouge2.fmeasure for r in results]
+            ),
+            "rouge_l_precision": AggregateStats.from_values(
+                [r.rouge_l.precision for r in results]
+            ),
+            "rouge_l_recall": AggregateStats.from_values(
+                [r.rouge_l.recall for r in results]
+            ),
+            "rouge_l_fmeasure": AggregateStats.from_values(
+                [r.rouge_l.fmeasure for r in results]
+            ),
+        }
+
+        return BatchResult(results=results, count=len(results), stats=stats)
@@ -0,0 +1,274 @@
+"""Tests for the readability metric."""
+
+import pytest
+
+from veritext.metrics import Readability, ReadabilityResult
+
+
+class TestReadability:
+    """Tests for the Readability metric class."""
+
+    @pytest.fixture
+    def readability(self) -> Readability:
+        """Provide a readability metric instance."""
+        return Readability()
+
+    def test_name(self, readability: Readability) -> None:
+        """Test that name returns 'readability'."""
+        assert readability.name == "readability"
+
+    def test_requires_reference(self, readability: Readability) -> None:
+        """Test that readability does NOT require reference text."""
+        assert readability.requires_reference is False
+
+    def test_simple_text(self, readability: Readability) -> None:
+        """Test readability of simple, easy text."""
+        # Simple children's text - short sentences, simple words
+        text = "The cat sat. The dog ran. I see a bird."
+        result = readability.score(text)
+
+        # Should have low grade level and high reading ease
+        assert result.flesch_kincaid_grade < 5.0
+        assert result.flesch_reading_ease > 80.0
+
+    def test_complex_text(self, readability: Readability) -> None:
+        """Test readability of complex, academic text."""
+        # Complex academic text - long sentences, polysyllabic words
+        text = (
+            "The implementation of sophisticated computational methodologies "
+            "necessitates comprehensive understanding of algorithmic complexity "
+            "and architectural considerations."
+        )
+        result = readability.score(text)
+
+        # Should have high grade level and low reading ease
+        assert result.flesch_kincaid_grade > 12.0
+        assert result.flesch_reading_ease < 30.0
+
+    def test_medium_text(self, readability: Readability) -> None:
+        """Test readability of medium-difficulty text."""
+        text = (
+            "The weather today is quite pleasant. "
+            "Many people are enjoying the sunshine in the park. "
+            "Children play while parents watch nearby."
+        )
+        result = readability.score(text)
+
+        # Should be middle of the road
+        assert 3.0 < result.flesch_kincaid_grade < 10.0
+        assert 50.0 < result.flesch_reading_ease < 90.0
+
+    def test_single_sentence(self, readability: Readability) -> None:
+        """Test readability with a single sentence."""
+        text = "The cat sat on the mat."
+        result = readability.score(text)
+
+        # Should compute without error
+        assert result.flesch_kincaid_grade is not None
+        assert result.flesch_reading_ease is not None
+
+    def test_single_word(self, readability: Readability) -> None:
+        """Test readability with a single word."""
+        text = "Cat"
+        result = readability.score(text)
+
+        # Should handle single word (1 word, 1 sentence, 1 syllable)
+        assert result.flesch_kincaid_grade is not None
+        assert result.flesch_reading_ease is not None
+
+    def test_empty_text(self, readability: Readability) -> None:
+        """Test that empty text returns zero scores."""
+        result = readability.score("")
+
+        assert result.flesch_kincaid_grade == 0.0
+        assert result.flesch_reading_ease == 0.0
+
+    def test_whitespace_only(self, readability: Readability) -> None:
+        """Test that whitespace-only text returns zero scores."""
+        result = readability.score("   \t\n  ")
+
+        assert result.flesch_kincaid_grade == 0.0
+        assert result.flesch_reading_ease == 0.0
+
+    def test_reference_ignored(self, readability: Readability) -> None:
+        """Test that reference parameter is ignored."""
+        text = "The cat sat on the mat."
+
+        # Score with no reference
+        result1 = readability.score(text)
+        # Score with reference (should be ignored)
+        result2 = readability.score(text, "Completely different text")
+        # Score with list of references
+        result3 = readability.score(text, ["ref1", "ref2"])
+
+        # All should produce identical results
+        assert result1.flesch_kincaid_grade == result2.flesch_kincaid_grade
+        assert result1.flesch_reading_ease == result2.flesch_reading_ease
+        assert result1.flesch_kincaid_grade == result3.flesch_kincaid_grade
+
+    def test_punctuation_handling(self, readability: Readability) -> None:
+        """Test that punctuation affects sentence counting."""
+        # Same words, different sentence structure
+        text1 = "The cat sat on the mat"  # 1 sentence
+        text2 = "The cat sat. On the mat."  # 2 sentences
+
+        result1 = readability.score(text1)
+        result2 = readability.score(text2)
+
+        # Different sentence counts should affect scores
+        assert result1.flesch_kincaid_grade != result2.flesch_kincaid_grade
+
+    def test_question_marks_count_sentences(self, readability: Readability) -> None:
+        """Test that question marks end sentences."""
+        text = "What is this? It is a test."
+        result = readability.score(text)
+
+        # Should count as 2 sentences
+        # With 7 words total, words_per_sentence = 3.5
+        assert result.flesch_kincaid_grade is not None
+
+    def test_exclamation_marks_count_sentences(self, readability: Readability) -> None:
+        """Test that exclamation marks end sentences."""
+        text = "Wow! That is amazing!"
+        result = readability.score(text)
+
+        # Should count as 2 sentences
+        assert result.flesch_kincaid_grade is not None
+
+    def test_multiple_punctuation(self, readability: Readability) -> None:
+        """Test handling of multiple punctuation marks."""
+        text = "What?! That's crazy... Well then."
+        result = readability.score(text)
+
+        # Should handle gracefully
+        assert result.flesch_kincaid_grade is not None
+
+    def test_result_score_property(self, readability: Readability) -> None:
+        """Test that result.score returns flesch_reading_ease."""
+        result = readability.score("The cat sat on the mat.")
+        assert result.score == result.flesch_reading_ease
+
+    def test_contractions(self, readability: Readability) -> None:
+        """Test handling of contractions."""
+        text = "I'm going to the store. It's not far away."
+        result = readability.score(text)
+
+        # Should handle contractions as words
+        assert result.flesch_kincaid_grade is not None
+        assert result.flesch_reading_ease is not None
+
+
+class TestReadabilityBatch:
+    """Tests for readability batch scoring."""
+
+    @pytest.fixture
+    def readability(self) -> Readability:
+        """Provide a readability metric instance."""
+        return Readability()
+
+    def test_batch_score_basic(self, readability: Readability) -> None:
+        """Test basic batch scoring."""
+        candidates = [
+            "The cat sat on the mat.",
+            "A dog ran through the park.",
+        ]
+        result = readability.batch_score(candidates)
+
+        assert result.count == 2
+        assert len(result.results) == 2
+
+    def test_batch_score_statistics(self, readability: Readability) -> None:
+        """Test that batch scoring computes statistics."""
+        candidates = [
+            "Cat sat.",  # Very simple
+            "The implementation of sophisticated methodologies requires expertise.",
+        ]
+        result = readability.batch_score(candidates)
+
+        # Check statistics are computed
+        assert "flesch_kincaid_grade" in result.stats
+        assert "flesch_reading_ease" in result.stats
+
+        # First should be easier than second
+        assert (
+            result.results[0].flesch_reading_ease
+            > result.results[1].flesch_reading_ease
+        )
+
+    def test_batch_score_percentiles(self, readability: Readability) -> None:
+        """Test that batch scoring computes percentiles."""
+        candidates = ["a", "b", "c", "d", "e"]
+        result = readability.batch_score(candidates)
+
+        stats = result.stats["flesch_reading_ease"]
+        assert 25 in stats.percentiles
+        assert 50 in stats.percentiles
+        assert 75 in stats.percentiles
+        assert 95 in stats.percentiles
+
+    def test_batch_score_references_ignored(self, readability: Readability) -> None:
+        """Test that batch scoring ignores references."""
+        candidates = ["The cat sat.", "A dog ran."]
+
+        result1 = readability.batch_score(candidates)
+        result2 = readability.batch_score(candidates, ["ref1", "ref2"])
+
+        # Results should be identical
+        assert result1.results[0].flesch_kincaid_grade == (
+            result2.results[0].flesch_kincaid_grade
+        )
+
+    def test_batch_score_empty_list_raises(self, readability: Readability) -> None:
+        """Test that empty candidate list raises ValueError."""
+        with pytest.raises(ValueError, match="empty"):
+            readability.batch_score([])
+
+
+class TestReadabilityResult:
+    """Tests for ReadabilityResult type."""
+
+    def test_frozen(self) -> None:
+        """Test that ReadabilityResult is frozen."""
+        from pydantic import ValidationError
+
+        result = ReadabilityResult(flesch_kincaid_grade=5.0, flesch_reading_ease=70.0)
+        with pytest.raises(ValidationError):
+            result.flesch_kincaid_grade = 6.0  # type: ignore[misc]
+
+    def test_values(self) -> None:
+        """Test that values are stored correctly."""
+        result = ReadabilityResult(flesch_kincaid_grade=8.5, flesch_reading_ease=65.0)
+        assert result.flesch_kincaid_grade == 8.5
+        assert result.flesch_reading_ease == 65.0
+
+    def test_score_property(self) -> None:
+        """Test that score property returns flesch_reading_ease."""
+        result = ReadabilityResult(flesch_kincaid_grade=8.5, flesch_reading_ease=65.0)
+        assert result.score == 65.0
+
+
+class TestSyllableCounting:
+    """Tests for syllable counting heuristics."""
+
+    @pytest.fixture
+    def readability(self) -> Readability:
+        """Provide a readability metric instance."""
+        return Readability()
+
+    def test_monosyllabic_words(self, readability: Readability) -> None:
+        """Test that monosyllabic words don't inflate scores."""
+        # All one-syllable words
+        text = "The cat sat on the mat."
+        result = readability.score(text)
+
+        # Should be very easy to read
+        assert result.flesch_reading_ease > 90.0
+
+    def test_polysyllabic_words(self, readability: Readability) -> None:
+        """Test that polysyllabic words affect scores."""
+        # Words with multiple syllables
+        text = "International communication facilitates understanding."
+        result = readability.score(text)
+
+        # Should be harder to read
+        assert result.flesch_reading_ease < 50.0
@@ -0,0 +1,295 @@
+"""Tests for the ROUGE metric."""
+
+import pytest
+
+from veritext.metrics import Rouge, RougeResult, RougeScore
+
+
+class TestRouge:
+    """Tests for the Rouge metric class."""
+
+    @pytest.fixture
+    def rouge(self) -> Rouge:
+        """Provide a ROUGE metric instance."""
+        return Rouge()
+
+    def test_name(self, rouge: Rouge) -> None:
+        """Test that name returns 'rouge'."""
+        assert rouge.name == "rouge"
+
+    def test_requires_reference(self, rouge: Rouge) -> None:
+        """Test that ROUGE requires reference text."""
+        assert rouge.requires_reference is True
+
+    def test_identical_texts(self, rouge: Rouge) -> None:
+        """Test that identical texts produce perfect scores."""
+        text = "The cat sat on the mat"
+        result = rouge.score(text, text)
+
+        assert result.rouge1.precision == 1.0
+        assert result.rouge1.recall == 1.0
+        assert result.rouge1.fmeasure == 1.0
+        assert result.rouge2.fmeasure == 1.0
+        assert result.rouge_l.fmeasure == 1.0
+
+    def test_no_overlap(self, rouge: Rouge) -> None:
+        """Test that texts with no overlap produce zero scores."""
+        candidate = "apple banana cherry"
+        reference = "dog elephant fox"
+        result = rouge.score(candidate, reference)
+
+        assert result.rouge1.precision == 0.0
+        assert result.rouge1.recall == 0.0
+        assert result.rouge1.fmeasure == 0.0
+        assert result.rouge2.fmeasure == 0.0
+        assert result.rouge_l.fmeasure == 0.0
+
+    def test_partial_overlap_rouge1(self, rouge: Rouge) -> None:
+        """Test ROUGE-1 with partial overlap."""
+        candidate = "the cat sat"
+        reference = "the dog sat"
+        result = rouge.score(candidate, reference)
+
+        # Candidate: {the, cat, sat}, Reference: {the, dog, sat}
+        # Overlap: {the, sat} = 2
+        # Precision = 2/3, Recall = 2/3
+        assert abs(result.rouge1.precision - 2 / 3) < 1e-10
+        assert abs(result.rouge1.recall - 2 / 3) < 1e-10
+
+    def test_partial_overlap_rouge2(self, rouge: Rouge) -> None:
+        """Test ROUGE-2 (bigram) with partial overlap."""
+        candidate = "the cat sat on the mat"
+        reference = "the cat lay on the mat"
+        result = rouge.score(candidate, reference)
+
+        # Bigrams in candidate: (the, cat), (cat, sat), (sat, on), (on, the), (the, mat)
+        # Bigrams in reference: (the, cat), (cat, lay), (lay, on), (on, the), (the, mat)
+        # Overlap: (the, cat), (on, the), (the, mat) = 3
+        # Precision = 3/5, Recall = 3/5
+        assert abs(result.rouge2.precision - 3 / 5) < 1e-10
+        assert abs(result.rouge2.recall - 3 / 5) < 1e-10
+
+    def test_rouge_l_basic(self, rouge: Rouge) -> None:
+        """Test ROUGE-L (LCS) computation."""
+        candidate = "the cat sat on the mat"
+        reference = "the cat sat"
+        result = rouge.score(candidate, reference)
+
+        # LCS = "the cat sat" = 3 tokens
+        # Precision = 3/6 = 0.5, Recall = 3/3 = 1.0
+        assert result.rouge_l.precision == 0.5
+        assert result.rouge_l.recall == 1.0
+
+    def test_rouge_l_non_contiguous(self, rouge: Rouge) -> None:
+        """Test ROUGE-L with non-contiguous LCS."""
+        candidate = "the big cat sat"
+        reference = "the cat sat"
+        result = rouge.score(candidate, reference)
+
+        # LCS = "the cat sat" = 3 (skipping "big")
+        # Precision = 3/4, Recall = 3/3 = 1.0
+        assert result.rouge_l.precision == 0.75
+        assert result.rouge_l.recall == 1.0
+
+    def test_precision_vs_recall(self, rouge: Rouge) -> None:
+        """Test that precision and recall differ appropriately."""
+        # Short candidate, long reference
+        candidate = "the cat"
+        reference = "the cat sat on the mat"
+        result = rouge.score(candidate, reference)
+
+        # Precision should be high (all candidate tokens in reference)
+        assert result.rouge1.precision == 1.0
+        # Recall should be lower (not all reference tokens in candidate)
+        assert result.rouge1.recall < 1.0
+
+    def test_empty_candidate(self, rouge: Rouge) -> None:
+        """Test that empty candidate returns zero scores."""
+        result = rouge.score("", "The cat sat")
+
+        assert result.rouge1.fmeasure == 0.0
+        assert result.rouge2.fmeasure == 0.0
+        assert result.rouge_l.fmeasure == 0.0
+
+    def test_whitespace_only_candidate(self, rouge: Rouge) -> None:
+        """Test that whitespace-only candidate returns zero scores."""
+        result = rouge.score("   \t\n  ", "The cat sat")
+
+        assert result.rouge1.fmeasure == 0.0
+        assert result.rouge_l.fmeasure == 0.0
+
+    def test_empty_reference_raises(self, rouge: Rouge) -> None:
+        """Test that empty reference raises ValueError."""
+        with pytest.raises(ValueError, match="cannot be empty"):
+            rouge.score("The cat sat", "")
+
+    def test_none_reference_raises(self, rouge: Rouge) -> None:
+        """Test that None reference raises ValueError."""
+        with pytest.raises(ValueError, match="requires reference"):
+            rouge.score("The cat sat", None)
+
+    def test_multiple_references_uses_max(self, rouge: Rouge) -> None:
+        """Test that multiple references use max scores."""
+        candidate = "the cat sat on the mat"
+        references = [
+            "a dog ran across the room",  # Low overlap
+            "the cat sat on the mat",  # Exact match
+        ]
+        result = rouge.score(candidate, references)
+
+        # Should get perfect scores due to exact match
+        assert result.rouge1.fmeasure == 1.0
+        assert result.rouge_l.fmeasure == 1.0
+
+    def test_multiple_references_partial(self, rouge: Rouge) -> None:
+        """Test multiple references with partial matches."""
+        candidate = "the quick brown fox"
+        references = [
+            "the fast brown fox",  # 3/4 match
+            "a quick brown dog",  # 3/4 match different tokens
+        ]
+        result = rouge.score(candidate, references)
+
+        # Should pick best from either reference
+        assert result.rouge1.fmeasure > 0.0
+
+    def test_result_score_property(self, rouge: Rouge) -> None:
+        """Test that result.score returns rouge_l.fmeasure."""
+        result = rouge.score("The cat sat", "The cat sat")
+        assert result.score == result.rouge_l.fmeasure
+
+    def test_case_insensitivity(self, rouge: Rouge) -> None:
+        """Test that ROUGE is case insensitive by default."""
+        result = rouge.score("THE CAT SAT", "the cat sat")
+        assert result.rouge1.fmeasure == 1.0
+        assert result.rouge_l.fmeasure == 1.0
+
+    def test_punctuation_ignored(self, rouge: Rouge) -> None:
+        """Test that punctuation is ignored by default."""
+        result = rouge.score("The cat sat.", "The cat sat!")
+        assert result.rouge1.fmeasure == 1.0
+
+    def test_single_word(self, rouge: Rouge) -> None:
+        """Test ROUGE with single word texts."""
+        result = rouge.score("cat", "cat")
+
+        assert result.rouge1.fmeasure == 1.0
+        # ROUGE-2 should be 0 for single words (no bigrams)
+        assert result.rouge2.fmeasure == 0.0
+        assert result.rouge_l.fmeasure == 1.0
+
+    def test_fmeasure_calculation(self, rouge: Rouge) -> None:
+        """Test that F-measure is calculated correctly."""
+        # Create a case where P != R
+        candidate = "the cat sat on"
+        reference = "the cat"
+        result = rouge.score(candidate, reference)
+
+        # P = 2/4 = 0.5, R = 2/2 = 1.0
+        # F = 2 * 0.5 * 1.0 / (0.5 + 1.0) = 1.0 / 1.5 = 2/3
+        expected_f = 2 * 0.5 * 1.0 / (0.5 + 1.0)
+        assert abs(result.rouge1.fmeasure - expected_f) < 1e-10
+
+
+class TestRougeBatch:
+    """Tests for ROUGE batch scoring."""
+
+    @pytest.fixture
+    def rouge(self) -> Rouge:
+        """Provide a ROUGE metric instance."""
+        return Rouge()
+
+    def test_batch_score_basic(self, rouge: Rouge) -> None:
+        """Test basic batch scoring."""
+        candidates = ["The cat sat", "A dog runs"]
+        references = ["The cat sat", "A dog runs"]
+        result = rouge.batch_score(candidates, references)
+
+        assert result.count == 2
+        assert len(result.results) == 2
+        assert all(r.rouge_l.fmeasure == 1.0 for r in result.results)
+
+    def test_batch_score_statistics(self, rouge: Rouge) -> None:
+        """Test that batch scoring computes statistics."""
+        candidates = ["The cat sat", "Completely different words"]
+        references = ["The cat sat", "The cat sat"]
+        result = rouge.batch_score(candidates, references)
+
+        # Check statistics are computed
+        assert "rouge1_fmeasure" in result.stats
+        assert "rouge2_fmeasure" in result.stats
+        assert "rouge_l_fmeasure" in result.stats
+        assert "rouge1_precision" in result.stats
+        assert "rouge1_recall" in result.stats
+
+        # First result should be 1.0, second should be 0.0
+        assert result.results[0].rouge1.fmeasure == 1.0
+        assert result.results[1].rouge1.fmeasure == 0.0
+
+    def test_batch_score_percentiles(self, rouge: Rouge) -> None:
+        """Test that batch scoring computes percentiles."""
+        candidates = ["a", "b", "c", "d", "e"]
+        references = ["a", "b", "c", "d", "e"]
+        result = rouge.batch_score(candidates, references)
+
+        stats = result.stats["rouge1_fmeasure"]
+        assert 25 in stats.percentiles
+        assert 50 in stats.percentiles
+        assert 75 in stats.percentiles
+        assert 95 in stats.percentiles
+
+    def test_batch_score_none_references_raises(self, rouge: Rouge) -> None:
+        """Test that batch scoring raises for None references."""
+        with pytest.raises(ValueError, match="requires reference"):
+            rouge.batch_score(["text"], None)
+
+    def test_batch_score_length_mismatch_raises(self, rouge: Rouge) -> None:
+        """Test that batch scoring raises for mismatched lengths."""
+        with pytest.raises(ValueError, match="must match"):
+            rouge.batch_score(["a", "b"], ["a"])
+
+    def test_batch_score_with_multiple_references(self, rouge: Rouge) -> None:
+        """Test batch scoring with multiple references per candidate."""
+        candidates = [
+            "The cat sat on the mat",
+            "A quick brown fox",
+        ]
+        references = [
+            ["The cat sat on the mat", "A cat rests on floor"],
+            ["A quick brown fox", "The fast brown fox"],
+        ]
+        result = rouge.batch_score(candidates, references)
+
+        assert result.count == 2
+        # Both should get perfect scores due to exact matches
+        assert result.results[0].rouge_l.fmeasure == 1.0
+        assert result.results[1].rouge_l.fmeasure == 1.0
+
+
+class TestRougeResult:
+    """Tests for RougeResult and RougeScore types."""
+
+    def test_rouge_score_frozen(self) -> None:
+        """Test that RougeScore is frozen."""
+        from pydantic import ValidationError
+
+        score = RougeScore(precision=0.5, recall=0.6, fmeasure=0.55)
+        with pytest.raises(ValidationError):
+            score.precision = 0.7  # type: ignore[misc]
+
+    def test_rouge_result_frozen(self) -> None:
+        """Test that RougeResult is frozen."""
+        from pydantic import ValidationError
+
+        score = RougeScore(precision=0.5, recall=0.6, fmeasure=0.55)
+        result = RougeResult(rouge1=score, rouge2=score, rouge_l=score)
+        with pytest.raises(ValidationError):
+            result.rouge1 = score  # type: ignore[misc]
+
+    def test_score_property(self) -> None:
+        """Test that score property returns rouge_l.fmeasure."""
+        r1 = RougeScore(precision=0.9, recall=0.9, fmeasure=0.9)
+        r2 = RougeScore(precision=0.8, recall=0.8, fmeasure=0.8)
+        rl = RougeScore(precision=0.7, recall=0.7, fmeasure=0.7)
+        result = RougeResult(rouge1=r1, rouge2=r2, rouge_l=rl)
+        assert result.score == 0.7
Author	SHA1	Message	Date
kschappell	b8ab5811dd	docs(changelog): add ROUGE and readability entries	2026-02-03 17:03:39 +00:00
kschappell	62fac688e4	test(metrics): add ROUGE and readability tests	2026-02-03 17:03:34 +00:00
kschappell	14ac7dbbb9	feat(metrics): export ROUGE and readability from module	2026-02-03 17:03:28 +00:00
kschappell	aad933f9c4	feat(metrics): add readability implementation	2026-02-03 17:03:24 +00:00
kschappell	2a7476046d	feat(metrics): add ROUGE implementation	2026-02-03 17:03:19 +00:00
kschappell	914c738013	feat(metrics): add ROUGE and readability result types	2026-02-03 17:03:14 +00:00