docs(changelog): add ROUGE and readability entries

test(metrics): add ROUGE and readability tests
feat(metrics): export ROUGE and readability from module
2026-02-03 17:03:39 +00:00 · 2026-02-03 17:03:34 +00:00 · 2026-02-03 17:03:28 +00:00 · 2026-02-03 17:03:24 +00:00 · 2026-02-03 17:03:19 +00:00 · 2026-02-03 17:03:14 +00:00
7 changed files with 1114 additions and 2 deletions
@@ -18,4 +18,6 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 - Metrics module with `Metric` protocol, `AggregateStats`, and `BatchResult` types
 - BLEU metric implementation (BLEU-1 through BLEU-4 with brevity penalty)
 - Lexical similarity metric (Jaccard similarity and token overlap)
 - ROUGE metric (ROUGE-1, ROUGE-2, ROUGE-L with precision/recall/F-measure)
 - Flesch-Kincaid readability metrics (grade level and reading ease)
 - Batch scoring with aggregate statistics for all metrics
@@ -1,9 +1,17 @@
-"""Metrics module: BLEU, lexical similarity, and batch processing."""
+"""Metrics module: BLEU, ROUGE, lexical similarity, readability, and batch processing."""
 from veritext.metrics.base import AggregateStats, BatchResult, Metric
 from veritext.metrics.bleu import Bleu
 from veritext.metrics.lexical import Lexical
-from veritext.metrics.results import BleuResult, LexicalResult
+from veritext.metrics.readability import Readability
 from veritext.metrics.results import (
    BleuResult,
    LexicalResult,
    ReadabilityResult,
    RougeResult,
    RougeScore,
 )
 from veritext.metrics.rouge import Rouge
 __all__ = [
    "AggregateStats",
@@ -13,4 +21,9 @@ __all__ = [
    "Lexical",
    "LexicalResult",
    "Metric",
    "Readability",
    "ReadabilityResult",
    "Rouge",
    "RougeResult",
    "RougeScore",
 ]
@@ -0,0 +1,195 @@
 """Readability metrics implementation (Flesch-Kincaid)."""
 import re
 from veritext.metrics.base import AggregateStats, BatchResult
 from veritext.metrics.results import ReadabilityResult
 # Sentence-ending punctuation pattern
 _SENTENCE_ENDINGS = re.compile(r"[.!?]+")
 # Vowel pattern for syllable counting
 _VOWELS = re.compile(r"[aeiouy]+", re.IGNORECASE)
 def _count_syllables(word: str) -> int:
    """
    Count syllables in a word using a heuristic approach.
    Uses vowel group counting with adjustments for common patterns.
    Args:
        word: The word to count syllables for.
    Returns:
        Estimated syllable count (minimum 1 for non-empty words).
    """
    if not word:
        return 0
    word = word.lower().strip()
    if not word:
        return 0
    # Count vowel groups
    vowel_groups = _VOWELS.findall(word)
    count = len(vowel_groups)
    # Adjust for silent 'e' at end
    if word.endswith("e") and count > 1:
        count -= 1
    # Adjust for 'le' ending (e.g., "table", "able")
    if word.endswith("le") and len(word) > 2 and word[-3] not in "aeiouy":
        count += 1
    # Adjust for 'ed' ending when not adding syllable
    if word.endswith("ed") and len(word) > 2 and word[-3] not in "dt":
        count = max(count - 1, 1)
    # Ensure at least 1 syllable for any word
    return max(count, 1)
 def _count_sentences(text: str) -> int:
    """
    Count sentences in text.
    Splits on sentence-ending punctuation (.!?).
    Args:
        text: The text to count sentences in.
    Returns:
        Number of sentences (minimum 1 for non-empty text).
    """
    if not text or not text.strip():
        return 0
    # Split on sentence endings and filter empty strings
    sentences = _SENTENCE_ENDINGS.split(text)
    # Filter out empty segments
    sentences = [s for s in sentences if s.strip()]
    return max(len(sentences), 1)
 def _count_words(text: str) -> tuple[list[str], int]:
    """
    Extract words from text and count them.
    Args:
        text: The text to process.
    Returns:
        Tuple of (word list, word count).
    """
    # Extract words (sequences of letters and apostrophes)
    words = re.findall(r"[a-zA-Z']+", text)
    # Filter out standalone apostrophes
    words = [w for w in words if w.replace("'", "")]
    return words, len(words)
 class Readability:
    """
    Readability metric using Flesch-Kincaid formulas.
    Computes:
    - Flesch-Kincaid Grade Level: US grade level required to understand text
    - Flesch Reading Ease: Score from 0-100 (higher = easier to read)
    This metric does NOT require reference text.
    """
    @property
    def name(self) -> str:
        """Return the name of this metric."""
        return "readability"
    @property
    def requires_reference(self) -> bool:
        """Return whether this metric requires reference text."""
        return False
    def score(
        self,
        candidate: str,
        reference: str | list[str] | None = None,  # noqa: ARG002
    ) -> ReadabilityResult:
        """
        Compute readability scores for a text.
        Args:
            candidate: The text to score.
            reference: Ignored (readability doesn't use reference text).
        Returns:
            ReadabilityResult with Flesch-Kincaid scores.
        """
        # Extract words and count
        words, word_count = _count_words(candidate)
        # Handle empty or trivial text
        if word_count == 0:
            return ReadabilityResult(
                flesch_kincaid_grade=0.0,
                flesch_reading_ease=0.0,
            )
        # Count sentences
        sentence_count = _count_sentences(candidate)
        # Count syllables
        syllable_count = sum(_count_syllables(word) for word in words)
        # Compute ratios
        words_per_sentence = word_count / sentence_count
        syllables_per_word = syllable_count / word_count
        # Flesch-Kincaid Grade Level
        # Formula: 0.39 * (words/sentences) + 11.8 * (syllables/words) - 15.59
        grade_level = 0.39 * words_per_sentence + 11.8 * syllables_per_word - 15.59
        # Flesch Reading Ease
        # Formula: 206.835 - 1.015 * (words/sentences) - 84.6 * (syllables/words)
        reading_ease = 206.835 - 1.015 * words_per_sentence - 84.6 * syllables_per_word
        return ReadabilityResult(
            flesch_kincaid_grade=grade_level,
            flesch_reading_ease=reading_ease,
        )
    def batch_score(
        self,
        candidates: list[str],
        references: list[str] | list[list[str]] | None = None,  # noqa: ARG002
    ) -> BatchResult[ReadabilityResult]:
        """
        Compute readability scores for a batch of texts.
        Args:
            candidates: List of texts to score.
            references: Ignored (readability doesn't use reference text).
        Returns:
            BatchResult containing individual results and aggregate statistics.
        """
        if not candidates:
            raise ValueError("Cannot compute batch statistics from empty list")
        results: list[ReadabilityResult] = []
        for cand in candidates:
            results.append(self.score(cand))
        # Compute aggregate statistics
        stats = {
            "flesch_kincaid_grade": AggregateStats.from_values(
                [r.flesch_kincaid_grade for r in results]
            ),
            "flesch_reading_ease": AggregateStats.from_values(
                [r.flesch_reading_ease for r in results]
            ),
        }
        return BatchResult(results=results, count=len(results), stats=stats)
@@ -39,3 +39,55 @@ class LexicalResult(BaseModel):
    token_overlap: float
    """Proportion of candidate tokens found in reference."""
 class RougeScore(BaseModel):
    """Individual ROUGE variant score with precision, recall, F-measure."""
    model_config = ConfigDict(frozen=True)
    precision: float
    """Precision: overlap / candidate length."""
    recall: float
    """Recall: overlap / reference length."""
    fmeasure: float
    """F1-measure: harmonic mean of precision and recall."""
 class RougeResult(BaseModel):
    """Result of ROUGE score computation."""
    model_config = ConfigDict(frozen=True)
    rouge1: RougeScore
    """ROUGE-1 (unigram) score."""
    rouge2: RougeScore
    """ROUGE-2 (bigram) score."""
    rouge_l: RougeScore
    """ROUGE-L (longest common subsequence) score."""
    @property
    def score(self) -> float:
        """Return ROUGE-L F-measure as the primary score."""
        return self.rouge_l.fmeasure
 class ReadabilityResult(BaseModel):
    """Result of readability computation."""
    model_config = ConfigDict(frozen=True)
    flesch_kincaid_grade: float
    """US grade level (e.g., 8.0 = 8th grade reading level)."""
    flesch_reading_ease: float
    """Score 0-100, higher = easier to read."""
    @property
    def score(self) -> float:
        """Return Flesch reading ease as the primary score."""
        return self.flesch_reading_ease
@@ -0,0 +1,281 @@
 """ROUGE (Recall-Oriented Understudy for Gisting Evaluation) metric implementation."""
 from collections import Counter
 from veritext.core.tokenisation import WordTokeniser
 from veritext.metrics.base import AggregateStats, BatchResult
 from veritext.metrics.results import RougeResult, RougeScore
 def _get_ngrams(tokens: list[str], n: int) -> Counter[tuple[str, ...]]:
    """Extract n-grams from a list of tokens."""
    if n > len(tokens):
        return Counter()
    return Counter(tuple(tokens[i : i + n]) for i in range(len(tokens) - n + 1))
 def _ngram_overlap(
    candidate_ngrams: Counter[tuple[str, ...]],
    reference_ngrams: Counter[tuple[str, ...]],
 ) -> int:
    """Compute the overlap count between candidate and reference n-grams."""
    overlap = 0
    for ngram, count in candidate_ngrams.items():
        overlap += min(count, reference_ngrams.get(ngram, 0))
    return overlap
 def _compute_rouge_score(
    candidate_tokens: list[str],
    reference_tokens: list[str],
    n: int,
 ) -> RougeScore:
    """
    Compute ROUGE-n score for given n-gram size.
    Args:
        candidate_tokens: Tokenised candidate text.
        reference_tokens: Tokenised reference text.
        n: N-gram size.
    Returns:
        RougeScore with precision, recall, and F-measure.
    """
    candidate_ngrams = _get_ngrams(candidate_tokens, n)
    reference_ngrams = _get_ngrams(reference_tokens, n)
    candidate_count = sum(candidate_ngrams.values())
    reference_count = sum(reference_ngrams.values())
    if candidate_count == 0 and reference_count == 0:
        return RougeScore(precision=0.0, recall=0.0, fmeasure=0.0)
    overlap = _ngram_overlap(candidate_ngrams, reference_ngrams)
    precision = overlap / candidate_count if candidate_count > 0 else 0.0
    recall = overlap / reference_count if reference_count > 0 else 0.0
    if precision + recall > 0:
        fmeasure = 2 * precision * recall / (precision + recall)
    else:
        fmeasure = 0.0
    return RougeScore(precision=precision, recall=recall, fmeasure=fmeasure)
 def _lcs_length(seq1: list[str], seq2: list[str]) -> int:
    """
    Compute the length of the longest common subsequence.
    Uses dynamic programming with O(m*n) time and O(min(m,n)) space.
    """
    if not seq1 or not seq2:
        return 0
    # Optimise by using shorter sequence for columns
    if len(seq1) < len(seq2):
        seq1, seq2 = seq2, seq1
    m, n = len(seq1), len(seq2)
    # Only need two rows at a time
    prev = [0] * (n + 1)
    curr = [0] * (n + 1)
    for i in range(1, m + 1):
        for j in range(1, n + 1):
            if seq1[i - 1] == seq2[j - 1]:
                curr[j] = prev[j - 1] + 1
            else:
                curr[j] = max(prev[j], curr[j - 1])
        prev, curr = curr, prev
    return prev[n]
 def _compute_rouge_l(
    candidate_tokens: list[str],
    reference_tokens: list[str],
 ) -> RougeScore:
    """
    Compute ROUGE-L score using longest common subsequence.
    Args:
        candidate_tokens: Tokenised candidate text.
        reference_tokens: Tokenised reference text.
    Returns:
        RougeScore with precision, recall, and F-measure.
    """
    if not candidate_tokens and not reference_tokens:
        return RougeScore(precision=0.0, recall=0.0, fmeasure=0.0)
    if not candidate_tokens or not reference_tokens:
        return RougeScore(precision=0.0, recall=0.0, fmeasure=0.0)
    lcs = _lcs_length(candidate_tokens, reference_tokens)
    precision = lcs / len(candidate_tokens)
    recall = lcs / len(reference_tokens)
    if precision + recall > 0:
        fmeasure = 2 * precision * recall / (precision + recall)
    else:
        fmeasure = 0.0
    return RougeScore(precision=precision, recall=recall, fmeasure=fmeasure)
 def _max_rouge_scores(scores: list[RougeScore]) -> RougeScore:
    """Select the RougeScore with the highest F-measure from a list."""
    return max(scores, key=lambda s: s.fmeasure)
 class Rouge:
    """
    ROUGE metric for measuring summary/generation quality.
    Computes ROUGE-1 (unigram), ROUGE-2 (bigram), and ROUGE-L (LCS) scores.
    ROUGE is recall-oriented, measuring how much of the reference is captured.
    """
    def __init__(self, tokeniser: WordTokeniser | None = None) -> None:
        """
        Initialise the ROUGE metric.
        Args:
            tokeniser: Tokeniser to use. Defaults to WordTokeniser().
        """
        self._tokeniser = tokeniser or WordTokeniser()
    @property
    def name(self) -> str:
        """Return the name of this metric."""
        return "rouge"
    @property
    def requires_reference(self) -> bool:
        """Return whether this metric requires reference text."""
        return True
    def score(
        self, candidate: str, reference: str | list[str] | None = None
    ) -> RougeResult:
        """
        Compute ROUGE scores for a candidate text.
        Args:
            candidate: The text to score.
            reference: Reference text(s) for comparison. If multiple references
                are provided, returns the maximum score for each variant.
        Returns:
            RougeResult with ROUGE-1, ROUGE-2, and ROUGE-L scores.
        Raises:
            ValueError: If reference is None or empty.
        """
        if reference is None:
            raise ValueError("ROUGE requires reference text")
        # Normalise reference to list
        references = [reference] if isinstance(reference, str) else reference
        # Tokenise
        candidate_tokens = self._tokeniser.tokenise(candidate)
        reference_token_lists = [self._tokeniser.tokenise(r) for r in references]
        # Handle empty references
        if all(not ref for ref in reference_token_lists):
            raise ValueError("Reference text cannot be empty")
        # Handle empty candidate
        if not candidate_tokens:
            return RougeResult(
                rouge1=RougeScore(precision=0.0, recall=0.0, fmeasure=0.0),
                rouge2=RougeScore(precision=0.0, recall=0.0, fmeasure=0.0),
                rouge_l=RougeScore(precision=0.0, recall=0.0, fmeasure=0.0),
            )
        # Compute scores for each reference and take max
        rouge1_scores = []
        rouge2_scores = []
        rouge_l_scores = []
        for ref_tokens in reference_token_lists:
            if not ref_tokens:
                continue
            rouge1_scores.append(_compute_rouge_score(candidate_tokens, ref_tokens, 1))
            rouge2_scores.append(_compute_rouge_score(candidate_tokens, ref_tokens, 2))
            rouge_l_scores.append(_compute_rouge_l(candidate_tokens, ref_tokens))
        return RougeResult(
            rouge1=_max_rouge_scores(rouge1_scores),
            rouge2=_max_rouge_scores(rouge2_scores),
            rouge_l=_max_rouge_scores(rouge_l_scores),
        )
    def batch_score(
        self,
        candidates: list[str],
        references: list[str] | list[list[str]] | None = None,
    ) -> BatchResult[RougeResult]:
        """
        Compute ROUGE scores for a batch of candidates.
        Args:
            candidates: List of texts to score.
            references: Reference text(s) for each candidate.
        Returns:
            BatchResult containing individual results and aggregate statistics.
        Raises:
            ValueError: If references is None or length mismatch.
        """
        if references is None:
            raise ValueError("ROUGE requires reference texts")
        if len(candidates) != len(references):
            raise ValueError(
                f"Number of candidates ({len(candidates)}) must match "
                f"number of references ({len(references)})"
            )
        results: list[RougeResult] = []
        for i, cand in enumerate(candidates):
            ref: str | list[str] = references[i]
            results.append(self.score(cand, ref))
        # Compute aggregate statistics for each score type
        stats = {
            "rouge1_precision": AggregateStats.from_values(
                [r.rouge1.precision for r in results]
            ),
            "rouge1_recall": AggregateStats.from_values(
                [r.rouge1.recall for r in results]
            ),
            "rouge1_fmeasure": AggregateStats.from_values(
                [r.rouge1.fmeasure for r in results]
            ),
            "rouge2_precision": AggregateStats.from_values(
                [r.rouge2.precision for r in results]
            ),
            "rouge2_recall": AggregateStats.from_values(
                [r.rouge2.recall for r in results]
            ),
            "rouge2_fmeasure": AggregateStats.from_values(
                [r.rouge2.fmeasure for r in results]
            ),
            "rouge_l_precision": AggregateStats.from_values(
                [r.rouge_l.precision for r in results]
            ),
            "rouge_l_recall": AggregateStats.from_values(
                [r.rouge_l.recall for r in results]
            ),
            "rouge_l_fmeasure": AggregateStats.from_values(
                [r.rouge_l.fmeasure for r in results]
            ),
        }
        return BatchResult(results=results, count=len(results), stats=stats)
@@ -0,0 +1,274 @@
 """Tests for the readability metric."""
 import pytest
 from veritext.metrics import Readability, ReadabilityResult
 class TestReadability:
    """Tests for the Readability metric class."""
    @pytest.fixture
    def readability(self) -> Readability:
        """Provide a readability metric instance."""
        return Readability()
    def test_name(self, readability: Readability) -> None:
        """Test that name returns 'readability'."""
        assert readability.name == "readability"
    def test_requires_reference(self, readability: Readability) -> None:
        """Test that readability does NOT require reference text."""
        assert readability.requires_reference is False
    def test_simple_text(self, readability: Readability) -> None:
        """Test readability of simple, easy text."""
        # Simple children's text - short sentences, simple words
        text = "The cat sat. The dog ran. I see a bird."
        result = readability.score(text)
        # Should have low grade level and high reading ease
        assert result.flesch_kincaid_grade < 5.0
        assert result.flesch_reading_ease > 80.0
    def test_complex_text(self, readability: Readability) -> None:
        """Test readability of complex, academic text."""
        # Complex academic text - long sentences, polysyllabic words
        text = (
            "The implementation of sophisticated computational methodologies "
            "necessitates comprehensive understanding of algorithmic complexity "
            "and architectural considerations."
        )
        result = readability.score(text)
        # Should have high grade level and low reading ease
        assert result.flesch_kincaid_grade > 12.0
        assert result.flesch_reading_ease < 30.0
    def test_medium_text(self, readability: Readability) -> None:
        """Test readability of medium-difficulty text."""
        text = (
            "The weather today is quite pleasant. "
            "Many people are enjoying the sunshine in the park. "
            "Children play while parents watch nearby."
        )
        result = readability.score(text)
        # Should be middle of the road
        assert 3.0 < result.flesch_kincaid_grade < 10.0
        assert 50.0 < result.flesch_reading_ease < 90.0
    def test_single_sentence(self, readability: Readability) -> None:
        """Test readability with a single sentence."""
        text = "The cat sat on the mat."
        result = readability.score(text)
        # Should compute without error
        assert result.flesch_kincaid_grade is not None
        assert result.flesch_reading_ease is not None
    def test_single_word(self, readability: Readability) -> None:
        """Test readability with a single word."""
        text = "Cat"
        result = readability.score(text)
        # Should handle single word (1 word, 1 sentence, 1 syllable)
        assert result.flesch_kincaid_grade is not None
        assert result.flesch_reading_ease is not None
    def test_empty_text(self, readability: Readability) -> None:
        """Test that empty text returns zero scores."""
        result = readability.score("")
        assert result.flesch_kincaid_grade == 0.0
        assert result.flesch_reading_ease == 0.0
    def test_whitespace_only(self, readability: Readability) -> None:
        """Test that whitespace-only text returns zero scores."""
        result = readability.score("   \t\n  ")
        assert result.flesch_kincaid_grade == 0.0
        assert result.flesch_reading_ease == 0.0
    def test_reference_ignored(self, readability: Readability) -> None:
        """Test that reference parameter is ignored."""
        text = "The cat sat on the mat."
        # Score with no reference
        result1 = readability.score(text)
        # Score with reference (should be ignored)
        result2 = readability.score(text, "Completely different text")
        # Score with list of references
        result3 = readability.score(text, ["ref1", "ref2"])
        # All should produce identical results
        assert result1.flesch_kincaid_grade == result2.flesch_kincaid_grade
        assert result1.flesch_reading_ease == result2.flesch_reading_ease
        assert result1.flesch_kincaid_grade == result3.flesch_kincaid_grade
    def test_punctuation_handling(self, readability: Readability) -> None:
        """Test that punctuation affects sentence counting."""
        # Same words, different sentence structure
        text1 = "The cat sat on the mat"  # 1 sentence
        text2 = "The cat sat. On the mat."  # 2 sentences
        result1 = readability.score(text1)
        result2 = readability.score(text2)
        # Different sentence counts should affect scores
        assert result1.flesch_kincaid_grade != result2.flesch_kincaid_grade
    def test_question_marks_count_sentences(self, readability: Readability) -> None:
        """Test that question marks end sentences."""
        text = "What is this? It is a test."
        result = readability.score(text)
        # Should count as 2 sentences
        # With 7 words total, words_per_sentence = 3.5
        assert result.flesch_kincaid_grade is not None
    def test_exclamation_marks_count_sentences(self, readability: Readability) -> None:
        """Test that exclamation marks end sentences."""
        text = "Wow! That is amazing!"
        result = readability.score(text)
        # Should count as 2 sentences
        assert result.flesch_kincaid_grade is not None
    def test_multiple_punctuation(self, readability: Readability) -> None:
        """Test handling of multiple punctuation marks."""
        text = "What?! That's crazy... Well then."
        result = readability.score(text)
        # Should handle gracefully
        assert result.flesch_kincaid_grade is not None
    def test_result_score_property(self, readability: Readability) -> None:
        """Test that result.score returns flesch_reading_ease."""
        result = readability.score("The cat sat on the mat.")
        assert result.score == result.flesch_reading_ease
    def test_contractions(self, readability: Readability) -> None:
        """Test handling of contractions."""
        text = "I'm going to the store. It's not far away."
        result = readability.score(text)
        # Should handle contractions as words
        assert result.flesch_kincaid_grade is not None
        assert result.flesch_reading_ease is not None
 class TestReadabilityBatch:
    """Tests for readability batch scoring."""
    @pytest.fixture
    def readability(self) -> Readability:
        """Provide a readability metric instance."""
        return Readability()
    def test_batch_score_basic(self, readability: Readability) -> None:
        """Test basic batch scoring."""
        candidates = [
            "The cat sat on the mat.",
            "A dog ran through the park.",
        ]
        result = readability.batch_score(candidates)
        assert result.count == 2
        assert len(result.results) == 2
    def test_batch_score_statistics(self, readability: Readability) -> None:
        """Test that batch scoring computes statistics."""
        candidates = [
            "Cat sat.",  # Very simple
            "The implementation of sophisticated methodologies requires expertise.",
        ]
        result = readability.batch_score(candidates)
        # Check statistics are computed
        assert "flesch_kincaid_grade" in result.stats
        assert "flesch_reading_ease" in result.stats
        # First should be easier than second
        assert (
            result.results[0].flesch_reading_ease
            > result.results[1].flesch_reading_ease
        )
    def test_batch_score_percentiles(self, readability: Readability) -> None:
        """Test that batch scoring computes percentiles."""
        candidates = ["a", "b", "c", "d", "e"]
        result = readability.batch_score(candidates)
        stats = result.stats["flesch_reading_ease"]
        assert 25 in stats.percentiles
        assert 50 in stats.percentiles
        assert 75 in stats.percentiles
        assert 95 in stats.percentiles
    def test_batch_score_references_ignored(self, readability: Readability) -> None:
        """Test that batch scoring ignores references."""
        candidates = ["The cat sat.", "A dog ran."]
        result1 = readability.batch_score(candidates)
        result2 = readability.batch_score(candidates, ["ref1", "ref2"])
        # Results should be identical
        assert result1.results[0].flesch_kincaid_grade == (
            result2.results[0].flesch_kincaid_grade
        )
    def test_batch_score_empty_list_raises(self, readability: Readability) -> None:
        """Test that empty candidate list raises ValueError."""
        with pytest.raises(ValueError, match="empty"):
            readability.batch_score([])
 class TestReadabilityResult:
    """Tests for ReadabilityResult type."""
    def test_frozen(self) -> None:
        """Test that ReadabilityResult is frozen."""
        from pydantic import ValidationError
        result = ReadabilityResult(flesch_kincaid_grade=5.0, flesch_reading_ease=70.0)
        with pytest.raises(ValidationError):
            result.flesch_kincaid_grade = 6.0  # type: ignore[misc]
    def test_values(self) -> None:
        """Test that values are stored correctly."""
        result = ReadabilityResult(flesch_kincaid_grade=8.5, flesch_reading_ease=65.0)
        assert result.flesch_kincaid_grade == 8.5
        assert result.flesch_reading_ease == 65.0
    def test_score_property(self) -> None:
        """Test that score property returns flesch_reading_ease."""
        result = ReadabilityResult(flesch_kincaid_grade=8.5, flesch_reading_ease=65.0)
        assert result.score == 65.0
 class TestSyllableCounting:
    """Tests for syllable counting heuristics."""
    @pytest.fixture
    def readability(self) -> Readability:
        """Provide a readability metric instance."""
        return Readability()
    def test_monosyllabic_words(self, readability: Readability) -> None:
        """Test that monosyllabic words don't inflate scores."""
        # All one-syllable words
        text = "The cat sat on the mat."
        result = readability.score(text)
        # Should be very easy to read
        assert result.flesch_reading_ease > 90.0
    def test_polysyllabic_words(self, readability: Readability) -> None:
        """Test that polysyllabic words affect scores."""
        # Words with multiple syllables
        text = "International communication facilitates understanding."
        result = readability.score(text)
        # Should be harder to read
        assert result.flesch_reading_ease < 50.0
@@ -0,0 +1,295 @@
 """Tests for the ROUGE metric."""
 import pytest
 from veritext.metrics import Rouge, RougeResult, RougeScore
 class TestRouge:
    """Tests for the Rouge metric class."""
    @pytest.fixture
    def rouge(self) -> Rouge:
        """Provide a ROUGE metric instance."""
        return Rouge()
    def test_name(self, rouge: Rouge) -> None:
        """Test that name returns 'rouge'."""
        assert rouge.name == "rouge"
    def test_requires_reference(self, rouge: Rouge) -> None:
        """Test that ROUGE requires reference text."""
        assert rouge.requires_reference is True
    def test_identical_texts(self, rouge: Rouge) -> None:
        """Test that identical texts produce perfect scores."""
        text = "The cat sat on the mat"
        result = rouge.score(text, text)
        assert result.rouge1.precision == 1.0
        assert result.rouge1.recall == 1.0
        assert result.rouge1.fmeasure == 1.0
        assert result.rouge2.fmeasure == 1.0
        assert result.rouge_l.fmeasure == 1.0
    def test_no_overlap(self, rouge: Rouge) -> None:
        """Test that texts with no overlap produce zero scores."""
        candidate = "apple banana cherry"
        reference = "dog elephant fox"
        result = rouge.score(candidate, reference)
        assert result.rouge1.precision == 0.0
        assert result.rouge1.recall == 0.0
        assert result.rouge1.fmeasure == 0.0
        assert result.rouge2.fmeasure == 0.0
        assert result.rouge_l.fmeasure == 0.0
    def test_partial_overlap_rouge1(self, rouge: Rouge) -> None:
        """Test ROUGE-1 with partial overlap."""
        candidate = "the cat sat"
        reference = "the dog sat"
        result = rouge.score(candidate, reference)
        # Candidate: {the, cat, sat}, Reference: {the, dog, sat}
        # Overlap: {the, sat} = 2
        # Precision = 2/3, Recall = 2/3
        assert abs(result.rouge1.precision - 2 / 3) < 1e-10
        assert abs(result.rouge1.recall - 2 / 3) < 1e-10
    def test_partial_overlap_rouge2(self, rouge: Rouge) -> None:
        """Test ROUGE-2 (bigram) with partial overlap."""
        candidate = "the cat sat on the mat"
        reference = "the cat lay on the mat"
        result = rouge.score(candidate, reference)
        # Bigrams in candidate: (the, cat), (cat, sat), (sat, on), (on, the), (the, mat)
        # Bigrams in reference: (the, cat), (cat, lay), (lay, on), (on, the), (the, mat)
        # Overlap: (the, cat), (on, the), (the, mat) = 3
        # Precision = 3/5, Recall = 3/5
        assert abs(result.rouge2.precision - 3 / 5) < 1e-10
        assert abs(result.rouge2.recall - 3 / 5) < 1e-10
    def test_rouge_l_basic(self, rouge: Rouge) -> None:
        """Test ROUGE-L (LCS) computation."""
        candidate = "the cat sat on the mat"
        reference = "the cat sat"
        result = rouge.score(candidate, reference)
        # LCS = "the cat sat" = 3 tokens
        # Precision = 3/6 = 0.5, Recall = 3/3 = 1.0
        assert result.rouge_l.precision == 0.5
        assert result.rouge_l.recall == 1.0
    def test_rouge_l_non_contiguous(self, rouge: Rouge) -> None:
        """Test ROUGE-L with non-contiguous LCS."""
        candidate = "the big cat sat"
        reference = "the cat sat"
        result = rouge.score(candidate, reference)
        # LCS = "the cat sat" = 3 (skipping "big")
        # Precision = 3/4, Recall = 3/3 = 1.0
        assert result.rouge_l.precision == 0.75
        assert result.rouge_l.recall == 1.0
    def test_precision_vs_recall(self, rouge: Rouge) -> None:
        """Test that precision and recall differ appropriately."""
        # Short candidate, long reference
        candidate = "the cat"
        reference = "the cat sat on the mat"
        result = rouge.score(candidate, reference)
        # Precision should be high (all candidate tokens in reference)
        assert result.rouge1.precision == 1.0
        # Recall should be lower (not all reference tokens in candidate)
        assert result.rouge1.recall < 1.0
    def test_empty_candidate(self, rouge: Rouge) -> None:
        """Test that empty candidate returns zero scores."""
        result = rouge.score("", "The cat sat")
        assert result.rouge1.fmeasure == 0.0
        assert result.rouge2.fmeasure == 0.0
        assert result.rouge_l.fmeasure == 0.0
    def test_whitespace_only_candidate(self, rouge: Rouge) -> None:
        """Test that whitespace-only candidate returns zero scores."""
        result = rouge.score("   \t\n  ", "The cat sat")
        assert result.rouge1.fmeasure == 0.0
        assert result.rouge_l.fmeasure == 0.0
    def test_empty_reference_raises(self, rouge: Rouge) -> None:
        """Test that empty reference raises ValueError."""
        with pytest.raises(ValueError, match="cannot be empty"):
            rouge.score("The cat sat", "")
    def test_none_reference_raises(self, rouge: Rouge) -> None:
        """Test that None reference raises ValueError."""
        with pytest.raises(ValueError, match="requires reference"):
            rouge.score("The cat sat", None)
    def test_multiple_references_uses_max(self, rouge: Rouge) -> None:
        """Test that multiple references use max scores."""
        candidate = "the cat sat on the mat"
        references = [
            "a dog ran across the room",  # Low overlap
            "the cat sat on the mat",  # Exact match
        ]
        result = rouge.score(candidate, references)
        # Should get perfect scores due to exact match
        assert result.rouge1.fmeasure == 1.0
        assert result.rouge_l.fmeasure == 1.0
    def test_multiple_references_partial(self, rouge: Rouge) -> None:
        """Test multiple references with partial matches."""
        candidate = "the quick brown fox"
        references = [
            "the fast brown fox",  # 3/4 match
            "a quick brown dog",  # 3/4 match different tokens
        ]
        result = rouge.score(candidate, references)
        # Should pick best from either reference
        assert result.rouge1.fmeasure > 0.0
    def test_result_score_property(self, rouge: Rouge) -> None:
        """Test that result.score returns rouge_l.fmeasure."""
        result = rouge.score("The cat sat", "The cat sat")
        assert result.score == result.rouge_l.fmeasure
    def test_case_insensitivity(self, rouge: Rouge) -> None:
        """Test that ROUGE is case insensitive by default."""
        result = rouge.score("THE CAT SAT", "the cat sat")
        assert result.rouge1.fmeasure == 1.0
        assert result.rouge_l.fmeasure == 1.0
    def test_punctuation_ignored(self, rouge: Rouge) -> None:
        """Test that punctuation is ignored by default."""
        result = rouge.score("The cat sat.", "The cat sat!")
        assert result.rouge1.fmeasure == 1.0
    def test_single_word(self, rouge: Rouge) -> None:
        """Test ROUGE with single word texts."""
        result = rouge.score("cat", "cat")
        assert result.rouge1.fmeasure == 1.0
        # ROUGE-2 should be 0 for single words (no bigrams)
        assert result.rouge2.fmeasure == 0.0
        assert result.rouge_l.fmeasure == 1.0
    def test_fmeasure_calculation(self, rouge: Rouge) -> None:
        """Test that F-measure is calculated correctly."""
        # Create a case where P != R
        candidate = "the cat sat on"
        reference = "the cat"
        result = rouge.score(candidate, reference)
        # P = 2/4 = 0.5, R = 2/2 = 1.0
        # F = 2 * 0.5 * 1.0 / (0.5 + 1.0) = 1.0 / 1.5 = 2/3
        expected_f = 2 * 0.5 * 1.0 / (0.5 + 1.0)
        assert abs(result.rouge1.fmeasure - expected_f) < 1e-10
 class TestRougeBatch:
    """Tests for ROUGE batch scoring."""
    @pytest.fixture
    def rouge(self) -> Rouge:
        """Provide a ROUGE metric instance."""
        return Rouge()
    def test_batch_score_basic(self, rouge: Rouge) -> None:
        """Test basic batch scoring."""
        candidates = ["The cat sat", "A dog runs"]
        references = ["The cat sat", "A dog runs"]
        result = rouge.batch_score(candidates, references)
        assert result.count == 2
        assert len(result.results) == 2
        assert all(r.rouge_l.fmeasure == 1.0 for r in result.results)
    def test_batch_score_statistics(self, rouge: Rouge) -> None:
        """Test that batch scoring computes statistics."""
        candidates = ["The cat sat", "Completely different words"]
        references = ["The cat sat", "The cat sat"]
        result = rouge.batch_score(candidates, references)
        # Check statistics are computed
        assert "rouge1_fmeasure" in result.stats
        assert "rouge2_fmeasure" in result.stats
        assert "rouge_l_fmeasure" in result.stats
        assert "rouge1_precision" in result.stats
        assert "rouge1_recall" in result.stats
        # First result should be 1.0, second should be 0.0
        assert result.results[0].rouge1.fmeasure == 1.0
        assert result.results[1].rouge1.fmeasure == 0.0
    def test_batch_score_percentiles(self, rouge: Rouge) -> None:
        """Test that batch scoring computes percentiles."""
        candidates = ["a", "b", "c", "d", "e"]
        references = ["a", "b", "c", "d", "e"]
        result = rouge.batch_score(candidates, references)
        stats = result.stats["rouge1_fmeasure"]
        assert 25 in stats.percentiles
        assert 50 in stats.percentiles
        assert 75 in stats.percentiles
        assert 95 in stats.percentiles
    def test_batch_score_none_references_raises(self, rouge: Rouge) -> None:
        """Test that batch scoring raises for None references."""
        with pytest.raises(ValueError, match="requires reference"):
            rouge.batch_score(["text"], None)
    def test_batch_score_length_mismatch_raises(self, rouge: Rouge) -> None:
        """Test that batch scoring raises for mismatched lengths."""
        with pytest.raises(ValueError, match="must match"):
            rouge.batch_score(["a", "b"], ["a"])
    def test_batch_score_with_multiple_references(self, rouge: Rouge) -> None:
        """Test batch scoring with multiple references per candidate."""
        candidates = [
            "The cat sat on the mat",
            "A quick brown fox",
        ]
        references = [
            ["The cat sat on the mat", "A cat rests on floor"],
            ["A quick brown fox", "The fast brown fox"],
        ]
        result = rouge.batch_score(candidates, references)
        assert result.count == 2
        # Both should get perfect scores due to exact matches
        assert result.results[0].rouge_l.fmeasure == 1.0
        assert result.results[1].rouge_l.fmeasure == 1.0
 class TestRougeResult:
    """Tests for RougeResult and RougeScore types."""
    def test_rouge_score_frozen(self) -> None:
        """Test that RougeScore is frozen."""
        from pydantic import ValidationError
        score = RougeScore(precision=0.5, recall=0.6, fmeasure=0.55)
        with pytest.raises(ValidationError):
            score.precision = 0.7  # type: ignore[misc]
    def test_rouge_result_frozen(self) -> None:
        """Test that RougeResult is frozen."""
        from pydantic import ValidationError
        score = RougeScore(precision=0.5, recall=0.6, fmeasure=0.55)
        result = RougeResult(rouge1=score, rouge2=score, rouge_l=score)
        with pytest.raises(ValidationError):
            result.rouge1 = score  # type: ignore[misc]
    def test_score_property(self) -> None:
        """Test that score property returns rouge_l.fmeasure."""
        r1 = RougeScore(precision=0.9, recall=0.9, fmeasure=0.9)
        r2 = RougeScore(precision=0.8, recall=0.8, fmeasure=0.8)
        rl = RougeScore(precision=0.7, recall=0.7, fmeasure=0.7)
        result = RougeResult(rouge1=r1, rouge2=r2, rouge_l=rl)
        assert result.score == 0.7
Author	SHA1	Message	Date
kschappell	b8ab5811dd	docs(changelog): add ROUGE and readability entries	2026-02-03 17:03:39 +00:00
kschappell	62fac688e4	test(metrics): add ROUGE and readability tests	2026-02-03 17:03:34 +00:00
kschappell	14ac7dbbb9	feat(metrics): export ROUGE and readability from module	2026-02-03 17:03:28 +00:00
kschappell	aad933f9c4	feat(metrics): add readability implementation	2026-02-03 17:03:24 +00:00
kschappell	2a7476046d	feat(metrics): add ROUGE implementation	2026-02-03 17:03:19 +00:00
kschappell	914c738013	feat(metrics): add ROUGE and readability result types	2026-02-03 17:03:14 +00:00