Compare commits
6 Commits
feat/metri
...
feat/metri
| Author | SHA1 | Date | |
|---|---|---|---|
|
b8ab5811dd
|
|||
|
62fac688e4
|
|||
|
14ac7dbbb9
|
|||
|
aad933f9c4
|
|||
|
2a7476046d
|
|||
|
914c738013
|
@@ -18,4 +18,6 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
|
||||
- Metrics module with `Metric` protocol, `AggregateStats`, and `BatchResult` types
|
||||
- BLEU metric implementation (BLEU-1 through BLEU-4 with brevity penalty)
|
||||
- Lexical similarity metric (Jaccard similarity and token overlap)
|
||||
- ROUGE metric (ROUGE-1, ROUGE-2, ROUGE-L with precision/recall/F-measure)
|
||||
- Flesch-Kincaid readability metrics (grade level and reading ease)
|
||||
- Batch scoring with aggregate statistics for all metrics
|
||||
|
||||
@@ -1,9 +1,17 @@
|
||||
"""Metrics module: BLEU, lexical similarity, and batch processing."""
|
||||
"""Metrics module: BLEU, ROUGE, lexical similarity, readability, and batch processing."""
|
||||
|
||||
from veritext.metrics.base import AggregateStats, BatchResult, Metric
|
||||
from veritext.metrics.bleu import Bleu
|
||||
from veritext.metrics.lexical import Lexical
|
||||
from veritext.metrics.results import BleuResult, LexicalResult
|
||||
from veritext.metrics.readability import Readability
|
||||
from veritext.metrics.results import (
|
||||
BleuResult,
|
||||
LexicalResult,
|
||||
ReadabilityResult,
|
||||
RougeResult,
|
||||
RougeScore,
|
||||
)
|
||||
from veritext.metrics.rouge import Rouge
|
||||
|
||||
__all__ = [
|
||||
"AggregateStats",
|
||||
@@ -13,4 +21,9 @@ __all__ = [
|
||||
"Lexical",
|
||||
"LexicalResult",
|
||||
"Metric",
|
||||
"Readability",
|
||||
"ReadabilityResult",
|
||||
"Rouge",
|
||||
"RougeResult",
|
||||
"RougeScore",
|
||||
]
|
||||
|
||||
195
src/veritext/metrics/readability.py
Normal file
195
src/veritext/metrics/readability.py
Normal file
@@ -0,0 +1,195 @@
|
||||
"""Readability metrics implementation (Flesch-Kincaid)."""
|
||||
|
||||
import re
|
||||
|
||||
from veritext.metrics.base import AggregateStats, BatchResult
|
||||
from veritext.metrics.results import ReadabilityResult
|
||||
|
||||
# Sentence-ending punctuation pattern
|
||||
_SENTENCE_ENDINGS = re.compile(r"[.!?]+")
|
||||
|
||||
# Vowel pattern for syllable counting
|
||||
_VOWELS = re.compile(r"[aeiouy]+", re.IGNORECASE)
|
||||
|
||||
|
||||
def _count_syllables(word: str) -> int:
|
||||
"""
|
||||
Count syllables in a word using a heuristic approach.
|
||||
|
||||
Uses vowel group counting with adjustments for common patterns.
|
||||
|
||||
Args:
|
||||
word: The word to count syllables for.
|
||||
|
||||
Returns:
|
||||
Estimated syllable count (minimum 1 for non-empty words).
|
||||
"""
|
||||
if not word:
|
||||
return 0
|
||||
|
||||
word = word.lower().strip()
|
||||
if not word:
|
||||
return 0
|
||||
|
||||
# Count vowel groups
|
||||
vowel_groups = _VOWELS.findall(word)
|
||||
count = len(vowel_groups)
|
||||
|
||||
# Adjust for silent 'e' at end
|
||||
if word.endswith("e") and count > 1:
|
||||
count -= 1
|
||||
|
||||
# Adjust for 'le' ending (e.g., "table", "able")
|
||||
if word.endswith("le") and len(word) > 2 and word[-3] not in "aeiouy":
|
||||
count += 1
|
||||
|
||||
# Adjust for 'ed' ending when not adding syllable
|
||||
if word.endswith("ed") and len(word) > 2 and word[-3] not in "dt":
|
||||
count = max(count - 1, 1)
|
||||
|
||||
# Ensure at least 1 syllable for any word
|
||||
return max(count, 1)
|
||||
|
||||
|
||||
def _count_sentences(text: str) -> int:
|
||||
"""
|
||||
Count sentences in text.
|
||||
|
||||
Splits on sentence-ending punctuation (.!?).
|
||||
|
||||
Args:
|
||||
text: The text to count sentences in.
|
||||
|
||||
Returns:
|
||||
Number of sentences (minimum 1 for non-empty text).
|
||||
"""
|
||||
if not text or not text.strip():
|
||||
return 0
|
||||
|
||||
# Split on sentence endings and filter empty strings
|
||||
sentences = _SENTENCE_ENDINGS.split(text)
|
||||
# Filter out empty segments
|
||||
sentences = [s for s in sentences if s.strip()]
|
||||
|
||||
return max(len(sentences), 1)
|
||||
|
||||
|
||||
def _count_words(text: str) -> tuple[list[str], int]:
|
||||
"""
|
||||
Extract words from text and count them.
|
||||
|
||||
Args:
|
||||
text: The text to process.
|
||||
|
||||
Returns:
|
||||
Tuple of (word list, word count).
|
||||
"""
|
||||
# Extract words (sequences of letters and apostrophes)
|
||||
words = re.findall(r"[a-zA-Z']+", text)
|
||||
# Filter out standalone apostrophes
|
||||
words = [w for w in words if w.replace("'", "")]
|
||||
return words, len(words)
|
||||
|
||||
|
||||
class Readability:
|
||||
"""
|
||||
Readability metric using Flesch-Kincaid formulas.
|
||||
|
||||
Computes:
|
||||
- Flesch-Kincaid Grade Level: US grade level required to understand text
|
||||
- Flesch Reading Ease: Score from 0-100 (higher = easier to read)
|
||||
|
||||
This metric does NOT require reference text.
|
||||
"""
|
||||
|
||||
@property
|
||||
def name(self) -> str:
|
||||
"""Return the name of this metric."""
|
||||
return "readability"
|
||||
|
||||
@property
|
||||
def requires_reference(self) -> bool:
|
||||
"""Return whether this metric requires reference text."""
|
||||
return False
|
||||
|
||||
def score(
|
||||
self,
|
||||
candidate: str,
|
||||
reference: str | list[str] | None = None, # noqa: ARG002
|
||||
) -> ReadabilityResult:
|
||||
"""
|
||||
Compute readability scores for a text.
|
||||
|
||||
Args:
|
||||
candidate: The text to score.
|
||||
reference: Ignored (readability doesn't use reference text).
|
||||
|
||||
Returns:
|
||||
ReadabilityResult with Flesch-Kincaid scores.
|
||||
"""
|
||||
# Extract words and count
|
||||
words, word_count = _count_words(candidate)
|
||||
|
||||
# Handle empty or trivial text
|
||||
if word_count == 0:
|
||||
return ReadabilityResult(
|
||||
flesch_kincaid_grade=0.0,
|
||||
flesch_reading_ease=0.0,
|
||||
)
|
||||
|
||||
# Count sentences
|
||||
sentence_count = _count_sentences(candidate)
|
||||
|
||||
# Count syllables
|
||||
syllable_count = sum(_count_syllables(word) for word in words)
|
||||
|
||||
# Compute ratios
|
||||
words_per_sentence = word_count / sentence_count
|
||||
syllables_per_word = syllable_count / word_count
|
||||
|
||||
# Flesch-Kincaid Grade Level
|
||||
# Formula: 0.39 * (words/sentences) + 11.8 * (syllables/words) - 15.59
|
||||
grade_level = 0.39 * words_per_sentence + 11.8 * syllables_per_word - 15.59
|
||||
|
||||
# Flesch Reading Ease
|
||||
# Formula: 206.835 - 1.015 * (words/sentences) - 84.6 * (syllables/words)
|
||||
reading_ease = 206.835 - 1.015 * words_per_sentence - 84.6 * syllables_per_word
|
||||
|
||||
return ReadabilityResult(
|
||||
flesch_kincaid_grade=grade_level,
|
||||
flesch_reading_ease=reading_ease,
|
||||
)
|
||||
|
||||
def batch_score(
|
||||
self,
|
||||
candidates: list[str],
|
||||
references: list[str] | list[list[str]] | None = None, # noqa: ARG002
|
||||
) -> BatchResult[ReadabilityResult]:
|
||||
"""
|
||||
Compute readability scores for a batch of texts.
|
||||
|
||||
Args:
|
||||
candidates: List of texts to score.
|
||||
references: Ignored (readability doesn't use reference text).
|
||||
|
||||
Returns:
|
||||
BatchResult containing individual results and aggregate statistics.
|
||||
"""
|
||||
if not candidates:
|
||||
raise ValueError("Cannot compute batch statistics from empty list")
|
||||
|
||||
results: list[ReadabilityResult] = []
|
||||
for cand in candidates:
|
||||
results.append(self.score(cand))
|
||||
|
||||
# Compute aggregate statistics
|
||||
stats = {
|
||||
"flesch_kincaid_grade": AggregateStats.from_values(
|
||||
[r.flesch_kincaid_grade for r in results]
|
||||
),
|
||||
"flesch_reading_ease": AggregateStats.from_values(
|
||||
[r.flesch_reading_ease for r in results]
|
||||
),
|
||||
}
|
||||
|
||||
return BatchResult(results=results, count=len(results), stats=stats)
|
||||
@@ -39,3 +39,55 @@ class LexicalResult(BaseModel):
|
||||
|
||||
token_overlap: float
|
||||
"""Proportion of candidate tokens found in reference."""
|
||||
|
||||
|
||||
class RougeScore(BaseModel):
|
||||
"""Individual ROUGE variant score with precision, recall, F-measure."""
|
||||
|
||||
model_config = ConfigDict(frozen=True)
|
||||
|
||||
precision: float
|
||||
"""Precision: overlap / candidate length."""
|
||||
|
||||
recall: float
|
||||
"""Recall: overlap / reference length."""
|
||||
|
||||
fmeasure: float
|
||||
"""F1-measure: harmonic mean of precision and recall."""
|
||||
|
||||
|
||||
class RougeResult(BaseModel):
|
||||
"""Result of ROUGE score computation."""
|
||||
|
||||
model_config = ConfigDict(frozen=True)
|
||||
|
||||
rouge1: RougeScore
|
||||
"""ROUGE-1 (unigram) score."""
|
||||
|
||||
rouge2: RougeScore
|
||||
"""ROUGE-2 (bigram) score."""
|
||||
|
||||
rouge_l: RougeScore
|
||||
"""ROUGE-L (longest common subsequence) score."""
|
||||
|
||||
@property
|
||||
def score(self) -> float:
|
||||
"""Return ROUGE-L F-measure as the primary score."""
|
||||
return self.rouge_l.fmeasure
|
||||
|
||||
|
||||
class ReadabilityResult(BaseModel):
|
||||
"""Result of readability computation."""
|
||||
|
||||
model_config = ConfigDict(frozen=True)
|
||||
|
||||
flesch_kincaid_grade: float
|
||||
"""US grade level (e.g., 8.0 = 8th grade reading level)."""
|
||||
|
||||
flesch_reading_ease: float
|
||||
"""Score 0-100, higher = easier to read."""
|
||||
|
||||
@property
|
||||
def score(self) -> float:
|
||||
"""Return Flesch reading ease as the primary score."""
|
||||
return self.flesch_reading_ease
|
||||
|
||||
281
src/veritext/metrics/rouge.py
Normal file
281
src/veritext/metrics/rouge.py
Normal file
@@ -0,0 +1,281 @@
|
||||
"""ROUGE (Recall-Oriented Understudy for Gisting Evaluation) metric implementation."""
|
||||
|
||||
from collections import Counter
|
||||
|
||||
from veritext.core.tokenisation import WordTokeniser
|
||||
from veritext.metrics.base import AggregateStats, BatchResult
|
||||
from veritext.metrics.results import RougeResult, RougeScore
|
||||
|
||||
|
||||
def _get_ngrams(tokens: list[str], n: int) -> Counter[tuple[str, ...]]:
|
||||
"""Extract n-grams from a list of tokens."""
|
||||
if n > len(tokens):
|
||||
return Counter()
|
||||
return Counter(tuple(tokens[i : i + n]) for i in range(len(tokens) - n + 1))
|
||||
|
||||
|
||||
def _ngram_overlap(
|
||||
candidate_ngrams: Counter[tuple[str, ...]],
|
||||
reference_ngrams: Counter[tuple[str, ...]],
|
||||
) -> int:
|
||||
"""Compute the overlap count between candidate and reference n-grams."""
|
||||
overlap = 0
|
||||
for ngram, count in candidate_ngrams.items():
|
||||
overlap += min(count, reference_ngrams.get(ngram, 0))
|
||||
return overlap
|
||||
|
||||
|
||||
def _compute_rouge_score(
|
||||
candidate_tokens: list[str],
|
||||
reference_tokens: list[str],
|
||||
n: int,
|
||||
) -> RougeScore:
|
||||
"""
|
||||
Compute ROUGE-n score for given n-gram size.
|
||||
|
||||
Args:
|
||||
candidate_tokens: Tokenised candidate text.
|
||||
reference_tokens: Tokenised reference text.
|
||||
n: N-gram size.
|
||||
|
||||
Returns:
|
||||
RougeScore with precision, recall, and F-measure.
|
||||
"""
|
||||
candidate_ngrams = _get_ngrams(candidate_tokens, n)
|
||||
reference_ngrams = _get_ngrams(reference_tokens, n)
|
||||
|
||||
candidate_count = sum(candidate_ngrams.values())
|
||||
reference_count = sum(reference_ngrams.values())
|
||||
|
||||
if candidate_count == 0 and reference_count == 0:
|
||||
return RougeScore(precision=0.0, recall=0.0, fmeasure=0.0)
|
||||
|
||||
overlap = _ngram_overlap(candidate_ngrams, reference_ngrams)
|
||||
|
||||
precision = overlap / candidate_count if candidate_count > 0 else 0.0
|
||||
recall = overlap / reference_count if reference_count > 0 else 0.0
|
||||
|
||||
if precision + recall > 0:
|
||||
fmeasure = 2 * precision * recall / (precision + recall)
|
||||
else:
|
||||
fmeasure = 0.0
|
||||
|
||||
return RougeScore(precision=precision, recall=recall, fmeasure=fmeasure)
|
||||
|
||||
|
||||
def _lcs_length(seq1: list[str], seq2: list[str]) -> int:
|
||||
"""
|
||||
Compute the length of the longest common subsequence.
|
||||
|
||||
Uses dynamic programming with O(m*n) time and O(min(m,n)) space.
|
||||
"""
|
||||
if not seq1 or not seq2:
|
||||
return 0
|
||||
|
||||
# Optimise by using shorter sequence for columns
|
||||
if len(seq1) < len(seq2):
|
||||
seq1, seq2 = seq2, seq1
|
||||
|
||||
m, n = len(seq1), len(seq2)
|
||||
|
||||
# Only need two rows at a time
|
||||
prev = [0] * (n + 1)
|
||||
curr = [0] * (n + 1)
|
||||
|
||||
for i in range(1, m + 1):
|
||||
for j in range(1, n + 1):
|
||||
if seq1[i - 1] == seq2[j - 1]:
|
||||
curr[j] = prev[j - 1] + 1
|
||||
else:
|
||||
curr[j] = max(prev[j], curr[j - 1])
|
||||
prev, curr = curr, prev
|
||||
|
||||
return prev[n]
|
||||
|
||||
|
||||
def _compute_rouge_l(
|
||||
candidate_tokens: list[str],
|
||||
reference_tokens: list[str],
|
||||
) -> RougeScore:
|
||||
"""
|
||||
Compute ROUGE-L score using longest common subsequence.
|
||||
|
||||
Args:
|
||||
candidate_tokens: Tokenised candidate text.
|
||||
reference_tokens: Tokenised reference text.
|
||||
|
||||
Returns:
|
||||
RougeScore with precision, recall, and F-measure.
|
||||
"""
|
||||
if not candidate_tokens and not reference_tokens:
|
||||
return RougeScore(precision=0.0, recall=0.0, fmeasure=0.0)
|
||||
|
||||
if not candidate_tokens or not reference_tokens:
|
||||
return RougeScore(precision=0.0, recall=0.0, fmeasure=0.0)
|
||||
|
||||
lcs = _lcs_length(candidate_tokens, reference_tokens)
|
||||
|
||||
precision = lcs / len(candidate_tokens)
|
||||
recall = lcs / len(reference_tokens)
|
||||
|
||||
if precision + recall > 0:
|
||||
fmeasure = 2 * precision * recall / (precision + recall)
|
||||
else:
|
||||
fmeasure = 0.0
|
||||
|
||||
return RougeScore(precision=precision, recall=recall, fmeasure=fmeasure)
|
||||
|
||||
|
||||
def _max_rouge_scores(scores: list[RougeScore]) -> RougeScore:
|
||||
"""Select the RougeScore with the highest F-measure from a list."""
|
||||
return max(scores, key=lambda s: s.fmeasure)
|
||||
|
||||
|
||||
class Rouge:
|
||||
"""
|
||||
ROUGE metric for measuring summary/generation quality.
|
||||
|
||||
Computes ROUGE-1 (unigram), ROUGE-2 (bigram), and ROUGE-L (LCS) scores.
|
||||
ROUGE is recall-oriented, measuring how much of the reference is captured.
|
||||
"""
|
||||
|
||||
def __init__(self, tokeniser: WordTokeniser | None = None) -> None:
|
||||
"""
|
||||
Initialise the ROUGE metric.
|
||||
|
||||
Args:
|
||||
tokeniser: Tokeniser to use. Defaults to WordTokeniser().
|
||||
"""
|
||||
self._tokeniser = tokeniser or WordTokeniser()
|
||||
|
||||
@property
|
||||
def name(self) -> str:
|
||||
"""Return the name of this metric."""
|
||||
return "rouge"
|
||||
|
||||
@property
|
||||
def requires_reference(self) -> bool:
|
||||
"""Return whether this metric requires reference text."""
|
||||
return True
|
||||
|
||||
def score(
|
||||
self, candidate: str, reference: str | list[str] | None = None
|
||||
) -> RougeResult:
|
||||
"""
|
||||
Compute ROUGE scores for a candidate text.
|
||||
|
||||
Args:
|
||||
candidate: The text to score.
|
||||
reference: Reference text(s) for comparison. If multiple references
|
||||
are provided, returns the maximum score for each variant.
|
||||
|
||||
Returns:
|
||||
RougeResult with ROUGE-1, ROUGE-2, and ROUGE-L scores.
|
||||
|
||||
Raises:
|
||||
ValueError: If reference is None or empty.
|
||||
"""
|
||||
if reference is None:
|
||||
raise ValueError("ROUGE requires reference text")
|
||||
|
||||
# Normalise reference to list
|
||||
references = [reference] if isinstance(reference, str) else reference
|
||||
|
||||
# Tokenise
|
||||
candidate_tokens = self._tokeniser.tokenise(candidate)
|
||||
reference_token_lists = [self._tokeniser.tokenise(r) for r in references]
|
||||
|
||||
# Handle empty references
|
||||
if all(not ref for ref in reference_token_lists):
|
||||
raise ValueError("Reference text cannot be empty")
|
||||
|
||||
# Handle empty candidate
|
||||
if not candidate_tokens:
|
||||
return RougeResult(
|
||||
rouge1=RougeScore(precision=0.0, recall=0.0, fmeasure=0.0),
|
||||
rouge2=RougeScore(precision=0.0, recall=0.0, fmeasure=0.0),
|
||||
rouge_l=RougeScore(precision=0.0, recall=0.0, fmeasure=0.0),
|
||||
)
|
||||
|
||||
# Compute scores for each reference and take max
|
||||
rouge1_scores = []
|
||||
rouge2_scores = []
|
||||
rouge_l_scores = []
|
||||
|
||||
for ref_tokens in reference_token_lists:
|
||||
if not ref_tokens:
|
||||
continue
|
||||
rouge1_scores.append(_compute_rouge_score(candidate_tokens, ref_tokens, 1))
|
||||
rouge2_scores.append(_compute_rouge_score(candidate_tokens, ref_tokens, 2))
|
||||
rouge_l_scores.append(_compute_rouge_l(candidate_tokens, ref_tokens))
|
||||
|
||||
return RougeResult(
|
||||
rouge1=_max_rouge_scores(rouge1_scores),
|
||||
rouge2=_max_rouge_scores(rouge2_scores),
|
||||
rouge_l=_max_rouge_scores(rouge_l_scores),
|
||||
)
|
||||
|
||||
def batch_score(
|
||||
self,
|
||||
candidates: list[str],
|
||||
references: list[str] | list[list[str]] | None = None,
|
||||
) -> BatchResult[RougeResult]:
|
||||
"""
|
||||
Compute ROUGE scores for a batch of candidates.
|
||||
|
||||
Args:
|
||||
candidates: List of texts to score.
|
||||
references: Reference text(s) for each candidate.
|
||||
|
||||
Returns:
|
||||
BatchResult containing individual results and aggregate statistics.
|
||||
|
||||
Raises:
|
||||
ValueError: If references is None or length mismatch.
|
||||
"""
|
||||
if references is None:
|
||||
raise ValueError("ROUGE requires reference texts")
|
||||
|
||||
if len(candidates) != len(references):
|
||||
raise ValueError(
|
||||
f"Number of candidates ({len(candidates)}) must match "
|
||||
f"number of references ({len(references)})"
|
||||
)
|
||||
|
||||
results: list[RougeResult] = []
|
||||
for i, cand in enumerate(candidates):
|
||||
ref: str | list[str] = references[i]
|
||||
results.append(self.score(cand, ref))
|
||||
|
||||
# Compute aggregate statistics for each score type
|
||||
stats = {
|
||||
"rouge1_precision": AggregateStats.from_values(
|
||||
[r.rouge1.precision for r in results]
|
||||
),
|
||||
"rouge1_recall": AggregateStats.from_values(
|
||||
[r.rouge1.recall for r in results]
|
||||
),
|
||||
"rouge1_fmeasure": AggregateStats.from_values(
|
||||
[r.rouge1.fmeasure for r in results]
|
||||
),
|
||||
"rouge2_precision": AggregateStats.from_values(
|
||||
[r.rouge2.precision for r in results]
|
||||
),
|
||||
"rouge2_recall": AggregateStats.from_values(
|
||||
[r.rouge2.recall for r in results]
|
||||
),
|
||||
"rouge2_fmeasure": AggregateStats.from_values(
|
||||
[r.rouge2.fmeasure for r in results]
|
||||
),
|
||||
"rouge_l_precision": AggregateStats.from_values(
|
||||
[r.rouge_l.precision for r in results]
|
||||
),
|
||||
"rouge_l_recall": AggregateStats.from_values(
|
||||
[r.rouge_l.recall for r in results]
|
||||
),
|
||||
"rouge_l_fmeasure": AggregateStats.from_values(
|
||||
[r.rouge_l.fmeasure for r in results]
|
||||
),
|
||||
}
|
||||
|
||||
return BatchResult(results=results, count=len(results), stats=stats)
|
||||
274
tests/test_metrics/test_readability.py
Normal file
274
tests/test_metrics/test_readability.py
Normal file
@@ -0,0 +1,274 @@
|
||||
"""Tests for the readability metric."""
|
||||
|
||||
import pytest
|
||||
|
||||
from veritext.metrics import Readability, ReadabilityResult
|
||||
|
||||
|
||||
class TestReadability:
|
||||
"""Tests for the Readability metric class."""
|
||||
|
||||
@pytest.fixture
|
||||
def readability(self) -> Readability:
|
||||
"""Provide a readability metric instance."""
|
||||
return Readability()
|
||||
|
||||
def test_name(self, readability: Readability) -> None:
|
||||
"""Test that name returns 'readability'."""
|
||||
assert readability.name == "readability"
|
||||
|
||||
def test_requires_reference(self, readability: Readability) -> None:
|
||||
"""Test that readability does NOT require reference text."""
|
||||
assert readability.requires_reference is False
|
||||
|
||||
def test_simple_text(self, readability: Readability) -> None:
|
||||
"""Test readability of simple, easy text."""
|
||||
# Simple children's text - short sentences, simple words
|
||||
text = "The cat sat. The dog ran. I see a bird."
|
||||
result = readability.score(text)
|
||||
|
||||
# Should have low grade level and high reading ease
|
||||
assert result.flesch_kincaid_grade < 5.0
|
||||
assert result.flesch_reading_ease > 80.0
|
||||
|
||||
def test_complex_text(self, readability: Readability) -> None:
|
||||
"""Test readability of complex, academic text."""
|
||||
# Complex academic text - long sentences, polysyllabic words
|
||||
text = (
|
||||
"The implementation of sophisticated computational methodologies "
|
||||
"necessitates comprehensive understanding of algorithmic complexity "
|
||||
"and architectural considerations."
|
||||
)
|
||||
result = readability.score(text)
|
||||
|
||||
# Should have high grade level and low reading ease
|
||||
assert result.flesch_kincaid_grade > 12.0
|
||||
assert result.flesch_reading_ease < 30.0
|
||||
|
||||
def test_medium_text(self, readability: Readability) -> None:
|
||||
"""Test readability of medium-difficulty text."""
|
||||
text = (
|
||||
"The weather today is quite pleasant. "
|
||||
"Many people are enjoying the sunshine in the park. "
|
||||
"Children play while parents watch nearby."
|
||||
)
|
||||
result = readability.score(text)
|
||||
|
||||
# Should be middle of the road
|
||||
assert 3.0 < result.flesch_kincaid_grade < 10.0
|
||||
assert 50.0 < result.flesch_reading_ease < 90.0
|
||||
|
||||
def test_single_sentence(self, readability: Readability) -> None:
|
||||
"""Test readability with a single sentence."""
|
||||
text = "The cat sat on the mat."
|
||||
result = readability.score(text)
|
||||
|
||||
# Should compute without error
|
||||
assert result.flesch_kincaid_grade is not None
|
||||
assert result.flesch_reading_ease is not None
|
||||
|
||||
def test_single_word(self, readability: Readability) -> None:
|
||||
"""Test readability with a single word."""
|
||||
text = "Cat"
|
||||
result = readability.score(text)
|
||||
|
||||
# Should handle single word (1 word, 1 sentence, 1 syllable)
|
||||
assert result.flesch_kincaid_grade is not None
|
||||
assert result.flesch_reading_ease is not None
|
||||
|
||||
def test_empty_text(self, readability: Readability) -> None:
|
||||
"""Test that empty text returns zero scores."""
|
||||
result = readability.score("")
|
||||
|
||||
assert result.flesch_kincaid_grade == 0.0
|
||||
assert result.flesch_reading_ease == 0.0
|
||||
|
||||
def test_whitespace_only(self, readability: Readability) -> None:
|
||||
"""Test that whitespace-only text returns zero scores."""
|
||||
result = readability.score(" \t\n ")
|
||||
|
||||
assert result.flesch_kincaid_grade == 0.0
|
||||
assert result.flesch_reading_ease == 0.0
|
||||
|
||||
def test_reference_ignored(self, readability: Readability) -> None:
|
||||
"""Test that reference parameter is ignored."""
|
||||
text = "The cat sat on the mat."
|
||||
|
||||
# Score with no reference
|
||||
result1 = readability.score(text)
|
||||
# Score with reference (should be ignored)
|
||||
result2 = readability.score(text, "Completely different text")
|
||||
# Score with list of references
|
||||
result3 = readability.score(text, ["ref1", "ref2"])
|
||||
|
||||
# All should produce identical results
|
||||
assert result1.flesch_kincaid_grade == result2.flesch_kincaid_grade
|
||||
assert result1.flesch_reading_ease == result2.flesch_reading_ease
|
||||
assert result1.flesch_kincaid_grade == result3.flesch_kincaid_grade
|
||||
|
||||
def test_punctuation_handling(self, readability: Readability) -> None:
|
||||
"""Test that punctuation affects sentence counting."""
|
||||
# Same words, different sentence structure
|
||||
text1 = "The cat sat on the mat" # 1 sentence
|
||||
text2 = "The cat sat. On the mat." # 2 sentences
|
||||
|
||||
result1 = readability.score(text1)
|
||||
result2 = readability.score(text2)
|
||||
|
||||
# Different sentence counts should affect scores
|
||||
assert result1.flesch_kincaid_grade != result2.flesch_kincaid_grade
|
||||
|
||||
def test_question_marks_count_sentences(self, readability: Readability) -> None:
|
||||
"""Test that question marks end sentences."""
|
||||
text = "What is this? It is a test."
|
||||
result = readability.score(text)
|
||||
|
||||
# Should count as 2 sentences
|
||||
# With 7 words total, words_per_sentence = 3.5
|
||||
assert result.flesch_kincaid_grade is not None
|
||||
|
||||
def test_exclamation_marks_count_sentences(self, readability: Readability) -> None:
|
||||
"""Test that exclamation marks end sentences."""
|
||||
text = "Wow! That is amazing!"
|
||||
result = readability.score(text)
|
||||
|
||||
# Should count as 2 sentences
|
||||
assert result.flesch_kincaid_grade is not None
|
||||
|
||||
def test_multiple_punctuation(self, readability: Readability) -> None:
|
||||
"""Test handling of multiple punctuation marks."""
|
||||
text = "What?! That's crazy... Well then."
|
||||
result = readability.score(text)
|
||||
|
||||
# Should handle gracefully
|
||||
assert result.flesch_kincaid_grade is not None
|
||||
|
||||
def test_result_score_property(self, readability: Readability) -> None:
|
||||
"""Test that result.score returns flesch_reading_ease."""
|
||||
result = readability.score("The cat sat on the mat.")
|
||||
assert result.score == result.flesch_reading_ease
|
||||
|
||||
def test_contractions(self, readability: Readability) -> None:
|
||||
"""Test handling of contractions."""
|
||||
text = "I'm going to the store. It's not far away."
|
||||
result = readability.score(text)
|
||||
|
||||
# Should handle contractions as words
|
||||
assert result.flesch_kincaid_grade is not None
|
||||
assert result.flesch_reading_ease is not None
|
||||
|
||||
|
||||
class TestReadabilityBatch:
|
||||
"""Tests for readability batch scoring."""
|
||||
|
||||
@pytest.fixture
|
||||
def readability(self) -> Readability:
|
||||
"""Provide a readability metric instance."""
|
||||
return Readability()
|
||||
|
||||
def test_batch_score_basic(self, readability: Readability) -> None:
|
||||
"""Test basic batch scoring."""
|
||||
candidates = [
|
||||
"The cat sat on the mat.",
|
||||
"A dog ran through the park.",
|
||||
]
|
||||
result = readability.batch_score(candidates)
|
||||
|
||||
assert result.count == 2
|
||||
assert len(result.results) == 2
|
||||
|
||||
def test_batch_score_statistics(self, readability: Readability) -> None:
|
||||
"""Test that batch scoring computes statistics."""
|
||||
candidates = [
|
||||
"Cat sat.", # Very simple
|
||||
"The implementation of sophisticated methodologies requires expertise.",
|
||||
]
|
||||
result = readability.batch_score(candidates)
|
||||
|
||||
# Check statistics are computed
|
||||
assert "flesch_kincaid_grade" in result.stats
|
||||
assert "flesch_reading_ease" in result.stats
|
||||
|
||||
# First should be easier than second
|
||||
assert (
|
||||
result.results[0].flesch_reading_ease
|
||||
> result.results[1].flesch_reading_ease
|
||||
)
|
||||
|
||||
def test_batch_score_percentiles(self, readability: Readability) -> None:
|
||||
"""Test that batch scoring computes percentiles."""
|
||||
candidates = ["a", "b", "c", "d", "e"]
|
||||
result = readability.batch_score(candidates)
|
||||
|
||||
stats = result.stats["flesch_reading_ease"]
|
||||
assert 25 in stats.percentiles
|
||||
assert 50 in stats.percentiles
|
||||
assert 75 in stats.percentiles
|
||||
assert 95 in stats.percentiles
|
||||
|
||||
def test_batch_score_references_ignored(self, readability: Readability) -> None:
|
||||
"""Test that batch scoring ignores references."""
|
||||
candidates = ["The cat sat.", "A dog ran."]
|
||||
|
||||
result1 = readability.batch_score(candidates)
|
||||
result2 = readability.batch_score(candidates, ["ref1", "ref2"])
|
||||
|
||||
# Results should be identical
|
||||
assert result1.results[0].flesch_kincaid_grade == (
|
||||
result2.results[0].flesch_kincaid_grade
|
||||
)
|
||||
|
||||
def test_batch_score_empty_list_raises(self, readability: Readability) -> None:
|
||||
"""Test that empty candidate list raises ValueError."""
|
||||
with pytest.raises(ValueError, match="empty"):
|
||||
readability.batch_score([])
|
||||
|
||||
|
||||
class TestReadabilityResult:
|
||||
"""Tests for ReadabilityResult type."""
|
||||
|
||||
def test_frozen(self) -> None:
|
||||
"""Test that ReadabilityResult is frozen."""
|
||||
from pydantic import ValidationError
|
||||
|
||||
result = ReadabilityResult(flesch_kincaid_grade=5.0, flesch_reading_ease=70.0)
|
||||
with pytest.raises(ValidationError):
|
||||
result.flesch_kincaid_grade = 6.0 # type: ignore[misc]
|
||||
|
||||
def test_values(self) -> None:
|
||||
"""Test that values are stored correctly."""
|
||||
result = ReadabilityResult(flesch_kincaid_grade=8.5, flesch_reading_ease=65.0)
|
||||
assert result.flesch_kincaid_grade == 8.5
|
||||
assert result.flesch_reading_ease == 65.0
|
||||
|
||||
def test_score_property(self) -> None:
|
||||
"""Test that score property returns flesch_reading_ease."""
|
||||
result = ReadabilityResult(flesch_kincaid_grade=8.5, flesch_reading_ease=65.0)
|
||||
assert result.score == 65.0
|
||||
|
||||
|
||||
class TestSyllableCounting:
|
||||
"""Tests for syllable counting heuristics."""
|
||||
|
||||
@pytest.fixture
|
||||
def readability(self) -> Readability:
|
||||
"""Provide a readability metric instance."""
|
||||
return Readability()
|
||||
|
||||
def test_monosyllabic_words(self, readability: Readability) -> None:
|
||||
"""Test that monosyllabic words don't inflate scores."""
|
||||
# All one-syllable words
|
||||
text = "The cat sat on the mat."
|
||||
result = readability.score(text)
|
||||
|
||||
# Should be very easy to read
|
||||
assert result.flesch_reading_ease > 90.0
|
||||
|
||||
def test_polysyllabic_words(self, readability: Readability) -> None:
|
||||
"""Test that polysyllabic words affect scores."""
|
||||
# Words with multiple syllables
|
||||
text = "International communication facilitates understanding."
|
||||
result = readability.score(text)
|
||||
|
||||
# Should be harder to read
|
||||
assert result.flesch_reading_ease < 50.0
|
||||
295
tests/test_metrics/test_rouge.py
Normal file
295
tests/test_metrics/test_rouge.py
Normal file
@@ -0,0 +1,295 @@
|
||||
"""Tests for the ROUGE metric."""
|
||||
|
||||
import pytest
|
||||
|
||||
from veritext.metrics import Rouge, RougeResult, RougeScore
|
||||
|
||||
|
||||
class TestRouge:
|
||||
"""Tests for the Rouge metric class."""
|
||||
|
||||
@pytest.fixture
|
||||
def rouge(self) -> Rouge:
|
||||
"""Provide a ROUGE metric instance."""
|
||||
return Rouge()
|
||||
|
||||
def test_name(self, rouge: Rouge) -> None:
|
||||
"""Test that name returns 'rouge'."""
|
||||
assert rouge.name == "rouge"
|
||||
|
||||
def test_requires_reference(self, rouge: Rouge) -> None:
|
||||
"""Test that ROUGE requires reference text."""
|
||||
assert rouge.requires_reference is True
|
||||
|
||||
def test_identical_texts(self, rouge: Rouge) -> None:
|
||||
"""Test that identical texts produce perfect scores."""
|
||||
text = "The cat sat on the mat"
|
||||
result = rouge.score(text, text)
|
||||
|
||||
assert result.rouge1.precision == 1.0
|
||||
assert result.rouge1.recall == 1.0
|
||||
assert result.rouge1.fmeasure == 1.0
|
||||
assert result.rouge2.fmeasure == 1.0
|
||||
assert result.rouge_l.fmeasure == 1.0
|
||||
|
||||
def test_no_overlap(self, rouge: Rouge) -> None:
|
||||
"""Test that texts with no overlap produce zero scores."""
|
||||
candidate = "apple banana cherry"
|
||||
reference = "dog elephant fox"
|
||||
result = rouge.score(candidate, reference)
|
||||
|
||||
assert result.rouge1.precision == 0.0
|
||||
assert result.rouge1.recall == 0.0
|
||||
assert result.rouge1.fmeasure == 0.0
|
||||
assert result.rouge2.fmeasure == 0.0
|
||||
assert result.rouge_l.fmeasure == 0.0
|
||||
|
||||
def test_partial_overlap_rouge1(self, rouge: Rouge) -> None:
|
||||
"""Test ROUGE-1 with partial overlap."""
|
||||
candidate = "the cat sat"
|
||||
reference = "the dog sat"
|
||||
result = rouge.score(candidate, reference)
|
||||
|
||||
# Candidate: {the, cat, sat}, Reference: {the, dog, sat}
|
||||
# Overlap: {the, sat} = 2
|
||||
# Precision = 2/3, Recall = 2/3
|
||||
assert abs(result.rouge1.precision - 2 / 3) < 1e-10
|
||||
assert abs(result.rouge1.recall - 2 / 3) < 1e-10
|
||||
|
||||
def test_partial_overlap_rouge2(self, rouge: Rouge) -> None:
|
||||
"""Test ROUGE-2 (bigram) with partial overlap."""
|
||||
candidate = "the cat sat on the mat"
|
||||
reference = "the cat lay on the mat"
|
||||
result = rouge.score(candidate, reference)
|
||||
|
||||
# Bigrams in candidate: (the, cat), (cat, sat), (sat, on), (on, the), (the, mat)
|
||||
# Bigrams in reference: (the, cat), (cat, lay), (lay, on), (on, the), (the, mat)
|
||||
# Overlap: (the, cat), (on, the), (the, mat) = 3
|
||||
# Precision = 3/5, Recall = 3/5
|
||||
assert abs(result.rouge2.precision - 3 / 5) < 1e-10
|
||||
assert abs(result.rouge2.recall - 3 / 5) < 1e-10
|
||||
|
||||
def test_rouge_l_basic(self, rouge: Rouge) -> None:
|
||||
"""Test ROUGE-L (LCS) computation."""
|
||||
candidate = "the cat sat on the mat"
|
||||
reference = "the cat sat"
|
||||
result = rouge.score(candidate, reference)
|
||||
|
||||
# LCS = "the cat sat" = 3 tokens
|
||||
# Precision = 3/6 = 0.5, Recall = 3/3 = 1.0
|
||||
assert result.rouge_l.precision == 0.5
|
||||
assert result.rouge_l.recall == 1.0
|
||||
|
||||
def test_rouge_l_non_contiguous(self, rouge: Rouge) -> None:
|
||||
"""Test ROUGE-L with non-contiguous LCS."""
|
||||
candidate = "the big cat sat"
|
||||
reference = "the cat sat"
|
||||
result = rouge.score(candidate, reference)
|
||||
|
||||
# LCS = "the cat sat" = 3 (skipping "big")
|
||||
# Precision = 3/4, Recall = 3/3 = 1.0
|
||||
assert result.rouge_l.precision == 0.75
|
||||
assert result.rouge_l.recall == 1.0
|
||||
|
||||
def test_precision_vs_recall(self, rouge: Rouge) -> None:
|
||||
"""Test that precision and recall differ appropriately."""
|
||||
# Short candidate, long reference
|
||||
candidate = "the cat"
|
||||
reference = "the cat sat on the mat"
|
||||
result = rouge.score(candidate, reference)
|
||||
|
||||
# Precision should be high (all candidate tokens in reference)
|
||||
assert result.rouge1.precision == 1.0
|
||||
# Recall should be lower (not all reference tokens in candidate)
|
||||
assert result.rouge1.recall < 1.0
|
||||
|
||||
def test_empty_candidate(self, rouge: Rouge) -> None:
|
||||
"""Test that empty candidate returns zero scores."""
|
||||
result = rouge.score("", "The cat sat")
|
||||
|
||||
assert result.rouge1.fmeasure == 0.0
|
||||
assert result.rouge2.fmeasure == 0.0
|
||||
assert result.rouge_l.fmeasure == 0.0
|
||||
|
||||
def test_whitespace_only_candidate(self, rouge: Rouge) -> None:
|
||||
"""Test that whitespace-only candidate returns zero scores."""
|
||||
result = rouge.score(" \t\n ", "The cat sat")
|
||||
|
||||
assert result.rouge1.fmeasure == 0.0
|
||||
assert result.rouge_l.fmeasure == 0.0
|
||||
|
||||
def test_empty_reference_raises(self, rouge: Rouge) -> None:
|
||||
"""Test that empty reference raises ValueError."""
|
||||
with pytest.raises(ValueError, match="cannot be empty"):
|
||||
rouge.score("The cat sat", "")
|
||||
|
||||
def test_none_reference_raises(self, rouge: Rouge) -> None:
|
||||
"""Test that None reference raises ValueError."""
|
||||
with pytest.raises(ValueError, match="requires reference"):
|
||||
rouge.score("The cat sat", None)
|
||||
|
||||
def test_multiple_references_uses_max(self, rouge: Rouge) -> None:
|
||||
"""Test that multiple references use max scores."""
|
||||
candidate = "the cat sat on the mat"
|
||||
references = [
|
||||
"a dog ran across the room", # Low overlap
|
||||
"the cat sat on the mat", # Exact match
|
||||
]
|
||||
result = rouge.score(candidate, references)
|
||||
|
||||
# Should get perfect scores due to exact match
|
||||
assert result.rouge1.fmeasure == 1.0
|
||||
assert result.rouge_l.fmeasure == 1.0
|
||||
|
||||
def test_multiple_references_partial(self, rouge: Rouge) -> None:
|
||||
"""Test multiple references with partial matches."""
|
||||
candidate = "the quick brown fox"
|
||||
references = [
|
||||
"the fast brown fox", # 3/4 match
|
||||
"a quick brown dog", # 3/4 match different tokens
|
||||
]
|
||||
result = rouge.score(candidate, references)
|
||||
|
||||
# Should pick best from either reference
|
||||
assert result.rouge1.fmeasure > 0.0
|
||||
|
||||
def test_result_score_property(self, rouge: Rouge) -> None:
|
||||
"""Test that result.score returns rouge_l.fmeasure."""
|
||||
result = rouge.score("The cat sat", "The cat sat")
|
||||
assert result.score == result.rouge_l.fmeasure
|
||||
|
||||
def test_case_insensitivity(self, rouge: Rouge) -> None:
|
||||
"""Test that ROUGE is case insensitive by default."""
|
||||
result = rouge.score("THE CAT SAT", "the cat sat")
|
||||
assert result.rouge1.fmeasure == 1.0
|
||||
assert result.rouge_l.fmeasure == 1.0
|
||||
|
||||
def test_punctuation_ignored(self, rouge: Rouge) -> None:
|
||||
"""Test that punctuation is ignored by default."""
|
||||
result = rouge.score("The cat sat.", "The cat sat!")
|
||||
assert result.rouge1.fmeasure == 1.0
|
||||
|
||||
def test_single_word(self, rouge: Rouge) -> None:
|
||||
"""Test ROUGE with single word texts."""
|
||||
result = rouge.score("cat", "cat")
|
||||
|
||||
assert result.rouge1.fmeasure == 1.0
|
||||
# ROUGE-2 should be 0 for single words (no bigrams)
|
||||
assert result.rouge2.fmeasure == 0.0
|
||||
assert result.rouge_l.fmeasure == 1.0
|
||||
|
||||
def test_fmeasure_calculation(self, rouge: Rouge) -> None:
|
||||
"""Test that F-measure is calculated correctly."""
|
||||
# Create a case where P != R
|
||||
candidate = "the cat sat on"
|
||||
reference = "the cat"
|
||||
result = rouge.score(candidate, reference)
|
||||
|
||||
# P = 2/4 = 0.5, R = 2/2 = 1.0
|
||||
# F = 2 * 0.5 * 1.0 / (0.5 + 1.0) = 1.0 / 1.5 = 2/3
|
||||
expected_f = 2 * 0.5 * 1.0 / (0.5 + 1.0)
|
||||
assert abs(result.rouge1.fmeasure - expected_f) < 1e-10
|
||||
|
||||
|
||||
class TestRougeBatch:
|
||||
"""Tests for ROUGE batch scoring."""
|
||||
|
||||
@pytest.fixture
|
||||
def rouge(self) -> Rouge:
|
||||
"""Provide a ROUGE metric instance."""
|
||||
return Rouge()
|
||||
|
||||
def test_batch_score_basic(self, rouge: Rouge) -> None:
|
||||
"""Test basic batch scoring."""
|
||||
candidates = ["The cat sat", "A dog runs"]
|
||||
references = ["The cat sat", "A dog runs"]
|
||||
result = rouge.batch_score(candidates, references)
|
||||
|
||||
assert result.count == 2
|
||||
assert len(result.results) == 2
|
||||
assert all(r.rouge_l.fmeasure == 1.0 for r in result.results)
|
||||
|
||||
def test_batch_score_statistics(self, rouge: Rouge) -> None:
|
||||
"""Test that batch scoring computes statistics."""
|
||||
candidates = ["The cat sat", "Completely different words"]
|
||||
references = ["The cat sat", "The cat sat"]
|
||||
result = rouge.batch_score(candidates, references)
|
||||
|
||||
# Check statistics are computed
|
||||
assert "rouge1_fmeasure" in result.stats
|
||||
assert "rouge2_fmeasure" in result.stats
|
||||
assert "rouge_l_fmeasure" in result.stats
|
||||
assert "rouge1_precision" in result.stats
|
||||
assert "rouge1_recall" in result.stats
|
||||
|
||||
# First result should be 1.0, second should be 0.0
|
||||
assert result.results[0].rouge1.fmeasure == 1.0
|
||||
assert result.results[1].rouge1.fmeasure == 0.0
|
||||
|
||||
def test_batch_score_percentiles(self, rouge: Rouge) -> None:
|
||||
"""Test that batch scoring computes percentiles."""
|
||||
candidates = ["a", "b", "c", "d", "e"]
|
||||
references = ["a", "b", "c", "d", "e"]
|
||||
result = rouge.batch_score(candidates, references)
|
||||
|
||||
stats = result.stats["rouge1_fmeasure"]
|
||||
assert 25 in stats.percentiles
|
||||
assert 50 in stats.percentiles
|
||||
assert 75 in stats.percentiles
|
||||
assert 95 in stats.percentiles
|
||||
|
||||
def test_batch_score_none_references_raises(self, rouge: Rouge) -> None:
|
||||
"""Test that batch scoring raises for None references."""
|
||||
with pytest.raises(ValueError, match="requires reference"):
|
||||
rouge.batch_score(["text"], None)
|
||||
|
||||
def test_batch_score_length_mismatch_raises(self, rouge: Rouge) -> None:
|
||||
"""Test that batch scoring raises for mismatched lengths."""
|
||||
with pytest.raises(ValueError, match="must match"):
|
||||
rouge.batch_score(["a", "b"], ["a"])
|
||||
|
||||
def test_batch_score_with_multiple_references(self, rouge: Rouge) -> None:
|
||||
"""Test batch scoring with multiple references per candidate."""
|
||||
candidates = [
|
||||
"The cat sat on the mat",
|
||||
"A quick brown fox",
|
||||
]
|
||||
references = [
|
||||
["The cat sat on the mat", "A cat rests on floor"],
|
||||
["A quick brown fox", "The fast brown fox"],
|
||||
]
|
||||
result = rouge.batch_score(candidates, references)
|
||||
|
||||
assert result.count == 2
|
||||
# Both should get perfect scores due to exact matches
|
||||
assert result.results[0].rouge_l.fmeasure == 1.0
|
||||
assert result.results[1].rouge_l.fmeasure == 1.0
|
||||
|
||||
|
||||
class TestRougeResult:
|
||||
"""Tests for RougeResult and RougeScore types."""
|
||||
|
||||
def test_rouge_score_frozen(self) -> None:
|
||||
"""Test that RougeScore is frozen."""
|
||||
from pydantic import ValidationError
|
||||
|
||||
score = RougeScore(precision=0.5, recall=0.6, fmeasure=0.55)
|
||||
with pytest.raises(ValidationError):
|
||||
score.precision = 0.7 # type: ignore[misc]
|
||||
|
||||
def test_rouge_result_frozen(self) -> None:
|
||||
"""Test that RougeResult is frozen."""
|
||||
from pydantic import ValidationError
|
||||
|
||||
score = RougeScore(precision=0.5, recall=0.6, fmeasure=0.55)
|
||||
result = RougeResult(rouge1=score, rouge2=score, rouge_l=score)
|
||||
with pytest.raises(ValidationError):
|
||||
result.rouge1 = score # type: ignore[misc]
|
||||
|
||||
def test_score_property(self) -> None:
|
||||
"""Test that score property returns rouge_l.fmeasure."""
|
||||
r1 = RougeScore(precision=0.9, recall=0.9, fmeasure=0.9)
|
||||
r2 = RougeScore(precision=0.8, recall=0.8, fmeasure=0.8)
|
||||
rl = RougeScore(precision=0.7, recall=0.7, fmeasure=0.7)
|
||||
result = RougeResult(rouge1=r1, rouge2=r2, rouge_l=rl)
|
||||
assert result.score == 0.7
|
||||
Reference in New Issue
Block a user