feat(metrics): add readability implementation
This commit is contained in:
195
src/veritext/metrics/readability.py
Normal file
195
src/veritext/metrics/readability.py
Normal file
@@ -0,0 +1,195 @@
|
|||||||
|
"""Readability metrics implementation (Flesch-Kincaid)."""
|
||||||
|
|
||||||
|
import re
|
||||||
|
|
||||||
|
from veritext.metrics.base import AggregateStats, BatchResult
|
||||||
|
from veritext.metrics.results import ReadabilityResult
|
||||||
|
|
||||||
|
# Sentence-ending punctuation pattern
|
||||||
|
_SENTENCE_ENDINGS = re.compile(r"[.!?]+")
|
||||||
|
|
||||||
|
# Vowel pattern for syllable counting
|
||||||
|
_VOWELS = re.compile(r"[aeiouy]+", re.IGNORECASE)
|
||||||
|
|
||||||
|
|
||||||
|
def _count_syllables(word: str) -> int:
|
||||||
|
"""
|
||||||
|
Count syllables in a word using a heuristic approach.
|
||||||
|
|
||||||
|
Uses vowel group counting with adjustments for common patterns.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
word: The word to count syllables for.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Estimated syllable count (minimum 1 for non-empty words).
|
||||||
|
"""
|
||||||
|
if not word:
|
||||||
|
return 0
|
||||||
|
|
||||||
|
word = word.lower().strip()
|
||||||
|
if not word:
|
||||||
|
return 0
|
||||||
|
|
||||||
|
# Count vowel groups
|
||||||
|
vowel_groups = _VOWELS.findall(word)
|
||||||
|
count = len(vowel_groups)
|
||||||
|
|
||||||
|
# Adjust for silent 'e' at end
|
||||||
|
if word.endswith("e") and count > 1:
|
||||||
|
count -= 1
|
||||||
|
|
||||||
|
# Adjust for 'le' ending (e.g., "table", "able")
|
||||||
|
if word.endswith("le") and len(word) > 2 and word[-3] not in "aeiouy":
|
||||||
|
count += 1
|
||||||
|
|
||||||
|
# Adjust for 'ed' ending when not adding syllable
|
||||||
|
if word.endswith("ed") and len(word) > 2 and word[-3] not in "dt":
|
||||||
|
count = max(count - 1, 1)
|
||||||
|
|
||||||
|
# Ensure at least 1 syllable for any word
|
||||||
|
return max(count, 1)
|
||||||
|
|
||||||
|
|
||||||
|
def _count_sentences(text: str) -> int:
|
||||||
|
"""
|
||||||
|
Count sentences in text.
|
||||||
|
|
||||||
|
Splits on sentence-ending punctuation (.!?).
|
||||||
|
|
||||||
|
Args:
|
||||||
|
text: The text to count sentences in.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Number of sentences (minimum 1 for non-empty text).
|
||||||
|
"""
|
||||||
|
if not text or not text.strip():
|
||||||
|
return 0
|
||||||
|
|
||||||
|
# Split on sentence endings and filter empty strings
|
||||||
|
sentences = _SENTENCE_ENDINGS.split(text)
|
||||||
|
# Filter out empty segments
|
||||||
|
sentences = [s for s in sentences if s.strip()]
|
||||||
|
|
||||||
|
return max(len(sentences), 1)
|
||||||
|
|
||||||
|
|
||||||
|
def _count_words(text: str) -> tuple[list[str], int]:
|
||||||
|
"""
|
||||||
|
Extract words from text and count them.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
text: The text to process.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Tuple of (word list, word count).
|
||||||
|
"""
|
||||||
|
# Extract words (sequences of letters and apostrophes)
|
||||||
|
words = re.findall(r"[a-zA-Z']+", text)
|
||||||
|
# Filter out standalone apostrophes
|
||||||
|
words = [w for w in words if w.replace("'", "")]
|
||||||
|
return words, len(words)
|
||||||
|
|
||||||
|
|
||||||
|
class Readability:
|
||||||
|
"""
|
||||||
|
Readability metric using Flesch-Kincaid formulas.
|
||||||
|
|
||||||
|
Computes:
|
||||||
|
- Flesch-Kincaid Grade Level: US grade level required to understand text
|
||||||
|
- Flesch Reading Ease: Score from 0-100 (higher = easier to read)
|
||||||
|
|
||||||
|
This metric does NOT require reference text.
|
||||||
|
"""
|
||||||
|
|
||||||
|
@property
|
||||||
|
def name(self) -> str:
|
||||||
|
"""Return the name of this metric."""
|
||||||
|
return "readability"
|
||||||
|
|
||||||
|
@property
|
||||||
|
def requires_reference(self) -> bool:
|
||||||
|
"""Return whether this metric requires reference text."""
|
||||||
|
return False
|
||||||
|
|
||||||
|
def score(
|
||||||
|
self,
|
||||||
|
candidate: str,
|
||||||
|
reference: str | list[str] | None = None, # noqa: ARG002
|
||||||
|
) -> ReadabilityResult:
|
||||||
|
"""
|
||||||
|
Compute readability scores for a text.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
candidate: The text to score.
|
||||||
|
reference: Ignored (readability doesn't use reference text).
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
ReadabilityResult with Flesch-Kincaid scores.
|
||||||
|
"""
|
||||||
|
# Extract words and count
|
||||||
|
words, word_count = _count_words(candidate)
|
||||||
|
|
||||||
|
# Handle empty or trivial text
|
||||||
|
if word_count == 0:
|
||||||
|
return ReadabilityResult(
|
||||||
|
flesch_kincaid_grade=0.0,
|
||||||
|
flesch_reading_ease=0.0,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Count sentences
|
||||||
|
sentence_count = _count_sentences(candidate)
|
||||||
|
|
||||||
|
# Count syllables
|
||||||
|
syllable_count = sum(_count_syllables(word) for word in words)
|
||||||
|
|
||||||
|
# Compute ratios
|
||||||
|
words_per_sentence = word_count / sentence_count
|
||||||
|
syllables_per_word = syllable_count / word_count
|
||||||
|
|
||||||
|
# Flesch-Kincaid Grade Level
|
||||||
|
# Formula: 0.39 * (words/sentences) + 11.8 * (syllables/words) - 15.59
|
||||||
|
grade_level = 0.39 * words_per_sentence + 11.8 * syllables_per_word - 15.59
|
||||||
|
|
||||||
|
# Flesch Reading Ease
|
||||||
|
# Formula: 206.835 - 1.015 * (words/sentences) - 84.6 * (syllables/words)
|
||||||
|
reading_ease = 206.835 - 1.015 * words_per_sentence - 84.6 * syllables_per_word
|
||||||
|
|
||||||
|
return ReadabilityResult(
|
||||||
|
flesch_kincaid_grade=grade_level,
|
||||||
|
flesch_reading_ease=reading_ease,
|
||||||
|
)
|
||||||
|
|
||||||
|
def batch_score(
|
||||||
|
self,
|
||||||
|
candidates: list[str],
|
||||||
|
references: list[str] | list[list[str]] | None = None, # noqa: ARG002
|
||||||
|
) -> BatchResult[ReadabilityResult]:
|
||||||
|
"""
|
||||||
|
Compute readability scores for a batch of texts.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
candidates: List of texts to score.
|
||||||
|
references: Ignored (readability doesn't use reference text).
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
BatchResult containing individual results and aggregate statistics.
|
||||||
|
"""
|
||||||
|
if not candidates:
|
||||||
|
raise ValueError("Cannot compute batch statistics from empty list")
|
||||||
|
|
||||||
|
results: list[ReadabilityResult] = []
|
||||||
|
for cand in candidates:
|
||||||
|
results.append(self.score(cand))
|
||||||
|
|
||||||
|
# Compute aggregate statistics
|
||||||
|
stats = {
|
||||||
|
"flesch_kincaid_grade": AggregateStats.from_values(
|
||||||
|
[r.flesch_kincaid_grade for r in results]
|
||||||
|
),
|
||||||
|
"flesch_reading_ease": AggregateStats.from_values(
|
||||||
|
[r.flesch_reading_ease for r in results]
|
||||||
|
),
|
||||||
|
}
|
||||||
|
|
||||||
|
return BatchResult(results=results, count=len(results), stats=stats)
|
||||||
Reference in New Issue
Block a user