From 2ef8265754e8b99eb30d425b5def0fd2af4d34e3 Mon Sep 17 00:00:00 2001 From: Kai Chappell Date: Sun, 16 Mar 2025 16:03:32 +0000 Subject: [PATCH] readability metrics (flesch, gunning fog, etc) --- src/veritext/metrics/readability.py | 155 ++++++++++++++++++++++++++++ 1 file changed, 155 insertions(+) create mode 100644 src/veritext/metrics/readability.py diff --git a/src/veritext/metrics/readability.py b/src/veritext/metrics/readability.py new file mode 100644 index 0000000..3bbf992 --- /dev/null +++ b/src/veritext/metrics/readability.py @@ -0,0 +1,155 @@ +"""Readability metrics implementation (Flesch-Kincaid).""" + +import re + +from veritext.metrics.base import AggregateStats, BatchResult +from veritext.metrics.results import ReadabilityResult + +_SENTENCE_ENDINGS = re.compile(r"[.!?]+") +_VOWELS = re.compile(r"[aeiouy]+", re.IGNORECASE) + +FK_GRADE_WORDS_PER_SENTENCE = 0.39 +FK_GRADE_SYLLABLES_PER_WORD = 11.8 +FK_GRADE_CONSTANT = 15.59 + +FRE_CONSTANT = 206.835 +FRE_WORDS_PER_SENTENCE = 1.015 +FRE_SYLLABLES_PER_WORD = 84.6 + + +def _count_syllables(word: str) -> int: + if not word: + return 0 + + word = word.lower().strip() + if not word: + return 0 + + vowel_groups = _VOWELS.findall(word) + count = len(vowel_groups) + + if word.endswith("e") and count > 1: + count -= 1 + + if word.endswith("le") and len(word) > 2 and word[-3] not in "aeiouy": + count += 1 + + if word.endswith("ed") and len(word) > 2 and word[-3] not in "dt": + count = max(count - 1, 1) + + return max(count, 1) + + +def _count_sentences(text: str) -> int: + if not text or not text.strip(): + return 0 + + sentences = _SENTENCE_ENDINGS.split(text) + sentences = [s for s in sentences if s.strip()] + + return max(len(sentences), 1) + + +def _count_words(text: str) -> tuple[list[str], int]: + words = re.findall(r"[a-zA-Z']+", text) + words = [w for w in words if w.replace("'", "")] + return words, len(words) + + +class Readability: + """ + Readability metric using Flesch-Kincaid formulas. + + Computes: + - Flesch-Kincaid Grade Level: US grade level required to understand text + - Flesch Reading Ease: Score from 0-100 (higher = easier to read) + + This metric does NOT require reference text. + """ + + @property + def name(self) -> str: + return "readability" + + @property + def requires_reference(self) -> bool: + return False + + def score( + self, + candidate: str, + reference: str | list[str] | None = None, # noqa: ARG002 + ) -> ReadabilityResult: + """ + Compute readability scores for a text. + + Args: + candidate: The text to score. + reference: Ignored (readability doesn't use reference text). + + Returns: + ReadabilityResult with Flesch-Kincaid scores. + """ + words, word_count = _count_words(candidate) + + if word_count == 0: + return ReadabilityResult( + flesch_kincaid_grade=0.0, + flesch_reading_ease=0.0, + ) + + sentence_count = max(_count_sentences(candidate), 1) + syllable_count = sum(_count_syllables(word) for word in words) + + words_per_sentence = word_count / sentence_count + syllables_per_word = syllable_count / word_count + + grade_level = ( + FK_GRADE_WORDS_PER_SENTENCE * words_per_sentence + + FK_GRADE_SYLLABLES_PER_WORD * syllables_per_word + - FK_GRADE_CONSTANT + ) + + reading_ease = ( + FRE_CONSTANT + - FRE_WORDS_PER_SENTENCE * words_per_sentence + - FRE_SYLLABLES_PER_WORD * syllables_per_word + ) + + return ReadabilityResult( + flesch_kincaid_grade=grade_level, + flesch_reading_ease=reading_ease, + ) + + def batch_score( + self, + candidates: list[str], + references: list[str] | list[list[str]] | None = None, # noqa: ARG002 + ) -> BatchResult[ReadabilityResult]: + """ + Compute readability scores for a batch of texts. + + Args: + candidates: List of texts to score. + references: Ignored (readability doesn't use reference text). + + Returns: + BatchResult containing individual results and aggregate statistics. + """ + if not candidates: + raise ValueError("Cannot compute batch statistics from empty list") + + results: list[ReadabilityResult] = [] + for cand in candidates: + results.append(self.score(cand)) + + stats = { + "flesch_kincaid_grade": AggregateStats.from_values( + [r.flesch_kincaid_grade for r in results] + ), + "flesch_reading_ease": AggregateStats.from_values( + [r.flesch_reading_ease for r in results] + ), + } + + return BatchResult(results=results, count=len(results), stats=stats)