constraint validators (length, regex, contains)

Implement LengthValidator, ReadabilityValidator, ContainsValidator, and
ExcludesValidator for text constraints without reference text.
This commit is contained in:
2025-03-26 18:06:03 +00:00
parent 3ef262d357
commit 067cd74566

View File

@@ -0,0 +1,349 @@
"""Constraint validators that do not require reference text."""
import re
from veritext.core.exceptions import InvalidThresholdError
from veritext.core.tokenisation import WordTokeniser
from veritext.core.types import CheckResult, ValidationContext
from veritext.metrics.readability import Readability
class LengthValidator:
"""Validates text length constraints."""
def __init__(
self,
min_chars: int | None = None,
max_chars: int | None = None,
min_words: int | None = None,
max_words: int | None = None,
tokeniser: WordTokeniser | None = None,
) -> None:
"""
Initialise the length validator.
Args:
min_chars: Minimum character count (inclusive).
max_chars: Maximum character count (inclusive).
min_words: Minimum word count (inclusive).
max_words: Maximum word count (inclusive).
tokeniser: Tokeniser to use for word counting. Defaults to WordTokeniser().
Raises:
InvalidThresholdError: If no constraints provided or invalid values.
"""
if all(v is None for v in (min_chars, max_chars, min_words, max_words)):
raise InvalidThresholdError("At least one length constraint must be set")
if min_chars is not None and min_chars < 0:
raise InvalidThresholdError(f"min_chars must be >= 0, got {min_chars}")
if max_chars is not None and max_chars < 0:
raise InvalidThresholdError(f"max_chars must be >= 0, got {max_chars}")
if min_words is not None and min_words < 0:
raise InvalidThresholdError(f"min_words must be >= 0, got {min_words}")
if max_words is not None and max_words < 0:
raise InvalidThresholdError(f"max_words must be >= 0, got {max_words}")
if min_chars is not None and max_chars is not None and min_chars > max_chars:
raise InvalidThresholdError(
f"min_chars ({min_chars}) cannot exceed max_chars ({max_chars})"
)
if min_words is not None and max_words is not None and min_words > max_words:
raise InvalidThresholdError(
f"min_words ({min_words}) cannot exceed max_words ({max_words})"
)
self._min_chars = min_chars
self._max_chars = max_chars
self._min_words = min_words
self._max_words = max_words
self._tokeniser = tokeniser or WordTokeniser()
@property
def name(self) -> str:
return "length"
def check(self, text: str, context: ValidationContext) -> CheckResult: # noqa: ARG002
"""
Run the length check.
Args:
text: The text to validate.
context: Validation context (not used for length checks).
Returns:
CheckResult with pass/fail status.
"""
char_count = len(text)
words = self._tokeniser.tokenise(text)
word_count = len(words)
failures = []
if self._min_chars is not None and char_count < self._min_chars:
failures.append(f"{char_count} chars < min {self._min_chars}")
if self._max_chars is not None and char_count > self._max_chars:
failures.append(f"{char_count} chars > max {self._max_chars}")
if self._min_words is not None and word_count < self._min_words:
failures.append(f"{word_count} words < min {self._min_words}")
if self._max_words is not None and word_count > self._max_words:
failures.append(f"{word_count} words > max {self._max_words}")
passed = len(failures) == 0
if passed:
message = f"Length check passed: {char_count} chars, {word_count} words"
else:
message = "Length check failed: " + "; ".join(failures)
actual = {"chars": char_count, "words": word_count}
threshold = {}
if self._min_chars is not None:
threshold["min_chars"] = self._min_chars
if self._max_chars is not None:
threshold["max_chars"] = self._max_chars
if self._min_words is not None:
threshold["min_words"] = self._min_words
if self._max_words is not None:
threshold["max_words"] = self._max_words
return CheckResult(
name=self.name,
passed=passed,
actual=actual,
threshold=threshold,
message=message,
)
class ReadabilityValidator:
"""Validates Flesch-Kincaid readability."""
def __init__(
self,
max_grade: float | None = None,
min_ease: float | None = None,
) -> None:
"""
Initialise the readability validator.
Args:
max_grade: Maximum Flesch-Kincaid grade level allowed.
min_ease: Minimum Flesch Reading Ease score required.
Raises:
InvalidThresholdError: If no constraints provided.
"""
if max_grade is None and min_ease is None:
raise InvalidThresholdError(
"At least one of max_grade or min_ease must be provided"
)
self._max_grade = max_grade
self._min_ease = min_ease
self._metric = Readability()
@property
def name(self) -> str:
return "readability"
def check(self, text: str, context: ValidationContext) -> CheckResult: # noqa: ARG002
"""
Run the readability check.
Args:
text: The text to validate.
context: Validation context (not used for readability checks).
Returns:
CheckResult with pass/fail status.
"""
result = self._metric.score(text)
failures = []
if (
self._max_grade is not None
and result.flesch_kincaid_grade > self._max_grade
):
failures.append(
f"grade level {result.flesch_kincaid_grade:.1f} "
f"> max {self._max_grade:.1f}"
)
if self._min_ease is not None and result.flesch_reading_ease < self._min_ease:
failures.append(
f"reading ease {result.flesch_reading_ease:.1f} "
f"< min {self._min_ease:.1f}"
)
passed = len(failures) == 0
if passed:
parts = []
if self._max_grade is not None:
parts.append(
f"grade {result.flesch_kincaid_grade:.1f} <= {self._max_grade:.1f}"
)
if self._min_ease is not None:
parts.append(
f"ease {result.flesch_reading_ease:.1f} >= {self._min_ease:.1f}"
)
message = "Readability: " + ", ".join(parts)
else:
message = "Readability: " + "; ".join(failures)
actual = {
"grade": result.flesch_kincaid_grade,
"ease": result.flesch_reading_ease,
}
threshold = {}
if self._max_grade is not None:
threshold["max_grade"] = self._max_grade
if self._min_ease is not None:
threshold["min_ease"] = self._min_ease
return CheckResult(
name=self.name,
passed=passed,
actual=actual,
threshold=threshold,
message=message,
)
def _compile_patterns(patterns: list[str], flags: int) -> list[re.Pattern[str]]:
compiled = []
for pattern in patterns:
try:
compiled.append(re.compile(pattern, flags))
except re.error as e:
raise InvalidThresholdError(f"Invalid regex pattern '{pattern}': {e}") from e
return compiled
class ContainsValidator:
"""Validates text contains required patterns."""
def __init__(
self,
patterns: list[str],
case_sensitive: bool = False,
) -> None:
"""
Initialise the contains validator.
Args:
patterns: List of substrings or regex patterns that must be present.
case_sensitive: Whether matching is case-sensitive. Defaults to False.
Raises:
InvalidThresholdError: If patterns list is empty or contains invalid regex.
"""
if not patterns:
raise InvalidThresholdError("patterns list cannot be empty")
self._patterns = patterns
self._case_sensitive = case_sensitive
self._flags = 0 if case_sensitive else re.IGNORECASE
self._compiled_patterns = _compile_patterns(patterns, self._flags)
@property
def name(self) -> str:
return "contains"
def check(self, text: str, context: ValidationContext) -> CheckResult: # noqa: ARG002
"""
Run the contains check.
Args:
text: The text to validate.
context: Validation context (not used for contains checks).
Returns:
CheckResult with pass/fail status.
"""
missing = []
for pattern, compiled in zip(
self._patterns, self._compiled_patterns, strict=True
):
if not compiled.search(text):
missing.append(pattern)
passed = len(missing) == 0
if passed:
message = f"Text contains all {len(self._patterns)} required pattern(s)"
else:
message = f"Text missing {len(missing)} pattern(s): {missing}"
return CheckResult(
name=self.name,
passed=passed,
actual={"found": len(self._patterns) - len(missing), "missing": missing},
threshold={"patterns": self._patterns},
message=message,
)
class ExcludesValidator:
"""Validates text excludes forbidden patterns."""
def __init__(
self,
patterns: list[str],
case_sensitive: bool = False,
) -> None:
"""
Initialise the excludes validator.
Args:
patterns: List of substrings or regex patterns that must not be present.
case_sensitive: Whether matching is case-sensitive. Defaults to False.
Raises:
InvalidThresholdError: If patterns list is empty or contains invalid regex.
"""
if not patterns:
raise InvalidThresholdError("patterns list cannot be empty")
self._patterns = patterns
self._case_sensitive = case_sensitive
self._flags = 0 if case_sensitive else re.IGNORECASE
self._compiled_patterns = _compile_patterns(patterns, self._flags)
@property
def name(self) -> str:
return "excludes"
def check(self, text: str, context: ValidationContext) -> CheckResult: # noqa: ARG002
"""
Run the excludes check.
Args:
text: The text to validate.
context: Validation context (not used for excludes checks).
Returns:
CheckResult with pass/fail status.
"""
found = []
for pattern, compiled in zip(
self._patterns, self._compiled_patterns, strict=True
):
if compiled.search(text):
found.append(pattern)
passed = len(found) == 0
if passed:
message = f"Text excludes all {len(self._patterns)} forbidden pattern(s)"
else:
message = f"Text contains {len(found)} forbidden pattern(s): {found}"
return CheckResult(
name=self.name,
passed=passed,
actual={"excluded": len(self._patterns) - len(found), "found": found},
threshold={"patterns": self._patterns},
message=message,
)