constraint validators (length, regex, contains)
Implement LengthValidator, ReadabilityValidator, ContainsValidator, and ExcludesValidator for text constraints without reference text.
This commit is contained in:
349
src/veritext/validators/constraint.py
Normal file
349
src/veritext/validators/constraint.py
Normal file
@@ -0,0 +1,349 @@
|
||||
"""Constraint validators that do not require reference text."""
|
||||
|
||||
import re
|
||||
|
||||
from veritext.core.exceptions import InvalidThresholdError
|
||||
from veritext.core.tokenisation import WordTokeniser
|
||||
from veritext.core.types import CheckResult, ValidationContext
|
||||
from veritext.metrics.readability import Readability
|
||||
|
||||
|
||||
class LengthValidator:
|
||||
"""Validates text length constraints."""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
min_chars: int | None = None,
|
||||
max_chars: int | None = None,
|
||||
min_words: int | None = None,
|
||||
max_words: int | None = None,
|
||||
tokeniser: WordTokeniser | None = None,
|
||||
) -> None:
|
||||
"""
|
||||
Initialise the length validator.
|
||||
|
||||
Args:
|
||||
min_chars: Minimum character count (inclusive).
|
||||
max_chars: Maximum character count (inclusive).
|
||||
min_words: Minimum word count (inclusive).
|
||||
max_words: Maximum word count (inclusive).
|
||||
tokeniser: Tokeniser to use for word counting. Defaults to WordTokeniser().
|
||||
|
||||
Raises:
|
||||
InvalidThresholdError: If no constraints provided or invalid values.
|
||||
"""
|
||||
if all(v is None for v in (min_chars, max_chars, min_words, max_words)):
|
||||
raise InvalidThresholdError("At least one length constraint must be set")
|
||||
|
||||
if min_chars is not None and min_chars < 0:
|
||||
raise InvalidThresholdError(f"min_chars must be >= 0, got {min_chars}")
|
||||
if max_chars is not None and max_chars < 0:
|
||||
raise InvalidThresholdError(f"max_chars must be >= 0, got {max_chars}")
|
||||
if min_words is not None and min_words < 0:
|
||||
raise InvalidThresholdError(f"min_words must be >= 0, got {min_words}")
|
||||
if max_words is not None and max_words < 0:
|
||||
raise InvalidThresholdError(f"max_words must be >= 0, got {max_words}")
|
||||
|
||||
if min_chars is not None and max_chars is not None and min_chars > max_chars:
|
||||
raise InvalidThresholdError(
|
||||
f"min_chars ({min_chars}) cannot exceed max_chars ({max_chars})"
|
||||
)
|
||||
if min_words is not None and max_words is not None and min_words > max_words:
|
||||
raise InvalidThresholdError(
|
||||
f"min_words ({min_words}) cannot exceed max_words ({max_words})"
|
||||
)
|
||||
|
||||
self._min_chars = min_chars
|
||||
self._max_chars = max_chars
|
||||
self._min_words = min_words
|
||||
self._max_words = max_words
|
||||
self._tokeniser = tokeniser or WordTokeniser()
|
||||
|
||||
@property
|
||||
def name(self) -> str:
|
||||
return "length"
|
||||
|
||||
def check(self, text: str, context: ValidationContext) -> CheckResult: # noqa: ARG002
|
||||
"""
|
||||
Run the length check.
|
||||
|
||||
Args:
|
||||
text: The text to validate.
|
||||
context: Validation context (not used for length checks).
|
||||
|
||||
Returns:
|
||||
CheckResult with pass/fail status.
|
||||
"""
|
||||
char_count = len(text)
|
||||
words = self._tokeniser.tokenise(text)
|
||||
word_count = len(words)
|
||||
|
||||
failures = []
|
||||
|
||||
if self._min_chars is not None and char_count < self._min_chars:
|
||||
failures.append(f"{char_count} chars < min {self._min_chars}")
|
||||
if self._max_chars is not None and char_count > self._max_chars:
|
||||
failures.append(f"{char_count} chars > max {self._max_chars}")
|
||||
if self._min_words is not None and word_count < self._min_words:
|
||||
failures.append(f"{word_count} words < min {self._min_words}")
|
||||
if self._max_words is not None and word_count > self._max_words:
|
||||
failures.append(f"{word_count} words > max {self._max_words}")
|
||||
|
||||
passed = len(failures) == 0
|
||||
|
||||
if passed:
|
||||
message = f"Length check passed: {char_count} chars, {word_count} words"
|
||||
else:
|
||||
message = "Length check failed: " + "; ".join(failures)
|
||||
|
||||
actual = {"chars": char_count, "words": word_count}
|
||||
threshold = {}
|
||||
if self._min_chars is not None:
|
||||
threshold["min_chars"] = self._min_chars
|
||||
if self._max_chars is not None:
|
||||
threshold["max_chars"] = self._max_chars
|
||||
if self._min_words is not None:
|
||||
threshold["min_words"] = self._min_words
|
||||
if self._max_words is not None:
|
||||
threshold["max_words"] = self._max_words
|
||||
|
||||
return CheckResult(
|
||||
name=self.name,
|
||||
passed=passed,
|
||||
actual=actual,
|
||||
threshold=threshold,
|
||||
message=message,
|
||||
)
|
||||
|
||||
|
||||
class ReadabilityValidator:
|
||||
"""Validates Flesch-Kincaid readability."""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
max_grade: float | None = None,
|
||||
min_ease: float | None = None,
|
||||
) -> None:
|
||||
"""
|
||||
Initialise the readability validator.
|
||||
|
||||
Args:
|
||||
max_grade: Maximum Flesch-Kincaid grade level allowed.
|
||||
min_ease: Minimum Flesch Reading Ease score required.
|
||||
|
||||
Raises:
|
||||
InvalidThresholdError: If no constraints provided.
|
||||
"""
|
||||
if max_grade is None and min_ease is None:
|
||||
raise InvalidThresholdError(
|
||||
"At least one of max_grade or min_ease must be provided"
|
||||
)
|
||||
|
||||
self._max_grade = max_grade
|
||||
self._min_ease = min_ease
|
||||
self._metric = Readability()
|
||||
|
||||
@property
|
||||
def name(self) -> str:
|
||||
return "readability"
|
||||
|
||||
def check(self, text: str, context: ValidationContext) -> CheckResult: # noqa: ARG002
|
||||
"""
|
||||
Run the readability check.
|
||||
|
||||
Args:
|
||||
text: The text to validate.
|
||||
context: Validation context (not used for readability checks).
|
||||
|
||||
Returns:
|
||||
CheckResult with pass/fail status.
|
||||
"""
|
||||
result = self._metric.score(text)
|
||||
|
||||
failures = []
|
||||
if (
|
||||
self._max_grade is not None
|
||||
and result.flesch_kincaid_grade > self._max_grade
|
||||
):
|
||||
failures.append(
|
||||
f"grade level {result.flesch_kincaid_grade:.1f} "
|
||||
f"> max {self._max_grade:.1f}"
|
||||
)
|
||||
|
||||
if self._min_ease is not None and result.flesch_reading_ease < self._min_ease:
|
||||
failures.append(
|
||||
f"reading ease {result.flesch_reading_ease:.1f} "
|
||||
f"< min {self._min_ease:.1f}"
|
||||
)
|
||||
|
||||
passed = len(failures) == 0
|
||||
|
||||
if passed:
|
||||
parts = []
|
||||
if self._max_grade is not None:
|
||||
parts.append(
|
||||
f"grade {result.flesch_kincaid_grade:.1f} <= {self._max_grade:.1f}"
|
||||
)
|
||||
if self._min_ease is not None:
|
||||
parts.append(
|
||||
f"ease {result.flesch_reading_ease:.1f} >= {self._min_ease:.1f}"
|
||||
)
|
||||
message = "Readability: " + ", ".join(parts)
|
||||
else:
|
||||
message = "Readability: " + "; ".join(failures)
|
||||
|
||||
actual = {
|
||||
"grade": result.flesch_kincaid_grade,
|
||||
"ease": result.flesch_reading_ease,
|
||||
}
|
||||
threshold = {}
|
||||
if self._max_grade is not None:
|
||||
threshold["max_grade"] = self._max_grade
|
||||
if self._min_ease is not None:
|
||||
threshold["min_ease"] = self._min_ease
|
||||
|
||||
return CheckResult(
|
||||
name=self.name,
|
||||
passed=passed,
|
||||
actual=actual,
|
||||
threshold=threshold,
|
||||
message=message,
|
||||
)
|
||||
|
||||
|
||||
def _compile_patterns(patterns: list[str], flags: int) -> list[re.Pattern[str]]:
|
||||
compiled = []
|
||||
for pattern in patterns:
|
||||
try:
|
||||
compiled.append(re.compile(pattern, flags))
|
||||
except re.error as e:
|
||||
raise InvalidThresholdError(f"Invalid regex pattern '{pattern}': {e}") from e
|
||||
return compiled
|
||||
|
||||
|
||||
class ContainsValidator:
|
||||
"""Validates text contains required patterns."""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
patterns: list[str],
|
||||
case_sensitive: bool = False,
|
||||
) -> None:
|
||||
"""
|
||||
Initialise the contains validator.
|
||||
|
||||
Args:
|
||||
patterns: List of substrings or regex patterns that must be present.
|
||||
case_sensitive: Whether matching is case-sensitive. Defaults to False.
|
||||
|
||||
Raises:
|
||||
InvalidThresholdError: If patterns list is empty or contains invalid regex.
|
||||
"""
|
||||
if not patterns:
|
||||
raise InvalidThresholdError("patterns list cannot be empty")
|
||||
|
||||
self._patterns = patterns
|
||||
self._case_sensitive = case_sensitive
|
||||
self._flags = 0 if case_sensitive else re.IGNORECASE
|
||||
self._compiled_patterns = _compile_patterns(patterns, self._flags)
|
||||
|
||||
@property
|
||||
def name(self) -> str:
|
||||
return "contains"
|
||||
|
||||
def check(self, text: str, context: ValidationContext) -> CheckResult: # noqa: ARG002
|
||||
"""
|
||||
Run the contains check.
|
||||
|
||||
Args:
|
||||
text: The text to validate.
|
||||
context: Validation context (not used for contains checks).
|
||||
|
||||
Returns:
|
||||
CheckResult with pass/fail status.
|
||||
"""
|
||||
missing = []
|
||||
for pattern, compiled in zip(
|
||||
self._patterns, self._compiled_patterns, strict=True
|
||||
):
|
||||
if not compiled.search(text):
|
||||
missing.append(pattern)
|
||||
|
||||
passed = len(missing) == 0
|
||||
|
||||
if passed:
|
||||
message = f"Text contains all {len(self._patterns)} required pattern(s)"
|
||||
else:
|
||||
message = f"Text missing {len(missing)} pattern(s): {missing}"
|
||||
|
||||
return CheckResult(
|
||||
name=self.name,
|
||||
passed=passed,
|
||||
actual={"found": len(self._patterns) - len(missing), "missing": missing},
|
||||
threshold={"patterns": self._patterns},
|
||||
message=message,
|
||||
)
|
||||
|
||||
|
||||
class ExcludesValidator:
|
||||
"""Validates text excludes forbidden patterns."""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
patterns: list[str],
|
||||
case_sensitive: bool = False,
|
||||
) -> None:
|
||||
"""
|
||||
Initialise the excludes validator.
|
||||
|
||||
Args:
|
||||
patterns: List of substrings or regex patterns that must not be present.
|
||||
case_sensitive: Whether matching is case-sensitive. Defaults to False.
|
||||
|
||||
Raises:
|
||||
InvalidThresholdError: If patterns list is empty or contains invalid regex.
|
||||
"""
|
||||
if not patterns:
|
||||
raise InvalidThresholdError("patterns list cannot be empty")
|
||||
|
||||
self._patterns = patterns
|
||||
self._case_sensitive = case_sensitive
|
||||
self._flags = 0 if case_sensitive else re.IGNORECASE
|
||||
self._compiled_patterns = _compile_patterns(patterns, self._flags)
|
||||
|
||||
@property
|
||||
def name(self) -> str:
|
||||
return "excludes"
|
||||
|
||||
def check(self, text: str, context: ValidationContext) -> CheckResult: # noqa: ARG002
|
||||
"""
|
||||
Run the excludes check.
|
||||
|
||||
Args:
|
||||
text: The text to validate.
|
||||
context: Validation context (not used for excludes checks).
|
||||
|
||||
Returns:
|
||||
CheckResult with pass/fail status.
|
||||
"""
|
||||
found = []
|
||||
for pattern, compiled in zip(
|
||||
self._patterns, self._compiled_patterns, strict=True
|
||||
):
|
||||
if compiled.search(text):
|
||||
found.append(pattern)
|
||||
|
||||
passed = len(found) == 0
|
||||
|
||||
if passed:
|
||||
message = f"Text excludes all {len(self._patterns)} forbidden pattern(s)"
|
||||
else:
|
||||
message = f"Text contains {len(found)} forbidden pattern(s): {found}"
|
||||
|
||||
return CheckResult(
|
||||
name=self.name,
|
||||
passed=passed,
|
||||
actual={"excluded": len(self._patterns) - len(found), "found": found},
|
||||
threshold={"patterns": self._patterns},
|
||||
message=message,
|
||||
)
|
||||
Reference in New Issue
Block a user