validator factory functions
Export all validators and provide factory functions for clean API: bleu(), rouge(), lexical(), length(), readability(), contains(), excludes(), all_of(), any_of().
This commit is contained in:
238
src/veritext/validators/__init__.py
Normal file
238
src/veritext/validators/__init__.py
Normal file
@@ -0,0 +1,238 @@
|
|||||||
|
"""Validators module: composable validation checks for text quality.
|
||||||
|
|
||||||
|
This module provides validators that apply thresholds to metrics and return
|
||||||
|
pass/fail decisions with diagnostics.
|
||||||
|
|
||||||
|
Example:
|
||||||
|
>>> from veritext.validators import bleu, length, all_of
|
||||||
|
>>> from veritext.core.types import ValidationContext
|
||||||
|
>>>
|
||||||
|
>>> validator = all_of([
|
||||||
|
... bleu(min_score=0.5),
|
||||||
|
... length(min_words=10),
|
||||||
|
... ])
|
||||||
|
>>> context = ValidationContext(reference="The quick brown fox.")
|
||||||
|
>>> result = validator.check("The quick brown fox jumps.", context)
|
||||||
|
>>> print(result.passed)
|
||||||
|
"""
|
||||||
|
|
||||||
|
from typing import Literal
|
||||||
|
|
||||||
|
from veritext.core.tokenisation import WordTokeniser
|
||||||
|
from veritext.validators.base import Check
|
||||||
|
from veritext.validators.composite import AllOf, AnyOf
|
||||||
|
from veritext.validators.constraint import (
|
||||||
|
ContainsValidator,
|
||||||
|
ExcludesValidator,
|
||||||
|
LengthValidator,
|
||||||
|
ReadabilityValidator,
|
||||||
|
)
|
||||||
|
from veritext.validators.metric import (
|
||||||
|
BleuValidator,
|
||||||
|
LexicalValidator,
|
||||||
|
RougeValidator,
|
||||||
|
SemanticValidator,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def bleu(
|
||||||
|
min_score: float,
|
||||||
|
variant: Literal[1, 2, 3, 4] = 4,
|
||||||
|
tokeniser: WordTokeniser | None = None,
|
||||||
|
) -> BleuValidator:
|
||||||
|
"""Create a BLEU validator.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
min_score: Minimum BLEU score required (0.0 to 1.0).
|
||||||
|
variant: BLEU variant to use (1, 2, 3, or 4). Defaults to 4.
|
||||||
|
tokeniser: Tokeniser to use. Defaults to WordTokeniser().
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
BleuValidator instance.
|
||||||
|
"""
|
||||||
|
return BleuValidator(min_score=min_score, variant=variant, tokeniser=tokeniser)
|
||||||
|
|
||||||
|
|
||||||
|
def rouge(
|
||||||
|
min_score: float,
|
||||||
|
variant: Literal["1", "2", "l"] = "l",
|
||||||
|
tokeniser: WordTokeniser | None = None,
|
||||||
|
) -> RougeValidator:
|
||||||
|
"""Create a ROUGE validator.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
min_score: Minimum ROUGE F-measure required (0.0 to 1.0).
|
||||||
|
variant: ROUGE variant ("1", "2", or "l"). Defaults to "l".
|
||||||
|
tokeniser: Tokeniser to use. Defaults to WordTokeniser().
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
RougeValidator instance.
|
||||||
|
"""
|
||||||
|
return RougeValidator(min_score=min_score, variant=variant, tokeniser=tokeniser)
|
||||||
|
|
||||||
|
|
||||||
|
def lexical(
|
||||||
|
min_jaccard: float | None = None,
|
||||||
|
min_overlap: float | None = None,
|
||||||
|
tokeniser: WordTokeniser | None = None,
|
||||||
|
) -> LexicalValidator:
|
||||||
|
"""Create a lexical similarity validator.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
min_jaccard: Minimum Jaccard similarity required (0.0 to 1.0).
|
||||||
|
min_overlap: Minimum token overlap required (0.0 to 1.0).
|
||||||
|
tokeniser: Tokeniser to use. Defaults to WordTokeniser().
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
LexicalValidator instance.
|
||||||
|
"""
|
||||||
|
return LexicalValidator(
|
||||||
|
min_jaccard=min_jaccard, min_overlap=min_overlap, tokeniser=tokeniser
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def length(
|
||||||
|
min_chars: int | None = None,
|
||||||
|
max_chars: int | None = None,
|
||||||
|
min_words: int | None = None,
|
||||||
|
max_words: int | None = None,
|
||||||
|
tokeniser: WordTokeniser | None = None,
|
||||||
|
) -> LengthValidator:
|
||||||
|
"""Create a length validator.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
min_chars: Minimum character count (inclusive).
|
||||||
|
max_chars: Maximum character count (inclusive).
|
||||||
|
min_words: Minimum word count (inclusive).
|
||||||
|
max_words: Maximum word count (inclusive).
|
||||||
|
tokeniser: Tokeniser to use for word counting. Defaults to WordTokeniser().
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
LengthValidator instance.
|
||||||
|
"""
|
||||||
|
return LengthValidator(
|
||||||
|
min_chars=min_chars,
|
||||||
|
max_chars=max_chars,
|
||||||
|
min_words=min_words,
|
||||||
|
max_words=max_words,
|
||||||
|
tokeniser=tokeniser,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def readability(
|
||||||
|
max_grade: float | None = None,
|
||||||
|
min_ease: float | None = None,
|
||||||
|
) -> ReadabilityValidator:
|
||||||
|
"""Create a readability validator.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
max_grade: Maximum Flesch-Kincaid grade level allowed.
|
||||||
|
min_ease: Minimum Flesch Reading Ease score required.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
ReadabilityValidator instance.
|
||||||
|
"""
|
||||||
|
return ReadabilityValidator(max_grade=max_grade, min_ease=min_ease)
|
||||||
|
|
||||||
|
|
||||||
|
def contains(
|
||||||
|
patterns: list[str],
|
||||||
|
case_sensitive: bool = False,
|
||||||
|
) -> ContainsValidator:
|
||||||
|
"""Create a contains validator.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
patterns: List of substrings or regex patterns that must be present.
|
||||||
|
case_sensitive: Whether matching is case-sensitive. Defaults to False.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
ContainsValidator instance.
|
||||||
|
"""
|
||||||
|
return ContainsValidator(patterns=patterns, case_sensitive=case_sensitive)
|
||||||
|
|
||||||
|
|
||||||
|
def excludes(
|
||||||
|
patterns: list[str],
|
||||||
|
case_sensitive: bool = False,
|
||||||
|
) -> ExcludesValidator:
|
||||||
|
"""Create an excludes validator.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
patterns: List of substrings or regex patterns that must not be present.
|
||||||
|
case_sensitive: Whether matching is case-sensitive. Defaults to False.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
ExcludesValidator instance.
|
||||||
|
"""
|
||||||
|
return ExcludesValidator(patterns=patterns, case_sensitive=case_sensitive)
|
||||||
|
|
||||||
|
|
||||||
|
def all_of(checks: list[Check]) -> AllOf:
|
||||||
|
"""Create an AllOf composite validator.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
checks: List of checks that must all pass.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
AllOf instance.
|
||||||
|
"""
|
||||||
|
return AllOf(checks=checks)
|
||||||
|
|
||||||
|
|
||||||
|
def any_of(checks: list[Check]) -> AnyOf:
|
||||||
|
"""Create an AnyOf composite validator.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
checks: List of checks where at least one must pass.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
AnyOf instance.
|
||||||
|
"""
|
||||||
|
return AnyOf(checks=checks)
|
||||||
|
|
||||||
|
|
||||||
|
def semantic(
|
||||||
|
min_score: float,
|
||||||
|
model: str = "all-MiniLM-L6-v2",
|
||||||
|
cache_embeddings: bool = True,
|
||||||
|
) -> SemanticValidator:
|
||||||
|
"""Create a semantic similarity validator.
|
||||||
|
|
||||||
|
Requires the `veritext[semantic]` extra to be installed.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
min_score: Minimum semantic similarity required (0.0 to 1.0).
|
||||||
|
model: Name of the sentence-transformers model to use.
|
||||||
|
cache_embeddings: Whether to cache embeddings for repeated texts.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
SemanticValidator instance.
|
||||||
|
"""
|
||||||
|
return SemanticValidator(
|
||||||
|
min_score=min_score, model=model, cache_embeddings=cache_embeddings
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
__all__ = [
|
||||||
|
"AllOf",
|
||||||
|
"AnyOf",
|
||||||
|
"BleuValidator",
|
||||||
|
"Check",
|
||||||
|
"ContainsValidator",
|
||||||
|
"ExcludesValidator",
|
||||||
|
"LengthValidator",
|
||||||
|
"LexicalValidator",
|
||||||
|
"ReadabilityValidator",
|
||||||
|
"RougeValidator",
|
||||||
|
"SemanticValidator",
|
||||||
|
"all_of",
|
||||||
|
"any_of",
|
||||||
|
"bleu",
|
||||||
|
"contains",
|
||||||
|
"excludes",
|
||||||
|
"length",
|
||||||
|
"lexical",
|
||||||
|
"readability",
|
||||||
|
"rouge",
|
||||||
|
"semantic",
|
||||||
|
]
|
||||||
Reference in New Issue
Block a user