From 8fd1dc4cd3db459aec7235cda9c64ac643f60f96 Mon Sep 17 00:00:00 2001 From: Kai Chappell Date: Sat, 29 Mar 2025 13:12:53 +0000 Subject: [PATCH] validator factory functions Export all validators and provide factory functions for clean API: bleu(), rouge(), lexical(), length(), readability(), contains(), excludes(), all_of(), any_of(). --- src/veritext/validators/__init__.py | 238 ++++++++++++++++++++++++++++ 1 file changed, 238 insertions(+) create mode 100644 src/veritext/validators/__init__.py diff --git a/src/veritext/validators/__init__.py b/src/veritext/validators/__init__.py new file mode 100644 index 0000000..bf4fd42 --- /dev/null +++ b/src/veritext/validators/__init__.py @@ -0,0 +1,238 @@ +"""Validators module: composable validation checks for text quality. + +This module provides validators that apply thresholds to metrics and return +pass/fail decisions with diagnostics. + +Example: + >>> from veritext.validators import bleu, length, all_of + >>> from veritext.core.types import ValidationContext + >>> + >>> validator = all_of([ + ... bleu(min_score=0.5), + ... length(min_words=10), + ... ]) + >>> context = ValidationContext(reference="The quick brown fox.") + >>> result = validator.check("The quick brown fox jumps.", context) + >>> print(result.passed) +""" + +from typing import Literal + +from veritext.core.tokenisation import WordTokeniser +from veritext.validators.base import Check +from veritext.validators.composite import AllOf, AnyOf +from veritext.validators.constraint import ( + ContainsValidator, + ExcludesValidator, + LengthValidator, + ReadabilityValidator, +) +from veritext.validators.metric import ( + BleuValidator, + LexicalValidator, + RougeValidator, + SemanticValidator, +) + + +def bleu( + min_score: float, + variant: Literal[1, 2, 3, 4] = 4, + tokeniser: WordTokeniser | None = None, +) -> BleuValidator: + """Create a BLEU validator. + + Args: + min_score: Minimum BLEU score required (0.0 to 1.0). + variant: BLEU variant to use (1, 2, 3, or 4). Defaults to 4. + tokeniser: Tokeniser to use. Defaults to WordTokeniser(). + + Returns: + BleuValidator instance. + """ + return BleuValidator(min_score=min_score, variant=variant, tokeniser=tokeniser) + + +def rouge( + min_score: float, + variant: Literal["1", "2", "l"] = "l", + tokeniser: WordTokeniser | None = None, +) -> RougeValidator: + """Create a ROUGE validator. + + Args: + min_score: Minimum ROUGE F-measure required (0.0 to 1.0). + variant: ROUGE variant ("1", "2", or "l"). Defaults to "l". + tokeniser: Tokeniser to use. Defaults to WordTokeniser(). + + Returns: + RougeValidator instance. + """ + return RougeValidator(min_score=min_score, variant=variant, tokeniser=tokeniser) + + +def lexical( + min_jaccard: float | None = None, + min_overlap: float | None = None, + tokeniser: WordTokeniser | None = None, +) -> LexicalValidator: + """Create a lexical similarity validator. + + Args: + min_jaccard: Minimum Jaccard similarity required (0.0 to 1.0). + min_overlap: Minimum token overlap required (0.0 to 1.0). + tokeniser: Tokeniser to use. Defaults to WordTokeniser(). + + Returns: + LexicalValidator instance. + """ + return LexicalValidator( + min_jaccard=min_jaccard, min_overlap=min_overlap, tokeniser=tokeniser + ) + + +def length( + min_chars: int | None = None, + max_chars: int | None = None, + min_words: int | None = None, + max_words: int | None = None, + tokeniser: WordTokeniser | None = None, +) -> LengthValidator: + """Create a length validator. + + Args: + min_chars: Minimum character count (inclusive). + max_chars: Maximum character count (inclusive). + min_words: Minimum word count (inclusive). + max_words: Maximum word count (inclusive). + tokeniser: Tokeniser to use for word counting. Defaults to WordTokeniser(). + + Returns: + LengthValidator instance. + """ + return LengthValidator( + min_chars=min_chars, + max_chars=max_chars, + min_words=min_words, + max_words=max_words, + tokeniser=tokeniser, + ) + + +def readability( + max_grade: float | None = None, + min_ease: float | None = None, +) -> ReadabilityValidator: + """Create a readability validator. + + Args: + max_grade: Maximum Flesch-Kincaid grade level allowed. + min_ease: Minimum Flesch Reading Ease score required. + + Returns: + ReadabilityValidator instance. + """ + return ReadabilityValidator(max_grade=max_grade, min_ease=min_ease) + + +def contains( + patterns: list[str], + case_sensitive: bool = False, +) -> ContainsValidator: + """Create a contains validator. + + Args: + patterns: List of substrings or regex patterns that must be present. + case_sensitive: Whether matching is case-sensitive. Defaults to False. + + Returns: + ContainsValidator instance. + """ + return ContainsValidator(patterns=patterns, case_sensitive=case_sensitive) + + +def excludes( + patterns: list[str], + case_sensitive: bool = False, +) -> ExcludesValidator: + """Create an excludes validator. + + Args: + patterns: List of substrings or regex patterns that must not be present. + case_sensitive: Whether matching is case-sensitive. Defaults to False. + + Returns: + ExcludesValidator instance. + """ + return ExcludesValidator(patterns=patterns, case_sensitive=case_sensitive) + + +def all_of(checks: list[Check]) -> AllOf: + """Create an AllOf composite validator. + + Args: + checks: List of checks that must all pass. + + Returns: + AllOf instance. + """ + return AllOf(checks=checks) + + +def any_of(checks: list[Check]) -> AnyOf: + """Create an AnyOf composite validator. + + Args: + checks: List of checks where at least one must pass. + + Returns: + AnyOf instance. + """ + return AnyOf(checks=checks) + + +def semantic( + min_score: float, + model: str = "all-MiniLM-L6-v2", + cache_embeddings: bool = True, +) -> SemanticValidator: + """Create a semantic similarity validator. + + Requires the `veritext[semantic]` extra to be installed. + + Args: + min_score: Minimum semantic similarity required (0.0 to 1.0). + model: Name of the sentence-transformers model to use. + cache_embeddings: Whether to cache embeddings for repeated texts. + + Returns: + SemanticValidator instance. + """ + return SemanticValidator( + min_score=min_score, model=model, cache_embeddings=cache_embeddings + ) + + +__all__ = [ + "AllOf", + "AnyOf", + "BleuValidator", + "Check", + "ContainsValidator", + "ExcludesValidator", + "LengthValidator", + "LexicalValidator", + "ReadabilityValidator", + "RougeValidator", + "SemanticValidator", + "all_of", + "any_of", + "bleu", + "contains", + "excludes", + "length", + "lexical", + "readability", + "rouge", + "semantic", +]