From b2b5eb1518c3596ec89c7d02f0297acaeb5ddc46 Mon Sep 17 00:00:00 2001 From: Kai Chappell Date: Tue, 3 Feb 2026 17:14:09 +0000 Subject: [PATCH] feat(validators): add metric-based validators Implement BleuValidator, RougeValidator, and LexicalValidator for validating text against reference using metric thresholds. --- src/veritext/validators/metric.py | 288 ++++++++++++++++++++++++++++++ 1 file changed, 288 insertions(+) create mode 100644 src/veritext/validators/metric.py diff --git a/src/veritext/validators/metric.py b/src/veritext/validators/metric.py new file mode 100644 index 0000000..e87841b --- /dev/null +++ b/src/veritext/validators/metric.py @@ -0,0 +1,288 @@ +"""Metric-based validators that require reference text.""" + +from typing import Literal + +from veritext.core.exceptions import InvalidThresholdError, ValidationError +from veritext.core.tokenisation import WordTokeniser +from veritext.core.types import CheckResult, ValidationContext +from veritext.metrics.bleu import Bleu +from veritext.metrics.lexical import Lexical +from veritext.metrics.rouge import Rouge + + +class BleuValidator: + """Validates that BLEU score meets minimum threshold.""" + + def __init__( + self, + min_score: float, + variant: Literal[1, 2, 3, 4] = 4, + tokeniser: WordTokeniser | None = None, + ) -> None: + """ + Initialise the BLEU validator. + + Args: + min_score: Minimum BLEU score required (0.0 to 1.0). + variant: BLEU variant to use (1, 2, 3, or 4). Defaults to 4. + tokeniser: Tokeniser to use. Defaults to WordTokeniser(). + + Raises: + InvalidThresholdError: If min_score is not in range [0.0, 1.0]. + """ + if not 0.0 <= min_score <= 1.0: + raise InvalidThresholdError( + f"min_score must be between 0.0 and 1.0, got {min_score}" + ) + if variant not in (1, 2, 3, 4): + raise InvalidThresholdError(f"variant must be 1, 2, 3, or 4, got {variant}") + + self._min_score = min_score + self._variant = variant + self._metric = Bleu(tokeniser=tokeniser) + + @property + def name(self) -> str: + """Return the name of this check.""" + return f"bleu-{self._variant}" + + def check(self, text: str, context: ValidationContext) -> CheckResult: + """ + Run the BLEU check. + + Args: + text: The text to validate. + context: Validation context containing reference text. + + Returns: + CheckResult with pass/fail status. + + Raises: + ValidationError: If reference text is missing from context. + """ + if context.reference is None: + raise ValidationError(f"{self.name} requires reference text in context") + + result = self._metric.score(text, context.reference) + + # Select the appropriate BLEU variant + score_map = { + 1: result.bleu1, + 2: result.bleu2, + 3: result.bleu3, + 4: result.bleu4, + } + actual_score = score_map[self._variant] + passed = actual_score >= self._min_score + + if passed: + message = ( + f"BLEU-{self._variant} score {actual_score:.2f} " + f"meets minimum {self._min_score:.2f}" + ) + else: + message = ( + f"BLEU-{self._variant} score {actual_score:.2f} " + f"below minimum {self._min_score:.2f}" + ) + + return CheckResult( + name=self.name, + passed=passed, + actual=actual_score, + threshold=self._min_score, + message=message, + ) + + +class RougeValidator: + """Validates that ROUGE score meets minimum threshold.""" + + def __init__( + self, + min_score: float, + variant: Literal["1", "2", "l"] = "l", + tokeniser: WordTokeniser | None = None, + ) -> None: + """ + Initialise the ROUGE validator. + + Args: + min_score: Minimum ROUGE F-measure required (0.0 to 1.0). + variant: ROUGE variant ("1", "2", or "l"). Defaults to "l". + tokeniser: Tokeniser to use. Defaults to WordTokeniser(). + + Raises: + InvalidThresholdError: If min_score is not in range [0.0, 1.0]. + """ + if not 0.0 <= min_score <= 1.0: + raise InvalidThresholdError( + f"min_score must be between 0.0 and 1.0, got {min_score}" + ) + if variant not in ("1", "2", "l"): + raise InvalidThresholdError( + f"variant must be '1', '2', or 'l', got '{variant}'" + ) + + self._min_score = min_score + self._variant = variant + self._metric = Rouge(tokeniser=tokeniser) + + @property + def name(self) -> str: + """Return the name of this check.""" + return f"rouge-{self._variant}" + + def check(self, text: str, context: ValidationContext) -> CheckResult: + """ + Run the ROUGE check. + + Args: + text: The text to validate. + context: Validation context containing reference text. + + Returns: + CheckResult with pass/fail status. + + Raises: + ValidationError: If reference text is missing from context. + """ + if context.reference is None: + raise ValidationError(f"{self.name} requires reference text in context") + + result = self._metric.score(text, context.reference) + + # Select the appropriate ROUGE variant (use F-measure) + score_map = { + "1": result.rouge1.fmeasure, + "2": result.rouge2.fmeasure, + "l": result.rouge_l.fmeasure, + } + actual_score = score_map[self._variant] + passed = actual_score >= self._min_score + + if passed: + message = ( + f"ROUGE-{self._variant.upper()} score {actual_score:.2f} " + f"meets minimum {self._min_score:.2f}" + ) + else: + message = ( + f"ROUGE-{self._variant.upper()} score {actual_score:.2f} " + f"below minimum {self._min_score:.2f}" + ) + + return CheckResult( + name=self.name, + passed=passed, + actual=actual_score, + threshold=self._min_score, + message=message, + ) + + +class LexicalValidator: + """Validates lexical similarity meets threshold.""" + + def __init__( + self, + min_jaccard: float | None = None, + min_overlap: float | None = None, + tokeniser: WordTokeniser | None = None, + ) -> None: + """ + Initialise the lexical validator. + + Args: + min_jaccard: Minimum Jaccard similarity required (0.0 to 1.0). + min_overlap: Minimum token overlap required (0.0 to 1.0). + tokeniser: Tokeniser to use. Defaults to WordTokeniser(). + + Raises: + InvalidThresholdError: If thresholds are invalid or none provided. + """ + if min_jaccard is None and min_overlap is None: + raise InvalidThresholdError( + "At least one of min_jaccard or min_overlap must be provided" + ) + + if min_jaccard is not None and not 0.0 <= min_jaccard <= 1.0: + raise InvalidThresholdError( + f"min_jaccard must be between 0.0 and 1.0, got {min_jaccard}" + ) + + if min_overlap is not None and not 0.0 <= min_overlap <= 1.0: + raise InvalidThresholdError( + f"min_overlap must be between 0.0 and 1.0, got {min_overlap}" + ) + + self._min_jaccard = min_jaccard + self._min_overlap = min_overlap + self._metric = Lexical(tokeniser=tokeniser) + + @property + def name(self) -> str: + """Return the name of this check.""" + return "lexical" + + def check(self, text: str, context: ValidationContext) -> CheckResult: + """ + Run the lexical similarity check. + + Args: + text: The text to validate. + context: Validation context containing reference text. + + Returns: + CheckResult with pass/fail status. + + Raises: + ValidationError: If reference text is missing from context. + """ + if context.reference is None: + raise ValidationError(f"{self.name} requires reference text in context") + + result = self._metric.score(text, context.reference) + + # Check each threshold that was specified + failures = [] + if self._min_jaccard is not None and result.jaccard < self._min_jaccard: + failures.append( + f"Jaccard {result.jaccard:.2f} below minimum {self._min_jaccard:.2f}" + ) + + if self._min_overlap is not None and result.token_overlap < self._min_overlap: + failures.append( + f"token overlap {result.token_overlap:.2f} " + f"below minimum {self._min_overlap:.2f}" + ) + + passed = len(failures) == 0 + + if passed: + parts = [] + if self._min_jaccard is not None: + parts.append(f"Jaccard {result.jaccard:.2f} >= {self._min_jaccard:.2f}") + if self._min_overlap is not None: + parts.append( + f"overlap {result.token_overlap:.2f} >= {self._min_overlap:.2f}" + ) + message = "Lexical similarity: " + ", ".join(parts) + else: + message = "Lexical similarity: " + "; ".join(failures) + + # Build actual value dict + actual = {"jaccard": result.jaccard, "token_overlap": result.token_overlap} + threshold = {} + if self._min_jaccard is not None: + threshold["min_jaccard"] = self._min_jaccard + if self._min_overlap is not None: + threshold["min_overlap"] = self._min_overlap + + return CheckResult( + name=self.name, + passed=passed, + actual=actual, + threshold=threshold, + message=message, + )