feat(validators): add metric-based validators

Implement BleuValidator, RougeValidator, and LexicalValidator for validating text against reference using metric thresholds.
2026-02-03 17:14:09 +00:00
parent 9e7b0131b3
commit b2b5eb1518
1 changed files with 288 additions and 0 deletions
@@ -0,0 +1,288 @@
+"""Metric-based validators that require reference text."""
+
+from typing import Literal
+
+from veritext.core.exceptions import InvalidThresholdError, ValidationError
+from veritext.core.tokenisation import WordTokeniser
+from veritext.core.types import CheckResult, ValidationContext
+from veritext.metrics.bleu import Bleu
+from veritext.metrics.lexical import Lexical
+from veritext.metrics.rouge import Rouge
+
+
+class BleuValidator:
+    """Validates that BLEU score meets minimum threshold."""
+
+    def __init__(
+        self,
+        min_score: float,
+        variant: Literal[1, 2, 3, 4] = 4,
+        tokeniser: WordTokeniser | None = None,
+    ) -> None:
+        """
+        Initialise the BLEU validator.
+
+        Args:
+            min_score: Minimum BLEU score required (0.0 to 1.0).
+            variant: BLEU variant to use (1, 2, 3, or 4). Defaults to 4.
+            tokeniser: Tokeniser to use. Defaults to WordTokeniser().
+
+        Raises:
+            InvalidThresholdError: If min_score is not in range [0.0, 1.0].
+        """
+        if not 0.0 <= min_score <= 1.0:
+            raise InvalidThresholdError(
+                f"min_score must be between 0.0 and 1.0, got {min_score}"
+            )
+        if variant not in (1, 2, 3, 4):
+            raise InvalidThresholdError(f"variant must be 1, 2, 3, or 4, got {variant}")
+
+        self._min_score = min_score
+        self._variant = variant
+        self._metric = Bleu(tokeniser=tokeniser)
+
+    @property
+    def name(self) -> str:
+        """Return the name of this check."""
+        return f"bleu-{self._variant}"
+
+    def check(self, text: str, context: ValidationContext) -> CheckResult:
+        """
+        Run the BLEU check.
+
+        Args:
+            text: The text to validate.
+            context: Validation context containing reference text.
+
+        Returns:
+            CheckResult with pass/fail status.
+
+        Raises:
+            ValidationError: If reference text is missing from context.
+        """
+        if context.reference is None:
+            raise ValidationError(f"{self.name} requires reference text in context")
+
+        result = self._metric.score(text, context.reference)
+
+        # Select the appropriate BLEU variant
+        score_map = {
+            1: result.bleu1,
+            2: result.bleu2,
+            3: result.bleu3,
+            4: result.bleu4,
+        }
+        actual_score = score_map[self._variant]
+        passed = actual_score >= self._min_score
+
+        if passed:
+            message = (
+                f"BLEU-{self._variant} score {actual_score:.2f} "
+                f"meets minimum {self._min_score:.2f}"
+            )
+        else:
+            message = (
+                f"BLEU-{self._variant} score {actual_score:.2f} "
+                f"below minimum {self._min_score:.2f}"
+            )
+
+        return CheckResult(
+            name=self.name,
+            passed=passed,
+            actual=actual_score,
+            threshold=self._min_score,
+            message=message,
+        )
+
+
+class RougeValidator:
+    """Validates that ROUGE score meets minimum threshold."""
+
+    def __init__(
+        self,
+        min_score: float,
+        variant: Literal["1", "2", "l"] = "l",
+        tokeniser: WordTokeniser | None = None,
+    ) -> None:
+        """
+        Initialise the ROUGE validator.
+
+        Args:
+            min_score: Minimum ROUGE F-measure required (0.0 to 1.0).
+            variant: ROUGE variant ("1", "2", or "l"). Defaults to "l".
+            tokeniser: Tokeniser to use. Defaults to WordTokeniser().
+
+        Raises:
+            InvalidThresholdError: If min_score is not in range [0.0, 1.0].
+        """
+        if not 0.0 <= min_score <= 1.0:
+            raise InvalidThresholdError(
+                f"min_score must be between 0.0 and 1.0, got {min_score}"
+            )
+        if variant not in ("1", "2", "l"):
+            raise InvalidThresholdError(
+                f"variant must be '1', '2', or 'l', got '{variant}'"
+            )
+
+        self._min_score = min_score
+        self._variant = variant
+        self._metric = Rouge(tokeniser=tokeniser)
+
+    @property
+    def name(self) -> str:
+        """Return the name of this check."""
+        return f"rouge-{self._variant}"
+
+    def check(self, text: str, context: ValidationContext) -> CheckResult:
+        """
+        Run the ROUGE check.
+
+        Args:
+            text: The text to validate.
+            context: Validation context containing reference text.
+
+        Returns:
+            CheckResult with pass/fail status.
+
+        Raises:
+            ValidationError: If reference text is missing from context.
+        """
+        if context.reference is None:
+            raise ValidationError(f"{self.name} requires reference text in context")
+
+        result = self._metric.score(text, context.reference)
+
+        # Select the appropriate ROUGE variant (use F-measure)
+        score_map = {
+            "1": result.rouge1.fmeasure,
+            "2": result.rouge2.fmeasure,
+            "l": result.rouge_l.fmeasure,
+        }
+        actual_score = score_map[self._variant]
+        passed = actual_score >= self._min_score
+
+        if passed:
+            message = (
+                f"ROUGE-{self._variant.upper()} score {actual_score:.2f} "
+                f"meets minimum {self._min_score:.2f}"
+            )
+        else:
+            message = (
+                f"ROUGE-{self._variant.upper()} score {actual_score:.2f} "
+                f"below minimum {self._min_score:.2f}"
+            )
+
+        return CheckResult(
+            name=self.name,
+            passed=passed,
+            actual=actual_score,
+            threshold=self._min_score,
+            message=message,
+        )
+
+
+class LexicalValidator:
+    """Validates lexical similarity meets threshold."""
+
+    def __init__(
+        self,
+        min_jaccard: float | None = None,
+        min_overlap: float | None = None,
+        tokeniser: WordTokeniser | None = None,
+    ) -> None:
+        """
+        Initialise the lexical validator.
+
+        Args:
+            min_jaccard: Minimum Jaccard similarity required (0.0 to 1.0).
+            min_overlap: Minimum token overlap required (0.0 to 1.0).
+            tokeniser: Tokeniser to use. Defaults to WordTokeniser().
+
+        Raises:
+            InvalidThresholdError: If thresholds are invalid or none provided.
+        """
+        if min_jaccard is None and min_overlap is None:
+            raise InvalidThresholdError(
+                "At least one of min_jaccard or min_overlap must be provided"
+            )
+
+        if min_jaccard is not None and not 0.0 <= min_jaccard <= 1.0:
+            raise InvalidThresholdError(
+                f"min_jaccard must be between 0.0 and 1.0, got {min_jaccard}"
+            )
+
+        if min_overlap is not None and not 0.0 <= min_overlap <= 1.0:
+            raise InvalidThresholdError(
+                f"min_overlap must be between 0.0 and 1.0, got {min_overlap}"
+            )
+
+        self._min_jaccard = min_jaccard
+        self._min_overlap = min_overlap
+        self._metric = Lexical(tokeniser=tokeniser)
+
+    @property
+    def name(self) -> str:
+        """Return the name of this check."""
+        return "lexical"
+
+    def check(self, text: str, context: ValidationContext) -> CheckResult:
+        """
+        Run the lexical similarity check.
+
+        Args:
+            text: The text to validate.
+            context: Validation context containing reference text.
+
+        Returns:
+            CheckResult with pass/fail status.
+
+        Raises:
+            ValidationError: If reference text is missing from context.
+        """
+        if context.reference is None:
+            raise ValidationError(f"{self.name} requires reference text in context")
+
+        result = self._metric.score(text, context.reference)
+
+        # Check each threshold that was specified
+        failures = []
+        if self._min_jaccard is not None and result.jaccard < self._min_jaccard:
+            failures.append(
+                f"Jaccard {result.jaccard:.2f} below minimum {self._min_jaccard:.2f}"
+            )
+
+        if self._min_overlap is not None and result.token_overlap < self._min_overlap:
+            failures.append(
+                f"token overlap {result.token_overlap:.2f} "
+                f"below minimum {self._min_overlap:.2f}"
+            )
+
+        passed = len(failures) == 0
+
+        if passed:
+            parts = []
+            if self._min_jaccard is not None:
+                parts.append(f"Jaccard {result.jaccard:.2f} >= {self._min_jaccard:.2f}")
+            if self._min_overlap is not None:
+                parts.append(
+                    f"overlap {result.token_overlap:.2f} >= {self._min_overlap:.2f}"
+                )
+            message = "Lexical similarity: " + ", ".join(parts)
+        else:
+            message = "Lexical similarity: " + "; ".join(failures)
+
+        # Build actual value dict
+        actual = {"jaccard": result.jaccard, "token_overlap": result.token_overlap}
+        threshold = {}
+        if self._min_jaccard is not None:
+            threshold["min_jaccard"] = self._min_jaccard
+        if self._min_overlap is not None:
+            threshold["min_overlap"] = self._min_overlap
+
+        return CheckResult(
+            name=self.name,
+            passed=passed,
+            actual=actual,
+            threshold=threshold,
+            message=message,
+        )