feat(validators): add metric-based validators

Implement BleuValidator, RougeValidator, and LexicalValidator for
validating text against reference using metric thresholds.
This commit is contained in:
2026-02-03 17:14:09 +00:00
parent 9e7b0131b3
commit b2b5eb1518

View File

@@ -0,0 +1,288 @@
"""Metric-based validators that require reference text."""
from typing import Literal
from veritext.core.exceptions import InvalidThresholdError, ValidationError
from veritext.core.tokenisation import WordTokeniser
from veritext.core.types import CheckResult, ValidationContext
from veritext.metrics.bleu import Bleu
from veritext.metrics.lexical import Lexical
from veritext.metrics.rouge import Rouge
class BleuValidator:
"""Validates that BLEU score meets minimum threshold."""
def __init__(
self,
min_score: float,
variant: Literal[1, 2, 3, 4] = 4,
tokeniser: WordTokeniser | None = None,
) -> None:
"""
Initialise the BLEU validator.
Args:
min_score: Minimum BLEU score required (0.0 to 1.0).
variant: BLEU variant to use (1, 2, 3, or 4). Defaults to 4.
tokeniser: Tokeniser to use. Defaults to WordTokeniser().
Raises:
InvalidThresholdError: If min_score is not in range [0.0, 1.0].
"""
if not 0.0 <= min_score <= 1.0:
raise InvalidThresholdError(
f"min_score must be between 0.0 and 1.0, got {min_score}"
)
if variant not in (1, 2, 3, 4):
raise InvalidThresholdError(f"variant must be 1, 2, 3, or 4, got {variant}")
self._min_score = min_score
self._variant = variant
self._metric = Bleu(tokeniser=tokeniser)
@property
def name(self) -> str:
"""Return the name of this check."""
return f"bleu-{self._variant}"
def check(self, text: str, context: ValidationContext) -> CheckResult:
"""
Run the BLEU check.
Args:
text: The text to validate.
context: Validation context containing reference text.
Returns:
CheckResult with pass/fail status.
Raises:
ValidationError: If reference text is missing from context.
"""
if context.reference is None:
raise ValidationError(f"{self.name} requires reference text in context")
result = self._metric.score(text, context.reference)
# Select the appropriate BLEU variant
score_map = {
1: result.bleu1,
2: result.bleu2,
3: result.bleu3,
4: result.bleu4,
}
actual_score = score_map[self._variant]
passed = actual_score >= self._min_score
if passed:
message = (
f"BLEU-{self._variant} score {actual_score:.2f} "
f"meets minimum {self._min_score:.2f}"
)
else:
message = (
f"BLEU-{self._variant} score {actual_score:.2f} "
f"below minimum {self._min_score:.2f}"
)
return CheckResult(
name=self.name,
passed=passed,
actual=actual_score,
threshold=self._min_score,
message=message,
)
class RougeValidator:
"""Validates that ROUGE score meets minimum threshold."""
def __init__(
self,
min_score: float,
variant: Literal["1", "2", "l"] = "l",
tokeniser: WordTokeniser | None = None,
) -> None:
"""
Initialise the ROUGE validator.
Args:
min_score: Minimum ROUGE F-measure required (0.0 to 1.0).
variant: ROUGE variant ("1", "2", or "l"). Defaults to "l".
tokeniser: Tokeniser to use. Defaults to WordTokeniser().
Raises:
InvalidThresholdError: If min_score is not in range [0.0, 1.0].
"""
if not 0.0 <= min_score <= 1.0:
raise InvalidThresholdError(
f"min_score must be between 0.0 and 1.0, got {min_score}"
)
if variant not in ("1", "2", "l"):
raise InvalidThresholdError(
f"variant must be '1', '2', or 'l', got '{variant}'"
)
self._min_score = min_score
self._variant = variant
self._metric = Rouge(tokeniser=tokeniser)
@property
def name(self) -> str:
"""Return the name of this check."""
return f"rouge-{self._variant}"
def check(self, text: str, context: ValidationContext) -> CheckResult:
"""
Run the ROUGE check.
Args:
text: The text to validate.
context: Validation context containing reference text.
Returns:
CheckResult with pass/fail status.
Raises:
ValidationError: If reference text is missing from context.
"""
if context.reference is None:
raise ValidationError(f"{self.name} requires reference text in context")
result = self._metric.score(text, context.reference)
# Select the appropriate ROUGE variant (use F-measure)
score_map = {
"1": result.rouge1.fmeasure,
"2": result.rouge2.fmeasure,
"l": result.rouge_l.fmeasure,
}
actual_score = score_map[self._variant]
passed = actual_score >= self._min_score
if passed:
message = (
f"ROUGE-{self._variant.upper()} score {actual_score:.2f} "
f"meets minimum {self._min_score:.2f}"
)
else:
message = (
f"ROUGE-{self._variant.upper()} score {actual_score:.2f} "
f"below minimum {self._min_score:.2f}"
)
return CheckResult(
name=self.name,
passed=passed,
actual=actual_score,
threshold=self._min_score,
message=message,
)
class LexicalValidator:
"""Validates lexical similarity meets threshold."""
def __init__(
self,
min_jaccard: float | None = None,
min_overlap: float | None = None,
tokeniser: WordTokeniser | None = None,
) -> None:
"""
Initialise the lexical validator.
Args:
min_jaccard: Minimum Jaccard similarity required (0.0 to 1.0).
min_overlap: Minimum token overlap required (0.0 to 1.0).
tokeniser: Tokeniser to use. Defaults to WordTokeniser().
Raises:
InvalidThresholdError: If thresholds are invalid or none provided.
"""
if min_jaccard is None and min_overlap is None:
raise InvalidThresholdError(
"At least one of min_jaccard or min_overlap must be provided"
)
if min_jaccard is not None and not 0.0 <= min_jaccard <= 1.0:
raise InvalidThresholdError(
f"min_jaccard must be between 0.0 and 1.0, got {min_jaccard}"
)
if min_overlap is not None and not 0.0 <= min_overlap <= 1.0:
raise InvalidThresholdError(
f"min_overlap must be between 0.0 and 1.0, got {min_overlap}"
)
self._min_jaccard = min_jaccard
self._min_overlap = min_overlap
self._metric = Lexical(tokeniser=tokeniser)
@property
def name(self) -> str:
"""Return the name of this check."""
return "lexical"
def check(self, text: str, context: ValidationContext) -> CheckResult:
"""
Run the lexical similarity check.
Args:
text: The text to validate.
context: Validation context containing reference text.
Returns:
CheckResult with pass/fail status.
Raises:
ValidationError: If reference text is missing from context.
"""
if context.reference is None:
raise ValidationError(f"{self.name} requires reference text in context")
result = self._metric.score(text, context.reference)
# Check each threshold that was specified
failures = []
if self._min_jaccard is not None and result.jaccard < self._min_jaccard:
failures.append(
f"Jaccard {result.jaccard:.2f} below minimum {self._min_jaccard:.2f}"
)
if self._min_overlap is not None and result.token_overlap < self._min_overlap:
failures.append(
f"token overlap {result.token_overlap:.2f} "
f"below minimum {self._min_overlap:.2f}"
)
passed = len(failures) == 0
if passed:
parts = []
if self._min_jaccard is not None:
parts.append(f"Jaccard {result.jaccard:.2f} >= {self._min_jaccard:.2f}")
if self._min_overlap is not None:
parts.append(
f"overlap {result.token_overlap:.2f} >= {self._min_overlap:.2f}"
)
message = "Lexical similarity: " + ", ".join(parts)
else:
message = "Lexical similarity: " + "; ".join(failures)
# Build actual value dict
actual = {"jaccard": result.jaccard, "token_overlap": result.token_overlap}
threshold = {}
if self._min_jaccard is not None:
threshold["min_jaccard"] = self._min_jaccard
if self._min_overlap is not None:
threshold["min_overlap"] = self._min_overlap
return CheckResult(
name=self.name,
passed=passed,
actual=actual,
threshold=threshold,
message=message,
)