feat(validators): add metric-based validators
Implement BleuValidator, RougeValidator, and LexicalValidator for validating text against reference using metric thresholds.
This commit is contained in:
288
src/veritext/validators/metric.py
Normal file
288
src/veritext/validators/metric.py
Normal file
@@ -0,0 +1,288 @@
|
|||||||
|
"""Metric-based validators that require reference text."""
|
||||||
|
|
||||||
|
from typing import Literal
|
||||||
|
|
||||||
|
from veritext.core.exceptions import InvalidThresholdError, ValidationError
|
||||||
|
from veritext.core.tokenisation import WordTokeniser
|
||||||
|
from veritext.core.types import CheckResult, ValidationContext
|
||||||
|
from veritext.metrics.bleu import Bleu
|
||||||
|
from veritext.metrics.lexical import Lexical
|
||||||
|
from veritext.metrics.rouge import Rouge
|
||||||
|
|
||||||
|
|
||||||
|
class BleuValidator:
|
||||||
|
"""Validates that BLEU score meets minimum threshold."""
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
min_score: float,
|
||||||
|
variant: Literal[1, 2, 3, 4] = 4,
|
||||||
|
tokeniser: WordTokeniser | None = None,
|
||||||
|
) -> None:
|
||||||
|
"""
|
||||||
|
Initialise the BLEU validator.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
min_score: Minimum BLEU score required (0.0 to 1.0).
|
||||||
|
variant: BLEU variant to use (1, 2, 3, or 4). Defaults to 4.
|
||||||
|
tokeniser: Tokeniser to use. Defaults to WordTokeniser().
|
||||||
|
|
||||||
|
Raises:
|
||||||
|
InvalidThresholdError: If min_score is not in range [0.0, 1.0].
|
||||||
|
"""
|
||||||
|
if not 0.0 <= min_score <= 1.0:
|
||||||
|
raise InvalidThresholdError(
|
||||||
|
f"min_score must be between 0.0 and 1.0, got {min_score}"
|
||||||
|
)
|
||||||
|
if variant not in (1, 2, 3, 4):
|
||||||
|
raise InvalidThresholdError(f"variant must be 1, 2, 3, or 4, got {variant}")
|
||||||
|
|
||||||
|
self._min_score = min_score
|
||||||
|
self._variant = variant
|
||||||
|
self._metric = Bleu(tokeniser=tokeniser)
|
||||||
|
|
||||||
|
@property
|
||||||
|
def name(self) -> str:
|
||||||
|
"""Return the name of this check."""
|
||||||
|
return f"bleu-{self._variant}"
|
||||||
|
|
||||||
|
def check(self, text: str, context: ValidationContext) -> CheckResult:
|
||||||
|
"""
|
||||||
|
Run the BLEU check.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
text: The text to validate.
|
||||||
|
context: Validation context containing reference text.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
CheckResult with pass/fail status.
|
||||||
|
|
||||||
|
Raises:
|
||||||
|
ValidationError: If reference text is missing from context.
|
||||||
|
"""
|
||||||
|
if context.reference is None:
|
||||||
|
raise ValidationError(f"{self.name} requires reference text in context")
|
||||||
|
|
||||||
|
result = self._metric.score(text, context.reference)
|
||||||
|
|
||||||
|
# Select the appropriate BLEU variant
|
||||||
|
score_map = {
|
||||||
|
1: result.bleu1,
|
||||||
|
2: result.bleu2,
|
||||||
|
3: result.bleu3,
|
||||||
|
4: result.bleu4,
|
||||||
|
}
|
||||||
|
actual_score = score_map[self._variant]
|
||||||
|
passed = actual_score >= self._min_score
|
||||||
|
|
||||||
|
if passed:
|
||||||
|
message = (
|
||||||
|
f"BLEU-{self._variant} score {actual_score:.2f} "
|
||||||
|
f"meets minimum {self._min_score:.2f}"
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
message = (
|
||||||
|
f"BLEU-{self._variant} score {actual_score:.2f} "
|
||||||
|
f"below minimum {self._min_score:.2f}"
|
||||||
|
)
|
||||||
|
|
||||||
|
return CheckResult(
|
||||||
|
name=self.name,
|
||||||
|
passed=passed,
|
||||||
|
actual=actual_score,
|
||||||
|
threshold=self._min_score,
|
||||||
|
message=message,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
class RougeValidator:
|
||||||
|
"""Validates that ROUGE score meets minimum threshold."""
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
min_score: float,
|
||||||
|
variant: Literal["1", "2", "l"] = "l",
|
||||||
|
tokeniser: WordTokeniser | None = None,
|
||||||
|
) -> None:
|
||||||
|
"""
|
||||||
|
Initialise the ROUGE validator.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
min_score: Minimum ROUGE F-measure required (0.0 to 1.0).
|
||||||
|
variant: ROUGE variant ("1", "2", or "l"). Defaults to "l".
|
||||||
|
tokeniser: Tokeniser to use. Defaults to WordTokeniser().
|
||||||
|
|
||||||
|
Raises:
|
||||||
|
InvalidThresholdError: If min_score is not in range [0.0, 1.0].
|
||||||
|
"""
|
||||||
|
if not 0.0 <= min_score <= 1.0:
|
||||||
|
raise InvalidThresholdError(
|
||||||
|
f"min_score must be between 0.0 and 1.0, got {min_score}"
|
||||||
|
)
|
||||||
|
if variant not in ("1", "2", "l"):
|
||||||
|
raise InvalidThresholdError(
|
||||||
|
f"variant must be '1', '2', or 'l', got '{variant}'"
|
||||||
|
)
|
||||||
|
|
||||||
|
self._min_score = min_score
|
||||||
|
self._variant = variant
|
||||||
|
self._metric = Rouge(tokeniser=tokeniser)
|
||||||
|
|
||||||
|
@property
|
||||||
|
def name(self) -> str:
|
||||||
|
"""Return the name of this check."""
|
||||||
|
return f"rouge-{self._variant}"
|
||||||
|
|
||||||
|
def check(self, text: str, context: ValidationContext) -> CheckResult:
|
||||||
|
"""
|
||||||
|
Run the ROUGE check.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
text: The text to validate.
|
||||||
|
context: Validation context containing reference text.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
CheckResult with pass/fail status.
|
||||||
|
|
||||||
|
Raises:
|
||||||
|
ValidationError: If reference text is missing from context.
|
||||||
|
"""
|
||||||
|
if context.reference is None:
|
||||||
|
raise ValidationError(f"{self.name} requires reference text in context")
|
||||||
|
|
||||||
|
result = self._metric.score(text, context.reference)
|
||||||
|
|
||||||
|
# Select the appropriate ROUGE variant (use F-measure)
|
||||||
|
score_map = {
|
||||||
|
"1": result.rouge1.fmeasure,
|
||||||
|
"2": result.rouge2.fmeasure,
|
||||||
|
"l": result.rouge_l.fmeasure,
|
||||||
|
}
|
||||||
|
actual_score = score_map[self._variant]
|
||||||
|
passed = actual_score >= self._min_score
|
||||||
|
|
||||||
|
if passed:
|
||||||
|
message = (
|
||||||
|
f"ROUGE-{self._variant.upper()} score {actual_score:.2f} "
|
||||||
|
f"meets minimum {self._min_score:.2f}"
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
message = (
|
||||||
|
f"ROUGE-{self._variant.upper()} score {actual_score:.2f} "
|
||||||
|
f"below minimum {self._min_score:.2f}"
|
||||||
|
)
|
||||||
|
|
||||||
|
return CheckResult(
|
||||||
|
name=self.name,
|
||||||
|
passed=passed,
|
||||||
|
actual=actual_score,
|
||||||
|
threshold=self._min_score,
|
||||||
|
message=message,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
class LexicalValidator:
|
||||||
|
"""Validates lexical similarity meets threshold."""
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
min_jaccard: float | None = None,
|
||||||
|
min_overlap: float | None = None,
|
||||||
|
tokeniser: WordTokeniser | None = None,
|
||||||
|
) -> None:
|
||||||
|
"""
|
||||||
|
Initialise the lexical validator.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
min_jaccard: Minimum Jaccard similarity required (0.0 to 1.0).
|
||||||
|
min_overlap: Minimum token overlap required (0.0 to 1.0).
|
||||||
|
tokeniser: Tokeniser to use. Defaults to WordTokeniser().
|
||||||
|
|
||||||
|
Raises:
|
||||||
|
InvalidThresholdError: If thresholds are invalid or none provided.
|
||||||
|
"""
|
||||||
|
if min_jaccard is None and min_overlap is None:
|
||||||
|
raise InvalidThresholdError(
|
||||||
|
"At least one of min_jaccard or min_overlap must be provided"
|
||||||
|
)
|
||||||
|
|
||||||
|
if min_jaccard is not None and not 0.0 <= min_jaccard <= 1.0:
|
||||||
|
raise InvalidThresholdError(
|
||||||
|
f"min_jaccard must be between 0.0 and 1.0, got {min_jaccard}"
|
||||||
|
)
|
||||||
|
|
||||||
|
if min_overlap is not None and not 0.0 <= min_overlap <= 1.0:
|
||||||
|
raise InvalidThresholdError(
|
||||||
|
f"min_overlap must be between 0.0 and 1.0, got {min_overlap}"
|
||||||
|
)
|
||||||
|
|
||||||
|
self._min_jaccard = min_jaccard
|
||||||
|
self._min_overlap = min_overlap
|
||||||
|
self._metric = Lexical(tokeniser=tokeniser)
|
||||||
|
|
||||||
|
@property
|
||||||
|
def name(self) -> str:
|
||||||
|
"""Return the name of this check."""
|
||||||
|
return "lexical"
|
||||||
|
|
||||||
|
def check(self, text: str, context: ValidationContext) -> CheckResult:
|
||||||
|
"""
|
||||||
|
Run the lexical similarity check.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
text: The text to validate.
|
||||||
|
context: Validation context containing reference text.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
CheckResult with pass/fail status.
|
||||||
|
|
||||||
|
Raises:
|
||||||
|
ValidationError: If reference text is missing from context.
|
||||||
|
"""
|
||||||
|
if context.reference is None:
|
||||||
|
raise ValidationError(f"{self.name} requires reference text in context")
|
||||||
|
|
||||||
|
result = self._metric.score(text, context.reference)
|
||||||
|
|
||||||
|
# Check each threshold that was specified
|
||||||
|
failures = []
|
||||||
|
if self._min_jaccard is not None and result.jaccard < self._min_jaccard:
|
||||||
|
failures.append(
|
||||||
|
f"Jaccard {result.jaccard:.2f} below minimum {self._min_jaccard:.2f}"
|
||||||
|
)
|
||||||
|
|
||||||
|
if self._min_overlap is not None and result.token_overlap < self._min_overlap:
|
||||||
|
failures.append(
|
||||||
|
f"token overlap {result.token_overlap:.2f} "
|
||||||
|
f"below minimum {self._min_overlap:.2f}"
|
||||||
|
)
|
||||||
|
|
||||||
|
passed = len(failures) == 0
|
||||||
|
|
||||||
|
if passed:
|
||||||
|
parts = []
|
||||||
|
if self._min_jaccard is not None:
|
||||||
|
parts.append(f"Jaccard {result.jaccard:.2f} >= {self._min_jaccard:.2f}")
|
||||||
|
if self._min_overlap is not None:
|
||||||
|
parts.append(
|
||||||
|
f"overlap {result.token_overlap:.2f} >= {self._min_overlap:.2f}"
|
||||||
|
)
|
||||||
|
message = "Lexical similarity: " + ", ".join(parts)
|
||||||
|
else:
|
||||||
|
message = "Lexical similarity: " + "; ".join(failures)
|
||||||
|
|
||||||
|
# Build actual value dict
|
||||||
|
actual = {"jaccard": result.jaccard, "token_overlap": result.token_overlap}
|
||||||
|
threshold = {}
|
||||||
|
if self._min_jaccard is not None:
|
||||||
|
threshold["min_jaccard"] = self._min_jaccard
|
||||||
|
if self._min_overlap is not None:
|
||||||
|
threshold["min_overlap"] = self._min_overlap
|
||||||
|
|
||||||
|
return CheckResult(
|
||||||
|
name=self.name,
|
||||||
|
passed=passed,
|
||||||
|
actual=actual,
|
||||||
|
threshold=threshold,
|
||||||
|
message=message,
|
||||||
|
)
|
||||||
Reference in New Issue
Block a user