feat(validators): add metric-based validators
Implement BleuValidator, RougeValidator, and LexicalValidator for validating text against reference using metric thresholds.
This commit is contained in:
288
src/veritext/validators/metric.py
Normal file
288
src/veritext/validators/metric.py
Normal file
@@ -0,0 +1,288 @@
|
||||
"""Metric-based validators that require reference text."""
|
||||
|
||||
from typing import Literal
|
||||
|
||||
from veritext.core.exceptions import InvalidThresholdError, ValidationError
|
||||
from veritext.core.tokenisation import WordTokeniser
|
||||
from veritext.core.types import CheckResult, ValidationContext
|
||||
from veritext.metrics.bleu import Bleu
|
||||
from veritext.metrics.lexical import Lexical
|
||||
from veritext.metrics.rouge import Rouge
|
||||
|
||||
|
||||
class BleuValidator:
|
||||
"""Validates that BLEU score meets minimum threshold."""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
min_score: float,
|
||||
variant: Literal[1, 2, 3, 4] = 4,
|
||||
tokeniser: WordTokeniser | None = None,
|
||||
) -> None:
|
||||
"""
|
||||
Initialise the BLEU validator.
|
||||
|
||||
Args:
|
||||
min_score: Minimum BLEU score required (0.0 to 1.0).
|
||||
variant: BLEU variant to use (1, 2, 3, or 4). Defaults to 4.
|
||||
tokeniser: Tokeniser to use. Defaults to WordTokeniser().
|
||||
|
||||
Raises:
|
||||
InvalidThresholdError: If min_score is not in range [0.0, 1.0].
|
||||
"""
|
||||
if not 0.0 <= min_score <= 1.0:
|
||||
raise InvalidThresholdError(
|
||||
f"min_score must be between 0.0 and 1.0, got {min_score}"
|
||||
)
|
||||
if variant not in (1, 2, 3, 4):
|
||||
raise InvalidThresholdError(f"variant must be 1, 2, 3, or 4, got {variant}")
|
||||
|
||||
self._min_score = min_score
|
||||
self._variant = variant
|
||||
self._metric = Bleu(tokeniser=tokeniser)
|
||||
|
||||
@property
|
||||
def name(self) -> str:
|
||||
"""Return the name of this check."""
|
||||
return f"bleu-{self._variant}"
|
||||
|
||||
def check(self, text: str, context: ValidationContext) -> CheckResult:
|
||||
"""
|
||||
Run the BLEU check.
|
||||
|
||||
Args:
|
||||
text: The text to validate.
|
||||
context: Validation context containing reference text.
|
||||
|
||||
Returns:
|
||||
CheckResult with pass/fail status.
|
||||
|
||||
Raises:
|
||||
ValidationError: If reference text is missing from context.
|
||||
"""
|
||||
if context.reference is None:
|
||||
raise ValidationError(f"{self.name} requires reference text in context")
|
||||
|
||||
result = self._metric.score(text, context.reference)
|
||||
|
||||
# Select the appropriate BLEU variant
|
||||
score_map = {
|
||||
1: result.bleu1,
|
||||
2: result.bleu2,
|
||||
3: result.bleu3,
|
||||
4: result.bleu4,
|
||||
}
|
||||
actual_score = score_map[self._variant]
|
||||
passed = actual_score >= self._min_score
|
||||
|
||||
if passed:
|
||||
message = (
|
||||
f"BLEU-{self._variant} score {actual_score:.2f} "
|
||||
f"meets minimum {self._min_score:.2f}"
|
||||
)
|
||||
else:
|
||||
message = (
|
||||
f"BLEU-{self._variant} score {actual_score:.2f} "
|
||||
f"below minimum {self._min_score:.2f}"
|
||||
)
|
||||
|
||||
return CheckResult(
|
||||
name=self.name,
|
||||
passed=passed,
|
||||
actual=actual_score,
|
||||
threshold=self._min_score,
|
||||
message=message,
|
||||
)
|
||||
|
||||
|
||||
class RougeValidator:
|
||||
"""Validates that ROUGE score meets minimum threshold."""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
min_score: float,
|
||||
variant: Literal["1", "2", "l"] = "l",
|
||||
tokeniser: WordTokeniser | None = None,
|
||||
) -> None:
|
||||
"""
|
||||
Initialise the ROUGE validator.
|
||||
|
||||
Args:
|
||||
min_score: Minimum ROUGE F-measure required (0.0 to 1.0).
|
||||
variant: ROUGE variant ("1", "2", or "l"). Defaults to "l".
|
||||
tokeniser: Tokeniser to use. Defaults to WordTokeniser().
|
||||
|
||||
Raises:
|
||||
InvalidThresholdError: If min_score is not in range [0.0, 1.0].
|
||||
"""
|
||||
if not 0.0 <= min_score <= 1.0:
|
||||
raise InvalidThresholdError(
|
||||
f"min_score must be between 0.0 and 1.0, got {min_score}"
|
||||
)
|
||||
if variant not in ("1", "2", "l"):
|
||||
raise InvalidThresholdError(
|
||||
f"variant must be '1', '2', or 'l', got '{variant}'"
|
||||
)
|
||||
|
||||
self._min_score = min_score
|
||||
self._variant = variant
|
||||
self._metric = Rouge(tokeniser=tokeniser)
|
||||
|
||||
@property
|
||||
def name(self) -> str:
|
||||
"""Return the name of this check."""
|
||||
return f"rouge-{self._variant}"
|
||||
|
||||
def check(self, text: str, context: ValidationContext) -> CheckResult:
|
||||
"""
|
||||
Run the ROUGE check.
|
||||
|
||||
Args:
|
||||
text: The text to validate.
|
||||
context: Validation context containing reference text.
|
||||
|
||||
Returns:
|
||||
CheckResult with pass/fail status.
|
||||
|
||||
Raises:
|
||||
ValidationError: If reference text is missing from context.
|
||||
"""
|
||||
if context.reference is None:
|
||||
raise ValidationError(f"{self.name} requires reference text in context")
|
||||
|
||||
result = self._metric.score(text, context.reference)
|
||||
|
||||
# Select the appropriate ROUGE variant (use F-measure)
|
||||
score_map = {
|
||||
"1": result.rouge1.fmeasure,
|
||||
"2": result.rouge2.fmeasure,
|
||||
"l": result.rouge_l.fmeasure,
|
||||
}
|
||||
actual_score = score_map[self._variant]
|
||||
passed = actual_score >= self._min_score
|
||||
|
||||
if passed:
|
||||
message = (
|
||||
f"ROUGE-{self._variant.upper()} score {actual_score:.2f} "
|
||||
f"meets minimum {self._min_score:.2f}"
|
||||
)
|
||||
else:
|
||||
message = (
|
||||
f"ROUGE-{self._variant.upper()} score {actual_score:.2f} "
|
||||
f"below minimum {self._min_score:.2f}"
|
||||
)
|
||||
|
||||
return CheckResult(
|
||||
name=self.name,
|
||||
passed=passed,
|
||||
actual=actual_score,
|
||||
threshold=self._min_score,
|
||||
message=message,
|
||||
)
|
||||
|
||||
|
||||
class LexicalValidator:
|
||||
"""Validates lexical similarity meets threshold."""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
min_jaccard: float | None = None,
|
||||
min_overlap: float | None = None,
|
||||
tokeniser: WordTokeniser | None = None,
|
||||
) -> None:
|
||||
"""
|
||||
Initialise the lexical validator.
|
||||
|
||||
Args:
|
||||
min_jaccard: Minimum Jaccard similarity required (0.0 to 1.0).
|
||||
min_overlap: Minimum token overlap required (0.0 to 1.0).
|
||||
tokeniser: Tokeniser to use. Defaults to WordTokeniser().
|
||||
|
||||
Raises:
|
||||
InvalidThresholdError: If thresholds are invalid or none provided.
|
||||
"""
|
||||
if min_jaccard is None and min_overlap is None:
|
||||
raise InvalidThresholdError(
|
||||
"At least one of min_jaccard or min_overlap must be provided"
|
||||
)
|
||||
|
||||
if min_jaccard is not None and not 0.0 <= min_jaccard <= 1.0:
|
||||
raise InvalidThresholdError(
|
||||
f"min_jaccard must be between 0.0 and 1.0, got {min_jaccard}"
|
||||
)
|
||||
|
||||
if min_overlap is not None and not 0.0 <= min_overlap <= 1.0:
|
||||
raise InvalidThresholdError(
|
||||
f"min_overlap must be between 0.0 and 1.0, got {min_overlap}"
|
||||
)
|
||||
|
||||
self._min_jaccard = min_jaccard
|
||||
self._min_overlap = min_overlap
|
||||
self._metric = Lexical(tokeniser=tokeniser)
|
||||
|
||||
@property
|
||||
def name(self) -> str:
|
||||
"""Return the name of this check."""
|
||||
return "lexical"
|
||||
|
||||
def check(self, text: str, context: ValidationContext) -> CheckResult:
|
||||
"""
|
||||
Run the lexical similarity check.
|
||||
|
||||
Args:
|
||||
text: The text to validate.
|
||||
context: Validation context containing reference text.
|
||||
|
||||
Returns:
|
||||
CheckResult with pass/fail status.
|
||||
|
||||
Raises:
|
||||
ValidationError: If reference text is missing from context.
|
||||
"""
|
||||
if context.reference is None:
|
||||
raise ValidationError(f"{self.name} requires reference text in context")
|
||||
|
||||
result = self._metric.score(text, context.reference)
|
||||
|
||||
# Check each threshold that was specified
|
||||
failures = []
|
||||
if self._min_jaccard is not None and result.jaccard < self._min_jaccard:
|
||||
failures.append(
|
||||
f"Jaccard {result.jaccard:.2f} below minimum {self._min_jaccard:.2f}"
|
||||
)
|
||||
|
||||
if self._min_overlap is not None and result.token_overlap < self._min_overlap:
|
||||
failures.append(
|
||||
f"token overlap {result.token_overlap:.2f} "
|
||||
f"below minimum {self._min_overlap:.2f}"
|
||||
)
|
||||
|
||||
passed = len(failures) == 0
|
||||
|
||||
if passed:
|
||||
parts = []
|
||||
if self._min_jaccard is not None:
|
||||
parts.append(f"Jaccard {result.jaccard:.2f} >= {self._min_jaccard:.2f}")
|
||||
if self._min_overlap is not None:
|
||||
parts.append(
|
||||
f"overlap {result.token_overlap:.2f} >= {self._min_overlap:.2f}"
|
||||
)
|
||||
message = "Lexical similarity: " + ", ".join(parts)
|
||||
else:
|
||||
message = "Lexical similarity: " + "; ".join(failures)
|
||||
|
||||
# Build actual value dict
|
||||
actual = {"jaccard": result.jaccard, "token_overlap": result.token_overlap}
|
||||
threshold = {}
|
||||
if self._min_jaccard is not None:
|
||||
threshold["min_jaccard"] = self._min_jaccard
|
||||
if self._min_overlap is not None:
|
||||
threshold["min_overlap"] = self._min_overlap
|
||||
|
||||
return CheckResult(
|
||||
name=self.name,
|
||||
passed=passed,
|
||||
actual=actual,
|
||||
threshold=threshold,
|
||||
message=message,
|
||||
)
|
||||
Reference in New Issue
Block a user