18 Commits

Author SHA1 Message Date
107fc4e275 docs(changelog): add semantic similarity entries 2026-02-03 17:31:14 +00:00
571b770281 test(semantic): add semantic similarity tests 2026-02-03 17:31:07 +00:00
8b3536873e feat(validators): add SemanticValidator 2026-02-03 17:31:01 +00:00
9a4ac359a3 feat(semantic): add SemanticSimilarity metric 2026-02-03 17:30:56 +00:00
de5ad93524 feat(metrics): add SemanticResult type 2026-02-03 17:30:50 +00:00
cab8099d06 docs(changelog): add validator entries
Document validators module with Check protocol, metric validators,
constraint validators, composite validators, and factory functions.
2026-02-03 17:14:37 +00:00
e2be3daffd test(validators): add validator tests
Add comprehensive tests for metric validators, constraint validators,
and composite validators covering pass/fail cases and error handling.
2026-02-03 17:14:32 +00:00
9239300fd9 feat(validators): add factory functions and exports
Export all validators and provide factory functions for clean API:
bleu(), rouge(), lexical(), length(), readability(), contains(),
excludes(), all_of(), any_of().
2026-02-03 17:14:26 +00:00
b9f805b2f4 feat(validators): add composite validators
Implement AllOf and AnyOf for combining multiple checks into
composite validation rules.
2026-02-03 17:14:20 +00:00
75cd7b68de feat(validators): add constraint validators
Implement LengthValidator, ReadabilityValidator, ContainsValidator, and
ExcludesValidator for text constraints without reference text.
2026-02-03 17:14:14 +00:00
b2b5eb1518 feat(validators): add metric-based validators
Implement BleuValidator, RougeValidator, and LexicalValidator for
validating text against reference using metric thresholds.
2026-02-03 17:14:09 +00:00
9e7b0131b3 feat(validators): add Check protocol and base types
Define the Check protocol for validation checks that compute a score
and return pass/fail results with diagnostics.
2026-02-03 17:14:03 +00:00
b8ab5811dd docs(changelog): add ROUGE and readability entries 2026-02-03 17:03:39 +00:00
62fac688e4 test(metrics): add ROUGE and readability tests 2026-02-03 17:03:34 +00:00
14ac7dbbb9 feat(metrics): export ROUGE and readability from module 2026-02-03 17:03:28 +00:00
aad933f9c4 feat(metrics): add readability implementation 2026-02-03 17:03:24 +00:00
2a7476046d feat(metrics): add ROUGE implementation 2026-02-03 17:03:19 +00:00
914c738013 feat(metrics): add ROUGE and readability result types 2026-02-03 17:03:14 +00:00
20 changed files with 3471 additions and 2 deletions

View File

@@ -18,4 +18,16 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
- Metrics module with `Metric` protocol, `AggregateStats`, and `BatchResult` types - Metrics module with `Metric` protocol, `AggregateStats`, and `BatchResult` types
- BLEU metric implementation (BLEU-1 through BLEU-4 with brevity penalty) - BLEU metric implementation (BLEU-1 through BLEU-4 with brevity penalty)
- Lexical similarity metric (Jaccard similarity and token overlap) - Lexical similarity metric (Jaccard similarity and token overlap)
- ROUGE metric (ROUGE-1, ROUGE-2, ROUGE-L with precision/recall/F-measure)
- Flesch-Kincaid readability metrics (grade level and reading ease)
- Batch scoring with aggregate statistics for all metrics - Batch scoring with aggregate statistics for all metrics
- Validators module with `Check` protocol for validation checks
- Metric-based validators: `BleuValidator`, `RougeValidator`, `LexicalValidator`
- Constraint validators: `LengthValidator`, `ReadabilityValidator`, `ContainsValidator`, `ExcludesValidator`
- Composite validators: `AllOf` (all checks must pass), `AnyOf` (any check must pass)
- Factory functions for clean validator API (`bleu()`, `rouge()`, `lexical()`, `length()`, `readability()`, `contains()`, `excludes()`, `all_of()`, `any_of()`)
- Semantic similarity module with embedding-based text comparison (requires `veritext[semantic]` extra)
- `SemanticSimilarity` metric using sentence-transformers for semantic relatedness
- `SemanticValidator` for threshold-based semantic similarity validation
- `semantic()` factory function for creating semantic validators
- Embedding caching for performance optimisation in repeated comparisons

View File

@@ -1,9 +1,18 @@
"""Metrics module: BLEU, lexical similarity, and batch processing.""" """Metrics module: BLEU, ROUGE, lexical similarity, readability, and batch processing."""
from veritext.metrics.base import AggregateStats, BatchResult, Metric from veritext.metrics.base import AggregateStats, BatchResult, Metric
from veritext.metrics.bleu import Bleu from veritext.metrics.bleu import Bleu
from veritext.metrics.lexical import Lexical from veritext.metrics.lexical import Lexical
from veritext.metrics.results import BleuResult, LexicalResult from veritext.metrics.readability import Readability
from veritext.metrics.results import (
BleuResult,
LexicalResult,
ReadabilityResult,
RougeResult,
RougeScore,
SemanticResult,
)
from veritext.metrics.rouge import Rouge
__all__ = [ __all__ = [
"AggregateStats", "AggregateStats",
@@ -13,4 +22,10 @@ __all__ = [
"Lexical", "Lexical",
"LexicalResult", "LexicalResult",
"Metric", "Metric",
"Readability",
"ReadabilityResult",
"Rouge",
"RougeResult",
"RougeScore",
"SemanticResult",
] ]

View File

@@ -0,0 +1,195 @@
"""Readability metrics implementation (Flesch-Kincaid)."""
import re
from veritext.metrics.base import AggregateStats, BatchResult
from veritext.metrics.results import ReadabilityResult
# Sentence-ending punctuation pattern
_SENTENCE_ENDINGS = re.compile(r"[.!?]+")
# Vowel pattern for syllable counting
_VOWELS = re.compile(r"[aeiouy]+", re.IGNORECASE)
def _count_syllables(word: str) -> int:
"""
Count syllables in a word using a heuristic approach.
Uses vowel group counting with adjustments for common patterns.
Args:
word: The word to count syllables for.
Returns:
Estimated syllable count (minimum 1 for non-empty words).
"""
if not word:
return 0
word = word.lower().strip()
if not word:
return 0
# Count vowel groups
vowel_groups = _VOWELS.findall(word)
count = len(vowel_groups)
# Adjust for silent 'e' at end
if word.endswith("e") and count > 1:
count -= 1
# Adjust for 'le' ending (e.g., "table", "able")
if word.endswith("le") and len(word) > 2 and word[-3] not in "aeiouy":
count += 1
# Adjust for 'ed' ending when not adding syllable
if word.endswith("ed") and len(word) > 2 and word[-3] not in "dt":
count = max(count - 1, 1)
# Ensure at least 1 syllable for any word
return max(count, 1)
def _count_sentences(text: str) -> int:
"""
Count sentences in text.
Splits on sentence-ending punctuation (.!?).
Args:
text: The text to count sentences in.
Returns:
Number of sentences (minimum 1 for non-empty text).
"""
if not text or not text.strip():
return 0
# Split on sentence endings and filter empty strings
sentences = _SENTENCE_ENDINGS.split(text)
# Filter out empty segments
sentences = [s for s in sentences if s.strip()]
return max(len(sentences), 1)
def _count_words(text: str) -> tuple[list[str], int]:
"""
Extract words from text and count them.
Args:
text: The text to process.
Returns:
Tuple of (word list, word count).
"""
# Extract words (sequences of letters and apostrophes)
words = re.findall(r"[a-zA-Z']+", text)
# Filter out standalone apostrophes
words = [w for w in words if w.replace("'", "")]
return words, len(words)
class Readability:
"""
Readability metric using Flesch-Kincaid formulas.
Computes:
- Flesch-Kincaid Grade Level: US grade level required to understand text
- Flesch Reading Ease: Score from 0-100 (higher = easier to read)
This metric does NOT require reference text.
"""
@property
def name(self) -> str:
"""Return the name of this metric."""
return "readability"
@property
def requires_reference(self) -> bool:
"""Return whether this metric requires reference text."""
return False
def score(
self,
candidate: str,
reference: str | list[str] | None = None, # noqa: ARG002
) -> ReadabilityResult:
"""
Compute readability scores for a text.
Args:
candidate: The text to score.
reference: Ignored (readability doesn't use reference text).
Returns:
ReadabilityResult with Flesch-Kincaid scores.
"""
# Extract words and count
words, word_count = _count_words(candidate)
# Handle empty or trivial text
if word_count == 0:
return ReadabilityResult(
flesch_kincaid_grade=0.0,
flesch_reading_ease=0.0,
)
# Count sentences
sentence_count = _count_sentences(candidate)
# Count syllables
syllable_count = sum(_count_syllables(word) for word in words)
# Compute ratios
words_per_sentence = word_count / sentence_count
syllables_per_word = syllable_count / word_count
# Flesch-Kincaid Grade Level
# Formula: 0.39 * (words/sentences) + 11.8 * (syllables/words) - 15.59
grade_level = 0.39 * words_per_sentence + 11.8 * syllables_per_word - 15.59
# Flesch Reading Ease
# Formula: 206.835 - 1.015 * (words/sentences) - 84.6 * (syllables/words)
reading_ease = 206.835 - 1.015 * words_per_sentence - 84.6 * syllables_per_word
return ReadabilityResult(
flesch_kincaid_grade=grade_level,
flesch_reading_ease=reading_ease,
)
def batch_score(
self,
candidates: list[str],
references: list[str] | list[list[str]] | None = None, # noqa: ARG002
) -> BatchResult[ReadabilityResult]:
"""
Compute readability scores for a batch of texts.
Args:
candidates: List of texts to score.
references: Ignored (readability doesn't use reference text).
Returns:
BatchResult containing individual results and aggregate statistics.
"""
if not candidates:
raise ValueError("Cannot compute batch statistics from empty list")
results: list[ReadabilityResult] = []
for cand in candidates:
results.append(self.score(cand))
# Compute aggregate statistics
stats = {
"flesch_kincaid_grade": AggregateStats.from_values(
[r.flesch_kincaid_grade for r in results]
),
"flesch_reading_ease": AggregateStats.from_values(
[r.flesch_reading_ease for r in results]
),
}
return BatchResult(results=results, count=len(results), stats=stats)

View File

@@ -39,3 +39,72 @@ class LexicalResult(BaseModel):
token_overlap: float token_overlap: float
"""Proportion of candidate tokens found in reference.""" """Proportion of candidate tokens found in reference."""
class RougeScore(BaseModel):
"""Individual ROUGE variant score with precision, recall, F-measure."""
model_config = ConfigDict(frozen=True)
precision: float
"""Precision: overlap / candidate length."""
recall: float
"""Recall: overlap / reference length."""
fmeasure: float
"""F1-measure: harmonic mean of precision and recall."""
class RougeResult(BaseModel):
"""Result of ROUGE score computation."""
model_config = ConfigDict(frozen=True)
rouge1: RougeScore
"""ROUGE-1 (unigram) score."""
rouge2: RougeScore
"""ROUGE-2 (bigram) score."""
rouge_l: RougeScore
"""ROUGE-L (longest common subsequence) score."""
@property
def score(self) -> float:
"""Return ROUGE-L F-measure as the primary score."""
return self.rouge_l.fmeasure
class ReadabilityResult(BaseModel):
"""Result of readability computation."""
model_config = ConfigDict(frozen=True)
flesch_kincaid_grade: float
"""US grade level (e.g., 8.0 = 8th grade reading level)."""
flesch_reading_ease: float
"""Score 0-100, higher = easier to read."""
@property
def score(self) -> float:
"""Return Flesch reading ease as the primary score."""
return self.flesch_reading_ease
class SemanticResult(BaseModel):
"""Result of semantic similarity computation."""
model_config = ConfigDict(frozen=True)
similarity: float
"""Cosine similarity score (0.0 to 1.0)."""
model: str
"""Name of the embedding model used."""
@property
def score(self) -> float:
"""Return the primary score for this result."""
return self.similarity

View File

@@ -0,0 +1,281 @@
"""ROUGE (Recall-Oriented Understudy for Gisting Evaluation) metric implementation."""
from collections import Counter
from veritext.core.tokenisation import WordTokeniser
from veritext.metrics.base import AggregateStats, BatchResult
from veritext.metrics.results import RougeResult, RougeScore
def _get_ngrams(tokens: list[str], n: int) -> Counter[tuple[str, ...]]:
"""Extract n-grams from a list of tokens."""
if n > len(tokens):
return Counter()
return Counter(tuple(tokens[i : i + n]) for i in range(len(tokens) - n + 1))
def _ngram_overlap(
candidate_ngrams: Counter[tuple[str, ...]],
reference_ngrams: Counter[tuple[str, ...]],
) -> int:
"""Compute the overlap count between candidate and reference n-grams."""
overlap = 0
for ngram, count in candidate_ngrams.items():
overlap += min(count, reference_ngrams.get(ngram, 0))
return overlap
def _compute_rouge_score(
candidate_tokens: list[str],
reference_tokens: list[str],
n: int,
) -> RougeScore:
"""
Compute ROUGE-n score for given n-gram size.
Args:
candidate_tokens: Tokenised candidate text.
reference_tokens: Tokenised reference text.
n: N-gram size.
Returns:
RougeScore with precision, recall, and F-measure.
"""
candidate_ngrams = _get_ngrams(candidate_tokens, n)
reference_ngrams = _get_ngrams(reference_tokens, n)
candidate_count = sum(candidate_ngrams.values())
reference_count = sum(reference_ngrams.values())
if candidate_count == 0 and reference_count == 0:
return RougeScore(precision=0.0, recall=0.0, fmeasure=0.0)
overlap = _ngram_overlap(candidate_ngrams, reference_ngrams)
precision = overlap / candidate_count if candidate_count > 0 else 0.0
recall = overlap / reference_count if reference_count > 0 else 0.0
if precision + recall > 0:
fmeasure = 2 * precision * recall / (precision + recall)
else:
fmeasure = 0.0
return RougeScore(precision=precision, recall=recall, fmeasure=fmeasure)
def _lcs_length(seq1: list[str], seq2: list[str]) -> int:
"""
Compute the length of the longest common subsequence.
Uses dynamic programming with O(m*n) time and O(min(m,n)) space.
"""
if not seq1 or not seq2:
return 0
# Optimise by using shorter sequence for columns
if len(seq1) < len(seq2):
seq1, seq2 = seq2, seq1
m, n = len(seq1), len(seq2)
# Only need two rows at a time
prev = [0] * (n + 1)
curr = [0] * (n + 1)
for i in range(1, m + 1):
for j in range(1, n + 1):
if seq1[i - 1] == seq2[j - 1]:
curr[j] = prev[j - 1] + 1
else:
curr[j] = max(prev[j], curr[j - 1])
prev, curr = curr, prev
return prev[n]
def _compute_rouge_l(
candidate_tokens: list[str],
reference_tokens: list[str],
) -> RougeScore:
"""
Compute ROUGE-L score using longest common subsequence.
Args:
candidate_tokens: Tokenised candidate text.
reference_tokens: Tokenised reference text.
Returns:
RougeScore with precision, recall, and F-measure.
"""
if not candidate_tokens and not reference_tokens:
return RougeScore(precision=0.0, recall=0.0, fmeasure=0.0)
if not candidate_tokens or not reference_tokens:
return RougeScore(precision=0.0, recall=0.0, fmeasure=0.0)
lcs = _lcs_length(candidate_tokens, reference_tokens)
precision = lcs / len(candidate_tokens)
recall = lcs / len(reference_tokens)
if precision + recall > 0:
fmeasure = 2 * precision * recall / (precision + recall)
else:
fmeasure = 0.0
return RougeScore(precision=precision, recall=recall, fmeasure=fmeasure)
def _max_rouge_scores(scores: list[RougeScore]) -> RougeScore:
"""Select the RougeScore with the highest F-measure from a list."""
return max(scores, key=lambda s: s.fmeasure)
class Rouge:
"""
ROUGE metric for measuring summary/generation quality.
Computes ROUGE-1 (unigram), ROUGE-2 (bigram), and ROUGE-L (LCS) scores.
ROUGE is recall-oriented, measuring how much of the reference is captured.
"""
def __init__(self, tokeniser: WordTokeniser | None = None) -> None:
"""
Initialise the ROUGE metric.
Args:
tokeniser: Tokeniser to use. Defaults to WordTokeniser().
"""
self._tokeniser = tokeniser or WordTokeniser()
@property
def name(self) -> str:
"""Return the name of this metric."""
return "rouge"
@property
def requires_reference(self) -> bool:
"""Return whether this metric requires reference text."""
return True
def score(
self, candidate: str, reference: str | list[str] | None = None
) -> RougeResult:
"""
Compute ROUGE scores for a candidate text.
Args:
candidate: The text to score.
reference: Reference text(s) for comparison. If multiple references
are provided, returns the maximum score for each variant.
Returns:
RougeResult with ROUGE-1, ROUGE-2, and ROUGE-L scores.
Raises:
ValueError: If reference is None or empty.
"""
if reference is None:
raise ValueError("ROUGE requires reference text")
# Normalise reference to list
references = [reference] if isinstance(reference, str) else reference
# Tokenise
candidate_tokens = self._tokeniser.tokenise(candidate)
reference_token_lists = [self._tokeniser.tokenise(r) for r in references]
# Handle empty references
if all(not ref for ref in reference_token_lists):
raise ValueError("Reference text cannot be empty")
# Handle empty candidate
if not candidate_tokens:
return RougeResult(
rouge1=RougeScore(precision=0.0, recall=0.0, fmeasure=0.0),
rouge2=RougeScore(precision=0.0, recall=0.0, fmeasure=0.0),
rouge_l=RougeScore(precision=0.0, recall=0.0, fmeasure=0.0),
)
# Compute scores for each reference and take max
rouge1_scores = []
rouge2_scores = []
rouge_l_scores = []
for ref_tokens in reference_token_lists:
if not ref_tokens:
continue
rouge1_scores.append(_compute_rouge_score(candidate_tokens, ref_tokens, 1))
rouge2_scores.append(_compute_rouge_score(candidate_tokens, ref_tokens, 2))
rouge_l_scores.append(_compute_rouge_l(candidate_tokens, ref_tokens))
return RougeResult(
rouge1=_max_rouge_scores(rouge1_scores),
rouge2=_max_rouge_scores(rouge2_scores),
rouge_l=_max_rouge_scores(rouge_l_scores),
)
def batch_score(
self,
candidates: list[str],
references: list[str] | list[list[str]] | None = None,
) -> BatchResult[RougeResult]:
"""
Compute ROUGE scores for a batch of candidates.
Args:
candidates: List of texts to score.
references: Reference text(s) for each candidate.
Returns:
BatchResult containing individual results and aggregate statistics.
Raises:
ValueError: If references is None or length mismatch.
"""
if references is None:
raise ValueError("ROUGE requires reference texts")
if len(candidates) != len(references):
raise ValueError(
f"Number of candidates ({len(candidates)}) must match "
f"number of references ({len(references)})"
)
results: list[RougeResult] = []
for i, cand in enumerate(candidates):
ref: str | list[str] = references[i]
results.append(self.score(cand, ref))
# Compute aggregate statistics for each score type
stats = {
"rouge1_precision": AggregateStats.from_values(
[r.rouge1.precision for r in results]
),
"rouge1_recall": AggregateStats.from_values(
[r.rouge1.recall for r in results]
),
"rouge1_fmeasure": AggregateStats.from_values(
[r.rouge1.fmeasure for r in results]
),
"rouge2_precision": AggregateStats.from_values(
[r.rouge2.precision for r in results]
),
"rouge2_recall": AggregateStats.from_values(
[r.rouge2.recall for r in results]
),
"rouge2_fmeasure": AggregateStats.from_values(
[r.rouge2.fmeasure for r in results]
),
"rouge_l_precision": AggregateStats.from_values(
[r.rouge_l.precision for r in results]
),
"rouge_l_recall": AggregateStats.from_values(
[r.rouge_l.recall for r in results]
),
"rouge_l_fmeasure": AggregateStats.from_values(
[r.rouge_l.fmeasure for r in results]
),
}
return BatchResult(results=results, count=len(results), stats=stats)

View File

@@ -0,0 +1,16 @@
"""Semantic similarity module: embedding-based text comparison.
This module provides semantic similarity using sentence-transformers.
It requires the `veritext[semantic]` extra to be installed.
Example:
>>> from veritext.semantic import SemanticSimilarity
>>>
>>> metric = SemanticSimilarity()
>>> result = metric.score("The cat sat on the mat", "A feline rested on the rug")
>>> print(f"Similarity: {result.similarity:.2f}")
"""
from veritext.semantic.similarity import SemanticSimilarity
__all__ = ["SemanticSimilarity"]

View File

@@ -0,0 +1,188 @@
"""Embedding-based semantic similarity using sentence-transformers."""
from typing import Any
from veritext.core.exceptions import DependencyError
from veritext.metrics.base import AggregateStats, BatchResult
from veritext.metrics.results import SemanticResult
class SemanticSimilarity:
"""
Embedding-based semantic similarity using sentence-transformers.
Computes cosine similarity between text embeddings to measure semantic
relatedness. This metric captures meaning beyond lexical overlap.
Requires the `veritext[semantic]` extra to be installed.
"""
def __init__(
self,
model: str = "all-MiniLM-L6-v2",
cache_embeddings: bool = True,
) -> None:
"""
Initialise the semantic similarity metric.
Args:
model: Name of the sentence-transformers model to use.
Defaults to "all-MiniLM-L6-v2" (22MB, good quality/size tradeoff).
cache_embeddings: Whether to cache embeddings for repeated texts.
Defaults to True.
Raises:
DependencyError: If sentence-transformers is not installed.
"""
try:
from sentence_transformers import SentenceTransformer
except ImportError as err:
raise DependencyError(
"Install veritext[semantic] for semantic similarity: "
"pip install veritext[semantic]"
) from err
self._model_name = model
self._model: Any = SentenceTransformer(model)
self._cache: dict[str, Any] | None = {} if cache_embeddings else None
@property
def name(self) -> str:
"""Return the name of this metric."""
return "semantic"
@property
def requires_reference(self) -> bool:
"""Return whether this metric requires reference text."""
return True
def _get_embedding(self, text: str) -> Any:
"""
Get embedding for text, using cache if available.
Args:
text: The text to embed.
Returns:
The embedding tensor.
"""
if self._cache is not None and text in self._cache:
return self._cache[text]
embedding = self._model.encode(text, convert_to_tensor=True)
if self._cache is not None:
self._cache[text] = embedding
return embedding
def _cosine_similarity(self, embedding1: Any, embedding2: Any) -> float:
"""
Compute cosine similarity between two embeddings.
Args:
embedding1: First embedding tensor.
embedding2: Second embedding tensor.
Returns:
Cosine similarity score (0.0 to 1.0).
"""
from sentence_transformers import util
similarity: float = util.cos_sim(embedding1, embedding2).item()
# Clamp to [0, 1] as negative similarities are possible but not meaningful
return max(0.0, min(1.0, similarity))
def score(
self, candidate: str, reference: str | list[str] | None = None
) -> SemanticResult:
"""
Compute semantic similarity between candidate and reference.
When multiple references are provided, returns the maximum similarity
across all references.
Args:
candidate: The text to score.
reference: Reference text(s) for comparison.
Returns:
SemanticResult with similarity score and model name.
Raises:
ValueError: If reference is None or empty.
"""
if reference is None:
raise ValueError("Semantic similarity requires reference text")
# Normalise reference to list
references = [reference] if isinstance(reference, str) else reference
if not references:
raise ValueError("Reference text cannot be empty")
# Handle empty candidate
candidate_stripped = candidate.strip()
if not candidate_stripped:
return SemanticResult(similarity=0.0, model=self._model_name)
# Handle empty references
valid_references = [r for r in references if r.strip()]
if not valid_references:
raise ValueError("Reference text cannot be empty")
# Get candidate embedding
candidate_embedding = self._get_embedding(candidate_stripped)
# Compute similarity against each reference, take maximum
max_similarity = 0.0
for ref in valid_references:
ref_embedding = self._get_embedding(ref.strip())
similarity = self._cosine_similarity(candidate_embedding, ref_embedding)
max_similarity = max(max_similarity, similarity)
return SemanticResult(similarity=max_similarity, model=self._model_name)
def batch_score(
self,
candidates: list[str],
references: list[str] | list[list[str]] | None = None,
) -> BatchResult[SemanticResult]:
"""
Compute semantic similarity for a batch of candidates.
Args:
candidates: List of texts to score.
references: Reference text(s) for each candidate.
Returns:
BatchResult containing individual results and aggregate statistics.
Raises:
ValueError: If references is None or length mismatch.
"""
if references is None:
raise ValueError("Semantic similarity requires reference texts")
if len(candidates) != len(references):
raise ValueError(
f"Number of candidates ({len(candidates)}) must match "
f"number of references ({len(references)})"
)
results: list[SemanticResult] = []
for i, cand in enumerate(candidates):
ref: str | list[str] = references[i]
results.append(self.score(cand, ref))
# Compute aggregate statistics
stats = {
"similarity": AggregateStats.from_values([r.similarity for r in results]),
}
return BatchResult(results=results, count=len(results), stats=stats)
def clear_cache(self) -> None:
"""Clear the embedding cache."""
if self._cache is not None:
self._cache.clear()

View File

@@ -0,0 +1,239 @@
"""Validators module: composable validation checks for text quality.
This module provides validators that apply thresholds to metrics and return
pass/fail decisions with diagnostics.
Example:
>>> from veritext.validators import bleu, length, all_of
>>> from veritext.core.types import ValidationContext
>>>
>>> validator = all_of([
... bleu(min_score=0.5),
... length(min_words=10),
... ])
>>> context = ValidationContext(reference="The quick brown fox.")
>>> result = validator.check("The quick brown fox jumps.", context)
>>> print(result.passed)
"""
from typing import Literal
from veritext.core.tokenisation import WordTokeniser
from veritext.validators.base import Check
from veritext.validators.composite import AllOf, AnyOf
from veritext.validators.constraint import (
ContainsValidator,
ExcludesValidator,
LengthValidator,
ReadabilityValidator,
)
from veritext.validators.metric import (
BleuValidator,
LexicalValidator,
RougeValidator,
SemanticValidator,
)
# Factory functions for clean API
def bleu(
min_score: float,
variant: Literal[1, 2, 3, 4] = 4,
tokeniser: WordTokeniser | None = None,
) -> BleuValidator:
"""Create a BLEU validator.
Args:
min_score: Minimum BLEU score required (0.0 to 1.0).
variant: BLEU variant to use (1, 2, 3, or 4). Defaults to 4.
tokeniser: Tokeniser to use. Defaults to WordTokeniser().
Returns:
BleuValidator instance.
"""
return BleuValidator(min_score=min_score, variant=variant, tokeniser=tokeniser)
def rouge(
min_score: float,
variant: Literal["1", "2", "l"] = "l",
tokeniser: WordTokeniser | None = None,
) -> RougeValidator:
"""Create a ROUGE validator.
Args:
min_score: Minimum ROUGE F-measure required (0.0 to 1.0).
variant: ROUGE variant ("1", "2", or "l"). Defaults to "l".
tokeniser: Tokeniser to use. Defaults to WordTokeniser().
Returns:
RougeValidator instance.
"""
return RougeValidator(min_score=min_score, variant=variant, tokeniser=tokeniser)
def lexical(
min_jaccard: float | None = None,
min_overlap: float | None = None,
tokeniser: WordTokeniser | None = None,
) -> LexicalValidator:
"""Create a lexical similarity validator.
Args:
min_jaccard: Minimum Jaccard similarity required (0.0 to 1.0).
min_overlap: Minimum token overlap required (0.0 to 1.0).
tokeniser: Tokeniser to use. Defaults to WordTokeniser().
Returns:
LexicalValidator instance.
"""
return LexicalValidator(
min_jaccard=min_jaccard, min_overlap=min_overlap, tokeniser=tokeniser
)
def length(
min_chars: int | None = None,
max_chars: int | None = None,
min_words: int | None = None,
max_words: int | None = None,
tokeniser: WordTokeniser | None = None,
) -> LengthValidator:
"""Create a length validator.
Args:
min_chars: Minimum character count (inclusive).
max_chars: Maximum character count (inclusive).
min_words: Minimum word count (inclusive).
max_words: Maximum word count (inclusive).
tokeniser: Tokeniser to use for word counting. Defaults to WordTokeniser().
Returns:
LengthValidator instance.
"""
return LengthValidator(
min_chars=min_chars,
max_chars=max_chars,
min_words=min_words,
max_words=max_words,
tokeniser=tokeniser,
)
def readability(
max_grade: float | None = None,
min_ease: float | None = None,
) -> ReadabilityValidator:
"""Create a readability validator.
Args:
max_grade: Maximum Flesch-Kincaid grade level allowed.
min_ease: Minimum Flesch Reading Ease score required.
Returns:
ReadabilityValidator instance.
"""
return ReadabilityValidator(max_grade=max_grade, min_ease=min_ease)
def contains(
patterns: list[str],
case_sensitive: bool = False,
) -> ContainsValidator:
"""Create a contains validator.
Args:
patterns: List of substrings or regex patterns that must be present.
case_sensitive: Whether matching is case-sensitive. Defaults to False.
Returns:
ContainsValidator instance.
"""
return ContainsValidator(patterns=patterns, case_sensitive=case_sensitive)
def excludes(
patterns: list[str],
case_sensitive: bool = False,
) -> ExcludesValidator:
"""Create an excludes validator.
Args:
patterns: List of substrings or regex patterns that must not be present.
case_sensitive: Whether matching is case-sensitive. Defaults to False.
Returns:
ExcludesValidator instance.
"""
return ExcludesValidator(patterns=patterns, case_sensitive=case_sensitive)
def all_of(checks: list[Check]) -> AllOf:
"""Create an AllOf composite validator.
Args:
checks: List of checks that must all pass.
Returns:
AllOf instance.
"""
return AllOf(checks=checks)
def any_of(checks: list[Check]) -> AnyOf:
"""Create an AnyOf composite validator.
Args:
checks: List of checks where at least one must pass.
Returns:
AnyOf instance.
"""
return AnyOf(checks=checks)
def semantic(
min_score: float,
model: str = "all-MiniLM-L6-v2",
cache_embeddings: bool = True,
) -> SemanticValidator:
"""Create a semantic similarity validator.
Requires the `veritext[semantic]` extra to be installed.
Args:
min_score: Minimum semantic similarity required (0.0 to 1.0).
model: Name of the sentence-transformers model to use.
cache_embeddings: Whether to cache embeddings for repeated texts.
Returns:
SemanticValidator instance.
"""
return SemanticValidator(
min_score=min_score, model=model, cache_embeddings=cache_embeddings
)
__all__ = [
"AllOf",
"AnyOf",
"BleuValidator",
"Check",
"ContainsValidator",
"ExcludesValidator",
"LengthValidator",
"LexicalValidator",
"ReadabilityValidator",
"RougeValidator",
"SemanticValidator",
"all_of",
"any_of",
"bleu",
"contains",
"excludes",
"length",
"lexical",
"readability",
"rouge",
"semantic",
]

View File

@@ -0,0 +1,31 @@
"""Base types and protocols for validation checks."""
from typing import Protocol, runtime_checkable
from veritext.core.types import CheckResult, ValidationContext
@runtime_checkable
class Check(Protocol):
"""Protocol for validation checks.
A Check computes a score or property of text and compares it against
a threshold to produce a pass/fail result.
"""
@property
def name(self) -> str:
"""Return the name of this check."""
...
def check(self, text: str, context: ValidationContext) -> CheckResult:
"""Run the check and return a result.
Args:
text: The text to validate.
context: Validation context containing reference text and metadata.
Returns:
CheckResult with pass/fail status and diagnostics.
"""
...

View File

@@ -0,0 +1,90 @@
"""Composite validators for combining multiple checks."""
from veritext.core.types import CheckResult, ValidationContext, ValidationResult
from veritext.validators.base import Check
class AllOf:
"""Passes only if all checks pass."""
def __init__(self, checks: list[Check]) -> None:
"""
Initialise the AllOf composite validator.
Args:
checks: List of checks that must all pass.
Raises:
ValueError: If checks list is empty.
"""
if not checks:
raise ValueError("checks list cannot be empty")
self._checks = checks
@property
def name(self) -> str:
"""Return the name of this composite check."""
return "all_of"
def check(self, text: str, context: ValidationContext) -> ValidationResult:
"""
Run all checks and return aggregate result.
Args:
text: The text to validate.
context: Validation context containing reference text and metadata.
Returns:
ValidationResult that passes only if all checks pass.
"""
results: list[CheckResult] = []
for check in self._checks:
results.append(check.check(text, context))
all_passed = all(r.passed for r in results)
return ValidationResult(passed=all_passed, checks=results)
class AnyOf:
"""Passes if any check passes."""
def __init__(self, checks: list[Check]) -> None:
"""
Initialise the AnyOf composite validator.
Args:
checks: List of checks where at least one must pass.
Raises:
ValueError: If checks list is empty.
"""
if not checks:
raise ValueError("checks list cannot be empty")
self._checks = checks
@property
def name(self) -> str:
"""Return the name of this composite check."""
return "any_of"
def check(self, text: str, context: ValidationContext) -> ValidationResult:
"""
Run all checks and return aggregate result.
Args:
text: The text to validate.
context: Validation context containing reference text and metadata.
Returns:
ValidationResult that passes if any check passes.
"""
results: list[CheckResult] = []
for check in self._checks:
results.append(check.check(text, context))
any_passed = any(r.passed for r in results)
return ValidationResult(passed=any_passed, checks=results)

View File

@@ -0,0 +1,337 @@
"""Constraint validators that do not require reference text."""
import re
from veritext.core.exceptions import InvalidThresholdError
from veritext.core.tokenisation import WordTokeniser
from veritext.core.types import CheckResult, ValidationContext
from veritext.metrics.readability import Readability
class LengthValidator:
"""Validates text length constraints."""
def __init__(
self,
min_chars: int | None = None,
max_chars: int | None = None,
min_words: int | None = None,
max_words: int | None = None,
tokeniser: WordTokeniser | None = None,
) -> None:
"""
Initialise the length validator.
Args:
min_chars: Minimum character count (inclusive).
max_chars: Maximum character count (inclusive).
min_words: Minimum word count (inclusive).
max_words: Maximum word count (inclusive).
tokeniser: Tokeniser to use for word counting. Defaults to WordTokeniser().
Raises:
InvalidThresholdError: If no constraints provided or invalid values.
"""
if all(v is None for v in (min_chars, max_chars, min_words, max_words)):
raise InvalidThresholdError("At least one length constraint must be set")
if min_chars is not None and min_chars < 0:
raise InvalidThresholdError(f"min_chars must be >= 0, got {min_chars}")
if max_chars is not None and max_chars < 0:
raise InvalidThresholdError(f"max_chars must be >= 0, got {max_chars}")
if min_words is not None and min_words < 0:
raise InvalidThresholdError(f"min_words must be >= 0, got {min_words}")
if max_words is not None and max_words < 0:
raise InvalidThresholdError(f"max_words must be >= 0, got {max_words}")
if min_chars is not None and max_chars is not None and min_chars > max_chars:
raise InvalidThresholdError(
f"min_chars ({min_chars}) cannot exceed max_chars ({max_chars})"
)
if min_words is not None and max_words is not None and min_words > max_words:
raise InvalidThresholdError(
f"min_words ({min_words}) cannot exceed max_words ({max_words})"
)
self._min_chars = min_chars
self._max_chars = max_chars
self._min_words = min_words
self._max_words = max_words
self._tokeniser = tokeniser or WordTokeniser()
@property
def name(self) -> str:
"""Return the name of this check."""
return "length"
def check(self, text: str, context: ValidationContext) -> CheckResult: # noqa: ARG002
"""
Run the length check.
Args:
text: The text to validate.
context: Validation context (not used for length checks).
Returns:
CheckResult with pass/fail status.
"""
char_count = len(text)
words = self._tokeniser.tokenise(text)
word_count = len(words)
failures = []
if self._min_chars is not None and char_count < self._min_chars:
failures.append(f"{char_count} chars < min {self._min_chars}")
if self._max_chars is not None and char_count > self._max_chars:
failures.append(f"{char_count} chars > max {self._max_chars}")
if self._min_words is not None and word_count < self._min_words:
failures.append(f"{word_count} words < min {self._min_words}")
if self._max_words is not None and word_count > self._max_words:
failures.append(f"{word_count} words > max {self._max_words}")
passed = len(failures) == 0
if passed:
message = f"Length check passed: {char_count} chars, {word_count} words"
else:
message = "Length check failed: " + "; ".join(failures)
actual = {"chars": char_count, "words": word_count}
threshold = {}
if self._min_chars is not None:
threshold["min_chars"] = self._min_chars
if self._max_chars is not None:
threshold["max_chars"] = self._max_chars
if self._min_words is not None:
threshold["min_words"] = self._min_words
if self._max_words is not None:
threshold["max_words"] = self._max_words
return CheckResult(
name=self.name,
passed=passed,
actual=actual,
threshold=threshold,
message=message,
)
class ReadabilityValidator:
"""Validates Flesch-Kincaid readability."""
def __init__(
self,
max_grade: float | None = None,
min_ease: float | None = None,
) -> None:
"""
Initialise the readability validator.
Args:
max_grade: Maximum Flesch-Kincaid grade level allowed.
min_ease: Minimum Flesch Reading Ease score required.
Raises:
InvalidThresholdError: If no constraints provided.
"""
if max_grade is None and min_ease is None:
raise InvalidThresholdError(
"At least one of max_grade or min_ease must be provided"
)
self._max_grade = max_grade
self._min_ease = min_ease
self._metric = Readability()
@property
def name(self) -> str:
"""Return the name of this check."""
return "readability"
def check(self, text: str, context: ValidationContext) -> CheckResult: # noqa: ARG002
"""
Run the readability check.
Args:
text: The text to validate.
context: Validation context (not used for readability checks).
Returns:
CheckResult with pass/fail status.
"""
result = self._metric.score(text)
failures = []
if (
self._max_grade is not None
and result.flesch_kincaid_grade > self._max_grade
):
failures.append(
f"grade level {result.flesch_kincaid_grade:.1f} "
f"> max {self._max_grade:.1f}"
)
if self._min_ease is not None and result.flesch_reading_ease < self._min_ease:
failures.append(
f"reading ease {result.flesch_reading_ease:.1f} "
f"< min {self._min_ease:.1f}"
)
passed = len(failures) == 0
if passed:
parts = []
if self._max_grade is not None:
parts.append(
f"grade {result.flesch_kincaid_grade:.1f} <= {self._max_grade:.1f}"
)
if self._min_ease is not None:
parts.append(
f"ease {result.flesch_reading_ease:.1f} >= {self._min_ease:.1f}"
)
message = "Readability: " + ", ".join(parts)
else:
message = "Readability: " + "; ".join(failures)
actual = {
"grade": result.flesch_kincaid_grade,
"ease": result.flesch_reading_ease,
}
threshold = {}
if self._max_grade is not None:
threshold["max_grade"] = self._max_grade
if self._min_ease is not None:
threshold["min_ease"] = self._min_ease
return CheckResult(
name=self.name,
passed=passed,
actual=actual,
threshold=threshold,
message=message,
)
class ContainsValidator:
"""Validates text contains required patterns."""
def __init__(
self,
patterns: list[str],
case_sensitive: bool = False,
) -> None:
"""
Initialise the contains validator.
Args:
patterns: List of substrings or regex patterns that must be present.
case_sensitive: Whether matching is case-sensitive. Defaults to False.
Raises:
InvalidThresholdError: If patterns list is empty.
"""
if not patterns:
raise InvalidThresholdError("patterns list cannot be empty")
self._patterns = patterns
self._case_sensitive = case_sensitive
self._flags = 0 if case_sensitive else re.IGNORECASE
@property
def name(self) -> str:
"""Return the name of this check."""
return "contains"
def check(self, text: str, context: ValidationContext) -> CheckResult: # noqa: ARG002
"""
Run the contains check.
Args:
text: The text to validate.
context: Validation context (not used for contains checks).
Returns:
CheckResult with pass/fail status.
"""
missing = []
for pattern in self._patterns:
if not re.search(pattern, text, self._flags):
missing.append(pattern)
passed = len(missing) == 0
if passed:
message = f"Text contains all {len(self._patterns)} required pattern(s)"
else:
message = f"Text missing {len(missing)} pattern(s): {missing}"
return CheckResult(
name=self.name,
passed=passed,
actual={"found": len(self._patterns) - len(missing), "missing": missing},
threshold={"patterns": self._patterns},
message=message,
)
class ExcludesValidator:
"""Validates text excludes forbidden patterns."""
def __init__(
self,
patterns: list[str],
case_sensitive: bool = False,
) -> None:
"""
Initialise the excludes validator.
Args:
patterns: List of substrings or regex patterns that must not be present.
case_sensitive: Whether matching is case-sensitive. Defaults to False.
Raises:
InvalidThresholdError: If patterns list is empty.
"""
if not patterns:
raise InvalidThresholdError("patterns list cannot be empty")
self._patterns = patterns
self._case_sensitive = case_sensitive
self._flags = 0 if case_sensitive else re.IGNORECASE
@property
def name(self) -> str:
"""Return the name of this check."""
return "excludes"
def check(self, text: str, context: ValidationContext) -> CheckResult: # noqa: ARG002
"""
Run the excludes check.
Args:
text: The text to validate.
context: Validation context (not used for excludes checks).
Returns:
CheckResult with pass/fail status.
"""
found = []
for pattern in self._patterns:
if re.search(pattern, text, self._flags):
found.append(pattern)
passed = len(found) == 0
if passed:
message = f"Text excludes all {len(self._patterns)} forbidden pattern(s)"
else:
message = f"Text contains {len(found)} forbidden pattern(s): {found}"
return CheckResult(
name=self.name,
passed=passed,
actual={"excluded": len(self._patterns) - len(found), "found": found},
threshold={"patterns": self._patterns},
message=message,
)

View File

@@ -0,0 +1,370 @@
"""Metric-based validators that require reference text."""
from typing import Literal
from veritext.core.exceptions import InvalidThresholdError, ValidationError
from veritext.core.tokenisation import WordTokeniser
from veritext.core.types import CheckResult, ValidationContext
from veritext.metrics.bleu import Bleu
from veritext.metrics.lexical import Lexical
from veritext.metrics.rouge import Rouge
class BleuValidator:
"""Validates that BLEU score meets minimum threshold."""
def __init__(
self,
min_score: float,
variant: Literal[1, 2, 3, 4] = 4,
tokeniser: WordTokeniser | None = None,
) -> None:
"""
Initialise the BLEU validator.
Args:
min_score: Minimum BLEU score required (0.0 to 1.0).
variant: BLEU variant to use (1, 2, 3, or 4). Defaults to 4.
tokeniser: Tokeniser to use. Defaults to WordTokeniser().
Raises:
InvalidThresholdError: If min_score is not in range [0.0, 1.0].
"""
if not 0.0 <= min_score <= 1.0:
raise InvalidThresholdError(
f"min_score must be between 0.0 and 1.0, got {min_score}"
)
if variant not in (1, 2, 3, 4):
raise InvalidThresholdError(f"variant must be 1, 2, 3, or 4, got {variant}")
self._min_score = min_score
self._variant = variant
self._metric = Bleu(tokeniser=tokeniser)
@property
def name(self) -> str:
"""Return the name of this check."""
return f"bleu-{self._variant}"
def check(self, text: str, context: ValidationContext) -> CheckResult:
"""
Run the BLEU check.
Args:
text: The text to validate.
context: Validation context containing reference text.
Returns:
CheckResult with pass/fail status.
Raises:
ValidationError: If reference text is missing from context.
"""
if context.reference is None:
raise ValidationError(f"{self.name} requires reference text in context")
result = self._metric.score(text, context.reference)
# Select the appropriate BLEU variant
score_map = {
1: result.bleu1,
2: result.bleu2,
3: result.bleu3,
4: result.bleu4,
}
actual_score = score_map[self._variant]
passed = actual_score >= self._min_score
if passed:
message = (
f"BLEU-{self._variant} score {actual_score:.2f} "
f"meets minimum {self._min_score:.2f}"
)
else:
message = (
f"BLEU-{self._variant} score {actual_score:.2f} "
f"below minimum {self._min_score:.2f}"
)
return CheckResult(
name=self.name,
passed=passed,
actual=actual_score,
threshold=self._min_score,
message=message,
)
class RougeValidator:
"""Validates that ROUGE score meets minimum threshold."""
def __init__(
self,
min_score: float,
variant: Literal["1", "2", "l"] = "l",
tokeniser: WordTokeniser | None = None,
) -> None:
"""
Initialise the ROUGE validator.
Args:
min_score: Minimum ROUGE F-measure required (0.0 to 1.0).
variant: ROUGE variant ("1", "2", or "l"). Defaults to "l".
tokeniser: Tokeniser to use. Defaults to WordTokeniser().
Raises:
InvalidThresholdError: If min_score is not in range [0.0, 1.0].
"""
if not 0.0 <= min_score <= 1.0:
raise InvalidThresholdError(
f"min_score must be between 0.0 and 1.0, got {min_score}"
)
if variant not in ("1", "2", "l"):
raise InvalidThresholdError(
f"variant must be '1', '2', or 'l', got '{variant}'"
)
self._min_score = min_score
self._variant = variant
self._metric = Rouge(tokeniser=tokeniser)
@property
def name(self) -> str:
"""Return the name of this check."""
return f"rouge-{self._variant}"
def check(self, text: str, context: ValidationContext) -> CheckResult:
"""
Run the ROUGE check.
Args:
text: The text to validate.
context: Validation context containing reference text.
Returns:
CheckResult with pass/fail status.
Raises:
ValidationError: If reference text is missing from context.
"""
if context.reference is None:
raise ValidationError(f"{self.name} requires reference text in context")
result = self._metric.score(text, context.reference)
# Select the appropriate ROUGE variant (use F-measure)
score_map = {
"1": result.rouge1.fmeasure,
"2": result.rouge2.fmeasure,
"l": result.rouge_l.fmeasure,
}
actual_score = score_map[self._variant]
passed = actual_score >= self._min_score
if passed:
message = (
f"ROUGE-{self._variant.upper()} score {actual_score:.2f} "
f"meets minimum {self._min_score:.2f}"
)
else:
message = (
f"ROUGE-{self._variant.upper()} score {actual_score:.2f} "
f"below minimum {self._min_score:.2f}"
)
return CheckResult(
name=self.name,
passed=passed,
actual=actual_score,
threshold=self._min_score,
message=message,
)
class LexicalValidator:
"""Validates lexical similarity meets threshold."""
def __init__(
self,
min_jaccard: float | None = None,
min_overlap: float | None = None,
tokeniser: WordTokeniser | None = None,
) -> None:
"""
Initialise the lexical validator.
Args:
min_jaccard: Minimum Jaccard similarity required (0.0 to 1.0).
min_overlap: Minimum token overlap required (0.0 to 1.0).
tokeniser: Tokeniser to use. Defaults to WordTokeniser().
Raises:
InvalidThresholdError: If thresholds are invalid or none provided.
"""
if min_jaccard is None and min_overlap is None:
raise InvalidThresholdError(
"At least one of min_jaccard or min_overlap must be provided"
)
if min_jaccard is not None and not 0.0 <= min_jaccard <= 1.0:
raise InvalidThresholdError(
f"min_jaccard must be between 0.0 and 1.0, got {min_jaccard}"
)
if min_overlap is not None and not 0.0 <= min_overlap <= 1.0:
raise InvalidThresholdError(
f"min_overlap must be between 0.0 and 1.0, got {min_overlap}"
)
self._min_jaccard = min_jaccard
self._min_overlap = min_overlap
self._metric = Lexical(tokeniser=tokeniser)
@property
def name(self) -> str:
"""Return the name of this check."""
return "lexical"
def check(self, text: str, context: ValidationContext) -> CheckResult:
"""
Run the lexical similarity check.
Args:
text: The text to validate.
context: Validation context containing reference text.
Returns:
CheckResult with pass/fail status.
Raises:
ValidationError: If reference text is missing from context.
"""
if context.reference is None:
raise ValidationError(f"{self.name} requires reference text in context")
result = self._metric.score(text, context.reference)
# Check each threshold that was specified
failures = []
if self._min_jaccard is not None and result.jaccard < self._min_jaccard:
failures.append(
f"Jaccard {result.jaccard:.2f} below minimum {self._min_jaccard:.2f}"
)
if self._min_overlap is not None and result.token_overlap < self._min_overlap:
failures.append(
f"token overlap {result.token_overlap:.2f} "
f"below minimum {self._min_overlap:.2f}"
)
passed = len(failures) == 0
if passed:
parts = []
if self._min_jaccard is not None:
parts.append(f"Jaccard {result.jaccard:.2f} >= {self._min_jaccard:.2f}")
if self._min_overlap is not None:
parts.append(
f"overlap {result.token_overlap:.2f} >= {self._min_overlap:.2f}"
)
message = "Lexical similarity: " + ", ".join(parts)
else:
message = "Lexical similarity: " + "; ".join(failures)
# Build actual value dict
actual = {"jaccard": result.jaccard, "token_overlap": result.token_overlap}
threshold = {}
if self._min_jaccard is not None:
threshold["min_jaccard"] = self._min_jaccard
if self._min_overlap is not None:
threshold["min_overlap"] = self._min_overlap
return CheckResult(
name=self.name,
passed=passed,
actual=actual,
threshold=threshold,
message=message,
)
class SemanticValidator:
"""Validates that semantic similarity meets minimum threshold.
Requires the `veritext[semantic]` extra to be installed.
"""
def __init__(
self,
min_score: float,
model: str = "all-MiniLM-L6-v2",
cache_embeddings: bool = True,
) -> None:
"""
Initialise the semantic validator.
Args:
min_score: Minimum semantic similarity required (0.0 to 1.0).
model: Name of the sentence-transformers model to use.
cache_embeddings: Whether to cache embeddings for repeated texts.
Raises:
InvalidThresholdError: If min_score is not in range [0.0, 1.0].
DependencyError: If sentence-transformers is not installed.
"""
if not 0.0 <= min_score <= 1.0:
raise InvalidThresholdError(
f"min_score must be between 0.0 and 1.0, got {min_score}"
)
self._min_score = min_score
# Lazy import to avoid loading PyTorch unless needed
from veritext.semantic.similarity import SemanticSimilarity
self._metric: SemanticSimilarity = SemanticSimilarity(
model=model, cache_embeddings=cache_embeddings
)
@property
def name(self) -> str:
"""Return the name of this check."""
return "semantic"
def check(self, text: str, context: ValidationContext) -> CheckResult:
"""
Run the semantic similarity check.
Args:
text: The text to validate.
context: Validation context containing reference text.
Returns:
CheckResult with pass/fail status.
Raises:
ValidationError: If reference text is missing from context.
"""
if context.reference is None:
raise ValidationError(f"{self.name} requires reference text in context")
result = self._metric.score(text, context.reference)
passed = result.similarity >= self._min_score
if passed:
message = (
f"Semantic similarity {result.similarity:.2f} "
f"meets minimum {self._min_score:.2f}"
)
else:
message = (
f"Semantic similarity {result.similarity:.2f} "
f"below minimum {self._min_score:.2f}"
)
return CheckResult(
name=self.name,
passed=passed,
actual=result.similarity,
threshold=self._min_score,
message=message,
)

View File

@@ -0,0 +1,274 @@
"""Tests for the readability metric."""
import pytest
from veritext.metrics import Readability, ReadabilityResult
class TestReadability:
"""Tests for the Readability metric class."""
@pytest.fixture
def readability(self) -> Readability:
"""Provide a readability metric instance."""
return Readability()
def test_name(self, readability: Readability) -> None:
"""Test that name returns 'readability'."""
assert readability.name == "readability"
def test_requires_reference(self, readability: Readability) -> None:
"""Test that readability does NOT require reference text."""
assert readability.requires_reference is False
def test_simple_text(self, readability: Readability) -> None:
"""Test readability of simple, easy text."""
# Simple children's text - short sentences, simple words
text = "The cat sat. The dog ran. I see a bird."
result = readability.score(text)
# Should have low grade level and high reading ease
assert result.flesch_kincaid_grade < 5.0
assert result.flesch_reading_ease > 80.0
def test_complex_text(self, readability: Readability) -> None:
"""Test readability of complex, academic text."""
# Complex academic text - long sentences, polysyllabic words
text = (
"The implementation of sophisticated computational methodologies "
"necessitates comprehensive understanding of algorithmic complexity "
"and architectural considerations."
)
result = readability.score(text)
# Should have high grade level and low reading ease
assert result.flesch_kincaid_grade > 12.0
assert result.flesch_reading_ease < 30.0
def test_medium_text(self, readability: Readability) -> None:
"""Test readability of medium-difficulty text."""
text = (
"The weather today is quite pleasant. "
"Many people are enjoying the sunshine in the park. "
"Children play while parents watch nearby."
)
result = readability.score(text)
# Should be middle of the road
assert 3.0 < result.flesch_kincaid_grade < 10.0
assert 50.0 < result.flesch_reading_ease < 90.0
def test_single_sentence(self, readability: Readability) -> None:
"""Test readability with a single sentence."""
text = "The cat sat on the mat."
result = readability.score(text)
# Should compute without error
assert result.flesch_kincaid_grade is not None
assert result.flesch_reading_ease is not None
def test_single_word(self, readability: Readability) -> None:
"""Test readability with a single word."""
text = "Cat"
result = readability.score(text)
# Should handle single word (1 word, 1 sentence, 1 syllable)
assert result.flesch_kincaid_grade is not None
assert result.flesch_reading_ease is not None
def test_empty_text(self, readability: Readability) -> None:
"""Test that empty text returns zero scores."""
result = readability.score("")
assert result.flesch_kincaid_grade == 0.0
assert result.flesch_reading_ease == 0.0
def test_whitespace_only(self, readability: Readability) -> None:
"""Test that whitespace-only text returns zero scores."""
result = readability.score(" \t\n ")
assert result.flesch_kincaid_grade == 0.0
assert result.flesch_reading_ease == 0.0
def test_reference_ignored(self, readability: Readability) -> None:
"""Test that reference parameter is ignored."""
text = "The cat sat on the mat."
# Score with no reference
result1 = readability.score(text)
# Score with reference (should be ignored)
result2 = readability.score(text, "Completely different text")
# Score with list of references
result3 = readability.score(text, ["ref1", "ref2"])
# All should produce identical results
assert result1.flesch_kincaid_grade == result2.flesch_kincaid_grade
assert result1.flesch_reading_ease == result2.flesch_reading_ease
assert result1.flesch_kincaid_grade == result3.flesch_kincaid_grade
def test_punctuation_handling(self, readability: Readability) -> None:
"""Test that punctuation affects sentence counting."""
# Same words, different sentence structure
text1 = "The cat sat on the mat" # 1 sentence
text2 = "The cat sat. On the mat." # 2 sentences
result1 = readability.score(text1)
result2 = readability.score(text2)
# Different sentence counts should affect scores
assert result1.flesch_kincaid_grade != result2.flesch_kincaid_grade
def test_question_marks_count_sentences(self, readability: Readability) -> None:
"""Test that question marks end sentences."""
text = "What is this? It is a test."
result = readability.score(text)
# Should count as 2 sentences
# With 7 words total, words_per_sentence = 3.5
assert result.flesch_kincaid_grade is not None
def test_exclamation_marks_count_sentences(self, readability: Readability) -> None:
"""Test that exclamation marks end sentences."""
text = "Wow! That is amazing!"
result = readability.score(text)
# Should count as 2 sentences
assert result.flesch_kincaid_grade is not None
def test_multiple_punctuation(self, readability: Readability) -> None:
"""Test handling of multiple punctuation marks."""
text = "What?! That's crazy... Well then."
result = readability.score(text)
# Should handle gracefully
assert result.flesch_kincaid_grade is not None
def test_result_score_property(self, readability: Readability) -> None:
"""Test that result.score returns flesch_reading_ease."""
result = readability.score("The cat sat on the mat.")
assert result.score == result.flesch_reading_ease
def test_contractions(self, readability: Readability) -> None:
"""Test handling of contractions."""
text = "I'm going to the store. It's not far away."
result = readability.score(text)
# Should handle contractions as words
assert result.flesch_kincaid_grade is not None
assert result.flesch_reading_ease is not None
class TestReadabilityBatch:
"""Tests for readability batch scoring."""
@pytest.fixture
def readability(self) -> Readability:
"""Provide a readability metric instance."""
return Readability()
def test_batch_score_basic(self, readability: Readability) -> None:
"""Test basic batch scoring."""
candidates = [
"The cat sat on the mat.",
"A dog ran through the park.",
]
result = readability.batch_score(candidates)
assert result.count == 2
assert len(result.results) == 2
def test_batch_score_statistics(self, readability: Readability) -> None:
"""Test that batch scoring computes statistics."""
candidates = [
"Cat sat.", # Very simple
"The implementation of sophisticated methodologies requires expertise.",
]
result = readability.batch_score(candidates)
# Check statistics are computed
assert "flesch_kincaid_grade" in result.stats
assert "flesch_reading_ease" in result.stats
# First should be easier than second
assert (
result.results[0].flesch_reading_ease
> result.results[1].flesch_reading_ease
)
def test_batch_score_percentiles(self, readability: Readability) -> None:
"""Test that batch scoring computes percentiles."""
candidates = ["a", "b", "c", "d", "e"]
result = readability.batch_score(candidates)
stats = result.stats["flesch_reading_ease"]
assert 25 in stats.percentiles
assert 50 in stats.percentiles
assert 75 in stats.percentiles
assert 95 in stats.percentiles
def test_batch_score_references_ignored(self, readability: Readability) -> None:
"""Test that batch scoring ignores references."""
candidates = ["The cat sat.", "A dog ran."]
result1 = readability.batch_score(candidates)
result2 = readability.batch_score(candidates, ["ref1", "ref2"])
# Results should be identical
assert result1.results[0].flesch_kincaid_grade == (
result2.results[0].flesch_kincaid_grade
)
def test_batch_score_empty_list_raises(self, readability: Readability) -> None:
"""Test that empty candidate list raises ValueError."""
with pytest.raises(ValueError, match="empty"):
readability.batch_score([])
class TestReadabilityResult:
"""Tests for ReadabilityResult type."""
def test_frozen(self) -> None:
"""Test that ReadabilityResult is frozen."""
from pydantic import ValidationError
result = ReadabilityResult(flesch_kincaid_grade=5.0, flesch_reading_ease=70.0)
with pytest.raises(ValidationError):
result.flesch_kincaid_grade = 6.0 # type: ignore[misc]
def test_values(self) -> None:
"""Test that values are stored correctly."""
result = ReadabilityResult(flesch_kincaid_grade=8.5, flesch_reading_ease=65.0)
assert result.flesch_kincaid_grade == 8.5
assert result.flesch_reading_ease == 65.0
def test_score_property(self) -> None:
"""Test that score property returns flesch_reading_ease."""
result = ReadabilityResult(flesch_kincaid_grade=8.5, flesch_reading_ease=65.0)
assert result.score == 65.0
class TestSyllableCounting:
"""Tests for syllable counting heuristics."""
@pytest.fixture
def readability(self) -> Readability:
"""Provide a readability metric instance."""
return Readability()
def test_monosyllabic_words(self, readability: Readability) -> None:
"""Test that monosyllabic words don't inflate scores."""
# All one-syllable words
text = "The cat sat on the mat."
result = readability.score(text)
# Should be very easy to read
assert result.flesch_reading_ease > 90.0
def test_polysyllabic_words(self, readability: Readability) -> None:
"""Test that polysyllabic words affect scores."""
# Words with multiple syllables
text = "International communication facilitates understanding."
result = readability.score(text)
# Should be harder to read
assert result.flesch_reading_ease < 50.0

View File

@@ -0,0 +1,295 @@
"""Tests for the ROUGE metric."""
import pytest
from veritext.metrics import Rouge, RougeResult, RougeScore
class TestRouge:
"""Tests for the Rouge metric class."""
@pytest.fixture
def rouge(self) -> Rouge:
"""Provide a ROUGE metric instance."""
return Rouge()
def test_name(self, rouge: Rouge) -> None:
"""Test that name returns 'rouge'."""
assert rouge.name == "rouge"
def test_requires_reference(self, rouge: Rouge) -> None:
"""Test that ROUGE requires reference text."""
assert rouge.requires_reference is True
def test_identical_texts(self, rouge: Rouge) -> None:
"""Test that identical texts produce perfect scores."""
text = "The cat sat on the mat"
result = rouge.score(text, text)
assert result.rouge1.precision == 1.0
assert result.rouge1.recall == 1.0
assert result.rouge1.fmeasure == 1.0
assert result.rouge2.fmeasure == 1.0
assert result.rouge_l.fmeasure == 1.0
def test_no_overlap(self, rouge: Rouge) -> None:
"""Test that texts with no overlap produce zero scores."""
candidate = "apple banana cherry"
reference = "dog elephant fox"
result = rouge.score(candidate, reference)
assert result.rouge1.precision == 0.0
assert result.rouge1.recall == 0.0
assert result.rouge1.fmeasure == 0.0
assert result.rouge2.fmeasure == 0.0
assert result.rouge_l.fmeasure == 0.0
def test_partial_overlap_rouge1(self, rouge: Rouge) -> None:
"""Test ROUGE-1 with partial overlap."""
candidate = "the cat sat"
reference = "the dog sat"
result = rouge.score(candidate, reference)
# Candidate: {the, cat, sat}, Reference: {the, dog, sat}
# Overlap: {the, sat} = 2
# Precision = 2/3, Recall = 2/3
assert abs(result.rouge1.precision - 2 / 3) < 1e-10
assert abs(result.rouge1.recall - 2 / 3) < 1e-10
def test_partial_overlap_rouge2(self, rouge: Rouge) -> None:
"""Test ROUGE-2 (bigram) with partial overlap."""
candidate = "the cat sat on the mat"
reference = "the cat lay on the mat"
result = rouge.score(candidate, reference)
# Bigrams in candidate: (the, cat), (cat, sat), (sat, on), (on, the), (the, mat)
# Bigrams in reference: (the, cat), (cat, lay), (lay, on), (on, the), (the, mat)
# Overlap: (the, cat), (on, the), (the, mat) = 3
# Precision = 3/5, Recall = 3/5
assert abs(result.rouge2.precision - 3 / 5) < 1e-10
assert abs(result.rouge2.recall - 3 / 5) < 1e-10
def test_rouge_l_basic(self, rouge: Rouge) -> None:
"""Test ROUGE-L (LCS) computation."""
candidate = "the cat sat on the mat"
reference = "the cat sat"
result = rouge.score(candidate, reference)
# LCS = "the cat sat" = 3 tokens
# Precision = 3/6 = 0.5, Recall = 3/3 = 1.0
assert result.rouge_l.precision == 0.5
assert result.rouge_l.recall == 1.0
def test_rouge_l_non_contiguous(self, rouge: Rouge) -> None:
"""Test ROUGE-L with non-contiguous LCS."""
candidate = "the big cat sat"
reference = "the cat sat"
result = rouge.score(candidate, reference)
# LCS = "the cat sat" = 3 (skipping "big")
# Precision = 3/4, Recall = 3/3 = 1.0
assert result.rouge_l.precision == 0.75
assert result.rouge_l.recall == 1.0
def test_precision_vs_recall(self, rouge: Rouge) -> None:
"""Test that precision and recall differ appropriately."""
# Short candidate, long reference
candidate = "the cat"
reference = "the cat sat on the mat"
result = rouge.score(candidate, reference)
# Precision should be high (all candidate tokens in reference)
assert result.rouge1.precision == 1.0
# Recall should be lower (not all reference tokens in candidate)
assert result.rouge1.recall < 1.0
def test_empty_candidate(self, rouge: Rouge) -> None:
"""Test that empty candidate returns zero scores."""
result = rouge.score("", "The cat sat")
assert result.rouge1.fmeasure == 0.0
assert result.rouge2.fmeasure == 0.0
assert result.rouge_l.fmeasure == 0.0
def test_whitespace_only_candidate(self, rouge: Rouge) -> None:
"""Test that whitespace-only candidate returns zero scores."""
result = rouge.score(" \t\n ", "The cat sat")
assert result.rouge1.fmeasure == 0.0
assert result.rouge_l.fmeasure == 0.0
def test_empty_reference_raises(self, rouge: Rouge) -> None:
"""Test that empty reference raises ValueError."""
with pytest.raises(ValueError, match="cannot be empty"):
rouge.score("The cat sat", "")
def test_none_reference_raises(self, rouge: Rouge) -> None:
"""Test that None reference raises ValueError."""
with pytest.raises(ValueError, match="requires reference"):
rouge.score("The cat sat", None)
def test_multiple_references_uses_max(self, rouge: Rouge) -> None:
"""Test that multiple references use max scores."""
candidate = "the cat sat on the mat"
references = [
"a dog ran across the room", # Low overlap
"the cat sat on the mat", # Exact match
]
result = rouge.score(candidate, references)
# Should get perfect scores due to exact match
assert result.rouge1.fmeasure == 1.0
assert result.rouge_l.fmeasure == 1.0
def test_multiple_references_partial(self, rouge: Rouge) -> None:
"""Test multiple references with partial matches."""
candidate = "the quick brown fox"
references = [
"the fast brown fox", # 3/4 match
"a quick brown dog", # 3/4 match different tokens
]
result = rouge.score(candidate, references)
# Should pick best from either reference
assert result.rouge1.fmeasure > 0.0
def test_result_score_property(self, rouge: Rouge) -> None:
"""Test that result.score returns rouge_l.fmeasure."""
result = rouge.score("The cat sat", "The cat sat")
assert result.score == result.rouge_l.fmeasure
def test_case_insensitivity(self, rouge: Rouge) -> None:
"""Test that ROUGE is case insensitive by default."""
result = rouge.score("THE CAT SAT", "the cat sat")
assert result.rouge1.fmeasure == 1.0
assert result.rouge_l.fmeasure == 1.0
def test_punctuation_ignored(self, rouge: Rouge) -> None:
"""Test that punctuation is ignored by default."""
result = rouge.score("The cat sat.", "The cat sat!")
assert result.rouge1.fmeasure == 1.0
def test_single_word(self, rouge: Rouge) -> None:
"""Test ROUGE with single word texts."""
result = rouge.score("cat", "cat")
assert result.rouge1.fmeasure == 1.0
# ROUGE-2 should be 0 for single words (no bigrams)
assert result.rouge2.fmeasure == 0.0
assert result.rouge_l.fmeasure == 1.0
def test_fmeasure_calculation(self, rouge: Rouge) -> None:
"""Test that F-measure is calculated correctly."""
# Create a case where P != R
candidate = "the cat sat on"
reference = "the cat"
result = rouge.score(candidate, reference)
# P = 2/4 = 0.5, R = 2/2 = 1.0
# F = 2 * 0.5 * 1.0 / (0.5 + 1.0) = 1.0 / 1.5 = 2/3
expected_f = 2 * 0.5 * 1.0 / (0.5 + 1.0)
assert abs(result.rouge1.fmeasure - expected_f) < 1e-10
class TestRougeBatch:
"""Tests for ROUGE batch scoring."""
@pytest.fixture
def rouge(self) -> Rouge:
"""Provide a ROUGE metric instance."""
return Rouge()
def test_batch_score_basic(self, rouge: Rouge) -> None:
"""Test basic batch scoring."""
candidates = ["The cat sat", "A dog runs"]
references = ["The cat sat", "A dog runs"]
result = rouge.batch_score(candidates, references)
assert result.count == 2
assert len(result.results) == 2
assert all(r.rouge_l.fmeasure == 1.0 for r in result.results)
def test_batch_score_statistics(self, rouge: Rouge) -> None:
"""Test that batch scoring computes statistics."""
candidates = ["The cat sat", "Completely different words"]
references = ["The cat sat", "The cat sat"]
result = rouge.batch_score(candidates, references)
# Check statistics are computed
assert "rouge1_fmeasure" in result.stats
assert "rouge2_fmeasure" in result.stats
assert "rouge_l_fmeasure" in result.stats
assert "rouge1_precision" in result.stats
assert "rouge1_recall" in result.stats
# First result should be 1.0, second should be 0.0
assert result.results[0].rouge1.fmeasure == 1.0
assert result.results[1].rouge1.fmeasure == 0.0
def test_batch_score_percentiles(self, rouge: Rouge) -> None:
"""Test that batch scoring computes percentiles."""
candidates = ["a", "b", "c", "d", "e"]
references = ["a", "b", "c", "d", "e"]
result = rouge.batch_score(candidates, references)
stats = result.stats["rouge1_fmeasure"]
assert 25 in stats.percentiles
assert 50 in stats.percentiles
assert 75 in stats.percentiles
assert 95 in stats.percentiles
def test_batch_score_none_references_raises(self, rouge: Rouge) -> None:
"""Test that batch scoring raises for None references."""
with pytest.raises(ValueError, match="requires reference"):
rouge.batch_score(["text"], None)
def test_batch_score_length_mismatch_raises(self, rouge: Rouge) -> None:
"""Test that batch scoring raises for mismatched lengths."""
with pytest.raises(ValueError, match="must match"):
rouge.batch_score(["a", "b"], ["a"])
def test_batch_score_with_multiple_references(self, rouge: Rouge) -> None:
"""Test batch scoring with multiple references per candidate."""
candidates = [
"The cat sat on the mat",
"A quick brown fox",
]
references = [
["The cat sat on the mat", "A cat rests on floor"],
["A quick brown fox", "The fast brown fox"],
]
result = rouge.batch_score(candidates, references)
assert result.count == 2
# Both should get perfect scores due to exact matches
assert result.results[0].rouge_l.fmeasure == 1.0
assert result.results[1].rouge_l.fmeasure == 1.0
class TestRougeResult:
"""Tests for RougeResult and RougeScore types."""
def test_rouge_score_frozen(self) -> None:
"""Test that RougeScore is frozen."""
from pydantic import ValidationError
score = RougeScore(precision=0.5, recall=0.6, fmeasure=0.55)
with pytest.raises(ValidationError):
score.precision = 0.7 # type: ignore[misc]
def test_rouge_result_frozen(self) -> None:
"""Test that RougeResult is frozen."""
from pydantic import ValidationError
score = RougeScore(precision=0.5, recall=0.6, fmeasure=0.55)
result = RougeResult(rouge1=score, rouge2=score, rouge_l=score)
with pytest.raises(ValidationError):
result.rouge1 = score # type: ignore[misc]
def test_score_property(self) -> None:
"""Test that score property returns rouge_l.fmeasure."""
r1 = RougeScore(precision=0.9, recall=0.9, fmeasure=0.9)
r2 = RougeScore(precision=0.8, recall=0.8, fmeasure=0.8)
rl = RougeScore(precision=0.7, recall=0.7, fmeasure=0.7)
result = RougeResult(rouge1=r1, rouge2=r2, rouge_l=rl)
assert result.score == 0.7

View File

@@ -0,0 +1 @@
"""Tests for semantic similarity module."""

View File

@@ -0,0 +1,240 @@
"""Tests for the semantic similarity metric."""
import pytest
# Skip all tests if sentence-transformers is not installed
pytest.importorskip("sentence_transformers")
from veritext.metrics.results import SemanticResult
from veritext.semantic import SemanticSimilarity
class TestSemanticSimilarity:
"""Tests for the SemanticSimilarity metric class."""
@pytest.fixture
def semantic(self) -> SemanticSimilarity:
"""Provide a SemanticSimilarity metric instance."""
return SemanticSimilarity()
def test_name(self, semantic: SemanticSimilarity) -> None:
"""Test that name returns 'semantic'."""
assert semantic.name == "semantic"
def test_requires_reference(self, semantic: SemanticSimilarity) -> None:
"""Test that semantic similarity requires reference text."""
assert semantic.requires_reference is True
def test_identical_texts(self, semantic: SemanticSimilarity) -> None:
"""Test that identical texts produce high similarity."""
text = "The cat sat on the mat"
result = semantic.score(text, text)
# Identical texts should have very high similarity (close to 1.0)
assert result.similarity >= 0.99
assert result.model == "all-MiniLM-L6-v2"
def test_semantically_similar_texts(self, semantic: SemanticSimilarity) -> None:
"""Test that semantically similar texts have high similarity."""
candidate = "The cat sat on the mat"
reference = "A feline rested on the rug"
result = semantic.score(candidate, reference)
# Similar meanings should have reasonable similarity
assert result.similarity > 0.3
def test_unrelated_texts(self, semantic: SemanticSimilarity) -> None:
"""Test that unrelated texts have low similarity."""
candidate = "The quick brown fox"
reference = "Quantum physics describes particle behaviour"
result = semantic.score(candidate, reference)
# Unrelated texts should have low similarity
assert result.similarity < 0.5
def test_empty_candidate(self, semantic: SemanticSimilarity) -> None:
"""Test that empty candidate returns zero similarity."""
result = semantic.score("", "The cat sat on the mat")
assert result.similarity == 0.0
def test_whitespace_only_candidate(self, semantic: SemanticSimilarity) -> None:
"""Test that whitespace-only candidate returns zero similarity."""
result = semantic.score(" \t\n ", "The cat sat on the mat")
assert result.similarity == 0.0
def test_none_reference_raises(self, semantic: SemanticSimilarity) -> None:
"""Test that None reference raises ValueError."""
with pytest.raises(ValueError, match="requires reference"):
semantic.score("The cat sat", None)
def test_empty_reference_raises(self, semantic: SemanticSimilarity) -> None:
"""Test that empty reference raises ValueError."""
with pytest.raises(ValueError, match="cannot be empty"):
semantic.score("The cat sat", "")
def test_whitespace_reference_raises(self, semantic: SemanticSimilarity) -> None:
"""Test that whitespace-only reference raises ValueError."""
with pytest.raises(ValueError, match="cannot be empty"):
semantic.score("The cat sat", " \t\n ")
def test_multiple_references(self, semantic: SemanticSimilarity) -> None:
"""Test semantic similarity with multiple references uses max."""
candidate = "The cat sat on the mat"
references = [
"A dog ran through the park",
"The cat sat on the mat", # Exact match
]
result = semantic.score(candidate, references)
# Should get high similarity due to exact match reference
assert result.similarity >= 0.99
def test_multiple_references_takes_max(self, semantic: SemanticSimilarity) -> None:
"""Test that multiple references returns maximum similarity."""
candidate = "The cat sat on the mat"
references = [
"Quantum physics is complex", # Low similarity
"A feline rested on the rug", # Higher similarity
]
result = semantic.score(candidate, references)
# Should use the higher similarity
assert result.similarity > 0.3
def test_result_score_property(self, semantic: SemanticSimilarity) -> None:
"""Test that result.score returns similarity."""
result = semantic.score("The cat sat", "The cat sat")
assert result.score == result.similarity
def test_caching_behaviour(self) -> None:
"""Test that caching works for repeated texts."""
semantic = SemanticSimilarity(cache_embeddings=True)
# Score same texts multiple times
text = "The cat sat on the mat"
result1 = semantic.score(text, text)
result2 = semantic.score(text, text)
# Results should be identical
assert result1.similarity == result2.similarity
# Clear cache and check again
semantic.clear_cache()
result3 = semantic.score(text, text)
assert result3.similarity == result1.similarity
def test_caching_disabled(self) -> None:
"""Test that caching can be disabled."""
semantic = SemanticSimilarity(cache_embeddings=False)
text = "The cat sat on the mat"
result1 = semantic.score(text, text)
result2 = semantic.score(text, text)
# Results should still be identical (just not cached)
assert result1.similarity == result2.similarity
# Clear cache should not raise even when disabled
semantic.clear_cache()
def test_custom_model(self) -> None:
"""Test that custom model name is recorded in result."""
# Use the same model but verify it's recorded correctly
semantic = SemanticSimilarity(model="all-MiniLM-L6-v2")
result = semantic.score("Test text", "Test text")
assert result.model == "all-MiniLM-L6-v2"
class TestSemanticSimilarityBatch:
"""Tests for semantic similarity batch scoring."""
@pytest.fixture
def semantic(self) -> SemanticSimilarity:
"""Provide a SemanticSimilarity metric instance."""
return SemanticSimilarity()
def test_batch_score_basic(self, semantic: SemanticSimilarity) -> None:
"""Test basic batch scoring."""
candidates = ["The cat sat on the mat", "A quick brown dog runs fast"]
references = ["The cat sat on the mat", "A quick brown dog runs fast"]
result = semantic.batch_score(candidates, references)
assert result.count == 2
assert len(result.results) == 2
# Identical texts should have very high similarity
assert all(r.similarity >= 0.99 for r in result.results)
def test_batch_score_statistics(self, semantic: SemanticSimilarity) -> None:
"""Test that batch scoring computes statistics."""
candidates = ["The cat sat", "Quantum physics is complex"]
references = ["The cat sat", "The cat sat"]
result = semantic.batch_score(candidates, references)
# Check statistics are computed
assert "similarity" in result.stats
# Mean should be between min and max
stats = result.stats["similarity"]
assert stats.min <= stats.mean <= stats.max
def test_batch_score_percentiles(self, semantic: SemanticSimilarity) -> None:
"""Test that batch scoring computes percentiles."""
candidates = ["a", "b", "c", "d", "e"]
references = ["a", "b", "c", "d", "e"]
result = semantic.batch_score(candidates, references)
stats = result.stats["similarity"]
assert 25 in stats.percentiles
assert 50 in stats.percentiles
assert 75 in stats.percentiles
assert 95 in stats.percentiles
def test_batch_score_none_references_raises(
self, semantic: SemanticSimilarity
) -> None:
"""Test that batch scoring raises for None references."""
with pytest.raises(ValueError, match="requires reference"):
semantic.batch_score(["text"], None)
def test_batch_score_length_mismatch_raises(
self, semantic: SemanticSimilarity
) -> None:
"""Test that batch scoring raises for mismatched lengths."""
with pytest.raises(ValueError, match="must match"):
semantic.batch_score(["a", "b"], ["a"])
def test_batch_score_with_multiple_references(
self, semantic: SemanticSimilarity
) -> None:
"""Test batch scoring with multiple references per candidate."""
candidates = [
"The cat sat on the mat",
"A quick brown dog runs fast",
]
references = [
["The cat sat on the mat", "A cat rests on floor"],
["A quick brown dog runs fast", "Dogs run very quickly"],
]
result = semantic.batch_score(candidates, references)
assert result.count == 2
# First pair has exact match
assert result.results[0].similarity >= 0.99
assert result.results[1].similarity >= 0.99
class TestSemanticResult:
"""Tests for SemanticResult type."""
def test_frozen(self) -> None:
"""Test that SemanticResult is frozen."""
from pydantic import ValidationError
result = SemanticResult(similarity=0.85, model="test-model")
with pytest.raises(ValidationError):
result.similarity = 0.9 # type: ignore[misc]
def test_score_property(self) -> None:
"""Test that score property returns similarity."""
result = SemanticResult(similarity=0.75, model="test-model")
assert result.score == 0.75

View File

@@ -0,0 +1 @@
"""Tests for the validators module."""

View File

@@ -0,0 +1,198 @@
"""Tests for composite validators."""
import pytest
from veritext.core.types import ValidationContext
from veritext.validators import all_of, any_of, bleu, contains, excludes, length
from veritext.validators.composite import AllOf, AnyOf
class TestAllOf:
"""Tests for AllOf composite validator."""
def test_all_of_passes_when_all_checks_pass(self) -> None:
"""Test that AllOf passes when all checks pass."""
validator = AllOf(
checks=[
length(min_words=2),
contains(patterns=["hello"]),
]
)
context = ValidationContext()
result = validator.check("hello world", context)
assert result.passed is True
assert len(result.checks) == 2
assert all(c.passed for c in result.checks)
def test_all_of_fails_when_one_check_fails(self) -> None:
"""Test that AllOf fails when any check fails."""
validator = AllOf(
checks=[
length(min_words=2),
contains(patterns=["goodbye"]),
]
)
context = ValidationContext()
result = validator.check("hello world", context)
assert result.passed is False
assert len(result.checks) == 2
assert len(result.failed_checks) == 1
def test_all_of_fails_when_all_checks_fail(self) -> None:
"""Test that AllOf fails when all checks fail."""
validator = AllOf(
checks=[
length(min_words=10),
contains(patterns=["goodbye"]),
]
)
context = ValidationContext()
result = validator.check("hello", context)
assert result.passed is False
assert len(result.failed_checks) == 2
def test_all_of_with_metric_validators(self) -> None:
"""Test AllOf with metric-based validators."""
validator = AllOf(
checks=[
bleu(min_score=0.5),
length(min_words=3),
]
)
context = ValidationContext(reference="the quick brown fox")
result = validator.check("the quick brown fox jumps", context)
assert result.passed is True
assert len(result.checks) == 2
def test_all_of_failure_summary(self) -> None:
"""Test the failure summary property."""
validator = AllOf(
checks=[
length(min_words=10),
contains(patterns=["goodbye"]),
]
)
context = ValidationContext()
result = validator.check("hello", context)
summary = result.failure_summary
assert "failed" in summary.lower()
assert "length" in summary
assert "contains" in summary
def test_all_of_raises_on_empty_checks(self) -> None:
"""Test that empty checks list raises error."""
with pytest.raises(ValueError, match="cannot be empty"):
AllOf(checks=[])
def test_all_of_name_property(self) -> None:
"""Test the name property."""
validator = AllOf(checks=[length(min_chars=1)])
assert validator.name == "all_of"
def test_all_of_factory_function(self) -> None:
"""Test the all_of() factory function."""
validator = all_of(checks=[length(min_chars=1)])
assert isinstance(validator, AllOf)
class TestAnyOf:
"""Tests for AnyOf composite validator."""
def test_any_of_passes_when_any_check_passes(self) -> None:
"""Test that AnyOf passes when any check passes."""
validator = AnyOf(
checks=[
length(min_words=10), # Will fail
contains(patterns=["hello"]), # Will pass
]
)
context = ValidationContext()
result = validator.check("hello world", context)
assert result.passed is True
assert len(result.checks) == 2
# At least one check passed
assert any(c.passed for c in result.checks)
def test_any_of_passes_when_all_checks_pass(self) -> None:
"""Test that AnyOf passes when all checks pass."""
validator = AnyOf(
checks=[
length(min_words=2),
contains(patterns=["hello"]),
]
)
context = ValidationContext()
result = validator.check("hello world", context)
assert result.passed is True
assert all(c.passed for c in result.checks)
def test_any_of_fails_when_all_checks_fail(self) -> None:
"""Test that AnyOf fails when all checks fail."""
validator = AnyOf(
checks=[
length(min_words=10),
contains(patterns=["goodbye"]),
]
)
context = ValidationContext()
result = validator.check("hello", context)
assert result.passed is False
assert not any(c.passed for c in result.checks)
def test_any_of_with_metric_validators(self) -> None:
"""Test AnyOf with metric-based validators."""
validator = AnyOf(
checks=[
bleu(min_score=0.9), # Might fail
length(min_words=3), # Should pass
]
)
context = ValidationContext(reference="different text entirely")
result = validator.check("the quick brown fox jumps", context)
assert result.passed is True # Length check passes
def test_any_of_with_excludes(self) -> None:
"""Test AnyOf with excludes validator."""
validator = AnyOf(
checks=[
excludes(patterns=["error"]),
excludes(patterns=["warning"]),
]
)
context = ValidationContext()
# Should pass - neither pattern found
result = validator.check("All is well", context)
assert result.passed is True
# Should pass - one pattern found, other not
result = validator.check("This is an error", context)
assert result.passed is True
# Should fail - both patterns found
result = validator.check("error and warning", context)
assert result.passed is False
def test_any_of_raises_on_empty_checks(self) -> None:
"""Test that empty checks list raises error."""
with pytest.raises(ValueError, match="cannot be empty"):
AnyOf(checks=[])
def test_any_of_name_property(self) -> None:
"""Test the name property."""
validator = AnyOf(checks=[length(min_chars=1)])
assert validator.name == "any_of"
def test_any_of_factory_function(self) -> None:
"""Test the any_of() factory function."""
validator = any_of(checks=[length(min_chars=1)])
assert isinstance(validator, AnyOf)

View File

@@ -0,0 +1,334 @@
"""Tests for constraint validators."""
import pytest
from veritext.core.exceptions import InvalidThresholdError
from veritext.core.types import ValidationContext
from veritext.validators import contains, excludes, length, readability
from veritext.validators.constraint import (
ContainsValidator,
ExcludesValidator,
LengthValidator,
ReadabilityValidator,
)
class TestLengthValidator:
"""Tests for LengthValidator."""
def test_length_validator_min_chars_passes(self) -> None:
"""Test that validator passes when char count meets minimum."""
validator = LengthValidator(min_chars=10)
context = ValidationContext()
result = validator.check("hello world!", context)
assert result.passed is True
assert result.name == "length"
assert result.actual["chars"] == 12
def test_length_validator_min_chars_fails(self) -> None:
"""Test that validator fails when char count below minimum."""
validator = LengthValidator(min_chars=20)
context = ValidationContext()
result = validator.check("hello", context)
assert result.passed is False
assert "< min" in result.message
def test_length_validator_max_chars_passes(self) -> None:
"""Test that validator passes when char count within maximum."""
validator = LengthValidator(max_chars=20)
context = ValidationContext()
result = validator.check("hello world", context)
assert result.passed is True
assert result.actual["chars"] == 11
def test_length_validator_max_chars_fails(self) -> None:
"""Test that validator fails when char count exceeds maximum."""
validator = LengthValidator(max_chars=5)
context = ValidationContext()
result = validator.check("hello world", context)
assert result.passed is False
assert "> max" in result.message
def test_length_validator_min_words_passes(self) -> None:
"""Test that validator passes when word count meets minimum."""
validator = LengthValidator(min_words=3)
context = ValidationContext()
result = validator.check("the quick brown fox", context)
assert result.passed is True
assert result.actual["words"] == 4
def test_length_validator_min_words_fails(self) -> None:
"""Test that validator fails when word count below minimum."""
validator = LengthValidator(min_words=10)
context = ValidationContext()
result = validator.check("hello world", context)
assert result.passed is False
assert "words < min" in result.message
def test_length_validator_max_words_passes(self) -> None:
"""Test that validator passes when word count within maximum."""
validator = LengthValidator(max_words=5)
context = ValidationContext()
result = validator.check("hello world", context)
assert result.passed is True
def test_length_validator_max_words_fails(self) -> None:
"""Test that validator fails when word count exceeds maximum."""
validator = LengthValidator(max_words=2)
context = ValidationContext()
result = validator.check("the quick brown fox", context)
assert result.passed is False
assert "words > max" in result.message
def test_length_validator_combined_constraints(self) -> None:
"""Test validator with multiple constraints."""
validator = LengthValidator(
min_chars=5, max_chars=50, min_words=2, max_words=10
)
context = ValidationContext()
result = validator.check("the quick brown fox", context)
assert result.passed is True
assert "min_chars" in result.threshold
assert "max_chars" in result.threshold
assert "min_words" in result.threshold
assert "max_words" in result.threshold
def test_length_validator_raises_when_no_constraints(self) -> None:
"""Test that validator raises when no constraints provided."""
with pytest.raises(InvalidThresholdError, match="At least one"):
LengthValidator()
def test_length_validator_raises_on_negative_values(self) -> None:
"""Test that negative constraint values raise error."""
with pytest.raises(InvalidThresholdError, match="min_chars must be >= 0"):
LengthValidator(min_chars=-1)
with pytest.raises(InvalidThresholdError, match="max_chars must be >= 0"):
LengthValidator(max_chars=-1)
with pytest.raises(InvalidThresholdError, match="min_words must be >= 0"):
LengthValidator(min_words=-1)
with pytest.raises(InvalidThresholdError, match="max_words must be >= 0"):
LengthValidator(max_words=-1)
def test_length_validator_raises_on_invalid_range(self) -> None:
"""Test that min > max raises error."""
with pytest.raises(InvalidThresholdError, match="cannot exceed max_chars"):
LengthValidator(min_chars=100, max_chars=50)
with pytest.raises(InvalidThresholdError, match="cannot exceed max_words"):
LengthValidator(min_words=20, max_words=5)
def test_length_factory_function(self) -> None:
"""Test the length() factory function."""
validator = length(min_chars=10, max_words=100)
assert isinstance(validator, LengthValidator)
assert validator.name == "length"
class TestReadabilityValidator:
"""Tests for ReadabilityValidator."""
def test_readability_validator_max_grade_passes(self) -> None:
"""Test that validator passes when grade level within maximum."""
validator = ReadabilityValidator(max_grade=12.0)
context = ValidationContext()
# Simple text should have low grade level
result = validator.check("The cat sat on the mat. It was a nice day.", context)
assert result.passed is True
assert result.name == "readability"
assert "grade" in result.actual
def test_readability_validator_max_grade_fails(self) -> None:
"""Test that validator fails when grade level exceeds maximum."""
validator = ReadabilityValidator(max_grade=1.0)
context = ValidationContext()
# Complex text
result = validator.check(
"The implementation of sophisticated methodologies necessitates "
"comprehensive analytical frameworks for systematic evaluation.",
context,
)
assert result.passed is False
assert "grade level" in result.message
assert "> max" in result.message
def test_readability_validator_min_ease_passes(self) -> None:
"""Test that validator passes when reading ease meets minimum."""
validator = ReadabilityValidator(min_ease=30.0)
context = ValidationContext()
# Simple text should have high reading ease
result = validator.check("The cat sat. The dog ran. It was fun.", context)
assert result.passed is True
assert "ease" in result.actual
def test_readability_validator_min_ease_fails(self) -> None:
"""Test that validator fails when reading ease below minimum."""
validator = ReadabilityValidator(min_ease=100.0)
context = ValidationContext()
result = validator.check(
"The implementation of sophisticated methodologies necessitates "
"comprehensive analytical frameworks.",
context,
)
assert result.passed is False
assert "reading ease" in result.message
assert "< min" in result.message
def test_readability_validator_combined_constraints(self) -> None:
"""Test validator with both grade and ease constraints."""
validator = ReadabilityValidator(max_grade=12.0, min_ease=30.0)
context = ValidationContext()
result = validator.check("The cat sat on the mat.", context)
assert "max_grade" in result.threshold
assert "min_ease" in result.threshold
def test_readability_validator_raises_when_no_constraints(self) -> None:
"""Test that validator raises when no constraints provided."""
with pytest.raises(InvalidThresholdError, match="At least one"):
ReadabilityValidator()
def test_readability_factory_function(self) -> None:
"""Test the readability() factory function."""
validator = readability(max_grade=8.0, min_ease=60.0)
assert isinstance(validator, ReadabilityValidator)
assert validator.name == "readability"
class TestContainsValidator:
"""Tests for ContainsValidator."""
def test_contains_validator_passes_when_pattern_found(self) -> None:
"""Test that validator passes when all patterns are found."""
validator = ContainsValidator(patterns=["hello", "world"])
context = ValidationContext()
result = validator.check("Hello World!", context)
assert result.passed is True
assert result.name == "contains"
assert result.actual["found"] == 2
assert result.actual["missing"] == []
def test_contains_validator_fails_when_pattern_missing(self) -> None:
"""Test that validator fails when a pattern is missing."""
validator = ContainsValidator(patterns=["hello", "goodbye"])
context = ValidationContext()
result = validator.check("Hello World!", context)
assert result.passed is False
assert "goodbye" in result.actual["missing"]
assert "missing" in result.message
def test_contains_validator_case_insensitive_by_default(self) -> None:
"""Test that matching is case-insensitive by default."""
validator = ContainsValidator(patterns=["HELLO"])
context = ValidationContext()
result = validator.check("hello world", context)
assert result.passed is True
def test_contains_validator_case_sensitive(self) -> None:
"""Test case-sensitive matching."""
validator = ContainsValidator(patterns=["HELLO"], case_sensitive=True)
context = ValidationContext()
result = validator.check("hello world", context)
assert result.passed is False
def test_contains_validator_regex_patterns(self) -> None:
"""Test regex pattern matching."""
validator = ContainsValidator(patterns=[r"\d{3}-\d{4}"])
context = ValidationContext()
result = validator.check("Call 555-1234 for info", context)
assert result.passed is True
def test_contains_validator_raises_on_empty_patterns(self) -> None:
"""Test that empty patterns list raises error."""
with pytest.raises(InvalidThresholdError, match="cannot be empty"):
ContainsValidator(patterns=[])
def test_contains_factory_function(self) -> None:
"""Test the contains() factory function."""
validator = contains(patterns=["test"], case_sensitive=True)
assert isinstance(validator, ContainsValidator)
assert validator.name == "contains"
class TestExcludesValidator:
"""Tests for ExcludesValidator."""
def test_excludes_validator_passes_when_pattern_absent(self) -> None:
"""Test that validator passes when all patterns are absent."""
validator = ExcludesValidator(patterns=["bad", "forbidden"])
context = ValidationContext()
result = validator.check("This is good text.", context)
assert result.passed is True
assert result.name == "excludes"
assert result.actual["found"] == []
def test_excludes_validator_fails_when_pattern_found(self) -> None:
"""Test that validator fails when a forbidden pattern is found."""
validator = ExcludesValidator(patterns=["bad", "forbidden"])
context = ValidationContext()
result = validator.check("This is bad text.", context)
assert result.passed is False
assert "bad" in result.actual["found"]
assert "forbidden" in result.message
def test_excludes_validator_case_insensitive_by_default(self) -> None:
"""Test that matching is case-insensitive by default."""
validator = ExcludesValidator(patterns=["BAD"])
context = ValidationContext()
result = validator.check("This is bad text.", context)
assert result.passed is False
def test_excludes_validator_case_sensitive(self) -> None:
"""Test case-sensitive matching."""
validator = ExcludesValidator(patterns=["BAD"], case_sensitive=True)
context = ValidationContext()
result = validator.check("This is bad text.", context)
assert result.passed is True
def test_excludes_validator_regex_patterns(self) -> None:
"""Test regex pattern matching."""
validator = ExcludesValidator(patterns=[r"\b\d{4}\b"]) # 4-digit numbers
context = ValidationContext()
# Should fail when pattern found
result = validator.check("PIN is 1234", context)
assert result.passed is False
# Should pass when pattern absent
result = validator.check("No numbers here", context)
assert result.passed is True
def test_excludes_validator_raises_on_empty_patterns(self) -> None:
"""Test that empty patterns list raises error."""
with pytest.raises(InvalidThresholdError, match="cannot be empty"):
ExcludesValidator(patterns=[])
def test_excludes_factory_function(self) -> None:
"""Test the excludes() factory function."""
validator = excludes(patterns=["test"], case_sensitive=True)
assert isinstance(validator, ExcludesValidator)
assert validator.name == "excludes"

View File

@@ -0,0 +1,283 @@
"""Tests for metric-based validators."""
import pytest
from veritext.core.exceptions import InvalidThresholdError, ValidationError
from veritext.core.types import ValidationContext
from veritext.validators import bleu, lexical, rouge
from veritext.validators.metric import BleuValidator, LexicalValidator, RougeValidator
class TestBleuValidator:
"""Tests for BleuValidator."""
def test_bleu_validator_passes_when_score_meets_threshold(self) -> None:
"""Test that validator passes when BLEU score meets threshold."""
validator = BleuValidator(min_score=0.5, variant=4)
context = ValidationContext(reference="the cat sat on the mat")
result = validator.check("the cat sat on the mat", context)
assert result.passed is True
assert result.name == "bleu-4"
assert result.actual == 1.0 # Identical text
assert result.threshold == 0.5
def test_bleu_validator_fails_when_score_below_threshold(self) -> None:
"""Test that validator fails when BLEU score is below threshold."""
validator = BleuValidator(min_score=0.9, variant=4)
context = ValidationContext(reference="the cat sat on the mat")
result = validator.check("a dog ran through the park", context)
assert result.passed is False
assert result.name == "bleu-4"
assert result.actual < 0.9
assert "below minimum" in result.message
def test_bleu_validator_variant_selection(self) -> None:
"""Test different BLEU variants."""
context = ValidationContext(reference="the quick brown fox jumps")
for variant in (1, 2, 3, 4):
validator = BleuValidator(min_score=0.0, variant=variant) # type: ignore[arg-type]
result = validator.check("the quick brown fox", context)
assert result.name == f"bleu-{variant}"
def test_bleu_validator_raises_on_missing_reference(self) -> None:
"""Test that validator raises when reference is missing."""
validator = BleuValidator(min_score=0.5)
context = ValidationContext()
with pytest.raises(ValidationError, match="requires reference text"):
validator.check("some text", context)
def test_bleu_validator_raises_on_invalid_min_score(self) -> None:
"""Test that invalid min_score raises error."""
with pytest.raises(InvalidThresholdError, match=r"between 0\.0 and 1\.0"):
BleuValidator(min_score=1.5)
with pytest.raises(InvalidThresholdError, match=r"between 0\.0 and 1\.0"):
BleuValidator(min_score=-0.1)
def test_bleu_validator_raises_on_invalid_variant(self) -> None:
"""Test that invalid variant raises error."""
with pytest.raises(InvalidThresholdError, match="variant must be"):
BleuValidator(min_score=0.5, variant=5) # type: ignore[arg-type]
def test_bleu_factory_function(self) -> None:
"""Test the bleu() factory function."""
validator = bleu(min_score=0.6, variant=2)
assert isinstance(validator, BleuValidator)
assert validator.name == "bleu-2"
class TestRougeValidator:
"""Tests for RougeValidator."""
def test_rouge_validator_passes_when_score_meets_threshold(self) -> None:
"""Test that validator passes when ROUGE score meets threshold."""
validator = RougeValidator(min_score=0.5, variant="l")
context = ValidationContext(reference="the cat sat on the mat")
result = validator.check("the cat sat on the mat", context)
assert result.passed is True
assert result.name == "rouge-l"
assert result.actual == 1.0 # Identical text
assert result.threshold == 0.5
def test_rouge_validator_fails_when_score_below_threshold(self) -> None:
"""Test that validator fails when ROUGE score is below threshold."""
validator = RougeValidator(min_score=0.9, variant="l")
context = ValidationContext(reference="the cat sat on the mat")
result = validator.check("a dog ran through the park", context)
assert result.passed is False
assert result.actual < 0.9
assert "below minimum" in result.message
def test_rouge_validator_variant_selection(self) -> None:
"""Test different ROUGE variants."""
context = ValidationContext(reference="the quick brown fox jumps")
for variant in ("1", "2", "l"):
validator = RougeValidator(min_score=0.0, variant=variant) # type: ignore[arg-type]
result = validator.check("the quick brown fox", context)
assert result.name == f"rouge-{variant}"
def test_rouge_validator_raises_on_missing_reference(self) -> None:
"""Test that validator raises when reference is missing."""
validator = RougeValidator(min_score=0.5)
context = ValidationContext()
with pytest.raises(ValidationError, match="requires reference text"):
validator.check("some text", context)
def test_rouge_validator_raises_on_invalid_min_score(self) -> None:
"""Test that invalid min_score raises error."""
with pytest.raises(InvalidThresholdError, match=r"between 0\.0 and 1\.0"):
RougeValidator(min_score=1.5)
def test_rouge_validator_raises_on_invalid_variant(self) -> None:
"""Test that invalid variant raises error."""
with pytest.raises(InvalidThresholdError, match="variant must be"):
RougeValidator(min_score=0.5, variant="3") # type: ignore[arg-type]
def test_rouge_factory_function(self) -> None:
"""Test the rouge() factory function."""
validator = rouge(min_score=0.6, variant="2")
assert isinstance(validator, RougeValidator)
assert validator.name == "rouge-2"
class TestLexicalValidator:
"""Tests for LexicalValidator."""
def test_lexical_validator_passes_on_jaccard(self) -> None:
"""Test that validator passes when Jaccard similarity meets threshold."""
validator = LexicalValidator(min_jaccard=0.5)
context = ValidationContext(reference="the cat sat on the mat")
result = validator.check("the cat sat on the mat", context)
assert result.passed is True
assert result.name == "lexical"
assert result.actual["jaccard"] == 1.0
def test_lexical_validator_fails_on_jaccard(self) -> None:
"""Test that validator fails when Jaccard is below threshold."""
validator = LexicalValidator(min_jaccard=0.9)
context = ValidationContext(reference="the cat sat on the mat")
result = validator.check("a dog ran through the park", context)
assert result.passed is False
assert "Jaccard" in result.message
assert "below minimum" in result.message
def test_lexical_validator_passes_on_overlap(self) -> None:
"""Test that validator passes when token overlap meets threshold."""
validator = LexicalValidator(min_overlap=0.5)
context = ValidationContext(reference="the cat sat on the mat")
result = validator.check("the cat sat on the mat", context)
assert result.passed is True
assert result.actual["token_overlap"] == 1.0
def test_lexical_validator_fails_on_overlap(self) -> None:
"""Test that validator fails when overlap is below threshold."""
validator = LexicalValidator(min_overlap=0.9)
context = ValidationContext(reference="the cat sat on the mat")
result = validator.check("a dog ran through", context)
assert result.passed is False
assert "overlap" in result.message
def test_lexical_validator_with_both_thresholds(self) -> None:
"""Test validator with both Jaccard and overlap thresholds."""
validator = LexicalValidator(min_jaccard=0.3, min_overlap=0.5)
context = ValidationContext(reference="the cat sat on the mat")
result = validator.check("the cat sat", context)
# Should check both thresholds
assert "min_jaccard" in result.threshold
assert "min_overlap" in result.threshold
def test_lexical_validator_raises_when_no_threshold(self) -> None:
"""Test that validator raises when no threshold is provided."""
with pytest.raises(InvalidThresholdError, match="At least one"):
LexicalValidator()
def test_lexical_validator_raises_on_invalid_jaccard(self) -> None:
"""Test that invalid Jaccard threshold raises error."""
with pytest.raises(InvalidThresholdError, match="min_jaccard"):
LexicalValidator(min_jaccard=1.5)
def test_lexical_validator_raises_on_invalid_overlap(self) -> None:
"""Test that invalid overlap threshold raises error."""
with pytest.raises(InvalidThresholdError, match="min_overlap"):
LexicalValidator(min_overlap=-0.1)
def test_lexical_validator_raises_on_missing_reference(self) -> None:
"""Test that validator raises when reference is missing."""
validator = LexicalValidator(min_jaccard=0.5)
context = ValidationContext()
with pytest.raises(ValidationError, match="requires reference text"):
validator.check("some text", context)
def test_lexical_factory_function(self) -> None:
"""Test the lexical() factory function."""
validator = lexical(min_jaccard=0.5, min_overlap=0.6)
assert isinstance(validator, LexicalValidator)
assert validator.name == "lexical"
# SemanticValidator tests - conditionally run if sentence-transformers is installed
class TestSemanticValidator:
"""Tests for SemanticValidator."""
@staticmethod
def _skip_if_no_transformers() -> None:
"""Skip test if sentence-transformers is not installed."""
pytest.importorskip("sentence_transformers")
def test_semantic_validator_passes_when_score_meets_threshold(self) -> None:
"""Test that validator passes when semantic similarity meets threshold."""
self._skip_if_no_transformers()
from veritext.validators.metric import SemanticValidator
validator = SemanticValidator(min_score=0.5)
context = ValidationContext(reference="the cat sat on the mat")
result = validator.check("the cat sat on the mat", context)
assert result.passed is True
assert result.name == "semantic"
assert result.actual >= 0.99 # Identical text
assert result.threshold == 0.5
def test_semantic_validator_fails_when_score_below_threshold(self) -> None:
"""Test that validator fails when semantic similarity is below threshold."""
self._skip_if_no_transformers()
from veritext.validators.metric import SemanticValidator
validator = SemanticValidator(min_score=0.99)
context = ValidationContext(reference="the cat sat on the mat")
result = validator.check(
"quantum physics describes particle behaviour", context
)
assert result.passed is False
assert result.name == "semantic"
assert result.actual < 0.99
assert "below minimum" in result.message
def test_semantic_validator_raises_on_missing_reference(self) -> None:
"""Test that validator raises when reference is missing."""
self._skip_if_no_transformers()
from veritext.validators.metric import SemanticValidator
validator = SemanticValidator(min_score=0.5)
context = ValidationContext()
with pytest.raises(ValidationError, match="requires reference text"):
validator.check("some text", context)
def test_semantic_validator_raises_on_invalid_min_score(self) -> None:
"""Test that invalid min_score raises error without loading model."""
# This test doesn't need sentence-transformers since validation happens first
with pytest.raises(InvalidThresholdError, match=r"between 0\.0 and 1\.0"):
from veritext.validators.metric import SemanticValidator
SemanticValidator(min_score=1.5)
with pytest.raises(InvalidThresholdError, match=r"between 0\.0 and 1\.0"):
from veritext.validators.metric import SemanticValidator
SemanticValidator(min_score=-0.1)
def test_semantic_factory_function(self) -> None:
"""Test the semantic() factory function."""
self._skip_if_no_transformers()
from veritext.validators import semantic
from veritext.validators.metric import SemanticValidator
validator = semantic(min_score=0.6)
assert isinstance(validator, SemanticValidator)
assert validator.name == "semantic"