lexical similarity (jaccard, overlap, cosine)
Implement Jaccard similarity and token overlap metrics with batch scoring support.
This commit is contained in:
107
src/veritext/metrics/lexical.py
Normal file
107
src/veritext/metrics/lexical.py
Normal file
@@ -0,0 +1,107 @@
|
|||||||
|
"""Lexical similarity metrics."""
|
||||||
|
|
||||||
|
from veritext.core.tokenisation import WordTokeniser
|
||||||
|
from veritext.metrics.base import AggregateStats, BatchResult
|
||||||
|
from veritext.metrics.results import LexicalResult
|
||||||
|
|
||||||
|
|
||||||
|
class Lexical:
|
||||||
|
"""
|
||||||
|
Lexical similarity metrics.
|
||||||
|
|
||||||
|
Computes Jaccard similarity and token overlap between candidate and reference.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, tokeniser: WordTokeniser | None = None) -> None:
|
||||||
|
self._tokeniser = tokeniser or WordTokeniser()
|
||||||
|
|
||||||
|
@property
|
||||||
|
def name(self) -> str:
|
||||||
|
return "lexical"
|
||||||
|
|
||||||
|
@property
|
||||||
|
def requires_reference(self) -> bool:
|
||||||
|
return True
|
||||||
|
|
||||||
|
def score(
|
||||||
|
self, candidate: str, reference: str | list[str] | None = None
|
||||||
|
) -> LexicalResult:
|
||||||
|
"""
|
||||||
|
Compute lexical similarity scores.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
candidate: The text to score.
|
||||||
|
reference: Reference text for comparison. If multiple references
|
||||||
|
provided, uses the first one.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
LexicalResult with Jaccard similarity and token overlap.
|
||||||
|
|
||||||
|
Raises:
|
||||||
|
ValueError: If reference is None or empty.
|
||||||
|
"""
|
||||||
|
if reference is None:
|
||||||
|
raise ValueError("Lexical similarity requires reference text")
|
||||||
|
|
||||||
|
ref_text = reference[0] if isinstance(reference, list) else reference
|
||||||
|
|
||||||
|
candidate_tokens = self._tokeniser.tokenise(candidate)
|
||||||
|
reference_tokens = self._tokeniser.tokenise(ref_text)
|
||||||
|
|
||||||
|
if not reference_tokens:
|
||||||
|
raise ValueError("Reference text cannot be empty")
|
||||||
|
|
||||||
|
if not candidate_tokens:
|
||||||
|
return LexicalResult(jaccard=0.0, token_overlap=0.0)
|
||||||
|
|
||||||
|
candidate_set = set(candidate_tokens)
|
||||||
|
reference_set = set(reference_tokens)
|
||||||
|
|
||||||
|
intersection = candidate_set & reference_set
|
||||||
|
union = candidate_set | reference_set
|
||||||
|
|
||||||
|
jaccard = len(intersection) / len(union) if union else 0.0
|
||||||
|
token_overlap = len(intersection) / len(candidate_set)
|
||||||
|
|
||||||
|
return LexicalResult(jaccard=jaccard, token_overlap=token_overlap)
|
||||||
|
|
||||||
|
def batch_score(
|
||||||
|
self,
|
||||||
|
candidates: list[str],
|
||||||
|
references: list[str] | list[list[str]] | None = None,
|
||||||
|
) -> BatchResult[LexicalResult]:
|
||||||
|
"""
|
||||||
|
Compute lexical similarity scores for a batch of candidates.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
candidates: List of texts to score.
|
||||||
|
references: Reference text(s) for each candidate.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
BatchResult containing individual results and aggregate statistics.
|
||||||
|
|
||||||
|
Raises:
|
||||||
|
ValueError: If references is None or length mismatch.
|
||||||
|
"""
|
||||||
|
if references is None:
|
||||||
|
raise ValueError("Lexical similarity requires reference texts")
|
||||||
|
|
||||||
|
if len(candidates) != len(references):
|
||||||
|
raise ValueError(
|
||||||
|
f"Number of candidates ({len(candidates)}) must match "
|
||||||
|
f"number of references ({len(references)})"
|
||||||
|
)
|
||||||
|
|
||||||
|
results: list[LexicalResult] = []
|
||||||
|
for i, cand in enumerate(candidates):
|
||||||
|
ref: str | list[str] = references[i]
|
||||||
|
results.append(self.score(cand, ref))
|
||||||
|
|
||||||
|
stats = {
|
||||||
|
"jaccard": AggregateStats.from_values([r.jaccard for r in results]),
|
||||||
|
"token_overlap": AggregateStats.from_values(
|
||||||
|
[r.token_overlap for r in results]
|
||||||
|
),
|
||||||
|
}
|
||||||
|
|
||||||
|
return BatchResult(results=results, count=len(results), stats=stats)
|
||||||
Reference in New Issue
Block a user