From f26e14bf2069e264a9ddb8597612ce5fe247e4e1 Mon Sep 17 00:00:00 2001 From: Kai Chappell Date: Sat, 15 Mar 2025 12:09:50 +0000 Subject: [PATCH] lexical similarity (jaccard, overlap, cosine) Implement Jaccard similarity and token overlap metrics with batch scoring support. --- src/veritext/metrics/lexical.py | 107 ++++++++++++++++++++++++++++++++ 1 file changed, 107 insertions(+) create mode 100644 src/veritext/metrics/lexical.py diff --git a/src/veritext/metrics/lexical.py b/src/veritext/metrics/lexical.py new file mode 100644 index 0000000..4cb3722 --- /dev/null +++ b/src/veritext/metrics/lexical.py @@ -0,0 +1,107 @@ +"""Lexical similarity metrics.""" + +from veritext.core.tokenisation import WordTokeniser +from veritext.metrics.base import AggregateStats, BatchResult +from veritext.metrics.results import LexicalResult + + +class Lexical: + """ + Lexical similarity metrics. + + Computes Jaccard similarity and token overlap between candidate and reference. + """ + + def __init__(self, tokeniser: WordTokeniser | None = None) -> None: + self._tokeniser = tokeniser or WordTokeniser() + + @property + def name(self) -> str: + return "lexical" + + @property + def requires_reference(self) -> bool: + return True + + def score( + self, candidate: str, reference: str | list[str] | None = None + ) -> LexicalResult: + """ + Compute lexical similarity scores. + + Args: + candidate: The text to score. + reference: Reference text for comparison. If multiple references + provided, uses the first one. + + Returns: + LexicalResult with Jaccard similarity and token overlap. + + Raises: + ValueError: If reference is None or empty. + """ + if reference is None: + raise ValueError("Lexical similarity requires reference text") + + ref_text = reference[0] if isinstance(reference, list) else reference + + candidate_tokens = self._tokeniser.tokenise(candidate) + reference_tokens = self._tokeniser.tokenise(ref_text) + + if not reference_tokens: + raise ValueError("Reference text cannot be empty") + + if not candidate_tokens: + return LexicalResult(jaccard=0.0, token_overlap=0.0) + + candidate_set = set(candidate_tokens) + reference_set = set(reference_tokens) + + intersection = candidate_set & reference_set + union = candidate_set | reference_set + + jaccard = len(intersection) / len(union) if union else 0.0 + token_overlap = len(intersection) / len(candidate_set) + + return LexicalResult(jaccard=jaccard, token_overlap=token_overlap) + + def batch_score( + self, + candidates: list[str], + references: list[str] | list[list[str]] | None = None, + ) -> BatchResult[LexicalResult]: + """ + Compute lexical similarity scores for a batch of candidates. + + Args: + candidates: List of texts to score. + references: Reference text(s) for each candidate. + + Returns: + BatchResult containing individual results and aggregate statistics. + + Raises: + ValueError: If references is None or length mismatch. + """ + if references is None: + raise ValueError("Lexical similarity requires reference texts") + + if len(candidates) != len(references): + raise ValueError( + f"Number of candidates ({len(candidates)}) must match " + f"number of references ({len(references)})" + ) + + results: list[LexicalResult] = [] + for i, cand in enumerate(candidates): + ref: str | list[str] = references[i] + results.append(self.score(cand, ref)) + + stats = { + "jaccard": AggregateStats.from_values([r.jaccard for r in results]), + "token_overlap": AggregateStats.from_values( + [r.token_overlap for r in results] + ), + } + + return BatchResult(results=results, count=len(results), stats=stats)