From 7832fa3d5969c605434b0e9a289aa8b7a04a75c2 Mon Sep 17 00:00:00 2001 From: Kai Chappell Date: Wed, 12 Mar 2025 20:13:11 +0000 Subject: [PATCH] metric protocol and batch scoring types Add Metric protocol, AggregateStats for statistical summaries, and BatchResult for batch processing support. --- src/veritext/metrics/base.py | 136 ++++++++++++++++++++++++++++++++ src/veritext/metrics/results.py | 110 ++++++++++++++++++++++++++ 2 files changed, 246 insertions(+) create mode 100644 src/veritext/metrics/base.py create mode 100644 src/veritext/metrics/results.py diff --git a/src/veritext/metrics/base.py b/src/veritext/metrics/base.py new file mode 100644 index 0000000..21dede8 --- /dev/null +++ b/src/veritext/metrics/base.py @@ -0,0 +1,136 @@ +"""Base types and protocols for metrics.""" + +import math +from typing import Generic, Protocol, TypeVar + +from pydantic import BaseModel, ConfigDict + +T = TypeVar("T") + + +class MetricResult(Protocol): + """Protocol for metric result types.""" + + +class AggregateStats(BaseModel): + """Aggregate statistics for a batch of metric scores.""" + + model_config = ConfigDict(frozen=True) + + mean: float + """Mean of the scores.""" + + std: float + """Standard deviation of the scores.""" + + min: float + """Minimum score.""" + + max: float + """Maximum score.""" + + percentiles: dict[int, float] + """Percentile values (typically 25, 50, 75, 95).""" + + @classmethod + def from_values(cls, values: list[float]) -> "AggregateStats": + """ + Compute aggregate statistics from a list of values. + + Args: + values: List of numeric values to aggregate. + + Returns: + AggregateStats with computed statistics. + + Raises: + ValueError: If values list is empty. + """ + if not values: + raise ValueError("Cannot compute statistics from empty list") + + n = len(values) + mean = sum(values) / n + + if n == 1: + std = 0.0 + else: + variance = sum((v - mean) ** 2 for v in values) / (n - 1) + std = math.sqrt(variance) + + sorted_values = sorted(values) + + def percentile(p: int) -> float: + if n == 1: + return sorted_values[0] + k = (n - 1) * p / 100 + f = math.floor(k) + c = math.ceil(k) + if f == c: + return sorted_values[int(k)] + return sorted_values[f] * (c - k) + sorted_values[c] * (k - f) + + return cls( + mean=mean, + std=std, + min=sorted_values[0], + max=sorted_values[-1], + percentiles={p: percentile(p) for p in (25, 50, 75, 95)}, + ) + + +class BatchResult(BaseModel, Generic[T]): + """Result of batch metric computation.""" + + model_config = ConfigDict(frozen=True, arbitrary_types_allowed=True) + + results: list[T] + """Individual results for each input.""" + + count: int + """Number of results.""" + + stats: dict[str, AggregateStats] + """Aggregate statistics keyed by score name.""" + + +class Metric(Protocol[T]): + """Protocol for metrics that compute scores from text.""" + + @property + def name(self) -> str: + ... + + @property + def requires_reference(self) -> bool: + ... + + def score(self, candidate: str, reference: str | list[str] | None = None) -> T: + """ + Compute the metric score for a candidate text. + + Args: + candidate: The text to score. + reference: Reference text(s) for comparison, if required. + + Returns: + The computed metric result. + """ + ... + + def batch_score( + self, + candidates: list[str], + references: list[str] | list[list[str]] | None = None, + ) -> BatchResult[T]: + """ + Compute metric scores for a batch of candidates. + + Args: + candidates: List of texts to score. + references: Reference text(s) for each candidate, if required. + + Returns: + BatchResult containing individual results and aggregate statistics. + """ + ... diff --git a/src/veritext/metrics/results.py b/src/veritext/metrics/results.py new file mode 100644 index 0000000..4ea49fc --- /dev/null +++ b/src/veritext/metrics/results.py @@ -0,0 +1,110 @@ +"""Result types for metrics.""" + +from pydantic import BaseModel, ConfigDict + + +class BleuResult(BaseModel): + """Result of BLEU score computation.""" + + model_config = ConfigDict(frozen=True) + + bleu1: float + """Unigram BLEU score (precision).""" + + bleu2: float + """Bigram BLEU score (precision).""" + + bleu3: float + """Trigram BLEU score (precision).""" + + bleu4: float + """4-gram BLEU score (precision).""" + + brevity_penalty: float + """Brevity penalty applied to the score.""" + + @property + def score(self) -> float: + return self.bleu4 + + +class LexicalResult(BaseModel): + """Result of lexical similarity computation.""" + + model_config = ConfigDict(frozen=True) + + jaccard: float + """Jaccard similarity: |intersection| / |union| of token sets.""" + + token_overlap: float + """Proportion of candidate tokens found in reference.""" + + @property + def score(self) -> float: + return self.jaccard + + +class RougeScore(BaseModel): + """Individual ROUGE variant score with precision, recall, F-measure.""" + + model_config = ConfigDict(frozen=True) + + precision: float + """Precision: overlap / candidate length.""" + + recall: float + """Recall: overlap / reference length.""" + + fmeasure: float + """F1-measure: harmonic mean of precision and recall.""" + + +class RougeResult(BaseModel): + """Result of ROUGE score computation.""" + + model_config = ConfigDict(frozen=True) + + rouge1: RougeScore + """ROUGE-1 (unigram) score.""" + + rouge2: RougeScore + """ROUGE-2 (bigram) score.""" + + rouge_l: RougeScore + """ROUGE-L (longest common subsequence) score.""" + + @property + def score(self) -> float: + return self.rouge_l.fmeasure + + +class ReadabilityResult(BaseModel): + """Result of readability computation.""" + + model_config = ConfigDict(frozen=True) + + flesch_kincaid_grade: float + """US grade level (e.g., 8.0 = 8th grade reading level).""" + + flesch_reading_ease: float + """Score 0-100, higher = easier to read.""" + + @property + def score(self) -> float: + return self.flesch_reading_ease + + +class SemanticResult(BaseModel): + """Result of semantic similarity computation.""" + + model_config = ConfigDict(frozen=True) + + similarity: float + """Cosine similarity score (0.0 to 1.0).""" + + model: str + """Name of the embedding model used.""" + + @property + def score(self) -> float: + return self.similarity