metric protocol and batch scoring types
Add Metric protocol, AggregateStats for statistical summaries, and BatchResult for batch processing support.
This commit is contained in:
136
src/veritext/metrics/base.py
Normal file
136
src/veritext/metrics/base.py
Normal file
@@ -0,0 +1,136 @@
|
|||||||
|
"""Base types and protocols for metrics."""
|
||||||
|
|
||||||
|
import math
|
||||||
|
from typing import Generic, Protocol, TypeVar
|
||||||
|
|
||||||
|
from pydantic import BaseModel, ConfigDict
|
||||||
|
|
||||||
|
T = TypeVar("T")
|
||||||
|
|
||||||
|
|
||||||
|
class MetricResult(Protocol):
|
||||||
|
"""Protocol for metric result types."""
|
||||||
|
|
||||||
|
|
||||||
|
class AggregateStats(BaseModel):
|
||||||
|
"""Aggregate statistics for a batch of metric scores."""
|
||||||
|
|
||||||
|
model_config = ConfigDict(frozen=True)
|
||||||
|
|
||||||
|
mean: float
|
||||||
|
"""Mean of the scores."""
|
||||||
|
|
||||||
|
std: float
|
||||||
|
"""Standard deviation of the scores."""
|
||||||
|
|
||||||
|
min: float
|
||||||
|
"""Minimum score."""
|
||||||
|
|
||||||
|
max: float
|
||||||
|
"""Maximum score."""
|
||||||
|
|
||||||
|
percentiles: dict[int, float]
|
||||||
|
"""Percentile values (typically 25, 50, 75, 95)."""
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def from_values(cls, values: list[float]) -> "AggregateStats":
|
||||||
|
"""
|
||||||
|
Compute aggregate statistics from a list of values.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
values: List of numeric values to aggregate.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
AggregateStats with computed statistics.
|
||||||
|
|
||||||
|
Raises:
|
||||||
|
ValueError: If values list is empty.
|
||||||
|
"""
|
||||||
|
if not values:
|
||||||
|
raise ValueError("Cannot compute statistics from empty list")
|
||||||
|
|
||||||
|
n = len(values)
|
||||||
|
mean = sum(values) / n
|
||||||
|
|
||||||
|
if n == 1:
|
||||||
|
std = 0.0
|
||||||
|
else:
|
||||||
|
variance = sum((v - mean) ** 2 for v in values) / (n - 1)
|
||||||
|
std = math.sqrt(variance)
|
||||||
|
|
||||||
|
sorted_values = sorted(values)
|
||||||
|
|
||||||
|
def percentile(p: int) -> float:
|
||||||
|
if n == 1:
|
||||||
|
return sorted_values[0]
|
||||||
|
k = (n - 1) * p / 100
|
||||||
|
f = math.floor(k)
|
||||||
|
c = math.ceil(k)
|
||||||
|
if f == c:
|
||||||
|
return sorted_values[int(k)]
|
||||||
|
return sorted_values[f] * (c - k) + sorted_values[c] * (k - f)
|
||||||
|
|
||||||
|
return cls(
|
||||||
|
mean=mean,
|
||||||
|
std=std,
|
||||||
|
min=sorted_values[0],
|
||||||
|
max=sorted_values[-1],
|
||||||
|
percentiles={p: percentile(p) for p in (25, 50, 75, 95)},
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
class BatchResult(BaseModel, Generic[T]):
|
||||||
|
"""Result of batch metric computation."""
|
||||||
|
|
||||||
|
model_config = ConfigDict(frozen=True, arbitrary_types_allowed=True)
|
||||||
|
|
||||||
|
results: list[T]
|
||||||
|
"""Individual results for each input."""
|
||||||
|
|
||||||
|
count: int
|
||||||
|
"""Number of results."""
|
||||||
|
|
||||||
|
stats: dict[str, AggregateStats]
|
||||||
|
"""Aggregate statistics keyed by score name."""
|
||||||
|
|
||||||
|
|
||||||
|
class Metric(Protocol[T]):
|
||||||
|
"""Protocol for metrics that compute scores from text."""
|
||||||
|
|
||||||
|
@property
|
||||||
|
def name(self) -> str:
|
||||||
|
...
|
||||||
|
|
||||||
|
@property
|
||||||
|
def requires_reference(self) -> bool:
|
||||||
|
...
|
||||||
|
|
||||||
|
def score(self, candidate: str, reference: str | list[str] | None = None) -> T:
|
||||||
|
"""
|
||||||
|
Compute the metric score for a candidate text.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
candidate: The text to score.
|
||||||
|
reference: Reference text(s) for comparison, if required.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
The computed metric result.
|
||||||
|
"""
|
||||||
|
...
|
||||||
|
|
||||||
|
def batch_score(
|
||||||
|
self,
|
||||||
|
candidates: list[str],
|
||||||
|
references: list[str] | list[list[str]] | None = None,
|
||||||
|
) -> BatchResult[T]:
|
||||||
|
"""
|
||||||
|
Compute metric scores for a batch of candidates.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
candidates: List of texts to score.
|
||||||
|
references: Reference text(s) for each candidate, if required.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
BatchResult containing individual results and aggregate statistics.
|
||||||
|
"""
|
||||||
|
...
|
||||||
110
src/veritext/metrics/results.py
Normal file
110
src/veritext/metrics/results.py
Normal file
@@ -0,0 +1,110 @@
|
|||||||
|
"""Result types for metrics."""
|
||||||
|
|
||||||
|
from pydantic import BaseModel, ConfigDict
|
||||||
|
|
||||||
|
|
||||||
|
class BleuResult(BaseModel):
|
||||||
|
"""Result of BLEU score computation."""
|
||||||
|
|
||||||
|
model_config = ConfigDict(frozen=True)
|
||||||
|
|
||||||
|
bleu1: float
|
||||||
|
"""Unigram BLEU score (precision)."""
|
||||||
|
|
||||||
|
bleu2: float
|
||||||
|
"""Bigram BLEU score (precision)."""
|
||||||
|
|
||||||
|
bleu3: float
|
||||||
|
"""Trigram BLEU score (precision)."""
|
||||||
|
|
||||||
|
bleu4: float
|
||||||
|
"""4-gram BLEU score (precision)."""
|
||||||
|
|
||||||
|
brevity_penalty: float
|
||||||
|
"""Brevity penalty applied to the score."""
|
||||||
|
|
||||||
|
@property
|
||||||
|
def score(self) -> float:
|
||||||
|
return self.bleu4
|
||||||
|
|
||||||
|
|
||||||
|
class LexicalResult(BaseModel):
|
||||||
|
"""Result of lexical similarity computation."""
|
||||||
|
|
||||||
|
model_config = ConfigDict(frozen=True)
|
||||||
|
|
||||||
|
jaccard: float
|
||||||
|
"""Jaccard similarity: |intersection| / |union| of token sets."""
|
||||||
|
|
||||||
|
token_overlap: float
|
||||||
|
"""Proportion of candidate tokens found in reference."""
|
||||||
|
|
||||||
|
@property
|
||||||
|
def score(self) -> float:
|
||||||
|
return self.jaccard
|
||||||
|
|
||||||
|
|
||||||
|
class RougeScore(BaseModel):
|
||||||
|
"""Individual ROUGE variant score with precision, recall, F-measure."""
|
||||||
|
|
||||||
|
model_config = ConfigDict(frozen=True)
|
||||||
|
|
||||||
|
precision: float
|
||||||
|
"""Precision: overlap / candidate length."""
|
||||||
|
|
||||||
|
recall: float
|
||||||
|
"""Recall: overlap / reference length."""
|
||||||
|
|
||||||
|
fmeasure: float
|
||||||
|
"""F1-measure: harmonic mean of precision and recall."""
|
||||||
|
|
||||||
|
|
||||||
|
class RougeResult(BaseModel):
|
||||||
|
"""Result of ROUGE score computation."""
|
||||||
|
|
||||||
|
model_config = ConfigDict(frozen=True)
|
||||||
|
|
||||||
|
rouge1: RougeScore
|
||||||
|
"""ROUGE-1 (unigram) score."""
|
||||||
|
|
||||||
|
rouge2: RougeScore
|
||||||
|
"""ROUGE-2 (bigram) score."""
|
||||||
|
|
||||||
|
rouge_l: RougeScore
|
||||||
|
"""ROUGE-L (longest common subsequence) score."""
|
||||||
|
|
||||||
|
@property
|
||||||
|
def score(self) -> float:
|
||||||
|
return self.rouge_l.fmeasure
|
||||||
|
|
||||||
|
|
||||||
|
class ReadabilityResult(BaseModel):
|
||||||
|
"""Result of readability computation."""
|
||||||
|
|
||||||
|
model_config = ConfigDict(frozen=True)
|
||||||
|
|
||||||
|
flesch_kincaid_grade: float
|
||||||
|
"""US grade level (e.g., 8.0 = 8th grade reading level)."""
|
||||||
|
|
||||||
|
flesch_reading_ease: float
|
||||||
|
"""Score 0-100, higher = easier to read."""
|
||||||
|
|
||||||
|
@property
|
||||||
|
def score(self) -> float:
|
||||||
|
return self.flesch_reading_ease
|
||||||
|
|
||||||
|
|
||||||
|
class SemanticResult(BaseModel):
|
||||||
|
"""Result of semantic similarity computation."""
|
||||||
|
|
||||||
|
model_config = ConfigDict(frozen=True)
|
||||||
|
|
||||||
|
similarity: float
|
||||||
|
"""Cosine similarity score (0.0 to 1.0)."""
|
||||||
|
|
||||||
|
model: str
|
||||||
|
"""Name of the embedding model used."""
|
||||||
|
|
||||||
|
@property
|
||||||
|
def score(self) -> float:
|
||||||
|
return self.similarity
|
||||||
Reference in New Issue
Block a user