metric protocol and batch scoring types

Add Metric protocol, AggregateStats for statistical summaries, and
BatchResult for batch processing support.
This commit is contained in:
2025-03-12 20:13:11 +00:00
parent c53cdd2536
commit 7832fa3d59
2 changed files with 246 additions and 0 deletions

View File

@@ -0,0 +1,136 @@
"""Base types and protocols for metrics."""
import math
from typing import Generic, Protocol, TypeVar
from pydantic import BaseModel, ConfigDict
T = TypeVar("T")
class MetricResult(Protocol):
"""Protocol for metric result types."""
class AggregateStats(BaseModel):
"""Aggregate statistics for a batch of metric scores."""
model_config = ConfigDict(frozen=True)
mean: float
"""Mean of the scores."""
std: float
"""Standard deviation of the scores."""
min: float
"""Minimum score."""
max: float
"""Maximum score."""
percentiles: dict[int, float]
"""Percentile values (typically 25, 50, 75, 95)."""
@classmethod
def from_values(cls, values: list[float]) -> "AggregateStats":
"""
Compute aggregate statistics from a list of values.
Args:
values: List of numeric values to aggregate.
Returns:
AggregateStats with computed statistics.
Raises:
ValueError: If values list is empty.
"""
if not values:
raise ValueError("Cannot compute statistics from empty list")
n = len(values)
mean = sum(values) / n
if n == 1:
std = 0.0
else:
variance = sum((v - mean) ** 2 for v in values) / (n - 1)
std = math.sqrt(variance)
sorted_values = sorted(values)
def percentile(p: int) -> float:
if n == 1:
return sorted_values[0]
k = (n - 1) * p / 100
f = math.floor(k)
c = math.ceil(k)
if f == c:
return sorted_values[int(k)]
return sorted_values[f] * (c - k) + sorted_values[c] * (k - f)
return cls(
mean=mean,
std=std,
min=sorted_values[0],
max=sorted_values[-1],
percentiles={p: percentile(p) for p in (25, 50, 75, 95)},
)
class BatchResult(BaseModel, Generic[T]):
"""Result of batch metric computation."""
model_config = ConfigDict(frozen=True, arbitrary_types_allowed=True)
results: list[T]
"""Individual results for each input."""
count: int
"""Number of results."""
stats: dict[str, AggregateStats]
"""Aggregate statistics keyed by score name."""
class Metric(Protocol[T]):
"""Protocol for metrics that compute scores from text."""
@property
def name(self) -> str:
...
@property
def requires_reference(self) -> bool:
...
def score(self, candidate: str, reference: str | list[str] | None = None) -> T:
"""
Compute the metric score for a candidate text.
Args:
candidate: The text to score.
reference: Reference text(s) for comparison, if required.
Returns:
The computed metric result.
"""
...
def batch_score(
self,
candidates: list[str],
references: list[str] | list[list[str]] | None = None,
) -> BatchResult[T]:
"""
Compute metric scores for a batch of candidates.
Args:
candidates: List of texts to score.
references: Reference text(s) for each candidate, if required.
Returns:
BatchResult containing individual results and aggregate statistics.
"""
...

View File

@@ -0,0 +1,110 @@
"""Result types for metrics."""
from pydantic import BaseModel, ConfigDict
class BleuResult(BaseModel):
"""Result of BLEU score computation."""
model_config = ConfigDict(frozen=True)
bleu1: float
"""Unigram BLEU score (precision)."""
bleu2: float
"""Bigram BLEU score (precision)."""
bleu3: float
"""Trigram BLEU score (precision)."""
bleu4: float
"""4-gram BLEU score (precision)."""
brevity_penalty: float
"""Brevity penalty applied to the score."""
@property
def score(self) -> float:
return self.bleu4
class LexicalResult(BaseModel):
"""Result of lexical similarity computation."""
model_config = ConfigDict(frozen=True)
jaccard: float
"""Jaccard similarity: |intersection| / |union| of token sets."""
token_overlap: float
"""Proportion of candidate tokens found in reference."""
@property
def score(self) -> float:
return self.jaccard
class RougeScore(BaseModel):
"""Individual ROUGE variant score with precision, recall, F-measure."""
model_config = ConfigDict(frozen=True)
precision: float
"""Precision: overlap / candidate length."""
recall: float
"""Recall: overlap / reference length."""
fmeasure: float
"""F1-measure: harmonic mean of precision and recall."""
class RougeResult(BaseModel):
"""Result of ROUGE score computation."""
model_config = ConfigDict(frozen=True)
rouge1: RougeScore
"""ROUGE-1 (unigram) score."""
rouge2: RougeScore
"""ROUGE-2 (bigram) score."""
rouge_l: RougeScore
"""ROUGE-L (longest common subsequence) score."""
@property
def score(self) -> float:
return self.rouge_l.fmeasure
class ReadabilityResult(BaseModel):
"""Result of readability computation."""
model_config = ConfigDict(frozen=True)
flesch_kincaid_grade: float
"""US grade level (e.g., 8.0 = 8th grade reading level)."""
flesch_reading_ease: float
"""Score 0-100, higher = easier to read."""
@property
def score(self) -> float:
return self.flesch_reading_ease
class SemanticResult(BaseModel):
"""Result of semantic similarity computation."""
model_config = ConfigDict(frozen=True)
similarity: float
"""Cosine similarity score (0.0 to 1.0)."""
model: str
"""Name of the embedding model used."""
@property
def score(self) -> float:
return self.similarity