feat(metrics): add metric protocol and batch types

Add Metric protocol, AggregateStats for statistical summaries, and BatchResult for batch processing support.
2026-02-03 16:45:38 +00:00
parent 14dcddcbba
commit e6167005e5
2 changed files with 180 additions and 0 deletions
@@ -0,0 +1,139 @@
 """Base types and protocols for metrics."""
 import math
 from typing import Generic, Protocol, TypeVar
 from pydantic import BaseModel, ConfigDict
 T = TypeVar("T")
 class MetricResult(Protocol):
    """Protocol for metric result types."""
 class AggregateStats(BaseModel):
    """Aggregate statistics for a batch of metric scores."""
    model_config = ConfigDict(frozen=True)
    mean: float
    """Mean of the scores."""
    std: float
    """Standard deviation of the scores."""
    min: float
    """Minimum score."""
    max: float
    """Maximum score."""
    percentiles: dict[int, float]
    """Percentile values (typically 25, 50, 75, 95)."""
    @classmethod
    def from_values(cls, values: list[float]) -> "AggregateStats":
        """
        Compute aggregate statistics from a list of values.
        Args:
            values: List of numeric values to aggregate.
        Returns:
            AggregateStats with computed statistics.
        Raises:
            ValueError: If values list is empty.
        """
        if not values:
            raise ValueError("Cannot compute statistics from empty list")
        n = len(values)
        mean = sum(values) / n
        if n == 1:
            std = 0.0
        else:
            variance = sum((v - mean) ** 2 for v in values) / (n - 1)
            std = math.sqrt(variance)
        sorted_values = sorted(values)
        def percentile(p: int) -> float:
            """Compute percentile using linear interpolation."""
            if n == 1:
                return sorted_values[0]
            k = (n - 1) * p / 100
            f = math.floor(k)
            c = math.ceil(k)
            if f == c:
                return sorted_values[int(k)]
            return sorted_values[f] * (c - k) + sorted_values[c] * (k - f)
        return cls(
            mean=mean,
            std=std,
            min=sorted_values[0],
            max=sorted_values[-1],
            percentiles={p: percentile(p) for p in (25, 50, 75, 95)},
        )
 class BatchResult(BaseModel, Generic[T]):
    """Result of batch metric computation."""
    model_config = ConfigDict(frozen=True, arbitrary_types_allowed=True)
    results: list[T]
    """Individual results for each input."""
    count: int
    """Number of results."""
    stats: dict[str, AggregateStats]
    """Aggregate statistics keyed by score name."""
 class Metric(Protocol[T]):
    """Protocol for metrics that compute scores from text."""
    @property
    def name(self) -> str:
        """Return the name of this metric."""
        ...
    @property
    def requires_reference(self) -> bool:
        """Return whether this metric requires reference text."""
        ...
    def score(self, candidate: str, reference: str | list[str] | None = None) -> T:
        """
        Compute the metric score for a candidate text.
        Args:
            candidate: The text to score.
            reference: Reference text(s) for comparison, if required.
        Returns:
            The computed metric result.
        """
        ...
    def batch_score(
        self,
        candidates: list[str],
        references: list[str] | list[list[str]] | None = None,
    ) -> BatchResult[T]:
        """
        Compute metric scores for a batch of candidates.
        Args:
            candidates: List of texts to score.
            references: Reference text(s) for each candidate, if required.
        Returns:
            BatchResult containing individual results and aggregate statistics.
        """
        ...
@@ -0,0 +1,41 @@
 """Result types for metrics."""
 from pydantic import BaseModel, ConfigDict
 class BleuResult(BaseModel):
    """Result of BLEU score computation."""
    model_config = ConfigDict(frozen=True)
    bleu1: float
    """Unigram BLEU score (precision)."""
    bleu2: float
    """Bigram BLEU score (precision)."""
    bleu3: float
    """Trigram BLEU score (precision)."""
    bleu4: float
    """4-gram BLEU score (precision)."""
    brevity_penalty: float
    """Brevity penalty applied to the score."""
    @property
    def score(self) -> float:
        """Return the composite BLEU-4 score with brevity penalty."""
        return self.bleu4
 class LexicalResult(BaseModel):
    """Result of lexical similarity computation."""
    model_config = ConfigDict(frozen=True)
    jaccard: float
    """Jaccard similarity: |intersection| / |union| of token sets."""
    token_overlap: float
    """Proportion of candidate tokens found in reference."""