metric protocol and batch scoring types

Add Metric protocol, AggregateStats for statistical summaries, and BatchResult for batch processing support.
2025-03-12 20:13:11 +00:00
parent c53cdd2536
commit 7832fa3d59
2 changed files with 246 additions and 0 deletions
@@ -0,0 +1,136 @@
 """Base types and protocols for metrics."""
 import math
 from typing import Generic, Protocol, TypeVar
 from pydantic import BaseModel, ConfigDict
 T = TypeVar("T")
 class MetricResult(Protocol):
    """Protocol for metric result types."""
 class AggregateStats(BaseModel):
    """Aggregate statistics for a batch of metric scores."""
    model_config = ConfigDict(frozen=True)
    mean: float
    """Mean of the scores."""
    std: float
    """Standard deviation of the scores."""
    min: float
    """Minimum score."""
    max: float
    """Maximum score."""
    percentiles: dict[int, float]
    """Percentile values (typically 25, 50, 75, 95)."""
    @classmethod
    def from_values(cls, values: list[float]) -> "AggregateStats":
        """
        Compute aggregate statistics from a list of values.
        Args:
            values: List of numeric values to aggregate.
        Returns:
            AggregateStats with computed statistics.
        Raises:
            ValueError: If values list is empty.
        """
        if not values:
            raise ValueError("Cannot compute statistics from empty list")
        n = len(values)
        mean = sum(values) / n
        if n == 1:
            std = 0.0
        else:
            variance = sum((v - mean) ** 2 for v in values) / (n - 1)
            std = math.sqrt(variance)
        sorted_values = sorted(values)
        def percentile(p: int) -> float:
            if n == 1:
                return sorted_values[0]
            k = (n - 1) * p / 100
            f = math.floor(k)
            c = math.ceil(k)
            if f == c:
                return sorted_values[int(k)]
            return sorted_values[f] * (c - k) + sorted_values[c] * (k - f)
        return cls(
            mean=mean,
            std=std,
            min=sorted_values[0],
            max=sorted_values[-1],
            percentiles={p: percentile(p) for p in (25, 50, 75, 95)},
        )
 class BatchResult(BaseModel, Generic[T]):
    """Result of batch metric computation."""
    model_config = ConfigDict(frozen=True, arbitrary_types_allowed=True)
    results: list[T]
    """Individual results for each input."""
    count: int
    """Number of results."""
    stats: dict[str, AggregateStats]
    """Aggregate statistics keyed by score name."""
 class Metric(Protocol[T]):
    """Protocol for metrics that compute scores from text."""
    @property
    def name(self) -> str:
        ...
    @property
    def requires_reference(self) -> bool:
        ...
    def score(self, candidate: str, reference: str | list[str] | None = None) -> T:
        """
        Compute the metric score for a candidate text.
        Args:
            candidate: The text to score.
            reference: Reference text(s) for comparison, if required.
        Returns:
            The computed metric result.
        """
        ...
    def batch_score(
        self,
        candidates: list[str],
        references: list[str] | list[list[str]] | None = None,
    ) -> BatchResult[T]:
        """
        Compute metric scores for a batch of candidates.
        Args:
            candidates: List of texts to score.
            references: Reference text(s) for each candidate, if required.
        Returns:
            BatchResult containing individual results and aggregate statistics.
        """
        ...
@@ -0,0 +1,110 @@
 """Result types for metrics."""
 from pydantic import BaseModel, ConfigDict
 class BleuResult(BaseModel):
    """Result of BLEU score computation."""
    model_config = ConfigDict(frozen=True)
    bleu1: float
    """Unigram BLEU score (precision)."""
    bleu2: float
    """Bigram BLEU score (precision)."""
    bleu3: float
    """Trigram BLEU score (precision)."""
    bleu4: float
    """4-gram BLEU score (precision)."""
    brevity_penalty: float
    """Brevity penalty applied to the score."""
    @property
    def score(self) -> float:
        return self.bleu4
 class LexicalResult(BaseModel):
    """Result of lexical similarity computation."""
    model_config = ConfigDict(frozen=True)
    jaccard: float
    """Jaccard similarity: |intersection| / |union| of token sets."""
    token_overlap: float
    """Proportion of candidate tokens found in reference."""
    @property
    def score(self) -> float:
        return self.jaccard
 class RougeScore(BaseModel):
    """Individual ROUGE variant score with precision, recall, F-measure."""
    model_config = ConfigDict(frozen=True)
    precision: float
    """Precision: overlap / candidate length."""
    recall: float
    """Recall: overlap / reference length."""
    fmeasure: float
    """F1-measure: harmonic mean of precision and recall."""
 class RougeResult(BaseModel):
    """Result of ROUGE score computation."""
    model_config = ConfigDict(frozen=True)
    rouge1: RougeScore
    """ROUGE-1 (unigram) score."""
    rouge2: RougeScore
    """ROUGE-2 (bigram) score."""
    rouge_l: RougeScore
    """ROUGE-L (longest common subsequence) score."""
    @property
    def score(self) -> float:
        return self.rouge_l.fmeasure
 class ReadabilityResult(BaseModel):
    """Result of readability computation."""
    model_config = ConfigDict(frozen=True)
    flesch_kincaid_grade: float
    """US grade level (e.g., 8.0 = 8th grade reading level)."""
    flesch_reading_ease: float
    """Score 0-100, higher = easier to read."""
    @property
    def score(self) -> float:
        return self.flesch_reading_ease
 class SemanticResult(BaseModel):
    """Result of semantic similarity computation."""
    model_config = ConfigDict(frozen=True)
    similarity: float
    """Cosine similarity score (0.0 to 1.0)."""
    model: str
    """Name of the embedding model used."""
    @property
    def score(self) -> float:
        return self.similarity