From 7832fa3d5969c605434b0e9a289aa8b7a04a75c2 Mon Sep 17 00:00:00 2001
From: Kai Chappell <git@kschappell.com>
Date: Wed, 12 Mar 2025 20:13:11 +0000
Subject: [PATCH] metric protocol and batch scoring types

Add Metric protocol, AggregateStats for statistical summaries, and
BatchResult for batch processing support.
---
 src/veritext/metrics/base.py    | 136 ++++++++++++++++++++++++++++++++
 src/veritext/metrics/results.py | 110 ++++++++++++++++++++++++++
 2 files changed, 246 insertions(+)
 create mode 100644 src/veritext/metrics/base.py
 create mode 100644 src/veritext/metrics/results.py

diff --git a/src/veritext/metrics/base.py b/src/veritext/metrics/base.py
new file mode 100644
index 0000000..21dede8
--- /dev/null
+++ b/src/veritext/metrics/base.py
@@ -0,0 +1,136 @@
+"""Base types and protocols for metrics."""
+
+import math
+from typing import Generic, Protocol, TypeVar
+
+from pydantic import BaseModel, ConfigDict
+
+T = TypeVar("T")
+
+
+class MetricResult(Protocol):
+    """Protocol for metric result types."""
+
+
+class AggregateStats(BaseModel):
+    """Aggregate statistics for a batch of metric scores."""
+
+    model_config = ConfigDict(frozen=True)
+
+    mean: float
+    """Mean of the scores."""
+
+    std: float
+    """Standard deviation of the scores."""
+
+    min: float
+    """Minimum score."""
+
+    max: float
+    """Maximum score."""
+
+    percentiles: dict[int, float]
+    """Percentile values (typically 25, 50, 75, 95)."""
+
+    @classmethod
+    def from_values(cls, values: list[float]) -> "AggregateStats":
+        """
+        Compute aggregate statistics from a list of values.
+
+        Args:
+            values: List of numeric values to aggregate.
+
+        Returns:
+            AggregateStats with computed statistics.
+
+        Raises:
+            ValueError: If values list is empty.
+        """
+        if not values:
+            raise ValueError("Cannot compute statistics from empty list")
+
+        n = len(values)
+        mean = sum(values) / n
+
+        if n == 1:
+            std = 0.0
+        else:
+            variance = sum((v - mean) ** 2 for v in values) / (n - 1)
+            std = math.sqrt(variance)
+
+        sorted_values = sorted(values)
+
+        def percentile(p: int) -> float:
+            if n == 1:
+                return sorted_values[0]
+            k = (n - 1) * p / 100
+            f = math.floor(k)
+            c = math.ceil(k)
+            if f == c:
+                return sorted_values[int(k)]
+            return sorted_values[f] * (c - k) + sorted_values[c] * (k - f)
+
+        return cls(
+            mean=mean,
+            std=std,
+            min=sorted_values[0],
+            max=sorted_values[-1],
+            percentiles={p: percentile(p) for p in (25, 50, 75, 95)},
+        )
+
+
+class BatchResult(BaseModel, Generic[T]):
+    """Result of batch metric computation."""
+
+    model_config = ConfigDict(frozen=True, arbitrary_types_allowed=True)
+
+    results: list[T]
+    """Individual results for each input."""
+
+    count: int
+    """Number of results."""
+
+    stats: dict[str, AggregateStats]
+    """Aggregate statistics keyed by score name."""
+
+
+class Metric(Protocol[T]):
+    """Protocol for metrics that compute scores from text."""
+
+    @property
+    def name(self) -> str:
+        ...
+
+    @property
+    def requires_reference(self) -> bool:
+        ...
+
+    def score(self, candidate: str, reference: str | list[str] | None = None) -> T:
+        """
+        Compute the metric score for a candidate text.
+
+        Args:
+            candidate: The text to score.
+            reference: Reference text(s) for comparison, if required.
+
+        Returns:
+            The computed metric result.
+        """
+        ...
+
+    def batch_score(
+        self,
+        candidates: list[str],
+        references: list[str] | list[list[str]] | None = None,
+    ) -> BatchResult[T]:
+        """
+        Compute metric scores for a batch of candidates.
+
+        Args:
+            candidates: List of texts to score.
+            references: Reference text(s) for each candidate, if required.
+
+        Returns:
+            BatchResult containing individual results and aggregate statistics.
+        """
+        ...
diff --git a/src/veritext/metrics/results.py b/src/veritext/metrics/results.py
new file mode 100644
index 0000000..4ea49fc
--- /dev/null
+++ b/src/veritext/metrics/results.py
@@ -0,0 +1,110 @@
+"""Result types for metrics."""
+
+from pydantic import BaseModel, ConfigDict
+
+
+class BleuResult(BaseModel):
+    """Result of BLEU score computation."""
+
+    model_config = ConfigDict(frozen=True)
+
+    bleu1: float
+    """Unigram BLEU score (precision)."""
+
+    bleu2: float
+    """Bigram BLEU score (precision)."""
+
+    bleu3: float
+    """Trigram BLEU score (precision)."""
+
+    bleu4: float
+    """4-gram BLEU score (precision)."""
+
+    brevity_penalty: float
+    """Brevity penalty applied to the score."""
+
+    @property
+    def score(self) -> float:
+        return self.bleu4
+
+
+class LexicalResult(BaseModel):
+    """Result of lexical similarity computation."""
+
+    model_config = ConfigDict(frozen=True)
+
+    jaccard: float
+    """Jaccard similarity: |intersection| / |union| of token sets."""
+
+    token_overlap: float
+    """Proportion of candidate tokens found in reference."""
+
+    @property
+    def score(self) -> float:
+        return self.jaccard
+
+
+class RougeScore(BaseModel):
+    """Individual ROUGE variant score with precision, recall, F-measure."""
+
+    model_config = ConfigDict(frozen=True)
+
+    precision: float
+    """Precision: overlap / candidate length."""
+
+    recall: float
+    """Recall: overlap / reference length."""
+
+    fmeasure: float
+    """F1-measure: harmonic mean of precision and recall."""
+
+
+class RougeResult(BaseModel):
+    """Result of ROUGE score computation."""
+
+    model_config = ConfigDict(frozen=True)
+
+    rouge1: RougeScore
+    """ROUGE-1 (unigram) score."""
+
+    rouge2: RougeScore
+    """ROUGE-2 (bigram) score."""
+
+    rouge_l: RougeScore
+    """ROUGE-L (longest common subsequence) score."""
+
+    @property
+    def score(self) -> float:
+        return self.rouge_l.fmeasure
+
+
+class ReadabilityResult(BaseModel):
+    """Result of readability computation."""
+
+    model_config = ConfigDict(frozen=True)
+
+    flesch_kincaid_grade: float
+    """US grade level (e.g., 8.0 = 8th grade reading level)."""
+
+    flesch_reading_ease: float
+    """Score 0-100, higher = easier to read."""
+
+    @property
+    def score(self) -> float:
+        return self.flesch_reading_ease
+
+
+class SemanticResult(BaseModel):
+    """Result of semantic similarity computation."""
+
+    model_config = ConfigDict(frozen=True)
+
+    similarity: float
+    """Cosine similarity score (0.0 to 1.0)."""
+
+    model: str
+    """Name of the embedding model used."""
+
+    @property
+    def score(self) -> float:
+        return self.similarity