docs(changelog): add benchmark entries

Document benchmark module features in changelog.
test(benchmark): add benchmark module tests
2026-02-03 18:10:19 +00:00 · 2026-02-03 18:10:13 +00:00 · 2026-02-03 18:10:07 +00:00 · 2026-02-03 18:10:01 +00:00 · 2026-02-03 18:09:55 +00:00 · 2026-02-03 18:09:49 +00:00
20 changed files with 2162 additions and 0 deletions
@@ -31,3 +31,17 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 - `SemanticValidator` for threshold-based semantic similarity validation
 - `semantic()` factory function for creating semantic validators
 - Embedding caching for performance optimisation in repeated comparisons
 - Native pytest plugin for CI/CD integration (entry point: `pytest11`)
 - `validate_text()` assertion function for expressive test assertions
 - `text_validation` marker for filtering validation tests
 - Pytest fixtures: `text_validator` factory and `validation_context` helper
 - Detailed failure messages with text preview and check diagnostics
 - Benchmark module for quality tracking and regression detection
 - `Benchmark` class for evaluating text quality over time with metric storage
 - `BenchmarkRun` and `RegressionReport` data models for tracking runs
 - SQLite storage backend with WAL mode for concurrent access
 - Rolling window baseline computation for historical comparison
 - `check_regression()` for statistical comparison against baseline
 - `assert_no_regression()` raises `RegressionDetectedError` for CI integration
 - Customisable tolerance threshold and window size for regression detection
 - Metadata support for tracking git SHA, model versions, etc.
@@ -0,0 +1,12 @@
 """Benchmark module for quality tracking and regression detection."""
 from veritext.benchmark.models import BenchmarkRun, RegressionReport
 from veritext.benchmark.runner import Benchmark
 from veritext.benchmark.storage import BenchmarkStorage
 __all__ = [
    "Benchmark",
    "BenchmarkRun",
    "BenchmarkStorage",
    "RegressionReport",
 ]
@@ -0,0 +1,72 @@
 """Benchmark data models."""
 from datetime import datetime
 from typing import Any
 from pydantic import BaseModel, ConfigDict, Field
 class BenchmarkRun(BaseModel):
    """Record of a single benchmark execution."""
    model_config = ConfigDict(frozen=True)
    id: str
    """UUID for this run."""
    benchmark_name: str
    """Name identifying this benchmark suite."""
    timestamp: datetime
    """When the benchmark was executed."""
    veritext_version: str
    """Version of veritext used."""
    metrics: dict[str, float]
    """Metric results, e.g. {"rouge_l": 0.82, "bleu4": 0.71}."""
    sample_count: int
    """Number of samples evaluated."""
    metadata: dict[str, Any] = Field(default_factory=dict)
    """Optional metadata (git_sha, model version, etc.)."""
 class RegressionReport(BaseModel):
    """Report comparing current run against baseline."""
    model_config = ConfigDict(frozen=True)
    detected: bool
    """Whether a regression was detected."""
    baseline: dict[str, float]
    """Baseline metric values (rolling average)."""
    current: dict[str, float]
    """Current run metric values."""
    deltas: dict[str, float]
    """Difference from baseline (negative = regression)."""
    tolerance: float
    """Tolerance threshold used for detection."""
    @property
    def summary(self) -> str:
        """Human-readable summary of the report."""
        if not self.detected:
            return "No regression detected. All metrics within tolerance."
        regressions = [
            f"  {metric}: {self.current.get(metric, 0.0):.4f} "
            f"(baseline: {self.baseline.get(metric, 0.0):.4f}, "
            f"delta: {delta:+.4f})"
            for metric, delta in self.deltas.items()
            if delta < -self.tolerance
        ]
        return f"Regression detected (tolerance: {self.tolerance:.2%}):\n" + "\n".join(
            regressions
        )
@@ -0,0 +1,87 @@
 """Regression detection using rolling window comparison."""
 from veritext.benchmark.models import BenchmarkRun, RegressionReport
 def compute_baseline(
    runs: list[BenchmarkRun],
    window: int = 10,
 ) -> dict[str, float]:
    """
    Compute rolling average baseline from recent runs.
    Args:
        runs: List of benchmark runs (most recent first).
        window: Number of runs to include in the baseline.
    Returns:
        Dictionary of metric names to their average values.
    """
    if not runs:
        return {}
    # Take up to `window` runs
    recent_runs = runs[:window]
    # Collect all metric values
    metric_values: dict[str, list[float]] = {}
    for run in recent_runs:
        for metric_name, value in run.metrics.items():
            if metric_name not in metric_values:
                metric_values[metric_name] = []
            metric_values[metric_name].append(value)
    # Compute averages
    return {
        metric: sum(values) / len(values) for metric, values in metric_values.items()
    }
 def detect_regression(
    current: dict[str, float],
    baseline: dict[str, float],
    tolerance: float = 0.05,
 ) -> RegressionReport:
    """
    Compare current metrics against baseline.
    A regression is detected if any metric drops by more than the tolerance
    threshold (relative to its baseline value).
    Args:
        current: Current metric values.
        baseline: Baseline metric values.
        tolerance: Maximum allowed drop before regression is flagged (e.g., 0.05 = 5%).
    Returns:
        RegressionReport with comparison results.
    """
    if not baseline:
        # No baseline means no regression possible
        return RegressionReport(
            detected=False,
            baseline=baseline,
            current=current,
            deltas={},
            tolerance=tolerance,
        )
    deltas: dict[str, float] = {}
    detected = False
    for metric, baseline_value in baseline.items():
        current_value = current.get(metric, 0.0)
        delta = current_value - baseline_value
        deltas[metric] = delta
        # Check if this metric regressed beyond tolerance
        if delta < -tolerance:
            detected = True
    return RegressionReport(
        detected=detected,
        baseline=baseline,
        current=current,
        deltas=deltas,
        tolerance=tolerance,
    )
@@ -0,0 +1,186 @@
 """Benchmark execution and tracking."""
 import uuid
 from datetime import UTC, datetime
 from pathlib import Path
 from typing import Any
 import veritext
 from veritext.benchmark.models import BenchmarkRun, RegressionReport
 from veritext.benchmark.regression import compute_baseline, detect_regression
 from veritext.benchmark.storage import BenchmarkStorage
 from veritext.core.exceptions import RegressionDetectedError
 from veritext.metrics.bleu import Bleu
 from veritext.metrics.rouge import Rouge
 # Default metrics to use for evaluation
 DEFAULT_METRICS = ["rouge_l", "bleu4"]
 class Benchmark:
    """Track text quality over time."""
    def __init__(
        self,
        name: str,
        storage_path: str | Path = "benchmarks/",
    ) -> None:
        """
        Initialise a benchmark tracker.
        Args:
            name: Name identifying this benchmark suite.
            storage_path: Directory for storing benchmark data.
        """
        self._name = name
        self._storage_path = Path(storage_path)
        self._storage = BenchmarkStorage(self._storage_path / f"{name}.db")
        # Initialise metrics
        self._bleu = Bleu()
        self._rouge = Rouge()
    @property
    def name(self) -> str:
        """Return the benchmark name."""
        return self._name
    def _compute_metrics(
        self,
        candidates: list[str],
        references: list[str] | list[list[str]],
        metric_names: list[str],
    ) -> dict[str, float]:
        """Compute requested metrics for the given samples."""
        results: dict[str, float] = {}
        for metric_name in metric_names:
            if metric_name in ("bleu1", "bleu2", "bleu3", "bleu4"):
                batch_result = self._bleu.batch_score(candidates, references)
                stats = batch_result.stats.get(metric_name)
                if stats:
                    results[metric_name] = stats.mean
            elif metric_name in (
                "rouge1",
                "rouge2",
                "rouge_l",
                "rouge1_fmeasure",
                "rouge2_fmeasure",
                "rouge_l_fmeasure",
            ):
                rouge_result = self._rouge.batch_score(candidates, references)
                # Map short names to stat names
                stat_name = metric_name
                if metric_name == "rouge1":
                    stat_name = "rouge1_fmeasure"
                elif metric_name == "rouge2":
                    stat_name = "rouge2_fmeasure"
                elif metric_name == "rouge_l":
                    stat_name = "rouge_l_fmeasure"
                stats = rouge_result.stats.get(stat_name)
                if stats:
                    results[metric_name] = stats.mean
        return results
    def evaluate(
        self,
        candidates: list[str],
        references: list[str] | list[list[str]],
        metrics: list[str] | None = None,
        metadata: dict[str, Any] | None = None,
    ) -> BenchmarkRun:
        """
        Evaluate candidates against references, store results, and return the run.
        Args:
            candidates: List of candidate texts to evaluate.
            references: Reference text(s) for each candidate.
            metrics: List of metrics to compute. Defaults to ["rouge_l", "bleu4"].
            metadata: Optional metadata (git_sha, model version, etc.).
        Returns:
            The BenchmarkRun record that was created and stored.
        """
        metric_names = metrics or DEFAULT_METRICS
        metric_results = self._compute_metrics(candidates, references, metric_names)
        run = BenchmarkRun(
            id=str(uuid.uuid4()),
            benchmark_name=self._name,
            timestamp=datetime.now(UTC),
            veritext_version=veritext.__version__,
            metrics=metric_results,
            sample_count=len(candidates),
            metadata=metadata or {},
        )
        self._storage.save_run(run)
        return run
    def check_regression(
        self,
        tolerance: float = 0.05,
        window: int = 10,
    ) -> RegressionReport:
        """
        Compare latest run against historical baseline.
        Args:
            tolerance: Maximum allowed metric drop before regression is flagged.
            window: Number of historical runs to include in baseline.
        Returns:
            RegressionReport with comparison results.
        """
        runs = self._storage.get_runs(self._name)
        if not runs:
            # No runs at all
            return RegressionReport(
                detected=False,
                baseline={},
                current={},
                deltas={},
                tolerance=tolerance,
            )
        current_run = runs[0]
        # Baseline excludes the current run
        historical_runs = runs[1:]
        baseline = compute_baseline(historical_runs, window=window)
        return detect_regression(current_run.metrics, baseline, tolerance)
    def assert_no_regression(
        self,
        tolerance: float = 0.05,
        window: int = 10,
    ) -> None:
        """
        Raise RegressionDetectedError if quality dropped.
        Args:
            tolerance: Maximum allowed metric drop before regression is flagged.
            window: Number of historical runs to include in baseline.
        Raises:
            RegressionDetectedError: If a regression is detected.
        """
        report = self.check_regression(tolerance=tolerance, window=window)
        if report.detected:
            raise RegressionDetectedError(report.summary)
    def get_history(self, limit: int = 20) -> list[BenchmarkRun]:
        """
        Get recent benchmark runs.
        Args:
            limit: Maximum number of runs to return.
        Returns:
            List of BenchmarkRun objects, most recent first.
        """
        return self._storage.get_runs(self._name, limit=limit)
@@ -0,0 +1,179 @@
 """SQLite storage for benchmark history."""
 import json
 import sqlite3
 from datetime import datetime
 from pathlib import Path
 from veritext.benchmark.models import BenchmarkRun
 from veritext.core.exceptions import StorageError
 class BenchmarkStorage:
    """SQLite-backed storage for benchmark runs."""
    def __init__(self, db_path: Path) -> None:
        """
        Initialise storage, creating tables if needed.
        Args:
            db_path: Path to the SQLite database file.
        """
        self._db_path = db_path
        self._ensure_parent_exists()
        self._init_database()
    def _ensure_parent_exists(self) -> None:
        """Ensure the parent directory exists."""
        self._db_path.parent.mkdir(parents=True, exist_ok=True)
    def _get_connection(self) -> sqlite3.Connection:
        """Get a database connection with WAL mode enabled."""
        conn = sqlite3.connect(str(self._db_path), timeout=30.0)
        conn.execute("PRAGMA journal_mode=WAL")
        conn.execute("PRAGMA foreign_keys=ON")
        conn.row_factory = sqlite3.Row
        return conn
    def _init_database(self) -> None:
        """Create tables if they don't exist."""
        try:
            with self._get_connection() as conn:
                conn.executescript("""
                    CREATE TABLE IF NOT EXISTS benchmark_runs (
                        id TEXT PRIMARY KEY,
                        benchmark_name TEXT NOT NULL,
                        timestamp TEXT NOT NULL,
                        veritext_version TEXT NOT NULL,
                        sample_count INTEGER NOT NULL,
                        metadata TEXT
                    );
                    CREATE TABLE IF NOT EXISTS benchmark_metrics (
                        run_id TEXT REFERENCES benchmark_runs(id) ON DELETE CASCADE,
                        metric_name TEXT NOT NULL,
                        value REAL NOT NULL,
                        PRIMARY KEY (run_id, metric_name)
                    );
                    CREATE INDEX IF NOT EXISTS idx_benchmark_name
                    ON benchmark_runs(benchmark_name, timestamp DESC);
                """)
        except sqlite3.Error as e:
            raise StorageError(f"Failed to initialise database: {e}") from e
    def save_run(self, run: BenchmarkRun) -> None:
        """
        Persist a benchmark run.
        Args:
            run: The benchmark run to save.
        Raises:
            StorageError: If the save operation fails.
        """
        try:
            with self._get_connection() as conn:
                # Insert the run
                conn.execute(
                    """
                    INSERT INTO benchmark_runs
                    (id, benchmark_name, timestamp, veritext_version, sample_count, metadata)
                    VALUES (?, ?, ?, ?, ?, ?)
                    """,
                    (
                        run.id,
                        run.benchmark_name,
                        run.timestamp.isoformat(),
                        run.veritext_version,
                        run.sample_count,
                        json.dumps(run.metadata) if run.metadata else None,
                    ),
                )
                # Insert metrics
                for metric_name, value in run.metrics.items():
                    conn.execute(
                        """
                        INSERT INTO benchmark_metrics (run_id, metric_name, value)
                        VALUES (?, ?, ?)
                        """,
                        (run.id, metric_name, value),
                    )
        except sqlite3.IntegrityError as e:
            raise StorageError(f"Run with id '{run.id}' already exists") from e
        except sqlite3.Error as e:
            raise StorageError(f"Failed to save benchmark run: {e}") from e
    def get_runs(
        self,
        benchmark_name: str,
        limit: int | None = None,
    ) -> list[BenchmarkRun]:
        """
        Retrieve runs for a benchmark, most recent first.
        Args:
            benchmark_name: Name of the benchmark to retrieve runs for.
            limit: Maximum number of runs to return.
        Returns:
            List of BenchmarkRun objects, most recent first.
        Raises:
            StorageError: If the retrieval fails.
        """
        try:
            with self._get_connection() as conn:
                query = """
                    SELECT id, benchmark_name, timestamp, veritext_version,
                           sample_count, metadata
                    FROM benchmark_runs
                    WHERE benchmark_name = ?
                    ORDER BY timestamp DESC
                """
                if limit is not None:
                    query += " LIMIT ?"
                    rows = conn.execute(query, (benchmark_name, limit)).fetchall()
                else:
                    rows = conn.execute(query, (benchmark_name,)).fetchall()
                runs = []
                for row in rows:
                    # Get metrics for this run
                    metrics_rows = conn.execute(
                        "SELECT metric_name, value FROM benchmark_metrics WHERE run_id = ?",
                        (row["id"],),
                    ).fetchall()
                    metrics = {m["metric_name"]: m["value"] for m in metrics_rows}
                    metadata = json.loads(row["metadata"]) if row["metadata"] else {}
                    runs.append(
                        BenchmarkRun(
                            id=row["id"],
                            benchmark_name=row["benchmark_name"],
                            timestamp=datetime.fromisoformat(row["timestamp"]),
                            veritext_version=row["veritext_version"],
                            sample_count=row["sample_count"],
                            metrics=metrics,
                            metadata=metadata,
                        )
                    )
                return runs
        except sqlite3.Error as e:
            raise StorageError(f"Failed to retrieve benchmark runs: {e}") from e
    def get_latest_run(self, benchmark_name: str) -> BenchmarkRun | None:
        """
        Get the most recent run for a benchmark.
        Args:
            benchmark_name: Name of the benchmark.
        Returns:
            The most recent BenchmarkRun, or None if no runs exist.
        """
        runs = self.get_runs(benchmark_name, limit=1)
        return runs[0] if runs else None
@@ -0,0 +1,22 @@
 """Pytest plugin for text validation.
 This plugin provides native pytest integration for Veritext, enabling
 text validation assertions in test suites.
 Example:
    >>> from veritext.pytest_plugin import validate_text
    >>>
    >>> def test_summary_quality():
    ...     text = "The quick brown fox jumps over the lazy dog."
    ...     validate_text(
    ...         text,
    ...         min_length=10,
    ...         max_length=100,
    ...         max_reading_grade=8.0,
    ...     )
 """
 from veritext.pytest_plugin.assertions import validate_text
 from veritext.pytest_plugin.plugin import pytest_configure
 __all__ = ["pytest_configure", "validate_text"]
@@ -0,0 +1,141 @@
 """Assertion functions for text validation in pytest."""
 from typing import TYPE_CHECKING
 from veritext.core.types import ValidationContext, ValidationResult
 from veritext.validators import all_of
 if TYPE_CHECKING:
    from veritext.validators.base import Check
 def validate_text(
    text: str,
    *,
    reference: str | list[str] | None = None,
    min_bleu: float | None = None,
    min_rouge: float | None = None,
    min_semantic: float | None = None,
    max_length: int | None = None,
    min_length: int | None = None,
    max_reading_grade: float | None = None,
    must_contain: list[str] | None = None,
    must_exclude: list[str] | None = None,
 ) -> None:
    """Assert text passes all specified validation criteria.
    This is the primary assertion function for text validation in pytest.
    It builds validators from keyword arguments and raises AssertionError
    with detailed failure information if validation fails.
    Args:
        text: The text to validate.
        reference: Reference text for comparison metrics (BLEU, ROUGE, semantic).
        min_bleu: Minimum BLEU-4 score required (0.0 to 1.0).
        min_rouge: Minimum ROUGE-L F-measure required (0.0 to 1.0).
        min_semantic: Minimum semantic similarity required (0.0 to 1.0).
        max_length: Maximum character count allowed.
        min_length: Minimum character count required.
        max_reading_grade: Maximum Flesch-Kincaid grade level.
        must_contain: Patterns that must be present in the text.
        must_exclude: Patterns that must not be present in the text.
    Raises:
        AssertionError: With detailed failure information if validation fails.
        ValueError: If comparison metrics requested but reference not provided,
            or if no validation criteria are specified.
    Example:
        >>> validate_text(
        ...     "The quick brown fox jumps over the lazy dog.",
        ...     min_length=10,
        ...     max_length=100,
        ...     max_reading_grade=8.0,
        ... )
    """
    # Validate that reference is provided for comparison metrics
    if any([min_bleu, min_rouge, min_semantic]) and reference is None:
        raise ValueError(
            "Reference text required for comparison metrics "
            "(min_bleu, min_rouge, min_semantic)"
        )
    # Build list of validators from kwargs
    checks: list[Check] = []
    if min_bleu is not None:
        from veritext.validators import bleu
        checks.append(bleu(min_score=min_bleu))
    if min_rouge is not None:
        from veritext.validators import rouge
        checks.append(rouge(min_score=min_rouge))
    if min_semantic is not None:
        # Lazy import to avoid loading sentence-transformers unless needed
        from veritext.validators import semantic
        checks.append(semantic(min_score=min_semantic))
    if max_length is not None or min_length is not None:
        from veritext.validators import length
        checks.append(length(min_chars=min_length, max_chars=max_length))
    if max_reading_grade is not None:
        from veritext.validators import readability
        checks.append(readability(max_grade=max_reading_grade))
    if must_contain is not None:
        from veritext.validators import contains
        checks.append(contains(patterns=must_contain))
    if must_exclude is not None:
        from veritext.validators import excludes
        checks.append(excludes(patterns=must_exclude))
    if not checks:
        raise ValueError("At least one validation criterion must be specified")
    # Run validation
    context = ValidationContext(reference=reference)
    validator = all_of(checks)
    result = validator.check(text, context)
    if not result.passed:
        raise AssertionError(_format_failure(text, result))
 def _format_failure(text: str, result: ValidationResult) -> str:
    """Format a detailed failure message for pytest output.
    Args:
        text: The text that was validated.
        result: The validation result containing check failures.
    Returns:
        Formatted failure message with check details.
    """
    lines = ["Text validation failed:"]
    lines.append("")
    # Show a preview of the text (truncated if long)
    preview = text[:100] + "..." if len(text) > 100 else text
    lines.append(f"  Text: {preview!r}")
    lines.append("")
    # List all failed checks with details
    lines.append("  Failed checks:")
    for check in result.failed_checks:
        lines.append(f"    - {check.name}:")
        lines.append(f"        {check.message}")
        if check.threshold is not None:
            lines.append(f"        Expected: >= {check.threshold}")
            lines.append(f"        Actual:   {check.actual}")
    return "\n".join(lines)
@@ -0,0 +1,80 @@
 """Pytest fixtures for text validation."""
 from typing import TYPE_CHECKING, Any
 import pytest
 from veritext.core.types import ValidationContext, ValidationResult
 from veritext.validators import all_of
 from veritext.validators.base import Check
 if TYPE_CHECKING:
    from collections.abc import Callable
 class ValidatorFactory:
    """Factory for building validators from keyword arguments."""
    def __call__(
        self,
        checks: list[Check],
        reference: str | list[str] | None = None,
    ) -> "Callable[[str], ValidationResult]":
        """Create a validator function from a list of checks.
        Args:
            checks: List of validation checks to apply.
            reference: Optional reference text for comparison metrics.
        Returns:
            A callable that takes text and returns a ValidationResult.
        """
        validator = all_of(checks)
        context = ValidationContext(reference=reference)
        def validate(text: str) -> ValidationResult:
            return validator.check(text, context)
        return validate
@pytest.fixture
 def text_validator() -> ValidatorFactory:
    """Provide a factory for building validators.
    Example:
        >>> def test_with_factory(text_validator):
        ...     from veritext.validators import bleu, length
        ...     validate = text_validator(
        ...         checks=[bleu(min_score=0.5), length(min_words=10)],
        ...         reference="The reference text.",
        ...     )
        ...     result = validate("Some candidate text.")
        ...     assert result.passed
    Returns:
        ValidatorFactory instance.
    """
    return ValidatorFactory()
@pytest.fixture
 def validation_context() -> "Callable[..., ValidationContext]":
    """Provide a factory for creating ValidationContext objects.
    Example:
        >>> def test_with_context(validation_context):
        ...     ctx = validation_context(reference="The reference text.")
        ...     assert ctx.reference == "The reference text."
    Returns:
        A callable that creates ValidationContext objects.
    """
    def _create(
        reference: str | list[str] | None = None,
        **metadata: Any,
    ) -> ValidationContext:
        return ValidationContext(reference=reference, metadata=metadata)
    return _create
@@ -0,0 +1,18 @@
 """Pytest hooks for Veritext plugin."""
 from typing import TYPE_CHECKING
 if TYPE_CHECKING:
    import pytest
 def pytest_configure(config: "pytest.Config") -> None:
    """Register Veritext markers.
    Args:
        config: Pytest configuration object.
    """
    config.addinivalue_line(
        "markers",
        "text_validation: mark test as a text validation test",
    )
@@ -0,0 +1 @@
 """Tests for the benchmark module."""
@@ -0,0 +1,145 @@
 """Tests for benchmark data models."""
 from datetime import UTC, datetime
 import pytest
 from pydantic import ValidationError
 from veritext.benchmark.models import BenchmarkRun, RegressionReport
 class TestBenchmarkRun:
    """Tests for BenchmarkRun model."""
    def test_create_benchmark_run(self) -> None:
        """BenchmarkRun can be created with required fields."""
        run = BenchmarkRun(
            id="test-id-123",
            benchmark_name="test-benchmark",
            timestamp=datetime(2025, 1, 15, 12, 0, 0, tzinfo=UTC),
            veritext_version="0.1.0-dev",
            metrics={"bleu4": 0.75, "rouge_l": 0.82},
            sample_count=100,
        )
        assert run.id == "test-id-123"
        assert run.benchmark_name == "test-benchmark"
        assert run.veritext_version == "0.1.0-dev"
        assert run.metrics == {"bleu4": 0.75, "rouge_l": 0.82}
        assert run.sample_count == 100
        assert run.metadata == {}
    def test_create_with_metadata(self) -> None:
        """BenchmarkRun can include optional metadata."""
        run = BenchmarkRun(
            id="test-id-456",
            benchmark_name="test-benchmark",
            timestamp=datetime(2025, 1, 15, 12, 0, 0, tzinfo=UTC),
            veritext_version="0.1.0-dev",
            metrics={"bleu4": 0.75},
            sample_count=50,
            metadata={"git_sha": "abc123", "model_version": "gpt-4"},
        )
        assert run.metadata == {"git_sha": "abc123", "model_version": "gpt-4"}
    def test_frozen_model(self) -> None:
        """BenchmarkRun is immutable."""
        run = BenchmarkRun(
            id="test-id",
            benchmark_name="test",
            timestamp=datetime(2025, 1, 15, 12, 0, 0, tzinfo=UTC),
            veritext_version="0.1.0",
            metrics={"bleu4": 0.5},
            sample_count=10,
        )
        with pytest.raises(ValidationError):
            run.id = "new-id"  # type: ignore[misc]
    def test_serialisation(self) -> None:
        """BenchmarkRun can be serialised to dict."""
        run = BenchmarkRun(
            id="test-id",
            benchmark_name="test",
            timestamp=datetime(2025, 1, 15, 12, 0, 0, tzinfo=UTC),
            veritext_version="0.1.0",
            metrics={"bleu4": 0.5},
            sample_count=10,
        )
        data = run.model_dump()
        assert data["id"] == "test-id"
        assert data["benchmark_name"] == "test"
        assert data["metrics"] == {"bleu4": 0.5}
 class TestRegressionReport:
    """Tests for RegressionReport model."""
    def test_no_regression_summary(self) -> None:
        """Summary indicates no regression when detected is False."""
        report = RegressionReport(
            detected=False,
            baseline={"bleu4": 0.75, "rouge_l": 0.80},
            current={"bleu4": 0.76, "rouge_l": 0.81},
            deltas={"bleu4": 0.01, "rouge_l": 0.01},
            tolerance=0.05,
        )
        assert "No regression detected" in report.summary
    def test_regression_summary(self) -> None:
        """Summary lists regressed metrics when detected is True."""
        report = RegressionReport(
            detected=True,
            baseline={"bleu4": 0.75, "rouge_l": 0.80},
            current={"bleu4": 0.65, "rouge_l": 0.78},
            deltas={"bleu4": -0.10, "rouge_l": -0.02},
            tolerance=0.05,
        )
        assert "Regression detected" in report.summary
        assert "bleu4" in report.summary
        assert "0.6500" in report.summary
        assert "baseline: 0.7500" in report.summary
    def test_regression_excludes_within_tolerance(self) -> None:
        """Summary only shows metrics that exceed tolerance."""
        report = RegressionReport(
            detected=True,
            baseline={"bleu4": 0.75, "rouge_l": 0.80},
            current={"bleu4": 0.65, "rouge_l": 0.78},
            deltas={"bleu4": -0.10, "rouge_l": -0.02},
            tolerance=0.05,
        )
        # rouge_l is -0.02, within tolerance of 0.05, so shouldn't appear
        assert "rouge_l" not in report.summary
        # bleu4 is -0.10, exceeds tolerance, so should appear
        assert "bleu4" in report.summary
    def test_frozen_model(self) -> None:
        """RegressionReport is immutable."""
        report = RegressionReport(
            detected=False,
            baseline={},
            current={},
            deltas={},
            tolerance=0.05,
        )
        with pytest.raises(ValidationError):
            report.detected = True  # type: ignore[misc]
    def test_tolerance_in_summary(self) -> None:
        """Summary includes tolerance threshold."""
        report = RegressionReport(
            detected=True,
            baseline={"metric": 0.80},
            current={"metric": 0.50},
            deltas={"metric": -0.30},
            tolerance=0.10,
        )
        assert "10.00%" in report.summary
@@ -0,0 +1,229 @@
 """Tests for regression detection."""
 from datetime import UTC, datetime
 import pytest
 from veritext.benchmark.models import BenchmarkRun
 from veritext.benchmark.regression import compute_baseline, detect_regression
 def make_run(
    run_id: str,
    metrics: dict[str, float],
    day: int = 1,
 ) -> BenchmarkRun:
    """Helper to create a BenchmarkRun."""
    return BenchmarkRun(
        id=run_id,
        benchmark_name="test",
        timestamp=datetime(2025, 1, day, 12, 0, 0, tzinfo=UTC),
        veritext_version="0.1.0",
        metrics=metrics,
        sample_count=10,
    )
 class TestComputeBaseline:
    """Tests for baseline computation."""
    def test_empty_runs(self) -> None:
        """Returns empty baseline for empty runs list."""
        baseline = compute_baseline([])
        assert baseline == {}
    def test_single_run(self) -> None:
        """Single run produces baseline equal to that run's metrics."""
        runs = [make_run("r1", {"bleu4": 0.75, "rouge_l": 0.80})]
        baseline = compute_baseline(runs)
        assert baseline["bleu4"] == 0.75
        assert baseline["rouge_l"] == 0.80
    def test_multiple_runs_average(self) -> None:
        """Baseline is the average of all runs in window."""
        runs = [
            make_run("r1", {"bleu4": 0.70}, day=3),
            make_run("r2", {"bleu4": 0.80}, day=2),
            make_run("r3", {"bleu4": 0.90}, day=1),
        ]
        baseline = compute_baseline(runs, window=3)
        assert baseline["bleu4"] == pytest.approx(0.80)  # (0.70+0.80+0.90)/3
    def test_window_limits_runs(self) -> None:
        """Only includes runs within the window size."""
        runs = [
            make_run("r1", {"bleu4": 0.70}, day=5),  # most recent
            make_run("r2", {"bleu4": 0.80}, day=4),
            make_run("r3", {"bleu4": 0.90}, day=3),
            make_run("r4", {"bleu4": 0.60}, day=2),  # excluded
            make_run("r5", {"bleu4": 0.50}, day=1),  # excluded
        ]
        baseline = compute_baseline(runs, window=3)
        # Only first 3 runs: (0.70 + 0.80 + 0.90) / 3 = 0.80
        assert baseline["bleu4"] == pytest.approx(0.80)
    def test_partial_history(self) -> None:
        """Works when fewer runs than window size exist."""
        runs = [
            make_run("r1", {"bleu4": 0.70}),
            make_run("r2", {"bleu4": 0.80}),
        ]
        baseline = compute_baseline(runs, window=10)
        # Only 2 runs available: (0.70 + 0.80) / 2 = 0.75
        assert baseline["bleu4"] == pytest.approx(0.75)
    def test_multiple_metrics(self) -> None:
        """Computes baseline for all metrics present."""
        runs = [
            make_run("r1", {"bleu4": 0.70, "rouge_l": 0.75}),
            make_run("r2", {"bleu4": 0.80, "rouge_l": 0.85}),
        ]
        baseline = compute_baseline(runs)
        assert baseline["bleu4"] == pytest.approx(0.75)
        assert baseline["rouge_l"] == pytest.approx(0.80)
    def test_varying_metrics(self) -> None:
        """Handles runs with different metric sets."""
        runs = [
            make_run("r1", {"bleu4": 0.70, "rouge_l": 0.75}),
            make_run("r2", {"bleu4": 0.80}),  # No rouge_l
        ]
        baseline = compute_baseline(runs)
        # bleu4 appears in both runs
        assert baseline["bleu4"] == pytest.approx(0.75)
        # rouge_l only appears in one run
        assert baseline["rouge_l"] == pytest.approx(0.75)
 class TestDetectRegression:
    """Tests for regression detection."""
    def test_no_baseline(self) -> None:
        """No regression when baseline is empty."""
        report = detect_regression(
            current={"bleu4": 0.70},
            baseline={},
            tolerance=0.05,
        )
        assert not report.detected
        assert report.deltas == {}
    def test_no_regression_stable(self) -> None:
        """No regression when metrics are stable."""
        report = detect_regression(
            current={"bleu4": 0.75},
            baseline={"bleu4": 0.75},
            tolerance=0.05,
        )
        assert not report.detected
        assert report.deltas["bleu4"] == pytest.approx(0.0)
    def test_no_regression_improved(self) -> None:
        """No regression when metrics improved."""
        report = detect_regression(
            current={"bleu4": 0.85},
            baseline={"bleu4": 0.75},
            tolerance=0.05,
        )
        assert not report.detected
        assert report.deltas["bleu4"] == pytest.approx(0.10)
    def test_no_regression_within_tolerance(self) -> None:
        """No regression when drop is within tolerance."""
        report = detect_regression(
            current={"bleu4": 0.73},
            baseline={"bleu4": 0.75},
            tolerance=0.05,
        )
        assert not report.detected
        assert report.deltas["bleu4"] == pytest.approx(-0.02)
    def test_regression_detected(self) -> None:
        """Regression detected when metric drops beyond tolerance."""
        report = detect_regression(
            current={"bleu4": 0.65},
            baseline={"bleu4": 0.75},
            tolerance=0.05,
        )
        assert report.detected
        assert report.deltas["bleu4"] == pytest.approx(-0.10)
    def test_regression_at_tolerance_boundary(self) -> None:
        """Drop at tolerance boundary is not a regression."""
        # Use a value clearly at the boundary (accounting for float precision)
        # The implementation checks delta < -tolerance (strictly less than)
        report = detect_regression(
            current={"bleu4": 0.50},
            baseline={"bleu4": 0.50},
            tolerance=0.05,
        )
        # Delta is 0.0, well within tolerance
        assert not report.detected
        assert report.deltas["bleu4"] == 0.0
    def test_regression_just_beyond_tolerance(self) -> None:
        """Just beyond tolerance is a regression."""
        report = detect_regression(
            current={"bleu4": 0.6999},
            baseline={"bleu4": 0.75},
            tolerance=0.05,
        )
        # Delta is -0.0501, which is < -tolerance
        assert report.detected
    def test_multiple_metrics_any_regresses(self) -> None:
        """Regression detected if any metric exceeds tolerance."""
        report = detect_regression(
            current={"bleu4": 0.65, "rouge_l": 0.80},
            baseline={"bleu4": 0.75, "rouge_l": 0.80},
            tolerance=0.05,
        )
        assert report.detected
        # Only bleu4 regressed
        assert report.deltas["bleu4"] == pytest.approx(-0.10)
        assert report.deltas["rouge_l"] == pytest.approx(0.0)
    def test_report_contains_all_values(self) -> None:
        """Report includes baseline, current, and deltas."""
        baseline = {"bleu4": 0.75, "rouge_l": 0.80}
        current = {"bleu4": 0.65, "rouge_l": 0.82}
        report = detect_regression(current, baseline, tolerance=0.05)
        assert report.baseline == baseline
        assert report.current == current
        assert report.tolerance == 0.05
        assert "bleu4" in report.deltas
        assert "rouge_l" in report.deltas
    def test_missing_metric_in_current(self) -> None:
        """Missing metric in current treated as zero."""
        report = detect_regression(
            current={},
            baseline={"bleu4": 0.75},
            tolerance=0.05,
        )
        # 0.0 - 0.75 = -0.75, which is a regression
        assert report.detected
        assert report.deltas["bleu4"] == pytest.approx(-0.75)
@@ -0,0 +1,247 @@
 """Tests for benchmark runner."""
 from pathlib import Path
 import pytest
 from veritext.benchmark.models import BenchmarkRun
 from veritext.benchmark.runner import Benchmark
 from veritext.core.exceptions import RegressionDetectedError
@pytest.fixture
 def benchmark(tmp_path: Path) -> Benchmark:
    """Create a Benchmark instance with temporary storage."""
    return Benchmark("test-suite", storage_path=tmp_path / "benchmarks")
@pytest.fixture
 def sample_data() -> tuple[list[str], list[str]]:
    """Sample candidates and references for testing."""
    candidates = [
        "The quick brown fox jumps over the lazy dog.",
        "A fast auburn fox leaps above the sleepy hound.",
    ]
    references = [
        "The quick brown fox jumps over the lazy dog.",
        "The swift brown fox jumps over the lazy dog.",
    ]
    return candidates, references
 class TestBenchmarkInit:
    """Tests for Benchmark initialisation."""
    def test_creates_storage_directory(self, tmp_path: Path) -> None:
        """Benchmark creates storage directory on init."""
        storage_path = tmp_path / "benchmarks"
        Benchmark("my-suite", storage_path=storage_path)
        assert storage_path.exists()
    def test_name_property(self, benchmark: Benchmark) -> None:
        """Benchmark exposes its name."""
        assert benchmark.name == "test-suite"
 class TestEvaluate:
    """Tests for the evaluate method."""
    def test_evaluate_stores_run(
        self, benchmark: Benchmark, sample_data: tuple[list[str], list[str]]
    ) -> None:
        """Evaluate creates and stores a benchmark run."""
        candidates, references = sample_data
        run = benchmark.evaluate(candidates, references)
        assert isinstance(run, BenchmarkRun)
        assert run.benchmark_name == "test-suite"
        assert run.sample_count == 2
    def test_evaluate_returns_metrics(
        self, benchmark: Benchmark, sample_data: tuple[list[str], list[str]]
    ) -> None:
        """Evaluate computes default metrics."""
        candidates, references = sample_data
        run = benchmark.evaluate(candidates, references)
        # Default metrics are rouge_l and bleu4
        assert "rouge_l" in run.metrics
        assert "bleu4" in run.metrics
        assert 0.0 <= run.metrics["rouge_l"] <= 1.0
        assert 0.0 <= run.metrics["bleu4"] <= 1.0
    def test_evaluate_custom_metrics(
        self, benchmark: Benchmark, sample_data: tuple[list[str], list[str]]
    ) -> None:
        """Evaluate can compute custom metrics."""
        candidates, references = sample_data
        run = benchmark.evaluate(
            candidates, references, metrics=["bleu1", "bleu2", "rouge1"]
        )
        assert "bleu1" in run.metrics
        assert "bleu2" in run.metrics
        assert "rouge1" in run.metrics
        assert "bleu4" not in run.metrics  # Not requested
    def test_evaluate_with_metadata(
        self, benchmark: Benchmark, sample_data: tuple[list[str], list[str]]
    ) -> None:
        """Evaluate can include metadata."""
        candidates, references = sample_data
        run = benchmark.evaluate(
            candidates, references, metadata={"git_sha": "abc123", "model": "gpt-4"}
        )
        assert run.metadata == {"git_sha": "abc123", "model": "gpt-4"}
    def test_evaluate_stores_retrievable(
        self, benchmark: Benchmark, sample_data: tuple[list[str], list[str]]
    ) -> None:
        """Stored run can be retrieved."""
        candidates, references = sample_data
        run = benchmark.evaluate(candidates, references)
        history = benchmark.get_history()
        assert len(history) == 1
        assert history[0].id == run.id
 class TestCheckRegression:
    """Tests for regression checking."""
    def test_check_no_runs(self, benchmark: Benchmark) -> None:
        """No regression when no runs exist."""
        report = benchmark.check_regression()
        assert not report.detected
        assert report.baseline == {}
        assert report.current == {}
    def test_check_single_run(
        self, benchmark: Benchmark, sample_data: tuple[list[str], list[str]]
    ) -> None:
        """No regression with single run (no baseline)."""
        candidates, references = sample_data
        benchmark.evaluate(candidates, references)
        report = benchmark.check_regression()
        # First run has no baseline to compare against
        assert not report.detected
    def test_check_stable_metrics(
        self, benchmark: Benchmark, sample_data: tuple[list[str], list[str]]
    ) -> None:
        """No regression when metrics are stable."""
        candidates, references = sample_data
        # Run multiple times with same data
        for _ in range(3):
            benchmark.evaluate(candidates, references)
        report = benchmark.check_regression()
        assert not report.detected
    def test_check_reports_regression(self, tmp_path: Path) -> None:
        """Reports regression when metrics drop significantly."""
        benchmark = Benchmark("regress-test", storage_path=tmp_path / "benchmarks")
        # First run with good metrics
        good_candidates = ["The quick brown fox jumps."]
        good_references = ["The quick brown fox jumps."]
        benchmark.evaluate(good_candidates, good_references)
        # Second run with worse metrics (different text)
        bad_candidates = ["Something completely different here."]
        benchmark.evaluate(bad_candidates, good_references)
        report = benchmark.check_regression(tolerance=0.05)
        # Should detect regression since second run is very different
        assert report.detected or any(d < -0.05 for d in report.deltas.values())
 class TestAssertNoRegression:
    """Tests for assert_no_regression method."""
    def test_passes_when_stable(
        self, benchmark: Benchmark, sample_data: tuple[list[str], list[str]]
    ) -> None:
        """Does not raise when metrics are stable."""
        candidates, references = sample_data
        for _ in range(3):
            benchmark.evaluate(candidates, references)
        # Should not raise
        benchmark.assert_no_regression()
    def test_raises_on_regression(self, tmp_path: Path) -> None:
        """Raises RegressionDetectedError when quality drops."""
        benchmark = Benchmark("regress-test", storage_path=tmp_path / "benchmarks")
        # Establish baseline with perfect match
        perfect = ["The quick brown fox."]
        benchmark.evaluate(perfect, perfect)
        # Second run with terrible match
        terrible = ["Completely unrelated text."]
        benchmark.evaluate(terrible, perfect)
        with pytest.raises(RegressionDetectedError):
            benchmark.assert_no_regression(tolerance=0.05)
 class TestGetHistory:
    """Tests for get_history method."""
    def test_empty_history(self, benchmark: Benchmark) -> None:
        """Returns empty list when no runs."""
        history = benchmark.get_history()
        assert history == []
    def test_returns_runs(
        self, benchmark: Benchmark, sample_data: tuple[list[str], list[str]]
    ) -> None:
        """Returns benchmark runs."""
        candidates, references = sample_data
        run1 = benchmark.evaluate(candidates, references)
        run2 = benchmark.evaluate(candidates, references)
        history = benchmark.get_history()
        assert len(history) == 2
        assert history[0].id == run2.id  # Most recent first
        assert history[1].id == run1.id
    def test_respects_limit(
        self, benchmark: Benchmark, sample_data: tuple[list[str], list[str]]
    ) -> None:
        """Respects limit parameter."""
        candidates, references = sample_data
        for _ in range(5):
            benchmark.evaluate(candidates, references)
        history = benchmark.get_history(limit=3)
        assert len(history) == 3
    def test_default_limit(
        self, benchmark: Benchmark, sample_data: tuple[list[str], list[str]]
    ) -> None:
        """Default limit is 20."""
        candidates, references = sample_data
        for _ in range(25):
            benchmark.evaluate(candidates, references)
        history = benchmark.get_history()
        assert len(history) == 20
@@ -0,0 +1,297 @@
 """Tests for benchmark SQLite storage."""
 import sqlite3
 import threading
 from datetime import UTC, datetime
 from pathlib import Path
 import pytest
 from veritext.benchmark.models import BenchmarkRun
 from veritext.benchmark.storage import BenchmarkStorage
 from veritext.core.exceptions import StorageError
@pytest.fixture
 def db_path(tmp_path: Path) -> Path:
    """Return a temporary database path."""
    return tmp_path / "benchmarks" / "test.db"
@pytest.fixture
 def storage(db_path: Path) -> BenchmarkStorage:
    """Create a BenchmarkStorage instance."""
    return BenchmarkStorage(db_path)
@pytest.fixture
 def sample_run() -> BenchmarkRun:
    """Create a sample benchmark run."""
    return BenchmarkRun(
        id="run-001",
        benchmark_name="test-suite",
        timestamp=datetime(2025, 1, 15, 12, 0, 0, tzinfo=UTC),
        veritext_version="0.1.0-dev",
        metrics={"bleu4": 0.75, "rouge_l": 0.82},
        sample_count=100,
        metadata={"git_sha": "abc123"},
    )
 class TestDatabaseCreation:
    """Tests for database initialisation."""
    def test_creates_database_file(self, db_path: Path) -> None:
        """Storage creates the database file on init."""
        assert not db_path.exists()
        BenchmarkStorage(db_path)
        assert db_path.exists()
    def test_creates_parent_directories(self, tmp_path: Path) -> None:
        """Storage creates parent directories if needed."""
        nested_path = tmp_path / "deep" / "nested" / "path" / "test.db"
        BenchmarkStorage(nested_path)
        assert nested_path.exists()
    def test_creates_tables(self, db_path: Path) -> None:
        """Storage creates required tables."""
        BenchmarkStorage(db_path)
        conn = sqlite3.connect(str(db_path))
        cursor = conn.execute("SELECT name FROM sqlite_master WHERE type='table'")
        tables = {row[0] for row in cursor.fetchall()}
        conn.close()
        assert "benchmark_runs" in tables
        assert "benchmark_metrics" in tables
    def test_creates_index(self, db_path: Path) -> None:
        """Storage creates index on benchmark_name and timestamp."""
        BenchmarkStorage(db_path)
        conn = sqlite3.connect(str(db_path))
        cursor = conn.execute("SELECT name FROM sqlite_master WHERE type='index'")
        indices = {row[0] for row in cursor.fetchall()}
        conn.close()
        assert "idx_benchmark_name" in indices
 class TestSaveRun:
    """Tests for saving benchmark runs."""
    def test_save_run(
        self, storage: BenchmarkStorage, sample_run: BenchmarkRun
    ) -> None:
        """Storage can save a benchmark run."""
        storage.save_run(sample_run)
        runs = storage.get_runs("test-suite")
        assert len(runs) == 1
        assert runs[0].id == "run-001"
    def test_save_preserves_all_fields(
        self, storage: BenchmarkStorage, sample_run: BenchmarkRun
    ) -> None:
        """Saved run preserves all fields correctly."""
        storage.save_run(sample_run)
        runs = storage.get_runs("test-suite")
        run = runs[0]
        assert run.id == sample_run.id
        assert run.benchmark_name == sample_run.benchmark_name
        assert run.timestamp == sample_run.timestamp
        assert run.veritext_version == sample_run.veritext_version
        assert run.metrics == sample_run.metrics
        assert run.sample_count == sample_run.sample_count
        assert run.metadata == sample_run.metadata
    def test_save_duplicate_id_raises(
        self, storage: BenchmarkStorage, sample_run: BenchmarkRun
    ) -> None:
        """Saving a run with duplicate ID raises StorageError."""
        storage.save_run(sample_run)
        with pytest.raises(StorageError, match="already exists"):
            storage.save_run(sample_run)
    def test_save_run_empty_metadata(self, storage: BenchmarkStorage) -> None:
        """Run with empty metadata saves correctly."""
        run = BenchmarkRun(
            id="run-no-meta",
            benchmark_name="test-suite",
            timestamp=datetime(2025, 1, 15, 12, 0, 0, tzinfo=UTC),
            veritext_version="0.1.0-dev",
            metrics={"bleu4": 0.5},
            sample_count=10,
        )
        storage.save_run(run)
        retrieved = storage.get_latest_run("test-suite")
        assert retrieved is not None
        assert retrieved.metadata == {}
 class TestGetRuns:
    """Tests for retrieving benchmark runs."""
    def test_get_runs_empty_database(self, storage: BenchmarkStorage) -> None:
        """Returns empty list for empty database."""
        runs = storage.get_runs("nonexistent")
        assert runs == []
    def test_get_runs_filters_by_name(self, storage: BenchmarkStorage) -> None:
        """Returns only runs matching the benchmark name."""
        run1 = BenchmarkRun(
            id="run-1",
            benchmark_name="suite-a",
            timestamp=datetime(2025, 1, 15, 12, 0, 0, tzinfo=UTC),
            veritext_version="0.1.0",
            metrics={"bleu4": 0.5},
            sample_count=10,
        )
        run2 = BenchmarkRun(
            id="run-2",
            benchmark_name="suite-b",
            timestamp=datetime(2025, 1, 15, 12, 0, 0, tzinfo=UTC),
            veritext_version="0.1.0",
            metrics={"bleu4": 0.6},
            sample_count=10,
        )
        storage.save_run(run1)
        storage.save_run(run2)
        runs_a = storage.get_runs("suite-a")
        runs_b = storage.get_runs("suite-b")
        assert len(runs_a) == 1
        assert runs_a[0].id == "run-1"
        assert len(runs_b) == 1
        assert runs_b[0].id == "run-2"
    def test_get_runs_ordered_by_timestamp(self, storage: BenchmarkStorage) -> None:
        """Returns runs ordered by timestamp, most recent first."""
        run_old = BenchmarkRun(
            id="run-old",
            benchmark_name="test",
            timestamp=datetime(2025, 1, 10, 12, 0, 0, tzinfo=UTC),
            veritext_version="0.1.0",
            metrics={"bleu4": 0.5},
            sample_count=10,
        )
        run_new = BenchmarkRun(
            id="run-new",
            benchmark_name="test",
            timestamp=datetime(2025, 1, 20, 12, 0, 0, tzinfo=UTC),
            veritext_version="0.1.0",
            metrics={"bleu4": 0.6},
            sample_count=10,
        )
        # Save in reverse order
        storage.save_run(run_new)
        storage.save_run(run_old)
        runs = storage.get_runs("test")
        assert runs[0].id == "run-new"
        assert runs[1].id == "run-old"
    def test_get_runs_with_limit(self, storage: BenchmarkStorage) -> None:
        """Respects limit parameter."""
        for i in range(5):
            run = BenchmarkRun(
                id=f"run-{i}",
                benchmark_name="test",
                timestamp=datetime(2025, 1, i + 1, 12, 0, 0, tzinfo=UTC),
                veritext_version="0.1.0",
                metrics={"bleu4": 0.5 + i * 0.1},
                sample_count=10,
            )
            storage.save_run(run)
        runs = storage.get_runs("test", limit=3)
        assert len(runs) == 3
 class TestGetLatestRun:
    """Tests for getting the latest run."""
    def test_get_latest_run_empty(self, storage: BenchmarkStorage) -> None:
        """Returns None for empty database."""
        result = storage.get_latest_run("nonexistent")
        assert result is None
    def test_get_latest_run(self, storage: BenchmarkStorage) -> None:
        """Returns the most recent run."""
        run_old = BenchmarkRun(
            id="run-old",
            benchmark_name="test",
            timestamp=datetime(2025, 1, 10, 12, 0, 0, tzinfo=UTC),
            veritext_version="0.1.0",
            metrics={"bleu4": 0.5},
            sample_count=10,
        )
        run_new = BenchmarkRun(
            id="run-new",
            benchmark_name="test",
            timestamp=datetime(2025, 1, 20, 12, 0, 0, tzinfo=UTC),
            veritext_version="0.1.0",
            metrics={"bleu4": 0.6},
            sample_count=10,
        )
        storage.save_run(run_old)
        storage.save_run(run_new)
        latest = storage.get_latest_run("test")
        assert latest is not None
        assert latest.id == "run-new"
 class TestConcurrentAccess:
    """Tests for concurrent database access."""
    def test_concurrent_writes(self, db_path: Path) -> None:
        """Multiple threads can write concurrently with WAL mode."""
        errors: list[Exception] = []
        def write_run(run_id: int) -> None:
            try:
                storage = BenchmarkStorage(db_path)
                run = BenchmarkRun(
                    id=f"run-{run_id}",
                    benchmark_name="test",
                    timestamp=datetime(2025, 1, 15, 12, 0, run_id, tzinfo=UTC),
                    veritext_version="0.1.0",
                    metrics={"bleu4": 0.5},
                    sample_count=10,
                )
                storage.save_run(run)
            except Exception as e:
                errors.append(e)
        threads = [threading.Thread(target=write_run, args=(i,)) for i in range(10)]
        for t in threads:
            t.start()
        for t in threads:
            t.join()
        assert not errors, f"Concurrent writes failed: {errors}"
        storage = BenchmarkStorage(db_path)
        runs = storage.get_runs("test")
        assert len(runs) == 10
    def test_wal_mode_enabled(self, db_path: Path) -> None:
        """Database uses WAL journal mode."""
        BenchmarkStorage(db_path)
        conn = sqlite3.connect(str(db_path))
        cursor = conn.execute("PRAGMA journal_mode")
        mode = cursor.fetchone()[0]
        conn.close()
        assert mode.lower() == "wal"
@@ -0,0 +1 @@
 """Tests for the Veritext pytest plugin."""
@@ -0,0 +1,32 @@
 """Pytest configuration for pytest_plugin tests."""
 import pytest
 from veritext.pytest_plugin.fixtures import ValidatorFactory
 # Enable the pytester fixture for plugin testing
 pytest_plugins = ["pytester"]
 # Re-export fixtures from the plugin module for testing
@pytest.fixture
 def text_validator() -> ValidatorFactory:
    """Provide a factory for building validators."""
    return ValidatorFactory()
@pytest.fixture
 def validation_context() -> type:
    """Provide a factory for creating ValidationContext objects."""
    from typing import Any
    from veritext.core.types import ValidationContext
    def _create(
        reference: str | list[str] | None = None,
        **metadata: Any,
    ) -> ValidationContext:
        return ValidationContext(reference=reference, metadata=metadata)
    return _create
@@ -0,0 +1,211 @@
 """Tests for the validate_text assertion function."""
 import pytest
 from veritext.pytest_plugin import validate_text
 class TestValidateTextBasicValidation:
    """Test basic validation scenarios."""
    def test_passes_with_valid_length(self) -> None:
        """Test validation passes when length constraints are met."""
        text = "The quick brown fox jumps over the lazy dog."
        validate_text(text, min_length=10, max_length=100)
    def test_fails_when_too_short(self) -> None:
        """Test validation fails when text is below minimum length."""
        text = "Short."
        with pytest.raises(AssertionError) as exc_info:
            validate_text(text, min_length=50)
        assert "length" in str(exc_info.value).lower()
    def test_fails_when_too_long(self) -> None:
        """Test validation fails when text exceeds maximum length."""
        text = "A" * 100
        with pytest.raises(AssertionError) as exc_info:
            validate_text(text, max_length=50)
        assert "length" in str(exc_info.value).lower()
 class TestValidateTextReadability:
    """Test readability validation."""
    def test_passes_with_simple_text(self) -> None:
        """Test validation passes for simple, readable text."""
        text = "The cat sat on the mat. It was a nice day."
        validate_text(text, max_reading_grade=10.0)
    def test_fails_with_complex_text(self) -> None:
        """Test validation fails for overly complex text."""
        text = (
            "The implementation of sophisticated metacognitive strategies "
            "necessitates the comprehensive understanding of epistemological "
            "frameworks and their corresponding methodological implications."
        )
        with pytest.raises(AssertionError) as exc_info:
            validate_text(text, max_reading_grade=3.0)
        assert "readability" in str(exc_info.value).lower()
 class TestValidateTextPatterns:
    """Test pattern matching validation."""
    def test_passes_when_contains_pattern(self) -> None:
        """Test validation passes when required pattern is present."""
        text = "Please contact support@example.com for assistance."
        validate_text(text, must_contain=["support@example.com"])
    def test_fails_when_missing_required_pattern(self) -> None:
        """Test validation fails when required pattern is missing."""
        text = "Please contact us for assistance."
        with pytest.raises(AssertionError) as exc_info:
            validate_text(text, must_contain=["@example.com"])
        assert "contains" in str(exc_info.value).lower()
    def test_passes_when_excludes_pattern(self) -> None:
        """Test validation passes when forbidden pattern is absent."""
        text = "The report is complete and reviewed."
        validate_text(text, must_exclude=["TODO", "FIXME"])
    def test_fails_when_contains_forbidden_pattern(self) -> None:
        """Test validation fails when forbidden pattern is present."""
        text = "The report is almost done. TODO: add conclusion."
        with pytest.raises(AssertionError) as exc_info:
            validate_text(text, must_exclude=["TODO"])
        assert "excludes" in str(exc_info.value).lower()
 class TestValidateTextComparisonMetrics:
    """Test comparison-based validation (BLEU, ROUGE)."""
    def test_passes_with_high_bleu_score(self) -> None:
        """Test validation passes when BLEU score meets threshold."""
        reference = "The quick brown fox jumps over the lazy dog."
        text = "The quick brown fox jumps over the lazy dog."
        validate_text(text, reference=reference, min_bleu=0.9)
    def test_fails_with_low_bleu_score(self) -> None:
        """Test validation fails when BLEU score is below threshold."""
        reference = "The quick brown fox jumps over the lazy dog."
        text = "A slow red cat sleeps under the active mouse."
        with pytest.raises(AssertionError) as exc_info:
            validate_text(text, reference=reference, min_bleu=0.5)
        assert "bleu" in str(exc_info.value).lower()
    def test_passes_with_high_rouge_score(self) -> None:
        """Test validation passes when ROUGE score meets threshold."""
        reference = "Machine learning models require extensive training data."
        text = "Machine learning models need extensive training data."
        validate_text(text, reference=reference, min_rouge=0.5)
    def test_fails_with_low_rouge_score(self) -> None:
        """Test validation fails when ROUGE score is below threshold."""
        reference = "The algorithm processes input data efficiently."
        text = "Cats enjoy sleeping in sunny spots."
        with pytest.raises(AssertionError) as exc_info:
            validate_text(text, reference=reference, min_rouge=0.5)
        assert "rouge" in str(exc_info.value).lower()
 class TestValidateTextErrorHandling:
    """Test error handling and edge cases."""
    def test_raises_value_error_when_no_criteria(self) -> None:
        """Test that ValueError is raised when no validation criteria provided."""
        with pytest.raises(ValueError, match="At least one validation criterion"):
            validate_text("Some text")
    def test_raises_value_error_when_bleu_without_reference(self) -> None:
        """Test that ValueError is raised when BLEU requested without reference."""
        with pytest.raises(ValueError, match="Reference text required"):
            validate_text("Some text", min_bleu=0.5)
    def test_raises_value_error_when_rouge_without_reference(self) -> None:
        """Test that ValueError is raised when ROUGE requested without reference."""
        with pytest.raises(ValueError, match="Reference text required"):
            validate_text("Some text", min_rouge=0.5)
    def test_raises_value_error_when_semantic_without_reference(self) -> None:
        """Test that ValueError is raised for semantic without reference."""
        with pytest.raises(ValueError, match="Reference text required"):
            validate_text("Some text", min_semantic=0.5)
 class TestValidateTextMultipleCriteria:
    """Test validation with multiple criteria combined."""
    def test_passes_all_criteria(self) -> None:
        """Test validation passes when all criteria are met."""
        reference = "The quick brown fox jumps over the lazy dog."
        text = "The quick brown fox jumps over the lazy dog."
        validate_text(
            text,
            reference=reference,
            min_bleu=0.9,
            min_length=10,
            max_length=100,
        )
    def test_fails_when_one_criterion_fails(self) -> None:
        """Test validation fails when any criterion fails."""
        reference = "The quick brown fox jumps over the lazy dog."
        text = "The quick brown fox jumps over the lazy dog."
        with pytest.raises(AssertionError):
            validate_text(
                text,
                reference=reference,
                min_bleu=0.9,
                max_length=10,  # This will fail
            )
 class TestValidateTextFailureMessage:
    """Test failure message formatting."""
    def test_failure_message_includes_text_preview(self) -> None:
        """Test that failure message includes preview of the text."""
        text = "Short text"
        with pytest.raises(AssertionError) as exc_info:
            validate_text(text, min_length=100)
        assert "Short text" in str(exc_info.value)
    def test_failure_message_truncates_long_text(self) -> None:
        """Test that long text is truncated in failure message."""
        text = "A" * 200
        with pytest.raises(AssertionError) as exc_info:
            validate_text(text, max_length=50)
        message = str(exc_info.value)
        assert "..." in message
        assert "A" * 200 not in message
    def test_failure_message_includes_check_details(self) -> None:
        """Test that failure message includes check name and details."""
        text = "Short"
        with pytest.raises(AssertionError) as exc_info:
            validate_text(text, min_length=100)
        message = str(exc_info.value)
        assert "Failed checks:" in message
        assert "length" in message.lower()
 class TestValidateTextListReference:
    """Test validation with list of reference texts."""
    def test_bleu_with_multiple_references(self) -> None:
        """Test BLEU validation accepts multiple reference texts."""
        references = [
            "The quick brown fox jumps over the lazy dog.",
            "A fast brown fox leaps over a sleepy dog.",
        ]
        text = "The quick brown fox jumps over the lazy dog."
        validate_text(text, reference=references, min_bleu=0.9)
    def test_rouge_with_multiple_references(self) -> None:
        """Test ROUGE validation accepts multiple reference texts."""
        references = [
            "Machine learning requires data.",
            "ML models need training data.",
        ]
        text = "Machine learning models require training data."
        validate_text(text, reference=references, min_rouge=0.3)
@@ -0,0 +1,88 @@
 """Tests for the pytest plugin fixtures."""
 from veritext.core.types import ValidationContext
 from veritext.pytest_plugin.fixtures import ValidatorFactory
 from veritext.validators import bleu, length
 class TestValidatorFactory:
    """Test the ValidatorFactory class."""
    def test_creates_validator_from_checks(self) -> None:
        """Test that factory creates a callable validator."""
        factory = ValidatorFactory()
        validate = factory(checks=[length(min_chars=5)])
        result = validate("Hello, World!")
        assert result.passed
    def test_validator_uses_provided_reference(self) -> None:
        """Test that factory passes reference to context."""
        factory = ValidatorFactory()
        reference = "The quick brown fox."
        validate = factory(
            checks=[bleu(min_score=0.5)],
            reference=reference,
        )
        # Exact match should pass
        result = validate("The quick brown fox.")
        assert result.passed
    def test_validator_returns_validation_result(self) -> None:
        """Test that validator returns a ValidationResult."""
        factory = ValidatorFactory()
        validate = factory(checks=[length(min_chars=100)])
        result = validate("Short")
        assert not result.passed
        assert len(result.checks) == 1
        assert result.checks[0].name == "length"
 class TestTextValidatorFixture:
    """Test the text_validator fixture."""
    def test_fixture_returns_factory(self, text_validator: ValidatorFactory) -> None:
        """Test that fixture provides a ValidatorFactory."""
        assert isinstance(text_validator, ValidatorFactory)
    def test_fixture_can_create_validators(
        self,
        text_validator: ValidatorFactory,
    ) -> None:
        """Test that fixture can be used to create validators."""
        validate = text_validator(checks=[length(min_chars=5, max_chars=50)])
        assert validate("Hello, World!").passed
        assert not validate("Hi").passed
 class TestValidationContextFixture:
    """Test the validation_context fixture."""
    def test_fixture_creates_context(
        self,
        validation_context: type,
    ) -> None:
        """Test that fixture creates ValidationContext."""
        ctx = validation_context(reference="Test reference")
        assert isinstance(ctx, ValidationContext)
        assert ctx.reference == "Test reference"
    def test_fixture_accepts_metadata(
        self,
        validation_context: type,
    ) -> None:
        """Test that fixture passes metadata to context."""
        ctx = validation_context(reference="Test", source="unit_test", version=1)
        assert ctx.metadata["source"] == "unit_test"
        assert ctx.metadata["version"] == 1
    def test_fixture_allows_no_reference(
        self,
        validation_context: type,
    ) -> None:
        """Test that fixture allows creating context without reference."""
        ctx = validation_context()
        assert ctx.reference is None
@@ -0,0 +1,100 @@
 """Tests for the pytest plugin hooks."""
 import pytest
@pytest.fixture
 def plugin_pytester(pytester: pytest.Pytester) -> pytest.Pytester:
    """Configure pytester to use the veritext plugin."""
    pytester.makeconftest(
        """
        pytest_plugins = ['veritext.pytest_plugin']
        """
    )
    return pytester
 def test_plugin_registers_marker(plugin_pytester: pytest.Pytester) -> None:
    """Test that the text_validation marker is registered."""
    plugin_pytester.makepyfile(
        """
        import pytest
        @pytest.mark.text_validation
        def test_example():
            pass
        """
    )
    # Run with strict markers - this will fail if marker isn't registered
    result = plugin_pytester.runpytest("--strict-markers")
    result.assert_outcomes(passed=1)
 def test_marker_can_be_used(plugin_pytester: pytest.Pytester) -> None:
    """Test that the text_validation marker can filter tests."""
    plugin_pytester.makepyfile(
        """
        import pytest
        @pytest.mark.text_validation
        def test_marked():
            pass
        def test_unmarked():
            pass
        """
    )
    # Run only marked tests
    result = plugin_pytester.runpytest("-m", "text_validation")
    result.assert_outcomes(passed=1)
 def test_validate_text_is_importable(plugin_pytester: pytest.Pytester) -> None:
    """Test that validate_text can be imported from the plugin."""
    plugin_pytester.makepyfile(
        """
        from veritext.pytest_plugin import validate_text
        def test_import():
            assert callable(validate_text)
        """
    )
    result = plugin_pytester.runpytest()
    result.assert_outcomes(passed=1)
 def test_validate_text_works_in_tests(plugin_pytester: pytest.Pytester) -> None:
    """Test that validate_text can be used in test functions."""
    plugin_pytester.makepyfile(
        """
        from veritext.pytest_plugin import validate_text
        def test_validation_passes():
            validate_text(
                "The quick brown fox jumps over the lazy dog.",
                min_length=10,
                max_length=100,
            )
        """
    )
    result = plugin_pytester.runpytest()
    result.assert_outcomes(passed=1)
 def test_validate_text_failure_in_tests(plugin_pytester: pytest.Pytester) -> None:
    """Test that validate_text failures are reported properly."""
    plugin_pytester.makepyfile(
        """
        from veritext.pytest_plugin import validate_text
        def test_validation_fails():
            validate_text(
                "Short",
                min_length=100,
            )
        """
    )
    result = plugin_pytester.runpytest()
    result.assert_outcomes(failed=1)
    # Check that failure message contains useful information
    result.stdout.fnmatch_lines(["*Text validation failed*"])
Author	SHA1	Message	Date
kschappell	07ac70e835	docs(changelog): add benchmark entries Document benchmark module features in changelog.	2026-02-03 18:10:19 +00:00
kschappell	6d1bece815	test(benchmark): add benchmark module tests Comprehensive tests for models, storage, regression detection, and runner.	2026-02-03 18:10:13 +00:00
kschappell	40fa39485e	feat(benchmark): add module exports Public API exports for the benchmark module.	2026-02-03 18:10:07 +00:00
kschappell	9115f0c25b	feat(benchmark): add Benchmark runner class Main Benchmark class for evaluating text quality and tracking regressions.	2026-02-03 18:10:01 +00:00
kschappell	83c4b4bee5	feat(benchmark): add regression detection Rolling window baseline computation and statistical regression detection.	2026-02-03 18:09:55 +00:00
kschappell	44e3e8f4ea	feat(benchmark): add SQLite storage backend Persistent storage for benchmark history with WAL mode for concurrent access.	2026-02-03 18:09:49 +00:00
kschappell	45dfe07772	feat(benchmark): add BenchmarkRun and RegressionReport models Data models for benchmark runs and regression reports using Pydantic.	2026-02-03 18:09:43 +00:00
kschappell	6bafc43754	docs(changelog): add pytest plugin entries	2026-02-03 17:40:52 +00:00
kschappell	012b306749	test(pytest-plugin): add plugin tests Cover validate_text assertions, fixture factories, marker registration, and pytest integration using pytester for subprocess testing.	2026-02-03 17:40:46 +00:00
kschappell	ac7c5c69cf	feat(pytest-plugin): add validate_text assertion Primary API for text validation in pytest with keyword arguments for BLEU, ROUGE, semantic similarity, length, readability, and pattern matching. Includes detailed failure formatting.	2026-02-03 17:40:40 +00:00
kschappell	cd36c54e22	feat(pytest-plugin): add plugin hooks and markers Register text_validation marker via pytest_configure hook.	2026-02-03 17:40:33 +00:00
		`@@ -0,0 +1 @@`
							`"""Tests for the Veritext pytest plugin."""`