Compare commits
11 Commits
feat/valid
...
feat/bench
| Author | SHA1 | Date | |
|---|---|---|---|
|
07ac70e835
|
|||
|
6d1bece815
|
|||
|
40fa39485e
|
|||
|
9115f0c25b
|
|||
|
83c4b4bee5
|
|||
|
44e3e8f4ea
|
|||
|
45dfe07772
|
|||
|
6bafc43754
|
|||
|
012b306749
|
|||
|
ac7c5c69cf
|
|||
|
cd36c54e22
|
14
changelog.md
14
changelog.md
@@ -31,3 +31,17 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
|
|||||||
- `SemanticValidator` for threshold-based semantic similarity validation
|
- `SemanticValidator` for threshold-based semantic similarity validation
|
||||||
- `semantic()` factory function for creating semantic validators
|
- `semantic()` factory function for creating semantic validators
|
||||||
- Embedding caching for performance optimisation in repeated comparisons
|
- Embedding caching for performance optimisation in repeated comparisons
|
||||||
|
- Native pytest plugin for CI/CD integration (entry point: `pytest11`)
|
||||||
|
- `validate_text()` assertion function for expressive test assertions
|
||||||
|
- `text_validation` marker for filtering validation tests
|
||||||
|
- Pytest fixtures: `text_validator` factory and `validation_context` helper
|
||||||
|
- Detailed failure messages with text preview and check diagnostics
|
||||||
|
- Benchmark module for quality tracking and regression detection
|
||||||
|
- `Benchmark` class for evaluating text quality over time with metric storage
|
||||||
|
- `BenchmarkRun` and `RegressionReport` data models for tracking runs
|
||||||
|
- SQLite storage backend with WAL mode for concurrent access
|
||||||
|
- Rolling window baseline computation for historical comparison
|
||||||
|
- `check_regression()` for statistical comparison against baseline
|
||||||
|
- `assert_no_regression()` raises `RegressionDetectedError` for CI integration
|
||||||
|
- Customisable tolerance threshold and window size for regression detection
|
||||||
|
- Metadata support for tracking git SHA, model versions, etc.
|
||||||
|
|||||||
12
src/veritext/benchmark/__init__.py
Normal file
12
src/veritext/benchmark/__init__.py
Normal file
@@ -0,0 +1,12 @@
|
|||||||
|
"""Benchmark module for quality tracking and regression detection."""
|
||||||
|
|
||||||
|
from veritext.benchmark.models import BenchmarkRun, RegressionReport
|
||||||
|
from veritext.benchmark.runner import Benchmark
|
||||||
|
from veritext.benchmark.storage import BenchmarkStorage
|
||||||
|
|
||||||
|
__all__ = [
|
||||||
|
"Benchmark",
|
||||||
|
"BenchmarkRun",
|
||||||
|
"BenchmarkStorage",
|
||||||
|
"RegressionReport",
|
||||||
|
]
|
||||||
72
src/veritext/benchmark/models.py
Normal file
72
src/veritext/benchmark/models.py
Normal file
@@ -0,0 +1,72 @@
|
|||||||
|
"""Benchmark data models."""
|
||||||
|
|
||||||
|
from datetime import datetime
|
||||||
|
from typing import Any
|
||||||
|
|
||||||
|
from pydantic import BaseModel, ConfigDict, Field
|
||||||
|
|
||||||
|
|
||||||
|
class BenchmarkRun(BaseModel):
|
||||||
|
"""Record of a single benchmark execution."""
|
||||||
|
|
||||||
|
model_config = ConfigDict(frozen=True)
|
||||||
|
|
||||||
|
id: str
|
||||||
|
"""UUID for this run."""
|
||||||
|
|
||||||
|
benchmark_name: str
|
||||||
|
"""Name identifying this benchmark suite."""
|
||||||
|
|
||||||
|
timestamp: datetime
|
||||||
|
"""When the benchmark was executed."""
|
||||||
|
|
||||||
|
veritext_version: str
|
||||||
|
"""Version of veritext used."""
|
||||||
|
|
||||||
|
metrics: dict[str, float]
|
||||||
|
"""Metric results, e.g. {"rouge_l": 0.82, "bleu4": 0.71}."""
|
||||||
|
|
||||||
|
sample_count: int
|
||||||
|
"""Number of samples evaluated."""
|
||||||
|
|
||||||
|
metadata: dict[str, Any] = Field(default_factory=dict)
|
||||||
|
"""Optional metadata (git_sha, model version, etc.)."""
|
||||||
|
|
||||||
|
|
||||||
|
class RegressionReport(BaseModel):
|
||||||
|
"""Report comparing current run against baseline."""
|
||||||
|
|
||||||
|
model_config = ConfigDict(frozen=True)
|
||||||
|
|
||||||
|
detected: bool
|
||||||
|
"""Whether a regression was detected."""
|
||||||
|
|
||||||
|
baseline: dict[str, float]
|
||||||
|
"""Baseline metric values (rolling average)."""
|
||||||
|
|
||||||
|
current: dict[str, float]
|
||||||
|
"""Current run metric values."""
|
||||||
|
|
||||||
|
deltas: dict[str, float]
|
||||||
|
"""Difference from baseline (negative = regression)."""
|
||||||
|
|
||||||
|
tolerance: float
|
||||||
|
"""Tolerance threshold used for detection."""
|
||||||
|
|
||||||
|
@property
|
||||||
|
def summary(self) -> str:
|
||||||
|
"""Human-readable summary of the report."""
|
||||||
|
if not self.detected:
|
||||||
|
return "No regression detected. All metrics within tolerance."
|
||||||
|
|
||||||
|
regressions = [
|
||||||
|
f" {metric}: {self.current.get(metric, 0.0):.4f} "
|
||||||
|
f"(baseline: {self.baseline.get(metric, 0.0):.4f}, "
|
||||||
|
f"delta: {delta:+.4f})"
|
||||||
|
for metric, delta in self.deltas.items()
|
||||||
|
if delta < -self.tolerance
|
||||||
|
]
|
||||||
|
|
||||||
|
return f"Regression detected (tolerance: {self.tolerance:.2%}):\n" + "\n".join(
|
||||||
|
regressions
|
||||||
|
)
|
||||||
87
src/veritext/benchmark/regression.py
Normal file
87
src/veritext/benchmark/regression.py
Normal file
@@ -0,0 +1,87 @@
|
|||||||
|
"""Regression detection using rolling window comparison."""
|
||||||
|
|
||||||
|
from veritext.benchmark.models import BenchmarkRun, RegressionReport
|
||||||
|
|
||||||
|
|
||||||
|
def compute_baseline(
|
||||||
|
runs: list[BenchmarkRun],
|
||||||
|
window: int = 10,
|
||||||
|
) -> dict[str, float]:
|
||||||
|
"""
|
||||||
|
Compute rolling average baseline from recent runs.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
runs: List of benchmark runs (most recent first).
|
||||||
|
window: Number of runs to include in the baseline.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Dictionary of metric names to their average values.
|
||||||
|
"""
|
||||||
|
if not runs:
|
||||||
|
return {}
|
||||||
|
|
||||||
|
# Take up to `window` runs
|
||||||
|
recent_runs = runs[:window]
|
||||||
|
|
||||||
|
# Collect all metric values
|
||||||
|
metric_values: dict[str, list[float]] = {}
|
||||||
|
for run in recent_runs:
|
||||||
|
for metric_name, value in run.metrics.items():
|
||||||
|
if metric_name not in metric_values:
|
||||||
|
metric_values[metric_name] = []
|
||||||
|
metric_values[metric_name].append(value)
|
||||||
|
|
||||||
|
# Compute averages
|
||||||
|
return {
|
||||||
|
metric: sum(values) / len(values) for metric, values in metric_values.items()
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def detect_regression(
|
||||||
|
current: dict[str, float],
|
||||||
|
baseline: dict[str, float],
|
||||||
|
tolerance: float = 0.05,
|
||||||
|
) -> RegressionReport:
|
||||||
|
"""
|
||||||
|
Compare current metrics against baseline.
|
||||||
|
|
||||||
|
A regression is detected if any metric drops by more than the tolerance
|
||||||
|
threshold (relative to its baseline value).
|
||||||
|
|
||||||
|
Args:
|
||||||
|
current: Current metric values.
|
||||||
|
baseline: Baseline metric values.
|
||||||
|
tolerance: Maximum allowed drop before regression is flagged (e.g., 0.05 = 5%).
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
RegressionReport with comparison results.
|
||||||
|
"""
|
||||||
|
if not baseline:
|
||||||
|
# No baseline means no regression possible
|
||||||
|
return RegressionReport(
|
||||||
|
detected=False,
|
||||||
|
baseline=baseline,
|
||||||
|
current=current,
|
||||||
|
deltas={},
|
||||||
|
tolerance=tolerance,
|
||||||
|
)
|
||||||
|
|
||||||
|
deltas: dict[str, float] = {}
|
||||||
|
detected = False
|
||||||
|
|
||||||
|
for metric, baseline_value in baseline.items():
|
||||||
|
current_value = current.get(metric, 0.0)
|
||||||
|
delta = current_value - baseline_value
|
||||||
|
deltas[metric] = delta
|
||||||
|
|
||||||
|
# Check if this metric regressed beyond tolerance
|
||||||
|
if delta < -tolerance:
|
||||||
|
detected = True
|
||||||
|
|
||||||
|
return RegressionReport(
|
||||||
|
detected=detected,
|
||||||
|
baseline=baseline,
|
||||||
|
current=current,
|
||||||
|
deltas=deltas,
|
||||||
|
tolerance=tolerance,
|
||||||
|
)
|
||||||
186
src/veritext/benchmark/runner.py
Normal file
186
src/veritext/benchmark/runner.py
Normal file
@@ -0,0 +1,186 @@
|
|||||||
|
"""Benchmark execution and tracking."""
|
||||||
|
|
||||||
|
import uuid
|
||||||
|
from datetime import UTC, datetime
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Any
|
||||||
|
|
||||||
|
import veritext
|
||||||
|
from veritext.benchmark.models import BenchmarkRun, RegressionReport
|
||||||
|
from veritext.benchmark.regression import compute_baseline, detect_regression
|
||||||
|
from veritext.benchmark.storage import BenchmarkStorage
|
||||||
|
from veritext.core.exceptions import RegressionDetectedError
|
||||||
|
from veritext.metrics.bleu import Bleu
|
||||||
|
from veritext.metrics.rouge import Rouge
|
||||||
|
|
||||||
|
# Default metrics to use for evaluation
|
||||||
|
DEFAULT_METRICS = ["rouge_l", "bleu4"]
|
||||||
|
|
||||||
|
|
||||||
|
class Benchmark:
|
||||||
|
"""Track text quality over time."""
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
name: str,
|
||||||
|
storage_path: str | Path = "benchmarks/",
|
||||||
|
) -> None:
|
||||||
|
"""
|
||||||
|
Initialise a benchmark tracker.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
name: Name identifying this benchmark suite.
|
||||||
|
storage_path: Directory for storing benchmark data.
|
||||||
|
"""
|
||||||
|
self._name = name
|
||||||
|
self._storage_path = Path(storage_path)
|
||||||
|
self._storage = BenchmarkStorage(self._storage_path / f"{name}.db")
|
||||||
|
|
||||||
|
# Initialise metrics
|
||||||
|
self._bleu = Bleu()
|
||||||
|
self._rouge = Rouge()
|
||||||
|
|
||||||
|
@property
|
||||||
|
def name(self) -> str:
|
||||||
|
"""Return the benchmark name."""
|
||||||
|
return self._name
|
||||||
|
|
||||||
|
def _compute_metrics(
|
||||||
|
self,
|
||||||
|
candidates: list[str],
|
||||||
|
references: list[str] | list[list[str]],
|
||||||
|
metric_names: list[str],
|
||||||
|
) -> dict[str, float]:
|
||||||
|
"""Compute requested metrics for the given samples."""
|
||||||
|
results: dict[str, float] = {}
|
||||||
|
|
||||||
|
for metric_name in metric_names:
|
||||||
|
if metric_name in ("bleu1", "bleu2", "bleu3", "bleu4"):
|
||||||
|
batch_result = self._bleu.batch_score(candidates, references)
|
||||||
|
stats = batch_result.stats.get(metric_name)
|
||||||
|
if stats:
|
||||||
|
results[metric_name] = stats.mean
|
||||||
|
|
||||||
|
elif metric_name in (
|
||||||
|
"rouge1",
|
||||||
|
"rouge2",
|
||||||
|
"rouge_l",
|
||||||
|
"rouge1_fmeasure",
|
||||||
|
"rouge2_fmeasure",
|
||||||
|
"rouge_l_fmeasure",
|
||||||
|
):
|
||||||
|
rouge_result = self._rouge.batch_score(candidates, references)
|
||||||
|
# Map short names to stat names
|
||||||
|
stat_name = metric_name
|
||||||
|
if metric_name == "rouge1":
|
||||||
|
stat_name = "rouge1_fmeasure"
|
||||||
|
elif metric_name == "rouge2":
|
||||||
|
stat_name = "rouge2_fmeasure"
|
||||||
|
elif metric_name == "rouge_l":
|
||||||
|
stat_name = "rouge_l_fmeasure"
|
||||||
|
|
||||||
|
stats = rouge_result.stats.get(stat_name)
|
||||||
|
if stats:
|
||||||
|
results[metric_name] = stats.mean
|
||||||
|
|
||||||
|
return results
|
||||||
|
|
||||||
|
def evaluate(
|
||||||
|
self,
|
||||||
|
candidates: list[str],
|
||||||
|
references: list[str] | list[list[str]],
|
||||||
|
metrics: list[str] | None = None,
|
||||||
|
metadata: dict[str, Any] | None = None,
|
||||||
|
) -> BenchmarkRun:
|
||||||
|
"""
|
||||||
|
Evaluate candidates against references, store results, and return the run.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
candidates: List of candidate texts to evaluate.
|
||||||
|
references: Reference text(s) for each candidate.
|
||||||
|
metrics: List of metrics to compute. Defaults to ["rouge_l", "bleu4"].
|
||||||
|
metadata: Optional metadata (git_sha, model version, etc.).
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
The BenchmarkRun record that was created and stored.
|
||||||
|
"""
|
||||||
|
metric_names = metrics or DEFAULT_METRICS
|
||||||
|
metric_results = self._compute_metrics(candidates, references, metric_names)
|
||||||
|
|
||||||
|
run = BenchmarkRun(
|
||||||
|
id=str(uuid.uuid4()),
|
||||||
|
benchmark_name=self._name,
|
||||||
|
timestamp=datetime.now(UTC),
|
||||||
|
veritext_version=veritext.__version__,
|
||||||
|
metrics=metric_results,
|
||||||
|
sample_count=len(candidates),
|
||||||
|
metadata=metadata or {},
|
||||||
|
)
|
||||||
|
|
||||||
|
self._storage.save_run(run)
|
||||||
|
return run
|
||||||
|
|
||||||
|
def check_regression(
|
||||||
|
self,
|
||||||
|
tolerance: float = 0.05,
|
||||||
|
window: int = 10,
|
||||||
|
) -> RegressionReport:
|
||||||
|
"""
|
||||||
|
Compare latest run against historical baseline.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
tolerance: Maximum allowed metric drop before regression is flagged.
|
||||||
|
window: Number of historical runs to include in baseline.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
RegressionReport with comparison results.
|
||||||
|
"""
|
||||||
|
runs = self._storage.get_runs(self._name)
|
||||||
|
|
||||||
|
if not runs:
|
||||||
|
# No runs at all
|
||||||
|
return RegressionReport(
|
||||||
|
detected=False,
|
||||||
|
baseline={},
|
||||||
|
current={},
|
||||||
|
deltas={},
|
||||||
|
tolerance=tolerance,
|
||||||
|
)
|
||||||
|
|
||||||
|
current_run = runs[0]
|
||||||
|
# Baseline excludes the current run
|
||||||
|
historical_runs = runs[1:]
|
||||||
|
baseline = compute_baseline(historical_runs, window=window)
|
||||||
|
|
||||||
|
return detect_regression(current_run.metrics, baseline, tolerance)
|
||||||
|
|
||||||
|
def assert_no_regression(
|
||||||
|
self,
|
||||||
|
tolerance: float = 0.05,
|
||||||
|
window: int = 10,
|
||||||
|
) -> None:
|
||||||
|
"""
|
||||||
|
Raise RegressionDetectedError if quality dropped.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
tolerance: Maximum allowed metric drop before regression is flagged.
|
||||||
|
window: Number of historical runs to include in baseline.
|
||||||
|
|
||||||
|
Raises:
|
||||||
|
RegressionDetectedError: If a regression is detected.
|
||||||
|
"""
|
||||||
|
report = self.check_regression(tolerance=tolerance, window=window)
|
||||||
|
if report.detected:
|
||||||
|
raise RegressionDetectedError(report.summary)
|
||||||
|
|
||||||
|
def get_history(self, limit: int = 20) -> list[BenchmarkRun]:
|
||||||
|
"""
|
||||||
|
Get recent benchmark runs.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
limit: Maximum number of runs to return.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
List of BenchmarkRun objects, most recent first.
|
||||||
|
"""
|
||||||
|
return self._storage.get_runs(self._name, limit=limit)
|
||||||
179
src/veritext/benchmark/storage.py
Normal file
179
src/veritext/benchmark/storage.py
Normal file
@@ -0,0 +1,179 @@
|
|||||||
|
"""SQLite storage for benchmark history."""
|
||||||
|
|
||||||
|
import json
|
||||||
|
import sqlite3
|
||||||
|
from datetime import datetime
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
from veritext.benchmark.models import BenchmarkRun
|
||||||
|
from veritext.core.exceptions import StorageError
|
||||||
|
|
||||||
|
|
||||||
|
class BenchmarkStorage:
|
||||||
|
"""SQLite-backed storage for benchmark runs."""
|
||||||
|
|
||||||
|
def __init__(self, db_path: Path) -> None:
|
||||||
|
"""
|
||||||
|
Initialise storage, creating tables if needed.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
db_path: Path to the SQLite database file.
|
||||||
|
"""
|
||||||
|
self._db_path = db_path
|
||||||
|
self._ensure_parent_exists()
|
||||||
|
self._init_database()
|
||||||
|
|
||||||
|
def _ensure_parent_exists(self) -> None:
|
||||||
|
"""Ensure the parent directory exists."""
|
||||||
|
self._db_path.parent.mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
|
def _get_connection(self) -> sqlite3.Connection:
|
||||||
|
"""Get a database connection with WAL mode enabled."""
|
||||||
|
conn = sqlite3.connect(str(self._db_path), timeout=30.0)
|
||||||
|
conn.execute("PRAGMA journal_mode=WAL")
|
||||||
|
conn.execute("PRAGMA foreign_keys=ON")
|
||||||
|
conn.row_factory = sqlite3.Row
|
||||||
|
return conn
|
||||||
|
|
||||||
|
def _init_database(self) -> None:
|
||||||
|
"""Create tables if they don't exist."""
|
||||||
|
try:
|
||||||
|
with self._get_connection() as conn:
|
||||||
|
conn.executescript("""
|
||||||
|
CREATE TABLE IF NOT EXISTS benchmark_runs (
|
||||||
|
id TEXT PRIMARY KEY,
|
||||||
|
benchmark_name TEXT NOT NULL,
|
||||||
|
timestamp TEXT NOT NULL,
|
||||||
|
veritext_version TEXT NOT NULL,
|
||||||
|
sample_count INTEGER NOT NULL,
|
||||||
|
metadata TEXT
|
||||||
|
);
|
||||||
|
|
||||||
|
CREATE TABLE IF NOT EXISTS benchmark_metrics (
|
||||||
|
run_id TEXT REFERENCES benchmark_runs(id) ON DELETE CASCADE,
|
||||||
|
metric_name TEXT NOT NULL,
|
||||||
|
value REAL NOT NULL,
|
||||||
|
PRIMARY KEY (run_id, metric_name)
|
||||||
|
);
|
||||||
|
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_benchmark_name
|
||||||
|
ON benchmark_runs(benchmark_name, timestamp DESC);
|
||||||
|
""")
|
||||||
|
except sqlite3.Error as e:
|
||||||
|
raise StorageError(f"Failed to initialise database: {e}") from e
|
||||||
|
|
||||||
|
def save_run(self, run: BenchmarkRun) -> None:
|
||||||
|
"""
|
||||||
|
Persist a benchmark run.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
run: The benchmark run to save.
|
||||||
|
|
||||||
|
Raises:
|
||||||
|
StorageError: If the save operation fails.
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
with self._get_connection() as conn:
|
||||||
|
# Insert the run
|
||||||
|
conn.execute(
|
||||||
|
"""
|
||||||
|
INSERT INTO benchmark_runs
|
||||||
|
(id, benchmark_name, timestamp, veritext_version, sample_count, metadata)
|
||||||
|
VALUES (?, ?, ?, ?, ?, ?)
|
||||||
|
""",
|
||||||
|
(
|
||||||
|
run.id,
|
||||||
|
run.benchmark_name,
|
||||||
|
run.timestamp.isoformat(),
|
||||||
|
run.veritext_version,
|
||||||
|
run.sample_count,
|
||||||
|
json.dumps(run.metadata) if run.metadata else None,
|
||||||
|
),
|
||||||
|
)
|
||||||
|
|
||||||
|
# Insert metrics
|
||||||
|
for metric_name, value in run.metrics.items():
|
||||||
|
conn.execute(
|
||||||
|
"""
|
||||||
|
INSERT INTO benchmark_metrics (run_id, metric_name, value)
|
||||||
|
VALUES (?, ?, ?)
|
||||||
|
""",
|
||||||
|
(run.id, metric_name, value),
|
||||||
|
)
|
||||||
|
except sqlite3.IntegrityError as e:
|
||||||
|
raise StorageError(f"Run with id '{run.id}' already exists") from e
|
||||||
|
except sqlite3.Error as e:
|
||||||
|
raise StorageError(f"Failed to save benchmark run: {e}") from e
|
||||||
|
|
||||||
|
def get_runs(
|
||||||
|
self,
|
||||||
|
benchmark_name: str,
|
||||||
|
limit: int | None = None,
|
||||||
|
) -> list[BenchmarkRun]:
|
||||||
|
"""
|
||||||
|
Retrieve runs for a benchmark, most recent first.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
benchmark_name: Name of the benchmark to retrieve runs for.
|
||||||
|
limit: Maximum number of runs to return.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
List of BenchmarkRun objects, most recent first.
|
||||||
|
|
||||||
|
Raises:
|
||||||
|
StorageError: If the retrieval fails.
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
with self._get_connection() as conn:
|
||||||
|
query = """
|
||||||
|
SELECT id, benchmark_name, timestamp, veritext_version,
|
||||||
|
sample_count, metadata
|
||||||
|
FROM benchmark_runs
|
||||||
|
WHERE benchmark_name = ?
|
||||||
|
ORDER BY timestamp DESC
|
||||||
|
"""
|
||||||
|
if limit is not None:
|
||||||
|
query += " LIMIT ?"
|
||||||
|
rows = conn.execute(query, (benchmark_name, limit)).fetchall()
|
||||||
|
else:
|
||||||
|
rows = conn.execute(query, (benchmark_name,)).fetchall()
|
||||||
|
|
||||||
|
runs = []
|
||||||
|
for row in rows:
|
||||||
|
# Get metrics for this run
|
||||||
|
metrics_rows = conn.execute(
|
||||||
|
"SELECT metric_name, value FROM benchmark_metrics WHERE run_id = ?",
|
||||||
|
(row["id"],),
|
||||||
|
).fetchall()
|
||||||
|
metrics = {m["metric_name"]: m["value"] for m in metrics_rows}
|
||||||
|
|
||||||
|
metadata = json.loads(row["metadata"]) if row["metadata"] else {}
|
||||||
|
|
||||||
|
runs.append(
|
||||||
|
BenchmarkRun(
|
||||||
|
id=row["id"],
|
||||||
|
benchmark_name=row["benchmark_name"],
|
||||||
|
timestamp=datetime.fromisoformat(row["timestamp"]),
|
||||||
|
veritext_version=row["veritext_version"],
|
||||||
|
sample_count=row["sample_count"],
|
||||||
|
metrics=metrics,
|
||||||
|
metadata=metadata,
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
return runs
|
||||||
|
except sqlite3.Error as e:
|
||||||
|
raise StorageError(f"Failed to retrieve benchmark runs: {e}") from e
|
||||||
|
|
||||||
|
def get_latest_run(self, benchmark_name: str) -> BenchmarkRun | None:
|
||||||
|
"""
|
||||||
|
Get the most recent run for a benchmark.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
benchmark_name: Name of the benchmark.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
The most recent BenchmarkRun, or None if no runs exist.
|
||||||
|
"""
|
||||||
|
runs = self.get_runs(benchmark_name, limit=1)
|
||||||
|
return runs[0] if runs else None
|
||||||
22
src/veritext/pytest_plugin/__init__.py
Normal file
22
src/veritext/pytest_plugin/__init__.py
Normal file
@@ -0,0 +1,22 @@
|
|||||||
|
"""Pytest plugin for text validation.
|
||||||
|
|
||||||
|
This plugin provides native pytest integration for Veritext, enabling
|
||||||
|
text validation assertions in test suites.
|
||||||
|
|
||||||
|
Example:
|
||||||
|
>>> from veritext.pytest_plugin import validate_text
|
||||||
|
>>>
|
||||||
|
>>> def test_summary_quality():
|
||||||
|
... text = "The quick brown fox jumps over the lazy dog."
|
||||||
|
... validate_text(
|
||||||
|
... text,
|
||||||
|
... min_length=10,
|
||||||
|
... max_length=100,
|
||||||
|
... max_reading_grade=8.0,
|
||||||
|
... )
|
||||||
|
"""
|
||||||
|
|
||||||
|
from veritext.pytest_plugin.assertions import validate_text
|
||||||
|
from veritext.pytest_plugin.plugin import pytest_configure
|
||||||
|
|
||||||
|
__all__ = ["pytest_configure", "validate_text"]
|
||||||
141
src/veritext/pytest_plugin/assertions.py
Normal file
141
src/veritext/pytest_plugin/assertions.py
Normal file
@@ -0,0 +1,141 @@
|
|||||||
|
"""Assertion functions for text validation in pytest."""
|
||||||
|
|
||||||
|
from typing import TYPE_CHECKING
|
||||||
|
|
||||||
|
from veritext.core.types import ValidationContext, ValidationResult
|
||||||
|
from veritext.validators import all_of
|
||||||
|
|
||||||
|
if TYPE_CHECKING:
|
||||||
|
from veritext.validators.base import Check
|
||||||
|
|
||||||
|
|
||||||
|
def validate_text(
|
||||||
|
text: str,
|
||||||
|
*,
|
||||||
|
reference: str | list[str] | None = None,
|
||||||
|
min_bleu: float | None = None,
|
||||||
|
min_rouge: float | None = None,
|
||||||
|
min_semantic: float | None = None,
|
||||||
|
max_length: int | None = None,
|
||||||
|
min_length: int | None = None,
|
||||||
|
max_reading_grade: float | None = None,
|
||||||
|
must_contain: list[str] | None = None,
|
||||||
|
must_exclude: list[str] | None = None,
|
||||||
|
) -> None:
|
||||||
|
"""Assert text passes all specified validation criteria.
|
||||||
|
|
||||||
|
This is the primary assertion function for text validation in pytest.
|
||||||
|
It builds validators from keyword arguments and raises AssertionError
|
||||||
|
with detailed failure information if validation fails.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
text: The text to validate.
|
||||||
|
reference: Reference text for comparison metrics (BLEU, ROUGE, semantic).
|
||||||
|
min_bleu: Minimum BLEU-4 score required (0.0 to 1.0).
|
||||||
|
min_rouge: Minimum ROUGE-L F-measure required (0.0 to 1.0).
|
||||||
|
min_semantic: Minimum semantic similarity required (0.0 to 1.0).
|
||||||
|
max_length: Maximum character count allowed.
|
||||||
|
min_length: Minimum character count required.
|
||||||
|
max_reading_grade: Maximum Flesch-Kincaid grade level.
|
||||||
|
must_contain: Patterns that must be present in the text.
|
||||||
|
must_exclude: Patterns that must not be present in the text.
|
||||||
|
|
||||||
|
Raises:
|
||||||
|
AssertionError: With detailed failure information if validation fails.
|
||||||
|
ValueError: If comparison metrics requested but reference not provided,
|
||||||
|
or if no validation criteria are specified.
|
||||||
|
|
||||||
|
Example:
|
||||||
|
>>> validate_text(
|
||||||
|
... "The quick brown fox jumps over the lazy dog.",
|
||||||
|
... min_length=10,
|
||||||
|
... max_length=100,
|
||||||
|
... max_reading_grade=8.0,
|
||||||
|
... )
|
||||||
|
"""
|
||||||
|
# Validate that reference is provided for comparison metrics
|
||||||
|
if any([min_bleu, min_rouge, min_semantic]) and reference is None:
|
||||||
|
raise ValueError(
|
||||||
|
"Reference text required for comparison metrics "
|
||||||
|
"(min_bleu, min_rouge, min_semantic)"
|
||||||
|
)
|
||||||
|
|
||||||
|
# Build list of validators from kwargs
|
||||||
|
checks: list[Check] = []
|
||||||
|
|
||||||
|
if min_bleu is not None:
|
||||||
|
from veritext.validators import bleu
|
||||||
|
|
||||||
|
checks.append(bleu(min_score=min_bleu))
|
||||||
|
|
||||||
|
if min_rouge is not None:
|
||||||
|
from veritext.validators import rouge
|
||||||
|
|
||||||
|
checks.append(rouge(min_score=min_rouge))
|
||||||
|
|
||||||
|
if min_semantic is not None:
|
||||||
|
# Lazy import to avoid loading sentence-transformers unless needed
|
||||||
|
from veritext.validators import semantic
|
||||||
|
|
||||||
|
checks.append(semantic(min_score=min_semantic))
|
||||||
|
|
||||||
|
if max_length is not None or min_length is not None:
|
||||||
|
from veritext.validators import length
|
||||||
|
|
||||||
|
checks.append(length(min_chars=min_length, max_chars=max_length))
|
||||||
|
|
||||||
|
if max_reading_grade is not None:
|
||||||
|
from veritext.validators import readability
|
||||||
|
|
||||||
|
checks.append(readability(max_grade=max_reading_grade))
|
||||||
|
|
||||||
|
if must_contain is not None:
|
||||||
|
from veritext.validators import contains
|
||||||
|
|
||||||
|
checks.append(contains(patterns=must_contain))
|
||||||
|
|
||||||
|
if must_exclude is not None:
|
||||||
|
from veritext.validators import excludes
|
||||||
|
|
||||||
|
checks.append(excludes(patterns=must_exclude))
|
||||||
|
|
||||||
|
if not checks:
|
||||||
|
raise ValueError("At least one validation criterion must be specified")
|
||||||
|
|
||||||
|
# Run validation
|
||||||
|
context = ValidationContext(reference=reference)
|
||||||
|
validator = all_of(checks)
|
||||||
|
result = validator.check(text, context)
|
||||||
|
|
||||||
|
if not result.passed:
|
||||||
|
raise AssertionError(_format_failure(text, result))
|
||||||
|
|
||||||
|
|
||||||
|
def _format_failure(text: str, result: ValidationResult) -> str:
|
||||||
|
"""Format a detailed failure message for pytest output.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
text: The text that was validated.
|
||||||
|
result: The validation result containing check failures.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Formatted failure message with check details.
|
||||||
|
"""
|
||||||
|
lines = ["Text validation failed:"]
|
||||||
|
lines.append("")
|
||||||
|
|
||||||
|
# Show a preview of the text (truncated if long)
|
||||||
|
preview = text[:100] + "..." if len(text) > 100 else text
|
||||||
|
lines.append(f" Text: {preview!r}")
|
||||||
|
lines.append("")
|
||||||
|
|
||||||
|
# List all failed checks with details
|
||||||
|
lines.append(" Failed checks:")
|
||||||
|
for check in result.failed_checks:
|
||||||
|
lines.append(f" - {check.name}:")
|
||||||
|
lines.append(f" {check.message}")
|
||||||
|
if check.threshold is not None:
|
||||||
|
lines.append(f" Expected: >= {check.threshold}")
|
||||||
|
lines.append(f" Actual: {check.actual}")
|
||||||
|
|
||||||
|
return "\n".join(lines)
|
||||||
80
src/veritext/pytest_plugin/fixtures.py
Normal file
80
src/veritext/pytest_plugin/fixtures.py
Normal file
@@ -0,0 +1,80 @@
|
|||||||
|
"""Pytest fixtures for text validation."""
|
||||||
|
|
||||||
|
from typing import TYPE_CHECKING, Any
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
from veritext.core.types import ValidationContext, ValidationResult
|
||||||
|
from veritext.validators import all_of
|
||||||
|
from veritext.validators.base import Check
|
||||||
|
|
||||||
|
if TYPE_CHECKING:
|
||||||
|
from collections.abc import Callable
|
||||||
|
|
||||||
|
|
||||||
|
class ValidatorFactory:
|
||||||
|
"""Factory for building validators from keyword arguments."""
|
||||||
|
|
||||||
|
def __call__(
|
||||||
|
self,
|
||||||
|
checks: list[Check],
|
||||||
|
reference: str | list[str] | None = None,
|
||||||
|
) -> "Callable[[str], ValidationResult]":
|
||||||
|
"""Create a validator function from a list of checks.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
checks: List of validation checks to apply.
|
||||||
|
reference: Optional reference text for comparison metrics.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
A callable that takes text and returns a ValidationResult.
|
||||||
|
"""
|
||||||
|
validator = all_of(checks)
|
||||||
|
context = ValidationContext(reference=reference)
|
||||||
|
|
||||||
|
def validate(text: str) -> ValidationResult:
|
||||||
|
return validator.check(text, context)
|
||||||
|
|
||||||
|
return validate
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def text_validator() -> ValidatorFactory:
|
||||||
|
"""Provide a factory for building validators.
|
||||||
|
|
||||||
|
Example:
|
||||||
|
>>> def test_with_factory(text_validator):
|
||||||
|
... from veritext.validators import bleu, length
|
||||||
|
... validate = text_validator(
|
||||||
|
... checks=[bleu(min_score=0.5), length(min_words=10)],
|
||||||
|
... reference="The reference text.",
|
||||||
|
... )
|
||||||
|
... result = validate("Some candidate text.")
|
||||||
|
... assert result.passed
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
ValidatorFactory instance.
|
||||||
|
"""
|
||||||
|
return ValidatorFactory()
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def validation_context() -> "Callable[..., ValidationContext]":
|
||||||
|
"""Provide a factory for creating ValidationContext objects.
|
||||||
|
|
||||||
|
Example:
|
||||||
|
>>> def test_with_context(validation_context):
|
||||||
|
... ctx = validation_context(reference="The reference text.")
|
||||||
|
... assert ctx.reference == "The reference text."
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
A callable that creates ValidationContext objects.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def _create(
|
||||||
|
reference: str | list[str] | None = None,
|
||||||
|
**metadata: Any,
|
||||||
|
) -> ValidationContext:
|
||||||
|
return ValidationContext(reference=reference, metadata=metadata)
|
||||||
|
|
||||||
|
return _create
|
||||||
18
src/veritext/pytest_plugin/plugin.py
Normal file
18
src/veritext/pytest_plugin/plugin.py
Normal file
@@ -0,0 +1,18 @@
|
|||||||
|
"""Pytest hooks for Veritext plugin."""
|
||||||
|
|
||||||
|
from typing import TYPE_CHECKING
|
||||||
|
|
||||||
|
if TYPE_CHECKING:
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
|
||||||
|
def pytest_configure(config: "pytest.Config") -> None:
|
||||||
|
"""Register Veritext markers.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
config: Pytest configuration object.
|
||||||
|
"""
|
||||||
|
config.addinivalue_line(
|
||||||
|
"markers",
|
||||||
|
"text_validation: mark test as a text validation test",
|
||||||
|
)
|
||||||
1
tests/test_benchmark/__init__.py
Normal file
1
tests/test_benchmark/__init__.py
Normal file
@@ -0,0 +1 @@
|
|||||||
|
"""Tests for the benchmark module."""
|
||||||
145
tests/test_benchmark/test_models.py
Normal file
145
tests/test_benchmark/test_models.py
Normal file
@@ -0,0 +1,145 @@
|
|||||||
|
"""Tests for benchmark data models."""
|
||||||
|
|
||||||
|
from datetime import UTC, datetime
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
from pydantic import ValidationError
|
||||||
|
|
||||||
|
from veritext.benchmark.models import BenchmarkRun, RegressionReport
|
||||||
|
|
||||||
|
|
||||||
|
class TestBenchmarkRun:
|
||||||
|
"""Tests for BenchmarkRun model."""
|
||||||
|
|
||||||
|
def test_create_benchmark_run(self) -> None:
|
||||||
|
"""BenchmarkRun can be created with required fields."""
|
||||||
|
run = BenchmarkRun(
|
||||||
|
id="test-id-123",
|
||||||
|
benchmark_name="test-benchmark",
|
||||||
|
timestamp=datetime(2025, 1, 15, 12, 0, 0, tzinfo=UTC),
|
||||||
|
veritext_version="0.1.0-dev",
|
||||||
|
metrics={"bleu4": 0.75, "rouge_l": 0.82},
|
||||||
|
sample_count=100,
|
||||||
|
)
|
||||||
|
|
||||||
|
assert run.id == "test-id-123"
|
||||||
|
assert run.benchmark_name == "test-benchmark"
|
||||||
|
assert run.veritext_version == "0.1.0-dev"
|
||||||
|
assert run.metrics == {"bleu4": 0.75, "rouge_l": 0.82}
|
||||||
|
assert run.sample_count == 100
|
||||||
|
assert run.metadata == {}
|
||||||
|
|
||||||
|
def test_create_with_metadata(self) -> None:
|
||||||
|
"""BenchmarkRun can include optional metadata."""
|
||||||
|
run = BenchmarkRun(
|
||||||
|
id="test-id-456",
|
||||||
|
benchmark_name="test-benchmark",
|
||||||
|
timestamp=datetime(2025, 1, 15, 12, 0, 0, tzinfo=UTC),
|
||||||
|
veritext_version="0.1.0-dev",
|
||||||
|
metrics={"bleu4": 0.75},
|
||||||
|
sample_count=50,
|
||||||
|
metadata={"git_sha": "abc123", "model_version": "gpt-4"},
|
||||||
|
)
|
||||||
|
|
||||||
|
assert run.metadata == {"git_sha": "abc123", "model_version": "gpt-4"}
|
||||||
|
|
||||||
|
def test_frozen_model(self) -> None:
|
||||||
|
"""BenchmarkRun is immutable."""
|
||||||
|
run = BenchmarkRun(
|
||||||
|
id="test-id",
|
||||||
|
benchmark_name="test",
|
||||||
|
timestamp=datetime(2025, 1, 15, 12, 0, 0, tzinfo=UTC),
|
||||||
|
veritext_version="0.1.0",
|
||||||
|
metrics={"bleu4": 0.5},
|
||||||
|
sample_count=10,
|
||||||
|
)
|
||||||
|
|
||||||
|
with pytest.raises(ValidationError):
|
||||||
|
run.id = "new-id" # type: ignore[misc]
|
||||||
|
|
||||||
|
def test_serialisation(self) -> None:
|
||||||
|
"""BenchmarkRun can be serialised to dict."""
|
||||||
|
run = BenchmarkRun(
|
||||||
|
id="test-id",
|
||||||
|
benchmark_name="test",
|
||||||
|
timestamp=datetime(2025, 1, 15, 12, 0, 0, tzinfo=UTC),
|
||||||
|
veritext_version="0.1.0",
|
||||||
|
metrics={"bleu4": 0.5},
|
||||||
|
sample_count=10,
|
||||||
|
)
|
||||||
|
|
||||||
|
data = run.model_dump()
|
||||||
|
assert data["id"] == "test-id"
|
||||||
|
assert data["benchmark_name"] == "test"
|
||||||
|
assert data["metrics"] == {"bleu4": 0.5}
|
||||||
|
|
||||||
|
|
||||||
|
class TestRegressionReport:
|
||||||
|
"""Tests for RegressionReport model."""
|
||||||
|
|
||||||
|
def test_no_regression_summary(self) -> None:
|
||||||
|
"""Summary indicates no regression when detected is False."""
|
||||||
|
report = RegressionReport(
|
||||||
|
detected=False,
|
||||||
|
baseline={"bleu4": 0.75, "rouge_l": 0.80},
|
||||||
|
current={"bleu4": 0.76, "rouge_l": 0.81},
|
||||||
|
deltas={"bleu4": 0.01, "rouge_l": 0.01},
|
||||||
|
tolerance=0.05,
|
||||||
|
)
|
||||||
|
|
||||||
|
assert "No regression detected" in report.summary
|
||||||
|
|
||||||
|
def test_regression_summary(self) -> None:
|
||||||
|
"""Summary lists regressed metrics when detected is True."""
|
||||||
|
report = RegressionReport(
|
||||||
|
detected=True,
|
||||||
|
baseline={"bleu4": 0.75, "rouge_l": 0.80},
|
||||||
|
current={"bleu4": 0.65, "rouge_l": 0.78},
|
||||||
|
deltas={"bleu4": -0.10, "rouge_l": -0.02},
|
||||||
|
tolerance=0.05,
|
||||||
|
)
|
||||||
|
|
||||||
|
assert "Regression detected" in report.summary
|
||||||
|
assert "bleu4" in report.summary
|
||||||
|
assert "0.6500" in report.summary
|
||||||
|
assert "baseline: 0.7500" in report.summary
|
||||||
|
|
||||||
|
def test_regression_excludes_within_tolerance(self) -> None:
|
||||||
|
"""Summary only shows metrics that exceed tolerance."""
|
||||||
|
report = RegressionReport(
|
||||||
|
detected=True,
|
||||||
|
baseline={"bleu4": 0.75, "rouge_l": 0.80},
|
||||||
|
current={"bleu4": 0.65, "rouge_l": 0.78},
|
||||||
|
deltas={"bleu4": -0.10, "rouge_l": -0.02},
|
||||||
|
tolerance=0.05,
|
||||||
|
)
|
||||||
|
|
||||||
|
# rouge_l is -0.02, within tolerance of 0.05, so shouldn't appear
|
||||||
|
assert "rouge_l" not in report.summary
|
||||||
|
# bleu4 is -0.10, exceeds tolerance, so should appear
|
||||||
|
assert "bleu4" in report.summary
|
||||||
|
|
||||||
|
def test_frozen_model(self) -> None:
|
||||||
|
"""RegressionReport is immutable."""
|
||||||
|
report = RegressionReport(
|
||||||
|
detected=False,
|
||||||
|
baseline={},
|
||||||
|
current={},
|
||||||
|
deltas={},
|
||||||
|
tolerance=0.05,
|
||||||
|
)
|
||||||
|
|
||||||
|
with pytest.raises(ValidationError):
|
||||||
|
report.detected = True # type: ignore[misc]
|
||||||
|
|
||||||
|
def test_tolerance_in_summary(self) -> None:
|
||||||
|
"""Summary includes tolerance threshold."""
|
||||||
|
report = RegressionReport(
|
||||||
|
detected=True,
|
||||||
|
baseline={"metric": 0.80},
|
||||||
|
current={"metric": 0.50},
|
||||||
|
deltas={"metric": -0.30},
|
||||||
|
tolerance=0.10,
|
||||||
|
)
|
||||||
|
|
||||||
|
assert "10.00%" in report.summary
|
||||||
229
tests/test_benchmark/test_regression.py
Normal file
229
tests/test_benchmark/test_regression.py
Normal file
@@ -0,0 +1,229 @@
|
|||||||
|
"""Tests for regression detection."""
|
||||||
|
|
||||||
|
from datetime import UTC, datetime
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
from veritext.benchmark.models import BenchmarkRun
|
||||||
|
from veritext.benchmark.regression import compute_baseline, detect_regression
|
||||||
|
|
||||||
|
|
||||||
|
def make_run(
|
||||||
|
run_id: str,
|
||||||
|
metrics: dict[str, float],
|
||||||
|
day: int = 1,
|
||||||
|
) -> BenchmarkRun:
|
||||||
|
"""Helper to create a BenchmarkRun."""
|
||||||
|
return BenchmarkRun(
|
||||||
|
id=run_id,
|
||||||
|
benchmark_name="test",
|
||||||
|
timestamp=datetime(2025, 1, day, 12, 0, 0, tzinfo=UTC),
|
||||||
|
veritext_version="0.1.0",
|
||||||
|
metrics=metrics,
|
||||||
|
sample_count=10,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
class TestComputeBaseline:
|
||||||
|
"""Tests for baseline computation."""
|
||||||
|
|
||||||
|
def test_empty_runs(self) -> None:
|
||||||
|
"""Returns empty baseline for empty runs list."""
|
||||||
|
baseline = compute_baseline([])
|
||||||
|
assert baseline == {}
|
||||||
|
|
||||||
|
def test_single_run(self) -> None:
|
||||||
|
"""Single run produces baseline equal to that run's metrics."""
|
||||||
|
runs = [make_run("r1", {"bleu4": 0.75, "rouge_l": 0.80})]
|
||||||
|
|
||||||
|
baseline = compute_baseline(runs)
|
||||||
|
|
||||||
|
assert baseline["bleu4"] == 0.75
|
||||||
|
assert baseline["rouge_l"] == 0.80
|
||||||
|
|
||||||
|
def test_multiple_runs_average(self) -> None:
|
||||||
|
"""Baseline is the average of all runs in window."""
|
||||||
|
runs = [
|
||||||
|
make_run("r1", {"bleu4": 0.70}, day=3),
|
||||||
|
make_run("r2", {"bleu4": 0.80}, day=2),
|
||||||
|
make_run("r3", {"bleu4": 0.90}, day=1),
|
||||||
|
]
|
||||||
|
|
||||||
|
baseline = compute_baseline(runs, window=3)
|
||||||
|
|
||||||
|
assert baseline["bleu4"] == pytest.approx(0.80) # (0.70+0.80+0.90)/3
|
||||||
|
|
||||||
|
def test_window_limits_runs(self) -> None:
|
||||||
|
"""Only includes runs within the window size."""
|
||||||
|
runs = [
|
||||||
|
make_run("r1", {"bleu4": 0.70}, day=5), # most recent
|
||||||
|
make_run("r2", {"bleu4": 0.80}, day=4),
|
||||||
|
make_run("r3", {"bleu4": 0.90}, day=3),
|
||||||
|
make_run("r4", {"bleu4": 0.60}, day=2), # excluded
|
||||||
|
make_run("r5", {"bleu4": 0.50}, day=1), # excluded
|
||||||
|
]
|
||||||
|
|
||||||
|
baseline = compute_baseline(runs, window=3)
|
||||||
|
|
||||||
|
# Only first 3 runs: (0.70 + 0.80 + 0.90) / 3 = 0.80
|
||||||
|
assert baseline["bleu4"] == pytest.approx(0.80)
|
||||||
|
|
||||||
|
def test_partial_history(self) -> None:
|
||||||
|
"""Works when fewer runs than window size exist."""
|
||||||
|
runs = [
|
||||||
|
make_run("r1", {"bleu4": 0.70}),
|
||||||
|
make_run("r2", {"bleu4": 0.80}),
|
||||||
|
]
|
||||||
|
|
||||||
|
baseline = compute_baseline(runs, window=10)
|
||||||
|
|
||||||
|
# Only 2 runs available: (0.70 + 0.80) / 2 = 0.75
|
||||||
|
assert baseline["bleu4"] == pytest.approx(0.75)
|
||||||
|
|
||||||
|
def test_multiple_metrics(self) -> None:
|
||||||
|
"""Computes baseline for all metrics present."""
|
||||||
|
runs = [
|
||||||
|
make_run("r1", {"bleu4": 0.70, "rouge_l": 0.75}),
|
||||||
|
make_run("r2", {"bleu4": 0.80, "rouge_l": 0.85}),
|
||||||
|
]
|
||||||
|
|
||||||
|
baseline = compute_baseline(runs)
|
||||||
|
|
||||||
|
assert baseline["bleu4"] == pytest.approx(0.75)
|
||||||
|
assert baseline["rouge_l"] == pytest.approx(0.80)
|
||||||
|
|
||||||
|
def test_varying_metrics(self) -> None:
|
||||||
|
"""Handles runs with different metric sets."""
|
||||||
|
runs = [
|
||||||
|
make_run("r1", {"bleu4": 0.70, "rouge_l": 0.75}),
|
||||||
|
make_run("r2", {"bleu4": 0.80}), # No rouge_l
|
||||||
|
]
|
||||||
|
|
||||||
|
baseline = compute_baseline(runs)
|
||||||
|
|
||||||
|
# bleu4 appears in both runs
|
||||||
|
assert baseline["bleu4"] == pytest.approx(0.75)
|
||||||
|
# rouge_l only appears in one run
|
||||||
|
assert baseline["rouge_l"] == pytest.approx(0.75)
|
||||||
|
|
||||||
|
|
||||||
|
class TestDetectRegression:
|
||||||
|
"""Tests for regression detection."""
|
||||||
|
|
||||||
|
def test_no_baseline(self) -> None:
|
||||||
|
"""No regression when baseline is empty."""
|
||||||
|
report = detect_regression(
|
||||||
|
current={"bleu4": 0.70},
|
||||||
|
baseline={},
|
||||||
|
tolerance=0.05,
|
||||||
|
)
|
||||||
|
|
||||||
|
assert not report.detected
|
||||||
|
assert report.deltas == {}
|
||||||
|
|
||||||
|
def test_no_regression_stable(self) -> None:
|
||||||
|
"""No regression when metrics are stable."""
|
||||||
|
report = detect_regression(
|
||||||
|
current={"bleu4": 0.75},
|
||||||
|
baseline={"bleu4": 0.75},
|
||||||
|
tolerance=0.05,
|
||||||
|
)
|
||||||
|
|
||||||
|
assert not report.detected
|
||||||
|
assert report.deltas["bleu4"] == pytest.approx(0.0)
|
||||||
|
|
||||||
|
def test_no_regression_improved(self) -> None:
|
||||||
|
"""No regression when metrics improved."""
|
||||||
|
report = detect_regression(
|
||||||
|
current={"bleu4": 0.85},
|
||||||
|
baseline={"bleu4": 0.75},
|
||||||
|
tolerance=0.05,
|
||||||
|
)
|
||||||
|
|
||||||
|
assert not report.detected
|
||||||
|
assert report.deltas["bleu4"] == pytest.approx(0.10)
|
||||||
|
|
||||||
|
def test_no_regression_within_tolerance(self) -> None:
|
||||||
|
"""No regression when drop is within tolerance."""
|
||||||
|
report = detect_regression(
|
||||||
|
current={"bleu4": 0.73},
|
||||||
|
baseline={"bleu4": 0.75},
|
||||||
|
tolerance=0.05,
|
||||||
|
)
|
||||||
|
|
||||||
|
assert not report.detected
|
||||||
|
assert report.deltas["bleu4"] == pytest.approx(-0.02)
|
||||||
|
|
||||||
|
def test_regression_detected(self) -> None:
|
||||||
|
"""Regression detected when metric drops beyond tolerance."""
|
||||||
|
report = detect_regression(
|
||||||
|
current={"bleu4": 0.65},
|
||||||
|
baseline={"bleu4": 0.75},
|
||||||
|
tolerance=0.05,
|
||||||
|
)
|
||||||
|
|
||||||
|
assert report.detected
|
||||||
|
assert report.deltas["bleu4"] == pytest.approx(-0.10)
|
||||||
|
|
||||||
|
def test_regression_at_tolerance_boundary(self) -> None:
|
||||||
|
"""Drop at tolerance boundary is not a regression."""
|
||||||
|
# Use a value clearly at the boundary (accounting for float precision)
|
||||||
|
# The implementation checks delta < -tolerance (strictly less than)
|
||||||
|
report = detect_regression(
|
||||||
|
current={"bleu4": 0.50},
|
||||||
|
baseline={"bleu4": 0.50},
|
||||||
|
tolerance=0.05,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Delta is 0.0, well within tolerance
|
||||||
|
assert not report.detected
|
||||||
|
assert report.deltas["bleu4"] == 0.0
|
||||||
|
|
||||||
|
def test_regression_just_beyond_tolerance(self) -> None:
|
||||||
|
"""Just beyond tolerance is a regression."""
|
||||||
|
report = detect_regression(
|
||||||
|
current={"bleu4": 0.6999},
|
||||||
|
baseline={"bleu4": 0.75},
|
||||||
|
tolerance=0.05,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Delta is -0.0501, which is < -tolerance
|
||||||
|
assert report.detected
|
||||||
|
|
||||||
|
def test_multiple_metrics_any_regresses(self) -> None:
|
||||||
|
"""Regression detected if any metric exceeds tolerance."""
|
||||||
|
report = detect_regression(
|
||||||
|
current={"bleu4": 0.65, "rouge_l": 0.80},
|
||||||
|
baseline={"bleu4": 0.75, "rouge_l": 0.80},
|
||||||
|
tolerance=0.05,
|
||||||
|
)
|
||||||
|
|
||||||
|
assert report.detected
|
||||||
|
# Only bleu4 regressed
|
||||||
|
assert report.deltas["bleu4"] == pytest.approx(-0.10)
|
||||||
|
assert report.deltas["rouge_l"] == pytest.approx(0.0)
|
||||||
|
|
||||||
|
def test_report_contains_all_values(self) -> None:
|
||||||
|
"""Report includes baseline, current, and deltas."""
|
||||||
|
baseline = {"bleu4": 0.75, "rouge_l": 0.80}
|
||||||
|
current = {"bleu4": 0.65, "rouge_l": 0.82}
|
||||||
|
|
||||||
|
report = detect_regression(current, baseline, tolerance=0.05)
|
||||||
|
|
||||||
|
assert report.baseline == baseline
|
||||||
|
assert report.current == current
|
||||||
|
assert report.tolerance == 0.05
|
||||||
|
assert "bleu4" in report.deltas
|
||||||
|
assert "rouge_l" in report.deltas
|
||||||
|
|
||||||
|
def test_missing_metric_in_current(self) -> None:
|
||||||
|
"""Missing metric in current treated as zero."""
|
||||||
|
report = detect_regression(
|
||||||
|
current={},
|
||||||
|
baseline={"bleu4": 0.75},
|
||||||
|
tolerance=0.05,
|
||||||
|
)
|
||||||
|
|
||||||
|
# 0.0 - 0.75 = -0.75, which is a regression
|
||||||
|
assert report.detected
|
||||||
|
assert report.deltas["bleu4"] == pytest.approx(-0.75)
|
||||||
247
tests/test_benchmark/test_runner.py
Normal file
247
tests/test_benchmark/test_runner.py
Normal file
@@ -0,0 +1,247 @@
|
|||||||
|
"""Tests for benchmark runner."""
|
||||||
|
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
from veritext.benchmark.models import BenchmarkRun
|
||||||
|
from veritext.benchmark.runner import Benchmark
|
||||||
|
from veritext.core.exceptions import RegressionDetectedError
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def benchmark(tmp_path: Path) -> Benchmark:
|
||||||
|
"""Create a Benchmark instance with temporary storage."""
|
||||||
|
return Benchmark("test-suite", storage_path=tmp_path / "benchmarks")
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def sample_data() -> tuple[list[str], list[str]]:
|
||||||
|
"""Sample candidates and references for testing."""
|
||||||
|
candidates = [
|
||||||
|
"The quick brown fox jumps over the lazy dog.",
|
||||||
|
"A fast auburn fox leaps above the sleepy hound.",
|
||||||
|
]
|
||||||
|
references = [
|
||||||
|
"The quick brown fox jumps over the lazy dog.",
|
||||||
|
"The swift brown fox jumps over the lazy dog.",
|
||||||
|
]
|
||||||
|
return candidates, references
|
||||||
|
|
||||||
|
|
||||||
|
class TestBenchmarkInit:
|
||||||
|
"""Tests for Benchmark initialisation."""
|
||||||
|
|
||||||
|
def test_creates_storage_directory(self, tmp_path: Path) -> None:
|
||||||
|
"""Benchmark creates storage directory on init."""
|
||||||
|
storage_path = tmp_path / "benchmarks"
|
||||||
|
Benchmark("my-suite", storage_path=storage_path)
|
||||||
|
|
||||||
|
assert storage_path.exists()
|
||||||
|
|
||||||
|
def test_name_property(self, benchmark: Benchmark) -> None:
|
||||||
|
"""Benchmark exposes its name."""
|
||||||
|
assert benchmark.name == "test-suite"
|
||||||
|
|
||||||
|
|
||||||
|
class TestEvaluate:
|
||||||
|
"""Tests for the evaluate method."""
|
||||||
|
|
||||||
|
def test_evaluate_stores_run(
|
||||||
|
self, benchmark: Benchmark, sample_data: tuple[list[str], list[str]]
|
||||||
|
) -> None:
|
||||||
|
"""Evaluate creates and stores a benchmark run."""
|
||||||
|
candidates, references = sample_data
|
||||||
|
|
||||||
|
run = benchmark.evaluate(candidates, references)
|
||||||
|
|
||||||
|
assert isinstance(run, BenchmarkRun)
|
||||||
|
assert run.benchmark_name == "test-suite"
|
||||||
|
assert run.sample_count == 2
|
||||||
|
|
||||||
|
def test_evaluate_returns_metrics(
|
||||||
|
self, benchmark: Benchmark, sample_data: tuple[list[str], list[str]]
|
||||||
|
) -> None:
|
||||||
|
"""Evaluate computes default metrics."""
|
||||||
|
candidates, references = sample_data
|
||||||
|
|
||||||
|
run = benchmark.evaluate(candidates, references)
|
||||||
|
|
||||||
|
# Default metrics are rouge_l and bleu4
|
||||||
|
assert "rouge_l" in run.metrics
|
||||||
|
assert "bleu4" in run.metrics
|
||||||
|
assert 0.0 <= run.metrics["rouge_l"] <= 1.0
|
||||||
|
assert 0.0 <= run.metrics["bleu4"] <= 1.0
|
||||||
|
|
||||||
|
def test_evaluate_custom_metrics(
|
||||||
|
self, benchmark: Benchmark, sample_data: tuple[list[str], list[str]]
|
||||||
|
) -> None:
|
||||||
|
"""Evaluate can compute custom metrics."""
|
||||||
|
candidates, references = sample_data
|
||||||
|
|
||||||
|
run = benchmark.evaluate(
|
||||||
|
candidates, references, metrics=["bleu1", "bleu2", "rouge1"]
|
||||||
|
)
|
||||||
|
|
||||||
|
assert "bleu1" in run.metrics
|
||||||
|
assert "bleu2" in run.metrics
|
||||||
|
assert "rouge1" in run.metrics
|
||||||
|
assert "bleu4" not in run.metrics # Not requested
|
||||||
|
|
||||||
|
def test_evaluate_with_metadata(
|
||||||
|
self, benchmark: Benchmark, sample_data: tuple[list[str], list[str]]
|
||||||
|
) -> None:
|
||||||
|
"""Evaluate can include metadata."""
|
||||||
|
candidates, references = sample_data
|
||||||
|
|
||||||
|
run = benchmark.evaluate(
|
||||||
|
candidates, references, metadata={"git_sha": "abc123", "model": "gpt-4"}
|
||||||
|
)
|
||||||
|
|
||||||
|
assert run.metadata == {"git_sha": "abc123", "model": "gpt-4"}
|
||||||
|
|
||||||
|
def test_evaluate_stores_retrievable(
|
||||||
|
self, benchmark: Benchmark, sample_data: tuple[list[str], list[str]]
|
||||||
|
) -> None:
|
||||||
|
"""Stored run can be retrieved."""
|
||||||
|
candidates, references = sample_data
|
||||||
|
run = benchmark.evaluate(candidates, references)
|
||||||
|
|
||||||
|
history = benchmark.get_history()
|
||||||
|
|
||||||
|
assert len(history) == 1
|
||||||
|
assert history[0].id == run.id
|
||||||
|
|
||||||
|
|
||||||
|
class TestCheckRegression:
|
||||||
|
"""Tests for regression checking."""
|
||||||
|
|
||||||
|
def test_check_no_runs(self, benchmark: Benchmark) -> None:
|
||||||
|
"""No regression when no runs exist."""
|
||||||
|
report = benchmark.check_regression()
|
||||||
|
|
||||||
|
assert not report.detected
|
||||||
|
assert report.baseline == {}
|
||||||
|
assert report.current == {}
|
||||||
|
|
||||||
|
def test_check_single_run(
|
||||||
|
self, benchmark: Benchmark, sample_data: tuple[list[str], list[str]]
|
||||||
|
) -> None:
|
||||||
|
"""No regression with single run (no baseline)."""
|
||||||
|
candidates, references = sample_data
|
||||||
|
benchmark.evaluate(candidates, references)
|
||||||
|
|
||||||
|
report = benchmark.check_regression()
|
||||||
|
|
||||||
|
# First run has no baseline to compare against
|
||||||
|
assert not report.detected
|
||||||
|
|
||||||
|
def test_check_stable_metrics(
|
||||||
|
self, benchmark: Benchmark, sample_data: tuple[list[str], list[str]]
|
||||||
|
) -> None:
|
||||||
|
"""No regression when metrics are stable."""
|
||||||
|
candidates, references = sample_data
|
||||||
|
|
||||||
|
# Run multiple times with same data
|
||||||
|
for _ in range(3):
|
||||||
|
benchmark.evaluate(candidates, references)
|
||||||
|
|
||||||
|
report = benchmark.check_regression()
|
||||||
|
assert not report.detected
|
||||||
|
|
||||||
|
def test_check_reports_regression(self, tmp_path: Path) -> None:
|
||||||
|
"""Reports regression when metrics drop significantly."""
|
||||||
|
benchmark = Benchmark("regress-test", storage_path=tmp_path / "benchmarks")
|
||||||
|
|
||||||
|
# First run with good metrics
|
||||||
|
good_candidates = ["The quick brown fox jumps."]
|
||||||
|
good_references = ["The quick brown fox jumps."]
|
||||||
|
benchmark.evaluate(good_candidates, good_references)
|
||||||
|
|
||||||
|
# Second run with worse metrics (different text)
|
||||||
|
bad_candidates = ["Something completely different here."]
|
||||||
|
benchmark.evaluate(bad_candidates, good_references)
|
||||||
|
|
||||||
|
report = benchmark.check_regression(tolerance=0.05)
|
||||||
|
|
||||||
|
# Should detect regression since second run is very different
|
||||||
|
assert report.detected or any(d < -0.05 for d in report.deltas.values())
|
||||||
|
|
||||||
|
|
||||||
|
class TestAssertNoRegression:
|
||||||
|
"""Tests for assert_no_regression method."""
|
||||||
|
|
||||||
|
def test_passes_when_stable(
|
||||||
|
self, benchmark: Benchmark, sample_data: tuple[list[str], list[str]]
|
||||||
|
) -> None:
|
||||||
|
"""Does not raise when metrics are stable."""
|
||||||
|
candidates, references = sample_data
|
||||||
|
|
||||||
|
for _ in range(3):
|
||||||
|
benchmark.evaluate(candidates, references)
|
||||||
|
|
||||||
|
# Should not raise
|
||||||
|
benchmark.assert_no_regression()
|
||||||
|
|
||||||
|
def test_raises_on_regression(self, tmp_path: Path) -> None:
|
||||||
|
"""Raises RegressionDetectedError when quality drops."""
|
||||||
|
benchmark = Benchmark("regress-test", storage_path=tmp_path / "benchmarks")
|
||||||
|
|
||||||
|
# Establish baseline with perfect match
|
||||||
|
perfect = ["The quick brown fox."]
|
||||||
|
benchmark.evaluate(perfect, perfect)
|
||||||
|
|
||||||
|
# Second run with terrible match
|
||||||
|
terrible = ["Completely unrelated text."]
|
||||||
|
benchmark.evaluate(terrible, perfect)
|
||||||
|
|
||||||
|
with pytest.raises(RegressionDetectedError):
|
||||||
|
benchmark.assert_no_regression(tolerance=0.05)
|
||||||
|
|
||||||
|
|
||||||
|
class TestGetHistory:
|
||||||
|
"""Tests for get_history method."""
|
||||||
|
|
||||||
|
def test_empty_history(self, benchmark: Benchmark) -> None:
|
||||||
|
"""Returns empty list when no runs."""
|
||||||
|
history = benchmark.get_history()
|
||||||
|
assert history == []
|
||||||
|
|
||||||
|
def test_returns_runs(
|
||||||
|
self, benchmark: Benchmark, sample_data: tuple[list[str], list[str]]
|
||||||
|
) -> None:
|
||||||
|
"""Returns benchmark runs."""
|
||||||
|
candidates, references = sample_data
|
||||||
|
|
||||||
|
run1 = benchmark.evaluate(candidates, references)
|
||||||
|
run2 = benchmark.evaluate(candidates, references)
|
||||||
|
|
||||||
|
history = benchmark.get_history()
|
||||||
|
|
||||||
|
assert len(history) == 2
|
||||||
|
assert history[0].id == run2.id # Most recent first
|
||||||
|
assert history[1].id == run1.id
|
||||||
|
|
||||||
|
def test_respects_limit(
|
||||||
|
self, benchmark: Benchmark, sample_data: tuple[list[str], list[str]]
|
||||||
|
) -> None:
|
||||||
|
"""Respects limit parameter."""
|
||||||
|
candidates, references = sample_data
|
||||||
|
|
||||||
|
for _ in range(5):
|
||||||
|
benchmark.evaluate(candidates, references)
|
||||||
|
|
||||||
|
history = benchmark.get_history(limit=3)
|
||||||
|
assert len(history) == 3
|
||||||
|
|
||||||
|
def test_default_limit(
|
||||||
|
self, benchmark: Benchmark, sample_data: tuple[list[str], list[str]]
|
||||||
|
) -> None:
|
||||||
|
"""Default limit is 20."""
|
||||||
|
candidates, references = sample_data
|
||||||
|
|
||||||
|
for _ in range(25):
|
||||||
|
benchmark.evaluate(candidates, references)
|
||||||
|
|
||||||
|
history = benchmark.get_history()
|
||||||
|
assert len(history) == 20
|
||||||
297
tests/test_benchmark/test_storage.py
Normal file
297
tests/test_benchmark/test_storage.py
Normal file
@@ -0,0 +1,297 @@
|
|||||||
|
"""Tests for benchmark SQLite storage."""
|
||||||
|
|
||||||
|
import sqlite3
|
||||||
|
import threading
|
||||||
|
from datetime import UTC, datetime
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
from veritext.benchmark.models import BenchmarkRun
|
||||||
|
from veritext.benchmark.storage import BenchmarkStorage
|
||||||
|
from veritext.core.exceptions import StorageError
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def db_path(tmp_path: Path) -> Path:
|
||||||
|
"""Return a temporary database path."""
|
||||||
|
return tmp_path / "benchmarks" / "test.db"
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def storage(db_path: Path) -> BenchmarkStorage:
|
||||||
|
"""Create a BenchmarkStorage instance."""
|
||||||
|
return BenchmarkStorage(db_path)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def sample_run() -> BenchmarkRun:
|
||||||
|
"""Create a sample benchmark run."""
|
||||||
|
return BenchmarkRun(
|
||||||
|
id="run-001",
|
||||||
|
benchmark_name="test-suite",
|
||||||
|
timestamp=datetime(2025, 1, 15, 12, 0, 0, tzinfo=UTC),
|
||||||
|
veritext_version="0.1.0-dev",
|
||||||
|
metrics={"bleu4": 0.75, "rouge_l": 0.82},
|
||||||
|
sample_count=100,
|
||||||
|
metadata={"git_sha": "abc123"},
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
class TestDatabaseCreation:
|
||||||
|
"""Tests for database initialisation."""
|
||||||
|
|
||||||
|
def test_creates_database_file(self, db_path: Path) -> None:
|
||||||
|
"""Storage creates the database file on init."""
|
||||||
|
assert not db_path.exists()
|
||||||
|
BenchmarkStorage(db_path)
|
||||||
|
assert db_path.exists()
|
||||||
|
|
||||||
|
def test_creates_parent_directories(self, tmp_path: Path) -> None:
|
||||||
|
"""Storage creates parent directories if needed."""
|
||||||
|
nested_path = tmp_path / "deep" / "nested" / "path" / "test.db"
|
||||||
|
BenchmarkStorage(nested_path)
|
||||||
|
assert nested_path.exists()
|
||||||
|
|
||||||
|
def test_creates_tables(self, db_path: Path) -> None:
|
||||||
|
"""Storage creates required tables."""
|
||||||
|
BenchmarkStorage(db_path)
|
||||||
|
|
||||||
|
conn = sqlite3.connect(str(db_path))
|
||||||
|
cursor = conn.execute("SELECT name FROM sqlite_master WHERE type='table'")
|
||||||
|
tables = {row[0] for row in cursor.fetchall()}
|
||||||
|
conn.close()
|
||||||
|
|
||||||
|
assert "benchmark_runs" in tables
|
||||||
|
assert "benchmark_metrics" in tables
|
||||||
|
|
||||||
|
def test_creates_index(self, db_path: Path) -> None:
|
||||||
|
"""Storage creates index on benchmark_name and timestamp."""
|
||||||
|
BenchmarkStorage(db_path)
|
||||||
|
|
||||||
|
conn = sqlite3.connect(str(db_path))
|
||||||
|
cursor = conn.execute("SELECT name FROM sqlite_master WHERE type='index'")
|
||||||
|
indices = {row[0] for row in cursor.fetchall()}
|
||||||
|
conn.close()
|
||||||
|
|
||||||
|
assert "idx_benchmark_name" in indices
|
||||||
|
|
||||||
|
|
||||||
|
class TestSaveRun:
|
||||||
|
"""Tests for saving benchmark runs."""
|
||||||
|
|
||||||
|
def test_save_run(
|
||||||
|
self, storage: BenchmarkStorage, sample_run: BenchmarkRun
|
||||||
|
) -> None:
|
||||||
|
"""Storage can save a benchmark run."""
|
||||||
|
storage.save_run(sample_run)
|
||||||
|
|
||||||
|
runs = storage.get_runs("test-suite")
|
||||||
|
assert len(runs) == 1
|
||||||
|
assert runs[0].id == "run-001"
|
||||||
|
|
||||||
|
def test_save_preserves_all_fields(
|
||||||
|
self, storage: BenchmarkStorage, sample_run: BenchmarkRun
|
||||||
|
) -> None:
|
||||||
|
"""Saved run preserves all fields correctly."""
|
||||||
|
storage.save_run(sample_run)
|
||||||
|
|
||||||
|
runs = storage.get_runs("test-suite")
|
||||||
|
run = runs[0]
|
||||||
|
|
||||||
|
assert run.id == sample_run.id
|
||||||
|
assert run.benchmark_name == sample_run.benchmark_name
|
||||||
|
assert run.timestamp == sample_run.timestamp
|
||||||
|
assert run.veritext_version == sample_run.veritext_version
|
||||||
|
assert run.metrics == sample_run.metrics
|
||||||
|
assert run.sample_count == sample_run.sample_count
|
||||||
|
assert run.metadata == sample_run.metadata
|
||||||
|
|
||||||
|
def test_save_duplicate_id_raises(
|
||||||
|
self, storage: BenchmarkStorage, sample_run: BenchmarkRun
|
||||||
|
) -> None:
|
||||||
|
"""Saving a run with duplicate ID raises StorageError."""
|
||||||
|
storage.save_run(sample_run)
|
||||||
|
|
||||||
|
with pytest.raises(StorageError, match="already exists"):
|
||||||
|
storage.save_run(sample_run)
|
||||||
|
|
||||||
|
def test_save_run_empty_metadata(self, storage: BenchmarkStorage) -> None:
|
||||||
|
"""Run with empty metadata saves correctly."""
|
||||||
|
run = BenchmarkRun(
|
||||||
|
id="run-no-meta",
|
||||||
|
benchmark_name="test-suite",
|
||||||
|
timestamp=datetime(2025, 1, 15, 12, 0, 0, tzinfo=UTC),
|
||||||
|
veritext_version="0.1.0-dev",
|
||||||
|
metrics={"bleu4": 0.5},
|
||||||
|
sample_count=10,
|
||||||
|
)
|
||||||
|
|
||||||
|
storage.save_run(run)
|
||||||
|
retrieved = storage.get_latest_run("test-suite")
|
||||||
|
|
||||||
|
assert retrieved is not None
|
||||||
|
assert retrieved.metadata == {}
|
||||||
|
|
||||||
|
|
||||||
|
class TestGetRuns:
|
||||||
|
"""Tests for retrieving benchmark runs."""
|
||||||
|
|
||||||
|
def test_get_runs_empty_database(self, storage: BenchmarkStorage) -> None:
|
||||||
|
"""Returns empty list for empty database."""
|
||||||
|
runs = storage.get_runs("nonexistent")
|
||||||
|
assert runs == []
|
||||||
|
|
||||||
|
def test_get_runs_filters_by_name(self, storage: BenchmarkStorage) -> None:
|
||||||
|
"""Returns only runs matching the benchmark name."""
|
||||||
|
run1 = BenchmarkRun(
|
||||||
|
id="run-1",
|
||||||
|
benchmark_name="suite-a",
|
||||||
|
timestamp=datetime(2025, 1, 15, 12, 0, 0, tzinfo=UTC),
|
||||||
|
veritext_version="0.1.0",
|
||||||
|
metrics={"bleu4": 0.5},
|
||||||
|
sample_count=10,
|
||||||
|
)
|
||||||
|
run2 = BenchmarkRun(
|
||||||
|
id="run-2",
|
||||||
|
benchmark_name="suite-b",
|
||||||
|
timestamp=datetime(2025, 1, 15, 12, 0, 0, tzinfo=UTC),
|
||||||
|
veritext_version="0.1.0",
|
||||||
|
metrics={"bleu4": 0.6},
|
||||||
|
sample_count=10,
|
||||||
|
)
|
||||||
|
|
||||||
|
storage.save_run(run1)
|
||||||
|
storage.save_run(run2)
|
||||||
|
|
||||||
|
runs_a = storage.get_runs("suite-a")
|
||||||
|
runs_b = storage.get_runs("suite-b")
|
||||||
|
|
||||||
|
assert len(runs_a) == 1
|
||||||
|
assert runs_a[0].id == "run-1"
|
||||||
|
assert len(runs_b) == 1
|
||||||
|
assert runs_b[0].id == "run-2"
|
||||||
|
|
||||||
|
def test_get_runs_ordered_by_timestamp(self, storage: BenchmarkStorage) -> None:
|
||||||
|
"""Returns runs ordered by timestamp, most recent first."""
|
||||||
|
run_old = BenchmarkRun(
|
||||||
|
id="run-old",
|
||||||
|
benchmark_name="test",
|
||||||
|
timestamp=datetime(2025, 1, 10, 12, 0, 0, tzinfo=UTC),
|
||||||
|
veritext_version="0.1.0",
|
||||||
|
metrics={"bleu4": 0.5},
|
||||||
|
sample_count=10,
|
||||||
|
)
|
||||||
|
run_new = BenchmarkRun(
|
||||||
|
id="run-new",
|
||||||
|
benchmark_name="test",
|
||||||
|
timestamp=datetime(2025, 1, 20, 12, 0, 0, tzinfo=UTC),
|
||||||
|
veritext_version="0.1.0",
|
||||||
|
metrics={"bleu4": 0.6},
|
||||||
|
sample_count=10,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Save in reverse order
|
||||||
|
storage.save_run(run_new)
|
||||||
|
storage.save_run(run_old)
|
||||||
|
|
||||||
|
runs = storage.get_runs("test")
|
||||||
|
assert runs[0].id == "run-new"
|
||||||
|
assert runs[1].id == "run-old"
|
||||||
|
|
||||||
|
def test_get_runs_with_limit(self, storage: BenchmarkStorage) -> None:
|
||||||
|
"""Respects limit parameter."""
|
||||||
|
for i in range(5):
|
||||||
|
run = BenchmarkRun(
|
||||||
|
id=f"run-{i}",
|
||||||
|
benchmark_name="test",
|
||||||
|
timestamp=datetime(2025, 1, i + 1, 12, 0, 0, tzinfo=UTC),
|
||||||
|
veritext_version="0.1.0",
|
||||||
|
metrics={"bleu4": 0.5 + i * 0.1},
|
||||||
|
sample_count=10,
|
||||||
|
)
|
||||||
|
storage.save_run(run)
|
||||||
|
|
||||||
|
runs = storage.get_runs("test", limit=3)
|
||||||
|
assert len(runs) == 3
|
||||||
|
|
||||||
|
|
||||||
|
class TestGetLatestRun:
|
||||||
|
"""Tests for getting the latest run."""
|
||||||
|
|
||||||
|
def test_get_latest_run_empty(self, storage: BenchmarkStorage) -> None:
|
||||||
|
"""Returns None for empty database."""
|
||||||
|
result = storage.get_latest_run("nonexistent")
|
||||||
|
assert result is None
|
||||||
|
|
||||||
|
def test_get_latest_run(self, storage: BenchmarkStorage) -> None:
|
||||||
|
"""Returns the most recent run."""
|
||||||
|
run_old = BenchmarkRun(
|
||||||
|
id="run-old",
|
||||||
|
benchmark_name="test",
|
||||||
|
timestamp=datetime(2025, 1, 10, 12, 0, 0, tzinfo=UTC),
|
||||||
|
veritext_version="0.1.0",
|
||||||
|
metrics={"bleu4": 0.5},
|
||||||
|
sample_count=10,
|
||||||
|
)
|
||||||
|
run_new = BenchmarkRun(
|
||||||
|
id="run-new",
|
||||||
|
benchmark_name="test",
|
||||||
|
timestamp=datetime(2025, 1, 20, 12, 0, 0, tzinfo=UTC),
|
||||||
|
veritext_version="0.1.0",
|
||||||
|
metrics={"bleu4": 0.6},
|
||||||
|
sample_count=10,
|
||||||
|
)
|
||||||
|
|
||||||
|
storage.save_run(run_old)
|
||||||
|
storage.save_run(run_new)
|
||||||
|
|
||||||
|
latest = storage.get_latest_run("test")
|
||||||
|
assert latest is not None
|
||||||
|
assert latest.id == "run-new"
|
||||||
|
|
||||||
|
|
||||||
|
class TestConcurrentAccess:
|
||||||
|
"""Tests for concurrent database access."""
|
||||||
|
|
||||||
|
def test_concurrent_writes(self, db_path: Path) -> None:
|
||||||
|
"""Multiple threads can write concurrently with WAL mode."""
|
||||||
|
errors: list[Exception] = []
|
||||||
|
|
||||||
|
def write_run(run_id: int) -> None:
|
||||||
|
try:
|
||||||
|
storage = BenchmarkStorage(db_path)
|
||||||
|
run = BenchmarkRun(
|
||||||
|
id=f"run-{run_id}",
|
||||||
|
benchmark_name="test",
|
||||||
|
timestamp=datetime(2025, 1, 15, 12, 0, run_id, tzinfo=UTC),
|
||||||
|
veritext_version="0.1.0",
|
||||||
|
metrics={"bleu4": 0.5},
|
||||||
|
sample_count=10,
|
||||||
|
)
|
||||||
|
storage.save_run(run)
|
||||||
|
except Exception as e:
|
||||||
|
errors.append(e)
|
||||||
|
|
||||||
|
threads = [threading.Thread(target=write_run, args=(i,)) for i in range(10)]
|
||||||
|
for t in threads:
|
||||||
|
t.start()
|
||||||
|
for t in threads:
|
||||||
|
t.join()
|
||||||
|
|
||||||
|
assert not errors, f"Concurrent writes failed: {errors}"
|
||||||
|
|
||||||
|
storage = BenchmarkStorage(db_path)
|
||||||
|
runs = storage.get_runs("test")
|
||||||
|
assert len(runs) == 10
|
||||||
|
|
||||||
|
def test_wal_mode_enabled(self, db_path: Path) -> None:
|
||||||
|
"""Database uses WAL journal mode."""
|
||||||
|
BenchmarkStorage(db_path)
|
||||||
|
|
||||||
|
conn = sqlite3.connect(str(db_path))
|
||||||
|
cursor = conn.execute("PRAGMA journal_mode")
|
||||||
|
mode = cursor.fetchone()[0]
|
||||||
|
conn.close()
|
||||||
|
|
||||||
|
assert mode.lower() == "wal"
|
||||||
1
tests/test_pytest_plugin/__init__.py
Normal file
1
tests/test_pytest_plugin/__init__.py
Normal file
@@ -0,0 +1 @@
|
|||||||
|
"""Tests for the Veritext pytest plugin."""
|
||||||
32
tests/test_pytest_plugin/conftest.py
Normal file
32
tests/test_pytest_plugin/conftest.py
Normal file
@@ -0,0 +1,32 @@
|
|||||||
|
"""Pytest configuration for pytest_plugin tests."""
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
from veritext.pytest_plugin.fixtures import ValidatorFactory
|
||||||
|
|
||||||
|
# Enable the pytester fixture for plugin testing
|
||||||
|
pytest_plugins = ["pytester"]
|
||||||
|
|
||||||
|
# Re-export fixtures from the plugin module for testing
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def text_validator() -> ValidatorFactory:
|
||||||
|
"""Provide a factory for building validators."""
|
||||||
|
return ValidatorFactory()
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def validation_context() -> type:
|
||||||
|
"""Provide a factory for creating ValidationContext objects."""
|
||||||
|
from typing import Any
|
||||||
|
|
||||||
|
from veritext.core.types import ValidationContext
|
||||||
|
|
||||||
|
def _create(
|
||||||
|
reference: str | list[str] | None = None,
|
||||||
|
**metadata: Any,
|
||||||
|
) -> ValidationContext:
|
||||||
|
return ValidationContext(reference=reference, metadata=metadata)
|
||||||
|
|
||||||
|
return _create
|
||||||
211
tests/test_pytest_plugin/test_assertions.py
Normal file
211
tests/test_pytest_plugin/test_assertions.py
Normal file
@@ -0,0 +1,211 @@
|
|||||||
|
"""Tests for the validate_text assertion function."""
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
from veritext.pytest_plugin import validate_text
|
||||||
|
|
||||||
|
|
||||||
|
class TestValidateTextBasicValidation:
|
||||||
|
"""Test basic validation scenarios."""
|
||||||
|
|
||||||
|
def test_passes_with_valid_length(self) -> None:
|
||||||
|
"""Test validation passes when length constraints are met."""
|
||||||
|
text = "The quick brown fox jumps over the lazy dog."
|
||||||
|
validate_text(text, min_length=10, max_length=100)
|
||||||
|
|
||||||
|
def test_fails_when_too_short(self) -> None:
|
||||||
|
"""Test validation fails when text is below minimum length."""
|
||||||
|
text = "Short."
|
||||||
|
with pytest.raises(AssertionError) as exc_info:
|
||||||
|
validate_text(text, min_length=50)
|
||||||
|
assert "length" in str(exc_info.value).lower()
|
||||||
|
|
||||||
|
def test_fails_when_too_long(self) -> None:
|
||||||
|
"""Test validation fails when text exceeds maximum length."""
|
||||||
|
text = "A" * 100
|
||||||
|
with pytest.raises(AssertionError) as exc_info:
|
||||||
|
validate_text(text, max_length=50)
|
||||||
|
assert "length" in str(exc_info.value).lower()
|
||||||
|
|
||||||
|
|
||||||
|
class TestValidateTextReadability:
|
||||||
|
"""Test readability validation."""
|
||||||
|
|
||||||
|
def test_passes_with_simple_text(self) -> None:
|
||||||
|
"""Test validation passes for simple, readable text."""
|
||||||
|
text = "The cat sat on the mat. It was a nice day."
|
||||||
|
validate_text(text, max_reading_grade=10.0)
|
||||||
|
|
||||||
|
def test_fails_with_complex_text(self) -> None:
|
||||||
|
"""Test validation fails for overly complex text."""
|
||||||
|
text = (
|
||||||
|
"The implementation of sophisticated metacognitive strategies "
|
||||||
|
"necessitates the comprehensive understanding of epistemological "
|
||||||
|
"frameworks and their corresponding methodological implications."
|
||||||
|
)
|
||||||
|
with pytest.raises(AssertionError) as exc_info:
|
||||||
|
validate_text(text, max_reading_grade=3.0)
|
||||||
|
assert "readability" in str(exc_info.value).lower()
|
||||||
|
|
||||||
|
|
||||||
|
class TestValidateTextPatterns:
|
||||||
|
"""Test pattern matching validation."""
|
||||||
|
|
||||||
|
def test_passes_when_contains_pattern(self) -> None:
|
||||||
|
"""Test validation passes when required pattern is present."""
|
||||||
|
text = "Please contact support@example.com for assistance."
|
||||||
|
validate_text(text, must_contain=["support@example.com"])
|
||||||
|
|
||||||
|
def test_fails_when_missing_required_pattern(self) -> None:
|
||||||
|
"""Test validation fails when required pattern is missing."""
|
||||||
|
text = "Please contact us for assistance."
|
||||||
|
with pytest.raises(AssertionError) as exc_info:
|
||||||
|
validate_text(text, must_contain=["@example.com"])
|
||||||
|
assert "contains" in str(exc_info.value).lower()
|
||||||
|
|
||||||
|
def test_passes_when_excludes_pattern(self) -> None:
|
||||||
|
"""Test validation passes when forbidden pattern is absent."""
|
||||||
|
text = "The report is complete and reviewed."
|
||||||
|
validate_text(text, must_exclude=["TODO", "FIXME"])
|
||||||
|
|
||||||
|
def test_fails_when_contains_forbidden_pattern(self) -> None:
|
||||||
|
"""Test validation fails when forbidden pattern is present."""
|
||||||
|
text = "The report is almost done. TODO: add conclusion."
|
||||||
|
with pytest.raises(AssertionError) as exc_info:
|
||||||
|
validate_text(text, must_exclude=["TODO"])
|
||||||
|
assert "excludes" in str(exc_info.value).lower()
|
||||||
|
|
||||||
|
|
||||||
|
class TestValidateTextComparisonMetrics:
|
||||||
|
"""Test comparison-based validation (BLEU, ROUGE)."""
|
||||||
|
|
||||||
|
def test_passes_with_high_bleu_score(self) -> None:
|
||||||
|
"""Test validation passes when BLEU score meets threshold."""
|
||||||
|
reference = "The quick brown fox jumps over the lazy dog."
|
||||||
|
text = "The quick brown fox jumps over the lazy dog."
|
||||||
|
validate_text(text, reference=reference, min_bleu=0.9)
|
||||||
|
|
||||||
|
def test_fails_with_low_bleu_score(self) -> None:
|
||||||
|
"""Test validation fails when BLEU score is below threshold."""
|
||||||
|
reference = "The quick brown fox jumps over the lazy dog."
|
||||||
|
text = "A slow red cat sleeps under the active mouse."
|
||||||
|
with pytest.raises(AssertionError) as exc_info:
|
||||||
|
validate_text(text, reference=reference, min_bleu=0.5)
|
||||||
|
assert "bleu" in str(exc_info.value).lower()
|
||||||
|
|
||||||
|
def test_passes_with_high_rouge_score(self) -> None:
|
||||||
|
"""Test validation passes when ROUGE score meets threshold."""
|
||||||
|
reference = "Machine learning models require extensive training data."
|
||||||
|
text = "Machine learning models need extensive training data."
|
||||||
|
validate_text(text, reference=reference, min_rouge=0.5)
|
||||||
|
|
||||||
|
def test_fails_with_low_rouge_score(self) -> None:
|
||||||
|
"""Test validation fails when ROUGE score is below threshold."""
|
||||||
|
reference = "The algorithm processes input data efficiently."
|
||||||
|
text = "Cats enjoy sleeping in sunny spots."
|
||||||
|
with pytest.raises(AssertionError) as exc_info:
|
||||||
|
validate_text(text, reference=reference, min_rouge=0.5)
|
||||||
|
assert "rouge" in str(exc_info.value).lower()
|
||||||
|
|
||||||
|
|
||||||
|
class TestValidateTextErrorHandling:
|
||||||
|
"""Test error handling and edge cases."""
|
||||||
|
|
||||||
|
def test_raises_value_error_when_no_criteria(self) -> None:
|
||||||
|
"""Test that ValueError is raised when no validation criteria provided."""
|
||||||
|
with pytest.raises(ValueError, match="At least one validation criterion"):
|
||||||
|
validate_text("Some text")
|
||||||
|
|
||||||
|
def test_raises_value_error_when_bleu_without_reference(self) -> None:
|
||||||
|
"""Test that ValueError is raised when BLEU requested without reference."""
|
||||||
|
with pytest.raises(ValueError, match="Reference text required"):
|
||||||
|
validate_text("Some text", min_bleu=0.5)
|
||||||
|
|
||||||
|
def test_raises_value_error_when_rouge_without_reference(self) -> None:
|
||||||
|
"""Test that ValueError is raised when ROUGE requested without reference."""
|
||||||
|
with pytest.raises(ValueError, match="Reference text required"):
|
||||||
|
validate_text("Some text", min_rouge=0.5)
|
||||||
|
|
||||||
|
def test_raises_value_error_when_semantic_without_reference(self) -> None:
|
||||||
|
"""Test that ValueError is raised for semantic without reference."""
|
||||||
|
with pytest.raises(ValueError, match="Reference text required"):
|
||||||
|
validate_text("Some text", min_semantic=0.5)
|
||||||
|
|
||||||
|
|
||||||
|
class TestValidateTextMultipleCriteria:
|
||||||
|
"""Test validation with multiple criteria combined."""
|
||||||
|
|
||||||
|
def test_passes_all_criteria(self) -> None:
|
||||||
|
"""Test validation passes when all criteria are met."""
|
||||||
|
reference = "The quick brown fox jumps over the lazy dog."
|
||||||
|
text = "The quick brown fox jumps over the lazy dog."
|
||||||
|
validate_text(
|
||||||
|
text,
|
||||||
|
reference=reference,
|
||||||
|
min_bleu=0.9,
|
||||||
|
min_length=10,
|
||||||
|
max_length=100,
|
||||||
|
)
|
||||||
|
|
||||||
|
def test_fails_when_one_criterion_fails(self) -> None:
|
||||||
|
"""Test validation fails when any criterion fails."""
|
||||||
|
reference = "The quick brown fox jumps over the lazy dog."
|
||||||
|
text = "The quick brown fox jumps over the lazy dog."
|
||||||
|
with pytest.raises(AssertionError):
|
||||||
|
validate_text(
|
||||||
|
text,
|
||||||
|
reference=reference,
|
||||||
|
min_bleu=0.9,
|
||||||
|
max_length=10, # This will fail
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
class TestValidateTextFailureMessage:
|
||||||
|
"""Test failure message formatting."""
|
||||||
|
|
||||||
|
def test_failure_message_includes_text_preview(self) -> None:
|
||||||
|
"""Test that failure message includes preview of the text."""
|
||||||
|
text = "Short text"
|
||||||
|
with pytest.raises(AssertionError) as exc_info:
|
||||||
|
validate_text(text, min_length=100)
|
||||||
|
assert "Short text" in str(exc_info.value)
|
||||||
|
|
||||||
|
def test_failure_message_truncates_long_text(self) -> None:
|
||||||
|
"""Test that long text is truncated in failure message."""
|
||||||
|
text = "A" * 200
|
||||||
|
with pytest.raises(AssertionError) as exc_info:
|
||||||
|
validate_text(text, max_length=50)
|
||||||
|
message = str(exc_info.value)
|
||||||
|
assert "..." in message
|
||||||
|
assert "A" * 200 not in message
|
||||||
|
|
||||||
|
def test_failure_message_includes_check_details(self) -> None:
|
||||||
|
"""Test that failure message includes check name and details."""
|
||||||
|
text = "Short"
|
||||||
|
with pytest.raises(AssertionError) as exc_info:
|
||||||
|
validate_text(text, min_length=100)
|
||||||
|
message = str(exc_info.value)
|
||||||
|
assert "Failed checks:" in message
|
||||||
|
assert "length" in message.lower()
|
||||||
|
|
||||||
|
|
||||||
|
class TestValidateTextListReference:
|
||||||
|
"""Test validation with list of reference texts."""
|
||||||
|
|
||||||
|
def test_bleu_with_multiple_references(self) -> None:
|
||||||
|
"""Test BLEU validation accepts multiple reference texts."""
|
||||||
|
references = [
|
||||||
|
"The quick brown fox jumps over the lazy dog.",
|
||||||
|
"A fast brown fox leaps over a sleepy dog.",
|
||||||
|
]
|
||||||
|
text = "The quick brown fox jumps over the lazy dog."
|
||||||
|
validate_text(text, reference=references, min_bleu=0.9)
|
||||||
|
|
||||||
|
def test_rouge_with_multiple_references(self) -> None:
|
||||||
|
"""Test ROUGE validation accepts multiple reference texts."""
|
||||||
|
references = [
|
||||||
|
"Machine learning requires data.",
|
||||||
|
"ML models need training data.",
|
||||||
|
]
|
||||||
|
text = "Machine learning models require training data."
|
||||||
|
validate_text(text, reference=references, min_rouge=0.3)
|
||||||
88
tests/test_pytest_plugin/test_fixtures.py
Normal file
88
tests/test_pytest_plugin/test_fixtures.py
Normal file
@@ -0,0 +1,88 @@
|
|||||||
|
"""Tests for the pytest plugin fixtures."""
|
||||||
|
|
||||||
|
from veritext.core.types import ValidationContext
|
||||||
|
from veritext.pytest_plugin.fixtures import ValidatorFactory
|
||||||
|
from veritext.validators import bleu, length
|
||||||
|
|
||||||
|
|
||||||
|
class TestValidatorFactory:
|
||||||
|
"""Test the ValidatorFactory class."""
|
||||||
|
|
||||||
|
def test_creates_validator_from_checks(self) -> None:
|
||||||
|
"""Test that factory creates a callable validator."""
|
||||||
|
factory = ValidatorFactory()
|
||||||
|
validate = factory(checks=[length(min_chars=5)])
|
||||||
|
|
||||||
|
result = validate("Hello, World!")
|
||||||
|
assert result.passed
|
||||||
|
|
||||||
|
def test_validator_uses_provided_reference(self) -> None:
|
||||||
|
"""Test that factory passes reference to context."""
|
||||||
|
factory = ValidatorFactory()
|
||||||
|
reference = "The quick brown fox."
|
||||||
|
validate = factory(
|
||||||
|
checks=[bleu(min_score=0.5)],
|
||||||
|
reference=reference,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Exact match should pass
|
||||||
|
result = validate("The quick brown fox.")
|
||||||
|
assert result.passed
|
||||||
|
|
||||||
|
def test_validator_returns_validation_result(self) -> None:
|
||||||
|
"""Test that validator returns a ValidationResult."""
|
||||||
|
factory = ValidatorFactory()
|
||||||
|
validate = factory(checks=[length(min_chars=100)])
|
||||||
|
|
||||||
|
result = validate("Short")
|
||||||
|
assert not result.passed
|
||||||
|
assert len(result.checks) == 1
|
||||||
|
assert result.checks[0].name == "length"
|
||||||
|
|
||||||
|
|
||||||
|
class TestTextValidatorFixture:
|
||||||
|
"""Test the text_validator fixture."""
|
||||||
|
|
||||||
|
def test_fixture_returns_factory(self, text_validator: ValidatorFactory) -> None:
|
||||||
|
"""Test that fixture provides a ValidatorFactory."""
|
||||||
|
assert isinstance(text_validator, ValidatorFactory)
|
||||||
|
|
||||||
|
def test_fixture_can_create_validators(
|
||||||
|
self,
|
||||||
|
text_validator: ValidatorFactory,
|
||||||
|
) -> None:
|
||||||
|
"""Test that fixture can be used to create validators."""
|
||||||
|
validate = text_validator(checks=[length(min_chars=5, max_chars=50)])
|
||||||
|
|
||||||
|
assert validate("Hello, World!").passed
|
||||||
|
assert not validate("Hi").passed
|
||||||
|
|
||||||
|
|
||||||
|
class TestValidationContextFixture:
|
||||||
|
"""Test the validation_context fixture."""
|
||||||
|
|
||||||
|
def test_fixture_creates_context(
|
||||||
|
self,
|
||||||
|
validation_context: type,
|
||||||
|
) -> None:
|
||||||
|
"""Test that fixture creates ValidationContext."""
|
||||||
|
ctx = validation_context(reference="Test reference")
|
||||||
|
assert isinstance(ctx, ValidationContext)
|
||||||
|
assert ctx.reference == "Test reference"
|
||||||
|
|
||||||
|
def test_fixture_accepts_metadata(
|
||||||
|
self,
|
||||||
|
validation_context: type,
|
||||||
|
) -> None:
|
||||||
|
"""Test that fixture passes metadata to context."""
|
||||||
|
ctx = validation_context(reference="Test", source="unit_test", version=1)
|
||||||
|
assert ctx.metadata["source"] == "unit_test"
|
||||||
|
assert ctx.metadata["version"] == 1
|
||||||
|
|
||||||
|
def test_fixture_allows_no_reference(
|
||||||
|
self,
|
||||||
|
validation_context: type,
|
||||||
|
) -> None:
|
||||||
|
"""Test that fixture allows creating context without reference."""
|
||||||
|
ctx = validation_context()
|
||||||
|
assert ctx.reference is None
|
||||||
100
tests/test_pytest_plugin/test_plugin.py
Normal file
100
tests/test_pytest_plugin/test_plugin.py
Normal file
@@ -0,0 +1,100 @@
|
|||||||
|
"""Tests for the pytest plugin hooks."""
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def plugin_pytester(pytester: pytest.Pytester) -> pytest.Pytester:
|
||||||
|
"""Configure pytester to use the veritext plugin."""
|
||||||
|
pytester.makeconftest(
|
||||||
|
"""
|
||||||
|
pytest_plugins = ['veritext.pytest_plugin']
|
||||||
|
"""
|
||||||
|
)
|
||||||
|
return pytester
|
||||||
|
|
||||||
|
|
||||||
|
def test_plugin_registers_marker(plugin_pytester: pytest.Pytester) -> None:
|
||||||
|
"""Test that the text_validation marker is registered."""
|
||||||
|
plugin_pytester.makepyfile(
|
||||||
|
"""
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
@pytest.mark.text_validation
|
||||||
|
def test_example():
|
||||||
|
pass
|
||||||
|
"""
|
||||||
|
)
|
||||||
|
# Run with strict markers - this will fail if marker isn't registered
|
||||||
|
result = plugin_pytester.runpytest("--strict-markers")
|
||||||
|
result.assert_outcomes(passed=1)
|
||||||
|
|
||||||
|
|
||||||
|
def test_marker_can_be_used(plugin_pytester: pytest.Pytester) -> None:
|
||||||
|
"""Test that the text_validation marker can filter tests."""
|
||||||
|
plugin_pytester.makepyfile(
|
||||||
|
"""
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
@pytest.mark.text_validation
|
||||||
|
def test_marked():
|
||||||
|
pass
|
||||||
|
|
||||||
|
def test_unmarked():
|
||||||
|
pass
|
||||||
|
"""
|
||||||
|
)
|
||||||
|
# Run only marked tests
|
||||||
|
result = plugin_pytester.runpytest("-m", "text_validation")
|
||||||
|
result.assert_outcomes(passed=1)
|
||||||
|
|
||||||
|
|
||||||
|
def test_validate_text_is_importable(plugin_pytester: pytest.Pytester) -> None:
|
||||||
|
"""Test that validate_text can be imported from the plugin."""
|
||||||
|
plugin_pytester.makepyfile(
|
||||||
|
"""
|
||||||
|
from veritext.pytest_plugin import validate_text
|
||||||
|
|
||||||
|
def test_import():
|
||||||
|
assert callable(validate_text)
|
||||||
|
"""
|
||||||
|
)
|
||||||
|
result = plugin_pytester.runpytest()
|
||||||
|
result.assert_outcomes(passed=1)
|
||||||
|
|
||||||
|
|
||||||
|
def test_validate_text_works_in_tests(plugin_pytester: pytest.Pytester) -> None:
|
||||||
|
"""Test that validate_text can be used in test functions."""
|
||||||
|
plugin_pytester.makepyfile(
|
||||||
|
"""
|
||||||
|
from veritext.pytest_plugin import validate_text
|
||||||
|
|
||||||
|
def test_validation_passes():
|
||||||
|
validate_text(
|
||||||
|
"The quick brown fox jumps over the lazy dog.",
|
||||||
|
min_length=10,
|
||||||
|
max_length=100,
|
||||||
|
)
|
||||||
|
"""
|
||||||
|
)
|
||||||
|
result = plugin_pytester.runpytest()
|
||||||
|
result.assert_outcomes(passed=1)
|
||||||
|
|
||||||
|
|
||||||
|
def test_validate_text_failure_in_tests(plugin_pytester: pytest.Pytester) -> None:
|
||||||
|
"""Test that validate_text failures are reported properly."""
|
||||||
|
plugin_pytester.makepyfile(
|
||||||
|
"""
|
||||||
|
from veritext.pytest_plugin import validate_text
|
||||||
|
|
||||||
|
def test_validation_fails():
|
||||||
|
validate_text(
|
||||||
|
"Short",
|
||||||
|
min_length=100,
|
||||||
|
)
|
||||||
|
"""
|
||||||
|
)
|
||||||
|
result = plugin_pytester.runpytest()
|
||||||
|
result.assert_outcomes(failed=1)
|
||||||
|
# Check that failure message contains useful information
|
||||||
|
result.stdout.fnmatch_lines(["*Text validation failed*"])
|
||||||
Reference in New Issue
Block a user