Add comprehensive test suite for validate command, benchmark commands, input readers, and output formatters using Typer CliRunner.
119 lines
3.9 KiB
Python
119 lines
3.9 KiB
Python
"""Tests for CLI output formatters."""
|
|
|
|
from datetime import UTC, datetime
|
|
|
|
from veritext.benchmark.models import BenchmarkRun, RegressionReport
|
|
from veritext.cli.formatters import (
|
|
format_benchmark_history,
|
|
format_regression_report,
|
|
format_validation_json,
|
|
format_validation_simple,
|
|
format_validation_table,
|
|
)
|
|
|
|
|
|
class TestFormatValidationTable:
|
|
def test_format_empty_results(self) -> None:
|
|
table = format_validation_table({})
|
|
assert table.title == "Validation Results"
|
|
assert table.row_count == 0
|
|
|
|
def test_format_single_metric(self) -> None:
|
|
results = {"bleu4": 0.8523}
|
|
table = format_validation_table(results)
|
|
assert table.row_count == 1
|
|
|
|
def test_format_multiple_metrics(self) -> None:
|
|
results = {"bleu4": 0.85, "rouge_l": 0.92, "jaccard": 0.75}
|
|
table = format_validation_table(results)
|
|
assert table.row_count == 3
|
|
|
|
def test_format_with_threshold(self) -> None:
|
|
results = {"bleu4": 0.85, "rouge_l": 0.45}
|
|
table = format_validation_table(results, threshold=0.5)
|
|
# Should have 3 columns: Metric, Score, Status
|
|
assert table.row_count == 2
|
|
|
|
|
|
class TestFormatValidationJson:
|
|
def test_format_empty_results(self) -> None:
|
|
result = format_validation_json({})
|
|
assert result == "{}"
|
|
|
|
def test_format_results(self) -> None:
|
|
results = {"bleu4": 0.85, "rouge_l": 0.92}
|
|
result = format_validation_json(results)
|
|
assert '"bleu4": 0.85' in result
|
|
assert '"rouge_l": 0.92' in result
|
|
|
|
|
|
class TestFormatValidationSimple:
|
|
def test_format_empty_results(self) -> None:
|
|
result = format_validation_simple({})
|
|
assert result == ""
|
|
|
|
def test_format_results(self) -> None:
|
|
results = {"bleu4": 0.8523, "rouge_l": 0.9234}
|
|
result = format_validation_simple(results)
|
|
assert "bleu4: 0.8523" in result
|
|
assert "rouge_l: 0.9234" in result
|
|
|
|
|
|
class TestFormatBenchmarkHistory:
|
|
def test_format_empty_history(self) -> None:
|
|
table = format_benchmark_history([])
|
|
assert table.title == "Benchmark History"
|
|
|
|
def test_format_single_run(self) -> None:
|
|
run = BenchmarkRun(
|
|
id="test-id",
|
|
benchmark_name="test",
|
|
timestamp=datetime(2024, 1, 15, 10, 30, tzinfo=UTC),
|
|
veritext_version="0.1.0",
|
|
metrics={"rouge_l": 0.85, "bleu4": 0.72},
|
|
sample_count=100,
|
|
)
|
|
table = format_benchmark_history([run])
|
|
assert table.row_count == 1
|
|
|
|
def test_format_multiple_runs(self) -> None:
|
|
runs = [
|
|
BenchmarkRun(
|
|
id=f"test-id-{i}",
|
|
benchmark_name="test",
|
|
timestamp=datetime(2024, 1, i + 1, 10, 30, tzinfo=UTC),
|
|
veritext_version="0.1.0",
|
|
metrics={"rouge_l": 0.8 + i * 0.01},
|
|
sample_count=100,
|
|
)
|
|
for i in range(3)
|
|
]
|
|
table = format_benchmark_history(runs)
|
|
assert table.row_count == 3
|
|
|
|
|
|
class TestFormatRegressionReport:
|
|
def test_format_no_regression(self) -> None:
|
|
report = RegressionReport(
|
|
detected=False,
|
|
baseline={"rouge_l": 0.85},
|
|
current={"rouge_l": 0.86},
|
|
deltas={"rouge_l": 0.01},
|
|
tolerance=0.05,
|
|
)
|
|
panel = format_regression_report(report)
|
|
assert panel.title == "Regression Check"
|
|
assert panel.border_style == "green"
|
|
|
|
def test_format_with_regression(self) -> None:
|
|
report = RegressionReport(
|
|
detected=True,
|
|
baseline={"rouge_l": 0.85, "bleu4": 0.72},
|
|
current={"rouge_l": 0.70, "bleu4": 0.70},
|
|
deltas={"rouge_l": -0.15, "bleu4": -0.02},
|
|
tolerance=0.05,
|
|
)
|
|
panel = format_regression_report(report)
|
|
assert panel.title == "Regression Check"
|
|
assert panel.border_style == "red"
|