cli tests
Add comprehensive test suite for validate command, benchmark commands, input readers, and output formatters using Typer CliRunner.
This commit is contained in:
1
tests/test_cli/__init__.py
Normal file
1
tests/test_cli/__init__.py
Normal file
@@ -0,0 +1 @@
|
|||||||
|
"""CLI test suite."""
|
||||||
316
tests/test_cli/test_benchmark.py
Normal file
316
tests/test_cli/test_benchmark.py
Normal file
@@ -0,0 +1,316 @@
|
|||||||
|
"""Tests for CLI benchmark commands."""
|
||||||
|
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
from typer.testing import CliRunner
|
||||||
|
|
||||||
|
from veritext.cli.main import app
|
||||||
|
|
||||||
|
runner = CliRunner()
|
||||||
|
|
||||||
|
|
||||||
|
class TestBenchmarkRun:
|
||||||
|
def test_benchmark_run_basic(self, tmp_path: Path) -> None:
|
||||||
|
data_file = tmp_path / "data.jsonl"
|
||||||
|
data_file.write_text(
|
||||||
|
'{"candidate": "hello world today", "reference": "hello world today"}\n'
|
||||||
|
'{"candidate": "foo bar baz qux", "reference": "foo bar baz qux"}'
|
||||||
|
)
|
||||||
|
storage_path = tmp_path / "benchmarks"
|
||||||
|
|
||||||
|
result = runner.invoke(
|
||||||
|
app,
|
||||||
|
[
|
||||||
|
"benchmark",
|
||||||
|
"run",
|
||||||
|
"test_bench",
|
||||||
|
"-f",
|
||||||
|
str(data_file),
|
||||||
|
"-m",
|
||||||
|
"rouge_l,bleu4",
|
||||||
|
"-s",
|
||||||
|
str(storage_path),
|
||||||
|
],
|
||||||
|
)
|
||||||
|
assert result.exit_code == 0
|
||||||
|
assert "Benchmark 'test_bench' completed" in result.stdout
|
||||||
|
assert "Samples: 2" in result.stdout
|
||||||
|
assert "rouge_l:" in result.stdout
|
||||||
|
assert "bleu4:" in result.stdout
|
||||||
|
|
||||||
|
def test_benchmark_run_file_not_found(self, tmp_path: Path) -> None:
|
||||||
|
result = runner.invoke(
|
||||||
|
app,
|
||||||
|
[
|
||||||
|
"benchmark",
|
||||||
|
"run",
|
||||||
|
"test_bench",
|
||||||
|
"-f",
|
||||||
|
"/nonexistent/file.jsonl",
|
||||||
|
"-s",
|
||||||
|
str(tmp_path / "benchmarks"),
|
||||||
|
],
|
||||||
|
)
|
||||||
|
assert result.exit_code == 1
|
||||||
|
assert "Error" in result.stdout
|
||||||
|
|
||||||
|
def test_benchmark_run_creates_storage(self, tmp_path: Path) -> None:
|
||||||
|
data_file = tmp_path / "data.jsonl"
|
||||||
|
data_file.write_text('{"candidate": "hello", "reference": "hello"}')
|
||||||
|
storage_path = tmp_path / "new_benchmarks"
|
||||||
|
|
||||||
|
result = runner.invoke(
|
||||||
|
app,
|
||||||
|
[
|
||||||
|
"benchmark",
|
||||||
|
"run",
|
||||||
|
"test_bench",
|
||||||
|
"-f",
|
||||||
|
str(data_file),
|
||||||
|
"-s",
|
||||||
|
str(storage_path),
|
||||||
|
],
|
||||||
|
)
|
||||||
|
assert result.exit_code == 0
|
||||||
|
assert storage_path.exists()
|
||||||
|
|
||||||
|
|
||||||
|
class TestBenchmarkShow:
|
||||||
|
def test_benchmark_show_no_runs(self, tmp_path: Path) -> None:
|
||||||
|
storage_path = tmp_path / "benchmarks"
|
||||||
|
storage_path.mkdir()
|
||||||
|
|
||||||
|
result = runner.invoke(
|
||||||
|
app,
|
||||||
|
[
|
||||||
|
"benchmark",
|
||||||
|
"show",
|
||||||
|
"nonexistent_bench",
|
||||||
|
"-s",
|
||||||
|
str(storage_path),
|
||||||
|
],
|
||||||
|
)
|
||||||
|
assert result.exit_code == 0
|
||||||
|
assert "No benchmark runs found" in result.stdout
|
||||||
|
|
||||||
|
def test_benchmark_show_with_runs(self, tmp_path: Path) -> None:
|
||||||
|
# First create some runs
|
||||||
|
data_file = tmp_path / "data.jsonl"
|
||||||
|
data_file.write_text('{"candidate": "hello world", "reference": "hello world"}')
|
||||||
|
storage_path = tmp_path / "benchmarks"
|
||||||
|
|
||||||
|
# Run benchmark twice
|
||||||
|
for _ in range(2):
|
||||||
|
runner.invoke(
|
||||||
|
app,
|
||||||
|
[
|
||||||
|
"benchmark",
|
||||||
|
"run",
|
||||||
|
"test_bench",
|
||||||
|
"-f",
|
||||||
|
str(data_file),
|
||||||
|
"-s",
|
||||||
|
str(storage_path),
|
||||||
|
],
|
||||||
|
)
|
||||||
|
|
||||||
|
# Show history
|
||||||
|
result = runner.invoke(
|
||||||
|
app,
|
||||||
|
[
|
||||||
|
"benchmark",
|
||||||
|
"show",
|
||||||
|
"test_bench",
|
||||||
|
"-s",
|
||||||
|
str(storage_path),
|
||||||
|
],
|
||||||
|
)
|
||||||
|
assert result.exit_code == 0
|
||||||
|
assert "Benchmark History" in result.stdout
|
||||||
|
|
||||||
|
def test_benchmark_show_limit(self, tmp_path: Path) -> None:
|
||||||
|
data_file = tmp_path / "data.jsonl"
|
||||||
|
data_file.write_text('{"candidate": "hello", "reference": "hello"}')
|
||||||
|
storage_path = tmp_path / "benchmarks"
|
||||||
|
|
||||||
|
# Run benchmark 3 times
|
||||||
|
for _ in range(3):
|
||||||
|
runner.invoke(
|
||||||
|
app,
|
||||||
|
[
|
||||||
|
"benchmark",
|
||||||
|
"run",
|
||||||
|
"test_bench",
|
||||||
|
"-f",
|
||||||
|
str(data_file),
|
||||||
|
"-s",
|
||||||
|
str(storage_path),
|
||||||
|
],
|
||||||
|
)
|
||||||
|
|
||||||
|
# Show only last 2
|
||||||
|
result = runner.invoke(
|
||||||
|
app,
|
||||||
|
[
|
||||||
|
"benchmark",
|
||||||
|
"show",
|
||||||
|
"test_bench",
|
||||||
|
"--last",
|
||||||
|
"2",
|
||||||
|
"-s",
|
||||||
|
str(storage_path),
|
||||||
|
],
|
||||||
|
)
|
||||||
|
assert result.exit_code == 0
|
||||||
|
|
||||||
|
|
||||||
|
class TestBenchmarkCheck:
|
||||||
|
def test_benchmark_check_no_regression(self, tmp_path: Path) -> None:
|
||||||
|
data_file = tmp_path / "data.jsonl"
|
||||||
|
data_file.write_text(
|
||||||
|
'{"candidate": "hello world today", "reference": "hello world today"}'
|
||||||
|
)
|
||||||
|
storage_path = tmp_path / "benchmarks"
|
||||||
|
|
||||||
|
# Run benchmark twice with same data (no regression)
|
||||||
|
for _ in range(2):
|
||||||
|
runner.invoke(
|
||||||
|
app,
|
||||||
|
[
|
||||||
|
"benchmark",
|
||||||
|
"run",
|
||||||
|
"test_bench",
|
||||||
|
"-f",
|
||||||
|
str(data_file),
|
||||||
|
"-s",
|
||||||
|
str(storage_path),
|
||||||
|
],
|
||||||
|
)
|
||||||
|
|
||||||
|
# Check for regression
|
||||||
|
result = runner.invoke(
|
||||||
|
app,
|
||||||
|
[
|
||||||
|
"benchmark",
|
||||||
|
"check",
|
||||||
|
"test_bench",
|
||||||
|
"-s",
|
||||||
|
str(storage_path),
|
||||||
|
],
|
||||||
|
)
|
||||||
|
assert result.exit_code == 0
|
||||||
|
assert "No regression detected" in result.stdout
|
||||||
|
|
||||||
|
def test_benchmark_check_with_regression(self, tmp_path: Path) -> None:
|
||||||
|
storage_path = tmp_path / "benchmarks"
|
||||||
|
|
||||||
|
# First run with good data
|
||||||
|
good_file = tmp_path / "good.jsonl"
|
||||||
|
good_file.write_text(
|
||||||
|
'{"candidate": "hello world today", "reference": "hello world today"}'
|
||||||
|
)
|
||||||
|
runner.invoke(
|
||||||
|
app,
|
||||||
|
[
|
||||||
|
"benchmark",
|
||||||
|
"run",
|
||||||
|
"test_bench",
|
||||||
|
"-f",
|
||||||
|
str(good_file),
|
||||||
|
"-s",
|
||||||
|
str(storage_path),
|
||||||
|
],
|
||||||
|
)
|
||||||
|
|
||||||
|
# Second run with bad data (regression)
|
||||||
|
bad_file = tmp_path / "bad.jsonl"
|
||||||
|
bad_file.write_text(
|
||||||
|
'{"candidate": "completely different", "reference": "hello world today"}'
|
||||||
|
)
|
||||||
|
runner.invoke(
|
||||||
|
app,
|
||||||
|
[
|
||||||
|
"benchmark",
|
||||||
|
"run",
|
||||||
|
"test_bench",
|
||||||
|
"-f",
|
||||||
|
str(bad_file),
|
||||||
|
"-s",
|
||||||
|
str(storage_path),
|
||||||
|
],
|
||||||
|
)
|
||||||
|
|
||||||
|
# Check for regression
|
||||||
|
result = runner.invoke(
|
||||||
|
app,
|
||||||
|
[
|
||||||
|
"benchmark",
|
||||||
|
"check",
|
||||||
|
"test_bench",
|
||||||
|
"-t",
|
||||||
|
"0.05",
|
||||||
|
"-s",
|
||||||
|
str(storage_path),
|
||||||
|
],
|
||||||
|
)
|
||||||
|
assert result.exit_code == 1
|
||||||
|
assert "Regression detected" in result.stdout
|
||||||
|
|
||||||
|
def test_benchmark_check_custom_tolerance(self, tmp_path: Path) -> None:
|
||||||
|
data_file = tmp_path / "data.jsonl"
|
||||||
|
data_file.write_text('{"candidate": "hello", "reference": "hello"}')
|
||||||
|
storage_path = tmp_path / "benchmarks"
|
||||||
|
|
||||||
|
runner.invoke(
|
||||||
|
app,
|
||||||
|
[
|
||||||
|
"benchmark",
|
||||||
|
"run",
|
||||||
|
"test_bench",
|
||||||
|
"-f",
|
||||||
|
str(data_file),
|
||||||
|
"-s",
|
||||||
|
str(storage_path),
|
||||||
|
],
|
||||||
|
)
|
||||||
|
|
||||||
|
result = runner.invoke(
|
||||||
|
app,
|
||||||
|
[
|
||||||
|
"benchmark",
|
||||||
|
"check",
|
||||||
|
"test_bench",
|
||||||
|
"--tolerance",
|
||||||
|
"0.10",
|
||||||
|
"-s",
|
||||||
|
str(storage_path),
|
||||||
|
],
|
||||||
|
)
|
||||||
|
assert result.exit_code == 0
|
||||||
|
assert "10.00%" in result.stdout
|
||||||
|
|
||||||
|
|
||||||
|
class TestBenchmarkHelp:
|
||||||
|
def test_benchmark_help(self) -> None:
|
||||||
|
result = runner.invoke(app, ["benchmark", "--help"])
|
||||||
|
assert result.exit_code == 0
|
||||||
|
assert "run" in result.stdout
|
||||||
|
assert "show" in result.stdout
|
||||||
|
assert "check" in result.stdout
|
||||||
|
|
||||||
|
def test_benchmark_run_help(self) -> None:
|
||||||
|
result = runner.invoke(app, ["benchmark", "run", "--help"])
|
||||||
|
assert result.exit_code == 0
|
||||||
|
assert "--file" in result.stdout
|
||||||
|
assert "--metrics" in result.stdout
|
||||||
|
|
||||||
|
def test_benchmark_show_help(self) -> None:
|
||||||
|
result = runner.invoke(app, ["benchmark", "show", "--help"])
|
||||||
|
assert result.exit_code == 0
|
||||||
|
assert "--last" in result.stdout
|
||||||
|
|
||||||
|
def test_benchmark_check_help(self) -> None:
|
||||||
|
result = runner.invoke(app, ["benchmark", "check", "--help"])
|
||||||
|
assert result.exit_code == 0
|
||||||
|
assert "--tolerance" in result.stdout
|
||||||
|
assert "--window" in result.stdout
|
||||||
118
tests/test_cli/test_formatters.py
Normal file
118
tests/test_cli/test_formatters.py
Normal file
@@ -0,0 +1,118 @@
|
|||||||
|
"""Tests for CLI output formatters."""
|
||||||
|
|
||||||
|
from datetime import UTC, datetime
|
||||||
|
|
||||||
|
from veritext.benchmark.models import BenchmarkRun, RegressionReport
|
||||||
|
from veritext.cli.formatters import (
|
||||||
|
format_benchmark_history,
|
||||||
|
format_regression_report,
|
||||||
|
format_validation_json,
|
||||||
|
format_validation_simple,
|
||||||
|
format_validation_table,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
class TestFormatValidationTable:
|
||||||
|
def test_format_empty_results(self) -> None:
|
||||||
|
table = format_validation_table({})
|
||||||
|
assert table.title == "Validation Results"
|
||||||
|
assert table.row_count == 0
|
||||||
|
|
||||||
|
def test_format_single_metric(self) -> None:
|
||||||
|
results = {"bleu4": 0.8523}
|
||||||
|
table = format_validation_table(results)
|
||||||
|
assert table.row_count == 1
|
||||||
|
|
||||||
|
def test_format_multiple_metrics(self) -> None:
|
||||||
|
results = {"bleu4": 0.85, "rouge_l": 0.92, "jaccard": 0.75}
|
||||||
|
table = format_validation_table(results)
|
||||||
|
assert table.row_count == 3
|
||||||
|
|
||||||
|
def test_format_with_threshold(self) -> None:
|
||||||
|
results = {"bleu4": 0.85, "rouge_l": 0.45}
|
||||||
|
table = format_validation_table(results, threshold=0.5)
|
||||||
|
# Should have 3 columns: Metric, Score, Status
|
||||||
|
assert table.row_count == 2
|
||||||
|
|
||||||
|
|
||||||
|
class TestFormatValidationJson:
|
||||||
|
def test_format_empty_results(self) -> None:
|
||||||
|
result = format_validation_json({})
|
||||||
|
assert result == "{}"
|
||||||
|
|
||||||
|
def test_format_results(self) -> None:
|
||||||
|
results = {"bleu4": 0.85, "rouge_l": 0.92}
|
||||||
|
result = format_validation_json(results)
|
||||||
|
assert '"bleu4": 0.85' in result
|
||||||
|
assert '"rouge_l": 0.92' in result
|
||||||
|
|
||||||
|
|
||||||
|
class TestFormatValidationSimple:
|
||||||
|
def test_format_empty_results(self) -> None:
|
||||||
|
result = format_validation_simple({})
|
||||||
|
assert result == ""
|
||||||
|
|
||||||
|
def test_format_results(self) -> None:
|
||||||
|
results = {"bleu4": 0.8523, "rouge_l": 0.9234}
|
||||||
|
result = format_validation_simple(results)
|
||||||
|
assert "bleu4: 0.8523" in result
|
||||||
|
assert "rouge_l: 0.9234" in result
|
||||||
|
|
||||||
|
|
||||||
|
class TestFormatBenchmarkHistory:
|
||||||
|
def test_format_empty_history(self) -> None:
|
||||||
|
table = format_benchmark_history([])
|
||||||
|
assert table.title == "Benchmark History"
|
||||||
|
|
||||||
|
def test_format_single_run(self) -> None:
|
||||||
|
run = BenchmarkRun(
|
||||||
|
id="test-id",
|
||||||
|
benchmark_name="test",
|
||||||
|
timestamp=datetime(2024, 1, 15, 10, 30, tzinfo=UTC),
|
||||||
|
veritext_version="0.1.0",
|
||||||
|
metrics={"rouge_l": 0.85, "bleu4": 0.72},
|
||||||
|
sample_count=100,
|
||||||
|
)
|
||||||
|
table = format_benchmark_history([run])
|
||||||
|
assert table.row_count == 1
|
||||||
|
|
||||||
|
def test_format_multiple_runs(self) -> None:
|
||||||
|
runs = [
|
||||||
|
BenchmarkRun(
|
||||||
|
id=f"test-id-{i}",
|
||||||
|
benchmark_name="test",
|
||||||
|
timestamp=datetime(2024, 1, i + 1, 10, 30, tzinfo=UTC),
|
||||||
|
veritext_version="0.1.0",
|
||||||
|
metrics={"rouge_l": 0.8 + i * 0.01},
|
||||||
|
sample_count=100,
|
||||||
|
)
|
||||||
|
for i in range(3)
|
||||||
|
]
|
||||||
|
table = format_benchmark_history(runs)
|
||||||
|
assert table.row_count == 3
|
||||||
|
|
||||||
|
|
||||||
|
class TestFormatRegressionReport:
|
||||||
|
def test_format_no_regression(self) -> None:
|
||||||
|
report = RegressionReport(
|
||||||
|
detected=False,
|
||||||
|
baseline={"rouge_l": 0.85},
|
||||||
|
current={"rouge_l": 0.86},
|
||||||
|
deltas={"rouge_l": 0.01},
|
||||||
|
tolerance=0.05,
|
||||||
|
)
|
||||||
|
panel = format_regression_report(report)
|
||||||
|
assert panel.title == "Regression Check"
|
||||||
|
assert panel.border_style == "green"
|
||||||
|
|
||||||
|
def test_format_with_regression(self) -> None:
|
||||||
|
report = RegressionReport(
|
||||||
|
detected=True,
|
||||||
|
baseline={"rouge_l": 0.85, "bleu4": 0.72},
|
||||||
|
current={"rouge_l": 0.70, "bleu4": 0.70},
|
||||||
|
deltas={"rouge_l": -0.15, "bleu4": -0.02},
|
||||||
|
tolerance=0.05,
|
||||||
|
)
|
||||||
|
panel = format_regression_report(report)
|
||||||
|
assert panel.title == "Regression Check"
|
||||||
|
assert panel.border_style == "red"
|
||||||
126
tests/test_cli/test_readers.py
Normal file
126
tests/test_cli/test_readers.py
Normal file
@@ -0,0 +1,126 @@
|
|||||||
|
"""Tests for CLI input readers."""
|
||||||
|
|
||||||
|
import json
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
from veritext.cli.readers import TextPair, read_jsonl, read_paired_jsonl
|
||||||
|
|
||||||
|
|
||||||
|
class TestTextPair:
|
||||||
|
def test_create_text_pair(self) -> None:
|
||||||
|
pair = TextPair(candidate="hello", reference="world")
|
||||||
|
assert pair.candidate == "hello"
|
||||||
|
assert pair.reference == "world"
|
||||||
|
|
||||||
|
|
||||||
|
class TestReadJsonl:
|
||||||
|
def test_read_valid_jsonl(self, tmp_path: Path) -> None:
|
||||||
|
data = [
|
||||||
|
{"candidate": "foo", "reference": "bar"},
|
||||||
|
{"candidate": "baz", "reference": "qux"},
|
||||||
|
]
|
||||||
|
jsonl_file = tmp_path / "data.jsonl"
|
||||||
|
jsonl_file.write_text("\n".join(json.dumps(d) for d in data))
|
||||||
|
|
||||||
|
pairs = read_jsonl(jsonl_file)
|
||||||
|
|
||||||
|
assert len(pairs) == 2
|
||||||
|
assert pairs[0].candidate == "foo"
|
||||||
|
assert pairs[0].reference == "bar"
|
||||||
|
assert pairs[1].candidate == "baz"
|
||||||
|
assert pairs[1].reference == "qux"
|
||||||
|
|
||||||
|
def test_read_empty_file(self, tmp_path: Path) -> None:
|
||||||
|
jsonl_file = tmp_path / "empty.jsonl"
|
||||||
|
jsonl_file.write_text("")
|
||||||
|
|
||||||
|
pairs = read_jsonl(jsonl_file)
|
||||||
|
|
||||||
|
assert pairs == []
|
||||||
|
|
||||||
|
def test_read_file_with_blank_lines(self, tmp_path: Path) -> None:
|
||||||
|
jsonl_file = tmp_path / "data.jsonl"
|
||||||
|
content = '{"candidate": "a", "reference": "b"}\n\n{"candidate": "c", "reference": "d"}\n'
|
||||||
|
jsonl_file.write_text(content)
|
||||||
|
|
||||||
|
pairs = read_jsonl(jsonl_file)
|
||||||
|
|
||||||
|
assert len(pairs) == 2
|
||||||
|
|
||||||
|
def test_read_file_not_found(self, tmp_path: Path) -> None:
|
||||||
|
with pytest.raises(FileNotFoundError):
|
||||||
|
read_jsonl(tmp_path / "nonexistent.jsonl")
|
||||||
|
|
||||||
|
def test_read_invalid_json(self, tmp_path: Path) -> None:
|
||||||
|
jsonl_file = tmp_path / "invalid.jsonl"
|
||||||
|
jsonl_file.write_text("not valid json")
|
||||||
|
|
||||||
|
with pytest.raises(ValueError, match="Invalid JSON on line 1"):
|
||||||
|
read_jsonl(jsonl_file)
|
||||||
|
|
||||||
|
def test_read_missing_candidate_key(self, tmp_path: Path) -> None:
|
||||||
|
jsonl_file = tmp_path / "data.jsonl"
|
||||||
|
jsonl_file.write_text('{"reference": "bar"}')
|
||||||
|
|
||||||
|
with pytest.raises(ValueError, match="Missing 'candidate' key on line 1"):
|
||||||
|
read_jsonl(jsonl_file)
|
||||||
|
|
||||||
|
def test_read_missing_reference_key(self, tmp_path: Path) -> None:
|
||||||
|
jsonl_file = tmp_path / "data.jsonl"
|
||||||
|
jsonl_file.write_text('{"candidate": "foo"}')
|
||||||
|
|
||||||
|
with pytest.raises(ValueError, match="Missing 'reference' key on line 1"):
|
||||||
|
read_jsonl(jsonl_file)
|
||||||
|
|
||||||
|
|
||||||
|
class TestReadPairedJsonl:
|
||||||
|
def test_read_paired_valid(self, tmp_path: Path) -> None:
|
||||||
|
candidates_file = tmp_path / "candidates.jsonl"
|
||||||
|
references_file = tmp_path / "references.jsonl"
|
||||||
|
|
||||||
|
candidates_file.write_text('{"text": "foo"}\n{"text": "bar"}')
|
||||||
|
references_file.write_text('{"text": "baz"}\n{"text": "qux"}')
|
||||||
|
|
||||||
|
pairs = read_paired_jsonl(candidates_file, references_file)
|
||||||
|
|
||||||
|
assert len(pairs) == 2
|
||||||
|
assert pairs[0].candidate == "foo"
|
||||||
|
assert pairs[0].reference == "baz"
|
||||||
|
assert pairs[1].candidate == "bar"
|
||||||
|
assert pairs[1].reference == "qux"
|
||||||
|
|
||||||
|
def test_read_paired_length_mismatch(self, tmp_path: Path) -> None:
|
||||||
|
candidates_file = tmp_path / "candidates.jsonl"
|
||||||
|
references_file = tmp_path / "references.jsonl"
|
||||||
|
|
||||||
|
candidates_file.write_text('{"text": "foo"}\n{"text": "bar"}')
|
||||||
|
references_file.write_text('{"text": "baz"}')
|
||||||
|
|
||||||
|
with pytest.raises(ValueError, match="does not match"):
|
||||||
|
read_paired_jsonl(candidates_file, references_file)
|
||||||
|
|
||||||
|
def test_read_paired_candidates_not_found(self, tmp_path: Path) -> None:
|
||||||
|
references_file = tmp_path / "references.jsonl"
|
||||||
|
references_file.write_text('{"text": "baz"}')
|
||||||
|
|
||||||
|
with pytest.raises(FileNotFoundError, match="Candidates file not found"):
|
||||||
|
read_paired_jsonl(tmp_path / "nonexistent.jsonl", references_file)
|
||||||
|
|
||||||
|
def test_read_paired_references_not_found(self, tmp_path: Path) -> None:
|
||||||
|
candidates_file = tmp_path / "candidates.jsonl"
|
||||||
|
candidates_file.write_text('{"text": "foo"}')
|
||||||
|
|
||||||
|
with pytest.raises(FileNotFoundError, match="References file not found"):
|
||||||
|
read_paired_jsonl(candidates_file, tmp_path / "nonexistent.jsonl")
|
||||||
|
|
||||||
|
def test_read_paired_missing_text_key(self, tmp_path: Path) -> None:
|
||||||
|
candidates_file = tmp_path / "candidates.jsonl"
|
||||||
|
references_file = tmp_path / "references.jsonl"
|
||||||
|
|
||||||
|
candidates_file.write_text('{"value": "foo"}')
|
||||||
|
references_file.write_text('{"text": "baz"}')
|
||||||
|
|
||||||
|
with pytest.raises(ValueError, match="Missing 'text' key in candidates file"):
|
||||||
|
read_paired_jsonl(candidates_file, references_file)
|
||||||
214
tests/test_cli/test_validate.py
Normal file
214
tests/test_cli/test_validate.py
Normal file
@@ -0,0 +1,214 @@
|
|||||||
|
"""Tests for CLI validate command."""
|
||||||
|
|
||||||
|
import json
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
from typer.testing import CliRunner
|
||||||
|
|
||||||
|
from veritext.cli.main import app
|
||||||
|
|
||||||
|
runner = CliRunner()
|
||||||
|
|
||||||
|
|
||||||
|
class TestValidateInline:
|
||||||
|
def test_validate_inline_basic(self) -> None:
|
||||||
|
result = runner.invoke(
|
||||||
|
app,
|
||||||
|
[
|
||||||
|
"validate",
|
||||||
|
"The quick brown fox jumps",
|
||||||
|
"-r",
|
||||||
|
"The quick brown fox jumps",
|
||||||
|
"-m",
|
||||||
|
"bleu",
|
||||||
|
],
|
||||||
|
)
|
||||||
|
assert result.exit_code == 0
|
||||||
|
assert "bleu4" in result.stdout
|
||||||
|
|
||||||
|
def test_validate_inline_with_rouge(self) -> None:
|
||||||
|
result = runner.invoke(
|
||||||
|
app,
|
||||||
|
[
|
||||||
|
"validate",
|
||||||
|
"hello world today",
|
||||||
|
"-r",
|
||||||
|
"hello world here",
|
||||||
|
"-m",
|
||||||
|
"rouge",
|
||||||
|
],
|
||||||
|
)
|
||||||
|
assert result.exit_code == 0
|
||||||
|
assert "rouge_l" in result.stdout
|
||||||
|
|
||||||
|
def test_validate_inline_with_lexical(self) -> None:
|
||||||
|
result = runner.invoke(
|
||||||
|
app,
|
||||||
|
[
|
||||||
|
"validate",
|
||||||
|
"hello world",
|
||||||
|
"-r",
|
||||||
|
"hello everyone",
|
||||||
|
"-m",
|
||||||
|
"lexical",
|
||||||
|
],
|
||||||
|
)
|
||||||
|
assert result.exit_code == 0
|
||||||
|
assert "jaccard" in result.stdout
|
||||||
|
assert "token_overlap" in result.stdout
|
||||||
|
|
||||||
|
def test_validate_inline_json_output(self) -> None:
|
||||||
|
result = runner.invoke(
|
||||||
|
app,
|
||||||
|
[
|
||||||
|
"validate",
|
||||||
|
"hello world today",
|
||||||
|
"-r",
|
||||||
|
"hello world today",
|
||||||
|
"-m",
|
||||||
|
"bleu",
|
||||||
|
"-o",
|
||||||
|
"json",
|
||||||
|
],
|
||||||
|
)
|
||||||
|
assert result.exit_code == 0
|
||||||
|
data = json.loads(result.stdout)
|
||||||
|
assert "bleu4" in data
|
||||||
|
|
||||||
|
def test_validate_inline_simple_output(self) -> None:
|
||||||
|
result = runner.invoke(
|
||||||
|
app,
|
||||||
|
[
|
||||||
|
"validate",
|
||||||
|
"hello world today",
|
||||||
|
"-r",
|
||||||
|
"hello world today",
|
||||||
|
"-m",
|
||||||
|
"rouge",
|
||||||
|
"-o",
|
||||||
|
"simple",
|
||||||
|
],
|
||||||
|
)
|
||||||
|
assert result.exit_code == 0
|
||||||
|
assert "rouge_l:" in result.stdout
|
||||||
|
|
||||||
|
def test_validate_inline_missing_reference(self) -> None:
|
||||||
|
result = runner.invoke(
|
||||||
|
app,
|
||||||
|
["validate", "hello world", "-m", "bleu"],
|
||||||
|
)
|
||||||
|
assert result.exit_code == 1
|
||||||
|
assert "Error" in result.stdout
|
||||||
|
|
||||||
|
def test_validate_inline_invalid_metric(self) -> None:
|
||||||
|
result = runner.invoke(
|
||||||
|
app,
|
||||||
|
["validate", "hello", "-r", "world", "-m", "invalid_metric"],
|
||||||
|
)
|
||||||
|
assert result.exit_code == 1
|
||||||
|
assert "Unknown metrics" in result.stdout
|
||||||
|
|
||||||
|
|
||||||
|
class TestValidateFile:
|
||||||
|
def test_validate_file_basic(self, tmp_path: Path) -> None:
|
||||||
|
data_file = tmp_path / "data.jsonl"
|
||||||
|
data_file.write_text(
|
||||||
|
'{"candidate": "hello world today", "reference": "hello world today"}\n'
|
||||||
|
'{"candidate": "foo bar baz", "reference": "foo bar baz"}'
|
||||||
|
)
|
||||||
|
|
||||||
|
result = runner.invoke(
|
||||||
|
app,
|
||||||
|
["validate", "-f", str(data_file), "-m", "bleu"],
|
||||||
|
)
|
||||||
|
assert result.exit_code == 0
|
||||||
|
assert "bleu4" in result.stdout
|
||||||
|
assert "Evaluated 2 text pairs" in result.stdout
|
||||||
|
|
||||||
|
def test_validate_file_not_found(self) -> None:
|
||||||
|
result = runner.invoke(
|
||||||
|
app,
|
||||||
|
["validate", "-f", "/nonexistent/file.jsonl", "-m", "bleu"],
|
||||||
|
)
|
||||||
|
assert result.exit_code == 1
|
||||||
|
assert "Error" in result.stdout
|
||||||
|
|
||||||
|
def test_validate_paired_files(self, tmp_path: Path) -> None:
|
||||||
|
candidates_file = tmp_path / "candidates.jsonl"
|
||||||
|
references_file = tmp_path / "references.jsonl"
|
||||||
|
|
||||||
|
candidates_file.write_text(
|
||||||
|
'{"text": "hello world today"}\n{"text": "foo bar baz"}'
|
||||||
|
)
|
||||||
|
references_file.write_text(
|
||||||
|
'{"text": "hello world today"}\n{"text": "foo bar baz"}'
|
||||||
|
)
|
||||||
|
|
||||||
|
result = runner.invoke(
|
||||||
|
app,
|
||||||
|
[
|
||||||
|
"validate",
|
||||||
|
"-f",
|
||||||
|
str(candidates_file),
|
||||||
|
"-R",
|
||||||
|
str(references_file),
|
||||||
|
"-m",
|
||||||
|
"bleu",
|
||||||
|
],
|
||||||
|
)
|
||||||
|
assert result.exit_code == 0
|
||||||
|
assert "Evaluated 2 text pairs" in result.stdout
|
||||||
|
|
||||||
|
|
||||||
|
class TestValidateOptions:
|
||||||
|
def test_validate_with_threshold(self) -> None:
|
||||||
|
result = runner.invoke(
|
||||||
|
app,
|
||||||
|
[
|
||||||
|
"validate",
|
||||||
|
"hello world today",
|
||||||
|
"-r",
|
||||||
|
"hello world today",
|
||||||
|
"-m",
|
||||||
|
"bleu",
|
||||||
|
"-t",
|
||||||
|
"0.5",
|
||||||
|
],
|
||||||
|
)
|
||||||
|
assert result.exit_code == 0
|
||||||
|
# Table output should include Status column
|
||||||
|
assert "Status" in result.stdout or "PASS" in result.stdout
|
||||||
|
|
||||||
|
def test_validate_invalid_output_format(self) -> None:
|
||||||
|
result = runner.invoke(
|
||||||
|
app,
|
||||||
|
[
|
||||||
|
"validate",
|
||||||
|
"hello",
|
||||||
|
"-r",
|
||||||
|
"world",
|
||||||
|
"-m",
|
||||||
|
"bleu",
|
||||||
|
"-o",
|
||||||
|
"invalid",
|
||||||
|
],
|
||||||
|
)
|
||||||
|
assert result.exit_code == 1
|
||||||
|
assert "Invalid output format" in result.stdout
|
||||||
|
|
||||||
|
def test_validate_multiple_metrics(self) -> None:
|
||||||
|
result = runner.invoke(
|
||||||
|
app,
|
||||||
|
[
|
||||||
|
"validate",
|
||||||
|
"The quick brown fox",
|
||||||
|
"-r",
|
||||||
|
"The quick brown fox",
|
||||||
|
"-m",
|
||||||
|
"bleu,rouge,lexical",
|
||||||
|
],
|
||||||
|
)
|
||||||
|
assert result.exit_code == 0
|
||||||
|
assert "bleu4" in result.stdout
|
||||||
|
assert "rouge_l" in result.stdout
|
||||||
|
assert "jaccard" in result.stdout
|
||||||
Reference in New Issue
Block a user