cli tests
Add comprehensive test suite for validate command, benchmark commands, input readers, and output formatters using Typer CliRunner.
This commit is contained in:
1
tests/test_cli/__init__.py
Normal file
1
tests/test_cli/__init__.py
Normal file
@@ -0,0 +1 @@
|
||||
"""CLI test suite."""
|
||||
316
tests/test_cli/test_benchmark.py
Normal file
316
tests/test_cli/test_benchmark.py
Normal file
@@ -0,0 +1,316 @@
|
||||
"""Tests for CLI benchmark commands."""
|
||||
|
||||
from pathlib import Path
|
||||
|
||||
from typer.testing import CliRunner
|
||||
|
||||
from veritext.cli.main import app
|
||||
|
||||
runner = CliRunner()
|
||||
|
||||
|
||||
class TestBenchmarkRun:
|
||||
def test_benchmark_run_basic(self, tmp_path: Path) -> None:
|
||||
data_file = tmp_path / "data.jsonl"
|
||||
data_file.write_text(
|
||||
'{"candidate": "hello world today", "reference": "hello world today"}\n'
|
||||
'{"candidate": "foo bar baz qux", "reference": "foo bar baz qux"}'
|
||||
)
|
||||
storage_path = tmp_path / "benchmarks"
|
||||
|
||||
result = runner.invoke(
|
||||
app,
|
||||
[
|
||||
"benchmark",
|
||||
"run",
|
||||
"test_bench",
|
||||
"-f",
|
||||
str(data_file),
|
||||
"-m",
|
||||
"rouge_l,bleu4",
|
||||
"-s",
|
||||
str(storage_path),
|
||||
],
|
||||
)
|
||||
assert result.exit_code == 0
|
||||
assert "Benchmark 'test_bench' completed" in result.stdout
|
||||
assert "Samples: 2" in result.stdout
|
||||
assert "rouge_l:" in result.stdout
|
||||
assert "bleu4:" in result.stdout
|
||||
|
||||
def test_benchmark_run_file_not_found(self, tmp_path: Path) -> None:
|
||||
result = runner.invoke(
|
||||
app,
|
||||
[
|
||||
"benchmark",
|
||||
"run",
|
||||
"test_bench",
|
||||
"-f",
|
||||
"/nonexistent/file.jsonl",
|
||||
"-s",
|
||||
str(tmp_path / "benchmarks"),
|
||||
],
|
||||
)
|
||||
assert result.exit_code == 1
|
||||
assert "Error" in result.stdout
|
||||
|
||||
def test_benchmark_run_creates_storage(self, tmp_path: Path) -> None:
|
||||
data_file = tmp_path / "data.jsonl"
|
||||
data_file.write_text('{"candidate": "hello", "reference": "hello"}')
|
||||
storage_path = tmp_path / "new_benchmarks"
|
||||
|
||||
result = runner.invoke(
|
||||
app,
|
||||
[
|
||||
"benchmark",
|
||||
"run",
|
||||
"test_bench",
|
||||
"-f",
|
||||
str(data_file),
|
||||
"-s",
|
||||
str(storage_path),
|
||||
],
|
||||
)
|
||||
assert result.exit_code == 0
|
||||
assert storage_path.exists()
|
||||
|
||||
|
||||
class TestBenchmarkShow:
|
||||
def test_benchmark_show_no_runs(self, tmp_path: Path) -> None:
|
||||
storage_path = tmp_path / "benchmarks"
|
||||
storage_path.mkdir()
|
||||
|
||||
result = runner.invoke(
|
||||
app,
|
||||
[
|
||||
"benchmark",
|
||||
"show",
|
||||
"nonexistent_bench",
|
||||
"-s",
|
||||
str(storage_path),
|
||||
],
|
||||
)
|
||||
assert result.exit_code == 0
|
||||
assert "No benchmark runs found" in result.stdout
|
||||
|
||||
def test_benchmark_show_with_runs(self, tmp_path: Path) -> None:
|
||||
# First create some runs
|
||||
data_file = tmp_path / "data.jsonl"
|
||||
data_file.write_text('{"candidate": "hello world", "reference": "hello world"}')
|
||||
storage_path = tmp_path / "benchmarks"
|
||||
|
||||
# Run benchmark twice
|
||||
for _ in range(2):
|
||||
runner.invoke(
|
||||
app,
|
||||
[
|
||||
"benchmark",
|
||||
"run",
|
||||
"test_bench",
|
||||
"-f",
|
||||
str(data_file),
|
||||
"-s",
|
||||
str(storage_path),
|
||||
],
|
||||
)
|
||||
|
||||
# Show history
|
||||
result = runner.invoke(
|
||||
app,
|
||||
[
|
||||
"benchmark",
|
||||
"show",
|
||||
"test_bench",
|
||||
"-s",
|
||||
str(storage_path),
|
||||
],
|
||||
)
|
||||
assert result.exit_code == 0
|
||||
assert "Benchmark History" in result.stdout
|
||||
|
||||
def test_benchmark_show_limit(self, tmp_path: Path) -> None:
|
||||
data_file = tmp_path / "data.jsonl"
|
||||
data_file.write_text('{"candidate": "hello", "reference": "hello"}')
|
||||
storage_path = tmp_path / "benchmarks"
|
||||
|
||||
# Run benchmark 3 times
|
||||
for _ in range(3):
|
||||
runner.invoke(
|
||||
app,
|
||||
[
|
||||
"benchmark",
|
||||
"run",
|
||||
"test_bench",
|
||||
"-f",
|
||||
str(data_file),
|
||||
"-s",
|
||||
str(storage_path),
|
||||
],
|
||||
)
|
||||
|
||||
# Show only last 2
|
||||
result = runner.invoke(
|
||||
app,
|
||||
[
|
||||
"benchmark",
|
||||
"show",
|
||||
"test_bench",
|
||||
"--last",
|
||||
"2",
|
||||
"-s",
|
||||
str(storage_path),
|
||||
],
|
||||
)
|
||||
assert result.exit_code == 0
|
||||
|
||||
|
||||
class TestBenchmarkCheck:
|
||||
def test_benchmark_check_no_regression(self, tmp_path: Path) -> None:
|
||||
data_file = tmp_path / "data.jsonl"
|
||||
data_file.write_text(
|
||||
'{"candidate": "hello world today", "reference": "hello world today"}'
|
||||
)
|
||||
storage_path = tmp_path / "benchmarks"
|
||||
|
||||
# Run benchmark twice with same data (no regression)
|
||||
for _ in range(2):
|
||||
runner.invoke(
|
||||
app,
|
||||
[
|
||||
"benchmark",
|
||||
"run",
|
||||
"test_bench",
|
||||
"-f",
|
||||
str(data_file),
|
||||
"-s",
|
||||
str(storage_path),
|
||||
],
|
||||
)
|
||||
|
||||
# Check for regression
|
||||
result = runner.invoke(
|
||||
app,
|
||||
[
|
||||
"benchmark",
|
||||
"check",
|
||||
"test_bench",
|
||||
"-s",
|
||||
str(storage_path),
|
||||
],
|
||||
)
|
||||
assert result.exit_code == 0
|
||||
assert "No regression detected" in result.stdout
|
||||
|
||||
def test_benchmark_check_with_regression(self, tmp_path: Path) -> None:
|
||||
storage_path = tmp_path / "benchmarks"
|
||||
|
||||
# First run with good data
|
||||
good_file = tmp_path / "good.jsonl"
|
||||
good_file.write_text(
|
||||
'{"candidate": "hello world today", "reference": "hello world today"}'
|
||||
)
|
||||
runner.invoke(
|
||||
app,
|
||||
[
|
||||
"benchmark",
|
||||
"run",
|
||||
"test_bench",
|
||||
"-f",
|
||||
str(good_file),
|
||||
"-s",
|
||||
str(storage_path),
|
||||
],
|
||||
)
|
||||
|
||||
# Second run with bad data (regression)
|
||||
bad_file = tmp_path / "bad.jsonl"
|
||||
bad_file.write_text(
|
||||
'{"candidate": "completely different", "reference": "hello world today"}'
|
||||
)
|
||||
runner.invoke(
|
||||
app,
|
||||
[
|
||||
"benchmark",
|
||||
"run",
|
||||
"test_bench",
|
||||
"-f",
|
||||
str(bad_file),
|
||||
"-s",
|
||||
str(storage_path),
|
||||
],
|
||||
)
|
||||
|
||||
# Check for regression
|
||||
result = runner.invoke(
|
||||
app,
|
||||
[
|
||||
"benchmark",
|
||||
"check",
|
||||
"test_bench",
|
||||
"-t",
|
||||
"0.05",
|
||||
"-s",
|
||||
str(storage_path),
|
||||
],
|
||||
)
|
||||
assert result.exit_code == 1
|
||||
assert "Regression detected" in result.stdout
|
||||
|
||||
def test_benchmark_check_custom_tolerance(self, tmp_path: Path) -> None:
|
||||
data_file = tmp_path / "data.jsonl"
|
||||
data_file.write_text('{"candidate": "hello", "reference": "hello"}')
|
||||
storage_path = tmp_path / "benchmarks"
|
||||
|
||||
runner.invoke(
|
||||
app,
|
||||
[
|
||||
"benchmark",
|
||||
"run",
|
||||
"test_bench",
|
||||
"-f",
|
||||
str(data_file),
|
||||
"-s",
|
||||
str(storage_path),
|
||||
],
|
||||
)
|
||||
|
||||
result = runner.invoke(
|
||||
app,
|
||||
[
|
||||
"benchmark",
|
||||
"check",
|
||||
"test_bench",
|
||||
"--tolerance",
|
||||
"0.10",
|
||||
"-s",
|
||||
str(storage_path),
|
||||
],
|
||||
)
|
||||
assert result.exit_code == 0
|
||||
assert "10.00%" in result.stdout
|
||||
|
||||
|
||||
class TestBenchmarkHelp:
|
||||
def test_benchmark_help(self) -> None:
|
||||
result = runner.invoke(app, ["benchmark", "--help"])
|
||||
assert result.exit_code == 0
|
||||
assert "run" in result.stdout
|
||||
assert "show" in result.stdout
|
||||
assert "check" in result.stdout
|
||||
|
||||
def test_benchmark_run_help(self) -> None:
|
||||
result = runner.invoke(app, ["benchmark", "run", "--help"])
|
||||
assert result.exit_code == 0
|
||||
assert "--file" in result.stdout
|
||||
assert "--metrics" in result.stdout
|
||||
|
||||
def test_benchmark_show_help(self) -> None:
|
||||
result = runner.invoke(app, ["benchmark", "show", "--help"])
|
||||
assert result.exit_code == 0
|
||||
assert "--last" in result.stdout
|
||||
|
||||
def test_benchmark_check_help(self) -> None:
|
||||
result = runner.invoke(app, ["benchmark", "check", "--help"])
|
||||
assert result.exit_code == 0
|
||||
assert "--tolerance" in result.stdout
|
||||
assert "--window" in result.stdout
|
||||
118
tests/test_cli/test_formatters.py
Normal file
118
tests/test_cli/test_formatters.py
Normal file
@@ -0,0 +1,118 @@
|
||||
"""Tests for CLI output formatters."""
|
||||
|
||||
from datetime import UTC, datetime
|
||||
|
||||
from veritext.benchmark.models import BenchmarkRun, RegressionReport
|
||||
from veritext.cli.formatters import (
|
||||
format_benchmark_history,
|
||||
format_regression_report,
|
||||
format_validation_json,
|
||||
format_validation_simple,
|
||||
format_validation_table,
|
||||
)
|
||||
|
||||
|
||||
class TestFormatValidationTable:
|
||||
def test_format_empty_results(self) -> None:
|
||||
table = format_validation_table({})
|
||||
assert table.title == "Validation Results"
|
||||
assert table.row_count == 0
|
||||
|
||||
def test_format_single_metric(self) -> None:
|
||||
results = {"bleu4": 0.8523}
|
||||
table = format_validation_table(results)
|
||||
assert table.row_count == 1
|
||||
|
||||
def test_format_multiple_metrics(self) -> None:
|
||||
results = {"bleu4": 0.85, "rouge_l": 0.92, "jaccard": 0.75}
|
||||
table = format_validation_table(results)
|
||||
assert table.row_count == 3
|
||||
|
||||
def test_format_with_threshold(self) -> None:
|
||||
results = {"bleu4": 0.85, "rouge_l": 0.45}
|
||||
table = format_validation_table(results, threshold=0.5)
|
||||
# Should have 3 columns: Metric, Score, Status
|
||||
assert table.row_count == 2
|
||||
|
||||
|
||||
class TestFormatValidationJson:
|
||||
def test_format_empty_results(self) -> None:
|
||||
result = format_validation_json({})
|
||||
assert result == "{}"
|
||||
|
||||
def test_format_results(self) -> None:
|
||||
results = {"bleu4": 0.85, "rouge_l": 0.92}
|
||||
result = format_validation_json(results)
|
||||
assert '"bleu4": 0.85' in result
|
||||
assert '"rouge_l": 0.92' in result
|
||||
|
||||
|
||||
class TestFormatValidationSimple:
|
||||
def test_format_empty_results(self) -> None:
|
||||
result = format_validation_simple({})
|
||||
assert result == ""
|
||||
|
||||
def test_format_results(self) -> None:
|
||||
results = {"bleu4": 0.8523, "rouge_l": 0.9234}
|
||||
result = format_validation_simple(results)
|
||||
assert "bleu4: 0.8523" in result
|
||||
assert "rouge_l: 0.9234" in result
|
||||
|
||||
|
||||
class TestFormatBenchmarkHistory:
|
||||
def test_format_empty_history(self) -> None:
|
||||
table = format_benchmark_history([])
|
||||
assert table.title == "Benchmark History"
|
||||
|
||||
def test_format_single_run(self) -> None:
|
||||
run = BenchmarkRun(
|
||||
id="test-id",
|
||||
benchmark_name="test",
|
||||
timestamp=datetime(2024, 1, 15, 10, 30, tzinfo=UTC),
|
||||
veritext_version="0.1.0",
|
||||
metrics={"rouge_l": 0.85, "bleu4": 0.72},
|
||||
sample_count=100,
|
||||
)
|
||||
table = format_benchmark_history([run])
|
||||
assert table.row_count == 1
|
||||
|
||||
def test_format_multiple_runs(self) -> None:
|
||||
runs = [
|
||||
BenchmarkRun(
|
||||
id=f"test-id-{i}",
|
||||
benchmark_name="test",
|
||||
timestamp=datetime(2024, 1, i + 1, 10, 30, tzinfo=UTC),
|
||||
veritext_version="0.1.0",
|
||||
metrics={"rouge_l": 0.8 + i * 0.01},
|
||||
sample_count=100,
|
||||
)
|
||||
for i in range(3)
|
||||
]
|
||||
table = format_benchmark_history(runs)
|
||||
assert table.row_count == 3
|
||||
|
||||
|
||||
class TestFormatRegressionReport:
|
||||
def test_format_no_regression(self) -> None:
|
||||
report = RegressionReport(
|
||||
detected=False,
|
||||
baseline={"rouge_l": 0.85},
|
||||
current={"rouge_l": 0.86},
|
||||
deltas={"rouge_l": 0.01},
|
||||
tolerance=0.05,
|
||||
)
|
||||
panel = format_regression_report(report)
|
||||
assert panel.title == "Regression Check"
|
||||
assert panel.border_style == "green"
|
||||
|
||||
def test_format_with_regression(self) -> None:
|
||||
report = RegressionReport(
|
||||
detected=True,
|
||||
baseline={"rouge_l": 0.85, "bleu4": 0.72},
|
||||
current={"rouge_l": 0.70, "bleu4": 0.70},
|
||||
deltas={"rouge_l": -0.15, "bleu4": -0.02},
|
||||
tolerance=0.05,
|
||||
)
|
||||
panel = format_regression_report(report)
|
||||
assert panel.title == "Regression Check"
|
||||
assert panel.border_style == "red"
|
||||
126
tests/test_cli/test_readers.py
Normal file
126
tests/test_cli/test_readers.py
Normal file
@@ -0,0 +1,126 @@
|
||||
"""Tests for CLI input readers."""
|
||||
|
||||
import json
|
||||
from pathlib import Path
|
||||
|
||||
import pytest
|
||||
|
||||
from veritext.cli.readers import TextPair, read_jsonl, read_paired_jsonl
|
||||
|
||||
|
||||
class TestTextPair:
|
||||
def test_create_text_pair(self) -> None:
|
||||
pair = TextPair(candidate="hello", reference="world")
|
||||
assert pair.candidate == "hello"
|
||||
assert pair.reference == "world"
|
||||
|
||||
|
||||
class TestReadJsonl:
|
||||
def test_read_valid_jsonl(self, tmp_path: Path) -> None:
|
||||
data = [
|
||||
{"candidate": "foo", "reference": "bar"},
|
||||
{"candidate": "baz", "reference": "qux"},
|
||||
]
|
||||
jsonl_file = tmp_path / "data.jsonl"
|
||||
jsonl_file.write_text("\n".join(json.dumps(d) for d in data))
|
||||
|
||||
pairs = read_jsonl(jsonl_file)
|
||||
|
||||
assert len(pairs) == 2
|
||||
assert pairs[0].candidate == "foo"
|
||||
assert pairs[0].reference == "bar"
|
||||
assert pairs[1].candidate == "baz"
|
||||
assert pairs[1].reference == "qux"
|
||||
|
||||
def test_read_empty_file(self, tmp_path: Path) -> None:
|
||||
jsonl_file = tmp_path / "empty.jsonl"
|
||||
jsonl_file.write_text("")
|
||||
|
||||
pairs = read_jsonl(jsonl_file)
|
||||
|
||||
assert pairs == []
|
||||
|
||||
def test_read_file_with_blank_lines(self, tmp_path: Path) -> None:
|
||||
jsonl_file = tmp_path / "data.jsonl"
|
||||
content = '{"candidate": "a", "reference": "b"}\n\n{"candidate": "c", "reference": "d"}\n'
|
||||
jsonl_file.write_text(content)
|
||||
|
||||
pairs = read_jsonl(jsonl_file)
|
||||
|
||||
assert len(pairs) == 2
|
||||
|
||||
def test_read_file_not_found(self, tmp_path: Path) -> None:
|
||||
with pytest.raises(FileNotFoundError):
|
||||
read_jsonl(tmp_path / "nonexistent.jsonl")
|
||||
|
||||
def test_read_invalid_json(self, tmp_path: Path) -> None:
|
||||
jsonl_file = tmp_path / "invalid.jsonl"
|
||||
jsonl_file.write_text("not valid json")
|
||||
|
||||
with pytest.raises(ValueError, match="Invalid JSON on line 1"):
|
||||
read_jsonl(jsonl_file)
|
||||
|
||||
def test_read_missing_candidate_key(self, tmp_path: Path) -> None:
|
||||
jsonl_file = tmp_path / "data.jsonl"
|
||||
jsonl_file.write_text('{"reference": "bar"}')
|
||||
|
||||
with pytest.raises(ValueError, match="Missing 'candidate' key on line 1"):
|
||||
read_jsonl(jsonl_file)
|
||||
|
||||
def test_read_missing_reference_key(self, tmp_path: Path) -> None:
|
||||
jsonl_file = tmp_path / "data.jsonl"
|
||||
jsonl_file.write_text('{"candidate": "foo"}')
|
||||
|
||||
with pytest.raises(ValueError, match="Missing 'reference' key on line 1"):
|
||||
read_jsonl(jsonl_file)
|
||||
|
||||
|
||||
class TestReadPairedJsonl:
|
||||
def test_read_paired_valid(self, tmp_path: Path) -> None:
|
||||
candidates_file = tmp_path / "candidates.jsonl"
|
||||
references_file = tmp_path / "references.jsonl"
|
||||
|
||||
candidates_file.write_text('{"text": "foo"}\n{"text": "bar"}')
|
||||
references_file.write_text('{"text": "baz"}\n{"text": "qux"}')
|
||||
|
||||
pairs = read_paired_jsonl(candidates_file, references_file)
|
||||
|
||||
assert len(pairs) == 2
|
||||
assert pairs[0].candidate == "foo"
|
||||
assert pairs[0].reference == "baz"
|
||||
assert pairs[1].candidate == "bar"
|
||||
assert pairs[1].reference == "qux"
|
||||
|
||||
def test_read_paired_length_mismatch(self, tmp_path: Path) -> None:
|
||||
candidates_file = tmp_path / "candidates.jsonl"
|
||||
references_file = tmp_path / "references.jsonl"
|
||||
|
||||
candidates_file.write_text('{"text": "foo"}\n{"text": "bar"}')
|
||||
references_file.write_text('{"text": "baz"}')
|
||||
|
||||
with pytest.raises(ValueError, match="does not match"):
|
||||
read_paired_jsonl(candidates_file, references_file)
|
||||
|
||||
def test_read_paired_candidates_not_found(self, tmp_path: Path) -> None:
|
||||
references_file = tmp_path / "references.jsonl"
|
||||
references_file.write_text('{"text": "baz"}')
|
||||
|
||||
with pytest.raises(FileNotFoundError, match="Candidates file not found"):
|
||||
read_paired_jsonl(tmp_path / "nonexistent.jsonl", references_file)
|
||||
|
||||
def test_read_paired_references_not_found(self, tmp_path: Path) -> None:
|
||||
candidates_file = tmp_path / "candidates.jsonl"
|
||||
candidates_file.write_text('{"text": "foo"}')
|
||||
|
||||
with pytest.raises(FileNotFoundError, match="References file not found"):
|
||||
read_paired_jsonl(candidates_file, tmp_path / "nonexistent.jsonl")
|
||||
|
||||
def test_read_paired_missing_text_key(self, tmp_path: Path) -> None:
|
||||
candidates_file = tmp_path / "candidates.jsonl"
|
||||
references_file = tmp_path / "references.jsonl"
|
||||
|
||||
candidates_file.write_text('{"value": "foo"}')
|
||||
references_file.write_text('{"text": "baz"}')
|
||||
|
||||
with pytest.raises(ValueError, match="Missing 'text' key in candidates file"):
|
||||
read_paired_jsonl(candidates_file, references_file)
|
||||
214
tests/test_cli/test_validate.py
Normal file
214
tests/test_cli/test_validate.py
Normal file
@@ -0,0 +1,214 @@
|
||||
"""Tests for CLI validate command."""
|
||||
|
||||
import json
|
||||
from pathlib import Path
|
||||
|
||||
from typer.testing import CliRunner
|
||||
|
||||
from veritext.cli.main import app
|
||||
|
||||
runner = CliRunner()
|
||||
|
||||
|
||||
class TestValidateInline:
|
||||
def test_validate_inline_basic(self) -> None:
|
||||
result = runner.invoke(
|
||||
app,
|
||||
[
|
||||
"validate",
|
||||
"The quick brown fox jumps",
|
||||
"-r",
|
||||
"The quick brown fox jumps",
|
||||
"-m",
|
||||
"bleu",
|
||||
],
|
||||
)
|
||||
assert result.exit_code == 0
|
||||
assert "bleu4" in result.stdout
|
||||
|
||||
def test_validate_inline_with_rouge(self) -> None:
|
||||
result = runner.invoke(
|
||||
app,
|
||||
[
|
||||
"validate",
|
||||
"hello world today",
|
||||
"-r",
|
||||
"hello world here",
|
||||
"-m",
|
||||
"rouge",
|
||||
],
|
||||
)
|
||||
assert result.exit_code == 0
|
||||
assert "rouge_l" in result.stdout
|
||||
|
||||
def test_validate_inline_with_lexical(self) -> None:
|
||||
result = runner.invoke(
|
||||
app,
|
||||
[
|
||||
"validate",
|
||||
"hello world",
|
||||
"-r",
|
||||
"hello everyone",
|
||||
"-m",
|
||||
"lexical",
|
||||
],
|
||||
)
|
||||
assert result.exit_code == 0
|
||||
assert "jaccard" in result.stdout
|
||||
assert "token_overlap" in result.stdout
|
||||
|
||||
def test_validate_inline_json_output(self) -> None:
|
||||
result = runner.invoke(
|
||||
app,
|
||||
[
|
||||
"validate",
|
||||
"hello world today",
|
||||
"-r",
|
||||
"hello world today",
|
||||
"-m",
|
||||
"bleu",
|
||||
"-o",
|
||||
"json",
|
||||
],
|
||||
)
|
||||
assert result.exit_code == 0
|
||||
data = json.loads(result.stdout)
|
||||
assert "bleu4" in data
|
||||
|
||||
def test_validate_inline_simple_output(self) -> None:
|
||||
result = runner.invoke(
|
||||
app,
|
||||
[
|
||||
"validate",
|
||||
"hello world today",
|
||||
"-r",
|
||||
"hello world today",
|
||||
"-m",
|
||||
"rouge",
|
||||
"-o",
|
||||
"simple",
|
||||
],
|
||||
)
|
||||
assert result.exit_code == 0
|
||||
assert "rouge_l:" in result.stdout
|
||||
|
||||
def test_validate_inline_missing_reference(self) -> None:
|
||||
result = runner.invoke(
|
||||
app,
|
||||
["validate", "hello world", "-m", "bleu"],
|
||||
)
|
||||
assert result.exit_code == 1
|
||||
assert "Error" in result.stdout
|
||||
|
||||
def test_validate_inline_invalid_metric(self) -> None:
|
||||
result = runner.invoke(
|
||||
app,
|
||||
["validate", "hello", "-r", "world", "-m", "invalid_metric"],
|
||||
)
|
||||
assert result.exit_code == 1
|
||||
assert "Unknown metrics" in result.stdout
|
||||
|
||||
|
||||
class TestValidateFile:
|
||||
def test_validate_file_basic(self, tmp_path: Path) -> None:
|
||||
data_file = tmp_path / "data.jsonl"
|
||||
data_file.write_text(
|
||||
'{"candidate": "hello world today", "reference": "hello world today"}\n'
|
||||
'{"candidate": "foo bar baz", "reference": "foo bar baz"}'
|
||||
)
|
||||
|
||||
result = runner.invoke(
|
||||
app,
|
||||
["validate", "-f", str(data_file), "-m", "bleu"],
|
||||
)
|
||||
assert result.exit_code == 0
|
||||
assert "bleu4" in result.stdout
|
||||
assert "Evaluated 2 text pairs" in result.stdout
|
||||
|
||||
def test_validate_file_not_found(self) -> None:
|
||||
result = runner.invoke(
|
||||
app,
|
||||
["validate", "-f", "/nonexistent/file.jsonl", "-m", "bleu"],
|
||||
)
|
||||
assert result.exit_code == 1
|
||||
assert "Error" in result.stdout
|
||||
|
||||
def test_validate_paired_files(self, tmp_path: Path) -> None:
|
||||
candidates_file = tmp_path / "candidates.jsonl"
|
||||
references_file = tmp_path / "references.jsonl"
|
||||
|
||||
candidates_file.write_text(
|
||||
'{"text": "hello world today"}\n{"text": "foo bar baz"}'
|
||||
)
|
||||
references_file.write_text(
|
||||
'{"text": "hello world today"}\n{"text": "foo bar baz"}'
|
||||
)
|
||||
|
||||
result = runner.invoke(
|
||||
app,
|
||||
[
|
||||
"validate",
|
||||
"-f",
|
||||
str(candidates_file),
|
||||
"-R",
|
||||
str(references_file),
|
||||
"-m",
|
||||
"bleu",
|
||||
],
|
||||
)
|
||||
assert result.exit_code == 0
|
||||
assert "Evaluated 2 text pairs" in result.stdout
|
||||
|
||||
|
||||
class TestValidateOptions:
|
||||
def test_validate_with_threshold(self) -> None:
|
||||
result = runner.invoke(
|
||||
app,
|
||||
[
|
||||
"validate",
|
||||
"hello world today",
|
||||
"-r",
|
||||
"hello world today",
|
||||
"-m",
|
||||
"bleu",
|
||||
"-t",
|
||||
"0.5",
|
||||
],
|
||||
)
|
||||
assert result.exit_code == 0
|
||||
# Table output should include Status column
|
||||
assert "Status" in result.stdout or "PASS" in result.stdout
|
||||
|
||||
def test_validate_invalid_output_format(self) -> None:
|
||||
result = runner.invoke(
|
||||
app,
|
||||
[
|
||||
"validate",
|
||||
"hello",
|
||||
"-r",
|
||||
"world",
|
||||
"-m",
|
||||
"bleu",
|
||||
"-o",
|
||||
"invalid",
|
||||
],
|
||||
)
|
||||
assert result.exit_code == 1
|
||||
assert "Invalid output format" in result.stdout
|
||||
|
||||
def test_validate_multiple_metrics(self) -> None:
|
||||
result = runner.invoke(
|
||||
app,
|
||||
[
|
||||
"validate",
|
||||
"The quick brown fox",
|
||||
"-r",
|
||||
"The quick brown fox",
|
||||
"-m",
|
||||
"bleu,rouge,lexical",
|
||||
],
|
||||
)
|
||||
assert result.exit_code == 0
|
||||
assert "bleu4" in result.stdout
|
||||
assert "rouge_l" in result.stdout
|
||||
assert "jaccard" in result.stdout
|
||||
Reference in New Issue
Block a user