Add comprehensive test suite for validate command, benchmark commands, input readers, and output formatters using Typer CliRunner.
215 lines
5.8 KiB
Python
215 lines
5.8 KiB
Python
"""Tests for CLI validate command."""
|
|
|
|
import json
|
|
from pathlib import Path
|
|
|
|
from typer.testing import CliRunner
|
|
|
|
from veritext.cli.main import app
|
|
|
|
runner = CliRunner()
|
|
|
|
|
|
class TestValidateInline:
|
|
def test_validate_inline_basic(self) -> None:
|
|
result = runner.invoke(
|
|
app,
|
|
[
|
|
"validate",
|
|
"The quick brown fox jumps",
|
|
"-r",
|
|
"The quick brown fox jumps",
|
|
"-m",
|
|
"bleu",
|
|
],
|
|
)
|
|
assert result.exit_code == 0
|
|
assert "bleu4" in result.stdout
|
|
|
|
def test_validate_inline_with_rouge(self) -> None:
|
|
result = runner.invoke(
|
|
app,
|
|
[
|
|
"validate",
|
|
"hello world today",
|
|
"-r",
|
|
"hello world here",
|
|
"-m",
|
|
"rouge",
|
|
],
|
|
)
|
|
assert result.exit_code == 0
|
|
assert "rouge_l" in result.stdout
|
|
|
|
def test_validate_inline_with_lexical(self) -> None:
|
|
result = runner.invoke(
|
|
app,
|
|
[
|
|
"validate",
|
|
"hello world",
|
|
"-r",
|
|
"hello everyone",
|
|
"-m",
|
|
"lexical",
|
|
],
|
|
)
|
|
assert result.exit_code == 0
|
|
assert "jaccard" in result.stdout
|
|
assert "token_overlap" in result.stdout
|
|
|
|
def test_validate_inline_json_output(self) -> None:
|
|
result = runner.invoke(
|
|
app,
|
|
[
|
|
"validate",
|
|
"hello world today",
|
|
"-r",
|
|
"hello world today",
|
|
"-m",
|
|
"bleu",
|
|
"-o",
|
|
"json",
|
|
],
|
|
)
|
|
assert result.exit_code == 0
|
|
data = json.loads(result.stdout)
|
|
assert "bleu4" in data
|
|
|
|
def test_validate_inline_simple_output(self) -> None:
|
|
result = runner.invoke(
|
|
app,
|
|
[
|
|
"validate",
|
|
"hello world today",
|
|
"-r",
|
|
"hello world today",
|
|
"-m",
|
|
"rouge",
|
|
"-o",
|
|
"simple",
|
|
],
|
|
)
|
|
assert result.exit_code == 0
|
|
assert "rouge_l:" in result.stdout
|
|
|
|
def test_validate_inline_missing_reference(self) -> None:
|
|
result = runner.invoke(
|
|
app,
|
|
["validate", "hello world", "-m", "bleu"],
|
|
)
|
|
assert result.exit_code == 1
|
|
assert "Error" in result.stdout
|
|
|
|
def test_validate_inline_invalid_metric(self) -> None:
|
|
result = runner.invoke(
|
|
app,
|
|
["validate", "hello", "-r", "world", "-m", "invalid_metric"],
|
|
)
|
|
assert result.exit_code == 1
|
|
assert "Unknown metrics" in result.stdout
|
|
|
|
|
|
class TestValidateFile:
|
|
def test_validate_file_basic(self, tmp_path: Path) -> None:
|
|
data_file = tmp_path / "data.jsonl"
|
|
data_file.write_text(
|
|
'{"candidate": "hello world today", "reference": "hello world today"}\n'
|
|
'{"candidate": "foo bar baz", "reference": "foo bar baz"}'
|
|
)
|
|
|
|
result = runner.invoke(
|
|
app,
|
|
["validate", "-f", str(data_file), "-m", "bleu"],
|
|
)
|
|
assert result.exit_code == 0
|
|
assert "bleu4" in result.stdout
|
|
assert "Evaluated 2 text pairs" in result.stdout
|
|
|
|
def test_validate_file_not_found(self) -> None:
|
|
result = runner.invoke(
|
|
app,
|
|
["validate", "-f", "/nonexistent/file.jsonl", "-m", "bleu"],
|
|
)
|
|
assert result.exit_code == 1
|
|
assert "Error" in result.stdout
|
|
|
|
def test_validate_paired_files(self, tmp_path: Path) -> None:
|
|
candidates_file = tmp_path / "candidates.jsonl"
|
|
references_file = tmp_path / "references.jsonl"
|
|
|
|
candidates_file.write_text(
|
|
'{"text": "hello world today"}\n{"text": "foo bar baz"}'
|
|
)
|
|
references_file.write_text(
|
|
'{"text": "hello world today"}\n{"text": "foo bar baz"}'
|
|
)
|
|
|
|
result = runner.invoke(
|
|
app,
|
|
[
|
|
"validate",
|
|
"-f",
|
|
str(candidates_file),
|
|
"-R",
|
|
str(references_file),
|
|
"-m",
|
|
"bleu",
|
|
],
|
|
)
|
|
assert result.exit_code == 0
|
|
assert "Evaluated 2 text pairs" in result.stdout
|
|
|
|
|
|
class TestValidateOptions:
|
|
def test_validate_with_threshold(self) -> None:
|
|
result = runner.invoke(
|
|
app,
|
|
[
|
|
"validate",
|
|
"hello world today",
|
|
"-r",
|
|
"hello world today",
|
|
"-m",
|
|
"bleu",
|
|
"-t",
|
|
"0.5",
|
|
],
|
|
)
|
|
assert result.exit_code == 0
|
|
# Table output should include Status column
|
|
assert "Status" in result.stdout or "PASS" in result.stdout
|
|
|
|
def test_validate_invalid_output_format(self) -> None:
|
|
result = runner.invoke(
|
|
app,
|
|
[
|
|
"validate",
|
|
"hello",
|
|
"-r",
|
|
"world",
|
|
"-m",
|
|
"bleu",
|
|
"-o",
|
|
"invalid",
|
|
],
|
|
)
|
|
assert result.exit_code == 1
|
|
assert "Invalid output format" in result.stdout
|
|
|
|
def test_validate_multiple_metrics(self) -> None:
|
|
result = runner.invoke(
|
|
app,
|
|
[
|
|
"validate",
|
|
"The quick brown fox",
|
|
"-r",
|
|
"The quick brown fox",
|
|
"-m",
|
|
"bleu,rouge,lexical",
|
|
],
|
|
)
|
|
assert result.exit_code == 0
|
|
assert "bleu4" in result.stdout
|
|
assert "rouge_l" in result.stdout
|
|
assert "jaccard" in result.stdout
|