veritext/tests/test_cli/test_readers.py

"""Tests for CLI input readers."""

import json
from pathlib import Path

import pytest

from veritext.cli.readers import TextPair, read_jsonl, read_paired_jsonl


class TestTextPair:
    """Tests for TextPair dataclass."""

    def test_create_text_pair(self) -> None:
        """Test creating a TextPair."""
        pair = TextPair(candidate="hello", reference="world")
        assert pair.candidate == "hello"
        assert pair.reference == "world"


class TestReadJsonl:
    """Tests for read_jsonl function."""

    def test_read_valid_jsonl(self, tmp_path: Path) -> None:
        """Test reading a valid JSONL file."""
        data = [
            {"candidate": "foo", "reference": "bar"},
            {"candidate": "baz", "reference": "qux"},
        ]
        jsonl_file = tmp_path / "data.jsonl"
        jsonl_file.write_text("\n".join(json.dumps(d) for d in data))

        pairs = read_jsonl(jsonl_file)

        assert len(pairs) == 2
        assert pairs[0].candidate == "foo"
        assert pairs[0].reference == "bar"
        assert pairs[1].candidate == "baz"
        assert pairs[1].reference == "qux"

    def test_read_empty_file(self, tmp_path: Path) -> None:
        """Test reading an empty JSONL file."""
        jsonl_file = tmp_path / "empty.jsonl"
        jsonl_file.write_text("")

        pairs = read_jsonl(jsonl_file)

        assert pairs == []

    def test_read_file_with_blank_lines(self, tmp_path: Path) -> None:
        """Test reading a JSONL file with blank lines."""
        jsonl_file = tmp_path / "data.jsonl"
        content = '{"candidate": "a", "reference": "b"}\n\n{"candidate": "c", "reference": "d"}\n'
        jsonl_file.write_text(content)

        pairs = read_jsonl(jsonl_file)

        assert len(pairs) == 2

    def test_read_file_not_found(self, tmp_path: Path) -> None:
        """Test reading a non-existent file."""
        with pytest.raises(FileNotFoundError):
            read_jsonl(tmp_path / "nonexistent.jsonl")

    def test_read_invalid_json(self, tmp_path: Path) -> None:
        """Test reading a file with invalid JSON."""
        jsonl_file = tmp_path / "invalid.jsonl"
        jsonl_file.write_text("not valid json")

        with pytest.raises(ValueError, match="Invalid JSON on line 1"):
            read_jsonl(jsonl_file)

    def test_read_missing_candidate_key(self, tmp_path: Path) -> None:
        """Test reading a file missing the candidate key."""
        jsonl_file = tmp_path / "data.jsonl"
        jsonl_file.write_text('{"reference": "bar"}')

        with pytest.raises(ValueError, match="Missing 'candidate' key on line 1"):
            read_jsonl(jsonl_file)

    def test_read_missing_reference_key(self, tmp_path: Path) -> None:
        """Test reading a file missing the reference key."""
        jsonl_file = tmp_path / "data.jsonl"
        jsonl_file.write_text('{"candidate": "foo"}')

        with pytest.raises(ValueError, match="Missing 'reference' key on line 1"):
            read_jsonl(jsonl_file)


class TestReadPairedJsonl:
    """Tests for read_paired_jsonl function."""

    def test_read_paired_valid(self, tmp_path: Path) -> None:
        """Test reading valid paired JSONL files."""
        candidates_file = tmp_path / "candidates.jsonl"
        references_file = tmp_path / "references.jsonl"

        candidates_file.write_text('{"text": "foo"}\n{"text": "bar"}')
        references_file.write_text('{"text": "baz"}\n{"text": "qux"}')

        pairs = read_paired_jsonl(candidates_file, references_file)

        assert len(pairs) == 2
        assert pairs[0].candidate == "foo"
        assert pairs[0].reference == "baz"
        assert pairs[1].candidate == "bar"
        assert pairs[1].reference == "qux"

    def test_read_paired_length_mismatch(self, tmp_path: Path) -> None:
        """Test reading paired files with different lengths."""
        candidates_file = tmp_path / "candidates.jsonl"
        references_file = tmp_path / "references.jsonl"

        candidates_file.write_text('{"text": "foo"}\n{"text": "bar"}')
        references_file.write_text('{"text": "baz"}')

        with pytest.raises(ValueError, match="does not match"):
            read_paired_jsonl(candidates_file, references_file)

    def test_read_paired_candidates_not_found(self, tmp_path: Path) -> None:
        """Test reading when candidates file doesn't exist."""
        references_file = tmp_path / "references.jsonl"
        references_file.write_text('{"text": "baz"}')

        with pytest.raises(FileNotFoundError, match="Candidates file not found"):
            read_paired_jsonl(tmp_path / "nonexistent.jsonl", references_file)

    def test_read_paired_references_not_found(self, tmp_path: Path) -> None:
        """Test reading when references file doesn't exist."""
        candidates_file = tmp_path / "candidates.jsonl"
        candidates_file.write_text('{"text": "foo"}')

        with pytest.raises(FileNotFoundError, match="References file not found"):
            read_paired_jsonl(candidates_file, tmp_path / "nonexistent.jsonl")

    def test_read_paired_missing_text_key(self, tmp_path: Path) -> None:
        """Test reading paired files with missing text key."""
        candidates_file = tmp_path / "candidates.jsonl"
        references_file = tmp_path / "references.jsonl"

        candidates_file.write_text('{"value": "foo"}')
        references_file.write_text('{"text": "baz"}')

        with pytest.raises(ValueError, match="Missing 'text' key in candidates file"):
            read_paired_jsonl(candidates_file, references_file)