diff --git a/src/veritext/cli/readers.py b/src/veritext/cli/readers.py new file mode 100644 index 0000000..85a83dc --- /dev/null +++ b/src/veritext/cli/readers.py @@ -0,0 +1,118 @@ +"""Input readers for CLI operations.""" + +import json +from dataclasses import dataclass +from pathlib import Path + + +@dataclass +class TextPair: + + candidate: str + reference: str + + +def read_jsonl(path: Path) -> list[TextPair]: + """ + Read text pairs from a JSONL file. + + Each line must be a JSON object with 'candidate' and 'reference' keys. + + Args: + path: Path to the JSONL file. + + Returns: + List of TextPair objects. + + Raises: + FileNotFoundError: If the file does not exist. + ValueError: If any line is malformed or missing required keys. + """ + if not path.exists(): + raise FileNotFoundError(f"File not found: {path}") + + pairs: list[TextPair] = [] + with path.open() as f: + for line_num, line in enumerate(f, start=1): + line = line.strip() + if not line: + continue + + try: + data = json.loads(line) + except json.JSONDecodeError as e: + raise ValueError(f"Invalid JSON on line {line_num}: {e}") from e + + if "candidate" not in data: + raise ValueError(f"Missing 'candidate' key on line {line_num}") + if "reference" not in data: + raise ValueError(f"Missing 'reference' key on line {line_num}") + + pairs.append( + TextPair( + candidate=str(data["candidate"]), + reference=str(data["reference"]), + ) + ) + + return pairs + + +def read_paired_jsonl(candidates_path: Path, references_path: Path) -> list[TextPair]: + """ + Read text pairs from separate candidate and reference JSONL files. + + Each file should contain one JSON object per line with a 'text' key. + + Args: + candidates_path: Path to the candidates JSONL file. + references_path: Path to the references JSONL file. + + Returns: + List of TextPair objects. + + Raises: + FileNotFoundError: If either file does not exist. + ValueError: If files have different lengths or are malformed. + """ + candidates = _read_text_jsonl(candidates_path, "candidates") + references = _read_text_jsonl(references_path, "references") + + if len(candidates) != len(references): + raise ValueError( + f"Number of candidates ({len(candidates)}) does not match " + f"number of references ({len(references)})" + ) + + return [ + TextPair(candidate=c, reference=r) + for c, r in zip(candidates, references, strict=True) + ] + + +def _read_text_jsonl(path: Path, label: str) -> list[str]: + if not path.exists(): + raise FileNotFoundError(f"{label.capitalize()} file not found: {path}") + + texts: list[str] = [] + with path.open() as f: + for line_num, line in enumerate(f, start=1): + line = line.strip() + if not line: + continue + + try: + data = json.loads(line) + except json.JSONDecodeError as e: + raise ValueError( + f"Invalid JSON in {label} file on line {line_num}: {e}" + ) from e + + if "text" not in data: + raise ValueError( + f"Missing 'text' key in {label} file on line {line_num}" + ) + + texts.append(str(data["text"])) + + return texts