feat: JSONL and directory input readers

Add TextPair dataclass and read_jsonl/read_paired_jsonl functions
for parsing candidate-reference pairs from JSONL files.
This commit is contained in:
2025-05-07 21:17:16 +00:00
parent ffa8658189
commit 7f2e82494c

118
src/veritext/cli/readers.py Normal file
View File

@@ -0,0 +1,118 @@
"""Input readers for CLI operations."""
import json
from dataclasses import dataclass
from pathlib import Path
@dataclass
class TextPair:
candidate: str
reference: str
def read_jsonl(path: Path) -> list[TextPair]:
"""
Read text pairs from a JSONL file.
Each line must be a JSON object with 'candidate' and 'reference' keys.
Args:
path: Path to the JSONL file.
Returns:
List of TextPair objects.
Raises:
FileNotFoundError: If the file does not exist.
ValueError: If any line is malformed or missing required keys.
"""
if not path.exists():
raise FileNotFoundError(f"File not found: {path}")
pairs: list[TextPair] = []
with path.open() as f:
for line_num, line in enumerate(f, start=1):
line = line.strip()
if not line:
continue
try:
data = json.loads(line)
except json.JSONDecodeError as e:
raise ValueError(f"Invalid JSON on line {line_num}: {e}") from e
if "candidate" not in data:
raise ValueError(f"Missing 'candidate' key on line {line_num}")
if "reference" not in data:
raise ValueError(f"Missing 'reference' key on line {line_num}")
pairs.append(
TextPair(
candidate=str(data["candidate"]),
reference=str(data["reference"]),
)
)
return pairs
def read_paired_jsonl(candidates_path: Path, references_path: Path) -> list[TextPair]:
"""
Read text pairs from separate candidate and reference JSONL files.
Each file should contain one JSON object per line with a 'text' key.
Args:
candidates_path: Path to the candidates JSONL file.
references_path: Path to the references JSONL file.
Returns:
List of TextPair objects.
Raises:
FileNotFoundError: If either file does not exist.
ValueError: If files have different lengths or are malformed.
"""
candidates = _read_text_jsonl(candidates_path, "candidates")
references = _read_text_jsonl(references_path, "references")
if len(candidates) != len(references):
raise ValueError(
f"Number of candidates ({len(candidates)}) does not match "
f"number of references ({len(references)})"
)
return [
TextPair(candidate=c, reference=r)
for c, r in zip(candidates, references, strict=True)
]
def _read_text_jsonl(path: Path, label: str) -> list[str]:
if not path.exists():
raise FileNotFoundError(f"{label.capitalize()} file not found: {path}")
texts: list[str] = []
with path.open() as f:
for line_num, line in enumerate(f, start=1):
line = line.strip()
if not line:
continue
try:
data = json.loads(line)
except json.JSONDecodeError as e:
raise ValueError(
f"Invalid JSON in {label} file on line {line_num}: {e}"
) from e
if "text" not in data:
raise ValueError(
f"Missing 'text' key in {label} file on line {line_num}"
)
texts.append(str(data["text"]))
return texts