feat(cli): add JSONL and directory input readers
Add TextPair dataclass and read_jsonl/read_paired_jsonl functions for parsing candidate-reference pairs from JSONL files.
This commit is contained in:
120
src/veritext/cli/readers.py
Normal file
120
src/veritext/cli/readers.py
Normal file
@@ -0,0 +1,120 @@
|
||||
"""Input readers for CLI operations."""
|
||||
|
||||
import json
|
||||
from dataclasses import dataclass
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
@dataclass
|
||||
class TextPair:
|
||||
"""A candidate-reference text pair for validation."""
|
||||
|
||||
candidate: str
|
||||
reference: str
|
||||
|
||||
|
||||
def read_jsonl(path: Path) -> list[TextPair]:
|
||||
"""
|
||||
Read text pairs from a JSONL file.
|
||||
|
||||
Each line must be a JSON object with 'candidate' and 'reference' keys.
|
||||
|
||||
Args:
|
||||
path: Path to the JSONL file.
|
||||
|
||||
Returns:
|
||||
List of TextPair objects.
|
||||
|
||||
Raises:
|
||||
FileNotFoundError: If the file does not exist.
|
||||
ValueError: If any line is malformed or missing required keys.
|
||||
"""
|
||||
if not path.exists():
|
||||
raise FileNotFoundError(f"File not found: {path}")
|
||||
|
||||
pairs: list[TextPair] = []
|
||||
with path.open() as f:
|
||||
for line_num, line in enumerate(f, start=1):
|
||||
line = line.strip()
|
||||
if not line:
|
||||
continue
|
||||
|
||||
try:
|
||||
data = json.loads(line)
|
||||
except json.JSONDecodeError as e:
|
||||
raise ValueError(f"Invalid JSON on line {line_num}: {e}") from e
|
||||
|
||||
if "candidate" not in data:
|
||||
raise ValueError(f"Missing 'candidate' key on line {line_num}")
|
||||
if "reference" not in data:
|
||||
raise ValueError(f"Missing 'reference' key on line {line_num}")
|
||||
|
||||
pairs.append(
|
||||
TextPair(
|
||||
candidate=str(data["candidate"]),
|
||||
reference=str(data["reference"]),
|
||||
)
|
||||
)
|
||||
|
||||
return pairs
|
||||
|
||||
|
||||
def read_paired_jsonl(candidates_path: Path, references_path: Path) -> list[TextPair]:
|
||||
"""
|
||||
Read text pairs from separate candidate and reference JSONL files.
|
||||
|
||||
Each file should contain one JSON object per line with a 'text' key.
|
||||
|
||||
Args:
|
||||
candidates_path: Path to the candidates JSONL file.
|
||||
references_path: Path to the references JSONL file.
|
||||
|
||||
Returns:
|
||||
List of TextPair objects.
|
||||
|
||||
Raises:
|
||||
FileNotFoundError: If either file does not exist.
|
||||
ValueError: If files have different lengths or are malformed.
|
||||
"""
|
||||
candidates = _read_text_jsonl(candidates_path, "candidates")
|
||||
references = _read_text_jsonl(references_path, "references")
|
||||
|
||||
if len(candidates) != len(references):
|
||||
raise ValueError(
|
||||
f"Number of candidates ({len(candidates)}) does not match "
|
||||
f"number of references ({len(references)})"
|
||||
)
|
||||
|
||||
return [
|
||||
TextPair(candidate=c, reference=r)
|
||||
for c, r in zip(candidates, references, strict=True)
|
||||
]
|
||||
|
||||
|
||||
def _read_text_jsonl(path: Path, label: str) -> list[str]:
|
||||
"""Read text values from a JSONL file with 'text' key per line."""
|
||||
if not path.exists():
|
||||
raise FileNotFoundError(f"{label.capitalize()} file not found: {path}")
|
||||
|
||||
texts: list[str] = []
|
||||
with path.open() as f:
|
||||
for line_num, line in enumerate(f, start=1):
|
||||
line = line.strip()
|
||||
if not line:
|
||||
continue
|
||||
|
||||
try:
|
||||
data = json.loads(line)
|
||||
except json.JSONDecodeError as e:
|
||||
raise ValueError(
|
||||
f"Invalid JSON in {label} file on line {line_num}: {e}"
|
||||
) from e
|
||||
|
||||
if "text" not in data:
|
||||
raise ValueError(
|
||||
f"Missing 'text' key in {label} file on line {line_num}"
|
||||
)
|
||||
|
||||
texts.append(str(data["text"]))
|
||||
|
||||
return texts
|
||||
Reference in New Issue
Block a user