feat: JSONL and directory input readers
Add TextPair dataclass and read_jsonl/read_paired_jsonl functions for parsing candidate-reference pairs from JSONL files.
This commit is contained in:
118
src/veritext/cli/readers.py
Normal file
118
src/veritext/cli/readers.py
Normal file
@@ -0,0 +1,118 @@
|
|||||||
|
"""Input readers for CLI operations."""
|
||||||
|
|
||||||
|
import json
|
||||||
|
from dataclasses import dataclass
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class TextPair:
|
||||||
|
|
||||||
|
candidate: str
|
||||||
|
reference: str
|
||||||
|
|
||||||
|
|
||||||
|
def read_jsonl(path: Path) -> list[TextPair]:
|
||||||
|
"""
|
||||||
|
Read text pairs from a JSONL file.
|
||||||
|
|
||||||
|
Each line must be a JSON object with 'candidate' and 'reference' keys.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
path: Path to the JSONL file.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
List of TextPair objects.
|
||||||
|
|
||||||
|
Raises:
|
||||||
|
FileNotFoundError: If the file does not exist.
|
||||||
|
ValueError: If any line is malformed or missing required keys.
|
||||||
|
"""
|
||||||
|
if not path.exists():
|
||||||
|
raise FileNotFoundError(f"File not found: {path}")
|
||||||
|
|
||||||
|
pairs: list[TextPair] = []
|
||||||
|
with path.open() as f:
|
||||||
|
for line_num, line in enumerate(f, start=1):
|
||||||
|
line = line.strip()
|
||||||
|
if not line:
|
||||||
|
continue
|
||||||
|
|
||||||
|
try:
|
||||||
|
data = json.loads(line)
|
||||||
|
except json.JSONDecodeError as e:
|
||||||
|
raise ValueError(f"Invalid JSON on line {line_num}: {e}") from e
|
||||||
|
|
||||||
|
if "candidate" not in data:
|
||||||
|
raise ValueError(f"Missing 'candidate' key on line {line_num}")
|
||||||
|
if "reference" not in data:
|
||||||
|
raise ValueError(f"Missing 'reference' key on line {line_num}")
|
||||||
|
|
||||||
|
pairs.append(
|
||||||
|
TextPair(
|
||||||
|
candidate=str(data["candidate"]),
|
||||||
|
reference=str(data["reference"]),
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
return pairs
|
||||||
|
|
||||||
|
|
||||||
|
def read_paired_jsonl(candidates_path: Path, references_path: Path) -> list[TextPair]:
|
||||||
|
"""
|
||||||
|
Read text pairs from separate candidate and reference JSONL files.
|
||||||
|
|
||||||
|
Each file should contain one JSON object per line with a 'text' key.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
candidates_path: Path to the candidates JSONL file.
|
||||||
|
references_path: Path to the references JSONL file.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
List of TextPair objects.
|
||||||
|
|
||||||
|
Raises:
|
||||||
|
FileNotFoundError: If either file does not exist.
|
||||||
|
ValueError: If files have different lengths or are malformed.
|
||||||
|
"""
|
||||||
|
candidates = _read_text_jsonl(candidates_path, "candidates")
|
||||||
|
references = _read_text_jsonl(references_path, "references")
|
||||||
|
|
||||||
|
if len(candidates) != len(references):
|
||||||
|
raise ValueError(
|
||||||
|
f"Number of candidates ({len(candidates)}) does not match "
|
||||||
|
f"number of references ({len(references)})"
|
||||||
|
)
|
||||||
|
|
||||||
|
return [
|
||||||
|
TextPair(candidate=c, reference=r)
|
||||||
|
for c, r in zip(candidates, references, strict=True)
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
def _read_text_jsonl(path: Path, label: str) -> list[str]:
|
||||||
|
if not path.exists():
|
||||||
|
raise FileNotFoundError(f"{label.capitalize()} file not found: {path}")
|
||||||
|
|
||||||
|
texts: list[str] = []
|
||||||
|
with path.open() as f:
|
||||||
|
for line_num, line in enumerate(f, start=1):
|
||||||
|
line = line.strip()
|
||||||
|
if not line:
|
||||||
|
continue
|
||||||
|
|
||||||
|
try:
|
||||||
|
data = json.loads(line)
|
||||||
|
except json.JSONDecodeError as e:
|
||||||
|
raise ValueError(
|
||||||
|
f"Invalid JSON in {label} file on line {line_num}: {e}"
|
||||||
|
) from e
|
||||||
|
|
||||||
|
if "text" not in data:
|
||||||
|
raise ValueError(
|
||||||
|
f"Missing 'text' key in {label} file on line {line_num}"
|
||||||
|
)
|
||||||
|
|
||||||
|
texts.append(str(data["text"]))
|
||||||
|
|
||||||
|
return texts
|
||||||
Reference in New Issue
Block a user