From 0699e97e1d08b1659c0fcb4ac5ab0515a010de36 Mon Sep 17 00:00:00 2001 From: Kai Chappell Date: Wed, 4 Feb 2026 15:38:46 +0000 Subject: [PATCH] refactor: CLI cleanup and documentation updates - Refactor CLI metric computation to eliminate code duplication - Update version format to PEP 440 compliance (0.1.0.dev0) - Cache Settings instance via @lru_cache for performance - Document composite validators' protocol deviation - Consolidate redundant empty checks in ROUGE-L computation - Add Phase 10 (Portfolio Demos) to implementation plan --- changelog.md | 13 +++ docs/implementation-plan.md | 53 ++++++++++ docs/project-plan.md | 44 ++++++++ pyproject.toml | 2 +- src/veritext/cli/validate.py | 153 +++++++++++++++++---------- src/veritext/core/config.py | 4 +- src/veritext/metrics/rouge.py | 3 - src/veritext/validators/composite.py | 18 +++- 8 files changed, 224 insertions(+), 66 deletions(-) diff --git a/changelog.md b/changelog.md index 22e1dd1..a373c73 100644 --- a/changelog.md +++ b/changelog.md @@ -7,9 +7,22 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +### Changed + +- Refactored CLI metric computation to eliminate code duplication +- Version format updated from `0.1.0-dev` to `0.1.0.dev0` (PEP 440 compliance) +- Settings instance is now cached via `@lru_cache` for better performance +- Documented composite validators' intentional deviation from `Check` protocol return type + ### Fixed +- Consolidated redundant empty checks in ROUGE-L computation - Fixed README example using incorrect property names (`grade_level` → `flesch_kincaid_grade`, `reading_ease` → `flesch_reading_ease`) + +### Documentation + +- Added Phase 10 (Portfolio Demos) to implementation plan: Streamlit demo and Jupyter notebooks +- Updated project plan with portfolio demo section - Fixed potential crash in ROUGE metric when all references are empty after tokenisation - Fixed potential division by zero in readability metric when text has no sentence endings - Fixed unbounded cache growth in `SemanticSimilarity` by implementing LRU eviction with configurable max size diff --git a/docs/implementation-plan.md b/docs/implementation-plan.md index 66f5fe1..2cffdd0 100644 --- a/docs/implementation-plan.md +++ b/docs/implementation-plan.md @@ -871,6 +871,59 @@ uv run pytest --cov=src/veritext --cov-report=term-missing --- +### Phase 10: Portfolio Demos + +**Goal:** Interactive demos for showcasing Veritext without installation. + +**Step 1 — Streamlit Demo:** + +Build a quick interactive web UI for general visitors. + +- [ ] Create `demo/streamlit_app.py` +- [ ] Text input boxes (candidate + reference) +- [ ] Metric selector (BLEU, ROUGE, lexical, readability) +- [ ] Threshold sliders for pass/fail validation +- [ ] Results table with scores and status +- [ ] Deploy to homeserver (e.g., `veritext.kschappell.com`) + +**Step 2 — Jupyter Notebook Collection:** + +Deep-dive notebooks targeting data science and ML recruiters. + +- [ ] Create `notebooks/` directory +- [ ] `01-metrics-overview.ipynb` — Introduction to each metric with visualisations +- [ ] `02-batch-evaluation.ipynb` — Evaluating model outputs at scale +- [ ] `03-regression-detection.ipynb` — Tracking quality over time +- [ ] `04-chatbot-validation.ipynb` — Real-world use case + +**Step 3 — JupyterLite Deployment:** + +Host notebooks as static files running in the browser. + +- [ ] Configure JupyterLite build with veritext pre-installed +- [ ] Bundle notebooks into static site +- [ ] Deploy alongside Streamlit demo + +**Files:** +- `demo/streamlit_app.py` +- `notebooks/01-metrics-overview.ipynb` +- `notebooks/02-batch-evaluation.ipynb` +- `notebooks/03-regression-detection.ipynb` +- `notebooks/04-chatbot-validation.ipynb` +- `notebooks/jupyterlite-config.json` + +**Verification:** +```bash +# Streamlit +uv run streamlit run demo/streamlit_app.py + +# JupyterLite (local preview) +jupyter lite build --contents notebooks/ +jupyter lite serve +``` + +--- + ## Dependencies ```toml diff --git a/docs/project-plan.md b/docs/project-plan.md index 88fbaa7..0954101 100644 --- a/docs/project-plan.md +++ b/docs/project-plan.md @@ -488,3 +488,47 @@ benchmark.assert_no_regression(tolerance=0.03) 5. **Natural portfolio narrative** — "I was building X and needed a better way to test it, so I built this tool." Every interviewer has faced similar problems. + +--- + +## Portfolio Demos (Future) + +Interactive demos to showcase Veritext without requiring installation. + +### Streamlit Demo + +A quick interactive web UI for general visitors and recruiters. + +**Features:** +- Text input boxes (candidate + reference) +- Metric selector (BLEU, ROUGE, lexical, readability) +- Threshold sliders for pass/fail validation +- Results table with scores and status + +**Deployment:** Self-hosted on homeserver (e.g., `veritext.kschappell.com`) + +**Effort:** ~30 minutes + +### Jupyter Notebook Collection + +Deep-dive notebooks targeting data science and ML recruiters. + +**Notebooks:** + +| Notebook | Purpose | +|----------|---------| +| `01-metrics-overview.ipynb` | Introduction to each metric with visualisations | +| `02-batch-evaluation.ipynb` | Evaluating model outputs at scale, statistical analysis | +| `03-regression-detection.ipynb` | Tracking quality over time, detecting degradation | +| `04-chatbot-validation.ipynb` | Real-world use case: validating chatbot responses | + +**Hosting:** JupyterLite (static files, runs in browser via WebAssembly) + +**Deployment:** Self-hosted alongside Streamlit demo + +**Why both:** + +| Demo Type | Audience | Value | +|-----------|----------|-------| +| Streamlit | General visitors | Quick, interactive, no friction | +| Notebooks | Data/ML recruiters | Shows analytical depth, speaks their language | diff --git a/pyproject.toml b/pyproject.toml index 6042295..89dab4f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "veritext" -version = "0.1.0-dev" +version = "0.1.0.dev0" description = "Semantic text validation framework" readme = "readme.md" requires-python = ">=3.11" diff --git a/src/veritext/cli/validate.py b/src/veritext/cli/validate.py index b07455c..6e15506 100644 --- a/src/veritext/cli/validate.py +++ b/src/veritext/cli/validate.py @@ -11,11 +11,91 @@ from veritext.metrics.bleu import Bleu from veritext.metrics.lexical import Lexical from veritext.metrics.rouge import Rouge -# Available metrics mapped to their computation functions +# Available metrics AVAILABLE_METRICS = frozenset( {"bleu", "bleu1", "bleu2", "bleu3", "bleu4", "rouge", "rouge_l", "lexical"} ) +# Lazily-initialised metric instances +_bleu: Bleu | None = None +_rouge: Rouge | None = None +_lexical: Lexical | None = None + + +def _get_bleu() -> Bleu: + """Get or create the BLEU metric instance.""" + global _bleu + if _bleu is None: + _bleu = Bleu() + return _bleu + + +def _get_rouge() -> Rouge: + """Get or create the ROUGE metric instance.""" + global _rouge + if _rouge is None: + _rouge = Rouge() + return _rouge + + +def _get_lexical() -> Lexical: + """Get or create the lexical metric instance.""" + global _lexical + if _lexical is None: + _lexical = Lexical() + return _lexical + + +# Metric registry: maps metric names to (result_keys, single_extractor, batch_extractor) +# - result_keys: output keys to populate +# - single_extractor: function(candidate, reference) -> dict of results +# - batch_extractor: function(candidates, references) -> dict of results +def _bleu_single(candidate: str, reference: str, key: str) -> dict[str, float]: + """Extract a BLEU score for single mode.""" + result = _get_bleu().score(candidate, reference) + return {key: getattr(result, key)} + + +def _bleu_batch( + candidates: list[str], references: list[str], key: str +) -> dict[str, float]: + """Extract a BLEU score for batch mode.""" + batch = _get_bleu().batch_score(candidates, references) + stats = batch.stats.get(key) + return {key: stats.mean} if stats else {} + + +def _rouge_single(candidate: str, reference: str) -> dict[str, float]: + """Extract ROUGE-L F-measure for single mode.""" + result = _get_rouge().score(candidate, reference) + return {"rouge_l": result.rouge_l.fmeasure} + + +def _rouge_batch(candidates: list[str], references: list[str]) -> dict[str, float]: + """Extract ROUGE-L F-measure for batch mode.""" + batch = _get_rouge().batch_score(candidates, references) + stats = batch.stats.get("rouge_l_fmeasure") + return {"rouge_l": stats.mean} if stats else {} + + +def _lexical_single(candidate: str, reference: str) -> dict[str, float]: + """Extract lexical scores for single mode.""" + result = _get_lexical().score(candidate, reference) + return {"jaccard": result.jaccard, "token_overlap": result.token_overlap} + + +def _lexical_batch(candidates: list[str], references: list[str]) -> dict[str, float]: + """Extract lexical scores for batch mode.""" + batch = _get_lexical().batch_score(candidates, references) + results: dict[str, float] = {} + jaccard_stats = batch.stats.get("jaccard") + overlap_stats = batch.stats.get("token_overlap") + if jaccard_stats: + results["jaccard"] = jaccard_stats.mean + if overlap_stats: + results["token_overlap"] = overlap_stats.mean + return results + def _compute_metrics( candidate: str, @@ -24,30 +104,16 @@ def _compute_metrics( ) -> dict[str, float]: """Compute requested metrics for a single text pair.""" results: dict[str, float] = {} - bleu = Bleu() - rouge = Rouge() - lexical = Lexical() for metric in metric_names: - if metric == "bleu" or metric == "bleu4": - bleu_result = bleu.score(candidate, reference) - results["bleu4"] = bleu_result.bleu4 - elif metric == "bleu1": - bleu_result = bleu.score(candidate, reference) - results["bleu1"] = bleu_result.bleu1 - elif metric == "bleu2": - bleu_result = bleu.score(candidate, reference) - results["bleu2"] = bleu_result.bleu2 - elif metric == "bleu3": - bleu_result = bleu.score(candidate, reference) - results["bleu3"] = bleu_result.bleu3 - elif metric == "rouge" or metric == "rouge_l": - rouge_result = rouge.score(candidate, reference) - results["rouge_l"] = rouge_result.rouge_l.fmeasure + if metric in ("bleu", "bleu4"): + results.update(_bleu_single(candidate, reference, "bleu4")) + elif metric in ("bleu1", "bleu2", "bleu3"): + results.update(_bleu_single(candidate, reference, metric)) + elif metric in ("rouge", "rouge_l"): + results.update(_rouge_single(candidate, reference)) elif metric == "lexical": - lexical_result = lexical.score(candidate, reference) - results["jaccard"] = lexical_result.jaccard - results["token_overlap"] = lexical_result.token_overlap + results.update(_lexical_single(candidate, reference)) return results @@ -58,46 +124,17 @@ def _compute_batch_metrics( metric_names: list[str], ) -> dict[str, float]: """Compute average metrics for a batch of text pairs.""" - bleu = Bleu() - rouge = Rouge() - lexical = Lexical() - results: dict[str, float] = {} for metric in metric_names: - if metric == "bleu" or metric == "bleu4": - bleu_batch = bleu.batch_score(candidates, references) - stats = bleu_batch.stats.get("bleu4") - if stats: - results["bleu4"] = stats.mean - elif metric == "bleu1": - bleu_batch = bleu.batch_score(candidates, references) - stats = bleu_batch.stats.get("bleu1") - if stats: - results["bleu1"] = stats.mean - elif metric == "bleu2": - bleu_batch = bleu.batch_score(candidates, references) - stats = bleu_batch.stats.get("bleu2") - if stats: - results["bleu2"] = stats.mean - elif metric == "bleu3": - bleu_batch = bleu.batch_score(candidates, references) - stats = bleu_batch.stats.get("bleu3") - if stats: - results["bleu3"] = stats.mean - elif metric == "rouge" or metric == "rouge_l": - rouge_batch = rouge.batch_score(candidates, references) - stats = rouge_batch.stats.get("rouge_l_fmeasure") - if stats: - results["rouge_l"] = stats.mean + if metric in ("bleu", "bleu4"): + results.update(_bleu_batch(candidates, references, "bleu4")) + elif metric in ("bleu1", "bleu2", "bleu3"): + results.update(_bleu_batch(candidates, references, metric)) + elif metric in ("rouge", "rouge_l"): + results.update(_rouge_batch(candidates, references)) elif metric == "lexical": - lexical_batch = lexical.batch_score(candidates, references) - jaccard_stats = lexical_batch.stats.get("jaccard") - overlap_stats = lexical_batch.stats.get("token_overlap") - if jaccard_stats: - results["jaccard"] = jaccard_stats.mean - if overlap_stats: - results["token_overlap"] = overlap_stats.mean + results.update(_lexical_batch(candidates, references)) return results diff --git a/src/veritext/core/config.py b/src/veritext/core/config.py index f794acc..f7e97dc 100644 --- a/src/veritext/core/config.py +++ b/src/veritext/core/config.py @@ -1,5 +1,6 @@ """Configuration management using pydantic-settings.""" +from functools import lru_cache from pathlib import Path from typing import Literal @@ -54,6 +55,7 @@ class VeritextSettings(BaseSettings): ) +@lru_cache def get_settings() -> VeritextSettings: - """Get the current settings instance.""" + """Get the cached settings instance.""" return VeritextSettings() diff --git a/src/veritext/metrics/rouge.py b/src/veritext/metrics/rouge.py index e5d6dfa..18a2297 100644 --- a/src/veritext/metrics/rouge.py +++ b/src/veritext/metrics/rouge.py @@ -107,9 +107,6 @@ def _compute_rouge_l( Returns: RougeScore with precision, recall, and F-measure. """ - if not candidate_tokens and not reference_tokens: - return RougeScore(precision=0.0, recall=0.0, fmeasure=0.0) - if not candidate_tokens or not reference_tokens: return RougeScore(precision=0.0, recall=0.0, fmeasure=0.0) diff --git a/src/veritext/validators/composite.py b/src/veritext/validators/composite.py index fa5349b..5a36b98 100644 --- a/src/veritext/validators/composite.py +++ b/src/veritext/validators/composite.py @@ -1,11 +1,20 @@ -"""Composite validators for combining multiple checks.""" +"""Composite validators for combining multiple checks. + +Note: CompositeCheck classes (AllOf, AnyOf) intentionally return ValidationResult +rather than CheckResult. This allows callers to inspect individual check results +for detailed error reporting. They implement a compatible interface but are not +substitutable where Check is expected as a type constraint. +""" from veritext.core.types import CheckResult, ValidationContext, ValidationResult from veritext.validators.base import Check class AllOf: - """Passes only if all checks pass.""" + """Passes only if all checks pass. + + Note: Returns ValidationResult (not CheckResult) to expose child results. + """ def __init__(self, checks: list[Check]) -> None: """ @@ -48,7 +57,10 @@ class AllOf: class AnyOf: - """Passes if any check passes.""" + """Passes if any check passes. + + Note: Returns ValidationResult (not CheckResult) to expose child results. + """ def __init__(self, checks: list[Check]) -> None: """