From 3e88705404635176317063449552a012b2decbc2 Mon Sep 17 00:00:00 2001
From: Kai Chappell <git@kschappell.com>
Date: Sun, 9 Mar 2025 10:06:28 +0000
Subject: [PATCH] tokeniser with unicode handling

Implement Tokeniser protocol and WordTokeniser class with NFC Unicode
normalisation, optional lowercasing, and punctuation removal.
---
 src/veritext/core/tokenisation.py | 66 +++++++++++++++++++++++++++++++
 1 file changed, 66 insertions(+)
 create mode 100644 src/veritext/core/tokenisation.py

diff --git a/src/veritext/core/tokenisation.py b/src/veritext/core/tokenisation.py
new file mode 100644
index 0000000..57e796c
--- /dev/null
+++ b/src/veritext/core/tokenisation.py
@@ -0,0 +1,66 @@
+"""Tokenisation utilities for text processing."""
+
+import re
+import unicodedata
+from typing import Literal, Protocol
+
+NormalisationForm = Literal["NFC", "NFD", "NFKC", "NFKD"]
+
+
+class Tokeniser(Protocol):
+    """Protocol for text tokenisers."""
+
+    def tokenise(self, text: str) -> list[str]: ...
+
+
+class WordTokeniser:
+    """
+    Word-level tokeniser with Unicode normalisation.
+
+    Splits text into words, with options for lowercasing and punctuation removal.
+    """
+
+    def __init__(
+        self,
+        lowercase: bool = True,
+        remove_punctuation: bool = True,
+        normalisation_form: NormalisationForm = "NFC",
+    ) -> None:
+        """
+        Initialise the tokeniser.
+
+        Args:
+            lowercase: Whether to convert tokens to lowercase.
+            remove_punctuation: Whether to remove punctuation from tokens.
+            normalisation_form: Unicode normalisation form (NFC, NFD, NFKC, NFKD).
+        """
+        self.lowercase = lowercase
+        self.remove_punctuation = remove_punctuation
+        self.normalisation_form: NormalisationForm = normalisation_form
+
+        self._punctuation_pattern = re.compile(r"[^\w\s]", re.UNICODE)
+
+    def tokenise(self, text: str) -> list[str]:
+        """
+        Tokenise text into words.
+
+        Args:
+            text: The text to tokenise.
+
+        Returns:
+            List of word tokens. Empty list if text is empty or whitespace-only.
+        """
+        if not text or not text.strip():
+            return []
+
+        normalised = unicodedata.normalize(self.normalisation_form, text)
+
+        if self.lowercase:
+            normalised = normalised.lower()
+
+        if self.remove_punctuation:
+            normalised = self._punctuation_pattern.sub(" ", normalised)
+
+        tokens = normalised.split()
+
+        return tokens