From 3e88705404635176317063449552a012b2decbc2 Mon Sep 17 00:00:00 2001 From: Kai Chappell Date: Sun, 9 Mar 2025 10:06:28 +0000 Subject: [PATCH] tokeniser with unicode handling Implement Tokeniser protocol and WordTokeniser class with NFC Unicode normalisation, optional lowercasing, and punctuation removal. --- src/veritext/core/tokenisation.py | 66 +++++++++++++++++++++++++++++++ 1 file changed, 66 insertions(+) create mode 100644 src/veritext/core/tokenisation.py diff --git a/src/veritext/core/tokenisation.py b/src/veritext/core/tokenisation.py new file mode 100644 index 0000000..57e796c --- /dev/null +++ b/src/veritext/core/tokenisation.py @@ -0,0 +1,66 @@ +"""Tokenisation utilities for text processing.""" + +import re +import unicodedata +from typing import Literal, Protocol + +NormalisationForm = Literal["NFC", "NFD", "NFKC", "NFKD"] + + +class Tokeniser(Protocol): + """Protocol for text tokenisers.""" + + def tokenise(self, text: str) -> list[str]: ... + + +class WordTokeniser: + """ + Word-level tokeniser with Unicode normalisation. + + Splits text into words, with options for lowercasing and punctuation removal. + """ + + def __init__( + self, + lowercase: bool = True, + remove_punctuation: bool = True, + normalisation_form: NormalisationForm = "NFC", + ) -> None: + """ + Initialise the tokeniser. + + Args: + lowercase: Whether to convert tokens to lowercase. + remove_punctuation: Whether to remove punctuation from tokens. + normalisation_form: Unicode normalisation form (NFC, NFD, NFKC, NFKD). + """ + self.lowercase = lowercase + self.remove_punctuation = remove_punctuation + self.normalisation_form: NormalisationForm = normalisation_form + + self._punctuation_pattern = re.compile(r"[^\w\s]", re.UNICODE) + + def tokenise(self, text: str) -> list[str]: + """ + Tokenise text into words. + + Args: + text: The text to tokenise. + + Returns: + List of word tokens. Empty list if text is empty or whitespace-only. + """ + if not text or not text.strip(): + return [] + + normalised = unicodedata.normalize(self.normalisation_form, text) + + if self.lowercase: + normalised = normalised.lower() + + if self.remove_punctuation: + normalised = self._punctuation_pattern.sub(" ", normalised) + + tokens = normalised.split() + + return tokens