diff --git a/src/veritext/core/tokenisation.py b/src/veritext/core/tokenisation.py new file mode 100644 index 0000000..7deddbc --- /dev/null +++ b/src/veritext/core/tokenisation.py @@ -0,0 +1,81 @@ +"""Tokenisation utilities for text processing.""" + +import re +import unicodedata +from typing import Literal, Protocol + +NormalisationForm = Literal["NFC", "NFD", "NFKC", "NFKD"] + + +class Tokeniser(Protocol): + """Protocol for text tokenisers.""" + + def tokenise(self, text: str) -> list[str]: + """ + Tokenise text into a list of tokens. + + Args: + text: The text to tokenise. + + Returns: + List of tokens. + """ + ... + + +class WordTokeniser: + """ + Word-level tokeniser with Unicode normalisation. + + Splits text into words, with options for lowercasing and punctuation removal. + """ + + def __init__( + self, + lowercase: bool = True, + remove_punctuation: bool = True, + normalisation_form: NormalisationForm = "NFC", + ) -> None: + """ + Initialise the tokeniser. + + Args: + lowercase: Whether to convert tokens to lowercase. + remove_punctuation: Whether to remove punctuation from tokens. + normalisation_form: Unicode normalisation form (NFC, NFD, NFKC, NFKD). + """ + self.lowercase = lowercase + self.remove_punctuation = remove_punctuation + self.normalisation_form: NormalisationForm = normalisation_form + + # Pattern for punctuation removal (keeps alphanumeric and Unicode letters) + self._punctuation_pattern = re.compile(r"[^\w\s]", re.UNICODE) + + def tokenise(self, text: str) -> list[str]: + """ + Tokenise text into words. + + Args: + text: The text to tokenise. + + Returns: + List of word tokens. Empty list if text is empty or whitespace-only. + """ + if not text or not text.strip(): + return [] + + # Unicode normalisation + normalised = unicodedata.normalize(self.normalisation_form, text) + + # Lowercase if requested + if self.lowercase: + normalised = normalised.lower() + + # Remove punctuation if requested + if self.remove_punctuation: + normalised = self._punctuation_pattern.sub(" ", normalised) + + # Split on whitespace and filter empty strings + tokens = normalised.split() + + return tokens