feat(core): add tokenisation with unicode support

Implement Tokeniser protocol and WordTokeniser class with NFC Unicode
normalisation, optional lowercasing, and punctuation removal.
This commit is contained in:
2026-02-03 16:16:07 +00:00
parent efc6a031a3
commit 697b1ddfeb

View File

@@ -0,0 +1,81 @@
"""Tokenisation utilities for text processing."""
import re
import unicodedata
from typing import Literal, Protocol
NormalisationForm = Literal["NFC", "NFD", "NFKC", "NFKD"]
class Tokeniser(Protocol):
"""Protocol for text tokenisers."""
def tokenise(self, text: str) -> list[str]:
"""
Tokenise text into a list of tokens.
Args:
text: The text to tokenise.
Returns:
List of tokens.
"""
...
class WordTokeniser:
"""
Word-level tokeniser with Unicode normalisation.
Splits text into words, with options for lowercasing and punctuation removal.
"""
def __init__(
self,
lowercase: bool = True,
remove_punctuation: bool = True,
normalisation_form: NormalisationForm = "NFC",
) -> None:
"""
Initialise the tokeniser.
Args:
lowercase: Whether to convert tokens to lowercase.
remove_punctuation: Whether to remove punctuation from tokens.
normalisation_form: Unicode normalisation form (NFC, NFD, NFKC, NFKD).
"""
self.lowercase = lowercase
self.remove_punctuation = remove_punctuation
self.normalisation_form: NormalisationForm = normalisation_form
# Pattern for punctuation removal (keeps alphanumeric and Unicode letters)
self._punctuation_pattern = re.compile(r"[^\w\s]", re.UNICODE)
def tokenise(self, text: str) -> list[str]:
"""
Tokenise text into words.
Args:
text: The text to tokenise.
Returns:
List of word tokens. Empty list if text is empty or whitespace-only.
"""
if not text or not text.strip():
return []
# Unicode normalisation
normalised = unicodedata.normalize(self.normalisation_form, text)
# Lowercase if requested
if self.lowercase:
normalised = normalised.lower()
# Remove punctuation if requested
if self.remove_punctuation:
normalised = self._punctuation_pattern.sub(" ", normalised)
# Split on whitespace and filter empty strings
tokens = normalised.split()
return tokens