From 1e3618e637f3201dbf4a5f0ede5ad321cfe0f389 Mon Sep 17 00:00:00 2001 From: Kai Chappell Date: Tue, 3 Feb 2026 16:16:20 +0000 Subject: [PATCH] test(core): add tokenisation and types tests Cover WordTokeniser (Unicode, empty input, punctuation, multiple scripts) and validation types (immutability, edge cases, failure summary). --- tests/__init__.py | 0 tests/__pycache__/__init__.cpython-314.pyc | Bin 0 -> 149 bytes .../conftest.cpython-314-pytest-9.0.2.pyc | Bin 0 -> 1610 bytes tests/conftest.py | 23 ++ tests/test_core/__init__.py | 0 .../__pycache__/__init__.cpython-314.pyc | Bin 0 -> 159 bytes ..._tokenisation.cpython-314-pytest-9.0.2.pyc | Bin 0 -> 22000 bytes .../test_types.cpython-314-pytest-9.0.2.pyc | Bin 0 -> 34228 bytes tests/test_core/test_tokenisation.py | 124 +++++++++++ tests/test_core/test_types.py | 201 ++++++++++++++++++ 10 files changed, 348 insertions(+) create mode 100644 tests/__init__.py create mode 100644 tests/__pycache__/__init__.cpython-314.pyc create mode 100644 tests/__pycache__/conftest.cpython-314-pytest-9.0.2.pyc create mode 100644 tests/conftest.py create mode 100644 tests/test_core/__init__.py create mode 100644 tests/test_core/__pycache__/__init__.cpython-314.pyc create mode 100644 tests/test_core/__pycache__/test_tokenisation.cpython-314-pytest-9.0.2.pyc create mode 100644 tests/test_core/__pycache__/test_types.cpython-314-pytest-9.0.2.pyc create mode 100644 tests/test_core/test_tokenisation.py create mode 100644 tests/test_core/test_types.py diff --git a/tests/__init__.py b/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/__pycache__/__init__.cpython-314.pyc b/tests/__pycache__/__init__.cpython-314.pyc new file mode 100644 index 0000000000000000000000000000000000000000..6b2028c04485b4f11f9f9c1f9a4fbce979ccc13a GIT binary patch literal 149 zcmdPq-z+nT5%Cf>$*+-T%U(LYM7>{PUmKcwrMO}_BC3qqO>#>y!y~PH7 zF){^3Z3?QBi=mQFoZvZzp3xD{7k&ngxpwqD=i#N&F|+2%eY zo?fO*UnR`pWSi^4s8TCPjE#ffRM1g;}K?>veY`~g*&{UyYQl|4QN!jwNT*okNo%iVEWk&nJ#2sRB6hT^en0*hH zLZvWxNfa$Oib8Vw{euH-M$S1MUZ&xiAygI(1HP(qEb}vgq)C0WcSv9&L|bV>F^5mxn8AByg`=dxUce$@!S(V1(88B zfn*5X>?k_ZrRHG5S@ccg=ZSPejvsFyPGk?X?0-XfQ3NnRSr7~hU#`gRG$`j!M9C`_ z*S83%dwSh>OWe1(LtRPp93wT_Am)(%pI9@;u!_}2CW)*>nHom>`O)FT!hyCRv&yD} zm}%OyWST*I(|4-ealIh WordTokeniser: + """Provide a default word tokeniser.""" + return WordTokeniser() + + +@pytest.fixture +def word_tokeniser_no_lowercase() -> WordTokeniser: + """Provide a word tokeniser without lowercasing.""" + return WordTokeniser(lowercase=False) + + +@pytest.fixture +def word_tokeniser_keep_punctuation() -> WordTokeniser: + """Provide a word tokeniser that keeps punctuation.""" + return WordTokeniser(remove_punctuation=False) diff --git a/tests/test_core/__init__.py b/tests/test_core/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/test_core/__pycache__/__init__.cpython-314.pyc b/tests/test_core/__pycache__/__init__.cpython-314.pyc new file mode 100644 index 0000000000000000000000000000000000000000..472ee1d5d7f323acf79bd9bc69c3730a8d6f5ab3 GIT binary patch literal 159 zcmdPq-r;dL@CAUNumJcv5;#xxtMnw7xN9cbRo5#cz)ms7Ox@3cXN?IzU!>4sERn2WukgX7S-&Fawet3 z)l@bkj%Sk-X*oI);kAade|YAEbo|tb<4>O+`SL6<&ShEhtS&7xkO)|jlJ_;rz}KWzw`Iu`Pmlk4fpiQcmI3D7_y8y>W93OGuO@QMk zjzF9MaRlN7iBm_M0CDPx(?FacaTvz`y}18R+v@U{XnG&iNy zOHw?O$*OTxmL%<=ib{w&e{Y0GD;+?7JIyU|ZtrpaBciv)@6!EEj~>4VB3#03r5FK3 zVI|ey`z2HxR1ts1tUmnJxMKW(!Ogj%fU9Tu%Lhl@kDTJT%TJovym$*M^}Ln$8La-Y z3$Pj(^+4+np6l*+4Hyzw{pWqA=o!Hb+4oLvz=%8w_lrBqo<#kREsdd0Zq#L#*(_ml zTUomL(lq7izQV`cF^}Vos>{30=k&RHtheB}oHyo;`9{5cd5z9;!ckRth9f}b1&`89mxgm+>hAcEU~;es$+XdRvl4KE7D zUr`vIxnjkvfP*)0ODs@wOUM}jDHew3ezM+@AyADjks&l#0}!JZq)KAg)ETzzP58@p zAV&YxgWXZj?6bbp{VtSFR{sV;9+&5uQ5_R2o-xQ?2sjj4N^2EHK)4mCM7e^1W z*1pLf6V|jov;fCY=P?nkEgS=&q727onwKD~c?rX-Ms38;_Csp+ z7!p*@tOrRik|>fsAeE@zM2K~?Wd!yvEN&HGmx@t4HnH8a7(O%?KJ?mnG5qB354P8$ zc(IdoP7H=ZcGfIuB{7(@1Nbophrh8D&O2hCZSH~D;z0B zWb%0SN~!kEw-I!Ea0-!5c8aI6lccMsvJ+3#26Dv94#CT^!$36mrR=2U#?ibRQmsxi zw-AOM#e<>(M97Bz4 zGxV^7H*ZTUQ1VJ?;DFRRYUz$LE&ZY_=OoALu1QmOL3JotF{G)z-sndhkazly+;zkK z-9W=fUbwFV;5SCwP|4P{EZ`sXXuFE4#x?|M(b)k+^9?0X&--Yo-IMp}(65d5GDw?h z8y0}pqPDjXQkpMP&$Jeeqclb!IxHs8Nku zHB@WNi^Bv(%2LvwqjlGSp;L%jWR@UkEm_@cE{1ymfUVmL!uIRUfBLOOVY~iU6t*ME zjF=U0@aAoa1x#$oI0GQ`DFY9u9#w#MR0rPK4d9h5^jBYcvXN z>@UNkzI=R(@Yr*k@Yu7AB3!L$YhyqQ^=w_A>43J6FnKASNv2T;X2-;FSSe2B(y~bK zRKyFiIw{K;GJGk~d?O8I(LO_CS$lH2KKL}BF6mFNiO(`QYPeUSN=xRaj~2s^eP+u| zXka0$_(Kq=WfhOP`oLjg)=i7Z`s{NSEZ-|F5 zWDoY3nVxZ2Z%rOptQ$ssV6kpIcfw(@M!T?Or#@5F{%n@rVX?NKR9$6yOgpbxLSsqU zPKPGLvI}5pr2|z9_A0Oot$*3P_gOY89hA*iW+C^-a9->$vycnt(KbofdHc|2Xej`ALTEc&S-9v8osNhPvLSan$uZZ-rDneki}42I~ba8L^y+@x&s^G@f50o{3FV5H1iNb47o zN?(P85rs^Mvq6U43+qIJ>kO%KNJ6k$ug1_3_`Ipbm1n#tTFShl}we{TmgJRp^nb4i~U5o9} zx%TMIe6jtpg3tNTu+)*NPwLRnXpymW}YsX!@-5#LW z#t7HxURGU&TIwFu&F9;?N`45UbA$`#k8h;J#HBckCsdg!(Ft^UpcPy3?%)6U5A~5P z1h4(s`VDWJOS%&(g0^#oE`+40#pa=auo~;*nnlxp=~yv5^qCEAQMY49zG|1xwP5ra z$k@iH+Y6`!a_f0q6&kp$QfvdGUnGpul4uD=Guceon*MENf)1UwP zZ+`iI|N3|0```Qi`!Bum{`Y1k0!}`~tlvKAoOEjmXrW9}8q1{ZT>-e3BsJxVdFkWD z@Dm8#T{pg2XzfRJ+}dBR*?}0-1IfSmm zKYEzlAcpxjK*74b!~C0s<>3uCh%eB>Zk;jvaH`nx)KzePu%L?`sk(ms)$2d`;K$d* z;cVs`;J_dj{=M+}+;nZ2f_oi$KhUH_PpbJZ=xK>(q(>|KiY|(6C19o!N=6Aaa^zTM zFkd1~ja-38mMk`%EQY^~&JaTdVW`kLG#46L6pX*3FobBe8G6{ko3|wvD0!tca6sx) z#`7q=rBf9TqVgToRq>#%iU-$06>qpSK|Nd7vJ1j6*j{&kV&`555mlRRq{&Q-Ux48; zIVt6s3_BOVqbZtiIt`loiYyKs6yrHqC{~t(ug0e|4~lE9frFaQNU$or>dMzBgk%+< zrt+*2Z3;YGptD+-4ZlQu&SJ9zW9!~!L+lm-q+eablf zIy4?z?|Nuo5>oVGz&koI;C%zDYa2FyzV{_PL0EO_-9$E{;;a_7>L}vgI4tO?ESrP< zNU)D54c6AAh@Qo|X(tlRZ#oV8mNI)(F_}d=iF6q>`xy>paida3<6*B7q+;JC4y2?| zng7zVP7%(s)^v(+3Q;4kz_UJ>e(#I5k0*|J#!(X)csyRvyFhMz1<^csJ4vw(EPs)( zT)9kjX$pV~o9hs?u(Qti!-^}>F^-Z)zhs~*iu(y*Cu%-Bqb309?fb^k&8 z`4T<9rvG5MAT{0!oa%b_y-!{5uE7q={s<7qu$x)g1bYz?YMEetUQn<=&;$!NTl<3) zU4EdjU{-(cKVTj{&Do~c7?@%QfrnR)hi$>QtX6i#ED!gbE9L={TdhsOIJc*(hKaR~ z+z9JS+z20x8Pi%HUZaWCN{435n14l+n^iMqCO2we_z9 zya+MVIXDU7754O0+O_m@mzuPwNHy(RHB%50{sam!&4DfJwb8dWADIc@QuX%5aQ9re z`^M>FIEu?%UDw+RLihEyxls3_VEh$@ZX90Q3_a}N&D#x>E@$u7b1ag$el7@Vv0& zT!RE`#+0O4UfuhPXL#h6JHsR9!Baa_ zf6NOo^l4jg41Cr+G2{%(s@X&~ou?IA{Qwf{M=@voDxxJhor-4?a_LDH>#r;HZLn4m z%<`2SDuJrfRwV%(>;ZXwGu2P)>ek+3HW`^XZn1jvTjjjx0WOJ3LhOK_b@&!D>UxaQ1dY%SoM~x4l9c{Iq zZ49c{*p4dFLpy4Ob`)AuSy$KdSU!7J8CTcySm~f_p;%Kayr#0Pt_Qcj;r-QPMS3V} zIM(cFgVlRgYuMda+VlC>(DV7tc4^)XZ$&tDYt-9hopF>C&f^jbUABkgWvi`T`~|ht z&CpV}Jb)T*GoNji(rN{(>+vemLk(|A(ME7XK?rbFg)ar7ElbG~}Lcn+^JS?gjqtf8F z;c?);2q!YdFQjG7gA=`yQiD6Co&f}El^R?qF`HmYf%BVG9J}IqX$pAjvlpPSng@5R zu|I|&9XAAby3p~nAO7zR1L@*AxM}VzcM>#xj<**fMtR4G%NUnm<)*zI+|y7I0q!9< zQLvjkpvk6YlZy_ zl79md3Ft>|P@C!jhVFx^muG-HPuV}0vHRH=CP(*G1>IUKuT=dktUcIpXfba2O4E*h zqB^Jf$Zlu`Frn}0`Pp}70`GS9z<%FrlRug&hH&Taf@9mSzK0h!|6ZH4w)4UkUPsJg zL&_gbSuqeYW3U_e#y>#{WHG* z4z*t2{p!wQXiq`dvl!a52-NXc6!u)pTa5XVn8k*a*LPbn5OTyUHVD1C(~5zR8Cwwc z6e^zkmayj?L`Sh1O`X>ifJ!lh!3BrOQ@3A&;q#henKHqAX~xjV*=*QD--2*ON=W2f zcoEnu*noeGh2*c1{0xYKLknY><8P6PB!@rX{KQ!1IJo42+3;_0 zN~}K#g#e_OsV0~Q+7VKY)0U&qXJkP(w?frn` mUgqBB#Q)_6K5#evk7r=XcY)_&>C@g-4@BPCHhOFq6uStczjl5IJ$WqG5V+*H=}M4m{-B=5L) z)Z<*cYLm8f;l>V{Hca{m5wrl67L6L9t((6Zpg>w61&$Aq%pxiBM-bO&|DYhl4E#^~ z`(}4%ckh-jnUvLn%j35*v)|0_&dkny-*3K|IZ{?yEa3S5zTX-8UcDgnkw-t=n&bWi z9Jd5X$OxAO$??1+<78*oS@*bezi`%*@$Ofz_%c3RaXv4eEy@(LFxT@XXZ;!f+0snu z*|JR8exX`u5F~f4AbC2SYWkS~3-cn($HU55n24|<9#$ba4hZ4mZ0JHFok@qTCgo6O zFcIoY%85|sW-5^mjU@ZWhZB)=VTTfk#)pUclr9v{vq z6~^_`$b6;ZfH*{+L7E&bnGsRIMQzDfj56YBk z$^+whwO7dQF|wxMVx-T3C#Qcp=yhE9zrPQp$8k%D3YVS7zaj|z2;C#R>=<-Nc$F4p z`EAz+#I1PcL}pwbJ$G9OI~6aoe~tUD0(*OM&{2ecyr=*EH0UkilAbF{HZz>bZeG(Gv(5*qe7JdhNWUL5Mm!wfE7Vu)j7hURhowc+K%%bVx4AZF(bn zey4sP$+Ir6OXqEI9W~`o$|HFtU)0m$u!MRqRp?J>O1V-GGtPKwub#4grkeYtBjtS3 zA)iJmQmx24QqC5EO7ZwvVe{o!%*wH3ryMOMw$0_pntp%wF=`1yeR2W~`{+O@j>ez0 z-J$D4nZeL#G9)LiCgj9uA36cWmAcug_%sgkwG-+3xD3rzLe7K^HK%TNhtm|;ecPcp zhtgq3wv9sD!|CGJHAkqIF}8-&5uSMQ8=;{z3+@W1Z#zDA_Vyxb>l#r<5}7!9qc~#r zstF=yxI{+tBPc~YVWs%#)J^(ZBDTQ%3(%$`oOr$hCqsU9! zdQIJ-eW$#7N9%(Kveg}Jk>1ftMUYGV*>3uHS8br&| z8FgYaZ4Q;N(dKwONA3wAclz1McnWZ_gBmaNsQ&wv9?B|eX(G;)!w~PI5NvtdRp(v7yx4fMzfilr)UtdzbOVlK>BkI$Ykd%1YIW`$BVm0ZzdO1Dg80gqw zT`GpMSbgy=OI`ABstnM4S=3TnW$_YMkMt+kpS2%tswKW{e^VGAb;~c6KEn0BY~6a# zOHm4}%PZ>nCY7(tTcq>0cr40OiBus~MoaRPVE-nRV85QSeuff^I-Bz0Uqs!#EtQ~u zuE`!>vwt1|GB%X5>7Swdw4;J&>rb&h`g%N#o;<^JF!a^&5cSkDYIc4tjb^5(Bh7BEpBrXL=W;RTq-wHmJza}iZ)?#9RV?GPhDnDx9U-I zrColE^Qs}1%N8B%2~~GgwX?RiGwY;Xpb|!QG3&gEPT{Iq|W#)4AZmxxgVfrz;ql2|TtaYQK5$u^g#+ zMu_TuJI%vpDGl!94kYz%H}`=__bnjF;6e3@IeECJD|iI612GJ7Xq$yHl9KOJx1nLJ zM+sy-s$#82ji&$uJGdhFMc}EbP)6utOTJ*{**3wfj0t7|i(m%SaZ{b?#*kMoVDV0L zW5_Fyht88kEw`R|=WRRPC|B|%2s_*G9$O)p6&DC*C6a&31+!8T%-EnvdA4co&P{{h zB8oSi9GCkNiklicr6@Brf^&Q%l`U>>ZSQ=%wGF?pLvBJLL9Qo3xbnF< z{Es>#P4{H!ZBX|N`_xW@#bNp|?=L~ z8l1W0)qT2z_oO4{7w)4?W$}Ofr>3nVk?7HrmxOdc1`$xS{i)k)jxQ3zP^32wBv4R$5QCX z#B=lhy(=!EtP?@_&6ak2oF>%td*UgF=ZB7xQXgwX}B?> zIW`_2CR852KrD83NKR*lhei{lNd)*}vHoOVEG9>gs@zHBc_J?pc?CqN#^@@>Lx=ie z@eDAtE8}Rq5V68E{@Rzoa6t@+83S>t%0I1QxKwv!BGebBspb0B@8g|+;%-`TW0-WA z#!6TGfN_@z%Uvl3#2Asi_#Q{APpXCjxM~0dvu@EiifolbR|mZQ`3o?^i%{f6>Pr`aw zQq0=p)spi8tqych(rk6GQ;HsGn`BX5iZnoxcffYU^}tpK>epDk^EFz4v+XZO8eYmf zP>$|TSdKM2<@hkmQ7Ri7JHI=x7~vi3j=Bha8=K%F@sILAZ!UHlu`$$B8x;Z1*Ii7T z;Q95TVgpt#e{_da9?8BDFt+?V$p$C|?31j4`pCQ!L(62}8ByF&OvWz^ zbg<*UPQHJY9lt!84SnW?XTA_hGqcgw$nmbwQ@t2%Mzq1?aDS*Z(tX>bxS92gmr_pj zE8cit2C_Sl0-a6q0WwI(2NDb)Aj?R)sqN5o)=f8&dr^Pnr-;x{^=(J?=o;2Sbaw+| z21G6IQX@syh@?Jjo@r8ctr+TSS#E>}Tgw89XFy&g7*CA?T5qAp16Up!8r7|%@=JcsPir)yA$)q>PUm~(W07QO`m`Tf!M~IAq+%8ge#UurUi4ZW(kaYPdxgG~GBj>I1 z8HA)s$mSrps$K5RKy`>|hDln@0`$#nhMD!5S-xMLmr47-hL?k8S+QrSqIN0Bel@$6 zs*ZhF;;k&6@Gh4Ly;t0eD^Z)iFirQ~ZJWR=v@2=tGmlwca!+o=ck1p)v7tD@%@vxyXZX>ppzz0Gtm!Y`&J%{fJ zAf&kBWIP49*ufRoGeEvR6~$GD`*&0xlkfsV>+M!~2&v0EASyqo;#$t&hl(9R`C$sm zFqhwI#Z~6bstQ$}tMMTW9ptM#URco#WCox8 z;1vj-AAAV8;NyKSu%OVf*49wsMqeV)k6Ci@8$%=GBcZO=R(S{wwmQbiWam|FYvmL% zk`^M55s`_cL14u4Nk#bE+K}d&(tF{MrbSZMGGs%Nx|NrSHy%vTsaYBCU9U;)P8f~0ZPOOP~CJ7Eo>GX zns32io~ML3kJ$l!-0@*c>d13LX5D7V`E&R_49U5OhI~NCdKo9|13Z#~Ab}S(lfa)f5c+ zz@l}_O;qS);*gXAMFoe5*bjjvGE}}(I3hr$Td~?B zFmz_4pw2CnVqp_+u|`yLBSy9;$buNp0;QAkZ8cbCwOH)ioWxzVX1|^QnyUKAi@D&T zrJCJKRS|RBueVZ%r#hkCgIKkDh9?;MzN)EHW2l8er`LVZ={YyF_^D3LWOh7E{XR-r zPr4g3=PU9wQuEX}eUP0BTPbMusy)jcmU)G(l;r8wzDdlyAlZN&pUA4kYq4ta!9>ny zvTE_IYt;e{&8kJr9%Ww{vuZKbnYHX$E)1xzuzWIG!Xm3k>utA+TN-#IL6OgrwF{Go zz5t(0nJZr;LTtA+n--FEEH*858A2Vkx@s|LX|}E0Hk_;&pGLwK#Ia_ns^P;T55bKe z`h~J8t7XfchKXn2e$mdb<@@3(x95lMl2g^XVM~epMIM&Po zP%S^Ay2&afOTyj+SeWKU=e*l&#rNER|(SV5Z`i0@S`MRaBIQ zsguM&V4zJ|A+)oxQ`ZYFb3+bqb<% ze#A!eI1F~4i!nw_jONuuJs8caz1V3Q%_E_wSm!$L*ImZhc4Y?RPGttC?!sCHTo}=x zP+OV7h*z$28)tJo*awr)vmQSeWTcXHdEL4}VV=8}e$BZTVi}po{*-yqwj^RK}CqmUCfzly^WXe}wB|g{cgRoP6Y+IYRSn@mOZ789vbZkkMmx@%)ika9%?YJu34zsT%0$xH0A5oi19q~n8=iu0w<$!{Y_ z{!I{w=_IWyUNy#g_-GHsGweG~lJ{vMr$9cgIAy(cqTd9gYPKLRXsUC*u>>YOHq)h5 zrX;G52Qv!88UynYVZzZxkycs6hS1MMe@u^ST8>7};CT_dn_{nT#X$*;%ZXTed}Jgp z-^61CRKqO%FN?4yUd#+U8|iWn`sqh?S{)%BDkwvw_vGn1K5L0r8heCQQQ>n5YqXWk3`bN{^m zb3gUhPQEhT|6a$xy662bE_+-|F%yN##_6g%SLZ8wC%iv@_!b($UEk>a8kUb}dOQ7Y z?_8jPOeS7t#EfzSIn4zc3gK#+*=X2Jn0!1KNxgfF`+#lhJ{VWL+pAtMrw;ej%befr zz!ioW9{@C+6T?$7BQt^UqNx4m#Bh#aYa>K;6aJ!w&5B{&w_xI)r}Q|J*?|a4fySKJ zv)I%#)6@e;4vs+2B0|`24k3X?)qU6G<_rsB&(u}!n>Bp6fVB7(vtwTDF%&G)pXnea zh!mp@EZeuiSqt(XAvqe(G1R66JDFxRR zEDSZUYF*yd)xc^f$Xl%j);^#b==ua|U?Wol4_MT|3aM^W<@@QWfpyZZN4PHTUblX6 zWnS7uU6ktAO?P!QaF4Wi%hf=tx9h2a+W}#yb@$*--POAwwqWU2z-g7zPTl31dRaJX z0yf5eW~UsdM9Dk$pV6|sHL~8x>-p>z=qc-GK%dlhKw)L;@tm`VkS;qml>h#n{MW0j-SWPGp4ZKP zsTQC?Q}%Jv>u8#~9cYkFj$lnfnpJ}F8ECs2Y9RkHZZ7``5e^wVE5AYhH;Mcf5i+b& zO0ZCEfBXt;wK6veaKI6MT62V+&K`VDj`t@*!-;E&;ZR$9q^rP0>v*J9{sv-wA_#)5 zQjs;UdJB{w4}_4vMbGBQe*VFb2#eO94@e+cHrZtr6vH~%1#3cW0gfR5Ib|A6D+!lx z4Ps!PJ<_EvvO#dS6+;MVOBB>HV1TNR4kM@;q_Dx~f?>W@xS3Msf&OEQBP><#U8>!) zRJ(g=_ui$t0ACh+*8sQu=}$8)s4uo0rV2@KJ~W+@Hs;|?T6 zV2Ht_`xcO75auIv#hg6c)2Da@vjbN+HZpbUZlEbAb}t5wVD%L^rf)DZ6To^?+HX$m z&e6&&ryH-*DJeM1e-qqvfPw;ug&J=2(A1X2Kz;BJI z7DQvC1KZ-*8EvAfF|71=8hQaNhi^s#TXi1Cn;S3eY&q#rt{Acyz)5?FxPX2$Y z`S01u|4%jlX@Ijbdw}UGwB?byif1T!m8m5dgbQhdaH@v#tfpf8{G>dH1~$9TGUL4~ z)MMBW(|mW;8kp3KtZ7#ymNW({`M{>)({5r>C1`Rw+3}pq7PqOoQgb@YMiL?RSLTf* zLIc2_7tA`+jX3gO5HSV`50E!R1Td?%^7=t?9Rdl9`GWjyxR_)Xp1J)HcFUS}A|+g>(BDOj7tqh$EVSzEoXbAd)Y;L?(GxS|snvy2Z-o znaXBbDh~rok53Jl#mPAmCo!Bvvfo{kn@f8T4yOjV52B^X$AjU(m8C#EhTkTyn@936 z1A5{&)nT=}hLex8VN`DNI`?6?&gA34aOjhsLzA`GYGYBciBNa>~FH4w6*#L0LHaIu3=1nKeRim$y_Q{ zE#Vm2&4lS|V76PEsf1)-YINI(_W3WnKsA@fgoI(HfVg*1&d&>K2`9J zc*^Pdp|j*vu&9spsBPAz{uZ||YE8Z8*gXM+wjso+u-0&0D(cCvHP{?XkoLaB_PW() zo~^!;vda6w68SzRk?*%i!NLu_%?_~^|L?eQLw ze;2p7V6vg1$$5%T?Vs9g*HKl(er(`Eb5|4(E$!Mi@b5pShM8$XX= zE3+He?S+ahiktR8QCzPjZYmyV2;&J=wm*(%kxvkL5+v+W9C5{Q zMRD}8rCZt3M`mnW+e=j4&xK*z!xVRe0=&Kn*gsI5FpKU&p#l^I5b#NvTX?YH46o>CxDgL=0sSqq0zrtqyxLon@7HtRMb+rVU`0OK{&( z1udbW1RIyC>agNKuz13^>=(*w-+A+`H*;d^cd#pR>l9YyY+V$!-<;Sw@h10i2W|eM zam<24^DP+6^OO+hF+0F-dKOyFddrtYJ(+;IfC|lyV-IhdnVIUrzgKO-E^nxy~DZo&*XxqZ60)T@NNJ$ zC|%R1?*yk$kmJNmpleaoesf|MTaJXhcTMhjV&q(A2d*$dN!zldboG@TzD?EeuZa9L zk?#@tDv^I7viceiU!%)JuHZj?21Hxq;n0d3>jx|gAS=5u#blZ0nXDXxfyFXeS*$bx z5ML%#e5D*p0Bt?exZ;Ok_Ys%wShC?tw(^7`>U*KHVIyT%Ox``*nFJ zgPqWM5Pe51O#MVjuvWyBwiAs^(kolALkV6>P|O>d2(=#(`c~5F*cHq@1XWe1)}Ady z!Jdlcr<$|WNGoEGy8X*j@{i%Al-b@Z%Pxn*@e`rxeL?t&@a58<2`Aqd&d&(vSG@Zj c*h>v$t<8 literal 0 HcmV?d00001 diff --git a/tests/test_core/test_tokenisation.py b/tests/test_core/test_tokenisation.py new file mode 100644 index 0000000..0139c11 --- /dev/null +++ b/tests/test_core/test_tokenisation.py @@ -0,0 +1,124 @@ +"""Tests for the tokenisation module.""" + +from typing import TYPE_CHECKING + +from veritext.core.tokenisation import WordTokeniser + +if TYPE_CHECKING: + from veritext.core.tokenisation import Tokeniser + + +class TestWordTokeniser: + """Tests for WordTokeniser.""" + + def test_basic_tokenisation(self, word_tokeniser: WordTokeniser) -> None: + """Test basic word tokenisation.""" + tokens = word_tokeniser.tokenise("The cat sat on the mat") + assert tokens == ["the", "cat", "sat", "on", "the", "mat"] + + def test_lowercasing(self, word_tokeniser: WordTokeniser) -> None: + """Test that tokens are lowercased by default.""" + tokens = word_tokeniser.tokenise("Hello WORLD") + assert tokens == ["hello", "world"] + + def test_no_lowercasing(self, word_tokeniser_no_lowercase: WordTokeniser) -> None: + """Test tokenisation without lowercasing.""" + tokens = word_tokeniser_no_lowercase.tokenise("Hello WORLD") + assert tokens == ["Hello", "WORLD"] + + def test_punctuation_removal(self, word_tokeniser: WordTokeniser) -> None: + """Test that punctuation is removed by default.""" + tokens = word_tokeniser.tokenise("Hello, world! How are you?") + assert tokens == ["hello", "world", "how", "are", "you"] + + def test_keep_punctuation( + self, word_tokeniser_keep_punctuation: WordTokeniser + ) -> None: + """Test tokenisation keeping punctuation.""" + tokens = word_tokeniser_keep_punctuation.tokenise("Hello, world!") + assert tokens == ["hello,", "world!"] + + def test_empty_string(self, word_tokeniser: WordTokeniser) -> None: + """Test that empty string returns empty list.""" + tokens = word_tokeniser.tokenise("") + assert tokens == [] + + def test_whitespace_only(self, word_tokeniser: WordTokeniser) -> None: + """Test that whitespace-only string returns empty list.""" + tokens = word_tokeniser.tokenise(" \t\n ") + assert tokens == [] + + def test_multiple_spaces(self, word_tokeniser: WordTokeniser) -> None: + """Test handling of multiple spaces between words.""" + tokens = word_tokeniser.tokenise("hello world") + assert tokens == ["hello", "world"] + + def test_unicode_nfc_normalisation(self) -> None: + """Test NFC Unicode normalisation.""" + # 'é' can be composed (U+00E9) or decomposed (e + U+0301) + composed = "caf\u00e9" # café with composed é + decomposed = "cafe\u0301" # café with decomposed é + + tokeniser = WordTokeniser() + tokens_composed = tokeniser.tokenise(composed) + tokens_decomposed = tokeniser.tokenise(decomposed) + + # After NFC normalisation, both should be the same + assert tokens_composed == tokens_decomposed + assert tokens_composed == ["café"] + + def test_unicode_emoji(self, word_tokeniser: WordTokeniser) -> None: + """Test handling of emoji characters.""" + # Emoji are removed as punctuation by default + tokens = word_tokeniser.tokenise("Hello 👋 world 🌍") + assert tokens == ["hello", "world"] + + def test_unicode_non_latin(self, word_tokeniser: WordTokeniser) -> None: + """Test handling of non-Latin scripts.""" + tokens = word_tokeniser.tokenise("日本語 テスト") + assert tokens == ["日本語", "テスト"] + + def test_unicode_mixed_scripts(self, word_tokeniser: WordTokeniser) -> None: + """Test handling of mixed scripts.""" + tokens = word_tokeniser.tokenise("Hello 世界 Bonjour мир") + assert tokens == ["hello", "世界", "bonjour", "мир"] + + def test_numbers_preserved(self, word_tokeniser: WordTokeniser) -> None: + """Test that numbers are preserved.""" + tokens = word_tokeniser.tokenise("I have 42 apples") + assert tokens == ["i", "have", "42", "apples"] + + def test_contractions(self, word_tokeniser: WordTokeniser) -> None: + """Test handling of contractions (apostrophe replaced with space).""" + tokens = word_tokeniser.tokenise("I can't don't won't") + # Apostrophe is replaced with space, splitting the words + assert tokens == ["i", "can", "t", "don", "t", "won", "t"] + + def test_hyphenated_words(self, word_tokeniser: WordTokeniser) -> None: + """Test handling of hyphenated words.""" + tokens = word_tokeniser.tokenise("state-of-the-art") + # Hyphens are removed as punctuation + assert tokens == ["state", "of", "the", "art"] + + def test_custom_normalisation_form(self) -> None: + """Test custom Unicode normalisation form.""" + tokeniser = WordTokeniser(normalisation_form="NFKC") + # NFKC normalises compatibility characters + # ™ (U+2122) is compatibility equivalent to 'TM' + tokens = tokeniser.tokenise("test™") + assert tokens == ["testtm"] + + +class TestTokeniserProtocol: + """Tests for Tokeniser protocol compliance.""" + + def test_word_tokeniser_implements_protocol(self) -> None: + """Test that WordTokeniser implements the Tokeniser protocol.""" + tokeniser: Tokeniser = WordTokeniser() + # Check that it has the required method + assert hasattr(tokeniser, "tokenise") + assert callable(tokeniser.tokenise) + # Check return type + result = tokeniser.tokenise("test") + assert isinstance(result, list) + assert all(isinstance(t, str) for t in result) diff --git a/tests/test_core/test_types.py b/tests/test_core/test_types.py new file mode 100644 index 0000000..7fbb1bd --- /dev/null +++ b/tests/test_core/test_types.py @@ -0,0 +1,201 @@ +"""Tests for the core types module.""" + +import pytest +from pydantic import ValidationError as PydanticValidationError + +from veritext.core.types import CheckResult, ValidationContext, ValidationResult + + +class TestValidationContext: + """Tests for ValidationContext.""" + + def test_create_empty_context(self) -> None: + """Test creating a context with no reference.""" + context = ValidationContext() + assert context.reference is None + assert context.metadata == {} + + def test_create_with_single_reference(self) -> None: + """Test creating a context with a single reference string.""" + context = ValidationContext(reference="The cat sat on the mat.") + assert context.reference == "The cat sat on the mat." + + def test_create_with_multiple_references(self) -> None: + """Test creating a context with multiple reference strings.""" + references = ["The cat sat on the mat.", "A cat was sitting on a mat."] + context = ValidationContext(reference=references) + assert context.reference == references + assert len(context.reference) == 2 + + def test_create_with_metadata(self) -> None: + """Test creating a context with metadata.""" + metadata = {"source": "test", "timestamp": "2024-01-01"} + context = ValidationContext(metadata=metadata) + assert context.metadata == metadata + + def test_context_is_immutable(self) -> None: + """Test that ValidationContext is immutable (frozen).""" + context = ValidationContext(reference="test") + with pytest.raises(PydanticValidationError): + context.reference = "new value" # type: ignore[misc] + + +class TestCheckResult: + """Tests for CheckResult.""" + + def test_create_passing_result(self) -> None: + """Test creating a passing check result.""" + result = CheckResult( + name="bleu", + passed=True, + actual=0.85, + threshold=0.7, + message="BLEU score 0.85 >= threshold 0.7", + ) + assert result.name == "bleu" + assert result.passed is True + assert result.actual == 0.85 + assert result.threshold == 0.7 + assert "0.85" in result.message + + def test_create_failing_result(self) -> None: + """Test creating a failing check result.""" + result = CheckResult( + name="length", + passed=False, + actual=600, + threshold=500, + message="Length 600 exceeds maximum 500", + ) + assert result.name == "length" + assert result.passed is False + assert result.actual == 600 + assert result.threshold == 500 + + def test_create_result_without_threshold(self) -> None: + """Test creating a result for checks without thresholds.""" + result = CheckResult( + name="contains", + passed=True, + actual=["hello", "world"], + threshold=None, + message="Found all required terms", + ) + assert result.threshold is None + + def test_result_is_immutable(self) -> None: + """Test that CheckResult is immutable (frozen).""" + result = CheckResult( + name="test", + passed=True, + actual=1.0, + message="Test passed", + ) + with pytest.raises(PydanticValidationError): + result.passed = False # type: ignore[misc] + + +class TestValidationResult: + """Tests for ValidationResult.""" + + def test_all_checks_passed(self) -> None: + """Test validation result when all checks pass.""" + checks = [ + CheckResult(name="bleu", passed=True, actual=0.8, message="OK"), + CheckResult(name="length", passed=True, actual=100, message="OK"), + ] + result = ValidationResult(passed=True, checks=checks) + + assert result.passed is True + assert len(result.checks) == 2 + assert result.failed_checks == [] + assert "All checks passed" in result.failure_summary + + def test_some_checks_failed(self) -> None: + """Test validation result when some checks fail.""" + checks = [ + CheckResult(name="bleu", passed=True, actual=0.8, message="OK"), + CheckResult( + name="length", + passed=False, + actual=600, + threshold=500, + message="Length 600 exceeds maximum 500", + ), + CheckResult( + name="readability", + passed=False, + actual=12.5, + threshold=8.0, + message="Grade level 12.5 exceeds maximum 8.0", + ), + ] + result = ValidationResult(passed=False, checks=checks) + + assert result.passed is False + assert len(result.checks) == 3 + assert len(result.failed_checks) == 2 + assert result.failed_checks[0].name == "length" + assert result.failed_checks[1].name == "readability" + + def test_failure_summary_format(self) -> None: + """Test the format of failure summary.""" + checks = [ + CheckResult( + name="bleu", + passed=False, + actual=0.5, + threshold=0.7, + message="BLEU score 0.5 < threshold 0.7", + ), + ] + result = ValidationResult(passed=False, checks=checks) + + summary = result.failure_summary + assert "Validation failed" in summary + assert "1 check(s)" in summary + assert "bleu" in summary + assert "BLEU score 0.5 < threshold 0.7" in summary + + def test_empty_checks_list(self) -> None: + """Test validation result with no checks.""" + result = ValidationResult(passed=True, checks=[]) + assert result.passed is True + assert result.checks == [] + assert result.failed_checks == [] + assert "All checks passed" in result.failure_summary + + def test_result_is_immutable(self) -> None: + """Test that ValidationResult is immutable (frozen).""" + result = ValidationResult(passed=True, checks=[]) + with pytest.raises(PydanticValidationError): + result.passed = False # type: ignore[misc] + + def test_check_actual_can_be_any_type(self) -> None: + """Test that CheckResult.actual can hold any type.""" + # List + result1 = CheckResult( + name="contains", + passed=True, + actual=["a", "b", "c"], + message="OK", + ) + assert result1.actual == ["a", "b", "c"] + + # Dict + result2 = CheckResult( + name="detailed", + passed=True, + actual={"bleu1": 0.9, "bleu2": 0.8}, + message="OK", + ) + assert result2.actual == {"bleu1": 0.9, "bleu2": 0.8} + + # Nested structure + result3 = CheckResult( + name="complex", + passed=True, + actual={"scores": [0.1, 0.2], "meta": {"key": "value"}}, + message="OK", + ) + assert result3.actual["scores"] == [0.1, 0.2]