diff --git a/algorithms/czech.sbl b/algorithms/czech.sbl new file mode 100644 index 00000000..279ac6f6 --- /dev/null +++ b/algorithms/czech.sbl @@ -0,0 +1,248 @@ +routines ( + RV R1 + palatalise + mark_regions + do_possessive + do_case + do_comparative + do_diminutive + do_augmentative + do_derivational + do_deriv_single + do_aggressive +) + +externals ( stem ) + +integers ( pV p1 ) + +groupings ( v ) + +stringescapes {} + +stringdef a' '{U+00E1}' +stringdef c^ '{U+010D}' +stringdef d^ '{U+010F}' +stringdef e' '{U+00E9}' +stringdef e^ '{U+011B}' +stringdef i' '{U+00ED}' +stringdef n^ '{U+0148}' +stringdef o' '{U+00F3}' +stringdef r^ '{U+0159}' +stringdef s^ '{U+0161}' +stringdef t^ '{U+0165}' +stringdef u' '{U+00FA}' +stringdef u* '{U+016F}' +stringdef y' '{U+00FD}' +stringdef z^ '{U+017E}' + +define v 'aeiouy{a'}{e^}{e'}{i'}{o'}{u'}{u*}{y'}' + +define mark_regions as ( + + $pV = limit + $p1 = limit + + do ( + gopast non-v setmark pV + gopast non-v gopast v setmark p1 + ) +) + +backwardmode ( + + define RV as $pV <= cursor + define R1 as $p1 <= cursor + + define palatalise as ( + [substring] RV among ( + 'ci' 'ce' '{c^}i' '{c^}' + (<- 'k') + 'zi' 'ze' '{z^}i' '{z^}e' + (<- 'h') + '{c^}t{e^}' '{c^}ti' '{c^}t{e'}' + (<- 'ck') + '{s^}t{e^}' '{s^}ti' '{s^}t{e'}' + (<- 'sk') + ) + ) + + define do_possessive as ( + [substring] RV among ( + 'ov' '{u*}v' + (delete) + 'in' + ( + delete + try palatalise + ) + ) + ) + + define do_case as ( + [substring] among ( + 'atech' + '{e^}tem' 'at{u*}m' + '{a'}ch' '{y'}ch' 'ov{e'}' '{y'}mi' + 'ata' 'aty' 'ama' 'ami' 'ovi' + 'at' '{a'}m' 'os' 'us' '{y'}m' 'mi' 'ou' + 'u' 'y' '{u*}' 'a' 'o' '{a'}' '{e'}' '{y'}' + (delete) + 'ech' 'ich' '{i'}ch' + '{e'}ho' '{e^}mi' '{e'}mu' '{e^}te' '{e^}ti' '{i'}ho' '{i'}mi' + 'emi' 'iho' 'imu' + '{e'}m' '{i'}m' 'es' + 'e' 'i' '{i'}' '{e^}' + ( + delete + try palatalise + ) + 'em' + ( + <- 'e' + try palatalise + ) + ) + ) + + define do_derivational as ( + [substring] R1 among ( + 'obinec' + 'ovisk' 'ovstv' 'ovi{s^}t' 'ovn{i'}k' + '{a'}sek' 'loun' 'nost' 'teln' 'ovec' 'ov{i'}k' 'ovtv' 'ovin' '{s^}tin' + '{a'}rn' 'och' 'ost' 'ovn' 'oun' 'out' 'ou{s^}' 'u{s^}k' + 'kyn' '{c^}an' 'k{a'}{r^}' 'n{e'}{r^}' 'n{i'}k' 'ctv' 'stv' + '{a'}{c^}' 'a{c^}' '{a'}n' 'an' '{a'}{r^}' 'as' + 'ob' 'ot' 'ov' 'o{n^}' 'ul' 'yn' + '{c^}k' '{c^}n' 'dl' 'nk' 'tv' 'tk' 'vk' + (delete) + 'ion{a'}{r^}' + 'inec' 'itel' + 'i{a'}n' 'ist' 'isk' 'i{s^}k' 'itb' + 'ic' 'in' 'it' 'iv' + ( + <- 'i' + palatalise + ) + 'enic' 'ec' 'en' + ( + <- 'e' + palatalise + ) + '{e'}{r^}' + ( + <- '{e'}' + palatalise + ) + '{e^}n' + ( + <- '{e^}' + palatalise + ) + '{i'}rn' + '{i'}{r^}' '{i'}n' + ( + <- '{i'}' + palatalise + ) + ) + ) + define do_deriv_single as ( + [substring] among ( + 'c' '{c^}' 'k' 'l' 'n' 't' + (delete) + ) + ) + + + define do_augmentative as ( + [substring] among ( + 'ajzn' '{a'}k' + (delete) + 'izn' 'isk' + ( + <- 'i' + palatalise + ) + ) + ) + + define do_diminutive as ( + [substring] among ( + 'ou{s^}ek' '{a'}{c^}ek' 'a{c^}ek' 'o{c^}ek' 'u{c^}ek' + 'anek' 'onek' 'unek' '{a'}nek' + 'e{c^}k' '{e'}{c^}k' 'i{c^}k' '{i'}{c^}k' 'enk' '{e'}nk' 'ink' '{i'}nk' + '{a'}{c^}k' 'a{c^}k' 'o{c^}k' 'u{c^}k' 'ank' 'onk' 'unk' + '{a'}tk' '{a'}nk' 'u{s^}k' + 'k' + (delete) + 'e{c^}ek' 'enek' 'ek' + ( + <- 'e' + palatalise + ) + '{e'}{c^}ek' '{e'}k' + ( + <- '{e'}' + palatalise + ) + 'i{c^}ek' 'inek' 'ik' + ( + <- 'i' + palatalise + ) + '{i'}{c^}ek' '{i'}k' + ( + <- '{i'}' + palatalise + ) + '{a'}k' + (<- '{a'}') + 'ak' + (<- 'a') + 'ok' + (<- 'o') + 'uk' + (<- 'u') + ) + ) + + define do_comparative as ( + [substring] among ( + '{e^}j{s^}' + ( + <- '{e^}' + palatalise + ) + 'ej{s^}' + ( + <- 'e' + palatalise + ) + ) + ) + + define do_aggressive as ( + do do_comparative + do do_diminutive + do do_augmentative + do_derivational or do_deriv_single + ) +) + +define stem as ( + do mark_regions + backwards ( + do_case + do_possessive + // light and aggressive are the same to this point + // comment next line for light stemmer + do_aggressive + ) +) + +// Ljiljana Dolamic and Jacques Savoy. 2009. +// Indexing and stemming approaches for the Czech language. +// Inf. Process. Manage. 45, 6 (November 2009), 714-720. +// http://members.unine.ch/jacques.savoy/clef/CzechStemmerLight.txt +// http://members.unine.ch/jacques.savoy/clef/CzechStemmerAgressive.txt \ No newline at end of file diff --git a/algorithms/slovak.sbl b/algorithms/slovak.sbl new file mode 100644 index 00000000..a863e263 --- /dev/null +++ b/algorithms/slovak.sbl @@ -0,0 +1,246 @@ +routines ( + R2 + lower_case + un_accent + exception + mark_p1 + mark_p2 + prefixes + suffixes + end_vowel +) + +externals ( stem ) +integers ( p1 p2 ) +groupings ( vowel ) + +stringescapes {} + +stringdef A" '{U+00C1}' // dlhe a +stringdef A: '{U+00C4}' // siroke a +stringdef C< '{U+010C}' // c s makcenom +stringdef D< '{U+010E}' // d s makcenom +stringdef E" '{U+00C9}' // e s dlznom +stringdef E< '{U+011A}' // e s makcenom +stringdef I" '{U+00CD}' // i s dlznom +stringdef L" '{U+0139}' // l s dlznom +stringdef L< '{U+013E}' // l s makcenom +stringdef N< '{U+013D}' // n s makcenom +stringdef O> '{U+00D4}' // o s vokanom +stringdef O" '{U+00D3}' // o s dlznom +stringdef R" '{U+0154}' // r s dlznom +stringdef R< '{U+0158}' // r s makcenom +stringdef S< '{U+0160}' // s s makcenom +stringdef T< '{U+0164}' // t s makcenom +stringdef U" '{U+00DA}' // u s dlznom +stringdef U^ '{U+016E}' // u s kruzkom +stringdef Y" '{U+00DD}' // y s dlznom +stringdef Z< '{U+017D}' // z s makcenom + +stringdef a" '{U+00E1}' // dlhe a +stringdef a: '{U+00E4}' // siroke a +stringdef c< '{U+010D}' // c s makcenom +stringdef d< '{U+010F}' // d s makcenom +stringdef e" '{U+00E9}' // e s dlznom +stringdef e< '{U+011B}' // e s makcenom +stringdef i" '{U+00ED}' // i s dlznom +stringdef l" '{U+013A}' // l s dlznom +stringdef l< '{U+013E}' // l s makcenom +stringdef n< '{U+0148}' // n s makcenom +stringdef o> '{U+00F4}' // o s vokanom +stringdef o" '{U+00F3}' // o s dlznom +stringdef r" '{U+0155}' // r s dlznom +stringdef r< '{U+0159}' // r s makcenom +stringdef s< '{U+0161}' // s s makcenom +stringdef t< '{U+0165}' // t s makcenom +stringdef u" '{U+00FA}' // u s dlznom +stringdef u^ '{U+016F}' // u s kruzkom +stringdef y" '{U+00FD}' // y s dlznom +stringdef z< '{U+017E}' // z s makcenom +stringdef ia 'ia' // ia +stringdef ie 'ie' // ie +stringdef iu 'iu' // iu + +define vowel 'aeiyou' + +define un_accent as repeat ( + [substring] among ( + '{a"}' (<-'a') + '{a:}' (<-'a') + '{c<}' (<-'c') + '{d<}' (<-'d') + '{e"}' (<-'e') + '{e<}' (<-'e') + '{i"}' (<-'i') + '{l"}' (<-'l') + '{l<}' (<-'l') + '{n<}' (<-'n') + '{o>}' (<-'o') + '{o"}' (<-'o') + '{r"}' (<-'r') + '{r<}' (<-'r') + '{s<}' (<-'s') + '{t<}' (<-'t') + '{u"}' (<-'u') + '{u^}' (<-'u') + '{y"}' (<-'y') + '{z<}' (<-'z') + '' (next) + ) +) + +define lower_case as repeat ( + [substring] among ( + '{A"}' (<-'{a"}') + '{A:}' (<-'{a:}') + '{C<}' (<-'{c<}') + '{D<}' (<-'{d<}') + '{E"}' (<-'{e"}') + '{E<}' (<-'{e<}') + '{I"}' (<-'{i"}') + '{L"}' (<-'{l"}') + '{L<}' (<-'{l<}') + '{N<}' (<-'{n<}') + '{O>}' (<-'{o>}') + '{O"}' (<-'{o"}') + '{R"}' (<-'{r"}') + '{R<}' (<-'{r<}') + '{S<}' (<-'{s<}') + '{T<}' (<-'{t<}') + '{U"}' (<-'{u"}') + '{U^}' (<-'{u^}') + '{Y"}' (<-'{y"}') + '{Z<}' (<-'{z<}') + '' (next) + ) +) + +define exception as ( + [substring] atlimit among ( + /* specialne */ + 'som' 'si' 'je' + 'sme' 'ste' 'su' + 'bol' 'bola' 'bolo' + 'bud' 'budte' 'budme' 'budu' + 'budem' 'budes' 'bude' 'budeme' 'budete' + (<-'byt') + + 'mam' 'mas' 'ma' 'mame' 'mate' 'maju' + 'mal' 'mala' 'malo' 'mali' + 'maj' 'majme' 'majte' + (<-'mat') + + 'idem' 'ides' 'ide' 'ideme' 'idete' 'idu' + 'isiel' 'siel' 'isla' 'sla' 'islo' 'slo' 'isli' 'sli' + 'chod' 'chodte' 'pod' 'podme' 'podte' + 'pojdem' 'pojdes' 'pojde' 'pojdeme' 'pojdete' 'pojdu' + 'iduc' 'iduca' 'iduce' 'iduci' + (<-'ist') + + 'jem' 'jes' 'jedia' 'jeme' 'jete' + 'jedol' 'jedla' 'jedlo' 'jedli' + 'jedz' 'jedzte' 'jedzme' + (<-'jest') + + 'babiek' 'babicka' + (<- 'babk') + + 'dlan' + (<- 'dlan') + + 'matka' 'maticka' 'mamka' 'mamicka' + (<- 'mam') + + 'majetok' + (<- 'majetk') + + 'meste' 'miest' + (<- 'mest') + + 'sral' 'srali' + (<- 'ser') + + 'zien' + (<- 'zen') + ) +) + +define mark_p1 as ( + $p1 = limit + gopast vowel gopast non-vowel setmark p1 +) + +define prefixes as ( + do ( + [substring] among ( + 'proti' + ( + $p1 > 4 delete + ) + 'bez' 'cez' 'naj' 'nad' 'pod' 'pre' 'pri' 'roz' + ( + $p1 > 2 delete + ) + 'do' 'od' 'ob' 'po' 'ne' 'vy' 'vz' 'za' + ( + $p1 > 1 delete + ) + ) + ) +) + +backwardmode ( + + define R2 as $p2 <= cursor + + define mark_p2 as ( + $p2 = limit + ) + + define suffixes as ( + do mark_p2 + do ( + [substring] R2 among ( + 'ejsieho' 'ejsiemu' + 'ejsich' 'ejsimi' 'inovia' 'encoch' 'encami' + 'ejsom' 'ejsim' 'ejsia' 'ejsie' 'ejsej' 'eniec' + 'ejsiu' 'ejsou' 'inych' 'inymi' 'ovych' 'ovymi' + 'inami' 'inovi' 'inoch' 'atami' 'atach' 'encom' + 'avame' 'avate' 'avaju' 'ovala' 'ovalo' 'ovali' + 'ujeme' 'ujete' + 'ejsi' 'iemu' 'ieho' 'inmu' 'inho' 'inej' 'ence' + 'inou' 'inov' 'inom' 'inym' 'ovia' 'iach' 'atom' + 'ovho' 'ovej' 'ovou' 'ovym' 'ovmu' 'ovom' 'atam' + 'ieme' 'iete' 'avam' 'avas' 'oval' 'ujem' 'ujes' + 'ujme' 'ujte' 'ujuc' + 'ovi' 'ovy' 'och' 'ami' 'ach' 'iam' 'eho' 'ete' + 'ych' 'ich' 'ymi' 'imi' 'ini' 'ata' 'atu' 'uju' + 'ina' 'inu' 'ino' 'ine' 'sie' 'emu' 'ati' 'uje' + 'ime' 'ite' 'ila' 'ilo' 'ili' 'iem' 'ies' + 'ali' 'ala' 'alo' 'ame' 'ate' 'aju' 'eme' + 'ove' 'ovu' 'ovo' 'jte' 'jme' 'ska' + 'ym' 'im' 'ia' 'ie' 'ej' 'iu' 'om' 'ou' + 'ho' 'mu' 'ov' 'mi' 'am' 'in' 'at' 'is' 'il' + 'va' 'vu' 've' 'vo' 'al' 'es' 'it' 'as' 'te' + 'me' + 'v' 'n' 'a' 'm' 'i' 'u' 'e' 'y' + (delete) + ) + ) + ) + + define end_vowel as ( + [vowel] (delete) + ) +) + +define stem as ( + do lower_case + do un_accent + do mark_p1 + do prefixes + exception or backwards ( + do suffixes + do end_vowel + ) +) diff --git a/libstemmer/modules.txt b/libstemmer/modules.txt index 6940fd17..de4e48b1 100644 --- a/libstemmer/modules.txt +++ b/libstemmer/modules.txt @@ -13,6 +13,7 @@ arabic UTF_8 arabic,ar,ara armenian UTF_8 armenian,hy,hye,arm basque UTF_8,ISO_8859_1 basque,eu,eus,baq catalan UTF_8,ISO_8859_1 catalan,ca,cat +czech UTF_8,ISO_8859_2 czech,cz,cze danish UTF_8,ISO_8859_1 danish,da,dan dutch UTF_8,ISO_8859_1 dutch,nl,dut,nld english UTF_8,ISO_8859_1 english,en,eng @@ -32,6 +33,7 @@ portuguese UTF_8,ISO_8859_1 portuguese,pt,por romanian UTF_8,ISO_8859_2 romanian,ro,rum,ron russian UTF_8,KOI8_R russian,ru,rus serbian UTF_8 serbian,sr,srp +slovak UTF_8,ISO_8859_2 slovak,sk,svk spanish UTF_8,ISO_8859_1 spanish,es,esl,spa swedish UTF_8,ISO_8859_1 swedish,sv,swe tamil UTF_8 tamil,ta,tam