Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Czech and Slovak algorithms. #149

Closed
wants to merge 3 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
248 changes: 248 additions & 0 deletions algorithms/czech.sbl
Original file line number Diff line number Diff line change
@@ -0,0 +1,248 @@
routines (
RV R1
palatalise
mark_regions
do_possessive
do_case
do_comparative
do_diminutive
do_augmentative
do_derivational
do_deriv_single
do_aggressive
)

externals ( stem )

integers ( pV p1 )

groupings ( v )

stringescapes {}

stringdef a' '{U+00E1}'
stringdef c^ '{U+010D}'
stringdef d^ '{U+010F}'
stringdef e' '{U+00E9}'
stringdef e^ '{U+011B}'
stringdef i' '{U+00ED}'
stringdef n^ '{U+0148}'
stringdef o' '{U+00F3}'
stringdef r^ '{U+0159}'
stringdef s^ '{U+0161}'
stringdef t^ '{U+0165}'
stringdef u' '{U+00FA}'
stringdef u* '{U+016F}'
stringdef y' '{U+00FD}'
stringdef z^ '{U+017E}'

define v 'aeiouy{a'}{e^}{e'}{i'}{o'}{u'}{u*}{y'}'

define mark_regions as (

$pV = limit
$p1 = limit

do (
gopast non-v setmark pV
gopast non-v gopast v setmark p1
)
)

backwardmode (

define RV as $pV <= cursor
define R1 as $p1 <= cursor

define palatalise as (
[substring] RV among (
'ci' 'ce' '{c^}i' '{c^}'
(<- 'k')
'zi' 'ze' '{z^}i' '{z^}e'
(<- 'h')
'{c^}t{e^}' '{c^}ti' '{c^}t{e'}'
(<- 'ck')
'{s^}t{e^}' '{s^}ti' '{s^}t{e'}'
(<- 'sk')
)
)

define do_possessive as (
[substring] RV among (
'ov' '{u*}v'
(delete)
'in'
(
delete
try palatalise
)
)
)

define do_case as (
[substring] among (
'atech'
'{e^}tem' 'at{u*}m'
'{a'}ch' '{y'}ch' 'ov{e'}' '{y'}mi'
'ata' 'aty' 'ama' 'ami' 'ovi'
'at' '{a'}m' 'os' 'us' '{y'}m' 'mi' 'ou'
'u' 'y' '{u*}' 'a' 'o' '{a'}' '{e'}' '{y'}'
(delete)
'ech' 'ich' '{i'}ch'
'{e'}ho' '{e^}mi' '{e'}mu' '{e^}te' '{e^}ti' '{i'}ho' '{i'}mi'
'emi' 'iho' 'imu'
'{e'}m' '{i'}m' 'es'
'e' 'i' '{i'}' '{e^}'
(
delete
try palatalise
)
'em'
(
<- 'e'
try palatalise
)
)
)

define do_derivational as (
[substring] R1 among (
'obinec'
'ovisk' 'ovstv' 'ovi{s^}t' 'ovn{i'}k'
'{a'}sek' 'loun' 'nost' 'teln' 'ovec' 'ov{i'}k' 'ovtv' 'ovin' '{s^}tin'
'{a'}rn' 'och' 'ost' 'ovn' 'oun' 'out' 'ou{s^}' 'u{s^}k'
'kyn' '{c^}an' 'k{a'}{r^}' 'n{e'}{r^}' 'n{i'}k' 'ctv' 'stv'
'{a'}{c^}' 'a{c^}' '{a'}n' 'an' '{a'}{r^}' 'as'
'ob' 'ot' 'ov' 'o{n^}' 'ul' 'yn'
'{c^}k' '{c^}n' 'dl' 'nk' 'tv' 'tk' 'vk'
(delete)
'ion{a'}{r^}'
'inec' 'itel'
'i{a'}n' 'ist' 'isk' 'i{s^}k' 'itb'
'ic' 'in' 'it' 'iv'
(
<- 'i'
palatalise
)
'enic' 'ec' 'en'
(
<- 'e'
palatalise
)
'{e'}{r^}'
(
<- '{e'}'
palatalise
)
'{e^}n'
(
<- '{e^}'
palatalise
)
'{i'}rn'
'{i'}{r^}' '{i'}n'
(
<- '{i'}'
palatalise
)
)
)
define do_deriv_single as (
[substring] among (
'c' '{c^}' 'k' 'l' 'n' 't'
(delete)
)
)


define do_augmentative as (
[substring] among (
'ajzn' '{a'}k'
(delete)
'izn' 'isk'
(
<- 'i'
palatalise
)
)
)

define do_diminutive as (
[substring] among (
'ou{s^}ek' '{a'}{c^}ek' 'a{c^}ek' 'o{c^}ek' 'u{c^}ek'
'anek' 'onek' 'unek' '{a'}nek'
'e{c^}k' '{e'}{c^}k' 'i{c^}k' '{i'}{c^}k' 'enk' '{e'}nk' 'ink' '{i'}nk'
'{a'}{c^}k' 'a{c^}k' 'o{c^}k' 'u{c^}k' 'ank' 'onk' 'unk'
'{a'}tk' '{a'}nk' 'u{s^}k'
'k'
(delete)
'e{c^}ek' 'enek' 'ek'
(
<- 'e'
palatalise
)
'{e'}{c^}ek' '{e'}k'
(
<- '{e'}'
palatalise
)
'i{c^}ek' 'inek' 'ik'
(
<- 'i'
palatalise
)
'{i'}{c^}ek' '{i'}k'
(
<- '{i'}'
palatalise
)
'{a'}k'
(<- '{a'}')
'ak'
(<- 'a')
'ok'
(<- 'o')
'uk'
(<- 'u')
)
)

define do_comparative as (
[substring] among (
'{e^}j{s^}'
(
<- '{e^}'
palatalise
)
'ej{s^}'
(
<- 'e'
palatalise
)
)
)

define do_aggressive as (
do do_comparative
do do_diminutive
do do_augmentative
do_derivational or do_deriv_single
)
)

define stem as (
do mark_regions
backwards (
do_case
do_possessive
// light and aggressive are the same to this point
// comment next line for light stemmer
do_aggressive
)
)

// Ljiljana Dolamic and Jacques Savoy. 2009.
// Indexing and stemming approaches for the Czech language.
// Inf. Process. Manage. 45, 6 (November 2009), 714-720.
// http://members.unine.ch/jacques.savoy/clef/CzechStemmerLight.txt
// http://members.unine.ch/jacques.savoy/clef/CzechStemmerAgressive.txt
Loading