From 918e0cbe79bbf97f8e1a17e9f9f2fff0e8f5dbe0 Mon Sep 17 00:00:00 2001 From: Olly Betts Date: Tue, 31 Aug 2021 15:30:22 +1200 Subject: [PATCH 01/22] Merge Czech stemmer This has been on the web site since 2012, but never actually got included in the code distribution. --- algorithms/czech.sbl | 248 +++++++++++++++++++++++++++++++++++++++++ libstemmer/modules.txt | 1 + 2 files changed, 249 insertions(+) create mode 100644 algorithms/czech.sbl diff --git a/algorithms/czech.sbl b/algorithms/czech.sbl new file mode 100644 index 00000000..30cadaf0 --- /dev/null +++ b/algorithms/czech.sbl @@ -0,0 +1,248 @@ +routines ( + RV R1 + palatalise + mark_regions + do_possessive + do_case + do_comparative + do_diminutive + do_augmentative + do_derivational + do_deriv_single + do_aggressive +) + +externals ( stem ) + +integers ( pV p1 ) + +groupings ( v ) + +stringescapes {} + +stringdef a' '{U+00E1}' +stringdef c^ '{U+010D}' +stringdef d^ '{U+010F}' +stringdef e' '{U+00E9}' +stringdef e^ '{U+011B}' +stringdef i' '{U+00ED}' +stringdef n^ '{U+0148}' +stringdef o' '{U+00F3}' +stringdef r^ '{U+0159}' +stringdef s^ '{U+0161}' +stringdef t^ '{U+0165}' +stringdef u' '{U+00FA}' +stringdef u* '{U+016F}' +stringdef y' '{U+00FD}' +stringdef z^ '{U+017E}' + +define v 'aeiouy{a'}{e^}{e'}{i'}{o'}{u'}{u*}{y'}' + +define mark_regions as ( + + $pV = limit + $p1 = limit + + do ( + gopast non-v setmark pV + gopast non-v gopast v setmark p1 + ) +) + +backwardmode ( + + define RV as $pV <= cursor + define R1 as $p1 <= cursor + + define palatalise as ( + [substring] RV among ( + 'ci' 'ce' '{c^}i' '{c^}' + (<- 'k') + 'zi' 'ze' '{z^}i' '{z^}e' + (<- 'h') + '{c^}t{e^}' '{c^}ti' '{c^}t{e'}' + (<- 'ck') + '{s^}t{e^}' '{s^}ti' '{s^}t{e'}' + (<- 'sk') + ) + ) + + define do_possessive as ( + [substring] RV among ( + 'ov' '{u*}v' + (delete) + 'in' + ( + delete + try palatalise + ) + ) + ) + + define do_case as ( + [substring] among ( + 'atech' + '{e^}tem' 'at{u*}m' + '{a'}ch' '{y'}ch' 'ov{e'}' '{y'}mi' + 'ata' 'aty' 'ama' 'ami' 'ovi' + 'at' '{a'}m' 'os' 'us' '{y'}m' 'mi' 'ou' + 'u' 'y' '{u*}' 'a' 'o' '{a'}' '{e'}' '{y'}' + (delete) + 'ech' 'ich' '{i'}ch' + '{e'}ho' '{e^}mi' '{e'}mu' '{e^}te' '{e^}ti' '{i'}ho' '{i'}mi' + 'emi' 'iho' 'imu' + '{e'}m' '{i'}m' 'es' + 'e' 'i' '{i'}' '{e^}' + ( + delete + try palatalise + ) + 'em' + ( + <- 'e' + try palatalise + ) + ) + ) + + define do_derivational as ( + [substring] R1 among ( + 'obinec' + 'ovisk' 'ovstv' 'ovi{s^}t' 'ovn{i'}k' + '{a'}sek' 'loun' 'nost' 'teln' 'ovec' 'ov{i'}k' 'ovtv' 'ovin' '{s^}tin' + '{a'}rn' 'och' 'ost' 'ovn' 'oun' 'out' 'ou{s^}' 'u{s^}k' + 'kyn' '{c^}an' 'k{a'}{r^}' 'n{e'}{r^}' 'n{i'}k' 'ctv' 'stv' + '{a'}{c^}' 'a{c^}' '{a'}n' 'an' '{a'}{r^}' 'as' + 'ob' 'ot' 'ov' 'o{n^}' 'ul' 'yn' + '{c^}k' '{c^}n' 'dl' 'nk' 'tv' 'tk' 'vk' + (delete) + 'ion{a'}{r^}' + 'inec' 'itel' + 'i{a'}n' 'ist' 'isk' 'i{s^}k' 'itb' + 'ic' 'in' 'it' 'iv' + ( + <- 'i' + palatalise + ) + 'enic' 'ec' 'en' + ( + <- 'e' + palatalise + ) + '{e'}{r^}' + ( + <- '{e'}' + palatalise + ) + '{e^}n' + ( + <- '{e^}' + palatalise + ) + '{i'}rn' + '{i'}{r^}' '{i'}n' + ( + <- '{i'}' + palatalise + ) + ) + ) + define do_deriv_single as ( + [substring] among ( + 'c' '{c^}' 'k' 'l' 'n' 't' + (delete) + ) + ) + + + define do_augmentative as ( + [substring] among ( + 'ajzn' '{a'}k' + (delete) + 'izn' 'isk' + ( + <- 'i' + palatalise + ) + ) + ) + + define do_diminutive as ( + [substring] among ( + 'ou{s^}ek' '{a'}{c^}ek' 'a{c^}ek' 'o{c^}ek' 'u{c^}ek' + 'anek' 'onek' 'unek' '{a'}nek' + 'e{c^}k' '{e'}{c^}k' 'i{c^}k' '{i'}{c^}k' 'enk' '{e'}nk' 'ink' '{i'}nk' + '{a'}{c^}k' 'a{c^}k' 'o{c^}k' 'u{c^}k' 'ank' 'onk' 'unk' + '{a'}tk' '{a'}nk' 'u{s^}k' + 'k' + (delete) + 'e{c^}ek' 'enek' 'ek' + ( + <- 'e' + palatalise + ) + '{e'}{c^}ek' '{e'}k' + ( + <- '{e'}' + palatalise + ) + 'i{c^}ek' 'inek' 'ik' + ( + <- 'i' + palatalise + ) + '{i'}{c^}ek' '{i'}k' + ( + <- '{i'}' + palatalise + ) + '{a'}k' + (<- '{a'}') + 'ak' + (<- 'a') + 'ok' + (<- 'o') + 'uk' + (<- 'u') + ) + ) + + define do_comparative as ( + [substring] among ( + '{e^}j{s^}' + ( + <- '{e^}' + palatalise + ) + 'ej{s^}' + ( + <- 'e' + palatalise + ) + ) + ) + + define do_aggressive as ( + do do_comparative + do do_diminutive + do do_augmentative + do_derivational or do_deriv_single + ) +) + +define stem as ( + do mark_regions + backwards ( + do_case + do_possessive + // light and aggressive are the same to this point + // comment next line for light stemmer + do_aggressive + ) +) + +// Ljiljana Dolamic and Jacques Savoy. 2009. +// Indexing and stemming approaches for the Czech language. +// Inf. Process. Manage. 45, 6 (November 2009), 714-720. +// http://members.unine.ch/jacques.savoy/clef/CzechStemmerLight.txt +// http://members.unine.ch/jacques.savoy/clef/CzechStemmerAgressive.txt diff --git a/libstemmer/modules.txt b/libstemmer/modules.txt index cd36a219..58df1464 100644 --- a/libstemmer/modules.txt +++ b/libstemmer/modules.txt @@ -13,6 +13,7 @@ arabic UTF_8 arabic,ar,ara armenian UTF_8 armenian,hy,hye,arm basque UTF_8,ISO_8859_1 basque,eu,eus,baq catalan UTF_8,ISO_8859_1 catalan,ca,cat +czech UTF_8,ISO_8859_2 czech,cs,ces,cze danish UTF_8,ISO_8859_1 danish,da,dan dutch UTF_8,ISO_8859_1 dutch,nl,dut,nld english UTF_8,ISO_8859_1 english,en,eng From 17d85272eef8857462b61ea57aef6ccc1fd96eea Mon Sep 17 00:00:00 2001 From: Olly Betts Date: Wed, 1 Sep 2021 11:56:32 +1200 Subject: [PATCH 02/22] Only apply do_case in R1 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This helps avoid overstemming. Co-authored-by: Jim O’Regan --- algorithms/czech.sbl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/algorithms/czech.sbl b/algorithms/czech.sbl index 30cadaf0..5bab9f9e 100644 --- a/algorithms/czech.sbl +++ b/algorithms/czech.sbl @@ -80,7 +80,7 @@ backwardmode ( ) define do_case as ( - [substring] among ( + [substring] R1 among ( 'atech' '{e^}tem' 'at{u*}m' '{a'}ch' '{y'}ch' 'ov{e'}' '{y'}mi' From 18016f58f2c81bbb212a4ae0f4599cf33947f91c Mon Sep 17 00:00:00 2001 From: Olly Betts Date: Wed, 1 Sep 2021 12:01:15 +1200 Subject: [PATCH 03/22] Implement the "light" version of the stemmer MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The "aggressive" version is known to overstem. According to the original paper, the aggressive version performs slightly better, but the difference isn't statistically significant and conflation from overstemming can be problematic. Co-authored-by: Jim O’Regan --- algorithms/czech.sbl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/algorithms/czech.sbl b/algorithms/czech.sbl index 5bab9f9e..c5bd04f9 100644 --- a/algorithms/czech.sbl +++ b/algorithms/czech.sbl @@ -237,7 +237,7 @@ define stem as ( do_possessive // light and aggressive are the same to this point // comment next line for light stemmer - do_aggressive + // do_aggressive ) ) From fe8fe84955b9652df98c188bf2f0c313ca522812 Mon Sep 17 00:00:00 2001 From: Olly Betts Date: Wed, 1 Sep 2021 12:01:58 +1200 Subject: [PATCH 04/22] Improve comment about origin of algorithm MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Jim O’Regan --- algorithms/czech.sbl | 1 + 1 file changed, 1 insertion(+) diff --git a/algorithms/czech.sbl b/algorithms/czech.sbl index c5bd04f9..033025bc 100644 --- a/algorithms/czech.sbl +++ b/algorithms/czech.sbl @@ -244,5 +244,6 @@ define stem as ( // Ljiljana Dolamic and Jacques Savoy. 2009. // Indexing and stemming approaches for the Czech language. // Inf. Process. Manage. 45, 6 (November 2009), 714-720. +// based on Java code by Ljiljana Dolamic: // http://members.unine.ch/jacques.savoy/clef/CzechStemmerLight.txt // http://members.unine.ch/jacques.savoy/clef/CzechStemmerAgressive.txt From d39135766075fe99b5a8253c6bfa5654b54244a4 Mon Sep 17 00:00:00 2001 From: Olly Betts Date: Wed, 1 Sep 2021 12:31:31 +1200 Subject: [PATCH 05/22] czech: Strip out unused "aggressive" code Avoids snowball and C compiler warnings. --- algorithms/czech.sbl | 134 ------------------------------------------- 1 file changed, 134 deletions(-) diff --git a/algorithms/czech.sbl b/algorithms/czech.sbl index 033025bc..5d777231 100644 --- a/algorithms/czech.sbl +++ b/algorithms/czech.sbl @@ -4,12 +4,6 @@ routines ( mark_regions do_possessive do_case - do_comparative - do_diminutive - do_augmentative - do_derivational - do_deriv_single - do_aggressive ) externals ( stem ) @@ -104,130 +98,6 @@ backwardmode ( ) ) ) - - define do_derivational as ( - [substring] R1 among ( - 'obinec' - 'ovisk' 'ovstv' 'ovi{s^}t' 'ovn{i'}k' - '{a'}sek' 'loun' 'nost' 'teln' 'ovec' 'ov{i'}k' 'ovtv' 'ovin' '{s^}tin' - '{a'}rn' 'och' 'ost' 'ovn' 'oun' 'out' 'ou{s^}' 'u{s^}k' - 'kyn' '{c^}an' 'k{a'}{r^}' 'n{e'}{r^}' 'n{i'}k' 'ctv' 'stv' - '{a'}{c^}' 'a{c^}' '{a'}n' 'an' '{a'}{r^}' 'as' - 'ob' 'ot' 'ov' 'o{n^}' 'ul' 'yn' - '{c^}k' '{c^}n' 'dl' 'nk' 'tv' 'tk' 'vk' - (delete) - 'ion{a'}{r^}' - 'inec' 'itel' - 'i{a'}n' 'ist' 'isk' 'i{s^}k' 'itb' - 'ic' 'in' 'it' 'iv' - ( - <- 'i' - palatalise - ) - 'enic' 'ec' 'en' - ( - <- 'e' - palatalise - ) - '{e'}{r^}' - ( - <- '{e'}' - palatalise - ) - '{e^}n' - ( - <- '{e^}' - palatalise - ) - '{i'}rn' - '{i'}{r^}' '{i'}n' - ( - <- '{i'}' - palatalise - ) - ) - ) - define do_deriv_single as ( - [substring] among ( - 'c' '{c^}' 'k' 'l' 'n' 't' - (delete) - ) - ) - - - define do_augmentative as ( - [substring] among ( - 'ajzn' '{a'}k' - (delete) - 'izn' 'isk' - ( - <- 'i' - palatalise - ) - ) - ) - - define do_diminutive as ( - [substring] among ( - 'ou{s^}ek' '{a'}{c^}ek' 'a{c^}ek' 'o{c^}ek' 'u{c^}ek' - 'anek' 'onek' 'unek' '{a'}nek' - 'e{c^}k' '{e'}{c^}k' 'i{c^}k' '{i'}{c^}k' 'enk' '{e'}nk' 'ink' '{i'}nk' - '{a'}{c^}k' 'a{c^}k' 'o{c^}k' 'u{c^}k' 'ank' 'onk' 'unk' - '{a'}tk' '{a'}nk' 'u{s^}k' - 'k' - (delete) - 'e{c^}ek' 'enek' 'ek' - ( - <- 'e' - palatalise - ) - '{e'}{c^}ek' '{e'}k' - ( - <- '{e'}' - palatalise - ) - 'i{c^}ek' 'inek' 'ik' - ( - <- 'i' - palatalise - ) - '{i'}{c^}ek' '{i'}k' - ( - <- '{i'}' - palatalise - ) - '{a'}k' - (<- '{a'}') - 'ak' - (<- 'a') - 'ok' - (<- 'o') - 'uk' - (<- 'u') - ) - ) - - define do_comparative as ( - [substring] among ( - '{e^}j{s^}' - ( - <- '{e^}' - palatalise - ) - 'ej{s^}' - ( - <- 'e' - palatalise - ) - ) - ) - - define do_aggressive as ( - do do_comparative - do do_diminutive - do do_augmentative - do_derivational or do_deriv_single - ) ) define stem as ( @@ -235,9 +105,6 @@ define stem as ( backwards ( do_case do_possessive - // light and aggressive are the same to this point - // comment next line for light stemmer - // do_aggressive ) ) @@ -246,4 +113,3 @@ define stem as ( // Inf. Process. Manage. 45, 6 (November 2009), 714-720. // based on Java code by Ljiljana Dolamic: // http://members.unine.ch/jacques.savoy/clef/CzechStemmerLight.txt -// http://members.unine.ch/jacques.savoy/clef/CzechStemmerAgressive.txt From 1ee3d2f239264c77740fdbf97ddfb2b683eac68b Mon Sep 17 00:00:00 2001 From: Olly Betts Date: Thu, 2 Nov 2023 13:57:46 +1300 Subject: [PATCH 06/22] =?UTF-8?q?czech:=20Remove=20-=C5=AFm=20ending=20in?= =?UTF-8?q?=20do=5Fcase?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The Java code removes this ending but it was missing from the Snowball version. Looking at the changes resulting from this, it seems a clear improvement so I've concluded it was an accidental omission. See #151 --- algorithms/czech.sbl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/algorithms/czech.sbl b/algorithms/czech.sbl index 5d777231..52488868 100644 --- a/algorithms/czech.sbl +++ b/algorithms/czech.sbl @@ -79,7 +79,7 @@ backwardmode ( '{e^}tem' 'at{u*}m' '{a'}ch' '{y'}ch' 'ov{e'}' '{y'}mi' 'ata' 'aty' 'ama' 'ami' 'ovi' - 'at' '{a'}m' 'os' 'us' '{y'}m' 'mi' 'ou' + 'at' '{a'}m' 'os' 'us' '{u*}m' '{y'}m' 'mi' 'ou' 'u' 'y' '{u*}' 'a' 'o' '{a'}' '{e'}' '{y'}' (delete) 'ech' 'ich' '{i'}ch' From 17e83a1cc2d6f1387b3f754d825c15a3c87b6f4c Mon Sep 17 00:00:00 2001 From: Olly Betts Date: Wed, 4 Sep 2024 17:37:00 +1200 Subject: [PATCH 07/22] Add initial version of CzechStemmerLight.java Temporary addition to allow easy comparison with Snowball implementation. As downloaded, except for comment and whitespace tweaks, plus addition of main() to allow testing. --- CzechStemmerLight.java | 297 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 297 insertions(+) create mode 100644 CzechStemmerLight.java diff --git a/CzechStemmerLight.java b/CzechStemmerLight.java new file mode 100644 index 00000000..6ef5779b --- /dev/null +++ b/CzechStemmerLight.java @@ -0,0 +1,297 @@ +import java.io.BufferedReader; +import java.io.BufferedWriter; +import java.io.FileInputStream; +import java.io.FileOutputStream; +import java.io.InputStream; +import java.io.InputStreamReader; +import java.io.OutputStream; +import java.io.OutputStreamWriter; +import java.io.Reader; +import java.io.Writer; +import java.nio.charset.StandardCharsets; + +/** + * @author Dolamic Ljiljana University of Neuchatel + * + * Czech stemmer-removes case endings form nouns and adjectives, possessive adj. + * endings from names + * and takes care of palatalisation + */ +public class CzechStemmerLight { + + /** + * A buffer of the current word being stemmed + */ + private StringBuffer sb=new StringBuffer(); + + + /** + * Default constructor + */ + public CzechStemmerLight(){} // constructor + + public String stem(String input){ + + // + input=input.toLowerCase(); + + //reset string buffer + sb.delete(0,sb.length()); + sb.insert(0,input); + + // stemming... + //removes case endings from nouns and adjectives + removeCase(sb); + + //removes possessive endings from names -ov- and -in- + removePossessives(sb); + + String result = sb.toString(); + + + return result; + } + private void palatalise(StringBuffer buffer){ + int len=buffer.length(); + + if( buffer.substring( len- 2 ,len).equals("ci")|| + buffer.substring( len- 2 ,len).equals("ce")|| + buffer.substring( len- 2 ,len).equals("\u010di")|| //-či + buffer.substring( len- 2 ,len).equals("\u010de")){ //-če + + buffer.replace(len- 2 ,len, "k"); + return; + } + if( buffer.substring( len- 2 ,len).equals("zi")|| + buffer.substring( len- 2 ,len).equals("ze")|| + buffer.substring( len- 2 ,len).equals("\u017ei")|| //-ži + buffer.substring( len- 2 ,len).equals("\u017ee")){ //-že + + buffer.replace(len- 2 ,len, "h"); + return; + } + if( buffer.substring( len- 3 ,len).equals("\u010dt\u011b")|| //-čtě + buffer.substring( len- 3 ,len).equals("\u010dti")|| //-čti + buffer.substring( len- 3 ,len).equals("\u010dt\u00ed")){ //-čtí + + buffer.replace(len- 3 ,len, "ck"); + return; + } + if( buffer.substring( len- 2 ,len).equals("\u0161t\u011b")|| //-ště + buffer.substring( len- 2 ,len).equals("\u0161ti")|| //-šti + buffer.substring( len- 2 ,len).equals("\u0161t\u00ed")){ //-ští + + buffer.replace(len- 2 ,len, "sk"); + return; + } + buffer.delete( len- 1 , len); + return; + }//palatalise + + private void removePossessives(StringBuffer buffer) { + int len=buffer.length(); + + if( len> 5 ){ + if( buffer.substring( len- 2 ,len).equals("ov")){ + + buffer.delete( len- 2 , len); + return; + } + if( buffer.substring( len-2,len).equals("\u016fv")){ //-ův + + buffer.delete( len- 2 , len); + return; + } + if( buffer.substring( len- 2 ,len).equals("in")){ + + buffer.delete( len- 1 , len); + palatalise(buffer); + return; + } + } + return; + }//removePossessives + + private void removeCase(StringBuffer buffer) { + int len=buffer.length(); + // + if( (len> 7 )&& + buffer.substring( len- 5 ,len).equals("atech")){ + + buffer.delete( len- 5 , len); + return; + }//len>7 + if( len> 6 ){ + if(buffer.substring( len- 4 ,len).equals("\u011btem")){ //-ětem + + buffer.delete( len- 3 , len); + palatalise(buffer); + return; + } + if(buffer.substring( len- 4 ,len).equals("at\u016fm")){ //-atům + buffer.delete( len- 4 , len); + return; + } + + } + if( len> 5 ){ + if(buffer.substring( len-3,len).equals("ech")|| + buffer.substring( len-3,len).equals("ich")|| + buffer.substring( len-3,len).equals("\u00edch")){ //-ích + + buffer.delete( len-2 , len); + palatalise(buffer); + return; + } + if(buffer.substring( len-3,len).equals("\u00e9ho")|| //-ého + buffer.substring( len-3,len).equals("\u011bmi")|| //-ěmi + buffer.substring( len-3,len).equals("emi")|| + buffer.substring( len-3,len).equals("\u00e9mu")|| //-ému + buffer.substring( len-3,len).equals("\u011bte")|| //-ěte + buffer.substring( len-3,len).equals("\u011bti")|| //-ěti + buffer.substring( len-3,len).equals("iho")|| + buffer.substring( len-3,len).equals("\u00edho")|| //-ího + buffer.substring( len-3,len).equals("\u00edmi")|| //-ími + buffer.substring( len-3,len).equals("imu")){ + + buffer.delete( len- 2 , len); + palatalise(buffer); + return; + } + if( buffer.substring( len-3,len).equals("\u00e1ch")|| //-ách + buffer.substring( len-3,len).equals("ata")|| + buffer.substring( len-3,len).equals("aty")|| + buffer.substring( len-3,len).equals("\u00fdch")|| //-ých + buffer.substring( len-3,len).equals("ama")|| + buffer.substring( len-3,len).equals("ami")|| + buffer.substring( len-3,len).equals("ov\u00e9")|| //-ové + buffer.substring( len-3,len).equals("ovi")|| + buffer.substring( len-3,len).equals("\u00fdmi")){ //-ými + + buffer.delete( len- 3 , len); + return; + } + } + if( len> 4){ + if(buffer.substring( len-2,len).equals("em")){ + + buffer.delete( len- 1 , len); + palatalise(buffer); + return; + + } + if( buffer.substring( len-2,len).equals("es")|| + buffer.substring( len-2,len).equals("\u00e9m")|| //-ém + buffer.substring( len-2,len).equals("\u00edm")){ //-ím + + buffer.delete( len- 2 , len); + palatalise(buffer); + return; + } + if( buffer.substring( len-2,len).equals("\u016fm")){ //-ům + + buffer.delete( len- 2 , len); + return; + } + if( buffer.substring( len-2,len).equals("at")|| + buffer.substring( len-2,len).equals("\u00e1m")|| //-ám + buffer.substring( len-2,len).equals("os")|| + buffer.substring( len-2,len).equals("us")|| + buffer.substring( len-2,len).equals("\u00fdm")|| //-ým + buffer.substring( len-2,len).equals("mi")|| + buffer.substring( len-2,len).equals("ou")){ + + buffer.delete( len- 2 , len); + return; + } + }//len>4 + if( len> 3){ + if( buffer.substring( len-1,len).equals("e")|| + buffer.substring( len-1,len).equals("i")){ + + palatalise(buffer); + return; + } + if( buffer.substring( len-1,len).equals("\u00ed")|| //-í + buffer.substring( len-1,len).equals("\u011b")){ //-ě + + palatalise(buffer); + return; + } + if( buffer.substring( len-1,len).equals("u")|| + buffer.substring( len-1,len).equals("y")|| + buffer.substring( len-1,len).equals("\u016f")){ //-ů + + buffer.delete( len- 1 , len); + return; + + } + if( buffer.substring( len-1,len).equals("a")|| + buffer.substring( len-1,len).equals("o")|| + buffer.substring( len-1,len).equals("\u00e1")|| // -á + buffer.substring( len-1,len).equals("\u00e9")|| //-é + buffer.substring( len-1,len).equals("\u00fd")){ //-ý + + buffer.delete( len- 1 , len); + return; + } + }//len>3 + } + + + private static void usage() + { + System.err.println("Usage: TestApp [] [-o ]"); + } + + public static void main(String [] args) throws Throwable { + if (args.length < 1) { + usage(); + return; + } + + CzechStemmerLight stemmer = new CzechStemmerLight(); + + int arg = 1; + + InputStream instream; + if (args.length > arg && !args[arg].equals("-o")) { + instream = new FileInputStream(args[arg++]); + } else { + instream = System.in; + } + + OutputStream outstream; + if (args.length > arg) { + if (args.length != arg + 2 || !args[arg].equals("-o")) { + usage(); + return; + } + outstream = new FileOutputStream(args[arg + 1]); + } else { + outstream = System.out; + } + + Reader reader = new InputStreamReader(instream, StandardCharsets.UTF_8); + reader = new BufferedReader(reader); + + Writer output = new OutputStreamWriter(outstream, StandardCharsets.UTF_8); + output = new BufferedWriter(output); + + StringBuffer input = new StringBuffer(); + int character; + while ((character = reader.read()) != -1) { + char ch = (char) character; + if (Character.isWhitespace(ch)) { + String result = stemmer.stem(input.toString()); + output.write(result); + output.write('\n'); + input.delete(0, input.length()); + } else { + input.append(ch < 127 ? Character.toLowerCase(ch) : ch); + } + } + output.flush(); + } + +}//CzechStemmer_1 From 5ef5479a1fa395f07e9a97047f3cf37bcea6abca Mon Sep 17 00:00:00 2001 From: Olly Betts Date: Thu, 5 Sep 2024 12:23:02 +1200 Subject: [PATCH 08/22] =?UTF-8?q?CzechStemmerLight:=20Fix=20length=20check?= =?UTF-8?q?=20for=20=C5=A1t=C4=9B/=C5=A1ti/=C5=A1t=C3=AD=20removal?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- CzechStemmerLight.java | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/CzechStemmerLight.java b/CzechStemmerLight.java index 6ef5779b..57b05595 100644 --- a/CzechStemmerLight.java +++ b/CzechStemmerLight.java @@ -77,11 +77,11 @@ private void palatalise(StringBuffer buffer){ buffer.replace(len- 3 ,len, "ck"); return; } - if( buffer.substring( len- 2 ,len).equals("\u0161t\u011b")|| //-ště - buffer.substring( len- 2 ,len).equals("\u0161ti")|| //-šti - buffer.substring( len- 2 ,len).equals("\u0161t\u00ed")){ //-ští + if( buffer.substring( len- 3 ,len).equals("\u0161t\u011b")|| //-ště + buffer.substring( len- 3 ,len).equals("\u0161ti")|| //-šti + buffer.substring( len- 3 ,len).equals("\u0161t\u00ed")){ //-ští - buffer.replace(len- 2 ,len, "sk"); + buffer.replace(len- 3 ,len, "sk"); return; } buffer.delete( len- 1 , len); From 8c88ddf68ab470d0a59499292729e4428e32cd9a Mon Sep 17 00:00:00 2001 From: Olly Betts Date: Thu, 5 Sep 2024 13:16:00 +1200 Subject: [PATCH 09/22] =?UTF-8?q?Change=20=C4=8D=20suffix=20check=20to=20?= =?UTF-8?q?=C4=8De?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The Java implementation removes če but has an incorrect comment saying it removes č. Compare before and after on the test vocabulary this is a clear improvement. --- algorithms/czech.sbl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/algorithms/czech.sbl b/algorithms/czech.sbl index 52488868..52fcd75e 100644 --- a/algorithms/czech.sbl +++ b/algorithms/czech.sbl @@ -50,7 +50,7 @@ backwardmode ( define palatalise as ( [substring] RV among ( - 'ci' 'ce' '{c^}i' '{c^}' + 'ci' 'ce' '{c^}i' '{c^}e' (<- 'k') 'zi' 'ze' '{z^}i' '{z^}e' (<- 'h') From ae495981c013a5a606aa4fe1c1dcd430c6758816 Mon Sep 17 00:00:00 2001 From: Olly Betts Date: Thu, 5 Sep 2024 13:46:45 +1200 Subject: [PATCH 10/22] =?UTF-8?q?czech:=20Change=20-=C4=8Dt=C3=A9/-=C5=A1t?= =?UTF-8?q?=C3=A9=20to=20-=C4=8Dt=C3=AD/-=C5=A1t=C3=AD?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The Java implementation removes the latter but has incorrect comments saying it removes the former. Changing the Snowball implementation makes no difference here (probably due to the oddness around when to remove a character vs calling do_palatalise) but changing Java to use the Snowball suffixes here leads to a clear regression, so adjust the Snowball implementation to match Java implementation. --- algorithms/czech.sbl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/algorithms/czech.sbl b/algorithms/czech.sbl index 52fcd75e..f4e6d55b 100644 --- a/algorithms/czech.sbl +++ b/algorithms/czech.sbl @@ -54,9 +54,9 @@ backwardmode ( (<- 'k') 'zi' 'ze' '{z^}i' '{z^}e' (<- 'h') - '{c^}t{e^}' '{c^}ti' '{c^}t{e'}' + '{c^}t{e^}' '{c^}ti' '{c^}t{i'}' (<- 'ck') - '{s^}t{e^}' '{s^}ti' '{s^}t{e'}' + '{s^}t{e^}' '{s^}ti' '{s^}t{i'}' (<- 'sk') ) ) From 67a58634eaf57248c5a712ce90e71ff9ef38309b Mon Sep 17 00:00:00 2001 From: Olly Betts Date: Thu, 5 Sep 2024 14:57:45 +1200 Subject: [PATCH 11/22] =?UTF-8?q?CzechStemmerLight:=20Remove=20one=20char?= =?UTF-8?q?=20for=20-es/-=C3=A9m/-=C3=ADm?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This case was inconsistent with all the other cases where we call palatalise as we remove the whole suffix here but leave the first character in every over case. Checking the vocabulary list, this means palatalise will almost never match one of the suffixes, as the only words with this as an ending in the list are these, which look like they're actually English words (except "abies"): abies cookies hippies series studies This means palatalise will just remove the last character, which seems odd. This change changes a lot of stems but seems to be an improvement in pretty much every instance I checked in google translate. --- CzechStemmerLight.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CzechStemmerLight.java b/CzechStemmerLight.java index 57b05595..c5018a15 100644 --- a/CzechStemmerLight.java +++ b/CzechStemmerLight.java @@ -184,7 +184,7 @@ private void removeCase(StringBuffer buffer) { buffer.substring( len-2,len).equals("\u00e9m")|| //-ém buffer.substring( len-2,len).equals("\u00edm")){ //-ím - buffer.delete( len- 2 , len); + buffer.delete( len- 1 , len); palatalise(buffer); return; } From 7f2e79733968a8444d64f8a8fa7f2957f8e397b9 Mon Sep 17 00:00:00 2001 From: Olly Betts Date: Fri, 6 Sep 2024 13:46:46 +1200 Subject: [PATCH 12/22] Fix handling of possessive removal There are two issues here: One seems clearly unintentional, which is that the cursor position from do_case wasn't reset. The other is that do_possessive was only called if do_case did something which does not match the Java implementation. It seems likely this was not intended, and testing suggests it's not a helpful change. --- algorithms/czech.sbl | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/algorithms/czech.sbl b/algorithms/czech.sbl index f4e6d55b..085e4dc5 100644 --- a/algorithms/czech.sbl +++ b/algorithms/czech.sbl @@ -2,8 +2,8 @@ routines ( RV R1 palatalise mark_regions - do_possessive - do_case + possessive_suffix + case_suffix ) externals ( stem ) @@ -61,7 +61,7 @@ backwardmode ( ) ) - define do_possessive as ( + define possessive_suffix as ( [substring] RV among ( 'ov' '{u*}v' (delete) @@ -73,7 +73,7 @@ backwardmode ( ) ) - define do_case as ( + define case_suffix as ( [substring] R1 among ( 'atech' '{e^}tem' 'at{u*}m' @@ -103,8 +103,8 @@ backwardmode ( define stem as ( do mark_regions backwards ( - do_case - do_possessive + do case_suffix + do possessive_suffix ) ) From ac701356066dff03ab07f34779b7bbdefe6a484a Mon Sep 17 00:00:00 2001 From: Olly Betts Date: Tue, 10 Sep 2024 11:20:33 +1200 Subject: [PATCH 13/22] Adjust palatalise to work like the Java version For the test vocabulary, this results in 1877 merges of groups of stems (all seem reasonable), 427 splits (all seem unhelpful) and 300 reshufflings of stems between existing groups (all seem neutral). Overall this seems a very clear improvement, but we should see if we can address the splits. --- algorithms/czech.sbl | 73 ++++++++++++++++++++++++++++++-------------- 1 file changed, 50 insertions(+), 23 deletions(-) diff --git a/algorithms/czech.sbl b/algorithms/czech.sbl index 085e4dc5..a4ce107e 100644 --- a/algorithms/czech.sbl +++ b/algorithms/czech.sbl @@ -1,6 +1,9 @@ routines ( RV R1 - palatalise + palatalise_e + palatalise_ecaron + palatalise_i + palatalise_iacute mark_regions possessive_suffix case_suffix @@ -48,16 +51,33 @@ backwardmode ( define RV as $pV <= cursor define R1 as $p1 <= cursor - define palatalise as ( - [substring] RV among ( - 'ci' 'ce' '{c^}i' '{c^}e' - (<- 'k') - 'zi' 'ze' '{z^}i' '{z^}e' - (<- 'h') - '{c^}t{e^}' '{c^}ti' '{c^}t{i'}' - (<- 'ck') - '{s^}t{e^}' '{s^}ti' '{s^}t{i'}' - (<- 'sk') + define palatalise_e as ( + [substring] among ( + 'c' '{c^}' (<- 'k') + 'z' '{z^}' (<- 'h') + ) + ) + + define palatalise_ecaron as ( + [substring] among ( + '{c^}t' (<- 'ck') + '{s^}t' (<- 'sk') + ) + ) + + define palatalise_i as ( + [substring] among ( + 'c' '{c^}' (<- 'k') + 'z' '{z^}' (<- 'h') + '{c^}t' (<- 'ck') + '{s^}t' (<- 'sk') + ) + ) + + define palatalise_iacute as ( + [substring] among ( + '{c^}t' (<- 'ck') + '{s^}t' (<- 'sk') ) ) @@ -68,33 +88,40 @@ backwardmode ( 'in' ( delete - try palatalise + try palatalise_i ) ) ) define case_suffix as ( - [substring] R1 among ( + setlimit tomark p1 for ( [substring] ) among ( 'atech' - '{e^}tem' 'at{u*}m' + 'at{u*}m' '{a'}ch' '{y'}ch' 'ov{e'}' '{y'}mi' 'ata' 'aty' 'ama' 'ami' 'ovi' 'at' '{a'}m' 'os' 'us' '{u*}m' '{y'}m' 'mi' 'ou' + '{e'}ho' '{e'}m' '{e'}mu' 'u' 'y' '{u*}' 'a' 'o' '{a'}' '{e'}' '{y'}' (delete) - 'ech' 'ich' '{i'}ch' - '{e'}ho' '{e^}mi' '{e'}mu' '{e^}te' '{e^}ti' '{i'}ho' '{i'}mi' - 'emi' 'iho' 'imu' - '{e'}m' '{i'}m' 'es' - 'e' 'i' '{i'}' '{e^}' + '{e^}' '{e^}tem' '{e^}mi' '{e^}te' '{e^}ti' + ( + delete + try palatalise_ecaron + ) + 'e' 'ech' 'em' 'emi' 'es' 'ete' 'etem' // 'eti' ( delete - try palatalise + try palatalise_e ) - 'em' + 'i' 'ich' 'iho' 'imu' ( - <- 'e' - try palatalise + delete + try palatalise_i + ) + '{i'}' '{i'}ch' '{i'}ho' '{i'}m' '{i'}mi' + ( + delete + try palatalise_iacute ) ) ) From 6fdd8fa170977ba5f961a7a28c76e5dccff460a0 Mon Sep 17 00:00:00 2001 From: Olly Betts Date: Wed, 11 Sep 2024 09:47:23 +1200 Subject: [PATCH 14/22] czech: Comment out unused R1 routine for now --- algorithms/czech.sbl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/algorithms/czech.sbl b/algorithms/czech.sbl index a4ce107e..65651547 100644 --- a/algorithms/czech.sbl +++ b/algorithms/czech.sbl @@ -1,5 +1,5 @@ routines ( - RV R1 + RV // R1 palatalise_e palatalise_ecaron palatalise_i @@ -49,7 +49,7 @@ define mark_regions as ( backwardmode ( define RV as $pV <= cursor - define R1 as $p1 <= cursor + // define R1 as $p1 <= cursor define palatalise_e as ( [substring] among ( From 401a2c9e3e995cb21d9459cbd7afbf75555e3d26 Mon Sep 17 00:00:00 2001 From: Olly Betts Date: Mon, 7 Oct 2024 16:23:56 +1300 Subject: [PATCH 15/22] czech: Don't remove -os suffix Testing seems to show this was never helpful and sometimes harmful. --- algorithms/czech.sbl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/algorithms/czech.sbl b/algorithms/czech.sbl index 65651547..bda49299 100644 --- a/algorithms/czech.sbl +++ b/algorithms/czech.sbl @@ -99,7 +99,7 @@ backwardmode ( 'at{u*}m' '{a'}ch' '{y'}ch' 'ov{e'}' '{y'}mi' 'ata' 'aty' 'ama' 'ami' 'ovi' - 'at' '{a'}m' 'os' 'us' '{u*}m' '{y'}m' 'mi' 'ou' + 'at' '{a'}m' 'us' '{u*}m' '{y'}m' 'mi' 'ou' '{e'}ho' '{e'}m' '{e'}mu' 'u' 'y' '{u*}' 'a' 'o' '{a'}' '{e'}' '{y'}' (delete) From d3fbcd993976b19a3e82185e670a6407ceef57c2 Mon Sep 17 00:00:00 2001 From: Olly Betts Date: Tue, 8 Oct 2024 16:47:46 +1300 Subject: [PATCH 16/22] czech: Remove more suffixes -es seems to be a valid suffix (e.g. diabetes) but there seem to be more cases where it is harmful to remove. -ich seems to only be a suffix for two pronouns. -iho doesn't seem to be a valid suffix and removing it makes no difference on the test vocabulary. --- algorithms/czech.sbl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/algorithms/czech.sbl b/algorithms/czech.sbl index bda49299..08c89ad2 100644 --- a/algorithms/czech.sbl +++ b/algorithms/czech.sbl @@ -108,12 +108,12 @@ backwardmode ( delete try palatalise_ecaron ) - 'e' 'ech' 'em' 'emi' 'es' 'ete' 'etem' // 'eti' + 'e' 'ech' 'em' 'emi' 'ete' 'etem' ( delete try palatalise_e ) - 'i' 'ich' 'iho' 'imu' + 'i' ( delete try palatalise_i From baaa66d249a75cdc9b208a540edaa749d7df71c4 Mon Sep 17 00:00:00 2001 From: Olly Betts Date: Tue, 8 Oct 2024 17:34:55 +1300 Subject: [PATCH 17/22] czech: Remove -'{i'}mu' This is a valid Czech suffix and removing it seems beneficial (88 cases in the sample vocabulary, all seem to be improvements). --- algorithms/czech.sbl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/algorithms/czech.sbl b/algorithms/czech.sbl index 08c89ad2..d954052a 100644 --- a/algorithms/czech.sbl +++ b/algorithms/czech.sbl @@ -118,7 +118,7 @@ backwardmode ( delete try palatalise_i ) - '{i'}' '{i'}ch' '{i'}ho' '{i'}m' '{i'}mi' + '{i'}' '{i'}ch' '{i'}ho' '{i'}m' '{i'}mi' '{i'}mu' ( delete try palatalise_iacute From c2d63e91a0ffc6a998a005e44f2622dbd876584b Mon Sep 17 00:00:00 2001 From: Olly Betts Date: Tue, 8 Oct 2024 17:42:05 +1300 Subject: [PATCH 18/22] czech: Use a better definition of R1 Use a definition of R1 more like the usual Snowball one, but take syllabic consonants 'l' and 'r' into account. It seems 'm' and 'n' can also be syllabic consonants but are much rarer so we ignore these for now at least. Testing suggests enforcing a minimum of 3 characters before R1 (like the Danish, Dutch and German stemmers do) helps so we do that here too. See #151 --- algorithms/czech.sbl | 21 ++++++++++++++++++--- 1 file changed, 18 insertions(+), 3 deletions(-) diff --git a/algorithms/czech.sbl b/algorithms/czech.sbl index d954052a..42786bd4 100644 --- a/algorithms/czech.sbl +++ b/algorithms/czech.sbl @@ -11,9 +11,9 @@ routines ( externals ( stem ) -integers ( pV p1 ) +integers ( pV p1 x ) -groupings ( v ) +groupings ( v syllabic_c ) stringescapes {} @@ -35,14 +35,29 @@ stringdef z^ '{U+017E}' define v 'aeiouy{a'}{e^}{e'}{i'}{o'}{u'}{u*}{y'}' +define syllabic_c 'lr' //mn' + define mark_regions as ( $pV = limit $p1 = limit + test(hop 3 setmark x) do ( gopast non-v setmark pV - gopast non-v gopast v setmark p1 + try($pV < x $pV = x) // at least 3 + ) + + do ( + // A syllabic consonant must occur between two consonants, or be + // preceded by a consonant and at the end of the word. + // + // However, we don't actually need to check the character after, since + // if it's a vowel then that vowel means we'd end up at the same + // position after `gopast non-v` anyway. + gopast ( v or (non-v syllabic_c) ) gopast non-v + setmark p1 + try($p1 < x $p1 = x) // at least 3 ) ) From fffc540103dd0d62dabf653fafb9fc7ffa69e0dc Mon Sep 17 00:00:00 2001 From: Olly Betts Date: Wed, 9 Oct 2024 07:14:36 +1300 Subject: [PATCH 19/22] czech: Optimise R1 check We can just handle the first character specially - after that we know the previous character is a consonant because otherwise we'd have already stopped. See #151 --- algorithms/czech.sbl | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/algorithms/czech.sbl b/algorithms/czech.sbl index 42786bd4..ed9aa448 100644 --- a/algorithms/czech.sbl +++ b/algorithms/czech.sbl @@ -13,7 +13,7 @@ externals ( stem ) integers ( pV p1 x ) -groupings ( v syllabic_c ) +groupings ( v v_or_syllabic_c ) stringescapes {} @@ -35,7 +35,7 @@ stringdef z^ '{U+017E}' define v 'aeiouy{a'}{e^}{e'}{i'}{o'}{u'}{u*}{y'}' -define syllabic_c 'lr' //mn' +define v_or_syllabic_c v + 'lr' //mn' define mark_regions as ( @@ -52,10 +52,14 @@ define mark_regions as ( // A syllabic consonant must occur between two consonants, or be // preceded by a consonant and at the end of the word. // - // However, we don't actually need to check the character after, since + // Instead of literally testing that, we check handle the first + // character specially, then we know that the character before is + // a consonant because otherwise we'd have stopped already. + // + // We also don't actually need to check the character after, since // if it's a vowel then that vowel means we'd end up at the same // position after `gopast non-v` anyway. - gopast ( v or (non-v syllabic_c) ) gopast non-v + (v or (next gopast v_or_syllabic_c)) gopast non-v setmark p1 try($p1 < x $p1 = x) // at least 3 ) From 360d722bbc42cf9526b890d42772d60647864cca Mon Sep 17 00:00:00 2001 From: Olly Betts Date: Wed, 9 Oct 2024 12:37:58 +1300 Subject: [PATCH 20/22] Improve comments --- algorithms/czech.sbl | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) diff --git a/algorithms/czech.sbl b/algorithms/czech.sbl index ed9aa448..c152d01c 100644 --- a/algorithms/czech.sbl +++ b/algorithms/czech.sbl @@ -35,7 +35,14 @@ stringdef z^ '{U+017E}' define v 'aeiouy{a'}{e^}{e'}{i'}{o'}{u'}{u*}{y'}' -define v_or_syllabic_c v + 'lr' //mn' +// Some consonants in Czech can be syllabic - if these occur between two other +// consonants then they act in a vowel-like way and it is helpful to include +// them in the definition of R1. +// +// Some sources also list 'm' and 'n' as syllabic consonants for Czech but they +// seem to be much rarer and including them makes no difference to the results +// of stemming any words in our sample vocabulary list. +define v_or_syllabic_c v + 'lr' define mark_regions as ( @@ -52,13 +59,15 @@ define mark_regions as ( // A syllabic consonant must occur between two consonants, or be // preceded by a consonant and at the end of the word. // - // Instead of literally testing that, we check handle the first - // character specially, then we know that the character before is - // a consonant because otherwise we'd have stopped already. + // Instead of literally testing that, we handle the first character + // specially by only checking if it's a vowel; for subsequent + // characters we know that the character before is a consonant because + // otherwise we'd have stopped already. // // We also don't actually need to check the character after, since // if it's a vowel then that vowel means we'd end up at the same - // position after `gopast non-v` anyway. + // position after `gopast non-v` anyway, and if it's the end of the + // word then there's no non-v after it. (v or (next gopast v_or_syllabic_c)) gopast non-v setmark p1 try($p1 < x $p1 = x) // at least 3 From 4b233624e2151f25660aca342e0813becec2e791 Mon Sep 17 00:00:00 2001 From: Olly Betts Date: Wed, 9 Oct 2024 13:22:04 +1300 Subject: [PATCH 21/22] czech: Use R1 instead of RV There seems no benefit from having a separate region we can remove possessive suffixes in. See #151 --- algorithms/czech.sbl | 15 ++++----------- 1 file changed, 4 insertions(+), 11 deletions(-) diff --git a/algorithms/czech.sbl b/algorithms/czech.sbl index c152d01c..b3623ecf 100644 --- a/algorithms/czech.sbl +++ b/algorithms/czech.sbl @@ -1,5 +1,5 @@ routines ( - RV // R1 + R1 palatalise_e palatalise_ecaron palatalise_i @@ -11,7 +11,7 @@ routines ( externals ( stem ) -integers ( pV p1 x ) +integers ( p1 x ) groupings ( v v_or_syllabic_c ) @@ -46,15 +46,9 @@ define v_or_syllabic_c v + 'lr' define mark_regions as ( - $pV = limit $p1 = limit test(hop 3 setmark x) - do ( - gopast non-v setmark pV - try($pV < x $pV = x) // at least 3 - ) - do ( // A syllabic consonant must occur between two consonants, or be // preceded by a consonant and at the end of the word. @@ -76,8 +70,7 @@ define mark_regions as ( backwardmode ( - define RV as $pV <= cursor - // define R1 as $p1 <= cursor + define R1 as $p1 <= cursor define palatalise_e as ( [substring] among ( @@ -110,7 +103,7 @@ backwardmode ( ) define possessive_suffix as ( - [substring] RV among ( + [substring] R1 among ( 'ov' '{u*}v' (delete) 'in' From bfccdb29951bce2e531b7333b3b6e3139edd8deb Mon Sep 17 00:00:00 2001 From: Olly Betts Date: Thu, 10 Oct 2024 16:36:16 +1300 Subject: [PATCH 22/22] czech: Merge two identical routines --- algorithms/czech.sbl | 16 ++++------------ 1 file changed, 4 insertions(+), 12 deletions(-) diff --git a/algorithms/czech.sbl b/algorithms/czech.sbl index b3623ecf..49cb4419 100644 --- a/algorithms/czech.sbl +++ b/algorithms/czech.sbl @@ -1,9 +1,8 @@ routines ( R1 palatalise_e - palatalise_ecaron + palatalise_ecaron_or_iacute palatalise_i - palatalise_iacute mark_regions possessive_suffix case_suffix @@ -79,7 +78,7 @@ backwardmode ( ) ) - define palatalise_ecaron as ( + define palatalise_ecaron_or_iacute as ( [substring] among ( '{c^}t' (<- 'ck') '{s^}t' (<- 'sk') @@ -95,13 +94,6 @@ backwardmode ( ) ) - define palatalise_iacute as ( - [substring] among ( - '{c^}t' (<- 'ck') - '{s^}t' (<- 'sk') - ) - ) - define possessive_suffix as ( [substring] R1 among ( 'ov' '{u*}v' @@ -127,7 +119,7 @@ backwardmode ( '{e^}' '{e^}tem' '{e^}mi' '{e^}te' '{e^}ti' ( delete - try palatalise_ecaron + try palatalise_ecaron_or_iacute ) 'e' 'ech' 'em' 'emi' 'ete' 'etem' ( @@ -142,7 +134,7 @@ backwardmode ( '{i'}' '{i'}ch' '{i'}ho' '{i'}m' '{i'}mi' '{i'}mu' ( delete - try palatalise_iacute + try palatalise_ecaron_or_iacute ) ) )