From 918e0cbe79bbf97f8e1a17e9f9f2fff0e8f5dbe0 Mon Sep 17 00:00:00 2001
From: Olly Betts <olly@survex.com>
Date: Tue, 31 Aug 2021 15:30:22 +1200
Subject: [PATCH 01/22] Merge Czech stemmer

This has been on the web site since 2012, but never actually got
included in the code distribution.
---
 algorithms/czech.sbl   | 248 +++++++++++++++++++++++++++++++++++++++++
 libstemmer/modules.txt |   1 +
 2 files changed, 249 insertions(+)
 create mode 100644 algorithms/czech.sbl

diff --git a/algorithms/czech.sbl b/algorithms/czech.sbl
new file mode 100644
index 00000000..30cadaf0
--- /dev/null
+++ b/algorithms/czech.sbl
@@ -0,0 +1,248 @@
+routines (
+  RV R1
+  palatalise
+  mark_regions
+  do_possessive
+  do_case
+  do_comparative
+  do_diminutive
+  do_augmentative
+  do_derivational
+  do_deriv_single
+  do_aggressive
+)
+
+externals ( stem )
+
+integers ( pV p1 )
+
+groupings ( v )
+
+stringescapes {}
+
+stringdef a' '{U+00E1}'
+stringdef c^ '{U+010D}'
+stringdef d^ '{U+010F}'
+stringdef e' '{U+00E9}'
+stringdef e^ '{U+011B}'
+stringdef i' '{U+00ED}'
+stringdef n^ '{U+0148}'
+stringdef o' '{U+00F3}'
+stringdef r^ '{U+0159}'
+stringdef s^ '{U+0161}'
+stringdef t^ '{U+0165}'
+stringdef u' '{U+00FA}'
+stringdef u* '{U+016F}'
+stringdef y' '{U+00FD}'
+stringdef z^ '{U+017E}'
+
+define v 'aeiouy{a'}{e^}{e'}{i'}{o'}{u'}{u*}{y'}'
+
+define mark_regions as (
+
+    $pV = limit
+    $p1 = limit
+
+    do (
+        gopast non-v setmark pV
+        gopast non-v gopast v setmark p1
+    )
+)
+
+backwardmode (
+
+  define RV as $pV <= cursor
+  define R1 as $p1 <= cursor
+
+  define palatalise as (
+    [substring] RV among (
+      'ci' 'ce' '{c^}i' '{c^}'
+      (<- 'k')
+      'zi' 'ze' '{z^}i' '{z^}e'
+      (<- 'h')
+      '{c^}t{e^}' '{c^}ti' '{c^}t{e'}'
+      (<- 'ck')
+      '{s^}t{e^}' '{s^}ti' '{s^}t{e'}'
+      (<- 'sk')
+    )
+  )
+
+  define do_possessive as (
+    [substring] RV among (
+      'ov' '{u*}v'
+      (delete)
+      'in'
+      (
+        delete
+        try palatalise
+      )
+    )
+  )
+
+  define do_case as (
+    [substring] among (
+      'atech'
+      '{e^}tem' 'at{u*}m'
+      '{a'}ch' '{y'}ch' 'ov{e'}' '{y'}mi'
+      'ata' 'aty' 'ama' 'ami' 'ovi'
+      'at' '{a'}m' 'os' 'us' '{y'}m' 'mi' 'ou'
+      'u' 'y' '{u*}' 'a' 'o' '{a'}' '{e'}' '{y'}'
+      (delete)
+      'ech' 'ich' '{i'}ch'
+      '{e'}ho' '{e^}mi' '{e'}mu' '{e^}te' '{e^}ti' '{i'}ho' '{i'}mi'
+      'emi' 'iho' 'imu'
+      '{e'}m' '{i'}m' 'es'
+      'e' 'i' '{i'}' '{e^}'
+      (
+        delete
+        try palatalise
+      )
+      'em'
+      (
+        <- 'e'
+        try palatalise
+      )
+    )
+  )
+
+  define do_derivational as (
+    [substring] R1 among (
+      'obinec'
+      'ovisk' 'ovstv' 'ovi{s^}t' 'ovn{i'}k'
+      '{a'}sek' 'loun' 'nost' 'teln' 'ovec' 'ov{i'}k' 'ovtv' 'ovin' '{s^}tin'
+      '{a'}rn' 'och' 'ost' 'ovn' 'oun' 'out' 'ou{s^}' 'u{s^}k'
+      'kyn' '{c^}an' 'k{a'}{r^}' 'n{e'}{r^}' 'n{i'}k' 'ctv' 'stv'
+      '{a'}{c^}' 'a{c^}' '{a'}n' 'an' '{a'}{r^}' 'as'
+      'ob' 'ot' 'ov' 'o{n^}' 'ul' 'yn'
+      '{c^}k' '{c^}n' 'dl' 'nk' 'tv' 'tk' 'vk'
+      (delete)
+      'ion{a'}{r^}'
+      'inec' 'itel'
+      'i{a'}n' 'ist' 'isk' 'i{s^}k' 'itb'
+      'ic' 'in' 'it' 'iv'
+      (
+        <- 'i'
+        palatalise
+      )
+      'enic' 'ec' 'en'
+      (
+        <- 'e'
+        palatalise
+      )
+      '{e'}{r^}'
+      (
+        <- '{e'}'
+        palatalise
+      )
+      '{e^}n'
+      (
+        <- '{e^}'
+        palatalise
+      )
+      '{i'}rn'
+      '{i'}{r^}' '{i'}n'
+      (
+        <- '{i'}'
+        palatalise
+      )
+    )
+  )
+  define do_deriv_single as (
+    [substring] among (
+      'c' '{c^}' 'k' 'l' 'n' 't'
+      (delete)
+    )
+  )
+
+
+  define do_augmentative as (
+    [substring] among (
+      'ajzn' '{a'}k'
+      (delete)
+      'izn' 'isk'
+      (
+        <- 'i'
+        palatalise
+      )
+    )
+  )
+
+  define do_diminutive as (
+    [substring] among (
+      'ou{s^}ek' '{a'}{c^}ek' 'a{c^}ek' 'o{c^}ek' 'u{c^}ek'
+      'anek' 'onek' 'unek' '{a'}nek'
+      'e{c^}k' '{e'}{c^}k' 'i{c^}k' '{i'}{c^}k' 'enk' '{e'}nk' 'ink' '{i'}nk'
+      '{a'}{c^}k' 'a{c^}k' 'o{c^}k' 'u{c^}k' 'ank' 'onk' 'unk'
+      '{a'}tk' '{a'}nk' 'u{s^}k'
+      'k'
+      (delete)
+      'e{c^}ek' 'enek' 'ek'
+      (
+        <- 'e'
+        palatalise
+      )
+      '{e'}{c^}ek' '{e'}k'
+      (
+        <- '{e'}'
+        palatalise
+      )
+      'i{c^}ek' 'inek' 'ik'
+      (
+        <- 'i'
+        palatalise
+      )
+      '{i'}{c^}ek' '{i'}k'
+      (
+        <- '{i'}'
+        palatalise
+      )
+      '{a'}k'
+       (<- '{a'}')
+      'ak'
+       (<- 'a')
+      'ok'
+       (<- 'o')
+      'uk'
+       (<- 'u')
+    )
+  )
+
+  define do_comparative as (
+    [substring] among (
+      '{e^}j{s^}'
+      (
+        <- '{e^}'
+        palatalise
+      )
+      'ej{s^}'
+      (
+        <- 'e'
+        palatalise
+      )
+    )
+  )
+
+  define do_aggressive as (
+    do do_comparative
+    do do_diminutive
+    do do_augmentative
+    do_derivational or do_deriv_single
+  )
+)
+
+define stem as (
+  do mark_regions
+  backwards (
+    do_case
+    do_possessive
+    // light and aggressive are the same to this point
+    // comment next line for light stemmer
+    do_aggressive
+  )
+)
+
+// Ljiljana Dolamic and Jacques Savoy. 2009.
+// Indexing and stemming approaches for the Czech language.
+// Inf. Process. Manage. 45, 6 (November 2009), 714-720.
+// http://members.unine.ch/jacques.savoy/clef/CzechStemmerLight.txt
+// http://members.unine.ch/jacques.savoy/clef/CzechStemmerAgressive.txt
diff --git a/libstemmer/modules.txt b/libstemmer/modules.txt
index cd36a219..58df1464 100644
--- a/libstemmer/modules.txt
+++ b/libstemmer/modules.txt
@@ -13,6 +13,7 @@ arabic          UTF_8                   arabic,ar,ara
 armenian        UTF_8                   armenian,hy,hye,arm
 basque          UTF_8,ISO_8859_1        basque,eu,eus,baq
 catalan         UTF_8,ISO_8859_1        catalan,ca,cat
+czech           UTF_8,ISO_8859_2        czech,cs,ces,cze
 danish          UTF_8,ISO_8859_1        danish,da,dan
 dutch           UTF_8,ISO_8859_1        dutch,nl,dut,nld
 english         UTF_8,ISO_8859_1        english,en,eng

From 17d85272eef8857462b61ea57aef6ccc1fd96eea Mon Sep 17 00:00:00 2001
From: Olly Betts <olly@survex.com>
Date: Wed, 1 Sep 2021 11:56:32 +1200
Subject: [PATCH 02/22] Only apply do_case in R1
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This helps avoid overstemming.

Co-authored-by: Jim O’Regan <jaoregan@tcd.ie>
---
 algorithms/czech.sbl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/algorithms/czech.sbl b/algorithms/czech.sbl
index 30cadaf0..5bab9f9e 100644
--- a/algorithms/czech.sbl
+++ b/algorithms/czech.sbl
@@ -80,7 +80,7 @@ backwardmode (
   )
 
   define do_case as (
-    [substring] among (
+    [substring] R1 among (
       'atech'
       '{e^}tem' 'at{u*}m'
       '{a'}ch' '{y'}ch' 'ov{e'}' '{y'}mi'

From 18016f58f2c81bbb212a4ae0f4599cf33947f91c Mon Sep 17 00:00:00 2001
From: Olly Betts <olly@survex.com>
Date: Wed, 1 Sep 2021 12:01:15 +1200
Subject: [PATCH 03/22] Implement the "light" version of the stemmer
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The "aggressive" version is known to overstem.  According to the
original paper, the aggressive version performs slightly better, but
the difference isn't statistically significant and conflation from
overstemming can be problematic.

Co-authored-by: Jim O’Regan <jaoregan@tcd.ie>
---
 algorithms/czech.sbl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/algorithms/czech.sbl b/algorithms/czech.sbl
index 5bab9f9e..c5bd04f9 100644
--- a/algorithms/czech.sbl
+++ b/algorithms/czech.sbl
@@ -237,7 +237,7 @@ define stem as (
     do_possessive
     // light and aggressive are the same to this point
     // comment next line for light stemmer
-    do_aggressive
+    // do_aggressive
   )
 )
 

From fe8fe84955b9652df98c188bf2f0c313ca522812 Mon Sep 17 00:00:00 2001
From: Olly Betts <olly@survex.com>
Date: Wed, 1 Sep 2021 12:01:58 +1200
Subject: [PATCH 04/22] Improve comment about origin of algorithm
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Co-authored-by: Jim O’Regan <jaoregan@tcd.ie>
---
 algorithms/czech.sbl | 1 +
 1 file changed, 1 insertion(+)

diff --git a/algorithms/czech.sbl b/algorithms/czech.sbl
index c5bd04f9..033025bc 100644
--- a/algorithms/czech.sbl
+++ b/algorithms/czech.sbl
@@ -244,5 +244,6 @@ define stem as (
 // Ljiljana Dolamic and Jacques Savoy. 2009.
 // Indexing and stemming approaches for the Czech language.
 // Inf. Process. Manage. 45, 6 (November 2009), 714-720.
+// based on Java code by Ljiljana Dolamic:
 // http://members.unine.ch/jacques.savoy/clef/CzechStemmerLight.txt
 // http://members.unine.ch/jacques.savoy/clef/CzechStemmerAgressive.txt

From d39135766075fe99b5a8253c6bfa5654b54244a4 Mon Sep 17 00:00:00 2001
From: Olly Betts <olly@survex.com>
Date: Wed, 1 Sep 2021 12:31:31 +1200
Subject: [PATCH 05/22] czech: Strip out unused "aggressive" code

Avoids snowball and C compiler warnings.
---
 algorithms/czech.sbl | 134 -------------------------------------------
 1 file changed, 134 deletions(-)

diff --git a/algorithms/czech.sbl b/algorithms/czech.sbl
index 033025bc..5d777231 100644
--- a/algorithms/czech.sbl
+++ b/algorithms/czech.sbl
@@ -4,12 +4,6 @@ routines (
   mark_regions
   do_possessive
   do_case
-  do_comparative
-  do_diminutive
-  do_augmentative
-  do_derivational
-  do_deriv_single
-  do_aggressive
 )
 
 externals ( stem )
@@ -104,130 +98,6 @@ backwardmode (
       )
     )
   )
-
-  define do_derivational as (
-    [substring] R1 among (
-      'obinec'
-      'ovisk' 'ovstv' 'ovi{s^}t' 'ovn{i'}k'
-      '{a'}sek' 'loun' 'nost' 'teln' 'ovec' 'ov{i'}k' 'ovtv' 'ovin' '{s^}tin'
-      '{a'}rn' 'och' 'ost' 'ovn' 'oun' 'out' 'ou{s^}' 'u{s^}k'
-      'kyn' '{c^}an' 'k{a'}{r^}' 'n{e'}{r^}' 'n{i'}k' 'ctv' 'stv'
-      '{a'}{c^}' 'a{c^}' '{a'}n' 'an' '{a'}{r^}' 'as'
-      'ob' 'ot' 'ov' 'o{n^}' 'ul' 'yn'
-      '{c^}k' '{c^}n' 'dl' 'nk' 'tv' 'tk' 'vk'
-      (delete)
-      'ion{a'}{r^}'
-      'inec' 'itel'
-      'i{a'}n' 'ist' 'isk' 'i{s^}k' 'itb'
-      'ic' 'in' 'it' 'iv'
-      (
-        <- 'i'
-        palatalise
-      )
-      'enic' 'ec' 'en'
-      (
-        <- 'e'
-        palatalise
-      )
-      '{e'}{r^}'
-      (
-        <- '{e'}'
-        palatalise
-      )
-      '{e^}n'
-      (
-        <- '{e^}'
-        palatalise
-      )
-      '{i'}rn'
-      '{i'}{r^}' '{i'}n'
-      (
-        <- '{i'}'
-        palatalise
-      )
-    )
-  )
-  define do_deriv_single as (
-    [substring] among (
-      'c' '{c^}' 'k' 'l' 'n' 't'
-      (delete)
-    )
-  )
-
-
-  define do_augmentative as (
-    [substring] among (
-      'ajzn' '{a'}k'
-      (delete)
-      'izn' 'isk'
-      (
-        <- 'i'
-        palatalise
-      )
-    )
-  )
-
-  define do_diminutive as (
-    [substring] among (
-      'ou{s^}ek' '{a'}{c^}ek' 'a{c^}ek' 'o{c^}ek' 'u{c^}ek'
-      'anek' 'onek' 'unek' '{a'}nek'
-      'e{c^}k' '{e'}{c^}k' 'i{c^}k' '{i'}{c^}k' 'enk' '{e'}nk' 'ink' '{i'}nk'
-      '{a'}{c^}k' 'a{c^}k' 'o{c^}k' 'u{c^}k' 'ank' 'onk' 'unk'
-      '{a'}tk' '{a'}nk' 'u{s^}k'
-      'k'
-      (delete)
-      'e{c^}ek' 'enek' 'ek'
-      (
-        <- 'e'
-        palatalise
-      )
-      '{e'}{c^}ek' '{e'}k'
-      (
-        <- '{e'}'
-        palatalise
-      )
-      'i{c^}ek' 'inek' 'ik'
-      (
-        <- 'i'
-        palatalise
-      )
-      '{i'}{c^}ek' '{i'}k'
-      (
-        <- '{i'}'
-        palatalise
-      )
-      '{a'}k'
-       (<- '{a'}')
-      'ak'
-       (<- 'a')
-      'ok'
-       (<- 'o')
-      'uk'
-       (<- 'u')
-    )
-  )
-
-  define do_comparative as (
-    [substring] among (
-      '{e^}j{s^}'
-      (
-        <- '{e^}'
-        palatalise
-      )
-      'ej{s^}'
-      (
-        <- 'e'
-        palatalise
-      )
-    )
-  )
-
-  define do_aggressive as (
-    do do_comparative
-    do do_diminutive
-    do do_augmentative
-    do_derivational or do_deriv_single
-  )
 )
 
 define stem as (
@@ -235,9 +105,6 @@ define stem as (
   backwards (
     do_case
     do_possessive
-    // light and aggressive are the same to this point
-    // comment next line for light stemmer
-    // do_aggressive
   )
 )
 
@@ -246,4 +113,3 @@ define stem as (
 // Inf. Process. Manage. 45, 6 (November 2009), 714-720.
 // based on Java code by Ljiljana Dolamic:
 // http://members.unine.ch/jacques.savoy/clef/CzechStemmerLight.txt
-// http://members.unine.ch/jacques.savoy/clef/CzechStemmerAgressive.txt

From 1ee3d2f239264c77740fdbf97ddfb2b683eac68b Mon Sep 17 00:00:00 2001
From: Olly Betts <olly@survex.com>
Date: Thu, 2 Nov 2023 13:57:46 +1300
Subject: [PATCH 06/22] =?UTF-8?q?czech:=20Remove=20-=C5=AFm=20ending=20in?=
 =?UTF-8?q?=20do=5Fcase?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The Java code removes this ending but it was missing from the Snowball
version.  Looking at the changes resulting from this, it seems a clear
improvement so I've concluded it was an accidental omission.

See #151
---
 algorithms/czech.sbl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/algorithms/czech.sbl b/algorithms/czech.sbl
index 5d777231..52488868 100644
--- a/algorithms/czech.sbl
+++ b/algorithms/czech.sbl
@@ -79,7 +79,7 @@ backwardmode (
       '{e^}tem' 'at{u*}m'
       '{a'}ch' '{y'}ch' 'ov{e'}' '{y'}mi'
       'ata' 'aty' 'ama' 'ami' 'ovi'
-      'at' '{a'}m' 'os' 'us' '{y'}m' 'mi' 'ou'
+      'at' '{a'}m' 'os' 'us' '{u*}m' '{y'}m' 'mi' 'ou'
       'u' 'y' '{u*}' 'a' 'o' '{a'}' '{e'}' '{y'}'
       (delete)
       'ech' 'ich' '{i'}ch'

From 17e83a1cc2d6f1387b3f754d825c15a3c87b6f4c Mon Sep 17 00:00:00 2001
From: Olly Betts <olly@survex.com>
Date: Wed, 4 Sep 2024 17:37:00 +1200
Subject: [PATCH 07/22] Add initial version of CzechStemmerLight.java

Temporary addition to allow easy comparison with Snowball
implementation.

As downloaded, except for comment and whitespace tweaks, plus addition
of main() to allow testing.
---
 CzechStemmerLight.java | 297 +++++++++++++++++++++++++++++++++++++++++
 1 file changed, 297 insertions(+)
 create mode 100644 CzechStemmerLight.java

diff --git a/CzechStemmerLight.java b/CzechStemmerLight.java
new file mode 100644
index 00000000..6ef5779b
--- /dev/null
+++ b/CzechStemmerLight.java
@@ -0,0 +1,297 @@
+import java.io.BufferedReader;
+import java.io.BufferedWriter;
+import java.io.FileInputStream;
+import java.io.FileOutputStream;
+import java.io.InputStream;
+import java.io.InputStreamReader;
+import java.io.OutputStream;
+import java.io.OutputStreamWriter;
+import java.io.Reader;
+import java.io.Writer;
+import java.nio.charset.StandardCharsets;
+
+/**
+ * @author Dolamic Ljiljana  University of Neuchatel
+ *
+ * Czech stemmer-removes case endings form nouns and adjectives, possessive adj.
+ * endings from names
+ * and takes care of palatalisation 
+ */
+public class CzechStemmerLight {
+	
+	/**
+	 * A buffer of the current word being stemmed
+	 */
+	private StringBuffer sb=new StringBuffer();
+
+	
+	/**
+	 * Default constructor
+	 */
+    public CzechStemmerLight(){} // constructor
+    
+	public String stem(String input){
+		
+		//
+		input=input.toLowerCase();
+
+		//reset string buffer
+		sb.delete(0,sb.length());
+		sb.insert(0,input);
+
+		// stemming...
+		//removes case endings from nouns and adjectives
+		removeCase(sb);
+
+		//removes possessive endings from names -ov- and -in-
+		removePossessives(sb);
+
+		String result = sb.toString();
+		
+
+		return result;
+	}
+	private void palatalise(StringBuffer buffer){
+		int len=buffer.length();
+		
+		if( buffer.substring( len- 2 ,len).equals("ci")||
+		    buffer.substring( len- 2 ,len).equals("ce")||		
+		    buffer.substring( len- 2 ,len).equals("\u010di")||  //-či
+		    buffer.substring( len- 2 ,len).equals("\u010de")){  //-če
+				
+		    buffer.replace(len- 2 ,len, "k");
+		    return;
+		}
+		if( buffer.substring( len- 2 ,len).equals("zi")||
+		    buffer.substring( len- 2 ,len).equals("ze")||		
+		    buffer.substring( len- 2 ,len).equals("\u017ei")||  //-ži
+		    buffer.substring( len- 2 ,len).equals("\u017ee")){  //-že
+					
+		    buffer.replace(len- 2 ,len, "h");
+		    return;
+		}
+		if( buffer.substring( len- 3 ,len).equals("\u010dt\u011b")||  //-čtě
+		    buffer.substring( len- 3 ,len).equals("\u010dti")||       //-čti
+		    buffer.substring( len- 3 ,len).equals("\u010dt\u00ed")){  //-čtí
+						
+		    buffer.replace(len- 3 ,len, "ck");
+		    return;
+		}
+		if( buffer.substring( len- 2 ,len).equals("\u0161t\u011b")||  //-ště
+		    buffer.substring( len- 2 ,len).equals("\u0161ti")||       //-šti
+		    buffer.substring( len- 2 ,len).equals("\u0161t\u00ed")){  //-ští
+						
+		    buffer.replace(len- 2 ,len, "sk");
+		    return;
+		}
+		buffer.delete( len- 1 , len);
+		return;
+	}//palatalise
+	
+	private void removePossessives(StringBuffer buffer) {
+		int len=buffer.length();
+		
+		if( len> 5 ){
+			if( buffer.substring( len- 2 ,len).equals("ov")){
+				
+			    buffer.delete( len- 2 , len);
+			    return;
+			}
+			if( buffer.substring( len-2,len).equals("\u016fv")){ //-ův
+			 	
+	        	    buffer.delete( len- 2 , len);
+		            return;
+			}
+		        if( buffer.substring( len- 2 ,len).equals("in")){
+			 	
+			    buffer.delete( len- 1 , len);
+			    palatalise(buffer);
+			    return;
+			}
+		}
+		return;
+	}//removePossessives
+
+	private void removeCase(StringBuffer buffer) {
+		int len=buffer.length();
+		// 
+		if( (len> 7 )&&
+		    buffer.substring( len- 5 ,len).equals("atech")){
+			
+		    buffer.delete( len- 5 , len);
+		    return;
+		}//len>7
+		if( len> 6 ){
+		      if(buffer.substring( len- 4 ,len).equals("\u011btem")){   //-ětem
+		
+		         buffer.delete( len- 3 , len);
+		         palatalise(buffer);
+		         return;
+		      }
+		       if(buffer.substring( len- 4 ,len).equals("at\u016fm")){  //-atům
+		    	      buffer.delete( len- 4 , len);
+		    	      return;
+		      }
+		      
+		}
+		if( len> 5 ){
+			    if(buffer.substring( len-3,len).equals("ech")|| 
+				  buffer.substring( len-3,len).equals("ich")|| 
+				  buffer.substring( len-3,len).equals("\u00edch")){ //-ích
+				
+				  buffer.delete( len-2 , len);
+				  palatalise(buffer);
+				  return;
+				}
+		                if(buffer.substring( len-3,len).equals("\u00e9ho")|| //-ého
+				   buffer.substring( len-3,len).equals("\u011bmi")||  //-ěmi
+				   buffer.substring( len-3,len).equals("emi")||
+				   buffer.substring( len-3,len).equals("\u00e9mu")||  //-ému
+				   buffer.substring( len-3,len).equals("\u011bte")||  //-ěte
+				   buffer.substring( len-3,len).equals("\u011bti")||  //-ěti
+				   buffer.substring( len-3,len).equals("iho")||
+				   buffer.substring( len-3,len).equals("\u00edho")||  //-ího
+				   buffer.substring( len-3,len).equals("\u00edmi")||  //-ími
+				   buffer.substring( len-3,len).equals("imu")){
+				
+				   buffer.delete( len- 2 , len);
+				   palatalise(buffer);
+				   return;
+			        }
+		                if( buffer.substring( len-3,len).equals("\u00e1ch")|| //-ách
+				    buffer.substring( len-3,len).equals("ata")||
+				    buffer.substring( len-3,len).equals("aty")||
+				    buffer.substring( len-3,len).equals("\u00fdch")||   //-ých
+				    buffer.substring( len-3,len).equals("ama")||
+				    buffer.substring( len-3,len).equals("ami")||
+				    buffer.substring( len-3,len).equals("ov\u00e9")||   //-ové
+				    buffer.substring( len-3,len).equals("ovi")||
+				    buffer.substring( len-3,len).equals("\u00fdmi")){  //-ými
+				
+		                    buffer.delete( len- 3 , len);
+		                    return;
+				}
+		}  
+		if( len> 4){
+				if(buffer.substring( len-2,len).equals("em")){
+			
+			         buffer.delete( len- 1 , len);
+			         palatalise(buffer);
+			         return;
+			         
+			      }
+		                if( buffer.substring( len-2,len).equals("es")|| 
+				    buffer.substring( len-2,len).equals("\u00e9m")||    //-ém
+				    buffer.substring( len-2,len).equals("\u00edm")){   //-ím
+			
+			            buffer.delete( len- 2 , len);
+			            palatalise(buffer);
+			            return;
+			        }
+		                if( buffer.substring( len-2,len).equals("\u016fm")){  //-ům
+			
+			            buffer.delete( len- 2 , len);
+			            return;
+			        }
+		                if( buffer.substring( len-2,len).equals("at")|| 
+				    buffer.substring( len-2,len).equals("\u00e1m")||    //-ám
+				    buffer.substring( len-2,len).equals("os")||
+				    buffer.substring( len-2,len).equals("us")||   
+				    buffer.substring( len-2,len).equals("\u00fdm")||     //-ým
+				    buffer.substring( len-2,len).equals("mi")||   
+				    buffer.substring( len-2,len).equals("ou")){
+				
+				    buffer.delete( len- 2 , len);
+				    return;
+				}
+		}//len>4
+		if( len> 3){
+			 if( buffer.substring( len-1,len).equals("e")||
+			    buffer.substring( len-1,len).equals("i")){
+			
+			     palatalise(buffer);
+			     return;
+			}
+		            if( buffer.substring( len-1,len).equals("\u00ed")||  //-í
+				    buffer.substring( len-1,len).equals("\u011b")){      //-ě
+				
+				    palatalise(buffer);
+				    return;
+				}
+		            if( buffer.substring( len-1,len).equals("u")||
+				    buffer.substring( len-1,len).equals("y")||
+				    buffer.substring( len-1,len).equals("\u016f")){  //-ů
+					
+				    buffer.delete( len- 1 , len);
+				    return;
+				           
+				}
+		          if( buffer.substring( len-1,len).equals("a")||
+		        	  buffer.substring( len-1,len).equals("o")||
+				    buffer.substring( len-1,len).equals("\u00e1")||  // -á
+				    buffer.substring( len-1,len).equals("\u00e9")||  //-é
+				    buffer.substring( len-1,len).equals("\u00fd")){   //-ý
+				
+				    buffer.delete( len- 1 , len);
+				    return;
+				}
+		}//len>3
+	}
+	
+
+    private static void usage()
+    {
+        System.err.println("Usage: TestApp <algorithm> [<input file>] [-o <output file>]");
+    }
+
+    public static void main(String [] args) throws Throwable {
+        if (args.length < 1) {
+            usage();
+            return;
+        }
+
+        CzechStemmerLight stemmer = new CzechStemmerLight();
+
+	int arg = 1;
+
+	InputStream instream;
+	if (args.length > arg && !args[arg].equals("-o")) {
+	    instream = new FileInputStream(args[arg++]);
+	} else {
+	    instream = System.in;
+	}
+
+        OutputStream outstream;
+	if (args.length > arg) {
+            if (args.length != arg + 2 || !args[arg].equals("-o")) {
+                usage();
+                return;
+            }
+	    outstream = new FileOutputStream(args[arg + 1]);
+	} else {
+	    outstream = System.out;
+	}
+
+	Reader reader = new InputStreamReader(instream, StandardCharsets.UTF_8);
+	reader = new BufferedReader(reader);
+
+	Writer output = new OutputStreamWriter(outstream, StandardCharsets.UTF_8);
+	output = new BufferedWriter(output);
+
+	StringBuffer input = new StringBuffer();
+	int character;
+	while ((character = reader.read()) != -1) {
+	    char ch = (char) character;
+	    if (Character.isWhitespace(ch)) {
+		String result = stemmer.stem(input.toString());
+		output.write(result);
+		output.write('\n');
+		input.delete(0, input.length());
+	    } else {
+		input.append(ch < 127 ? Character.toLowerCase(ch) : ch);
+	    }
+	}
+	output.flush();
+    }
+	
+}//CzechStemmer_1

From 5ef5479a1fa395f07e9a97047f3cf37bcea6abca Mon Sep 17 00:00:00 2001
From: Olly Betts <olly@survex.com>
Date: Thu, 5 Sep 2024 12:23:02 +1200
Subject: [PATCH 08/22] =?UTF-8?q?CzechStemmerLight:=20Fix=20length=20check?=
 =?UTF-8?q?=20for=20=C5=A1t=C4=9B/=C5=A1ti/=C5=A1t=C3=AD=20removal?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 CzechStemmerLight.java | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/CzechStemmerLight.java b/CzechStemmerLight.java
index 6ef5779b..57b05595 100644
--- a/CzechStemmerLight.java
+++ b/CzechStemmerLight.java
@@ -77,11 +77,11 @@ private void palatalise(StringBuffer buffer){
 		    buffer.replace(len- 3 ,len, "ck");
 		    return;
 		}
-		if( buffer.substring( len- 2 ,len).equals("\u0161t\u011b")||  //-ště
-		    buffer.substring( len- 2 ,len).equals("\u0161ti")||       //-šti
-		    buffer.substring( len- 2 ,len).equals("\u0161t\u00ed")){  //-ští
+		if( buffer.substring( len- 3 ,len).equals("\u0161t\u011b")||  //-ště
+		    buffer.substring( len- 3 ,len).equals("\u0161ti")||       //-šti
+		    buffer.substring( len- 3 ,len).equals("\u0161t\u00ed")){  //-ští
 						
-		    buffer.replace(len- 2 ,len, "sk");
+		    buffer.replace(len- 3 ,len, "sk");
 		    return;
 		}
 		buffer.delete( len- 1 , len);

From 8c88ddf68ab470d0a59499292729e4428e32cd9a Mon Sep 17 00:00:00 2001
From: Olly Betts <olly@survex.com>
Date: Thu, 5 Sep 2024 13:16:00 +1200
Subject: [PATCH 09/22] =?UTF-8?q?Change=20=C4=8D=20suffix=20check=20to=20?=
 =?UTF-8?q?=C4=8De?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The Java implementation removes če but has an incorrect comment
saying it removes č.  Compare before and after on the test vocabulary
this is a clear improvement.
---
 algorithms/czech.sbl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/algorithms/czech.sbl b/algorithms/czech.sbl
index 52488868..52fcd75e 100644
--- a/algorithms/czech.sbl
+++ b/algorithms/czech.sbl
@@ -50,7 +50,7 @@ backwardmode (
 
   define palatalise as (
     [substring] RV among (
-      'ci' 'ce' '{c^}i' '{c^}'
+      'ci' 'ce' '{c^}i' '{c^}e'
       (<- 'k')
       'zi' 'ze' '{z^}i' '{z^}e'
       (<- 'h')

From ae495981c013a5a606aa4fe1c1dcd430c6758816 Mon Sep 17 00:00:00 2001
From: Olly Betts <olly@survex.com>
Date: Thu, 5 Sep 2024 13:46:45 +1200
Subject: [PATCH 10/22] =?UTF-8?q?czech:=20Change=20-=C4=8Dt=C3=A9/-=C5=A1t?=
 =?UTF-8?q?=C3=A9=20to=20-=C4=8Dt=C3=AD/-=C5=A1t=C3=AD?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The Java implementation removes the latter but has incorrect comments
saying it removes the former.

Changing the Snowball implementation makes no difference here (probably
due to the oddness around when to remove a character vs calling
do_palatalise) but changing Java to use the Snowball suffixes here leads
to a clear regression, so adjust the Snowball implementation to match
Java implementation.
---
 algorithms/czech.sbl | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/algorithms/czech.sbl b/algorithms/czech.sbl
index 52fcd75e..f4e6d55b 100644
--- a/algorithms/czech.sbl
+++ b/algorithms/czech.sbl
@@ -54,9 +54,9 @@ backwardmode (
       (<- 'k')
       'zi' 'ze' '{z^}i' '{z^}e'
       (<- 'h')
-      '{c^}t{e^}' '{c^}ti' '{c^}t{e'}'
+      '{c^}t{e^}' '{c^}ti' '{c^}t{i'}'
       (<- 'ck')
-      '{s^}t{e^}' '{s^}ti' '{s^}t{e'}'
+      '{s^}t{e^}' '{s^}ti' '{s^}t{i'}'
       (<- 'sk')
     )
   )

From 67a58634eaf57248c5a712ce90e71ff9ef38309b Mon Sep 17 00:00:00 2001
From: Olly Betts <olly@survex.com>
Date: Thu, 5 Sep 2024 14:57:45 +1200
Subject: [PATCH 11/22] =?UTF-8?q?CzechStemmerLight:=20Remove=20one=20char?=
 =?UTF-8?q?=20for=20-es/-=C3=A9m/-=C3=ADm?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This case was inconsistent with all the other cases where we call
palatalise as we remove the whole suffix here but leave the first
character in every over case.

Checking the vocabulary list, this means palatalise will almost never
match one of the suffixes, as the only words with this as an ending in
the list are these, which look like they're actually English words
(except "abies"):

abies
cookies
hippies
series
studies

This means palatalise will just remove the last character, which seems
odd.

This change changes a lot of stems but seems to be an improvement in
pretty much every instance I checked in google translate.
---
 CzechStemmerLight.java | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/CzechStemmerLight.java b/CzechStemmerLight.java
index 57b05595..c5018a15 100644
--- a/CzechStemmerLight.java
+++ b/CzechStemmerLight.java
@@ -184,7 +184,7 @@ private void removeCase(StringBuffer buffer) {
 				    buffer.substring( len-2,len).equals("\u00e9m")||    //-ém
 				    buffer.substring( len-2,len).equals("\u00edm")){   //-ím
 			
-			            buffer.delete( len- 2 , len);
+			            buffer.delete( len- 1 , len);
 			            palatalise(buffer);
 			            return;
 			        }

From 7f2e79733968a8444d64f8a8fa7f2957f8e397b9 Mon Sep 17 00:00:00 2001
From: Olly Betts <olly@survex.com>
Date: Fri, 6 Sep 2024 13:46:46 +1200
Subject: [PATCH 12/22] Fix handling of possessive removal

There are two issues here:

One seems clearly unintentional, which is that the cursor position from
do_case wasn't reset.

The other is that do_possessive was only called if do_case did something
which does not match the Java implementation.  It seems likely this
was not intended, and testing suggests it's not a helpful change.
---
 algorithms/czech.sbl | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/algorithms/czech.sbl b/algorithms/czech.sbl
index f4e6d55b..085e4dc5 100644
--- a/algorithms/czech.sbl
+++ b/algorithms/czech.sbl
@@ -2,8 +2,8 @@ routines (
   RV R1
   palatalise
   mark_regions
-  do_possessive
-  do_case
+  possessive_suffix
+  case_suffix
 )
 
 externals ( stem )
@@ -61,7 +61,7 @@ backwardmode (
     )
   )
 
-  define do_possessive as (
+  define possessive_suffix as (
     [substring] RV among (
       'ov' '{u*}v'
       (delete)
@@ -73,7 +73,7 @@ backwardmode (
     )
   )
 
-  define do_case as (
+  define case_suffix as (
     [substring] R1 among (
       'atech'
       '{e^}tem' 'at{u*}m'
@@ -103,8 +103,8 @@ backwardmode (
 define stem as (
   do mark_regions
   backwards (
-    do_case
-    do_possessive
+    do case_suffix
+    do possessive_suffix
   )
 )
 

From ac701356066dff03ab07f34779b7bbdefe6a484a Mon Sep 17 00:00:00 2001
From: Olly Betts <olly@survex.com>
Date: Tue, 10 Sep 2024 11:20:33 +1200
Subject: [PATCH 13/22] Adjust palatalise to work like the Java version

For the test vocabulary, this results in 1877 merges of groups of
stems (all seem reasonable), 427 splits (all seem unhelpful) and
300 reshufflings of stems between existing groups (all seem
neutral).

Overall this seems a very clear improvement, but we should see if we can
address the splits.
---
 algorithms/czech.sbl | 73 ++++++++++++++++++++++++++++++--------------
 1 file changed, 50 insertions(+), 23 deletions(-)

diff --git a/algorithms/czech.sbl b/algorithms/czech.sbl
index 085e4dc5..a4ce107e 100644
--- a/algorithms/czech.sbl
+++ b/algorithms/czech.sbl
@@ -1,6 +1,9 @@
 routines (
   RV R1
-  palatalise
+  palatalise_e
+  palatalise_ecaron
+  palatalise_i
+  palatalise_iacute
   mark_regions
   possessive_suffix
   case_suffix
@@ -48,16 +51,33 @@ backwardmode (
   define RV as $pV <= cursor
   define R1 as $p1 <= cursor
 
-  define palatalise as (
-    [substring] RV among (
-      'ci' 'ce' '{c^}i' '{c^}e'
-      (<- 'k')
-      'zi' 'ze' '{z^}i' '{z^}e'
-      (<- 'h')
-      '{c^}t{e^}' '{c^}ti' '{c^}t{i'}'
-      (<- 'ck')
-      '{s^}t{e^}' '{s^}ti' '{s^}t{i'}'
-      (<- 'sk')
+  define palatalise_e as (
+    [substring] among (
+      'c' '{c^}' (<- 'k')
+      'z' '{z^}' (<- 'h')
+    )
+  )
+
+  define palatalise_ecaron as (
+    [substring] among (
+      '{c^}t' (<- 'ck')
+      '{s^}t' (<- 'sk')
+    )
+  )
+
+  define palatalise_i as (
+    [substring] among (
+      'c' '{c^}' (<- 'k')
+      'z' '{z^}' (<- 'h')
+      '{c^}t' (<- 'ck')
+      '{s^}t' (<- 'sk')
+    )
+  )
+
+  define palatalise_iacute as (
+    [substring] among (
+      '{c^}t' (<- 'ck')
+      '{s^}t' (<- 'sk')
     )
   )
 
@@ -68,33 +88,40 @@ backwardmode (
       'in'
       (
         delete
-        try palatalise
+        try palatalise_i
       )
     )
   )
 
   define case_suffix as (
-    [substring] R1 among (
+    setlimit tomark p1 for ( [substring] ) among (
       'atech'
-      '{e^}tem' 'at{u*}m'
+      'at{u*}m'
       '{a'}ch' '{y'}ch' 'ov{e'}' '{y'}mi'
       'ata' 'aty' 'ama' 'ami' 'ovi'
       'at' '{a'}m' 'os' 'us' '{u*}m' '{y'}m' 'mi' 'ou'
+      '{e'}ho' '{e'}m' '{e'}mu'
       'u' 'y' '{u*}' 'a' 'o' '{a'}' '{e'}' '{y'}'
       (delete)
-      'ech' 'ich' '{i'}ch'
-      '{e'}ho' '{e^}mi' '{e'}mu' '{e^}te' '{e^}ti' '{i'}ho' '{i'}mi'
-      'emi' 'iho' 'imu'
-      '{e'}m' '{i'}m' 'es'
-      'e' 'i' '{i'}' '{e^}'
+      '{e^}' '{e^}tem' '{e^}mi' '{e^}te' '{e^}ti'
+      (
+        delete
+        try palatalise_ecaron
+      )
+      'e' 'ech' 'em' 'emi' 'es' 'ete' 'etem' // 'eti'
       (
         delete
-        try palatalise
+        try palatalise_e
       )
-      'em'
+      'i' 'ich' 'iho' 'imu'
       (
-        <- 'e'
-        try palatalise
+        delete
+        try palatalise_i
+      )
+      '{i'}' '{i'}ch' '{i'}ho' '{i'}m' '{i'}mi'
+      (
+        delete
+        try palatalise_iacute
       )
     )
   )

From 6fdd8fa170977ba5f961a7a28c76e5dccff460a0 Mon Sep 17 00:00:00 2001
From: Olly Betts <olly@survex.com>
Date: Wed, 11 Sep 2024 09:47:23 +1200
Subject: [PATCH 14/22] czech: Comment out unused R1 routine for now

---
 algorithms/czech.sbl | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/algorithms/czech.sbl b/algorithms/czech.sbl
index a4ce107e..65651547 100644
--- a/algorithms/czech.sbl
+++ b/algorithms/czech.sbl
@@ -1,5 +1,5 @@
 routines (
-  RV R1
+  RV // R1
   palatalise_e
   palatalise_ecaron
   palatalise_i
@@ -49,7 +49,7 @@ define mark_regions as (
 backwardmode (
 
   define RV as $pV <= cursor
-  define R1 as $p1 <= cursor
+  // define R1 as $p1 <= cursor
 
   define palatalise_e as (
     [substring] among (

From 401a2c9e3e995cb21d9459cbd7afbf75555e3d26 Mon Sep 17 00:00:00 2001
From: Olly Betts <olly@survex.com>
Date: Mon, 7 Oct 2024 16:23:56 +1300
Subject: [PATCH 15/22] czech: Don't remove -os suffix

Testing seems to show this was never helpful and sometimes harmful.
---
 algorithms/czech.sbl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/algorithms/czech.sbl b/algorithms/czech.sbl
index 65651547..bda49299 100644
--- a/algorithms/czech.sbl
+++ b/algorithms/czech.sbl
@@ -99,7 +99,7 @@ backwardmode (
       'at{u*}m'
       '{a'}ch' '{y'}ch' 'ov{e'}' '{y'}mi'
       'ata' 'aty' 'ama' 'ami' 'ovi'
-      'at' '{a'}m' 'os' 'us' '{u*}m' '{y'}m' 'mi' 'ou'
+      'at' '{a'}m' 'us' '{u*}m' '{y'}m' 'mi' 'ou'
       '{e'}ho' '{e'}m' '{e'}mu'
       'u' 'y' '{u*}' 'a' 'o' '{a'}' '{e'}' '{y'}'
       (delete)

From d3fbcd993976b19a3e82185e670a6407ceef57c2 Mon Sep 17 00:00:00 2001
From: Olly Betts <olly@survex.com>
Date: Tue, 8 Oct 2024 16:47:46 +1300
Subject: [PATCH 16/22] czech: Remove more suffixes

-es seems to be a valid suffix (e.g. diabetes) but there seem to be
more cases where it is harmful to remove.

-ich seems to only be a suffix for two pronouns.

-iho doesn't seem to be a valid suffix and removing it makes no
difference on the test vocabulary.
---
 algorithms/czech.sbl | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/algorithms/czech.sbl b/algorithms/czech.sbl
index bda49299..08c89ad2 100644
--- a/algorithms/czech.sbl
+++ b/algorithms/czech.sbl
@@ -108,12 +108,12 @@ backwardmode (
         delete
         try palatalise_ecaron
       )
-      'e' 'ech' 'em' 'emi' 'es' 'ete' 'etem' // 'eti'
+      'e' 'ech' 'em' 'emi' 'ete' 'etem'
       (
         delete
         try palatalise_e
       )
-      'i' 'ich' 'iho' 'imu'
+      'i'
       (
         delete
         try palatalise_i

From baaa66d249a75cdc9b208a540edaa749d7df71c4 Mon Sep 17 00:00:00 2001
From: Olly Betts <olly@survex.com>
Date: Tue, 8 Oct 2024 17:34:55 +1300
Subject: [PATCH 17/22] czech: Remove -'{i'}mu'

This is a valid Czech suffix and removing it seems beneficial (88
cases in the sample vocabulary, all seem to be improvements).
---
 algorithms/czech.sbl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/algorithms/czech.sbl b/algorithms/czech.sbl
index 08c89ad2..d954052a 100644
--- a/algorithms/czech.sbl
+++ b/algorithms/czech.sbl
@@ -118,7 +118,7 @@ backwardmode (
         delete
         try palatalise_i
       )
-      '{i'}' '{i'}ch' '{i'}ho' '{i'}m' '{i'}mi'
+      '{i'}' '{i'}ch' '{i'}ho' '{i'}m' '{i'}mi' '{i'}mu'
       (
         delete
         try palatalise_iacute

From c2d63e91a0ffc6a998a005e44f2622dbd876584b Mon Sep 17 00:00:00 2001
From: Olly Betts <olly@survex.com>
Date: Tue, 8 Oct 2024 17:42:05 +1300
Subject: [PATCH 18/22] czech: Use a better definition of R1

Use a definition of R1 more like the usual Snowball one, but take
syllabic consonants 'l' and 'r' into account.

It seems 'm' and 'n' can also be syllabic consonants but are much
rarer so we ignore these for now at least.

Testing suggests enforcing a minimum of 3 characters before R1 (like
the Danish, Dutch and German stemmers do) helps so we do that here
too.

See #151
---
 algorithms/czech.sbl | 21 ++++++++++++++++++---
 1 file changed, 18 insertions(+), 3 deletions(-)

diff --git a/algorithms/czech.sbl b/algorithms/czech.sbl
index d954052a..42786bd4 100644
--- a/algorithms/czech.sbl
+++ b/algorithms/czech.sbl
@@ -11,9 +11,9 @@ routines (
 
 externals ( stem )
 
-integers ( pV p1 )
+integers ( pV p1 x )
 
-groupings ( v )
+groupings ( v syllabic_c )
 
 stringescapes {}
 
@@ -35,14 +35,29 @@ stringdef z^ '{U+017E}'
 
 define v 'aeiouy{a'}{e^}{e'}{i'}{o'}{u'}{u*}{y'}'
 
+define syllabic_c 'lr' //mn'
+
 define mark_regions as (
 
     $pV = limit
     $p1 = limit
+    test(hop 3 setmark x)
 
     do (
         gopast non-v setmark pV
-        gopast non-v gopast v setmark p1
+        try($pV < x  $pV = x)  // at least 3
+    )
+
+    do (
+        // A syllabic consonant must occur between two consonants, or be
+        // preceded by a consonant and at the end of the word.
+        //
+        // However, we don't actually need to check the character after, since
+        // if it's a vowel then that vowel means we'd end up at the same
+        // position after `gopast non-v` anyway.
+        gopast ( v or (non-v syllabic_c) ) gopast non-v
+        setmark p1
+        try($p1 < x  $p1 = x)  // at least 3
     )
 )
 

From fffc540103dd0d62dabf653fafb9fc7ffa69e0dc Mon Sep 17 00:00:00 2001
From: Olly Betts <olly@survex.com>
Date: Wed, 9 Oct 2024 07:14:36 +1300
Subject: [PATCH 19/22] czech: Optimise R1 check

We can just handle the first character specially - after that we
know the previous character is a consonant because otherwise we'd
have already stopped.

See #151
---
 algorithms/czech.sbl | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/algorithms/czech.sbl b/algorithms/czech.sbl
index 42786bd4..ed9aa448 100644
--- a/algorithms/czech.sbl
+++ b/algorithms/czech.sbl
@@ -13,7 +13,7 @@ externals ( stem )
 
 integers ( pV p1 x )
 
-groupings ( v syllabic_c )
+groupings ( v v_or_syllabic_c )
 
 stringescapes {}
 
@@ -35,7 +35,7 @@ stringdef z^ '{U+017E}'
 
 define v 'aeiouy{a'}{e^}{e'}{i'}{o'}{u'}{u*}{y'}'
 
-define syllabic_c 'lr' //mn'
+define v_or_syllabic_c v + 'lr' //mn'
 
 define mark_regions as (
 
@@ -52,10 +52,14 @@ define mark_regions as (
         // A syllabic consonant must occur between two consonants, or be
         // preceded by a consonant and at the end of the word.
         //
-        // However, we don't actually need to check the character after, since
+        // Instead of literally testing that, we check handle the first
+        // character specially, then we know that the character before is
+        // a consonant because otherwise we'd have stopped already.
+        //
+        // We also don't actually need to check the character after, since
         // if it's a vowel then that vowel means we'd end up at the same
         // position after `gopast non-v` anyway.
-        gopast ( v or (non-v syllabic_c) ) gopast non-v
+        (v or (next gopast v_or_syllabic_c)) gopast non-v
         setmark p1
         try($p1 < x  $p1 = x)  // at least 3
     )

From 360d722bbc42cf9526b890d42772d60647864cca Mon Sep 17 00:00:00 2001
From: Olly Betts <olly@survex.com>
Date: Wed, 9 Oct 2024 12:37:58 +1300
Subject: [PATCH 20/22] Improve comments

---
 algorithms/czech.sbl | 19 ++++++++++++++-----
 1 file changed, 14 insertions(+), 5 deletions(-)

diff --git a/algorithms/czech.sbl b/algorithms/czech.sbl
index ed9aa448..c152d01c 100644
--- a/algorithms/czech.sbl
+++ b/algorithms/czech.sbl
@@ -35,7 +35,14 @@ stringdef z^ '{U+017E}'
 
 define v 'aeiouy{a'}{e^}{e'}{i'}{o'}{u'}{u*}{y'}'
 
-define v_or_syllabic_c v + 'lr' //mn'
+// Some consonants in Czech can be syllabic - if these occur between two other
+// consonants then they act in a vowel-like way and it is helpful to include
+// them in the definition of R1.
+//
+// Some sources also list 'm' and 'n' as syllabic consonants for Czech but they
+// seem to be much rarer and including them makes no difference to the results
+// of stemming any words in our sample vocabulary list.
+define v_or_syllabic_c v + 'lr'
 
 define mark_regions as (
 
@@ -52,13 +59,15 @@ define mark_regions as (
         // A syllabic consonant must occur between two consonants, or be
         // preceded by a consonant and at the end of the word.
         //
-        // Instead of literally testing that, we check handle the first
-        // character specially, then we know that the character before is
-        // a consonant because otherwise we'd have stopped already.
+        // Instead of literally testing that, we handle the first character
+        // specially by only checking if it's a vowel; for subsequent
+        // characters we know that the character before is a consonant because
+        // otherwise we'd have stopped already.
         //
         // We also don't actually need to check the character after, since
         // if it's a vowel then that vowel means we'd end up at the same
-        // position after `gopast non-v` anyway.
+        // position after `gopast non-v` anyway, and if it's the end of the
+        // word then there's no non-v after it.
         (v or (next gopast v_or_syllabic_c)) gopast non-v
         setmark p1
         try($p1 < x  $p1 = x)  // at least 3

From 4b233624e2151f25660aca342e0813becec2e791 Mon Sep 17 00:00:00 2001
From: Olly Betts <olly@survex.com>
Date: Wed, 9 Oct 2024 13:22:04 +1300
Subject: [PATCH 21/22] czech: Use R1 instead of RV

There seems no benefit from having a separate region we can remove
possessive suffixes in.

See #151
---
 algorithms/czech.sbl | 15 ++++-----------
 1 file changed, 4 insertions(+), 11 deletions(-)

diff --git a/algorithms/czech.sbl b/algorithms/czech.sbl
index c152d01c..b3623ecf 100644
--- a/algorithms/czech.sbl
+++ b/algorithms/czech.sbl
@@ -1,5 +1,5 @@
 routines (
-  RV // R1
+  R1
   palatalise_e
   palatalise_ecaron
   palatalise_i
@@ -11,7 +11,7 @@ routines (
 
 externals ( stem )
 
-integers ( pV p1 x )
+integers ( p1 x )
 
 groupings ( v v_or_syllabic_c )
 
@@ -46,15 +46,9 @@ define v_or_syllabic_c v + 'lr'
 
 define mark_regions as (
 
-    $pV = limit
     $p1 = limit
     test(hop 3 setmark x)
 
-    do (
-        gopast non-v setmark pV
-        try($pV < x  $pV = x)  // at least 3
-    )
-
     do (
         // A syllabic consonant must occur between two consonants, or be
         // preceded by a consonant and at the end of the word.
@@ -76,8 +70,7 @@ define mark_regions as (
 
 backwardmode (
 
-  define RV as $pV <= cursor
-  // define R1 as $p1 <= cursor
+  define R1 as $p1 <= cursor
 
   define palatalise_e as (
     [substring] among (
@@ -110,7 +103,7 @@ backwardmode (
   )
 
   define possessive_suffix as (
-    [substring] RV among (
+    [substring] R1 among (
       'ov' '{u*}v'
       (delete)
       'in'

From bfccdb29951bce2e531b7333b3b6e3139edd8deb Mon Sep 17 00:00:00 2001
From: Olly Betts <olly@survex.com>
Date: Thu, 10 Oct 2024 16:36:16 +1300
Subject: [PATCH 22/22] czech: Merge two identical routines

---
 algorithms/czech.sbl | 16 ++++------------
 1 file changed, 4 insertions(+), 12 deletions(-)

diff --git a/algorithms/czech.sbl b/algorithms/czech.sbl
index b3623ecf..49cb4419 100644
--- a/algorithms/czech.sbl
+++ b/algorithms/czech.sbl
@@ -1,9 +1,8 @@
 routines (
   R1
   palatalise_e
-  palatalise_ecaron
+  palatalise_ecaron_or_iacute
   palatalise_i
-  palatalise_iacute
   mark_regions
   possessive_suffix
   case_suffix
@@ -79,7 +78,7 @@ backwardmode (
     )
   )
 
-  define palatalise_ecaron as (
+  define palatalise_ecaron_or_iacute as (
     [substring] among (
       '{c^}t' (<- 'ck')
       '{s^}t' (<- 'sk')
@@ -95,13 +94,6 @@ backwardmode (
     )
   )
 
-  define palatalise_iacute as (
-    [substring] among (
-      '{c^}t' (<- 'ck')
-      '{s^}t' (<- 'sk')
-    )
-  )
-
   define possessive_suffix as (
     [substring] R1 among (
       'ov' '{u*}v'
@@ -127,7 +119,7 @@ backwardmode (
       '{e^}' '{e^}tem' '{e^}mi' '{e^}te' '{e^}ti'
       (
         delete
-        try palatalise_ecaron
+        try palatalise_ecaron_or_iacute
       )
       'e' 'ech' 'em' 'emi' 'ete' 'etem'
       (
@@ -142,7 +134,7 @@ backwardmode (
       '{i'}' '{i'}ch' '{i'}ho' '{i'}m' '{i'}mi' '{i'}mu'
       (
         delete
-        try palatalise_iacute
+        try palatalise_ecaron_or_iacute
       )
     )
   )