Add support for ICU tokenizer

hplt-project · Nov 22, 2024 · ee534f3 · ee534f3
1 parent 7be3b4d
commit ee534f3
Show file tree

Hide file tree

Showing 16 changed files with 281 additions and 47 deletions.
diff --git a/.github/workflows/release.yaml b/.github/workflows/release.yaml
@@ -18,6 +18,7 @@ jobs:
 
       - name: Run tests
         run: |-
+          apt-get install python3-icu
           pip install .
           python -m unittest discover -s tests
 

diff --git a/README.md b/README.md
@@ -198,8 +198,8 @@ This modifier needs a third column in the training data with per-word (technical
 
 ```yaml
 - Tags: 0.05
-  custom_detok_src: null
-  custom_detok_trg: zh
+  custom_detok_src: "moses:null"
+  custom_detok_trg: "moses:zh"
   spm_vocab: path/to/vocab.enzh.spm
   template: "__source__ {src} __target__ {trg} __done__"
 ```
@@ -218,8 +218,8 @@ Sometimes we want to just replace the source token with the target token directl
 ```yml
 modifiers:
   - Tags: 0.1
-    custom_detok_src: null # Null value for the src detokenizer
-    custom_detok_trg: zh
+    custom_detok_src: "moses:null" # Null value for the src detokenizer
+    custom_detok_trg: "moses:zh"
     replace: 0.4 # 0.4 out of the time tags is triggered, instead replace the target token with random noise, and use that random noise to tag a corresponding source word.
 ```
 
@@ -229,8 +229,8 @@ If alignment information is present, we can augment the training data with inlin
 ```yml
 modifiers:
   - Tags: 0.1
-    custom_detok_src: null # Null value for the src detokenizer
-    custom_detok_trg: zh
+    custom_detok_src: "moses:null" # Null value for the src detokenizer
+    custom_detok_trg: "moses:zh"
     augment: 0.4 # 0.4 out of the time tags is triggered, instead augment the source and the target with random noise. If you want 100% only noise without tag functionality use augment: 1
 ```
 

diff --git a/contrib/test_enzh_config.yml b/contrib/test_enzh_config.yml
@@ -14,8 +14,8 @@ modifiers:
   - TitleCase: 0.2
   - Typos: 0.2
   - Tags: 0.1
-    custom_detok_src: null # Null value for the src detokenizer
-    custom_detok_trg: zh
+    custom_detok_src: "moses:null" # Null value for the src detokenizer
+    custom_detok_trg: "moses:zh"
     template: "__source__ {src} __target__ {trg} __done__" # This is the default way of inserting tags.
     # We STRONGLY DISCOURAGE the modification of this line and in fact it shouldn't be included in the config
     # unless you really know what you are doing.

diff --git a/contrib/test_enzh_noise_config.yml b/contrib/test_enzh_noise_config.yml
@@ -13,8 +13,8 @@ modifiers:
   - UpperCase: 0.2
   - TitleCase: 0.2
   - Tags: 0.1
-    custom_detok_src: null # Null value for the src detokenizer
-    custom_detok_trg: zh
+    custom_detok_src: "moses:null" # Null value for the src detokenizer
+    custom_detok_trg: "moses:zh"
     augment: 0.4 # 0.4 out of the time tags is triggered, instead augment the source and the target with random noise
     replace: 0.4 # =====//=====, instead replace the target token with random noise, and use that random noise to tag a corresponding source word
     # template: "__source__ {src} __target__ {trg} __done__" # This is the default way of inserting tags.

diff --git a/contrib/test_enzh_tags_advanced_config.yml b/contrib/test_enzh_tags_advanced_config.yml
@@ -15,8 +15,8 @@ start:
     - until clean 1
   modifiers:
     - Tags: 0.5
-      custom_detok_src: null
-      custom_detok_trg: null
+      custom_detok_src: "moses:null"
+      custom_detok_trg: "moses:null"
       template: "{src} __target__ {trg} __done__"
     - *modifiers
 
@@ -26,8 +26,8 @@ end:
     - until clean 1
   modifiers:
     - Tags: 0.5
-      custom_detok_src: null
-      custom_detok_trg: zh
+      custom_detok_src: "moses:null"
+      custom_detok_trg: "moses:zh"
       template: "{src} __target__ {trg} __done__"
     - *modifiers
 

diff --git a/contrib/test_enzh_tags_stage_config.yml b/contrib/test_enzh_tags_stage_config.yml
@@ -12,8 +12,8 @@ start:
     - until clean 1
   modifiers:
     - Tags: 0.5
-      custom_detok_src: null
-      custom_detok_trg: null
+      custom_detok_src: "moses:null"
+      custom_detok_trg: "moses:null"
       template: "{src} __target__ {trg} __done__"
 
 end:
@@ -22,8 +22,8 @@ end:
     - until clean 1
   modifiers:
     - Tags: 0.5
-      custom_detok_src: null
-      custom_detok_trg: zh
+      custom_detok_src: "moses:null"
+      custom_detok_trg: "moses:zh"
       template: "{src} __target__ {trg} __done__"
 
 seed: 1111

diff --git a/contrib/test_full_config.yml b/contrib/test_full_config.yml
@@ -52,8 +52,8 @@ modifiers:
     repeated_char: 0.1 # Repeats a random word character.
     unichar:       0.1 # Replaces a random consecutive repeated letter with a single letter.
   - Tags: 0.08
-    custom_detok_src: null
-    custom_detok_trg: zh
+    custom_detok_src: "moses:null"
+    custom_detok_trg: "moses:zh"
     template: "__source__ {src} __target__ {trg} __done__"
 
 seed: 1111

diff --git a/contrib/test_zhen_config.yml b/contrib/test_zhen_config.yml
@@ -12,7 +12,7 @@ start:
 modifiers:
   # No UpperCase or TitleCase modifier when the source is Chinese as we can't upper or lowercase Chinese
   - Tags: 0.1
-    custom_detok_src: zh
+    custom_detok_src: "moses:zh"
 
 seed: 1111
 trainer: cat
diff --git a/contrib/train_config.yml b/contrib/train_config.yml
@@ -31,8 +31,8 @@ modifiers:
   - UpperCase: 0.05
   - TitleCase: 0.05
   #- Tags: 0.08 # Requires dataset augmented with alignment info
-  #  custom_detok_src: null # Null value for the src detokenizer
-  #  custom_detok_trg: zh
+  #  custom_detok_src: "moses:null" # Null value for the src detokenizer
+  #  custom_detok_trg: "moses:zh"
   # template: "__source__ {src} __target__ {trg} __done__" # This is the default way of inserting tags.
     # We STRONGLY DISCOURAGE the modification of this line and in fact it shouldn't be included in the config
     # unless you really know what you are doing.

diff --git a/src/opustrainer/modifiers/placeholders.py b/src/opustrainer/modifiers/placeholders.py
@@ -5,7 +5,8 @@
 
 from opustrainer.alignments import Pair, parse_alignments, format_alignments
 from opustrainer.modifiers import Modifier
-from opustrainer.tokenizers import SpaceDetokenizer, SpaceTokenizer, MosesDetokenizer, SentencePieceTokenizer
+from opustrainer.tokenizers import SpaceDetokenizer, SpaceTokenizer, SentencePieceTokenizer, \
+    make_detokenizer, ICU_WHITESPACE_TOKEN
 from opustrainer.modifiers.retokenize import Retokenizer, remap_alignment_pairs
 from opustrainer import logger
 
@@ -231,8 +232,8 @@ class PlaceholderTagModifier(Modifier):
        ```yaml
        modifiers:
        - Tags: 0.02
-         custom_detok_src: 'zh'
-         custom_detok_trg: null
+         custom_detok_src: 'moses:zh'
+         custom_detok_trg: "moses:null"
          template: "__source__ {src} __target__ {trg} __done__"
          augment: 0.0 # 0% chance to just insert a random string on both sides
          replace: 0.0 # 0% change to use tags to force translate to a random string
@@ -252,18 +253,20 @@ class PlaceholderTagModifier(Modifier):
 
     def __init__(self, probability: float=0.0, custom_detok_src: Optional[str]=None, custom_detok_trg: Optional[str]=None,
         spm_vocab: Optional[Path]=None,
-        template: str="__source__ {src} __target__ {trg} __done__", augment: float=0, replace:float=0):
+        template: str="__source__ {src} __target__ {trg} __done__", augment: float=0, replace:float=0, tag:float=1):
         super().__init__(probability)
 
         self.template = template
+        self.custom_detok_src = custom_detok_src
+        self.custom_detok_trg = custom_detok_trg
 
         self.src_retokenizer = Retokenizer(
-            detokenizer=MosesDetokenizer(custom_detok_src) if custom_detok_src else SpaceDetokenizer(),
+            detokenizer=make_detokenizer(custom_detok_src) if custom_detok_src else SpaceDetokenizer(),
             tokenizer=SentencePieceTokenizer(spm_vocab) if spm_vocab else SpaceTokenizer()
         )
 
         self.trg_retokenizer = Retokenizer(
-            detokenizer=MosesDetokenizer(custom_detok_trg) if custom_detok_trg else SpaceDetokenizer(),
+            detokenizer=make_detokenizer(custom_detok_trg) if custom_detok_trg else SpaceDetokenizer(),
             tokenizer=SentencePieceTokenizer(spm_vocab) if spm_vocab else SpaceTokenizer()
         )
 
@@ -281,7 +284,13 @@ def __init__(self, probability: float=0.0, custom_detok_src: Optional[str]=None,
         if replace > 0:
             self.modes.append(('replace', replace))
 
-        self.modes.append(('tag', 1.0)) # Weight doesn't matter as long as cumsum => 1.0, it's last on the list anyway
+        # the modifier can be used for inline noise augmentation only
+        if tag > 0:
+            self.modes.append(('tag', tag))
+
+        if ({'replace', 'tag'} & {mode for mode,_ in self.modes}) and \
+            'icu' in ((self.custom_detok_trg or '') + (self.custom_detok_trg or '')):
+            raise ValueError('ICU tokenization is not supported with "tag" and "replace" modes')
 
     def __call__(self, batch: List[str]) -> Iterable[str]:
         for line in batch:
@@ -293,7 +302,7 @@ def __call__(self, batch: List[str]) -> Iterable[str]:
     def apply(self, line:str) -> str:
         """Applies tag to words in a line based on alignment info, and then removes the alignment info from the line.
            This is used to enable terminology support by tagging random words with their translation.
-           eg "I like cake" would become "I __source__ like __target__ gusta __done__ cake. 
+           eg "I like cake" would become "I __source__ like __target__ gusta __done__ cake.
            By default the detokenizer used is the trivial detokenizer, but we can instead have separate detokenizers on src and trg."
         """
 
@@ -333,7 +342,7 @@ def apply(self, line:str) -> str:
                 continue
 
             # Select mode (skip random_weighted_choices*() when 'tag' is the only mode)
-            mode = random_weighted_choice(self.modes) if len(self.modes) > 1 else 'tag'
+            mode = random_weighted_choice(self.modes) if len(self.modes) > 1 else self.modes[0][0]
 
             if mode == "tag" or mode == "replace":
                 if mode == "tag":
@@ -375,19 +384,19 @@ def apply(self, line:str) -> str:
                 # Augment mode adds random noise both on the source and the target without any
                 # tagging encouraging the model to copy crap from one side to the other.
                 augment_tokens = get_random_unicode_words()
-                source = source[:candidate.src+1] + augment_tokens + source[candidate.src+1:]
-                target = target[:candidate.trg+1] + augment_tokens + target[candidate.trg+1:]
+                source, num_src_aug_tokens, pos_aug_src = self.insert_augmented(augment_tokens, source, candidate.src+1, self.custom_detok_src)
+                target, num_trg_aug_tokens, pos_aug_trg = self.insert_augmented(augment_tokens, target, candidate.trg+1, self.custom_detok_trg)
 
                 # Fix up alignment pairs
                 alignments = (
                     # pairs before and including the candidate stay the same
                     alignments[:candidate_index+1]
                     # fill in the gap created by the added random noise
-                    + [Pair(candidate.src + n, candidate.trg + n) for n in range(1, len(augment_tokens) + 1)]
+                    + [Pair(candidate.src + n_src, candidate.trg + n_trg) for n_src, n_trg in zip(pos_aug_src, pos_aug_trg)]
                     # pairs after the replaced bit have to be offset by the length of the replacement bit
-                    + [Pair(pair.src + len(augment_tokens), pair.trg + len(augment_tokens)) for pair in alignments[candidate_index+1:]]
+                    + [Pair(pair.src + num_src_aug_tokens, pair.trg + num_trg_aug_tokens) for pair in alignments[candidate_index+1:]]
                 )
-                candidate_offset = candidate_index + len(augment_tokens) + 1
+                candidate_offset = candidate_index + min(num_src_aug_tokens, num_trg_aug_tokens) + 1
 
         source_detok, _, source_mapping = self.src_retokenizer.retokenize(source)
         target_detok, _, target_mapping = self.trg_retokenizer.retokenize(target)
@@ -398,6 +407,46 @@ def apply(self, line:str) -> str:
         else:
             return source_detok + "\t" + target_detok
 
+    def insert_augmented(self, augment_tokens: List[str], tokens: List[str], position: int, detokenization: str) -> Tuple[List[str], int, List[int]]:
+        """
+        Inserts augmented tokens.
+        Accounts for possible ICU detokenization which uses special symbol "▁" for whitespace tokens.
+            Such tokens will also be inserted to separate the augmented words.
+
+        Returns:
+            new tokens
+            number of augmented tokens including whitespaces for in ICU case
+            alignments positions for the augmented tokens (whitespaces are excluded, we don't need alignments for them)
+        """
+        prefix = tokens[:position]
+        postfix = tokens[position:]
+        aug_aln_offset = []
+
+        if detokenization is not None and "icu" in detokenization:
+            new_aug_tokens = []
+            aug_pos_index = 1
+
+            if len(prefix) > 0 and prefix[-1] != ICU_WHITESPACE_TOKEN:
+                new_aug_tokens.append(ICU_WHITESPACE_TOKEN)
+                aug_pos_index += 1
+
+            for token in augment_tokens:
+                new_aug_tokens.append(token)
+                # save the offset of the augmented words to use in alignments
+                aug_aln_offset.append(aug_pos_index)
+                new_aug_tokens.append(ICU_WHITESPACE_TOKEN)
+                aug_pos_index += 2
+
+            if len(postfix) > 0 and postfix[0] == ICU_WHITESPACE_TOKEN:
+                new_aug_tokens.pop()
+
+            augment_tokens = new_aug_tokens
+        else:
+            aug_aln_offset = list(range(1, len(augment_tokens) + 1))
+
+        tokens = prefix + augment_tokens + postfix
+        return tokens, len(augment_tokens), aug_aln_offset
+
     def validate(self, context:List[Modifier]) -> None:
         """Current limitation of the tags modifier is that any other modifier might modify the
         inserted tags, which we don't want. So warn users about that if we notice it.

diff --git a/src/opustrainer/modifiers/retokenize.py b/src/opustrainer/modifiers/retokenize.py
@@ -33,6 +33,10 @@ def retokenize(self, tokens:TokenList) -> Tuple[str,TokenList,TokenMapping]:
 
         prev_j = 0
         for i, old_token_span in enumerate(old_token_spans):
+            # it is possible for ICU tokenizer whitespace token, return empty list
+            if old_token_span is None:
+                continue
+
             for j, new_token_span in enumerate(new_token_spans[prev_j:], start=prev_j):
                 prev_j = j
                 overlap = slice_cmp(old_token_span, new_token_span)
@@ -59,8 +63,8 @@ def remap_alignment_pairs(src_mapping:TokenMapping, trg_mapping:TokenMapping, al
     sentence pair.
     
     E.g. if you have
-    source-mapping: [0 => [3,4], 1 => [5]],
-    target-mapping: [0 => [0], 1 => [1]]
+    source-mapping: [0 => [3,4], 1 => [5], 2 => []],
+    target-mapping: [0 => [0], 1 => [1], 2 => []]
     alignments:     [(0,1), (1,1)]
     it will return  [
         (3,1), (4,1), # the [0 => [3,4]] mapping

diff --git a/src/opustrainer/tokenizers.py b/src/opustrainer/tokenizers.py
@@ -11,12 +11,15 @@
 DETOKENIZERS = {
     'moses': lambda lang: MosesDetokenizer(lang),
     'spaces': lambda: SpaceDetokenizer(),
+    'icu': lambda lang: IcuDetokenizer(lang),
+
 }
 
 TOKENIZERS = {
     'moses': lambda lang: MosesTokenizer(lang),
     'spm': lambda vocab: SentencePieceTokenizer(vocab),
     'spaces': lambda: SpaceTokenizer(),
+    'icu': lambda lang: IcuTokenizer(lang),
 }
 
 
@@ -126,3 +129,72 @@ def tokenize(self, text:str) -> Tuple[TokenList,TokenSpanList]:
         tokens = [text[span] for span in spans]
         return tokens, spans
 
+# The same character as in SentencePiece
+ICU_WHITESPACE_TOKEN = "▁"
+class IcuTokenizer:
+    """
+    Tokenizes text by splitting words and punctuation using ICU segmenter.
+    Whitespaces will be preserved as a special token ▁ for lossless detokenization.
+    Requires installation with the steps specified in https://pypi.org/project/PyICU/
+    """
+
+    def __init__(self, lang: str):
+        self.lang = lang
+
+    def tokenize(self, text:str) -> Tuple[TokenList, TokenSpanList]:
+        from icu import BreakIterator, Locale
+
+        bi = BreakIterator.createWordInstance(Locale(self.lang))
+        bi.setText(text)
+
+        tokens = []
+        start = bi.first()
+        for end in bi:
+            token = text[start:end]
+            if (
+                token and token != "\n"
+            ):  # exclude empty tokens, but leave whitespaces and replace them with a special token
+                tokens.append(token)
+            start = end
+
+        spans: TokenSpanList = []
+        offset = 0
+        for token in tokens:
+            offset = text.find(token, offset)
+            if offset == -1:
+                raise RuntimeError(f"Could not find token '{token}' in original text")
+            spans.append(slice(offset, offset + len(token)))
+            offset += len(token)
+
+        tokens = [token.replace(" ", ICU_WHITESPACE_TOKEN) for token in tokens]
+        return tokens, spans
+
+class IcuDetokenizer:
+    """
+    Detokenizes tokens back into the original text preserving whitespaces as well.
+    Spans for whitespaces will be None.
+    """
+
+    # For compatibility with MosesDetokenizer interface
+    def __init__(self, lang):
+        self.lang = lang
+
+    def detokenize(self, tokens:TokenList) -> Tuple[str,TokenSpanList]:
+        text = "".join(tokens).replace(ICU_WHITESPACE_TOKEN, " ")
+
+        spans = []
+        offset = 0
+
+        for token in tokens:
+            if token == ICU_WHITESPACE_TOKEN:
+                spans.append(None)
+                continue
+            # there are some edge cases where a whitespace can appear inside a token
+            token = token.replace(ICU_WHITESPACE_TOKEN, " ")
+            offset = text.find(token, offset)
+            if offset == -1:
+                raise RuntimeError(f"Could not find token '{token}' in detokenized text")
+            spans.append(slice(offset, offset + len(token)))
+            offset += len(token)
+
+        return text, spans