diff --git a/.github/workflows/release.yaml b/.github/workflows/release.yaml
index 3ac4451..786c184 100644
--- a/.github/workflows/release.yaml
+++ b/.github/workflows/release.yaml
@@ -18,6 +18,7 @@ jobs:
 
       - name: Run tests
         run: |-
+          apt-get install python3-icu
           pip install .
           python -m unittest discover -s tests
 
diff --git a/README.md b/README.md
index e49136f..4ff0ebc 100644
--- a/README.md
+++ b/README.md
@@ -198,8 +198,8 @@ This modifier needs a third column in the training data with per-word (technical
 
 ```yaml
 - Tags: 0.05
-  custom_detok_src: null
-  custom_detok_trg: zh
+  custom_detok_src: "moses:null"
+  custom_detok_trg: "moses:zh"
   spm_vocab: path/to/vocab.enzh.spm
   template: "__source__ {src} __target__ {trg} __done__"
 ```
@@ -218,8 +218,8 @@ Sometimes we want to just replace the source token with the target token directl
 ```yml
 modifiers:
   - Tags: 0.1
-    custom_detok_src: null # Null value for the src detokenizer
-    custom_detok_trg: zh
+    custom_detok_src: "moses:null" # Null value for the src detokenizer
+    custom_detok_trg: "moses:zh"
     replace: 0.4 # 0.4 out of the time tags is triggered, instead replace the target token with random noise, and use that random noise to tag a corresponding source word.
 ```
 
@@ -229,8 +229,8 @@ If alignment information is present, we can augment the training data with inlin
 ```yml
 modifiers:
   - Tags: 0.1
-    custom_detok_src: null # Null value for the src detokenizer
-    custom_detok_trg: zh
+    custom_detok_src: "moses:null" # Null value for the src detokenizer
+    custom_detok_trg: "moses:zh"
     augment: 0.4 # 0.4 out of the time tags is triggered, instead augment the source and the target with random noise. If you want 100% only noise without tag functionality use augment: 1
 ```
 
diff --git a/contrib/test_enzh_config.yml b/contrib/test_enzh_config.yml
index e3eb124..804643e 100644
--- a/contrib/test_enzh_config.yml
+++ b/contrib/test_enzh_config.yml
@@ -14,8 +14,8 @@ modifiers:
   - TitleCase: 0.2
   - Typos: 0.2
   - Tags: 0.1
-    custom_detok_src: null # Null value for the src detokenizer
-    custom_detok_trg: zh
+    custom_detok_src: "moses:null" # Null value for the src detokenizer
+    custom_detok_trg: "moses:zh"
     template: "__source__ {src} __target__ {trg} __done__" # This is the default way of inserting tags.
     # We STRONGLY DISCOURAGE the modification of this line and in fact it shouldn't be included in the config
     # unless you really know what you are doing.
diff --git a/contrib/test_enzh_noise_config.yml b/contrib/test_enzh_noise_config.yml
index 20cde0f..8fd7dbd 100644
--- a/contrib/test_enzh_noise_config.yml
+++ b/contrib/test_enzh_noise_config.yml
@@ -13,8 +13,8 @@ modifiers:
   - UpperCase: 0.2
   - TitleCase: 0.2
   - Tags: 0.1
-    custom_detok_src: null # Null value for the src detokenizer
-    custom_detok_trg: zh
+    custom_detok_src: "moses:null" # Null value for the src detokenizer
+    custom_detok_trg: "moses:zh"
     augment: 0.4 # 0.4 out of the time tags is triggered, instead augment the source and the target with random noise
     replace: 0.4 # =====//=====, instead replace the target token with random noise, and use that random noise to tag a corresponding source word
     # template: "__source__ {src} __target__ {trg} __done__" # This is the default way of inserting tags.
diff --git a/contrib/test_enzh_tags_advanced_config.yml b/contrib/test_enzh_tags_advanced_config.yml
index bfdc0dd..e2cae55 100644
--- a/contrib/test_enzh_tags_advanced_config.yml
+++ b/contrib/test_enzh_tags_advanced_config.yml
@@ -15,8 +15,8 @@ start:
     - until clean 1
   modifiers:
     - Tags: 0.5
-      custom_detok_src: null
-      custom_detok_trg: null
+      custom_detok_src: "moses:null"
+      custom_detok_trg: "moses:null"
       template: "{src} __target__ {trg} __done__"
     - *modifiers
 
@@ -26,8 +26,8 @@ end:
     - until clean 1
   modifiers:
     - Tags: 0.5
-      custom_detok_src: null
-      custom_detok_trg: zh
+      custom_detok_src: "moses:null"
+      custom_detok_trg: "moses:zh"
       template: "{src} __target__ {trg} __done__"
     - *modifiers
 
diff --git a/contrib/test_enzh_tags_stage_config.yml b/contrib/test_enzh_tags_stage_config.yml
index 720c13b..dbe44a9 100644
--- a/contrib/test_enzh_tags_stage_config.yml
+++ b/contrib/test_enzh_tags_stage_config.yml
@@ -12,8 +12,8 @@ start:
     - until clean 1
   modifiers:
     - Tags: 0.5
-      custom_detok_src: null
-      custom_detok_trg: null
+      custom_detok_src: "moses:null"
+      custom_detok_trg: "moses:null"
       template: "{src} __target__ {trg} __done__"
 
 end:
@@ -22,8 +22,8 @@ end:
     - until clean 1
   modifiers:
     - Tags: 0.5
-      custom_detok_src: null
-      custom_detok_trg: zh
+      custom_detok_src: "moses:null"
+      custom_detok_trg: "moses:zh"
       template: "{src} __target__ {trg} __done__"
 
 seed: 1111
diff --git a/contrib/test_full_config.yml b/contrib/test_full_config.yml
index 0f0582d..6d0b9d2 100644
--- a/contrib/test_full_config.yml
+++ b/contrib/test_full_config.yml
@@ -52,8 +52,8 @@ modifiers:
     repeated_char: 0.1 # Repeats a random word character.
     unichar:       0.1 # Replaces a random consecutive repeated letter with a single letter.
   - Tags: 0.08
-    custom_detok_src: null
-    custom_detok_trg: zh
+    custom_detok_src: "moses:null"
+    custom_detok_trg: "moses:zh"
     template: "__source__ {src} __target__ {trg} __done__"
 
 seed: 1111
diff --git a/contrib/test_zhen_config.yml b/contrib/test_zhen_config.yml
index 5132790..4715710 100644
--- a/contrib/test_zhen_config.yml
+++ b/contrib/test_zhen_config.yml
@@ -12,7 +12,7 @@ start:
 modifiers:
   # No UpperCase or TitleCase modifier when the source is Chinese as we can't upper or lowercase Chinese
   - Tags: 0.1
-    custom_detok_src: zh
+    custom_detok_src: "moses:zh"
 
 seed: 1111
 trainer: cat
diff --git a/contrib/train_config.yml b/contrib/train_config.yml
index b06b699..627f51c 100644
--- a/contrib/train_config.yml
+++ b/contrib/train_config.yml
@@ -31,8 +31,8 @@ modifiers:
   - UpperCase: 0.05
   - TitleCase: 0.05
   #- Tags: 0.08 # Requires dataset augmented with alignment info
-  #  custom_detok_src: null # Null value for the src detokenizer
-  #  custom_detok_trg: zh
+  #  custom_detok_src: "moses:null" # Null value for the src detokenizer
+  #  custom_detok_trg: "moses:zh"
   # template: "__source__ {src} __target__ {trg} __done__" # This is the default way of inserting tags.
     # We STRONGLY DISCOURAGE the modification of this line and in fact it shouldn't be included in the config
     # unless you really know what you are doing.
diff --git a/src/opustrainer/modifiers/placeholders.py b/src/opustrainer/modifiers/placeholders.py
index 4af71b0..df86233 100644
--- a/src/opustrainer/modifiers/placeholders.py
+++ b/src/opustrainer/modifiers/placeholders.py
@@ -5,7 +5,8 @@
 
 from opustrainer.alignments import Pair, parse_alignments, format_alignments
 from opustrainer.modifiers import Modifier
-from opustrainer.tokenizers import SpaceDetokenizer, SpaceTokenizer, MosesDetokenizer, SentencePieceTokenizer
+from opustrainer.tokenizers import SpaceDetokenizer, SpaceTokenizer, SentencePieceTokenizer, \
+    make_detokenizer, ICU_WHITESPACE_TOKEN
 from opustrainer.modifiers.retokenize import Retokenizer, remap_alignment_pairs
 from opustrainer import logger
 
@@ -231,8 +232,8 @@ class PlaceholderTagModifier(Modifier):
        ```yaml
        modifiers:
        - Tags: 0.02
-         custom_detok_src: 'zh'
-         custom_detok_trg: null
+         custom_detok_src: 'moses:zh'
+         custom_detok_trg: "moses:null"
          template: "__source__ {src} __target__ {trg} __done__"
          augment: 0.0 # 0% chance to just insert a random string on both sides
          replace: 0.0 # 0% change to use tags to force translate to a random string
@@ -252,18 +253,20 @@ class PlaceholderTagModifier(Modifier):
 
     def __init__(self, probability: float=0.0, custom_detok_src: Optional[str]=None, custom_detok_trg: Optional[str]=None,
         spm_vocab: Optional[Path]=None,
-        template: str="__source__ {src} __target__ {trg} __done__", augment: float=0, replace:float=0):
+        template: str="__source__ {src} __target__ {trg} __done__", augment: float=0, replace:float=0, tag:float=1):
         super().__init__(probability)
 
         self.template = template
+        self.custom_detok_src = custom_detok_src
+        self.custom_detok_trg = custom_detok_trg
 
         self.src_retokenizer = Retokenizer(
-            detokenizer=MosesDetokenizer(custom_detok_src) if custom_detok_src else SpaceDetokenizer(),
+            detokenizer=make_detokenizer(custom_detok_src) if custom_detok_src else SpaceDetokenizer(),
             tokenizer=SentencePieceTokenizer(spm_vocab) if spm_vocab else SpaceTokenizer()
         )
 
         self.trg_retokenizer = Retokenizer(
-            detokenizer=MosesDetokenizer(custom_detok_trg) if custom_detok_trg else SpaceDetokenizer(),
+            detokenizer=make_detokenizer(custom_detok_trg) if custom_detok_trg else SpaceDetokenizer(),
             tokenizer=SentencePieceTokenizer(spm_vocab) if spm_vocab else SpaceTokenizer()
         )
 
@@ -281,7 +284,13 @@ def __init__(self, probability: float=0.0, custom_detok_src: Optional[str]=None,
         if replace > 0:
             self.modes.append(('replace', replace))
 
-        self.modes.append(('tag', 1.0)) # Weight doesn't matter as long as cumsum => 1.0, it's last on the list anyway
+        # the modifier can be used for inline noise augmentation only
+        if tag > 0:
+            self.modes.append(('tag', tag))
+
+        if ({'replace', 'tag'} & {mode for mode,_ in self.modes}) and \
+            'icu' in ((self.custom_detok_trg or '') + (self.custom_detok_trg or '')):
+            raise ValueError('ICU tokenization is not supported with "tag" and "replace" modes')
 
     def __call__(self, batch: List[str]) -> Iterable[str]:
         for line in batch:
@@ -293,7 +302,7 @@ def __call__(self, batch: List[str]) -> Iterable[str]:
     def apply(self, line:str) -> str:
         """Applies tag to words in a line based on alignment info, and then removes the alignment info from the line.
            This is used to enable terminology support by tagging random words with their translation.
-           eg "I like cake" would become "I __source__ like __target__ gusta __done__ cake. 
+           eg "I like cake" would become "I __source__ like __target__ gusta __done__ cake.
            By default the detokenizer used is the trivial detokenizer, but we can instead have separate detokenizers on src and trg."
         """
 
@@ -333,7 +342,7 @@ def apply(self, line:str) -> str:
                 continue
             
             # Select mode (skip random_weighted_choices*() when 'tag' is the only mode)
-            mode = random_weighted_choice(self.modes) if len(self.modes) > 1 else 'tag'
+            mode = random_weighted_choice(self.modes) if len(self.modes) > 1 else self.modes[0][0]
 
             if mode == "tag" or mode == "replace":
                 if mode == "tag":
@@ -375,19 +384,19 @@ def apply(self, line:str) -> str:
                 # Augment mode adds random noise both on the source and the target without any
                 # tagging encouraging the model to copy crap from one side to the other.
                 augment_tokens = get_random_unicode_words()
-                source = source[:candidate.src+1] + augment_tokens + source[candidate.src+1:]
-                target = target[:candidate.trg+1] + augment_tokens + target[candidate.trg+1:]
+                source, num_src_aug_tokens, pos_aug_src = self.insert_augmented(augment_tokens, source, candidate.src+1, self.custom_detok_src)
+                target, num_trg_aug_tokens, pos_aug_trg = self.insert_augmented(augment_tokens, target, candidate.trg+1, self.custom_detok_trg)
 
                 # Fix up alignment pairs
                 alignments = (
                     # pairs before and including the candidate stay the same
                     alignments[:candidate_index+1]
                     # fill in the gap created by the added random noise
-                    + [Pair(candidate.src + n, candidate.trg + n) for n in range(1, len(augment_tokens) + 1)]
+                    + [Pair(candidate.src + n_src, candidate.trg + n_trg) for n_src, n_trg in zip(pos_aug_src, pos_aug_trg)]
                     # pairs after the replaced bit have to be offset by the length of the replacement bit
-                    + [Pair(pair.src + len(augment_tokens), pair.trg + len(augment_tokens)) for pair in alignments[candidate_index+1:]]
+                    + [Pair(pair.src + num_src_aug_tokens, pair.trg + num_trg_aug_tokens) for pair in alignments[candidate_index+1:]]
                 )
-                candidate_offset = candidate_index + len(augment_tokens) + 1
+                candidate_offset = candidate_index + min(num_src_aug_tokens, num_trg_aug_tokens) + 1
 
         source_detok, _, source_mapping = self.src_retokenizer.retokenize(source)
         target_detok, _, target_mapping = self.trg_retokenizer.retokenize(target)
@@ -398,6 +407,46 @@ def apply(self, line:str) -> str:
         else:
             return source_detok + "\t" + target_detok
 
+    def insert_augmented(self, augment_tokens: List[str], tokens: List[str], position: int, detokenization: str) -> Tuple[List[str], int, List[int]]:
+        """
+        Inserts augmented tokens.
+        Accounts for possible ICU detokenization which uses special symbol "▁" for whitespace tokens.
+            Such tokens will also be inserted to separate the augmented words.
+
+        Returns:
+            new tokens
+            number of augmented tokens including whitespaces for in ICU case
+            alignments positions for the augmented tokens (whitespaces are excluded, we don't need alignments for them)
+        """
+        prefix = tokens[:position]
+        postfix = tokens[position:]
+        aug_aln_offset = []
+
+        if detokenization is not None and "icu" in detokenization:
+            new_aug_tokens = []
+            aug_pos_index = 1
+
+            if len(prefix) > 0 and prefix[-1] != ICU_WHITESPACE_TOKEN:
+                new_aug_tokens.append(ICU_WHITESPACE_TOKEN)
+                aug_pos_index += 1
+
+            for token in augment_tokens:
+                new_aug_tokens.append(token)
+                # save the offset of the augmented words to use in alignments
+                aug_aln_offset.append(aug_pos_index)
+                new_aug_tokens.append(ICU_WHITESPACE_TOKEN)
+                aug_pos_index += 2
+
+            if len(postfix) > 0 and postfix[0] == ICU_WHITESPACE_TOKEN:
+                new_aug_tokens.pop()
+
+            augment_tokens = new_aug_tokens
+        else:
+            aug_aln_offset = list(range(1, len(augment_tokens) + 1))
+
+        tokens = prefix + augment_tokens + postfix
+        return tokens, len(augment_tokens), aug_aln_offset
+
     def validate(self, context:List[Modifier]) -> None:
         """Current limitation of the tags modifier is that any other modifier might modify the
         inserted tags, which we don't want. So warn users about that if we notice it.
diff --git a/src/opustrainer/modifiers/retokenize.py b/src/opustrainer/modifiers/retokenize.py
index 8d42ca0..9c47363 100644
--- a/src/opustrainer/modifiers/retokenize.py
+++ b/src/opustrainer/modifiers/retokenize.py
@@ -33,6 +33,10 @@ def retokenize(self, tokens:TokenList) -> Tuple[str,TokenList,TokenMapping]:
 
         prev_j = 0
         for i, old_token_span in enumerate(old_token_spans):
+            # it is possible for ICU tokenizer whitespace token, return empty list
+            if old_token_span is None:
+                continue
+
             for j, new_token_span in enumerate(new_token_spans[prev_j:], start=prev_j):
                 prev_j = j
                 overlap = slice_cmp(old_token_span, new_token_span)
@@ -59,8 +63,8 @@ def remap_alignment_pairs(src_mapping:TokenMapping, trg_mapping:TokenMapping, al
     sentence pair.
     
     E.g. if you have
-    source-mapping: [0 => [3,4], 1 => [5]],
-    target-mapping: [0 => [0], 1 => [1]]
+    source-mapping: [0 => [3,4], 1 => [5], 2 => []],
+    target-mapping: [0 => [0], 1 => [1], 2 => []]
     alignments:     [(0,1), (1,1)]
     it will return  [
         (3,1), (4,1), # the [0 => [3,4]] mapping
diff --git a/src/opustrainer/tokenizers.py b/src/opustrainer/tokenizers.py
index f063fb8..2dc4d5f 100644
--- a/src/opustrainer/tokenizers.py
+++ b/src/opustrainer/tokenizers.py
@@ -11,12 +11,15 @@
 DETOKENIZERS = {
     'moses': lambda lang: MosesDetokenizer(lang),
     'spaces': lambda: SpaceDetokenizer(),
+    'icu': lambda lang: IcuDetokenizer(lang),
+
 }
 
 TOKENIZERS = {
     'moses': lambda lang: MosesTokenizer(lang),
     'spm': lambda vocab: SentencePieceTokenizer(vocab),
     'spaces': lambda: SpaceTokenizer(),
+    'icu': lambda lang: IcuTokenizer(lang),
 }
 
 
@@ -126,3 +129,72 @@ def tokenize(self, text:str) -> Tuple[TokenList,TokenSpanList]:
         tokens = [text[span] for span in spans]
         return tokens, spans
 
+# The same character as in SentencePiece
+ICU_WHITESPACE_TOKEN = "▁"
+class IcuTokenizer:
+    """
+    Tokenizes text by splitting words and punctuation using ICU segmenter.
+    Whitespaces will be preserved as a special token ▁ for lossless detokenization.
+    Requires installation with the steps specified in https://pypi.org/project/PyICU/
+    """
+
+    def __init__(self, lang: str):
+        self.lang = lang
+
+    def tokenize(self, text:str) -> Tuple[TokenList, TokenSpanList]:
+        from icu import BreakIterator, Locale
+
+        bi = BreakIterator.createWordInstance(Locale(self.lang))
+        bi.setText(text)
+
+        tokens = []
+        start = bi.first()
+        for end in bi:
+            token = text[start:end]
+            if (
+                token and token != "\n"
+            ):  # exclude empty tokens, but leave whitespaces and replace them with a special token
+                tokens.append(token)
+            start = end
+
+        spans: TokenSpanList = []
+        offset = 0
+        for token in tokens:
+            offset = text.find(token, offset)
+            if offset == -1:
+                raise RuntimeError(f"Could not find token '{token}' in original text")
+            spans.append(slice(offset, offset + len(token)))
+            offset += len(token)
+
+        tokens = [token.replace(" ", ICU_WHITESPACE_TOKEN) for token in tokens]
+        return tokens, spans
+
+class IcuDetokenizer:
+    """
+    Detokenizes tokens back into the original text preserving whitespaces as well.
+    Spans for whitespaces will be None.
+    """
+
+    # For compatibility with MosesDetokenizer interface
+    def __init__(self, lang):
+        self.lang = lang
+
+    def detokenize(self, tokens:TokenList) -> Tuple[str,TokenSpanList]:
+        text = "".join(tokens).replace(ICU_WHITESPACE_TOKEN, " ")
+
+        spans = []
+        offset = 0
+
+        for token in tokens:
+            if token == ICU_WHITESPACE_TOKEN:
+                spans.append(None)
+                continue
+            # there are some edge cases where a whitespace can appear inside a token
+            token = token.replace(ICU_WHITESPACE_TOKEN, " ")
+            offset = text.find(token, offset)
+            if offset == -1:
+                raise RuntimeError(f"Could not find token '{token}' in detokenized text")
+            spans.append(slice(offset, offset + len(token)))
+            offset += len(token)
+
+        return text, spans
diff --git a/src/opustrainer/trainer.py b/src/opustrainer/trainer.py
index f75baaf..ba5c43a 100755
--- a/src/opustrainer/trainer.py
+++ b/src/opustrainer/trainer.py
@@ -504,8 +504,8 @@ def _load_modifiers(self, ymldata:dict, basepath:str) -> List[Modifier]:
           - TitleCase: 0.05
           - Tags: 0.02
             num_tags: 6
-            custom_detok_src: null
-            custom_detok_trg: zh
+            custom_detok_src: "moses:null"
+            custom_detok_trg: "moses:zh"
         ```
         """
         modifiers = [
diff --git a/tests/test_placeholders.py b/tests/test_placeholders.py
index 5cd146d..32ef0b6 100644
--- a/tests/test_placeholders.py
+++ b/tests/test_placeholders.py
@@ -40,13 +40,22 @@ def test_tagger_augment(self):
     output = tagger(['Hello world\tHallo Welt\t0-0 1-1'])
     self.assertEqual(first(output), 'Hello িৡহ world ЇӤӕѣѮ қӃӄЀҲ\tHallo িৡহ Welt ЇӤӕѣѮ қӃӄЀҲ\t0-0 1-1 2-2 3-3 4-4')
 
+  def test_tagger_augment_icu(self):
+    """Augment mode will add random noise without tags to both source and target
+    sentence, teaching the model to copy strings it doesn't understand."""
+    tagger = PlaceholderTagModifier(probability=1, augment=1, tag=0, custom_detok_src='icu:en', custom_detok_trg='icu:de')
+    tagger.print_alignments = True
+    output = tagger(['Hello ▁ world\tHallo ▁ Welt\t0-0 1-1 2-2'])
+    self.assertEqual(first(output), 'Hello িৡহ world ټ؇ۤە٣ٮڛۃ \tHallo িৡহ Welt ټ؇ۤە٣ٮڛۃ \t0-0 1-1 2-2 3-3')
+
+
   def test_retokenize(self):
     """Pass the spm vocab to the placeholder tag generator so that it can
     retokenize the input, and update the alignments accordingly."""
     tagger = PlaceholderTagModifier(
       probability=0.25,
-      custom_detok_src='en',
-      custom_detok_trg='zh',
+      custom_detok_src='moses:en',
+      custom_detok_trg='moses:zh',
       spm_vocab='contrib/test-data/vocab.zhen.spm') # type: ignore Path vs String type issue
     
     output = tagger(['\t'.join([
@@ -80,13 +89,43 @@ def test_retokenize(self):
       # 7-9 [.]         [。]    18-16
     ])
 
+  def test_augment_icu(self):
+    """Pass the spm vocab to the placeholder tag generator so that it can
+    retokenize the input, and update the alignments accordingly."""
+    tagger = PlaceholderTagModifier(
+      probability=0.2,
+      augment=1,
+      tag=0,
+      custom_detok_src='icu:en',
+      custom_detok_trg='icu:zh',
+      spm_vocab='contrib/test-data/vocab.zhen.spm')  # type: ignore Path vs String type issue
+
+    output = tagger(['\t'.join([
+      'This ▁ is ▁ a ▁ simple ▁ test ▁ statement ▁ 🤣 .',
+      #^0   ^1^2 ^3^4^5^6     ^7^8   ^9^10       ^11^12^13
+      '这 是 一个 简单 的 测试 语 句 ▁ 🤣 ▁ 。',
+      #^0 ^1 ^2  ^3  ^4 ^5  ^6 ^7^8 ^9^10^11
+      '0-0 2-1 4-2 6-3 6-4 8-5 10-6 10-7 12-9 13-11',
+    ])])
+
+    self.assertEqual(first(output).split('\t'), [
+      'This িৡহ is a simple test statement 🤣.',
+      # ['This', ' ', '', '', 'ি', '', '', 'ৡ', '', '', 'হ', ' is', ' a', ' simple', ' test', ' statement', ' ', '', '', '', '🤣', '.']
+      '这 িৡহ 是一个简单的测试语句 🤣 。',
+      # ['这', ' ', '', '', 'ি', '', '', 'ৡ', '', '', 'হ', ' 是', '一', '个', '简', '单', '的', '测', '试', '语', '句', ' ', '', '', '', '🤣', ' 。']
+      '0-0 4-4 4-5 4-6 4-7 4-8 4-9 4-10 5-4 5-5 5-6 5-7 5-8 5-9 5-10 6-4 6-5 6-6 '
+      '6-7 6-8 6-9 6-10 7-4 7-5 7-6 7-7 7-8 7-9 7-10 8-4 8-5 8-6 8-7 8-8 8-9 8-10 '
+      '9-4 9-5 9-6 9-7 9-8 9-9 9-10 10-4 10-5 10-6 10-7 10-8 10-9 10-10 11-11 12-12 '
+      '12-13 13-14 13-15 13-16 14-17 14-18 15-19 15-20 20-25 21-26'
+    ])
+
   def test_retokenize_on_non_trigger(self):
     """Pass the spm vocab to the placeholder tag generator so that it can
     retokenize the input, even if probability is 0."""
     tagger = PlaceholderTagModifier(
       probability=0.0,
-      custom_detok_src='en',
-      custom_detok_trg='zh',
+      custom_detok_src='moses:en',
+      custom_detok_trg='moses:zh',
       spm_vocab='contrib/test-data/vocab.zhen.spm') # type: ignore Path vs String type issue
     
     output = tagger(['\t'.join([
@@ -109,7 +148,7 @@ def test_mode(self):
     multiple modes are enabled."""
     tagger = PlaceholderTagModifier(
       probability=1.0,
-      custom_detok_src='zh',
+      custom_detok_src='moses:zh',
       augment=0.33,
       replace=0.33,
       # tag=0.33 is implicit
diff --git a/tests/test_retokenizer.py b/tests/test_retokenizer.py
index c74920e..4ac987a 100644
--- a/tests/test_retokenizer.py
+++ b/tests/test_retokenizer.py
@@ -10,7 +10,7 @@ def first(it):
   return next(iter(it))
 
 
-class TestTokenizer(unittest.TestCase):
+class TestRetokenizer(unittest.TestCase):
   maxDiff = None
 
   def test_identity(self):
@@ -81,4 +81,36 @@ def test_retokenize(self):
       # 7-9 [.]         [。]    11-16
     ]))
 
+  def test_retokenize_icu(self):
+    tokenizer = RetokenizeModifier(
+      src=dict(detokenize='icu:en', tokenize=f'spm:{VOCAB}'),
+      trg=dict(detokenize='icu:zh', tokenize=f'spm:{VOCAB}'))
+
+    out = tokenizer(['\t'.join([
+      'This ▁ is ▁ a ▁ simple ▁ test ▁ statement ▁ 🤣 .',
+      #^0   ^1^2 ^3^4^5^6     ^7^8   ^9^10       ^11^12^13
+      '这 是 一个 简单 的 测试 语 句 ▁ 🤣 ▁ 。',
+      #^0 ^1 ^2  ^3  ^4 ^5  ^6 ^7^8 ^9^10^11
+      '0-0 2-1 4-2 6-3 6-4 8-5 10-6 10-7 12-9 13-11',
+    ])])
+
+    self.assertEqual(first(out), '\t'.join([
+      'This is a simple test statement 🤣.',
+      #[This][ is][ a][ simple][ test][ statement][ ][] [] [] [🤣][.]
+      #^0    ^1   ^2  ^3       ^4     ^5          ^6 ^7 ^8 ^9 ^10 ^11
+      '这是一个简单的测试语句 🤣 。',
+      #[这][是][一][个][简][单][的][测][试][语][句] [ ] []  []  []  [🤣][ 。]
+      #^0  ^1  ^2  ^3 ^4  ^5  ^6  ^7  ^8  ^9  ^10 ^11 ^12 ^13 ^14 ^15  ^16
+      '0-0 1-1 2-2 2-3 3-4 3-5 3-6 4-7 4-8 5-9 5-10 10-15 11-16',
+      # 0-0 [This]      [这]    0-0
+      # 1-1 [is]        [是]    1-1
+      # 2-2 [a]         [一个]  2-2 2-3
+      # 3-3 [simple]    [简单]  3-4 3-5
+      # 3-4 [simple]    [的]    3-6
+      # 4-5 [test]      [测试]  4-7 4-8
+      # 5-6 [statement] [语]    5-9
+      # 5-7 [statement] [句]    5-10 (6-11)
+      # 6-8 [🤣]        [🤣]   (7-12 8-13 9-14) 10-15
+      # 7-9 [.]         [。]    11-16
+    ]))
 
diff --git a/tests/test_tokenizers.py b/tests/test_tokenizers.py
new file mode 100644
index 0000000..eed06ed
--- /dev/null
+++ b/tests/test_tokenizers.py
@@ -0,0 +1,37 @@
+import unittest
+
+from opustrainer.tokenizers import make_tokenizer, make_detokenizer
+
+
+class TestTokenizers(unittest.TestCase):
+
+  def test_tokenize_detokenize_icu_en(self):
+    """
+    Tests lossless text reconstruction by the ICU tokenizer for English.
+    Requires installation with the steps specified in https://pypi.org/project/PyICU/
+    """
+    tokenizer = make_tokenizer('icu:en')
+    detokenizer = make_detokenizer('icu:en')
+    text = '“This is,” a simple test statement 🤣.'
+
+    tokens, _ = tokenizer.tokenize(text)
+    detokenized, _ = detokenizer.detokenize(tokens)
+
+    self.assertEqual(text, detokenized)
+    self.assertEqual("“ This ▁ is , ” ▁ a ▁ simple ▁ test ▁ statement ▁ 🤣.", " ".join(tokens))
+
+
+  def test_tokenize_detokenize_icu_zh(self):
+    """
+    Tests lossless text reconstruction by the ICU tokenizer for Chinese.
+    Requires installation with the steps specified in https://pypi.org/project/PyICU/
+    """
+    tokenizer = make_tokenizer('icu:zh')
+    detokenizer = make_detokenizer('icu:zh')
+    text = '这是一个简单的测试语句 🤣 。'
+
+    tokens, _ = tokenizer.tokenize(text)
+    detokenized, _ = detokenizer.detokenize(tokens)
+
+    self.assertEqual(text, detokenized)
+    self.assertEqual("这 是 一个 简单 的 测试 语 句 ▁ 🤣▁ 。", " ".join(tokens))