diff --git a/.github/workflows/release.yaml b/.github/workflows/release.yaml index 3ac4451..786c184 100644 --- a/.github/workflows/release.yaml +++ b/.github/workflows/release.yaml @@ -18,6 +18,7 @@ jobs: - name: Run tests run: |- + apt-get install python3-icu pip install . python -m unittest discover -s tests diff --git a/README.md b/README.md index e49136f..4ff0ebc 100644 --- a/README.md +++ b/README.md @@ -198,8 +198,8 @@ This modifier needs a third column in the training data with per-word (technical ```yaml - Tags: 0.05 - custom_detok_src: null - custom_detok_trg: zh + custom_detok_src: "moses:null" + custom_detok_trg: "moses:zh" spm_vocab: path/to/vocab.enzh.spm template: "__source__ {src} __target__ {trg} __done__" ``` @@ -218,8 +218,8 @@ Sometimes we want to just replace the source token with the target token directl ```yml modifiers: - Tags: 0.1 - custom_detok_src: null # Null value for the src detokenizer - custom_detok_trg: zh + custom_detok_src: "moses:null" # Null value for the src detokenizer + custom_detok_trg: "moses:zh" replace: 0.4 # 0.4 out of the time tags is triggered, instead replace the target token with random noise, and use that random noise to tag a corresponding source word. ``` @@ -229,8 +229,8 @@ If alignment information is present, we can augment the training data with inlin ```yml modifiers: - Tags: 0.1 - custom_detok_src: null # Null value for the src detokenizer - custom_detok_trg: zh + custom_detok_src: "moses:null" # Null value for the src detokenizer + custom_detok_trg: "moses:zh" augment: 0.4 # 0.4 out of the time tags is triggered, instead augment the source and the target with random noise. If you want 100% only noise without tag functionality use augment: 1 ``` diff --git a/contrib/test_enzh_config.yml b/contrib/test_enzh_config.yml index e3eb124..804643e 100644 --- a/contrib/test_enzh_config.yml +++ b/contrib/test_enzh_config.yml @@ -14,8 +14,8 @@ modifiers: - TitleCase: 0.2 - Typos: 0.2 - Tags: 0.1 - custom_detok_src: null # Null value for the src detokenizer - custom_detok_trg: zh + custom_detok_src: "moses:null" # Null value for the src detokenizer + custom_detok_trg: "moses:zh" template: "__source__ {src} __target__ {trg} __done__" # This is the default way of inserting tags. # We STRONGLY DISCOURAGE the modification of this line and in fact it shouldn't be included in the config # unless you really know what you are doing. diff --git a/contrib/test_enzh_noise_config.yml b/contrib/test_enzh_noise_config.yml index 20cde0f..8fd7dbd 100644 --- a/contrib/test_enzh_noise_config.yml +++ b/contrib/test_enzh_noise_config.yml @@ -13,8 +13,8 @@ modifiers: - UpperCase: 0.2 - TitleCase: 0.2 - Tags: 0.1 - custom_detok_src: null # Null value for the src detokenizer - custom_detok_trg: zh + custom_detok_src: "moses:null" # Null value for the src detokenizer + custom_detok_trg: "moses:zh" augment: 0.4 # 0.4 out of the time tags is triggered, instead augment the source and the target with random noise replace: 0.4 # =====//=====, instead replace the target token with random noise, and use that random noise to tag a corresponding source word # template: "__source__ {src} __target__ {trg} __done__" # This is the default way of inserting tags. diff --git a/contrib/test_enzh_tags_advanced_config.yml b/contrib/test_enzh_tags_advanced_config.yml index bfdc0dd..e2cae55 100644 --- a/contrib/test_enzh_tags_advanced_config.yml +++ b/contrib/test_enzh_tags_advanced_config.yml @@ -15,8 +15,8 @@ start: - until clean 1 modifiers: - Tags: 0.5 - custom_detok_src: null - custom_detok_trg: null + custom_detok_src: "moses:null" + custom_detok_trg: "moses:null" template: "{src} __target__ {trg} __done__" - *modifiers @@ -26,8 +26,8 @@ end: - until clean 1 modifiers: - Tags: 0.5 - custom_detok_src: null - custom_detok_trg: zh + custom_detok_src: "moses:null" + custom_detok_trg: "moses:zh" template: "{src} __target__ {trg} __done__" - *modifiers diff --git a/contrib/test_enzh_tags_stage_config.yml b/contrib/test_enzh_tags_stage_config.yml index 720c13b..dbe44a9 100644 --- a/contrib/test_enzh_tags_stage_config.yml +++ b/contrib/test_enzh_tags_stage_config.yml @@ -12,8 +12,8 @@ start: - until clean 1 modifiers: - Tags: 0.5 - custom_detok_src: null - custom_detok_trg: null + custom_detok_src: "moses:null" + custom_detok_trg: "moses:null" template: "{src} __target__ {trg} __done__" end: @@ -22,8 +22,8 @@ end: - until clean 1 modifiers: - Tags: 0.5 - custom_detok_src: null - custom_detok_trg: zh + custom_detok_src: "moses:null" + custom_detok_trg: "moses:zh" template: "{src} __target__ {trg} __done__" seed: 1111 diff --git a/contrib/test_full_config.yml b/contrib/test_full_config.yml index 0f0582d..6d0b9d2 100644 --- a/contrib/test_full_config.yml +++ b/contrib/test_full_config.yml @@ -52,8 +52,8 @@ modifiers: repeated_char: 0.1 # Repeats a random word character. unichar: 0.1 # Replaces a random consecutive repeated letter with a single letter. - Tags: 0.08 - custom_detok_src: null - custom_detok_trg: zh + custom_detok_src: "moses:null" + custom_detok_trg: "moses:zh" template: "__source__ {src} __target__ {trg} __done__" seed: 1111 diff --git a/contrib/test_zhen_config.yml b/contrib/test_zhen_config.yml index 5132790..4715710 100644 --- a/contrib/test_zhen_config.yml +++ b/contrib/test_zhen_config.yml @@ -12,7 +12,7 @@ start: modifiers: # No UpperCase or TitleCase modifier when the source is Chinese as we can't upper or lowercase Chinese - Tags: 0.1 - custom_detok_src: zh + custom_detok_src: "moses:zh" seed: 1111 trainer: cat diff --git a/contrib/train_config.yml b/contrib/train_config.yml index b06b699..627f51c 100644 --- a/contrib/train_config.yml +++ b/contrib/train_config.yml @@ -31,8 +31,8 @@ modifiers: - UpperCase: 0.05 - TitleCase: 0.05 #- Tags: 0.08 # Requires dataset augmented with alignment info - # custom_detok_src: null # Null value for the src detokenizer - # custom_detok_trg: zh + # custom_detok_src: "moses:null" # Null value for the src detokenizer + # custom_detok_trg: "moses:zh" # template: "__source__ {src} __target__ {trg} __done__" # This is the default way of inserting tags. # We STRONGLY DISCOURAGE the modification of this line and in fact it shouldn't be included in the config # unless you really know what you are doing. diff --git a/src/opustrainer/modifiers/placeholders.py b/src/opustrainer/modifiers/placeholders.py index 4af71b0..df86233 100644 --- a/src/opustrainer/modifiers/placeholders.py +++ b/src/opustrainer/modifiers/placeholders.py @@ -5,7 +5,8 @@ from opustrainer.alignments import Pair, parse_alignments, format_alignments from opustrainer.modifiers import Modifier -from opustrainer.tokenizers import SpaceDetokenizer, SpaceTokenizer, MosesDetokenizer, SentencePieceTokenizer +from opustrainer.tokenizers import SpaceDetokenizer, SpaceTokenizer, SentencePieceTokenizer, \ + make_detokenizer, ICU_WHITESPACE_TOKEN from opustrainer.modifiers.retokenize import Retokenizer, remap_alignment_pairs from opustrainer import logger @@ -231,8 +232,8 @@ class PlaceholderTagModifier(Modifier): ```yaml modifiers: - Tags: 0.02 - custom_detok_src: 'zh' - custom_detok_trg: null + custom_detok_src: 'moses:zh' + custom_detok_trg: "moses:null" template: "__source__ {src} __target__ {trg} __done__" augment: 0.0 # 0% chance to just insert a random string on both sides replace: 0.0 # 0% change to use tags to force translate to a random string @@ -252,18 +253,20 @@ class PlaceholderTagModifier(Modifier): def __init__(self, probability: float=0.0, custom_detok_src: Optional[str]=None, custom_detok_trg: Optional[str]=None, spm_vocab: Optional[Path]=None, - template: str="__source__ {src} __target__ {trg} __done__", augment: float=0, replace:float=0): + template: str="__source__ {src} __target__ {trg} __done__", augment: float=0, replace:float=0, tag:float=1): super().__init__(probability) self.template = template + self.custom_detok_src = custom_detok_src + self.custom_detok_trg = custom_detok_trg self.src_retokenizer = Retokenizer( - detokenizer=MosesDetokenizer(custom_detok_src) if custom_detok_src else SpaceDetokenizer(), + detokenizer=make_detokenizer(custom_detok_src) if custom_detok_src else SpaceDetokenizer(), tokenizer=SentencePieceTokenizer(spm_vocab) if spm_vocab else SpaceTokenizer() ) self.trg_retokenizer = Retokenizer( - detokenizer=MosesDetokenizer(custom_detok_trg) if custom_detok_trg else SpaceDetokenizer(), + detokenizer=make_detokenizer(custom_detok_trg) if custom_detok_trg else SpaceDetokenizer(), tokenizer=SentencePieceTokenizer(spm_vocab) if spm_vocab else SpaceTokenizer() ) @@ -281,7 +284,13 @@ def __init__(self, probability: float=0.0, custom_detok_src: Optional[str]=None, if replace > 0: self.modes.append(('replace', replace)) - self.modes.append(('tag', 1.0)) # Weight doesn't matter as long as cumsum => 1.0, it's last on the list anyway + # the modifier can be used for inline noise augmentation only + if tag > 0: + self.modes.append(('tag', tag)) + + if ({'replace', 'tag'} & {mode for mode,_ in self.modes}) and \ + 'icu' in ((self.custom_detok_trg or '') + (self.custom_detok_trg or '')): + raise ValueError('ICU tokenization is not supported with "tag" and "replace" modes') def __call__(self, batch: List[str]) -> Iterable[str]: for line in batch: @@ -293,7 +302,7 @@ def __call__(self, batch: List[str]) -> Iterable[str]: def apply(self, line:str) -> str: """Applies tag to words in a line based on alignment info, and then removes the alignment info from the line. This is used to enable terminology support by tagging random words with their translation. - eg "I like cake" would become "I __source__ like __target__ gusta __done__ cake. + eg "I like cake" would become "I __source__ like __target__ gusta __done__ cake. By default the detokenizer used is the trivial detokenizer, but we can instead have separate detokenizers on src and trg." """ @@ -333,7 +342,7 @@ def apply(self, line:str) -> str: continue # Select mode (skip random_weighted_choices*() when 'tag' is the only mode) - mode = random_weighted_choice(self.modes) if len(self.modes) > 1 else 'tag' + mode = random_weighted_choice(self.modes) if len(self.modes) > 1 else self.modes[0][0] if mode == "tag" or mode == "replace": if mode == "tag": @@ -375,19 +384,19 @@ def apply(self, line:str) -> str: # Augment mode adds random noise both on the source and the target without any # tagging encouraging the model to copy crap from one side to the other. augment_tokens = get_random_unicode_words() - source = source[:candidate.src+1] + augment_tokens + source[candidate.src+1:] - target = target[:candidate.trg+1] + augment_tokens + target[candidate.trg+1:] + source, num_src_aug_tokens, pos_aug_src = self.insert_augmented(augment_tokens, source, candidate.src+1, self.custom_detok_src) + target, num_trg_aug_tokens, pos_aug_trg = self.insert_augmented(augment_tokens, target, candidate.trg+1, self.custom_detok_trg) # Fix up alignment pairs alignments = ( # pairs before and including the candidate stay the same alignments[:candidate_index+1] # fill in the gap created by the added random noise - + [Pair(candidate.src + n, candidate.trg + n) for n in range(1, len(augment_tokens) + 1)] + + [Pair(candidate.src + n_src, candidate.trg + n_trg) for n_src, n_trg in zip(pos_aug_src, pos_aug_trg)] # pairs after the replaced bit have to be offset by the length of the replacement bit - + [Pair(pair.src + len(augment_tokens), pair.trg + len(augment_tokens)) for pair in alignments[candidate_index+1:]] + + [Pair(pair.src + num_src_aug_tokens, pair.trg + num_trg_aug_tokens) for pair in alignments[candidate_index+1:]] ) - candidate_offset = candidate_index + len(augment_tokens) + 1 + candidate_offset = candidate_index + min(num_src_aug_tokens, num_trg_aug_tokens) + 1 source_detok, _, source_mapping = self.src_retokenizer.retokenize(source) target_detok, _, target_mapping = self.trg_retokenizer.retokenize(target) @@ -398,6 +407,46 @@ def apply(self, line:str) -> str: else: return source_detok + "\t" + target_detok + def insert_augmented(self, augment_tokens: List[str], tokens: List[str], position: int, detokenization: str) -> Tuple[List[str], int, List[int]]: + """ + Inserts augmented tokens. + Accounts for possible ICU detokenization which uses special symbol "▁" for whitespace tokens. + Such tokens will also be inserted to separate the augmented words. + + Returns: + new tokens + number of augmented tokens including whitespaces for in ICU case + alignments positions for the augmented tokens (whitespaces are excluded, we don't need alignments for them) + """ + prefix = tokens[:position] + postfix = tokens[position:] + aug_aln_offset = [] + + if detokenization is not None and "icu" in detokenization: + new_aug_tokens = [] + aug_pos_index = 1 + + if len(prefix) > 0 and prefix[-1] != ICU_WHITESPACE_TOKEN: + new_aug_tokens.append(ICU_WHITESPACE_TOKEN) + aug_pos_index += 1 + + for token in augment_tokens: + new_aug_tokens.append(token) + # save the offset of the augmented words to use in alignments + aug_aln_offset.append(aug_pos_index) + new_aug_tokens.append(ICU_WHITESPACE_TOKEN) + aug_pos_index += 2 + + if len(postfix) > 0 and postfix[0] == ICU_WHITESPACE_TOKEN: + new_aug_tokens.pop() + + augment_tokens = new_aug_tokens + else: + aug_aln_offset = list(range(1, len(augment_tokens) + 1)) + + tokens = prefix + augment_tokens + postfix + return tokens, len(augment_tokens), aug_aln_offset + def validate(self, context:List[Modifier]) -> None: """Current limitation of the tags modifier is that any other modifier might modify the inserted tags, which we don't want. So warn users about that if we notice it. diff --git a/src/opustrainer/modifiers/retokenize.py b/src/opustrainer/modifiers/retokenize.py index 8d42ca0..9c47363 100644 --- a/src/opustrainer/modifiers/retokenize.py +++ b/src/opustrainer/modifiers/retokenize.py @@ -33,6 +33,10 @@ def retokenize(self, tokens:TokenList) -> Tuple[str,TokenList,TokenMapping]: prev_j = 0 for i, old_token_span in enumerate(old_token_spans): + # it is possible for ICU tokenizer whitespace token, return empty list + if old_token_span is None: + continue + for j, new_token_span in enumerate(new_token_spans[prev_j:], start=prev_j): prev_j = j overlap = slice_cmp(old_token_span, new_token_span) @@ -59,8 +63,8 @@ def remap_alignment_pairs(src_mapping:TokenMapping, trg_mapping:TokenMapping, al sentence pair. E.g. if you have - source-mapping: [0 => [3,4], 1 => [5]], - target-mapping: [0 => [0], 1 => [1]] + source-mapping: [0 => [3,4], 1 => [5], 2 => []], + target-mapping: [0 => [0], 1 => [1], 2 => []] alignments: [(0,1), (1,1)] it will return [ (3,1), (4,1), # the [0 => [3,4]] mapping diff --git a/src/opustrainer/tokenizers.py b/src/opustrainer/tokenizers.py index f063fb8..2dc4d5f 100644 --- a/src/opustrainer/tokenizers.py +++ b/src/opustrainer/tokenizers.py @@ -11,12 +11,15 @@ DETOKENIZERS = { 'moses': lambda lang: MosesDetokenizer(lang), 'spaces': lambda: SpaceDetokenizer(), + 'icu': lambda lang: IcuDetokenizer(lang), + } TOKENIZERS = { 'moses': lambda lang: MosesTokenizer(lang), 'spm': lambda vocab: SentencePieceTokenizer(vocab), 'spaces': lambda: SpaceTokenizer(), + 'icu': lambda lang: IcuTokenizer(lang), } @@ -126,3 +129,72 @@ def tokenize(self, text:str) -> Tuple[TokenList,TokenSpanList]: tokens = [text[span] for span in spans] return tokens, spans +# The same character as in SentencePiece +ICU_WHITESPACE_TOKEN = "▁" +class IcuTokenizer: + """ + Tokenizes text by splitting words and punctuation using ICU segmenter. + Whitespaces will be preserved as a special token ▁ for lossless detokenization. + Requires installation with the steps specified in https://pypi.org/project/PyICU/ + """ + + def __init__(self, lang: str): + self.lang = lang + + def tokenize(self, text:str) -> Tuple[TokenList, TokenSpanList]: + from icu import BreakIterator, Locale + + bi = BreakIterator.createWordInstance(Locale(self.lang)) + bi.setText(text) + + tokens = [] + start = bi.first() + for end in bi: + token = text[start:end] + if ( + token and token != "\n" + ): # exclude empty tokens, but leave whitespaces and replace them with a special token + tokens.append(token) + start = end + + spans: TokenSpanList = [] + offset = 0 + for token in tokens: + offset = text.find(token, offset) + if offset == -1: + raise RuntimeError(f"Could not find token '{token}' in original text") + spans.append(slice(offset, offset + len(token))) + offset += len(token) + + tokens = [token.replace(" ", ICU_WHITESPACE_TOKEN) for token in tokens] + return tokens, spans + +class IcuDetokenizer: + """ + Detokenizes tokens back into the original text preserving whitespaces as well. + Spans for whitespaces will be None. + """ + + # For compatibility with MosesDetokenizer interface + def __init__(self, lang): + self.lang = lang + + def detokenize(self, tokens:TokenList) -> Tuple[str,TokenSpanList]: + text = "".join(tokens).replace(ICU_WHITESPACE_TOKEN, " ") + + spans = [] + offset = 0 + + for token in tokens: + if token == ICU_WHITESPACE_TOKEN: + spans.append(None) + continue + # there are some edge cases where a whitespace can appear inside a token + token = token.replace(ICU_WHITESPACE_TOKEN, " ") + offset = text.find(token, offset) + if offset == -1: + raise RuntimeError(f"Could not find token '{token}' in detokenized text") + spans.append(slice(offset, offset + len(token))) + offset += len(token) + + return text, spans diff --git a/src/opustrainer/trainer.py b/src/opustrainer/trainer.py index f75baaf..ba5c43a 100755 --- a/src/opustrainer/trainer.py +++ b/src/opustrainer/trainer.py @@ -504,8 +504,8 @@ def _load_modifiers(self, ymldata:dict, basepath:str) -> List[Modifier]: - TitleCase: 0.05 - Tags: 0.02 num_tags: 6 - custom_detok_src: null - custom_detok_trg: zh + custom_detok_src: "moses:null" + custom_detok_trg: "moses:zh" ``` """ modifiers = [ diff --git a/tests/test_placeholders.py b/tests/test_placeholders.py index 5cd146d..32ef0b6 100644 --- a/tests/test_placeholders.py +++ b/tests/test_placeholders.py @@ -40,13 +40,22 @@ def test_tagger_augment(self): output = tagger(['Hello world\tHallo Welt\t0-0 1-1']) self.assertEqual(first(output), 'Hello িৡহ world ЇӤӕѣѮ қӃӄЀҲ\tHallo িৡহ Welt ЇӤӕѣѮ қӃӄЀҲ\t0-0 1-1 2-2 3-3 4-4') + def test_tagger_augment_icu(self): + """Augment mode will add random noise without tags to both source and target + sentence, teaching the model to copy strings it doesn't understand.""" + tagger = PlaceholderTagModifier(probability=1, augment=1, tag=0, custom_detok_src='icu:en', custom_detok_trg='icu:de') + tagger.print_alignments = True + output = tagger(['Hello ▁ world\tHallo ▁ Welt\t0-0 1-1 2-2']) + self.assertEqual(first(output), 'Hello িৡহ world ټ؇ۤە٣ٮڛۃ \tHallo িৡহ Welt ټ؇ۤە٣ٮڛۃ \t0-0 1-1 2-2 3-3') + + def test_retokenize(self): """Pass the spm vocab to the placeholder tag generator so that it can retokenize the input, and update the alignments accordingly.""" tagger = PlaceholderTagModifier( probability=0.25, - custom_detok_src='en', - custom_detok_trg='zh', + custom_detok_src='moses:en', + custom_detok_trg='moses:zh', spm_vocab='contrib/test-data/vocab.zhen.spm') # type: ignore Path vs String type issue output = tagger(['\t'.join([ @@ -80,13 +89,43 @@ def test_retokenize(self): # 7-9 [.] [。] 18-16 ]) + def test_augment_icu(self): + """Pass the spm vocab to the placeholder tag generator so that it can + retokenize the input, and update the alignments accordingly.""" + tagger = PlaceholderTagModifier( + probability=0.2, + augment=1, + tag=0, + custom_detok_src='icu:en', + custom_detok_trg='icu:zh', + spm_vocab='contrib/test-data/vocab.zhen.spm') # type: ignore Path vs String type issue + + output = tagger(['\t'.join([ + 'This ▁ is ▁ a ▁ simple ▁ test ▁ statement ▁ 🤣 .', + #^0 ^1^2 ^3^4^5^6 ^7^8 ^9^10 ^11^12^13 + '这 是 一个 简单 的 测试 语 句 ▁ 🤣 ▁ 。', + #^0 ^1 ^2 ^3 ^4 ^5 ^6 ^7^8 ^9^10^11 + '0-0 2-1 4-2 6-3 6-4 8-5 10-6 10-7 12-9 13-11', + ])]) + + self.assertEqual(first(output).split('\t'), [ + 'This িৡহ is a simple test statement 🤣.', + # ['This', ' ', '', '', 'ি', '', '', 'ৡ', '', '', 'হ', ' is', ' a', ' simple', ' test', ' statement', ' ', '', '', '', '🤣', '.'] + '这 িৡহ 是一个简单的测试语句 🤣 。', + # ['这', ' ', '', '', 'ি', '', '', 'ৡ', '', '', 'হ', ' 是', '一', '个', '简', '单', '的', '测', '试', '语', '句', ' ', '', '', '', '🤣', ' 。'] + '0-0 4-4 4-5 4-6 4-7 4-8 4-9 4-10 5-4 5-5 5-6 5-7 5-8 5-9 5-10 6-4 6-5 6-6 ' + '6-7 6-8 6-9 6-10 7-4 7-5 7-6 7-7 7-8 7-9 7-10 8-4 8-5 8-6 8-7 8-8 8-9 8-10 ' + '9-4 9-5 9-6 9-7 9-8 9-9 9-10 10-4 10-5 10-6 10-7 10-8 10-9 10-10 11-11 12-12 ' + '12-13 13-14 13-15 13-16 14-17 14-18 15-19 15-20 20-25 21-26' + ]) + def test_retokenize_on_non_trigger(self): """Pass the spm vocab to the placeholder tag generator so that it can retokenize the input, even if probability is 0.""" tagger = PlaceholderTagModifier( probability=0.0, - custom_detok_src='en', - custom_detok_trg='zh', + custom_detok_src='moses:en', + custom_detok_trg='moses:zh', spm_vocab='contrib/test-data/vocab.zhen.spm') # type: ignore Path vs String type issue output = tagger(['\t'.join([ @@ -109,7 +148,7 @@ def test_mode(self): multiple modes are enabled.""" tagger = PlaceholderTagModifier( probability=1.0, - custom_detok_src='zh', + custom_detok_src='moses:zh', augment=0.33, replace=0.33, # tag=0.33 is implicit diff --git a/tests/test_retokenizer.py b/tests/test_retokenizer.py index c74920e..4ac987a 100644 --- a/tests/test_retokenizer.py +++ b/tests/test_retokenizer.py @@ -10,7 +10,7 @@ def first(it): return next(iter(it)) -class TestTokenizer(unittest.TestCase): +class TestRetokenizer(unittest.TestCase): maxDiff = None def test_identity(self): @@ -81,4 +81,36 @@ def test_retokenize(self): # 7-9 [.] [。] 11-16 ])) + def test_retokenize_icu(self): + tokenizer = RetokenizeModifier( + src=dict(detokenize='icu:en', tokenize=f'spm:{VOCAB}'), + trg=dict(detokenize='icu:zh', tokenize=f'spm:{VOCAB}')) + + out = tokenizer(['\t'.join([ + 'This ▁ is ▁ a ▁ simple ▁ test ▁ statement ▁ 🤣 .', + #^0 ^1^2 ^3^4^5^6 ^7^8 ^9^10 ^11^12^13 + '这 是 一个 简单 的 测试 语 句 ▁ 🤣 ▁ 。', + #^0 ^1 ^2 ^3 ^4 ^5 ^6 ^7^8 ^9^10^11 + '0-0 2-1 4-2 6-3 6-4 8-5 10-6 10-7 12-9 13-11', + ])]) + + self.assertEqual(first(out), '\t'.join([ + 'This is a simple test statement 🤣.', + #[This][ is][ a][ simple][ test][ statement][ ][] [] [] [🤣][.] + #^0 ^1 ^2 ^3 ^4 ^5 ^6 ^7 ^8 ^9 ^10 ^11 + '这是一个简单的测试语句 🤣 。', + #[这][是][一][个][简][单][的][测][试][语][句] [ ] [] [] [] [🤣][ 。] + #^0 ^1 ^2 ^3 ^4 ^5 ^6 ^7 ^8 ^9 ^10 ^11 ^12 ^13 ^14 ^15 ^16 + '0-0 1-1 2-2 2-3 3-4 3-5 3-6 4-7 4-8 5-9 5-10 10-15 11-16', + # 0-0 [This] [这] 0-0 + # 1-1 [is] [是] 1-1 + # 2-2 [a] [一个] 2-2 2-3 + # 3-3 [simple] [简单] 3-4 3-5 + # 3-4 [simple] [的] 3-6 + # 4-5 [test] [测试] 4-7 4-8 + # 5-6 [statement] [语] 5-9 + # 5-7 [statement] [句] 5-10 (6-11) + # 6-8 [🤣] [🤣] (7-12 8-13 9-14) 10-15 + # 7-9 [.] [。] 11-16 + ])) diff --git a/tests/test_tokenizers.py b/tests/test_tokenizers.py new file mode 100644 index 0000000..eed06ed --- /dev/null +++ b/tests/test_tokenizers.py @@ -0,0 +1,37 @@ +import unittest + +from opustrainer.tokenizers import make_tokenizer, make_detokenizer + + +class TestTokenizers(unittest.TestCase): + + def test_tokenize_detokenize_icu_en(self): + """ + Tests lossless text reconstruction by the ICU tokenizer for English. + Requires installation with the steps specified in https://pypi.org/project/PyICU/ + """ + tokenizer = make_tokenizer('icu:en') + detokenizer = make_detokenizer('icu:en') + text = '“This is,” a simple test statement 🤣.' + + tokens, _ = tokenizer.tokenize(text) + detokenized, _ = detokenizer.detokenize(tokens) + + self.assertEqual(text, detokenized) + self.assertEqual("“ This ▁ is , ” ▁ a ▁ simple ▁ test ▁ statement ▁ 🤣.", " ".join(tokens)) + + + def test_tokenize_detokenize_icu_zh(self): + """ + Tests lossless text reconstruction by the ICU tokenizer for Chinese. + Requires installation with the steps specified in https://pypi.org/project/PyICU/ + """ + tokenizer = make_tokenizer('icu:zh') + detokenizer = make_detokenizer('icu:zh') + text = '这是一个简单的测试语句 🤣 。' + + tokens, _ = tokenizer.tokenize(text) + detokenized, _ = detokenizer.detokenize(tokens) + + self.assertEqual(text, detokenized) + self.assertEqual("这 是 一个 简单 的 测试 语 句 ▁ 🤣▁ 。", " ".join(tokens))