Skip to content

Commit

Permalink
Add support for ICU tokenizer
Browse files Browse the repository at this point in the history
  • Loading branch information
eu9ene committed Nov 22, 2024
1 parent 7be3b4d commit ee534f3
Show file tree
Hide file tree
Showing 16 changed files with 281 additions and 47 deletions.
1 change: 1 addition & 0 deletions .github/workflows/release.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ jobs:

- name: Run tests
run: |-
apt-get install python3-icu
pip install .
python -m unittest discover -s tests
Expand Down
12 changes: 6 additions & 6 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -198,8 +198,8 @@ This modifier needs a third column in the training data with per-word (technical

```yaml
- Tags: 0.05
custom_detok_src: null
custom_detok_trg: zh
custom_detok_src: "moses:null"
custom_detok_trg: "moses:zh"
spm_vocab: path/to/vocab.enzh.spm
template: "__source__ {src} __target__ {trg} __done__"
```
Expand All @@ -218,8 +218,8 @@ Sometimes we want to just replace the source token with the target token directl
```yml
modifiers:
- Tags: 0.1
custom_detok_src: null # Null value for the src detokenizer
custom_detok_trg: zh
custom_detok_src: "moses:null" # Null value for the src detokenizer
custom_detok_trg: "moses:zh"
replace: 0.4 # 0.4 out of the time tags is triggered, instead replace the target token with random noise, and use that random noise to tag a corresponding source word.
```

Expand All @@ -229,8 +229,8 @@ If alignment information is present, we can augment the training data with inlin
```yml
modifiers:
- Tags: 0.1
custom_detok_src: null # Null value for the src detokenizer
custom_detok_trg: zh
custom_detok_src: "moses:null" # Null value for the src detokenizer
custom_detok_trg: "moses:zh"
augment: 0.4 # 0.4 out of the time tags is triggered, instead augment the source and the target with random noise. If you want 100% only noise without tag functionality use augment: 1
```

Expand Down
4 changes: 2 additions & 2 deletions contrib/test_enzh_config.yml
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,8 @@ modifiers:
- TitleCase: 0.2
- Typos: 0.2
- Tags: 0.1
custom_detok_src: null # Null value for the src detokenizer
custom_detok_trg: zh
custom_detok_src: "moses:null" # Null value for the src detokenizer
custom_detok_trg: "moses:zh"
template: "__source__ {src} __target__ {trg} __done__" # This is the default way of inserting tags.
# We STRONGLY DISCOURAGE the modification of this line and in fact it shouldn't be included in the config
# unless you really know what you are doing.
Expand Down
4 changes: 2 additions & 2 deletions contrib/test_enzh_noise_config.yml
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,8 @@ modifiers:
- UpperCase: 0.2
- TitleCase: 0.2
- Tags: 0.1
custom_detok_src: null # Null value for the src detokenizer
custom_detok_trg: zh
custom_detok_src: "moses:null" # Null value for the src detokenizer
custom_detok_trg: "moses:zh"
augment: 0.4 # 0.4 out of the time tags is triggered, instead augment the source and the target with random noise
replace: 0.4 # =====//=====, instead replace the target token with random noise, and use that random noise to tag a corresponding source word
# template: "__source__ {src} __target__ {trg} __done__" # This is the default way of inserting tags.
Expand Down
8 changes: 4 additions & 4 deletions contrib/test_enzh_tags_advanced_config.yml
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,8 @@ start:
- until clean 1
modifiers:
- Tags: 0.5
custom_detok_src: null
custom_detok_trg: null
custom_detok_src: "moses:null"
custom_detok_trg: "moses:null"
template: "{src} __target__ {trg} __done__"
- *modifiers

Expand All @@ -26,8 +26,8 @@ end:
- until clean 1
modifiers:
- Tags: 0.5
custom_detok_src: null
custom_detok_trg: zh
custom_detok_src: "moses:null"
custom_detok_trg: "moses:zh"
template: "{src} __target__ {trg} __done__"
- *modifiers

Expand Down
8 changes: 4 additions & 4 deletions contrib/test_enzh_tags_stage_config.yml
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,8 @@ start:
- until clean 1
modifiers:
- Tags: 0.5
custom_detok_src: null
custom_detok_trg: null
custom_detok_src: "moses:null"
custom_detok_trg: "moses:null"
template: "{src} __target__ {trg} __done__"

end:
Expand All @@ -22,8 +22,8 @@ end:
- until clean 1
modifiers:
- Tags: 0.5
custom_detok_src: null
custom_detok_trg: zh
custom_detok_src: "moses:null"
custom_detok_trg: "moses:zh"
template: "{src} __target__ {trg} __done__"

seed: 1111
Expand Down
4 changes: 2 additions & 2 deletions contrib/test_full_config.yml
Original file line number Diff line number Diff line change
Expand Up @@ -52,8 +52,8 @@ modifiers:
repeated_char: 0.1 # Repeats a random word character.
unichar: 0.1 # Replaces a random consecutive repeated letter with a single letter.
- Tags: 0.08
custom_detok_src: null
custom_detok_trg: zh
custom_detok_src: "moses:null"
custom_detok_trg: "moses:zh"
template: "__source__ {src} __target__ {trg} __done__"

seed: 1111
Expand Down
2 changes: 1 addition & 1 deletion contrib/test_zhen_config.yml
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ start:
modifiers:
# No UpperCase or TitleCase modifier when the source is Chinese as we can't upper or lowercase Chinese
- Tags: 0.1
custom_detok_src: zh
custom_detok_src: "moses:zh"

seed: 1111
trainer: cat
4 changes: 2 additions & 2 deletions contrib/train_config.yml
Original file line number Diff line number Diff line change
Expand Up @@ -31,8 +31,8 @@ modifiers:
- UpperCase: 0.05
- TitleCase: 0.05
#- Tags: 0.08 # Requires dataset augmented with alignment info
# custom_detok_src: null # Null value for the src detokenizer
# custom_detok_trg: zh
# custom_detok_src: "moses:null" # Null value for the src detokenizer
# custom_detok_trg: "moses:zh"
# template: "__source__ {src} __target__ {trg} __done__" # This is the default way of inserting tags.
# We STRONGLY DISCOURAGE the modification of this line and in fact it shouldn't be included in the config
# unless you really know what you are doing.
Expand Down
77 changes: 63 additions & 14 deletions src/opustrainer/modifiers/placeholders.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,8 @@

from opustrainer.alignments import Pair, parse_alignments, format_alignments
from opustrainer.modifiers import Modifier
from opustrainer.tokenizers import SpaceDetokenizer, SpaceTokenizer, MosesDetokenizer, SentencePieceTokenizer
from opustrainer.tokenizers import SpaceDetokenizer, SpaceTokenizer, SentencePieceTokenizer, \
make_detokenizer, ICU_WHITESPACE_TOKEN
from opustrainer.modifiers.retokenize import Retokenizer, remap_alignment_pairs
from opustrainer import logger

Expand Down Expand Up @@ -231,8 +232,8 @@ class PlaceholderTagModifier(Modifier):
```yaml
modifiers:
- Tags: 0.02
custom_detok_src: 'zh'
custom_detok_trg: null
custom_detok_src: 'moses:zh'
custom_detok_trg: "moses:null"
template: "__source__ {src} __target__ {trg} __done__"
augment: 0.0 # 0% chance to just insert a random string on both sides
replace: 0.0 # 0% change to use tags to force translate to a random string
Expand All @@ -252,18 +253,20 @@ class PlaceholderTagModifier(Modifier):

def __init__(self, probability: float=0.0, custom_detok_src: Optional[str]=None, custom_detok_trg: Optional[str]=None,
spm_vocab: Optional[Path]=None,
template: str="__source__ {src} __target__ {trg} __done__", augment: float=0, replace:float=0):
template: str="__source__ {src} __target__ {trg} __done__", augment: float=0, replace:float=0, tag:float=1):
super().__init__(probability)

self.template = template
self.custom_detok_src = custom_detok_src
self.custom_detok_trg = custom_detok_trg

self.src_retokenizer = Retokenizer(
detokenizer=MosesDetokenizer(custom_detok_src) if custom_detok_src else SpaceDetokenizer(),
detokenizer=make_detokenizer(custom_detok_src) if custom_detok_src else SpaceDetokenizer(),
tokenizer=SentencePieceTokenizer(spm_vocab) if spm_vocab else SpaceTokenizer()
)

self.trg_retokenizer = Retokenizer(
detokenizer=MosesDetokenizer(custom_detok_trg) if custom_detok_trg else SpaceDetokenizer(),
detokenizer=make_detokenizer(custom_detok_trg) if custom_detok_trg else SpaceDetokenizer(),
tokenizer=SentencePieceTokenizer(spm_vocab) if spm_vocab else SpaceTokenizer()
)

Expand All @@ -281,7 +284,13 @@ def __init__(self, probability: float=0.0, custom_detok_src: Optional[str]=None,
if replace > 0:
self.modes.append(('replace', replace))

self.modes.append(('tag', 1.0)) # Weight doesn't matter as long as cumsum => 1.0, it's last on the list anyway
# the modifier can be used for inline noise augmentation only
if tag > 0:
self.modes.append(('tag', tag))

if ({'replace', 'tag'} & {mode for mode,_ in self.modes}) and \
'icu' in ((self.custom_detok_trg or '') + (self.custom_detok_trg or '')):
raise ValueError('ICU tokenization is not supported with "tag" and "replace" modes')

def __call__(self, batch: List[str]) -> Iterable[str]:
for line in batch:
Expand All @@ -293,7 +302,7 @@ def __call__(self, batch: List[str]) -> Iterable[str]:
def apply(self, line:str) -> str:
"""Applies tag to words in a line based on alignment info, and then removes the alignment info from the line.
This is used to enable terminology support by tagging random words with their translation.
eg "I like cake" would become "I __source__ like __target__ gusta __done__ cake.
eg "I like cake" would become "I __source__ like __target__ gusta __done__ cake.
By default the detokenizer used is the trivial detokenizer, but we can instead have separate detokenizers on src and trg."
"""

Expand Down Expand Up @@ -333,7 +342,7 @@ def apply(self, line:str) -> str:
continue

# Select mode (skip random_weighted_choices*() when 'tag' is the only mode)
mode = random_weighted_choice(self.modes) if len(self.modes) > 1 else 'tag'
mode = random_weighted_choice(self.modes) if len(self.modes) > 1 else self.modes[0][0]

if mode == "tag" or mode == "replace":
if mode == "tag":
Expand Down Expand Up @@ -375,19 +384,19 @@ def apply(self, line:str) -> str:
# Augment mode adds random noise both on the source and the target without any
# tagging encouraging the model to copy crap from one side to the other.
augment_tokens = get_random_unicode_words()
source = source[:candidate.src+1] + augment_tokens + source[candidate.src+1:]
target = target[:candidate.trg+1] + augment_tokens + target[candidate.trg+1:]
source, num_src_aug_tokens, pos_aug_src = self.insert_augmented(augment_tokens, source, candidate.src+1, self.custom_detok_src)
target, num_trg_aug_tokens, pos_aug_trg = self.insert_augmented(augment_tokens, target, candidate.trg+1, self.custom_detok_trg)

# Fix up alignment pairs
alignments = (
# pairs before and including the candidate stay the same
alignments[:candidate_index+1]
# fill in the gap created by the added random noise
+ [Pair(candidate.src + n, candidate.trg + n) for n in range(1, len(augment_tokens) + 1)]
+ [Pair(candidate.src + n_src, candidate.trg + n_trg) for n_src, n_trg in zip(pos_aug_src, pos_aug_trg)]
# pairs after the replaced bit have to be offset by the length of the replacement bit
+ [Pair(pair.src + len(augment_tokens), pair.trg + len(augment_tokens)) for pair in alignments[candidate_index+1:]]
+ [Pair(pair.src + num_src_aug_tokens, pair.trg + num_trg_aug_tokens) for pair in alignments[candidate_index+1:]]
)
candidate_offset = candidate_index + len(augment_tokens) + 1
candidate_offset = candidate_index + min(num_src_aug_tokens, num_trg_aug_tokens) + 1

source_detok, _, source_mapping = self.src_retokenizer.retokenize(source)
target_detok, _, target_mapping = self.trg_retokenizer.retokenize(target)
Expand All @@ -398,6 +407,46 @@ def apply(self, line:str) -> str:
else:
return source_detok + "\t" + target_detok

def insert_augmented(self, augment_tokens: List[str], tokens: List[str], position: int, detokenization: str) -> Tuple[List[str], int, List[int]]:
"""
Inserts augmented tokens.
Accounts for possible ICU detokenization which uses special symbol "▁" for whitespace tokens.
Such tokens will also be inserted to separate the augmented words.
Returns:
new tokens
number of augmented tokens including whitespaces for in ICU case
alignments positions for the augmented tokens (whitespaces are excluded, we don't need alignments for them)
"""
prefix = tokens[:position]
postfix = tokens[position:]
aug_aln_offset = []

if detokenization is not None and "icu" in detokenization:
new_aug_tokens = []
aug_pos_index = 1

if len(prefix) > 0 and prefix[-1] != ICU_WHITESPACE_TOKEN:
new_aug_tokens.append(ICU_WHITESPACE_TOKEN)
aug_pos_index += 1

for token in augment_tokens:
new_aug_tokens.append(token)
# save the offset of the augmented words to use in alignments
aug_aln_offset.append(aug_pos_index)
new_aug_tokens.append(ICU_WHITESPACE_TOKEN)
aug_pos_index += 2

if len(postfix) > 0 and postfix[0] == ICU_WHITESPACE_TOKEN:
new_aug_tokens.pop()

augment_tokens = new_aug_tokens
else:
aug_aln_offset = list(range(1, len(augment_tokens) + 1))

tokens = prefix + augment_tokens + postfix
return tokens, len(augment_tokens), aug_aln_offset

def validate(self, context:List[Modifier]) -> None:
"""Current limitation of the tags modifier is that any other modifier might modify the
inserted tags, which we don't want. So warn users about that if we notice it.
Expand Down
8 changes: 6 additions & 2 deletions src/opustrainer/modifiers/retokenize.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,10 @@ def retokenize(self, tokens:TokenList) -> Tuple[str,TokenList,TokenMapping]:

prev_j = 0
for i, old_token_span in enumerate(old_token_spans):
# it is possible for ICU tokenizer whitespace token, return empty list
if old_token_span is None:
continue

for j, new_token_span in enumerate(new_token_spans[prev_j:], start=prev_j):
prev_j = j
overlap = slice_cmp(old_token_span, new_token_span)
Expand All @@ -59,8 +63,8 @@ def remap_alignment_pairs(src_mapping:TokenMapping, trg_mapping:TokenMapping, al
sentence pair.
E.g. if you have
source-mapping: [0 => [3,4], 1 => [5]],
target-mapping: [0 => [0], 1 => [1]]
source-mapping: [0 => [3,4], 1 => [5], 2 => []],
target-mapping: [0 => [0], 1 => [1], 2 => []]
alignments: [(0,1), (1,1)]
it will return [
(3,1), (4,1), # the [0 => [3,4]] mapping
Expand Down
72 changes: 72 additions & 0 deletions src/opustrainer/tokenizers.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,12 +11,15 @@
DETOKENIZERS = {
'moses': lambda lang: MosesDetokenizer(lang),
'spaces': lambda: SpaceDetokenizer(),
'icu': lambda lang: IcuDetokenizer(lang),

}

TOKENIZERS = {
'moses': lambda lang: MosesTokenizer(lang),
'spm': lambda vocab: SentencePieceTokenizer(vocab),
'spaces': lambda: SpaceTokenizer(),
'icu': lambda lang: IcuTokenizer(lang),
}


Expand Down Expand Up @@ -126,3 +129,72 @@ def tokenize(self, text:str) -> Tuple[TokenList,TokenSpanList]:
tokens = [text[span] for span in spans]
return tokens, spans

# The same character as in SentencePiece
ICU_WHITESPACE_TOKEN = "▁"
class IcuTokenizer:
"""
Tokenizes text by splitting words and punctuation using ICU segmenter.
Whitespaces will be preserved as a special token ▁ for lossless detokenization.
Requires installation with the steps specified in https://pypi.org/project/PyICU/
"""

def __init__(self, lang: str):
self.lang = lang

def tokenize(self, text:str) -> Tuple[TokenList, TokenSpanList]:
from icu import BreakIterator, Locale

bi = BreakIterator.createWordInstance(Locale(self.lang))
bi.setText(text)

tokens = []
start = bi.first()
for end in bi:
token = text[start:end]
if (
token and token != "\n"
): # exclude empty tokens, but leave whitespaces and replace them with a special token
tokens.append(token)
start = end

spans: TokenSpanList = []
offset = 0
for token in tokens:
offset = text.find(token, offset)
if offset == -1:
raise RuntimeError(f"Could not find token '{token}' in original text")
spans.append(slice(offset, offset + len(token)))
offset += len(token)

tokens = [token.replace(" ", ICU_WHITESPACE_TOKEN) for token in tokens]
return tokens, spans

class IcuDetokenizer:
"""
Detokenizes tokens back into the original text preserving whitespaces as well.
Spans for whitespaces will be None.
"""

# For compatibility with MosesDetokenizer interface
def __init__(self, lang):
self.lang = lang

def detokenize(self, tokens:TokenList) -> Tuple[str,TokenSpanList]:
text = "".join(tokens).replace(ICU_WHITESPACE_TOKEN, " ")

spans = []
offset = 0

for token in tokens:
if token == ICU_WHITESPACE_TOKEN:
spans.append(None)
continue
# there are some edge cases where a whitespace can appear inside a token
token = token.replace(ICU_WHITESPACE_TOKEN, " ")
offset = text.find(token, offset)
if offset == -1:
raise RuntimeError(f"Could not find token '{token}' in detokenized text")
spans.append(slice(offset, offset + len(token)))
offset += len(token)

return text, spans
Loading

0 comments on commit ee534f3

Please sign in to comment.