diff --git a/decepticonlp/transforms/keys_in_proximity.json b/decepticonlp/transforms/keys_in_proximity.json new file mode 100644 index 0000000..60615c4 --- /dev/null +++ b/decepticonlp/transforms/keys_in_proximity.json @@ -0,0 +1 @@ +{"a": ["q", "w", "s", "x", "z"], "b": ["v", "g", "h", "n"], "c": ["x", "d", "f", "v"], "d": ["s", "e", "r", "f", "c", "x"], "e": ["w", "s", "d", "r"], "f": ["d", "r", "t", "g", "v", "c"], "g": ["f", "t", "y", "h", "b", "v"], "h": ["g", "y", "u", "j", "n", "b"], "i": ["u", "j", "k", "o"], "j": ["h", "u", "i", "k", "n", "m"], "k": ["j", "i", "o", "l", "m"], "l": ["k", "o", "p"], "m": ["n", "j", "k", "l"], "n": ["b", "h", "j", "m"], "o": ["i", "k", "l", "p"], "p": ["o", "l"], "q": ["w", "a", "s"], "r": ["e", "d", "f", "t"], "s": ["w", "e", "d", "x", "z", "a"], "t": ["r", "f", "g", "y"], "u": ["y", "h", "j", "i"], "v": ["c", "f", "g", "v", "b"], "w": ["q", "a", "s", "e"], "x": ["z", "s", "d", "c"], "y": ["t", "g", "h", "u"], "z": ["a", "s", "x"], "A": ["Q", "W", "S", "X", "Z"], "B": ["V", "G", "H", "N"], "C": ["X", "D", "F", "V"], "D": ["S", "E", "R", "F", "C", "X"], "E": ["W", "S", "D", "R"], "F": ["D", "R", "T", "G", "V", "C"], "G": ["F", "T", "Y", "H", "B", "V"], "H": ["G", "Y", "U", "J", "N", "B"], "I": ["U", "J", "K", "O"], "J": ["H", "U", "I", "K", "N", "M"], "K": ["J", "I", "O", "L", "M"], "L": ["K", "O", "P"], "M": ["N", "J", "K", "L"], "N": ["B", "H", "J", "M"], "O": ["I", "K", "L", "P"], "P": ["O", "L"], "Q": ["W", "A", "S"], "R": ["E", "D", "F", "T"], "S": ["W", "E", "D", "X", "Z", "A"], "T": ["R", "F", "G", "Y"], "U": ["Y", "H", "J", "I"], "V": ["C", "F", "G", "V", "B"], "W": ["Q", "A", "S", "E"], "X": ["Z", "S", "D", "C"], "Y": ["T", "G", "H", "U"], "Z": ["A", "S", "X"]} \ No newline at end of file diff --git a/decepticonlp/transforms/perturbations.py b/decepticonlp/transforms/perturbations.py index 5df7b80..4cdd0a9 100644 --- a/decepticonlp/transforms/perturbations.py +++ b/decepticonlp/transforms/perturbations.py @@ -1,8 +1,10 @@ import abc import math import random +import json import string import numpy as np +from pathlib import Path class CharacterPerturbations(metaclass=abc.ABCMeta): @@ -215,71 +217,26 @@ def apply(self, word: str, **kwargs): assert " " not in word, self.get_string_not_a_word_error_msg() - # convert word to list (string is immutable) word = list(word) - - num_chars_to_shift = math.ceil(len(word) * kwargs.get("probability", 0.1)) - - # checking for capitalizations - capitalization = [False] * len(word) - - # convert to lowercase and record capitalization - for i in range(len(word)): - capitalization[i] = word[i].isupper() - word[i] = word[i].lower() + chars = len(word) + num_chars_to_shift = math.ceil(chars * kwargs.get("probability", 0.1)) # list of characters to be switched - positions_to_shift = [] - for i in range(num_chars_to_shift): - positions_to_shift.append(random.randint(0, len(word) - 1)) + positions_to_shift = random.sample(range(chars), num_chars_to_shift) # defining a dictionary of keys located close to each character - keys_in_proximity = { - "a": ["q", "w", "s", "x", "z"], - "b": ["v", "g", "h", "n"], - "c": ["x", "d", "f", "v"], - "d": ["s", "e", "r", "f", "c", "x"], - "e": ["w", "s", "d", "r"], - "f": ["d", "r", "t", "g", "v", "c"], - "g": ["f", "t", "y", "h", "b", "v"], - "h": ["g", "y", "u", "j", "n", "b"], - "i": ["u", "j", "k", "o"], - "j": ["h", "u", "i", "k", "n", "m"], - "k": ["j", "i", "o", "l", "m"], - "l": ["k", "o", "p"], - "m": ["n", "j", "k", "l"], - "n": ["b", "h", "j", "m"], - "o": ["i", "k", "l", "p"], - "p": ["o", "l"], - "q": ["w", "a", "s"], - "r": ["e", "d", "f", "t"], - "s": ["w", "e", "d", "x", "z", "a"], - "t": ["r", "f", "g", "y"], - "u": ["y", "h", "j", "i"], - "v": ["c", "f", "g", "v", "b"], - "w": ["q", "a", "s", "e"], - "x": ["z", "s", "d", "c"], - "y": ["t", "g", "h", "u"], - "z": ["a", "s", "x"], - } - - # insert typo - for pos in positions_to_shift: - # no typo insertion for special characters - try: - typo_list = keys_in_proximity[word[pos]] - word[pos] = random.choice(typo_list) - except: - break - - # reinsert capitalization - for i in range(len(word)): - if capitalization[i]: - word[i] = word[i].upper() + json_path = Path("decepticonlp/transforms/keys_in_proximity.json") + keys_in_proximity = json.load(open(json_path, "r")) + + for i, c in enumerate(word): + # Check Upper + + # Check if in position and given keys + if i in positions_to_shift and c in keys_in_proximity: + word[i] = random.choice(keys_in_proximity[c]) # recombine word = "".join(word) - return word