-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathtools.py
168 lines (122 loc) · 4.34 KB
/
tools.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
import genealloy
try:
import rapidfuzz
except ImportError:
_has_rapidfuzz = False
else:
_has_rapidfuzz = True
def differentiate_sequences(seq, ref=None, verbose=True):
"""Replace each codon in a sequence with a synonymous codon that does not match the
reference sequence.
Return a dictionary with a list of possible codons for each position, and also a
possible solution.
Parameters
----------
seq
A string of ATGC characters, with length divisible by 3.
ref
A string of ATGC characters, with same length as seq. If None, use seq.
verbose
Print aa sequence and Levenshtein distances.
"""
if ref is None:
ref = seq
verboseprint = print if verbose else lambda *a, **k: None
if len(seq) % 3 != 0:
raise ValueError("Sequence length must be divisible by 3")
if len(seq) != len(ref):
raise ValueError("`seq` and `ref` must be the same length")
if set(seq + ref) - {"A", "C", "G", "T"} != set():
raise ValueError("The sequences must contain only A,T,G,C")
seq_codons = genealloy.convert_seq_to_codons(seq)
seq_aa = [genealloy.codon_to_aa[triplet] for triplet in seq_codons]
verboseprint("The aa sequence is:", "".join(seq_aa))
aa_to_codon = generate_aa_to_codon(genealloy.codon_to_aa)
modified_seq_codons = []
for i, triplet in enumerate(seq_codons):
ref_triplet = ref[i * 3 : i * 3 + 3]
if triplet != ref_triplet:
replacement = [triplet]
else:
replacement = []
aa = genealloy.codon_to_aa[triplet]
for codon in aa_to_codon[aa]:
if codon != ref_triplet:
replacement.append(codon)
if len(replacement) == 0:
replacement = [triplet] # no alternative codon
modified_seq_codons.append(replacement)
modified_seq = "".join([codons[0] for codons in modified_seq_codons])
if _has_rapidfuzz:
verboseprint(
"Levenshtein distance (seq vs ref):",
rapidfuzz.levenshtein.distance(seq, ref),
)
verboseprint(
"Levenshtein distance (modified seq vs ref):",
rapidfuzz.levenshtein.distance(modified_seq, ref),
)
else:
verboseprint("Levenshtein distance requires the rapidfuzz package")
return {"solution": modified_seq, "codons": modified_seq_codons}
def generate_aa_to_codon(codon_to_aa):
aa_to_codon = {}
for aa in set(codon_to_aa.values()):
aa_to_codon[aa] = []
for codon, aa in codon_to_aa.items():
aa_to_codon[aa].append(codon)
return aa_to_codon
def find_best_match(seq, ref):
"""Find alignment with smallest Levenshtein distance between seq and ref.
Return a dictionary of distances and positions of best matches.
Parameters
----------
seq
A string
ref
A string, not shorter than seq
"""
if not _has_rapidfuzz:
raise ImportError("Function requires the rapidfuzz package.")
if len(seq) > len(ref):
raise ValueError("`ref` is shorter than `seq`")
distances = {}
for i in range(0, len(ref) - len(seq) + 1): # moving window of comparison
distance = rapidfuzz.levenshtein.distance(seq, ref[i : (i + len(seq))])
distances[(i, i + len(seq))] = distance
shortest_distance = min(distances.values())
best_matches = {distance: shortest_distance, "locations": []}
for location, distance in distances.items():
if distance == shortest_distance:
best_matches["locations"].append(location)
results = {"distances": distances, "best_matches": best_matches}
return results
def modify_seq(seq, skip_first=False, skip_last=False):
if skip_first:
start = 3
else:
start = 0
if skip_last:
stop = -3
else:
stop = len(seq)
modified_seq = seq[start:stop]
return modified_seq
def find_longest_repeat(seq):
"""Find the longest repeat in a string,
then return the length and the character in a tuple.
"""
maximum = 0
count = 0
current = ''
letter = ''
for nt in seq:
if nt == current:
count += 1
else:
count = 1
current = nt
if count > maximum:
letter = nt
maximum = max(count, maximum)
return (maximum, letter)