-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathFeaturePreparation.py
242 lines (224 loc) · 11.4 KB
/
FeaturePreparation.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
# This file contains functions for preparing the features for the network merging process
# Created by: Leonid Chindelevitch
# Last modified: November 30, 2016
from functools import reduce
from urllib.request import urlopen
import ClassDefinitions
funcWords = ['al.', 'and', 'are', 'but', 'can', 'et.', 'for', 'has', 'is', 'not', 'of', 'or', 'out', 'see', 'the', 'to', 'via']
CASRE = ClassDefinitions.re.compile("[0-9]*-[0-9]*-[0-9]")
ProteinRE = ClassDefinitions.re.compile('[A-Z]{1}[a-z]{2,3}[A-Z]{0,3}[0-9]{0,2}')
atomRE = ClassDefinitions.re.compile('[A-Z]{1}[a-z]*[0-9]*')
def flattenFeature(reactions, featureName):
# For each specified reaction, transforms a list corresponding to featureName into a string
for reaction in reactions:
if reaction.description and featureName in reaction.description:
reaction.description[featureName] = ' '.join(reaction.description[featureName])
return
def collectSpeciesNames(species):
# For each specified species, adds the species name into the description of the species
for spec in species:
if spec.description:
spec.description['Species name'] = spec.name
return
def collectReactionNames(reactions):
# For each specified reaction, adds the reaction name into the description of the reaction
for reaction in reactions:
if reaction.description:
reaction.description['Reaction name'] = reaction.name
return
def fixGeneCombinations(reactions):
# For each specified reaction, changes a missing gene combination into an empty one
for reaction in reactions:
if reaction.geneCombination is None:
reaction.geneCombination = ClassDefinitions.CNF([[]])
return
def collectAllGeneNames(reactions):
# For each specified reaction, collects the gene names into a list
for reaction in reactions:
if reaction.geneCombination and reaction.geneCombination.clauses and reaction.description:
reaction.description['Gene names'] = sorted(list(reduce(lambda x,y:set(x).union(y), reaction.geneCombination.clauses, [])))
return
def splitAllProteinNames(reactions, andSymbol = '+', orSymbol = ','):
# For each specified reaction, splits the protein combination into individual protein names
for reaction in reactions:
if reaction.description and 'Protein' in reaction.description:
curProteins = reaction.description['Protein']
splitProteins = [x.strip() for x in curProteins.split(orSymbol)]
splitProteins = sum([[x.strip() for x in protein.split(andSymbol)] for protein in splitProteins], [])
reaction.description['Protein names'] = splitProteins
return
def findAllProteinNames(reactions):
# For each specified reaction, finds the likely protein names
for reaction in reactions:
if reaction.name:
if reaction.description:
reaction.description['Protein names'] = findProteinNames(reaction.name)
return
def findProteinNames(Line):
# Returns the substrings that are likely to be protein names
if not Line:
return []
else:
line = Line.replace(',','')
line = line.replace('(','')
line = line.replace(')','')
line = line.replace('/',' ')
line = line.split(' ')
line = [x.title() for x in line]
found = sorted([x for x in line if ClassDefinitions.re.match(ProteinRE, x) and len(ClassDefinitions.re.match(ProteinRE, x).group(0)) == len(x)])
return found
def findAllEnzymeNames(reactions, Map = {}, delay = 0.5):
# For each specified reaction, finds the names of enzymes based on their EC identifiers via a web search
# Saves the results into an optionally specified dictionary; delay is the time between consecutive queries
allECIDs = sum([reaction.description['EC numbers'] for reaction in reactions if 'EC numbers' in reaction.description], [])
allECIDs = sorted(list(set(allECIDs)))
print(('There are ' + str(len(allECIDs)) + ' enzymes to process'))
for ind, ECID in enumerate(allECIDs):
if ind % 10 == 0:
print(('Processed ' + str(ind) + ' enzymes so far'))
Map[ECID] = findEnzymeName(ECID)
ClassDefinitions.time.sleep(delay)
for reaction in reactions:
if 'EC numbers' in reaction.description:
curNumbers = reaction.description['EC numbers']
names = [_f for _f in [Map[x] for x in curNumbers] if _f]
reaction.description['Enzyme names'] = names
return
def findEnzymeName(ECID):
# This function returns the accepted name of an enzyme given by its E.C. number
if not ECID:
return ''
else:
url = 'http://www.expasy.org/enzyme/' + ECID
page = urlopen(url).read()
index = page.find("<strong>Accepted Name</strong>")
if index == -1:
return ''
else:
index1 = page.find("<strong>", index + len("<strong>"))
index2 = page.find("</strong>", index1)
return page[index1 + len("<strong>") : index2].strip().strip('.')
def getAllSpeciesInfo(species):
# Finds the formula and CAS number for each given species via a web search
# If information is already available for a given species, it has priority
for ind, spec in enumerate(species):
if ind % 10 == 0:
print(('Processed ' + str(ind) + ' species so far'))
curDescription = spec.description
if curDescription is not None:
descriptors = list(curDescription.keys())
foundCAS, foundFormula = True, True
if not ('CAS number' in descriptors and curDescription['CAS number']):
foundCAS = False
if not ('Formula' in descriptors and curDescription['Formula']):
foundFormula = False
if not (foundCAS and foundFormula):
if 'Biocyc' in descriptors:
(formula, CAS) = getMoreInfo(curDescription['Biocyc'], 'Bio')
if formula:
if 'Formula' in descriptors and curDescription['Formula'] != formula:
print(('Error: discrepancy between the formula of ' + spec.name + ' and ' + formula + ' using KEGG ID'))
else:
curDescription['Formula'] = formula
foundFormula = True
if CAS:
if 'CAS number' in descriptors and curDescription['CAS number'] != CAS:
print(('Error: discrepancy between the CAS number of ' + spec.name + ' and ' + CAS + ' using KEGG ID'))
else:
curDescription['CAS number'] = CAS
foundCAS = True
if not (foundCAS and foundFormula) and 'kegg-id' in descriptors:
(formula, CAS) = getMoreInfo(curDescription['kegg-id'], 'KEGG')
if formula:
if 'Formula' in descriptors and curDescription['Formula'] != formula:
print(('Error: discrepancy between the formula of ' + spec.name + ' and ' + formula + ' using Biocyc ID'))
else:
curDescription['Formula'] = formula
if CAS:
if 'CAS number' in descriptors and curDescription['CAS number'] != CAS:
print(('Error: discrepancy between the CAS number of ' + spec.name + ' and ' + CAS + ' using Biocyc ID'))
else:
curDescription['CAS number'] = CAS
return
def getMoreInfo(CompoundID, option = 'KEGG'):
# This function returns the empirical formula and CAS number of a given compound from its ID.
# The possible options are 'KEGG' if this is the KEGG ID and 'Bio' if it is the BioCyc ID.
Formula, CASNum = '', ''
if not CompoundID:
return ('', '')
elif option == 'KEGG':
url = 'http://www.genome.jp/dbget-bin/www_bget?cpd:' + CompoundID
page = urlopen(url).read()
cleanPage = ClassDefinitions.cleanupTags(page)
if 'Formula' in cleanPage:
index0 = cleanPage.index('Formula')
Formula = cleanPage[index0 + 1]
if 'CAS:' in cleanPage:
index1 = cleanPage.index('CAS:')
CASNum = ClassDefinitions.re.match(CASRE, cleanPage[index1 + 1])
if CASNum:
CASNum = CASNum.group(0)
else:
CASNum = ''
elif option == 'Bio':
url = 'http://biocyc.org/META/NEW-IMAGE?type=COMPOUND&object=' + CompoundID
page = urlopen(url).read()
index = page.find("Formula:")
if index != -1:
index1 = page.find("<p", index)
Formula = page[index + len("Formula:") : index1].strip()
Formula = Formula.replace('<SUB>','').replace('</SUB>','')
index2 = page.find("CAS:")
if index2 != -1:
CASNum = ClassDefinitions.re.match(CASRE, page[index2 + len("CAS:"):])
if CASNum:
CASNum = CASNum.group(0)
else:
CASNum = ''
else:
print(('Error: unrecognized option ' + option))
return (Formula, CASNum)
def convertAllFormulas(species):
# For each given species, converts its formula into a dictionary format for further comparisons
for ind, spec in enumerate(species):
if ind % 10 == 0:
print(('Processed ' + str(ind) + ' species so far'))
if spec.description is not None and 'Formula' in spec.description and spec.description['Formula']:
spec.description['Formula'] = convertFormula(spec.description['Formula'])
return
def convertFormula(Formula):
# Converts a string encoding a chemical formula into a dictionary with key = atom name, value = multiplicity
# Note that strings such as (C2H5)n cannot be dealt with even if n is a specified integer, not just a letter
Dico = {}
if type(Formula) == type(Dico):
return Formula
else:
# start by removing groups!
other = [x for x in range(len(Formula)) if not (Formula[x].isalpha() or Formula[x].isdigit())]
if other:
Formula = Formula[:other[0]]
for item in ClassDefinitions.re.findall(atomRE, Formula):
curLast = max([i for i,x in enumerate(item) if x.isalpha()]) + 1
ClassDefinitions.myIncrement(Dico, item[:curLast], int(item[curLast:]) if curLast < len(item) else 1)
return Dico
def countFeatures(speciesList):
# Tabulates the features present in a given list of species
Features = {}
for species in speciesList:
if species.description is not None:
curDescription = species.description
for feature in curDescription:
if feature not in Features:
Features[feature] = 0
if curDescription[feature] or curDescription[feature] == 0:
Features[feature] += 1
return Features
def convertFeaturesToMatrix(featureDicts, featureNames):
# Converts a list of feature dictionaries into a matrix (using featureNames as header row)
# Note: assumes that each feature in featureNames is present in each feature dictionary!
Matrix = [[]]*len(featureDicts)
for row, featureDict in enumerate(featureDicts):
Matrix[row] = [0]*len(featureNames)
for col, featureName in enumerate(featureNames):
Matrix[row][col] = featureDict[featureName]
return Matrix