-
Notifications
You must be signed in to change notification settings - Fork 9
/
Copy pathbindEmbed21DL.py
142 lines (109 loc) · 5.11 KB
/
bindEmbed21DL.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
from data_preparation import ProteinInformation
from ml_trainer import MLTrainer
from ml_predictor import MLPredictor
from config import FileSetter, FileManager
from architectures import CNN2Layers
import numpy as np
from sklearn.model_selection import PredefinedSplit
class BindEmbed21DL(object):
@staticmethod
def cross_train_pipeline(params, model_output, predictions_output, ri):
"""
Run cross-training pipeline for a specific set of parameters
:param params:
:param model_output: If None, trained model is not written
:param predictions_output: If None, predictions are not written
:param ri: Should RI or raw probabilities be written?
:return:
"""
print("Prepare data")
ids = []
fold_array = []
for s in range(1, 6):
ids_in = '{}{}.txt'.format(FileSetter.split_ids_in(), s)
split_ids = FileManager.read_ids(ids_in)
ids += split_ids
fold_array += [s] * len(split_ids)
ps = PredefinedSplit(fold_array)
# get sequences + maximum length + labels
sequences, max_length, labels = ProteinInformation.get_data(ids)
embeddings = FileManager.read_embeddings(FileSetter.embeddings_input())
proteins = dict()
trainer = MLTrainer(pos_weights=params['weights'])
for train_index, test_index in ps.split():
split_counter = fold_array[test_index[0]]
train_ids = [ids[train_idx] for train_idx in train_index]
validation_ids = [ids[test_idx] for test_idx in test_index]
print("Train model")
model_split = trainer.train_validate(params, train_ids, validation_ids, sequences, embeddings, labels,
max_length, verbose=False)
if model_output is not None:
model_path = '{}{}.pt'.format(model_output, split_counter)
FileManager.save_classifier_torch(model_split, model_path)
print("Calculate predictions per protein")
ml_predictor = MLPredictor(model_split)
curr_proteins = ml_predictor.predict_per_protein(validation_ids, sequences, embeddings, labels, max_length)
proteins = {**proteins, **curr_proteins}
if predictions_output is not None:
FileManager.write_predictions(proteins, predictions_output, 0.5, ri)
return proteins
@staticmethod
def hyperparameter_optimization_pipeline(params, num_splits, result_file):
"""
Development pipeline used to optimize hyperparameters
:param params:
:param num_splits:
:param result_file:
:return:
"""
print("Prepare data")
ids = []
fold_array = []
for s in range(1, num_splits + 1):
ids_in = '{}{}.txt'.format(FileSetter.split_ids_in(), s)
split_ids = FileManager.read_ids(ids_in)
ids += split_ids
fold_array += [s] * len(split_ids)
ids = np.array(ids)
# get sequences + maximum length + labels
sequences, max_length, labels = ProteinInformation.get_data(ids)
embeddings = FileManager.read_embeddings(FileSetter.embeddings_input())
print("Perform hyperparameter optimization")
trainer = MLTrainer(pos_weights=params['weights'])
del params['weights'] # remove weights to not consider as parameter for optimization
model = trainer.cross_validate(params, ids, fold_array, sequences, embeddings, labels, max_length, result_file)
return model
@staticmethod
def prediction_pipeline(model_prefix, cutoff, result_folder, ids, fasta_file, ri):
"""
Run predictions with bindEmbed21DL for a given list of proteins
:param model_prefix:
:param cutoff: Cutoff to use to define prediction as binding (default: 0.5)
:param result_folder:
:param ids:
:param fasta_file:
:param ri: Should RI or raw probabilities be written?
:return:
"""
print("Prepare data")
sequences, max_length, labels = ProteinInformation.get_data_predictions(ids, fasta_file)
embeddings = FileManager.read_embeddings(FileSetter.embeddings_input())
proteins = dict()
for i in range(0, 5):
print("Load model")
model_path = '{}{}.pt'.format(model_prefix, i + 1)
model = FileManager.load_classifier_torch(model_path)
print("Calculate predictions")
ml_predictor = MLPredictor(model)
curr_proteins = ml_predictor.predict_per_protein(ids, sequences, embeddings, labels, max_length)
for k in curr_proteins.keys():
if k in proteins.keys():
prot = proteins[k]
prot.add_predictions(curr_proteins[k].predictions)
else:
proteins[k] = curr_proteins[k]
for k in proteins.keys():
proteins[k].normalize_predictions(5)
if result_folder is not None:
FileManager.write_predictions(proteins, result_folder, cutoff, ri)
return proteins