-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathsdf_overlap.py
53 lines (43 loc) · 2.35 KB
/
sdf_overlap.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
import argparse
import pybel as pb
parser = argparse.ArgumentParser(description = 'Python script to clean the databases from compounds that fail to be docked by one of the software, useful for using mol_rmsd.svl. Remove compound duplicates in the original sdf or from the resulting sdf files as the program does not do that. Currently supports two software only.')
parser.add_argument('-i1', dest='input_1', help='First input SDF file')
parser.add_argument('-i2', dest='input_2', help='Second input SDF file')
parser.add_argument('-o1', dest='output_1', help='Name of first output SDF file')
parser.add_argument('-o2', dest='output_2', help='Name of second output SDF file')
args = parser.parse_args()
info = open("info.dat", 'w')
mols_1 = list(pb.readfile('sdf', args.input_1))
mols_2 = list(pb.readfile('sdf', args.input_2))
names_1 = list([mol.title for mol in mols_1]) #get names of compounds
names_2 = list([mol.title for mol in mols_2])
dkd = list(set(names_1) & set(names_2)) #names of compounds docked by both programs
n_dkd = list((set(names_1) | set(names_2)) - (set(names_1) & set(names_2))) #names of compounds not docked by both programs
len_dkd = len(dkd)
len_n_dkd = len(n_dkd)
info.write("compounds in 1: %i\nunique compounds in 1: %i\n\ncompounds in 2: %i\nunique compounds in 2: %i\n\nunique compounds docked by both programs: %i\nunique compounds not docked by both programs: %i\n\n"%(len(names_1), len(set(names_1)), len(names_2), len(set(names_2)), len_dkd, len_n_dkd))
info.close()
names_1 *= 0 #list cleaning to not occupy memory
names_2 *= 0
if len_dkd >= len_n_dkd: #choose most efficient list to go through depending on the size of n_dkd and dkd
mols_1_dkd = [mol for mol in mols_1 if mol.title not in n_dkd]
mols_2_dkd = [mol for mol in mols_2 if mol.title not in n_dkd]
else:
mols_1_dkd = []
mols_2_dkd = []
for mol in mols_1:
if mol.title in dkd:
mols_1_dkd.append(mol)
# print "%s %s"%(args.input_1, mol.title)
for mol in mols_2:
if mol.title in dkd:
mols_2_dkd.append(mol)
# print "%s %s"%(args.input_2, mol.title
out_1 = pb.Outputfile('sdf',args.output_1,overwrite=True)
out_2 = pb.Outputfile('sdf',args.output_2,overwrite=True)
for i in mols_1_dkd:
out_1.write(i)
for j in mols_2_dkd:
out_2.write(j)
out_1.close()
out_2.close()