Skip to content

Commit

Permalink
Added compositional model, CandyCrumbs scoring
Browse files Browse the repository at this point in the history
-Improved graph generation
-Added vectorised mz_to_composition
-Introduced CandyCrumbs annotation cutoff for spectral clusters
-Model now takes vector of monosaccharides instead of precursor mass
  • Loading branch information
urbj committed Nov 6, 2024
1 parent 45babde commit c2b1e4d
Show file tree
Hide file tree
Showing 6 changed files with 132 additions and 27 deletions.
12 changes: 7 additions & 5 deletions CandyCrunch/analysis.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@
'atoms': {'02X': [1, 2, 3], '04X': [1, 2, 3, 4, 5], '24X': [1, 2, 3, 6, 7, 8, 9], '02A': [4, 5, 6, 7, 8, 9],
'04A': [6, 7, 8, 9], '24A': [4, 5], 'Neu5Gc': [1, 2, 3, 4, 5, 6, 7, 8, 9]}},
'Neu5Ac8S': {'mass': {'02X': 70.0055, '04X': 170.0453, '24X': 271.0124, '02A': 301.0467,
'04A': 201.0069 , '24A': 100.0398, 'Neu5Ac8S': 371.0522 },
'04A': 201.0069 , '24A': 100.0398, 'Neu5Ac8S': 371.0522},
'atoms': {'02X': [1, 2, 3], '04X': [1, 2, 3, 4, 5], '24X': [1, 2, 3, 6, 7, 8, 9], '02A': [4, 5, 6, 7, 8, 9],
'04A': [6, 7, 8, 9], '24A': [4, 5], 'Neu5Ac8S': [1, 2, 3, 4, 5, 6, 7, 8, 9]}},
'Neu5Gc8S': {'mass': {'02X': 70.0055, '04X': 186.0402, '24X': 271.0124 , '02A': 317.0416,
Expand Down Expand Up @@ -115,7 +115,7 @@

AA_masses = {'A':71.0371,'R':156.1011,'N':114.0429,'D':115.0269,
'C':103.0091,'E':129.0425,'Q':128.0585,'G':57.0214,'H':137.0589,
'I':113.0840,'L':113.0840,'K':128.0949,'M':131.0404,'F':147.0684,
'I':113.0840,'L':113.0840,'K':128.0949,'k':357.25783,'M':131.0404,'F':147.0684,
'P':97.0527,'S':87.0320,'T':101.0476,'W':186.0793,'Y':163.0633,'V':99.0684}
tester_ma_addition = {k:{'mass':{k:v},'atoms':{k:[1,2,3,4,5,6]}} for k,v in AA_masses.items()}
mono_attributes = mono_attributes|tester_ma_addition
Expand Down Expand Up @@ -165,9 +165,11 @@ def glycan_to_graph_monos(glycan):
adj_matrix = np.zeros((len(mono_proc), len(mono_proc)), dtype = int)

for k in mono_mask_dic:
adjustment = 2 if k >= 100 else 1 if k >= 10 else 0
for j in range(k + 1, len(mono_mask_dic)):
adjustment = 2 if k >= 100 else 1 if k >= 10 else 0
k_idx, j_idx = glycan.find(str(k), k),glycan.find(str(j), j)
min_idx_k = k+(10*max((k//10)-1,0))
min_idx_j = j+(10*max((j//10)-1,0))
k_idx, j_idx = glycan.find(str(k), min_idx_k),glycan.find(str(j), min_idx_j)
glycan_part = glycan[k_idx+1:j_idx]
if evaluate_adjacency_monos(glycan_part, adjustment):
adj_matrix[k, j] = 1
Expand Down Expand Up @@ -1251,7 +1253,7 @@ def glycopeptide_string_to_input(gpep_string):
input_dict['glycans'] = [chem_string]
return input_dict
else:
input_dict['peptide'] = [chem_string]
input_dict['peptide'] = chem_string
return input_dict
else:
input_dict['peptide'] = ''.join(split_string[::2])
Expand Down
Binary file modified CandyCrunch/candycrunch.pt
Binary file not shown.
Binary file modified CandyCrunch/glycans.pkl
Binary file not shown.
14 changes: 2 additions & 12 deletions CandyCrunch/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,9 +59,6 @@ def __init__(self, x, y, transform_mz = None, transform_prec = None, transform_r
self.x = x
self.y = y
self.transform_mz = transform_mz
self.transform_prec = transform_prec
self.transform_prec_neg = transform_prec_neg
self.transform_prec_pos = transform_prec_pos
self.transform_rt = transform_rt

def __len__(self):
Expand All @@ -73,23 +70,16 @@ def __getitem__(self, index):
mz = self.transform_mz(mz)
mz_r = self.x[index][1]
prec = self.x[index][2]
if self.transform_prec:
prec = self.transform_prec(prec)
glycan_type = self.x[index][3]
RT = self.x[index][4]
if self.transform_rt:
RT = self.transform_rt(RT)
mode = self.x[index][5]
if any([self.transform_prec_neg, self.transform_prec_pos]):
if mode == 0:
prec = self.transform_prec_neg(prec)
else:
prec = self.transform_prec_pos(prec)
lc = self.x[index][6]
modification = self.x[index][7]
trap = self.x[index][8]
out = self.y[index]
return torch.FloatTensor(mz), torch.FloatTensor(mz_r), torch.FloatTensor([prec]), torch.LongTensor([glycan_type]), torch.FloatTensor([RT]), torch.LongTensor([mode]), torch.LongTensor([lc]), torch.LongTensor([modification]), torch.LongTensor([trap]), torch.LongTensor([out])
return torch.FloatTensor(mz), torch.FloatTensor(mz_r), torch.FloatTensor(prec), torch.LongTensor([glycan_type]), torch.FloatTensor([RT]), torch.LongTensor([mode]), torch.LongTensor([lc]), torch.LongTensor([modification]), torch.LongTensor([trap]), torch.LongTensor([out])


class ResUnit(nn.Module):
Expand Down Expand Up @@ -134,7 +124,7 @@ def __init__(self, input_dim, num_classes = 1, hidden_dim = 512):
self.input_dim = input_dim

self.mz_lin1 = torch.nn.Linear(input_dim, 2*hidden_dim)
self.prec_lin1 = torch.nn.Linear(1, 24)
self.prec_lin1 = torch.nn.Linear(12, 24)
self.rt_lin1 = torch.nn.Linear(1, 24)
self.comb_lin1 = torch.nn.Linear(2*hidden_dim+24+24+24+24+24+24+24, 2*512)
self.comb_lin2 = torch.nn.Linear(2*512, 2*256)
Expand Down
127 changes: 120 additions & 7 deletions CandyCrunch/prediction.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@

from CandyCrunch.model import (CandyCrunch_CNN, SimpleDataset, transform_mz,
transform_prec, transform_rt)
from CandyCrunch.analysis import CandyCrumbs

this_dir, this_filename = os.path.split(__file__)
data_path = os.path.join(this_dir, 'glycans.pkl')
Expand All @@ -40,7 +41,7 @@
if torch.cuda.is_available():
device = "cuda:0"

sdict = os.path.join(this_dir, 'candycrunch.pt')
sdict = os.path.join(this_dir, 'sugarbase.pt')
sdict = torch.load(sdict, map_location = device)
sdict = {k.replace('module.', ''): v for k, v in sdict.items()}
candycrunch = CandyCrunch_CNN(2048, num_classes = len(glycans)).to(device)
Expand All @@ -51,6 +52,7 @@
modification_mass_dict = {'reduced': 1.0078, '2AA': 137.14, '2AB': 120.2}
abbrev_dict = {'S': 'Sulphate', 'P': 'Phosphate', 'Ac': 'Acetate'}
temperature = torch.Tensor([1.15]).to(device)
comp_vector_order = ['dHex','Hex','HexA','HexN','HexNAc','Kdn','Me','Neu5Ac','Neu5Gc','P','Pen','S']


def T_scaling(logits, temperature):
Expand Down Expand Up @@ -148,7 +150,7 @@ def process_mzXML_stack(filepath, num_peaks = 1000, ms_level = 2, intensity = Fa
return df_out


def average_dicts(dicts, mode = 'mean'):
def average_dicts(dicts, mode = 'mean',round_dp=False):
"""averages a list of dictionaries containing spectra\n
| Arguments:
| :-
Expand All @@ -161,7 +163,11 @@ def average_dicts(dicts, mode = 'mean'):
result = defaultdict(list)
for d in dicts:
for mass, intensity in d.items():
result[mass].append(intensity)
if round_dp:
key_mass = np.round(mass,round_dp)
result[key_mass].append(intensity)
else:
result[mass].append(intensity)
return {mass: np.mean(intensities) if mode == 'mean' else max(intensities) for mass, intensities in result.items()}


Expand Down Expand Up @@ -225,7 +231,7 @@ def process_for_inference(df, glycan_class, mode = 'negative', modification = 'r
max_rt = max(max(df['RT']), 30)
df['RT2'] = df['RT'] / max_rt
# Dataloader generation
X = list(zip(df.binned_intensities.values.tolist(), df.mz_remainder.values.tolist(), df.reducing_mass.values.tolist(), df.glycan_type.values.tolist(),
X = list(zip(df.binned_intensities.values.tolist(), df.mz_remainder.values.tolist(), df.compositional_vector.values.tolist(), df.glycan_type.values.tolist(),
df.RT2.values.tolist(), df['mode'].values.tolist(), df.lc.values.tolist(), df.modification.values.tolist(), df.trap.values.tolist()))
y = df['glycan']
if device != 'cpu':
Expand Down Expand Up @@ -388,6 +394,7 @@ def condense_dataframe(df, mz_diff = 0.5, rt_diff = 1.0, min_mz = 39.714, max_mz
})

# Create a condensed dataframe
cluster_dicts = []
condensed_data = []
for cluster in clusters:
highest_intensity_index = np.argmax(cluster['intensity'])
Expand All @@ -399,6 +406,7 @@ def condense_dataframe(df, mz_diff = 0.5, rt_diff = 1.0, min_mz = 39.714, max_mz
min_rm = min(cluster['reducing_mass'])
mean_rt = np.mean(cluster['RT'])
sum_intensity = np.nansum(cluster['intensity'])
cluster_dicts.append([c for c in cluster['peak_d']])
binned_intensities, mz_remainder = zip(*[bin_intensities(c, frames) for c in cluster['peak_d']])
binned_intensities = np.mean(np.array(binned_intensities), axis = 0)
mz_remainder = np.mean(np.array(mz_remainder), axis = 0)
Expand All @@ -407,7 +415,105 @@ def condense_dataframe(df, mz_diff = 0.5, rt_diff = 1.0, min_mz = 39.714, max_mz
num_spectra = len(cluster['RT'])
condensed_data.append([min_rm, mean_rt, sum_intensity, binned_intensities, mz_remainder, num_spectra])
condensed_df = pd.DataFrame(condensed_data, columns = ['reducing_mass', 'RT', 'intensity', 'binned_intensities', 'mz_remainder', 'num_spectra'])
return condensed_df
return condensed_df,cluster_dicts


def comp_to_str_comp(comp):
sc = sorted(comp)
sorted_comp = {k:comp[k] for k in sc}
str_comp = "$".join([k+'_'+str(v) for k,v in sorted_comp.items()])
return str_comp


def comp_to_input_vect(glycomp):
comp_vect = np.zeros(12)
for mono,counts in glycomp.items():
comp_vect[comp_vector_order.index(mono)] = counts
return comp_vect


def mz_to_comp_vect(masses_in, comp_masses,comps_in,mass_threshold):
comps_in.append(None)
sm= masses_in.reshape(-1, 1)
gtm = comp_masses.reshape(1, -1)
pair_diffs = np.abs(gtm - sm)
within_thresh = pair_diffs < mass_threshold
spectra_match_idx = np.where(within_thresh.sum(axis=1))[0]
closest_comp_idx = np.where(within_thresh, pair_diffs, np.nan).argsort()[spectra_match_idx,0]
matched_comps = np.ones_like(masses_in)*len(comps_in)-1
matched_comps[spectra_match_idx] = closest_comp_idx
compositions_out = [comps_in[x] for x in matched_comps.astype('int')]
return compositions_out


def calculate_file_comps(masses_to_check,mass_threshold,df_in,mode='negative',
max_charge=2,mass_value='monoisotopic',sample_prep='underivatized',
filter_out = None):
all_comps = [x for x in df_in.groupby('comp_str').first()['Composition']]
comps_in = [x for x in copy.deepcopy(all_comps)]
comp_masses = np.array([composition_to_mass(x, mass_value = mass_value, sample_prep = sample_prep)+1.0078 for x in comps_in])
comps_out = [(None,0) for x in masses_to_check]
comps_in_copy = [x for x in copy.deepcopy(all_comps)]
for charge in range(1,max_charge+1):
chunked_calc_comps =[]
new_masses_to_check = (masses_to_check*charge)-(charge*(1.0078)-1.0078)*-1
for mz_chunk in np.array_split(new_masses_to_check,len(new_masses_to_check)//100):
compositions_out = mz_to_comp_vect(mz_chunk, comp_masses,comps_in_copy,mass_threshold)
chunked_calc_comps.extend(compositions_out)
comps_out = [(y,charge) if not x[0] else x for x,y in zip(comps_out,chunked_calc_comps)]
return comps_out


def create_struct_map(df_glycan,glycan_class,filter_out=None,phylo_level = 'Kingdom',phylo_filter= 'Animalia'):
processed_df_use = df_glycan[df_glycan[f"{phylo_level}"].apply(lambda x: phylo_filter in x)&(df_glycan['glycan_type']== glycan_class)]
if filter_out:
processed_df_use = processed_df_use.iloc[[i for i,x in enumerate(processed_df_use.Composition) if not filter_out.intersection(x)]]
processed_df_use = processed_df_use.assign(comp_str=[comp_to_str_comp(x) for x in processed_df_use.Composition])
processed_df_use = processed_df_use.assign(ref_counts=processed_df_use.loc[:,'ref'].map(len)+processed_df_use.loc[:,'tissue_ref'].map(len)+processed_df_use.loc[:,'disease_ref'].map(len))
processed_df_use = processed_df_use[~(processed_df_use['glycan'].str.contains("}"))]
processed_df_use = processed_df_use.sort_values('ref_counts',ascending=False)
df_use_unq_comps = processed_df_use.groupby('comp_str').first()
common_struct_map = dict(zip(df_use_unq_comps.index,df_use_unq_comps.glycan))
return common_struct_map,processed_df_use


def assign_candidate_structures(df_in,df_glycan_in,comp_struct_map,mass_tolerance):
red_masses = np.array(df_in.reducing_mass)
comps_out = calculate_file_comps(red_masses,mass_tolerance,df_glycan_in)
df_in['Composition'] = [x[0] for x in comps_out]
df_in['compositional_vector'] = [comp_to_input_vect(x) if x else None for x in df_in.Composition]
df_in['Charge'] = [x[1] if x[0] else None for x in comps_out]
matched_str_comps = [comp_to_str_comp(x) if x else None for x in df_in['Composition']]
df_in['candidate_structure'] = [comp_struct_map[x] if x in comp_struct_map else None for x in matched_str_comps]
return df_in


def filter_top_frag_annotations(ccrumbs_out):
filtered_annotations = []
for k,v in ccrumbs_out.items():
if v:
for ant in [a for a in v['Domon-Costello nomenclatures'] if len(a)<3]:
prefs = [a.split('_')[0][-1] for a in ant]
if 'A' in prefs or 'X' in prefs:
if len(prefs)>1:
continue
filtered_annotations.append(ant)
return filtered_annotations


def assign_annotation_scores(df_in,clustered_dicts):
scores = []
for struct,p_dict,charge,comp in zip(df_in.candidate_structure,clustered_dicts,df_in.Charge,df_in.Composition):
if struct:
avg_clust_d = average_dicts(p_dict,round_dp=1)
avg_clust_d = dict(sorted(avg_clust_d.items(), key = lambda x: x[1], reverse = True))
cc_out = CandyCrumbs(struct, [x for x in avg_clust_d][:15],0.6,simplify=False,charge=-int(charge))
tester_mass_scores=filter_top_frag_annotations(cc_out)
scores.append(len(tester_mass_scores)/sum(comp.values()))
else:
scores.append(None)
df_in['annotation_score'] = scores
return df_in


def deduplicate_predictions(df, mz_diff = 0.5, rt_diff = 1.0):
Expand Down Expand Up @@ -817,6 +923,9 @@ def load_spectra_filepath(spectra_filepath):
return process_mzML_stack(spectra_filepath, intensity = True)
if spectra_filepath.endswith(".mzXML"):
return process_mzXML_stack(spectra_filepath, intensity = True)
if spectra_filepath.endswith(".pkl"):
loaded_file = pd.read_pickle(spectra_filepath)
return loaded_file
if spectra_filepath.endswith(".xlsx"):
loaded_file = pd.read_excel(spectra_filepath)
mask = loaded_file['peak_d'].str.endswith('}', na = False)
Expand Down Expand Up @@ -1273,8 +1382,12 @@ def wrap_inference(spectra_filepath, glycan_class, model = candycrunch, glycans
loaded_file['reducing_mass'] += np.random.uniform(0.00001, 10**(-20), size = len(loaded_file))
coded_class = {'O': 0, 'N': 1, 'free': 2, 'lipid': 2}[glycan_class]
spec_dic = {mass: peak for mass, peak in zip(loaded_file['reducing_mass'].values, loaded_file['peak_d'].values)}
# Group spectra by mass/retention isomers and process them for being inputs to CandyCrunch
df_out = condense_dataframe(loaded_file, mz_diff = mass_tolerance, rt_diff = rt_diff, bin_num = bin_num)
# Group specctra by mass/retention isomers and process them for being inputs to CandyCrunch
df_out,cluster_peak_dicts = condense_dataframe(loaded_file, mz_diff = mass_tolerance, rt_diff = rt_diff, bin_num = bin_num)
common_structure_map,df_use = create_struct_map(df_use,glycan_class,filter_out = filter_out,phylo_level = 'Class',phylo_filter= taxonomy_class)
df_out = assign_candidate_structures(df_out,df_use,common_structure_map,mass_tolerance)
df_out = assign_annotation_scores(df_out,cluster_peak_dicts)
df_out = df_out[df_out['annotation_score']>2].drop(columns = ['Composition','Charge','candidate_structure','annotation_score']).reset_index(drop=True)
loader, df_out = process_for_inference(df_out, coded_class, mode = mode, modification = modification, lc = lc, trap = trap)
df_out['peak_d'] = [spec_dic[m] for m in df_out.index]
# Predict glycans from spectra
Expand Down
6 changes: 3 additions & 3 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@

setuptools.setup(
name="CandyCrunch",
version="0.4.1",
version="0.5.0",
author="Daniel Bojar",
author_email="[email protected]",
description="Package for predicting glycan structure from LC-MS/MS data",
Expand All @@ -21,12 +21,12 @@
"Operating System :: OS Independent",
],
python_requires='>=3.8',
install_requires=["glycowork~=1.3.0", "regex", "networkx",
install_requires=["glycowork[draw] @ git+https://github.com/BojarLab/glycowork.git@dev",
"regex", "networkx",
"scipy", "torch~=2.1", "numpy_indexed",
"seaborn", "pandas", "statsmodels",
"pymzml", "statsmodels", "pyteomics",
"lxml", "torchvision", "openpyxl"],
extras_require={'draw':["glycowork[draw]~=1.3.0"]},
entry_points={
'console_scripts': [
'candycrunch_predict=CandyCrunch.cli:main',
Expand Down

0 comments on commit c2b1e4d

Please sign in to comment.