From 22f17d918db789143d7ae59849ae9a8019d5de7d Mon Sep 17 00:00:00 2001 From: mschwoerer <82171591+mschwoer@users.noreply.github.com> Date: Tue, 12 Nov 2024 16:48:24 +0100 Subject: [PATCH 01/11] access _psm_df rather than psm_df --- alphabase/psm_reader/psm_reader.py | 26 +++++++++++++------------- alphabase/psm_reader/sage_reader.py | 12 ++++++------ 2 files changed, 19 insertions(+), 19 deletions(-) diff --git a/alphabase/psm_reader/psm_reader.py b/alphabase/psm_reader/psm_reader.py index a4593a66..7740828d 100644 --- a/alphabase/psm_reader/psm_reader.py +++ b/alphabase/psm_reader/psm_reader.py @@ -312,17 +312,17 @@ def _get_table_delimiter(self, _filename): return get_delimiter(_filename) def _normalize_rt(self): - if "rt" in self.psm_df.columns: + if "rt" in self._psm_df.columns: if self._engine_rt_unit == "second": # self.psm_df['rt_sec'] = self.psm_df.rt - self.psm_df["rt"] = self.psm_df.rt / 60 - if "rt_start" in self.psm_df.columns: - self.psm_df["rt_start"] = self.psm_df.rt_start / 60 - self.psm_df["rt_stop"] = self.psm_df.rt_stop / 60 + self._psm_df["rt"] = self._psm_df.rt / 60 + if "rt_start" in self._psm_df.columns: + self._psm_df["rt_start"] = self._psm_df.rt_start / 60 + self._psm_df["rt_stop"] = self._psm_df.rt_stop / 60 # elif self._engine_rt_unit == 'minute': # self.psm_df['rt_sec'] = self.psm_df.rt*60 - min_rt = self.psm_df.rt.min() - max_rt = self.psm_df.rt.max() + min_rt = self._psm_df.rt.min() + max_rt = self._psm_df.rt.max() if min_rt < 0: # iRT if min_rt < self._min_irt_value: min_rt = self._min_irt_value @@ -332,19 +332,19 @@ def _normalize_rt(self): elif not self._min_max_rt_norm: min_rt = 0 - self.psm_df["rt_norm"] = ( - (self.psm_df.rt - min_rt) / (max_rt - min_rt) + self._psm_df["rt_norm"] = ( + (self._psm_df.rt - min_rt) / (max_rt - min_rt) ).clip(0, 1) def normalize_rt_by_raw_name(self): if "rt" not in self.psm_df.columns: return - if "rt_norm" not in self.psm_df.columns: + if "rt_norm" not in self._psm_df.columns: self._normalize_rt() - if "raw_name" not in self.psm_df.columns: + if "raw_name" not in self._psm_df.columns: return - for _, df_group in self.psm_df.groupby("raw_name"): - self.psm_df.loc[df_group.index, "rt_norm"] = ( + for _, df_group in self._psm_df.groupby("raw_name"): + self._psm_df.loc[df_group.index, "rt_norm"] = ( df_group.rt_norm / df_group.rt_norm.max() ) diff --git a/alphabase/psm_reader/sage_reader.py b/alphabase/psm_reader/sage_reader.py index 2c1da227..27bc7d5f 100644 --- a/alphabase/psm_reader/sage_reader.py +++ b/alphabase/psm_reader/sage_reader.py @@ -600,18 +600,18 @@ def _load_file(self, filename): raise NotImplementedError def _transform_table(self, origin_df): - self.psm_df["spec_idx"] = self.psm_df["scannr"].apply( + self._psm_df["spec_idx"] = self._psm_df["scannr"].apply( _sage_spec_idx_from_scan_nr ) - self.psm_df.drop(columns=["scannr"], inplace=True) + self._psm_df.drop(columns=["scannr"], inplace=True) def _translate_decoy(self, origin_df): if not self._keep_decoy: - self._psm_df = self.psm_df[~self.psm_df["decoy"]] + self._psm_df = self._psm_df[~self._psm_df["decoy"]] - self._psm_df = self.psm_df[self.psm_df["fdr"] <= self._keep_fdr] - self._psm_df = self.psm_df[self.psm_df["peptide_fdr"] <= self._keep_fdr] - self._psm_df = self.psm_df[self.psm_df["protein_fdr"] <= self._keep_fdr] + self._psm_df = self._psm_df[self._psm_df["fdr"] <= self._keep_fdr] + self._psm_df = self._psm_df[self._psm_df["peptide_fdr"] <= self._keep_fdr] + self._psm_df = self._psm_df[self._psm_df["protein_fdr"] <= self._keep_fdr] # drop peptide_fdr, protein_fdr self._psm_df.drop(columns=["peptide_fdr", "protein_fdr"], inplace=True) From a9c2d4684e017a0aab5101251aef286e61aebd41 Mon Sep 17 00:00:00 2001 From: mschwoerer <82171591+mschwoer@users.noreply.github.com> Date: Tue, 12 Nov 2024 17:41:47 +0100 Subject: [PATCH 02/11] simplify modification_mapping --- alphabase/psm_reader/maxquant_reader.py | 3 ++- alphabase/psm_reader/msfragger_reader.py | 3 --- alphabase/psm_reader/pfind_reader.py | 3 --- alphabase/psm_reader/psm_reader.py | 9 +++++++-- alphabase/psm_reader/sage_reader.py | 3 --- 5 files changed, 9 insertions(+), 12 deletions(-) diff --git a/alphabase/psm_reader/maxquant_reader.py b/alphabase/psm_reader/maxquant_reader.py index 96c5a67c..5d92d14f 100644 --- a/alphabase/psm_reader/maxquant_reader.py +++ b/alphabase/psm_reader/maxquant_reader.py @@ -1,4 +1,5 @@ import copy +from typing import Optional import numba import numpy as np @@ -191,7 +192,7 @@ def _init_modification_mapping(self): psm_reader_yaml["maxquant"]["modification_mapping"] ) - def set_modification_mapping(self, modification_mapping: dict): + def set_modification_mapping(self, modification_mapping: Optional[dict] = None): super().set_modification_mapping(modification_mapping) self._add_all_unimod() self._extend_mod_brackets() diff --git a/alphabase/psm_reader/msfragger_reader.py b/alphabase/psm_reader/msfragger_reader.py index 6da1b713..e4d8c2f6 100644 --- a/alphabase/psm_reader/msfragger_reader.py +++ b/alphabase/psm_reader/msfragger_reader.py @@ -115,9 +115,6 @@ def __init__( def _init_column_mapping(self): self.column_mapping = psm_reader_yaml["msfragger_pepxml"]["column_mapping"] - def _init_modification_mapping(self): - self.modification_mapping = {} - def _translate_modifications(self): pass diff --git a/alphabase/psm_reader/pfind_reader.py b/alphabase/psm_reader/pfind_reader.py index 69e24549..e5398e13 100644 --- a/alphabase/psm_reader/pfind_reader.py +++ b/alphabase/psm_reader/pfind_reader.py @@ -104,9 +104,6 @@ def __init__( def _init_column_mapping(self): self.column_mapping = psm_reader_yaml["pfind"]["column_mapping"] - def _init_modification_mapping(self): - self.modification_mapping = {} - def _translate_modifications(self): pass diff --git a/alphabase/psm_reader/psm_reader.py b/alphabase/psm_reader/psm_reader.py index 7740828d..7c1160ac 100644 --- a/alphabase/psm_reader/psm_reader.py +++ b/alphabase/psm_reader/psm_reader.py @@ -1,6 +1,7 @@ import copy import os import warnings +from typing import Optional import numpy as np import pandas as pd @@ -153,7 +154,10 @@ def __init__( Defaults to False. """ - self.set_modification_mapping(None) + self.modification_mapping = None + self.rev_mod_mapping = None + + self.set_modification_mapping() self.add_modification_mapping(modification_mapping) if column_mapping is not None: @@ -210,7 +214,7 @@ def add_modification_mapping(self, modification_mapping: dict): self.set_modification_mapping(self.modification_mapping) - def set_modification_mapping(self, modification_mapping: dict): + def set_modification_mapping(self, modification_mapping: Optional[dict] = None): if modification_mapping is None: self._init_modification_mapping() elif isinstance(modification_mapping, str): @@ -224,6 +228,7 @@ def set_modification_mapping(self, modification_mapping: dict): ) else: self.modification_mapping = copy.deepcopy(modification_mapping) + self._mods_as_lists() self._reverse_mod_mapping() diff --git a/alphabase/psm_reader/sage_reader.py b/alphabase/psm_reader/sage_reader.py index 27bc7d5f..5c69b8fd 100644 --- a/alphabase/psm_reader/sage_reader.py +++ b/alphabase/psm_reader/sage_reader.py @@ -593,9 +593,6 @@ def __init__( def _init_column_mapping(self): self.column_mapping = psm_reader_yaml["sage"]["column_mapping"] - def _init_modification_mapping(self): - self.modification_mapping = {} - def _load_file(self, filename): raise NotImplementedError From 98a2f834f6b933ef9ff57dcc21714838f74ce245 Mon Sep 17 00:00:00 2001 From: mschwoerer <82171591+mschwoer@users.noreply.github.com> Date: Tue, 12 Nov 2024 17:45:55 +0100 Subject: [PATCH 03/11] introduce PsmDfCols --- alphabase/psm_reader/alphapept_reader.py | 17 +++++---- alphabase/psm_reader/keys.py | 22 +++++++++++ alphabase/psm_reader/maxquant_reader.py | 7 ++-- alphabase/psm_reader/msfragger_reader.py | 32 ++++++++-------- alphabase/psm_reader/pfind_reader.py | 11 ++++-- alphabase/psm_reader/psm_reader.py | 47 ++++++++++++++---------- alphabase/psm_reader/sage_reader.py | 28 +++++++++----- 7 files changed, 105 insertions(+), 59 deletions(-) create mode 100644 alphabase/psm_reader/keys.py diff --git a/alphabase/psm_reader/alphapept_reader.py b/alphabase/psm_reader/alphapept_reader.py index df59f8fa..a4f4be69 100644 --- a/alphabase/psm_reader/alphapept_reader.py +++ b/alphabase/psm_reader/alphapept_reader.py @@ -5,6 +5,7 @@ import numpy as np import pandas as pd +from alphabase.psm_reader.keys import PsmDfCols from alphabase.psm_reader.psm_reader import ( PSMReaderBase, psm_reader_provider, @@ -90,18 +91,18 @@ def _load_file(self, filename): def _load_modifications(self, df: pd.DataFrame): if len(df) == 0: - self._psm_df["sequence"] = "" - self._psm_df["mods"] = "" - self._psm_df["mod_sites"] = "" - self._psm_df["decoy"] = 0 + self._psm_df[PsmDfCols.SEQUENCE] = "" + self._psm_df[PsmDfCols.MODS] = "" + self._psm_df[PsmDfCols.MOD_SITES] = "" + self._psm_df[PsmDfCols.DECOY] = 0 return ( - self._psm_df["sequence"], - self._psm_df["mods"], - self._psm_df["mod_sites"], + self._psm_df[PsmDfCols.SEQUENCE], + self._psm_df[PsmDfCols.MODS], + self._psm_df[PsmDfCols.MOD_SITES], _charges, - self._psm_df["decoy"], + self._psm_df[PsmDfCols.DECOY], ) = zip(*df["precursor"].apply(parse_ap)) self._psm_df.decoy = self._psm_df.decoy.astype(np.int8) diff --git a/alphabase/psm_reader/keys.py b/alphabase/psm_reader/keys.py new file mode 100644 index 00000000..65a2063e --- /dev/null +++ b/alphabase/psm_reader/keys.py @@ -0,0 +1,22 @@ +class PsmDfCols: + MOD_SITES = "mod_sites" + MODIFIED_SEQUENCE = "modified_sequence" + SEQUENCE = "sequence" + DECOY = "decoy" + MODS = "mods" + SCORE = "score" + TO_REMOVE = "to_remove" + AA_MASS_DIFFS = "aa_mass_diffs" + AA_MASS_DIFF_SITES = "aa_mass_diff_sites" + RT = "rt" + RT_START = "rt_start" + RT_STOP = "rt_stop" + RT_NORM = "rt_norm" + SPEC_IDX = "spec_idx" + SCANNR = "scannr" + FDR = "fdr" + NAA = "nAA" + CCS = "ccs" + MOBILITY = "mobility" + PEPTIDE_FDR = "peptide_fdr" + PROTEIN_FDR = "protein_fdr" diff --git a/alphabase/psm_reader/maxquant_reader.py b/alphabase/psm_reader/maxquant_reader.py index 5d92d14f..776720a4 100644 --- a/alphabase/psm_reader/maxquant_reader.py +++ b/alphabase/psm_reader/maxquant_reader.py @@ -6,6 +6,7 @@ import pandas as pd from alphabase.constants.modification import MOD_DF +from alphabase.psm_reader.keys import PsmDfCols from alphabase.psm_reader.psm_reader import ( PSMReaderBase, psm_reader_provider, @@ -261,15 +262,15 @@ def _load_modifications(self, origin_df: pd.DataFrame): else: mod_sep = "()" - (seqs, self._psm_df["mods"], self._psm_df["mod_sites"]) = zip( + (seqs, self._psm_df[PsmDfCols.MODS], self._psm_df[PsmDfCols.MOD_SITES]) = zip( *origin_df[self.mod_seq_column].apply( parse_mod_seq, mod_sep=mod_sep, fixed_C57=self.fixed_C57, ) ) - if "sequence" not in self._psm_df.columns: - self._psm_df["sequence"] = seqs + if PsmDfCols.SEQUENCE not in self._psm_df.columns: + self._psm_df[PsmDfCols.SEQUENCE] = seqs def register_readers(): diff --git a/alphabase/psm_reader/msfragger_reader.py b/alphabase/psm_reader/msfragger_reader.py index e4d8c2f6..274d1f67 100644 --- a/alphabase/psm_reader/msfragger_reader.py +++ b/alphabase/psm_reader/msfragger_reader.py @@ -5,6 +5,7 @@ from alphabase.constants.aa import AA_ASCII_MASS from alphabase.constants.atom import MASS_H, MASS_O from alphabase.constants.modification import MOD_MASS +from alphabase.psm_reader.keys import PsmDfCols from alphabase.psm_reader.psm_reader import ( PSMReaderBase, psm_reader_provider, @@ -129,31 +130,31 @@ def _load_file(self, filename): return msf_df def _translate_decoy(self, origin_df=None): - self._psm_df["decoy"] = self._psm_df.proteins.apply(_is_fragger_decoy).astype( - np.int8 - ) + self._psm_df[PsmDfCols.DECOY] = self._psm_df.proteins.apply( + _is_fragger_decoy + ).astype(np.int8) self._psm_df.proteins = self._psm_df.proteins.apply(lambda x: ";".join(x)) if not self._keep_decoy: - self._psm_df["to_remove"] += self._psm_df.decoy > 0 + self._psm_df[PsmDfCols.TO_REMOVE] += self._psm_df.decoy > 0 def _translate_score(self, origin_df=None): # evalue score - self._psm_df["score"] = -np.log(self._psm_df["score"] + 1e-100) + self._psm_df[PsmDfCols.SCORE] = -np.log(self._psm_df[PsmDfCols.SCORE] + 1e-100) def _load_modifications(self, msf_df): if len(msf_df) == 0: - self._psm_df["mods"] = "" - self._psm_df["mod_sites"] = "" - self._psm_df["aa_mass_diffs"] = "" - self._psm_df["aa_mass_diff_sites"] = "" + self._psm_df[PsmDfCols.MODS] = "" + self._psm_df[PsmDfCols.MOD_SITES] = "" + self._psm_df[PsmDfCols.AA_MASS_DIFFS] = "" + self._psm_df[PsmDfCols.AA_MASS_DIFF_SITES] = "" return ( - self._psm_df["mods"], - self._psm_df["mod_sites"], - self._psm_df["aa_mass_diffs"], - self._psm_df["aa_mass_diff_sites"], + self._psm_df[PsmDfCols.MODS], + self._psm_df[PsmDfCols.MOD_SITES], + self._psm_df[PsmDfCols.AA_MASS_DIFFS], + self._psm_df[PsmDfCols.AA_MASS_DIFF_SITES], ) = zip( *msf_df[["peptide", "modifications"]].apply( lambda x: _get_mods_from_masses(*x), axis=1 @@ -161,9 +162,10 @@ def _load_modifications(self, msf_df): ) if not self.keep_unknown_aa_mass_diffs: - self._psm_df["to_remove"] += self._psm_df.aa_mass_diffs != "" + self._psm_df[PsmDfCols.TO_REMOVE] += self._psm_df.aa_mass_diffs != "" self._psm_df.drop( - columns=["aa_mass_diffs", "aa_mass_diff_sites"], inplace=True + columns=[PsmDfCols.AA_MASS_DIFFS, PsmDfCols.AA_MASS_DIFF_SITES], + inplace=True, ) def _post_process(self, origin_df: pd.DataFrame): diff --git a/alphabase/psm_reader/pfind_reader.py b/alphabase/psm_reader/pfind_reader.py index e5398e13..94b1a686 100644 --- a/alphabase/psm_reader/pfind_reader.py +++ b/alphabase/psm_reader/pfind_reader.py @@ -2,6 +2,7 @@ import pandas as pd import alphabase.constants.modification as ap_mod +from alphabase.psm_reader.keys import PsmDfCols from alphabase.psm_reader.psm_reader import ( PSMReaderBase, psm_reader_provider, @@ -127,15 +128,17 @@ def _translate_score(self, origin_df=None): def _load_modifications(self, pfind_df): if len(pfind_df) == 0: - self._psm_df["mods"] = "" - self._psm_df["mod_sites"] = "" + self._psm_df[PsmDfCols.MODS] = "" + self._psm_df[PsmDfCols.MOD_SITES] = "" return - (self._psm_df["mods"], self._psm_df["mod_sites"]) = zip( + (self._psm_df[PsmDfCols.MODS], self._psm_df[PsmDfCols.MOD_SITES]) = zip( *pfind_df["Modification"].apply(get_pFind_mods) ) - self._psm_df["mods"] = self._psm_df["mods"].apply(translate_pFind_mod) + self._psm_df[PsmDfCols.MODS] = self._psm_df[PsmDfCols.MODS].apply( + translate_pFind_mod + ) def register_readers(): diff --git a/alphabase/psm_reader/psm_reader.py b/alphabase/psm_reader/psm_reader.py index 7c1160ac..e06dda8e 100644 --- a/alphabase/psm_reader/psm_reader.py +++ b/alphabase/psm_reader/psm_reader.py @@ -9,6 +9,7 @@ import alphabase.peptide.mobility as mobility from alphabase.constants._const import CONST_FILE_FOLDER from alphabase.peptide.precursor import reset_precursor_df, update_precursor_mz +from alphabase.psm_reader.keys import PsmDfCols from alphabase.utils import get_delimiter from alphabase.yaml_utils import load_yaml @@ -317,13 +318,13 @@ def _get_table_delimiter(self, _filename): return get_delimiter(_filename) def _normalize_rt(self): - if "rt" in self._psm_df.columns: + if PsmDfCols.RT in self._psm_df.columns: if self._engine_rt_unit == "second": # self.psm_df['rt_sec'] = self.psm_df.rt - self._psm_df["rt"] = self._psm_df.rt / 60 - if "rt_start" in self._psm_df.columns: - self._psm_df["rt_start"] = self._psm_df.rt_start / 60 - self._psm_df["rt_stop"] = self._psm_df.rt_stop / 60 + self._psm_df[PsmDfCols.RT] = self._psm_df.rt / 60 + if PsmDfCols.RT_START in self._psm_df.columns: + self._psm_df[PsmDfCols.RT_START] = self._psm_df.rt_start / 60 + self._psm_df[PsmDfCols.RT_STOP] = self._psm_df.rt_stop / 60 # elif self._engine_rt_unit == 'minute': # self.psm_df['rt_sec'] = self.psm_df.rt*60 min_rt = self._psm_df.rt.min() @@ -337,19 +338,19 @@ def _normalize_rt(self): elif not self._min_max_rt_norm: min_rt = 0 - self._psm_df["rt_norm"] = ( + self._psm_df[PsmDfCols.RT_NORM] = ( (self._psm_df.rt - min_rt) / (max_rt - min_rt) ).clip(0, 1) def normalize_rt_by_raw_name(self): if "rt" not in self.psm_df.columns: return - if "rt_norm" not in self._psm_df.columns: + if PsmDfCols.RT_NORM not in self._psm_df.columns: self._normalize_rt() if "raw_name" not in self._psm_df.columns: return for _, df_group in self._psm_df.groupby("raw_name"): - self._psm_df.loc[df_group.index, "rt_norm"] = ( + self._psm_df.loc[df_group.index, PsmDfCols.RT_NORM] = ( df_group.rt_norm / df_group.rt_norm.max() ) @@ -410,9 +411,9 @@ def _translate_columns(self, origin_df: pd.DataFrame): if ( "scan_num" in self._psm_df.columns - and "spec_idx" not in self._psm_df.columns + and PsmDfCols.SPEC_IDX not in self._psm_df.columns ): - self._psm_df["spec_idx"] = self._psm_df.scan_num - 1 + self._psm_df[PsmDfCols.SPEC_IDX] = self._psm_df.scan_num - 1 def _transform_table(self, origin_df: pd.DataFrame): """ @@ -484,16 +485,16 @@ def _post_process(self, origin_df: pd.DataFrame): origin_df : pd.DataFrame the loaded original df """ - self._psm_df["nAA"] = self._psm_df.sequence.str.len() + self._psm_df[PsmDfCols.NAA] = self._psm_df.sequence.str.len() self.normalize_rt_by_raw_name() - self._psm_df = self._psm_df[~self._psm_df["mods"].isna()] + self._psm_df = self._psm_df[~self._psm_df[PsmDfCols.MODS].isna()] keep_rows = np.ones(len(self._psm_df), dtype=bool) - if "fdr" in self._psm_df.columns: + if PsmDfCols.FDR in self._psm_df.columns: keep_rows &= self._psm_df.fdr <= self._keep_fdr - if "decoy" in self._psm_df.columns and not self._keep_decoy: + if PsmDfCols.DECOY in self._psm_df.columns and not self._keep_decoy: keep_rows &= self._psm_df.decoy == 0 self._psm_df = self._psm_df[keep_rows] @@ -503,13 +504,19 @@ def _post_process(self, origin_df: pd.DataFrame): if "precursor_mz" not in self._psm_df: self._psm_df = update_precursor_mz(self._psm_df) - if "ccs" in self._psm_df.columns and "mobility" not in self._psm_df.columns: - self._psm_df["mobility"] = mobility.ccs_to_mobility_for_df( - self._psm_df, "ccs" + if ( + PsmDfCols.CCS in self._psm_df.columns + and PsmDfCols.MOBILITY not in self._psm_df.columns + ): + self._psm_df[PsmDfCols.MOBILITY] = mobility.ccs_to_mobility_for_df( + self._psm_df, PsmDfCols.CCS ) - elif "mobility" in self._psm_df.columns and "ccs" not in self._psm_df.columns: - self._psm_df["ccs"] = mobility.mobility_to_ccs_for_df( - self._psm_df, "mobility" + elif ( + PsmDfCols.MOBILITY in self._psm_df.columns + and PsmDfCols.CCS not in self._psm_df.columns + ): + self._psm_df[PsmDfCols.CCS] = mobility.mobility_to_ccs_for_df( + self._psm_df, PsmDfCols.MOBILITY ) def filter_psm_by_modifications( diff --git a/alphabase/psm_reader/sage_reader.py b/alphabase/psm_reader/sage_reader.py index 5c69b8fd..fe855d9a 100644 --- a/alphabase/psm_reader/sage_reader.py +++ b/alphabase/psm_reader/sage_reader.py @@ -9,6 +9,7 @@ from tqdm import tqdm from alphabase.constants.modification import MOD_DF +from alphabase.psm_reader.keys import PsmDfCols from alphabase.psm_reader.psm_reader import ( PSMReaderBase, psm_reader_provider, @@ -94,7 +95,7 @@ def __call__(self, psm_df: pd.DataFrame) -> pd.DataFrame: translated_psm_df = _apply_translate_modifications_mp(psm_df, translation_df) # 5. Drop PSMs with missing modifications - is_null = translated_psm_df["mod_sites"].isnull() + is_null = translated_psm_df[PsmDfCols.MOD_SITES].isnull() translated_psm_df = translated_psm_df[~is_null] if np.sum(is_null) > 0: logging.warning( @@ -217,7 +218,10 @@ def _discover_modifications(psm_df: pd.DataFrame) -> pd.DataFrame: """ modifications = ( - psm_df["modified_sequence"].apply(_match_modified_sequence).explode().unique() + psm_df[PsmDfCols.MODIFIED_SEQUENCE] + .apply(_match_modified_sequence) + .explode() + .unique() ) modifications = modifications[~pd.isnull(modifications)] return pd.DataFrame( @@ -597,21 +601,27 @@ def _load_file(self, filename): raise NotImplementedError def _transform_table(self, origin_df): - self._psm_df["spec_idx"] = self._psm_df["scannr"].apply( + self._psm_df[PsmDfCols.SPEC_IDX] = self._psm_df[PsmDfCols.SCANNR].apply( _sage_spec_idx_from_scan_nr ) - self._psm_df.drop(columns=["scannr"], inplace=True) + self._psm_df.drop(columns=[PsmDfCols.SCANNR], inplace=True) def _translate_decoy(self, origin_df): if not self._keep_decoy: - self._psm_df = self._psm_df[~self._psm_df["decoy"]] + self._psm_df = self._psm_df[~self._psm_df[PsmDfCols.DECOY]] - self._psm_df = self._psm_df[self._psm_df["fdr"] <= self._keep_fdr] - self._psm_df = self._psm_df[self._psm_df["peptide_fdr"] <= self._keep_fdr] - self._psm_df = self._psm_df[self._psm_df["protein_fdr"] <= self._keep_fdr] + self._psm_df = self._psm_df[self._psm_df[PsmDfCols.FDR] <= self._keep_fdr] + self._psm_df = self._psm_df[ + self._psm_df[PsmDfCols.PEPTIDE_FDR] <= self._keep_fdr + ] + self._psm_df = self._psm_df[ + self._psm_df[PsmDfCols.PROTEIN_FDR] <= self._keep_fdr + ] # drop peptide_fdr, protein_fdr - self._psm_df.drop(columns=["peptide_fdr", "protein_fdr"], inplace=True) + self._psm_df.drop( + columns=[PsmDfCols.PEPTIDE_FDR, PsmDfCols.PROTEIN_FDR], inplace=True + ) def _load_modifications(self, origin_df): pass From a4103b0911fea29d77556505a09e384678033c5d Mon Sep 17 00:00:00 2001 From: mschwoerer <82171591+mschwoer@users.noreply.github.com> Date: Tue, 12 Nov 2024 17:50:34 +0100 Subject: [PATCH 04/11] more use of PsmDfCols --- alphabase/psm_reader/alphapept_reader.py | 2 +- alphabase/psm_reader/maxquant_reader.py | 6 ++++-- alphabase/psm_reader/msfragger_reader.py | 2 +- alphabase/psm_reader/pfind_reader.py | 4 +++- alphabase/psm_reader/psm_reader.py | 4 ++-- alphabase/psm_reader/sage_reader.py | 20 ++++++++++---------- 6 files changed, 21 insertions(+), 17 deletions(-) diff --git a/alphabase/psm_reader/alphapept_reader.py b/alphabase/psm_reader/alphapept_reader.py index a4f4be69..b6d0b9da 100644 --- a/alphabase/psm_reader/alphapept_reader.py +++ b/alphabase/psm_reader/alphapept_reader.py @@ -104,7 +104,7 @@ def _load_modifications(self, df: pd.DataFrame): _charges, self._psm_df[PsmDfCols.DECOY], ) = zip(*df["precursor"].apply(parse_ap)) - self._psm_df.decoy = self._psm_df.decoy.astype(np.int8) + self._psm_df[PsmDfCols.DECOY] = self._psm_df[PsmDfCols.DECOY].astype(np.int8) def register_readers(): diff --git a/alphabase/psm_reader/maxquant_reader.py b/alphabase/psm_reader/maxquant_reader.py index 776720a4..1a62c392 100644 --- a/alphabase/psm_reader/maxquant_reader.py +++ b/alphabase/psm_reader/maxquant_reader.py @@ -235,8 +235,10 @@ def _extend_mod_brackets(self): self.modification_mapping[key] = list(mod_set) def _translate_decoy(self, origin_df=None): - if "decoy" in self._psm_df.columns: - self._psm_df.decoy = (self._psm_df.decoy == "-").astype(np.int8) + if PsmDfCols.DECOY in self._psm_df.columns: + self._psm_df[PsmDfCols.DECOY] = ( + self._psm_df[PsmDfCols.DECOY] == "-" + ).astype(np.int8) def _init_column_mapping(self): self.column_mapping = psm_reader_yaml["maxquant"]["column_mapping"] diff --git a/alphabase/psm_reader/msfragger_reader.py b/alphabase/psm_reader/msfragger_reader.py index 274d1f67..5fd45be4 100644 --- a/alphabase/psm_reader/msfragger_reader.py +++ b/alphabase/psm_reader/msfragger_reader.py @@ -136,7 +136,7 @@ def _translate_decoy(self, origin_df=None): self._psm_df.proteins = self._psm_df.proteins.apply(lambda x: ";".join(x)) if not self._keep_decoy: - self._psm_df[PsmDfCols.TO_REMOVE] += self._psm_df.decoy > 0 + self._psm_df[PsmDfCols.TO_REMOVE] += self._psm_df[PsmDfCols.DECOY] > 0 def _translate_score(self, origin_df=None): # evalue score diff --git a/alphabase/psm_reader/pfind_reader.py b/alphabase/psm_reader/pfind_reader.py index 94b1a686..03bde46a 100644 --- a/alphabase/psm_reader/pfind_reader.py +++ b/alphabase/psm_reader/pfind_reader.py @@ -121,7 +121,9 @@ def _load_file(self, filename): return pfind_df def _translate_decoy(self, origin_df=None): - self._psm_df.decoy = (self._psm_df.decoy == "decoy").astype(np.int8) + self._psm_df[PsmDfCols.DECOY] = ( + self._psm_df[PsmDfCols.DECOY] == "decoy" + ).astype(np.int8) def _translate_score(self, origin_df=None): self._psm_df.score = -np.log(self._psm_df.score.astype(float) + 1e-100) diff --git a/alphabase/psm_reader/psm_reader.py b/alphabase/psm_reader/psm_reader.py index e06dda8e..7322f800 100644 --- a/alphabase/psm_reader/psm_reader.py +++ b/alphabase/psm_reader/psm_reader.py @@ -485,7 +485,7 @@ def _post_process(self, origin_df: pd.DataFrame): origin_df : pd.DataFrame the loaded original df """ - self._psm_df[PsmDfCols.NAA] = self._psm_df.sequence.str.len() + self._psm_df[PsmDfCols.NAA] = self._psm_df[PsmDfCols.SEQUENCE].str.len() self.normalize_rt_by_raw_name() @@ -495,7 +495,7 @@ def _post_process(self, origin_df: pd.DataFrame): if PsmDfCols.FDR in self._psm_df.columns: keep_rows &= self._psm_df.fdr <= self._keep_fdr if PsmDfCols.DECOY in self._psm_df.columns and not self._keep_decoy: - keep_rows &= self._psm_df.decoy == 0 + keep_rows &= self._psm_df[PsmDfCols.DECOY] == 0 self._psm_df = self._psm_df[keep_rows] diff --git a/alphabase/psm_reader/sage_reader.py b/alphabase/psm_reader/sage_reader.py index fe855d9a..c4a8cebc 100644 --- a/alphabase/psm_reader/sage_reader.py +++ b/alphabase/psm_reader/sage_reader.py @@ -418,14 +418,14 @@ def _translate_modifications( def _apply_translate_modifications( - df: pd.DataFrame, mod_translation_df: pd.DataFrame + psm_df: pd.DataFrame, mod_translation_df: pd.DataFrame ) -> pd.DataFrame: """Apply the translation of modifications to the PSMs. Parameters ---------- - df : pd.DataFrame + psm_df : pd.DataFrame The PSM dataframe with column 'modified_sequence'. mod_translation_df : pd.DataFrame @@ -439,12 +439,12 @@ def _apply_translate_modifications( """ - df["mod_sites"], df["mods"] = zip( - *df["modified_sequence"].apply( + psm_df[PsmDfCols.MOD_SITES], psm_df[PsmDfCols.MODS] = zip( + *psm_df[PsmDfCols.MODIFIED_SEQUENCE].apply( lambda x: _translate_modifications(x, mod_translation_df) ) ) - return df + return psm_df def _batchify_df(df: pd.DataFrame, mp_batch_size: int) -> typing.Generator: @@ -470,7 +470,7 @@ def _batchify_df(df: pd.DataFrame, mp_batch_size: int) -> typing.Generator: def _apply_translate_modifications_mp( - df: pd.DataFrame, + psm_df: pd.DataFrame, mod_translation_df: pd.DataFrame, mp_batch_size: int = 50000, mp_process_num: int = 10, @@ -481,7 +481,7 @@ def _apply_translate_modifications_mp( Parameters ---------- - df : pd.DataFrame + psm_df : pd.DataFrame The PSM dataframe. mod_translation_df : pd.DataFrame @@ -500,11 +500,11 @@ def _apply_translate_modifications_mp( partial( _apply_translate_modifications, mod_translation_df=mod_translation_df ), - _batchify_df(df, mp_batch_size), + _batchify_df(psm_df, mp_batch_size), ) if progress_bar: df_list = list( - tqdm(processing, total=int(np.ceil(len(df) / mp_batch_size))) + tqdm(processing, total=int(np.ceil(len(psm_df) / mp_batch_size))) ) else: df_list = list(processing) @@ -634,7 +634,7 @@ def _translate_modifications(self): self._psm_df = sage_translation(self._psm_df) # drop modified_sequence - self._psm_df.drop(columns=["modified_sequence"], inplace=True) + self._psm_df.drop(columns=[PsmDfCols.MODIFIED_SEQUENCE], inplace=True) class SageReaderTSV(SageReaderBase): From 82fa443fa9592c1955e84eb79959fafa0f8d2a49 Mon Sep 17 00:00:00 2001 From: mschwoerer <82171591+mschwoer@users.noreply.github.com> Date: Tue, 12 Nov 2024 17:56:44 +0100 Subject: [PATCH 05/11] more use of PsmDfCols --- alphabase/psm_reader/dia_psm_reader.py | 5 +++- alphabase/psm_reader/msfragger_reader.py | 12 ++++++---- alphabase/psm_reader/pfind_reader.py | 4 +++- alphabase/psm_reader/psm_reader.py | 30 ++++++++++++++---------- 4 files changed, 31 insertions(+), 20 deletions(-) diff --git a/alphabase/psm_reader/dia_psm_reader.py b/alphabase/psm_reader/dia_psm_reader.py index 310f4485..b6fb4859 100644 --- a/alphabase/psm_reader/dia_psm_reader.py +++ b/alphabase/psm_reader/dia_psm_reader.py @@ -1,6 +1,7 @@ import numpy as np import pandas as pd +from alphabase.psm_reader.keys import PsmDfCols from alphabase.psm_reader.maxquant_reader import MaxQuantReader from alphabase.psm_reader.psm_reader import psm_reader_provider, psm_reader_yaml @@ -127,7 +128,9 @@ def _load_file(self, filename): def _post_process(self, origin_df: pd.DataFrame): super()._post_process(origin_df) - self._psm_df.rename(columns={"spec_idx": "diann_spec_idx"}, inplace=True) + self._psm_df.rename( + columns={PsmDfCols.SPEC_IDX: "diann_spec_idx"}, inplace=True + ) class SpectronautReportReader(MaxQuantReader): diff --git a/alphabase/psm_reader/msfragger_reader.py b/alphabase/psm_reader/msfragger_reader.py index 5fd45be4..715f3c01 100644 --- a/alphabase/psm_reader/msfragger_reader.py +++ b/alphabase/psm_reader/msfragger_reader.py @@ -125,8 +125,8 @@ def _load_file(self, filename): if "ion_mobility" in msf_df.columns: msf_df["ion_mobility"] = msf_df.ion_mobility.astype(float) msf_df["raw_name"] = msf_df["spectrum"].str.split(".").apply(lambda x: x[0]) - msf_df["to_remove"] = 0 - self.column_mapping["to_remove"] = "to_remove" + msf_df["to_remove"] = 0 # TODO revisit + self.column_mapping[PsmDfCols.TO_REMOVE] = "to_remove" return msf_df def _translate_decoy(self, origin_df=None): @@ -162,7 +162,9 @@ def _load_modifications(self, msf_df): ) if not self.keep_unknown_aa_mass_diffs: - self._psm_df[PsmDfCols.TO_REMOVE] += self._psm_df.aa_mass_diffs != "" + self._psm_df[PsmDfCols.TO_REMOVE] += ( + self._psm_df[PsmDfCols.AA_MASS_DIFFS] != "" + ) self._psm_df.drop( columns=[PsmDfCols.AA_MASS_DIFFS, PsmDfCols.AA_MASS_DIFF_SITES], inplace=True, @@ -171,8 +173,8 @@ def _load_modifications(self, msf_df): def _post_process(self, origin_df: pd.DataFrame): super()._post_process(origin_df) self._psm_df = ( - self._psm_df.query("to_remove==0") - .drop(columns="to_remove") + self._psm_df.query(f"{PsmDfCols.TO_REMOVE}==0") + .drop(columns=PsmDfCols.TO_REMOVE) .reset_index(drop=True) ) diff --git a/alphabase/psm_reader/pfind_reader.py b/alphabase/psm_reader/pfind_reader.py index 03bde46a..40f849fa 100644 --- a/alphabase/psm_reader/pfind_reader.py +++ b/alphabase/psm_reader/pfind_reader.py @@ -126,7 +126,9 @@ def _translate_decoy(self, origin_df=None): ).astype(np.int8) def _translate_score(self, origin_df=None): - self._psm_df.score = -np.log(self._psm_df.score.astype(float) + 1e-100) + self._psm_df[PsmDfCols.SCORE] = -np.log( + self._psm_df[PsmDfCols.SCORE].astype(float) + 1e-100 + ) def _load_modifications(self, pfind_df): if len(pfind_df) == 0: diff --git a/alphabase/psm_reader/psm_reader.py b/alphabase/psm_reader/psm_reader.py index 7322f800..56323e8a 100644 --- a/alphabase/psm_reader/psm_reader.py +++ b/alphabase/psm_reader/psm_reader.py @@ -150,7 +150,7 @@ def __init__( If keep decoy PSMs in self.psm_df. _min_max_rt_norm : bool if True, the 'rt_norm' values in self._psm_df - will be normalized by rt_norm = (self.psm_df.rt-rt_min)/(rt_max-rt_min). + will be normalized by rt_norm = (self.psm_df[PsmDfCols.RT]-rt_min)/(rt_max-rt_min). It is useful to normalize iRT values as they contain negative values. Defaults to False. """ @@ -321,14 +321,18 @@ def _normalize_rt(self): if PsmDfCols.RT in self._psm_df.columns: if self._engine_rt_unit == "second": # self.psm_df['rt_sec'] = self.psm_df.rt - self._psm_df[PsmDfCols.RT] = self._psm_df.rt / 60 + self._psm_df[PsmDfCols.RT] = self._psm_df[PsmDfCols.RT] / 60 if PsmDfCols.RT_START in self._psm_df.columns: - self._psm_df[PsmDfCols.RT_START] = self._psm_df.rt_start / 60 - self._psm_df[PsmDfCols.RT_STOP] = self._psm_df.rt_stop / 60 + self._psm_df[PsmDfCols.RT_START] = ( + self._psm_df[PsmDfCols.RT_START] / 60 + ) + self._psm_df[PsmDfCols.RT_STOP] = ( + self._psm_df[PsmDfCols.RT_STOP] / 60 + ) # elif self._engine_rt_unit == 'minute': # self.psm_df['rt_sec'] = self.psm_df.rt*60 - min_rt = self._psm_df.rt.min() - max_rt = self._psm_df.rt.max() + min_rt = self._psm_df[PsmDfCols.RT].min() + max_rt = self._psm_df[PsmDfCols.RT].max() if min_rt < 0: # iRT if min_rt < self._min_irt_value: min_rt = self._min_irt_value @@ -339,7 +343,7 @@ def _normalize_rt(self): min_rt = 0 self._psm_df[PsmDfCols.RT_NORM] = ( - (self._psm_df.rt - min_rt) / (max_rt - min_rt) + (self._psm_df[PsmDfCols.RT] - min_rt) / (max_rt - min_rt) ).clip(0, 1) def normalize_rt_by_raw_name(self): @@ -351,7 +355,7 @@ def normalize_rt_by_raw_name(self): return for _, df_group in self._psm_df.groupby("raw_name"): self._psm_df.loc[df_group.index, PsmDfCols.RT_NORM] = ( - df_group.rt_norm / df_group.rt_norm.max() + df_group[PsmDfCols.RT_NORM] / df_group[PsmDfCols.RT_NORM].max() ) def _load_file(self, filename: str) -> pd.DataFrame: @@ -457,8 +461,8 @@ def _translate_modifications(self): not in `self.modification_mapping` """ - self._psm_df.mods, unknown_mods = zip( - *self._psm_df.mods.apply( + self._psm_df[PsmDfCols.MODS], unknown_mods = zip( + *self._psm_df[PsmDfCols.MODS].apply( translate_other_modification, mod_dict=self.rev_mod_mapping ) ) @@ -493,7 +497,7 @@ def _post_process(self, origin_df: pd.DataFrame): keep_rows = np.ones(len(self._psm_df), dtype=bool) if PsmDfCols.FDR in self._psm_df.columns: - keep_rows &= self._psm_df.fdr <= self._keep_fdr + keep_rows &= self._psm_df[PsmDfCols.FDR] <= self._keep_fdr if PsmDfCols.DECOY in self._psm_df.columns and not self._keep_decoy: keep_rows &= self._psm_df[PsmDfCols.DECOY] == 0 @@ -536,11 +540,11 @@ def filter_psm_by_modifications( "Acetyl@Protein_N-term", ] ) - self._psm_df.mods = self._psm_df.mods.apply( + self._psm_df[PsmDfCols.MODS] = self._psm_df[PsmDfCols.MODS].apply( _keep_modifications, mod_set=include_mod_set ) - self._psm_df.dropna(subset=["mods"], inplace=True) + self._psm_df.dropna(subset=[PsmDfCols.MODS], inplace=True) self._psm_df.reset_index(drop=True, inplace=True) From 297aef0da57a8fc3f7b74e4b80053d3c1732c737 Mon Sep 17 00:00:00 2001 From: mschwoerer <82171591+mschwoer@users.noreply.github.com> Date: Wed, 13 Nov 2024 08:15:56 +0100 Subject: [PATCH 06/11] more use of PsmDfCols --- alphabase/psm_reader/keys.py | 33 +++++++++++++++++++++++- alphabase/psm_reader/msfragger_reader.py | 14 ++++++---- alphabase/spectral_library/reader.py | 17 +++++++----- tests/integration/test_psm_readers.py | 6 +++++ 4 files changed, 57 insertions(+), 13 deletions(-) diff --git a/alphabase/psm_reader/keys.py b/alphabase/psm_reader/keys.py index 65a2063e..3cf2efdb 100644 --- a/alphabase/psm_reader/keys.py +++ b/alphabase/psm_reader/keys.py @@ -1,4 +1,19 @@ -class PsmDfCols: +class ConstantsClass(type): + """A metaclass for classes that should only contain string constants.""" + + def __setattr__(self, name, value): + raise TypeError("Constants class cannot be modified") + + def get_values(cls): + """Get all user-defined string values of the class.""" + return [ + value + for key, value in cls.__dict__.items() + if not key.startswith("__") and isinstance(value, str) + ] + + +class PsmDfCols(metaclass=ConstantsClass): MOD_SITES = "mod_sites" MODIFIED_SEQUENCE = "modified_sequence" SEQUENCE = "sequence" @@ -20,3 +35,19 @@ class PsmDfCols: MOBILITY = "mobility" PEPTIDE_FDR = "peptide_fdr" PROTEIN_FDR = "protein_fdr" + + RAW_NAME = "raw_name" + CHARGE = "charge" + PROTEINS = "proteins" + + SCAN_NUM = "scan_num" + PRECURSOR_MZ = "precursor_mz" + DIANN_SPEC_INDEX = "diann_spec_idx" + + FRAG_START_IDX = "frag_start_idx" + FRAG_STOP_IDX = "frag_stop_idx" + + # part of the output, but not directly referenced + _UNIPROT_IDS = "uniprot_ids" + _GENES = "genes" + _QUERY_ID = "query_id" diff --git a/alphabase/psm_reader/msfragger_reader.py b/alphabase/psm_reader/msfragger_reader.py index 715f3c01..22301b94 100644 --- a/alphabase/psm_reader/msfragger_reader.py +++ b/alphabase/psm_reader/msfragger_reader.py @@ -124,17 +124,21 @@ def _load_file(self, filename): msf_df.fillna("", inplace=True) if "ion_mobility" in msf_df.columns: msf_df["ion_mobility"] = msf_df.ion_mobility.astype(float) - msf_df["raw_name"] = msf_df["spectrum"].str.split(".").apply(lambda x: x[0]) + msf_df[PsmDfCols.RAW_NAME] = ( + msf_df["spectrum"].str.split(".").apply(lambda x: x[0]) + ) msf_df["to_remove"] = 0 # TODO revisit self.column_mapping[PsmDfCols.TO_REMOVE] = "to_remove" return msf_df def _translate_decoy(self, origin_df=None): - self._psm_df[PsmDfCols.DECOY] = self._psm_df.proteins.apply( - _is_fragger_decoy - ).astype(np.int8) + self._psm_df[PsmDfCols.DECOY] = ( + self._psm_df[PsmDfCols.PROTEINS].apply(_is_fragger_decoy).astype(np.int8) + ) - self._psm_df.proteins = self._psm_df.proteins.apply(lambda x: ";".join(x)) + self._psm_df[PsmDfCols.PROTEINS] = self._psm_df[PsmDfCols.PROTEINS].apply( + lambda x: ";".join(x) + ) if not self._keep_decoy: self._psm_df[PsmDfCols.TO_REMOVE] += self._psm_df[PsmDfCols.DECOY] > 0 diff --git a/alphabase/spectral_library/reader.py b/alphabase/spectral_library/reader.py index 151f314b..fd3e2b9f 100644 --- a/alphabase/spectral_library/reader.py +++ b/alphabase/spectral_library/reader.py @@ -7,6 +7,7 @@ from alphabase.constants._const import PEAK_INTENSITY_DTYPE from alphabase.peptide.mobility import mobility_to_ccs_for_df from alphabase.psm_reader import psm_reader_provider +from alphabase.psm_reader.keys import PsmDfCols from alphabase.psm_reader.maxquant_reader import MaxQuantReader from alphabase.psm_reader.psm_reader import psm_reader_yaml from alphabase.spectral_library.base import SpecLibBase @@ -175,7 +176,7 @@ def _get_fragment_intensity(self, lib_df: pd.DataFrame): for keys, df_group in tqdm(lib_df.groupby(non_fragment_columns)): precursor_columns = dict(zip(non_fragment_columns, keys)) - nAA = len(precursor_columns["sequence"]) + nAA = len(precursor_columns[PsmDfCols.SEQUENCE]) intens = np.zeros( (nAA - 1, len(self.charged_frag_types)), @@ -286,7 +287,7 @@ def _post_process( # identify unknown modifications len_before = len(self._psm_df) - self._psm_df = self._psm_df[~self._psm_df["mods"].isna()] + self._psm_df = self._psm_df[~self._psm_df[PsmDfCols.MODS].isna()] len_after = len(self._psm_df) if len_before != len_after: @@ -294,17 +295,19 @@ def _post_process( f"{len_before-len_after} Entries with unknown modifications are removed" ) - if "nAA" not in self._psm_df.columns: - self._psm_df["nAA"] = self._psm_df.sequence.str.len() + if PsmDfCols.NAA not in self._psm_df.columns: + self._psm_df[PsmDfCols.NAA] = self._psm_df[PsmDfCols.SEQUENCE].str.len() self._psm_df = self._get_fragment_intensity(self._psm_df) self.normalize_rt_by_raw_name() - if "mobility" in self._psm_df.columns: - self._psm_df["ccs"] = mobility_to_ccs_for_df(self._psm_df, "mobility") + if PsmDfCols.MOBILITY in self._psm_df.columns: + self._psm_df[PsmDfCols.CCS] = mobility_to_ccs_for_df( + self._psm_df, PsmDfCols.MOBILITY + ) - self._psm_df.drop("modified_sequence", axis=1, inplace=True) + self._psm_df.drop(PsmDfCols.MODIFIED_SEQUENCE, axis=1, inplace=True) self._precursor_df = self._psm_df self.calc_fragment_mz_df() diff --git a/tests/integration/test_psm_readers.py b/tests/integration/test_psm_readers.py index 6c46167d..5ed7b593 100644 --- a/tests/integration/test_psm_readers.py +++ b/tests/integration/test_psm_readers.py @@ -20,6 +20,7 @@ SwathReader, pFindReader, ) +from alphabase.psm_reader.keys import PsmDfCols from alphabase.spectral_library.reader import LibraryReaderBase current_file_directory = os.path.dirname(os.path.abspath(__file__)) @@ -34,6 +35,11 @@ def _assert_reference_df_equal(psm_df: pd.DataFrame, test_case_name: str) -> Non If reference is not present, save the output as reference data and raise. """ out_file_path = test_data_path / f"reference_{test_case_name}.parquet" + # psm_df.to_csv(test_data_path / f"reference_{test_case_name}.csv") + + # check that all columns are available in PsmDfCols + assert set(psm_df.columns) - set(PsmDfCols.get_values()) == set() + if out_file_path.exists(): expected_df = pd.read_parquet(out_file_path) From b760d509a0a4ae8751f707ce923957d0dae48e56 Mon Sep 17 00:00:00 2001 From: mschwoerer <82171591+mschwoer@users.noreply.github.com> Date: Wed, 13 Nov 2024 08:17:02 +0100 Subject: [PATCH 07/11] more use of PsmDfCols --- alphabase/psm_reader/alphapept_reader.py | 4 ++-- alphabase/psm_reader/dia_psm_reader.py | 10 +++++----- alphabase/psm_reader/pfind_reader.py | 2 +- alphabase/psm_reader/psm_reader.py | 12 ++++++------ 4 files changed, 14 insertions(+), 14 deletions(-) diff --git a/alphabase/psm_reader/alphapept_reader.py b/alphabase/psm_reader/alphapept_reader.py index b6d0b9da..48f5b158 100644 --- a/alphabase/psm_reader/alphapept_reader.py +++ b/alphabase/psm_reader/alphapept_reader.py @@ -80,13 +80,13 @@ def _load_file(self, filename): with h5py.File(filename, "r") as _hdf: dataset = _hdf[self.hdf_dataset] df = pd.DataFrame({col: dataset[col] for col in dataset}) - df["raw_name"] = os.path.basename(filename)[: -len(".ms_data.hdf")] + df[PsmDfCols.RAW_NAME] = os.path.basename(filename)[: -len(".ms_data.hdf")] df["precursor"] = df["precursor"].str.decode("utf-8") # df['naked_sequence'] = df['naked_sequence'].str.decode('utf-8') if "scan_no" in df.columns: df["scan_no"] = df["scan_no"].astype("int") df["raw_idx"] = df["scan_no"] - 1 # if thermo, use scan-1 as spec_idx - df["charge"] = df["charge"].astype(int) + df[PsmDfCols.CHARGE] = df[PsmDfCols.CHARGE].astype(int) return df def _load_modifications(self, df: pd.DataFrame): diff --git a/alphabase/psm_reader/dia_psm_reader.py b/alphabase/psm_reader/dia_psm_reader.py index b6fb4859..5db0b97e 100644 --- a/alphabase/psm_reader/dia_psm_reader.py +++ b/alphabase/psm_reader/dia_psm_reader.py @@ -129,7 +129,7 @@ def _load_file(self, filename): def _post_process(self, origin_df: pd.DataFrame): super()._post_process(origin_df) self._psm_df.rename( - columns={PsmDfCols.SPEC_IDX: "diann_spec_idx"}, inplace=True + columns={PsmDfCols.SPEC_IDX: PsmDfCols.DIANN_SPEC_INDEX}, inplace=True ) @@ -177,10 +177,10 @@ def _load_file(self, filename): self.mod_seq_column = "ModifiedSequence" self.csv_sep = self._get_table_delimiter(filename) df = pd.read_csv(filename, sep=self.csv_sep, keep_default_na=False) - df[[self.mod_seq_column, "charge"]] = df[self.precursor_column].str.split( - ".", expand=True, n=2 - ) - df["charge"] = df.charge.astype(np.int8) + df[[self.mod_seq_column, PsmDfCols.CHARGE]] = df[ + self.precursor_column + ].str.split(".", expand=True, n=2) + df[PsmDfCols.CHARGE] = df[PsmDfCols.CHARGE].astype(np.int8) return df diff --git a/alphabase/psm_reader/pfind_reader.py b/alphabase/psm_reader/pfind_reader.py index 40f849fa..a182a8c9 100644 --- a/alphabase/psm_reader/pfind_reader.py +++ b/alphabase/psm_reader/pfind_reader.py @@ -114,7 +114,7 @@ def _load_file(self, filename): ) pfind_df.fillna("", inplace=True) pfind_df = pfind_df[pfind_df.Sequence != ""] - pfind_df["raw_name"] = ( + pfind_df[PsmDfCols.RAW_NAME] = ( pfind_df["File_Name"].str.split(".").apply(lambda x: x[0]) ) pfind_df["Proteins"] = pfind_df["Proteins"].apply(parse_pfind_protein) diff --git a/alphabase/psm_reader/psm_reader.py b/alphabase/psm_reader/psm_reader.py index 56323e8a..381b5b81 100644 --- a/alphabase/psm_reader/psm_reader.py +++ b/alphabase/psm_reader/psm_reader.py @@ -347,13 +347,13 @@ def _normalize_rt(self): ).clip(0, 1) def normalize_rt_by_raw_name(self): - if "rt" not in self.psm_df.columns: + if PsmDfCols.RT not in self._psm_df.columns: return if PsmDfCols.RT_NORM not in self._psm_df.columns: self._normalize_rt() - if "raw_name" not in self._psm_df.columns: + if PsmDfCols.RAW_NAME not in self._psm_df.columns: return - for _, df_group in self._psm_df.groupby("raw_name"): + for _, df_group in self._psm_df.groupby(PsmDfCols.RAW_NAME): self._psm_df.loc[df_group.index, PsmDfCols.RT_NORM] = ( df_group[PsmDfCols.RT_NORM] / df_group[PsmDfCols.RT_NORM].max() ) @@ -414,10 +414,10 @@ def _translate_columns(self, origin_df: pd.DataFrame): self._psm_df[col] = origin_df[map_col] if ( - "scan_num" in self._psm_df.columns + PsmDfCols.SCAN_NUM in self._psm_df.columns and PsmDfCols.SPEC_IDX not in self._psm_df.columns ): - self._psm_df[PsmDfCols.SPEC_IDX] = self._psm_df.scan_num - 1 + self._psm_df[PsmDfCols.SPEC_IDX] = self._psm_df[PsmDfCols.SCAN_NUM] - 1 def _transform_table(self, origin_df: pd.DataFrame): """ @@ -505,7 +505,7 @@ def _post_process(self, origin_df: pd.DataFrame): reset_precursor_df(self._psm_df) - if "precursor_mz" not in self._psm_df: + if PsmDfCols.PRECURSOR_MZ not in self._psm_df: self._psm_df = update_precursor_mz(self._psm_df) if ( From 41874d96335ac925c77c5e443d08079ea0ba2c6b Mon Sep 17 00:00:00 2001 From: mschwoerer <82171591+mschwoer@users.noreply.github.com> Date: Wed, 13 Nov 2024 09:08:40 +0100 Subject: [PATCH 08/11] add LibPsmDfCols --- alphabase/psm_reader/keys.py | 25 ++++++++++++++-- alphabase/spectral_library/reader.py | 42 ++++++++++++++------------- tests/integration/test_psm_readers.py | 13 ++++++++- 3 files changed, 56 insertions(+), 24 deletions(-) diff --git a/alphabase/psm_reader/keys.py b/alphabase/psm_reader/keys.py index 3cf2efdb..9980abd5 100644 --- a/alphabase/psm_reader/keys.py +++ b/alphabase/psm_reader/keys.py @@ -14,6 +14,9 @@ def get_values(cls): class PsmDfCols(metaclass=ConstantsClass): + """Constants for accessing the columns of a PSM dataframe.""" + + # TODO: these are used only in th psm_reader package and the spectral_library.reader module so far MOD_SITES = "mod_sites" MODIFIED_SEQUENCE = "modified_sequence" SEQUENCE = "sequence" @@ -44,10 +47,26 @@ class PsmDfCols(metaclass=ConstantsClass): PRECURSOR_MZ = "precursor_mz" DIANN_SPEC_INDEX = "diann_spec_idx" - FRAG_START_IDX = "frag_start_idx" - FRAG_STOP_IDX = "frag_stop_idx" - # part of the output, but not directly referenced _UNIPROT_IDS = "uniprot_ids" _GENES = "genes" _QUERY_ID = "query_id" + + # part of psm_reader_yaml, but not directly referenced + _INTENSITY = "intensity" + + +class LibPsmDfCols(metaclass=ConstantsClass): + """Constants for accessing the columns of a Library PSM dataframe.""" + + # only used for reader_type=library_reader_base + FRAG_START_IDX = "frag_start_idx" + FRAG_STOP_IDX = "frag_stop_idx" + + # only used for reader_type=library_reader_base, not referenced in reader classes + FRAGMENT_INTENSITY = "fragment_intensity" + FRAGMENT_MZ = "fragment_mz" + FRAGMENT_TYPE = "fragment_type" + FRAGMENT_CHARGE = "fragment_charge" + FRAGMENT_SERIES = "fragment_series" + FRAGMENT_LOSS_TYPE = "fragment_loss_type" diff --git a/alphabase/spectral_library/reader.py b/alphabase/spectral_library/reader.py index fd3e2b9f..ea648a29 100644 --- a/alphabase/spectral_library/reader.py +++ b/alphabase/spectral_library/reader.py @@ -7,7 +7,7 @@ from alphabase.constants._const import PEAK_INTENSITY_DTYPE from alphabase.peptide.mobility import mobility_to_ccs_for_df from alphabase.psm_reader import psm_reader_provider -from alphabase.psm_reader.keys import PsmDfCols +from alphabase.psm_reader.keys import LibPsmDfCols, PsmDfCols from alphabase.psm_reader.maxquant_reader import MaxQuantReader from alphabase.psm_reader.psm_reader import psm_reader_yaml from alphabase.spectral_library.base import SpecLibBase @@ -116,19 +116,21 @@ def _find_key_columns(self, lib_df: pd.DataFrame): Dataframe containing the spectral library. """ - if "fragment_loss_type" not in lib_df.columns: - lib_df["fragment_loss_type"] = "" + if LibPsmDfCols.FRAGMENT_LOSS_TYPE not in lib_df.columns: + lib_df[LibPsmDfCols.FRAGMENT_LOSS_TYPE] = "" - lib_df.fillna({"fragment_loss_type": ""}, inplace=True) + lib_df.fillna({LibPsmDfCols.FRAGMENT_LOSS_TYPE: ""}, inplace=True) lib_df.replace( - {"fragment_loss_type": "noloss"}, {"fragment_loss_type": ""}, inplace=True + {LibPsmDfCols.FRAGMENT_LOSS_TYPE: "noloss"}, + {LibPsmDfCols.FRAGMENT_LOSS_TYPE: ""}, + inplace=True, ) - if "mods" not in lib_df.columns: - lib_df["mods"] = "" + if PsmDfCols.MODS not in lib_df.columns: + lib_df[PsmDfCols.MODS] = "" - if "mod_sites" not in lib_df.columns: - lib_df["mod_sites"] = "" + if PsmDfCols.MOD_SITES not in lib_df.columns: + lib_df[PsmDfCols.MOD_SITES] = "" def _get_fragment_intensity(self, lib_df: pd.DataFrame): """ @@ -162,12 +164,12 @@ def _get_fragment_intensity(self, lib_df: pd.DataFrame): nAA_list = [] fragment_columns = [ - "fragment_mz", - "fragment_type", - "fragment_charge", - "fragment_series", - "fragment_loss_type", - "fragment_intensity", + LibPsmDfCols.FRAGMENT_MZ, + LibPsmDfCols.FRAGMENT_TYPE, + LibPsmDfCols.FRAGMENT_CHARGE, + LibPsmDfCols.FRAGMENT_SERIES, + LibPsmDfCols.FRAGMENT_LOSS_TYPE, + LibPsmDfCols.FRAGMENT_INTENSITY, ] # by default, all non-fragment columns are used to group the library @@ -184,11 +186,11 @@ def _get_fragment_intensity(self, lib_df: pd.DataFrame): ) for frag_type, frag_num, loss_type, frag_charge, inten in df_group[ [ - "fragment_type", - "fragment_series", - "fragment_loss_type", - "fragment_charge", - "fragment_intensity", + LibPsmDfCols.FRAGMENT_TYPE, + LibPsmDfCols.FRAGMENT_SERIES, + LibPsmDfCols.FRAGMENT_LOSS_TYPE, + LibPsmDfCols.FRAGMENT_CHARGE, + LibPsmDfCols.FRAGMENT_INTENSITY, ] ].values: if frag_type in "abc": diff --git a/tests/integration/test_psm_readers.py b/tests/integration/test_psm_readers.py index 5ed7b593..62c760cd 100644 --- a/tests/integration/test_psm_readers.py +++ b/tests/integration/test_psm_readers.py @@ -19,8 +19,9 @@ SpectronautReportReader, SwathReader, pFindReader, + psm_reader_yaml, ) -from alphabase.psm_reader.keys import PsmDfCols +from alphabase.psm_reader.keys import LibPsmDfCols, PsmDfCols from alphabase.spectral_library.reader import LibraryReaderBase current_file_directory = os.path.dirname(os.path.abspath(__file__)) @@ -49,6 +50,16 @@ def _assert_reference_df_equal(psm_df: pd.DataFrame, test_case_name: str) -> Non raise ValueError("No reference data found.") +def test_psm_reader_yaml() -> None: + """Test that all column mappings in the psm_reader.yaml are covered by string constant keys.""" + for reader_config in psm_reader_yaml.values(): + ks = [k for k in reader_config["column_mapping"]] + assert ( + set(ks) - set(PsmDfCols.get_values()) - set(LibPsmDfCols.get_values()) + == set() + ) + + def test_maxquant_reader() -> None: """Test the MaxQuant reader.""" From 70ceaa1a89f45d464e96a61e91d1b88003a9a822 Mon Sep 17 00:00:00 2001 From: mschwoerer <82171591+mschwoer@users.noreply.github.com> Date: Wed, 13 Nov 2024 09:15:52 +0100 Subject: [PATCH 09/11] fix tests --- alphabase/psm_reader/keys.py | 3 +-- tests/integration/test_psm_readers.py | 7 ++++++- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/alphabase/psm_reader/keys.py b/alphabase/psm_reader/keys.py index 9980abd5..4b4449e9 100644 --- a/alphabase/psm_reader/keys.py +++ b/alphabase/psm_reader/keys.py @@ -59,11 +59,10 @@ class PsmDfCols(metaclass=ConstantsClass): class LibPsmDfCols(metaclass=ConstantsClass): """Constants for accessing the columns of a Library PSM dataframe.""" - # only used for reader_type=library_reader_base FRAG_START_IDX = "frag_start_idx" FRAG_STOP_IDX = "frag_stop_idx" - # only used for reader_type=library_reader_base, not referenced in reader classes + # not referenced in reader classes FRAGMENT_INTENSITY = "fragment_intensity" FRAGMENT_MZ = "fragment_mz" FRAGMENT_TYPE = "fragment_type" diff --git a/tests/integration/test_psm_readers.py b/tests/integration/test_psm_readers.py index 62c760cd..2b4dfcd9 100644 --- a/tests/integration/test_psm_readers.py +++ b/tests/integration/test_psm_readers.py @@ -39,7 +39,12 @@ def _assert_reference_df_equal(psm_df: pd.DataFrame, test_case_name: str) -> Non # psm_df.to_csv(test_data_path / f"reference_{test_case_name}.csv") # check that all columns are available in PsmDfCols - assert set(psm_df.columns) - set(PsmDfCols.get_values()) == set() + assert ( + set(psm_df.columns) + - set(PsmDfCols.get_values()) + - set(LibPsmDfCols.get_values()) + == set() + ) if out_file_path.exists(): expected_df = pd.read_parquet(out_file_path) From b1e03c804b28cee890a875e2279ab07b0bd42cef Mon Sep 17 00:00:00 2001 From: mschwoerer <82171591+mschwoer@users.noreply.github.com> Date: Thu, 14 Nov 2024 17:33:15 +0100 Subject: [PATCH 10/11] more use of PsmDfCols --- alphabase/spectral_library/reader.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/alphabase/spectral_library/reader.py b/alphabase/spectral_library/reader.py index ea648a29..ca5fc761 100644 --- a/alphabase/spectral_library/reader.py +++ b/alphabase/spectral_library/reader.py @@ -236,8 +236,8 @@ def _get_fragment_intensity(self, lib_df: pd.DataFrame): indices[1:] = np.array(nAA_list) - 1 indices = np.cumsum(indices) - df["frag_start_idx"] = indices[:-1] - df["frag_stop_idx"] = indices[1:] + df[LibPsmDfCols.FRAG_START_IDX] = indices[:-1] + df[LibPsmDfCols.FRAG_STOP_IDX] = indices[1:] return df From a711aff65659688e5c3831456c4cd744b41ed806 Mon Sep 17 00:00:00 2001 From: mschwoerer <82171591+mschwoer@users.noreply.github.com> Date: Fri, 15 Nov 2024 09:51:23 +0100 Subject: [PATCH 11/11] add again what was lost by the failed merge commit 97613f1336eca33fd74309d2bdbdd60f8b6e926d --- .bumpversion.cfg | 2 +- alphabase/__init__.py | 2 +- alphabase/psm_reader/maxquant_reader.py | 18 ++++++++++++++++++ docs/conf.py | 2 +- 4 files changed, 21 insertions(+), 3 deletions(-) diff --git a/.bumpversion.cfg b/.bumpversion.cfg index b55048b4..37587c45 100644 --- a/.bumpversion.cfg +++ b/.bumpversion.cfg @@ -1,5 +1,5 @@ [bumpversion] -current_version = 1.4.0 +current_version = 1.4.1 commit = True tag = False parse = (?P\d+)\.(?P\d+)\.(?P\d+)(\-(?P[a-z]+)(?P\d+))? diff --git a/alphabase/__init__.py b/alphabase/__init__.py index 006941da..86008ce9 100644 --- a/alphabase/__init__.py +++ b/alphabase/__init__.py @@ -2,7 +2,7 @@ __project__ = "alphabase" -__version__ = "1.4.0" +__version__ = "1.4.1" __license__ = "Apache" __description__ = "An infrastructure Python package of the AlphaX ecosystem" __author__ = "Mann Labs" diff --git a/alphabase/psm_reader/maxquant_reader.py b/alphabase/psm_reader/maxquant_reader.py index 1a62c392..87da6bda 100644 --- a/alphabase/psm_reader/maxquant_reader.py +++ b/alphabase/psm_reader/maxquant_reader.py @@ -1,4 +1,5 @@ import copy +import warnings from typing import Optional import numba @@ -13,6 +14,9 @@ psm_reader_yaml, ) +# make sure all warnings are shown +warnings.filterwarnings("always") + mod_to_unimod_dict = {} for mod_name, unimod_id in MOD_DF[["mod_name", "unimod_id"]].values: unimod_id = int(unimod_id) @@ -249,6 +253,20 @@ def _load_file(self, filename): self._find_mod_seq_column(df) df = df[~pd.isna(df["Retention time"])] df.fillna("", inplace=True) + + # remove MBR PSMs as they are currently not supported and will crash import + mapped_columns = self._find_mapped_columns(df) + if "scan_num" in mapped_columns: + scan_num_col = mapped_columns["scan_num"] + no_ms2_mask = df[scan_num_col] == "" + if (num_no_ms2_mask := np.sum(no_ms2_mask)) > 0: + warnings.warn( + f"Maxquant psm file contains {num_no_ms2_mask} MBR PSMs without MS2 scan. This is not yet supported and rows containing MBR PSMs will be removed." + ) + df = df[~no_ms2_mask] + df.reset_index(drop=True, inplace=True) + df[scan_num_col] = df[scan_num_col].astype(int) + # if 'K0' in df.columns: # df['Mobility'] = df['K0'] # Bug in MaxQuant? It should be 1/K0 # min_rt = df['Retention time'].min() diff --git a/docs/conf.py b/docs/conf.py index 8cd3bdfa..687c35f7 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -24,7 +24,7 @@ copyright = "2022, Mann Labs, MPIB" author = "Mann Labs, MPIB" -release = "1.4.0" +release = "1.4.1" # -- General configuration ---------------------------------------------------