diff --git a/.bumpversion.cfg b/.bumpversion.cfg index 4864cd98..37587c45 100644 --- a/.bumpversion.cfg +++ b/.bumpversion.cfg @@ -14,5 +14,6 @@ serialize = [bumpversion:file:./alphabase/__init__.py] [bumpversion:file:./docs/conf.py] + search = {current_version} replace = {new_version} diff --git a/alphabase/psm_reader/alphapept_reader.py b/alphabase/psm_reader/alphapept_reader.py index df59f8fa..48f5b158 100644 --- a/alphabase/psm_reader/alphapept_reader.py +++ b/alphabase/psm_reader/alphapept_reader.py @@ -5,6 +5,7 @@ import numpy as np import pandas as pd +from alphabase.psm_reader.keys import PsmDfCols from alphabase.psm_reader.psm_reader import ( PSMReaderBase, psm_reader_provider, @@ -79,31 +80,31 @@ def _load_file(self, filename): with h5py.File(filename, "r") as _hdf: dataset = _hdf[self.hdf_dataset] df = pd.DataFrame({col: dataset[col] for col in dataset}) - df["raw_name"] = os.path.basename(filename)[: -len(".ms_data.hdf")] + df[PsmDfCols.RAW_NAME] = os.path.basename(filename)[: -len(".ms_data.hdf")] df["precursor"] = df["precursor"].str.decode("utf-8") # df['naked_sequence'] = df['naked_sequence'].str.decode('utf-8') if "scan_no" in df.columns: df["scan_no"] = df["scan_no"].astype("int") df["raw_idx"] = df["scan_no"] - 1 # if thermo, use scan-1 as spec_idx - df["charge"] = df["charge"].astype(int) + df[PsmDfCols.CHARGE] = df[PsmDfCols.CHARGE].astype(int) return df def _load_modifications(self, df: pd.DataFrame): if len(df) == 0: - self._psm_df["sequence"] = "" - self._psm_df["mods"] = "" - self._psm_df["mod_sites"] = "" - self._psm_df["decoy"] = 0 + self._psm_df[PsmDfCols.SEQUENCE] = "" + self._psm_df[PsmDfCols.MODS] = "" + self._psm_df[PsmDfCols.MOD_SITES] = "" + self._psm_df[PsmDfCols.DECOY] = 0 return ( - self._psm_df["sequence"], - self._psm_df["mods"], - self._psm_df["mod_sites"], + self._psm_df[PsmDfCols.SEQUENCE], + self._psm_df[PsmDfCols.MODS], + self._psm_df[PsmDfCols.MOD_SITES], _charges, - self._psm_df["decoy"], + self._psm_df[PsmDfCols.DECOY], ) = zip(*df["precursor"].apply(parse_ap)) - self._psm_df.decoy = self._psm_df.decoy.astype(np.int8) + self._psm_df[PsmDfCols.DECOY] = self._psm_df[PsmDfCols.DECOY].astype(np.int8) def register_readers(): diff --git a/alphabase/psm_reader/dia_psm_reader.py b/alphabase/psm_reader/dia_psm_reader.py index 310f4485..5db0b97e 100644 --- a/alphabase/psm_reader/dia_psm_reader.py +++ b/alphabase/psm_reader/dia_psm_reader.py @@ -1,6 +1,7 @@ import numpy as np import pandas as pd +from alphabase.psm_reader.keys import PsmDfCols from alphabase.psm_reader.maxquant_reader import MaxQuantReader from alphabase.psm_reader.psm_reader import psm_reader_provider, psm_reader_yaml @@ -127,7 +128,9 @@ def _load_file(self, filename): def _post_process(self, origin_df: pd.DataFrame): super()._post_process(origin_df) - self._psm_df.rename(columns={"spec_idx": "diann_spec_idx"}, inplace=True) + self._psm_df.rename( + columns={PsmDfCols.SPEC_IDX: PsmDfCols.DIANN_SPEC_INDEX}, inplace=True + ) class SpectronautReportReader(MaxQuantReader): @@ -174,10 +177,10 @@ def _load_file(self, filename): self.mod_seq_column = "ModifiedSequence" self.csv_sep = self._get_table_delimiter(filename) df = pd.read_csv(filename, sep=self.csv_sep, keep_default_na=False) - df[[self.mod_seq_column, "charge"]] = df[self.precursor_column].str.split( - ".", expand=True, n=2 - ) - df["charge"] = df.charge.astype(np.int8) + df[[self.mod_seq_column, PsmDfCols.CHARGE]] = df[ + self.precursor_column + ].str.split(".", expand=True, n=2) + df[PsmDfCols.CHARGE] = df[PsmDfCols.CHARGE].astype(np.int8) return df diff --git a/alphabase/psm_reader/keys.py b/alphabase/psm_reader/keys.py new file mode 100644 index 00000000..4b4449e9 --- /dev/null +++ b/alphabase/psm_reader/keys.py @@ -0,0 +1,71 @@ +class ConstantsClass(type): + """A metaclass for classes that should only contain string constants.""" + + def __setattr__(self, name, value): + raise TypeError("Constants class cannot be modified") + + def get_values(cls): + """Get all user-defined string values of the class.""" + return [ + value + for key, value in cls.__dict__.items() + if not key.startswith("__") and isinstance(value, str) + ] + + +class PsmDfCols(metaclass=ConstantsClass): + """Constants for accessing the columns of a PSM dataframe.""" + + # TODO: these are used only in th psm_reader package and the spectral_library.reader module so far + MOD_SITES = "mod_sites" + MODIFIED_SEQUENCE = "modified_sequence" + SEQUENCE = "sequence" + DECOY = "decoy" + MODS = "mods" + SCORE = "score" + TO_REMOVE = "to_remove" + AA_MASS_DIFFS = "aa_mass_diffs" + AA_MASS_DIFF_SITES = "aa_mass_diff_sites" + RT = "rt" + RT_START = "rt_start" + RT_STOP = "rt_stop" + RT_NORM = "rt_norm" + SPEC_IDX = "spec_idx" + SCANNR = "scannr" + FDR = "fdr" + NAA = "nAA" + CCS = "ccs" + MOBILITY = "mobility" + PEPTIDE_FDR = "peptide_fdr" + PROTEIN_FDR = "protein_fdr" + + RAW_NAME = "raw_name" + CHARGE = "charge" + PROTEINS = "proteins" + + SCAN_NUM = "scan_num" + PRECURSOR_MZ = "precursor_mz" + DIANN_SPEC_INDEX = "diann_spec_idx" + + # part of the output, but not directly referenced + _UNIPROT_IDS = "uniprot_ids" + _GENES = "genes" + _QUERY_ID = "query_id" + + # part of psm_reader_yaml, but not directly referenced + _INTENSITY = "intensity" + + +class LibPsmDfCols(metaclass=ConstantsClass): + """Constants for accessing the columns of a Library PSM dataframe.""" + + FRAG_START_IDX = "frag_start_idx" + FRAG_STOP_IDX = "frag_stop_idx" + + # not referenced in reader classes + FRAGMENT_INTENSITY = "fragment_intensity" + FRAGMENT_MZ = "fragment_mz" + FRAGMENT_TYPE = "fragment_type" + FRAGMENT_CHARGE = "fragment_charge" + FRAGMENT_SERIES = "fragment_series" + FRAGMENT_LOSS_TYPE = "fragment_loss_type" diff --git a/alphabase/psm_reader/maxquant_reader.py b/alphabase/psm_reader/maxquant_reader.py index 984ef1cc..87da6bda 100644 --- a/alphabase/psm_reader/maxquant_reader.py +++ b/alphabase/psm_reader/maxquant_reader.py @@ -1,11 +1,13 @@ import copy import warnings +from typing import Optional import numba import numpy as np import pandas as pd from alphabase.constants.modification import MOD_DF +from alphabase.psm_reader.keys import PsmDfCols from alphabase.psm_reader.psm_reader import ( PSMReaderBase, psm_reader_provider, @@ -195,7 +197,7 @@ def _init_modification_mapping(self): psm_reader_yaml["maxquant"]["modification_mapping"] ) - def set_modification_mapping(self, modification_mapping: dict): + def set_modification_mapping(self, modification_mapping: Optional[dict] = None): super().set_modification_mapping(modification_mapping) self._add_all_unimod() self._extend_mod_brackets() @@ -237,8 +239,10 @@ def _extend_mod_brackets(self): self.modification_mapping[key] = list(mod_set) def _translate_decoy(self, origin_df=None): - if "decoy" in self._psm_df.columns: - self._psm_df.decoy = (self._psm_df.decoy == "-").astype(np.int8) + if PsmDfCols.DECOY in self._psm_df.columns: + self._psm_df[PsmDfCols.DECOY] = ( + self._psm_df[PsmDfCols.DECOY] == "-" + ).astype(np.int8) def _init_column_mapping(self): self.column_mapping = psm_reader_yaml["maxquant"]["column_mapping"] @@ -278,15 +282,15 @@ def _load_modifications(self, origin_df: pd.DataFrame): else: mod_sep = "()" - (seqs, self._psm_df["mods"], self._psm_df["mod_sites"]) = zip( + (seqs, self._psm_df[PsmDfCols.MODS], self._psm_df[PsmDfCols.MOD_SITES]) = zip( *origin_df[self.mod_seq_column].apply( parse_mod_seq, mod_sep=mod_sep, fixed_C57=self.fixed_C57, ) ) - if "sequence" not in self._psm_df.columns: - self._psm_df["sequence"] = seqs + if PsmDfCols.SEQUENCE not in self._psm_df.columns: + self._psm_df[PsmDfCols.SEQUENCE] = seqs def register_readers(): diff --git a/alphabase/psm_reader/msfragger_reader.py b/alphabase/psm_reader/msfragger_reader.py index 6da1b713..22301b94 100644 --- a/alphabase/psm_reader/msfragger_reader.py +++ b/alphabase/psm_reader/msfragger_reader.py @@ -5,6 +5,7 @@ from alphabase.constants.aa import AA_ASCII_MASS from alphabase.constants.atom import MASS_H, MASS_O from alphabase.constants.modification import MOD_MASS +from alphabase.psm_reader.keys import PsmDfCols from alphabase.psm_reader.psm_reader import ( PSMReaderBase, psm_reader_provider, @@ -115,9 +116,6 @@ def __init__( def _init_column_mapping(self): self.column_mapping = psm_reader_yaml["msfragger_pepxml"]["column_mapping"] - def _init_modification_mapping(self): - self.modification_mapping = {} - def _translate_modifications(self): pass @@ -126,37 +124,41 @@ def _load_file(self, filename): msf_df.fillna("", inplace=True) if "ion_mobility" in msf_df.columns: msf_df["ion_mobility"] = msf_df.ion_mobility.astype(float) - msf_df["raw_name"] = msf_df["spectrum"].str.split(".").apply(lambda x: x[0]) - msf_df["to_remove"] = 0 - self.column_mapping["to_remove"] = "to_remove" + msf_df[PsmDfCols.RAW_NAME] = ( + msf_df["spectrum"].str.split(".").apply(lambda x: x[0]) + ) + msf_df["to_remove"] = 0 # TODO revisit + self.column_mapping[PsmDfCols.TO_REMOVE] = "to_remove" return msf_df def _translate_decoy(self, origin_df=None): - self._psm_df["decoy"] = self._psm_df.proteins.apply(_is_fragger_decoy).astype( - np.int8 + self._psm_df[PsmDfCols.DECOY] = ( + self._psm_df[PsmDfCols.PROTEINS].apply(_is_fragger_decoy).astype(np.int8) ) - self._psm_df.proteins = self._psm_df.proteins.apply(lambda x: ";".join(x)) + self._psm_df[PsmDfCols.PROTEINS] = self._psm_df[PsmDfCols.PROTEINS].apply( + lambda x: ";".join(x) + ) if not self._keep_decoy: - self._psm_df["to_remove"] += self._psm_df.decoy > 0 + self._psm_df[PsmDfCols.TO_REMOVE] += self._psm_df[PsmDfCols.DECOY] > 0 def _translate_score(self, origin_df=None): # evalue score - self._psm_df["score"] = -np.log(self._psm_df["score"] + 1e-100) + self._psm_df[PsmDfCols.SCORE] = -np.log(self._psm_df[PsmDfCols.SCORE] + 1e-100) def _load_modifications(self, msf_df): if len(msf_df) == 0: - self._psm_df["mods"] = "" - self._psm_df["mod_sites"] = "" - self._psm_df["aa_mass_diffs"] = "" - self._psm_df["aa_mass_diff_sites"] = "" + self._psm_df[PsmDfCols.MODS] = "" + self._psm_df[PsmDfCols.MOD_SITES] = "" + self._psm_df[PsmDfCols.AA_MASS_DIFFS] = "" + self._psm_df[PsmDfCols.AA_MASS_DIFF_SITES] = "" return ( - self._psm_df["mods"], - self._psm_df["mod_sites"], - self._psm_df["aa_mass_diffs"], - self._psm_df["aa_mass_diff_sites"], + self._psm_df[PsmDfCols.MODS], + self._psm_df[PsmDfCols.MOD_SITES], + self._psm_df[PsmDfCols.AA_MASS_DIFFS], + self._psm_df[PsmDfCols.AA_MASS_DIFF_SITES], ) = zip( *msf_df[["peptide", "modifications"]].apply( lambda x: _get_mods_from_masses(*x), axis=1 @@ -164,16 +166,19 @@ def _load_modifications(self, msf_df): ) if not self.keep_unknown_aa_mass_diffs: - self._psm_df["to_remove"] += self._psm_df.aa_mass_diffs != "" + self._psm_df[PsmDfCols.TO_REMOVE] += ( + self._psm_df[PsmDfCols.AA_MASS_DIFFS] != "" + ) self._psm_df.drop( - columns=["aa_mass_diffs", "aa_mass_diff_sites"], inplace=True + columns=[PsmDfCols.AA_MASS_DIFFS, PsmDfCols.AA_MASS_DIFF_SITES], + inplace=True, ) def _post_process(self, origin_df: pd.DataFrame): super()._post_process(origin_df) self._psm_df = ( - self._psm_df.query("to_remove==0") - .drop(columns="to_remove") + self._psm_df.query(f"{PsmDfCols.TO_REMOVE}==0") + .drop(columns=PsmDfCols.TO_REMOVE) .reset_index(drop=True) ) diff --git a/alphabase/psm_reader/pfind_reader.py b/alphabase/psm_reader/pfind_reader.py index 69e24549..a182a8c9 100644 --- a/alphabase/psm_reader/pfind_reader.py +++ b/alphabase/psm_reader/pfind_reader.py @@ -2,6 +2,7 @@ import pandas as pd import alphabase.constants.modification as ap_mod +from alphabase.psm_reader.keys import PsmDfCols from alphabase.psm_reader.psm_reader import ( PSMReaderBase, psm_reader_provider, @@ -104,9 +105,6 @@ def __init__( def _init_column_mapping(self): self.column_mapping = psm_reader_yaml["pfind"]["column_mapping"] - def _init_modification_mapping(self): - self.modification_mapping = {} - def _translate_modifications(self): pass @@ -116,29 +114,35 @@ def _load_file(self, filename): ) pfind_df.fillna("", inplace=True) pfind_df = pfind_df[pfind_df.Sequence != ""] - pfind_df["raw_name"] = ( + pfind_df[PsmDfCols.RAW_NAME] = ( pfind_df["File_Name"].str.split(".").apply(lambda x: x[0]) ) pfind_df["Proteins"] = pfind_df["Proteins"].apply(parse_pfind_protein) return pfind_df def _translate_decoy(self, origin_df=None): - self._psm_df.decoy = (self._psm_df.decoy == "decoy").astype(np.int8) + self._psm_df[PsmDfCols.DECOY] = ( + self._psm_df[PsmDfCols.DECOY] == "decoy" + ).astype(np.int8) def _translate_score(self, origin_df=None): - self._psm_df.score = -np.log(self._psm_df.score.astype(float) + 1e-100) + self._psm_df[PsmDfCols.SCORE] = -np.log( + self._psm_df[PsmDfCols.SCORE].astype(float) + 1e-100 + ) def _load_modifications(self, pfind_df): if len(pfind_df) == 0: - self._psm_df["mods"] = "" - self._psm_df["mod_sites"] = "" + self._psm_df[PsmDfCols.MODS] = "" + self._psm_df[PsmDfCols.MOD_SITES] = "" return - (self._psm_df["mods"], self._psm_df["mod_sites"]) = zip( + (self._psm_df[PsmDfCols.MODS], self._psm_df[PsmDfCols.MOD_SITES]) = zip( *pfind_df["Modification"].apply(get_pFind_mods) ) - self._psm_df["mods"] = self._psm_df["mods"].apply(translate_pFind_mod) + self._psm_df[PsmDfCols.MODS] = self._psm_df[PsmDfCols.MODS].apply( + translate_pFind_mod + ) def register_readers(): diff --git a/alphabase/psm_reader/psm_reader.py b/alphabase/psm_reader/psm_reader.py index a4593a66..381b5b81 100644 --- a/alphabase/psm_reader/psm_reader.py +++ b/alphabase/psm_reader/psm_reader.py @@ -1,6 +1,7 @@ import copy import os import warnings +from typing import Optional import numpy as np import pandas as pd @@ -8,6 +9,7 @@ import alphabase.peptide.mobility as mobility from alphabase.constants._const import CONST_FILE_FOLDER from alphabase.peptide.precursor import reset_precursor_df, update_precursor_mz +from alphabase.psm_reader.keys import PsmDfCols from alphabase.utils import get_delimiter from alphabase.yaml_utils import load_yaml @@ -148,12 +150,15 @@ def __init__( If keep decoy PSMs in self.psm_df. _min_max_rt_norm : bool if True, the 'rt_norm' values in self._psm_df - will be normalized by rt_norm = (self.psm_df.rt-rt_min)/(rt_max-rt_min). + will be normalized by rt_norm = (self.psm_df[PsmDfCols.RT]-rt_min)/(rt_max-rt_min). It is useful to normalize iRT values as they contain negative values. Defaults to False. """ - self.set_modification_mapping(None) + self.modification_mapping = None + self.rev_mod_mapping = None + + self.set_modification_mapping() self.add_modification_mapping(modification_mapping) if column_mapping is not None: @@ -210,7 +215,7 @@ def add_modification_mapping(self, modification_mapping: dict): self.set_modification_mapping(self.modification_mapping) - def set_modification_mapping(self, modification_mapping: dict): + def set_modification_mapping(self, modification_mapping: Optional[dict] = None): if modification_mapping is None: self._init_modification_mapping() elif isinstance(modification_mapping, str): @@ -224,6 +229,7 @@ def set_modification_mapping(self, modification_mapping: dict): ) else: self.modification_mapping = copy.deepcopy(modification_mapping) + self._mods_as_lists() self._reverse_mod_mapping() @@ -312,17 +318,21 @@ def _get_table_delimiter(self, _filename): return get_delimiter(_filename) def _normalize_rt(self): - if "rt" in self.psm_df.columns: + if PsmDfCols.RT in self._psm_df.columns: if self._engine_rt_unit == "second": # self.psm_df['rt_sec'] = self.psm_df.rt - self.psm_df["rt"] = self.psm_df.rt / 60 - if "rt_start" in self.psm_df.columns: - self.psm_df["rt_start"] = self.psm_df.rt_start / 60 - self.psm_df["rt_stop"] = self.psm_df.rt_stop / 60 + self._psm_df[PsmDfCols.RT] = self._psm_df[PsmDfCols.RT] / 60 + if PsmDfCols.RT_START in self._psm_df.columns: + self._psm_df[PsmDfCols.RT_START] = ( + self._psm_df[PsmDfCols.RT_START] / 60 + ) + self._psm_df[PsmDfCols.RT_STOP] = ( + self._psm_df[PsmDfCols.RT_STOP] / 60 + ) # elif self._engine_rt_unit == 'minute': # self.psm_df['rt_sec'] = self.psm_df.rt*60 - min_rt = self.psm_df.rt.min() - max_rt = self.psm_df.rt.max() + min_rt = self._psm_df[PsmDfCols.RT].min() + max_rt = self._psm_df[PsmDfCols.RT].max() if min_rt < 0: # iRT if min_rt < self._min_irt_value: min_rt = self._min_irt_value @@ -332,20 +342,20 @@ def _normalize_rt(self): elif not self._min_max_rt_norm: min_rt = 0 - self.psm_df["rt_norm"] = ( - (self.psm_df.rt - min_rt) / (max_rt - min_rt) + self._psm_df[PsmDfCols.RT_NORM] = ( + (self._psm_df[PsmDfCols.RT] - min_rt) / (max_rt - min_rt) ).clip(0, 1) def normalize_rt_by_raw_name(self): - if "rt" not in self.psm_df.columns: + if PsmDfCols.RT not in self._psm_df.columns: return - if "rt_norm" not in self.psm_df.columns: + if PsmDfCols.RT_NORM not in self._psm_df.columns: self._normalize_rt() - if "raw_name" not in self.psm_df.columns: + if PsmDfCols.RAW_NAME not in self._psm_df.columns: return - for _, df_group in self.psm_df.groupby("raw_name"): - self.psm_df.loc[df_group.index, "rt_norm"] = ( - df_group.rt_norm / df_group.rt_norm.max() + for _, df_group in self._psm_df.groupby(PsmDfCols.RAW_NAME): + self._psm_df.loc[df_group.index, PsmDfCols.RT_NORM] = ( + df_group[PsmDfCols.RT_NORM] / df_group[PsmDfCols.RT_NORM].max() ) def _load_file(self, filename: str) -> pd.DataFrame: @@ -404,10 +414,10 @@ def _translate_columns(self, origin_df: pd.DataFrame): self._psm_df[col] = origin_df[map_col] if ( - "scan_num" in self._psm_df.columns - and "spec_idx" not in self._psm_df.columns + PsmDfCols.SCAN_NUM in self._psm_df.columns + and PsmDfCols.SPEC_IDX not in self._psm_df.columns ): - self._psm_df["spec_idx"] = self._psm_df.scan_num - 1 + self._psm_df[PsmDfCols.SPEC_IDX] = self._psm_df[PsmDfCols.SCAN_NUM] - 1 def _transform_table(self, origin_df: pd.DataFrame): """ @@ -451,8 +461,8 @@ def _translate_modifications(self): not in `self.modification_mapping` """ - self._psm_df.mods, unknown_mods = zip( - *self._psm_df.mods.apply( + self._psm_df[PsmDfCols.MODS], unknown_mods = zip( + *self._psm_df[PsmDfCols.MODS].apply( translate_other_modification, mod_dict=self.rev_mod_mapping ) ) @@ -479,32 +489,38 @@ def _post_process(self, origin_df: pd.DataFrame): origin_df : pd.DataFrame the loaded original df """ - self._psm_df["nAA"] = self._psm_df.sequence.str.len() + self._psm_df[PsmDfCols.NAA] = self._psm_df[PsmDfCols.SEQUENCE].str.len() self.normalize_rt_by_raw_name() - self._psm_df = self._psm_df[~self._psm_df["mods"].isna()] + self._psm_df = self._psm_df[~self._psm_df[PsmDfCols.MODS].isna()] keep_rows = np.ones(len(self._psm_df), dtype=bool) - if "fdr" in self._psm_df.columns: - keep_rows &= self._psm_df.fdr <= self._keep_fdr - if "decoy" in self._psm_df.columns and not self._keep_decoy: - keep_rows &= self._psm_df.decoy == 0 + if PsmDfCols.FDR in self._psm_df.columns: + keep_rows &= self._psm_df[PsmDfCols.FDR] <= self._keep_fdr + if PsmDfCols.DECOY in self._psm_df.columns and not self._keep_decoy: + keep_rows &= self._psm_df[PsmDfCols.DECOY] == 0 self._psm_df = self._psm_df[keep_rows] reset_precursor_df(self._psm_df) - if "precursor_mz" not in self._psm_df: + if PsmDfCols.PRECURSOR_MZ not in self._psm_df: self._psm_df = update_precursor_mz(self._psm_df) - if "ccs" in self._psm_df.columns and "mobility" not in self._psm_df.columns: - self._psm_df["mobility"] = mobility.ccs_to_mobility_for_df( - self._psm_df, "ccs" + if ( + PsmDfCols.CCS in self._psm_df.columns + and PsmDfCols.MOBILITY not in self._psm_df.columns + ): + self._psm_df[PsmDfCols.MOBILITY] = mobility.ccs_to_mobility_for_df( + self._psm_df, PsmDfCols.CCS ) - elif "mobility" in self._psm_df.columns and "ccs" not in self._psm_df.columns: - self._psm_df["ccs"] = mobility.mobility_to_ccs_for_df( - self._psm_df, "mobility" + elif ( + PsmDfCols.MOBILITY in self._psm_df.columns + and PsmDfCols.CCS not in self._psm_df.columns + ): + self._psm_df[PsmDfCols.CCS] = mobility.mobility_to_ccs_for_df( + self._psm_df, PsmDfCols.MOBILITY ) def filter_psm_by_modifications( @@ -524,11 +540,11 @@ def filter_psm_by_modifications( "Acetyl@Protein_N-term", ] ) - self._psm_df.mods = self._psm_df.mods.apply( + self._psm_df[PsmDfCols.MODS] = self._psm_df[PsmDfCols.MODS].apply( _keep_modifications, mod_set=include_mod_set ) - self._psm_df.dropna(subset=["mods"], inplace=True) + self._psm_df.dropna(subset=[PsmDfCols.MODS], inplace=True) self._psm_df.reset_index(drop=True, inplace=True) diff --git a/alphabase/psm_reader/sage_reader.py b/alphabase/psm_reader/sage_reader.py index 2c1da227..c4a8cebc 100644 --- a/alphabase/psm_reader/sage_reader.py +++ b/alphabase/psm_reader/sage_reader.py @@ -9,6 +9,7 @@ from tqdm import tqdm from alphabase.constants.modification import MOD_DF +from alphabase.psm_reader.keys import PsmDfCols from alphabase.psm_reader.psm_reader import ( PSMReaderBase, psm_reader_provider, @@ -94,7 +95,7 @@ def __call__(self, psm_df: pd.DataFrame) -> pd.DataFrame: translated_psm_df = _apply_translate_modifications_mp(psm_df, translation_df) # 5. Drop PSMs with missing modifications - is_null = translated_psm_df["mod_sites"].isnull() + is_null = translated_psm_df[PsmDfCols.MOD_SITES].isnull() translated_psm_df = translated_psm_df[~is_null] if np.sum(is_null) > 0: logging.warning( @@ -217,7 +218,10 @@ def _discover_modifications(psm_df: pd.DataFrame) -> pd.DataFrame: """ modifications = ( - psm_df["modified_sequence"].apply(_match_modified_sequence).explode().unique() + psm_df[PsmDfCols.MODIFIED_SEQUENCE] + .apply(_match_modified_sequence) + .explode() + .unique() ) modifications = modifications[~pd.isnull(modifications)] return pd.DataFrame( @@ -414,14 +418,14 @@ def _translate_modifications( def _apply_translate_modifications( - df: pd.DataFrame, mod_translation_df: pd.DataFrame + psm_df: pd.DataFrame, mod_translation_df: pd.DataFrame ) -> pd.DataFrame: """Apply the translation of modifications to the PSMs. Parameters ---------- - df : pd.DataFrame + psm_df : pd.DataFrame The PSM dataframe with column 'modified_sequence'. mod_translation_df : pd.DataFrame @@ -435,12 +439,12 @@ def _apply_translate_modifications( """ - df["mod_sites"], df["mods"] = zip( - *df["modified_sequence"].apply( + psm_df[PsmDfCols.MOD_SITES], psm_df[PsmDfCols.MODS] = zip( + *psm_df[PsmDfCols.MODIFIED_SEQUENCE].apply( lambda x: _translate_modifications(x, mod_translation_df) ) ) - return df + return psm_df def _batchify_df(df: pd.DataFrame, mp_batch_size: int) -> typing.Generator: @@ -466,7 +470,7 @@ def _batchify_df(df: pd.DataFrame, mp_batch_size: int) -> typing.Generator: def _apply_translate_modifications_mp( - df: pd.DataFrame, + psm_df: pd.DataFrame, mod_translation_df: pd.DataFrame, mp_batch_size: int = 50000, mp_process_num: int = 10, @@ -477,7 +481,7 @@ def _apply_translate_modifications_mp( Parameters ---------- - df : pd.DataFrame + psm_df : pd.DataFrame The PSM dataframe. mod_translation_df : pd.DataFrame @@ -496,11 +500,11 @@ def _apply_translate_modifications_mp( partial( _apply_translate_modifications, mod_translation_df=mod_translation_df ), - _batchify_df(df, mp_batch_size), + _batchify_df(psm_df, mp_batch_size), ) if progress_bar: df_list = list( - tqdm(processing, total=int(np.ceil(len(df) / mp_batch_size))) + tqdm(processing, total=int(np.ceil(len(psm_df) / mp_batch_size))) ) else: df_list = list(processing) @@ -593,28 +597,31 @@ def __init__( def _init_column_mapping(self): self.column_mapping = psm_reader_yaml["sage"]["column_mapping"] - def _init_modification_mapping(self): - self.modification_mapping = {} - def _load_file(self, filename): raise NotImplementedError def _transform_table(self, origin_df): - self.psm_df["spec_idx"] = self.psm_df["scannr"].apply( + self._psm_df[PsmDfCols.SPEC_IDX] = self._psm_df[PsmDfCols.SCANNR].apply( _sage_spec_idx_from_scan_nr ) - self.psm_df.drop(columns=["scannr"], inplace=True) + self._psm_df.drop(columns=[PsmDfCols.SCANNR], inplace=True) def _translate_decoy(self, origin_df): if not self._keep_decoy: - self._psm_df = self.psm_df[~self.psm_df["decoy"]] + self._psm_df = self._psm_df[~self._psm_df[PsmDfCols.DECOY]] - self._psm_df = self.psm_df[self.psm_df["fdr"] <= self._keep_fdr] - self._psm_df = self.psm_df[self.psm_df["peptide_fdr"] <= self._keep_fdr] - self._psm_df = self.psm_df[self.psm_df["protein_fdr"] <= self._keep_fdr] + self._psm_df = self._psm_df[self._psm_df[PsmDfCols.FDR] <= self._keep_fdr] + self._psm_df = self._psm_df[ + self._psm_df[PsmDfCols.PEPTIDE_FDR] <= self._keep_fdr + ] + self._psm_df = self._psm_df[ + self._psm_df[PsmDfCols.PROTEIN_FDR] <= self._keep_fdr + ] # drop peptide_fdr, protein_fdr - self._psm_df.drop(columns=["peptide_fdr", "protein_fdr"], inplace=True) + self._psm_df.drop( + columns=[PsmDfCols.PEPTIDE_FDR, PsmDfCols.PROTEIN_FDR], inplace=True + ) def _load_modifications(self, origin_df): pass @@ -627,7 +634,7 @@ def _translate_modifications(self): self._psm_df = sage_translation(self._psm_df) # drop modified_sequence - self._psm_df.drop(columns=["modified_sequence"], inplace=True) + self._psm_df.drop(columns=[PsmDfCols.MODIFIED_SEQUENCE], inplace=True) class SageReaderTSV(SageReaderBase): diff --git a/alphabase/spectral_library/reader.py b/alphabase/spectral_library/reader.py index 151f314b..ca5fc761 100644 --- a/alphabase/spectral_library/reader.py +++ b/alphabase/spectral_library/reader.py @@ -7,6 +7,7 @@ from alphabase.constants._const import PEAK_INTENSITY_DTYPE from alphabase.peptide.mobility import mobility_to_ccs_for_df from alphabase.psm_reader import psm_reader_provider +from alphabase.psm_reader.keys import LibPsmDfCols, PsmDfCols from alphabase.psm_reader.maxquant_reader import MaxQuantReader from alphabase.psm_reader.psm_reader import psm_reader_yaml from alphabase.spectral_library.base import SpecLibBase @@ -115,19 +116,21 @@ def _find_key_columns(self, lib_df: pd.DataFrame): Dataframe containing the spectral library. """ - if "fragment_loss_type" not in lib_df.columns: - lib_df["fragment_loss_type"] = "" + if LibPsmDfCols.FRAGMENT_LOSS_TYPE not in lib_df.columns: + lib_df[LibPsmDfCols.FRAGMENT_LOSS_TYPE] = "" - lib_df.fillna({"fragment_loss_type": ""}, inplace=True) + lib_df.fillna({LibPsmDfCols.FRAGMENT_LOSS_TYPE: ""}, inplace=True) lib_df.replace( - {"fragment_loss_type": "noloss"}, {"fragment_loss_type": ""}, inplace=True + {LibPsmDfCols.FRAGMENT_LOSS_TYPE: "noloss"}, + {LibPsmDfCols.FRAGMENT_LOSS_TYPE: ""}, + inplace=True, ) - if "mods" not in lib_df.columns: - lib_df["mods"] = "" + if PsmDfCols.MODS not in lib_df.columns: + lib_df[PsmDfCols.MODS] = "" - if "mod_sites" not in lib_df.columns: - lib_df["mod_sites"] = "" + if PsmDfCols.MOD_SITES not in lib_df.columns: + lib_df[PsmDfCols.MOD_SITES] = "" def _get_fragment_intensity(self, lib_df: pd.DataFrame): """ @@ -161,12 +164,12 @@ def _get_fragment_intensity(self, lib_df: pd.DataFrame): nAA_list = [] fragment_columns = [ - "fragment_mz", - "fragment_type", - "fragment_charge", - "fragment_series", - "fragment_loss_type", - "fragment_intensity", + LibPsmDfCols.FRAGMENT_MZ, + LibPsmDfCols.FRAGMENT_TYPE, + LibPsmDfCols.FRAGMENT_CHARGE, + LibPsmDfCols.FRAGMENT_SERIES, + LibPsmDfCols.FRAGMENT_LOSS_TYPE, + LibPsmDfCols.FRAGMENT_INTENSITY, ] # by default, all non-fragment columns are used to group the library @@ -175,7 +178,7 @@ def _get_fragment_intensity(self, lib_df: pd.DataFrame): for keys, df_group in tqdm(lib_df.groupby(non_fragment_columns)): precursor_columns = dict(zip(non_fragment_columns, keys)) - nAA = len(precursor_columns["sequence"]) + nAA = len(precursor_columns[PsmDfCols.SEQUENCE]) intens = np.zeros( (nAA - 1, len(self.charged_frag_types)), @@ -183,11 +186,11 @@ def _get_fragment_intensity(self, lib_df: pd.DataFrame): ) for frag_type, frag_num, loss_type, frag_charge, inten in df_group[ [ - "fragment_type", - "fragment_series", - "fragment_loss_type", - "fragment_charge", - "fragment_intensity", + LibPsmDfCols.FRAGMENT_TYPE, + LibPsmDfCols.FRAGMENT_SERIES, + LibPsmDfCols.FRAGMENT_LOSS_TYPE, + LibPsmDfCols.FRAGMENT_CHARGE, + LibPsmDfCols.FRAGMENT_INTENSITY, ] ].values: if frag_type in "abc": @@ -233,8 +236,8 @@ def _get_fragment_intensity(self, lib_df: pd.DataFrame): indices[1:] = np.array(nAA_list) - 1 indices = np.cumsum(indices) - df["frag_start_idx"] = indices[:-1] - df["frag_stop_idx"] = indices[1:] + df[LibPsmDfCols.FRAG_START_IDX] = indices[:-1] + df[LibPsmDfCols.FRAG_STOP_IDX] = indices[1:] return df @@ -286,7 +289,7 @@ def _post_process( # identify unknown modifications len_before = len(self._psm_df) - self._psm_df = self._psm_df[~self._psm_df["mods"].isna()] + self._psm_df = self._psm_df[~self._psm_df[PsmDfCols.MODS].isna()] len_after = len(self._psm_df) if len_before != len_after: @@ -294,17 +297,19 @@ def _post_process( f"{len_before-len_after} Entries with unknown modifications are removed" ) - if "nAA" not in self._psm_df.columns: - self._psm_df["nAA"] = self._psm_df.sequence.str.len() + if PsmDfCols.NAA not in self._psm_df.columns: + self._psm_df[PsmDfCols.NAA] = self._psm_df[PsmDfCols.SEQUENCE].str.len() self._psm_df = self._get_fragment_intensity(self._psm_df) self.normalize_rt_by_raw_name() - if "mobility" in self._psm_df.columns: - self._psm_df["ccs"] = mobility_to_ccs_for_df(self._psm_df, "mobility") + if PsmDfCols.MOBILITY in self._psm_df.columns: + self._psm_df[PsmDfCols.CCS] = mobility_to_ccs_for_df( + self._psm_df, PsmDfCols.MOBILITY + ) - self._psm_df.drop("modified_sequence", axis=1, inplace=True) + self._psm_df.drop(PsmDfCols.MODIFIED_SEQUENCE, axis=1, inplace=True) self._precursor_df = self._psm_df self.calc_fragment_mz_df() diff --git a/tests/integration/test_psm_readers.py b/tests/integration/test_psm_readers.py index 6c46167d..2b4dfcd9 100644 --- a/tests/integration/test_psm_readers.py +++ b/tests/integration/test_psm_readers.py @@ -19,7 +19,9 @@ SpectronautReportReader, SwathReader, pFindReader, + psm_reader_yaml, ) +from alphabase.psm_reader.keys import LibPsmDfCols, PsmDfCols from alphabase.spectral_library.reader import LibraryReaderBase current_file_directory = os.path.dirname(os.path.abspath(__file__)) @@ -34,6 +36,16 @@ def _assert_reference_df_equal(psm_df: pd.DataFrame, test_case_name: str) -> Non If reference is not present, save the output as reference data and raise. """ out_file_path = test_data_path / f"reference_{test_case_name}.parquet" + # psm_df.to_csv(test_data_path / f"reference_{test_case_name}.csv") + + # check that all columns are available in PsmDfCols + assert ( + set(psm_df.columns) + - set(PsmDfCols.get_values()) + - set(LibPsmDfCols.get_values()) + == set() + ) + if out_file_path.exists(): expected_df = pd.read_parquet(out_file_path) @@ -43,6 +55,16 @@ def _assert_reference_df_equal(psm_df: pd.DataFrame, test_case_name: str) -> Non raise ValueError("No reference data found.") +def test_psm_reader_yaml() -> None: + """Test that all column mappings in the psm_reader.yaml are covered by string constant keys.""" + for reader_config in psm_reader_yaml.values(): + ks = [k for k in reader_config["column_mapping"]] + assert ( + set(ks) - set(PsmDfCols.get_values()) - set(LibPsmDfCols.get_values()) + == set() + ) + + def test_maxquant_reader() -> None: """Test the MaxQuant reader."""