Skip to content

Commit

Permalink
introduce PsmDfCols
Browse files Browse the repository at this point in the history
  • Loading branch information
mschwoer committed Nov 12, 2024
1 parent a9c2d46 commit 98a2f83
Show file tree
Hide file tree
Showing 7 changed files with 105 additions and 59 deletions.
17 changes: 9 additions & 8 deletions alphabase/psm_reader/alphapept_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
import numpy as np
import pandas as pd

from alphabase.psm_reader.keys import PsmDfCols
from alphabase.psm_reader.psm_reader import (
PSMReaderBase,
psm_reader_provider,
Expand Down Expand Up @@ -90,18 +91,18 @@ def _load_file(self, filename):

def _load_modifications(self, df: pd.DataFrame):
if len(df) == 0:
self._psm_df["sequence"] = ""
self._psm_df["mods"] = ""
self._psm_df["mod_sites"] = ""
self._psm_df["decoy"] = 0
self._psm_df[PsmDfCols.SEQUENCE] = ""
self._psm_df[PsmDfCols.MODS] = ""
self._psm_df[PsmDfCols.MOD_SITES] = ""
self._psm_df[PsmDfCols.DECOY] = 0
return

(
self._psm_df["sequence"],
self._psm_df["mods"],
self._psm_df["mod_sites"],
self._psm_df[PsmDfCols.SEQUENCE],
self._psm_df[PsmDfCols.MODS],
self._psm_df[PsmDfCols.MOD_SITES],
_charges,
self._psm_df["decoy"],
self._psm_df[PsmDfCols.DECOY],
) = zip(*df["precursor"].apply(parse_ap))
self._psm_df.decoy = self._psm_df.decoy.astype(np.int8)

Expand Down
22 changes: 22 additions & 0 deletions alphabase/psm_reader/keys.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
class PsmDfCols:
MOD_SITES = "mod_sites"
MODIFIED_SEQUENCE = "modified_sequence"
SEQUENCE = "sequence"
DECOY = "decoy"
MODS = "mods"
SCORE = "score"
TO_REMOVE = "to_remove"
AA_MASS_DIFFS = "aa_mass_diffs"
AA_MASS_DIFF_SITES = "aa_mass_diff_sites"
RT = "rt"
RT_START = "rt_start"
RT_STOP = "rt_stop"
RT_NORM = "rt_norm"
SPEC_IDX = "spec_idx"
SCANNR = "scannr"
FDR = "fdr"
NAA = "nAA"
CCS = "ccs"
MOBILITY = "mobility"
PEPTIDE_FDR = "peptide_fdr"
PROTEIN_FDR = "protein_fdr"
7 changes: 4 additions & 3 deletions alphabase/psm_reader/maxquant_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
import pandas as pd

from alphabase.constants.modification import MOD_DF
from alphabase.psm_reader.keys import PsmDfCols
from alphabase.psm_reader.psm_reader import (
PSMReaderBase,
psm_reader_provider,
Expand Down Expand Up @@ -261,15 +262,15 @@ def _load_modifications(self, origin_df: pd.DataFrame):
else:
mod_sep = "()"

(seqs, self._psm_df["mods"], self._psm_df["mod_sites"]) = zip(
(seqs, self._psm_df[PsmDfCols.MODS], self._psm_df[PsmDfCols.MOD_SITES]) = zip(
*origin_df[self.mod_seq_column].apply(
parse_mod_seq,
mod_sep=mod_sep,
fixed_C57=self.fixed_C57,
)
)
if "sequence" not in self._psm_df.columns:
self._psm_df["sequence"] = seqs
if PsmDfCols.SEQUENCE not in self._psm_df.columns:
self._psm_df[PsmDfCols.SEQUENCE] = seqs


def register_readers():
Expand Down
32 changes: 17 additions & 15 deletions alphabase/psm_reader/msfragger_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
from alphabase.constants.aa import AA_ASCII_MASS
from alphabase.constants.atom import MASS_H, MASS_O
from alphabase.constants.modification import MOD_MASS
from alphabase.psm_reader.keys import PsmDfCols
from alphabase.psm_reader.psm_reader import (
PSMReaderBase,
psm_reader_provider,
Expand Down Expand Up @@ -129,41 +130,42 @@ def _load_file(self, filename):
return msf_df

def _translate_decoy(self, origin_df=None):
self._psm_df["decoy"] = self._psm_df.proteins.apply(_is_fragger_decoy).astype(
np.int8
)
self._psm_df[PsmDfCols.DECOY] = self._psm_df.proteins.apply(
_is_fragger_decoy
).astype(np.int8)

self._psm_df.proteins = self._psm_df.proteins.apply(lambda x: ";".join(x))
if not self._keep_decoy:
self._psm_df["to_remove"] += self._psm_df.decoy > 0
self._psm_df[PsmDfCols.TO_REMOVE] += self._psm_df.decoy > 0

def _translate_score(self, origin_df=None):
# evalue score
self._psm_df["score"] = -np.log(self._psm_df["score"] + 1e-100)
self._psm_df[PsmDfCols.SCORE] = -np.log(self._psm_df[PsmDfCols.SCORE] + 1e-100)

def _load_modifications(self, msf_df):
if len(msf_df) == 0:
self._psm_df["mods"] = ""
self._psm_df["mod_sites"] = ""
self._psm_df["aa_mass_diffs"] = ""
self._psm_df["aa_mass_diff_sites"] = ""
self._psm_df[PsmDfCols.MODS] = ""
self._psm_df[PsmDfCols.MOD_SITES] = ""
self._psm_df[PsmDfCols.AA_MASS_DIFFS] = ""
self._psm_df[PsmDfCols.AA_MASS_DIFF_SITES] = ""
return

(
self._psm_df["mods"],
self._psm_df["mod_sites"],
self._psm_df["aa_mass_diffs"],
self._psm_df["aa_mass_diff_sites"],
self._psm_df[PsmDfCols.MODS],
self._psm_df[PsmDfCols.MOD_SITES],
self._psm_df[PsmDfCols.AA_MASS_DIFFS],
self._psm_df[PsmDfCols.AA_MASS_DIFF_SITES],
) = zip(
*msf_df[["peptide", "modifications"]].apply(
lambda x: _get_mods_from_masses(*x), axis=1
)
)

if not self.keep_unknown_aa_mass_diffs:
self._psm_df["to_remove"] += self._psm_df.aa_mass_diffs != ""
self._psm_df[PsmDfCols.TO_REMOVE] += self._psm_df.aa_mass_diffs != ""
self._psm_df.drop(
columns=["aa_mass_diffs", "aa_mass_diff_sites"], inplace=True
columns=[PsmDfCols.AA_MASS_DIFFS, PsmDfCols.AA_MASS_DIFF_SITES],
inplace=True,
)

def _post_process(self, origin_df: pd.DataFrame):
Expand Down
11 changes: 7 additions & 4 deletions alphabase/psm_reader/pfind_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
import pandas as pd

import alphabase.constants.modification as ap_mod
from alphabase.psm_reader.keys import PsmDfCols
from alphabase.psm_reader.psm_reader import (
PSMReaderBase,
psm_reader_provider,
Expand Down Expand Up @@ -127,15 +128,17 @@ def _translate_score(self, origin_df=None):

def _load_modifications(self, pfind_df):
if len(pfind_df) == 0:
self._psm_df["mods"] = ""
self._psm_df["mod_sites"] = ""
self._psm_df[PsmDfCols.MODS] = ""
self._psm_df[PsmDfCols.MOD_SITES] = ""
return

(self._psm_df["mods"], self._psm_df["mod_sites"]) = zip(
(self._psm_df[PsmDfCols.MODS], self._psm_df[PsmDfCols.MOD_SITES]) = zip(
*pfind_df["Modification"].apply(get_pFind_mods)
)

self._psm_df["mods"] = self._psm_df["mods"].apply(translate_pFind_mod)
self._psm_df[PsmDfCols.MODS] = self._psm_df[PsmDfCols.MODS].apply(
translate_pFind_mod
)


def register_readers():
Expand Down
47 changes: 27 additions & 20 deletions alphabase/psm_reader/psm_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
import alphabase.peptide.mobility as mobility
from alphabase.constants._const import CONST_FILE_FOLDER
from alphabase.peptide.precursor import reset_precursor_df, update_precursor_mz
from alphabase.psm_reader.keys import PsmDfCols
from alphabase.utils import get_delimiter
from alphabase.yaml_utils import load_yaml

Expand Down Expand Up @@ -317,13 +318,13 @@ def _get_table_delimiter(self, _filename):
return get_delimiter(_filename)

def _normalize_rt(self):
if "rt" in self._psm_df.columns:
if PsmDfCols.RT in self._psm_df.columns:
if self._engine_rt_unit == "second":
# self.psm_df['rt_sec'] = self.psm_df.rt
self._psm_df["rt"] = self._psm_df.rt / 60
if "rt_start" in self._psm_df.columns:
self._psm_df["rt_start"] = self._psm_df.rt_start / 60
self._psm_df["rt_stop"] = self._psm_df.rt_stop / 60
self._psm_df[PsmDfCols.RT] = self._psm_df.rt / 60
if PsmDfCols.RT_START in self._psm_df.columns:
self._psm_df[PsmDfCols.RT_START] = self._psm_df.rt_start / 60
self._psm_df[PsmDfCols.RT_STOP] = self._psm_df.rt_stop / 60
# elif self._engine_rt_unit == 'minute':
# self.psm_df['rt_sec'] = self.psm_df.rt*60
min_rt = self._psm_df.rt.min()
Expand All @@ -337,19 +338,19 @@ def _normalize_rt(self):
elif not self._min_max_rt_norm:
min_rt = 0

self._psm_df["rt_norm"] = (
self._psm_df[PsmDfCols.RT_NORM] = (
(self._psm_df.rt - min_rt) / (max_rt - min_rt)
).clip(0, 1)

def normalize_rt_by_raw_name(self):
if "rt" not in self.psm_df.columns:
return
if "rt_norm" not in self._psm_df.columns:
if PsmDfCols.RT_NORM not in self._psm_df.columns:
self._normalize_rt()
if "raw_name" not in self._psm_df.columns:
return
for _, df_group in self._psm_df.groupby("raw_name"):
self._psm_df.loc[df_group.index, "rt_norm"] = (
self._psm_df.loc[df_group.index, PsmDfCols.RT_NORM] = (
df_group.rt_norm / df_group.rt_norm.max()
)

Expand Down Expand Up @@ -410,9 +411,9 @@ def _translate_columns(self, origin_df: pd.DataFrame):

if (
"scan_num" in self._psm_df.columns
and "spec_idx" not in self._psm_df.columns
and PsmDfCols.SPEC_IDX not in self._psm_df.columns
):
self._psm_df["spec_idx"] = self._psm_df.scan_num - 1
self._psm_df[PsmDfCols.SPEC_IDX] = self._psm_df.scan_num - 1

def _transform_table(self, origin_df: pd.DataFrame):
"""
Expand Down Expand Up @@ -484,16 +485,16 @@ def _post_process(self, origin_df: pd.DataFrame):
origin_df : pd.DataFrame
the loaded original df
"""
self._psm_df["nAA"] = self._psm_df.sequence.str.len()
self._psm_df[PsmDfCols.NAA] = self._psm_df.sequence.str.len()

self.normalize_rt_by_raw_name()

self._psm_df = self._psm_df[~self._psm_df["mods"].isna()]
self._psm_df = self._psm_df[~self._psm_df[PsmDfCols.MODS].isna()]

keep_rows = np.ones(len(self._psm_df), dtype=bool)
if "fdr" in self._psm_df.columns:
if PsmDfCols.FDR in self._psm_df.columns:
keep_rows &= self._psm_df.fdr <= self._keep_fdr
if "decoy" in self._psm_df.columns and not self._keep_decoy:
if PsmDfCols.DECOY in self._psm_df.columns and not self._keep_decoy:
keep_rows &= self._psm_df.decoy == 0

self._psm_df = self._psm_df[keep_rows]
Expand All @@ -503,13 +504,19 @@ def _post_process(self, origin_df: pd.DataFrame):
if "precursor_mz" not in self._psm_df:
self._psm_df = update_precursor_mz(self._psm_df)

if "ccs" in self._psm_df.columns and "mobility" not in self._psm_df.columns:
self._psm_df["mobility"] = mobility.ccs_to_mobility_for_df(
self._psm_df, "ccs"
if (
PsmDfCols.CCS in self._psm_df.columns
and PsmDfCols.MOBILITY not in self._psm_df.columns
):
self._psm_df[PsmDfCols.MOBILITY] = mobility.ccs_to_mobility_for_df(
self._psm_df, PsmDfCols.CCS
)
elif "mobility" in self._psm_df.columns and "ccs" not in self._psm_df.columns:
self._psm_df["ccs"] = mobility.mobility_to_ccs_for_df(
self._psm_df, "mobility"
elif (
PsmDfCols.MOBILITY in self._psm_df.columns
and PsmDfCols.CCS not in self._psm_df.columns
):
self._psm_df[PsmDfCols.CCS] = mobility.mobility_to_ccs_for_df(
self._psm_df, PsmDfCols.MOBILITY
)

def filter_psm_by_modifications(
Expand Down
28 changes: 19 additions & 9 deletions alphabase/psm_reader/sage_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
from tqdm import tqdm

from alphabase.constants.modification import MOD_DF
from alphabase.psm_reader.keys import PsmDfCols
from alphabase.psm_reader.psm_reader import (
PSMReaderBase,
psm_reader_provider,
Expand Down Expand Up @@ -94,7 +95,7 @@ def __call__(self, psm_df: pd.DataFrame) -> pd.DataFrame:
translated_psm_df = _apply_translate_modifications_mp(psm_df, translation_df)

# 5. Drop PSMs with missing modifications
is_null = translated_psm_df["mod_sites"].isnull()
is_null = translated_psm_df[PsmDfCols.MOD_SITES].isnull()
translated_psm_df = translated_psm_df[~is_null]
if np.sum(is_null) > 0:
logging.warning(
Expand Down Expand Up @@ -217,7 +218,10 @@ def _discover_modifications(psm_df: pd.DataFrame) -> pd.DataFrame:
"""

modifications = (
psm_df["modified_sequence"].apply(_match_modified_sequence).explode().unique()
psm_df[PsmDfCols.MODIFIED_SEQUENCE]
.apply(_match_modified_sequence)
.explode()
.unique()
)
modifications = modifications[~pd.isnull(modifications)]
return pd.DataFrame(
Expand Down Expand Up @@ -597,21 +601,27 @@ def _load_file(self, filename):
raise NotImplementedError

def _transform_table(self, origin_df):
self._psm_df["spec_idx"] = self._psm_df["scannr"].apply(
self._psm_df[PsmDfCols.SPEC_IDX] = self._psm_df[PsmDfCols.SCANNR].apply(
_sage_spec_idx_from_scan_nr
)
self._psm_df.drop(columns=["scannr"], inplace=True)
self._psm_df.drop(columns=[PsmDfCols.SCANNR], inplace=True)

def _translate_decoy(self, origin_df):
if not self._keep_decoy:
self._psm_df = self._psm_df[~self._psm_df["decoy"]]
self._psm_df = self._psm_df[~self._psm_df[PsmDfCols.DECOY]]

self._psm_df = self._psm_df[self._psm_df["fdr"] <= self._keep_fdr]
self._psm_df = self._psm_df[self._psm_df["peptide_fdr"] <= self._keep_fdr]
self._psm_df = self._psm_df[self._psm_df["protein_fdr"] <= self._keep_fdr]
self._psm_df = self._psm_df[self._psm_df[PsmDfCols.FDR] <= self._keep_fdr]
self._psm_df = self._psm_df[
self._psm_df[PsmDfCols.PEPTIDE_FDR] <= self._keep_fdr
]
self._psm_df = self._psm_df[
self._psm_df[PsmDfCols.PROTEIN_FDR] <= self._keep_fdr
]

# drop peptide_fdr, protein_fdr
self._psm_df.drop(columns=["peptide_fdr", "protein_fdr"], inplace=True)
self._psm_df.drop(
columns=[PsmDfCols.PEPTIDE_FDR, PsmDfCols.PROTEIN_FDR], inplace=True
)

def _load_modifications(self, origin_df):
pass
Expand Down

1 comment on commit 98a2f83

@lucas-diedrich
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

👍

Please sign in to comment.