Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Refactor readers i #232

Merged
merged 12 commits into from
Nov 18, 2024
1 change: 1 addition & 0 deletions .bumpversion.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -14,5 +14,6 @@ serialize =
[bumpversion:file:./alphabase/__init__.py]

[bumpversion:file:./docs/conf.py]

search = {current_version}
replace = {new_version}
23 changes: 12 additions & 11 deletions alphabase/psm_reader/alphapept_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
import numpy as np
import pandas as pd

from alphabase.psm_reader.keys import PsmDfCols
from alphabase.psm_reader.psm_reader import (
PSMReaderBase,
psm_reader_provider,
Expand Down Expand Up @@ -79,31 +80,31 @@ def _load_file(self, filename):
with h5py.File(filename, "r") as _hdf:
dataset = _hdf[self.hdf_dataset]
df = pd.DataFrame({col: dataset[col] for col in dataset})
df["raw_name"] = os.path.basename(filename)[: -len(".ms_data.hdf")]
df[PsmDfCols.RAW_NAME] = os.path.basename(filename)[: -len(".ms_data.hdf")]
df["precursor"] = df["precursor"].str.decode("utf-8")
# df['naked_sequence'] = df['naked_sequence'].str.decode('utf-8')
if "scan_no" in df.columns:
df["scan_no"] = df["scan_no"].astype("int")
df["raw_idx"] = df["scan_no"] - 1 # if thermo, use scan-1 as spec_idx
df["charge"] = df["charge"].astype(int)
df[PsmDfCols.CHARGE] = df[PsmDfCols.CHARGE].astype(int)
return df

def _load_modifications(self, df: pd.DataFrame):
if len(df) == 0:
self._psm_df["sequence"] = ""
self._psm_df["mods"] = ""
self._psm_df["mod_sites"] = ""
self._psm_df["decoy"] = 0
self._psm_df[PsmDfCols.SEQUENCE] = ""
self._psm_df[PsmDfCols.MODS] = ""
self._psm_df[PsmDfCols.MOD_SITES] = ""
self._psm_df[PsmDfCols.DECOY] = 0
return

(
self._psm_df["sequence"],
self._psm_df["mods"],
self._psm_df["mod_sites"],
self._psm_df[PsmDfCols.SEQUENCE],
self._psm_df[PsmDfCols.MODS],
self._psm_df[PsmDfCols.MOD_SITES],
_charges,
self._psm_df["decoy"],
self._psm_df[PsmDfCols.DECOY],
) = zip(*df["precursor"].apply(parse_ap))
self._psm_df.decoy = self._psm_df.decoy.astype(np.int8)
self._psm_df[PsmDfCols.DECOY] = self._psm_df[PsmDfCols.DECOY].astype(np.int8)


def register_readers():
Expand Down
13 changes: 8 additions & 5 deletions alphabase/psm_reader/dia_psm_reader.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import numpy as np
import pandas as pd

from alphabase.psm_reader.keys import PsmDfCols
from alphabase.psm_reader.maxquant_reader import MaxQuantReader
from alphabase.psm_reader.psm_reader import psm_reader_provider, psm_reader_yaml

Expand Down Expand Up @@ -127,7 +128,9 @@ def _load_file(self, filename):

def _post_process(self, origin_df: pd.DataFrame):
super()._post_process(origin_df)
self._psm_df.rename(columns={"spec_idx": "diann_spec_idx"}, inplace=True)
self._psm_df.rename(
columns={PsmDfCols.SPEC_IDX: PsmDfCols.DIANN_SPEC_INDEX}, inplace=True
)


class SpectronautReportReader(MaxQuantReader):
Expand Down Expand Up @@ -174,10 +177,10 @@ def _load_file(self, filename):
self.mod_seq_column = "ModifiedSequence"
self.csv_sep = self._get_table_delimiter(filename)
df = pd.read_csv(filename, sep=self.csv_sep, keep_default_na=False)
df[[self.mod_seq_column, "charge"]] = df[self.precursor_column].str.split(
".", expand=True, n=2
)
df["charge"] = df.charge.astype(np.int8)
df[[self.mod_seq_column, PsmDfCols.CHARGE]] = df[
self.precursor_column
].str.split(".", expand=True, n=2)
df[PsmDfCols.CHARGE] = df[PsmDfCols.CHARGE].astype(np.int8)
return df


Expand Down
71 changes: 71 additions & 0 deletions alphabase/psm_reader/keys.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
class ConstantsClass(type):
"""A metaclass for classes that should only contain string constants."""

def __setattr__(self, name, value):
raise TypeError("Constants class cannot be modified")

def get_values(cls):
"""Get all user-defined string values of the class."""
return [
value
for key, value in cls.__dict__.items()
if not key.startswith("__") and isinstance(value, str)
]


class PsmDfCols(metaclass=ConstantsClass):
jalew188 marked this conversation as resolved.
Show resolved Hide resolved
"""Constants for accessing the columns of a PSM dataframe."""

# TODO: these are used only in th psm_reader package and the spectral_library.reader module so far
MOD_SITES = "mod_sites"
MODIFIED_SEQUENCE = "modified_sequence"
SEQUENCE = "sequence"
DECOY = "decoy"
MODS = "mods"
SCORE = "score"
TO_REMOVE = "to_remove"
AA_MASS_DIFFS = "aa_mass_diffs"
AA_MASS_DIFF_SITES = "aa_mass_diff_sites"
RT = "rt"
RT_START = "rt_start"
RT_STOP = "rt_stop"
RT_NORM = "rt_norm"
SPEC_IDX = "spec_idx"
SCANNR = "scannr"
FDR = "fdr"
NAA = "nAA"
CCS = "ccs"
MOBILITY = "mobility"
PEPTIDE_FDR = "peptide_fdr"
PROTEIN_FDR = "protein_fdr"

RAW_NAME = "raw_name"
CHARGE = "charge"
PROTEINS = "proteins"

SCAN_NUM = "scan_num"
PRECURSOR_MZ = "precursor_mz"
DIANN_SPEC_INDEX = "diann_spec_idx"
mschwoer marked this conversation as resolved.
Show resolved Hide resolved

# part of the output, but not directly referenced
_UNIPROT_IDS = "uniprot_ids"
_GENES = "genes"
_QUERY_ID = "query_id"

# part of psm_reader_yaml, but not directly referenced
_INTENSITY = "intensity"


class LibPsmDfCols(metaclass=ConstantsClass):
"""Constants for accessing the columns of a Library PSM dataframe."""

FRAG_START_IDX = "frag_start_idx"
FRAG_STOP_IDX = "frag_stop_idx"

# not referenced in reader classes
FRAGMENT_INTENSITY = "fragment_intensity"
FRAGMENT_MZ = "fragment_mz"
FRAGMENT_TYPE = "fragment_type"
FRAGMENT_CHARGE = "fragment_charge"
FRAGMENT_SERIES = "fragment_series"
FRAGMENT_LOSS_TYPE = "fragment_loss_type"
16 changes: 10 additions & 6 deletions alphabase/psm_reader/maxquant_reader.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,13 @@
import copy
import warnings
from typing import Optional

import numba
import numpy as np
import pandas as pd

from alphabase.constants.modification import MOD_DF
from alphabase.psm_reader.keys import PsmDfCols
from alphabase.psm_reader.psm_reader import (
PSMReaderBase,
psm_reader_provider,
Expand Down Expand Up @@ -195,7 +197,7 @@ def _init_modification_mapping(self):
psm_reader_yaml["maxquant"]["modification_mapping"]
)

def set_modification_mapping(self, modification_mapping: dict):
def set_modification_mapping(self, modification_mapping: Optional[dict] = None):
super().set_modification_mapping(modification_mapping)
self._add_all_unimod()
self._extend_mod_brackets()
Expand Down Expand Up @@ -237,8 +239,10 @@ def _extend_mod_brackets(self):
self.modification_mapping[key] = list(mod_set)

def _translate_decoy(self, origin_df=None):
if "decoy" in self._psm_df.columns:
self._psm_df.decoy = (self._psm_df.decoy == "-").astype(np.int8)
if PsmDfCols.DECOY in self._psm_df.columns:
self._psm_df[PsmDfCols.DECOY] = (
self._psm_df[PsmDfCols.DECOY] == "-"
).astype(np.int8)

def _init_column_mapping(self):
self.column_mapping = psm_reader_yaml["maxquant"]["column_mapping"]
Expand Down Expand Up @@ -278,15 +282,15 @@ def _load_modifications(self, origin_df: pd.DataFrame):
else:
mod_sep = "()"

(seqs, self._psm_df["mods"], self._psm_df["mod_sites"]) = zip(
(seqs, self._psm_df[PsmDfCols.MODS], self._psm_df[PsmDfCols.MOD_SITES]) = zip(
*origin_df[self.mod_seq_column].apply(
parse_mod_seq,
mod_sep=mod_sep,
fixed_C57=self.fixed_C57,
)
)
if "sequence" not in self._psm_df.columns:
self._psm_df["sequence"] = seqs
if PsmDfCols.SEQUENCE not in self._psm_df.columns:
self._psm_df[PsmDfCols.SEQUENCE] = seqs


def register_readers():
Expand Down
51 changes: 28 additions & 23 deletions alphabase/psm_reader/msfragger_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
from alphabase.constants.aa import AA_ASCII_MASS
from alphabase.constants.atom import MASS_H, MASS_O
from alphabase.constants.modification import MOD_MASS
from alphabase.psm_reader.keys import PsmDfCols
from alphabase.psm_reader.psm_reader import (
PSMReaderBase,
psm_reader_provider,
Expand Down Expand Up @@ -115,9 +116,6 @@ def __init__(
def _init_column_mapping(self):
self.column_mapping = psm_reader_yaml["msfragger_pepxml"]["column_mapping"]

def _init_modification_mapping(self):
self.modification_mapping = {}

def _translate_modifications(self):
pass

Expand All @@ -126,54 +124,61 @@ def _load_file(self, filename):
msf_df.fillna("", inplace=True)
if "ion_mobility" in msf_df.columns:
msf_df["ion_mobility"] = msf_df.ion_mobility.astype(float)
msf_df["raw_name"] = msf_df["spectrum"].str.split(".").apply(lambda x: x[0])
msf_df["to_remove"] = 0
self.column_mapping["to_remove"] = "to_remove"
msf_df[PsmDfCols.RAW_NAME] = (
msf_df["spectrum"].str.split(".").apply(lambda x: x[0])
)
msf_df["to_remove"] = 0 # TODO revisit
self.column_mapping[PsmDfCols.TO_REMOVE] = "to_remove"
return msf_df

def _translate_decoy(self, origin_df=None):
self._psm_df["decoy"] = self._psm_df.proteins.apply(_is_fragger_decoy).astype(
np.int8
self._psm_df[PsmDfCols.DECOY] = (
self._psm_df[PsmDfCols.PROTEINS].apply(_is_fragger_decoy).astype(np.int8)
)

self._psm_df.proteins = self._psm_df.proteins.apply(lambda x: ";".join(x))
self._psm_df[PsmDfCols.PROTEINS] = self._psm_df[PsmDfCols.PROTEINS].apply(
lambda x: ";".join(x)
)
if not self._keep_decoy:
self._psm_df["to_remove"] += self._psm_df.decoy > 0
self._psm_df[PsmDfCols.TO_REMOVE] += self._psm_df[PsmDfCols.DECOY] > 0

def _translate_score(self, origin_df=None):
# evalue score
self._psm_df["score"] = -np.log(self._psm_df["score"] + 1e-100)
self._psm_df[PsmDfCols.SCORE] = -np.log(self._psm_df[PsmDfCols.SCORE] + 1e-100)

def _load_modifications(self, msf_df):
if len(msf_df) == 0:
self._psm_df["mods"] = ""
self._psm_df["mod_sites"] = ""
self._psm_df["aa_mass_diffs"] = ""
self._psm_df["aa_mass_diff_sites"] = ""
self._psm_df[PsmDfCols.MODS] = ""
self._psm_df[PsmDfCols.MOD_SITES] = ""
self._psm_df[PsmDfCols.AA_MASS_DIFFS] = ""
self._psm_df[PsmDfCols.AA_MASS_DIFF_SITES] = ""
return

(
self._psm_df["mods"],
self._psm_df["mod_sites"],
self._psm_df["aa_mass_diffs"],
self._psm_df["aa_mass_diff_sites"],
self._psm_df[PsmDfCols.MODS],
self._psm_df[PsmDfCols.MOD_SITES],
self._psm_df[PsmDfCols.AA_MASS_DIFFS],
self._psm_df[PsmDfCols.AA_MASS_DIFF_SITES],
) = zip(
*msf_df[["peptide", "modifications"]].apply(
lambda x: _get_mods_from_masses(*x), axis=1
)
)

if not self.keep_unknown_aa_mass_diffs:
self._psm_df["to_remove"] += self._psm_df.aa_mass_diffs != ""
self._psm_df[PsmDfCols.TO_REMOVE] += (
self._psm_df[PsmDfCols.AA_MASS_DIFFS] != ""
)
self._psm_df.drop(
columns=["aa_mass_diffs", "aa_mass_diff_sites"], inplace=True
columns=[PsmDfCols.AA_MASS_DIFFS, PsmDfCols.AA_MASS_DIFF_SITES],
inplace=True,
)

def _post_process(self, origin_df: pd.DataFrame):
super()._post_process(origin_df)
self._psm_df = (
self._psm_df.query("to_remove==0")
.drop(columns="to_remove")
self._psm_df.query(f"{PsmDfCols.TO_REMOVE}==0")
.drop(columns=PsmDfCols.TO_REMOVE)
.reset_index(drop=True)
)

Expand Down
24 changes: 14 additions & 10 deletions alphabase/psm_reader/pfind_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
import pandas as pd

import alphabase.constants.modification as ap_mod
from alphabase.psm_reader.keys import PsmDfCols
from alphabase.psm_reader.psm_reader import (
PSMReaderBase,
psm_reader_provider,
Expand Down Expand Up @@ -104,9 +105,6 @@ def __init__(
def _init_column_mapping(self):
self.column_mapping = psm_reader_yaml["pfind"]["column_mapping"]

def _init_modification_mapping(self):
self.modification_mapping = {}

def _translate_modifications(self):
pass

Expand All @@ -116,29 +114,35 @@ def _load_file(self, filename):
)
pfind_df.fillna("", inplace=True)
pfind_df = pfind_df[pfind_df.Sequence != ""]
pfind_df["raw_name"] = (
pfind_df[PsmDfCols.RAW_NAME] = (
pfind_df["File_Name"].str.split(".").apply(lambda x: x[0])
)
pfind_df["Proteins"] = pfind_df["Proteins"].apply(parse_pfind_protein)
return pfind_df

def _translate_decoy(self, origin_df=None):
self._psm_df.decoy = (self._psm_df.decoy == "decoy").astype(np.int8)
self._psm_df[PsmDfCols.DECOY] = (
self._psm_df[PsmDfCols.DECOY] == "decoy"
).astype(np.int8)

def _translate_score(self, origin_df=None):
self._psm_df.score = -np.log(self._psm_df.score.astype(float) + 1e-100)
self._psm_df[PsmDfCols.SCORE] = -np.log(
self._psm_df[PsmDfCols.SCORE].astype(float) + 1e-100
)

def _load_modifications(self, pfind_df):
if len(pfind_df) == 0:
self._psm_df["mods"] = ""
self._psm_df["mod_sites"] = ""
self._psm_df[PsmDfCols.MODS] = ""
self._psm_df[PsmDfCols.MOD_SITES] = ""
return

(self._psm_df["mods"], self._psm_df["mod_sites"]) = zip(
(self._psm_df[PsmDfCols.MODS], self._psm_df[PsmDfCols.MOD_SITES]) = zip(
*pfind_df["Modification"].apply(get_pFind_mods)
)

self._psm_df["mods"] = self._psm_df["mods"].apply(translate_pFind_mod)
self._psm_df[PsmDfCols.MODS] = self._psm_df[PsmDfCols.MODS].apply(
translate_pFind_mod
)


def register_readers():
Expand Down
Loading
Loading