Skip to content

Commit

Permalink
Merge pull request #252 from MannLabs/refactor_readers_X
Browse files Browse the repository at this point in the history
Refactor readers x
  • Loading branch information
mschwoer authored Jan 9, 2025
2 parents bcfe470 + 8a4f238 commit 09bdfe6
Show file tree
Hide file tree
Showing 15 changed files with 115 additions and 125 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/_run_tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ on:
required: true
type: string
jobs:
pre-commit:
run-tests:
runs-on: ${{ inputs.os }}
steps:
- uses: actions/checkout@v3
Expand Down
8 changes: 8 additions & 0 deletions .github/workflows/branch-checks.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -24,3 +24,11 @@ jobs:
python-version: ${{ matrix.python-version }}
os: ${{ matrix.os }}
install-script: "loose_pip_install.sh"
get-code-review-input:
runs-on: ubuntu-latest
#if: contains(github.event.pull_request.labels.*.name, 'code-review')
steps:
- uses: MannLabs/alphashared/actions/get-code-review-input@v1
with:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
PR_NUMBER: ${{ github.event.number }}
1 change: 1 addition & 0 deletions alphabase/constants/_const.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
from alphabase.yaml_utils import load_yaml

CONST_FILE_FOLDER = os.path.join(os.path.dirname(__file__), "const_files")
PSM_READER_YAML_FILE_NAME = "psm_reader.yaml"

common_const_dict: dict = load_yaml(
os.path.join(CONST_FILE_FOLDER, "common_constants.yaml")
Expand Down
38 changes: 21 additions & 17 deletions alphabase/constants/const_files/psm_reader.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -13,13 +13,8 @@ alphapept:
'raw_name': 'raw_name' #parse from `ms_data.hdf`` file
'fdr': 'q_value'
'decoy': 'decoy'
modification_mapping:
'Carbamidomethyl@C': 'cC'
'Oxidation@M': 'oxM'
'Phospho@S': 'pS'
'Phospho@T': 'pT'
'Phospho@Y': 'pY'
'Acetyl@Protein_N-term': 'a'
modification_mapping_type: 'alphapept'


maxquant:
reader_type: maxquant
Expand Down Expand Up @@ -49,7 +44,10 @@ maxquant:
'genes': ['Gene Names','Gene names']
'decoy': 'Reverse'
'intensity': 'Intensity'
modification_mapping:
modification_mapping_type: 'maxquant'

modification_mappings:
maxquant:
'mTRAQ@K':
- 'K(mTRAQ)'
'mTRAQ@Any_N-term':
Expand Down Expand Up @@ -107,6 +105,13 @@ maxquant:
'Deamidated@Q': ['Q(Deamidation (NQ))','Q(de)']
'GlyGly@K': ['K(GlyGly (K))', 'K(gl)']
'hydroxyisobutyryl@K': 'K(2-)'
alphapept:
'Carbamidomethyl@C': 'cC'
'Oxidation@M': 'oxM'
'Phospho@S': 'pS'
'Phospho@T': 'pT'
'Phospho@Y': 'pY'
'Acetyl@Protein_N-term': 'a'

pfind:
reader_type: pfind
Expand All @@ -123,8 +128,7 @@ pfind:
'uniprot_ids': 'Proteins'
'fdr': 'Q-value'
'decoy': ['Target/Decoy', 'Targe/Decoy']
modification_mapping:
'': ''
modification_mapping_type: null # no mapping required

msfragger_pepxml:
reader_type: msfragger_pepxml
Expand All @@ -139,8 +143,6 @@ msfragger_pepxml:
'proteins': 'protein'
'raw_name': 'raw_name'
'mobility': 'ion_mobility'
modification_mapping:
'': ''
mass_mapped_mods:
- 'Oxidation@M' #other Oxidation@X are not needed here
- 'Carbamidomethyl@C'
Expand All @@ -153,6 +155,7 @@ msfragger_pepxml:
- 'Dimethyl@K' # Any_N-term is not needed here as it will be infered in-the-fly
- 'Methyl@E' #an example of a PTM that can be C-term
mod_mass_tol: 0.1 # Da
modification_mapping_type: 'maxquant'

diann:
reader_type: diann
Expand All @@ -175,7 +178,7 @@ diann:
'fdr': 'Q.Value'
mod_seq_columns:
- "Modified.Sequence"
modification_mapping: 'maxquant'
modification_mapping_type: 'maxquant'

spectronaut_report:
reader_type: spectronaut_report
Expand All @@ -191,7 +194,7 @@ spectronaut_report:
'charge': 'charge'
mod_seq_columns:
- 'ModifiedSequence'
modification_mapping: 'maxquant'
modification_mapping_type: 'maxquant'

spectronaut:
reader_type: spectronaut
Expand All @@ -215,7 +218,7 @@ spectronaut:
- 'ModifiedPeptideSequence'
- 'LabeledSequence'
- 'FullUniModPeptideName'
modification_mapping: 'maxquant'
modification_mapping_type: 'maxquant'

library_reader_base:
reader_type: library_reader_base
Expand Down Expand Up @@ -246,11 +249,11 @@ library_reader_base:
- 'FullUniModPeptideName'
- 'LabeledSequence'
- 'FullUniModPeptideName'
modification_mapping: 'maxquant'
modification_mapping_type: 'maxquant'

sage:
reader_type: sage
rt_unit: minute
rt_unit: second
column_mapping:
'modified_sequence': 'peptide'
'sequence': 'stripped_peptide'
Expand All @@ -265,3 +268,4 @@ sage:
'peptide_fdr': 'peptide_q'
'protein_fdr': 'protein_q'
'decoy': 'is_decoy'
modification_mapping_type: null # custom mapping in code
1 change: 0 additions & 1 deletion alphabase/psm_reader/alphapept_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,6 @@ class AlphaPeptReader(PSMReaderBase):
"""Reader for AlphaPept's *.ms_data.hdf files."""

_reader_type = "alphapept"
_modification_type = "alphapept"

def _load_file(self, filename: str) -> pd.DataFrame:
"""Load an AlphaPept output file to a DataFrame."""
Expand Down
1 change: 0 additions & 1 deletion alphabase/psm_reader/dia_psm_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,6 @@ class SpectronautReader(MaxQuantReader):
_reader_type = "spectronaut"
_add_unimod_to_mod_mapping = True
_min_max_rt_norm = True
_fixed_c57_default = False

def _pre_process(self, df: pd.DataFrame) -> pd.DataFrame:
"""Spectronaut-specific preprocessing of output data."""
Expand Down
13 changes: 8 additions & 5 deletions alphabase/psm_reader/maxquant_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
from alphabase.psm_reader.psm_reader import (
PSMReaderBase,
psm_reader_provider,
psm_reader_yaml,
)
from alphabase.psm_reader.utils import get_column_mapping_for_df

Expand Down Expand Up @@ -127,8 +128,6 @@ class MaxQuantReader(PSMReaderBase):

_reader_type = "maxquant"
_add_unimod_to_mod_mapping = True
_modification_type = "maxquant"
_fixed_c57_default = True

def __init__( # noqa: PLR0913, D417 # too many arguments in function definition, missing argument descriptions
self,
Expand All @@ -138,7 +137,7 @@ def __init__( # noqa: PLR0913, D417 # too many arguments in function definition
mod_seq_columns: Optional[List[str]] = None,
fdr: float = 0.01,
keep_decoy: bool = False,
rt_unit: str = "minute",
rt_unit: Optional[str] = None,
# MaxQuant reader-specific
fixed_C57: Optional[bool] = None, # noqa: N803 TODO: make this *,fixed_c57 (breaking)
**kwargs,
Expand All @@ -152,7 +151,7 @@ def __init__( # noqa: PLR0913, D417 # too many arguments in function definition
fixed_C57 : bool, optional
If true, the search engine will not show `Carbamidomethyl`
in the modified sequences.
by default True
by default read from psm_reader_yaml key `fixed_C57`.
See documentation of `PSMReaderBase` for the rest of parameters.
Expand All @@ -167,7 +166,11 @@ def __init__( # noqa: PLR0913, D417 # too many arguments in function definition
**kwargs,
)

self.fixed_C57 = fixed_C57 if fixed_C57 is not None else self._fixed_c57_default
self.fixed_C57 = (
fixed_C57
if fixed_C57 is not None
else psm_reader_yaml[self._reader_type]["fixed_C57"]
)

def _translate_decoy(self) -> None:
if PsmDfCols.DECOY in self._psm_df.columns:
Expand Down
38 changes: 17 additions & 21 deletions alphabase/psm_reader/modification_mapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ def __init__(
custom_modification_mapping: Optional[Dict[str, str]],
*,
reader_yaml: Dict,
modification_type: Optional[str],
mapping_type: str,
add_unimod_to_mod_mapping: bool,
):
"""Initialize the ModificationMapper.
Expand All @@ -35,7 +35,7 @@ def __init__(
reader_yaml:
the yaml (read from file) containing the modification mappings
modification_type:
mapping_type:
the type of modification mapping ("maxquant" or "alphapept")
add_unimod_to_mod_mapping:
Expand All @@ -44,7 +44,7 @@ def __init__(
"""
self._psm_reader_yaml = reader_yaml
self._add_unimod_to_mod_mapping = add_unimod_to_mod_mapping
self._modification_type = modification_type
self._mapping_type = mapping_type

self.modification_mapping = None
self.rev_mod_mapping = None
Expand Down Expand Up @@ -95,23 +95,20 @@ def set_modification_mapping(
----------
modification_mapping:
If dictionary: the current modification_mapping will be overwritten by this.
If str: the parameter will be interpreted as a reader type, and the modification_mapping is read from the
"modification_mapping" section of the psm_reader_yaml
If str: the parameter will be interpreted as a modification_mapping_type, and the mapping is read from the
respective key in the "modification_mappings" section of the psm_reader_yaml
"""
if modification_mapping is None:
self._init_modification_mapping()
elif isinstance(
modification_mapping, str
): # TODO: remove this overloading of the parameter by introducing yaml key "modification_mapping_type"
if modification_mapping in self._psm_reader_yaml:
self.modification_mapping = self._psm_reader_yaml[modification_mapping][
"modification_mapping"
]
else:
raise ValueError(
f"Unknown modification mapping: {modification_mapping}"
)
modification_mapping,
str, # interpret as modification_mapping_type
):
self.modification_mapping = self._psm_reader_yaml["modification_mappings"][
modification_mapping
]

else:
self.modification_mapping = copy.deepcopy(modification_mapping)

Expand All @@ -125,12 +122,11 @@ def set_modification_mapping(

def _init_modification_mapping(self) -> None:
"""Initialize the modification mapping from the psm_reader_yaml or as an empty dictionary."""
if self._modification_type is not None:
self.modification_mapping = self._psm_reader_yaml[self._modification_type][
"modification_mapping"
]
else:
self.modification_mapping = {}
self.modification_mapping = (
self._psm_reader_yaml["modification_mappings"][self._mapping_type]
if self._mapping_type is not None
else {}
)

def _add_all_unimod(self) -> None:
"""Add all unimod modifications to the modification mapping."""
Expand Down
25 changes: 16 additions & 9 deletions alphabase/psm_reader/msfragger_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,12 +21,11 @@ def _is_fragger_decoy(proteins: List[str]) -> bool:
return all(prot.lower().startswith("rev_") for prot in proteins)


mass_mapped_mods = psm_reader_yaml["msfragger_pepxml"]["mass_mapped_mods"]
mod_mass_tol = psm_reader_yaml["msfragger_pepxml"]["mod_mass_tol"]


def _get_mods_from_masses( # noqa: PLR0912, C901 too many branches, too complex TODO: refactor
sequence: str, msf_aa_mods: List[str]
sequence: str,
msf_aa_mods: List[str],
mass_mapped_mods: List[str],
mod_mass_tol: float,
) -> Tuple[str, str, str, str]:
mods = []
mod_sites = []
Expand Down Expand Up @@ -106,7 +105,7 @@ def __init__( # noqa: PLR0913, D417 # too many arguments in function definition
# mod_seq_columns: Optional[List[str]] = None,# TODO: not needed here?
fdr: float = 0.001, # refers to E-value in the PepXML
keep_decoy: bool = False,
rt_unit: str = "second",
rt_unit: Optional[str] = None,
# MSFragger reader-specific:
keep_unknown_aa_mass_diffs: bool = False,
**kwargs,
Expand Down Expand Up @@ -134,7 +133,10 @@ def __init__( # noqa: PLR0913, D417 # too many arguments in function definition
rt_unit=rt_unit,
**kwargs,
)
self.keep_unknown_aa_mass_diffs = keep_unknown_aa_mass_diffs
self._keep_unknown_aa_mass_diffs = keep_unknown_aa_mass_diffs
# TODO: should those be set via API, too?
self._mass_mapped_mods = psm_reader_yaml["msfragger_pepxml"]["mass_mapped_mods"]
self._mod_mass_tol = psm_reader_yaml["msfragger_pepxml"]["mod_mass_tol"]

def _translate_modifications(self) -> None:
pass
Expand Down Expand Up @@ -183,11 +185,16 @@ def _load_modifications(self, origin_df: pd.DataFrame) -> None:
self._psm_df[PsmDfCols.AA_MASS_DIFF_SITES],
) = zip(
*origin_df[["peptide", "modifications"]].apply(
lambda x: _get_mods_from_masses(*x), axis=1
lambda x: _get_mods_from_masses(
*x,
mass_mapped_mods=self._mass_mapped_mods,
mod_mass_tol=self._mod_mass_tol,
),
axis=1,
)
)

if not self.keep_unknown_aa_mass_diffs:
if not self._keep_unknown_aa_mass_diffs:
self._psm_df[PsmDfCols.TO_REMOVE] += (
self._psm_df[PsmDfCols.AA_MASS_DIFFS] != ""
)
Expand Down
Loading

0 comments on commit 09bdfe6

Please sign in to comment.