Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Refactor readers x #252

Merged
merged 9 commits into from
Jan 9, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/_run_tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ on:
required: true
type: string
jobs:
pre-commit:
run-tests:
runs-on: ${{ inputs.os }}
steps:
- uses: actions/checkout@v3
Expand Down
8 changes: 8 additions & 0 deletions .github/workflows/branch-checks.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -24,3 +24,11 @@ jobs:
python-version: ${{ matrix.python-version }}
os: ${{ matrix.os }}
install-script: "loose_pip_install.sh"
get-code-review-input:
runs-on: ubuntu-latest
#if: contains(github.event.pull_request.labels.*.name, 'code-review')
steps:
- uses: MannLabs/alphashared/actions/get-code-review-input@v1
with:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
PR_NUMBER: ${{ github.event.number }}
1 change: 1 addition & 0 deletions alphabase/constants/_const.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
from alphabase.yaml_utils import load_yaml

CONST_FILE_FOLDER = os.path.join(os.path.dirname(__file__), "const_files")
PSM_READER_YAML_FILE_NAME = "psm_reader.yaml"

common_const_dict: dict = load_yaml(
os.path.join(CONST_FILE_FOLDER, "common_constants.yaml")
Expand Down
38 changes: 21 additions & 17 deletions alphabase/constants/const_files/psm_reader.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -13,13 +13,8 @@ alphapept:
'raw_name': 'raw_name' #parse from `ms_data.hdf`` file
'fdr': 'q_value'
'decoy': 'decoy'
modification_mapping:
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think we shall show here that if search engines use unimod_id

'Carbamidomethyl@C': 'cC'
'Oxidation@M': 'oxM'
'Phospho@S': 'pS'
'Phospho@T': 'pT'
'Phospho@Y': 'pY'
'Acetyl@Protein_N-term': 'a'
modification_mapping_type: 'alphapept'


maxquant:
reader_type: maxquant
Expand Down Expand Up @@ -49,7 +44,10 @@ maxquant:
'genes': ['Gene Names','Gene names']
'decoy': 'Reverse'
'intensity': 'Intensity'
modification_mapping:
modification_mapping_type: 'maxquant'

modification_mappings:
maxquant:
'mTRAQ@K':
- 'K(mTRAQ)'
'mTRAQ@Any_N-term':
Expand Down Expand Up @@ -107,6 +105,13 @@ maxquant:
'Deamidated@Q': ['Q(Deamidation (NQ))','Q(de)']
'GlyGly@K': ['K(GlyGly (K))', 'K(gl)']
'hydroxyisobutyryl@K': 'K(2-)'
alphapept:
'Carbamidomethyl@C': 'cC'
'Oxidation@M': 'oxM'
'Phospho@S': 'pS'
'Phospho@T': 'pT'
'Phospho@Y': 'pY'
'Acetyl@Protein_N-term': 'a'

pfind:
reader_type: pfind
Expand All @@ -123,8 +128,7 @@ pfind:
'uniprot_ids': 'Proteins'
'fdr': 'Q-value'
'decoy': ['Target/Decoy', 'Targe/Decoy']
modification_mapping:
'': ''
modification_mapping_type: null # no mapping required

msfragger_pepxml:
reader_type: msfragger_pepxml
Expand All @@ -139,8 +143,6 @@ msfragger_pepxml:
'proteins': 'protein'
'raw_name': 'raw_name'
'mobility': 'ion_mobility'
modification_mapping:
'': ''
mass_mapped_mods:
- 'Oxidation@M' #other Oxidation@X are not needed here
- 'Carbamidomethyl@C'
Expand All @@ -153,6 +155,7 @@ msfragger_pepxml:
- 'Dimethyl@K' # Any_N-term is not needed here as it will be infered in-the-fly
- 'Methyl@E' #an example of a PTM that can be C-term
mod_mass_tol: 0.1 # Da
modification_mapping_type: 'maxquant'

diann:
reader_type: diann
Expand All @@ -175,7 +178,7 @@ diann:
'fdr': 'Q.Value'
mod_seq_columns:
- "Modified.Sequence"
modification_mapping: 'maxquant'
modification_mapping_type: 'maxquant'

spectronaut_report:
reader_type: spectronaut_report
Expand All @@ -191,7 +194,7 @@ spectronaut_report:
'charge': 'charge'
mod_seq_columns:
- 'ModifiedSequence'
modification_mapping: 'maxquant'
modification_mapping_type: 'maxquant'

spectronaut:
reader_type: spectronaut
Expand All @@ -215,7 +218,7 @@ spectronaut:
- 'ModifiedPeptideSequence'
- 'LabeledSequence'
- 'FullUniModPeptideName'
modification_mapping: 'maxquant'
modification_mapping_type: 'maxquant'

library_reader_base:
reader_type: library_reader_base
Expand Down Expand Up @@ -246,11 +249,11 @@ library_reader_base:
- 'FullUniModPeptideName'
- 'LabeledSequence'
- 'FullUniModPeptideName'
modification_mapping: 'maxquant'
modification_mapping_type: 'maxquant'

sage:
reader_type: sage
rt_unit: minute
rt_unit: second
column_mapping:
'modified_sequence': 'peptide'
'sequence': 'stripped_peptide'
Expand All @@ -265,3 +268,4 @@ sage:
'peptide_fdr': 'peptide_q'
'protein_fdr': 'protein_q'
'decoy': 'is_decoy'
modification_mapping_type: null # custom mapping in code
1 change: 0 additions & 1 deletion alphabase/psm_reader/alphapept_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,6 @@ class AlphaPeptReader(PSMReaderBase):
"""Reader for AlphaPept's *.ms_data.hdf files."""

_reader_type = "alphapept"
_modification_type = "alphapept"

def _load_file(self, filename: str) -> pd.DataFrame:
"""Load an AlphaPept output file to a DataFrame."""
Expand Down
1 change: 0 additions & 1 deletion alphabase/psm_reader/dia_psm_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,6 @@ class SpectronautReader(MaxQuantReader):
_reader_type = "spectronaut"
_add_unimod_to_mod_mapping = True
_min_max_rt_norm = True
_fixed_c57_default = False

def _pre_process(self, df: pd.DataFrame) -> pd.DataFrame:
"""Spectronaut-specific preprocessing of output data."""
Expand Down
13 changes: 8 additions & 5 deletions alphabase/psm_reader/maxquant_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
from alphabase.psm_reader.psm_reader import (
PSMReaderBase,
psm_reader_provider,
psm_reader_yaml,
)
from alphabase.psm_reader.utils import get_column_mapping_for_df

Expand Down Expand Up @@ -127,8 +128,6 @@ class MaxQuantReader(PSMReaderBase):

_reader_type = "maxquant"
_add_unimod_to_mod_mapping = True
_modification_type = "maxquant"
_fixed_c57_default = True

def __init__( # noqa: PLR0913, D417 # too many arguments in function definition, missing argument descriptions
self,
Expand All @@ -138,7 +137,7 @@ def __init__( # noqa: PLR0913, D417 # too many arguments in function definition
mod_seq_columns: Optional[List[str]] = None,
fdr: float = 0.01,
keep_decoy: bool = False,
rt_unit: str = "minute",
jalew188 marked this conversation as resolved.
Show resolved Hide resolved
rt_unit: Optional[str] = None,
# MaxQuant reader-specific
fixed_C57: Optional[bool] = None, # noqa: N803 TODO: make this *,fixed_c57 (breaking)
**kwargs,
Expand All @@ -152,7 +151,7 @@ def __init__( # noqa: PLR0913, D417 # too many arguments in function definition
fixed_C57 : bool, optional
If true, the search engine will not show `Carbamidomethyl`
in the modified sequences.
by default True
by default read from psm_reader_yaml key `fixed_C57`.

See documentation of `PSMReaderBase` for the rest of parameters.

Expand All @@ -167,7 +166,11 @@ def __init__( # noqa: PLR0913, D417 # too many arguments in function definition
**kwargs,
)

self.fixed_C57 = fixed_C57 if fixed_C57 is not None else self._fixed_c57_default
self.fixed_C57 = (
fixed_C57
if fixed_C57 is not None
else psm_reader_yaml[self._reader_type]["fixed_C57"]
)

def _translate_decoy(self) -> None:
if PsmDfCols.DECOY in self._psm_df.columns:
Expand Down
38 changes: 17 additions & 21 deletions alphabase/psm_reader/modification_mapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ def __init__(
custom_modification_mapping: Optional[Dict[str, str]],
*,
reader_yaml: Dict,
modification_type: Optional[str],
mapping_type: str,
add_unimod_to_mod_mapping: bool,
):
"""Initialize the ModificationMapper.
Expand All @@ -35,7 +35,7 @@ def __init__(
reader_yaml:
the yaml (read from file) containing the modification mappings

modification_type:
mapping_type:
the type of modification mapping ("maxquant" or "alphapept")

add_unimod_to_mod_mapping:
Expand All @@ -44,7 +44,7 @@ def __init__(
"""
self._psm_reader_yaml = reader_yaml
self._add_unimod_to_mod_mapping = add_unimod_to_mod_mapping
self._modification_type = modification_type
self._mapping_type = mapping_type

self.modification_mapping = None
self.rev_mod_mapping = None
Expand Down Expand Up @@ -95,23 +95,20 @@ def set_modification_mapping(
----------
modification_mapping:
If dictionary: the current modification_mapping will be overwritten by this.
If str: the parameter will be interpreted as a reader type, and the modification_mapping is read from the
"modification_mapping" section of the psm_reader_yaml
If str: the parameter will be interpreted as a modification_mapping_type, and the mapping is read from the
respective key in the "modification_mappings" section of the psm_reader_yaml

"""
if modification_mapping is None:
self._init_modification_mapping()
elif isinstance(
modification_mapping, str
): # TODO: remove this overloading of the parameter by introducing yaml key "modification_mapping_type"
if modification_mapping in self._psm_reader_yaml:
self.modification_mapping = self._psm_reader_yaml[modification_mapping][
"modification_mapping"
]
else:
raise ValueError(
f"Unknown modification mapping: {modification_mapping}"
)
modification_mapping,
str, # interpret as modification_mapping_type
):
self.modification_mapping = self._psm_reader_yaml["modification_mappings"][
modification_mapping
]

else:
self.modification_mapping = copy.deepcopy(modification_mapping)

Expand All @@ -125,12 +122,11 @@ def set_modification_mapping(

def _init_modification_mapping(self) -> None:
"""Initialize the modification mapping from the psm_reader_yaml or as an empty dictionary."""
if self._modification_type is not None:
self.modification_mapping = self._psm_reader_yaml[self._modification_type][
"modification_mapping"
]
else:
self.modification_mapping = {}
self.modification_mapping = (
self._psm_reader_yaml["modification_mappings"][self._mapping_type]
if self._mapping_type is not None
else {}
)

def _add_all_unimod(self) -> None:
"""Add all unimod modifications to the modification mapping."""
Expand Down
25 changes: 16 additions & 9 deletions alphabase/psm_reader/msfragger_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,12 +21,11 @@ def _is_fragger_decoy(proteins: List[str]) -> bool:
return all(prot.lower().startswith("rev_") for prot in proteins)


mass_mapped_mods = psm_reader_yaml["msfragger_pepxml"]["mass_mapped_mods"]
mod_mass_tol = psm_reader_yaml["msfragger_pepxml"]["mod_mass_tol"]


def _get_mods_from_masses( # noqa: PLR0912, C901 too many branches, too complex TODO: refactor
sequence: str, msf_aa_mods: List[str]
sequence: str,
msf_aa_mods: List[str],
mass_mapped_mods: List[str],
mod_mass_tol: float,
) -> Tuple[str, str, str, str]:
mods = []
mod_sites = []
Expand Down Expand Up @@ -106,7 +105,7 @@ def __init__( # noqa: PLR0913, D417 # too many arguments in function definition
# mod_seq_columns: Optional[List[str]] = None,# TODO: not needed here?
fdr: float = 0.001, # refers to E-value in the PepXML
keep_decoy: bool = False,
rt_unit: str = "second",
rt_unit: Optional[str] = None,
# MSFragger reader-specific:
keep_unknown_aa_mass_diffs: bool = False,
**kwargs,
Expand Down Expand Up @@ -134,7 +133,10 @@ def __init__( # noqa: PLR0913, D417 # too many arguments in function definition
rt_unit=rt_unit,
**kwargs,
)
self.keep_unknown_aa_mass_diffs = keep_unknown_aa_mass_diffs
self._keep_unknown_aa_mass_diffs = keep_unknown_aa_mass_diffs
# TODO: should those be set via API, too?
self._mass_mapped_mods = psm_reader_yaml["msfragger_pepxml"]["mass_mapped_mods"]
self._mod_mass_tol = psm_reader_yaml["msfragger_pepxml"]["mod_mass_tol"]

def _translate_modifications(self) -> None:
pass
Expand Down Expand Up @@ -183,11 +185,16 @@ def _load_modifications(self, origin_df: pd.DataFrame) -> None:
self._psm_df[PsmDfCols.AA_MASS_DIFF_SITES],
) = zip(
*origin_df[["peptide", "modifications"]].apply(
lambda x: _get_mods_from_masses(*x), axis=1
lambda x: _get_mods_from_masses(
*x,
mass_mapped_mods=self._mass_mapped_mods,
mod_mass_tol=self._mod_mass_tol,
),
axis=1,
)
)

if not self.keep_unknown_aa_mass_diffs:
if not self._keep_unknown_aa_mass_diffs:
self._psm_df[PsmDfCols.TO_REMOVE] += (
self._psm_df[PsmDfCols.AA_MASS_DIFFS] != ""
)
Expand Down
Loading
Loading