From dd6923a14e48b0c5dde8f3a97d84d19aa7e4e29b Mon Sep 17 00:00:00 2001 From: mschwoerer <82171591+mschwoer@users.noreply.github.com> Date: Wed, 20 Nov 2024 14:50:05 +0100 Subject: [PATCH 1/8] read defaults for rt_unit & fixed_c57 from yaml --- alphabase/constants/const_files/psm_reader.yaml | 6 +----- alphabase/psm_reader/dia_psm_reader.py | 1 - alphabase/psm_reader/maxquant_reader.py | 10 +++++++--- alphabase/psm_reader/msfragger_reader.py | 2 +- alphabase/psm_reader/psm_reader.py | 12 ++++++++---- alphabase/psm_reader/sage_reader.py | 2 +- alphabase/spectral_library/reader.py | 2 +- 7 files changed, 19 insertions(+), 16 deletions(-) diff --git a/alphabase/constants/const_files/psm_reader.yaml b/alphabase/constants/const_files/psm_reader.yaml index 5b67be60..bde674ab 100644 --- a/alphabase/constants/const_files/psm_reader.yaml +++ b/alphabase/constants/const_files/psm_reader.yaml @@ -119,8 +119,6 @@ pfind: 'uniprot_ids': 'Proteins' 'fdr': 'Q-value' 'decoy': ['Target/Decoy', 'Targe/Decoy'] - modification_mapping: - '': '' msfragger_pepxml: reader_type: msfragger_pepxml @@ -136,8 +134,6 @@ msfragger_pepxml: 'proteins': 'protein' 'raw_name': 'raw_name' 'mobility': 'ion_mobility' - modification_mapping: - '': '' mass_mapped_mods: - 'Oxidation@M' #other Oxidation@X are not needed here - 'Carbamidomethyl@C' @@ -247,7 +243,7 @@ library_reader_base: sage: reader_type: sage - rt_unit: minute + rt_unit: second column_mapping: 'modified_sequence': 'peptide' 'sequence': 'stripped_peptide' diff --git a/alphabase/psm_reader/dia_psm_reader.py b/alphabase/psm_reader/dia_psm_reader.py index 01d53e9b..2b638591 100644 --- a/alphabase/psm_reader/dia_psm_reader.py +++ b/alphabase/psm_reader/dia_psm_reader.py @@ -18,7 +18,6 @@ class SpectronautReader(MaxQuantReader): _reader_type = "spectronaut" _add_unimod_to_mod_mapping = True _min_max_rt_norm = True - _fixed_c57_default = False def _pre_process(self, df: pd.DataFrame) -> pd.DataFrame: """Spectronaut-specific preprocessing of output data.""" diff --git a/alphabase/psm_reader/maxquant_reader.py b/alphabase/psm_reader/maxquant_reader.py index 029e73f2..7f8d2382 100644 --- a/alphabase/psm_reader/maxquant_reader.py +++ b/alphabase/psm_reader/maxquant_reader.py @@ -11,6 +11,7 @@ from alphabase.psm_reader.psm_reader import ( PSMReaderBase, psm_reader_provider, + psm_reader_yaml, ) from alphabase.psm_reader.utils import get_column_mapping_for_df @@ -128,7 +129,6 @@ class MaxQuantReader(PSMReaderBase): _reader_type = "maxquant" _add_unimod_to_mod_mapping = True _modification_type = "maxquant" - _fixed_c57_default = True def __init__( # noqa: PLR0913, D417 # too many arguments in function definition, missing argument descriptions self, @@ -138,7 +138,7 @@ def __init__( # noqa: PLR0913, D417 # too many arguments in function definition mod_seq_columns: Optional[List[str]] = None, fdr: float = 0.01, keep_decoy: bool = False, - rt_unit: str = "minute", + rt_unit: Optional[str] = None, # MaxQuant reader-specific fixed_C57: Optional[bool] = None, # noqa: N803 TODO: make this *,fixed_c57 (breaking) **kwargs, @@ -167,7 +167,11 @@ def __init__( # noqa: PLR0913, D417 # too many arguments in function definition **kwargs, ) - self.fixed_C57 = fixed_C57 if fixed_C57 is not None else self._fixed_c57_default + self.fixed_C57 = ( + fixed_C57 + if fixed_C57 is not None + else psm_reader_yaml[self._reader_type]["fixed_C57"] + ) def _translate_decoy(self) -> None: if PsmDfCols.DECOY in self._psm_df.columns: diff --git a/alphabase/psm_reader/msfragger_reader.py b/alphabase/psm_reader/msfragger_reader.py index 099548e8..c2d05056 100644 --- a/alphabase/psm_reader/msfragger_reader.py +++ b/alphabase/psm_reader/msfragger_reader.py @@ -106,7 +106,7 @@ def __init__( # noqa: PLR0913, D417 # too many arguments in function definition # mod_seq_columns: Optional[List[str]] = None,# TODO: not needed here? fdr: float = 0.001, # refers to E-value in the PepXML keep_decoy: bool = False, - rt_unit: str = "second", + rt_unit: Optional[str] = None, # MSFragger reader-specific: keep_unknown_aa_mass_diffs: bool = False, **kwargs, diff --git a/alphabase/psm_reader/psm_reader.py b/alphabase/psm_reader/psm_reader.py index c7c3892a..31e58402 100644 --- a/alphabase/psm_reader/psm_reader.py +++ b/alphabase/psm_reader/psm_reader.py @@ -46,7 +46,7 @@ def __init__( # noqa: PLR0913 # too many arguments modification_mapping: Optional[dict] = None, fdr: float = 0.01, keep_decoy: bool = False, - rt_unit: str = "minute", + rt_unit: Optional[str] = None, mod_seq_columns: Optional[List[str]] = None, **kwargs, ): @@ -98,8 +98,8 @@ def __init__( # noqa: PLR0913 # too many arguments Defaults to False. rt_unit : str, optional - The unit of RT in the search engine result. - Defaults to 'minute'. + The unit of RT in the search engine result, "minute", "second" or "irt". + If None, it is read from psm_reader_yaml key "rt_unit". mod_seq_columns : list, optional The columns to find modified sequences. @@ -145,7 +145,11 @@ def __init__( # noqa: PLR0913 # too many arguments self._keep_fdr = fdr self._keep_decoy = keep_decoy - self._engine_rt_unit = rt_unit + self._engine_rt_unit = ( + rt_unit + if rt_unit is not None + else psm_reader_yaml[self._reader_type]["rt_unit"] + ) self._min_irt_value = -100 self._max_irt_value = 200 diff --git a/alphabase/psm_reader/sage_reader.py b/alphabase/psm_reader/sage_reader.py index 7e4945c5..265846de 100644 --- a/alphabase/psm_reader/sage_reader.py +++ b/alphabase/psm_reader/sage_reader.py @@ -576,7 +576,7 @@ def __init__( # noqa: PLR0913, D417 # too many arguments in function definition # mod_seq_columns: Optional[List[str]] = None, # TODO: not needed here? fdr: float = 0.01, keep_decoy: bool = False, - rt_unit: str = "second", + rt_unit: Optional[str] = None, # sage reader-specific: custom_translation_df: pd.DataFrame = None, mp_process_num: int = 10, diff --git a/alphabase/spectral_library/reader.py b/alphabase/spectral_library/reader.py index 0ec44146..3bbf8f80 100644 --- a/alphabase/spectral_library/reader.py +++ b/alphabase/spectral_library/reader.py @@ -37,7 +37,7 @@ def __init__( # noqa: PLR0913 many arguments in function definition fdr: float = 0.01, fixed_C57: bool = False, # noqa: FBT001, FBT002, N803 TODO: make this *,fixed_c57 (breaking) mod_seq_columns: Optional[List[str]] = None, - rt_unit: str = "irt", + rt_unit: Optional[str] = None, # library reader-specific: precursor_mz_min: float = 400, precursor_mz_max: float = 2000, From f0e5d7b804a1d4d0d247786bdca80fa99afcfb77 Mon Sep 17 00:00:00 2001 From: mschwoerer <82171591+mschwoer@users.noreply.github.com> Date: Wed, 20 Nov 2024 15:03:48 +0100 Subject: [PATCH 2/8] refactorings --- alphabase/constants/_const.py | 1 + alphabase/psm_reader/maxquant_reader.py | 2 +- alphabase/psm_reader/psm_reader.py | 28 ++++++++++++++++--------- 3 files changed, 20 insertions(+), 11 deletions(-) diff --git a/alphabase/constants/_const.py b/alphabase/constants/_const.py index d7d5da91..95f34102 100644 --- a/alphabase/constants/_const.py +++ b/alphabase/constants/_const.py @@ -5,6 +5,7 @@ from alphabase.yaml_utils import load_yaml CONST_FILE_FOLDER = os.path.join(os.path.dirname(__file__), "const_files") +PSM_READER_YAML_FILE_NAME = "psm_reader.yaml" common_const_dict: dict = load_yaml( os.path.join(CONST_FILE_FOLDER, "common_constants.yaml") diff --git a/alphabase/psm_reader/maxquant_reader.py b/alphabase/psm_reader/maxquant_reader.py index 7f8d2382..798c86c1 100644 --- a/alphabase/psm_reader/maxquant_reader.py +++ b/alphabase/psm_reader/maxquant_reader.py @@ -152,7 +152,7 @@ def __init__( # noqa: PLR0913, D417 # too many arguments in function definition fixed_C57 : bool, optional If true, the search engine will not show `Carbamidomethyl` in the modified sequences. - by default True + by default read from psm_reader_yaml key `fixed_C57`. See documentation of `PSMReaderBase` for the rest of parameters. diff --git a/alphabase/psm_reader/psm_reader.py b/alphabase/psm_reader/psm_reader.py index 31e58402..edb8538d 100644 --- a/alphabase/psm_reader/psm_reader.py +++ b/alphabase/psm_reader/psm_reader.py @@ -9,7 +9,7 @@ import numpy as np import pandas as pd -from alphabase.constants._const import CONST_FILE_FOLDER +from alphabase.constants._const import CONST_FILE_FOLDER, PSM_READER_YAML_FILE_NAME from alphabase.peptide import mobility from alphabase.peptide.precursor import reset_precursor_df, update_precursor_mz from alphabase.psm_reader.keys import PsmDfCols @@ -23,7 +23,10 @@ from alphabase.yaml_utils import load_yaml #: See `psm_reader.yaml `_ -psm_reader_yaml = load_yaml(Path(CONST_FILE_FOLDER) / "psm_reader.yaml") +psm_reader_yaml = load_yaml(Path(CONST_FILE_FOLDER) / PSM_READER_YAML_FILE_NAME) + +_MIN_IRT_VALUE = -100 +_MAX_IRT_VALUE = 200 class PSMReaderBase(ABC): @@ -124,7 +127,9 @@ def __init__( # noqa: PLR0913 # too many arguments self._modification_mapper = ModificationMapper( modification_mapping, reader_yaml=copy.deepcopy(psm_reader_yaml), - modification_type=self._modification_type, + modification_type=psm_reader_yaml[self._reader_type].get( + "modification_mapping_type", None + ), add_unimod_to_mod_mapping=self._add_unimod_to_mod_mapping, ) @@ -145,13 +150,17 @@ def __init__( # noqa: PLR0913 # too many arguments self._keep_fdr = fdr self._keep_decoy = keep_decoy - self._engine_rt_unit = ( + + self._rt_unit = ( rt_unit if rt_unit is not None else psm_reader_yaml[self._reader_type]["rt_unit"] ) - self._min_irt_value = -100 - self._max_irt_value = 200 + if self._rt_unit not in ["minute", "second", "irt"]: + raise ValueError( + f"Invalid rt_unit: {self._rt_unit}. " + f"rt_unit should be one of ['minute', 'second', 'irt']." + ) for key, value in kwargs.items(): # TODO: remove and remove kwargs warnings.warn( @@ -365,7 +374,7 @@ def _normalize_rt(self) -> None: if PsmDfCols.RT not in self._psm_df.columns: return - if self._engine_rt_unit == "second": + if self._rt_unit == "second": self._psm_df[PsmDfCols.RT] = self._psm_df[PsmDfCols.RT] / 60 if PsmDfCols.RT_START in self._psm_df.columns: self._psm_df[PsmDfCols.RT_START] = self._psm_df[PsmDfCols.RT_START] / 60 @@ -374,9 +383,8 @@ def _normalize_rt(self) -> None: min_rt = self._psm_df[PsmDfCols.RT].min() max_rt = self._psm_df[PsmDfCols.RT].max() if min_rt < 0: # iRT - min_rt = max(min_rt, self._min_irt_value) - max_rt = min(max_rt, self._max_irt_value) - + min_rt = max(min_rt, _MIN_IRT_VALUE) + max_rt = min(max_rt, _MAX_IRT_VALUE) elif not self._min_max_rt_norm: min_rt = 0 From c3ed0dfafee161e0c25324bd68809016da1edae5 Mon Sep 17 00:00:00 2001 From: mschwoerer <82171591+mschwoer@users.noreply.github.com> Date: Wed, 20 Nov 2024 15:08:58 +0100 Subject: [PATCH 3/8] introduce modification_mapping_type --- .../constants/const_files/psm_reader.yaml | 32 ++++++++++++------- alphabase/psm_reader/alphapept_reader.py | 1 - alphabase/psm_reader/maxquant_reader.py | 1 - alphabase/psm_reader/modification_mapper.py | 32 ++++++++----------- alphabase/psm_reader/psm_reader.py | 8 ++--- tests/integration/test_psm_readers.py | 2 ++ 6 files changed, 38 insertions(+), 38 deletions(-) diff --git a/alphabase/constants/const_files/psm_reader.yaml b/alphabase/constants/const_files/psm_reader.yaml index bde674ab..6c8443b2 100644 --- a/alphabase/constants/const_files/psm_reader.yaml +++ b/alphabase/constants/const_files/psm_reader.yaml @@ -13,13 +13,8 @@ alphapept: 'raw_name': 'raw_name' #parse from `ms_data.hdf`` file 'fdr': 'q_value' 'decoy': 'decoy' - modification_mapping: - 'Carbamidomethyl@C': 'cC' - 'Oxidation@M': 'oxM' - 'Phospho@S': 'pS' - 'Phospho@T': 'pT' - 'Phospho@Y': 'pY' - 'Acetyl@Protein_N-term': 'a' + modification_mapping_type: 'alphapept' + maxquant: reader_type: maxquant @@ -49,7 +44,10 @@ maxquant: 'genes': ['Gene Names','Gene names'] 'decoy': 'Reverse' 'intensity': 'Intensity' - modification_mapping: + modification_mapping_type: 'maxquant' + +modification_mappings: + maxquant: 'Dimethyl@K': - 'K(Dimethyl)' 'Dimethyl@R': @@ -103,6 +101,13 @@ maxquant: 'Deamidated@Q': ['Q(Deamidation (NQ))','Q(de)'] 'GlyGly@K': ['K(GlyGly (K))', 'K(gl)'] 'hydroxyisobutyryl@K': 'K(2-)' + alphapept: + 'Carbamidomethyl@C': 'cC' + 'Oxidation@M': 'oxM' + 'Phospho@S': 'pS' + 'Phospho@T': 'pT' + 'Phospho@Y': 'pY' + 'Acetyl@Protein_N-term': 'a' pfind: reader_type: pfind @@ -119,6 +124,7 @@ pfind: 'uniprot_ids': 'Proteins' 'fdr': 'Q-value' 'decoy': ['Target/Decoy', 'Targe/Decoy'] + modification_mapping_type: 'maxquant' msfragger_pepxml: reader_type: msfragger_pepxml @@ -146,6 +152,7 @@ msfragger_pepxml: - 'Dimethyl@K' # Any_N-term is not needed here as it will be infered in-the-fly - 'Methyl@E' #an example of a PTM that can be C-term mod_mass_tol: 0.1 # Da + modification_mapping_type: 'maxquant' diann: reader_type: diann @@ -168,7 +175,7 @@ diann: 'fdr': 'Q.Value' mod_seq_columns: - "Modified.Sequence" - modification_mapping: 'maxquant' + modification_mapping_type: 'maxquant' spectronaut_report: reader_type: spectronaut_report @@ -184,7 +191,7 @@ spectronaut_report: 'charge': 'charge' mod_seq_columns: - 'ModifiedSequence' - modification_mapping: 'maxquant' + modification_mapping_type: 'maxquant' spectronaut: reader_type: spectronaut @@ -208,7 +215,7 @@ spectronaut: - 'ModifiedPeptideSequence' - 'LabeledSequence' - 'FullUniModPeptideName' - modification_mapping: 'maxquant' + modification_mapping_type: 'maxquant' library_reader_base: reader_type: library_reader_base @@ -239,7 +246,7 @@ library_reader_base: - 'FullUniModPeptideName' - 'LabeledSequence' - 'FullUniModPeptideName' - modification_mapping: 'maxquant' + modification_mapping_type: 'maxquant' sage: reader_type: sage @@ -258,3 +265,4 @@ sage: 'peptide_fdr': 'peptide_q' 'protein_fdr': 'protein_q' 'decoy': 'is_decoy' + modification_mapping_type: 'maxquant' diff --git a/alphabase/psm_reader/alphapept_reader.py b/alphabase/psm_reader/alphapept_reader.py index 5c1a332d..b3efba1d 100644 --- a/alphabase/psm_reader/alphapept_reader.py +++ b/alphabase/psm_reader/alphapept_reader.py @@ -52,7 +52,6 @@ class AlphaPeptReader(PSMReaderBase): """Reader for AlphaPept's *.ms_data.hdf files.""" _reader_type = "alphapept" - _modification_type = "alphapept" def _load_file(self, filename: str) -> pd.DataFrame: """Load an AlphaPept output file to a DataFrame.""" diff --git a/alphabase/psm_reader/maxquant_reader.py b/alphabase/psm_reader/maxquant_reader.py index 798c86c1..101efcd8 100644 --- a/alphabase/psm_reader/maxquant_reader.py +++ b/alphabase/psm_reader/maxquant_reader.py @@ -128,7 +128,6 @@ class MaxQuantReader(PSMReaderBase): _reader_type = "maxquant" _add_unimod_to_mod_mapping = True - _modification_type = "maxquant" def __init__( # noqa: PLR0913, D417 # too many arguments in function definition, missing argument descriptions self, diff --git a/alphabase/psm_reader/modification_mapper.py b/alphabase/psm_reader/modification_mapper.py index 355ee08f..534e2610 100644 --- a/alphabase/psm_reader/modification_mapper.py +++ b/alphabase/psm_reader/modification_mapper.py @@ -15,7 +15,7 @@ def __init__( custom_modification_mapping: Optional[Dict[str, str]], *, reader_yaml: Dict, - modification_type: Optional[str], + mapping_type: str, add_unimod_to_mod_mapping: bool, ): """Initialize the ModificationMapper. @@ -35,7 +35,7 @@ def __init__( reader_yaml: the yaml (read from file) containing the modification mappings - modification_type: + mapping_type: the type of modification mapping ("maxquant" or "alphapept") add_unimod_to_mod_mapping: @@ -44,7 +44,7 @@ def __init__( """ self._psm_reader_yaml = reader_yaml self._add_unimod_to_mod_mapping = add_unimod_to_mod_mapping - self._modification_type = modification_type + self._mapping_type = mapping_type self.modification_mapping = None self.rev_mod_mapping = None @@ -102,16 +102,13 @@ def set_modification_mapping( if modification_mapping is None: self._init_modification_mapping() elif isinstance( - modification_mapping, str - ): # TODO: remove this overloading of the parameter by introducing yaml key "modification_mapping_type" - if modification_mapping in self._psm_reader_yaml: - self.modification_mapping = self._psm_reader_yaml[modification_mapping][ - "modification_mapping" - ] - else: - raise ValueError( - f"Unknown modification mapping: {modification_mapping}" - ) + modification_mapping, + str, # interprete as modification_mapping_type + ): + self.modification_mapping = self._psm_reader_yaml["modification_mappings"][ + modification_mapping + ] + else: self.modification_mapping = copy.deepcopy(modification_mapping) @@ -125,12 +122,9 @@ def set_modification_mapping( def _init_modification_mapping(self) -> None: """Initialize the modification mapping from the psm_reader_yaml or as an empty dictionary.""" - if self._modification_type is not None: - self.modification_mapping = self._psm_reader_yaml[self._modification_type][ - "modification_mapping" - ] - else: - self.modification_mapping = {} + self.modification_mapping = self._psm_reader_yaml["modification_mappings"][ + self._mapping_type + ] def _add_all_unimod(self) -> None: """Add all unimod modifications to the modification mapping.""" diff --git a/alphabase/psm_reader/psm_reader.py b/alphabase/psm_reader/psm_reader.py index edb8538d..4c3e9aec 100644 --- a/alphabase/psm_reader/psm_reader.py +++ b/alphabase/psm_reader/psm_reader.py @@ -36,8 +36,6 @@ class PSMReaderBase(ABC): _reader_type: str # whether to add the unimod mappings to the modification mapping _add_unimod_to_mod_mapping: bool = False - # the typ of modification mapping to be used - _modification_type: Optional[str] = None # whether 'rt_norm' values in self._psm_dd will be normalized using min/max values # Useful to normalize iRT values as they contain negative values. _min_max_rt_norm = False @@ -127,9 +125,9 @@ def __init__( # noqa: PLR0913 # too many arguments self._modification_mapper = ModificationMapper( modification_mapping, reader_yaml=copy.deepcopy(psm_reader_yaml), - modification_type=psm_reader_yaml[self._reader_type].get( - "modification_mapping_type", None - ), + mapping_type=psm_reader_yaml[self._reader_type][ + "modification_mapping_type" + ], add_unimod_to_mod_mapping=self._add_unimod_to_mod_mapping, ) diff --git a/tests/integration/test_psm_readers.py b/tests/integration/test_psm_readers.py index f7c28f8f..e33ef8f8 100644 --- a/tests/integration/test_psm_readers.py +++ b/tests/integration/test_psm_readers.py @@ -229,6 +229,8 @@ def _assert_reference_df_equal( def test_psm_reader_yaml() -> None: """Test that all column mappings in the psm_reader.yaml are covered by string constant keys.""" for reader_config in psm_reader_yaml.values(): + if reader_config == "modification_mappings": + continue ks = [k for k in reader_config["column_mapping"]] assert ( set(ks) - set(PsmDfCols.get_values()) - set(LibPsmDfCols.get_values()) From c6189bdcf61bcb1a52a9d362ddb4cc15568ea858 Mon Sep 17 00:00:00 2001 From: mschwoerer <82171591+mschwoer@users.noreply.github.com> Date: Wed, 20 Nov 2024 15:17:45 +0100 Subject: [PATCH 4/8] fix tests --- tests/integration/test_psm_readers.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tests/integration/test_psm_readers.py b/tests/integration/test_psm_readers.py index e33ef8f8..65a15075 100644 --- a/tests/integration/test_psm_readers.py +++ b/tests/integration/test_psm_readers.py @@ -231,6 +231,8 @@ def test_psm_reader_yaml() -> None: for reader_config in psm_reader_yaml.values(): if reader_config == "modification_mappings": continue + if "column_mapping" not in reader_config: + continue ks = [k for k in reader_config["column_mapping"]] assert ( set(ks) - set(PsmDfCols.get_values()) - set(LibPsmDfCols.get_values()) From 1e975e541f62a565084fc36463fd5d4b7e63e353 Mon Sep 17 00:00:00 2001 From: mschwoerer <82171591+mschwoer@users.noreply.github.com> Date: Wed, 20 Nov 2024 15:21:35 +0100 Subject: [PATCH 5/8] add code review action --- .github/workflows/_run_tests.yml | 2 +- .github/workflows/branch-checks.yaml | 8 ++++++++ 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/.github/workflows/_run_tests.yml b/.github/workflows/_run_tests.yml index 1243b0a8..e9e13eee 100644 --- a/.github/workflows/_run_tests.yml +++ b/.github/workflows/_run_tests.yml @@ -14,7 +14,7 @@ on: required: true type: string jobs: - pre-commit: + run-tests: runs-on: ${{ inputs.os }} steps: - uses: actions/checkout@v3 diff --git a/.github/workflows/branch-checks.yaml b/.github/workflows/branch-checks.yaml index 033909a7..2f6a8155 100644 --- a/.github/workflows/branch-checks.yaml +++ b/.github/workflows/branch-checks.yaml @@ -24,3 +24,11 @@ jobs: python-version: ${{ matrix.python-version }} os: ${{ matrix.os }} install-script: "loose_pip_install.sh" + get-code-review-input: + runs-on: ubuntu-latest + #if: contains(github.event.pull_request.labels.*.name, 'code-review') + steps: + - uses: MannLabs/alphashared/actions/get-code-review-input@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + PR_NUMBER: ${{ github.event.number }} From b8e091b189fce081b00990e428618890ac8212c1 Mon Sep 17 00:00:00 2001 From: mschwoerer <82171591+mschwoer@users.noreply.github.com> Date: Thu, 21 Nov 2024 09:21:20 +0100 Subject: [PATCH 6/8] fix tests --- nbs_tests/psm_reader/alphapept_reader.ipynb | 63 ++++----------------- nbs_tests/psm_reader/maxquant_reader.ipynb | 2 +- 2 files changed, 12 insertions(+), 53 deletions(-) diff --git a/nbs_tests/psm_reader/alphapept_reader.ipynb b/nbs_tests/psm_reader/alphapept_reader.ipynb index d2722e21..4948ae69 100644 --- a/nbs_tests/psm_reader/alphapept_reader.ipynb +++ b/nbs_tests/psm_reader/alphapept_reader.ipynb @@ -24,10 +24,10 @@ ] }, { - "cell_type": "code", - "execution_count": null, "metadata": {}, + "cell_type": "code", "outputs": [], + "execution_count": null, "source": [ "import os\n", "from alphabase.psm_reader.alphapept_reader import register_readers\n", @@ -37,70 +37,29 @@ ] }, { + "metadata": {}, "cell_type": "code", + "outputs": [], "execution_count": null, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "{'rt': 'rt',\n", - " 'scan_num': 'scan_no',\n", - " 'spec_idx': 'raw_idx',\n", - " 'query_id': 'query_idx',\n", - " 'mobility': 'mobility',\n", - " 'score': 'score',\n", - " 'precursor_mz': 'mz',\n", - " 'charge': 'charge',\n", - " 'raw_name': 'raw_name',\n", - " 'fdr': 'q_value',\n", - " 'decoy': 'decoy'}" - ] - }, - "execution_count": null, - "metadata": {}, - "output_type": "execute_result" - } - ], "source": "psm_reader_yaml['alphapept']['column_mapping']" }, { + "metadata": {}, "cell_type": "code", + "outputs": [], "execution_count": null, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "{'Carbamidomethyl@C': 'cC',\n", - " 'Oxidation@M': 'oxM',\n", - " 'Phospho@S': 'pS',\n", - " 'Phospho@T': 'pT',\n", - " 'Phospho@Y': 'pY',\n", - " 'Acetyl@Protein_N-term': 'a'}" - ] - }, - "execution_count": null, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "psm_reader_yaml['alphapept']['modification_mapping']" - ] + "source": "psm_reader_yaml['modification_mappings']['alphapept']" }, { - "cell_type": "markdown", "metadata": {}, - "source": [ - "The modified sequence column is `precursor` column" - ] + "cell_type": "markdown", + "source": "The modified sequence column is `precursor` column" }, { - "cell_type": "code", - "execution_count": null, "metadata": {}, + "cell_type": "code", "outputs": [], + "execution_count": null, "source": [ "#| hide\n", "ap_reader = psm_reader_provider.get_reader('alphapept')\n", diff --git a/nbs_tests/psm_reader/maxquant_reader.ipynb b/nbs_tests/psm_reader/maxquant_reader.ipynb index 63c5f8a7..5789796e 100644 --- a/nbs_tests/psm_reader/maxquant_reader.ipynb +++ b/nbs_tests/psm_reader/maxquant_reader.ipynb @@ -103,7 +103,7 @@ } ], "source": [ - "psm_reader_yaml['maxquant']['modification_mapping']" + "psm_reader_yaml['modification_mappings']['maxquant']" ] }, { From 36ccc7d6c23e38424bb9b9ec1cb74eaf59e0a935 Mon Sep 17 00:00:00 2001 From: mschwoerer <82171591+mschwoer@users.noreply.github.com> Date: Thu, 21 Nov 2024 09:53:40 +0100 Subject: [PATCH 7/8] refactor MSFraggerReader --- alphabase/psm_reader/modification_mapper.py | 4 ++-- alphabase/psm_reader/msfragger_reader.py | 23 ++++++++++++++------- 2 files changed, 17 insertions(+), 10 deletions(-) diff --git a/alphabase/psm_reader/modification_mapper.py b/alphabase/psm_reader/modification_mapper.py index 534e2610..b8a3ce52 100644 --- a/alphabase/psm_reader/modification_mapper.py +++ b/alphabase/psm_reader/modification_mapper.py @@ -95,8 +95,8 @@ def set_modification_mapping( ---------- modification_mapping: If dictionary: the current modification_mapping will be overwritten by this. - If str: the parameter will be interpreted as a reader type, and the modification_mapping is read from the - "modification_mapping" section of the psm_reader_yaml + If str: the parameter will be interpreted as a modification_mapping_type, and the mapping is read from the + respective key in the "modification_mappings" section of the psm_reader_yaml """ if modification_mapping is None: diff --git a/alphabase/psm_reader/msfragger_reader.py b/alphabase/psm_reader/msfragger_reader.py index c2d05056..cc9eb0da 100644 --- a/alphabase/psm_reader/msfragger_reader.py +++ b/alphabase/psm_reader/msfragger_reader.py @@ -21,12 +21,11 @@ def _is_fragger_decoy(proteins: List[str]) -> bool: return all(prot.lower().startswith("rev_") for prot in proteins) -mass_mapped_mods = psm_reader_yaml["msfragger_pepxml"]["mass_mapped_mods"] -mod_mass_tol = psm_reader_yaml["msfragger_pepxml"]["mod_mass_tol"] - - def _get_mods_from_masses( # noqa: PLR0912, C901 too many branches, too complex TODO: refactor - sequence: str, msf_aa_mods: List[str] + sequence: str, + msf_aa_mods: List[str], + mass_mapped_mods: List[str], + mod_mass_tol: float, ) -> Tuple[str, str, str, str]: mods = [] mod_sites = [] @@ -134,7 +133,10 @@ def __init__( # noqa: PLR0913, D417 # too many arguments in function definition rt_unit=rt_unit, **kwargs, ) - self.keep_unknown_aa_mass_diffs = keep_unknown_aa_mass_diffs + self._keep_unknown_aa_mass_diffs = keep_unknown_aa_mass_diffs + # TODO: should those be set via API, too? + self._mass_mapped_mods = psm_reader_yaml["msfragger_pepxml"]["mass_mapped_mods"] + self._mod_mass_tol = psm_reader_yaml["msfragger_pepxml"]["mod_mass_tol"] def _translate_modifications(self) -> None: pass @@ -183,11 +185,16 @@ def _load_modifications(self, origin_df: pd.DataFrame) -> None: self._psm_df[PsmDfCols.AA_MASS_DIFF_SITES], ) = zip( *origin_df[["peptide", "modifications"]].apply( - lambda x: _get_mods_from_masses(*x), axis=1 + lambda x: _get_mods_from_masses( + *x, + mass_mapped_mods=self._mass_mapped_mods, + mod_mass_tol=self._mod_mass_tol, + ), + axis=1, ) ) - if not self.keep_unknown_aa_mass_diffs: + if not self._keep_unknown_aa_mass_diffs: self._psm_df[PsmDfCols.TO_REMOVE] += ( self._psm_df[PsmDfCols.AA_MASS_DIFFS] != "" ) From 2f7f75ae601333c6a604e8146f14f6c6ba3c0ba3 Mon Sep 17 00:00:00 2001 From: mschwoerer <82171591+mschwoer@users.noreply.github.com> Date: Fri, 20 Dec 2024 17:21:19 +0100 Subject: [PATCH 8/8] make pfind and sage special mappings more explicit in yaml --- alphabase/constants/const_files/psm_reader.yaml | 4 ++-- alphabase/psm_reader/modification_mapper.py | 10 ++++++---- 2 files changed, 8 insertions(+), 6 deletions(-) diff --git a/alphabase/constants/const_files/psm_reader.yaml b/alphabase/constants/const_files/psm_reader.yaml index 6c8443b2..acc881a0 100644 --- a/alphabase/constants/const_files/psm_reader.yaml +++ b/alphabase/constants/const_files/psm_reader.yaml @@ -124,7 +124,7 @@ pfind: 'uniprot_ids': 'Proteins' 'fdr': 'Q-value' 'decoy': ['Target/Decoy', 'Targe/Decoy'] - modification_mapping_type: 'maxquant' + modification_mapping_type: null # no mapping required msfragger_pepxml: reader_type: msfragger_pepxml @@ -265,4 +265,4 @@ sage: 'peptide_fdr': 'peptide_q' 'protein_fdr': 'protein_q' 'decoy': 'is_decoy' - modification_mapping_type: 'maxquant' + modification_mapping_type: null # custom mapping in code diff --git a/alphabase/psm_reader/modification_mapper.py b/alphabase/psm_reader/modification_mapper.py index b8a3ce52..2bc322a2 100644 --- a/alphabase/psm_reader/modification_mapper.py +++ b/alphabase/psm_reader/modification_mapper.py @@ -103,7 +103,7 @@ def set_modification_mapping( self._init_modification_mapping() elif isinstance( modification_mapping, - str, # interprete as modification_mapping_type + str, # interpret as modification_mapping_type ): self.modification_mapping = self._psm_reader_yaml["modification_mappings"][ modification_mapping @@ -122,9 +122,11 @@ def set_modification_mapping( def _init_modification_mapping(self) -> None: """Initialize the modification mapping from the psm_reader_yaml or as an empty dictionary.""" - self.modification_mapping = self._psm_reader_yaml["modification_mappings"][ - self._mapping_type - ] + self.modification_mapping = ( + self._psm_reader_yaml["modification_mappings"][self._mapping_type] + if self._mapping_type is not None + else {} + ) def _add_all_unimod(self) -> None: """Add all unimod modifications to the modification mapping."""