From f72335654781a7de59544f76995397975db1be5c Mon Sep 17 00:00:00 2001 From: kierandidi Date: Sat, 23 Mar 2024 19:01:53 +0000 Subject: [PATCH 1/7] changed time_cutoff option --- proteinworkshop/config/dataset/pdb.yaml | 5 +++-- proteinworkshop/datasets/pdb_dataset.py | 19 ++++++++++++++----- 2 files changed, 17 insertions(+), 7 deletions(-) diff --git a/proteinworkshop/config/dataset/pdb.yaml b/proteinworkshop/config/dataset/pdb.yaml index 991917d7..426707a3 100644 --- a/proteinworkshop/config/dataset/pdb.yaml +++ b/proteinworkshop/config/dataset/pdb.yaml @@ -24,6 +24,7 @@ datamodule: remove_non_standard_residues: True # Include only proteins containing standard amino acid residues remove_pdb_unavailable: True # Include only proteins that are available to download train_val_test: [0.8, 0.1, 0.1] # Cross-validation ratios to use for train, val, and test splits - split_type: "sequence_similarity" # Split sequences by sequence similarity clustering, other option is "random" - split_sequence_similiarity: 0.3 # Clustering at 30% sequence similarity (argument is ignored if split_type="random") + split_type: "sequence_similarity" # Split sequences by sequence similarity clustering, other options are "random" and "time_cutoff" + split_sequence_similiarity: 0.3 # Clustering at 30% sequence similarity (argument is ignored if split_type!="sequence_similarity") + split_time_frames: ["2020-01-01", "2021-01-01", "2023-03-01"] # Time-cutoffs for train, val and test set (argument is ignored if split_type!="time_cutoff") overwrite_sequence_clusters: False # Previous clusterings at same sequence similarity are reused and not overwritten diff --git a/proteinworkshop/datasets/pdb_dataset.py b/proteinworkshop/datasets/pdb_dataset.py index 1de4abb7..93086ed4 100644 --- a/proteinworkshop/datasets/pdb_dataset.py +++ b/proteinworkshop/datasets/pdb_dataset.py @@ -2,6 +2,7 @@ import hydra import omegaconf +import numpy as np import os import pandas as pd import pathlib @@ -30,9 +31,10 @@ def __init__( remove_non_standard_residues: bool, remove_pdb_unavailable: bool, train_val_test: List[float], - split_type: Literal["sequence_similarity", "random"], + split_type: Literal["sequence_similarity", "time_cutoff", "random"], split_sequence_similiarity: int, - overwrite_sequence_clusters: bool + overwrite_sequence_clusters: bool, + split_time_frames: List[str] ): self.fraction = fraction self.molecule_type = molecule_type @@ -52,6 +54,7 @@ def __init__( self.split_type = split_type self.split_sequence_similarity = split_sequence_similiarity self.overwrite_sequence_clusters = overwrite_sequence_clusters + self.split_time_frames = [np.datetime64(date) for date in split_time_frames] self.splits = ["train", "val", "test"] def create_dataset(self): @@ -128,9 +131,15 @@ def create_dataset(self): elif self.split_type == "sequence_similarity": log.info(f"Splitting dataset via sequence-similarity split into {self.train_val_test}...") log.info(f"Using {self.split_sequence_similarity} sequence similarity for split") - pdb_manager.cluster(min_seq_id=self.split_sequence_similarity, update=True) - splits = pdb_manager.split_clusters( - pdb_manager.df, update=True, overwrite = self.overwrite_sequence_clusters) + pdb_manager.cluster(min_seq_id=self.split_sequence_similarity, update=True, + overwrite = self.overwrite_sequence_clusters) + splits = pdb_manager.split_clusters(pdb_manager.df, update=True) + + elif self.split_type == "time_cutoff": + log.info(f"Splitting dataset via time_cutoff split into {self.train_val_test}...") + log.info(f"Using {self.split_time_frames} dates for split") + pdb_manager.split_time_frames = self.split_time_frames + splits = pdb_manager.split_by_deposition_date(df=pdb_manager.df, update=True) log.info(splits["train"]) return splits From 707b8981268fe277957c61e288c65dff9e0b311f Mon Sep 17 00:00:00 2001 From: kierandidi Date: Sat, 23 Mar 2024 19:06:53 +0000 Subject: [PATCH 2/7] added to CHANGELOG --- CHANGELOG.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 1feddd64..ebe1a315 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,7 +5,7 @@ * Improves support for datamodules with multiple test sets. Generalises this to support GO and FOLD. Also adds multiple seq ID.-based splits for GO. [#72](https://github.com/a-r-j/ProteinWorkshop/pull/72) * Add redownload checks for already downloaded datasets and harmonise pdb download interface [#86](https://github.com/a-r-j/ProteinWorkshop/pull/86) * Remove remaining errors from PDB dataset change -* Add option to create pdb datasets with sequence-based splits [#88](https://github.com/a-r-j/ProteinWorkshop/pull/88) +* Add option to create pdb datasets with sequence-based splits [#88](https://github.com/a-r-j/ProteinWorkshop/pull/88) as well as time-based splits [#89](https://github.com/a-r-j/ProteinWorkshop/pull/89) ### Models From cb32b8114f8c9f56ebf73a7dfaa2c1b6dd768a4d Mon Sep 17 00:00:00 2001 From: kierandidi Date: Sun, 24 Mar 2024 16:48:17 +0000 Subject: [PATCH 3/7] changed PDBDataset arguments --- proteinworkshop/config/dataset/pdb.yaml | 2 +- proteinworkshop/datasets/pdb_dataset.py | 10 +++++----- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/proteinworkshop/config/dataset/pdb.yaml b/proteinworkshop/config/dataset/pdb.yaml index 426707a3..9f2b8a4e 100644 --- a/proteinworkshop/config/dataset/pdb.yaml +++ b/proteinworkshop/config/dataset/pdb.yaml @@ -26,5 +26,5 @@ datamodule: train_val_test: [0.8, 0.1, 0.1] # Cross-validation ratios to use for train, val, and test splits split_type: "sequence_similarity" # Split sequences by sequence similarity clustering, other options are "random" and "time_cutoff" split_sequence_similiarity: 0.3 # Clustering at 30% sequence similarity (argument is ignored if split_type!="sequence_similarity") - split_time_frames: ["2020-01-01", "2021-01-01", "2023-03-01"] # Time-cutoffs for train, val and test set (argument is ignored if split_type!="time_cutoff") overwrite_sequence_clusters: False # Previous clusterings at same sequence similarity are reused and not overwritten + split_time_frames: null # Time-cutoffs for train, val and test set (argument is ignored if split_type!="time_cutoff") - e.g., ["2020-01-01", "2021-01-01", "2023-03-01"] diff --git a/proteinworkshop/datasets/pdb_dataset.py b/proteinworkshop/datasets/pdb_dataset.py index 93086ed4..39a470fe 100644 --- a/proteinworkshop/datasets/pdb_dataset.py +++ b/proteinworkshop/datasets/pdb_dataset.py @@ -17,6 +17,11 @@ class PDBData: def __init__( self, + split_type: Literal["sequence_similarity", "time_cutoff", "random"] = "random", + split_sequence_similiarity: Optional[int] = None, + overwrite_sequence_clusters: Optional[bool] = False, + split_time_frames: Optional[List[str]] = None, + train_val_test: List[float], fraction: float, min_length: int, max_length: int, @@ -30,11 +35,6 @@ def __init__( remove_ligands: List[str], remove_non_standard_residues: bool, remove_pdb_unavailable: bool, - train_val_test: List[float], - split_type: Literal["sequence_similarity", "time_cutoff", "random"], - split_sequence_similiarity: int, - overwrite_sequence_clusters: bool, - split_time_frames: List[str] ): self.fraction = fraction self.molecule_type = molecule_type From 48c64e83a966ca4d7bf5c5bc72a5c7a22efbca06 Mon Sep 17 00:00:00 2001 From: kierandidi Date: Sun, 24 Mar 2024 16:52:52 +0000 Subject: [PATCH 4/7] Changed yaml file arg order --- proteinworkshop/config/dataset/pdb.yaml | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/proteinworkshop/config/dataset/pdb.yaml b/proteinworkshop/config/dataset/pdb.yaml index 9f2b8a4e..3954e199 100644 --- a/proteinworkshop/config/dataset/pdb.yaml +++ b/proteinworkshop/config/dataset/pdb.yaml @@ -10,6 +10,11 @@ datamodule: pdb_dataset: _target_: "proteinworkshop.datasets.pdb_dataset.PDBData" + split_type: "sequence_similarity" # Split sequences by sequence similarity clustering, other options are "random" and "time_cutoff" + split_sequence_similiarity: 0.3 # Clustering at 30% sequence similarity (argument is ignored if split_type!="sequence_similarity") + overwrite_sequence_clusters: False # Previous clusterings at same sequence similarity are reused and not overwritten + split_time_frames: null # Time-cutoffs for train, val and test set (argument is ignored if split_type!="time_cutoff") - e.g., ["2020-01-01", "2021-01-01", "2023-03-01"] + train_val_test: [0.8, 0.1, 0.1] # Cross-validation ratios to use for train, val, and test splits fraction: 1.0 # Fraction of dataset to use molecule_type: "protein" # Type of molecule for which to select experiment_types: ["diffraction", "NMR", "EM", "other"] # All experiment types @@ -23,8 +28,4 @@ datamodule: remove_ligands: [] # Exclude specific ligands from any available protein-ligand complexes remove_non_standard_residues: True # Include only proteins containing standard amino acid residues remove_pdb_unavailable: True # Include only proteins that are available to download - train_val_test: [0.8, 0.1, 0.1] # Cross-validation ratios to use for train, val, and test splits - split_type: "sequence_similarity" # Split sequences by sequence similarity clustering, other options are "random" and "time_cutoff" - split_sequence_similiarity: 0.3 # Clustering at 30% sequence similarity (argument is ignored if split_type!="sequence_similarity") - overwrite_sequence_clusters: False # Previous clusterings at same sequence similarity are reused and not overwritten - split_time_frames: null # Time-cutoffs for train, val and test set (argument is ignored if split_type!="time_cutoff") - e.g., ["2020-01-01", "2021-01-01", "2023-03-01"] + From 6b1912c839aa29b702a2b830f64fbb724fb27d95 Mon Sep 17 00:00:00 2001 From: kierandidi Date: Mon, 25 Mar 2024 09:53:20 +0000 Subject: [PATCH 5/7] reorder arguments --- proteinworkshop/config/dataset/pdb.yaml | 11 ++++++----- proteinworkshop/datasets/pdb_dataset.py | 11 ++++++----- 2 files changed, 12 insertions(+), 10 deletions(-) diff --git a/proteinworkshop/config/dataset/pdb.yaml b/proteinworkshop/config/dataset/pdb.yaml index 3954e199..da481a4b 100644 --- a/proteinworkshop/config/dataset/pdb.yaml +++ b/proteinworkshop/config/dataset/pdb.yaml @@ -10,11 +10,6 @@ datamodule: pdb_dataset: _target_: "proteinworkshop.datasets.pdb_dataset.PDBData" - split_type: "sequence_similarity" # Split sequences by sequence similarity clustering, other options are "random" and "time_cutoff" - split_sequence_similiarity: 0.3 # Clustering at 30% sequence similarity (argument is ignored if split_type!="sequence_similarity") - overwrite_sequence_clusters: False # Previous clusterings at same sequence similarity are reused and not overwritten - split_time_frames: null # Time-cutoffs for train, val and test set (argument is ignored if split_type!="time_cutoff") - e.g., ["2020-01-01", "2021-01-01", "2023-03-01"] - train_val_test: [0.8, 0.1, 0.1] # Cross-validation ratios to use for train, val, and test splits fraction: 1.0 # Fraction of dataset to use molecule_type: "protein" # Type of molecule for which to select experiment_types: ["diffraction", "NMR", "EM", "other"] # All experiment types @@ -28,4 +23,10 @@ datamodule: remove_ligands: [] # Exclude specific ligands from any available protein-ligand complexes remove_non_standard_residues: True # Include only proteins containing standard amino acid residues remove_pdb_unavailable: True # Include only proteins that are available to download + train_val_test: [0.8, 0.1, 0.1] # Cross-validation ratios to use for train, val, and test splits + split_type: "sequence_similarity" # Split sequences by sequence similarity clustering, other options are "random" and "time_cutoff" + split_sequence_similiarity: 0.3 # Clustering at 30% sequence similarity (argument is ignored if split_type!="sequence_similarity") + overwrite_sequence_clusters: False # Previous clusterings at same sequence similarity are reused and not overwritten + split_time_frames: null # Time-cutoffs for train, val and test set (argument is ignored if split_type!="time_cutoff") - e.g., ["2020-01-01", "2021-01-01", "2023-03-01"] + diff --git a/proteinworkshop/datasets/pdb_dataset.py b/proteinworkshop/datasets/pdb_dataset.py index 39a470fe..af8d7118 100644 --- a/proteinworkshop/datasets/pdb_dataset.py +++ b/proteinworkshop/datasets/pdb_dataset.py @@ -17,11 +17,6 @@ class PDBData: def __init__( self, - split_type: Literal["sequence_similarity", "time_cutoff", "random"] = "random", - split_sequence_similiarity: Optional[int] = None, - overwrite_sequence_clusters: Optional[bool] = False, - split_time_frames: Optional[List[str]] = None, - train_val_test: List[float], fraction: float, min_length: int, max_length: int, @@ -35,6 +30,12 @@ def __init__( remove_ligands: List[str], remove_non_standard_residues: bool, remove_pdb_unavailable: bool, + train_val_test: List[float], + split_type: Literal["sequence_similarity", "time_cutoff", "random"] = "random", + split_sequence_similiarity: Optional[int] = None, + overwrite_sequence_clusters: Optional[bool] = False, + split_time_frames: Optional[List[str]] = None, + ): self.fraction = fraction self.molecule_type = molecule_type From 878fa9639d30ee2b34dc5c5ae2f2ef473eddc13a Mon Sep 17 00:00:00 2001 From: kierandidi Date: Tue, 26 Mar 2024 22:44:14 +0000 Subject: [PATCH 6/7] add check for time interval --- proteinworkshop/config/dataset/pdb.yaml | 4 ++-- proteinworkshop/datasets/pdb_dataset.py | 8 +++++++- 2 files changed, 9 insertions(+), 3 deletions(-) diff --git a/proteinworkshop/config/dataset/pdb.yaml b/proteinworkshop/config/dataset/pdb.yaml index da481a4b..56540cc3 100644 --- a/proteinworkshop/config/dataset/pdb.yaml +++ b/proteinworkshop/config/dataset/pdb.yaml @@ -10,10 +10,10 @@ datamodule: pdb_dataset: _target_: "proteinworkshop.datasets.pdb_dataset.PDBData" - fraction: 1.0 # Fraction of dataset to use + fraction: 0.01 # Fraction of dataset to use molecule_type: "protein" # Type of molecule for which to select experiment_types: ["diffraction", "NMR", "EM", "other"] # All experiment types - max_length: 1000 # Exclude polypeptides greater than length 1000 + max_length: 150 # Exclude polypeptides greater than length 1000 min_length: 10 # Exclude peptides of length 10 oligomeric_min: 1 # Include only monomeric proteins oligomeric_max: 5 # Include up to 5-meric proteins diff --git a/proteinworkshop/datasets/pdb_dataset.py b/proteinworkshop/datasets/pdb_dataset.py index af8d7118..d569624d 100644 --- a/proteinworkshop/datasets/pdb_dataset.py +++ b/proteinworkshop/datasets/pdb_dataset.py @@ -55,7 +55,13 @@ def __init__( self.split_type = split_type self.split_sequence_similarity = split_sequence_similiarity self.overwrite_sequence_clusters = overwrite_sequence_clusters - self.split_time_frames = [np.datetime64(date) for date in split_time_frames] + if split_time_frames is None: + self.split_time_frames = split_time_frames + else: + try: + self.split_time_frames = [np.datetime64(date) for date in split_time_frames] + except: + raise TypeError(f"{split_time_frames} does not contain valid dates for np.datetime64 format") self.splits = ["train", "val", "test"] def create_dataset(self): From 7a3875d0698200aaa071b42d5624457fd96b20f0 Mon Sep 17 00:00:00 2001 From: kierandidi Date: Tue, 26 Mar 2024 23:00:23 +0000 Subject: [PATCH 7/7] changed time intervall check --- proteinworkshop/datasets/pdb_dataset.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/proteinworkshop/datasets/pdb_dataset.py b/proteinworkshop/datasets/pdb_dataset.py index d569624d..0daa9d95 100644 --- a/proteinworkshop/datasets/pdb_dataset.py +++ b/proteinworkshop/datasets/pdb_dataset.py @@ -55,9 +55,7 @@ def __init__( self.split_type = split_type self.split_sequence_similarity = split_sequence_similiarity self.overwrite_sequence_clusters = overwrite_sequence_clusters - if split_time_frames is None: - self.split_time_frames = split_time_frames - else: + if self.split_type == "time_cutoff": try: self.split_time_frames = [np.datetime64(date) for date in split_time_frames] except: