From f72335654781a7de59544f76995397975db1be5c Mon Sep 17 00:00:00 2001
From: kierandidi <kieran.didi@gmail.com>
Date: Sat, 23 Mar 2024 19:01:53 +0000
Subject: [PATCH 1/7] changed time_cutoff option

---
 proteinworkshop/config/dataset/pdb.yaml |  5 +++--
 proteinworkshop/datasets/pdb_dataset.py | 19 ++++++++++++++-----
 2 files changed, 17 insertions(+), 7 deletions(-)

diff --git a/proteinworkshop/config/dataset/pdb.yaml b/proteinworkshop/config/dataset/pdb.yaml
index 991917d7..426707a3 100644
--- a/proteinworkshop/config/dataset/pdb.yaml
+++ b/proteinworkshop/config/dataset/pdb.yaml
@@ -24,6 +24,7 @@ datamodule:
     remove_non_standard_residues: True # Include only proteins containing standard amino acid residues
     remove_pdb_unavailable: True # Include only proteins that are available to download
     train_val_test: [0.8, 0.1, 0.1] # Cross-validation ratios to use for train, val, and test splits
-    split_type: "sequence_similarity" # Split sequences by sequence similarity clustering, other option is "random"
-    split_sequence_similiarity: 0.3 # Clustering at 30% sequence similarity (argument is ignored if split_type="random")
+    split_type: "sequence_similarity" # Split sequences by sequence similarity clustering, other options are "random" and "time_cutoff"
+    split_sequence_similiarity: 0.3 # Clustering at 30% sequence similarity (argument is ignored if split_type!="sequence_similarity")
+    split_time_frames: ["2020-01-01", "2021-01-01", "2023-03-01"] # Time-cutoffs for train, val and test set (argument is ignored if split_type!="time_cutoff")
     overwrite_sequence_clusters: False # Previous clusterings at same sequence similarity are reused and not overwritten
diff --git a/proteinworkshop/datasets/pdb_dataset.py b/proteinworkshop/datasets/pdb_dataset.py
index 1de4abb7..93086ed4 100644
--- a/proteinworkshop/datasets/pdb_dataset.py
+++ b/proteinworkshop/datasets/pdb_dataset.py
@@ -2,6 +2,7 @@
 
 import hydra
 import omegaconf
+import numpy as np
 import os
 import pandas as pd
 import pathlib
@@ -30,9 +31,10 @@ def __init__(
         remove_non_standard_residues: bool,
         remove_pdb_unavailable: bool,
         train_val_test: List[float],
-        split_type: Literal["sequence_similarity", "random"],
+        split_type: Literal["sequence_similarity", "time_cutoff", "random"],
         split_sequence_similiarity: int,
-        overwrite_sequence_clusters: bool
+        overwrite_sequence_clusters: bool,
+        split_time_frames: List[str]
     ):
         self.fraction = fraction
         self.molecule_type = molecule_type
@@ -52,6 +54,7 @@ def __init__(
         self.split_type = split_type
         self.split_sequence_similarity = split_sequence_similiarity
         self.overwrite_sequence_clusters = overwrite_sequence_clusters
+        self.split_time_frames = [np.datetime64(date) for date in split_time_frames]
         self.splits = ["train", "val", "test"]
 
     def create_dataset(self):
@@ -128,9 +131,15 @@ def create_dataset(self):
         elif self.split_type == "sequence_similarity":
             log.info(f"Splitting dataset via sequence-similarity split into {self.train_val_test}...")
             log.info(f"Using {self.split_sequence_similarity} sequence similarity for split")
-            pdb_manager.cluster(min_seq_id=self.split_sequence_similarity, update=True)
-            splits = pdb_manager.split_clusters(
-                pdb_manager.df, update=True, overwrite = self.overwrite_sequence_clusters)
+            pdb_manager.cluster(min_seq_id=self.split_sequence_similarity, update=True,
+                                 overwrite = self.overwrite_sequence_clusters)
+            splits = pdb_manager.split_clusters(pdb_manager.df, update=True)
+        
+        elif self.split_type == "time_cutoff":
+            log.info(f"Splitting dataset via time_cutoff split into {self.train_val_test}...")
+            log.info(f"Using {self.split_time_frames} dates for split")
+            pdb_manager.split_time_frames = self.split_time_frames
+            splits = pdb_manager.split_by_deposition_date(df=pdb_manager.df, update=True)
 
         log.info(splits["train"])
         return splits

From 707b8981268fe277957c61e288c65dff9e0b311f Mon Sep 17 00:00:00 2001
From: kierandidi <kieran.didi@gmail.com>
Date: Sat, 23 Mar 2024 19:06:53 +0000
Subject: [PATCH 2/7] added to CHANGELOG

---
 CHANGELOG.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 1feddd64..ebe1a315 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -5,7 +5,7 @@
 * Improves support for datamodules with multiple test sets. Generalises this to support GO and FOLD. Also adds multiple seq ID.-based splits for GO. [#72](https://github.com/a-r-j/ProteinWorkshop/pull/72)
 * Add redownload checks for already downloaded datasets and harmonise pdb download interface [#86](https://github.com/a-r-j/ProteinWorkshop/pull/86)
 * Remove remaining errors from PDB dataset change
-* Add option to create pdb datasets with sequence-based splits [#88](https://github.com/a-r-j/ProteinWorkshop/pull/88)
+* Add option to create pdb datasets with sequence-based splits [#88](https://github.com/a-r-j/ProteinWorkshop/pull/88) as well as time-based splits [#89](https://github.com/a-r-j/ProteinWorkshop/pull/89)
 
 ### Models
 

From cb32b8114f8c9f56ebf73a7dfaa2c1b6dd768a4d Mon Sep 17 00:00:00 2001
From: kierandidi <kieran.didi@gmail.com>
Date: Sun, 24 Mar 2024 16:48:17 +0000
Subject: [PATCH 3/7] changed PDBDataset arguments

---
 proteinworkshop/config/dataset/pdb.yaml |  2 +-
 proteinworkshop/datasets/pdb_dataset.py | 10 +++++-----
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/proteinworkshop/config/dataset/pdb.yaml b/proteinworkshop/config/dataset/pdb.yaml
index 426707a3..9f2b8a4e 100644
--- a/proteinworkshop/config/dataset/pdb.yaml
+++ b/proteinworkshop/config/dataset/pdb.yaml
@@ -26,5 +26,5 @@ datamodule:
     train_val_test: [0.8, 0.1, 0.1] # Cross-validation ratios to use for train, val, and test splits
     split_type: "sequence_similarity" # Split sequences by sequence similarity clustering, other options are "random" and "time_cutoff"
     split_sequence_similiarity: 0.3 # Clustering at 30% sequence similarity (argument is ignored if split_type!="sequence_similarity")
-    split_time_frames: ["2020-01-01", "2021-01-01", "2023-03-01"] # Time-cutoffs for train, val and test set (argument is ignored if split_type!="time_cutoff")
     overwrite_sequence_clusters: False # Previous clusterings at same sequence similarity are reused and not overwritten
+    split_time_frames: null # Time-cutoffs for train, val and test set (argument is ignored if split_type!="time_cutoff") - e.g., ["2020-01-01", "2021-01-01", "2023-03-01"]
diff --git a/proteinworkshop/datasets/pdb_dataset.py b/proteinworkshop/datasets/pdb_dataset.py
index 93086ed4..39a470fe 100644
--- a/proteinworkshop/datasets/pdb_dataset.py
+++ b/proteinworkshop/datasets/pdb_dataset.py
@@ -17,6 +17,11 @@
 class PDBData:
     def __init__(
         self,
+        split_type: Literal["sequence_similarity", "time_cutoff", "random"] = "random",
+        split_sequence_similiarity: Optional[int] = None,
+        overwrite_sequence_clusters: Optional[bool] = False,
+        split_time_frames: Optional[List[str]] = None,
+        train_val_test: List[float],
         fraction: float,
         min_length: int,
         max_length: int,
@@ -30,11 +35,6 @@ def __init__(
         remove_ligands: List[str],
         remove_non_standard_residues: bool,
         remove_pdb_unavailable: bool,
-        train_val_test: List[float],
-        split_type: Literal["sequence_similarity", "time_cutoff", "random"],
-        split_sequence_similiarity: int,
-        overwrite_sequence_clusters: bool,
-        split_time_frames: List[str]
     ):
         self.fraction = fraction
         self.molecule_type = molecule_type

From 48c64e83a966ca4d7bf5c5bc72a5c7a22efbca06 Mon Sep 17 00:00:00 2001
From: kierandidi <kieran.didi@gmail.com>
Date: Sun, 24 Mar 2024 16:52:52 +0000
Subject: [PATCH 4/7] Changed yaml file arg order

---
 proteinworkshop/config/dataset/pdb.yaml | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/proteinworkshop/config/dataset/pdb.yaml b/proteinworkshop/config/dataset/pdb.yaml
index 9f2b8a4e..3954e199 100644
--- a/proteinworkshop/config/dataset/pdb.yaml
+++ b/proteinworkshop/config/dataset/pdb.yaml
@@ -10,6 +10,11 @@ datamodule:
 
   pdb_dataset:
     _target_: "proteinworkshop.datasets.pdb_dataset.PDBData"
+    split_type: "sequence_similarity" # Split sequences by sequence similarity clustering, other options are "random" and "time_cutoff"
+    split_sequence_similiarity: 0.3 # Clustering at 30% sequence similarity (argument is ignored if split_type!="sequence_similarity")
+    overwrite_sequence_clusters: False # Previous clusterings at same sequence similarity are reused and not overwritten
+    split_time_frames: null # Time-cutoffs for train, val and test set (argument is ignored if split_type!="time_cutoff") - e.g., ["2020-01-01", "2021-01-01", "2023-03-01"]
+    train_val_test: [0.8, 0.1, 0.1] # Cross-validation ratios to use for train, val, and test splits
     fraction: 1.0 # Fraction of dataset to use
     molecule_type: "protein" # Type of molecule for which to select
     experiment_types: ["diffraction", "NMR", "EM", "other"] # All experiment types
@@ -23,8 +28,4 @@ datamodule:
     remove_ligands: [] # Exclude specific ligands from any available protein-ligand complexes
     remove_non_standard_residues: True # Include only proteins containing standard amino acid residues
     remove_pdb_unavailable: True # Include only proteins that are available to download
-    train_val_test: [0.8, 0.1, 0.1] # Cross-validation ratios to use for train, val, and test splits
-    split_type: "sequence_similarity" # Split sequences by sequence similarity clustering, other options are "random" and "time_cutoff"
-    split_sequence_similiarity: 0.3 # Clustering at 30% sequence similarity (argument is ignored if split_type!="sequence_similarity")
-    overwrite_sequence_clusters: False # Previous clusterings at same sequence similarity are reused and not overwritten
-    split_time_frames: null # Time-cutoffs for train, val and test set (argument is ignored if split_type!="time_cutoff") - e.g., ["2020-01-01", "2021-01-01", "2023-03-01"]
+

From 6b1912c839aa29b702a2b830f64fbb724fb27d95 Mon Sep 17 00:00:00 2001
From: kierandidi <kieran.didi@gmail.com>
Date: Mon, 25 Mar 2024 09:53:20 +0000
Subject: [PATCH 5/7] reorder arguments

---
 proteinworkshop/config/dataset/pdb.yaml | 11 ++++++-----
 proteinworkshop/datasets/pdb_dataset.py | 11 ++++++-----
 2 files changed, 12 insertions(+), 10 deletions(-)

diff --git a/proteinworkshop/config/dataset/pdb.yaml b/proteinworkshop/config/dataset/pdb.yaml
index 3954e199..da481a4b 100644
--- a/proteinworkshop/config/dataset/pdb.yaml
+++ b/proteinworkshop/config/dataset/pdb.yaml
@@ -10,11 +10,6 @@ datamodule:
 
   pdb_dataset:
     _target_: "proteinworkshop.datasets.pdb_dataset.PDBData"
-    split_type: "sequence_similarity" # Split sequences by sequence similarity clustering, other options are "random" and "time_cutoff"
-    split_sequence_similiarity: 0.3 # Clustering at 30% sequence similarity (argument is ignored if split_type!="sequence_similarity")
-    overwrite_sequence_clusters: False # Previous clusterings at same sequence similarity are reused and not overwritten
-    split_time_frames: null # Time-cutoffs for train, val and test set (argument is ignored if split_type!="time_cutoff") - e.g., ["2020-01-01", "2021-01-01", "2023-03-01"]
-    train_val_test: [0.8, 0.1, 0.1] # Cross-validation ratios to use for train, val, and test splits
     fraction: 1.0 # Fraction of dataset to use
     molecule_type: "protein" # Type of molecule for which to select
     experiment_types: ["diffraction", "NMR", "EM", "other"] # All experiment types
@@ -28,4 +23,10 @@ datamodule:
     remove_ligands: [] # Exclude specific ligands from any available protein-ligand complexes
     remove_non_standard_residues: True # Include only proteins containing standard amino acid residues
     remove_pdb_unavailable: True # Include only proteins that are available to download
+    train_val_test: [0.8, 0.1, 0.1] # Cross-validation ratios to use for train, val, and test splits
+    split_type: "sequence_similarity" # Split sequences by sequence similarity clustering, other options are "random" and "time_cutoff"
+    split_sequence_similiarity: 0.3 # Clustering at 30% sequence similarity (argument is ignored if split_type!="sequence_similarity")
+    overwrite_sequence_clusters: False # Previous clusterings at same sequence similarity are reused and not overwritten
+    split_time_frames: null # Time-cutoffs for train, val and test set (argument is ignored if split_type!="time_cutoff") - e.g., ["2020-01-01", "2021-01-01", "2023-03-01"]
+
 
diff --git a/proteinworkshop/datasets/pdb_dataset.py b/proteinworkshop/datasets/pdb_dataset.py
index 39a470fe..af8d7118 100644
--- a/proteinworkshop/datasets/pdb_dataset.py
+++ b/proteinworkshop/datasets/pdb_dataset.py
@@ -17,11 +17,6 @@
 class PDBData:
     def __init__(
         self,
-        split_type: Literal["sequence_similarity", "time_cutoff", "random"] = "random",
-        split_sequence_similiarity: Optional[int] = None,
-        overwrite_sequence_clusters: Optional[bool] = False,
-        split_time_frames: Optional[List[str]] = None,
-        train_val_test: List[float],
         fraction: float,
         min_length: int,
         max_length: int,
@@ -35,6 +30,12 @@ def __init__(
         remove_ligands: List[str],
         remove_non_standard_residues: bool,
         remove_pdb_unavailable: bool,
+        train_val_test: List[float],
+        split_type: Literal["sequence_similarity", "time_cutoff", "random"] = "random",
+        split_sequence_similiarity: Optional[int] = None,
+        overwrite_sequence_clusters: Optional[bool] = False,
+        split_time_frames: Optional[List[str]] = None,
+
     ):
         self.fraction = fraction
         self.molecule_type = molecule_type

From 878fa9639d30ee2b34dc5c5ae2f2ef473eddc13a Mon Sep 17 00:00:00 2001
From: kierandidi <kieran.didi@gmail.com>
Date: Tue, 26 Mar 2024 22:44:14 +0000
Subject: [PATCH 6/7] add check for time interval

---
 proteinworkshop/config/dataset/pdb.yaml | 4 ++--
 proteinworkshop/datasets/pdb_dataset.py | 8 +++++++-
 2 files changed, 9 insertions(+), 3 deletions(-)

diff --git a/proteinworkshop/config/dataset/pdb.yaml b/proteinworkshop/config/dataset/pdb.yaml
index da481a4b..56540cc3 100644
--- a/proteinworkshop/config/dataset/pdb.yaml
+++ b/proteinworkshop/config/dataset/pdb.yaml
@@ -10,10 +10,10 @@ datamodule:
 
   pdb_dataset:
     _target_: "proteinworkshop.datasets.pdb_dataset.PDBData"
-    fraction: 1.0 # Fraction of dataset to use
+    fraction: 0.01 # Fraction of dataset to use
     molecule_type: "protein" # Type of molecule for which to select
     experiment_types: ["diffraction", "NMR", "EM", "other"] # All experiment types
-    max_length: 1000 # Exclude polypeptides greater than length 1000
+    max_length: 150 # Exclude polypeptides greater than length 1000
     min_length: 10 # Exclude peptides of length 10
     oligomeric_min: 1 # Include only monomeric proteins
     oligomeric_max: 5 # Include up to 5-meric proteins 
diff --git a/proteinworkshop/datasets/pdb_dataset.py b/proteinworkshop/datasets/pdb_dataset.py
index af8d7118..d569624d 100644
--- a/proteinworkshop/datasets/pdb_dataset.py
+++ b/proteinworkshop/datasets/pdb_dataset.py
@@ -55,7 +55,13 @@ def __init__(
         self.split_type = split_type
         self.split_sequence_similarity = split_sequence_similiarity
         self.overwrite_sequence_clusters = overwrite_sequence_clusters
-        self.split_time_frames = [np.datetime64(date) for date in split_time_frames]
+        if split_time_frames is None:
+            self.split_time_frames = split_time_frames
+        else:
+            try:
+                self.split_time_frames = [np.datetime64(date) for date in split_time_frames]
+            except:
+                raise TypeError(f"{split_time_frames} does not contain valid dates for np.datetime64 format")
         self.splits = ["train", "val", "test"]
 
     def create_dataset(self):

From 7a3875d0698200aaa071b42d5624457fd96b20f0 Mon Sep 17 00:00:00 2001
From: kierandidi <kieran.didi@gmail.com>
Date: Tue, 26 Mar 2024 23:00:23 +0000
Subject: [PATCH 7/7] changed time intervall check

---
 proteinworkshop/datasets/pdb_dataset.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/proteinworkshop/datasets/pdb_dataset.py b/proteinworkshop/datasets/pdb_dataset.py
index d569624d..0daa9d95 100644
--- a/proteinworkshop/datasets/pdb_dataset.py
+++ b/proteinworkshop/datasets/pdb_dataset.py
@@ -55,9 +55,7 @@ def __init__(
         self.split_type = split_type
         self.split_sequence_similarity = split_sequence_similiarity
         self.overwrite_sequence_clusters = overwrite_sequence_clusters
-        if split_time_frames is None:
-            self.split_time_frames = split_time_frames
-        else:
+        if self.split_type == "time_cutoff":
             try:
                 self.split_time_frames = [np.datetime64(date) for date in split_time_frames]
             except: