minor linting

a-r-j · Oct 23, 2023 · f184773 · f184773
1 parent 005db26
commit f184773
Show file tree

Hide file tree

Showing 19 changed files with 185 additions and 195 deletions.
diff --git a/proteinworkshop/datasets/atom3d_datamodule.py b/proteinworkshop/datasets/atom3d_datamodule.py
@@ -71,7 +71,7 @@ def get_test_data_path(
         # default to testing PPI methods with DB5
         "PPI": f"PPI/splits/{ppi_split}/data/{test_phase}"
         if use_dips_for_testing
-        else f"PPI/raw/DB5/data/",
+        else "PPI/raw/DB5/data/",
         "RES": f"RES/splits/split-by-cath-topology/data/{test_phase}",
         "MSP": f"MSP/splits/split-by-sequence-identity-30/data/{test_phase}",
     }

diff --git a/proteinworkshop/datasets/base.py b/proteinworkshop/datasets/base.py
@@ -210,6 +210,60 @@ def get_class_weights(self) -> torch.Tensor:
 
 
 class ProteinDataset(Dataset):
+    """Dataset for loading protein structures.
+
+    :param pdb_codes: List of PDB codes to load. This can also be a list
+        of identifiers to specific to your filenames if you have
+        pre-downloaded structures.
+    :type pdb_codes: List[str]
+    :param root: Path to root directory, defaults to ``None``.
+    :type root: Optional[str], optional
+    :param pdb_dir: Path to directory containing raw PDB files,
+        defaults to ``None``.
+    :type pdb_dir: Optional[str], optional
+    :param processed_dir: Directory to store processed data, defaults to
+        ``None``.
+    :type processed_dir: Optional[str], optional
+    :param pdb_paths: If specified, the dataset will load structures from
+        these paths instead of downloading them from the RCSB PDB or using
+        the identifies in ``pdb_codes``. This is useful if you have already
+        downloaded structures and want to use them. defaults to ``None``
+    :type pdb_paths: Optional[List[str]], optional
+    :param chains: List of chains to load for each PDB code,
+        defaults to ``None``.
+    :type chains: Optional[List[str]], optional
+    :param graph_labels: List of tensors to set as graph labels for each
+        examples. If not specified, no graph labels will be set.
+        defaults to ``None``.
+    :type graph_labels: Optional[List[torch.Tensor]], optional
+    :param node_labels: List of tensors to set as node labels for each
+        examples. If not specified, no node labels will be set.
+        defaults to ``None``.
+    :type node_labels: Optional[List[torch.Tensor]], optional
+    :param transform: List of transforms to apply to each example,
+        defaults to ``None``.
+    :type transform: Optional[List[Callable]], optional
+    :param pre_transform: Transform to apply to each example before
+        processing, defaults to ``None``.
+    :type pre_transform: Optional[Callable], optional
+    :param pre_filter: Filter to apply to each example before processing,
+        defaults to ``None``.
+    :type pre_filter: Optional[Callable], optional
+    :param log: Whether to log. If ``True``, logs will be printed to
+        stdout, defaults to ``True``.
+    :type log: bool, optional
+    :param overwrite: Whether to overwrite existing files, defaults to
+        ``False``.
+    :type overwrite: bool, optional
+    :param format: Format to save structures in, defaults to "pdb".
+    :type format: Literal[mmtf, pdb, optional
+    :param in_memory: Whether to load data into memory, defaults to False.
+    :type in_memory: bool, optional
+    :param store_het: Whether to store heteroatoms in the graph,
+        defaults to ``False``.
+    :type store_het: bool, optional
+    """
+
     def __init__(
         self,
         pdb_codes: List[str],
@@ -230,59 +284,6 @@ def __init__(
         store_het: bool = False,
         out_names: Optional[List[str]] = None,
     ):
-        """Dataset for loading protein structures.
-
-        :param pdb_codes: List of PDB codes to load. This can also be a list
-            of identifiers to specific to your filenames if you have
-            pre-downloaded structures.
-        :type pdb_codes: List[str]
-        :param root: Path to root directory, defaults to ``None``.
-        :type root: Optional[str], optional
-        :param pdb_dir: Path to directory containing raw PDB files,
-            defaults to ``None``.
-        :type pdb_dir: Optional[str], optional
-        :param processed_dir: Directory to store processed data, defaults to
-            ``None``.
-        :type processed_dir: Optional[str], optional
-        :param pdb_paths: If specified, the dataset will load structures from
-            these paths instead of downloading them from the RCSB PDB or using
-            the identifies in ``pdb_codes``. This is useful if you have already
-            downloaded structures and want to use them. defaults to ``None``
-        :type pdb_paths: Optional[List[str]], optional
-        :param chains: List of chains to load for each PDB code,
-            defaults to ``None``.
-        :type chains: Optional[List[str]], optional
-        :param graph_labels: List of tensors to set as graph labels for each
-            examples. If not specified, no graph labels will be set.
-            defaults to ``None``.
-        :type graph_labels: Optional[List[torch.Tensor]], optional
-        :param node_labels: List of tensors to set as node labels for each
-            examples. If not specified, no node labels will be set.
-            defaults to ``None``.
-        :type node_labels: Optional[List[torch.Tensor]], optional
-        :param transform: List of transforms to apply to each example,
-            defaults to ``None``.
-        :type transform: Optional[List[Callable]], optional
-        :param pre_transform: Transform to apply to each example before
-            processing, defaults to ``None``.
-        :type pre_transform: Optional[Callable], optional
-        :param pre_filter: Filter to apply to each example before processing,
-            defaults to ``None``.
-        :type pre_filter: Optional[Callable], optional
-        :param log: Whether to log. If ``True``, logs will be printed to
-            stdout, defaults to ``True``.
-        :type log: bool, optional
-        :param overwrite: Whether to overwrite existing files, defaults to
-            ``False``.
-        :type overwrite: bool, optional
-        :param format: Format to save structures in, defaults to "pdb".
-        :type format: Literal[mmtf, pdb, optional
-        :param in_memory: Whether to load data into memory, defaults to False.
-        :type in_memory: bool, optional
-        :param store_het: Whether to store heteroatoms in the graph,
-            defaults to ``False``.
-        :type store_het: bool, optional
-        """
         self.pdb_codes = [pdb.lower() for pdb in pdb_codes]
         self.pdb_dir = pdb_dir
         self.pdb_paths = pdb_paths
@@ -302,7 +303,7 @@ def __init__(
             for p in self.processed_file_names
         ):
             logger.info(
-                f"All structures already processed and overwrite=False. Skipping download."
+                "All structures already processed and overwrite=False. Skipping download."
             )
             self._skip_download = True
         else:

diff --git a/proteinworkshop/datasets/cath.py b/proteinworkshop/datasets/cath.py
@@ -14,6 +14,30 @@
 
 
 class CATHDataModule(ProteinDataModule):
+    """Data module for CATH dataset.
+
+    :param path: Path to store data.
+    :type path: str
+    :param batch_size: Batch size for dataloaders.
+    :type batch_size: int
+    :param format: Format to load PDB files in.
+    :type format: Literal["mmtf", "pdb"]
+    :param pdb_dir: Path to directory containing PDB files.
+    :type pdb_dir: str
+    :param pin_memory: Whether to pin memory for dataloaders.
+    :type pin_memory: bool
+    :param in_memory: Whether to load the entire dataset into memory.
+    :type in_memory: bool
+    :param num_workers: Number of workers for dataloaders.
+    :type num_workers: int
+    :param dataset_fraction: Fraction of dataset to use.
+    :type dataset_fraction: float
+    :param transforms: List of transforms to apply to dataset.
+    :type transforms: Optional[List[Callable]]
+    :param overwrite: Whether to overwrite existing data.
+        Defaults to ``False``.
+    :type overwrite: bool
+    """
     def __init__(
         self,
         path: str,
@@ -27,30 +51,6 @@ def __init__(
         transforms: Optional[Iterable[Callable]] = None,
         overwrite: bool = False,
     ) -> None:
-        """Data module for CATH dataset.
-
-        :param path: Path to store data.
-        :type path: str
-        :param batch_size: Batch size for dataloaders.
-        :type batch_size: int
-        :param format: Format to load PDB files in.
-        :type format: Literal["mmtf", "pdb"]
-        :param pdb_dir: Path to directory containing PDB files.
-        :type pdb_dir: str
-        :param pin_memory: Whether to pin memory for dataloaders.
-        :type pin_memory: bool
-        :param in_memory: Whether to load the entire dataset into memory.
-        :type in_memory: bool
-        :param num_workers: Number of workers for dataloaders.
-        :type num_workers: int
-        :param dataset_fraction: Fraction of dataset to use.
-        :type dataset_fraction: float
-        :param transforms: List of transforms to apply to dataset.
-        :type transforms: Optional[List[Callable]]
-        :param overwrite: Whether to overwrite existing data.
-            Defaults to ``False``.
-        :type overwrite: bool
-        """
         super().__init__()
 
         self.data_dir = Path(path)
@@ -270,7 +270,6 @@ def test_dataloader(self) -> ProteinDataLoader:
     import pathlib
 
     import hydra
-    import omegaconf
 
     from proteinworkshop import constants
 

diff --git a/proteinworkshop/datasets/cc_pdb.py b/proteinworkshop/datasets/cc_pdb.py
@@ -17,6 +17,46 @@
 
 
 class CCPDBDataModule(ProteinDataModule):
+    """Data module for CCPDB datasets.
+
+    :param path: Path to store data.
+    :type path: str
+    :param pdb_dir: Path to directory containing structure files.
+    :type pdb_dir: str
+    :param name: Name of dataset to use.
+    :type name: CCPDB_DATASET_NAMES
+    :param batch_size: Batch size for dataloaders.
+    :type batch_size: int
+    :param num_workers: Number of workers for dataloaders.
+    :type num_workers: int
+    :param pin_memory: Whether to pin memory for dataloaders.
+    :type pin_memory: bool
+    :param in_memory: Whether to load dataset into memory, defaults to
+        ``False``
+    :type in_memory: bool, optional
+    :param format: Format of the structure files, defaults to ``"mmtf"``.
+    :type format: Literal[mmtf, pdb], optional
+    :param obsolete_strategy: How to deal with obsolete PDBs,
+        defaults to "drop"
+    :type obsolete_strategy: str, optional
+    :param split_strategy: How to split the data,
+        defaults to ``"random"``
+    :type split_strategy: Literal["random", 'stratified"], optional
+    :param val_fraction: Fraction of the dataset to use for validation,
+        defaults to ``0.1``
+    :type val_fraction: float, optional
+    :param test_fraction: Fraction of the dataset to use for testing,
+        defaults to ``0.1``.
+    :type test_fraction: float, optional
+    :param transforms: List of transforms to apply to each example,
+        defaults to ``None``.
+    :type transforms: Optional[List[Callable]], optional
+    :param overwrite: Whether to overwrite existing data, defaults to
+        ``False``
+    :type overwrite: bool, optional
+    :raises ValueError: If train, val, and test fractions do not sum to 1.
+    """
+
     def __init__(
         self,
         path: str,
@@ -35,45 +75,6 @@ def __init__(
         transforms: Optional[List[Callable]] = None,
         overwrite: bool = False,
     ):
-        """Data module for CCPDB datasets.
-
-        :param path: Path to store data.
-        :type path: str
-        :param pdb_dir: Path to directory containing structure files.
-        :type pdb_dir: str
-        :param name: Name of dataset to use.
-        :type name: CCPDB_DATASET_NAMES
-        :param batch_size: Batch size for dataloaders.
-        :type batch_size: int
-        :param num_workers: Number of workers for dataloaders.
-        :type num_workers: int
-        :param pin_memory: Whether to pin memory for dataloaders.
-        :type pin_memory: bool
-        :param in_memory: Whether to load dataset into memory, defaults to
-            ``False``
-        :type in_memory: bool, optional
-        :param format: Format of the structure files, defaults to ``"mmtf"``.
-        :type format: Literal[mmtf, pdb], optional
-        :param obsolete_strategy: How to deal with obsolete PDBs,
-            defaults to "drop"
-        :type obsolete_strategy: str, optional
-        :param split_strategy: How to split the data,
-            defaults to ``"random"``
-        :type split_strategy: Literal["random", 'stratified"], optional
-        :param val_fraction: Fraction of the dataset to use for validation,
-            defaults to ``0.1``
-        :type val_fraction: float, optional
-        :param test_fraction: Fraction of the dataset to use for testing,
-            defaults to ``0.1``.
-        :type test_fraction: float, optional
-        :param transforms: List of transforms to apply to each example,
-            defaults to ``None``.
-        :type transforms: Optional[List[Callable]], optional
-        :param overwrite: Whether to overwrite existing data, defaults to
-            ``False``
-        :type overwrite: bool, optional
-        :raises ValueError: If train, val, and test fractions do not sum to 1.
-        """
         super().__init__()
         self.root = pathlib.Path(path)
         if not os.path.exists(self.root):
@@ -264,7 +265,7 @@ def test_dataloader(self) -> ProteinDataLoader:
     num_workers = 4
     pin_memory = True
 
-    dataset = CCPDBDataset(
+    dataset = CCPDBDataModule(
         path, pdb_dir, name, batch_size, num_workers, pin_memory
     )
     dataset.download()

diff --git a/proteinworkshop/datasets/components/res_dataset.py b/proteinworkshop/datasets/components/res_dataset.py
@@ -8,7 +8,7 @@
 
 from proteinworkshop.datasets.components.atom3d_dataset import BaseTransform
 
-_amino_acids = lambda x: {
+_amino_acids = lambda x: {  # noqa: E731
     "ALA": 0,
     "ARG": 1,
     "ASN": 2,

diff --git a/proteinworkshop/datasets/deep_sea_proteins.py b/proteinworkshop/datasets/deep_sea_proteins.py
@@ -17,6 +17,34 @@
 
 
 class DeepSeaProteinsDataModule(ProteinDataModule):
+    """Data module for Deep Sea Proteins dataset.
+
+    :param path: Path to store data.
+    :type path: os.PathLike
+    :param pdb_dir: Path to directory containing PDB files.
+    :type pdb_dir: os.PathLike
+    :param validation_fold: Name of validation fold to use.
+    :type validation_fold: int
+    :param batch_size: Batch size for dataloaders.
+    :type batch_size: int
+    :param in_memory: Whether to load the entire dataset into memory, defaults to False
+    :type in_memory: bool, optional
+    :param pin_memory: Whether to pin dataloader memory, defaults to True
+    :type pin_memory: bool, optional
+    :param num_workers: Number of dataloader workers, defaults to 16
+    :type num_workers: int, optional
+    :param obsolete_strategy: Strategy to deal with obsolete PDbs,
+        defaults to "drop"
+    :type obsolete_strategy: str, optional
+    :param format: Format of the structure files, defaults to "mmtf"
+    :type format: Literal[mmtf, pdb], optional
+    :param transforms: Transforms to apply, defaults to None
+    :type transforms: Optional[Iterable[Callable]], optional
+    :param overwrite: Whether to overwrite existing data, defaults to
+        ``False``
+    :type overwrite: bool, optional
+    """
+
     def __init__(
         self,
         path: os.PathLike,
@@ -31,33 +59,6 @@ def __init__(
         transforms: Optional[Iterable[Callable]] = None,
         overwrite: bool = False,
     ):
-        """Data module for Deep Sea Proteins dataset.
-
-        :param path: Path to store data.
-        :type path: os.PathLike
-        :param pdb_dir: Path to directory containing PDB files.
-        :type pdb_dir: os.PathLike
-        :param validation_fold: Name of validation fold to use.
-        :type validation_fold: int
-        :param batch_size: Batch size for dataloaders.
-        :type batch_size: int
-        :param in_memory: Whether to load the entire dataset into memory, defaults to False
-        :type in_memory: bool, optional
-        :param pin_memory: Whether to pin dataloader memory, defaults to True
-        :type pin_memory: bool, optional
-        :param num_workers: Number of dataloader workers, defaults to 16
-        :type num_workers: int, optional
-        :param obsolete_strategy: Strategy to deal with obsolete PDbs,
-            defaults to "drop"
-        :type obsolete_strategy: str, optional
-        :param format: Format of the structure files, defaults to "mmtf"
-        :type format: Literal[mmtf, pdb], optional
-        :param transforms: Transforms to apply, defaults to None
-        :type transforms: Optional[Iterable[Callable]], optional
-        :param overwrite: Whether to overwrite existing data, defaults to
-            ``False``
-        :type overwrite: bool, optional
-        """
         super().__init__()
         self.data_dir = pathlib.Path(path)
         if not os.path.exists(self.data_dir):
@@ -256,7 +257,6 @@ def test_dataloader(self) -> ProteinDataLoader:
 
 if __name__ == "__main__":
     import hydra
-    import omegaconf
 
     from proteinworkshop import constants