Skip to content

Commit

Permalink
minor linting
Browse files Browse the repository at this point in the history
  • Loading branch information
Jamasb committed Oct 23, 2023
1 parent 005db26 commit f184773
Show file tree
Hide file tree
Showing 19 changed files with 185 additions and 195 deletions.
2 changes: 1 addition & 1 deletion proteinworkshop/datasets/atom3d_datamodule.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,7 @@ def get_test_data_path(
# default to testing PPI methods with DB5
"PPI": f"PPI/splits/{ppi_split}/data/{test_phase}"
if use_dips_for_testing
else f"PPI/raw/DB5/data/",
else "PPI/raw/DB5/data/",
"RES": f"RES/splits/split-by-cath-topology/data/{test_phase}",
"MSP": f"MSP/splits/split-by-sequence-identity-30/data/{test_phase}",
}
Expand Down
109 changes: 55 additions & 54 deletions proteinworkshop/datasets/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -210,6 +210,60 @@ def get_class_weights(self) -> torch.Tensor:


class ProteinDataset(Dataset):
"""Dataset for loading protein structures.
:param pdb_codes: List of PDB codes to load. This can also be a list
of identifiers to specific to your filenames if you have
pre-downloaded structures.
:type pdb_codes: List[str]
:param root: Path to root directory, defaults to ``None``.
:type root: Optional[str], optional
:param pdb_dir: Path to directory containing raw PDB files,
defaults to ``None``.
:type pdb_dir: Optional[str], optional
:param processed_dir: Directory to store processed data, defaults to
``None``.
:type processed_dir: Optional[str], optional
:param pdb_paths: If specified, the dataset will load structures from
these paths instead of downloading them from the RCSB PDB or using
the identifies in ``pdb_codes``. This is useful if you have already
downloaded structures and want to use them. defaults to ``None``
:type pdb_paths: Optional[List[str]], optional
:param chains: List of chains to load for each PDB code,
defaults to ``None``.
:type chains: Optional[List[str]], optional
:param graph_labels: List of tensors to set as graph labels for each
examples. If not specified, no graph labels will be set.
defaults to ``None``.
:type graph_labels: Optional[List[torch.Tensor]], optional
:param node_labels: List of tensors to set as node labels for each
examples. If not specified, no node labels will be set.
defaults to ``None``.
:type node_labels: Optional[List[torch.Tensor]], optional
:param transform: List of transforms to apply to each example,
defaults to ``None``.
:type transform: Optional[List[Callable]], optional
:param pre_transform: Transform to apply to each example before
processing, defaults to ``None``.
:type pre_transform: Optional[Callable], optional
:param pre_filter: Filter to apply to each example before processing,
defaults to ``None``.
:type pre_filter: Optional[Callable], optional
:param log: Whether to log. If ``True``, logs will be printed to
stdout, defaults to ``True``.
:type log: bool, optional
:param overwrite: Whether to overwrite existing files, defaults to
``False``.
:type overwrite: bool, optional
:param format: Format to save structures in, defaults to "pdb".
:type format: Literal[mmtf, pdb, optional
:param in_memory: Whether to load data into memory, defaults to False.
:type in_memory: bool, optional
:param store_het: Whether to store heteroatoms in the graph,
defaults to ``False``.
:type store_het: bool, optional
"""

def __init__(
self,
pdb_codes: List[str],
Expand All @@ -230,59 +284,6 @@ def __init__(
store_het: bool = False,
out_names: Optional[List[str]] = None,
):
"""Dataset for loading protein structures.
:param pdb_codes: List of PDB codes to load. This can also be a list
of identifiers to specific to your filenames if you have
pre-downloaded structures.
:type pdb_codes: List[str]
:param root: Path to root directory, defaults to ``None``.
:type root: Optional[str], optional
:param pdb_dir: Path to directory containing raw PDB files,
defaults to ``None``.
:type pdb_dir: Optional[str], optional
:param processed_dir: Directory to store processed data, defaults to
``None``.
:type processed_dir: Optional[str], optional
:param pdb_paths: If specified, the dataset will load structures from
these paths instead of downloading them from the RCSB PDB or using
the identifies in ``pdb_codes``. This is useful if you have already
downloaded structures and want to use them. defaults to ``None``
:type pdb_paths: Optional[List[str]], optional
:param chains: List of chains to load for each PDB code,
defaults to ``None``.
:type chains: Optional[List[str]], optional
:param graph_labels: List of tensors to set as graph labels for each
examples. If not specified, no graph labels will be set.
defaults to ``None``.
:type graph_labels: Optional[List[torch.Tensor]], optional
:param node_labels: List of tensors to set as node labels for each
examples. If not specified, no node labels will be set.
defaults to ``None``.
:type node_labels: Optional[List[torch.Tensor]], optional
:param transform: List of transforms to apply to each example,
defaults to ``None``.
:type transform: Optional[List[Callable]], optional
:param pre_transform: Transform to apply to each example before
processing, defaults to ``None``.
:type pre_transform: Optional[Callable], optional
:param pre_filter: Filter to apply to each example before processing,
defaults to ``None``.
:type pre_filter: Optional[Callable], optional
:param log: Whether to log. If ``True``, logs will be printed to
stdout, defaults to ``True``.
:type log: bool, optional
:param overwrite: Whether to overwrite existing files, defaults to
``False``.
:type overwrite: bool, optional
:param format: Format to save structures in, defaults to "pdb".
:type format: Literal[mmtf, pdb, optional
:param in_memory: Whether to load data into memory, defaults to False.
:type in_memory: bool, optional
:param store_het: Whether to store heteroatoms in the graph,
defaults to ``False``.
:type store_het: bool, optional
"""
self.pdb_codes = [pdb.lower() for pdb in pdb_codes]
self.pdb_dir = pdb_dir
self.pdb_paths = pdb_paths
Expand All @@ -302,7 +303,7 @@ def __init__(
for p in self.processed_file_names
):
logger.info(
f"All structures already processed and overwrite=False. Skipping download."
"All structures already processed and overwrite=False. Skipping download."
)
self._skip_download = True
else:
Expand Down
49 changes: 24 additions & 25 deletions proteinworkshop/datasets/cath.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,30 @@


class CATHDataModule(ProteinDataModule):
"""Data module for CATH dataset.
:param path: Path to store data.
:type path: str
:param batch_size: Batch size for dataloaders.
:type batch_size: int
:param format: Format to load PDB files in.
:type format: Literal["mmtf", "pdb"]
:param pdb_dir: Path to directory containing PDB files.
:type pdb_dir: str
:param pin_memory: Whether to pin memory for dataloaders.
:type pin_memory: bool
:param in_memory: Whether to load the entire dataset into memory.
:type in_memory: bool
:param num_workers: Number of workers for dataloaders.
:type num_workers: int
:param dataset_fraction: Fraction of dataset to use.
:type dataset_fraction: float
:param transforms: List of transforms to apply to dataset.
:type transforms: Optional[List[Callable]]
:param overwrite: Whether to overwrite existing data.
Defaults to ``False``.
:type overwrite: bool
"""
def __init__(
self,
path: str,
Expand All @@ -27,30 +51,6 @@ def __init__(
transforms: Optional[Iterable[Callable]] = None,
overwrite: bool = False,
) -> None:
"""Data module for CATH dataset.
:param path: Path to store data.
:type path: str
:param batch_size: Batch size for dataloaders.
:type batch_size: int
:param format: Format to load PDB files in.
:type format: Literal["mmtf", "pdb"]
:param pdb_dir: Path to directory containing PDB files.
:type pdb_dir: str
:param pin_memory: Whether to pin memory for dataloaders.
:type pin_memory: bool
:param in_memory: Whether to load the entire dataset into memory.
:type in_memory: bool
:param num_workers: Number of workers for dataloaders.
:type num_workers: int
:param dataset_fraction: Fraction of dataset to use.
:type dataset_fraction: float
:param transforms: List of transforms to apply to dataset.
:type transforms: Optional[List[Callable]]
:param overwrite: Whether to overwrite existing data.
Defaults to ``False``.
:type overwrite: bool
"""
super().__init__()

self.data_dir = Path(path)
Expand Down Expand Up @@ -270,7 +270,6 @@ def test_dataloader(self) -> ProteinDataLoader:
import pathlib

import hydra
import omegaconf

from proteinworkshop import constants

Expand Down
81 changes: 41 additions & 40 deletions proteinworkshop/datasets/cc_pdb.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,46 @@


class CCPDBDataModule(ProteinDataModule):
"""Data module for CCPDB datasets.
:param path: Path to store data.
:type path: str
:param pdb_dir: Path to directory containing structure files.
:type pdb_dir: str
:param name: Name of dataset to use.
:type name: CCPDB_DATASET_NAMES
:param batch_size: Batch size for dataloaders.
:type batch_size: int
:param num_workers: Number of workers for dataloaders.
:type num_workers: int
:param pin_memory: Whether to pin memory for dataloaders.
:type pin_memory: bool
:param in_memory: Whether to load dataset into memory, defaults to
``False``
:type in_memory: bool, optional
:param format: Format of the structure files, defaults to ``"mmtf"``.
:type format: Literal[mmtf, pdb], optional
:param obsolete_strategy: How to deal with obsolete PDBs,
defaults to "drop"
:type obsolete_strategy: str, optional
:param split_strategy: How to split the data,
defaults to ``"random"``
:type split_strategy: Literal["random", 'stratified"], optional
:param val_fraction: Fraction of the dataset to use for validation,
defaults to ``0.1``
:type val_fraction: float, optional
:param test_fraction: Fraction of the dataset to use for testing,
defaults to ``0.1``.
:type test_fraction: float, optional
:param transforms: List of transforms to apply to each example,
defaults to ``None``.
:type transforms: Optional[List[Callable]], optional
:param overwrite: Whether to overwrite existing data, defaults to
``False``
:type overwrite: bool, optional
:raises ValueError: If train, val, and test fractions do not sum to 1.
"""

def __init__(
self,
path: str,
Expand All @@ -35,45 +75,6 @@ def __init__(
transforms: Optional[List[Callable]] = None,
overwrite: bool = False,
):
"""Data module for CCPDB datasets.
:param path: Path to store data.
:type path: str
:param pdb_dir: Path to directory containing structure files.
:type pdb_dir: str
:param name: Name of dataset to use.
:type name: CCPDB_DATASET_NAMES
:param batch_size: Batch size for dataloaders.
:type batch_size: int
:param num_workers: Number of workers for dataloaders.
:type num_workers: int
:param pin_memory: Whether to pin memory for dataloaders.
:type pin_memory: bool
:param in_memory: Whether to load dataset into memory, defaults to
``False``
:type in_memory: bool, optional
:param format: Format of the structure files, defaults to ``"mmtf"``.
:type format: Literal[mmtf, pdb], optional
:param obsolete_strategy: How to deal with obsolete PDBs,
defaults to "drop"
:type obsolete_strategy: str, optional
:param split_strategy: How to split the data,
defaults to ``"random"``
:type split_strategy: Literal["random", 'stratified"], optional
:param val_fraction: Fraction of the dataset to use for validation,
defaults to ``0.1``
:type val_fraction: float, optional
:param test_fraction: Fraction of the dataset to use for testing,
defaults to ``0.1``.
:type test_fraction: float, optional
:param transforms: List of transforms to apply to each example,
defaults to ``None``.
:type transforms: Optional[List[Callable]], optional
:param overwrite: Whether to overwrite existing data, defaults to
``False``
:type overwrite: bool, optional
:raises ValueError: If train, val, and test fractions do not sum to 1.
"""
super().__init__()
self.root = pathlib.Path(path)
if not os.path.exists(self.root):
Expand Down Expand Up @@ -264,7 +265,7 @@ def test_dataloader(self) -> ProteinDataLoader:
num_workers = 4
pin_memory = True

dataset = CCPDBDataset(
dataset = CCPDBDataModule(
path, pdb_dir, name, batch_size, num_workers, pin_memory
)
dataset.download()
Expand Down
2 changes: 1 addition & 1 deletion proteinworkshop/datasets/components/res_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@

from proteinworkshop.datasets.components.atom3d_dataset import BaseTransform

_amino_acids = lambda x: {
_amino_acids = lambda x: { # noqa: E731
"ALA": 0,
"ARG": 1,
"ASN": 2,
Expand Down
56 changes: 28 additions & 28 deletions proteinworkshop/datasets/deep_sea_proteins.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,34 @@


class DeepSeaProteinsDataModule(ProteinDataModule):
"""Data module for Deep Sea Proteins dataset.
:param path: Path to store data.
:type path: os.PathLike
:param pdb_dir: Path to directory containing PDB files.
:type pdb_dir: os.PathLike
:param validation_fold: Name of validation fold to use.
:type validation_fold: int
:param batch_size: Batch size for dataloaders.
:type batch_size: int
:param in_memory: Whether to load the entire dataset into memory, defaults to False
:type in_memory: bool, optional
:param pin_memory: Whether to pin dataloader memory, defaults to True
:type pin_memory: bool, optional
:param num_workers: Number of dataloader workers, defaults to 16
:type num_workers: int, optional
:param obsolete_strategy: Strategy to deal with obsolete PDbs,
defaults to "drop"
:type obsolete_strategy: str, optional
:param format: Format of the structure files, defaults to "mmtf"
:type format: Literal[mmtf, pdb], optional
:param transforms: Transforms to apply, defaults to None
:type transforms: Optional[Iterable[Callable]], optional
:param overwrite: Whether to overwrite existing data, defaults to
``False``
:type overwrite: bool, optional
"""

def __init__(
self,
path: os.PathLike,
Expand All @@ -31,33 +59,6 @@ def __init__(
transforms: Optional[Iterable[Callable]] = None,
overwrite: bool = False,
):
"""Data module for Deep Sea Proteins dataset.
:param path: Path to store data.
:type path: os.PathLike
:param pdb_dir: Path to directory containing PDB files.
:type pdb_dir: os.PathLike
:param validation_fold: Name of validation fold to use.
:type validation_fold: int
:param batch_size: Batch size for dataloaders.
:type batch_size: int
:param in_memory: Whether to load the entire dataset into memory, defaults to False
:type in_memory: bool, optional
:param pin_memory: Whether to pin dataloader memory, defaults to True
:type pin_memory: bool, optional
:param num_workers: Number of dataloader workers, defaults to 16
:type num_workers: int, optional
:param obsolete_strategy: Strategy to deal with obsolete PDbs,
defaults to "drop"
:type obsolete_strategy: str, optional
:param format: Format of the structure files, defaults to "mmtf"
:type format: Literal[mmtf, pdb], optional
:param transforms: Transforms to apply, defaults to None
:type transforms: Optional[Iterable[Callable]], optional
:param overwrite: Whether to overwrite existing data, defaults to
``False``
:type overwrite: bool, optional
"""
super().__init__()
self.data_dir = pathlib.Path(path)
if not os.path.exists(self.data_dir):
Expand Down Expand Up @@ -256,7 +257,6 @@ def test_dataloader(self) -> ProteinDataLoader:

if __name__ == "__main__":
import hydra
import omegaconf

from proteinworkshop import constants

Expand Down
Loading

0 comments on commit f184773

Please sign in to comment.