Enable molformer training (#206)

* Add molformer's trainer skeleton * Clean up parameters * Enable GPU inference * Add molformer to training pipelines * Proper handling of list parameters * Inherit from pl training pipeline * Rename arguments * Copy measure_name from dataset to model args * Fix alignments * Add missing parameters * Add aug in dataset args * Add pretrained path for regression * Fix parameters * Apply style * Fix parameter types * Update parameter's metadata * Add missing parameters * Add examples for molformer * Update parameter's metadata
GT4SD · Mar 17, 2023 · be5642f · be5642f
1 parent 95cee8d
commit be5642f
Show file tree

Hide file tree

Showing 6 changed files with 606 additions and 3 deletions.
diff --git a/examples/molformer/README.md b/examples/molformer/README.md
@@ -0,0 +1,117 @@
+# Molformer
+
+Simple example to train or finetune the Molformer model
+
+Make sure to activate the conda environment:
+
+```console
+conda activate gt4sd
+```
+
+## Pretraining
+
+An example for Molformer's pretraining. The `data_path` parameter contains the path where one or both the `pubchem`, `ZINC` directories are located. Link to the dataset and further detais about it can be found at the [original molformer repo](https://github.com/IBM/molformer).
+
+```console
+
+gt4sd-trainer  --training_pipeline_name molformer \
+        --type pretraining \
+        --batch_size 1200  \
+        --n_head 12 \
+        --n_layer 12 \
+        --n_embd 768 \
+        --d_dropout 0.1 \
+        --lr_start 3e-5 \
+        --num_workers 8\
+        --max_epochs 4\
+        --num_feats 32 \
+        --grad_acc 1\
+        --data_path molformer/data/pretrained \
+        --model_arch BERT_both_rotate
+```
+
+## Finetuning 
+
+### Classification
+
+An example of classification finetuning using the hiv dataset. Link to the dataset can be found at the [original molformer repo](https://github.com/IBM/molformer).
+
+```console
+
+gt4sd-trainer  --training_pipeline_name molformer \
+        --type classification \
+        --batch_size 128  \
+        --n_head 12 \
+        --n_layer 12 \
+        --n_embd 768 \
+        --d_dropout 0.1 \
+        --dropout 0.1 \
+        --lr_start 3e-5 \
+        --num_workers 8\
+        --max_epochs 500\
+        --num_feats 32 \
+        --every_n_epochs 10 \
+        --data_root molformer/data/hiv \
+        --pretrained_path pretrained_molformer/checkpoints/N-Step-Checkpoint_3_30000.ckpt \
+        --dataset_name hiv \
+        --measure_name HIV_active \
+        --dims 768 768 768 1 \
+        --num_classes 2 \
+        --save_dir test_hiv 
+```
+
+### Multiclass classification
+
+An example of multiclass finetuning using the clintox dataset. Link to the dataset can be found at the [original molformer repo](https://github.com/IBM/molformer).
+
+```console
+
+gt4sd-trainer --training_pipeline_name molformer \
+        --type multitask_classification \
+        --batch_size 128  \
+        --n_head 12 \
+        --n_layer 12 \
+        --n_embd 768 \
+        --d_dropout 0.1 \
+        --dropout 0.1 \
+        --lr_start 3e-5 \
+        --num_workers 8\
+        --max_epochs 500\
+        --num_feats 32 \
+        --every_n_epochs 10 \
+        --data_root molformer/data/clintox \
+        --pretrained_path pretrained_molformer/checkpoints/N-Step-Checkpoint_3_30000.ckpt \
+        --dataset_name tox21 \
+        --dims 768 768 768 1 \
+        --measure_names FDA_APPROVED CT_TOX
+        --save_dir test_clintox \
+```
+
+### Regression
+
+An example of regression finetuning using the qm9 dataset. Link to the dataset can be found at the [original molformer repo](https://github.com/IBM/molformer).
+
+```console
+
+gt4sd-trainer  --training_pipeline_name molformer \
+        --type regression \
+        --n_batch 128  \
+        --n_head 12 \
+        --n_layer 12 \
+        --n_embd 768 \
+        --d_dropout 0.1 \
+        --dropout 0.1 \
+        --lr_start 3e-5 \
+        --n_workers 8\
+        --max_epochs 500\
+        --num_feats 32 \
+        --every_n_epochs 10 \
+        --data_root molformer/data/qm9 \
+        --pretrained_path pretrained_molformer/checkpoints/N-Step-Checkpoint_3_30000.ckpt \
+        --dataset_name qm9 \
+        --measure_name alpha \
+        --dims 768 768 768 1 \
+        --save_dir test_alpha
+```
+
+
diff --git a/src/gt4sd/cli/argument_parser.py b/src/gt4sd/cli/argument_parser.py
@@ -180,7 +180,7 @@ def _add_dataclass_arguments(self, dtype: DataClassType) -> None:
                 kwargs["nargs"] = "+"
                 kwargs["type"] = partial(none_checker, dtype=field.type.__args__[0])
                 assert all(
-                    x == kwargs["type"] for x in field.type.__args__
+                    x == kwargs["type"].keywords["dtype"] for x in field.type.__args__
                 ), f"{field.name} cannot be a List of mixed types: {field.type.__args__}"
                 if field.default_factory is not dataclasses.MISSING:  # type: ignore
                     kwargs["default"] = field.default_factory()  # type: ignore

diff --git a/src/gt4sd/properties/molecules/core.py b/src/gt4sd/properties/molecules/core.py
@@ -358,6 +358,7 @@ def informative_model(samples: Union[str, List[str]]) -> List[float]:
             preds = []
             for batch in datamodule.test_dataloader():
                 with torch.no_grad():
+                    batch = [x.to(self.device) for x in batch]
                     batch_output = model.testing_step(batch, 0, 0)
 
                 preds_cpu = batch_output["pred"][:, 1]
@@ -415,10 +416,9 @@ def informative_model(samples: Union[str, List[str]]) -> List[str]:
             for batch in datamodule.test_dataloader():
 
                 with torch.no_grad():
+                    batch = [x.to(self.device) for x in batch]
                     batch_output = model.testing_step(batch, 0, 0)
 
-                print(batch_output["pred"])
-
                 batch_preds_idx = torch.argmax(batch_output["pred"], dim=1)
                 batch_preds = [config["measure_names"][i] for i in batch_preds_idx]
                 preds += batch_preds
@@ -473,6 +473,7 @@ def informative_model(samples: Union[str, List[str]]) -> List[float]:
             preds = []
             for batch in datamodule.test_dataloader():
                 with torch.no_grad():
+                    batch = [x.to(self.device) for x in batch]
                     batch_output = model.testing_step(batch, 0, 0)
 
                 preds += batch_output["pred"].view(-1).tolist()

diff --git a/src/gt4sd/training_pipelines/__init__.py b/src/gt4sd/training_pipelines/__init__.py
@@ -93,6 +93,13 @@
     LanguageModelingSavingArguments,
     LanguageModelingTrainingPipeline,
 )
+from .pytorch_lightning.molformer.core import (
+    MolformerDataArguments,
+    MolformerModelArguments,
+    MolformerSavingArguments,
+    MolformerTrainingArguments,
+    MolformerTrainingPipeline,
+)
 from .regression_transformer.core import (
     RegressionTransformerDataArguments,
     RegressionTransformerSavingArguments,
@@ -190,6 +197,11 @@
         CrystalsRFCModelArguments,
         CrystalsRFCTrainingArguments,
     ),
+    "molformer": (
+        MolformerDataArguments,
+        MolformerModelArguments,
+        MolformerTrainingArguments,
+    ),
 }
 
 TRAINING_PIPELINE_MAPPING = {
@@ -206,6 +218,7 @@
     "gflownet-trainer": GFlowNetTrainingPipeline,
     "cgcnn": CGCNNTrainingPipeline,
     "crystals-rfc": CrystalsRFCTrainingPipeline,
+    "molformer": MolformerTrainingPipeline,
 }
 
 TRAINING_PIPELINE_ARGUMENTS_FOR_MODEL_SAVING = {
@@ -222,6 +235,7 @@
     "gflownet-trainer": GFlowNetSavingArguments,
     "cgcnn": CGCNNSavingArguments,
     "crystals-rfc": CrystalsRFCSavingArguments,
+    "molformer": MolformerSavingArguments,
 }
 
 

diff --git a/src/gt4sd/training_pipelines/pytorch_lightning/molformer/__init__.py b/src/gt4sd/training_pipelines/pytorch_lightning/molformer/__init__.py
@@ -0,0 +1,24 @@
+#
+# MIT License
+#
+# Copyright (c) 2023 GT4SD team
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+#
+"""Molformer training pipeline initialization."""