diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml index a53b6f3..d6d42d2 100644 --- a/.github/workflows/build.yaml +++ b/.github/workflows/build.yaml @@ -1,4 +1,4 @@ -name: build +name: Build GAUCHE and run unit tests. on: push: @@ -15,31 +15,30 @@ on: - "CONTRIBUTING.md" - "imgs/" +permissions: + contents: read + jobs: - build_cpu: - runs-on: ubuntu-18.04 + build: + + runs-on: ubuntu-latest + strategy: + fail-fast: true matrix: - python-version: [3.7, 3.8, 3.9, 3.11] - steps: - - name: Checkout repository - uses: actions/checkout@v2 - # See: https://github.com/marketplace/actions/setup-conda - - name: Setup anaconda - uses: s-weigand/setup-conda@v1 - with: - python-version: ${{ matrix.python-version }} - conda-channels: "conda-forge" + python-version: ["3.9", "3.10", "3.11"] - - name: Install GAUCHE - run: pip install -e . - - name: Install rnxfp - run: pip install --no-deps rxnfp - - name: Install drfp - run: pip install --no-deps drfp - - name: Install transformers - run: pip install transformers - - name: Run unit tests and generate coverage report - run: pytest . - - name: Test notebook execution - run: pytest --nbval-lax notebooks/ --current-env \ No newline at end of file + steps: + - uses: actions/checkout@v4 + - name: Set up Python. + uses: actions/setup-python@v4 + with: + python-version: ${{ matrix.python-version }} + - name: Set up GAUCHE and install all dependencies. + run: | + python -m pip install --upgrade pip + pip install pytest + pip install -e .[all] + - name: Run unit tests with pytest. + run: | + pytest tests/ diff --git a/.github/workflows/code-style.yaml b/.github/workflows/code-style.yaml index 4893d97..790eb2f 100644 --- a/.github/workflows/code-style.yaml +++ b/.github/workflows/code-style.yaml @@ -5,16 +5,16 @@ on: [push] jobs: black: name: "Ensure black compliance" - runs-on: "ubuntu-18.04" + runs-on: ubuntu-latest steps: - name: Checkout repository - uses: actions/checkout@v2 + uses: actions/checkout@v4 - name: Install Python - uses: actions/setup-python@v2 + uses: actions/setup-python@v4 with: - python-version: 3.9 + python-version: "3.10" - name: Install code style checking tools run: python -m pip install black @@ -22,36 +22,17 @@ jobs: - name: Run black run: black --check . - interrogate: - name: "Ensure docstring coverage" - runs-on: "ubuntu-18.04" - steps: - - name: Checkout repository - uses: actions/checkout@v2 - - - name: Install Python - uses: actions/setup-python@v2 - with: - python-version: 3.9 - - - name: Install code style checking tools - run: python -m pip install interrogate - - - name: Run interrogate - run: interrogate . - - isort: name: "Ensure imports are correctly sorted" - runs-on: "ubuntu-18.04" + runs-on: ubuntu-latest steps: - name: Checkout repository - uses: actions/checkout@v2 + uses: actions/checkout@v4 - name: Install Python - uses: actions/setup-python@v2 + uses: actions/setup-python@v4 with: - python-version: 3.9 + python-version: "3.10" - name: Install code style checking tools run: python -m pip install isort diff --git a/benchmarks/benchmark_models.py b/benchmarks/benchmark_models.py index 927f50c..80caea2 100644 --- a/benchmarks/benchmark_models.py +++ b/benchmarks/benchmark_models.py @@ -2,14 +2,13 @@ GP Model definitions to be used in the benchmarks """ -from gauche.kernels.fingerprint_kernels.tanimoto_kernel import ( - TanimotoKernel, -) from gpytorch.distributions import MultivariateNormal from gpytorch.kernels import LinearKernel, ScaleKernel from gpytorch.means import ConstantMean from gpytorch.models import ExactGP +from gauche.kernels.fingerprint_kernels.tanimoto_kernel import TanimotoKernel + class TanimotoGP(ExactGP): def __init__(self, train_x, train_y, likelihood): diff --git a/benchmarks/run_benchmark.py b/benchmarks/run_benchmark.py index 5835a07..da67318 100644 --- a/benchmarks/run_benchmark.py +++ b/benchmarks/run_benchmark.py @@ -11,8 +11,6 @@ import torch from benchmark_models import ScalarProductGP, TanimotoGP from botorch import fit_gpytorch_model -from gauche.dataloader import MolPropLoader -from gauche.dataloader.data_utils import transform_data from gpytorch.likelihoods import GaussianLikelihood from gpytorch.mlls import ExactMarginalLogLikelihood from gpytorch_metrics import ( @@ -23,6 +21,9 @@ from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score from sklearn.model_selection import train_test_split +from gauche.dataloader import MolPropLoader +from gauche.dataloader.data_utils import transform_data + # Remove Graphein warnings logging.getLogger("graphein").setLevel("ERROR") diff --git a/notebooks/pretrained_kernel.py b/notebooks/pretrained_kernel.py index 1a39b05..cb0a85f 100644 --- a/notebooks/pretrained_kernel.py +++ b/notebooks/pretrained_kernel.py @@ -1,35 +1,41 @@ -import sys, time, warnings, numpy as np, torch, random -sys.path.append('..') -from matplotlib import pyplot as plt -from rdkit.Chem import MolFromSmiles -from sklearn.model_selection import train_test_split +import random +import sys +import time +import warnings +import numpy as np +import torch from botorch import fit_gpytorch_model from botorch.acquisition import ExpectedImprovement from botorch.exceptions import BadInitialCandidatesWarning from botorch.models.gp_regression import SingleTaskGP - from gpytorch.distributions import MultivariateNormal -from gpytorch.kernels import ScaleKernel, RBFKernel +from gpytorch.kernels import RBFKernel, ScaleKernel from gpytorch.likelihoods import GaussianLikelihood from gpytorch.means import ConstantMean from gpytorch.mlls import ExactMarginalLogLikelihood +from matplotlib import pyplot as plt +from rdkit.Chem import MolFromSmiles +from sklearn.model_selection import train_test_split from gauche.dataloader import MolPropLoader from gauche.kernels.gnn_kernels.pretrained_kernel import GNN, mol_to_pyg +sys.path.append("..") + + def set_seed(seed): np.random.seed(seed) torch.manual_seed(seed) torch.cuda.manual_seed_all(seed) random.seed(seed) - torch.backends.cudnn.deterministic=True + torch.backends.cudnn.deterministic = True torch.backends.cudnn.benchmark = False torch.cuda.manual_seed_all(seed) + # We define our custom GP surrogate model using the RBF kernel class GP_PretrainedKernel(SingleTaskGP): - def __init__(self, train_X, train_Y): super().__init__(train_X, train_Y, GaussianLikelihood()) self.mean_module = ConstantMean() @@ -41,8 +47,10 @@ def forward(self, x): covar_x = self.covar_module(x) return MultivariateNormal(mean_x, covar_x) + """We define helper functions for the Bayesian optimisation loop. In particular the acquisition function optimisation procedure is framed so as to take the maximum over a discrete set of heldout molecules.""" + def initialize_model(train_x, train_obj, state_dict=None): """ Initialise model and loss function. @@ -65,7 +73,9 @@ def initialize_model(train_x, train_obj, state_dict=None): return mll, model -def optimize_acqf_and_get_observation(acq_func, heldout_inputs, heldout_outputs): +def optimize_acqf_and_get_observation( + acq_func, heldout_inputs, heldout_outputs +): """ Optimizes the acquisition function, and returns a new candidate and an observation. @@ -79,7 +89,9 @@ def optimize_acqf_and_get_observation(acq_func, heldout_inputs, heldout_outputs) # Loop over the discrete set of points to evaluate the acquisition function at. acq_vals = [] for i in range(len(heldout_outputs)): - acq_vals.append(acq_func(heldout_inputs[i].unsqueeze(-2))) # use unsqueeze to append batch dimension + acq_vals.append( + acq_func(heldout_inputs[i].unsqueeze(-2)) + ) # use unsqueeze to append batch dimension # observe new values acq_vals = torch.tensor(acq_vals) @@ -88,8 +100,12 @@ def optimize_acqf_and_get_observation(acq_func, heldout_inputs, heldout_outputs) new_obj = heldout_outputs[best_idx].unsqueeze(-1) # add output dimension # Delete the selected input and value from the heldout set. - heldout_inputs = torch.cat((heldout_inputs[:best_idx], heldout_inputs[best_idx+1:]), axis=0) - heldout_outputs = torch.cat((heldout_outputs[:best_idx], heldout_outputs[best_idx+1:]), axis=0) + heldout_inputs = torch.cat( + (heldout_inputs[:best_idx], heldout_inputs[best_idx + 1 :]), axis=0 + ) + heldout_outputs = torch.cat( + (heldout_outputs[:best_idx], heldout_outputs[best_idx + 1 :]), axis=0 + ) return new_x, new_obj, heldout_inputs, heldout_outputs @@ -113,11 +129,16 @@ def update_random_observations(best_random, heldout_inputs, heldout_outputs): best_random.append(max(best_random[-1], next_random_best)) # Delete the selected input and value from the heldout set. - heldout_inputs = torch.cat((heldout_inputs[:index], heldout_inputs[index+1:]), axis=0) - heldout_outputs = torch.cat((heldout_outputs[:index], heldout_outputs[index+1:]), axis=0) + heldout_inputs = torch.cat( + (heldout_inputs[:index], heldout_inputs[index + 1 :]), axis=0 + ) + heldout_outputs = torch.cat( + (heldout_outputs[:index], heldout_outputs[index + 1 :]), axis=0 + ) return best_random, heldout_inputs, heldout_outputs + """Run the Bayesian optimisation loop, comparing the analytic (sequential) expected improvement acquisition funciton with a random policy.""" # Bayesian optimisation experiment parameters, number of random trials, split size, batch size @@ -129,27 +150,27 @@ def update_random_observations(best_random, heldout_inputs, heldout_outputs): verbose = False set_seed(12) -warnings.filterwarnings('ignore', category=BadInitialCandidatesWarning) -warnings.filterwarnings('ignore', category=RuntimeWarning) +warnings.filterwarnings("ignore", category=BadInitialCandidatesWarning) +warnings.filterwarnings("ignore", category=RuntimeWarning) best_observed_all_ei, best_random_all = [], [] # Load the Photoswitch dataset loader = MolPropLoader() -loader.load_benchmark("Photoswitch", "../data/property_prediction/Photoswitch.csv") +loader.load_benchmark( + "Photoswitch", "../data/property_prediction/Photoswitch.csv" +) # We use the fragprints representations (a concatenation of Morgan fingerprints and RDKit fragment features) y = loader.labels # get PyTorch Geometric featurisation of molecules -graphs = [ - mol_to_pyg(MolFromSmiles(smiles)) for smiles in loader.features -] +graphs = [mol_to_pyg(MolFromSmiles(smiles)) for smiles in loader.features] # load pretrained model with torch.no_grad(): - model = GNN(gnn_type='gin') - model.load_pretrained('contextpred', device=torch.device("cpu")) + model = GNN(gnn_type="gin") + model.load_pretrained("contextpred", device=torch.device("cpu")) X = torch.stack( [ @@ -157,18 +178,20 @@ def update_random_observations(best_random, heldout_inputs, heldout_outputs): x=graphs[i].x, edge_index=graphs[i].edge_index, edge_attr=graphs[i].edge_attr, - ).mean(0) for i in range(len(graphs)) + ).mean(0) + for i in range(len(graphs)) ] ) # average over multiple random trials (each trial splits the initial training set for the GP in a random manner) for trial in range(1, N_TRIALS + 1): - print(f"\nTrial {trial:>2} of {N_TRIALS} ", end="") best_observed_ei, best_random = [], [] # Generate initial training data and initialize model - train_x_ei, heldout_x_ei, train_y_ei, heldout_y_ei = train_test_split(X, y, test_size=holdout_set_size, random_state=trial) + train_x_ei, heldout_x_ei, train_y_ei, heldout_y_ei = train_test_split( + X, y, test_size=holdout_set_size, random_state=trial + ) best_observed_value_ei = torch.tensor(np.max(train_y_ei)) # Convert numpy arrays to PyTorch tensors and flatten the label vectors @@ -186,27 +209,37 @@ def update_random_observations(best_random, heldout_inputs, heldout_outputs): # run N_ITERS rounds of BayesOpt after the initial random batch for iteration in range(1, N_ITERS + 1): - t0 = time.time() # fit the model fit_gpytorch_model(mll_ei) # Use analytic acquisition function for batch size of 1. - EI = ExpectedImprovement(model=model_ei, best_f=(train_y_ei.to(train_y_ei)).max()) + EI = ExpectedImprovement( + model=model_ei, best_f=(train_y_ei.to(train_y_ei)).max() + ) - new_x_ei, new_obj_ei, heldout_x_ei, heldout_y_ei = optimize_acqf_and_get_observation(EI, - heldout_x_ei, - heldout_y_ei) + ( + new_x_ei, + new_obj_ei, + heldout_x_ei, + heldout_y_ei, + ) = optimize_acqf_and_get_observation(EI, heldout_x_ei, heldout_y_ei) # update training points train_x_ei = torch.cat([train_x_ei, new_x_ei]) train_y_ei = torch.cat([train_y_ei, new_obj_ei]) # update random search progress - best_random, heldout_x_random, heldout_y_random = update_random_observations(best_random, - heldout_inputs=heldout_x_random, - heldout_outputs=heldout_y_random) + ( + best_random, + heldout_x_random, + heldout_y_random, + ) = update_random_observations( + best_random, + heldout_inputs=heldout_x_random, + heldout_outputs=heldout_y_random, + ) best_value_ei = torch.max(new_obj_ei, best_observed_ei[-1]) best_observed_ei.append(best_value_ei) @@ -224,7 +257,8 @@ def update_random_observations(best_random, heldout_inputs, heldout_outputs): print( f"\nBatch {iteration:>2}: best_value (random, qEI) = " f"({max(best_random):>4.2f}, {best_value_ei:>4.2f}), " - f"time = {t1 - t0:>4.2f}.", end="" + f"time = {t1 - t0:>4.2f}.", + end="", ) else: print(".", end="") @@ -232,10 +266,12 @@ def update_random_observations(best_random, heldout_inputs, heldout_outputs): best_observed_all_ei.append(best_observed_ei) best_random_all.append(best_random) + # Define a confience interval function for plotting. def ci(y): return 1.96 * y.std(axis=0) / np.sqrt(N_TRIALS) + iters = np.arange(N_ITERS + 1) y_ei = [] @@ -264,14 +300,14 @@ def ci(y): lower_ei = y_ei_mean - y_ei_std upper_ei = y_ei_mean + y_ei_std -plt.plot(iters, y_rnd_mean, label='Random') +plt.plot(iters, y_rnd_mean, label="Random") plt.fill_between(iters, lower_rnd, upper_rnd, alpha=0.2) -plt.plot(iters, y_ei_mean, label='EI') +plt.plot(iters, y_ei_mean, label="EI") plt.fill_between(iters, lower_ei, upper_ei, alpha=0.2) -plt.xlabel('Number of Iterations') -plt.ylabel('Best Objective Value') +plt.xlabel("Number of Iterations") +plt.ylabel("Best Objective Value") plt.legend(loc="lower right") plt.xticks(list(np.arange(1, 21))) plt.show() -"""EI outperforms random search in terms of selecting molecules with high E isomer pi-pi* transition wavelength! It should be noted that the true objective for photoswitch optimisation would consider all transition wavelengths as well as the thermal half-life and this will hopefully be included in a future notebook!""" \ No newline at end of file +"""EI outperforms random search in terms of selecting molecules with high E isomer pi-pi* transition wavelength! It should be noted that the true objective for photoswitch optimisation would consider all transition wavelengths as well as the thermal half-life and this will hopefully be included in a future notebook!""" diff --git a/setup.py b/setup.py index 05d1bca..c12d661 100644 --- a/setup.py +++ b/setup.py @@ -2,11 +2,12 @@ package setup """ +import codecs import io import os import re -import codecs from typing import Dict, List + from setuptools import find_packages, setup HERE = os.path.abspath(os.path.dirname(__file__))