diff --git a/.github/workflows/build_documentation.yaml b/.github/workflows/build_documentation.yaml index ccd6ae8..13cf6bb 100644 --- a/.github/workflows/build_documentation.yaml +++ b/.github/workflows/build_documentation.yaml @@ -11,12 +11,20 @@ jobs: steps: - uses: actions/checkout@v3 - uses: actions/setup-python@v3 + with: + python-version: "3.11" + + - name: Install Pandoc + run: | + sudo apt-get update + sudo apt-get install -y pandoc + - name: Install dependencies run: | - pip install sphinx furo myst_parser + pip install .[all,docs] - name: Sphinx build run: | - sphinx-build docs _build + sphinx-build docs/source _build - name: Deploy to GitHub Pages uses: peaceiris/actions-gh-pages@v3 if: ${{ github.event_name == 'push' && github.ref == 'refs/heads/main' }} diff --git a/.requirements/docs.in b/.requirements/docs.in index d6bd3be..7368fe6 100644 --- a/.requirements/docs.in +++ b/.requirements/docs.in @@ -1,2 +1,15 @@ +furo +nbsphinx +nbsphinx-link +sphinx-copybutton +m2r2 +nbstripout +pandoc +pydocstyle sphinx -furo \ No newline at end of file +sphinx-inline-tabs +sphinxext-opengraph +sphinxcontrib-gtagjs +ipython +watermark +sphinx_codeautolink \ No newline at end of file diff --git a/README.md b/README.md index 2053b4d..bcc66f0 100644 --- a/README.md +++ b/README.md @@ -86,7 +86,7 @@ The easiest way to get started with GAUCHE is to check out our tutorial notebook | [GP Regression on Molecules](https://leojklarner.github.io/gauche/notebooks/gp_regression_on_molecules.html) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/leojklarner/gauche/blob/main/notebooks/GP%20Regression%20on%20Molecules.ipynb) | | [Bayesian Optimisation Over Molecules](https://leojklarner.github.io/gauche/notebooks/bayesian_optimisation_over_molecules.html) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/leojklarner/gauche/blob/main/notebooks/Bayesian%20Optimisation%20Over%20Molecules.ipynb) | | [Multioutput Gaussian Processes for Multitask Learning](https://leojklarner.github.io/gauche/notebooks/multitask_gp_regression_on_molecules.html) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/leojklarner/gauche/blob/main/notebooks/Multitask%20GP%20Regression%20on%20Molecules.ipynb) | -| [Training GPs on Graphs](https://leojklarner.github.io/gauche/notebooks/Training%20GPs%20on%20Graphs.html) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/leojklarner/gauche/blob/main/notebooks/Training%20GPs%20on%20Graphs.ipynb) | +| [Training GPs on Graphs](https://leojklarner.github.io/gauche/notebooks/training_gps_on_graphs.html) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/leojklarner/gauche/blob/main/notebooks/Training%20GPs%20on%20Graphs.ipynb) | | [Sparse GP Regression for Big Molecular Data](https://leojklarner.github.io/gauche/notebooks/sparse_gp_regression_for_big_molecular_data.html) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/leojklarner/gauche/blob/main/notebooks/Sparse%20GP%20Regression%20for%20Big%20Molecular%20Data.ipynb) | |[Molecular Preference Learning](https://github.com/leojklarner/gauche/blob/main/notebooks/Molecular%20Preference%20Learning.ipynb)|[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/leojklarner/gauche/blob/main/notebooks/Molecular%20Preference%20Learning.ipynb) | |[Preferential Bayesian Optimisation](https://github.com/leojklarner/gauche/blob/main/notebooks/Preferential%20Bayesian%20Optimisation.ipynb)|[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/leojklarner/gauche/blob/main/notebooks/Preferential%20Bayesian%20Optimisation.ipynb) | diff --git a/docs/source/conf.py b/docs/source/conf.py index 67ee1c9..8127d37 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -10,10 +10,12 @@ # add these directories to sys.path here. If the directory is relative to the # documentation root, use os.path.abspath to make it absolute, like shown here. # -# import os -# import sys -# sys.path.insert(0, os.path.abspath("../../../gauche")) +import os +import sys + +print(sys.executable) +sys.path.insert(0, os.path.abspath(".")) # -- Project information ----------------------------------------------------- @@ -23,7 +25,7 @@ author = "Ryan Rhys-Griffiths" # The full version, including alpha/beta/rc tags -release = "0.1.0" +# release = "1.0.0" # -- General configuration --------------------------------------------------- @@ -33,15 +35,19 @@ # ones. extensions = [ "sphinx.ext.autodoc", + "sphinx.ext.autosummary", "sphinx.ext.intersphinx", "sphinx.ext.viewcode", - # "sphinx_copybutton", - # "sphinx_inline_tabs", - # "sphinxcontrib.gtagjs", - # "sphinxext.opengraph", - # "m2r2", - # "nbsphinx", - # "nbsphinx_link", + "sphinx_copybutton", + "sphinx_inline_tabs", + "sphinxcontrib.gtagjs", + "sphinxext.opengraph", + "m2r2", + "nbsphinx", + "nbsphinx_link", + "sphinx.ext.napoleon", + "sphinx_codeautolink", + # "sphinx_autorun", ] # Add any paths that contain templates here, relative to this directory. diff --git a/docs/source/index.rst b/docs/source/index.rst index 973e1c6..e256e54 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -3,16 +3,63 @@ You can adapt this file completely to your liking, but it should at least contain the root `toctree` directive. -Welcome to GAUCHE's documentation! -================================== +Documentation +================== + +.. image:: ../../imgs/gauche_banner_1.png + :width: 100% + :alt: GAUCHE Logo + :align: left + +**GAUCHE** is a collaborative, open-source software library that aims to make state-of-the-art probabilistic modelling and black-box optimisation techniques more easily accessible to scientific experts in chemistry, materials science and beyond. We provide 30+ bespoke kernels for molecules, chemical reactions and proteins and illustrate how they can be used for Gaussian processes and Bayesian optimisation in 10+ easy-to-adapt tutorial notebooks. + +`Paper (NeurIPS 2023) `_ + +Overview +========== + +General-purpose Gaussian process (GP) and Bayesian optimisation (BO) libraries do not cater for molecular representations. Likewise, general-purpose molecular machine learning libraries do not consider GPs and BO. To bridge this gap, GAUCHE provides a modular, robust and easy-to-use framework of 30+ parallelisable and batch-GP-compatible implementations of string, fingerprint and graph kernels that operate on a range of widely-used molecular representations. + +.. image:: ../../imgs/gauche_overview.png + :width: 100 % + :alt: GAUCHE Overview + :align: left + +Kernels +--------- -.. include:: readme.rst +Standard GP packages typically assume continuous input spaces of low and fixed dimensionality. This makes it difficult to apply them to common molecular representations: molecular graphs are discrete objects, SMILES strings vary in length and topological fingerprints tend to be high-dimensional and sparse. To bridge this gap, GAUCHE provides: + +* **Fingerprint Kernels** that measure the similarity between bit/count vectors of descriptor by examining the degree to which their elements overlap. +* **String Kernels** that measure the similarity between strings by examining the degree to which their sub-strings overlap. +* **Graph Kernels** that measure between graphs by examining the degree to which certain substructural motifs overlap. + +Representations +----------------- + +GAUCHE supports any representation that is based on bit/count vectors, strings or graphs. For rapid prototyping and benchmarking, we also provide a range of standard featurisation techniques for molecules, chemical reactions and proteins: + +.. list-table:: + :header-rows: 1 + + * - Domain + - Representation + * - Molecules + - ECFP Fingerprints [1], rdkit Fragments, Fragprints, Graphs [2], SMILES [3], SELFIES [4] + * - Chemical Reactions + - One-Hot Encoding, Data-Driven Reaction Fingerprints [5], Differential Reaction Fingerprints [6], Reaction SMARTS + * - Proteins + - Sequences, Graphs [2] + +Getting Started +----------------- + +The easiest way to get started with GAUCHE is to check out our tutorial notebooks: .. toctree:: - :maxdepth: 2 - :caption: Tutorials + :maxdepth: 1 notebooks/gp_regression_on_molecules.nblink notebooks/bayesian_optimisation_over_molecules.nblink @@ -26,6 +73,16 @@ Welcome to GAUCHE's documentation! notebooks/external_graph_kernels.nblink + +Extensions +----------------- + +If there are any specific kernels or representations that you would like to see included in GAUCHE, please reach out or submit an issue/pull request. + + +Gauche's API +================ + .. toctree:: :maxdepth: 3 :caption: API Reference @@ -34,11 +91,26 @@ Welcome to GAUCHE's documentation! modules/representations modules/dataloader - - Indices and tables ================== * :ref:`genindex` * :ref:`modindex` * :ref:`search` + +References +================== + +.. _bibliography: + +[1] Rogers, D. and Hahn, M., 2010. `Extended-connectivity fingerprints. `_ Journal of Chemical Information and Modeling, 50(5), pp.742-754. + +[2] Jamasb, A., Viñas Torné, R., Ma, E., Du, Y., Harris, C., Huang, K., Hall, D., Lió, P. and Blundell, T., 2022. `Graphein-a Python library for geometric deep learning and network analysis on biomolecular structures and interaction networks `_. Advances in Neural Information Processing Systems, 35, pp.27153-27167. + +[3] Weininger, D., 1988. `SMILES, a chemical language and information system. 1. Introduction to methodology and encoding rules. `_ Journal of Chemical Information and Computer Sciences, 28(1), pp.31-36. + +[4] Krenn, M., Häse, F., Nigam, A., Friederich, P. and Aspuru-Guzik, A., 2020. `Self-referencing embedded strings (SELFIES): A 100% robust molecular string representation `_. Machine Learning: Science and Technology, 1(4), p.045024. + +[5] Probst, D., Schwaller, P. and Reymond, J.L., 2022. `Reaction classification and yield prediction using the differential reaction fingerprint DRFP `_. Digital Discovery, 1(2), pp.91-97. + +[6] Schwaller, P., Probst, D., Vaucher, A.C., Nair, V.H., Kreutter, D., Laino, T. and Reymond, J.L., 2021. `Mapping the space of chemical reactions using attention-based neural networks `_. Nature Machine Intelligence, 3(2), pp.144-152. \ No newline at end of file diff --git a/docs/source/modules/dataloader.rst b/docs/source/modules/dataloader.rst index 57d6452..0c6cfa1 100644 --- a/docs/source/modules/dataloader.rst +++ b/docs/source/modules/dataloader.rst @@ -8,7 +8,7 @@ Dataloader Molecular Properties ---------------------- -.. automodule:: gauche.dataloader.mol_prop_loader +.. automodule:: gauche.dataloader.molprop_loader :members: Reaction Loader diff --git a/docs/source/readme.rst b/docs/source/readme.rst index 57de865..e69de29 100644 --- a/docs/source/readme.rst +++ b/docs/source/readme.rst @@ -1 +0,0 @@ -.. mdinclude:: ../../README.md \ No newline at end of file diff --git a/gauche/dataloader/reaction_loader.py b/gauche/dataloader/reaction_loader.py index 82b75e3..40ea64d 100644 --- a/gauche/dataloader/reaction_loader.py +++ b/gauche/dataloader/reaction_loader.py @@ -19,6 +19,16 @@ class ReactionLoader(DataLoader): + """ + Data loader class for reaction yield prediction + datasets with a single regression target. + Expects input to be a csv file with either multiple SMILES + columns or a single reaction SMARTS column. + Contains methods to validate the dataset and to + transform the SMILES/SMARTS strings into different + molecular representations. + """ + def __init__(self): super(ReactionLoader, self).__init__() self.task = "reaction_yield_prediction" diff --git a/gauche/kernels/fingerprint_kernels/braun_blanquet_kernel.py b/gauche/kernels/fingerprint_kernels/braun_blanquet_kernel.py index d4a3f87..959a8f8 100644 --- a/gauche/kernels/fingerprint_kernels/braun_blanquet_kernel.py +++ b/gauche/kernels/fingerprint_kernels/braun_blanquet_kernel.py @@ -16,7 +16,7 @@ def batch_braun_blanquet_sim( Braun-Blanquet similarity between two batched tensors, across last 2 dimensions. eps argument ensures numerical stability if all zero tensors are added. - / max(|x1|, |x2|) + :math:` / max(|x1|, |x2|)` Where || is the L1 norm and <.> is the inner product diff --git a/gauche/kernels/fingerprint_kernels/dice_kernel.py b/gauche/kernels/fingerprint_kernels/dice_kernel.py index abbd9d2..6a56f87 100644 --- a/gauche/kernels/fingerprint_kernels/dice_kernel.py +++ b/gauche/kernels/fingerprint_kernels/dice_kernel.py @@ -14,7 +14,7 @@ def batch_dice_sim( Dice similarity between two batched tensors, across last 2 dimensions. eps argument ensures numerical stability if all zero tensors are added. - (2 * ) / (|x1| + |x2|) + :math:`(2 * ) / (|x1| + |x2|)` Where || is the L1 norm and <.> is the inner product @@ -50,10 +50,10 @@ class DiceKernel(Kernel): .. math:: - \begin{equation*} - k_{\text{Dice}}(\mathbf{x}, \mathbf{x'}) = \frac{2\langle\mathbf{x}, - \mathbf{x'}\rangle}{\left\lVert\mathbf{x}\right\rVert + \left\lVert\mathbf{x'}\right\rVert} - \end{equation*} + \begin{equation*} + k_{\text{Dice}}(\mathbf{x}, \mathbf{x'}) = \frac{2\langle\mathbf{x}, + \mathbf{x'}\rangle}{\left\lVert\mathbf{x}\right\rVert + \left\lVert\mathbf{x'}\right\rVert} + \end{equation*} .. note:: diff --git a/gauche/kernels/fingerprint_kernels/faith_kernel.py b/gauche/kernels/fingerprint_kernels/faith_kernel.py index 343bc4e..ee3ae99 100644 --- a/gauche/kernels/fingerprint_kernels/faith_kernel.py +++ b/gauche/kernels/fingerprint_kernels/faith_kernel.py @@ -16,7 +16,7 @@ def batch_faith_sim( Faith similarity between two batched tensors, across last 2 dimensions. eps argument ensures numerical stability if all zero tensors are added. - (2 * ) + d / 2n + :math:`(2 * ) + d / 2n` Where <.> is the inner product, d is the number of common zeros and n is the dimension of the input vectors diff --git a/gauche/kernels/fingerprint_kernels/forbes_kernel.py b/gauche/kernels/fingerprint_kernels/forbes_kernel.py index 431171a..c4b43d2 100644 --- a/gauche/kernels/fingerprint_kernels/forbes_kernel.py +++ b/gauche/kernels/fingerprint_kernels/forbes_kernel.py @@ -16,9 +16,9 @@ def batch_forbes_sim( Forbes similarity between two batched tensors, across last 2 dimensions. eps argument ensures numerical stability if all zero tensors are added. - n * / (|x1| + |x2|) + :math:`n * / (|x1| + |x2|)` - Where <.> is the inner product, || is the L1 norm, and n is the dimension of the input vectors + Where <.> is the inner product, :math:`||` is the L1 norm, and n is the dimension of the input vectors Args: x1: `[b x n x d]` Tensor where b is the batch dimension diff --git a/gauche/kernels/fingerprint_kernels/inner_product_kernel.py b/gauche/kernels/fingerprint_kernels/inner_product_kernel.py index db784df..8c71814 100644 --- a/gauche/kernels/fingerprint_kernels/inner_product_kernel.py +++ b/gauche/kernels/fingerprint_kernels/inner_product_kernel.py @@ -16,7 +16,7 @@ def batch_inner_product_sim( Inner product similarity between two batched tensors, across last 2 dimensions. eps argument ensures numerical stability if all zero tensors are added. - + :math:`` Where <.> is the inner product @@ -47,10 +47,10 @@ class InnerProductKernel(Kernel): .. math:: - \begin{equation*} - k_{\text{Inner Product}}(\mathbf{x}, \mathbf{x'}) = \langle\mathbf{x}, - \mathbf{x'}\rangle - \end{equation*} + \begin{equation*} + k_{\text{Inner Product}}(\mathbf{x}, \mathbf{x'}) = \langle\mathbf{x}, + \mathbf{x'}\rangle + \end{equation*} .. note:: diff --git a/gauche/kernels/fingerprint_kernels/intersection_kernel.py b/gauche/kernels/fingerprint_kernels/intersection_kernel.py index b4ea165..dc0a612 100644 --- a/gauche/kernels/fingerprint_kernels/intersection_kernel.py +++ b/gauche/kernels/fingerprint_kernels/intersection_kernel.py @@ -17,7 +17,7 @@ def batch_intersection_sim( eps argument ensures numerical stability if all zero tensors are added. Must be used with binary-valued vectors only - + + :math:` + ` Where <.> is the inner product and x1' and x2' denote the bit flipped vectors such that ones and zeros are interchanged diff --git a/gauche/kernels/fingerprint_kernels/minmax_kernel.py b/gauche/kernels/fingerprint_kernels/minmax_kernel.py index 290647b..8dd532b 100644 --- a/gauche/kernels/fingerprint_kernels/minmax_kernel.py +++ b/gauche/kernels/fingerprint_kernels/minmax_kernel.py @@ -14,9 +14,9 @@ def batch_minmax_sim( MinMax similarity between two batched tensors, across last 2 dimensions. eps argument ensures numerical stability if all zero tensors are added. - (|x1| + |x2| - |x1 - x2|) / (|x1| + |x2| + |x1 - x2|) + :math:`(|x1| + |x2| - |x1 - x2|) / (|x1| + |x2| + |x1 - x2|)` - Where || is the L1 norm + Where :math:`||` is the L1 norm Args: x1: `[b x n x d]` Tensor where b is the batch dimension @@ -51,9 +51,9 @@ class MinMaxKernel(Kernel): .. math:: - \begin{equation*} - k_{\text{MinMax}}(\mathbf{x}, \mathbf{x'}) = \frac{\sum_i \min(x_i, x'_i)} - \end{equation*} + \begin{equation*} + k_{\text{MinMax}}(\mathbf{x}, \mathbf{x'}) = \frac{\sum_i \min(x_i, x'_i)} + \end{equation*} .. note:: diff --git a/gauche/kernels/fingerprint_kernels/otsuka_kernel.py b/gauche/kernels/fingerprint_kernels/otsuka_kernel.py index 2c82d07..c4a868a 100644 --- a/gauche/kernels/fingerprint_kernels/otsuka_kernel.py +++ b/gauche/kernels/fingerprint_kernels/otsuka_kernel.py @@ -16,7 +16,7 @@ def batch_otsuka_sim( Otsuka similarity between two batched tensors, across last 2 dimensions. eps argument ensures numerical stability if all zero tensors are added. - / sqrt(|x1| + |x2|) + :math:` / sqrt(|x1| + |x2|)` Where || is the L1 norm and <.> is the inner product diff --git a/gauche/kernels/fingerprint_kernels/rand_kernel.py b/gauche/kernels/fingerprint_kernels/rand_kernel.py index ad5cd12..6561296 100644 --- a/gauche/kernels/fingerprint_kernels/rand_kernel.py +++ b/gauche/kernels/fingerprint_kernels/rand_kernel.py @@ -16,7 +16,7 @@ def batch_rand_sim( Rand similarity between two batched tensors, across last 2 dimensions. eps argument ensures numerical stability if all zero tensors are added. - + d / n + :math:` + d / n` Where <.> is the inner product, d is the number of common zeros and n is the dimensionality diff --git a/gauche/kernels/fingerprint_kernels/rogers_tanimoto_kernel.py b/gauche/kernels/fingerprint_kernels/rogers_tanimoto_kernel.py index 8707b3d..6c78e17 100644 --- a/gauche/kernels/fingerprint_kernels/rogers_tanimoto_kernel.py +++ b/gauche/kernels/fingerprint_kernels/rogers_tanimoto_kernel.py @@ -16,7 +16,7 @@ def batch_rogers_tanimoto_sim( Rogers-Tanimoto similarity between two batched tensors, across last 2 dimensions. eps argument ensures numerical stability if all zero tensors are added. - + d / 2|x1| + 2|x2| - 3* + d + :math:` + d / 2|x1| + 2|x2| - 3* + d` Where || is the L1 norm and <.> is the inner product and d is the number of common zeros diff --git a/gauche/kernels/fingerprint_kernels/russell_rao_kernel.py b/gauche/kernels/fingerprint_kernels/russell_rao_kernel.py index 0c9459e..820b4a3 100644 --- a/gauche/kernels/fingerprint_kernels/russell_rao_kernel.py +++ b/gauche/kernels/fingerprint_kernels/russell_rao_kernel.py @@ -16,7 +16,7 @@ def batch_russell_rao_sim( Russell-Rao similarity between two batched tensors, across last 2 dimensions. eps argument ensures numerical stability if all zero tensors are added. - / n + :math:` / n` Where <.> is the inner product and n is the dimension of the vectors x1/x2 diff --git a/gauche/kernels/fingerprint_kernels/sogenfrei_kernel.py b/gauche/kernels/fingerprint_kernels/sogenfrei_kernel.py index 02b506b..b6c6ef3 100644 --- a/gauche/kernels/fingerprint_kernels/sogenfrei_kernel.py +++ b/gauche/kernels/fingerprint_kernels/sogenfrei_kernel.py @@ -16,7 +16,7 @@ def batch_sogenfrei_sim( Sogenfrei similarity between two batched tensors, across last 2 dimensions. eps argument ensures numerical stability if all zero tensors are added. - **2 / (|x1| + |x2|) + :math:`**2 / (|x1| + |x2|)` Where <.> is the inner product and || is the L1 norm diff --git a/gauche/kernels/fingerprint_kernels/sokal_sneath_kernel.py b/gauche/kernels/fingerprint_kernels/sokal_sneath_kernel.py index da91e23..cb4bc57 100644 --- a/gauche/kernels/fingerprint_kernels/sokal_sneath_kernel.py +++ b/gauche/kernels/fingerprint_kernels/sokal_sneath_kernel.py @@ -16,7 +16,7 @@ def batch_sokal_sneath_sim( Sokal-Sneath similarity between two batched tensors, across last 2 dimensions. eps argument ensures numerical stability if all zero tensors are added. - / 2|x1| + 2|x2| - 3* + :math:` / 2|x1| + 2|x2| - 3*` Where <.> is the inner product and || is the L1 norm diff --git a/gauche/kernels/fingerprint_kernels/tanimoto_kernel.py b/gauche/kernels/fingerprint_kernels/tanimoto_kernel.py index 1dddf98..c289016 100644 --- a/gauche/kernels/fingerprint_kernels/tanimoto_kernel.py +++ b/gauche/kernels/fingerprint_kernels/tanimoto_kernel.py @@ -15,16 +15,17 @@ def batch_tanimoto_sim( Tanimoto similarity between two batched tensors, across last 2 dimensions. eps argument ensures numerical stability if all zero tensors are added. Tanimoto similarity is proportional to: - () / (||x||^2 + ||y||^2 - ) + :math:`() / (||x||^2 + ||y||^2 - )` where x and y may be bit or count vectors or in set notation: - |A \cap B | / |A| + |B| - |A \cap B | + :math:`|A \\cap B| / |A| + |B| - |A \\cap B|` Args: x1: `[b x n x d]` Tensor where b is the batch dimension x2: `[b x m x d]` Tensor eps: Float for numerical stability. Default value is 1e-6 + Returns: Tensor denoting the Tanimoto similarity. """ @@ -52,11 +53,11 @@ class TanimotoKernel(Kernel): .. math:: - \begin{equation*} - k_{\text{Tanimoto}}(\mathbf{x}, \mathbf{x'}) = \frac{\langle\mathbf{x}, - \mathbf{x'}\rangle}{\left\lVert\mathbf{x}\right\rVert^2 + \left\lVert\mathbf{x'}\right\rVert^2 - - \langle\mathbf{x}, \mathbf{x'}\rangle} - \end{equation*} + \begin{equation*} + k_{\text{Tanimoto}}(\mathbf{x}, \mathbf{x'}) = \frac{\langle\mathbf{x}, + \mathbf{x'}\rangle}{\left\lVert\mathbf{x}\right\rVert^2 + \left\lVert\mathbf{x'}\right\rVert^2 - + \langle\mathbf{x}, \mathbf{x'}\rangle} + \end{equation*} .. note:: diff --git a/gauche/representations/fingerprints.py b/gauche/representations/fingerprints.py index cb49443..7f5a142 100644 --- a/gauche/representations/fingerprints.py +++ b/gauche/representations/fingerprints.py @@ -17,11 +17,9 @@ def one_hot(df: pd.DataFrame) -> np.ndarray: Builds reaction representation as a bit vector which indicates whether a certain condition, reagent, reactant etc. is present in the reaction. - :param df: pandas DataFrame with columns representing different - parameters of the reaction (e.g. reactants, reagents, conditions). + :param df: pandas DataFrame with columns representing different parameters of the reaction (e.g. reactants, reagents, conditions). :type df: pandas DataFrame - :return: array of shape [len(reaction_smiles), sum(unique values for different columns in df)] - with one-hot encoding of reactions + :return: array of shape [len(reaction_smiles), sum(unique values for different columns in df)] with one-hot encoding of reactions """ df_ohe = pd.get_dummies(df) return df_ohe.to_numpy(dtype=np.float64)