diff --git a/CHANGELOG.md b/CHANGELOG.md index 31b735b..89a00a3 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,6 +2,15 @@ All notable changes to this project will be documented in this file. +## [0.0.7] - 18.09.2020 +### Added +* new extention for `embedding` module with ESM transformer-based protein embedding model; + +### Changed +* `requirements.txt` updated; +* Updated `README.md`; + + ## [0.0.6] - 01.09.2020 ### Added * [K-mer transformer](https://en.wikipedia.org/wiki/K-mer) with parallelization support; diff --git a/README.md b/README.md index 6ab1ee7..fb0a25d 100644 --- a/README.md +++ b/README.md @@ -93,6 +93,7 @@ If you want to use Bacteriophage proteins vectorizers then remember to install e ``` pip install -U "bio-embeddings[all] @ git+https://github.com/sacdallago/bio_embeddings.git" +pip install git+https://github.com/facebookresearch/esm.git ``` ## Community and Contributions @@ -111,20 +112,18 @@ Feel free to add a new issue with a respective title and description on the [the ## Team -#### Data Science crew +Core Developers, Domain Experts, Community Managers and Educators who contributing to PHAGES2050: + * Piotr Tynecki * Yana Minina * Iwona Świętochowska * Przemysław Mitura - -#### Phage domain expert crew * Joanna Kazimierczak * Arkadiusz Guziński * Bogusław Zimnoch - -#### Community Managers and Educators crew * Jessica Sacher, PhD * Shawna McCallin, PhD +* Marie-Agnes Petit, PhD * Jan Zheng ## Change log diff --git a/examples/embeddings/Bacteriophage-proteins-embedding.ipynb b/examples/embeddings/Bacteriophage-proteins-embedding.ipynb index ce6d897..d5fb17e 100644 --- a/examples/embeddings/Bacteriophage-proteins-embedding.ipynb +++ b/examples/embeddings/Bacteriophage-proteins-embedding.ipynb @@ -5,22 +5,25 @@ "metadata": {}, "source": [ "## PHAGES2050\n", - "**Date:** 24.08.2020 \n", + "**Date:** 19.09.2020 \n", "**Author:** Pior Tynecki\n", "\n", - "That notebook is a part of examples in the [PHAGES2050](https://github.com/ptynecki/PHAGES2050) repository. The content of it is valid and was tested with 0.0.4 framework version.\n", + "That notebook is a part of examples in the [PHAGES2050](https://github.com/ptynecki/PHAGES2050) repository. The content of it is valid and was tested with 0.0.7 framework version.\n", "\n", - "Two belows examples are presenting the embedding model execution used for protein vectorization. The vectorization is producing 1024 feature space for each protein or for the set of proteins representing single bacteriophage.\n", + "Two belows examples are presenting the embedding model execution used for protein vectorization. The vectorization is producing 1024 or 1280 feature space for each protein or for the set of proteins representing single bacteriophage.\n", "Depends on your needs you can use fixed-length numeric vectors for:\n", "* single protein function prediction,\n", "* feature space creation for further Machine Learning classifications,\n", - "* feature space creation for bacteriophages deeper exploration in 3D;" + "* feature space creation for bacteriophages deeper exploration in 3D;\n", + "\n", + "This noteboook presents **ProtBert** and **ESM** embeddings execution." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ + "## ProtBert\n", "### Import necessary modules" ] }, @@ -249,7 +252,10 @@ "metadata": {}, "outputs": [], "source": [ - "embbeding_result = bert_embedding.transform(df=df_phage, bacteriophage_level=True)" + "embbeding_result = bert_embedding.transform(\n", + " df=df_phage,\n", + " bacteriophage_level=True\n", + ")" ] }, { @@ -260,6 +266,119 @@ "source": [ "embbeding_result.head()" ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## ESM\n", + "### Import necessary modules" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from phages2050.embeddings.proteins.esm import ESMEmbedding" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Download pre-trained proteins model" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# It can take a while and depends on the Internet speed\n", + "# Be aware that model size is 6.7 GB\n", + "esm_embedding = ESMEmbedding(\n", + " # Uniref100 or Uniref50\n", + " uniref='Uniref100',\n", + " # CUDA device ID or None (CPU)\n", + " cuda_device=0\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Bacteriophage averaged vectorization - example 3" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "phage_embbeding_result = esm_embedding.transform(\n", + " fasta_path='example_with_proteins.fasta',\n", + " bacteriophage_level=True\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "phage_embbeding_result.head()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "phage_embbeding_result.shape" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Bacteriophage each protein vectorization - example 4" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "proteins_embbeding_result = esm_embedding.transform(\n", + " fasta_path='example_with_proteins.fasta'\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "proteins_embbeding_result.head()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "proteins_embbeding_result.shape" + ] } ], "metadata": { diff --git a/phages2050/embeddings/proteins/esm.py b/phages2050/embeddings/proteins/esm.py new file mode 100644 index 0000000..208bb54 --- /dev/null +++ b/phages2050/embeddings/proteins/esm.py @@ -0,0 +1,182 @@ +import os +from typing import List, Dict + +import pandas as pd + +import torch + +import esm +from esm import FastaBatchedDataset + + +class ESMEmbedding: + """ + Embedding class is responsible to load pre-trained transformer model for proteins + and execute vectorization on single protein or set of proteins which represent + single bacteriophage + + In the case of set of proteins the vectorization returns averaged numeric vector + """ + + CPU = "cpu" + FEATURE_SPACE = 1280 + UNIREF50 = "Uniref50" + UNIREF100 = "Uniref100" + + def __init__( + self, + uniref: str = UNIREF50, + toks_per_batch: int = 4096, + extra_toks_per_seq: int = 1, + repr_layers: int = 34, + cuda_device: int = None, + ): + self.uniref = uniref + self.toks_per_batch = toks_per_batch + self.extra_toks_per_seq = extra_toks_per_seq + self.repr_layers = repr_layers + + # Select GPU card (if you have more than one) + if cuda_device is not None and torch.cuda.is_available(): + available_devices = self._get_cuda_devices() + if available_devices.get(cuda_device, None): + cuda_device = f"cuda:{cuda_device}" + else: + cuda_device = self.CPU + else: + cuda_device = self.CPU + self.device = torch.device(cuda_device) + + # Load ESM model once + self._load_model() + + @staticmethod + def _get_cuda_devices() -> Dict: + """ + Return dict with cuda devices (id: name) if exists + """ + + gpu_device_count = torch.cuda.device_count() + + return { + gpu_id: torch.cuda.get_device_name(gpu_id) + for gpu_id in range(gpu_device_count) + } + + def _load_model(self) -> None: + """ + Download and load selected ESM model (Uniref50 Sparse or Uniref100) + + This procedure should be executed once and the result + loaded by ESMEmbedding class instance + """ + + if self.uniref == self.UNIREF50: + # 34 layer transformer model with 670M params, trained on Uniref50 Sparse. + self.model, self.alphabet = esm.pretrained.esm1_t34_670M_UR50S() + elif self.uniref == self.UNIREF100: + # 34 layer transformer model with 670M params, trained on Uniref100. + self.model, self.alphabet = esm.pretrained.esm1_t34_670M_UR100() + else: + raise NotImplemented("Invalid uniref argument value") + + self.model.cuda(device=self.device) + + self.layers = [ + (i + self.model.num_layers + 1) % (self.model.num_layers + 1) + for i in [self.repr_layers] + ] + + def _get_data(self, fasta_path): + """ + Load and process proteins sequences from the FASTA or multi-Fasta file + + Each of the sample label have to be unique, in other case assertion exception is raised + """ + + dataset = FastaBatchedDataset.from_file(fasta_path) + batch_converter = self.alphabet.get_batch_converter() + batches = dataset.get_batch_indices( + self.toks_per_batch, self.extra_toks_per_seq + ) + + return torch.utils.data.DataLoader( + dataset, collate_fn=batch_converter, batch_sampler=batches + ) + + def _set_column_names(self) -> None: + """ + Set a list with embedding column names + """ + + self.columns = [f"ESM_{index}" for index in range(self.FEATURE_SPACE)] + + def _get_vectors(self, batched_data, bacteriophage_level: bool = False) -> List: + """ + Return the embedding result represented by lists or averaged list with 1280 digits + """ + + protein_tensors = [] + + with torch.no_grad(): + for batch_idx, (labels, strs, toks) in enumerate(batched_data): + toks = toks.to(device=self.device, non_blocking=True) + + out = self.model(toks, repr_layers=self.layers) + + representations = { + layer: t.to(device=self.device) + for layer, t in out["representations"].items() + } + + for i, label in enumerate(labels): + result = { + "label": label, + "mean_representations": { + layer: t[i, 1: len(strs[i]) + 1].mean(0) + for layer, t in representations.items() + }, + } + + protein_tensors.append(result) + + mean_representations = [ + tensor["mean_representations"][self.repr_layers] + for tensor in protein_tensors + ] + vectors = torch.stack(mean_representations, dim=0) + + # Organism level + if bacteriophage_level: + vectors = vectors.mean(dim=0).reshape(1, -1) + + return vectors.cpu().numpy() + + def transform( + self, fasta_path: str, bacteriophage_level: bool = False + ) -> pd.DataFrame: + """ + Execute transformer embedding directly based on FASTA input file + + The first case is expected for single protein vectorization + The second case is expected for set of proteins which represent + single bacteriophage + """ + + fname, ext = os.path.splitext(os.path.basename(fasta_path)) + + batched_data = self._get_data(fasta_path) + data = self._get_vectors(batched_data, bacteriophage_level) + self._set_column_names() + + result_df = pd.DataFrame(data, columns=self.columns) + + if bacteriophage_level: + result_df.insert(0, "name", [fname]) + else: + proteins_count = result_df.shape[0] + result_df.insert( + 0, "name", [f"protein_{index}" for index in range(proteins_count)] + ) + + return result_df diff --git a/requirements.txt b/requirements.txt index f82624a..f35faab 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,11 +1,13 @@ black==19.10b0 lxml==4.5.2 -pandas==1.1.1 +pandas==1.1.2 requests==2.24.0 -biopython==1.77 +biopython==1.78 fake-useragent==0.1.11 joblib==0.16.0 pandarallel==1.5.1 scikit-learn==0.22.2.post1 gensim==3.8.3 -numpy==1.19.1 +numpy==1.19.2 +pytest==6.0.2 +coverage==5.3 diff --git a/setup.py b/setup.py index aef0cd9..b8b3353 100644 --- a/setup.py +++ b/setup.py @@ -17,7 +17,7 @@ dependencies = list(map(str.strip, filter(None, dependencies.split("\n")))) -version = "0.0.6.2" +version = "0.0.7" setup( name="phages2050",