Merge branch 'develop'

ptynecki · Sep 1, 2020 · 9e0e5dd · 9e0e5dd
2 parents 2fe4d49 + 8f2a6e2
commit 9e0e5dd
Show file tree

Hide file tree

Showing 13 changed files with 437 additions and 11 deletions.
diff --git a/.gitignore b/.gitignore
@@ -130,4 +130,5 @@ dmypy.json
 
 bert_model/
 bsp_model/
+word2vec_model/
 .pypirc
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -2,6 +2,15 @@
 All notable changes to this project will be documented in this file.
 
 
+## [0.0.6] - 01.09.2020
+### Added
+* [K-mer transformer](https://en.wikipedia.org/wiki/K-mer) with parallelization support;
+* Embedding for bacteriophages nucleotides with new Example in Jupyter Notebook format;
+
+### Changed
+* `requirements.txt` with pandarallel, scikit-learn, gensim and numpy;
+
+
 ## [0.0.5] - 27.08.2020
 ### Added
 * Initial Travis CI integration;

diff --git a/MANIFEST.in b/MANIFEST.in
@@ -4,4 +4,4 @@ include CHANGELOG.md
 include CODE_OF_CONDUCT.md
 include README.md
 include setup.py
-recursive-include examples .ipynb
+recursive-include examples .ipynbgit
diff --git a/README.md b/README.md
@@ -44,7 +44,7 @@ The repository includes numerous examples of using the framework in Jupyter Note
 
 ##### Embeddings
 * [Bacteriophage proteins embedding](https://github.com/ptynecki/PHAGES2050/blob/master/examples/embeddings/Bacteriophage-proteins-embedding.ipynb)
-* Bacteriophage DNA embedding (in progress)
+* [Bacteriophage DNA embedding](https://github.com/ptynecki/PHAGES2050/blob/master/examples/embeddings/Bacteriophage-nucleotides-embedding.ipynb)
 * Bacteriophage sequence-based biological and biochemical features extraction (planned)
 
 ##### Classifiers
@@ -123,8 +123,9 @@ Feel free to add a new issue with a respective title and description on the [the
 * Bogusław Zimnoch
 
 #### Community Managers and Educators crew
-* Jessica Sacher
-* Shawna McCallin
+* Jessica Sacher, PhD
+* Shawna McCallin, PhD
+* Jan Zheng
 
 ## Change log
 

diff --git a/examples/embeddings/Bacteriophage-nucleotides-embedding.ipynb b/examples/embeddings/Bacteriophage-nucleotides-embedding.ipynb
@@ -0,0 +1,106 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from phages2050.features.io.fasta import FastaReader\n",
+    "from phages2050.features.transformers.kmers import KMersTransformer, GenomeAvgTransformer\n",
+    "from phages2050.embeddings.nucleotides.word2vec import Word2VecModelManager, Word2VecEmbedding"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Download the newest Word2Vec embedding model for nucleotides\n",
+    "# word2vec_model directory will be created\n",
+    "model_dir = Word2VecModelManager().download_model()\n",
+    "model_pkl_file = model_dir / 'word2vec-embedding-21.07.2020.pkl'"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Example sample in FASTA format (or multi-FASTA)\n",
+    "fasta_file = 'NC_001604.fasta'"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Read FASTA file and return as pandas DataFrame\n",
+    "fasta_reader = FastaReader(fasta_file)\n",
+    "sample = fr.to_df()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Transform genome sequence into k-mers sequence\n",
+    "kmt = KMersTransformer()\n",
+    "X_kmt = kmt.transform(sample)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Load Word2Vec embedding model\n",
+    "w2v = Word2VecEmbedding(\n",
+    "    model_pkl_file=model_pkl_file\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Transform Bacteriophage genome into averaged vector space using Word2Vec model\n",
+    "genone_avg = GenomeAvgTransformer(gensim_model=w2v.model)\n",
+    "\n",
+    "# Return pandas DataFrame with fixed-length numeric vector\n",
+    "# ready for 3D plot exploration or Machine Learning classification\n",
+    "genone_avg.transform(X_kmt)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.6.9"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
diff --git a/phages2050/embeddings/nucleotides/__init__.py b/phages2050/embeddings/nucleotides/__init__.py
diff --git a/phages2050/embeddings/nucleotides/word2vec.py b/phages2050/embeddings/nucleotides/word2vec.py
@@ -0,0 +1,94 @@
+import os
+import base64
+from io import BytesIO
+from zipfile import ZipFile
+from typing import Dict
+from pathlib import Path
+
+import requests
+
+from gensim.models.word2vec import Word2Vec
+
+from fake_useragent import UserAgent
+
+
+class Word2VecModelManager:
+    """
+    Manager class is responsible to download and unzip
+    Word2Vec pre-trained model for nucleotides embedding
+    """
+
+    WORD2VEC_URL = base64.b64decode(
+        "aHR0cHM6Ly9kZWVwcGV0cmkuYWkvc3RhdGljL3BoYWdlczIwNTAv"
+        "d29yZDJ2ZWMtZW1iZWRkaW5nLTIxLjA3LjIwMjAuemlw"
+    )
+    STATUS_CODE_200 = 200
+
+    def __init__(self, model_dir: str = "word2vec_model"):
+        self.model_dir = model_dir
+
+        if not os.path.exists(model_dir):
+            os.mkdir(self.model_dir)
+
+    @staticmethod
+    def _get_headers() -> Dict:
+        """
+        Return header dict with random User-Agent to support request
+        and to avoid being blocked by the server
+        """
+
+        ua = UserAgent()
+        ua.update()
+
+        return {"User-Agent": ua.random}
+
+    def download_model(self) -> Path:
+        """
+        Download Word2Vec pre-trained model and unzip it into directory
+
+        This procedure should be executed once and the result
+        loaded by Word2VecEmbedding class instance
+        """
+
+        path = Path(self.model_dir)
+        # If model directory exists then return it immediately
+        if os.path.exists(path):
+            print("[DEBUG] Word2Vec model exists")
+            return path
+        else:
+            print("[DEBUG] Word2Vec model is downloading now")
+
+        headers = self._get_headers()
+
+        with requests.get(self.WORD2VEC_URL, headers=headers) as response:
+            assert response.status_code == self.STATUS_CODE_200
+
+            with ZipFile(BytesIO(response.content)) as zip_file:
+                zip_file.extractall(self.model_dir)
+
+        return path
+
+
+class Word2VecEmbedding:
+    """
+    Word2Vec instance loader class
+    """
+
+    def __init__(self, model_pkl_file: str):
+        """
+        Pickle file need to be serialized by Word2Vec.save method
+        before it will be loader with this class
+        """
+
+        self.model_pkl_file = model_pkl_file
+        if not os.path.exists(self.model_pkl_file):
+            raise Exception("Word2Vec model wasn't downloaded yet")
+
+        self.model = Word2Vec.load(self.model_pkl_file)
+        self.feature_space = self.model.vector_size
+
+    def get_train_params(self) -> Exception:
+        """
+        TODO: return dict with model train parameters
+        """
+        raise NotImplemented
diff --git a/phages2050/features/io/__init__.py b/phages2050/features/io/__init__.py
diff --git a/phages2050/features/io/fasta.py b/phages2050/features/io/fasta.py
@@ -0,0 +1,65 @@
+import os
+
+import pandas as pd
+
+from Bio.SeqIO.FastaIO import FastaIterator
+from Bio.SeqRecord import SeqRecord
+
+
+class FastaReader:
+    """
+    Universal class for reading FASTA files with genome or protein
+    sequence or multi-FASTA with chunks of sequences
+
+    Example:
+
+        fname = 'NC_001604.fasta'
+        fr = FastaReader(fname)
+
+        kmers_sequence = fr.get_sequence()
+
+        ks_df = fr.to_df()
+    """
+
+    def __init__(self, fasta_file_path: str):
+        self.fasta_file_path = fasta_file_path
+        self.fasta_name = os.path.basename(self.fasta_file_path)
+
+    @staticmethod
+    def _fasta_reader(filename: str) -> SeqRecord:
+        """
+        FASTA file reader as iterator
+        """
+
+        with open(filename) as handle:
+            for record in FastaIterator(handle):
+                yield record
+
+    @staticmethod
+    def _normalize(entry: SeqRecord) -> str:
+        """
+        Each of the sequence is normalized into uppercase
+        format without blank chars at the end
+        """
+
+        return str(entry.seq).upper().strip()
+
+    def get_sequence(self) -> str:
+        """
+        Final genome or protein sequence string after normalization
+        """
+
+        sequence: str = ""
+
+        for entry in self._fasta_reader(self.fasta_file_path):
+            sequence += f"{self._normalize(entry)} "
+
+        return sequence.strip()
+
+    def to_df(self) -> pd.DataFrame:
+        """
+        Return pandas DataFrame with k-mers sequence
+        format what is expected by KMersTransformer
+        """
+
+        return pd.DataFrame(data=[self.get_sequence()], columns=["sequence"])
diff --git a/phages2050/features/transformers/__init__.py b/phages2050/features/transformers/__init__.py