Skip to content

Commit

Permalink
Merge branch 'develop'
Browse files Browse the repository at this point in the history
  • Loading branch information
Piotr Tynecki committed Sep 1, 2020
2 parents 2fe4d49 + 8f2a6e2 commit 9e0e5dd
Show file tree
Hide file tree
Showing 13 changed files with 437 additions and 11 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -130,4 +130,5 @@ dmypy.json

bert_model/
bsp_model/
word2vec_model/
.pypirc
9 changes: 9 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,15 @@
All notable changes to this project will be documented in this file.


## [0.0.6] - 01.09.2020
### Added
* [K-mer transformer](https://en.wikipedia.org/wiki/K-mer) with parallelization support;
* Embedding for bacteriophages nucleotides with new Example in Jupyter Notebook format;

### Changed
* `requirements.txt` with pandarallel, scikit-learn, gensim and numpy;


## [0.0.5] - 27.08.2020
### Added
* Initial Travis CI integration;
Expand Down
2 changes: 1 addition & 1 deletion MANIFEST.in
Original file line number Diff line number Diff line change
Expand Up @@ -4,4 +4,4 @@ include CHANGELOG.md
include CODE_OF_CONDUCT.md
include README.md
include setup.py
recursive-include examples .ipynb
recursive-include examples .ipynbgit
7 changes: 4 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ The repository includes numerous examples of using the framework in Jupyter Note

##### Embeddings
* [Bacteriophage proteins embedding](https://github.com/ptynecki/PHAGES2050/blob/master/examples/embeddings/Bacteriophage-proteins-embedding.ipynb)
* Bacteriophage DNA embedding (in progress)
* [Bacteriophage DNA embedding](https://github.com/ptynecki/PHAGES2050/blob/master/examples/embeddings/Bacteriophage-nucleotides-embedding.ipynb)
* Bacteriophage sequence-based biological and biochemical features extraction (planned)

##### Classifiers
Expand Down Expand Up @@ -123,8 +123,9 @@ Feel free to add a new issue with a respective title and description on the [the
* Bogusław Zimnoch

#### Community Managers and Educators crew
* Jessica Sacher
* Shawna McCallin
* Jessica Sacher, PhD
* Shawna McCallin, PhD
* Jan Zheng

## Change log

Expand Down
106 changes: 106 additions & 0 deletions examples/embeddings/Bacteriophage-nucleotides-embedding.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,106 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from phages2050.features.io.fasta import FastaReader\n",
"from phages2050.features.transformers.kmers import KMersTransformer, GenomeAvgTransformer\n",
"from phages2050.embeddings.nucleotides.word2vec import Word2VecModelManager, Word2VecEmbedding"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Download the newest Word2Vec embedding model for nucleotides\n",
"# word2vec_model directory will be created\n",
"model_dir = Word2VecModelManager().download_model()\n",
"model_pkl_file = model_dir / 'word2vec-embedding-21.07.2020.pkl'"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Example sample in FASTA format (or multi-FASTA)\n",
"fasta_file = 'NC_001604.fasta'"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Read FASTA file and return as pandas DataFrame\n",
"fasta_reader = FastaReader(fasta_file)\n",
"sample = fr.to_df()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Transform genome sequence into k-mers sequence\n",
"kmt = KMersTransformer()\n",
"X_kmt = kmt.transform(sample)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Load Word2Vec embedding model\n",
"w2v = Word2VecEmbedding(\n",
" model_pkl_file=model_pkl_file\n",
")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Transform Bacteriophage genome into averaged vector space using Word2Vec model\n",
"genone_avg = GenomeAvgTransformer(gensim_model=w2v.model)\n",
"\n",
"# Return pandas DataFrame with fixed-length numeric vector\n",
"# ready for 3D plot exploration or Machine Learning classification\n",
"genone_avg.transform(X_kmt)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.9"
}
},
"nbformat": 4,
"nbformat_minor": 4
}
Empty file.
94 changes: 94 additions & 0 deletions phages2050/embeddings/nucleotides/word2vec.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,94 @@
import os
import base64
from io import BytesIO
from zipfile import ZipFile
from typing import Dict
from pathlib import Path

import requests

from gensim.models.word2vec import Word2Vec

from fake_useragent import UserAgent


class Word2VecModelManager:
"""
Manager class is responsible to download and unzip
Word2Vec pre-trained model for nucleotides embedding
"""

WORD2VEC_URL = base64.b64decode(
"aHR0cHM6Ly9kZWVwcGV0cmkuYWkvc3RhdGljL3BoYWdlczIwNTAv"
"d29yZDJ2ZWMtZW1iZWRkaW5nLTIxLjA3LjIwMjAuemlw"
)
STATUS_CODE_200 = 200

def __init__(self, model_dir: str = "word2vec_model"):
self.model_dir = model_dir

if not os.path.exists(model_dir):
os.mkdir(self.model_dir)

@staticmethod
def _get_headers() -> Dict:
"""
Return header dict with random User-Agent to support request
and to avoid being blocked by the server
"""

ua = UserAgent()
ua.update()

return {"User-Agent": ua.random}

def download_model(self) -> Path:
"""
Download Word2Vec pre-trained model and unzip it into directory
This procedure should be executed once and the result
loaded by Word2VecEmbedding class instance
"""

path = Path(self.model_dir)
# If model directory exists then return it immediately
if os.path.exists(path):
print("[DEBUG] Word2Vec model exists")
return path
else:
print("[DEBUG] Word2Vec model is downloading now")

headers = self._get_headers()

with requests.get(self.WORD2VEC_URL, headers=headers) as response:
assert response.status_code == self.STATUS_CODE_200

with ZipFile(BytesIO(response.content)) as zip_file:
zip_file.extractall(self.model_dir)

return path


class Word2VecEmbedding:
"""
Word2Vec instance loader class
"""

def __init__(self, model_pkl_file: str):
"""
Pickle file need to be serialized by Word2Vec.save method
before it will be loader with this class
"""

self.model_pkl_file = model_pkl_file
if not os.path.exists(self.model_pkl_file):
raise Exception("Word2Vec model wasn't downloaded yet")

self.model = Word2Vec.load(self.model_pkl_file)
self.feature_space = self.model.vector_size

def get_train_params(self) -> Exception:
"""
TODO: return dict with model train parameters
"""
raise NotImplemented
Empty file.
65 changes: 65 additions & 0 deletions phages2050/features/io/fasta.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
import os

import pandas as pd

from Bio.SeqIO.FastaIO import FastaIterator
from Bio.SeqRecord import SeqRecord


class FastaReader:
"""
Universal class for reading FASTA files with genome or protein
sequence or multi-FASTA with chunks of sequences
Example:
fname = 'NC_001604.fasta'
fr = FastaReader(fname)
kmers_sequence = fr.get_sequence()
ks_df = fr.to_df()
"""

def __init__(self, fasta_file_path: str):
self.fasta_file_path = fasta_file_path
self.fasta_name = os.path.basename(self.fasta_file_path)

@staticmethod
def _fasta_reader(filename: str) -> SeqRecord:
"""
FASTA file reader as iterator
"""

with open(filename) as handle:
for record in FastaIterator(handle):
yield record

@staticmethod
def _normalize(entry: SeqRecord) -> str:
"""
Each of the sequence is normalized into uppercase
format without blank chars at the end
"""

return str(entry.seq).upper().strip()

def get_sequence(self) -> str:
"""
Final genome or protein sequence string after normalization
"""

sequence: str = ""

for entry in self._fasta_reader(self.fasta_file_path):
sequence += f"{self._normalize(entry)} "

return sequence.strip()

def to_df(self) -> pd.DataFrame:
"""
Return pandas DataFrame with k-mers sequence
format what is expected by KMersTransformer
"""

return pd.DataFrame(data=[self.get_sequence()], columns=["sequence"])
Empty file.
Loading

0 comments on commit 9e0e5dd

Please sign in to comment.