-
Notifications
You must be signed in to change notification settings - Fork 5
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
13 changed files
with
437 additions
and
11 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -130,4 +130,5 @@ dmypy.json | |
|
||
bert_model/ | ||
bsp_model/ | ||
word2vec_model/ | ||
.pypirc |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
106 changes: 106 additions & 0 deletions
106
examples/embeddings/Bacteriophage-nucleotides-embedding.ipynb
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,106 @@ | ||
{ | ||
"cells": [ | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"from phages2050.features.io.fasta import FastaReader\n", | ||
"from phages2050.features.transformers.kmers import KMersTransformer, GenomeAvgTransformer\n", | ||
"from phages2050.embeddings.nucleotides.word2vec import Word2VecModelManager, Word2VecEmbedding" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"# Download the newest Word2Vec embedding model for nucleotides\n", | ||
"# word2vec_model directory will be created\n", | ||
"model_dir = Word2VecModelManager().download_model()\n", | ||
"model_pkl_file = model_dir / 'word2vec-embedding-21.07.2020.pkl'" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"# Example sample in FASTA format (or multi-FASTA)\n", | ||
"fasta_file = 'NC_001604.fasta'" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"# Read FASTA file and return as pandas DataFrame\n", | ||
"fasta_reader = FastaReader(fasta_file)\n", | ||
"sample = fr.to_df()" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"# Transform genome sequence into k-mers sequence\n", | ||
"kmt = KMersTransformer()\n", | ||
"X_kmt = kmt.transform(sample)" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"# Load Word2Vec embedding model\n", | ||
"w2v = Word2VecEmbedding(\n", | ||
" model_pkl_file=model_pkl_file\n", | ||
")" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"# Transform Bacteriophage genome into averaged vector space using Word2Vec model\n", | ||
"genone_avg = GenomeAvgTransformer(gensim_model=w2v.model)\n", | ||
"\n", | ||
"# Return pandas DataFrame with fixed-length numeric vector\n", | ||
"# ready for 3D plot exploration or Machine Learning classification\n", | ||
"genone_avg.transform(X_kmt)" | ||
] | ||
} | ||
], | ||
"metadata": { | ||
"kernelspec": { | ||
"display_name": "Python 3", | ||
"language": "python", | ||
"name": "python3" | ||
}, | ||
"language_info": { | ||
"codemirror_mode": { | ||
"name": "ipython", | ||
"version": 3 | ||
}, | ||
"file_extension": ".py", | ||
"mimetype": "text/x-python", | ||
"name": "python", | ||
"nbconvert_exporter": "python", | ||
"pygments_lexer": "ipython3", | ||
"version": "3.6.9" | ||
} | ||
}, | ||
"nbformat": 4, | ||
"nbformat_minor": 4 | ||
} |
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,94 @@ | ||
import os | ||
import base64 | ||
from io import BytesIO | ||
from zipfile import ZipFile | ||
from typing import Dict | ||
from pathlib import Path | ||
|
||
import requests | ||
|
||
from gensim.models.word2vec import Word2Vec | ||
|
||
from fake_useragent import UserAgent | ||
|
||
|
||
class Word2VecModelManager: | ||
""" | ||
Manager class is responsible to download and unzip | ||
Word2Vec pre-trained model for nucleotides embedding | ||
""" | ||
|
||
WORD2VEC_URL = base64.b64decode( | ||
"aHR0cHM6Ly9kZWVwcGV0cmkuYWkvc3RhdGljL3BoYWdlczIwNTAv" | ||
"d29yZDJ2ZWMtZW1iZWRkaW5nLTIxLjA3LjIwMjAuemlw" | ||
) | ||
STATUS_CODE_200 = 200 | ||
|
||
def __init__(self, model_dir: str = "word2vec_model"): | ||
self.model_dir = model_dir | ||
|
||
if not os.path.exists(model_dir): | ||
os.mkdir(self.model_dir) | ||
|
||
@staticmethod | ||
def _get_headers() -> Dict: | ||
""" | ||
Return header dict with random User-Agent to support request | ||
and to avoid being blocked by the server | ||
""" | ||
|
||
ua = UserAgent() | ||
ua.update() | ||
|
||
return {"User-Agent": ua.random} | ||
|
||
def download_model(self) -> Path: | ||
""" | ||
Download Word2Vec pre-trained model and unzip it into directory | ||
This procedure should be executed once and the result | ||
loaded by Word2VecEmbedding class instance | ||
""" | ||
|
||
path = Path(self.model_dir) | ||
# If model directory exists then return it immediately | ||
if os.path.exists(path): | ||
print("[DEBUG] Word2Vec model exists") | ||
return path | ||
else: | ||
print("[DEBUG] Word2Vec model is downloading now") | ||
|
||
headers = self._get_headers() | ||
|
||
with requests.get(self.WORD2VEC_URL, headers=headers) as response: | ||
assert response.status_code == self.STATUS_CODE_200 | ||
|
||
with ZipFile(BytesIO(response.content)) as zip_file: | ||
zip_file.extractall(self.model_dir) | ||
|
||
return path | ||
|
||
|
||
class Word2VecEmbedding: | ||
""" | ||
Word2Vec instance loader class | ||
""" | ||
|
||
def __init__(self, model_pkl_file: str): | ||
""" | ||
Pickle file need to be serialized by Word2Vec.save method | ||
before it will be loader with this class | ||
""" | ||
|
||
self.model_pkl_file = model_pkl_file | ||
if not os.path.exists(self.model_pkl_file): | ||
raise Exception("Word2Vec model wasn't downloaded yet") | ||
|
||
self.model = Word2Vec.load(self.model_pkl_file) | ||
self.feature_space = self.model.vector_size | ||
|
||
def get_train_params(self) -> Exception: | ||
""" | ||
TODO: return dict with model train parameters | ||
""" | ||
raise NotImplemented |
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,65 @@ | ||
import os | ||
|
||
import pandas as pd | ||
|
||
from Bio.SeqIO.FastaIO import FastaIterator | ||
from Bio.SeqRecord import SeqRecord | ||
|
||
|
||
class FastaReader: | ||
""" | ||
Universal class for reading FASTA files with genome or protein | ||
sequence or multi-FASTA with chunks of sequences | ||
Example: | ||
fname = 'NC_001604.fasta' | ||
fr = FastaReader(fname) | ||
kmers_sequence = fr.get_sequence() | ||
ks_df = fr.to_df() | ||
""" | ||
|
||
def __init__(self, fasta_file_path: str): | ||
self.fasta_file_path = fasta_file_path | ||
self.fasta_name = os.path.basename(self.fasta_file_path) | ||
|
||
@staticmethod | ||
def _fasta_reader(filename: str) -> SeqRecord: | ||
""" | ||
FASTA file reader as iterator | ||
""" | ||
|
||
with open(filename) as handle: | ||
for record in FastaIterator(handle): | ||
yield record | ||
|
||
@staticmethod | ||
def _normalize(entry: SeqRecord) -> str: | ||
""" | ||
Each of the sequence is normalized into uppercase | ||
format without blank chars at the end | ||
""" | ||
|
||
return str(entry.seq).upper().strip() | ||
|
||
def get_sequence(self) -> str: | ||
""" | ||
Final genome or protein sequence string after normalization | ||
""" | ||
|
||
sequence: str = "" | ||
|
||
for entry in self._fasta_reader(self.fasta_file_path): | ||
sequence += f"{self._normalize(entry)} " | ||
|
||
return sequence.strip() | ||
|
||
def to_df(self) -> pd.DataFrame: | ||
""" | ||
Return pandas DataFrame with k-mers sequence | ||
format what is expected by KMersTransformer | ||
""" | ||
|
||
return pd.DataFrame(data=[self.get_sequence()], columns=["sequence"]) |
Empty file.
Oops, something went wrong.