From 18fd3fc4a7142281cf87d2b15ee86f19d9a9e3eb Mon Sep 17 00:00:00 2001 From: Piotr Tynecki Date: Fri, 23 Oct 2020 07:18:40 +0200 Subject: [PATCH] Extended KMersTransformer to support editable sliding window --- CHANGELOG.md | 5 +++++ docs/conf.py | 2 +- phages2050/features/transformers/kmers.py | 16 +++++++++------- setup.py | 2 +- 4 files changed, 16 insertions(+), 9 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index ca65929..0e6a459 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,6 +2,11 @@ All notable changes to this project will be documented in this file. +## [0.0.9] - 23.10.2020 +### Changed +* Extended `KMersTransformer` to support editable sliding window; + + ## [0.0.8] - 11.10.2020 ### Added * Initial online documentation; diff --git a/docs/conf.py b/docs/conf.py index 82bd87d..b8ce597 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -22,7 +22,7 @@ author = 'Piotr Tynecki, Iwona Świętochowska, Yana Minina, Przemysław Mitura, Wojciech Łaguna' # The full version, including alpha/beta/rc tags -release = version = '0.0.8' +release = version = '0.0.9' # -- General configuration --------------------------------------------------- diff --git a/phages2050/features/transformers/kmers.py b/phages2050/features/transformers/kmers.py index 1106a58..995dc84 100644 --- a/phages2050/features/transformers/kmers.py +++ b/phages2050/features/transformers/kmers.py @@ -20,12 +20,13 @@ class KMersTransformer(BaseEstimator, TransformerMixin): """ - K-mer transformer is responsible to extract set of - words which are subsequences of length (6 by default) - contained within a biological sequence + K-mer transformer is responsible to extract set of words - + using configurable sliding window - which are subsequences + of length (6 by default) contained within a biological sequence - Each of the word is called k-mer and are composed of - nucleotides (i.e. A, T, G, and C) + Each of the word is called k-mer and are composed of nucleotides + (i.e. A, T, G, and C). Each word which includes other characters + is removed from the output Example: fname = 'NC_001604.fasta' @@ -37,9 +38,10 @@ class KMersTransformer(BaseEstimator, TransformerMixin): kmt.transform(sample) """ - def __init__(self, size: int = 6): + def __init__(self, size: int = 6, sliding_window: int = 1): self.accepted_chars: Set[str] = {"A", "C", "T", "G"} self.size: int = size + self.sliding_window: int = sliding_window def _extract_kmers_from_sequence(self, sequence: str) -> str: """ @@ -57,7 +59,7 @@ def _extract_kmers_from_sequence(self, sequence: str) -> str: return " ".join( [ sequence[x : x + self.size] - for x in range(len(sequence) - self.size + 1) + for x in range(0, len(sequence) - self.size + 1, self.sliding_window) if not set(sequence[x : x + self.size]) - self.accepted_chars ] ) diff --git a/setup.py b/setup.py index d81983f..88964e7 100644 --- a/setup.py +++ b/setup.py @@ -17,7 +17,7 @@ dependencies = list(map(str.strip, filter(None, dependencies.split("\n")))) -version = "0.0.8" +version = "0.0.9" setup( name="phages2050",