Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Uppercase #164

Open
wants to merge 3 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Empty file added deblur/test/__init__.py
Empty file.
90 changes: 88 additions & 2 deletions deblur/test/test_workflow.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@

from unittest import TestCase, main
from shutil import rmtree
from tempfile import mkdtemp
from tempfile import mkdtemp, NamedTemporaryFile
from os import listdir, remove
from types import GeneratorType
from os.path import join, isfile, abspath, dirname, splitext
Expand All @@ -34,7 +34,8 @@
remove_artifacts_from_biom_table,
_get_fastq_variant,
filter_minreads_samples_from_table,
fasta_from_biom)
fasta_from_biom,
upper_fasta)
from deblur.deblurring import get_default_error_profile


Expand Down Expand Up @@ -213,6 +214,28 @@ def test_dereplicate_seqs_remove_singletons(self):

self.assertEqual(act, exp)

def test_upper_fasta(self):
fasta = (">Seq1\nAAGtttcA\n"
">Seq2\nAAAAGCcA\n"
">Seq3\nAAGTGCAA\n")
fasta_exp = (">Seq1\nAAGTTTCA\n"
">Seq2\nAAAAGCCA\n"
">Seq3\nAAGTGCAA\n")

in_fa = join(self.working_dir, "seqs_lower.fasta")

with open(in_fa, 'w') as f:
f.write(fasta)

upper_fasta(in_fa)

with open(in_fa) as f:
fasta_out = f.read()

remove(in_fa)

self.assertEqual(fasta_exp, fasta_out)

def test_dereplicate_seqs(self):
""" Test dereplicate_seqs() method functionality,
keep singletons
Expand Down Expand Up @@ -530,6 +553,69 @@ def test_remove_chimeras_denovo_from_seqs(self):
seqs_obs.append(label)
self.assertEqual(seqs_non_chimera, seqs_obs)

def test_remove_chimeras_denovo_from_seqs_lower(self):
""" Test remove_chimeras_denovo_from_seqs() method functionality.
Remove chimeric sequences from a FASTA file using the UCHIME
algorithm, implemented in VSEARCH.
"""
seqs = [("s1_104;size=2;", "GTGCCAGCCGCCGCGGTAATACCCGCAGCTCAAGTGGTG"
"GTCGCTATTATTGAGCCTAAAACGTCCGTAGTCGGCTTT"
"GTAAATCCCTGGGTAAATCGGGAAGCTTAACTTTCCGAC"
"TTCCGAGGAGACTGTCAAACTTGGGACCGGGAG"),
("s1_106;size=2;", "GTGTCAGCCGCCGCGGTAATACCAGCTCTCCGAGTGGTG"
"TGGATGTTTATTGGGCCTAAAGCGTCCGTAGCCGGCTGC"
"GCAAGTCTGTCGGGAAATCCGCACGCCTAACGTGCGGGC"
"GTCCGGCGGAAACTGCGTGGCTTGGGACCGGAA"),
("s1_1;size=9;", "TACCCGCAGCTCAAGTGGTGGTCGCTATTATTGAGCCTAAA"
"ACGTCCGTAGTCGGCTTTGTAAATCCCTGGGTAAATCGGGT"
"CGCTTAACGATCCGATTCTGGGGAGACTGCAAAGCTTGGGA"
"CCGGGCGAGGTTAGAGGTACTCTCGGG"),
("s1_20;size=9;", "TACCTGCAGCCCAAGTGGTGGTCGATTTTATTGAGTCTAA"
"AACGTTCGTAGCCGGTTTGATAAATCCTTGGGTAAATCGG"
"GAAGCTTAACTTTCCGATTCCGAGGAGACTGTCAAACTTG"
"GGACCGGGAGAGGCTAGAGGTACTTCTGGG"),
("s1_40;size=8;", "TACCAGCTCTCCGAGTGGTGTGGATGTTTATTGGGCCTAA"
"AGCATCCGTAGCTGGCTAGGTTAGTCCCCTGTTAAATCCA"
"CCGAATTAATCGTTGGATGCGGGGGATACTGCTTGGCTAG"
"AAAAAAAAAAAAAAAAAAAAAAAAAAAAAA"),
("s1_60;size=8;", "TACCGGCAGCTCAAGTGATGACCGCTATTATTGGGCCTAA"
"AGCGTCCGTAGCCGGCTGCGCAAGTCTGTCGGGAAATCCG"
"CACGCCTAACGTGCGGGTCCGGCGGAAACTGCGTGGCTTG"
"GGACCGGAAGACTCGAGGGGTACGTCAGGG")]
names_non_chimera = ["s1_1;size=9;", "s1_20;size=9;",
"s1_40;size=8;", "s1_60;size=8;"]
seqs_non_chimera = [("TACCCGCAGCTCAAGTGGTGGTCGCTATTATTGAGCCTAAA"
"ACGTCCGTAGTCGGCTTTGTAAATCCCTGGGTAAATCGGGT"
"CGCTTAACGATCCGATTCTGGGGAGACTGCAAAGCTTGGGA"
"CCGGGCGAGGTTAGAGGTACTCTCGGG"),
("TACCTGCAGCCCAAGTGGTGGTCGATTTTATTGAGTCTAA"
"AACGTTCGTAGCCGGTTTGATAAATCCTTGGGTAAATCGG"
"GAAGCTTAACTTTCCGATTCCGAGGAGACTGTCAAACTTG"
"GGACCGGGAGAGGCTAGAGGTACTTCTGGG"),
("TACCAGCTCTCCGAGTGGTGTGGATGTTTATTGGGCCTAA"
"AGCATCCGTAGCTGGCTAGGTTAGTCCCCTGTTAAATCCA"
"CCGAATTAATCGTTGGATGCGGGGGATACTGCTTGGCTAG"
"AAAAAAAAAAAAAAAAAAAAAAAAAAAAAA"),
("TACCGGCAGCTCAAGTGATGACCGCTATTATTGGGCCTAA"
"AGCGTCCGTAGCCGGCTGCGCAAGTCTGTCGGGAAATCCG"
"CACGCCTAACGTGCGGGTCCGGCGGAAACTGCGTGGCTTG"
"GGACCGGAAGACTCGAGGGGTACGTCAGGG")]
seqs_fp = join(self.working_dir, "seqs.fasta")
with open(seqs_fp, 'w') as seqs_f:
for seq in seqs:
seqs_f.write(">%s\n%s\n" % seq)
output_fp = remove_chimeras_denovo_from_seqs(
seqs_fp=seqs_fp,
working_dir=self.working_dir)
names_obs = []
seqs_obs = []
for label, seq in sequence_generator(output_fp):
label = label.split()[0]
names_obs.append(label)
seqs_obs.append(seq)
self.assertEqual(names_non_chimera, names_obs)
self.assertEqual(seqs_non_chimera, seqs_obs)

def test_multiple_sequence_alignment(self):
"""Test multiple sequence alignment.
"""
Expand Down
18 changes: 18 additions & 0 deletions deblur/workflow.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,8 @@
import warnings
import io
import os
import tempfile
import shutil

import skbio
from biom.table import Table
Expand Down Expand Up @@ -567,9 +569,25 @@ def remove_chimeras_denovo_from_seqs(seqs_fp, working_dir, threads=1):
logger.error('problem with chimera removal for file %s' % seqs_fp)
logger.debug('stdout : %s' % sout)
logger.debug('stderr : %s' % serr)

upper_fasta(output_fp)

return output_fp


def upper_fasta(fp):
with open(fp) as f:
o_f = tempfile.NamedTemporaryFile(delete=False)
with open(o_f.name, 'w') as o:
for line in f:
if line.startswith('>'):
o.write(line)
else:
o.write(line.upper())
shutil.copy2(o.name, fp)
os.remove(o_f.name)


def sample_id_from_read_id(readid):
"""Get SampleID from the split_libraries_fastq.py output
fasta file read header
Expand Down