From 08b380dea299b1fd471fcdbe3de26a7a863097e0 Mon Sep 17 00:00:00 2001 From: mictadlo Date: Mon, 21 May 2018 06:53:52 +1000 Subject: [PATCH] Racon, Miniasm and Minimap added --- .idea/circlator.iml | 12 ++++ .idea/misc.xml | 4 ++ .idea/modules.xml | 8 +++ .idea/vcs.xml | 6 ++ circlator/assemble.py | 113 +++++++++++++++++++++++++++++++++++- circlator/assembly.py | 5 ++ circlator/common.py | 2 +- circlator/external_progs.py | 14 ++++- circlator/merge.py | 1 + circlator/tasks/assemble.py | 4 +- circlator/tasks/merge.py | 2 +- install_dependencies.sh | 44 +++++++++++++- setup.py | 2 +- 13 files changed, 207 insertions(+), 10 deletions(-) create mode 100644 .idea/circlator.iml create mode 100644 .idea/misc.xml create mode 100644 .idea/modules.xml create mode 100644 .idea/vcs.xml diff --git a/.idea/circlator.iml b/.idea/circlator.iml new file mode 100644 index 0000000..c69e48f --- /dev/null +++ b/.idea/circlator.iml @@ -0,0 +1,12 @@ + + + + + + + + + + \ No newline at end of file diff --git a/.idea/misc.xml b/.idea/misc.xml new file mode 100644 index 0000000..dfa04cd --- /dev/null +++ b/.idea/misc.xml @@ -0,0 +1,4 @@ + + + + \ No newline at end of file diff --git a/.idea/modules.xml b/.idea/modules.xml new file mode 100644 index 0000000..e500a73 --- /dev/null +++ b/.idea/modules.xml @@ -0,0 +1,8 @@ + + + + + + + + \ No newline at end of file diff --git a/.idea/vcs.xml b/.idea/vcs.xml new file mode 100644 index 0000000..94a25f7 --- /dev/null +++ b/.idea/vcs.xml @@ -0,0 +1,6 @@ + + + + + + \ No newline at end of file diff --git a/circlator/assemble.py b/circlator/assemble.py index 7396f43..8bb784a 100644 --- a/circlator/assemble.py +++ b/circlator/assemble.py @@ -19,7 +19,7 @@ def __init__(self, spades_use_first_success=False, assembler='spades', genomeSize=100000, # only matters for Canu if correcting reads (which we're not) - data_type='pacbio-corrected', + data_type='pacbio-raw', ): self.outdir = os.path.abspath(outdir) self.reads = os.path.abspath(reads) @@ -41,6 +41,9 @@ def __init__(self, self.canu = external_progs.make_and_check_prog('canu', verbose=self.verbose, required=True) self.genomeSize=genomeSize self.data_type = data_type + elif self.assembler == 'racon': + self.racon = external_progs.make_and_check_prog('racon', verbose=self.verbose, required=True) + self.data_type = data_type else: raise Error('Unknown assembler: "' + self.assembler + '". cannot continue') @@ -170,11 +173,119 @@ def run_canu(self): renamed_gfa = os.path.join(self.outdir, 'contigs.gfa') os.rename(original_gfa, renamed_gfa) + def run_racon(self): + '''Runs minimap, miniasm, racon instead of spades''' + + if self.data_type.split('-')[0] == 'pacbio': + overlapRaadsType = 'ava-pb' # PacBio + else: + overlapReadsType = 'ava-ont' # Nanopore + + # minimap2 + cmd = [ + self.minimap2.exe(), + '-t', self.threads, + '-x', overlapReadsType, self.reads, self.reads, + '>', os.path.join(self.outdir, 'output.paf') + ] + + ok, errs = common.syscall(' '.join(cmd), verbose=self.verbose, allow_fail=False) + if not ok: + raise Error('Error running minimap2.') + + # miniasm + cmd = [ + self.miniasm.exe(), + '-Rc2', '-f', self.reads, os.path.join(self.outdir, 'output.paf'), + '>', os.path.join(self.outdir, 'output.gfa') + ] + + ok, errs = common.syscall(' '.join(cmd), verbose=self.verbose, allow_fail=False) + if not ok: + raise Error('Error running miniasm.') + + # gfa2fasta + cmd = [ + self.awk.exe(), + '/^S/{print ">"$2"\n"$3}', os.path.join(self.outdir, 'output.gfa'), + '|', 'fold ' '>', os.path.join(self.outdir, 'output.gfa.fasta') + + ] + + ok, errs = common.syscall(' '.join(cmd), verbose=self.verbose, allow_fail=False) + if not ok: + raise Error('Error running awk.') + + if self.data_type.split('-')[0] == 'pacbio': + mapRaadsType = 'map-pb' # PacBio + else: + mapReadsType = 'map-ont' # Nanopore + + + # Correction 1 + # minimap2 + cmd = [ + self.minimap2.exe(), + '-t', self.threads, + '-ax', mapRaadsType, os.path.join(self.outdir, 'output.gfa.fasta'), self.reads, + '>', os.path.join(self.outdir, 'output.gfa1.sam') + ] + + ok, errs = common.syscall(' '.join(cmd), verbose=self.verbose, allow_fail=False) + if not ok: + raise Error('Error running minimap2 correction step #1.') + + # Racon 1 + cmd = [ + self.racon.exe(), + '-t', self.threads, self.reads, os.path.join(self.outdir, 'output.gfa1.sam'), + os.path.join(self.outdir, 'output.gfa.fasta'), + '>', os.path.join(self.outdir, 'output.racon1.fasta') + ] + + ok, errs = common.syscall(' '.join(cmd), verbose=self.verbose, allow_fail=False) + if not ok: + raise Error('Error running racon correction step #1.') + + + # Correction 2 + # minimap2 2 + cmd = [ + self.minimap2.exe(), + '-t', self.threads, + '-ax map-pb', os.path.join(self.outdir, 'output.racon1.fasta'), self.reads, + '>', os.path.join(self.outdir, 'output.gfa2.sam') + ] + + ok, errs = common.syscall(' '.join(cmd), verbose=self.verbose, allow_fail=False) + if not ok: + raise Error('Error running minimap2 correction step #2.') + + # Racon 2 + cmd = [ + self.racon.exe(), + '-t', self.threads, self.reads, os.path.join(self.outdir, 'output.gfa2.sam'), + os.path.join(self.outdir, 'output.racon1.fasta'), + '>', os.path.join(self.outdir, 'output.racon2.fasta') + ] + + ok, errs = common.syscall(' '.join(cmd), verbose=self.verbose, allow_fail=False) + if not ok: + raise Error('Error running racon correction step #2.') + + original_gfa = os.path.join(self.outdir, 'output.gfa') + renamed_gfa = os.path.join(self.outdir, 'contigs.gfa') + os.rename(original_gfa, renamed_gfa) + original_contigs = os.path.join(self.outdir, 'output.racon2.fasta') + renamed_contigs = os.path.join(self.outdir, 'contigs.fasta') + os.rename(original_contigs, renamed_contigs) def run(self): if self.assembler == 'spades': self.run_spades(stop_at_first_success=self.spades_use_first_success) elif self.assembler == 'canu': self.run_canu() + elif self.assembler == 'racon': + self.run_racon() else: raise Error('Unknown assembler: "' + self.assembler + '". cannot continue') diff --git a/circlator/assembly.py b/circlator/assembly.py index 70a3bf6..5e6a8b6 100644 --- a/circlator/assembly.py +++ b/circlator/assembly.py @@ -64,6 +64,9 @@ def _set_filenames(self): elif self.assembler == 'canu': if self.contigs_fasta is None or self.contigs_gfa is None: raise Error('Error finding canu contigs fasta and/or gfa file') + elif self.assembler == 'racon': + if self.contigs_fasta is None or self.contigs_gfa is None: + raise Error('Error finding canu contigs fasta and/or gfa file') else: raise Error('Assembler "' + self.assembler + '" not recognised. Cannot continue') @@ -185,5 +188,7 @@ def circular_contigs(self): return set() elif self.assembler == 'canu': return self._circular_contigs_from_canu_gfa(self.contigs_gfa) + elif self.assembler == 'racon': + return self._circular_contigs_from_canu_gfa(self.contigs_gfa) else: return set() diff --git a/circlator/common.py b/circlator/common.py index afce3e3..dc8fa69 100644 --- a/circlator/common.py +++ b/circlator/common.py @@ -4,7 +4,7 @@ class Error (Exception): pass -allowed_assemblers = ['canu', 'spades'] +allowed_assemblers = ['canu', 'spades', 'racon'] allowed_data_types = ['pacbio-raw', 'pacbio-corrected', 'nanopore-raw', 'nanopore-corrected'] def syscall(cmd, allow_fail=False, verbose=False): diff --git a/circlator/external_progs.py b/circlator/external_progs.py index f6b9f92..30f6846 100644 --- a/circlator/external_progs.py +++ b/circlator/external_progs.py @@ -11,6 +11,7 @@ class Error (Exception): pass 'samtools': 'CIRCLATOR_SAMTOOLS', 'spades': 'CIRCLATOR_SPADES', 'canu': 'CIRCLATOR_CANU', + 'racon': 'CIRCLATOR_RACON', } prog_to_version_cmd = { @@ -20,6 +21,9 @@ class Error (Exception): pass 'samtools': ('', re.compile(r'Version: (\d+\.\d+[\.\d]*)')), 'spades': ('-v', re.compile(r'v.?([0-9][0-9\.]+)')), 'canu': ('-version', re.compile(r'^Canu \D*([\d][\d\.]+)')), + 'minimap2': ('-V', re.compile(r'([0-9\.]+)')), + 'miniasm': ('-V', re.compile(r'([0-9\.]+)')), + 'racon': ('--version', re.compile(r'v.?([0-9][0-9\.]+)')), } min_versions = { @@ -27,8 +31,11 @@ class Error (Exception): pass 'nucmer': '3.1', 'prodigal': '2.6', 'samtools': '0.1.19', - 'spades': '3.6.2', # this is the first version to support python3 + 'spades': '3.11.1', # this is the first version to support python3 'canu': '0.0', + 'minimap2': '2.10', + 'miniasm': '0.2', + 'racon': '1.2.1', } @@ -44,9 +51,12 @@ class Error (Exception): pass 'spades': 'spades.py', 'samtools': 'samtools', 'canu': 'canu', + 'minimap2':'minimap2', + 'miniasm': 'miniasm', + 'racon': 'racon' } -not_required = {'spades', 'canu'} +not_required = {'spades', 'canu','racon'} def handle_error(message, raise_error=True): if raise_error: diff --git a/circlator/merge.py b/circlator/merge.py index 3958996..eeaf086 100644 --- a/circlator/merge.py +++ b/circlator/merge.py @@ -1,4 +1,5 @@ import os +import os import sys import copy import shutil diff --git a/circlator/tasks/assemble.py b/circlator/tasks/assemble.py index e9f9a2c..233591b 100644 --- a/circlator/tasks/assemble.py +++ b/circlator/tasks/assemble.py @@ -5,7 +5,7 @@ def run(): parser = argparse.ArgumentParser( - description = 'Assemble reads using SPAdes/Canu', + description = 'Assemble reads using SPAdes/Canu/Racon', usage = 'circlator assemble [options] ') parser.add_argument('--not_careful', action='store_true', help='Do not use the --careful option with SPAdes (used by default)') parser.add_argument('--not_only_assembler', action='store_true', help='Do not use the --assemble-only option with SPAdes (used by default)') @@ -14,7 +14,7 @@ def run(): parser.add_argument('--spades_k', help='Comma separated list of kmers to use when running SPAdes. Max kmer is 127 and each kmer should be an odd integer [%(default)s]', default='127,117,107,97,87,77', metavar='k1,k2,k3,...') parser.add_argument('--spades_use_first', action='store_true', help='Use the first successful SPAdes assembly. Default is to try all kmers and use the assembly with the largest N50') parser.add_argument('--assembler', choices=circlator.common.allowed_assemblers, help='Assembler to use for reassemblies [%(default)s]', default='spades') - parser.add_argument('--data_type', choices=circlator.common.allowed_data_types, help='String representing one of the 4 type of data analysed (only used for Canu) [%(default)s]', default='pacbio-corrected') + parser.add_argument('--data_type', choices=circlator.common.allowed_data_types, help='String representing one of the 4 type of data analysed (only used for Canu and Racon) [%(default)s]', default='pacbio-raw') parser.add_argument('reads', help='Name of input reads FASTA file', metavar='in.reads.fasta') parser.add_argument('out_dir', help='Output directory (must not already exist)') options = parser.parse_args() diff --git a/circlator/tasks/merge.py b/circlator/tasks/merge.py index 94ccd95..b8cfefd 100644 --- a/circlator/tasks/merge.py +++ b/circlator/tasks/merge.py @@ -18,7 +18,7 @@ def run(): parser.add_argument('--spades_k', help='Comma separated list of kmers to use when running SPAdes. Max kmer is 127 and each kmer should be an odd integer [%(default)s]', default='127,117,107,97,87,77', metavar='k1,k2,k3,...') parser.add_argument('--spades_use_first', action='store_true', help='Use the first successful SPAdes assembly. Default is to try all kmers and use the assembly with the largest N50') parser.add_argument('--assembler', choices=circlator.common.allowed_assemblers, help='Assembler to use for reassemblies [%(default)s]', default='spades') - parser.add_argument('--data_type', choices=circlator.common.allowed_data_types, help='String representing one of the 4 type of data analysed (only used for Canu) [%(default)s]', default='pacbio-corrected') + parser.add_argument('--data_type', choices=circlator.common.allowed_data_types, help='String representing one of the 4 type of data analysed (only used for Canu and Racon) [%(default)s]', default='pacbio-raw') parser.add_argument('--b2r_length_cutoff', type=int, help='All reads mapped to contigs shorter than this will be kept [%(default)s]', default=100000, metavar='INT') parser.add_argument('--b2r_split_all_reads', action='store_true', help='By default, reads mapped to shorter contigs are left unchanged. This option splits them into two, broken at the middle of the contig to try to force circularization. May help if the assembler does not detect circular contigs (eg canu)') parser.add_argument('--ref_end', type=int, help='max distance allowed between nucmer hit and end of input assembly contig [%(default)s]', metavar='INT', default=15000) diff --git a/install_dependencies.sh b/install_dependencies.sh index 020871c..00a04cb 100755 --- a/install_dependencies.sh +++ b/install_dependencies.sh @@ -9,7 +9,11 @@ CANU_VERSION=1.4 PRODIGAL_VERSION=2.6.2 SAMTOOLS_VERSION=1.3 MUMMER_VERSION=3.23 -SPADES_VERSION=3.7.1 +SPADES_VERSION=3.11.1 + +MINIMAP2_VERSION=2.10 +MINIASM_VERSION=55cf0189e2f7d5bda5868396cebe066eec0a9547 +RACON_VERSION=1.3.0 BWA_DOWNLOAD_URL="http://downloads.sourceforge.net/project/bio-bwa/bwa-${BWA_VERSION}.tar.bz2" CANU_DOWNLOAD_URL="https://github.com/marbl/canu/releases/download/v${CANU_VERSION}/canu-${CANU_VERSION}.Linux-amd64.tar.xz" @@ -17,7 +21,9 @@ PRODIGAL_DOWNLOAD_URL="https://github.com/hyattpd/Prodigal/releases/download/v${ SAMTOOLS_DOWNLOAD_URL="https://github.com/samtools/samtools/releases/download/${SAMTOOLS_VERSION}/samtools-${SAMTOOLS_VERSION}.tar.bz2" MUMMER_DOWNLOAD_URL="http://downloads.sourceforge.net/project/mummer/mummer/${MUMMER_VERSION}/MUMmer${MUMMER_VERSION}.tar.gz" SPADES_DOWNLOAD_URL="http://spades.bioinf.spbau.ru/release${SPADES_VERSION}/SPAdes-${SPADES_VERSION}-Linux.tar.gz" - +MINIMAP2_DOWNLOAD_URL="https://github.com/lh3/minimap2/archive/v${MINIMAP2_VERSION}.tar.gz" +MINIASM_DOWNLOAD_URL="https://github.com/lh3/miniasm/archive/${MINIASM_VERSION}.tar.gz" +RACON_DOWNLOAD_URL="https://github.com/isovic/racon/releases/download/${RACON_VERSION}/racon-v${RACON_VERSION}.tar.gz" # Make an install location if [ ! -d 'build' ]; then @@ -91,6 +97,36 @@ spades_dir="$build_dir/SPAdes-${SPADES_VERSION}-Linux/bin" tar -zxf SPAdes-${SPADES_VERSION}-Linux.tar.gz +# --------------- minimap2 ----------------- +cd $build_dir +download $MINIMAP2_DOWNLOAD_URL "${MINIMAP2_VERSION}.tar.gz" +minimap2_dir="$build_dir/minimap2-${MINIMAP2_VERSION}/" +tar -zxf ${MINIMAP2_VERSION}.tar.gz +cd $minimap2_dir +make + + +# --------------- miniasm ----------------- +cd $build_dir +download $MINIASM_DOWNLOAD_URL "${MINIASM_VERSION}.tar.gz" +miniasm_dir="$build_dir/miniasm-${MINIASM_VERSION}" +tar -zxf ${MINIASM_VERSION}.tar.gz +cd $miniasm_dir +make + +# --------------- racon ----------------- +cd $build_dir +download $RACON_DOWNLOAD_URL "racon-v${RACON_VERSION}.tar.gz" +racon_dir="$build_dir/racon-v${RACON_VERSION}" +tar -zxf racon-v${RACON_VERSION}.tar.gz +cd $racon_dir +mkdir build +cd build +cmake -DCMAKE_BUILD_TYPE=Release .. +make +racon_dir="$build_dir/racon-v${RACON_VERSION}/build/bin" + + cd $start_dir update_path () { @@ -106,4 +142,8 @@ update_path ${prodigal_dir} update_path ${mummer_dir} update_path ${samtools_dir} update_path ${spades_dir} +update_path ${minimap2_dir} +update_path ${miniasm_dir} +update_path ${racon_dir} + diff --git a/setup.py b/setup.py index a074aed..c0f7b2f 100644 --- a/setup.py +++ b/setup.py @@ -7,7 +7,7 @@ setup( name='circlator', - version='1.5.5', + version='1.5.6', description='circlator: a tool to circularise genome assemblies', packages = find_packages(), package_data={'circlator': ['data/*']},