From fa4f08f57e1f9a6b108d62ec7329a6ba937b8579 Mon Sep 17 00:00:00 2001 From: RVanDamme <38455046+RVanDamme@users.noreply.github.com> Date: Thu, 28 Jun 2018 14:33:17 +0200 Subject: [PATCH 1/5] Update README.md --- README.md | 32 +++++++++++++++++++++++++++++--- 1 file changed, 29 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 621e65b..1aa830b 100644 --- a/README.md +++ b/README.md @@ -6,7 +6,7 @@ `capture` is an assembler developed to recover complete genome from ultra-high coverage samples -The repository is a work in progress and the assembler is not functional yet. +The repository is a work in progress and the assembler is not completely functional yet. Thanks for your interest and come back soon! ## Installation @@ -15,8 +15,11 @@ Thanks for your interest and come back soon! * python >= 3.6 * SPAdes +* Canu -*TODO* +To install capture-assembler, simply type the following in your terminal: + pip install capture-assembler + ## Quickstart @@ -30,7 +33,7 @@ capture assemble -f reads_R1.fastq.gz -r reads_R2.fastq.gz \ # single end reads, Ion Torrent capture assemble -u reads.fastq.gz \ --genome_size 35000 --mean 240 -o output_dir -# reads in bam format +# reads in bam format (Ion Torrent) capture assemble --bam reads.bam \ --genome_size 35000 --mean 240 -o output_dir # full list of subcommands and options @@ -38,11 +41,34 @@ capture -h # full list of options for a subcommand capture assemble -h ``` +## Arguments + +``` + -h, --help show the help message and exit + -v, --version print version and exit + -u, --uniq Input reads file in format fastq fastq.gz + -f, --forward Input forward file in format fastq fastq.gz + -r, --reverse Input reverse file in format fastq fastq.gz + -b, --bam Input the reads file in bam format. It will be considerate as Ion Torrent data in Spades + -g, --genome_size The size of the genome specific to your reads in numeric value + -m, --mean The mean size of the reads present in the input file + -o, --output The output directory + -s, --subsample The number of subsample to produce. Default: the maximum + -t, --thread The number of threads available. Default: 4 + -M, --memory The memory available in Gigs. Default: 16G + -c, --clean Clean the temporary files. Default: True + +``` + ## License Code is under the [MIT](LICENSE) license +## Issues + +Found a bug or have a question? Please open an [issue](https://github.com/SGBC/capture/issues) + ## Contributing We welcome contributions from the community! See our [Contributing](CONTRIBUTING.md) guide. From 51722580c0ab85ce755518c8e5d703e1ab7c1bb0 Mon Sep 17 00:00:00 2001 From: RVanDamme Date: Thu, 28 Jun 2018 13:38:42 +0200 Subject: [PATCH 2/5] adding file for packaging and starting documentation --- .gitignore | 6 +++++ MANIFEST.in | 1 + capture/app.py | 2 +- capture/bam.py | 21 ++++++++++-------- capture/clean.py | 5 ++++- capture/jobs.py | 29 +++++++++++++++++++++++- capture/join.py | 6 +++++ capture/parse.py | 57 +++++++++++++++++++++++++++++++++++++++++++----- capture/run.py | 17 ++++++++++++++- capture/split.py | 13 ++++++++++- setup.py | 37 +++++++++++++++++++++++++++++++ 11 files changed, 175 insertions(+), 19 deletions(-) create mode 100644 MANIFEST.in create mode 100644 setup.py diff --git a/.gitignore b/.gitignore index 7866689..33bb37b 100644 --- a/.gitignore +++ b/.gitignore @@ -6,3 +6,9 @@ __pycache__ # PyDoit .doit.db + +# Pip Package +capture_assembler.egg-info +Capture.egg-info +dist +build diff --git a/MANIFEST.in b/MANIFEST.in new file mode 100644 index 0000000..bb3ec5f --- /dev/null +++ b/MANIFEST.in @@ -0,0 +1 @@ +include README.md diff --git a/capture/app.py b/capture/app.py index 5c13077..f850320 100644 --- a/capture/app.py +++ b/capture/app.py @@ -17,7 +17,7 @@ def assemble(args): """ - main function for the capture software + main function for the capture-assembler Arguments: args (object): the argument dictionary from argparse diff --git a/capture/bam.py b/capture/bam.py index 2dab64a..5fae562 100644 --- a/capture/bam.py +++ b/capture/bam.py @@ -6,8 +6,11 @@ def count_bam(file): """ count the number of reads present in the file - bam_file = pysam.AlignmentFile(file, "rb") - Needs the bam file and bam index file to work + + Arguments: + file : the path to the bamfile we will reads + return: + int () """ # map_seq = 0 # unmap_seq = 0 @@ -22,13 +25,13 @@ def count_bam(file): def write(reads, output, c_sub, file_record): - """ Will be add if need of only paired reads in bam - if args.paired: - for read in reads: - if read.is_paired: - pairedreads.write(read) - pairedreads.close() - else: + """ + Write the reads selected in a subsample + Arguments: + reads = the reads to write in the subsample + output = the path to the output directory + c_sub = the subsample number + file_record = the template format to write the subsample """ allreads = pysam.AlignmentFile( f"{output}/subsample_{c_sub}.bam", diff --git a/capture/clean.py b/capture/clean.py index ad7cc45..003152e 100644 --- a/capture/clean.py +++ b/capture/clean.py @@ -6,7 +6,10 @@ def clean_spades(output, num_sub): """ save the contigs from each spades run and - remove all the tmp files + remove all the temporary files + Arguments: + output = the path to the output directory + num_sub = the number of subsample """ output_temp = output + "/temp" output_spades_contig = output + "/spades_contigs" diff --git a/capture/jobs.py b/capture/jobs.py index 97bb5c8..8c4bd0a 100644 --- a/capture/jobs.py +++ b/capture/jobs.py @@ -17,7 +17,13 @@ def run_tasks(tasks, args, config={'verbosity': 0}): '''Given a list of `Task` objects, a list of arguments, and a config dictionary, execute the tasks. - Those task will be SPAdes and overlap layout + Those task will be SPAdes and overlap layout (Canu) + Arguments: + tasks = list of 'Task' objects + args = the list of arguments link to those tasks + config = configuration dictionary + Return: + the execution of the tasks ''' if type(tasks) is not list: @@ -44,6 +50,18 @@ def d_to_t(*args, **kwargs): @make_task def task_spades(num, type_r, output, mem, thread): + """ Execute SPAdes according to a certain presetting + The presetting is chosen according to type_r + Arguments: + num = the number of spades run (first, second,...) + type_r = the type of spades run to execute (paired-end, bam,...) + output = the path to the output directory + mem = the memory available spades can use + thread = the number of threads available spades can use + Return: + dictionary + + """ if type_r == "pe": cmd = f"""spades.py -1 {output}/subsample_forward{num}.fastq \ @@ -98,6 +116,15 @@ def task_spades(num, type_r, output, mem, thread): @make_task def task_canu(output, mem, thread, genome_size): + """Execute Canu to overlap all the contigs obtained by SPAdes run + Arguments: + output = the path to the output directory + mem = the memory available spades can use + thread = the number of threads available spades can use + genome_size = the size of the wanted genome + Return: + dictionary + """ contig_dir = f"{output}/temp" output_dir = f"{output}/temp/canu_out" contig = f"{contig_dir}/all_contigs.fasta" diff --git a/capture/join.py b/capture/join.py index df66113..77d24bd 100644 --- a/capture/join.py +++ b/capture/join.py @@ -5,6 +5,12 @@ def contig(num_sub, output): + """Join all the SPAdes contig produce in each subsample run + in one file. The contig must be at least 1000bp long + Arguments: + num_sub = the number of subsample analyzed by SPAdes + output = the path to the output directory + """ location_contigs = output + "/spades_contigs" output_temp = output + "/temp" list_contigs = [] diff --git a/capture/parse.py b/capture/parse.py index efb039c..d5d6569 100644 --- a/capture/parse.py +++ b/capture/parse.py @@ -13,7 +13,11 @@ def is_gzip(file): """ test if the file is gzip using the 4 first byte of the file - who are characteristic of the type of file + who are characteristic of the type of file. + Arguments: + file = the path to the file to test + return: + boolean answer """ logger = logging.getLogger(__name__) magic_number = b"\x1f\x8b\x08" @@ -29,7 +33,9 @@ def is_gzip(file): def count_record(file): - """count the number of reads present in the file + """count the number of reads present in the fastq file + Arguments: + file = the the path to the file where to count """ if is_gzip(file): with gzip.open(file, "rt") as handle: @@ -47,7 +53,16 @@ def parse_fq(output, file, type_f, num_sub, number_records, handle): """ we read the file, we make a sum of the reads each time we get the number of reads for the wanted coverage we save them in a subfile, and keep reading the infile to - get the next subfile + get the next subfile. All the read will be selected because + the number of subsample is the highest possible. + For fastq files + Arguments: + output = the path to the output directory + file = the fastq file where we retrieve the sequence + type_f = the type of file (bam/fastq) + num_sub = the number of subsample (first, second,...) + number_records = number of record needed in each subsample + handle = the file open to be read """ c = 1 c_sub = 1 @@ -84,6 +99,19 @@ def parse_fq_rnd( num_sub, number_records, handle, tot_records ): + """ we read the file, we make a sum of the reads + each time we get the number of reads for the wanted coverage + we save them in a subfile, and keep reading the infile to + get the next subfile. Here we choose randomly the reads because + the number of subsample is limited + Arguments: + output = the path to the output directory + file = the fastq file where we retrieve the sequence + type_f = the type of file (bam/fastq) + num_sub = the number of subsample (first, second,...) + number_records = number of record needed in each subsample + handle = the file open to be read + """ total_record = tot_records wanted_record = num_sub * number_records file_record = SeqIO.parse(handle, "fastq") @@ -120,7 +148,18 @@ def reservoir(total_record, wanted_record, file_record): def parse_bam(output, file, type_f, num_sub, number_records): - """ same as parse_fq but for bam format + """ we read the file, we make a sum of the reads + each time we get the number of reads for the wanted coverage + we save them in a subfile, and keep reading the infile to + get the next subfile. Here we choose randomly the reads because + the number of subsample is limited. + For bam files + Arguments: + output = the path to the output directory + file = the fastq file where we retrieve the sequence + type_f = the type of file (bam/fastq) + num_sub = the number of subsample (first, second,...) + number_records = number of record needed in each subsample """ c = 1 c_sub = 1 @@ -148,7 +187,15 @@ def parse_bam(output, file, type_f, num_sub, number_records): def parse_bam_rnd(output, file, type_f, num_sub, fraction): """ parse the bam file and create only a chosen number - of subsample""" + of subsample. Each subsample contains randomly chosen reads + Arguments: + output = the path to the output directory + file = the fastq file where we retrieve the sequence + type_f = the type of file (bam/fastq) + num_sub = the number of subsample (first, second,...) + number_records = number of record needed in each subsample + fraction = the fraction of the bamfile needed to do the subsample one + """ c_sub = 1 for sub in range(num_sub): pysam.view( diff --git a/capture/run.py b/capture/run.py index 9b85aa6..9e89d2c 100644 --- a/capture/run.py +++ b/capture/run.py @@ -5,7 +5,15 @@ from capture.jobs import * -def spades(num_sub, output, type_r, mem, thread): # more parameter later +def spades(num_sub, output, type_r, mem, thread): + """ Call the execution of all the SPAdes tasks + Arguments: + num_sub = the sum of all the spades run + type_r = the type of spades run to execute (paired-end, bam,...) + output = the path to the output directory + mem = the memory available spades can use + thread = the number of threads available spades can use + """ num = 1 tasks = [] while num <= num_sub: @@ -17,6 +25,13 @@ def spades(num_sub, output, type_r, mem, thread): # more parameter later def canu(output, mem, thread, genome_size): + """ Call the execution of Canu + Arguments: + output = the path to the output directory + mem = the memory available spades can use + thread = the number of threads available spades can use + genome_size = the size of the wanted genome + """ os.makedirs(f"{output}/temp/canu") tasks = [] tasks.append(task_canu(output, mem, thread, genome_size)) diff --git a/capture/split.py b/capture/split.py index 8a0b424..9d4ea63 100644 --- a/capture/split.py +++ b/capture/split.py @@ -15,7 +15,18 @@ def split( wanted_cov=100 ): """ calcul the number of reads needed in each subsample then - parse the file and make the subsample + parse the file and makes the subsamples + Arguments: + genome_size = the size of the expected genome + mean_size = the mean size of the reads + output = the path to the output directory + file = the path to the input file + type_f = the type of file + subsample = the number of subsample wanted + (or all the subsample possible) + wanted_cov = the coverage wanted in each of the subsample + Return: + int """ logger = logging.getLogger(__name__) # caclul the number of reads in each subsample for a coverage diff --git a/setup.py b/setup.py new file mode 100644 index 0000000..8f0afce --- /dev/null +++ b/setup.py @@ -0,0 +1,37 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +from capture.version import __version__ + +from setuptools import setup, find_packages + + +url = 'https://github.com/SGBC/capture' + +with open('README.md') as f: + long_description = f.read() + +setup( + name='capture-assembler', + version=__version__, + + description='The Capture seq assembler', + long_description=long_description, + long_description_content_type='text/markdown', + + url=url, + download_url=url + '/tarball/' + __version__, + author='Hadrien Gourlé, Renaud Van Damme', + author_email='hadrien.gourle@slu.se', + + license='MIT', + packages=find_packages(), + + tests_require=['nose', 'codecov'], + install_requires=['doit', 'biopython', 'pysam'], + include_package_data=True, + + entry_points={ + 'console_scripts': ['capture = capture.app:main'], + } +) From 65ea7f00274aab768497ee10799075e0e7c79da8 Mon Sep 17 00:00:00 2001 From: RVanDamme Date: Fri, 29 Jun 2018 09:09:30 +0200 Subject: [PATCH 3/5] minor correction in function description --- capture/jobs.py | 4 ++-- capture/parse.py | 40 +++++++++++++++++++++------------------- 2 files changed, 23 insertions(+), 21 deletions(-) diff --git a/capture/jobs.py b/capture/jobs.py index 8c4bd0a..065285c 100644 --- a/capture/jobs.py +++ b/capture/jobs.py @@ -122,8 +122,8 @@ def task_canu(output, mem, thread, genome_size): mem = the memory available spades can use thread = the number of threads available spades can use genome_size = the size of the wanted genome - Return: - dictionary + Return: + dictionary """ contig_dir = f"{output}/temp" output_dir = f"{output}/temp/canu_out" diff --git a/capture/parse.py b/capture/parse.py index d5d6569..cba15e6 100644 --- a/capture/parse.py +++ b/capture/parse.py @@ -12,8 +12,7 @@ def is_gzip(file): - """ test if the file is gzip using the 4 first byte of the file - who are characteristic of the type of file. + """ test if the file is gzip Arguments: file = the path to the file to test return: @@ -50,12 +49,12 @@ def count_record(file): def parse_fq(output, file, type_f, num_sub, number_records, handle): - """ we read the file, we make a sum of the reads - each time we get the number of reads for the wanted coverage - we save them in a subfile, and keep reading the infile to - get the next subfile. All the read will be selected because - the number of subsample is the highest possible. - For fastq files + """ read the file, make a sum of the reads + for each occurence, the number of reads for the wanted coverage + is saved in a subfile. The software keep reading the infile to + get the next subfile. All the read will be selected, because + the number of subsample required is the highest possible. + Only for fastq files Arguments: output = the path to the output directory file = the fastq file where we retrieve the sequence @@ -99,11 +98,12 @@ def parse_fq_rnd( num_sub, number_records, handle, tot_records ): - """ we read the file, we make a sum of the reads - each time we get the number of reads for the wanted coverage - we save them in a subfile, and keep reading the infile to - get the next subfile. Here we choose randomly the reads because - the number of subsample is limited + """ Read the file, make a sum of the reads + for each occurence, the number of reads for the wanted coverage + is saved in a subfile. the software keep reading the infile to + get the next subfile. Here the reads are chosen randomly because + the number of subsample is limited. + Only for Fastq Arguments: output = the path to the output directory file = the fastq file where we retrieve the sequence @@ -148,12 +148,13 @@ def reservoir(total_record, wanted_record, file_record): def parse_bam(output, file, type_f, num_sub, number_records): - """ we read the file, we make a sum of the reads - each time we get the number of reads for the wanted coverage - we save them in a subfile, and keep reading the infile to - get the next subfile. Here we choose randomly the reads because - the number of subsample is limited. - For bam files + """ Read the file, make a sum of the reads + for each occurence, the number of reads for the wanted coverage + is saved in a subfile.the software keep reading the infile to + get the next subfile. All the reads will be parsed + and write in a subsample since the highest number of subsample + possible is required + Only for bam files Arguments: output = the path to the output directory file = the fastq file where we retrieve the sequence @@ -188,6 +189,7 @@ def parse_bam(output, file, type_f, num_sub, number_records): def parse_bam_rnd(output, file, type_f, num_sub, fraction): """ parse the bam file and create only a chosen number of subsample. Each subsample contains randomly chosen reads + Only for bam files Arguments: output = the path to the output directory file = the fastq file where we retrieve the sequence From 4416fb1ae875808c934a1af0716e4352e69913f8 Mon Sep 17 00:00:00 2001 From: RVanDamme <38455046+RVanDamme@users.noreply.github.com> Date: Fri, 29 Jun 2018 10:25:56 +0200 Subject: [PATCH 4/5] Update README.md --- README.md | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/README.md b/README.md index 1aa830b..49f0503 100644 --- a/README.md +++ b/README.md @@ -18,6 +18,7 @@ Thanks for your interest and come back soon! * Canu To install capture-assembler, simply type the following in your terminal: + pip install capture-assembler @@ -61,6 +62,13 @@ capture assemble -h ``` +## External Links + +Official link to: + +* [SPAdes](http://cab.spbu.ru/software/spades/) +* [Canu](http://canu.readthedocs.io/en/latest/) + ## License Code is under the [MIT](LICENSE) license From ed895a065e3abfa59f18b8af77d6d78ffd167289 Mon Sep 17 00:00:00 2001 From: RVanDamme <38455046+RVanDamme@users.noreply.github.com> Date: Mon, 2 Jul 2018 13:52:02 +0200 Subject: [PATCH 5/5] adding conda installation --- README.md | 24 ++++++++++++++++++------ 1 file changed, 18 insertions(+), 6 deletions(-) diff --git a/README.md b/README.md index 49f0503..1fd9135 100644 --- a/README.md +++ b/README.md @@ -20,27 +20,39 @@ Thanks for your interest and come back soon! To install capture-assembler, simply type the following in your terminal: pip install capture-assembler - +It requires python 3.6 and canu and spades already installed. + +If you use conda you can install it using: + + conda install -c rvandamme capture-assembler +It requires a conda environment with python 3.6 and canu and spades already installed. + +To install capture-assembler, spades and canu in a conda environment (recommanded to create a new one), type the following: + ```bash + conda create -n env_name python=3.6 # create a new env in python 3.6 (optionnal) + conda install -n env_name -c rvandamme capture-assembler canu=1.7 spades=3.12.0 + conda activate env_name #start using the environment and the capture-assembler + ``` ## Quickstart ```bash # paired-end reads, Illumina MiSeq -capture assemble -f reads_R1.fastq.gz -r reads_R2.fastq.gz \ +capture -f reads_R1.fastq.gz -r reads_R2.fastq.gz \ --genome_size 35000 --mean 300 -o output_dir # compressed paired-end reads, Illumina HiSeq -capture assemble -f reads_R1.fastq.gz -r reads_R2.fastq.gz \ +capture -f reads_R1.fastq.gz -r reads_R2.fastq.gz \ --genome_size 35000 --mean 125 -o output_dir # single end reads, Ion Torrent -capture assemble -u reads.fastq.gz \ +capture -u reads.fastq.gz \ --genome_size 35000 --mean 240 -o output_dir # reads in bam format (Ion Torrent) -capture assemble --bam reads.bam \ +capture --bam reads.bam \ --genome_size 35000 --mean 240 -o output_dir # full list of subcommands and options capture -h # full list of options for a subcommand -capture assemble -h +capture -h ``` ## Arguments