SGBC · RVanDamme · Jun 28, 2018 · Jun 28, 2018 · Jun 29, 2018 · Jun 29, 2018
diff --git a/.gitignore b/.gitignore
@@ -6,3 +6,9 @@ __pycache__
 
 # PyDoit
 .doit.db
+
+# Pip Package
+capture_assembler.egg-info
+Capture.egg-info
+dist
+build
diff --git a/MANIFEST.in b/MANIFEST.in
@@ -0,0 +1 @@
+include README.md
diff --git a/README.md b/README.md
@@ -6,7 +6,7 @@
 
 `capture` is an assembler developed to recover complete genome from ultra-high coverage samples
 
-The repository is a work in progress and the assembler is not functional yet.
+The repository is a work in progress and the assembler is not completely functional yet.
 Thanks for your interest and come back soon!
 
 ## Installation
@@ -15,34 +15,80 @@ Thanks for your interest and come back soon!
 
 * python >= 3.6
 * SPAdes
+* Canu
 
-*TODO*
+To install capture-assembler, simply type the following in your terminal:
+
+    pip install capture-assembler
+It requires python 3.6 and canu and spades already installed.
+
+If you use conda you can install it using:
+
+    conda install -c rvandamme capture-assembler
+It requires a conda environment with python 3.6 and canu and spades already installed.
+
+To install capture-assembler, spades and canu in a conda environment (recommanded to create a new one), type the following:
+ ```bash
+ conda create -n env_name python=3.6 # create a new env in python 3.6 (optionnal)
+ conda install -n env_name -c rvandamme capture-assembler  canu=1.7 spades=3.12.0
+ conda activate env_name #start using the environment and the capture-assembler
+ ```
 
 ## Quickstart
 
 ```bash
 # paired-end reads, Illumina MiSeq
-capture assemble -f reads_R1.fastq.gz -r reads_R2.fastq.gz \
+capture -f reads_R1.fastq.gz -r reads_R2.fastq.gz \
 --genome_size 35000 --mean 300 -o output_dir
 # compressed paired-end reads, Illumina HiSeq
-capture assemble -f reads_R1.fastq.gz -r reads_R2.fastq.gz \
+capture -f reads_R1.fastq.gz -r reads_R2.fastq.gz \
 --genome_size 35000 --mean 125 -o output_dir
 # single end reads, Ion Torrent
-capture assemble -u reads.fastq.gz \
+capture -u reads.fastq.gz \
 --genome_size 35000 --mean 240 -o output_dir
-# reads in bam format
-capture assemble --bam reads.bam \
+# reads in bam format (Ion Torrent)
+capture --bam reads.bam \
 --genome_size 35000 --mean 240 -o output_dir
 # full list of subcommands and options
 capture -h
 # full list of options for a subcommand
-capture assemble -h
+capture -h
+```
+## Arguments
+
 ```
+  -h, --help            show the help message and exit
+  -v, --version         print version and exit
+  -u, --uniq            Input reads file in format fastq fastq.gz
+  -f, --forward         Input forward file in format fastq fastq.gz
+  -r, --reverse         Input reverse file in format fastq fastq.gz
+  -b, --bam             Input the reads file in bam format. It will be considerate as Ion Torrent data in Spades
+  -g, --genome_size     The size of the genome specific to your reads in numeric value
+  -m, --mean            The mean size of the reads present in the input file
+  -o, --output          The output directory
+  -s, --subsample       The number of subsample to produce. Default: the maximum
+  -t, --thread          The number of threads available. Default: 4
+  -M, --memory          The memory available in Gigs. Default: 16G
+  -c, --clean           Clean the temporary files. Default: True
+
+```
+
+
+## External Links
+
+Official link to:
+
+* [SPAdes](http://cab.spbu.ru/software/spades/)
+* [Canu](http://canu.readthedocs.io/en/latest/)
 
 ## License
 
 Code is under the [MIT](LICENSE) license
 
+## Issues
+
+Found a bug or have a question? Please open an [issue](https://github.com/SGBC/capture/issues)
+
 ## Contributing
 
 We welcome contributions from the community! See our [Contributing](CONTRIBUTING.md) guide.
diff --git a/capture/app.py b/capture/app.py
@@ -17,7 +17,7 @@
 
 def assemble(args):
     """
-    main function for the capture software
+    main function for the capture-assembler
 
     Arguments:
     args (object): the argument dictionary from argparse

diff --git a/capture/bam.py b/capture/bam.py
@@ -6,8 +6,11 @@
 
 def count_bam(file):
     """ count the number of reads present in the file
-        bam_file = pysam.AlignmentFile(file, "rb")
-        Needs the bam file and bam index file to work
+
+        Arguments:
+            file : the path to the bamfile we will reads
+        return:
+            int ()
     """
     # map_seq = 0
     # unmap_seq = 0
@@ -22,13 +25,13 @@ def count_bam(file):
 
 
 def write(reads, output, c_sub, file_record):
-    """ Will be add if need of only paired reads in bam
-        if args.paired:
-            for read in reads:
-                if read.is_paired:
-                    pairedreads.write(read)
-                pairedreads.close()
-        else:
+    """
+        Write the reads selected in a subsample
+        Arguments:
+            reads = the reads to write in the subsample
+            output = the path to the output directory
+            c_sub = the subsample number
+            file_record = the template format to write the subsample
     """
     allreads = pysam.AlignmentFile(
         f"{output}/subsample_{c_sub}.bam",

diff --git a/capture/clean.py b/capture/clean.py
@@ -6,7 +6,10 @@
 
 def clean_spades(output, num_sub):
     """  save the contigs from each spades run and
-    remove all the tmp files
+    remove all the temporary files
+        Arguments:
+            output =  the path to the output directory
+            num_sub = the number of subsample
     """
     output_temp = output + "/temp"
     output_spades_contig = output + "/spades_contigs"

diff --git a/capture/jobs.py b/capture/jobs.py
@@ -17,7 +17,13 @@
 def run_tasks(tasks, args, config={'verbosity': 0}):
     '''Given a list of `Task` objects, a list of arguments,
     and a config dictionary, execute the tasks.
-    Those task will be SPAdes and overlap layout
+    Those task will be SPAdes and overlap layout (Canu)
+        Arguments:
+            tasks =  list of 'Task' objects
+            args = the list of arguments link to those tasks
+            config = configuration dictionary
+        Return:
+            the execution of the tasks
     '''
 
     if type(tasks) is not list:
@@ -44,6 +50,18 @@ def d_to_t(*args, **kwargs):
 
 @make_task
 def task_spades(num, type_r, output, mem, thread):
+    """ Execute SPAdes according to a certain presetting
+        The presetting is chosen according to type_r
+        Arguments:
+            num = the number of spades run (first, second,...)
+            type_r =  the type of spades run to execute (paired-end, bam,...)
+            output = the path to the output directory
+            mem =  the memory available spades can use
+            thread = the number of threads available spades can use
+        Return:
+            dictionary
+
+    """
 
     if type_r == "pe":
         cmd = f"""spades.py -1 {output}/subsample_forward{num}.fastq \
@@ -98,6 +116,15 @@ def task_spades(num, type_r, output, mem, thread):
 
 @make_task
 def task_canu(output, mem, thread, genome_size):
+    """Execute Canu to overlap all the contigs obtained by SPAdes run
+        Arguments:
+            output =  the path to the output directory
+            mem =  the memory available spades can use
+            thread = the number of threads available spades can use
+            genome_size = the size of the wanted genome
+        Return:
+            dictionary
+    """
     contig_dir = f"{output}/temp"
     output_dir = f"{output}/temp/canu_out"
     contig = f"{contig_dir}/all_contigs.fasta"

diff --git a/capture/join.py b/capture/join.py
@@ -5,6 +5,12 @@
 
 
 def contig(num_sub, output):
+    """Join all the SPAdes contig produce in each subsample run
+        in one file. The contig must be at least 1000bp long
+        Arguments:
+            num_sub = the number of subsample analyzed by SPAdes
+            output = the path to the output directory
+    """
     location_contigs = output + "/spades_contigs"
     output_temp = output + "/temp"
     list_contigs = []

diff --git a/capture/parse.py b/capture/parse.py
@@ -12,8 +12,11 @@
 
 
 def is_gzip(file):
-    """ test if the file is gzip using the 4 first byte of the file
-        who are characteristic of the type of file
+    """ test if the file is gzip
+        Arguments:
+            file =  the path to the file to test
+        return:
+            boolean answer
     """
     logger = logging.getLogger(__name__)
     magic_number = b"\x1f\x8b\x08"
@@ -29,7 +32,9 @@ def is_gzip(file):
 
 
 def count_record(file):
-    """count the number of reads present in the file
+    """count the number of reads present in the fastq file
+        Arguments:
+            file = the the path to the file where to count
     """
     if is_gzip(file):
         with gzip.open(file, "rt") as handle:
@@ -44,10 +49,19 @@ def count_record(file):
 
 
 def parse_fq(output, file, type_f, num_sub, number_records, handle):
-    """ we read the file, we make a sum of the reads
-        each time we get the number of reads for the wanted coverage
-        we save them in a subfile, and keep reading the infile to
-        get the next subfile
+    """ read the file, make a sum of the reads
+        for each occurence, the number of reads for the wanted coverage
+        is saved in a subfile. The software keep reading the infile to
+        get the next subfile. All the read will be selected, because
+        the number of subsample required is the highest possible.
+        Only for fastq files
+        Arguments:
+            output = the path to the output directory
+            file = the fastq file where we retrieve the sequence
+            type_f = the type of file (bam/fastq)
+            num_sub = the number of subsample (first, second,...)
+            number_records = number  of record needed in each subsample
+            handle = the file open to be read
     """
     c = 1
     c_sub = 1
@@ -84,6 +98,20 @@ def parse_fq_rnd(
                 num_sub, number_records,
                 handle, tot_records
                 ):
+    """ Read the file, make a sum of the reads
+        for each occurence, the number of reads for the wanted coverage
+        is saved in a subfile. the software keep reading the infile to
+        get the next subfile. Here the reads are chosen randomly because
+        the number of subsample is limited.
+        Only for Fastq
+        Arguments:
+            output = the path to the output directory
+            file = the fastq file where we retrieve the sequence
+            type_f = the type of file (bam/fastq)
+            num_sub = the number of subsample (first, second,...)
+            number_records = number  of record needed in each subsample
+            handle = the file open to be read
+    """
     total_record = tot_records
     wanted_record = num_sub * number_records
     file_record = SeqIO.parse(handle, "fastq")
@@ -120,7 +148,19 @@ def reservoir(total_record, wanted_record, file_record):
 
 
 def parse_bam(output, file, type_f, num_sub, number_records):
-    """ same as parse_fq but for bam format
+    """ Read the file, make a sum of the reads
+        for each occurence, the number of reads for the wanted coverage
+        is saved in a subfile.the software keep reading the infile to
+        get the next subfile. All the reads will be parsed
+        and write in a subsample since the highest number of subsample
+        possible is required
+        Only for bam files
+        Arguments:
+            output = the path to the output directory
+            file = the fastq file where we retrieve the sequence
+            type_f = the type of file (bam/fastq)
+            num_sub = the number of subsample (first, second,...)
+            number_records = number  of record needed in each subsample
     """
     c = 1
     c_sub = 1
@@ -148,7 +188,16 @@ def parse_bam(output, file, type_f, num_sub, number_records):
 
 def parse_bam_rnd(output, file, type_f, num_sub, fraction):
     """  parse the bam file and create only a chosen number
-    of subsample"""
+    of subsample. Each subsample contains randomly chosen reads
+    Only for bam files
+    Arguments:
+        output = the path to the output directory
+        file = the fastq file where we retrieve the sequence
+        type_f = the type of file (bam/fastq)
+        num_sub = the number of subsample (first, second,...)
+        number_records = number  of record needed in each subsample
+        fraction = the fraction of the bamfile needed to do the subsample one
+    """
     c_sub = 1
     for sub in range(num_sub):
         pysam.view(

diff --git a/capture/run.py b/capture/run.py
@@ -5,7 +5,15 @@
 from capture.jobs import *
 
 
-def spades(num_sub, output, type_r, mem, thread):  # more parameter later
+def spades(num_sub, output, type_r, mem, thread):
+    """ Call the execution of all the SPAdes tasks
+        Arguments:
+            num_sub = the sum of all the spades run
+            type_r =  the type of spades run to execute (paired-end, bam,...)
+            output = the path to the output directory
+            mem =  the memory available spades can use
+            thread = the number of threads available spades can use
+    """
     num = 1
     tasks = []
     while num <= num_sub:
@@ -17,6 +25,13 @@ def spades(num_sub, output, type_r, mem, thread):  # more parameter later
 
 
 def canu(output, mem, thread, genome_size):
+    """ Call the execution of Canu
+        Arguments:
+            output =  the path to the output directory
+            mem =  the memory available spades can use
+            thread = the number of threads available spades can use
+            genome_size = the size of the wanted genome
+    """
     os.makedirs(f"{output}/temp/canu")
     tasks = []
     tasks.append(task_canu(output, mem, thread, genome_size))

diff --git a/capture/split.py b/capture/split.py
@@ -15,7 +15,18 @@ def split(
         wanted_cov=100
         ):
     """ calcul the number of reads needed in each subsample then
-        parse the file and make the subsample
+        parse the file and makes the subsamples
+        Arguments:
+            genome_size = the size of the expected genome
+            mean_size = the mean size of the reads
+            output = the path to the output directory
+            file = the path to the input file
+            type_f = the type of file
+            subsample =  the number of subsample wanted
+                        (or all the subsample possible)
+            wanted_cov = the coverage wanted in each of the subsample
+        Return:
+            int
     """
     logger = logging.getLogger(__name__)
     # caclul the number of reads in each subsample for a coverage