From fa4f08f57e1f9a6b108d62ec7329a6ba937b8579 Mon Sep 17 00:00:00 2001
From: RVanDamme <38455046+RVanDamme@users.noreply.github.com>
Date: Thu, 28 Jun 2018 14:33:17 +0200
Subject: [PATCH 1/5] Update README.md

---
 README.md | 32 +++++++++++++++++++++++++++++---
 1 file changed, 29 insertions(+), 3 deletions(-)

diff --git a/README.md b/README.md
index 621e65b..1aa830b 100644
--- a/README.md
+++ b/README.md
@@ -6,7 +6,7 @@
 
 `capture` is an assembler developed to recover complete genome from ultra-high coverage samples
 
-The repository is a work in progress and the assembler is not functional yet.
+The repository is a work in progress and the assembler is not completely functional yet.
 Thanks for your interest and come back soon!
 
 ## Installation
@@ -15,8 +15,11 @@ Thanks for your interest and come back soon!
 
 * python >= 3.6
 * SPAdes
+* Canu
 
-*TODO*
+To install capture-assembler, simply type the following in your terminal:
+    pip install capture-assembler
+    
 
 ## Quickstart
 
@@ -30,7 +33,7 @@ capture assemble -f reads_R1.fastq.gz -r reads_R2.fastq.gz \
 # single end reads, Ion Torrent
 capture assemble -u reads.fastq.gz \
 --genome_size 35000 --mean 240 -o output_dir
-# reads in bam format
+# reads in bam format (Ion Torrent)
 capture assemble --bam reads.bam \
 --genome_size 35000 --mean 240 -o output_dir
 # full list of subcommands and options
@@ -38,11 +41,34 @@ capture -h
 # full list of options for a subcommand
 capture assemble -h
 ```
+## Arguments
+
+```
+  -h, --help            show the help message and exit
+  -v, --version         print version and exit
+  -u, --uniq            Input reads file in format fastq fastq.gz
+  -f, --forward         Input forward file in format fastq fastq.gz
+  -r, --reverse         Input reverse file in format fastq fastq.gz
+  -b, --bam             Input the reads file in bam format. It will be considerate as Ion Torrent data in Spades
+  -g, --genome_size     The size of the genome specific to your reads in numeric value
+  -m, --mean            The mean size of the reads present in the input file
+  -o, --output          The output directory
+  -s, --subsample       The number of subsample to produce. Default: the maximum
+  -t, --thread          The number of threads available. Default: 4
+  -M, --memory          The memory available in Gigs. Default: 16G
+  -c, --clean           Clean the temporary files. Default: True
+
+```
+
 
 ## License
 
 Code is under the [MIT](LICENSE) license
 
+## Issues
+
+Found a bug or have a question? Please open an [issue](https://github.com/SGBC/capture/issues)
+
 ## Contributing
 
 We welcome contributions from the community! See our [Contributing](CONTRIBUTING.md) guide.

From 51722580c0ab85ce755518c8e5d703e1ab7c1bb0 Mon Sep 17 00:00:00 2001
From: RVanDamme <renaudvandamme@gmail.com>
Date: Thu, 28 Jun 2018 13:38:42 +0200
Subject: [PATCH 2/5] adding file for packaging and starting documentation

---
 .gitignore       |  6 +++++
 MANIFEST.in      |  1 +
 capture/app.py   |  2 +-
 capture/bam.py   | 21 ++++++++++--------
 capture/clean.py |  5 ++++-
 capture/jobs.py  | 29 +++++++++++++++++++++++-
 capture/join.py  |  6 +++++
 capture/parse.py | 57 +++++++++++++++++++++++++++++++++++++++++++-----
 capture/run.py   | 17 ++++++++++++++-
 capture/split.py | 13 ++++++++++-
 setup.py         | 37 +++++++++++++++++++++++++++++++
 11 files changed, 175 insertions(+), 19 deletions(-)
 create mode 100644 MANIFEST.in
 create mode 100644 setup.py

diff --git a/.gitignore b/.gitignore
index 7866689..33bb37b 100644
--- a/.gitignore
+++ b/.gitignore
@@ -6,3 +6,9 @@ __pycache__
 
 # PyDoit
 .doit.db
+
+# Pip Package
+capture_assembler.egg-info
+Capture.egg-info
+dist
+build
diff --git a/MANIFEST.in b/MANIFEST.in
new file mode 100644
index 0000000..bb3ec5f
--- /dev/null
+++ b/MANIFEST.in
@@ -0,0 +1 @@
+include README.md
diff --git a/capture/app.py b/capture/app.py
index 5c13077..f850320 100644
--- a/capture/app.py
+++ b/capture/app.py
@@ -17,7 +17,7 @@
 
 def assemble(args):
     """
-    main function for the capture software
+    main function for the capture-assembler
 
     Arguments:
     args (object): the argument dictionary from argparse
diff --git a/capture/bam.py b/capture/bam.py
index 2dab64a..5fae562 100644
--- a/capture/bam.py
+++ b/capture/bam.py
@@ -6,8 +6,11 @@
 
 def count_bam(file):
     """ count the number of reads present in the file
-        bam_file = pysam.AlignmentFile(file, "rb")
-        Needs the bam file and bam index file to work
+
+        Arguments:
+            file : the path to the bamfile we will reads
+        return:
+            int ()
     """
     # map_seq = 0
     # unmap_seq = 0
@@ -22,13 +25,13 @@ def count_bam(file):
 
 
 def write(reads, output, c_sub, file_record):
-    """ Will be add if need of only paired reads in bam
-        if args.paired:
-            for read in reads:
-                if read.is_paired:
-                    pairedreads.write(read)
-                pairedreads.close()
-        else:
+    """
+        Write the reads selected in a subsample
+        Arguments:
+            reads = the reads to write in the subsample
+            output = the path to the output directory
+            c_sub = the subsample number
+            file_record = the template format to write the subsample
     """
     allreads = pysam.AlignmentFile(
         f"{output}/subsample_{c_sub}.bam",
diff --git a/capture/clean.py b/capture/clean.py
index ad7cc45..003152e 100644
--- a/capture/clean.py
+++ b/capture/clean.py
@@ -6,7 +6,10 @@
 
 def clean_spades(output, num_sub):
     """  save the contigs from each spades run and
-    remove all the tmp files
+    remove all the temporary files
+        Arguments:
+            output =  the path to the output directory
+            num_sub = the number of subsample
     """
     output_temp = output + "/temp"
     output_spades_contig = output + "/spades_contigs"
diff --git a/capture/jobs.py b/capture/jobs.py
index 97bb5c8..8c4bd0a 100644
--- a/capture/jobs.py
+++ b/capture/jobs.py
@@ -17,7 +17,13 @@
 def run_tasks(tasks, args, config={'verbosity': 0}):
     '''Given a list of `Task` objects, a list of arguments,
     and a config dictionary, execute the tasks.
-    Those task will be SPAdes and overlap layout
+    Those task will be SPAdes and overlap layout (Canu)
+        Arguments:
+            tasks =  list of 'Task' objects
+            args = the list of arguments link to those tasks
+            config = configuration dictionary
+        Return:
+            the execution of the tasks
     '''
 
     if type(tasks) is not list:
@@ -44,6 +50,18 @@ def d_to_t(*args, **kwargs):
 
 @make_task
 def task_spades(num, type_r, output, mem, thread):
+    """ Execute SPAdes according to a certain presetting
+        The presetting is chosen according to type_r
+        Arguments:
+            num = the number of spades run (first, second,...)
+            type_r =  the type of spades run to execute (paired-end, bam,...)
+            output = the path to the output directory
+            mem =  the memory available spades can use
+            thread = the number of threads available spades can use
+        Return:
+            dictionary
+
+    """
 
     if type_r == "pe":
         cmd = f"""spades.py -1 {output}/subsample_forward{num}.fastq \
@@ -98,6 +116,15 @@ def task_spades(num, type_r, output, mem, thread):
 
 @make_task
 def task_canu(output, mem, thread, genome_size):
+    """Execute Canu to overlap all the contigs obtained by SPAdes run
+        Arguments:
+            output =  the path to the output directory
+            mem =  the memory available spades can use
+            thread = the number of threads available spades can use
+            genome_size = the size of the wanted genome
+            Return:
+                dictionary
+    """
     contig_dir = f"{output}/temp"
     output_dir = f"{output}/temp/canu_out"
     contig = f"{contig_dir}/all_contigs.fasta"
diff --git a/capture/join.py b/capture/join.py
index df66113..77d24bd 100644
--- a/capture/join.py
+++ b/capture/join.py
@@ -5,6 +5,12 @@
 
 
 def contig(num_sub, output):
+    """Join all the SPAdes contig produce in each subsample run
+        in one file. The contig must be at least 1000bp long
+        Arguments:
+            num_sub = the number of subsample analyzed by SPAdes
+            output = the path to the output directory
+    """
     location_contigs = output + "/spades_contigs"
     output_temp = output + "/temp"
     list_contigs = []
diff --git a/capture/parse.py b/capture/parse.py
index efb039c..d5d6569 100644
--- a/capture/parse.py
+++ b/capture/parse.py
@@ -13,7 +13,11 @@
 
 def is_gzip(file):
     """ test if the file is gzip using the 4 first byte of the file
-        who are characteristic of the type of file
+        who are characteristic of the type of file.
+        Arguments:
+            file =  the path to the file to test
+        return:
+            boolean answer
     """
     logger = logging.getLogger(__name__)
     magic_number = b"\x1f\x8b\x08"
@@ -29,7 +33,9 @@ def is_gzip(file):
 
 
 def count_record(file):
-    """count the number of reads present in the file
+    """count the number of reads present in the fastq file
+        Arguments:
+            file = the the path to the file where to count
     """
     if is_gzip(file):
         with gzip.open(file, "rt") as handle:
@@ -47,7 +53,16 @@ def parse_fq(output, file, type_f, num_sub, number_records, handle):
     """ we read the file, we make a sum of the reads
         each time we get the number of reads for the wanted coverage
         we save them in a subfile, and keep reading the infile to
-        get the next subfile
+        get the next subfile. All the read will be selected because
+        the number of subsample is the highest possible.
+        For fastq files
+        Arguments:
+            output = the path to the output directory
+            file = the fastq file where we retrieve the sequence
+            type_f = the type of file (bam/fastq)
+            num_sub = the number of subsample (first, second,...)
+            number_records = number  of record needed in each subsample
+            handle = the file open to be read
     """
     c = 1
     c_sub = 1
@@ -84,6 +99,19 @@ def parse_fq_rnd(
                 num_sub, number_records,
                 handle, tot_records
                 ):
+    """ we read the file, we make a sum of the reads
+        each time we get the number of reads for the wanted coverage
+        we save them in a subfile, and keep reading the infile to
+        get the next subfile. Here we choose randomly the reads because
+        the number of subsample is limited
+        Arguments:
+            output = the path to the output directory
+            file = the fastq file where we retrieve the sequence
+            type_f = the type of file (bam/fastq)
+            num_sub = the number of subsample (first, second,...)
+            number_records = number  of record needed in each subsample
+            handle = the file open to be read
+    """
     total_record = tot_records
     wanted_record = num_sub * number_records
     file_record = SeqIO.parse(handle, "fastq")
@@ -120,7 +148,18 @@ def reservoir(total_record, wanted_record, file_record):
 
 
 def parse_bam(output, file, type_f, num_sub, number_records):
-    """ same as parse_fq but for bam format
+    """ we read the file, we make a sum of the reads
+        each time we get the number of reads for the wanted coverage
+        we save them in a subfile, and keep reading the infile to
+        get the next subfile. Here we choose randomly the reads because
+        the number of subsample is limited.
+        For bam files
+        Arguments:
+            output = the path to the output directory
+            file = the fastq file where we retrieve the sequence
+            type_f = the type of file (bam/fastq)
+            num_sub = the number of subsample (first, second,...)
+            number_records = number  of record needed in each subsample
     """
     c = 1
     c_sub = 1
@@ -148,7 +187,15 @@ def parse_bam(output, file, type_f, num_sub, number_records):
 
 def parse_bam_rnd(output, file, type_f, num_sub, fraction):
     """  parse the bam file and create only a chosen number
-    of subsample"""
+    of subsample. Each subsample contains randomly chosen reads
+    Arguments:
+        output = the path to the output directory
+        file = the fastq file where we retrieve the sequence
+        type_f = the type of file (bam/fastq)
+        num_sub = the number of subsample (first, second,...)
+        number_records = number  of record needed in each subsample
+        fraction = the fraction of the bamfile needed to do the subsample one
+    """
     c_sub = 1
     for sub in range(num_sub):
         pysam.view(
diff --git a/capture/run.py b/capture/run.py
index 9b85aa6..9e89d2c 100644
--- a/capture/run.py
+++ b/capture/run.py
@@ -5,7 +5,15 @@
 from capture.jobs import *
 
 
-def spades(num_sub, output, type_r, mem, thread):  # more parameter later
+def spades(num_sub, output, type_r, mem, thread):
+    """ Call the execution of all the SPAdes tasks
+        Arguments:
+            num_sub = the sum of all the spades run
+            type_r =  the type of spades run to execute (paired-end, bam,...)
+            output = the path to the output directory
+            mem =  the memory available spades can use
+            thread = the number of threads available spades can use
+    """
     num = 1
     tasks = []
     while num <= num_sub:
@@ -17,6 +25,13 @@ def spades(num_sub, output, type_r, mem, thread):  # more parameter later
 
 
 def canu(output, mem, thread, genome_size):
+    """ Call the execution of Canu
+        Arguments:
+            output =  the path to the output directory
+            mem =  the memory available spades can use
+            thread = the number of threads available spades can use
+            genome_size = the size of the wanted genome
+    """
     os.makedirs(f"{output}/temp/canu")
     tasks = []
     tasks.append(task_canu(output, mem, thread, genome_size))
diff --git a/capture/split.py b/capture/split.py
index 8a0b424..9d4ea63 100644
--- a/capture/split.py
+++ b/capture/split.py
@@ -15,7 +15,18 @@ def split(
         wanted_cov=100
         ):
     """ calcul the number of reads needed in each subsample then
-        parse the file and make the subsample
+        parse the file and makes the subsamples
+        Arguments:
+            genome_size = the size of the expected genome
+            mean_size = the mean size of the reads
+            output = the path to the output directory
+            file = the path to the input file
+            type_f = the type of file
+            subsample =  the number of subsample wanted
+                        (or all the subsample possible)
+            wanted_cov = the coverage wanted in each of the subsample
+        Return:
+            int
     """
     logger = logging.getLogger(__name__)
     # caclul the number of reads in each subsample for a coverage
diff --git a/setup.py b/setup.py
new file mode 100644
index 0000000..8f0afce
--- /dev/null
+++ b/setup.py
@@ -0,0 +1,37 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+from capture.version import __version__
+
+from setuptools import setup, find_packages
+
+
+url = 'https://github.com/SGBC/capture'
+
+with open('README.md') as f:
+    long_description = f.read()
+
+setup(
+    name='capture-assembler',
+    version=__version__,
+
+    description='The Capture seq assembler',
+    long_description=long_description,
+    long_description_content_type='text/markdown',
+
+    url=url,
+    download_url=url + '/tarball/' + __version__,
+    author='Hadrien Gourlé, Renaud Van Damme',
+    author_email='hadrien.gourle@slu.se',
+
+    license='MIT',
+    packages=find_packages(),
+
+    tests_require=['nose', 'codecov'],
+    install_requires=['doit', 'biopython', 'pysam'],
+    include_package_data=True,
+
+    entry_points={
+        'console_scripts': ['capture = capture.app:main'],
+    }
+)

From 65ea7f00274aab768497ee10799075e0e7c79da8 Mon Sep 17 00:00:00 2001
From: RVanDamme <renaudvandamme@gmail.com>
Date: Fri, 29 Jun 2018 09:09:30 +0200
Subject: [PATCH 3/5] minor correction in function description

---
 capture/jobs.py  |  4 ++--
 capture/parse.py | 40 +++++++++++++++++++++-------------------
 2 files changed, 23 insertions(+), 21 deletions(-)

diff --git a/capture/jobs.py b/capture/jobs.py
index 8c4bd0a..065285c 100644
--- a/capture/jobs.py
+++ b/capture/jobs.py
@@ -122,8 +122,8 @@ def task_canu(output, mem, thread, genome_size):
             mem =  the memory available spades can use
             thread = the number of threads available spades can use
             genome_size = the size of the wanted genome
-            Return:
-                dictionary
+        Return:
+            dictionary
     """
     contig_dir = f"{output}/temp"
     output_dir = f"{output}/temp/canu_out"
diff --git a/capture/parse.py b/capture/parse.py
index d5d6569..cba15e6 100644
--- a/capture/parse.py
+++ b/capture/parse.py
@@ -12,8 +12,7 @@
 
 
 def is_gzip(file):
-    """ test if the file is gzip using the 4 first byte of the file
-        who are characteristic of the type of file.
+    """ test if the file is gzip
         Arguments:
             file =  the path to the file to test
         return:
@@ -50,12 +49,12 @@ def count_record(file):
 
 
 def parse_fq(output, file, type_f, num_sub, number_records, handle):
-    """ we read the file, we make a sum of the reads
-        each time we get the number of reads for the wanted coverage
-        we save them in a subfile, and keep reading the infile to
-        get the next subfile. All the read will be selected because
-        the number of subsample is the highest possible.
-        For fastq files
+    """ read the file, make a sum of the reads
+        for each occurence, the number of reads for the wanted coverage
+        is saved in a subfile. The software keep reading the infile to
+        get the next subfile. All the read will be selected, because
+        the number of subsample required is the highest possible.
+        Only for fastq files
         Arguments:
             output = the path to the output directory
             file = the fastq file where we retrieve the sequence
@@ -99,11 +98,12 @@ def parse_fq_rnd(
                 num_sub, number_records,
                 handle, tot_records
                 ):
-    """ we read the file, we make a sum of the reads
-        each time we get the number of reads for the wanted coverage
-        we save them in a subfile, and keep reading the infile to
-        get the next subfile. Here we choose randomly the reads because
-        the number of subsample is limited
+    """ Read the file, make a sum of the reads
+        for each occurence, the number of reads for the wanted coverage
+        is saved in a subfile. the software keep reading the infile to
+        get the next subfile. Here the reads are chosen randomly because
+        the number of subsample is limited.
+        Only for Fastq
         Arguments:
             output = the path to the output directory
             file = the fastq file where we retrieve the sequence
@@ -148,12 +148,13 @@ def reservoir(total_record, wanted_record, file_record):
 
 
 def parse_bam(output, file, type_f, num_sub, number_records):
-    """ we read the file, we make a sum of the reads
-        each time we get the number of reads for the wanted coverage
-        we save them in a subfile, and keep reading the infile to
-        get the next subfile. Here we choose randomly the reads because
-        the number of subsample is limited.
-        For bam files
+    """ Read the file, make a sum of the reads
+        for each occurence, the number of reads for the wanted coverage
+        is saved in a subfile.the software keep reading the infile to
+        get the next subfile. All the reads will be parsed
+        and write in a subsample since the highest number of subsample
+        possible is required
+        Only for bam files
         Arguments:
             output = the path to the output directory
             file = the fastq file where we retrieve the sequence
@@ -188,6 +189,7 @@ def parse_bam(output, file, type_f, num_sub, number_records):
 def parse_bam_rnd(output, file, type_f, num_sub, fraction):
     """  parse the bam file and create only a chosen number
     of subsample. Each subsample contains randomly chosen reads
+    Only for bam files
     Arguments:
         output = the path to the output directory
         file = the fastq file where we retrieve the sequence

From 4416fb1ae875808c934a1af0716e4352e69913f8 Mon Sep 17 00:00:00 2001
From: RVanDamme <38455046+RVanDamme@users.noreply.github.com>
Date: Fri, 29 Jun 2018 10:25:56 +0200
Subject: [PATCH 4/5] Update README.md

---
 README.md | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/README.md b/README.md
index 1aa830b..49f0503 100644
--- a/README.md
+++ b/README.md
@@ -18,6 +18,7 @@ Thanks for your interest and come back soon!
 * Canu
 
 To install capture-assembler, simply type the following in your terminal:
+
     pip install capture-assembler
     
 
@@ -61,6 +62,13 @@ capture assemble -h
 ```
 
 
+## External Links
+
+Official link to:
+
+* [SPAdes](http://cab.spbu.ru/software/spades/)
+* [Canu](http://canu.readthedocs.io/en/latest/)
+
 ## License
 
 Code is under the [MIT](LICENSE) license

From ed895a065e3abfa59f18b8af77d6d78ffd167289 Mon Sep 17 00:00:00 2001
From: RVanDamme <38455046+RVanDamme@users.noreply.github.com>
Date: Mon, 2 Jul 2018 13:52:02 +0200
Subject: [PATCH 5/5] adding conda installation

---
 README.md | 24 ++++++++++++++++++------
 1 file changed, 18 insertions(+), 6 deletions(-)

diff --git a/README.md b/README.md
index 49f0503..1fd9135 100644
--- a/README.md
+++ b/README.md
@@ -20,27 +20,39 @@ Thanks for your interest and come back soon!
 To install capture-assembler, simply type the following in your terminal:
 
     pip install capture-assembler
-    
+It requires python 3.6 and canu and spades already installed.
+
+If you use conda you can install it using:
+
+    conda install -c rvandamme capture-assembler
+It requires a conda environment with python 3.6 and canu and spades already installed.
+
+To install capture-assembler, spades and canu in a conda environment (recommanded to create a new one), type the following:
+ ```bash
+ conda create -n env_name python=3.6 # create a new env in python 3.6 (optionnal)
+ conda install -n env_name -c rvandamme capture-assembler  canu=1.7 spades=3.12.0
+ conda activate env_name #start using the environment and the capture-assembler
+ ```
 
 ## Quickstart
 
 ```bash
 # paired-end reads, Illumina MiSeq
-capture assemble -f reads_R1.fastq.gz -r reads_R2.fastq.gz \
+capture -f reads_R1.fastq.gz -r reads_R2.fastq.gz \
 --genome_size 35000 --mean 300 -o output_dir
 # compressed paired-end reads, Illumina HiSeq
-capture assemble -f reads_R1.fastq.gz -r reads_R2.fastq.gz \
+capture -f reads_R1.fastq.gz -r reads_R2.fastq.gz \
 --genome_size 35000 --mean 125 -o output_dir
 # single end reads, Ion Torrent
-capture assemble -u reads.fastq.gz \
+capture -u reads.fastq.gz \
 --genome_size 35000 --mean 240 -o output_dir
 # reads in bam format (Ion Torrent)
-capture assemble --bam reads.bam \
+capture --bam reads.bam \
 --genome_size 35000 --mean 240 -o output_dir
 # full list of subcommands and options
 capture -h
 # full list of options for a subcommand
-capture assemble -h
+capture -h
 ```
 ## Arguments