diff --git a/README.md b/README.md index ceabc0b..60ea6a5 100644 --- a/README.md +++ b/README.md @@ -125,7 +125,8 @@ configuration file is in json format, it has all the information required by run "queue": "normal" }, "gs": { - "mem": 10000 + "mem": 10000, + "oe": 1 }, "kcp": { "core": 12, @@ -153,12 +154,13 @@ This file use several key words to define resource allocation, input files or ou - **ref**: Assembly file path - **out_dir**: Working directory - **ispb**: Bool value set for pacbio data, 0 for Illumina data +- **oe**: only remove the haplotypic duplications occuring at the ends of the contigs **Notice**: **isdip** is deprecated. -The dictionary "kcp" keeps paramaters for run_kcm script. -The dictionary "gs" sets parameters for get\_seqs (purge\_dups executable file), designed to produce primary contigs and haplotigs. -The dictionary "pd" sets parameters for purge\_dups (purge\_dups executable file), designed to purge haplotigs and overlaps in an assembly. +The dictionary **"kcp"** keeps paramaters for run_kcm script. +The dictionary **"gs"** sets parameters for get\_seqs (purge\_dups executable file), designed to produce primary contigs and haplotigs. +The dictionary **"pd"** sets parameters for purge\_dups (purge\_dups executable file), designed to purge haplotigs and overlaps in an assembly. The dictionary **"cc"** sets parameters for **minimap2/bwa**. The dictionary **"sa"** sets parameters for minimap2. The dictionary "busco" sets parameters for run\_busco. @@ -237,9 +239,9 @@ bin/purge_dups -2 -T cutoffs -c PB.base.cov $pri_asm.split.self.paf.gz > dups.be ### Step 3. Get purged primary and haplotig sequences from draft assembly. ``` -bin/get_seqs dups.bed $pri_asm +bin/get_seqs -e dups.bed $pri_asm ``` -**Notice** this command will remove haplotypic duplications at the ends and in the middle of the contigs. If you just want to remove the duplications at the ends, please use `-e` option. For more options, please refer to `get_seqs -h`. +**Notice** this command will only remove haplotypic duplications at the ends of the contigs. If you also want to remove the duplications in the middle, please remove `-e` option at your own risk, it may delete false positive duplications. For more options, please refer to `get_seqs -h`. ### Step 4. Merge hap.fa and $hap_asm and redo the above steps to get a decent haplotig set. diff --git a/scripts/pd_config.py b/scripts/pd_config.py index dbe8b8b..537b4a3 100755 --- a/scripts/pd_config.py +++ b/scripts/pd_config.py @@ -34,7 +34,7 @@ def gen_config(r, d, fn, skipB): "sa":{"core":12, "mem":10000, "queue":"normal"}, "busco":{"core":12, "mem":20000, "queue":"long", "skip":0, "lineage":used_lineage, "prefix":ref_fn+"_purged", "tmpdir":"busco_tmp"}, "pd":{"mem": 20000, "queue": "normal"}, - "gs": {"mem": 10000}, + "gs": {"mem": 10000, "oe": 1}, "kcp": {"core":12, "mem":30000, "fofn":"", "prefix":ref_fn+"_purged_kcm", "tmpdir":"kcp_tmp", "skip": 0}, "ref":r, "out_dir":ref_fn } diff --git a/scripts/run_purge_dups.py b/scripts/run_purge_dups.py index 5bd032b..b6233b2 100755 --- a/scripts/run_purge_dups.py +++ b/scripts/run_purge_dups.py @@ -70,12 +70,14 @@ def purge_dups(man, pltfm, paf_fn, base_cov_fn, cutoff_fn, core_lim, mem_lim, qu rtn = man.start([j], True) return rtn #INPUT: dups assembly -def get_seqs(man, pltfm, ref, dups_fn, core_lim, mem_lim, out_dir, bin_dir, spid): +def get_seqs(man, pltfm, ref, dups_fn, core_lim, mem_lim, onlyend, out_dir, bin_dir, spid): mkdir(out_dir) out_fn = "{0}/{1}.purged.fa".format(out_dir, get_rm_prefix(ref)) out_red_fn = "{0}/{1}.red.fa".format(out_dir, get_rm_prefix(ref)) out_prefx="{0}/{1}".format(out_dir, get_rm_prefix(ref)) jcmd = "{0}/get_seqs -p {5} {1} {2}".format(bin_dir, dups_fn, ref, out_fn, out_red_fn, out_prefx) + if onlyend: + jcmd = "{0}/get_seqs -e -p {5} {1} {2}".format(bin_dir, dups_fn, ref, out_fn, out_red_fn, out_prefx) jjn = "get_seqs_{}".format(spid) jout = "{0}/{1}_%J.o".format(out_dir, jjn) jerr = "{0}/{1}_%J.e".format(out_dir, jjn) @@ -264,11 +266,14 @@ def cont(config_fn, bin_dir, spid, pltfm, _wait, _retries): rtn = purge_dups(man, pltfm, in_paf_fn, "", "", 1, pd_mem, pd_queue, out_pd_dir, bin_dir, spid) if not rtn: gs_mem = 10000 + gs_onlyend = 1 if "gs" in config_dict: gs_mem = config_dict["gs"]["mem"] + if "oe" in config_dict: + gs_onlyend=config_dict["gs"]["oe"] in_dups_fn = "{0}/dups.bed".format(out_pd_dir) out_dir = "{}/seqs".format(out_dir) - rtn = get_seqs(man, pltfm, ref, in_dups_fn, 1, gs_mem, out_dir, bin_dir, spid) + rtn = get_seqs(man, pltfm, ref, in_dups_fn, 1, gs_mem, gs_onlyend, out_dir, bin_dir, spid) procs = [] workdir = out_dir diff --git a/src/version.h b/src/version.h index 980e4ef..86b10f6 100644 --- a/src/version.h +++ b/src/version.h @@ -17,4 +17,4 @@ */ #define MAJOR 1 #define MINOR 2 -#define PATCH 3 +#define PATCH 5