Skip to content

Commit

Permalink
v125-master: open -e switch for get_seqs
Browse files Browse the repository at this point in the history
  • Loading branch information
dfguan committed Oct 29, 2020
1 parent fe8dce2 commit 60e4a6e
Show file tree
Hide file tree
Showing 4 changed files with 17 additions and 10 deletions.
14 changes: 8 additions & 6 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -125,7 +125,8 @@ configuration file is in json format, it has all the information required by run
"queue": "normal"
},
"gs": {
"mem": 10000
"mem": 10000,
"oe": 1
},
"kcp": {
"core": 12,
Expand Down Expand Up @@ -153,12 +154,13 @@ This file use several key words to define resource allocation, input files or ou
- **ref**: Assembly file path
- **out_dir**: Working directory
- **ispb**: Bool value set for pacbio data, 0 for Illumina data
- **oe**: only remove the haplotypic duplications occuring at the ends of the contigs

**Notice**: **isdip** is deprecated.

The dictionary "kcp" keeps paramaters for run_kcm script.
The dictionary "gs" sets parameters for get\_seqs (purge\_dups executable file), designed to produce primary contigs and haplotigs.
The dictionary "pd" sets parameters for purge\_dups (purge\_dups executable file), designed to purge haplotigs and overlaps in an assembly.
The dictionary **"kcp"** keeps paramaters for run_kcm script.
The dictionary **"gs"** sets parameters for get\_seqs (purge\_dups executable file), designed to produce primary contigs and haplotigs.
The dictionary **"pd"** sets parameters for purge\_dups (purge\_dups executable file), designed to purge haplotigs and overlaps in an assembly.
The dictionary **"cc"** sets parameters for **minimap2/bwa**.
The dictionary **"sa"** sets parameters for minimap2.
The dictionary "busco" sets parameters for run\_busco.
Expand Down Expand Up @@ -237,9 +239,9 @@ bin/purge_dups -2 -T cutoffs -c PB.base.cov $pri_asm.split.self.paf.gz > dups.be
### Step 3. Get purged primary and haplotig sequences from draft assembly.

```
bin/get_seqs dups.bed $pri_asm
bin/get_seqs -e dups.bed $pri_asm
```
**Notice** this command will remove haplotypic duplications at the ends and in the middle of the contigs. If you just want to remove the duplications at the ends, please use `-e` option. For more options, please refer to `get_seqs -h`.
**Notice** this command will only remove haplotypic duplications at the ends of the contigs. If you also want to remove the duplications in the middle, please remove `-e` option at your own risk, it may delete false positive duplications. For more options, please refer to `get_seqs -h`.

### Step 4. Merge hap.fa and $hap_asm and redo the above steps to get a decent haplotig set.

Expand Down
2 changes: 1 addition & 1 deletion scripts/pd_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ def gen_config(r, d, fn, skipB):
"sa":{"core":12, "mem":10000, "queue":"normal"},
"busco":{"core":12, "mem":20000, "queue":"long", "skip":0, "lineage":used_lineage, "prefix":ref_fn+"_purged", "tmpdir":"busco_tmp"},
"pd":{"mem": 20000, "queue": "normal"},
"gs": {"mem": 10000},
"gs": {"mem": 10000, "oe": 1},
"kcp": {"core":12, "mem":30000, "fofn":"", "prefix":ref_fn+"_purged_kcm", "tmpdir":"kcp_tmp", "skip": 0},
"ref":r, "out_dir":ref_fn
}
Expand Down
9 changes: 7 additions & 2 deletions scripts/run_purge_dups.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,12 +70,14 @@ def purge_dups(man, pltfm, paf_fn, base_cov_fn, cutoff_fn, core_lim, mem_lim, qu
rtn = man.start([j], True)
return rtn
#INPUT: dups assembly
def get_seqs(man, pltfm, ref, dups_fn, core_lim, mem_lim, out_dir, bin_dir, spid):
def get_seqs(man, pltfm, ref, dups_fn, core_lim, mem_lim, onlyend, out_dir, bin_dir, spid):
mkdir(out_dir)
out_fn = "{0}/{1}.purged.fa".format(out_dir, get_rm_prefix(ref))
out_red_fn = "{0}/{1}.red.fa".format(out_dir, get_rm_prefix(ref))
out_prefx="{0}/{1}".format(out_dir, get_rm_prefix(ref))
jcmd = "{0}/get_seqs -p {5} {1} {2}".format(bin_dir, dups_fn, ref, out_fn, out_red_fn, out_prefx)
if onlyend:
jcmd = "{0}/get_seqs -e -p {5} {1} {2}".format(bin_dir, dups_fn, ref, out_fn, out_red_fn, out_prefx)
jjn = "get_seqs_{}".format(spid)
jout = "{0}/{1}_%J.o".format(out_dir, jjn)
jerr = "{0}/{1}_%J.e".format(out_dir, jjn)
Expand Down Expand Up @@ -264,11 +266,14 @@ def cont(config_fn, bin_dir, spid, pltfm, _wait, _retries):
rtn = purge_dups(man, pltfm, in_paf_fn, "", "", 1, pd_mem, pd_queue, out_pd_dir, bin_dir, spid)
if not rtn:
gs_mem = 10000
gs_onlyend = 1
if "gs" in config_dict:
gs_mem = config_dict["gs"]["mem"]
if "oe" in config_dict:
gs_onlyend=config_dict["gs"]["oe"]
in_dups_fn = "{0}/dups.bed".format(out_pd_dir)
out_dir = "{}/seqs".format(out_dir)
rtn = get_seqs(man, pltfm, ref, in_dups_fn, 1, gs_mem, out_dir, bin_dir, spid)
rtn = get_seqs(man, pltfm, ref, in_dups_fn, 1, gs_mem, gs_onlyend, out_dir, bin_dir, spid)

procs = []
workdir = out_dir
Expand Down
2 changes: 1 addition & 1 deletion src/version.h
Original file line number Diff line number Diff line change
Expand Up @@ -17,4 +17,4 @@
*/
#define MAJOR 1
#define MINOR 2
#define PATCH 3
#define PATCH 5

0 comments on commit 60e4a6e

Please sign in to comment.