diff --git a/annot.nf b/annot.nf index a643529..d2c80e2 100644 --- a/annot.nf +++ b/annot.nf @@ -107,7 +107,7 @@ if (params.do_contiguation) { file 'pseudo.contigs.fasta' into contigs_seq // TODO If upgrading to DSL2, can just use json output and splitJson operator before circos_run_chrs process. file 'ref_target_mapping.txt' into ref_target_mapping_circos - file 'ref_target_mapping.json' into ref_target_mapping_integrate + file 'ref_target_mapping.json' into ref_target_mapping """ abacas2.nonparallel.sh \ @@ -129,7 +129,7 @@ if (params.do_contiguation) { file 'pseudo.scafs.fasta' into scaffolds_seq file 'pseudo.scafs.agp' into scaffolds_agp file 'pseudo.contigs.fasta' into contigs_seq - file 'ref_target_mapping.json' into ref_target_mapping_integrate + file 'ref_target_mapping.json' into ref_target_mapping """ no_abacas_prepare.lua ${sanitized_genome_file} pseudo @@ -170,6 +170,9 @@ pseudochr_agp.into{ pseudochr_agp_augustus pseudochr_agp_make_gaps pseudochr_agp_make_dist } +ref_target_mapping.into{ ref_target_mapping_integrate + ref_target_mapping_pseudo } + // TRNA PREDICTION // =============== @@ -886,6 +889,7 @@ if (params.do_pseudo) { file 'pseudochr.fasta' from pseudochr_seq_pseudogene_calling file 'genes.gff3' from integrated_gff3_clean file 'last.out' from pseudochr_last_out.collectFile() + file 'ref_target_mapping.json' from ref_target_mapping_pseudo output: file 'genes_and_pseudo.gff3' into gff3_with_pseudogenes @@ -899,7 +903,8 @@ if (params.do_pseudo) { if [ ! -s last_and_genes.gff3 ]; then echo '##gff-version 3' > last_and_genes.gff3 fi - pseudo_merge_with_genes.lua last_and_genes.gff3 pseudochr.fasta > out_tmp.gff3 + pseudo_merge_with_genes.lua last_and_genes.gff3 pseudochr.fasta \ + -m ref_target_mapping.json -o ${params.MAX_OVERLAP} > out_tmp.gff3 gt gff3 -sort -retainids -tidy out_tmp.gff3 > genes_and_pseudo.gff3 """ } diff --git a/bin/integrate_gene_calls.lua b/bin/integrate_gene_calls.lua index b34e97f..1490224 100755 --- a/bin/integrate_gene_calls.lua +++ b/bin/integrate_gene_calls.lua @@ -97,26 +97,6 @@ function stream:process_current_cluster() end end -function stream:conditional_overlap(new_rng) - local ext_rng = nil - - if ref_target_mapping then - -- allow short overlaps for apicoplasts and mitochondria - if (ref_target_mapping.API and self.last_seqid == ref_target_mapping.API[2]) - or (ref_target_mapping.MIT and self.last_seqid == ref_target_mapping.MIT[2]) then - local ext = options.overlap - -- ensure consistent start and end bases for new range - if self.curr_rng:get_start() < new_rng:get_start() then - ext_rng = gt.range_new(math.min(new_rng:get_start() + ext, new_rng:get_end()), new_rng:get_end()) - else - ext_rng = gt.range_new(new_rng:get_start(), math.max(new_rng:get_start(), new_rng:get_end() - ext)) - end - return self.curr_rng:overlap(ext_rng) - end - end - return self.curr_rng:overlap(new_rng) -end - function stream:next_tree() local complete_cluster = false local mygn = nil @@ -141,7 +121,7 @@ function stream:next_tree() else if self.last_seqid == fn:get_seqid() and self.last_strand == fn:get_strand() - and self:conditional_overlap(new_rng) then + and conditional_overlap(ref_target_mapping, self.curr_rng, new_rng, self.last_seqid, options.overlap) then table.insert(self.curr_gene_set, fn) self.curr_rng = self.curr_rng:join(new_rng) else diff --git a/bin/lib.lua b/bin/lib.lua index 1f6f2b1..b88b4e4 100644 --- a/bin/lib.lua +++ b/bin/lib.lua @@ -593,4 +593,24 @@ function infernal_in_stream_new(myio) end end return s -end \ No newline at end of file +end + +function conditional_overlap(ref_target_mapping, curr_rng, new_rng, last_seqid, overlap) + local ext_rng = nil + + if ref_target_mapping then + -- allow short overlaps for apicoplasts and mitochondria + if (ref_target_mapping.API and last_seqid == ref_target_mapping.API[2]) + or (ref_target_mapping.MIT and last_seqid == ref_target_mapping.MIT[2]) then + local ext = overlap + -- ensure consistent start and end bases for new range + if curr_rng:get_start() < new_rng:get_start() then + ext_rng = gt.range_new(math.min(new_rng:get_start() + ext, new_rng:get_end()), new_rng:get_end()) + else + ext_rng = gt.range_new(new_rng:get_start(), math.max(new_rng:get_start(), new_rng:get_end() - ext)) + end + return curr_rng:overlap(ext_rng) + end + end + return curr_rng:overlap(new_rng) +end diff --git a/bin/pseudo_merge_with_genes.lua b/bin/pseudo_merge_with_genes.lua index 3dc8d81..7a03337 100755 --- a/bin/pseudo_merge_with_genes.lua +++ b/bin/pseudo_merge_with_genes.lua @@ -23,6 +23,7 @@ to_gene = {pseudogene = 'gene', pseudogenic_transcript = 'mRNA', package.path = gt.script_dir .. "/?.lua;" .. package.path require("lib") require("optparse") +local json = require ("dkjson") op = OptionParser:new({usage="%prog merged_gff_file.gff3 sequence.fas", oneliner="Adds pseudogene annotations to existing set of gene models.", @@ -30,7 +31,7 @@ op = OptionParser:new({usage="%prog merged_gff_file.gff3 sequence.fas" op:option{"-t", action='store', dest='threshold', help="minimum gene coverage ratio threshold for pseudogene " .. "required to replace a gene (e.g. 0.6 for 60%)"} -op:option{"-m", action='store', dest='minratio', +op:option{"-n", action='store', dest='minratio', help="minimal 'stitching' gene coverage ratio " .. "required to replace multiple genes (e.g. 0.8 for 80%)"} op:option{"-x", action='store', dest='maxratio', @@ -44,10 +45,14 @@ op:option{"-r", action='store', dest='rejectfile', op:option{"-l", action='store', dest='longest_until', help="pick candidate with median length if at least " .. " candidates per locus"} +op:option{"-m", action='store', dest='mapping', + help="reference target chromosome mapping"} +op:option{"-o", action='store', dest='overlap', default=0, + help="number of bases to allow two adjacent genes to overlap by"} op:option{"-d", action='store_true', dest='debug', help="output debug information on stderr"} options,args = op:parse({threshold=0.6, minratio=0.8, maxratio=1.2, - debug=false, rejectfile=nil, + debug=false, rejectfile=nil, mapping=nil, ignore_frame_agreement=false, longest_until=4}) function usage() @@ -59,6 +64,18 @@ if #args < 2 then usage() end +if options.mapping then + local mapfile = io.open(options.mapping, "rb") + if not mapfile then + error("could not open reference target mapping file") + end + local mapcontent = mapfile:read("*all") + mapfile:close() + ref_target_mapping = json.decode(mapcontent) +else + ref_target_mapping = nil +end + local rejected_pseudogenes = {} local rm = gt.region_mapping_new_seqfile_matchdescstart(args[2]) @@ -381,7 +398,7 @@ function stream:next_tree() else if self.last_seqid == fn:get_seqid() and self.last_strand == fn:get_strand() - and self.curr_rng:overlap(new_rng) then + and conditional_overlap(ref_target_mapping, self.curr_rng, new_rng, self.last_seqid, options.overlap) then table.insert(self.curr_gene_set, fn) self.curr_rng = self.curr_rng:join(new_rng) else