Skip to content

Commit

Permalink
add conditional overlap logic to pseudogene_calling process
Browse files Browse the repository at this point in the history
  • Loading branch information
haessar committed Oct 23, 2023
1 parent cc4197d commit 9d95fca
Show file tree
Hide file tree
Showing 4 changed files with 50 additions and 28 deletions.
11 changes: 8 additions & 3 deletions annot.nf
Original file line number Diff line number Diff line change
Expand Up @@ -107,7 +107,7 @@ if (params.do_contiguation) {
file 'pseudo.contigs.fasta' into contigs_seq
// TODO If upgrading to DSL2, can just use json output and splitJson operator before circos_run_chrs process.
file 'ref_target_mapping.txt' into ref_target_mapping_circos
file 'ref_target_mapping.json' into ref_target_mapping_integrate
file 'ref_target_mapping.json' into ref_target_mapping

"""
abacas2.nonparallel.sh \
Expand All @@ -129,7 +129,7 @@ if (params.do_contiguation) {
file 'pseudo.scafs.fasta' into scaffolds_seq
file 'pseudo.scafs.agp' into scaffolds_agp
file 'pseudo.contigs.fasta' into contigs_seq
file 'ref_target_mapping.json' into ref_target_mapping_integrate
file 'ref_target_mapping.json' into ref_target_mapping

"""
no_abacas_prepare.lua ${sanitized_genome_file} pseudo
Expand Down Expand Up @@ -170,6 +170,9 @@ pseudochr_agp.into{ pseudochr_agp_augustus
pseudochr_agp_make_gaps
pseudochr_agp_make_dist }

ref_target_mapping.into{ ref_target_mapping_integrate
ref_target_mapping_pseudo }

// TRNA PREDICTION
// ===============

Expand Down Expand Up @@ -886,6 +889,7 @@ if (params.do_pseudo) {
file 'pseudochr.fasta' from pseudochr_seq_pseudogene_calling
file 'genes.gff3' from integrated_gff3_clean
file 'last.out' from pseudochr_last_out.collectFile()
file 'ref_target_mapping.json' from ref_target_mapping_pseudo

output:
file 'genes_and_pseudo.gff3' into gff3_with_pseudogenes
Expand All @@ -899,7 +903,8 @@ if (params.do_pseudo) {
if [ ! -s last_and_genes.gff3 ]; then
echo '##gff-version 3' > last_and_genes.gff3
fi
pseudo_merge_with_genes.lua last_and_genes.gff3 pseudochr.fasta > out_tmp.gff3
pseudo_merge_with_genes.lua last_and_genes.gff3 pseudochr.fasta \
-m ref_target_mapping.json -o ${params.MAX_OVERLAP} > out_tmp.gff3
gt gff3 -sort -retainids -tidy out_tmp.gff3 > genes_and_pseudo.gff3
"""
}
Expand Down
22 changes: 1 addition & 21 deletions bin/integrate_gene_calls.lua
Original file line number Diff line number Diff line change
Expand Up @@ -97,26 +97,6 @@ function stream:process_current_cluster()
end
end

function stream:conditional_overlap(new_rng)
local ext_rng = nil

if ref_target_mapping then
-- allow short overlaps for apicoplasts and mitochondria
if (ref_target_mapping.API and self.last_seqid == ref_target_mapping.API[2])
or (ref_target_mapping.MIT and self.last_seqid == ref_target_mapping.MIT[2]) then
local ext = options.overlap
-- ensure consistent start and end bases for new range
if self.curr_rng:get_start() < new_rng:get_start() then
ext_rng = gt.range_new(math.min(new_rng:get_start() + ext, new_rng:get_end()), new_rng:get_end())
else
ext_rng = gt.range_new(new_rng:get_start(), math.max(new_rng:get_start(), new_rng:get_end() - ext))
end
return self.curr_rng:overlap(ext_rng)
end
end
return self.curr_rng:overlap(new_rng)
end

function stream:next_tree()
local complete_cluster = false
local mygn = nil
Expand All @@ -141,7 +121,7 @@ function stream:next_tree()
else
if self.last_seqid == fn:get_seqid()
and self.last_strand == fn:get_strand()
and self:conditional_overlap(new_rng) then
and conditional_overlap(ref_target_mapping, self.curr_rng, new_rng, self.last_seqid, options.overlap) then
table.insert(self.curr_gene_set, fn)
self.curr_rng = self.curr_rng:join(new_rng)
else
Expand Down
22 changes: 21 additions & 1 deletion bin/lib.lua
Original file line number Diff line number Diff line change
Expand Up @@ -593,4 +593,24 @@ function infernal_in_stream_new(myio)
end
end
return s
end
end

function conditional_overlap(ref_target_mapping, curr_rng, new_rng, last_seqid, overlap)
local ext_rng = nil

if ref_target_mapping then
-- allow short overlaps for apicoplasts and mitochondria
if (ref_target_mapping.API and last_seqid == ref_target_mapping.API[2])
or (ref_target_mapping.MIT and last_seqid == ref_target_mapping.MIT[2]) then
local ext = overlap
-- ensure consistent start and end bases for new range
if curr_rng:get_start() < new_rng:get_start() then
ext_rng = gt.range_new(math.min(new_rng:get_start() + ext, new_rng:get_end()), new_rng:get_end())
else
ext_rng = gt.range_new(new_rng:get_start(), math.max(new_rng:get_start(), new_rng:get_end() - ext))
end
return curr_rng:overlap(ext_rng)
end
end
return curr_rng:overlap(new_rng)
end
23 changes: 20 additions & 3 deletions bin/pseudo_merge_with_genes.lua
Original file line number Diff line number Diff line change
Expand Up @@ -23,14 +23,15 @@ to_gene = {pseudogene = 'gene', pseudogenic_transcript = 'mRNA',
package.path = gt.script_dir .. "/?.lua;" .. package.path
require("lib")
require("optparse")
local json = require ("dkjson")

op = OptionParser:new({usage="%prog <options> merged_gff_file.gff3 sequence.fas",
oneliner="Adds pseudogene annotations to existing set of gene models.",
version="0.1"})
op:option{"-t", action='store', dest='threshold',
help="minimum gene coverage ratio threshold for pseudogene "
.. "required to replace a gene (e.g. 0.6 for 60%)"}
op:option{"-m", action='store', dest='minratio',
op:option{"-n", action='store', dest='minratio',
help="minimal 'stitching' gene coverage ratio "
.. "required to replace multiple genes (e.g. 0.8 for 80%)"}
op:option{"-x", action='store', dest='maxratio',
Expand All @@ -44,10 +45,14 @@ op:option{"-r", action='store', dest='rejectfile',
op:option{"-l", action='store', dest='longest_until',
help="pick candidate with median length if at least "
.. "<LONGEST_UNTIL> candidates per locus"}
op:option{"-m", action='store', dest='mapping',
help="reference target chromosome mapping"}
op:option{"-o", action='store', dest='overlap', default=0,
help="number of bases to allow two adjacent genes to overlap by"}
op:option{"-d", action='store_true', dest='debug',
help="output debug information on stderr"}
options,args = op:parse({threshold=0.6, minratio=0.8, maxratio=1.2,
debug=false, rejectfile=nil,
debug=false, rejectfile=nil, mapping=nil,
ignore_frame_agreement=false, longest_until=4})

function usage()
Expand All @@ -59,6 +64,18 @@ if #args < 2 then
usage()
end

if options.mapping then
local mapfile = io.open(options.mapping, "rb")
if not mapfile then
error("could not open reference target mapping file")
end
local mapcontent = mapfile:read("*all")
mapfile:close()
ref_target_mapping = json.decode(mapcontent)
else
ref_target_mapping = nil
end

local rejected_pseudogenes = {}
local rm = gt.region_mapping_new_seqfile_matchdescstart(args[2])

Expand Down Expand Up @@ -381,7 +398,7 @@ function stream:next_tree()
else
if self.last_seqid == fn:get_seqid()
and self.last_strand == fn:get_strand()
and self.curr_rng:overlap(new_rng) then
and conditional_overlap(ref_target_mapping, self.curr_rng, new_rng, self.last_seqid, options.overlap) then
table.insert(self.curr_gene_set, fn)
self.curr_rng = self.curr_rng:join(new_rng)
else
Expand Down

0 comments on commit 9d95fca

Please sign in to comment.