Skip to content

Commit

Permalink
Merge pull request #194 from CDCgov/test_data_variola_update_kao
Browse files Browse the repository at this point in the history
Test data variola update kao
  • Loading branch information
jessicarowell authored Apr 9, 2024
2 parents 9ae71eb + da9e742 commit 90d79ec
Show file tree
Hide file tree
Showing 16 changed files with 6,245 additions and 21 deletions.
3,094 changes: 3,094 additions & 0 deletions assets/sample_fastas/variola/NC_001611.fasta

Large diffs are not rendered by default.

3,106 changes: 3,106 additions & 0 deletions assets/sample_fastas/variola/PP405578.fasta

Large diffs are not rendered by default.

2 changes: 0 additions & 2 deletions assets/sample_fastas/variola/VARV_RZ10_3587.fasta

This file was deleted.

2 changes: 0 additions & 2 deletions assets/sample_fastas/variola/VARV_RZ10_3587_2.fasta

This file was deleted.

Binary file modified assets/sample_metadata/VARV_metadata_Sample_Run_1.xlsx
Binary file not shown.
Binary file not shown.
Binary file not shown.
26 changes: 19 additions & 7 deletions bin/repeatmasker_liftoff.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,11 +23,22 @@ def get_args():
parser.add_argument("--refgff", type=str, help="Reference GFF to gather the ITR attributes and sample ID \n", required=True)
parser.add_argument("--fasta", type=str, help="FASTA file for sample \n", required=True)
parser.add_argument("--outdir", type=str, default=".", help="Output directory, defualt is current directory")
parser.add_argument("--sample_name", type=str, default=".", help="Sample name")

args = parser.parse_args()

return args

def count_rows_starting_with_comment(file_path):
count = 0
with open(file_path, 'r') as file:
for line in file:
if line.startswith('#'):
count += 1
else:
break # Stop counting once a line is encountered that doesn't start with '#'
return count

def annotation_main():
""" Main function for calling the annotation transfer pipeline
"""
Expand All @@ -44,9 +55,10 @@ def annotation_main():
headerList = ['seq_id', 'source', 'type', 'start', 'end', 'score', 'strand', 'phase', 'attributes']

#####GATHER REF INFO#####

#load in repeatmasker gff skip commented lines that dont belong in dataframe
ref_gff = pd.read_csv(args.refgff, delimiter='\t', skip_blank_lines=True, names=headerList, comment='#')
#ref_gff = pd.read_csv(args.refgff, delimiter='\t', skip_blank_lines=True, names=headerList, comment='#')
ref_gff = pd.read_csv(args.refgff, delimiter='\t', skip_blank_lines=True, names=headerList, skiprows=count_rows_starting_with_comment(args.refgff))

#gather ref sample id
ref_id=ref_gff['seq_id'][0]
#gather index of attributes for first and second ITRs; needed for repeatmasker ITR attributes
Expand All @@ -63,18 +75,18 @@ def annotation_main():
#samp_name=repMannotation_prep.sample_info()[0]
#repMannotation_prep.repM_prep_main()

LOannotation_prep=Liftoff_Annotations(args.liftoff_gff, headerList, samp_name, args.outdir)
LOannotation_prep=Liftoff_Annotations(args.liftoff_gff, headerList, args.sample_name, args.outdir)
#LOannotation_prep.LO_prep_main()
#repMannotation_prep.sample_info()
new_gff=concat_gffs(args.liftoff_gff, repMannotation_prep.repM_prep_main(), LOannotation_prep.LO_prep_main(), ref_id, samp_name, args.outdir)
new_gff=concat_gffs(args.liftoff_gff, repMannotation_prep.repM_prep_main(), LOannotation_prep.LO_prep_main(), ref_id, args.sample_name, args.outdir)

new_gff.concat_LO_RM()

#####CREATE TBL FILE#####
main_util=MainUtility()
main_util.gff2tbl(
samp_name=samp_name,
gff_loc=f"{args.outdir}/{samp_name}_reformatted.gff",
gff_loc=f"{args.outdir}/{args.sample_name}_reformatted.gff",
tbl_output=f"{args.outdir}/"
)

Expand Down Expand Up @@ -121,7 +133,7 @@ def sample_info(self):

def cleanup_repeat_masker_gff(self):
#load in repeatmasker gff skip the first two lines that dont belong in dataframe
rem_gff = pd.read_csv(self.repeatMGFF, delimiter='\t', skip_blank_lines=True, names=self.headerList, comment='#')
rem_gff = pd.read_csv(self.repeatMGFF, delimiter='\t', skip_blank_lines=True, names=self.headerList, skiprows=count_rows_starting_with_comment(self.repeatMGFF))
#correct repeat region labels; repeatmasker labels repeat regions as dispersed_repeat
rem_gff['type'] = rem_gff['type'].replace({'dispersed_repeat': 'repeat_region'}, regex=True)

Expand Down Expand Up @@ -213,7 +225,7 @@ def LO_prep_main(self):
fields_to_drop = ['coverage', 'sequence_ID', 'matches_ref_protein', 'valid_ORF', 'valid_ORFs', 'extra_copy_number',
'copy_num_ID', 'pseudogene', 'partial_mapping', 'low_identity']
#load in liftoff gff with same headers as Repeatmasker and skip commented lines at dont belong to dataframe
lo_gff = pd.read_csv(self.liftoffGFF, delimiter='\t', skip_blank_lines=True, names=self.headerList, comment='#')
lo_gff = pd.read_csv(self.liftoffGFF, delimiter='\t', skip_blank_lines=True, names=self.headerList, skiprows=count_rows_starting_with_comment(self.liftoffGFF))

#run function to find and drop fields in attributes
lo_gff['attributes']=lo_gff['attributes'].apply(lambda row : self.fix_attributes(fields_to_drop, row))
Expand Down
8 changes: 5 additions & 3 deletions modules/local/concat_gffs/main.nf
Original file line number Diff line number Diff line change
Expand Up @@ -21,13 +21,15 @@ process CONCAT_GFFS {

input:
path ref_gff_path
path repeatmasker_gff
path liftoff_gff
//path repeatmasker_gff
//path liftoff_gff
tuple val(meta), path(repeatmasker_gff), path(liftoff_gff)
tuple val(meta), path(fasta_path), path(fastq_1), path(fastq_2)

script:
"""
repeatmasker_liftoff.py --repeatm_gff $repeatmasker_gff --liftoff_gff $liftoff_gff --refgff $ref_gff_path --fasta $fasta_path
echo "repeatmasker_liftoff.py --repeatm_gff $repeatmasker_gff --liftoff_gff $liftoff_gff --refgff $ref_gff_path --fasta $fasta_path --sample_name $meta.id"
repeatmasker_liftoff.py --repeatm_gff $repeatmasker_gff --liftoff_gff $liftoff_gff --refgff $ref_gff_path --fasta $fasta_path --sample_name $meta.id
"""

output:
Expand Down
4 changes: 2 additions & 2 deletions modules/local/liftoff_cli_annotation/main.nf
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ process LIFTOFF_CLI {
'https://depot.galaxyproject.org/singularity/liftoff:1.6.3--pyhdfd78af_0' :
'quay.io/biocontainers/liftoff:1.6.3--pyhdfd78af_0'}"

publishDir "$params.output_dir/repeatmasker_liftoff_outputs", mode: "copy", overwrite: params.overwrite_output,
publishDir "$params.output_dir/liftoff", mode: "copy", overwrite: params.overwrite_output,
saveAs: { filename ->
filename.indexOf('.fasta') > 0 ? "fasta/${filename}":
filename.indexOf('.txt') > 0 ? "errors/${filename}":
Expand All @@ -26,7 +26,7 @@ process LIFTOFF_CLI {

script:
"""
liftoff -g $ref_gff_path -o ${fasta.baseName}_liftoff-orig.gff \
liftoff -g $ref_gff_path -o ${fasta.baseName}.liftoff-orig.gff \
-u $params.lift_unmapped_features_file_name \
-a $params.lift_coverage_threshold -s $params.lift_child_feature_align_threshold \
-d $params.lift_distance_scaling_factor -flank $params.lift_flank -p $params.lift_parallel_processes \
Expand Down
22 changes: 19 additions & 3 deletions subworkflows/local/repeatmasker_liftoff.nf
Original file line number Diff line number Diff line change
Expand Up @@ -27,15 +27,31 @@ workflow REPEATMASKER_LIFTOFF {
params.ref_fasta_path,
params.ref_gff_path
)

repeatmasker_gff_ch = REPEATMASKER.out.gff.collect().flatten()
.map {
meta = [:]
meta['id'] = [id:it.getSimpleName()]
[ meta, it ]
}

liftoff_gff_ch = LIFTOFF_CLI.out.gff.collect().flatten()
.map {
meta = [:]
meta['id'] = [id:it.getSimpleName()]
[ meta, it ]
}

concat_gffs_ch = repeatmasker_gff_ch.join(liftoff_gff_ch) // meta.id, fasta, repeatmasker_gff, liftoff_gff

// concat gffs
CONCAT_GFFS (
params.ref_gff_path,
REPEATMASKER.out.gff,
LIFTOFF_CLI.out.gff,
concat_gffs_ch,
fasta
)

emit:
fasta = LIFTOFF_CLI.out.fasta
gff = CONCAT_GFFS.out.gff
}
}
2 changes: 0 additions & 2 deletions subworkflows/local/submission.nf
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,6 @@ workflow INITIAL_SUBMISSION {
// submit the files to database of choice (after fixing config and getting wait time)
if ( params.genbank && params.sra ){ // genbank and sra
// submit the files to database of choice (after fixing config and getting wait time)
submission_ch.view()
SUBMISSION_FULL ( submission_ch, submission_config )

// actual process to initiate wait
Expand All @@ -39,7 +38,6 @@ workflow INITIAL_SUBMISSION {
.map {
it -> [it[0], it[1], it[3], it[4]]
}
submission_ch.view()
SUBMISSION_SRA ( submission_ch, submission_config )

// actual process to initiate wait
Expand Down

0 comments on commit 90d79ec

Please sign in to comment.