Merge pull request #194 from CDCgov/test_data_variola_update_kao

Test data variola update kao
CDCgov · Apr 9, 2024 · 90d79ec · 90d79ec
2 parents 9ae71eb + da9e742
commit 90d79ec
Show file tree

Hide file tree

Showing 16 changed files with 6,245 additions and 21 deletions.
diff --git a/assets/sample_fastas/variola/NC_001611.fasta b/assets/sample_fastas/variola/NC_001611.fasta
diff --git a/assets/sample_fastas/variola/PP405578.fasta b/assets/sample_fastas/variola/PP405578.fasta
diff --git a/assets/sample_fastas/variola/VARV_RZ10_3587.fasta b/assets/sample_fastas/variola/VARV_RZ10_3587.fasta
diff --git a/assets/sample_fastas/variola/VARV_RZ10_3587_2.fasta b/assets/sample_fastas/variola/VARV_RZ10_3587_2.fasta
diff --git a/...stqs/variola/VARV_RZ10_3587_2_R1.fastq.gz → ...mple_fastqs/variola/NC_001611_R1.fastq.gz b/...stqs/variola/VARV_RZ10_3587_2_R1.fastq.gz → ...mple_fastqs/variola/NC_001611_R1.fastq.gz
diff --git a/...stqs/variola/VARV_RZ10_3587_2_R2.fastq.gz → ...mple_fastqs/variola/NC_001611_R2.fastq.gz b/...stqs/variola/VARV_RZ10_3587_2_R2.fastq.gz → ...mple_fastqs/variola/NC_001611_R2.fastq.gz
diff --git a/...fastqs/variola/VARV_RZ10_3587_R1.fastq.gz → ...ample_fastqs/variola/PP405578_R1.fastq.gz b/...fastqs/variola/VARV_RZ10_3587_R1.fastq.gz → ...ample_fastqs/variola/PP405578_R1.fastq.gz
diff --git a/...fastqs/variola/VARV_RZ10_3587_R2.fastq.gz → ...ample_fastqs/variola/PP405578_R2.fastq.gz b/...fastqs/variola/VARV_RZ10_3587_R2.fastq.gz → ...ample_fastqs/variola/PP405578_R2.fastq.gz
diff --git a/assets/sample_metadata/VARV_metadata_Sample_Run_1.xlsx b/assets/sample_metadata/VARV_metadata_Sample_Run_1.xlsx
diff --git a/assets/sample_metadata/~$VARV_metadata_Sample_Run_1.xlsx b/assets/sample_metadata/~$VARV_metadata_Sample_Run_1.xlsx
diff --git a/assets/sample_metadata/~$custom_fields_MPXV_metadata_Sample_Run_1.xlsx b/assets/sample_metadata/~$custom_fields_MPXV_metadata_Sample_Run_1.xlsx
diff --git a/bin/repeatmasker_liftoff.py b/bin/repeatmasker_liftoff.py
@@ -23,11 +23,22 @@ def get_args():
     parser.add_argument("--refgff", type=str, help="Reference GFF to gather the ITR attributes and sample ID \n", required=True)
     parser.add_argument("--fasta", type=str, help="FASTA file for sample \n", required=True)
     parser.add_argument("--outdir", type=str, default=".", help="Output directory, defualt is current directory")
+    parser.add_argument("--sample_name", type=str, default=".", help="Sample name")
 
     args = parser.parse_args()
 
     return args
 
+def count_rows_starting_with_comment(file_path):
+    count = 0
+    with open(file_path, 'r') as file:
+        for line in file:
+            if line.startswith('#'):
+                count += 1
+            else:
+                break  # Stop counting once a line is encountered that doesn't start with '#'
+    return count
+
 def annotation_main():
     """ Main function for calling the annotation transfer pipeline
     """
@@ -44,9 +55,10 @@ def annotation_main():
     headerList = ['seq_id', 'source', 'type', 'start', 'end', 'score', 'strand', 'phase', 'attributes']
 
     #####GATHER REF INFO#####
-
     #load in repeatmasker gff skip commented lines that dont belong in dataframe
-    ref_gff = pd.read_csv(args.refgff, delimiter='\t', skip_blank_lines=True, names=headerList, comment='#')
+    #ref_gff = pd.read_csv(args.refgff, delimiter='\t', skip_blank_lines=True, names=headerList, comment='#')
+    ref_gff = pd.read_csv(args.refgff, delimiter='\t', skip_blank_lines=True, names=headerList, skiprows=count_rows_starting_with_comment(args.refgff))
+
     #gather ref sample id
     ref_id=ref_gff['seq_id'][0]
     #gather index of attributes for first and second ITRs; needed for repeatmasker ITR attributes
@@ -63,18 +75,18 @@ def annotation_main():
     #samp_name=repMannotation_prep.sample_info()[0]
     #repMannotation_prep.repM_prep_main()
 
-    LOannotation_prep=Liftoff_Annotations(args.liftoff_gff, headerList, samp_name, args.outdir)
+    LOannotation_prep=Liftoff_Annotations(args.liftoff_gff, headerList, args.sample_name, args.outdir)
     #LOannotation_prep.LO_prep_main()
     #repMannotation_prep.sample_info()
-    new_gff=concat_gffs(args.liftoff_gff, repMannotation_prep.repM_prep_main(), LOannotation_prep.LO_prep_main(), ref_id, samp_name, args.outdir)
+    new_gff=concat_gffs(args.liftoff_gff, repMannotation_prep.repM_prep_main(), LOannotation_prep.LO_prep_main(), ref_id, args.sample_name, args.outdir)
 
     new_gff.concat_LO_RM()
 
     #####CREATE TBL FILE#####
     main_util=MainUtility()
     main_util.gff2tbl(
         samp_name=samp_name,
-        gff_loc=f"{args.outdir}/{samp_name}_reformatted.gff",
+        gff_loc=f"{args.outdir}/{args.sample_name}_reformatted.gff",
         tbl_output=f"{args.outdir}/"
     )
 
@@ -121,7 +133,7 @@ def sample_info(self):
 
     def cleanup_repeat_masker_gff(self):
         #load in repeatmasker gff skip the first two lines that dont belong in dataframe
-        rem_gff = pd.read_csv(self.repeatMGFF, delimiter='\t', skip_blank_lines=True, names=self.headerList, comment='#')
+        rem_gff = pd.read_csv(self.repeatMGFF, delimiter='\t', skip_blank_lines=True, names=self.headerList, skiprows=count_rows_starting_with_comment(self.repeatMGFF))
         #correct repeat region labels; repeatmasker labels repeat regions as dispersed_repeat 
         rem_gff['type'] = rem_gff['type'].replace({'dispersed_repeat': 'repeat_region'}, regex=True)
 
@@ -213,7 +225,7 @@ def LO_prep_main(self):
         fields_to_drop = ['coverage', 'sequence_ID', 'matches_ref_protein', 'valid_ORF', 'valid_ORFs', 'extra_copy_number',
                               'copy_num_ID', 'pseudogene', 'partial_mapping', 'low_identity']
         #load in liftoff gff with same headers as Repeatmasker and skip commented lines at dont belong to dataframe
-        lo_gff = pd.read_csv(self.liftoffGFF, delimiter='\t', skip_blank_lines=True, names=self.headerList, comment='#')
+        lo_gff = pd.read_csv(self.liftoffGFF, delimiter='\t', skip_blank_lines=True, names=self.headerList, skiprows=count_rows_starting_with_comment(self.liftoffGFF))
 
         #run function to find and drop fields in attributes
         lo_gff['attributes']=lo_gff['attributes'].apply(lambda row : self.fix_attributes(fields_to_drop, row))

diff --git a/modules/local/concat_gffs/main.nf b/modules/local/concat_gffs/main.nf
@@ -21,13 +21,15 @@ process CONCAT_GFFS {
 
 	input:
 	path ref_gff_path
-	path repeatmasker_gff
-    path liftoff_gff
+	//path repeatmasker_gff
+    //path liftoff_gff
+    tuple val(meta), path(repeatmasker_gff), path(liftoff_gff)
 	tuple val(meta), path(fasta_path), path(fastq_1), path(fastq_2)
 
 	script:
 	"""
-	repeatmasker_liftoff.py --repeatm_gff $repeatmasker_gff --liftoff_gff $liftoff_gff --refgff $ref_gff_path --fasta $fasta_path   
+    echo "repeatmasker_liftoff.py --repeatm_gff $repeatmasker_gff --liftoff_gff $liftoff_gff --refgff $ref_gff_path --fasta $fasta_path  --sample_name $meta.id"
+	repeatmasker_liftoff.py --repeatm_gff $repeatmasker_gff --liftoff_gff $liftoff_gff --refgff $ref_gff_path --fasta $fasta_path  --sample_name $meta.id
 	"""
 
 	output:

diff --git a/modules/local/liftoff_cli_annotation/main.nf b/modules/local/liftoff_cli_annotation/main.nf
@@ -12,7 +12,7 @@ process LIFTOFF_CLI {
         'https://depot.galaxyproject.org/singularity/liftoff:1.6.3--pyhdfd78af_0' :
         'quay.io/biocontainers/liftoff:1.6.3--pyhdfd78af_0'}"
 
-    publishDir "$params.output_dir/repeatmasker_liftoff_outputs", mode: "copy", overwrite: params.overwrite_output,
+    publishDir "$params.output_dir/liftoff", mode: "copy", overwrite: params.overwrite_output,
         saveAs: { filename ->
                       filename.indexOf('.fasta') > 0 ? "fasta/${filename}":
                       filename.indexOf('.txt') > 0 ? "errors/${filename}":
@@ -26,7 +26,7 @@ process LIFTOFF_CLI {
 
 	script:
     """
-    liftoff -g $ref_gff_path -o ${fasta.baseName}_liftoff-orig.gff \
+    liftoff -g $ref_gff_path -o ${fasta.baseName}.liftoff-orig.gff \
     -u $params.lift_unmapped_features_file_name \
     -a $params.lift_coverage_threshold -s $params.lift_child_feature_align_threshold \
     -d $params.lift_distance_scaling_factor -flank $params.lift_flank -p $params.lift_parallel_processes \

diff --git a/subworkflows/local/repeatmasker_liftoff.nf b/subworkflows/local/repeatmasker_liftoff.nf
@@ -27,15 +27,31 @@ workflow REPEATMASKER_LIFTOFF {
             params.ref_fasta_path, 
             params.ref_gff_path 
         )
+
+        repeatmasker_gff_ch = REPEATMASKER.out.gff.collect().flatten()
+                .map { 
+                    meta = [:] 
+                    meta['id'] = [id:it.getSimpleName()] 
+                    [ meta, it ] 
+                }
+
+        liftoff_gff_ch = LIFTOFF_CLI.out.gff.collect().flatten()
+                .map { 
+                    meta = [:] 
+                    meta['id'] = [id:it.getSimpleName()] 
+                    [ meta, it ] 
+                }
+
+        concat_gffs_ch = repeatmasker_gff_ch.join(liftoff_gff_ch) // meta.id, fasta, repeatmasker_gff, liftoff_gff
+
         // concat gffs
         CONCAT_GFFS (
            params.ref_gff_path,
-           REPEATMASKER.out.gff,
-           LIFTOFF_CLI.out.gff,
+           concat_gffs_ch,
            fasta
         )
 
     emit:
         fasta = LIFTOFF_CLI.out.fasta
         gff = CONCAT_GFFS.out.gff
-}
+}
diff --git a/subworkflows/local/submission.nf b/subworkflows/local/submission.nf
@@ -23,7 +23,6 @@ workflow INITIAL_SUBMISSION {
         // submit the files to database of choice (after fixing config and getting wait time)
         if ( params.genbank && params.sra ){ // genbank and sra
             // submit the files to database of choice (after fixing config and getting wait time)
-            submission_ch.view()
             SUBMISSION_FULL ( submission_ch, submission_config )
 
             // actual process to initiate wait 
@@ -39,7 +38,6 @@ workflow INITIAL_SUBMISSION {
                 .map { 
                     it -> [it[0], it[1], it[3], it[4]] 
                 }
-            submission_ch.view()
             SUBMISSION_SRA ( submission_ch, submission_config )
 
             // actual process to initiate wait