fix issue in concat_gffs script that was truncating IDs with # in them

CDCgov · Apr 8, 2024 · da9e742 · da9e742
1 parent 3aed3b8
commit da9e742
Showing 1 changed file with 19 additions and 7 deletions.
diff --git a/bin/repeatmasker_liftoff.py b/bin/repeatmasker_liftoff.py
@@ -23,11 +23,22 @@ def get_args():
     parser.add_argument("--refgff", type=str, help="Reference GFF to gather the ITR attributes and sample ID \n", required=True)
     parser.add_argument("--fasta", type=str, help="FASTA file for sample \n", required=True)
     parser.add_argument("--outdir", type=str, default=".", help="Output directory, defualt is current directory")
+    parser.add_argument("--sample_name", type=str, default=".", help="Sample name")
 
     args = parser.parse_args()
 
     return args
 
+def count_rows_starting_with_comment(file_path):
+    count = 0
+    with open(file_path, 'r') as file:
+        for line in file:
+            if line.startswith('#'):
+                count += 1
+            else:
+                break  # Stop counting once a line is encountered that doesn't start with '#'
+    return count
+
 def annotation_main():
     """ Main function for calling the annotation transfer pipeline
     """
@@ -44,9 +55,10 @@ def annotation_main():
     headerList = ['seq_id', 'source', 'type', 'start', 'end', 'score', 'strand', 'phase', 'attributes']
 
     #####GATHER REF INFO#####
-
     #load in repeatmasker gff skip commented lines that dont belong in dataframe
-    ref_gff = pd.read_csv(args.refgff, delimiter='\t', skip_blank_lines=True, names=headerList, comment='#')
+    #ref_gff = pd.read_csv(args.refgff, delimiter='\t', skip_blank_lines=True, names=headerList, comment='#')
+    ref_gff = pd.read_csv(args.refgff, delimiter='\t', skip_blank_lines=True, names=headerList, skiprows=count_rows_starting_with_comment(args.refgff))
+
     #gather ref sample id
     ref_id=ref_gff['seq_id'][0]
     #gather index of attributes for first and second ITRs; needed for repeatmasker ITR attributes
@@ -63,18 +75,18 @@ def annotation_main():
     #samp_name=repMannotation_prep.sample_info()[0]
     #repMannotation_prep.repM_prep_main()
 
-    LOannotation_prep=Liftoff_Annotations(args.liftoff_gff, headerList, samp_name, args.outdir)
+    LOannotation_prep=Liftoff_Annotations(args.liftoff_gff, headerList, args.sample_name, args.outdir)
     #LOannotation_prep.LO_prep_main()
     #repMannotation_prep.sample_info()
-    new_gff=concat_gffs(args.liftoff_gff, repMannotation_prep.repM_prep_main(), LOannotation_prep.LO_prep_main(), ref_id, samp_name, args.outdir)
+    new_gff=concat_gffs(args.liftoff_gff, repMannotation_prep.repM_prep_main(), LOannotation_prep.LO_prep_main(), ref_id, args.sample_name, args.outdir)
 
     new_gff.concat_LO_RM()
 
     #####CREATE TBL FILE#####
     main_util=MainUtility()
     main_util.gff2tbl(
         samp_name=samp_name,
-        gff_loc=f"{args.outdir}/{samp_name}_reformatted.gff",
+        gff_loc=f"{args.outdir}/{args.sample_name}_reformatted.gff",
         tbl_output=f"{args.outdir}/"
     )
 
@@ -121,7 +133,7 @@ def sample_info(self):
 
     def cleanup_repeat_masker_gff(self):
         #load in repeatmasker gff skip the first two lines that dont belong in dataframe
-        rem_gff = pd.read_csv(self.repeatMGFF, delimiter='\t', skip_blank_lines=True, names=self.headerList, comment='#')
+        rem_gff = pd.read_csv(self.repeatMGFF, delimiter='\t', skip_blank_lines=True, names=self.headerList, skiprows=count_rows_starting_with_comment(self.repeatMGFF))
         #correct repeat region labels; repeatmasker labels repeat regions as dispersed_repeat 
         rem_gff['type'] = rem_gff['type'].replace({'dispersed_repeat': 'repeat_region'}, regex=True)
 
@@ -213,7 +225,7 @@ def LO_prep_main(self):
         fields_to_drop = ['coverage', 'sequence_ID', 'matches_ref_protein', 'valid_ORF', 'valid_ORFs', 'extra_copy_number',
                               'copy_num_ID', 'pseudogene', 'partial_mapping', 'low_identity']
         #load in liftoff gff with same headers as Repeatmasker and skip commented lines at dont belong to dataframe
-        lo_gff = pd.read_csv(self.liftoffGFF, delimiter='\t', skip_blank_lines=True, names=self.headerList, comment='#')
+        lo_gff = pd.read_csv(self.liftoffGFF, delimiter='\t', skip_blank_lines=True, names=self.headerList, skiprows=count_rows_starting_with_comment(self.liftoffGFF))
 
         #run function to find and drop fields in attributes
         lo_gff['attributes']=lo_gff['attributes'].apply(lambda row : self.fix_attributes(fields_to_drop, row))