Skip to content

Commit

Permalink
fix issue in concat_gffs script that was truncating IDs with # in them
Browse files Browse the repository at this point in the history
  • Loading branch information
Jessica Rowell authored and Jessica Rowell committed Apr 8, 2024
1 parent 3aed3b8 commit da9e742
Showing 1 changed file with 19 additions and 7 deletions.
26 changes: 19 additions & 7 deletions bin/repeatmasker_liftoff.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,11 +23,22 @@ def get_args():
parser.add_argument("--refgff", type=str, help="Reference GFF to gather the ITR attributes and sample ID \n", required=True)
parser.add_argument("--fasta", type=str, help="FASTA file for sample \n", required=True)
parser.add_argument("--outdir", type=str, default=".", help="Output directory, defualt is current directory")
parser.add_argument("--sample_name", type=str, default=".", help="Sample name")

args = parser.parse_args()

return args

def count_rows_starting_with_comment(file_path):
count = 0
with open(file_path, 'r') as file:
for line in file:
if line.startswith('#'):
count += 1
else:
break # Stop counting once a line is encountered that doesn't start with '#'
return count

def annotation_main():
""" Main function for calling the annotation transfer pipeline
"""
Expand All @@ -44,9 +55,10 @@ def annotation_main():
headerList = ['seq_id', 'source', 'type', 'start', 'end', 'score', 'strand', 'phase', 'attributes']

#####GATHER REF INFO#####

#load in repeatmasker gff skip commented lines that dont belong in dataframe
ref_gff = pd.read_csv(args.refgff, delimiter='\t', skip_blank_lines=True, names=headerList, comment='#')
#ref_gff = pd.read_csv(args.refgff, delimiter='\t', skip_blank_lines=True, names=headerList, comment='#')
ref_gff = pd.read_csv(args.refgff, delimiter='\t', skip_blank_lines=True, names=headerList, skiprows=count_rows_starting_with_comment(args.refgff))

#gather ref sample id
ref_id=ref_gff['seq_id'][0]
#gather index of attributes for first and second ITRs; needed for repeatmasker ITR attributes
Expand All @@ -63,18 +75,18 @@ def annotation_main():
#samp_name=repMannotation_prep.sample_info()[0]
#repMannotation_prep.repM_prep_main()

LOannotation_prep=Liftoff_Annotations(args.liftoff_gff, headerList, samp_name, args.outdir)
LOannotation_prep=Liftoff_Annotations(args.liftoff_gff, headerList, args.sample_name, args.outdir)
#LOannotation_prep.LO_prep_main()
#repMannotation_prep.sample_info()
new_gff=concat_gffs(args.liftoff_gff, repMannotation_prep.repM_prep_main(), LOannotation_prep.LO_prep_main(), ref_id, samp_name, args.outdir)
new_gff=concat_gffs(args.liftoff_gff, repMannotation_prep.repM_prep_main(), LOannotation_prep.LO_prep_main(), ref_id, args.sample_name, args.outdir)

new_gff.concat_LO_RM()

#####CREATE TBL FILE#####
main_util=MainUtility()
main_util.gff2tbl(
samp_name=samp_name,
gff_loc=f"{args.outdir}/{samp_name}_reformatted.gff",
gff_loc=f"{args.outdir}/{args.sample_name}_reformatted.gff",
tbl_output=f"{args.outdir}/"
)

Expand Down Expand Up @@ -121,7 +133,7 @@ def sample_info(self):

def cleanup_repeat_masker_gff(self):
#load in repeatmasker gff skip the first two lines that dont belong in dataframe
rem_gff = pd.read_csv(self.repeatMGFF, delimiter='\t', skip_blank_lines=True, names=self.headerList, comment='#')
rem_gff = pd.read_csv(self.repeatMGFF, delimiter='\t', skip_blank_lines=True, names=self.headerList, skiprows=count_rows_starting_with_comment(self.repeatMGFF))
#correct repeat region labels; repeatmasker labels repeat regions as dispersed_repeat
rem_gff['type'] = rem_gff['type'].replace({'dispersed_repeat': 'repeat_region'}, regex=True)

Expand Down Expand Up @@ -213,7 +225,7 @@ def LO_prep_main(self):
fields_to_drop = ['coverage', 'sequence_ID', 'matches_ref_protein', 'valid_ORF', 'valid_ORFs', 'extra_copy_number',
'copy_num_ID', 'pseudogene', 'partial_mapping', 'low_identity']
#load in liftoff gff with same headers as Repeatmasker and skip commented lines at dont belong to dataframe
lo_gff = pd.read_csv(self.liftoffGFF, delimiter='\t', skip_blank_lines=True, names=self.headerList, comment='#')
lo_gff = pd.read_csv(self.liftoffGFF, delimiter='\t', skip_blank_lines=True, names=self.headerList, skiprows=count_rows_starting_with_comment(self.liftoffGFF))

#run function to find and drop fields in attributes
lo_gff['attributes']=lo_gff['attributes'].apply(lambda row : self.fix_attributes(fields_to_drop, row))
Expand Down

0 comments on commit da9e742

Please sign in to comment.