From 03701e4345f620326657cc01395d177c0dcb2c7e Mon Sep 17 00:00:00 2001
From: Modupe <madetunj@stjude.org>
Date: Tue, 16 Apr 2024 16:48:52 -0500
Subject: [PATCH 1/5] syntax updates

---
 PEAKSANNO.py | 549 ---------------------------------------------------
 peaksanno.py | 373 ++++++++++++++++++++++++++++++++++
 2 files changed, 373 insertions(+), 549 deletions(-)
 delete mode 100755 PEAKSANNO.py
 create mode 100755 peaksanno.py
diff --git a/PEAKSANNO.py b/PEAKSANNO.py
deleted file mode 100755
index ef2f4fc..0000000
--- a/PEAKSANNO.py
+++ /dev/null
@@ -1,549 +0,0 @@
-#!/usr/bin/env python3
-'''
-========================  PEAKS ANNOTATION  ======================
-
-USAGE
-All inclusive script for Annotation of ChIP-Seq peaks.
-This is included in the SEASEQ pipeline.
-    ```
-    python PEAKSANNO.py -g <GTF/GFF/GFF3> -c <CHROM SIZES> -p <MACS PEAKS bed> -s <MACS SUMMITS bed>
-	```
-
-The script input requirements are :
-- GTF : GTF/GFF/GFF3 having either gene or transcript (prefered) annotation
-- CHROM SIZES : chrom.sizes file
-- MACS PEAKS bed : MACS peaks bed file
-- MACS SUMMITS bed : MACS peaks summit file
-
-Output files provided are :
-	1) TSS nearest the center of peaks in __*centerofpeaks_closest.regions.txt*__
-	2) peaks overlapping genes regions in __*peaks_within_genebody.regions.txt*__
-	3) peaks overlapping promoters in __*peaks_within_promoter.regions.txt*__
-	4) peaks overlapping windows in __*peaks_within_window.regions.txt*__
-	5) peaks identified in previous overlapping regions and comparison of
-           all regions in __*peaks_compared_regions.peaks.txt*__
-	6) genes identified in previous overlapping regions and comparison of
-           all regions in __*peaks_compared_regions.genes.txt*__
-	7) distribution graphs in __*peaks_compared_regions.distribution.pdf*__
-
-Definitions of annotation regions:
-	- TSS: TSS (transcription start site)
-	- promoters: 1kb +/- TSS
-	- window: 10kb upstream --> 3kb downstream of genebody
-	- genebody: 1kb upstream of TSS --> TES (transcription end site)
-
-Dependencies of script:
-	* bedtools
-	* python3
-
-==================================================================
-'''
-
-import os
-import sys
-import re
-import argparse
-import subprocess
-import matplotlib.pyplot as plt
-from pathlib import Path
-import collections
-from collections import defaultdict
-
-if not os.path.exists('annotation'):
-    os.makedirs('annotation')
-
-def parse_genelocations(chromz, results, up_dist, down_dist, tss=True):
-    """ Parse genomic regions
-    Args:
-        chromz (dict) : Chromosomal sizes
-        results (string) : Individual gene coordinates
-        up_dist (int) : genomic distance upstream / +
-        down_dist (int) : genomic distance downstream / -
-        tss (boolean) : TSS or entire gene (true if tss)
-    """
-
-    lines = results.split("\t")
-    lines[3] = int(lines[3])
-    lines[4] = int(lines[4])
-
-    if lines[6] == "+":
-        if tss:
-            end_region = 4
-        else:
-            end_region = 3
-
-        end_position = lines[end_region] + down_dist
-        start_position = lines[3] - up_dist
-
-    elif lines[6] == "-":
-        if tss:
-            end_region = 3
-        else:
-            end_region = 4
-
-        start_position = lines[end_region] - down_dist
-        end_position = lines[4] + up_dist
-
-    if start_position < 1:
-        start_position = 1
-
-    if end_position > int(chromz[lines[0]]):
-        end_position = chromz[lines[0]]
-
-    output_gff = ("{0}\t{1}\t{2}\t{3}\n".format("\t".join(lines[0:3]),
-                                                start_position, end_position, "\t".join(lines[5:])))
-    return output_gff
-
-
-def gtf_to_genes(chrom_sizes_file, gtf_file):
-    """ GTF to GENES
-    Args:
-        chrom_sizes_file (file) : Chromosomal sizes file
-        gtf_file (file) : GTF file
-    """
-    #initialize outputfiles
-    original_output = open('annotation/original_genes.gff', 'w')
-    promoters_output = open('annotation/promoter-region_genes.gff', 'w')
-    window_output = open('annotation/window-region_genes.gff', 'w')
-    tss_output = open('annotation/TSS-region_genes.gff', 'w')
-    genebody_output = open('annotation/genebody-region_genes.gff', 'w')
-
-    haschr = False #check 'chr' prefix in gtf file
-    
-    #read ucsc chrom sizes, gtf
-    chrom_sizes_input = open(chrom_sizes_file, 'r')
-    chrom_sizes_dict = {}
-    for line in chrom_sizes_input:
-        lines = line.split('\t')
-        chrom_sizes_dict[lines[0]] = lines[1].rstrip("\n")
-        if lines[0].startswith('chr'):
-            haschr = True
-
-    #feature = options.feature
-    feature_dict = {}
-
-    #reading the feature type to be used
-    gtf_input = open(gtf_file, 'r')
-    for line in gtf_input:
-        if not line.startswith('#'):
-            lines = line.split("\t")
-            feature_dict[lines[2]] = lines[2]
-
-    if 'transcript' in feature_dict:
-        feature = "transcript"
-    elif 'gene' in feature_dict:
-        feature = "gene"
-    else:
-        sys.exit("ERROR :\tGTF/GFF with either transcript/gene annotation is needed")
-    print("NOTE :\tFeature type used is '%s'" %(feature))
-
-    #organize gtf file
-    gtf_input = open(gtf_file, 'r')
-    for line in gtf_input:
-        if not line.startswith('#'):
-            lines = line.rstrip("\n").split("\t")
-            if haschr and not lines[0].startswith('chr'):
-                lines[0] = "chr"+lines[0]
-            elif not haschr and lines[0].startswith('chr'):
-                lines[0] = lines[0][3:]
-            if lines[2] == feature:
-                newline = lines[8].split(';')
-                
-                if gtf_file.split('.')[-1] == 'gff' or gtf_file.split('.')[-1] == 'gff3':
-                    if re.search(";transcript_id", lines[8]):
-                        transcript_id = [s for s in newline if "transcript_id=" in s][0]
-                    else:
-                        transcript_id = newline[0]
-                    if re.search(";gene_name", lines[8]):
-                        gene_name = [s for s in newline if "gene_name=" in s][0]
-                    else:
-                        gene_name = newline[0]
-                    transcript_id = re.sub('[\"\;]', '', transcript_id.split('=')[-1]) #clean transcript_id
-                    gene_name = re.sub('[\"\;]', '', gene_name.split('=')[-1]) #clean gene_name
-                elif gtf_file.split('.')[-1] == 'gtf':
-                    if re.search("; transcript_id", lines[8]):
-                        transcript_id = [s for s in newline if "transcript_id " in s][0]
-                    else:
-                        transcript_id = newline[0]
-                    if re.search("; gene_name", lines[8]):
-                        gene_name = [s for s in newline if "gene_name " in s][0]
-                    else:
-                        gene_name = newline[0]
-                    transcript_id = re.sub('[\"\;]', '', transcript_id.split(' ')[-1]) #clean transcript_id
-                    gene_name = re.sub('[\"\;]', '', gene_name.split(' ')[-1]) #clean gene_name
-                    
-                results = ("{0}\t{1}\t{2}\t{3}".format(lines[0],
-                                                       "\t".join(lines[1:8]),
-                                                       gene_name, transcript_id))
-
-                try:
-                    if chrom_sizes_dict[lines[0]]:
-                        original_output.write(results + "\n")
-                        final_output = parse_genelocations(chrom_sizes_dict, results, 1000, 1000, False)
-                        promoters_output.write(final_output)
-                        final_output = parse_genelocations(chrom_sizes_dict, results, 10000, 3000, True)
-                        window_output.write(final_output)
-                        final_output = parse_genelocations(chrom_sizes_dict, results, 0, 0, False)
-                        tss_output.write(final_output)
-                        final_output = parse_genelocations(chrom_sizes_dict, results, 1000, 0, True)
-                        genebody_output.write(final_output)
-                except KeyError:
-                    continue
-
-
-def annotate_regions(region_file, output_file, macs_summits_file=None):
-    """ Annotate Peaks based on genomic features
-    Args:
-        region_file (file) : Region file
-        output_file (string) : Output file name
-        macs_summits_file (file) : MACS summits file
-    """
-
-    #initialize chromosomal positions based on gff provided
-    opt_chrom = 0
-    opt_start = 1
-    opt_stop = 2
-    opt_peakid = 3
-    opt_score = 4
-    opt_none = 6
-    opt_gene = 8
-    opt_transcript = 9
-
-    summits_dict = {}
-    if macs_summits_file:
-        #read summits file
-        summits_input = open(macs_summits_file, 'r')
-        for line in summits_input:
-            line = line.strip('\n').split('\t')
-            string_identifier = line[opt_start], line[opt_stop]
-            summits_dict[line[opt_peakid]] = string_identifier
-
-    #read intersectbed file
-    intersect_input = open(region_file, 'r')
-    peaks_dict = {}
-    peaks_score = {}
-    gene_dict = {}
-    transcript_dict = {}
-    peak_region = []
-    for line in intersect_input:
-        line = line.strip('\n').split('\t')
-        #print(line[opt_none],"\n")
-        if int(line[opt_none]) != -1:
-            string_identifier = line[opt_chrom], line[opt_start], line[opt_stop]
-            peaks_dict[string_identifier] = line[opt_peakid]
-            peaks_score[line[opt_peakid]] = line[opt_score]
-            peak_region.append(string_identifier)
-
-            if string_identifier in gene_dict:
-                gene_dict[string_identifier] = ("{0},{1}".format(gene_dict[string_identifier],
-                                                                 line[opt_gene]))
-                transcript_dict[string_identifier] = ("{0},{1}".
-                                                      format(transcript_dict[string_identifier],
-                                                             line[opt_transcript]))
-            else:
-                gene_dict[string_identifier] = line[opt_gene]
-                transcript_dict[string_identifier] = line[opt_transcript]
-
-    #each peak id is unique
-    unique_peak_region = []
-    for  eachpeak_region in peak_region:
-        if eachpeak_region not in unique_peak_region:
-            unique_peak_region.append(eachpeak_region)
-
-    #write to output file
-    final_output = open(output_file, 'w')
-
-    if macs_summits_file:
-        final_output.write("peak_id\tchrom\tpeak_start\tpeak_end\t"
-                           "summit_start\tsummit_end\tgene_name\t"
-                           "transcript_id\tpeak_score\n")
-
-        #print regions identified
-        for chromregion in unique_peak_region:
-            peakid = peaks_dict[chromregion]
-            final_output.write("%s\t%s\t%s\t%s\t%s\t%s\n"
-                               %(peakid, '\t'.join(chromregion), '\t'.join(summits_dict[peakid]),
-                                 ','.join(unique(gene_dict[chromregion])),
-                                 ','.join(unique(transcript_dict[chromregion])),
-                                 peaks_score[peakid]))
-    else:
-        final_output.write("peak_id\tchrom\tpeak_start\tpeak_end\t"
-                           "gene_name\ttranscript_id\tpeak_score\n")
-
-        #print regions identified
-        for chromregion in unique_peak_region:
-            peakid = peaks_dict[chromregion]
-            final_output.write("%s\t%s\t%s\t%s\t%s\n"
-                               %(peakid, '\t'.join(chromregion),
-                                 ','.join(unique(gene_dict[chromregion])),
-                                 ','.join(unique(transcript_dict[chromregion])),
-                                 peaks_score[peakid]))
-
-def unique(list1):
-    """ Get unique list
-    Args:
-        list1 (list) : list of variable separated by comma ','
-
-    """
-
-    # intilize a null list
-    unique_list = []
-
-    # traverse for all elements
-    for variable in list1.split(","):
-        if variable not in unique_list:
-            unique_list.append(variable)
-    return unique_list
-
-
-def include_genes(macs_peaks_file, file_name):
-    """ Annotate Genes in peak regions
-    Args:
-        macs_peaks_file (file) : MACS peaks file
-        output_file (string) : Output file name
-    """
-    peaks_input = open(macs_peaks_file, 'r')
-    peaks_chr_dict = {}
-    peaks_count = 0
-    for line in peaks_input:
-        line = line.strip('\n').split('\t')
-        peaks_count += 1
-        peaks_chr_dict[line[3]] = (line[0:3])
-
-    #reading the gtf genes file
-    gene_names_dict = {}
-    gene_names_input = open("tempPrefix.genes.names", 'r')
-    for line in gene_names_input:
-        line = line.strip('\n').split('\t')
-        gene_names_dict[(line[0], line[1])] = line[0]
-
-    #reading the genomic regions files
-    region_names_list = []
-    region_dict = defaultdict(dict)
-    peaks_id_dict = defaultdict(dict)
-    for region_prefix in ("promoter", "genebody", "window", "closest"):
-        region = region_prefix + ".tempPrefix"
-        region_name = Path(region).stem
-        region_names_list.append(region_name.upper())
-        region_input = open(region, 'r')
-        for line in region_input:
-            line = line.strip('\n').split('\t')
-            region_dict[region_name.upper()][(line[8], line[9])] = (line[8], line[9])
-            peaks_id_dict[region_name.upper()][line[3]] = (line[0:3])
-    peaks_id_list = peaks_chr_dict.keys()
-    peaks_id_list = sorted(peaks_id_list, key=lambda item:
-                           (int(item[10:]) if item[10:].isdigit()
-                            else print(item[10:], type(item[10:])), item))
-
-    #order regions and gene names
-    region_names_list.sort()
-    orderedgene_names_dict = collections.OrderedDict(sorted(gene_names_dict.items()))
-
-    #barcoding the identified regions
-    barcoderegions_dict = {}
-    master_generegion_dict = defaultdict(dict)
-    for eachgene in orderedgene_names_dict:
-        for eachkey in region_names_list:
-            found = 0
-            if eachgene in region_dict[eachkey]:
-                found = 1
-            master_generegion_dict[eachkey][eachgene] = found
-            if eachgene in barcoderegions_dict:
-                barcoderegions_dict[eachgene] += str(found)
-            else:
-                barcoderegions_dict[eachgene] = str(found)
-
-    barcodepeaks_id_dict = {}
-    master_peaksid_dict = defaultdict(dict)
-    for eachpeaks_id in peaks_id_list:
-        for eachkey in region_names_list:
-            found = 0
-            #print(eachpeaks_id)
-            if eachpeaks_id in peaks_id_dict[eachkey]:
-                found = 1
-                #print("yes")
-            master_peaksid_dict[eachkey][eachpeaks_id] = found
-            if eachpeaks_id in barcodepeaks_id_dict:
-                barcodepeaks_id_dict[eachpeaks_id] += str(found)
-            else:
-                barcodepeaks_id_dict[eachpeaks_id] = str(found)
-
-    #write to output file
-    gene_file_name = file_name.replace('.txt', '.genes.txt')
-
-    final_output = open(gene_file_name, 'w')
-    final_output.write("GeneName\tTranscript_ID\tBARCODE\t%s\tEVER\n"
-                       %('\t'.join(region_names_list)))
-
-    for eachgene in orderedgene_names_dict:
-        string_identifier = [barcoderegions_dict[eachgene]]
-        found_sum = 0
-        for eachkey in region_names_list:
-            string_identifier.append(str(master_generegion_dict[eachkey][eachgene]))
-            found_sum += master_generegion_dict[eachkey][eachgene]
-        if found_sum >= 1:
-            string_identifier.append("1")
-        else:
-            string_identifier.append("0")
-
-        final_output.write("%s\t%s\n" %('\t'.join(eachgene), '\t'.join(string_identifier)))
-
-    peaks_file_name = file_name.replace('.txt', '.peaks.txt')
-    pdf_file_name = file_name.replace('.txt', '.distribution.pdf')
-    final_output_peaks = open(peaks_file_name, 'w')
-    final_output_peaks.write("chrom\tstart\tstop\tpeak_id\tBARCODE\t%s\n"
-                             %('\t'.join(region_names_list)))
-    for eachpeaks_id in peaks_id_list:
-        string_identifier = [barcodepeaks_id_dict[eachpeaks_id]]
-        for eachkey in region_names_list:
-            string_identifier.append(str(master_peaksid_dict[eachkey][eachpeaks_id]))
-
-        final_output_peaks.write("%s\t%s\t%s\n"
-                                 %('\t'.join(peaks_chr_dict[eachpeaks_id]),
-                                   eachpeaks_id, '\t'.join(string_identifier)))
-
-    #the bar plot
-    region_count_dict = {}
-    #for eachregion in region_count_dict:
-    region_count_dict["Promoter\n(1kb up/down TSS)"] = round(
-        len(peaks_id_dict["PROMOTER"])*100/peaks_count)
-    region_count_dict["Genebody\n(1kb up TSS/TES)"] = round(
-        len(peaks_id_dict["GENEBODY"])*100/peaks_count)
-    region_count_dict["Window\n(10kb up TSS/3kb down TES)"] = round(
-        len(peaks_id_dict["WINDOW"])*100/peaks_count)
-
-    rects = plt.bar(region_count_dict.keys(), region_count_dict.values(), width=0.4)
-    axes = plt.gca()
-    axes.set_ylim([0, 100])
-    peaks_title = Path(macs_peaks_file).stem
-    plt.title(peaks_title.split('.sorted')[0])
-    plt.ylabel("% of peaks")
-
-    #label the bars
-    xpos = 'center'
-    ha = {'center': 'center', 'right': 'left', 'left': 'right'}
-    offset = {'center': 0.5, 'right': 0.57, 'left': 0.43}  # x_txt = x + w*off
-
-    for rect in rects:
-        height = rect.get_height()
-        plt.text(rect.get_x() + rect.get_width()*offset[xpos], 1.01*height,
-                 '{}%'.format(height), ha=ha[xpos], va='bottom')
-
-    #save figure
-    plt.savefig(pdf_file_name)
-
-    for eachregion in ("PROMOTER", "GENEBODY", "WINDOW"):
-        print("Total number of %s = %d" %(eachregion, len(peaks_id_dict[eachregion])))
-        print("Percentage of %s = %.3f" %(eachregion,
-                                          len(peaks_id_dict[eachregion])*100/peaks_count))
-
-
-def main():
-    '''
-    Generate genomic coordinates of all promoters, window, genebody, TSS
-
-    Definitions: promoters: +/- 1kb of TSS (transcription start site)
-                 window: 10kb upstream --> 3kb downstream of genebody
-                 genebody: 1kb upstream of TSS --> TTS/TES (transcription termination/end site)
-                 TSS: TSS (transcription start site)
-
-    Annotate Peaks based on genomic features listed above
-
-    Comparison master files for all genes/transcripts
-        and their genomic regions within peaks called.
-    Genomic regions are specified based on regions provided,
-        default expectation are promoter, genebody, window, closest
-
-    '''
-    parser = argparse.ArgumentParser()
-    parser.add_argument("-g", "--gtf", dest="gtf", required=True,
-                        help="Enter .gtf/gff file to be processed.")
-    parser.add_argument("-c", "--chrom", dest="chrom_sizes", required=True,
-                        help="Enter ucsc chrom sizes file to be processed.")
-    parser.add_argument("-p", "--peaks", dest="macs_peaks", required=True,
-                        help="Enter MACS peaks bed file.")
-    parser.add_argument("-s", "--summit", dest="macs_summits", required=False,
-                        help="Enter MACS summit bed file.")
-
-    options = parser.parse_args()
-    #print(options)
-
-    # Stage 1
-    # Extract genomic regions of interest
-    gtf_to_genes(options.chrom_sizes, options.gtf)
-
-    # Turn 4 column bed to 5 (for SICER peaks: include unique id)
-    command = "wc -l " + options.macs_peaks + " | awk -F' '  '{print $1}'"
-    checkcolumns = int(subprocess.check_output(command,shell=True).strip())
-    if checkcolumns > 0:
-        command = "awk -F'\\t' '{print NF; exit}' " + options.macs_peaks
-        numberofcolumns = int(subprocess.check_output(command,shell=True).strip())
-    else:
-        sys.exit("ERROR :\tNo peaks were identified\n")
-
-    command = "head -n 1 " + options.macs_peaks + ' | cut -f5 '
-    fifthcol_value = (str(subprocess.check_output(command,shell=True).strip()).split("'"))[1] #FDRisland from SICER produce a fifth empty column
-    
-    new_macs_peaks = options.macs_peaks
-
-    if numberofcolumns == 4 or len(fifthcol_value) < 1:
-        new_macs_peaks = options.macs_peaks + ".tempPrefix"
-        # command = "cat " + options.macs_peaks + ' | awk -F\\\\t ' + "'" + \
-        #           '{print $1 "\\t" $2 "\\t" $3 "\\t" $1":"$2"-"$3 "\\t" $4}' + \
-        #           "' > tempPrefix-new.macs_peaks.bed"
-        # print(command)
-        # os.system(command)
-        macs_peaks_output = open(new_macs_peaks, 'w')
-        macs_peaks_input = open(options.macs_peaks,'r')
-        macs_peaks_count = 1
-        for line in macs_peaks_input:
-            line = line.strip('\n').split('\t')
-            new_peaks_id = "SICERpeak_" + str(macs_peaks_count)
-            results = ("{0}\t{1}\t{2}\n".format("\t".join(line[0:3]), new_peaks_id, line[3]))
-            macs_peaks_output.write(results)
-            macs_peaks_count += 1
-        macs_peaks_output.close()
-        
-    # Convert TSS to compatible bedtools (closestBed) format and obtain center of peaks coordinate
-    command = "cut -f1,4,5,9,10 annotation/TSS-region_genes.gff | sort -k1,1 -k2,2n " + \
-              "> tempPrefix-sorted.TSS; " + \
-              "cat " + new_macs_peaks + ' | awk -F\\\\t ' + "'" + \
-              '{print $1 "\\t" int(($2+$3)/2) "\\t" int(($2+$3)/2) "\\t" $4 "\\t" $5}' + \
-              "' | sort -k1,1 -k2,2n > tempPrefix-centerofpeaks.bed;" + \
-              "closestBed -t all -a tempPrefix-centerofpeaks.bed -b tempPrefix-sorted.TSS" + \
-              "> closest.tempPrefix"
-    os.system(command)
-
-    # Stage 2
-    # Annotate peaks based on previously extracted genomic coordinates
-    for region in ("promoter", "genebody", "window"):
-        command = 'intersectBed -b ' + new_macs_peaks + ' -a annotation/' + region + \
-                  '-region_genes.gff -wb | cut -f1,4,5,9,10,11,12,13,14,15 |' + "awk -F\\\\t '" + \
-                  '{print $6 "\\t" $7 "\\t" $8 "\\t" $9 "\\t" $10 "\\t" $1 "\\t" $2 "\\t" $3 ' + \
-                  '"\\t" $4 "\\t" $5}' + \
-                  "' > " + region + ".tempPrefix"
-        os.system(command)
-
-    # Convert annotated regions to tab delimited file and include summit location
-    for region in ("promoter", "genebody", "window", "closest"):
-        region_file = region + ".tempPrefix"
-        output_file = "peaks_within_" + region + ".regions.txt"
-        if options.macs_summits:
-            annotate_regions(region_file, output_file, options.macs_summits)
-        else:
-            annotate_regions(region_file, output_file)
-
-    command = "mv peaks_within_closest.regions.txt centerofpeaks_closest.regions.txt; \
-              cut -f 9,10 annotation/original_genes.gff > tempPrefix.genes.names"
-    os.system(command)
-
-    # Stage 3
-    # Binary matrix of all genes
-    include_genes(new_macs_peaks, "peaks_compared_regions.txt")
-
-    command = "rm -rf *tempPrefix*"
-    #os.system(command)
-
-
-if __name__ == "__main__":
-    main()
diff --git a/peaksanno.py b/peaksanno.py
new file mode 100755
index 0000000..7c0505e
--- /dev/null
+++ b/peaksanno.py
@@ -0,0 +1,373 @@
+#!/usr/bin/env python3
+'''
+========================  PEAKS ANNOTATION  ======================
+
+USAGE
+All inclusive script for Annotation of ChIP-Seq peaks.
+This is included in the SEASEQ pipeline.
+    ```
+    python PEAKSANNO.py -g <GTF/GFF/GFF3> -c <CHROM SIZES> -p <MACS PEAKS bed> -s <MACS SUMMITS bed>
+	```
+
+The script input requirements are :
+- GTF : GTF/GFF/GFF3 having either gene or transcript (prefered) annotation
+- CHROM SIZES : chrom.sizes file
+- MACS PEAKS bed : MACS peaks bed file
+- MACS SUMMITS bed : MACS peaks summit file
+
+Output files provided are :
+	1) TSS nearest the center of peaks in __*centerofpeaks_closest.regions.txt*__
+	2) peaks overlapping genes regions in __*peaks_within_genebody.regions.txt*__
+	3) peaks overlapping promoters in __*peaks_within_promoter.regions.txt*__
+	4) peaks overlapping windows in __*peaks_within_window.regions.txt*__
+	5) peaks identified in previous overlapping regions and comparison of
+           all regions in __*peaks_compared_regions.peaks.txt*__
+	6) genes identified in previous overlapping regions and comparison of
+           all regions in __*peaks_compared_regions.genes.txt*__
+	7) distribution graphs in __*peaks_compared_regions.distribution.pdf*__
+
+Definitions of annotation regions:
+	- TSS: TSS (transcription start site)
+	- promoters: 1kb +/- TSS
+	- window: 10kb upstream --> 3kb downstream of genebody
+	- genebody: 1kb upstream of TSS --> TES (transcription end site)
+
+Dependencies of script:
+	* bedtools
+	* python3
+
+==================================================================
+'''
+
+import os
+import re
+import argparse
+import subprocess
+import matplotlib.pyplot as plt
+from pathlib import Path
+import collections
+from collections import defaultdict
+
+if not os.path.exists('annotation'):
+    os.makedirs('annotation')
+
+def parse_genelocations(chromz, results, up_dist, down_dist, tss=True):
+    """ Parse genomic regions
+    Args:
+        chromz (dict) : Chromosomal sizes
+        results (string) : Individual gene coordinates
+        up_dist (int) : genomic distance upstream / +
+        down_dist (int) : genomic distance downstream / -
+        tss (boolean) : TSS or entire gene (true if tss)
+    """
+    lines = results.split("\t")
+    lines[3], lines[4] = int(lines[3]), int(lines[4])
+
+    if lines[6] == "+":
+        end_region = 4 if tss else 3
+        end_position = lines[end_region] + down_dist
+        start_position = lines[3] - up_dist
+    elif lines[6] == "-":
+        end_region = 3 if tss else 4
+        start_position = lines[end_region] - down_dist
+        end_position = lines[4] + up_dist
+
+    start_position = max(start_position, 1)
+    end_position = min(end_position, int(chromz[lines[0]]))
+
+    return "{0}\t{1}\t{2}\t{3}\n".format("\t".join(lines[0:3]), start_position, end_position, "\t".join(lines[5:]))
+
+
+def gtf_to_genes(chrom_sizes_file, gtf_file):
+    """ GTF to GENES
+    Args:
+        chrom_sizes_file (file) : Chromosomal sizes file
+        gtf_file (file) : GTF file
+    """
+    #read ucsc chrom sizes, gtf
+    chrom_sizes_dict = {line.split('\t')[0]: line.split('\t')[1].rstrip("\n") for line in open(chrom_sizes_file)}
+    haschr = any(chrom.startswith('chr') for chrom in chrom_sizes_dict)  #check 'chr' prefix in gtf file
+
+    feature_dict = {line.split("\t")[2]: line.split("\t")[2] for line in open(gtf_file) if not line.startswith('#')}
+
+    feature = "transcript" if 'transcript' in feature_dict else "gene"
+    print("NOTE :\tFeature type used is '%s'" % feature)
+
+    #parsing gene annotation file, GFF, GFF3, GTF
+    with open(gtf_file, 'r') as gtf_input:
+        for gtf_col in gtf_input:
+            if not gtf_col.startswith('#'):
+                gtf_col.split("\t")[0] = "chr" + gtf_col.split("\t")[0] if haschr and not gtf_col.split("\t")[0].startswith('chr') else gtf_col.split("\t")[0][3:] if not haschr and gtf_col.split("\t")[0].startswith('chr') else gtf_col.split("\t")[0]
+                if gtf_col.split("\t")[2] == feature:
+                    record = gtf_col.split("\t")[8].split(';')
+                    if gtf_file.split('.')[-1] in ['gff', 'gff3']:
+                        transcript_id = [s for s in record if "transcript_id" in s][0] if re.search(";transcript_id", gtf_col.split("\t")[8]) else record[0]
+                        gene_name = [s for s in record if "gene_name" in s][0] if re.search(";gene_name", gtf_col.split("\t")[8]) else record[0]
+                        transcript_id = re.sub('[\"\;]', '', transcript_id.split('=')[-1]) #clean transcript_id
+                        gene_name = re.sub('[\"\;]', '', gene_name.split('=')[-1]) #clean gene_name
+                    elif gtf_file.split('.')[-1] == 'gtf':
+                        transcript_id = [s for s in record if "transcript_id " in s][0] if re.search("; transcript_id", gtf_col.split("\t")[8]) else record[0]
+                        gene_name = [s for s in record if "gene_name " in s][0] if re.search("; gene_name", gtf_col.split("\t")[8]) else record[0]
+                        transcript_id = re.sub('[\"\;]', '', transcript_id.split(' ')[-1]) #clean transcript_id
+                        gene_name = re.sub('[\"\;]', '', gene_name.split(' ')[-1]) #clean gene_name
+
+                    results = "{0}\t{1}\t{2}\t{3}".format(gtf_col.split("\t")[0], "\t".join(gtf_col.split("\t")[1:8]), gene_name, transcript_id)
+
+                    try:
+                        if chrom_sizes_dict[gtf_col.split("\t")[0]]:
+                            with open('annotation/original_genes.gff', 'w') as original_output:
+                                original_output.write(results + "\n")
+                                final_output = parse_genelocations(chrom_sizes_dict, results, 1000, 1000, False)
+                                for file_name, up, down, tss in [('promoter-region_genes.gff', 10000, 3000, True), ('window-region_genes.gff', 1000, 1000, False), ('TSS-region_genes.gff', 0, 0, False), ('genebody-region_genes.gff', 1000, 0, True)]:
+                                    with open(f'annotation/{file_name}', 'w') as file:
+                                        file.write(final_output)
+                    except KeyError:
+                        continue
+
+
+def annotate_regions(region_file, output_file, macs_summits_file=None):
+    """ Annotate Peaks based on genomic features
+    Args:
+        region_file (file) : Region file
+        output_file (string) : Output file name
+        macs_summits_file (file) : MACS summits file
+    """
+    #initialize chromosomal positions based on gff provided
+    opt_chrom, opt_start, opt_stop, opt_peakid, opt_score, opt_none, opt_gene, opt_transcript = 0, 1, 2, 3, 4, 6, 8, 9
+    summits_dict = {}
+    if macs_summits_file: #read summits file
+        with open(macs_summits_file, 'r') as summits_input:
+            summits_dict = {line.strip().split('\t')[opt_peakid]: (line.strip().split('\t')[opt_start], line.strip().split('\t')[opt_stop]) for line in summits_input}
+
+    #read intersectbed file
+    with open(region_file, 'r') as intersect_input, open(output_file, 'w') as final_output:
+        peaks_dict, peaks_score, gene_dict, transcript_dict, peak_region = {}, {}, {}, {}, []
+        for line in intersect_input:
+            line = line.strip().split('\t')
+            if int(line[opt_none]) != -1:
+                string_identifier = tuple(line[i] for i in [opt_chrom, opt_start, opt_stop])
+                peaks_dict[string_identifier] = line[opt_peakid]
+                peaks_score[line[opt_peakid]] = line[opt_score]
+                peak_region.append(string_identifier)
+                gene_dict[string_identifier] = gene_dict.get(string_identifier, "") + line[opt_gene]
+                transcript_dict[string_identifier] = transcript_dict.get(string_identifier, "") + line[opt_transcript]
+
+        unique_peak_region = list(set(peak_region))
+
+        header = "peak_id\tchrom\tpeak_start\tpeak_end\tsummit_start\tsummit_end\tgene_name\ttranscript_id\tpeak_score\n" if macs_summits_file else "peak_id\tchrom\tpeak_start\tpeak_end\tgene_name\ttranscript_id\tpeak_score\n"
+
+        final_output.write(header)
+        for chromregion in unique_peak_region:
+            peakid = peaks_dict[chromregion]
+            if macs_summits_file:
+                final_output.write("%s\t%s\t%s\t%s\t%s\t%s\n" %(peakid, '\t'.join(chromregion), '\t'.join(summits_dict[peakid]),','.join(unique(gene_dict[chromregion])),','.join(unique(transcript_dict[chromregion])),peaks_score[peakid]))
+            else:
+                final_output.write("%s\t%s\t%s\t%s\t%s\n" %(peakid, '\t'.join(chromregion), ','.join(unique(gene_dict[chromregion])), ','.join(unique(transcript_dict[chromregion])), peaks_score[peakid]))
+
+
+def unique(list1):
+    """ Get unique list
+    Args:
+        list1 (list) : list of variable separated by comma ','
+
+    """
+    return list(set(list1.split(",")))
+
+
+def include_genes(macs_peaks_file, file_name):
+    """ Annotate Genes in peak regions
+    Args:
+        macs_peaks_file (file) : MACS peaks file
+        output_file (string) : Output file name
+    """
+    peaks_chr_dict, peaks_count, gene_names_dict, region_dict, peaks_id_dict, region_names_list, master_generegion_dict, barcoderegions_dict, barcodepeaks_id_dict, master_peaksid_dict = {}, 0, {}, defaultdict(dict), defaultdict(dict), [], defaultdict(dict), {}, {}, defaultdict(dict)
+
+    #reading peaks
+    peaks_chr_dict = {line.split('\t')[3]: line.split('\t')[0:3] for line in open(macs_peaks_file, 'r')}
+    peaks_count = sum(1 for _ in open(macs_peaks_file, 'r'))
+
+    #reading gtf gene names
+    gene_names_dict = {(line.split('\t')[0],line.split('\t')[1]): line.split('\t')[0] for line in open("tempPrefix.genes.names", 'r')}
+
+    #reading the genomic regions files
+    for region_prefix in ("promoter", "genebody", "window", "closest"):
+        region = region_prefix + ".tempPrefix"
+        region_name = Path(region).stem.upper()
+        region_names_list.append(region_name)
+        with open(region, 'r') as region_input:
+            for line in region_input:
+                line = line.strip().split('\t')
+                region_dict[region_name][(line[8], line[9])] = line[8:10]
+                peaks_id_dict[region_name][line[3]] = line[0:3]
+    peaks_id_list = peaks_chr_dict.keys()
+    peaks_id_list = sorted(peaks_id_list, key=lambda item:
+                           (int(item[10:]) if item[10:].isdigit()
+                            else print(item[10:], type(item[10:])), item))
+
+    #order regions and gene names
+    region_names_list.sort()
+    orderedgene_names_dict = collections.OrderedDict(sorted(gene_names_dict.items()))
+
+    #barcoding the identified regions
+    for eachgene in orderedgene_names_dict:
+        for eachkey in region_names_list:
+            found = 1 if eachgene in region_dict[eachkey] else 0
+            master_generegion_dict[eachkey][eachgene] = found
+            barcoderegions_dict[eachgene] = barcoderegions_dict.get(eachgene, "") + str(found)
+
+    for eachpeaks_id in peaks_id_list:
+        for eachkey in region_names_list:
+            found = 1 if eachpeaks_id in peaks_id_dict[eachkey] else 0
+            master_peaksid_dict[eachkey][eachpeaks_id] = found
+            barcodepeaks_id_dict[eachpeaks_id] = barcodepeaks_id_dict.get(eachpeaks_id, "") + str(found)
+
+    
+    #write to output file
+    gene_file_name = file_name.replace('.txt', '.genes.txt')
+
+    with open(gene_file_name, 'w') as final_output:
+        final_output.write("GeneName\tTranscript_ID\tBARCODE\t%s\tEVER\n" % ('\t'.join(region_names_list)))
+        for eachgene in orderedgene_names_dict:
+            string_identifier = [barcoderegions_dict[eachgene]]
+            found_sum = sum(master_generegion_dict[region][eachgene] for region in region_names_list)
+            string_identifier.extend(str(master_generegion_dict[region][eachgene]) for region in region_names_list)
+            string_identifier.append("1" if found_sum >= 1 else "0")
+
+            final_output.write("%s\t%s\n" % ('\t'.join(eachgene), '\t'.join(string_identifier)))
+
+    peaks_file_name = file_name.replace('.txt', '.peaks.txt')
+    pdf_file_name = file_name.replace('.txt', '.distribution.pdf')
+    with open(peaks_file_name, 'w') as final_output_peaks:
+        final_output_peaks.write("chrom\tstart\tstop\tpeak_id\tBARCODE\t%s\n" %('\t'.join(region_names_list)))
+        for eachpeaks_id in peaks_id_list:
+            string_identifier = [barcodepeaks_id_dict[eachpeaks_id]]
+            string_identifier.extend(str(master_peaksid_dict[region][eachpeaks_id]) for region in region_names_list)
+            final_output_peaks.write("%s\t%s\t%s\n" %('\t'.join(peaks_chr_dict[eachpeaks_id]), eachpeaks_id, '\t'.join(string_identifier)))
+
+    region_count_dict = {region: round(len(peaks_id_dict[region]) * 100 / peaks_count) for region in ("PROMOTER", "GENEBODY", "WINDOW")}
+    rects = plt.bar(region_count_dict.keys(), region_count_dict.values(), width=0.4)
+    axes = plt.gca()
+    axes.set_ylim([0, 100])
+    peaks_title = Path(macs_peaks_file).stem
+    plt.title(peaks_title.split('.sorted')[0])
+    plt.ylabel("% of peaks")
+
+    #label the bars
+    xpos = 'center'
+    ha = {'center': 'center', 'right': 'left', 'left': 'right'}
+    offset = {'center': 0.5, 'right': 0.57, 'left': 0.43}  # x_txt = x + w*off
+
+    for rect in rects:
+        height = rect.get_height()
+        plt.text(rect.get_x() + rect.get_width()*offset[xpos], 1.01*height,
+                 '{}%'.format(height), ha=ha[xpos], va='bottom')
+
+    #save figure
+    plt.savefig(pdf_file_name)
+
+    for eachregion in ("PROMOTER", "GENEBODY", "WINDOW"):
+        print("Total number of %s = %d" %(eachregion, len(peaks_id_dict[eachregion])))
+        print("Percentage of %s = %.3f" %(eachregion, len(peaks_id_dict[eachregion])*100/peaks_count))
+
+
+def main():
+    '''
+    Generate genomic coordinates of all promoters, window, genebody, TSS
+
+    Definitions: promoters: +/- 1kb of TSS (transcription start site)
+                 window: 10kb upstream --> 3kb downstream of genebody
+                 genebody: 1kb upstream of TSS --> TTS/TES (transcription termination/end site)
+                 TSS: TSS (transcription start site)
+
+    Annotate Peaks based on genomic features listed above
+
+    Comparison master files for all genes/transcripts
+        and their genomic regions within peaks called.
+    Genomic regions are specified based on regions provided,
+        default expectation are promoter, genebody, window, closest
+
+    '''
+    parser = argparse.ArgumentParser()
+    parser.add_argument("-g", "--gtf", dest="gtf", required=True, help="Enter .gtf/gff file to be processed.")
+    parser.add_argument("-c", "--chrom", dest="chrom_sizes", required=True, help="Enter ucsc chrom sizes file to be processed.")
+    parser.add_argument("-p", "--peaks", dest="macs_peaks", required=True, help="Enter MACS peaks bed file.")
+    parser.add_argument("-s", "--summit", dest="macs_summits", required=False, help="Enter MACS summit bed file.")
+
+    options = parser.parse_args()
+
+    # Stage 1
+    # Extract genomic regions of interest
+    gtf_to_genes(options.chrom_sizes, options.gtf)
+
+    #Peaks filecheck
+    checkcolumns = sum(1 for _ in open(options.macs_peaks, 'r'))
+    if checkcolumns <= 0: sys.exit("ERROR :\tNo peaks were identified\n") 
+
+    # Turn 4 column bed to 5 (for SICER peaks: include unique id)
+    with open(options.macs_peaks, 'r') as peaks_file:
+        numberofcolumns = len(peaks_file.readline().strip().split('\t'))
+        fifthcol_value = next(peaks_file).split('\t')[4].strip() #FDRisland from SICER produce a fifth empty column
+        
+    new_macs_peaks = options.macs_peaks
+
+    if numberofcolumns == 4 or len(fifthcol_value) < 1:
+        new_macs_peaks = options.macs_peaks + ".tempPrefix"
+        # command = "cat " + options.macs_peaks + ' | awk -F\\\\t ' + "'" + \
+        #           '{print $1 "\\t" $2 "\\t" $3 "\\t" $1":"$2"-"$3 "\\t" $4}' + \
+        #           "' > tempPrefix-new.macs_peaks.bed"
+        # print(command)
+        # os.system(command)
+        macs_peaks_output = open(new_macs_peaks, 'w')
+        macs_peaks_input = open(options.macs_peaks,'r')
+        macs_peaks_count = 1
+        for line in macs_peaks_input:
+            line = line.strip().split('\t')
+            new_peaks_id = "SICERpeak_" + str(macs_peaks_count)
+            results = ("{0}\t{1}\t{2}\n".format("\t".join(line[0:3]), new_peaks_id, line[3]))
+            macs_peaks_output.write(results)
+            macs_peaks_count += 1
+        macs_peaks_output.close()
+        
+    # Convert TSS to compatible bedtools (closestBed) format and obtain center of peaks coordinate
+    command = "cut -f1,4,5,9,10 annotation/TSS-region_genes.gff | sort -k1,1 -k2,2n " + \
+              "> tempPrefix-sorted.TSS; " + \
+              "cat " + new_macs_peaks + ' | awk -F\\\\t ' + "'" + \
+              '{print $1 "\\t" int(($2+$3)/2) "\\t" int(($2+$3)/2) "\\t" $4 "\\t" $5}' + \
+              "' | sort -k1,1 -k2,2n > tempPrefix-centerofpeaks.bed;" + \
+              "closestBed -t all -a tempPrefix-centerofpeaks.bed -b tempPrefix-sorted.TSS" + \
+              "> closest.tempPrefix"
+    os.system(command)
+
+    # Stage 2
+    # Annotate peaks based on previously extracted genomic coordinates
+    for region in ("promoter", "genebody", "window"):
+        command = 'intersectBed -b ' + new_macs_peaks + ' -a annotation/' + region + \
+                  '-region_genes.gff -wb | cut -f1,4,5,9,10,11,12,13,14,15 |' + "awk -F\\\\t '" + \
+                  '{print $6 "\\t" $7 "\\t" $8 "\\t" $9 "\\t" $10 "\\t" $1 "\\t" $2 "\\t" $3 ' + \
+                  '"\\t" $4 "\\t" $5}' + \
+                  "' > " + region + ".tempPrefix"
+        os.system(command)
+
+    # Convert annotated regions to tab delimited file and include summit location
+    for region in ("promoter", "genebody", "window", "closest"):
+        region_file = region + ".tempPrefix"
+        output_file = "peaks_within_" + region + ".regions.txt"
+        if options.macs_summits:
+            annotate_regions(region_file, output_file, options.macs_summits)
+        else:
+            annotate_regions(region_file, output_file)
+
+    command = "mv peaks_within_closest.regions.txt centerofpeaks_closest.regions.txt; \
+              cut -f 9,10 annotation/original_genes.gff > tempPrefix.genes.names"
+    os.system(command)
+
+    # Stage 3
+    # Binary matrix of all genes
+    include_genes(new_macs_peaks, "peaks_compared_regions.txt")
+
+    command = "rm -rf *tempPrefix*"
+    os.system(command)
+
+
+if __name__ == "__main__":
+    main()

From 6b3967b1e05cb6ba577ca8478c04af778395d9de Mon Sep 17 00:00:00 2001
From: Modupe <madetunj@stjude.org>
Date: Tue, 16 Apr 2024 17:00:13 -0500
Subject: [PATCH 2/5] update README.md

---
 README.md | 38 +++++++++++++++++++++++---------------
 1 file changed, 23 insertions(+), 15 deletions(-)

diff --git a/README.md b/README.md
index 2f19a5c..7d0c8f4 100755
--- a/README.md
+++ b/README.md
@@ -1,33 +1,41 @@
 ========================  PEAKS ANNOTATION  ======================
 
-1) USAGE
-Executable anywhere as long as the PATHTO is correctly specified.
+1) DESCRIPTION
+
+Genic annotation of peaks including promoters, gene bodies, gene-centric windows, and proximal genes.
+
+PEAKSANNO generates the genic annotation of peaks at promoters, gene bodies and gene-centric windows.
+Annotated regions are collated to provide a binary overview of proximal genes, and the peaks occupancy 
+percentages are graphically presented in a bar plot.
+This is a part of the major analysis for ChIPSeq or Cut&Run-Seq using our comprehensive pipeline [SEASEQ](https://github.com/stjude/seaseq/)
+
+
+1) USAGE.
 	```
-	PEAKSANNO-main.sh ["MACS PEAK bed"] ["MACS Summit bed"] ["GTF"] ["CHROM SIZES"]
+	peaksanno.py -p ["MACS PEAK bed"] -s ["MACS Summit bed"] -g ["GTF"] -c ["CHROM SIZES"]
 	```
-
 1) INPUT
 The package requires the following files
-	- MACS peaks bed file
-	- MACS peaks summit file
-	- GTF having either gene or transcript (prefered) annotation
+	- Peaks bed file
+	- GTF/GFF/GFF3 having either gene or transcript (prefered) annotation
 	- UCSC provided chrom.sizes file
+Optional summit file from MACS can be included
 
 1) OUTPUT
 
 	1) TSS nearest the center of peaks in __*centerofpeaks_closest.regions.txt*__
-	1) peaks overlapping genes regions in __*peaks_within_genebody.regions.txt*__
-	1) peaks overlapping promoters in __*peaks_within_promoter.regions.txt*__
-	1) peaks overlapping windows in __*peaks_within_window.regions.txt*__
-	1) peaks identified in previous overlapping regions and comparison of all regions in __*peaks_compared_regions.peaks.txt*__
-	1) genes identified in previous overlapping regions and comparison of all regions in __*peaks_compared_regions.genes.txt*__
-	1) distribution graphs in __*peaks_compared_regions.distribution.pdf*__
+	1) Peaks overlapping genes regions in __*peaks_within_genebody.regions.txt*__
+	1) Peaks overlapping promoters in __*peaks_within_promoter.regions.txt*__
+	1) Peaks overlapping windows in __*peaks_within_window.regions.txt*__
+	1) Peaks identified in previous overlapping regions and comparison of all regions in __*peaks_compared_regions.peaks.txt*__
+	1) Genes identified in previous overlapping regions and comparison of all regions in __*peaks_compared_regions.genes.txt*__
+	1) Distribution graphs in __*peaks_compared_regions.distribution.pdf*__
 
 1) DEFINITIONS
 	- TSS: TSS (transcription start site)
 	- promoters: 1kb +/- TSS
-	- window: 10kb upstream --> 3kb downstream of genebody
-	- genebody: 1kb upstream of TSS --> TES (transcription end site)
+	- window: 10kb upstream to 3kb downstream of genebody
+	- genebody: 1kb upstream of TSS to TES (transcription end site)
 
 1) DEPENDENCIES
 	* bedtools

From f94e07ae0c9da3ff56f9893a0817a6dd69ef98bb Mon Sep 17 00:00:00 2001
From: Modupe <madetunj@stjude.org>
Date: Tue, 16 Apr 2024 17:06:50 -0500
Subject: [PATCH 3/5] update README.md

---
 README.md | 25 +++++++++++++++----------
 1 file changed, 15 insertions(+), 10 deletions(-)

diff --git a/README.md b/README.md
index 7d0c8f4..2a2888e 100755
--- a/README.md
+++ b/README.md
@@ -1,27 +1,26 @@
-========================  PEAKS ANNOTATION  ======================
+#PEAKS ANNOTATION
 
-1) DESCRIPTION
+## DESCRIPTION
 
 Genic annotation of peaks including promoters, gene bodies, gene-centric windows, and proximal genes.
 
 PEAKSANNO generates the genic annotation of peaks at promoters, gene bodies and gene-centric windows.
 Annotated regions are collated to provide a binary overview of proximal genes, and the peaks occupancy 
 percentages are graphically presented in a bar plot.
-This is a part of the major analysis for ChIPSeq or Cut&Run-Seq using our comprehensive pipeline [SEASEQ](https://github.com/stjude/seaseq/)
+This is a part of the major analysis for ChIPSeq or Cut&Run-Seq using our comprehensive pipeline [SEASEQ](https://github.com/stjude/seaseq/).
 
-
-1) USAGE.
+## USAGE.
 	```
 	peaksanno.py -p ["MACS PEAK bed"] -s ["MACS Summit bed"] -g ["GTF"] -c ["CHROM SIZES"]
 	```
-1) INPUT
+## INPUT
 The package requires the following files
 	- Peaks bed file
 	- GTF/GFF/GFF3 having either gene or transcript (prefered) annotation
 	- UCSC provided chrom.sizes file
 Optional summit file from MACS can be included
 
-1) OUTPUT
+## OUTPUT
 
 	1) TSS nearest the center of peaks in __*centerofpeaks_closest.regions.txt*__
 	1) Peaks overlapping genes regions in __*peaks_within_genebody.regions.txt*__
@@ -31,14 +30,20 @@ Optional summit file from MACS can be included
 	1) Genes identified in previous overlapping regions and comparison of all regions in __*peaks_compared_regions.genes.txt*__
 	1) Distribution graphs in __*peaks_compared_regions.distribution.pdf*__
 
-1) DEFINITIONS
+## DEFINITIONS
 	- TSS: TSS (transcription start site)
 	- promoters: 1kb +/- TSS
 	- window: 10kb upstream to 3kb downstream of genebody
 	- genebody: 1kb upstream of TSS to TES (transcription end site)
 
-1) DEPENDENCIES
+## DEPENDENCIES
 	* bedtools
 	* python3
 
-=================================================================
+
+##Contact
+
+Please use the GitHub issues page for reporting any issues / suggestions (recommended). 
+
+Alternatively, you can e-mail Modupe Adetunji <modupeore.adetunji@stjude.org>
+

From 5d5ae84f98b7a731f7a37b947014a4cd49241cc0 Mon Sep 17 00:00:00 2001
From: Modupe <madetunj@stjude.org>
Date: Tue, 16 Apr 2024 17:11:14 -0500
Subject: [PATCH 4/5] update README.md

---
 README.md | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 2a2888e..c6e3e60 100755
--- a/README.md
+++ b/README.md
@@ -40,8 +40,12 @@ Optional summit file from MACS can be included
 	* bedtools
 	* python3
 
+## CITATION
 
-##Contact
+If you are using *PEAKSANNO*, please cite it's parent paper : 
+Adetunji, M.O., Abraham, B.J. SEAseq: a portable and cloud-based chromatin occupancy analysis suite. BMC Bioinformatics 23, 77 (2022). https://doi.org/10.1186/s12859-022-04588-z
+
+## CONTACT
 
 Please use the GitHub issues page for reporting any issues / suggestions (recommended). 
 

From 35ad944c19473d765d049454e263d629990fcf02 Mon Sep 17 00:00:00 2001
From: Modupeore Adetunji <53270089+madetunj@users.noreply.github.com>
Date: Tue, 16 Apr 2024 18:36:48 -0500
Subject: [PATCH 5/5] Update README.md

---
 README.md | 52 ++++++++++++++++++++++++++--------------------------
 1 file changed, 26 insertions(+), 26 deletions(-)

diff --git a/README.md b/README.md
index c6e3e60..4d4d90e 100755
--- a/README.md
+++ b/README.md
@@ -1,53 +1,53 @@
-#PEAKS ANNOTATION
+# PEAKS ANNOTATION
 
 ## DESCRIPTION
 
 Genic annotation of peaks including promoters, gene bodies, gene-centric windows, and proximal genes.
 
-PEAKSANNO generates the genic annotation of peaks at promoters, gene bodies and gene-centric windows.
-Annotated regions are collated to provide a binary overview of proximal genes, and the peaks occupancy 
+**PeaksAnoo** provides the genic annotation of peaks at promoters, gene bodies, and gene-centric windows.
+Annotated regions are collated to give a binary overview of proximal genes, and the peak occupancy 
 percentages are graphically presented in a bar plot.
 This is a part of the major analysis for ChIPSeq or Cut&Run-Seq using our comprehensive pipeline [SEASEQ](https://github.com/stjude/seaseq/).
 
-## USAGE.
-	```
-	peaksanno.py -p ["MACS PEAK bed"] -s ["MACS Summit bed"] -g ["GTF"] -c ["CHROM SIZES"]
-	```
+## USAGE
+``` bash
+peaksanno.py -p ["PEAK bed"] [-s ["MACS Summit bed"]] -g ["GTF"] -c ["CHROM SIZES"]
+```
 ## INPUT
-The package requires the following files
-	- Peaks bed file
-	- GTF/GFF/GFF3 having either gene or transcript (prefered) annotation
-	- UCSC provided chrom.sizes file
-Optional summit file from MACS can be included
+The script requires the following files:
+1. Peaks bed file
+2. GTF/GFF/GFF3 having either gene or transcript (preferred) annotation
+3. UCSC provided chromosomal sizes file
 
-## OUTPUT
+An optional summit file from MACS can be included
 
-	1) TSS nearest the center of peaks in __*centerofpeaks_closest.regions.txt*__
-	1) Peaks overlapping genes regions in __*peaks_within_genebody.regions.txt*__
-	1) Peaks overlapping promoters in __*peaks_within_promoter.regions.txt*__
-	1) Peaks overlapping windows in __*peaks_within_window.regions.txt*__
-	1) Peaks identified in previous overlapping regions and comparison of all regions in __*peaks_compared_regions.peaks.txt*__
-	1) Genes identified in previous overlapping regions and comparison of all regions in __*peaks_compared_regions.genes.txt*__
-	1) Distribution graphs in __*peaks_compared_regions.distribution.pdf*__
+## OUTPUT
+PeaksAnno genes the following files:
+1. TSS nearest the center of peaks in **centerofpeaks_closest.regions.txt**.
+2. Peaks overlapping genes regions in **peaks_within_genebody.regions.txt**.
+3. Peaks overlapping promoters in **peaks_within_promoter.regions.txt**.
+4. Peaks overlapping windows in **peaks_within_window.regions.txt**.
+5. Peaks identified in previous overlapping regions and comparison of all regions in **peaks_compared_regions.peaks.txt**.
+6. Genes identified in previous overlapping regions and comparison of all regions in **peaks_compared_regions.genes.txt**.
+7. Distribution graphs in **peaks_compared_regions.distribution.pdf**.
 
 ## DEFINITIONS
 	- TSS: TSS (transcription start site)
 	- promoters: 1kb +/- TSS
-	- window: 10kb upstream to 3kb downstream of genebody
+	- window: 10kb upstream to 3kb downstream of the gene locus
 	- genebody: 1kb upstream of TSS to TES (transcription end site)
 
 ## DEPENDENCIES
-	* bedtools
-	* python3
+1. bedtools
+2. python3
 
 ## CITATION
-
-If you are using *PEAKSANNO*, please cite it's parent paper : 
+If you are using ***PEAKSANNO***, please cite its parent paper : 
 Adetunji, M.O., Abraham, B.J. SEAseq: a portable and cloud-based chromatin occupancy analysis suite. BMC Bioinformatics 23, 77 (2022). https://doi.org/10.1186/s12859-022-04588-z
 
 ## CONTACT
 
-Please use the GitHub issues page for reporting any issues / suggestions (recommended). 
+Please use the GitHub issues page for reporting any issues/suggestions (recommended). 
 
 Alternatively, you can e-mail Modupe Adetunji <modupeore.adetunji@stjude.org>