From 03701e4345f620326657cc01395d177c0dcb2c7e Mon Sep 17 00:00:00 2001 From: Modupe Date: Tue, 16 Apr 2024 16:48:52 -0500 Subject: [PATCH 1/5] syntax updates --- PEAKSANNO.py | 549 --------------------------------------------------- peaksanno.py | 373 ++++++++++++++++++++++++++++++++++ 2 files changed, 373 insertions(+), 549 deletions(-) delete mode 100755 PEAKSANNO.py create mode 100755 peaksanno.py diff --git a/PEAKSANNO.py b/PEAKSANNO.py deleted file mode 100755 index ef2f4fc..0000000 --- a/PEAKSANNO.py +++ /dev/null @@ -1,549 +0,0 @@ -#!/usr/bin/env python3 -''' -======================== PEAKS ANNOTATION ====================== - -USAGE -All inclusive script for Annotation of ChIP-Seq peaks. -This is included in the SEASEQ pipeline. - ``` - python PEAKSANNO.py -g -c -p -s - ``` - -The script input requirements are : -- GTF : GTF/GFF/GFF3 having either gene or transcript (prefered) annotation -- CHROM SIZES : chrom.sizes file -- MACS PEAKS bed : MACS peaks bed file -- MACS SUMMITS bed : MACS peaks summit file - -Output files provided are : - 1) TSS nearest the center of peaks in __*centerofpeaks_closest.regions.txt*__ - 2) peaks overlapping genes regions in __*peaks_within_genebody.regions.txt*__ - 3) peaks overlapping promoters in __*peaks_within_promoter.regions.txt*__ - 4) peaks overlapping windows in __*peaks_within_window.regions.txt*__ - 5) peaks identified in previous overlapping regions and comparison of - all regions in __*peaks_compared_regions.peaks.txt*__ - 6) genes identified in previous overlapping regions and comparison of - all regions in __*peaks_compared_regions.genes.txt*__ - 7) distribution graphs in __*peaks_compared_regions.distribution.pdf*__ - -Definitions of annotation regions: - - TSS: TSS (transcription start site) - - promoters: 1kb +/- TSS - - window: 10kb upstream --> 3kb downstream of genebody - - genebody: 1kb upstream of TSS --> TES (transcription end site) - -Dependencies of script: - * bedtools - * python3 - -================================================================== -''' - -import os -import sys -import re -import argparse -import subprocess -import matplotlib.pyplot as plt -from pathlib import Path -import collections -from collections import defaultdict - -if not os.path.exists('annotation'): - os.makedirs('annotation') - -def parse_genelocations(chromz, results, up_dist, down_dist, tss=True): - """ Parse genomic regions - Args: - chromz (dict) : Chromosomal sizes - results (string) : Individual gene coordinates - up_dist (int) : genomic distance upstream / + - down_dist (int) : genomic distance downstream / - - tss (boolean) : TSS or entire gene (true if tss) - """ - - lines = results.split("\t") - lines[3] = int(lines[3]) - lines[4] = int(lines[4]) - - if lines[6] == "+": - if tss: - end_region = 4 - else: - end_region = 3 - - end_position = lines[end_region] + down_dist - start_position = lines[3] - up_dist - - elif lines[6] == "-": - if tss: - end_region = 3 - else: - end_region = 4 - - start_position = lines[end_region] - down_dist - end_position = lines[4] + up_dist - - if start_position < 1: - start_position = 1 - - if end_position > int(chromz[lines[0]]): - end_position = chromz[lines[0]] - - output_gff = ("{0}\t{1}\t{2}\t{3}\n".format("\t".join(lines[0:3]), - start_position, end_position, "\t".join(lines[5:]))) - return output_gff - - -def gtf_to_genes(chrom_sizes_file, gtf_file): - """ GTF to GENES - Args: - chrom_sizes_file (file) : Chromosomal sizes file - gtf_file (file) : GTF file - """ - #initialize outputfiles - original_output = open('annotation/original_genes.gff', 'w') - promoters_output = open('annotation/promoter-region_genes.gff', 'w') - window_output = open('annotation/window-region_genes.gff', 'w') - tss_output = open('annotation/TSS-region_genes.gff', 'w') - genebody_output = open('annotation/genebody-region_genes.gff', 'w') - - haschr = False #check 'chr' prefix in gtf file - - #read ucsc chrom sizes, gtf - chrom_sizes_input = open(chrom_sizes_file, 'r') - chrom_sizes_dict = {} - for line in chrom_sizes_input: - lines = line.split('\t') - chrom_sizes_dict[lines[0]] = lines[1].rstrip("\n") - if lines[0].startswith('chr'): - haschr = True - - #feature = options.feature - feature_dict = {} - - #reading the feature type to be used - gtf_input = open(gtf_file, 'r') - for line in gtf_input: - if not line.startswith('#'): - lines = line.split("\t") - feature_dict[lines[2]] = lines[2] - - if 'transcript' in feature_dict: - feature = "transcript" - elif 'gene' in feature_dict: - feature = "gene" - else: - sys.exit("ERROR :\tGTF/GFF with either transcript/gene annotation is needed") - print("NOTE :\tFeature type used is '%s'" %(feature)) - - #organize gtf file - gtf_input = open(gtf_file, 'r') - for line in gtf_input: - if not line.startswith('#'): - lines = line.rstrip("\n").split("\t") - if haschr and not lines[0].startswith('chr'): - lines[0] = "chr"+lines[0] - elif not haschr and lines[0].startswith('chr'): - lines[0] = lines[0][3:] - if lines[2] == feature: - newline = lines[8].split(';') - - if gtf_file.split('.')[-1] == 'gff' or gtf_file.split('.')[-1] == 'gff3': - if re.search(";transcript_id", lines[8]): - transcript_id = [s for s in newline if "transcript_id=" in s][0] - else: - transcript_id = newline[0] - if re.search(";gene_name", lines[8]): - gene_name = [s for s in newline if "gene_name=" in s][0] - else: - gene_name = newline[0] - transcript_id = re.sub('[\"\;]', '', transcript_id.split('=')[-1]) #clean transcript_id - gene_name = re.sub('[\"\;]', '', gene_name.split('=')[-1]) #clean gene_name - elif gtf_file.split('.')[-1] == 'gtf': - if re.search("; transcript_id", lines[8]): - transcript_id = [s for s in newline if "transcript_id " in s][0] - else: - transcript_id = newline[0] - if re.search("; gene_name", lines[8]): - gene_name = [s for s in newline if "gene_name " in s][0] - else: - gene_name = newline[0] - transcript_id = re.sub('[\"\;]', '', transcript_id.split(' ')[-1]) #clean transcript_id - gene_name = re.sub('[\"\;]', '', gene_name.split(' ')[-1]) #clean gene_name - - results = ("{0}\t{1}\t{2}\t{3}".format(lines[0], - "\t".join(lines[1:8]), - gene_name, transcript_id)) - - try: - if chrom_sizes_dict[lines[0]]: - original_output.write(results + "\n") - final_output = parse_genelocations(chrom_sizes_dict, results, 1000, 1000, False) - promoters_output.write(final_output) - final_output = parse_genelocations(chrom_sizes_dict, results, 10000, 3000, True) - window_output.write(final_output) - final_output = parse_genelocations(chrom_sizes_dict, results, 0, 0, False) - tss_output.write(final_output) - final_output = parse_genelocations(chrom_sizes_dict, results, 1000, 0, True) - genebody_output.write(final_output) - except KeyError: - continue - - -def annotate_regions(region_file, output_file, macs_summits_file=None): - """ Annotate Peaks based on genomic features - Args: - region_file (file) : Region file - output_file (string) : Output file name - macs_summits_file (file) : MACS summits file - """ - - #initialize chromosomal positions based on gff provided - opt_chrom = 0 - opt_start = 1 - opt_stop = 2 - opt_peakid = 3 - opt_score = 4 - opt_none = 6 - opt_gene = 8 - opt_transcript = 9 - - summits_dict = {} - if macs_summits_file: - #read summits file - summits_input = open(macs_summits_file, 'r') - for line in summits_input: - line = line.strip('\n').split('\t') - string_identifier = line[opt_start], line[opt_stop] - summits_dict[line[opt_peakid]] = string_identifier - - #read intersectbed file - intersect_input = open(region_file, 'r') - peaks_dict = {} - peaks_score = {} - gene_dict = {} - transcript_dict = {} - peak_region = [] - for line in intersect_input: - line = line.strip('\n').split('\t') - #print(line[opt_none],"\n") - if int(line[opt_none]) != -1: - string_identifier = line[opt_chrom], line[opt_start], line[opt_stop] - peaks_dict[string_identifier] = line[opt_peakid] - peaks_score[line[opt_peakid]] = line[opt_score] - peak_region.append(string_identifier) - - if string_identifier in gene_dict: - gene_dict[string_identifier] = ("{0},{1}".format(gene_dict[string_identifier], - line[opt_gene])) - transcript_dict[string_identifier] = ("{0},{1}". - format(transcript_dict[string_identifier], - line[opt_transcript])) - else: - gene_dict[string_identifier] = line[opt_gene] - transcript_dict[string_identifier] = line[opt_transcript] - - #each peak id is unique - unique_peak_region = [] - for eachpeak_region in peak_region: - if eachpeak_region not in unique_peak_region: - unique_peak_region.append(eachpeak_region) - - #write to output file - final_output = open(output_file, 'w') - - if macs_summits_file: - final_output.write("peak_id\tchrom\tpeak_start\tpeak_end\t" - "summit_start\tsummit_end\tgene_name\t" - "transcript_id\tpeak_score\n") - - #print regions identified - for chromregion in unique_peak_region: - peakid = peaks_dict[chromregion] - final_output.write("%s\t%s\t%s\t%s\t%s\t%s\n" - %(peakid, '\t'.join(chromregion), '\t'.join(summits_dict[peakid]), - ','.join(unique(gene_dict[chromregion])), - ','.join(unique(transcript_dict[chromregion])), - peaks_score[peakid])) - else: - final_output.write("peak_id\tchrom\tpeak_start\tpeak_end\t" - "gene_name\ttranscript_id\tpeak_score\n") - - #print regions identified - for chromregion in unique_peak_region: - peakid = peaks_dict[chromregion] - final_output.write("%s\t%s\t%s\t%s\t%s\n" - %(peakid, '\t'.join(chromregion), - ','.join(unique(gene_dict[chromregion])), - ','.join(unique(transcript_dict[chromregion])), - peaks_score[peakid])) - -def unique(list1): - """ Get unique list - Args: - list1 (list) : list of variable separated by comma ',' - - """ - - # intilize a null list - unique_list = [] - - # traverse for all elements - for variable in list1.split(","): - if variable not in unique_list: - unique_list.append(variable) - return unique_list - - -def include_genes(macs_peaks_file, file_name): - """ Annotate Genes in peak regions - Args: - macs_peaks_file (file) : MACS peaks file - output_file (string) : Output file name - """ - peaks_input = open(macs_peaks_file, 'r') - peaks_chr_dict = {} - peaks_count = 0 - for line in peaks_input: - line = line.strip('\n').split('\t') - peaks_count += 1 - peaks_chr_dict[line[3]] = (line[0:3]) - - #reading the gtf genes file - gene_names_dict = {} - gene_names_input = open("tempPrefix.genes.names", 'r') - for line in gene_names_input: - line = line.strip('\n').split('\t') - gene_names_dict[(line[0], line[1])] = line[0] - - #reading the genomic regions files - region_names_list = [] - region_dict = defaultdict(dict) - peaks_id_dict = defaultdict(dict) - for region_prefix in ("promoter", "genebody", "window", "closest"): - region = region_prefix + ".tempPrefix" - region_name = Path(region).stem - region_names_list.append(region_name.upper()) - region_input = open(region, 'r') - for line in region_input: - line = line.strip('\n').split('\t') - region_dict[region_name.upper()][(line[8], line[9])] = (line[8], line[9]) - peaks_id_dict[region_name.upper()][line[3]] = (line[0:3]) - peaks_id_list = peaks_chr_dict.keys() - peaks_id_list = sorted(peaks_id_list, key=lambda item: - (int(item[10:]) if item[10:].isdigit() - else print(item[10:], type(item[10:])), item)) - - #order regions and gene names - region_names_list.sort() - orderedgene_names_dict = collections.OrderedDict(sorted(gene_names_dict.items())) - - #barcoding the identified regions - barcoderegions_dict = {} - master_generegion_dict = defaultdict(dict) - for eachgene in orderedgene_names_dict: - for eachkey in region_names_list: - found = 0 - if eachgene in region_dict[eachkey]: - found = 1 - master_generegion_dict[eachkey][eachgene] = found - if eachgene in barcoderegions_dict: - barcoderegions_dict[eachgene] += str(found) - else: - barcoderegions_dict[eachgene] = str(found) - - barcodepeaks_id_dict = {} - master_peaksid_dict = defaultdict(dict) - for eachpeaks_id in peaks_id_list: - for eachkey in region_names_list: - found = 0 - #print(eachpeaks_id) - if eachpeaks_id in peaks_id_dict[eachkey]: - found = 1 - #print("yes") - master_peaksid_dict[eachkey][eachpeaks_id] = found - if eachpeaks_id in barcodepeaks_id_dict: - barcodepeaks_id_dict[eachpeaks_id] += str(found) - else: - barcodepeaks_id_dict[eachpeaks_id] = str(found) - - #write to output file - gene_file_name = file_name.replace('.txt', '.genes.txt') - - final_output = open(gene_file_name, 'w') - final_output.write("GeneName\tTranscript_ID\tBARCODE\t%s\tEVER\n" - %('\t'.join(region_names_list))) - - for eachgene in orderedgene_names_dict: - string_identifier = [barcoderegions_dict[eachgene]] - found_sum = 0 - for eachkey in region_names_list: - string_identifier.append(str(master_generegion_dict[eachkey][eachgene])) - found_sum += master_generegion_dict[eachkey][eachgene] - if found_sum >= 1: - string_identifier.append("1") - else: - string_identifier.append("0") - - final_output.write("%s\t%s\n" %('\t'.join(eachgene), '\t'.join(string_identifier))) - - peaks_file_name = file_name.replace('.txt', '.peaks.txt') - pdf_file_name = file_name.replace('.txt', '.distribution.pdf') - final_output_peaks = open(peaks_file_name, 'w') - final_output_peaks.write("chrom\tstart\tstop\tpeak_id\tBARCODE\t%s\n" - %('\t'.join(region_names_list))) - for eachpeaks_id in peaks_id_list: - string_identifier = [barcodepeaks_id_dict[eachpeaks_id]] - for eachkey in region_names_list: - string_identifier.append(str(master_peaksid_dict[eachkey][eachpeaks_id])) - - final_output_peaks.write("%s\t%s\t%s\n" - %('\t'.join(peaks_chr_dict[eachpeaks_id]), - eachpeaks_id, '\t'.join(string_identifier))) - - #the bar plot - region_count_dict = {} - #for eachregion in region_count_dict: - region_count_dict["Promoter\n(1kb up/down TSS)"] = round( - len(peaks_id_dict["PROMOTER"])*100/peaks_count) - region_count_dict["Genebody\n(1kb up TSS/TES)"] = round( - len(peaks_id_dict["GENEBODY"])*100/peaks_count) - region_count_dict["Window\n(10kb up TSS/3kb down TES)"] = round( - len(peaks_id_dict["WINDOW"])*100/peaks_count) - - rects = plt.bar(region_count_dict.keys(), region_count_dict.values(), width=0.4) - axes = plt.gca() - axes.set_ylim([0, 100]) - peaks_title = Path(macs_peaks_file).stem - plt.title(peaks_title.split('.sorted')[0]) - plt.ylabel("% of peaks") - - #label the bars - xpos = 'center' - ha = {'center': 'center', 'right': 'left', 'left': 'right'} - offset = {'center': 0.5, 'right': 0.57, 'left': 0.43} # x_txt = x + w*off - - for rect in rects: - height = rect.get_height() - plt.text(rect.get_x() + rect.get_width()*offset[xpos], 1.01*height, - '{}%'.format(height), ha=ha[xpos], va='bottom') - - #save figure - plt.savefig(pdf_file_name) - - for eachregion in ("PROMOTER", "GENEBODY", "WINDOW"): - print("Total number of %s = %d" %(eachregion, len(peaks_id_dict[eachregion]))) - print("Percentage of %s = %.3f" %(eachregion, - len(peaks_id_dict[eachregion])*100/peaks_count)) - - -def main(): - ''' - Generate genomic coordinates of all promoters, window, genebody, TSS - - Definitions: promoters: +/- 1kb of TSS (transcription start site) - window: 10kb upstream --> 3kb downstream of genebody - genebody: 1kb upstream of TSS --> TTS/TES (transcription termination/end site) - TSS: TSS (transcription start site) - - Annotate Peaks based on genomic features listed above - - Comparison master files for all genes/transcripts - and their genomic regions within peaks called. - Genomic regions are specified based on regions provided, - default expectation are promoter, genebody, window, closest - - ''' - parser = argparse.ArgumentParser() - parser.add_argument("-g", "--gtf", dest="gtf", required=True, - help="Enter .gtf/gff file to be processed.") - parser.add_argument("-c", "--chrom", dest="chrom_sizes", required=True, - help="Enter ucsc chrom sizes file to be processed.") - parser.add_argument("-p", "--peaks", dest="macs_peaks", required=True, - help="Enter MACS peaks bed file.") - parser.add_argument("-s", "--summit", dest="macs_summits", required=False, - help="Enter MACS summit bed file.") - - options = parser.parse_args() - #print(options) - - # Stage 1 - # Extract genomic regions of interest - gtf_to_genes(options.chrom_sizes, options.gtf) - - # Turn 4 column bed to 5 (for SICER peaks: include unique id) - command = "wc -l " + options.macs_peaks + " | awk -F' ' '{print $1}'" - checkcolumns = int(subprocess.check_output(command,shell=True).strip()) - if checkcolumns > 0: - command = "awk -F'\\t' '{print NF; exit}' " + options.macs_peaks - numberofcolumns = int(subprocess.check_output(command,shell=True).strip()) - else: - sys.exit("ERROR :\tNo peaks were identified\n") - - command = "head -n 1 " + options.macs_peaks + ' | cut -f5 ' - fifthcol_value = (str(subprocess.check_output(command,shell=True).strip()).split("'"))[1] #FDRisland from SICER produce a fifth empty column - - new_macs_peaks = options.macs_peaks - - if numberofcolumns == 4 or len(fifthcol_value) < 1: - new_macs_peaks = options.macs_peaks + ".tempPrefix" - # command = "cat " + options.macs_peaks + ' | awk -F\\\\t ' + "'" + \ - # '{print $1 "\\t" $2 "\\t" $3 "\\t" $1":"$2"-"$3 "\\t" $4}' + \ - # "' > tempPrefix-new.macs_peaks.bed" - # print(command) - # os.system(command) - macs_peaks_output = open(new_macs_peaks, 'w') - macs_peaks_input = open(options.macs_peaks,'r') - macs_peaks_count = 1 - for line in macs_peaks_input: - line = line.strip('\n').split('\t') - new_peaks_id = "SICERpeak_" + str(macs_peaks_count) - results = ("{0}\t{1}\t{2}\n".format("\t".join(line[0:3]), new_peaks_id, line[3])) - macs_peaks_output.write(results) - macs_peaks_count += 1 - macs_peaks_output.close() - - # Convert TSS to compatible bedtools (closestBed) format and obtain center of peaks coordinate - command = "cut -f1,4,5,9,10 annotation/TSS-region_genes.gff | sort -k1,1 -k2,2n " + \ - "> tempPrefix-sorted.TSS; " + \ - "cat " + new_macs_peaks + ' | awk -F\\\\t ' + "'" + \ - '{print $1 "\\t" int(($2+$3)/2) "\\t" int(($2+$3)/2) "\\t" $4 "\\t" $5}' + \ - "' | sort -k1,1 -k2,2n > tempPrefix-centerofpeaks.bed;" + \ - "closestBed -t all -a tempPrefix-centerofpeaks.bed -b tempPrefix-sorted.TSS" + \ - "> closest.tempPrefix" - os.system(command) - - # Stage 2 - # Annotate peaks based on previously extracted genomic coordinates - for region in ("promoter", "genebody", "window"): - command = 'intersectBed -b ' + new_macs_peaks + ' -a annotation/' + region + \ - '-region_genes.gff -wb | cut -f1,4,5,9,10,11,12,13,14,15 |' + "awk -F\\\\t '" + \ - '{print $6 "\\t" $7 "\\t" $8 "\\t" $9 "\\t" $10 "\\t" $1 "\\t" $2 "\\t" $3 ' + \ - '"\\t" $4 "\\t" $5}' + \ - "' > " + region + ".tempPrefix" - os.system(command) - - # Convert annotated regions to tab delimited file and include summit location - for region in ("promoter", "genebody", "window", "closest"): - region_file = region + ".tempPrefix" - output_file = "peaks_within_" + region + ".regions.txt" - if options.macs_summits: - annotate_regions(region_file, output_file, options.macs_summits) - else: - annotate_regions(region_file, output_file) - - command = "mv peaks_within_closest.regions.txt centerofpeaks_closest.regions.txt; \ - cut -f 9,10 annotation/original_genes.gff > tempPrefix.genes.names" - os.system(command) - - # Stage 3 - # Binary matrix of all genes - include_genes(new_macs_peaks, "peaks_compared_regions.txt") - - command = "rm -rf *tempPrefix*" - #os.system(command) - - -if __name__ == "__main__": - main() diff --git a/peaksanno.py b/peaksanno.py new file mode 100755 index 0000000..7c0505e --- /dev/null +++ b/peaksanno.py @@ -0,0 +1,373 @@ +#!/usr/bin/env python3 +''' +======================== PEAKS ANNOTATION ====================== + +USAGE +All inclusive script for Annotation of ChIP-Seq peaks. +This is included in the SEASEQ pipeline. + ``` + python PEAKSANNO.py -g -c -p -s + ``` + +The script input requirements are : +- GTF : GTF/GFF/GFF3 having either gene or transcript (prefered) annotation +- CHROM SIZES : chrom.sizes file +- MACS PEAKS bed : MACS peaks bed file +- MACS SUMMITS bed : MACS peaks summit file + +Output files provided are : + 1) TSS nearest the center of peaks in __*centerofpeaks_closest.regions.txt*__ + 2) peaks overlapping genes regions in __*peaks_within_genebody.regions.txt*__ + 3) peaks overlapping promoters in __*peaks_within_promoter.regions.txt*__ + 4) peaks overlapping windows in __*peaks_within_window.regions.txt*__ + 5) peaks identified in previous overlapping regions and comparison of + all regions in __*peaks_compared_regions.peaks.txt*__ + 6) genes identified in previous overlapping regions and comparison of + all regions in __*peaks_compared_regions.genes.txt*__ + 7) distribution graphs in __*peaks_compared_regions.distribution.pdf*__ + +Definitions of annotation regions: + - TSS: TSS (transcription start site) + - promoters: 1kb +/- TSS + - window: 10kb upstream --> 3kb downstream of genebody + - genebody: 1kb upstream of TSS --> TES (transcription end site) + +Dependencies of script: + * bedtools + * python3 + +================================================================== +''' + +import os +import re +import argparse +import subprocess +import matplotlib.pyplot as plt +from pathlib import Path +import collections +from collections import defaultdict + +if not os.path.exists('annotation'): + os.makedirs('annotation') + +def parse_genelocations(chromz, results, up_dist, down_dist, tss=True): + """ Parse genomic regions + Args: + chromz (dict) : Chromosomal sizes + results (string) : Individual gene coordinates + up_dist (int) : genomic distance upstream / + + down_dist (int) : genomic distance downstream / - + tss (boolean) : TSS or entire gene (true if tss) + """ + lines = results.split("\t") + lines[3], lines[4] = int(lines[3]), int(lines[4]) + + if lines[6] == "+": + end_region = 4 if tss else 3 + end_position = lines[end_region] + down_dist + start_position = lines[3] - up_dist + elif lines[6] == "-": + end_region = 3 if tss else 4 + start_position = lines[end_region] - down_dist + end_position = lines[4] + up_dist + + start_position = max(start_position, 1) + end_position = min(end_position, int(chromz[lines[0]])) + + return "{0}\t{1}\t{2}\t{3}\n".format("\t".join(lines[0:3]), start_position, end_position, "\t".join(lines[5:])) + + +def gtf_to_genes(chrom_sizes_file, gtf_file): + """ GTF to GENES + Args: + chrom_sizes_file (file) : Chromosomal sizes file + gtf_file (file) : GTF file + """ + #read ucsc chrom sizes, gtf + chrom_sizes_dict = {line.split('\t')[0]: line.split('\t')[1].rstrip("\n") for line in open(chrom_sizes_file)} + haschr = any(chrom.startswith('chr') for chrom in chrom_sizes_dict) #check 'chr' prefix in gtf file + + feature_dict = {line.split("\t")[2]: line.split("\t")[2] for line in open(gtf_file) if not line.startswith('#')} + + feature = "transcript" if 'transcript' in feature_dict else "gene" + print("NOTE :\tFeature type used is '%s'" % feature) + + #parsing gene annotation file, GFF, GFF3, GTF + with open(gtf_file, 'r') as gtf_input: + for gtf_col in gtf_input: + if not gtf_col.startswith('#'): + gtf_col.split("\t")[0] = "chr" + gtf_col.split("\t")[0] if haschr and not gtf_col.split("\t")[0].startswith('chr') else gtf_col.split("\t")[0][3:] if not haschr and gtf_col.split("\t")[0].startswith('chr') else gtf_col.split("\t")[0] + if gtf_col.split("\t")[2] == feature: + record = gtf_col.split("\t")[8].split(';') + if gtf_file.split('.')[-1] in ['gff', 'gff3']: + transcript_id = [s for s in record if "transcript_id" in s][0] if re.search(";transcript_id", gtf_col.split("\t")[8]) else record[0] + gene_name = [s for s in record if "gene_name" in s][0] if re.search(";gene_name", gtf_col.split("\t")[8]) else record[0] + transcript_id = re.sub('[\"\;]', '', transcript_id.split('=')[-1]) #clean transcript_id + gene_name = re.sub('[\"\;]', '', gene_name.split('=')[-1]) #clean gene_name + elif gtf_file.split('.')[-1] == 'gtf': + transcript_id = [s for s in record if "transcript_id " in s][0] if re.search("; transcript_id", gtf_col.split("\t")[8]) else record[0] + gene_name = [s for s in record if "gene_name " in s][0] if re.search("; gene_name", gtf_col.split("\t")[8]) else record[0] + transcript_id = re.sub('[\"\;]', '', transcript_id.split(' ')[-1]) #clean transcript_id + gene_name = re.sub('[\"\;]', '', gene_name.split(' ')[-1]) #clean gene_name + + results = "{0}\t{1}\t{2}\t{3}".format(gtf_col.split("\t")[0], "\t".join(gtf_col.split("\t")[1:8]), gene_name, transcript_id) + + try: + if chrom_sizes_dict[gtf_col.split("\t")[0]]: + with open('annotation/original_genes.gff', 'w') as original_output: + original_output.write(results + "\n") + final_output = parse_genelocations(chrom_sizes_dict, results, 1000, 1000, False) + for file_name, up, down, tss in [('promoter-region_genes.gff', 10000, 3000, True), ('window-region_genes.gff', 1000, 1000, False), ('TSS-region_genes.gff', 0, 0, False), ('genebody-region_genes.gff', 1000, 0, True)]: + with open(f'annotation/{file_name}', 'w') as file: + file.write(final_output) + except KeyError: + continue + + +def annotate_regions(region_file, output_file, macs_summits_file=None): + """ Annotate Peaks based on genomic features + Args: + region_file (file) : Region file + output_file (string) : Output file name + macs_summits_file (file) : MACS summits file + """ + #initialize chromosomal positions based on gff provided + opt_chrom, opt_start, opt_stop, opt_peakid, opt_score, opt_none, opt_gene, opt_transcript = 0, 1, 2, 3, 4, 6, 8, 9 + summits_dict = {} + if macs_summits_file: #read summits file + with open(macs_summits_file, 'r') as summits_input: + summits_dict = {line.strip().split('\t')[opt_peakid]: (line.strip().split('\t')[opt_start], line.strip().split('\t')[opt_stop]) for line in summits_input} + + #read intersectbed file + with open(region_file, 'r') as intersect_input, open(output_file, 'w') as final_output: + peaks_dict, peaks_score, gene_dict, transcript_dict, peak_region = {}, {}, {}, {}, [] + for line in intersect_input: + line = line.strip().split('\t') + if int(line[opt_none]) != -1: + string_identifier = tuple(line[i] for i in [opt_chrom, opt_start, opt_stop]) + peaks_dict[string_identifier] = line[opt_peakid] + peaks_score[line[opt_peakid]] = line[opt_score] + peak_region.append(string_identifier) + gene_dict[string_identifier] = gene_dict.get(string_identifier, "") + line[opt_gene] + transcript_dict[string_identifier] = transcript_dict.get(string_identifier, "") + line[opt_transcript] + + unique_peak_region = list(set(peak_region)) + + header = "peak_id\tchrom\tpeak_start\tpeak_end\tsummit_start\tsummit_end\tgene_name\ttranscript_id\tpeak_score\n" if macs_summits_file else "peak_id\tchrom\tpeak_start\tpeak_end\tgene_name\ttranscript_id\tpeak_score\n" + + final_output.write(header) + for chromregion in unique_peak_region: + peakid = peaks_dict[chromregion] + if macs_summits_file: + final_output.write("%s\t%s\t%s\t%s\t%s\t%s\n" %(peakid, '\t'.join(chromregion), '\t'.join(summits_dict[peakid]),','.join(unique(gene_dict[chromregion])),','.join(unique(transcript_dict[chromregion])),peaks_score[peakid])) + else: + final_output.write("%s\t%s\t%s\t%s\t%s\n" %(peakid, '\t'.join(chromregion), ','.join(unique(gene_dict[chromregion])), ','.join(unique(transcript_dict[chromregion])), peaks_score[peakid])) + + +def unique(list1): + """ Get unique list + Args: + list1 (list) : list of variable separated by comma ',' + + """ + return list(set(list1.split(","))) + + +def include_genes(macs_peaks_file, file_name): + """ Annotate Genes in peak regions + Args: + macs_peaks_file (file) : MACS peaks file + output_file (string) : Output file name + """ + peaks_chr_dict, peaks_count, gene_names_dict, region_dict, peaks_id_dict, region_names_list, master_generegion_dict, barcoderegions_dict, barcodepeaks_id_dict, master_peaksid_dict = {}, 0, {}, defaultdict(dict), defaultdict(dict), [], defaultdict(dict), {}, {}, defaultdict(dict) + + #reading peaks + peaks_chr_dict = {line.split('\t')[3]: line.split('\t')[0:3] for line in open(macs_peaks_file, 'r')} + peaks_count = sum(1 for _ in open(macs_peaks_file, 'r')) + + #reading gtf gene names + gene_names_dict = {(line.split('\t')[0],line.split('\t')[1]): line.split('\t')[0] for line in open("tempPrefix.genes.names", 'r')} + + #reading the genomic regions files + for region_prefix in ("promoter", "genebody", "window", "closest"): + region = region_prefix + ".tempPrefix" + region_name = Path(region).stem.upper() + region_names_list.append(region_name) + with open(region, 'r') as region_input: + for line in region_input: + line = line.strip().split('\t') + region_dict[region_name][(line[8], line[9])] = line[8:10] + peaks_id_dict[region_name][line[3]] = line[0:3] + peaks_id_list = peaks_chr_dict.keys() + peaks_id_list = sorted(peaks_id_list, key=lambda item: + (int(item[10:]) if item[10:].isdigit() + else print(item[10:], type(item[10:])), item)) + + #order regions and gene names + region_names_list.sort() + orderedgene_names_dict = collections.OrderedDict(sorted(gene_names_dict.items())) + + #barcoding the identified regions + for eachgene in orderedgene_names_dict: + for eachkey in region_names_list: + found = 1 if eachgene in region_dict[eachkey] else 0 + master_generegion_dict[eachkey][eachgene] = found + barcoderegions_dict[eachgene] = barcoderegions_dict.get(eachgene, "") + str(found) + + for eachpeaks_id in peaks_id_list: + for eachkey in region_names_list: + found = 1 if eachpeaks_id in peaks_id_dict[eachkey] else 0 + master_peaksid_dict[eachkey][eachpeaks_id] = found + barcodepeaks_id_dict[eachpeaks_id] = barcodepeaks_id_dict.get(eachpeaks_id, "") + str(found) + + + #write to output file + gene_file_name = file_name.replace('.txt', '.genes.txt') + + with open(gene_file_name, 'w') as final_output: + final_output.write("GeneName\tTranscript_ID\tBARCODE\t%s\tEVER\n" % ('\t'.join(region_names_list))) + for eachgene in orderedgene_names_dict: + string_identifier = [barcoderegions_dict[eachgene]] + found_sum = sum(master_generegion_dict[region][eachgene] for region in region_names_list) + string_identifier.extend(str(master_generegion_dict[region][eachgene]) for region in region_names_list) + string_identifier.append("1" if found_sum >= 1 else "0") + + final_output.write("%s\t%s\n" % ('\t'.join(eachgene), '\t'.join(string_identifier))) + + peaks_file_name = file_name.replace('.txt', '.peaks.txt') + pdf_file_name = file_name.replace('.txt', '.distribution.pdf') + with open(peaks_file_name, 'w') as final_output_peaks: + final_output_peaks.write("chrom\tstart\tstop\tpeak_id\tBARCODE\t%s\n" %('\t'.join(region_names_list))) + for eachpeaks_id in peaks_id_list: + string_identifier = [barcodepeaks_id_dict[eachpeaks_id]] + string_identifier.extend(str(master_peaksid_dict[region][eachpeaks_id]) for region in region_names_list) + final_output_peaks.write("%s\t%s\t%s\n" %('\t'.join(peaks_chr_dict[eachpeaks_id]), eachpeaks_id, '\t'.join(string_identifier))) + + region_count_dict = {region: round(len(peaks_id_dict[region]) * 100 / peaks_count) for region in ("PROMOTER", "GENEBODY", "WINDOW")} + rects = plt.bar(region_count_dict.keys(), region_count_dict.values(), width=0.4) + axes = plt.gca() + axes.set_ylim([0, 100]) + peaks_title = Path(macs_peaks_file).stem + plt.title(peaks_title.split('.sorted')[0]) + plt.ylabel("% of peaks") + + #label the bars + xpos = 'center' + ha = {'center': 'center', 'right': 'left', 'left': 'right'} + offset = {'center': 0.5, 'right': 0.57, 'left': 0.43} # x_txt = x + w*off + + for rect in rects: + height = rect.get_height() + plt.text(rect.get_x() + rect.get_width()*offset[xpos], 1.01*height, + '{}%'.format(height), ha=ha[xpos], va='bottom') + + #save figure + plt.savefig(pdf_file_name) + + for eachregion in ("PROMOTER", "GENEBODY", "WINDOW"): + print("Total number of %s = %d" %(eachregion, len(peaks_id_dict[eachregion]))) + print("Percentage of %s = %.3f" %(eachregion, len(peaks_id_dict[eachregion])*100/peaks_count)) + + +def main(): + ''' + Generate genomic coordinates of all promoters, window, genebody, TSS + + Definitions: promoters: +/- 1kb of TSS (transcription start site) + window: 10kb upstream --> 3kb downstream of genebody + genebody: 1kb upstream of TSS --> TTS/TES (transcription termination/end site) + TSS: TSS (transcription start site) + + Annotate Peaks based on genomic features listed above + + Comparison master files for all genes/transcripts + and their genomic regions within peaks called. + Genomic regions are specified based on regions provided, + default expectation are promoter, genebody, window, closest + + ''' + parser = argparse.ArgumentParser() + parser.add_argument("-g", "--gtf", dest="gtf", required=True, help="Enter .gtf/gff file to be processed.") + parser.add_argument("-c", "--chrom", dest="chrom_sizes", required=True, help="Enter ucsc chrom sizes file to be processed.") + parser.add_argument("-p", "--peaks", dest="macs_peaks", required=True, help="Enter MACS peaks bed file.") + parser.add_argument("-s", "--summit", dest="macs_summits", required=False, help="Enter MACS summit bed file.") + + options = parser.parse_args() + + # Stage 1 + # Extract genomic regions of interest + gtf_to_genes(options.chrom_sizes, options.gtf) + + #Peaks filecheck + checkcolumns = sum(1 for _ in open(options.macs_peaks, 'r')) + if checkcolumns <= 0: sys.exit("ERROR :\tNo peaks were identified\n") + + # Turn 4 column bed to 5 (for SICER peaks: include unique id) + with open(options.macs_peaks, 'r') as peaks_file: + numberofcolumns = len(peaks_file.readline().strip().split('\t')) + fifthcol_value = next(peaks_file).split('\t')[4].strip() #FDRisland from SICER produce a fifth empty column + + new_macs_peaks = options.macs_peaks + + if numberofcolumns == 4 or len(fifthcol_value) < 1: + new_macs_peaks = options.macs_peaks + ".tempPrefix" + # command = "cat " + options.macs_peaks + ' | awk -F\\\\t ' + "'" + \ + # '{print $1 "\\t" $2 "\\t" $3 "\\t" $1":"$2"-"$3 "\\t" $4}' + \ + # "' > tempPrefix-new.macs_peaks.bed" + # print(command) + # os.system(command) + macs_peaks_output = open(new_macs_peaks, 'w') + macs_peaks_input = open(options.macs_peaks,'r') + macs_peaks_count = 1 + for line in macs_peaks_input: + line = line.strip().split('\t') + new_peaks_id = "SICERpeak_" + str(macs_peaks_count) + results = ("{0}\t{1}\t{2}\n".format("\t".join(line[0:3]), new_peaks_id, line[3])) + macs_peaks_output.write(results) + macs_peaks_count += 1 + macs_peaks_output.close() + + # Convert TSS to compatible bedtools (closestBed) format and obtain center of peaks coordinate + command = "cut -f1,4,5,9,10 annotation/TSS-region_genes.gff | sort -k1,1 -k2,2n " + \ + "> tempPrefix-sorted.TSS; " + \ + "cat " + new_macs_peaks + ' | awk -F\\\\t ' + "'" + \ + '{print $1 "\\t" int(($2+$3)/2) "\\t" int(($2+$3)/2) "\\t" $4 "\\t" $5}' + \ + "' | sort -k1,1 -k2,2n > tempPrefix-centerofpeaks.bed;" + \ + "closestBed -t all -a tempPrefix-centerofpeaks.bed -b tempPrefix-sorted.TSS" + \ + "> closest.tempPrefix" + os.system(command) + + # Stage 2 + # Annotate peaks based on previously extracted genomic coordinates + for region in ("promoter", "genebody", "window"): + command = 'intersectBed -b ' + new_macs_peaks + ' -a annotation/' + region + \ + '-region_genes.gff -wb | cut -f1,4,5,9,10,11,12,13,14,15 |' + "awk -F\\\\t '" + \ + '{print $6 "\\t" $7 "\\t" $8 "\\t" $9 "\\t" $10 "\\t" $1 "\\t" $2 "\\t" $3 ' + \ + '"\\t" $4 "\\t" $5}' + \ + "' > " + region + ".tempPrefix" + os.system(command) + + # Convert annotated regions to tab delimited file and include summit location + for region in ("promoter", "genebody", "window", "closest"): + region_file = region + ".tempPrefix" + output_file = "peaks_within_" + region + ".regions.txt" + if options.macs_summits: + annotate_regions(region_file, output_file, options.macs_summits) + else: + annotate_regions(region_file, output_file) + + command = "mv peaks_within_closest.regions.txt centerofpeaks_closest.regions.txt; \ + cut -f 9,10 annotation/original_genes.gff > tempPrefix.genes.names" + os.system(command) + + # Stage 3 + # Binary matrix of all genes + include_genes(new_macs_peaks, "peaks_compared_regions.txt") + + command = "rm -rf *tempPrefix*" + os.system(command) + + +if __name__ == "__main__": + main() From 6b3967b1e05cb6ba577ca8478c04af778395d9de Mon Sep 17 00:00:00 2001 From: Modupe Date: Tue, 16 Apr 2024 17:00:13 -0500 Subject: [PATCH 2/5] update README.md --- README.md | 38 +++++++++++++++++++++++--------------- 1 file changed, 23 insertions(+), 15 deletions(-) diff --git a/README.md b/README.md index 2f19a5c..7d0c8f4 100755 --- a/README.md +++ b/README.md @@ -1,33 +1,41 @@ ======================== PEAKS ANNOTATION ====================== -1) USAGE -Executable anywhere as long as the PATHTO is correctly specified. +1) DESCRIPTION + +Genic annotation of peaks including promoters, gene bodies, gene-centric windows, and proximal genes. + +PEAKSANNO generates the genic annotation of peaks at promoters, gene bodies and gene-centric windows. +Annotated regions are collated to provide a binary overview of proximal genes, and the peaks occupancy +percentages are graphically presented in a bar plot. +This is a part of the major analysis for ChIPSeq or Cut&Run-Seq using our comprehensive pipeline [SEASEQ](https://github.com/stjude/seaseq/) + + +1) USAGE. ``` - PEAKSANNO-main.sh ["MACS PEAK bed"] ["MACS Summit bed"] ["GTF"] ["CHROM SIZES"] + peaksanno.py -p ["MACS PEAK bed"] -s ["MACS Summit bed"] -g ["GTF"] -c ["CHROM SIZES"] ``` - 1) INPUT The package requires the following files - - MACS peaks bed file - - MACS peaks summit file - - GTF having either gene or transcript (prefered) annotation + - Peaks bed file + - GTF/GFF/GFF3 having either gene or transcript (prefered) annotation - UCSC provided chrom.sizes file +Optional summit file from MACS can be included 1) OUTPUT 1) TSS nearest the center of peaks in __*centerofpeaks_closest.regions.txt*__ - 1) peaks overlapping genes regions in __*peaks_within_genebody.regions.txt*__ - 1) peaks overlapping promoters in __*peaks_within_promoter.regions.txt*__ - 1) peaks overlapping windows in __*peaks_within_window.regions.txt*__ - 1) peaks identified in previous overlapping regions and comparison of all regions in __*peaks_compared_regions.peaks.txt*__ - 1) genes identified in previous overlapping regions and comparison of all regions in __*peaks_compared_regions.genes.txt*__ - 1) distribution graphs in __*peaks_compared_regions.distribution.pdf*__ + 1) Peaks overlapping genes regions in __*peaks_within_genebody.regions.txt*__ + 1) Peaks overlapping promoters in __*peaks_within_promoter.regions.txt*__ + 1) Peaks overlapping windows in __*peaks_within_window.regions.txt*__ + 1) Peaks identified in previous overlapping regions and comparison of all regions in __*peaks_compared_regions.peaks.txt*__ + 1) Genes identified in previous overlapping regions and comparison of all regions in __*peaks_compared_regions.genes.txt*__ + 1) Distribution graphs in __*peaks_compared_regions.distribution.pdf*__ 1) DEFINITIONS - TSS: TSS (transcription start site) - promoters: 1kb +/- TSS - - window: 10kb upstream --> 3kb downstream of genebody - - genebody: 1kb upstream of TSS --> TES (transcription end site) + - window: 10kb upstream to 3kb downstream of genebody + - genebody: 1kb upstream of TSS to TES (transcription end site) 1) DEPENDENCIES * bedtools From f94e07ae0c9da3ff56f9893a0817a6dd69ef98bb Mon Sep 17 00:00:00 2001 From: Modupe Date: Tue, 16 Apr 2024 17:06:50 -0500 Subject: [PATCH 3/5] update README.md --- README.md | 25 +++++++++++++++---------- 1 file changed, 15 insertions(+), 10 deletions(-) diff --git a/README.md b/README.md index 7d0c8f4..2a2888e 100755 --- a/README.md +++ b/README.md @@ -1,27 +1,26 @@ -======================== PEAKS ANNOTATION ====================== +#PEAKS ANNOTATION -1) DESCRIPTION +## DESCRIPTION Genic annotation of peaks including promoters, gene bodies, gene-centric windows, and proximal genes. PEAKSANNO generates the genic annotation of peaks at promoters, gene bodies and gene-centric windows. Annotated regions are collated to provide a binary overview of proximal genes, and the peaks occupancy percentages are graphically presented in a bar plot. -This is a part of the major analysis for ChIPSeq or Cut&Run-Seq using our comprehensive pipeline [SEASEQ](https://github.com/stjude/seaseq/) +This is a part of the major analysis for ChIPSeq or Cut&Run-Seq using our comprehensive pipeline [SEASEQ](https://github.com/stjude/seaseq/). - -1) USAGE. +## USAGE. ``` peaksanno.py -p ["MACS PEAK bed"] -s ["MACS Summit bed"] -g ["GTF"] -c ["CHROM SIZES"] ``` -1) INPUT +## INPUT The package requires the following files - Peaks bed file - GTF/GFF/GFF3 having either gene or transcript (prefered) annotation - UCSC provided chrom.sizes file Optional summit file from MACS can be included -1) OUTPUT +## OUTPUT 1) TSS nearest the center of peaks in __*centerofpeaks_closest.regions.txt*__ 1) Peaks overlapping genes regions in __*peaks_within_genebody.regions.txt*__ @@ -31,14 +30,20 @@ Optional summit file from MACS can be included 1) Genes identified in previous overlapping regions and comparison of all regions in __*peaks_compared_regions.genes.txt*__ 1) Distribution graphs in __*peaks_compared_regions.distribution.pdf*__ -1) DEFINITIONS +## DEFINITIONS - TSS: TSS (transcription start site) - promoters: 1kb +/- TSS - window: 10kb upstream to 3kb downstream of genebody - genebody: 1kb upstream of TSS to TES (transcription end site) -1) DEPENDENCIES +## DEPENDENCIES * bedtools * python3 -================================================================= + +##Contact + +Please use the GitHub issues page for reporting any issues / suggestions (recommended). + +Alternatively, you can e-mail Modupe Adetunji + From 5d5ae84f98b7a731f7a37b947014a4cd49241cc0 Mon Sep 17 00:00:00 2001 From: Modupe Date: Tue, 16 Apr 2024 17:11:14 -0500 Subject: [PATCH 4/5] update README.md --- README.md | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 2a2888e..c6e3e60 100755 --- a/README.md +++ b/README.md @@ -40,8 +40,12 @@ Optional summit file from MACS can be included * bedtools * python3 +## CITATION -##Contact +If you are using *PEAKSANNO*, please cite it's parent paper : +Adetunji, M.O., Abraham, B.J. SEAseq: a portable and cloud-based chromatin occupancy analysis suite. BMC Bioinformatics 23, 77 (2022). https://doi.org/10.1186/s12859-022-04588-z + +## CONTACT Please use the GitHub issues page for reporting any issues / suggestions (recommended). From 35ad944c19473d765d049454e263d629990fcf02 Mon Sep 17 00:00:00 2001 From: Modupeore Adetunji <53270089+madetunj@users.noreply.github.com> Date: Tue, 16 Apr 2024 18:36:48 -0500 Subject: [PATCH 5/5] Update README.md --- README.md | 52 ++++++++++++++++++++++++++-------------------------- 1 file changed, 26 insertions(+), 26 deletions(-) diff --git a/README.md b/README.md index c6e3e60..4d4d90e 100755 --- a/README.md +++ b/README.md @@ -1,53 +1,53 @@ -#PEAKS ANNOTATION +# PEAKS ANNOTATION ## DESCRIPTION Genic annotation of peaks including promoters, gene bodies, gene-centric windows, and proximal genes. -PEAKSANNO generates the genic annotation of peaks at promoters, gene bodies and gene-centric windows. -Annotated regions are collated to provide a binary overview of proximal genes, and the peaks occupancy +**PeaksAnoo** provides the genic annotation of peaks at promoters, gene bodies, and gene-centric windows. +Annotated regions are collated to give a binary overview of proximal genes, and the peak occupancy percentages are graphically presented in a bar plot. This is a part of the major analysis for ChIPSeq or Cut&Run-Seq using our comprehensive pipeline [SEASEQ](https://github.com/stjude/seaseq/). -## USAGE. - ``` - peaksanno.py -p ["MACS PEAK bed"] -s ["MACS Summit bed"] -g ["GTF"] -c ["CHROM SIZES"] - ``` +## USAGE +``` bash +peaksanno.py -p ["PEAK bed"] [-s ["MACS Summit bed"]] -g ["GTF"] -c ["CHROM SIZES"] +``` ## INPUT -The package requires the following files - - Peaks bed file - - GTF/GFF/GFF3 having either gene or transcript (prefered) annotation - - UCSC provided chrom.sizes file -Optional summit file from MACS can be included +The script requires the following files: +1. Peaks bed file +2. GTF/GFF/GFF3 having either gene or transcript (preferred) annotation +3. UCSC provided chromosomal sizes file -## OUTPUT +An optional summit file from MACS can be included - 1) TSS nearest the center of peaks in __*centerofpeaks_closest.regions.txt*__ - 1) Peaks overlapping genes regions in __*peaks_within_genebody.regions.txt*__ - 1) Peaks overlapping promoters in __*peaks_within_promoter.regions.txt*__ - 1) Peaks overlapping windows in __*peaks_within_window.regions.txt*__ - 1) Peaks identified in previous overlapping regions and comparison of all regions in __*peaks_compared_regions.peaks.txt*__ - 1) Genes identified in previous overlapping regions and comparison of all regions in __*peaks_compared_regions.genes.txt*__ - 1) Distribution graphs in __*peaks_compared_regions.distribution.pdf*__ +## OUTPUT +PeaksAnno genes the following files: +1. TSS nearest the center of peaks in **centerofpeaks_closest.regions.txt**. +2. Peaks overlapping genes regions in **peaks_within_genebody.regions.txt**. +3. Peaks overlapping promoters in **peaks_within_promoter.regions.txt**. +4. Peaks overlapping windows in **peaks_within_window.regions.txt**. +5. Peaks identified in previous overlapping regions and comparison of all regions in **peaks_compared_regions.peaks.txt**. +6. Genes identified in previous overlapping regions and comparison of all regions in **peaks_compared_regions.genes.txt**. +7. Distribution graphs in **peaks_compared_regions.distribution.pdf**. ## DEFINITIONS - TSS: TSS (transcription start site) - promoters: 1kb +/- TSS - - window: 10kb upstream to 3kb downstream of genebody + - window: 10kb upstream to 3kb downstream of the gene locus - genebody: 1kb upstream of TSS to TES (transcription end site) ## DEPENDENCIES - * bedtools - * python3 +1. bedtools +2. python3 ## CITATION - -If you are using *PEAKSANNO*, please cite it's parent paper : +If you are using ***PEAKSANNO***, please cite its parent paper : Adetunji, M.O., Abraham, B.J. SEAseq: a portable and cloud-based chromatin occupancy analysis suite. BMC Bioinformatics 23, 77 (2022). https://doi.org/10.1186/s12859-022-04588-z ## CONTACT -Please use the GitHub issues page for reporting any issues / suggestions (recommended). +Please use the GitHub issues page for reporting any issues/suggestions (recommended). Alternatively, you can e-mail Modupe Adetunji