Skip to content

Commit

Permalink
Merge pull request #158 from CostaLab/develop
Browse files Browse the repository at this point in the history
RGT-0.13.1
  • Loading branch information
lzj1769 authored Aug 20, 2020
2 parents c40ae5e + 4da7433 commit 2c43b84
Show file tree
Hide file tree
Showing 12 changed files with 366 additions and 198 deletions.
6 changes: 4 additions & 2 deletions rgt/GenomicRegionSet.py
Original file line number Diff line number Diff line change
Expand Up @@ -1135,12 +1135,13 @@ def window(self, y, adding_length=1000):
return extended_self.intersect(y)

def subtract(self, y, whole_region=False, merge=True, exact=False):
"""Return a GenomicRegionSet excluded the overlapping regions with y.
"""Return a copy of this GenomicRegionSet minus all the regions overlapping with y.
*Keyword arguments:*
- y -- the GenomicRegionSet which to subtract by
- whole_region -- subtract the whole region, not partially
- merge -- before subtracting, it merges any overlapping sequence within self or y
- exact -- only regions which match exactly with a region within y are subtracted
if set, whole_region and merge are completely ignored
if set, the returned GRS is sorted and does not contain duplicates
Expand Down Expand Up @@ -1273,10 +1274,11 @@ def subtract(self, y, whole_region=False, merge=True, exact=False):
if len(self) == 0 or len(y) == 0:
return self

# If there is overlap within self or y, they should be merged first.
if not self.sorted:
self.sort()

# if there is overlap within self or y, and the `merge` option is set,
# we merge any overlapping sequence and create two different GenomicRegionSets
if merge:
a = self.merge(w_return=True)
b = y.merge(w_return=True)
Expand Down
50 changes: 38 additions & 12 deletions rgt/HINT/DifferentialAnalysis.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@

from scipy.stats import zscore
from scipy.stats import norm
from scipy import stats
from argparse import SUPPRESS

from multiprocessing import Pool, cpu_count
Expand Down Expand Up @@ -64,6 +65,8 @@ def diff_analysis_args(parser):
help="The prefix for results files. DEFAULT: differential")
parser.add_argument("--standardize", action="store_true", default=False,
help="If set, the signal will be rescaled to (0, 1) for plotting.")
parser.add_argument("--no-lineplots", default=False, action='store_true',
help="If set, the footprint line plots will not be generated. DEFAULT: False")
parser.add_argument("--output-profiles", default=False, action='store_true',
help="If set, the footprint profiles will be writen into a text, in which each row is a "
"specific instance of the given motif. DEFAULT: False")
Expand Down Expand Up @@ -205,7 +208,7 @@ def diff_analysis_run(args):

print("signal generation is done!\n")

# compute normalization facotr for each condition
# compute normalization factor for each condition
factors = compute_factors(signals)
output_factor(args, factors, conditions)

Expand All @@ -217,18 +220,19 @@ def diff_analysis_run(args):
if args.output_profiles:
output_profiles(mpbs_name_list, signals, conditions, args.output_location)

print("generating line plot for each motif...\n")
if args.nc == 1:
for i, mpbs_name in enumerate(mpbs_name_list):
output_line_plot((mpbs_name, motif_num[i], signals[:, i, :], conditions, motif_pwm[i], output_location,
args.window_size, colors))
else:
with Pool(processes=args.nc) as pool:
arguments_list = list()
if not args.no_lineplots:
print("generating line plot for each motif...\n")
if args.nc == 1:
for i, mpbs_name in enumerate(mpbs_name_list):
arguments_list.append((mpbs_name, motif_num[i], signals[:, i, :], conditions, motif_pwm[i], output_location,
args.window_size, colors))
pool.map(output_line_plot, arguments_list)
output_line_plot((mpbs_name, motif_num[i], signals[:, i, :], conditions, motif_pwm[i], output_location,
args.window_size, colors))
else:
with Pool(processes=args.nc) as pool:
arguments_list = list()
for i, mpbs_name in enumerate(mpbs_name_list):
arguments_list.append((mpbs_name, motif_num[i], signals[:, i, :], conditions, motif_pwm[i], output_location,
args.window_size, colors))
pool.map(output_line_plot, arguments_list)

ps_tc_results = list()
for i, mpbs_name in enumerate(mpbs_name_list):
Expand Down Expand Up @@ -269,6 +273,8 @@ def get_raw_signal(arguments):
if p1 <= cut_site < p2:
signal[cut_site - p1] += 1.0

signal = smooth(signal)

return signal


Expand Down Expand Up @@ -296,6 +302,8 @@ def get_bc_signal(arguments):
# smooth the signal
signal = np.add(signal, np.array(_signal))

signal = smooth(signal)

return signal


Expand Down Expand Up @@ -630,3 +638,21 @@ def output_profiles(mpbs_name_list, signals, conditions, output_location):
output_filename = os.path.join(output_location, "{}_{}.txt".format(condition, mpbs_name))
with open(output_filename, "w") as f:
f.write("\t".join(map(str, signals[i][j])) + "\n")


def smooth(signal, window_size=5, rank=5):
k = len(signal)
zscore = stats.zscore(signal)
order = zscore.argsort()
ranks = order.argsort()

signal[ranks > (k - rank)] = np.nan

smooth_signal = np.zeros(k)

for i in range(k):
a = max(0, i - round(window_size / 2))
b = min(k, ceil(i + window_size / 2))
smooth_signal[i] = np.nanmean(signal[a:b])

return smooth_signal
3 changes: 1 addition & 2 deletions rgt/HINT/Footprinting.py
Original file line number Diff line number Diff line change
Expand Up @@ -853,8 +853,7 @@ def post_processing(footprints, original_regions, fp_min_size, fp_ext, genome_da
footprints_overlap.write(output_file_name)

# the number of reads
lines = pysam.idxstats(reads_file.file_name).splitlines()
num_reads = sum(map(int, [x.split("\t")[2] for x in lines if not x.startswith("#")]))
num_reads = pysam.AlignmentFile(reads_file.file_name).count(until_eof=True)

# the number of peaks and tag count within peaks
num_peaks = 0
Expand Down
13 changes: 2 additions & 11 deletions rgt/Util.py
Original file line number Diff line number Diff line change
Expand Up @@ -90,8 +90,7 @@ def __init__(self, organism):
self.organism = organism
self.genome = os.path.join(self.data_dir, self.config.get(organism, 'genome'))
self.chromosome_sizes = os.path.join(self.data_dir, self.config.get(organism, 'chromosome_sizes'))
self.genes_gencode = os.path.join(self.data_dir, self.config.get(organism, 'genes_Gencode'))
self.genes_refseq = os.path.join(self.data_dir, self.config.get(organism, 'genes_RefSeq'))
self.gene_regions = os.path.join(self.data_dir, self.config.get(organism, 'gene_regions'))
self.annotation = os.path.join(self.data_dir, self.config.get(organism, 'annotation'))
self.annotation_dump_dir = os.path.dirname(os.path.join(self.data_dir, self.annotation))
self.gene_alias = os.path.join(self.data_dir, self.config.get(organism, 'gene_alias'))
Expand All @@ -114,15 +113,7 @@ def get_chromosome_sizes(self):

def get_gene_regions(self):
"""Returns the current path to the gene_regions BED file."""
return self.genes_gencode

def get_genes_gencode(self):
"""Returns the current path to the gene_regions BED file."""
return self.genes_gencode

def get_genes_refseq(self):
"""Returns the current path to the gene_regions BED file."""
return self.genes_refseq
return self.gene_regions

def get_annotation(self):
"""
Expand Down
2 changes: 1 addition & 1 deletion rgt/__version__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = "0.13.0"
__version__ = "0.13.1"
40 changes: 26 additions & 14 deletions rgt/motifanalysis/Enrichment.py
Original file line number Diff line number Diff line change
Expand Up @@ -135,18 +135,27 @@ def main(args):
output_location = args.output_location
else:
output_location = os.path.join(os.getcwd(), enrichment_folder_name)
print(">> output location:", output_location)

# Matching folder
if args.matching_location:
match_location = args.matching_location
else:
match_location = os.path.join(os.getcwd(), matching_folder_name)
print(">> match location:", match_location)
print()

# Default genomic data
genome_data = GenomeData(args.organism)
print(">> genome:", genome_data.organism)
print()

# the matching directory must exist, for obvious reasons
if not os.path.isdir(match_location):
err.throw_error("ME_MATCH_NOTFOUND")

# Background file must exist
print(">> loading background file..")
background_original_filename = background_filename
if not os.path.isfile(background_filename):
err.throw_error("DEFAULT_ERROR", add_msg="Background file does not exist or is not readable.")
Expand All @@ -159,9 +168,10 @@ def main(args):

# Background MPBS file must exist
path, ext = os.path.splitext(background_filename)
background_name = os.path.basename(path)

# first we check at matching folder location
background_mpbs_filename = os.path.join(match_location, os.path.basename(path) + "_mpbs" + ext)
background_mpbs_filename = os.path.join(match_location, background_name + "_mpbs" + ext)

if not os.path.isfile(background_mpbs_filename):
# if not found, we search at background file location
Expand All @@ -184,34 +194,39 @@ def main(args):
else:
err.throw_error("DEFAULT_ERROR", add_msg="Background MPBS file must be in either BED or BigBed format.")

# Default genomic data
genome_data = GenomeData(args.organism)

print(">> genome:", genome_data.organism)
background = GenomicRegionSet("background")
background.read(background_filename)
print(">>> ", background_name, ", ", len(background), " regions", sep="")
print()

# Load motif_set (includes MotifData object), is either customized or default
if args.motif_dbs:
print(">> loading external motif databases..")
# args.motif_dbs is a list of paths to pwm files
motif_set = MotifSet(preload_motifs=args.motif_dbs, motif_dbs=True)

# filter for dbs only if --motif_dbs is not set
if 'database' in filter_values:
del filter_values['database']
else:
print(">> loading motif databases..")
if 'database' in filter_values:
motif_set = MotifSet(preload_motifs=filter_values['database'])
else:
motif_set = MotifSet(preload_motifs="default")

print(">> used database(s):", ",".join([str(db) for db in motif_set.motif_data.repositories_list]))
for db in motif_set.motif_data.get_repositories_list():
print(">>>", db)
print()

# applying filtering pattern, taking a subset of the motif set
if args.filter:
print(">> applying motif filter..")
motif_set = motif_set.filter(filter_values, search=args.filter_type)

motif_names = list(motif_set.motifs_map.keys())

print(">> motifs loaded:", len(motif_names))
print()

# Default image data
image_data = ImageData()
Expand Down Expand Up @@ -245,10 +260,12 @@ def main(args):
del exp_matrix

print(">> experimental matrix loaded")
print()

except Exception:
err.throw_error("MM_WRONG_EXPMAT")
elif input_files:
print(">> loading input files..")
# get input files, if available
for input_filename in input_files:
name, _ = os.path.splitext(os.path.basename(input_filename))
Expand All @@ -258,7 +275,8 @@ def main(args):

genomic_regions_dict[name] = regions

print(">> input regions BED files loaded")
print(">>> ", name, ", ", len(regions), " regions", sep="")
print()

if not genomic_regions_dict:
err.throw_error("DEFAULT_ERROR", add_msg="You must either specify an experimental matrix, "
Expand Down Expand Up @@ -347,13 +365,9 @@ def main(args):
except Exception:
err.throw_error("ME_OUT_FOLDER_CREATION")

print()

start = time.time()
print(">> collecting background statistics...", sep="", end="")
sys.stdout.flush()
background = GenomicRegionSet("background")
background.read(background_filename)
background_mpbs = GenomicRegionSet("background_mpbs")
background_mpbs.read(background_mpbs_filename)

Expand Down Expand Up @@ -395,8 +409,6 @@ def main(args):
link_name = grs.name
sitetest_link_dict[link_name] = os.path.join(link_location, link_name, output_stat_fulltest + ".html")

print()

# Iterating on each input object
for curr_input in input_list:

Expand Down
Loading

0 comments on commit 2c43b84

Please sign in to comment.