Merge pull request #158 from CostaLab/develop

RGT-0.13.1
CostaLab · Aug 20, 2020 · 2c43b84 · 2c43b84
2 parents c40ae5e + 4da7433
commit 2c43b84
Show file tree

Hide file tree

Showing 12 changed files with 366 additions and 198 deletions.
diff --git a/rgt/GenomicRegionSet.py b/rgt/GenomicRegionSet.py
@@ -1135,12 +1135,13 @@ def window(self, y, adding_length=1000):
         return extended_self.intersect(y)
 
     def subtract(self, y, whole_region=False, merge=True, exact=False):
-        """Return a GenomicRegionSet excluded the overlapping regions with y.
+        """Return a copy of this GenomicRegionSet minus all the regions overlapping with y.
 
         *Keyword arguments:*
 
             - y -- the GenomicRegionSet which to subtract by
             - whole_region -- subtract the whole region, not partially
+            - merge -- before subtracting, it merges any overlapping sequence within self or y
             - exact --  only regions which match exactly with a region within y are subtracted
                         if set, whole_region and merge are completely ignored
                         if set, the returned GRS is sorted and does not contain duplicates
@@ -1273,10 +1274,11 @@ def subtract(self, y, whole_region=False, merge=True, exact=False):
             if len(self) == 0 or len(y) == 0:
                 return self
 
-            # If there is overlap within self or y, they should be merged first.
             if not self.sorted:
                 self.sort()
 
+            # if there is overlap within self or y, and the `merge` option is set,
+            # we merge any overlapping sequence and create two different GenomicRegionSets
             if merge:
                 a = self.merge(w_return=True)
                 b = y.merge(w_return=True)

diff --git a/rgt/HINT/DifferentialAnalysis.py b/rgt/HINT/DifferentialAnalysis.py
@@ -10,6 +10,7 @@
 
 from scipy.stats import zscore
 from scipy.stats import norm
+from scipy import stats
 from argparse import SUPPRESS
 
 from multiprocessing import Pool, cpu_count
@@ -64,6 +65,8 @@ def diff_analysis_args(parser):
                         help="The prefix for results files. DEFAULT: differential")
     parser.add_argument("--standardize", action="store_true", default=False,
                         help="If set, the signal will be rescaled to (0, 1) for plotting.")
+    parser.add_argument("--no-lineplots", default=False, action='store_true',
+                        help="If set, the footprint line plots will not be generated. DEFAULT: False")
     parser.add_argument("--output-profiles", default=False, action='store_true',
                         help="If set, the footprint profiles will be writen into a text, in which each row is a "
                              "specific instance of the given motif. DEFAULT: False")
@@ -205,7 +208,7 @@ def diff_analysis_run(args):
 
     print("signal generation is done!\n")
 
-    # compute normalization facotr for each condition
+    # compute normalization factor for each condition
     factors = compute_factors(signals)
     output_factor(args, factors, conditions)
 
@@ -217,18 +220,19 @@ def diff_analysis_run(args):
     if args.output_profiles:
         output_profiles(mpbs_name_list, signals, conditions, args.output_location)
 
-    print("generating line plot for each motif...\n")
-    if args.nc == 1:
-        for i, mpbs_name in enumerate(mpbs_name_list):
-            output_line_plot((mpbs_name, motif_num[i], signals[:, i, :], conditions, motif_pwm[i], output_location,
-                              args.window_size, colors))
-    else:
-        with Pool(processes=args.nc) as pool:
-            arguments_list = list()
+    if not args.no_lineplots:
+        print("generating line plot for each motif...\n")
+        if args.nc == 1:
             for i, mpbs_name in enumerate(mpbs_name_list):
-                arguments_list.append((mpbs_name, motif_num[i], signals[:, i, :], conditions, motif_pwm[i], output_location,
-                                       args.window_size, colors))
-            pool.map(output_line_plot, arguments_list)
+                output_line_plot((mpbs_name, motif_num[i], signals[:, i, :], conditions, motif_pwm[i], output_location,
+                                  args.window_size, colors))
+        else:
+            with Pool(processes=args.nc) as pool:
+                arguments_list = list()
+                for i, mpbs_name in enumerate(mpbs_name_list):
+                    arguments_list.append((mpbs_name, motif_num[i], signals[:, i, :], conditions, motif_pwm[i], output_location,
+                                           args.window_size, colors))
+                pool.map(output_line_plot, arguments_list)
 
     ps_tc_results = list()
     for i, mpbs_name in enumerate(mpbs_name_list):
@@ -269,6 +273,8 @@ def get_raw_signal(arguments):
                 if p1 <= cut_site < p2:
                     signal[cut_site - p1] += 1.0
 
+    signal = smooth(signal)
+
     return signal
 
 
@@ -296,6 +302,8 @@ def get_bc_signal(arguments):
         # smooth the signal
         signal = np.add(signal, np.array(_signal))
 
+    signal = smooth(signal)
+
     return signal
 
 
@@ -630,3 +638,21 @@ def output_profiles(mpbs_name_list, signals, conditions, output_location):
             output_filename = os.path.join(output_location, "{}_{}.txt".format(condition, mpbs_name))
             with open(output_filename, "w") as f:
                 f.write("\t".join(map(str, signals[i][j])) + "\n")
+
+
+def smooth(signal, window_size=5, rank=5):
+    k = len(signal)
+    zscore = stats.zscore(signal)
+    order = zscore.argsort()
+    ranks = order.argsort()
+
+    signal[ranks > (k - rank)] = np.nan
+
+    smooth_signal = np.zeros(k)
+
+    for i in range(k):
+        a = max(0, i - round(window_size / 2))
+        b = min(k, ceil(i + window_size / 2))
+        smooth_signal[i] = np.nanmean(signal[a:b])
+
+    return smooth_signal
diff --git a/rgt/HINT/Footprinting.py b/rgt/HINT/Footprinting.py
@@ -853,8 +853,7 @@ def post_processing(footprints, original_regions, fp_min_size, fp_ext, genome_da
     footprints_overlap.write(output_file_name)
 
     # the number of reads
-    lines = pysam.idxstats(reads_file.file_name).splitlines()
-    num_reads = sum(map(int, [x.split("\t")[2] for x in lines if not x.startswith("#")]))
+    num_reads = pysam.AlignmentFile(reads_file.file_name).count(until_eof=True)
 
     # the number of peaks and tag count within peaks
     num_peaks = 0

diff --git a/rgt/Util.py b/rgt/Util.py
@@ -90,8 +90,7 @@ def __init__(self, organism):
         self.organism = organism
         self.genome = os.path.join(self.data_dir, self.config.get(organism, 'genome'))
         self.chromosome_sizes = os.path.join(self.data_dir, self.config.get(organism, 'chromosome_sizes'))
-        self.genes_gencode = os.path.join(self.data_dir, self.config.get(organism, 'genes_Gencode'))
-        self.genes_refseq = os.path.join(self.data_dir, self.config.get(organism, 'genes_RefSeq'))
+        self.gene_regions = os.path.join(self.data_dir, self.config.get(organism, 'gene_regions'))
         self.annotation = os.path.join(self.data_dir, self.config.get(organism, 'annotation'))
         self.annotation_dump_dir = os.path.dirname(os.path.join(self.data_dir, self.annotation))
         self.gene_alias = os.path.join(self.data_dir, self.config.get(organism, 'gene_alias'))
@@ -114,15 +113,7 @@ def get_chromosome_sizes(self):
 
     def get_gene_regions(self):
         """Returns the current path to the gene_regions BED file."""
-        return self.genes_gencode
-
-    def get_genes_gencode(self):
-        """Returns the current path to the gene_regions BED file."""
-        return self.genes_gencode
-
-    def get_genes_refseq(self):
-        """Returns the current path to the gene_regions BED file."""
-        return self.genes_refseq
+        return self.gene_regions
 
     def get_annotation(self):
         """

diff --git a/rgt/__version__.py b/rgt/__version__.py
@@ -1 +1 @@
-__version__ = "0.13.0"
+__version__ = "0.13.1"
diff --git a/rgt/motifanalysis/Enrichment.py b/rgt/motifanalysis/Enrichment.py
@@ -135,18 +135,27 @@ def main(args):
         output_location = args.output_location
     else:
         output_location = os.path.join(os.getcwd(), enrichment_folder_name)
+    print(">> output location:", output_location)
 
     # Matching folder
     if args.matching_location:
         match_location = args.matching_location
     else:
         match_location = os.path.join(os.getcwd(), matching_folder_name)
+    print(">> match location:", match_location)
+    print()
+
+    # Default genomic data
+    genome_data = GenomeData(args.organism)
+    print(">> genome:", genome_data.organism)
+    print()
 
     # the matching directory must exist, for obvious reasons
     if not os.path.isdir(match_location):
         err.throw_error("ME_MATCH_NOTFOUND")
 
     # Background file must exist
+    print(">> loading background file..")
     background_original_filename = background_filename
     if not os.path.isfile(background_filename):
         err.throw_error("DEFAULT_ERROR", add_msg="Background file does not exist or is not readable.")
@@ -159,9 +168,10 @@ def main(args):
 
     # Background MPBS file must exist
     path, ext = os.path.splitext(background_filename)
+    background_name = os.path.basename(path)
 
     # first we check at matching folder location
-    background_mpbs_filename = os.path.join(match_location, os.path.basename(path) + "_mpbs" + ext)
+    background_mpbs_filename = os.path.join(match_location, background_name + "_mpbs" + ext)
 
     if not os.path.isfile(background_mpbs_filename):
         # if not found, we search at background file location
@@ -184,34 +194,39 @@ def main(args):
     else:
         err.throw_error("DEFAULT_ERROR", add_msg="Background MPBS file must be in either BED or BigBed format.")
 
-    # Default genomic data
-    genome_data = GenomeData(args.organism)
-
-    print(">> genome:", genome_data.organism)
+    background = GenomicRegionSet("background")
+    background.read(background_filename)
+    print(">>> ", background_name, ", ", len(background), " regions", sep="")
+    print()
 
     # Load motif_set (includes MotifData object), is either customized or default
     if args.motif_dbs:
+        print(">> loading external motif databases..")
         # args.motif_dbs is a list of paths to pwm files
         motif_set = MotifSet(preload_motifs=args.motif_dbs, motif_dbs=True)
 
         # filter for dbs only if --motif_dbs is not set
         if 'database' in filter_values:
             del filter_values['database']
     else:
+        print(">> loading motif databases..")
         if 'database' in filter_values:
             motif_set = MotifSet(preload_motifs=filter_values['database'])
         else:
             motif_set = MotifSet(preload_motifs="default")
 
-    print(">> used database(s):", ",".join([str(db) for db in motif_set.motif_data.repositories_list]))
+    for db in motif_set.motif_data.get_repositories_list():
+        print(">>>", db)
+    print()
 
     # applying filtering pattern, taking a subset of the motif set
     if args.filter:
+        print(">> applying motif filter..")
         motif_set = motif_set.filter(filter_values, search=args.filter_type)
 
     motif_names = list(motif_set.motifs_map.keys())
-
     print(">> motifs loaded:", len(motif_names))
+    print()
 
     # Default image data
     image_data = ImageData()
@@ -245,10 +260,12 @@ def main(args):
             del exp_matrix
 
             print(">> experimental matrix loaded")
+            print()
 
         except Exception:
             err.throw_error("MM_WRONG_EXPMAT")
     elif input_files:
+        print(">> loading input files..")
         # get input files, if available
         for input_filename in input_files:
             name, _ = os.path.splitext(os.path.basename(input_filename))
@@ -258,7 +275,8 @@ def main(args):
 
             genomic_regions_dict[name] = regions
 
-        print(">> input regions BED files loaded")
+            print(">>> ", name, ", ", len(regions), " regions", sep="")
+            print()
 
     if not genomic_regions_dict:
         err.throw_error("DEFAULT_ERROR", add_msg="You must either specify an experimental matrix, "
@@ -347,13 +365,9 @@ def main(args):
     except Exception:
         err.throw_error("ME_OUT_FOLDER_CREATION")
 
-    print()
-
     start = time.time()
     print(">> collecting background statistics...", sep="", end="")
     sys.stdout.flush()
-    background = GenomicRegionSet("background")
-    background.read(background_filename)
     background_mpbs = GenomicRegionSet("background_mpbs")
     background_mpbs.read(background_mpbs_filename)
 
@@ -395,8 +409,6 @@ def main(args):
                 link_name = grs.name
                 sitetest_link_dict[link_name] = os.path.join(link_location, link_name, output_stat_fulltest + ".html")
 
-    print()
-
     # Iterating on each input object
     for curr_input in input_list: