Merge pull request #2201 from merenlab/issue-2200

Filter HMM hits based on length
merenlab · Jan 18, 2024 · 7228a15 · 7228a15
2 parents c40c447 + 2182b67
commit 7228a15
Show file tree

Hide file tree

Showing 3 changed files with 84 additions and 4 deletions.
diff --git a/anvio/__init__.py b/anvio/__init__.py
@@ -1397,6 +1397,20 @@ def TABULATE(table, header, numalign="right", max_width=0):
              'type': str,
              'help': "Characters to separate things (the default is whatever is most suitable)."}
                 ),
+    'ignore-genes-longer-than': (
+            ['--ignore-genes-longer-than'],
+            {'metavar': 'MAX_LENGTH',
+             'default': 0,
+             'type': int,
+             'help': "In some cases the gene calling step can identify open reading frames that span across extremely long "
+                     "stretches of genomes. Such mistakes can lead to downstream issues, especially when concatenate flag "
+                     "is used, including failure to align genes. This flag allows anvi'o to ignore extremely long gene calls to "
+                     "avoid unintended issues (i.e., during phylogenomic analyses). If you use this flag, please carefully "
+                     "examine the output messages from the program to see which genes are removed from the analysis. "
+                     "Please note that the length parameter considers the nucleotide lenght of the open reading frame, "
+                     "even if you asked for amino acid sequences to be returned. Setting this parameter to small values, "
+                     "such as less than 10000 nucleotides may lead to the removal of genuine genes, so please use it carefully."}
+                ),
     'align-with': (
             ['--align-with'],
             {'metavar': 'ALIGNER',

diff --git a/anvio/hmmops.py b/anvio/hmmops.py
@@ -530,6 +530,61 @@ def get_num_genes_missing_per_bin_dict(self, hmm_sequences_dict_for_splits, gene
         return num_genes_missing_per_bin
 
 
+    def filter_hmm_sequences_dict_for_genes_that_are_too_long(self, hmm_sequences_dict_for_splits, ignore_genes_longer_than=0):
+        """This takes in your `hmm_sequences_dict_for_splits`, and removes genes that are too long"""
+
+        if not isinstance(ignore_genes_longer_than, int):
+            raise ConfigError("The `--ignore-genes-longer-than` expects an integer argument :/")
+
+        # we will keep track of these bad bois
+        gene_calls_removed = set([])
+
+        # we identify entry ids that describe genes that are too long for removal
+        entry_ids_to_remove = set([])
+        for entry_id in hmm_sequences_dict_for_splits:
+            if hmm_sequences_dict_for_splits[entry_id]['length'] > ignore_genes_longer_than:
+                entry_ids_to_remove.add(entry_id)
+                gene_calls_removed.add(hmm_sequences_dict_for_splits[entry_id]['gene_callers_id'])
+
+        # we return early if there is nothing to be done
+        if not len(entry_ids_to_remove):
+            self.run.warning(f"You asked anvi'o to remove genes that are longer than {ignore_genes_longer_than} nts from your "
+                             f"HMM hits. But none of the gene calls were longer than that value, so you get to keep everything.",
+                             header="A MESSAGE FROM YOUR GENE LENGTH FILTER 📏")
+            return (hmm_sequences_dict_for_splits, set([]), set([]))
+
+        # if we are here, it means there are things to be gotten rid of. we will remove things
+        # while keeping the user informed.
+        self.run.warning(f"You asked anvi'o to remove genes that are longer than {ignore_genes_longer_than} nts from your "
+                         f"HMM hits. There were a total of {len(entry_ids_to_remove)} HMM hits that matched to gene calls "
+                         f"that were longer than {ignore_genes_longer_than}, and they are now removed from your analysis. "
+                         f"The following lines list all these gene calls, their length, and which model they belonged.",
+                         header="A MESSAGE FROM YOUR GENE LENGTH FILTER 📏", lc='yellow')
+
+        # before we actually start removing stuff, we first learn all bin names that are in the master dict
+        bin_names_in_original_dict = set([])
+        for entry in hmm_sequences_dict_for_splits.values():
+            bin_names_in_original_dict.add(entry['bin_id'])
+
+        # puts in business socks
+        for entry_id in entry_ids_to_remove:
+            e = hmm_sequences_dict_for_splits[entry_id]
+            self.run.info_single(f"Source: {e['source']} / Model: {e['gene_name']} /  Bin: {e['bin_id']} / "
+                                 f"Gene call: {e['gene_callers_id']} / Length: {e['length']}",
+                                 cut_after=None, level=2)
+            hmm_sequences_dict_for_splits.pop(entry_id)
+
+        # now we're done, and we will take another look at the dict to figure out remaining bins
+        bin_names_in_filtered_dict = set([])
+        for entry in hmm_sequences_dict_for_splits.values():
+            bin_names_in_filtered_dict.add(entry['bin_id'])
+
+        # bins we lost
+        bins_removed = bin_names_in_original_dict.difference(bin_names_in_filtered_dict)
+
+        return (hmm_sequences_dict_for_splits, gene_calls_removed, bins_removed)
+
+
     def filter_hmm_sequences_dict_from_genes_that_occur_in_less_than_N_bins(self, hmm_sequences_dict_for_splits, min_num_bins_gene_occurs=None):
         """This takes in your `hmm_sequences_dict_for_splits`, and removes genes that rarely occurs across bins.
 

diff --git a/bin/anvi-get-sequences-for-hmm-hits b/bin/anvi-get-sequences-for-hmm-hits
@@ -184,6 +184,16 @@ def main(args):
 
         run.info('Filtered hits', '%d hits remain after filtering for %d gene(s)' % (len(hmm_sequences_dict), len(gene_names)))
 
+    if args.ignore_genes_longer_than > 0:
+        hmm_sequences_dict, gene_calls_removed, bins_removed = s.filter_hmm_sequences_dict_for_genes_that_are_too_long(hmm_sequences_dict, int(args.ignore_genes_longer_than))
+        CHK()
+
+        if len(bins_removed):
+            bins_removed_for_any_reason.update(bins_removed)
+
+        if len(gene_calls_removed):
+            run.info('Filtered hits', '%d hits remain after filtering for %d genes longer than %d' % (len(hmm_sequences_dict), len(gene_calls_removed), args.ignore_genes_longer_than), nl_before=1)
+
     if args.max_num_genes_missing_from_bin is not None:
         hmm_sequences_dict, bins_removed = s.filter_hmm_sequences_dict_for_bins_that_lack_more_than_N_genes(hmm_sequences_dict, gene_names, int(args.max_num_genes_missing_from_bin))
         CHK()
@@ -321,10 +331,11 @@ if __name__ == '__main__':
                                   program with `--concatenate-genes` flag). The default is "XXX" for amino\
                                   acid sequences, and "NNN" for DNA sequences'}))
 
-    groupG = parser.add_argument_group('OPTIONAL', "Everything is optional, but some options are more optional than others.")
-    groupG.add_argument(*anvio.A('return-best-hit'), **anvio.K('return-best-hit'))
-    groupG.add_argument(*anvio.A('unique-genes'), **anvio.K('unique-genes'))
-    groupG.add_argument(*anvio.A('just-do-it'), **anvio.K('just-do-it'))
+    groupH = parser.add_argument_group('OPTIONAL', "Everything is optional, but some options are more optional than others.")
+    groupH.add_argument(*anvio.A('return-best-hit'), **anvio.K('return-best-hit'))
+    groupH.add_argument(*anvio.A('ignore-genes-longer-than'), **anvio.K('ignore-genes-longer-than'))
+    groupH.add_argument(*anvio.A('unique-genes'), **anvio.K('unique-genes'))
+    groupH.add_argument(*anvio.A('just-do-it'), **anvio.K('just-do-it'))
 
     args = parser.get_args(parser)