From 2d3ae1b404bf0e17d7cbd8c608e7ab3a1a766ab2 Mon Sep 17 00:00:00 2001
From: "A. Murat Eren" <a.murat.eren@gmail.com>
Date: Thu, 18 Jan 2024 17:10:29 +0300
Subject: [PATCH 1/6] community params += --ignore-genes-longer-than

---
 anvio/__init__.py | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/anvio/__init__.py b/anvio/__init__.py
index 709ebeaaff..4de8656665 100644
--- a/anvio/__init__.py
+++ b/anvio/__init__.py
@@ -1397,6 +1397,20 @@ def TABULATE(table, header, numalign="right", max_width=0):
              'type': str,
              'help': "Characters to separate things (the default is whatever is most suitable)."}
                 ),
+    'ignore-genes-longer-than': (
+            ['--ignore-genes-longer-than'],
+            {'metavar': 'MAX_LENGTH',
+             'default': 0,
+             'type': int,
+             'help': "In some cases the gene calling step can identify open reading frames that span across extremely long "
+                     "stretches of genomes. Such mistakes can lead to downstream issues, especially when concatenate flag "
+                     "is used, including faliure to align genes. This flag allows anvi'o to ignore extremely long gene calls to "
+                     "avoid unintended issues (i.e., during phylogenomic analyses). If you use this flag, please carefully "
+                     "examine the output messages from the program to see which genes are removed from the analysis. "
+                     "Please note that the length parameter considers the nucleotide lenght of the open reading frame, "
+                     "even if you asked for amino acid sequences to be returned. Setting this parameter to small values, "
+                     "such as less than 10000 nucleotides may lead to the removal of geniune genes, so please use it carefully."}
+                ),
     'align-with': (
             ['--align-with'],
             {'metavar': 'ALIGNER',

From 09e49dd3838465d6c97ef7faee1ae8e95b399605 Mon Sep 17 00:00:00 2001
From: "A. Murat Eren" <a.murat.eren@gmail.com>
Date: Thu, 18 Jan 2024 17:11:13 +0300
Subject: [PATCH 2/6] new param for `anvi-get-sequences-for-hmm-hits`

---
 bin/anvi-get-sequences-for-hmm-hits | 1 +
 1 file changed, 1 insertion(+)

diff --git a/bin/anvi-get-sequences-for-hmm-hits b/bin/anvi-get-sequences-for-hmm-hits
index 57e621ed0e..2e36f8cbb8 100755
--- a/bin/anvi-get-sequences-for-hmm-hits
+++ b/bin/anvi-get-sequences-for-hmm-hits
@@ -323,6 +323,7 @@ if __name__ == '__main__':
 
     groupG = parser.add_argument_group('OPTIONAL', "Everything is optional, but some options are more optional than others.")
     groupG.add_argument(*anvio.A('return-best-hit'), **anvio.K('return-best-hit'))
+    groupG.add_argument(*anvio.A('ignore-genes-longer-than'), **anvio.K('ignore-genes-longer-than'))
     groupG.add_argument(*anvio.A('unique-genes'), **anvio.K('unique-genes'))
     groupG.add_argument(*anvio.A('just-do-it'), **anvio.K('just-do-it'))
 

From 510e4cdd92ef33bddafea5cb16d1b4408ac93bbb Mon Sep 17 00:00:00 2001
From: "A. Murat Eren" <a.murat.eren@gmail.com>
Date: Thu, 18 Jan 2024 17:11:52 +0300
Subject: [PATCH 3/6] fixy

---
 bin/anvi-get-sequences-for-hmm-hits | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/bin/anvi-get-sequences-for-hmm-hits b/bin/anvi-get-sequences-for-hmm-hits
index 2e36f8cbb8..2600882554 100755
--- a/bin/anvi-get-sequences-for-hmm-hits
+++ b/bin/anvi-get-sequences-for-hmm-hits
@@ -321,11 +321,11 @@ if __name__ == '__main__':
                                   program with `--concatenate-genes` flag). The default is "XXX" for amino\
                                   acid sequences, and "NNN" for DNA sequences'}))
 
-    groupG = parser.add_argument_group('OPTIONAL', "Everything is optional, but some options are more optional than others.")
-    groupG.add_argument(*anvio.A('return-best-hit'), **anvio.K('return-best-hit'))
-    groupG.add_argument(*anvio.A('ignore-genes-longer-than'), **anvio.K('ignore-genes-longer-than'))
-    groupG.add_argument(*anvio.A('unique-genes'), **anvio.K('unique-genes'))
-    groupG.add_argument(*anvio.A('just-do-it'), **anvio.K('just-do-it'))
+    groupH = parser.add_argument_group('OPTIONAL', "Everything is optional, but some options are more optional than others.")
+    groupH.add_argument(*anvio.A('return-best-hit'), **anvio.K('return-best-hit'))
+    groupH.add_argument(*anvio.A('ignore-genes-longer-than'), **anvio.K('ignore-genes-longer-than'))
+    groupH.add_argument(*anvio.A('unique-genes'), **anvio.K('unique-genes'))
+    groupH.add_argument(*anvio.A('just-do-it'), **anvio.K('just-do-it'))
 
     args = parser.get_args(parser)
 

From 32fb0a53fe33183721910a6f3a907071981b1f0a Mon Sep 17 00:00:00 2001
From: "A. Murat Eren" <a.murat.eren@gmail.com>
Date: Thu, 18 Jan 2024 17:14:14 +0300
Subject: [PATCH 4/6] a new function to filter genes based on max length

---
 anvio/hmmops.py | 55 +++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 55 insertions(+)

diff --git a/anvio/hmmops.py b/anvio/hmmops.py
index 7d0fb41057..02280e7c15 100644
--- a/anvio/hmmops.py
+++ b/anvio/hmmops.py
@@ -530,6 +530,61 @@ def get_num_genes_missing_per_bin_dict(self, hmm_sequences_dict_for_splits, gene
         return num_genes_missing_per_bin
 
 
+    def filter_hmm_sequences_dict_for_genes_that_are_too_long(self, hmm_sequences_dict_for_splits, ignore_genes_longer_than=0):
+        """This takes in your `hmm_sequences_dict_for_splits`, and removes genes that are too long"""
+
+        if not isinstance(ignore_genes_longer_than, int):
+            raise ConfigError("The `--ignore-genes-longer-than` expects an integer argument :/")
+
+        # we will keep track of these bad bois
+        gene_calls_removed = set([])
+
+        # we identify entry ids that describe genes that are too long for removal
+        entry_ids_to_remove = set([])
+        for entry_id in hmm_sequences_dict_for_splits:
+            if hmm_sequences_dict_for_splits[entry_id]['length'] > ignore_genes_longer_than:
+                entry_ids_to_remove.add(entry_id)
+                gene_calls_removed.add(hmm_sequences_dict_for_splits[entry_id]['gene_callers_id'])
+
+        # we return early if there is nothing to be done
+        if not len(entry_ids_to_remove):
+            self.run.warning(f"You asked anvi'o to remove genes that are longer than {ignore_genes_longer_than} nts from your "
+                             f"HMM hits. But none of the gene calls were longer than that value, so you get to keep everything.",
+                             header="A MESSAGE FROM YOUR GENE LENGTH FILTER 📏")
+            return (hmm_sequences_dict_for_splits, set([]), set([]))
+
+        # if we are here, it means there are things to be gotten rid of. we will remove things
+        # while keeping the user informed.
+        self.run.warning(f"You asked anvi'o to remove genes that are longer than {ignore_genes_longer_than} nts from your "
+                         f"HMM hits. There were a total of {len(entry_ids_to_remove)} HMM hits that matched to gene calls "
+                         f"that were longer than {ignore_genes_longer_than}, and they are now removed from your analysis. "
+                         f"The following lines list all these gene calls, their length, and which model they belonged.",
+                         header="A MESSAGE FROM YOUR GENE LENGTH FILTER 📏", lc='yellow')
+
+        # before we actually start removing stuff, we first learn all bin names that are in the master dict
+        bin_names_in_original_dict = set([])
+        for entry in hmm_sequences_dict_for_splits.values():
+            bin_names_in_original_dict.add(entry['bin_id'])
+
+        # puts in business socks
+        for entry_id in entry_ids_to_remove:
+            e = hmm_sequences_dict_for_splits[entry_id]
+            self.run.info_single(f"Source: {e['source']} / Model: {e['gene_name']} /  Bin: {e['bin_id']} / "
+                                 f"Gene call: {e['gene_callers_id']} / Length: {e['length']}",
+                                 cut_after=None, level=2)
+            hmm_sequences_dict_for_splits.pop(entry_id)
+
+        # now we're done, and we will take another look at the dict to figure out remaining bins
+        bin_names_in_filtered_dict = set([])
+        for entry in hmm_sequences_dict_for_splits.values():
+            bin_names_in_filtered_dict.add(entry['bin_id'])
+
+        # bins we lost
+        bins_removed = bin_names_in_original_dict.difference(bin_names_in_filtered_dict)
+
+        return (hmm_sequences_dict_for_splits, gene_calls_removed, bins_removed)
+
+
     def filter_hmm_sequences_dict_from_genes_that_occur_in_less_than_N_bins(self, hmm_sequences_dict_for_splits, min_num_bins_gene_occurs=None):
         """This takes in your `hmm_sequences_dict_for_splits`, and removes genes that rarely occurs across bins.
 

From 438b3cea51c9db171fae9ed4175159b58f7a937c Mon Sep 17 00:00:00 2001
From: "A. Murat Eren" <a.murat.eren@gmail.com>
Date: Thu, 18 Jan 2024 17:14:29 +0300
Subject: [PATCH 5/6] connect everything together

---
 bin/anvi-get-sequences-for-hmm-hits | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/bin/anvi-get-sequences-for-hmm-hits b/bin/anvi-get-sequences-for-hmm-hits
index 2600882554..9b858e2b99 100755
--- a/bin/anvi-get-sequences-for-hmm-hits
+++ b/bin/anvi-get-sequences-for-hmm-hits
@@ -184,6 +184,16 @@ def main(args):
 
         run.info('Filtered hits', '%d hits remain after filtering for %d gene(s)' % (len(hmm_sequences_dict), len(gene_names)))
 
+    if args.ignore_genes_longer_than > 0:
+        hmm_sequences_dict, gene_calls_removed, bins_removed = s.filter_hmm_sequences_dict_for_genes_that_are_too_long(hmm_sequences_dict, int(args.ignore_genes_longer_than))
+        CHK()
+
+        if len(bins_removed):
+            bins_removed_for_any_reason.update(bins_removed)
+
+        if len(gene_calls_removed):
+            run.info('Filtered hits', '%d hits remain after filtering for %d genes longer than %d' % (len(hmm_sequences_dict), len(gene_calls_removed), args.ignore_genes_longer_than), nl_before=1)
+
     if args.max_num_genes_missing_from_bin is not None:
         hmm_sequences_dict, bins_removed = s.filter_hmm_sequences_dict_for_bins_that_lack_more_than_N_genes(hmm_sequences_dict, gene_names, int(args.max_num_genes_missing_from_bin))
         CHK()

From 2182b67ba4d9d6265cf4be9f61ed3e28af174cf1 Mon Sep 17 00:00:00 2001
From: "A. Murat Eren" <a.murat.eren@gmail.com>
Date: Thu, 18 Jan 2024 17:18:08 +0300
Subject: [PATCH 6/6] fixy

---
 anvio/__init__.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/anvio/__init__.py b/anvio/__init__.py
index 4de8656665..d7fa4f4c4f 100644
--- a/anvio/__init__.py
+++ b/anvio/__init__.py
@@ -1404,12 +1404,12 @@ def TABULATE(table, header, numalign="right", max_width=0):
              'type': int,
              'help': "In some cases the gene calling step can identify open reading frames that span across extremely long "
                      "stretches of genomes. Such mistakes can lead to downstream issues, especially when concatenate flag "
-                     "is used, including faliure to align genes. This flag allows anvi'o to ignore extremely long gene calls to "
+                     "is used, including failure to align genes. This flag allows anvi'o to ignore extremely long gene calls to "
                      "avoid unintended issues (i.e., during phylogenomic analyses). If you use this flag, please carefully "
                      "examine the output messages from the program to see which genes are removed from the analysis. "
                      "Please note that the length parameter considers the nucleotide lenght of the open reading frame, "
                      "even if you asked for amino acid sequences to be returned. Setting this parameter to small values, "
-                     "such as less than 10000 nucleotides may lead to the removal of geniune genes, so please use it carefully."}
+                     "such as less than 10000 nucleotides may lead to the removal of genuine genes, so please use it carefully."}
                 ),
     'align-with': (
             ['--align-with'],