From 2d3ae1b404bf0e17d7cbd8c608e7ab3a1a766ab2 Mon Sep 17 00:00:00 2001 From: "A. Murat Eren" Date: Thu, 18 Jan 2024 17:10:29 +0300 Subject: [PATCH 1/6] community params += --ignore-genes-longer-than --- anvio/__init__.py | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/anvio/__init__.py b/anvio/__init__.py index 709ebeaaff..4de8656665 100644 --- a/anvio/__init__.py +++ b/anvio/__init__.py @@ -1397,6 +1397,20 @@ def TABULATE(table, header, numalign="right", max_width=0): 'type': str, 'help': "Characters to separate things (the default is whatever is most suitable)."} ), + 'ignore-genes-longer-than': ( + ['--ignore-genes-longer-than'], + {'metavar': 'MAX_LENGTH', + 'default': 0, + 'type': int, + 'help': "In some cases the gene calling step can identify open reading frames that span across extremely long " + "stretches of genomes. Such mistakes can lead to downstream issues, especially when concatenate flag " + "is used, including faliure to align genes. This flag allows anvi'o to ignore extremely long gene calls to " + "avoid unintended issues (i.e., during phylogenomic analyses). If you use this flag, please carefully " + "examine the output messages from the program to see which genes are removed from the analysis. " + "Please note that the length parameter considers the nucleotide lenght of the open reading frame, " + "even if you asked for amino acid sequences to be returned. Setting this parameter to small values, " + "such as less than 10000 nucleotides may lead to the removal of geniune genes, so please use it carefully."} + ), 'align-with': ( ['--align-with'], {'metavar': 'ALIGNER', From 09e49dd3838465d6c97ef7faee1ae8e95b399605 Mon Sep 17 00:00:00 2001 From: "A. Murat Eren" Date: Thu, 18 Jan 2024 17:11:13 +0300 Subject: [PATCH 2/6] new param for `anvi-get-sequences-for-hmm-hits` --- bin/anvi-get-sequences-for-hmm-hits | 1 + 1 file changed, 1 insertion(+) diff --git a/bin/anvi-get-sequences-for-hmm-hits b/bin/anvi-get-sequences-for-hmm-hits index 57e621ed0e..2e36f8cbb8 100755 --- a/bin/anvi-get-sequences-for-hmm-hits +++ b/bin/anvi-get-sequences-for-hmm-hits @@ -323,6 +323,7 @@ if __name__ == '__main__': groupG = parser.add_argument_group('OPTIONAL', "Everything is optional, but some options are more optional than others.") groupG.add_argument(*anvio.A('return-best-hit'), **anvio.K('return-best-hit')) + groupG.add_argument(*anvio.A('ignore-genes-longer-than'), **anvio.K('ignore-genes-longer-than')) groupG.add_argument(*anvio.A('unique-genes'), **anvio.K('unique-genes')) groupG.add_argument(*anvio.A('just-do-it'), **anvio.K('just-do-it')) From 510e4cdd92ef33bddafea5cb16d1b4408ac93bbb Mon Sep 17 00:00:00 2001 From: "A. Murat Eren" Date: Thu, 18 Jan 2024 17:11:52 +0300 Subject: [PATCH 3/6] fixy --- bin/anvi-get-sequences-for-hmm-hits | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/bin/anvi-get-sequences-for-hmm-hits b/bin/anvi-get-sequences-for-hmm-hits index 2e36f8cbb8..2600882554 100755 --- a/bin/anvi-get-sequences-for-hmm-hits +++ b/bin/anvi-get-sequences-for-hmm-hits @@ -321,11 +321,11 @@ if __name__ == '__main__': program with `--concatenate-genes` flag). The default is "XXX" for amino\ acid sequences, and "NNN" for DNA sequences'})) - groupG = parser.add_argument_group('OPTIONAL', "Everything is optional, but some options are more optional than others.") - groupG.add_argument(*anvio.A('return-best-hit'), **anvio.K('return-best-hit')) - groupG.add_argument(*anvio.A('ignore-genes-longer-than'), **anvio.K('ignore-genes-longer-than')) - groupG.add_argument(*anvio.A('unique-genes'), **anvio.K('unique-genes')) - groupG.add_argument(*anvio.A('just-do-it'), **anvio.K('just-do-it')) + groupH = parser.add_argument_group('OPTIONAL', "Everything is optional, but some options are more optional than others.") + groupH.add_argument(*anvio.A('return-best-hit'), **anvio.K('return-best-hit')) + groupH.add_argument(*anvio.A('ignore-genes-longer-than'), **anvio.K('ignore-genes-longer-than')) + groupH.add_argument(*anvio.A('unique-genes'), **anvio.K('unique-genes')) + groupH.add_argument(*anvio.A('just-do-it'), **anvio.K('just-do-it')) args = parser.get_args(parser) From 32fb0a53fe33183721910a6f3a907071981b1f0a Mon Sep 17 00:00:00 2001 From: "A. Murat Eren" Date: Thu, 18 Jan 2024 17:14:14 +0300 Subject: [PATCH 4/6] a new function to filter genes based on max length --- anvio/hmmops.py | 55 +++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 55 insertions(+) diff --git a/anvio/hmmops.py b/anvio/hmmops.py index 7d0fb41057..02280e7c15 100644 --- a/anvio/hmmops.py +++ b/anvio/hmmops.py @@ -530,6 +530,61 @@ def get_num_genes_missing_per_bin_dict(self, hmm_sequences_dict_for_splits, gene return num_genes_missing_per_bin + def filter_hmm_sequences_dict_for_genes_that_are_too_long(self, hmm_sequences_dict_for_splits, ignore_genes_longer_than=0): + """This takes in your `hmm_sequences_dict_for_splits`, and removes genes that are too long""" + + if not isinstance(ignore_genes_longer_than, int): + raise ConfigError("The `--ignore-genes-longer-than` expects an integer argument :/") + + # we will keep track of these bad bois + gene_calls_removed = set([]) + + # we identify entry ids that describe genes that are too long for removal + entry_ids_to_remove = set([]) + for entry_id in hmm_sequences_dict_for_splits: + if hmm_sequences_dict_for_splits[entry_id]['length'] > ignore_genes_longer_than: + entry_ids_to_remove.add(entry_id) + gene_calls_removed.add(hmm_sequences_dict_for_splits[entry_id]['gene_callers_id']) + + # we return early if there is nothing to be done + if not len(entry_ids_to_remove): + self.run.warning(f"You asked anvi'o to remove genes that are longer than {ignore_genes_longer_than} nts from your " + f"HMM hits. But none of the gene calls were longer than that value, so you get to keep everything.", + header="A MESSAGE FROM YOUR GENE LENGTH FILTER 📏") + return (hmm_sequences_dict_for_splits, set([]), set([])) + + # if we are here, it means there are things to be gotten rid of. we will remove things + # while keeping the user informed. + self.run.warning(f"You asked anvi'o to remove genes that are longer than {ignore_genes_longer_than} nts from your " + f"HMM hits. There were a total of {len(entry_ids_to_remove)} HMM hits that matched to gene calls " + f"that were longer than {ignore_genes_longer_than}, and they are now removed from your analysis. " + f"The following lines list all these gene calls, their length, and which model they belonged.", + header="A MESSAGE FROM YOUR GENE LENGTH FILTER 📏", lc='yellow') + + # before we actually start removing stuff, we first learn all bin names that are in the master dict + bin_names_in_original_dict = set([]) + for entry in hmm_sequences_dict_for_splits.values(): + bin_names_in_original_dict.add(entry['bin_id']) + + # puts in business socks + for entry_id in entry_ids_to_remove: + e = hmm_sequences_dict_for_splits[entry_id] + self.run.info_single(f"Source: {e['source']} / Model: {e['gene_name']} / Bin: {e['bin_id']} / " + f"Gene call: {e['gene_callers_id']} / Length: {e['length']}", + cut_after=None, level=2) + hmm_sequences_dict_for_splits.pop(entry_id) + + # now we're done, and we will take another look at the dict to figure out remaining bins + bin_names_in_filtered_dict = set([]) + for entry in hmm_sequences_dict_for_splits.values(): + bin_names_in_filtered_dict.add(entry['bin_id']) + + # bins we lost + bins_removed = bin_names_in_original_dict.difference(bin_names_in_filtered_dict) + + return (hmm_sequences_dict_for_splits, gene_calls_removed, bins_removed) + + def filter_hmm_sequences_dict_from_genes_that_occur_in_less_than_N_bins(self, hmm_sequences_dict_for_splits, min_num_bins_gene_occurs=None): """This takes in your `hmm_sequences_dict_for_splits`, and removes genes that rarely occurs across bins. From 438b3cea51c9db171fae9ed4175159b58f7a937c Mon Sep 17 00:00:00 2001 From: "A. Murat Eren" Date: Thu, 18 Jan 2024 17:14:29 +0300 Subject: [PATCH 5/6] connect everything together --- bin/anvi-get-sequences-for-hmm-hits | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/bin/anvi-get-sequences-for-hmm-hits b/bin/anvi-get-sequences-for-hmm-hits index 2600882554..9b858e2b99 100755 --- a/bin/anvi-get-sequences-for-hmm-hits +++ b/bin/anvi-get-sequences-for-hmm-hits @@ -184,6 +184,16 @@ def main(args): run.info('Filtered hits', '%d hits remain after filtering for %d gene(s)' % (len(hmm_sequences_dict), len(gene_names))) + if args.ignore_genes_longer_than > 0: + hmm_sequences_dict, gene_calls_removed, bins_removed = s.filter_hmm_sequences_dict_for_genes_that_are_too_long(hmm_sequences_dict, int(args.ignore_genes_longer_than)) + CHK() + + if len(bins_removed): + bins_removed_for_any_reason.update(bins_removed) + + if len(gene_calls_removed): + run.info('Filtered hits', '%d hits remain after filtering for %d genes longer than %d' % (len(hmm_sequences_dict), len(gene_calls_removed), args.ignore_genes_longer_than), nl_before=1) + if args.max_num_genes_missing_from_bin is not None: hmm_sequences_dict, bins_removed = s.filter_hmm_sequences_dict_for_bins_that_lack_more_than_N_genes(hmm_sequences_dict, gene_names, int(args.max_num_genes_missing_from_bin)) CHK() From 2182b67ba4d9d6265cf4be9f61ed3e28af174cf1 Mon Sep 17 00:00:00 2001 From: "A. Murat Eren" Date: Thu, 18 Jan 2024 17:18:08 +0300 Subject: [PATCH 6/6] fixy --- anvio/__init__.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/anvio/__init__.py b/anvio/__init__.py index 4de8656665..d7fa4f4c4f 100644 --- a/anvio/__init__.py +++ b/anvio/__init__.py @@ -1404,12 +1404,12 @@ def TABULATE(table, header, numalign="right", max_width=0): 'type': int, 'help': "In some cases the gene calling step can identify open reading frames that span across extremely long " "stretches of genomes. Such mistakes can lead to downstream issues, especially when concatenate flag " - "is used, including faliure to align genes. This flag allows anvi'o to ignore extremely long gene calls to " + "is used, including failure to align genes. This flag allows anvi'o to ignore extremely long gene calls to " "avoid unintended issues (i.e., during phylogenomic analyses). If you use this flag, please carefully " "examine the output messages from the program to see which genes are removed from the analysis. " "Please note that the length parameter considers the nucleotide lenght of the open reading frame, " "even if you asked for amino acid sequences to be returned. Setting this parameter to small values, " - "such as less than 10000 nucleotides may lead to the removal of geniune genes, so please use it carefully."} + "such as less than 10000 nucleotides may lead to the removal of genuine genes, so please use it carefully."} ), 'align-with': ( ['--align-with'],