Skip to content

Commit

Permalink
Merge pull request #2201 from merenlab/issue-2200
Browse files Browse the repository at this point in the history
Filter HMM hits based on length
  • Loading branch information
meren authored Jan 18, 2024
2 parents c40c447 + 2182b67 commit 7228a15
Show file tree
Hide file tree
Showing 3 changed files with 84 additions and 4 deletions.
14 changes: 14 additions & 0 deletions anvio/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -1397,6 +1397,20 @@ def TABULATE(table, header, numalign="right", max_width=0):
'type': str,
'help': "Characters to separate things (the default is whatever is most suitable)."}
),
'ignore-genes-longer-than': (
['--ignore-genes-longer-than'],
{'metavar': 'MAX_LENGTH',
'default': 0,
'type': int,
'help': "In some cases the gene calling step can identify open reading frames that span across extremely long "
"stretches of genomes. Such mistakes can lead to downstream issues, especially when concatenate flag "
"is used, including failure to align genes. This flag allows anvi'o to ignore extremely long gene calls to "
"avoid unintended issues (i.e., during phylogenomic analyses). If you use this flag, please carefully "
"examine the output messages from the program to see which genes are removed from the analysis. "
"Please note that the length parameter considers the nucleotide lenght of the open reading frame, "
"even if you asked for amino acid sequences to be returned. Setting this parameter to small values, "
"such as less than 10000 nucleotides may lead to the removal of genuine genes, so please use it carefully."}
),
'align-with': (
['--align-with'],
{'metavar': 'ALIGNER',
Expand Down
55 changes: 55 additions & 0 deletions anvio/hmmops.py
Original file line number Diff line number Diff line change
Expand Up @@ -530,6 +530,61 @@ def get_num_genes_missing_per_bin_dict(self, hmm_sequences_dict_for_splits, gene
return num_genes_missing_per_bin


def filter_hmm_sequences_dict_for_genes_that_are_too_long(self, hmm_sequences_dict_for_splits, ignore_genes_longer_than=0):
"""This takes in your `hmm_sequences_dict_for_splits`, and removes genes that are too long"""

if not isinstance(ignore_genes_longer_than, int):
raise ConfigError("The `--ignore-genes-longer-than` expects an integer argument :/")

# we will keep track of these bad bois
gene_calls_removed = set([])

# we identify entry ids that describe genes that are too long for removal
entry_ids_to_remove = set([])
for entry_id in hmm_sequences_dict_for_splits:
if hmm_sequences_dict_for_splits[entry_id]['length'] > ignore_genes_longer_than:
entry_ids_to_remove.add(entry_id)
gene_calls_removed.add(hmm_sequences_dict_for_splits[entry_id]['gene_callers_id'])

# we return early if there is nothing to be done
if not len(entry_ids_to_remove):
self.run.warning(f"You asked anvi'o to remove genes that are longer than {ignore_genes_longer_than} nts from your "
f"HMM hits. But none of the gene calls were longer than that value, so you get to keep everything.",
header="A MESSAGE FROM YOUR GENE LENGTH FILTER 📏")
return (hmm_sequences_dict_for_splits, set([]), set([]))

# if we are here, it means there are things to be gotten rid of. we will remove things
# while keeping the user informed.
self.run.warning(f"You asked anvi'o to remove genes that are longer than {ignore_genes_longer_than} nts from your "
f"HMM hits. There were a total of {len(entry_ids_to_remove)} HMM hits that matched to gene calls "
f"that were longer than {ignore_genes_longer_than}, and they are now removed from your analysis. "
f"The following lines list all these gene calls, their length, and which model they belonged.",
header="A MESSAGE FROM YOUR GENE LENGTH FILTER 📏", lc='yellow')

# before we actually start removing stuff, we first learn all bin names that are in the master dict
bin_names_in_original_dict = set([])
for entry in hmm_sequences_dict_for_splits.values():
bin_names_in_original_dict.add(entry['bin_id'])

# puts in business socks
for entry_id in entry_ids_to_remove:
e = hmm_sequences_dict_for_splits[entry_id]
self.run.info_single(f"Source: {e['source']} / Model: {e['gene_name']} / Bin: {e['bin_id']} / "
f"Gene call: {e['gene_callers_id']} / Length: {e['length']}",
cut_after=None, level=2)
hmm_sequences_dict_for_splits.pop(entry_id)

# now we're done, and we will take another look at the dict to figure out remaining bins
bin_names_in_filtered_dict = set([])
for entry in hmm_sequences_dict_for_splits.values():
bin_names_in_filtered_dict.add(entry['bin_id'])

# bins we lost
bins_removed = bin_names_in_original_dict.difference(bin_names_in_filtered_dict)

return (hmm_sequences_dict_for_splits, gene_calls_removed, bins_removed)


def filter_hmm_sequences_dict_from_genes_that_occur_in_less_than_N_bins(self, hmm_sequences_dict_for_splits, min_num_bins_gene_occurs=None):
"""This takes in your `hmm_sequences_dict_for_splits`, and removes genes that rarely occurs across bins.
Expand Down
19 changes: 15 additions & 4 deletions bin/anvi-get-sequences-for-hmm-hits
Original file line number Diff line number Diff line change
Expand Up @@ -184,6 +184,16 @@ def main(args):

run.info('Filtered hits', '%d hits remain after filtering for %d gene(s)' % (len(hmm_sequences_dict), len(gene_names)))

if args.ignore_genes_longer_than > 0:
hmm_sequences_dict, gene_calls_removed, bins_removed = s.filter_hmm_sequences_dict_for_genes_that_are_too_long(hmm_sequences_dict, int(args.ignore_genes_longer_than))
CHK()

if len(bins_removed):
bins_removed_for_any_reason.update(bins_removed)

if len(gene_calls_removed):
run.info('Filtered hits', '%d hits remain after filtering for %d genes longer than %d' % (len(hmm_sequences_dict), len(gene_calls_removed), args.ignore_genes_longer_than), nl_before=1)

if args.max_num_genes_missing_from_bin is not None:
hmm_sequences_dict, bins_removed = s.filter_hmm_sequences_dict_for_bins_that_lack_more_than_N_genes(hmm_sequences_dict, gene_names, int(args.max_num_genes_missing_from_bin))
CHK()
Expand Down Expand Up @@ -321,10 +331,11 @@ if __name__ == '__main__':
program with `--concatenate-genes` flag). The default is "XXX" for amino\
acid sequences, and "NNN" for DNA sequences'}))

groupG = parser.add_argument_group('OPTIONAL', "Everything is optional, but some options are more optional than others.")
groupG.add_argument(*anvio.A('return-best-hit'), **anvio.K('return-best-hit'))
groupG.add_argument(*anvio.A('unique-genes'), **anvio.K('unique-genes'))
groupG.add_argument(*anvio.A('just-do-it'), **anvio.K('just-do-it'))
groupH = parser.add_argument_group('OPTIONAL', "Everything is optional, but some options are more optional than others.")
groupH.add_argument(*anvio.A('return-best-hit'), **anvio.K('return-best-hit'))
groupH.add_argument(*anvio.A('ignore-genes-longer-than'), **anvio.K('ignore-genes-longer-than'))
groupH.add_argument(*anvio.A('unique-genes'), **anvio.K('unique-genes'))
groupH.add_argument(*anvio.A('just-do-it'), **anvio.K('just-do-it'))

args = parser.get_args(parser)

Expand Down

0 comments on commit 7228a15

Please sign in to comment.