diff --git a/anvio/biochemistry/reactionnetwork.py b/anvio/biochemistry/reactionnetwork.py index 91d2081241..e1fcfce464 100644 --- a/anvio/biochemistry/reactionnetwork.py +++ b/anvio/biochemistry/reactionnetwork.py @@ -21,7 +21,7 @@ import multiprocessing as mp from argparse import Namespace -from typing import Dict, List, Set, Tuple +from typing import Dict, List, Set, Tuple, Union import anvio.utils as utils import anvio.dbinfo as dbinfo @@ -96,6 +96,7 @@ class GeneCluster: """Representation of a gene cluster in the metabolic network.""" def __init__(self) -> None: self.gene_cluster_id: int = None + self.genomes: List[str] = [] # Consensus KO among the genes in the cluster. The KO annotations of genes in genomes that # underlie each consensus annotation are not tracked. This would require storing more data # on the consensus annotations in `dbops.PanSuperclass.get_gene_cluster_function_summary`. @@ -136,6 +137,36 @@ def __init__(self) -> None: self.modelseed_kegg_aliases: Dict[str, List[str]] = {} self.modelseed_ec_number_aliases: Dict[str, List[str]] = {} + def remove_missing_objective_metabolites(self, objective_dict: Dict) -> None: + """ + Remove metabolites from the biomass objective dictionary that are not produced or consumed + in the reaction network. + + Parameters + ========== + objective_dict : dict + Biomass objective in COBRApy JSON format, like that returned by the method, + 'JSONStructure.get_e_coli_core_objective'. + + Returns + ======= + None + """ + objective_metabolites: Dict = objective_dict['metabolites'] + objective_original_metabolites: Dict = objective_dict['notes']['original_metabolite_ids'] + missing_metabolite_ids = [] + # The original objective had metabolite BiGG IDs, which were replaced with KEGG COMPOUND IDs. + missing_original_metabolite_ids = [] + for metabolite_id, original_metabolite_id in zip(objective_metabolites, objective_original_metabolites): + if metabolite_id[:-2] not in self.metabolites: + # The metabolite (removing localization substring) is not in the network. + missing_metabolite_ids.append(metabolite_id) + missing_original_metabolite_ids.append(original_metabolite_id) + for metabolite_id in missing_metabolite_ids: + objective_metabolites.pop(metabolite_id) + for original_metabolite_id in missing_original_metabolite_ids: + objective_original_metabolites.pop(original_metabolite_id) + class GenomicNetwork(ReactionNetwork): """ A reaction network predicted from KEGG KO and ModelSEED annotations of genes. @@ -697,7 +728,7 @@ def export_json( overwrite: bool = False, objective: str = None, remove_missing_objective_metabolites: bool = False, - # record_bins: tuple = ('gene', ), + # record_bins: Tuple[str] = ('gene', ), indent: int = 2, progress: terminal.Progress = terminal.Progress() ) -> None: @@ -719,7 +750,8 @@ def export_json( An objective to use in the model, stored as the first entry in the JSON 'reactions' array. Currently, the only valid options are None and 'e_coli_core'. - None does not add an objective to the JSON, meaning that FBA cannot be performed on the model. + None means that no objective is added to the JSON, meaning that FBA cannot be performed + on the model. 'e_coli_core' is the biomass objective from the COBRApy example JSON file of E. coli "core" metabolism, 'e_coli_core.json'. @@ -733,8 +765,8 @@ def export_json( reaction network. By default, bin membership is only recorded for genes with the argument, ('gene', ). 'reaction' and 'metabolite' can also be provided in a tuple argument (e.g., ('reaction', ) or ('metabolite', 'reaction', 'gene')) to likewise record - in which bins the reaction and metabolite entries occur. Pass an argument of None to not - record bin membership in the JSON at all. + in which bins the reaction and metabolite entries occur. To not record bins at all, pass + either an empty tuple or None. indent : int, 2 spaces of indentation per nesting level in JSON file @@ -751,20 +783,7 @@ def export_json( if objective == 'e_coli_core': objective_dict = JSONStructure.get_e_coli_core_objective() if remove_missing_objective_metabolites: - objective_metabolites: Dict = objective_dict['metabolites'] - objective_original_metabolites: Dict = objective_dict['notes']['original_metabolite_ids'] - missing_metabolite_ids = [] - # The original objective had metabolite BiGG IDs, which were replaced with KEGG COMPOUND IDs. - missing_original_metabolite_ids = [] - for metabolite_id, original_metabolite_id in zip(objective_metabolites, objective_original_metabolites): - if metabolite_id[:-2] not in self.metabolites: - # The metabolite (removing localization substring) is not in the network. - missing_metabolite_ids.append(metabolite_id) - missing_original_metabolite_ids.append(original_metabolite_id) - for metabolite_id in missing_metabolite_ids: - objective_metabolites.pop(metabolite_id) - for original_metabolite_id in missing_original_metabolite_ids: - objective_original_metabolites.pop(original_metabolite_id) + self.remove_missing_objective_metabolites(objective_dict) json_reactions.append(objective_dict) elif objective != None: raise ConfigError(f"Anvi'o does not recognize an objective with the name, '{objective}'.") @@ -774,13 +793,14 @@ def export_json( reaction_kos: Dict[str, List[KO]] = {} for gcid, gene in self.genes.items(): gene_entry = JSONStructure.get_gene_entry() + json_genes.append(gene_entry) gcid_str = str(gcid) gene_entry['id'] = gcid_str - notes = gene_entry['notes'] - # Record KO IDs and annotation e-values in the notes section of the gene entry. - notes['ko'] = notes_kos = {} + # Record KO IDs and annotation e-values in the annotation section of the gene entry. + annotation = gene_entry['annotation'] + annotation['ko'] = annotation_kos = {} for ko, e_value in zip(gene.kos, gene.e_values): - notes_kos[ko.id] = str(e_value) + annotation_kos[ko.id] = str(e_value) for modelseed_reaction_id in ko.reactions: try: reaction_genes[modelseed_reaction_id].append(gcid_str) @@ -790,12 +810,12 @@ def export_json( reaction_kos[modelseed_reaction_id].append(ko) except KeyError: reaction_kos[modelseed_reaction_id] = [ko] - json_genes.append(gene_entry) progress.update("Reactions") compound_compartments: Dict[str, Set[str]] = {} for modelseed_reaction_id, reaction in self.reactions.items(): reaction_entry = JSONStructure.get_reaction_entry() + json_reactions.append(reaction_entry) reaction_entry['id'] = modelseed_reaction_id reaction_entry['name'] = reaction.modelseed_name metabolites = reaction_entry['metabolites'] @@ -835,7 +855,6 @@ def export_json( 'kegg.reaction': list(set(reaction.kegg_aliases).difference(ko_kegg_aliases)), 'ec-code': list(set(reaction.ec_number_aliases).difference(ko_ec_number_aliases)) } - json_reactions.append(reaction_entry) progress.update("Metabolites") for modelseed_compound_id, metabolite in self.metabolites.items(): @@ -845,6 +864,7 @@ def export_json( kegg_compound_aliases = list(metabolite.kegg_aliases) for compartment in compound_compartments[modelseed_compound_id]: metabolite_entry = JSONStructure.get_metabolite_entry() + json_metabolites.append(metabolite_entry) metabolite_entry['id'] = f"{modelseed_compound_id}_{compartment}" metabolite_entry['name'] = modelseed_compound_name metabolite_entry['compartment'] = compartment @@ -853,7 +873,6 @@ def export_json( metabolite_entry['charge'] = charge if charge is not None else 0 metabolite_entry['formula'] = formula if formula is not None else "" metabolite_entry['annotation']['kegg.compound'] = kegg_compound_aliases - json_metabolites.append(metabolite_entry) progress.update("Saving") with open(path, 'w') as f: @@ -872,75 +891,208 @@ def __init__(self) -> None: def export_json( self, path: str, - annotate_genes: tuple = ('genome', 'bin', ), - annotate_reactions: tuple = ('genome', 'bin', 'kegg_reaction', 'ec_number'), - annotate_metabolites: tuple = ('genome', 'bin', 'kegg_compound'), - run: terminal.Run = terminal.Run(), + overwrite: bool = False, + objective: str = None, + remove_missing_objective_metabolites: bool = False, + record_genomes: Tuple[str] = ('gene', 'reaction'), + # record_bins: Tuple[str] = ('gene', 'reaction'), + indent: int = 2, progress: terminal.Progress = terminal.Progress() ) -> None: """ - Export the network to a metabolic model file in JSON format. *Gene entries in this file - represent gene clusters.* Optionally, gene, reaction, and metabolite entries in this file - are annotated with the names of genomes and names of gene cluster bins in which they occur. + Export the network to a metabolic model file in JSON format. Entries in the "gene" section + of this file represent gene clusters. + + All information from the network is included in the JSON so that the file can by imported by + anvi'o as a PangenomicNetwork object containing the same information. Parameters ========== path : str output JSON file path - annotate_genes : tuple, ('genome', 'bin', ) - Annotate gene (cluster) entries in the JSON file with additional data, selecting - from the following: - - 'genome' : genomes in which the genes of the cluster occur - - 'bin' : bins in which the gene cluster occurs - - 'all_ko' : all KOs associated with genes in the cluster, sorted in descending order of - the number of genes in the cluster that were associated with each KO and then mean - e-value of gene-KO assignments - - 'ko' : KOs associated with the gene cluster that yielded reactions in the network, - sorted in descending order of the number of genes in the cluster that were - associated with each KO and then mean e-value of gene-KO assignments - - 'ko_count' : number of genes in the cluster that were associated with each KO; if - 'all_ko' is provided, then each value corresponds to a KO in 'all_ko', whereas if - only 'ko' is provided, then each value corresponds to a KO in 'ko' - - 'e_value' : mean scores of KO associations with genes in the cluster; if 'all_ko' is - provided, then each value corresponds to a KO in 'all_ko', whereas if only 'ko' is - provided, then each value corresponds to a KO in 'ko' - - annotate_reactions : tuple, ('genome', 'bin', 'kegg_reaction', 'ec_number') - Annotate reaction entries in the JSON file with additional data, selecting from the following: + overwrite : bool, False + Overwrite the JSON file if it already exists. - 'genome' : genomes in which the reaction occurs + objective : str, None + An objective to use in the model, stored as the first entry in the JSON 'reactions' + array. Currently, the only valid options are None and 'e_coli_core'. - 'bin' : bins in which the reaction occurs + None means that no objective is added to the JSON, meaning that FBA cannot be performed + on the model. - 'kegg_reaction' : KO-associated KEGG reaction IDs yielding the ModelSEED reaction + 'e_coli_core' is the biomass objective from the COBRApy example JSON file of E. coli + "core" metabolism, 'e_coli_core.json'. - 'ec_number' : KO-associated EC numbers yielding the ModelSEED reaction + remove_missing_objective_metabolites : bool, False + If True, remove metabolites from the JSON objective that are not produced or consumed in + the reaction network. FBA fails with metabolites outside the network. - 'ko' : KOs yielding the ModelSEED reaction + record_genomes : tuple, ('gene cluster', 'reaction') + Record the genome membership of gene clusters in JSON entries. By default, genome names + are recorded for gene clusters and reactions with the argument, ('gene cluster', + 'reaction'). To not record genomes at all, pass either an empty tuple or None. The + following valid strings can be provided in a tuple in any combination: 'gene cluster', + 'reaction', and 'metabolite'. 'reaction' and 'metabolite' record the genomes predicted + to encode enzymes associated with reactions and metabolites, respectively. - annotate_metabolites : tuple, ('genome', 'bin', 'kegg_compound') - Annotate metabolite entries in the JSON file with additional data, selecting from the following: + indent : int, 2 + spaces of indentation per nesting level in JSON file - 'genome' : genomes in which the metabolite occurs + progress : terminal.Progress, terminal.Progress() + """ + if record_genomes is None: + record_genomes = () + valid_items = ('gene cluster', 'reaction', 'metabolite') + invalid_items = [] + for item in record_genomes: + if item not in valid_items: + invalid_items.append(item) + if invalid_items: + raise ConfigError( + f"The following items in the 'record_genomes' argument are invalid: {', '.join(invalid_items)}" + ) - 'bin' : bins in which the metabolite occurs + progress.new("Constructing JSON") + progress.update("Setting up") + filesnpaths.is_output_file_writable(path, ok_if_exists=overwrite) + json_dict = JSONStructure.get() + json_gene_clusters: List[Dict] = json_dict['genes'] + json_reactions: List[Dict] = json_dict['reactions'] + json_metabolites: List[Dict] = json_dict['metabolites'] + if objective == 'e_coli_core': + objective_dict = JSONStructure.get_e_coli_core_objective() + if remove_missing_objective_metabolites: + self.remove_missing_objective_metabolites(objective_dict) + json_reactions.append(objective_dict) + elif objective != None: + raise ConfigError(f"Anvi'o does not recognize an objective with the name, '{objective}'.") - 'kegg_compound' : KEGG compound aliases of the ModelSEED compound + progress.update("Gene clusters") + reaction_gene_clusters: Dict[str, List[str]] = {} + reaction_kos: Dict[str, List[KO]] = {} + # The following two dictionaries are only needed for recording the occurrence of reactions + # and metabolites in genomes. + reaction_genomes: Dict[str, List[str]] = {} + metabolite_genomes: Dict[str, List[str]] = {} + for gene_cluster_id, gene_cluster in self.gene_clusters.items(): + gene_cluster_entry = JSONStructure.get_gene_entry() + json_gene_clusters.append(gene_cluster_entry) + gene_cluster_id_str = str(gene_cluster_id) + gene_cluster_entry['id'] = gene_cluster_id_str + # Record KO IDs in the annotation section of the gene cluster entry. In a JSON file produced + # from a 'GenomicNetwork', KO IDs are paired with their gene annotation e-values, which + # can't be done with consensus KOs for gene clusters. Therefore, where the e-value would + # be, put an empty string. + annotation = gene_cluster_entry['annotation'] + annotation['ko'] = annotation_kos = {} + ko = gene_cluster.ko + annotation_kos[ko.id] = "" + for modelseed_reaction_id in ko.reactions: + try: + reaction_gene_clusters[modelseed_reaction_id].append(gene_cluster_id_str) + except KeyError: + reaction_gene_clusters[modelseed_reaction_id] = [gene_cluster_id_str] + try: + reaction_kos[modelseed_reaction_id].append(ko) + except KeyError: + reaction_kos[modelseed_reaction_id] = [ko] + if not record_genomes: + continue + genome_names = gene_cluster.genomes + if 'gene cluster' in record_genomes: + # Record the names of the genomes contributing to the gene cluster in the notes section + # of the gene cluster entry. + gene_cluster_entry['notes']['genomes'] = genome_names + if 'reaction' in record_genomes: + for modelseed_reaction_id in ko.reactions: + try: + reaction_genomes[modelseed_reaction_id] += genome_names + except KeyError: + reaction_genomes[modelseed_reaction_id] = genome_names + if 'metabolite' in record_genomes: + for reaction in ko.reactions.values(): + for compartment, metabolite in zip(reaction.compartments, reaction.compounds): + entry_id = f"{metabolite.modelseed_id}_{compartment}" + try: + metabolite_genomes[entry_id] += genome_names + except KeyError: + metabolite_genomes[entry_id] = genome_names - 'ko' : KOs yielding the ModelSEED compound + progress.update("Reactions") + compound_compartments: Dict[str, Set[str]] = {} + for modelseed_reaction_id, reaction in self.reactions.items(): + reaction_entry = JSONStructure.get_reaction_entry() + json_reactions.append(reaction_entry) + reaction_entry['id'] = modelseed_reaction_id + reaction_entry['name'] = reaction.modelseed_name + metabolites = reaction_entry['metabolites'] + for compound, compartment, coefficient in zip(reaction.compounds, reaction.compartments, reaction.coefficients): + modelseed_compound_id = compound.modelseed_id + metabolites[f"{modelseed_compound_id}_{compartment}"] = coefficient + try: + compound_compartments[modelseed_compound_id].add(compartment) + except KeyError: + compound_compartments[modelseed_compound_id] = set(compartment) + if not reaction.reversibility: + # By default, the reaction entry was set up to be reversible; here make it irreversible. + reaction_entry['lower_bound'] = 0.0 + reaction_entry['gene_reaction_rule'] = " or ".join([gcid for gcid in reaction_gene_clusters[modelseed_reaction_id]]) + notes = reaction_entry['notes'] + # Record gene KO annotations which aliased the reaction via KEGG REACTION or EC number. + notes['ko'] = ko_notes = {} + ko_kegg_aliases = [] + ko_ec_number_aliases = [] + for ko in reaction_kos[modelseed_reaction_id]: + try: + kegg_aliases = ko.kegg_reaction_aliases[modelseed_reaction_id] + except KeyError: + kegg_aliases = [] + try: + ec_number_aliases = ko.ec_number_aliases[modelseed_reaction_id] + except KeyError: + ec_number_aliases = [] + ko_notes[ko.id] = {'kegg.reaction': kegg_aliases, 'ec-code': ec_number_aliases} + ko_kegg_aliases += kegg_aliases + ko_ec_number_aliases += ec_number_aliases + ko_kegg_aliases = set(ko_kegg_aliases) + ko_ec_number_aliases = set(ko_ec_number_aliases) + # Record other KEGG REACTION or EC number aliases of the reaction in the ModelSEED + # database that did not happen to be associated with KO annotations. + notes['other_aliases'] = { + 'kegg.reaction': list(set(reaction.kegg_aliases).difference(ko_kegg_aliases)), + 'ec-code': list(set(reaction.ec_number_aliases).difference(ko_ec_number_aliases)) + } + if 'reaction' not in record_genomes: + continue + notes['genomes'] = sorted(set(reaction_genomes[modelseed_reaction_id])) - run : terminal.Run, terminal.Run() + progress.update("Metabolites") + for modelseed_compound_id, metabolite in self.metabolites.items(): + modelseed_compound_name = metabolite.modelseed_name + charge = metabolite.charge + formula = metabolite.formula + kegg_compound_aliases = list(metabolite.kegg_aliases) + for compartment in compound_compartments[modelseed_compound_id]: + metabolite_entry = JSONStructure.get_metabolite_entry() + json_metabolites.append(metabolite_entry) + entry_id = f"{modelseed_compound_id}_{compartment}" + metabolite_entry['id'] = entry_id + metabolite_entry['name'] = modelseed_compound_name + metabolite_entry['compartment'] = compartment + # Compounds without a formula have a nominal charge of 10000000 in the ModelSEED + # compounds database, which is replaced by None in the reaction network and 0 in the JSON. + metabolite_entry['charge'] = charge if charge is not None else 0 + metabolite_entry['formula'] = formula if formula is not None else "" + metabolite_entry['annotation']['kegg.compound'] = kegg_compound_aliases + if 'metabolite' not in record_genomes: + continue + notes['genomes'] = sorted(set(metabolite_genomes[entry_id])) - progress : terminal.Progress, terminal.Progress() - """ - pass + progress.update("Saving") + with open(path, 'w') as f: + json.dump(json_dict, f, indent=indent) + progress.end() class JSONStructure: """JSON structure of metabolic model file.""" @@ -1570,15 +1722,12 @@ def __init__( self.run = run self.progress = progress - # def import_network(self, json: str) -> ReactionNetwork: - # """Import a metabolic model JSON file as a reaction network object.""" - # pass - def load_network( self, contigs_db: str = None, + pan_db: str = None, genomes_storage_db: str = None, - pan_db: str = None + check_gene_annotations: bool = True ) -> ReactionNetwork: """ Load a reaction network stored in a database as a reaction network object. @@ -1588,13 +1737,20 @@ def load_network( contigs_db : str, None Path to a contigs database in which a reaction network is stored. - genomes_storage_db: str, None + pan_db : str, None + Path to a pan database in which a reaction network is stored. 'genomes_storage_db' is + also required. + + genomes_storage_db : str, None Path to a genomes storage database in which KO annotations are stored. 'pan_db' is also required. - pan_db: str, None - Path to a pan database in which a reaction network is stored. 'genomes_storage_db' is - also required. + check_gene_annotations : bool, True + If True, as by default, check that the stored reaction network was made from the set of + gene KO annotations that is currently stored. An exception is raised if this is not the + case. If False, allow the stored reaction network to have been made from a different set + of gene KO annotations than is currently stored. This can result in different KOs in the + returned ReactionNetwork than in the original network that was stored. Returns ======= @@ -1604,9 +1760,16 @@ def load_network( # Check that the reaction network stored in a database is derived from the current gene KO # annotations in the database. if contigs_db: - network = self.load_contigs_database_network(contigs_db) + network = self.load_contigs_database_network( + contigs_db, + check_gene_annotations=check_gene_annotations + ) elif genomes_storage_db or pan_db: - network = self.load_pangenomic_network(genomes_storage_db=genomes_storage_db, pan_db=pan_db) + network = self.load_pan_database_network( + genomes_storage_db=genomes_storage_db, + pan_db=pan_db, + check_gene_annotations=check_gene_annotations + ) else: raise ConfigError( "A reaction network must be loaded from a database source. " @@ -1628,11 +1791,11 @@ def load_contigs_database_network( Path to a contigs database in which a reaction network is stored. check_gene_annotations : bool, True - If True, check that the reaction network stored in the contigs database was made from - the same set of KEGG KO gene annotations as currently in the database, and throw an - error if this is not the case. If False, allow the stored reaction network to have been - made from a different set of KO gene annotations than is currently stored in the - database. This can result in different genes being associated with KOs in the returned + If True, as by default, check that the reaction network stored in the contigs database + was made from the same set of gene KO annotations as currently in the database, and + throw an error if this is not the case. If False, allow the stored reaction network to + have been made from a different set of gene KO annotations than is currently stored in + the database. This can result in different KO assignments to genes in the returned GenomicNetwork than in the original network that was stored. Returns @@ -1647,13 +1810,13 @@ def load_contigs_database_network( contigs_super = ContigsSuperclass(args, r=run_quiet) contigs_super.init_functions(requested_sources=['KOfam']) - # Check that the network stored in the contigs database was made from the same set of KEGG - # KO gene annotations as currently in the database. + # Check that the network stored in the contigs database was made from the same set of KO + # gene annotations as currently in the database. stored_hash = contigs_super.a_meta['reaction_network_ko_annotations_hash'] current_hash = self.hash_contigs_db_ko_annotations(contigs_super.gene_function_calls_dict) - if check_gene_annotations: - if stored_hash != current_hash: - ConfigError( + if stored_hash != current_hash: + if check_gene_annotations: + raise ConfigError( "The reaction network stored in the contigs database was made from a different set " "of KEGG KO gene annotations than is currently in the database. There are two " "solutions to this problem. First, 'anvi-reaction-network' can be run again to " @@ -1662,31 +1825,25 @@ def load_contigs_database_network( "than True, allowing the stored network to have been made from a different set of KO " "gene annotations than is currently stored in the database. This can result in " "different genes being associated with KOs in the returned GenomicNetwork than in " - "the original network that was stored. The available version of the KO database is " - "used to fill in data for KOs in the network that are not current gene annotations." - ) - # The KO database is needed if KOs in the stored network are among the current gene annotations. - ko_db = KODatabase(ko_dir=self.ko_dir) - else: - if stored_hash != current_hash: - self.run.warning( - "The reaction network stored in the contigs database was made from a different set " - "of KEGG KO gene annotations than is currently in the database. This will be " - "ignored since 'check_gene_annotations' is False. This can result in different genes " - "being associated with KOs in the returned GenomicNetwork than in the original " - "network that was stored." + "the original network that was stored. The available version of the KO database that " + "has been set up by anvi'o is used to fill in data for KOs in the network that are not " + "current gene annotations." ) + self.run.warning( + "The reaction network stored in the contigs database was made from a different set " + "of KEGG KO gene annotations than is currently in the database. This will be " + "ignored since 'check_gene_annotations' is False. This can result in different genes " + "being associated with KOs in the returned GenomicNetwork than in the original " + "network that was stored." + ) network = GenomicNetwork() cdb = ContigsDatabase(contigs_db) - functions_table = cdb.db.get_table_as_dataframe('gene_functions', where_clause='source = "KOfam"') - reactions_table = cdb.db.get_table_as_dataframe('gene_function_reactions') - metabolites_table = cdb.db.get_table_as_dataframe('gene_function_metabolites') - cdb.disconnect() - # Make gene objects for all genes with KO annotations in the contigs database, including + # Make objects representing all genes with KO annotations in the contigs database, including # genes that are not in the network, which are later removed from the network. + functions_table = cdb.db.get_table_as_dataframe('gene_functions', where_clause='source = "KOfam"') for gcid, ko_id, ko_name, e_value in zip(functions_table['gene_callers_id'], functions_table['accession'], functions_table['function'], functions_table['e_value']): try: # This is not the first annotation involving the gene, so an object for it already exists. @@ -1706,7 +1863,273 @@ def load_contigs_database_network( gene.kos.append(ko) gene.e_values.append(e_value) - # Make ModelSEED reaction objects. Relate them to KOs through KEGG REACTION and EC number aliases. + self._load_modelseed_reactions(cdb, network) + self._load_modelseed_compounds(cdb, network) + cdb.disconnect() + + # Remove any trace of genes that do not contribute to the reaction network. Also remove + # unnetworked KO links to genes. + unnetworked_gcids = [] + for gcid, gene in network.genes.items(): + gene_in_network = False + unnetworked_ko_indices: List[int] = [] + idx = 0 + for ko, e_value in zip(gene.kos, gene.e_values): + if ko.reactions: + gene_in_network = True + else: + unnetworked_ko_indices.append(idx) + idx += 1 + if gene_in_network: + for unnetworked_ko_index in reversed(unnetworked_ko_indices): + gene.kos.pop(unnetworked_ko_index) + gene.e_values.pop(unnetworked_ko_index) + else: + unnetworked_gcids.append(gcid) + for gcid in unnetworked_gcids: + network.genes.pop(gcid) + + # Remove any trace of KOs that do not contribute to the reaction network. + unnetworked_ko_ids = [] + for ko_id, ko in network.kos.items(): + if not ko.reactions: + unnetworked_ko_ids.append(ko_id) + for ko_id in unnetworked_ko_ids: + network.kos.pop(ko_id) + + # Remove entries in the network attribute mapping ModelSEED reaction IDs to KO KEGG + # REACTION ID aliases if no such aliases were found to exist. + modelseed_reaction_ids = [] + for modelseed_reaction_id, kegg_reaction_ids in network.modelseed_kegg_aliases.items(): + if not kegg_reaction_ids: + modelseed_reaction_ids.append(modelseed_reaction_id) + for modelseed_reaction_id in modelseed_reaction_ids: + network.modelseed_kegg_aliases.pop(modelseed_reaction_id) + + # Remove entries in the network attribute mapping ModelSEED reaction IDs to KO EC number + # aliases if no such aliases were found to exist. + modelseed_reaction_ids = [] + for modelseed_reaction_id, ec_numbers in network.modelseed_ec_number_aliases.items(): + if not ec_numbers: + modelseed_reaction_ids.append(modelseed_reaction_id) + for modelseed_reaction_id in modelseed_reaction_ids: + network.modelseed_ec_number_aliases.pop(modelseed_reaction_id) + + return network + + def load_pan_database_network( + self, + pan_db: str, + genomes_storage_db: str, + check_gene_annotations: bool = True + ) -> PangenomicNetwork: + """ + Load reaction network data stored in a pan database as a reaction network object. + + Parameters + ========== + pan_db : str + Path to a pan database in which a reaction network is stored. + + genomes_storage_db : str + Path to the genomes storage database associated with the pan database. + + check_annotations : bool, True + If True, as by default, check that the reaction network stored in the pan database was + made from the set of gene KO annotations currently stored in the associated genomes + storage database. An exception is raised if this is not the case. If False, allow the + stored reaction network to have been made from a different set of gene KO annotations + than is currently stored in the genomes storage database. This can result in different + consensus KOs assigned to gene clusters in the returned PangenomicNetwork than in the + original network that was stored. + """ + # Load the pan database. + pan_db_info = dbinfo.PanDBInfo(pan_db) + self_table = pan_db_info.get_self_table() + # No consensus threshold may have been used in network construction, in which case the value + # of the parameter is None. + consensus_threshold = self_table['reaction_network_consensus_threshold'] + if consensus_threshold is not None: + consensus_threshold = float(consensus_threshold) + discard_ties = bool(int(self_table['reaction_network_discard_ties'])) + args = argparse.Namespace() + args.pan_db = pan_db + args.genomes_storage = genomes_storage_db + args.consensus_threshold = consensus_threshold + args.discard_ties = discard_ties + pan_super = PanSuperclass(args, r=run_quiet) + pan_super.init_gene_clusters() + pan_super.init_gene_clusters_functions() + pan_super.init_gene_clusters_functions_summary_dict() + gene_clusters_functions_summary_dict: Dict = pan_super.gene_clusters_functions_summary_dict + + # Check that the network stored in the pan database was made from the same set of KO gene + # annotations currently in the associated genomes storage database. + stored_hash = self_table['reaction_network_ko_annotations_hash'] + current_hash = self.hash_pan_db_ko_annotations( + genomes_storage_db, + gene_clusters_functions_summary_dict, + consensus_threshold, + discard_ties + ) + if stored_hash != current_hash: + if check_gene_annotations: + # Note that another unstated possible cause of the error could be due to manual + # meddling with the metavariables, 'consensus_threshold' and 'discard_ties', in the + # database. Assume that the user was not engaged in mischief. + raise ConfigError( + "The reaction network stored in the pan database was made from a different set " + "of KO gene annotations than is currently in the associated genomes storage " + "database. There are two solutions to this problem. First, the program, " + "'anvi-reaction-network', can be run again to overwrite the existing network " + "stored in the pan database with a new network from the new KO gene " + "annotations. Second, 'check_gene_annotations' can be given an argument of " + "False instead of True, preventing this exception from being raised if the " + "stored network was made from a different set of KO gene annotations than is " + "currently in the genomes storage database. This can result in different " + "consensus KOs assigned to gene clusters in the returned PangenomicNetwork " + "than in the original network that was stored. The available version of the KO " + "database that has been set up by anvi'o is used to fill in data for any KOs " + "in the network that are not current gene annotations in the genomes storage " + "database." + ) + self.run.warning( + "The reaction network stored in the pan database was made from a different set of " + "KO gene annotations than is currently in the genomes storage database. This will " + "be ignored since 'check_gene_annotations' is False. This can result in different " + "consensus KO assignments to gene clusters in the returned PangenomicNetwork than " + "in the original network that was stored." + ) + + network = PangenomicNetwork() + + # Make objects representing all gene clusters with consensus KO annotations. + for gene_cluster_id, gene_cluster_functions_data in gene_clusters_functions_summary_dict.items(): + # Retrieve the consensus KO across genes in the cluster. Parameterization of the method + # used to select consensus KOs occurred in pan super initialization. Parameter values + # were loaded from pan database metavariables. + gene_cluster_ko_data = gene_cluster_functions_data['KOfam'] + if gene_cluster_ko_data == {'function': None, 'accession': None}: + # No KO was assigned to the cluster. + continue + ko_id = gene_cluster_ko_data['accession'] + + gene_cluster = GeneCluster() + gene_cluster.gene_cluster_id = gene_cluster_id + gene_cluster.genomes = list(pan_super.gene_clusters[gene_cluster_id]) + # Add the gene cluster to the network, regardless of whether it yields reactions. Gene + # clusters not contributing to the reaction network are removed later. + network.gene_clusters[gene_cluster_id] = gene_cluster + + try: + # This is not the first gene cluster that has been encountered with the KO assigned + # to it, so an object for the KO already exists. + ko = network.kos[ko_id] + except KeyError: + ko = KO() + ko.id = ko_id + ko.name = gene_cluster_ko_data['function'] + network.kos[ko_id] = ko + gene_cluster.ko = ko + + pdb = PanDatabase(pan_db) + self._load_modelseed_reactions(pdb, network) + self._load_modelseed_compounds(pdb, network) + pdb.disconnect() + + # Remove any trace of gene clusters that do not contribute to the reaction network. + unnetworked_gene_cluster_ids = [] + for gene_cluster_id, gene_cluster in network.gene_clusters.items(): + if gene_cluster.ko.reactions: + continue + unnetworked_gene_cluster_ids.append(gene_cluster_id) + for gene_cluster_id in unnetworked_gene_cluster_ids: + network.gene_clusters.pop(gene_cluster_id) + + # Remove any trace of KOs that do not contribute to the reaction network. + unnetworked_ko_ids = [] + for ko_id, ko in network.kos.items(): + if not ko.reactions: + unnetworked_ko_ids.append(ko_id) + for ko_id in unnetworked_ko_ids: + network.kos.pop(ko_id) + + # Remove entries in the network attribute mapping ModelSEED reaction IDs to KO KEGG + # REACTION ID aliases if no such aliases were found to exist. + modelseed_reaction_ids = [] + for modelseed_reaction_id, kegg_reaction_ids in network.modelseed_kegg_aliases.items(): + if not kegg_reaction_ids: + modelseed_reaction_ids.append(modelseed_reaction_id) + for modelseed_reaction_id in modelseed_reaction_ids: + network.modelseed_kegg_aliases.pop(modelseed_reaction_id) + + # Remove entries in the network attribute mapping ModelSEED reaction IDs to KO EC number + # aliases if no such aliases were found to exist. + modelseed_reaction_ids = [] + for modelseed_reaction_id, ec_numbers in network.modelseed_ec_number_aliases.items(): + if not ec_numbers: + modelseed_reaction_ids.append(modelseed_reaction_id) + for modelseed_reaction_id in modelseed_reaction_ids: + network.modelseed_ec_number_aliases.pop(modelseed_reaction_id) + + return network + + def _load_modelseed_reactions( + self, + database: Union[ContigsDatabase, PanDatabase], + network: Union[GenomicNetwork, PangenomicNetwork] + ) -> None: + """ + Add ModelSEED reactions to the network being loaded from the database. + + ModelSEED reaction objects are related to KOs through KEGG REACTION and EC number aliases. + + Parameters + ========== + database : ContigsDatabase or PanDatabase + The database storing a reaction network. In loading a genomic network, provide a contigs + database; in loading a pangenomic network, provide a pan database. + + network : GenomicNetwork or PangenomicNetwork + The reaction network under construction. + + Returns + ======= + None + """ + # Load the table of reactions data. + if type(database) is ContigsDatabase: + reactions_table = database.db.get_table_as_dataframe('gene_function_reactions') + if type(network) is not GenomicNetwork: + raise ConfigError( + "The provided 'database' was of type 'ContigsDatabase', so the provided " + "'network' must be of type 'GenomicNetwork'. Instead, the reaction network " + f"argument was of type '{type(network)}'." + ) + elif type(database) is PanDatabase: + reactions_table = database.db.get_table_as_dataframe('gene_cluster_function_reactions') + if type(network) is not PangenomicNetwork: + raise ConfigError( + "The provided 'database' was of type 'PanDatabase', so the provided 'network' " + "must be of type 'PangenomicNetwork'. Instead, the reaction network argument " + f"was of type '{type(network)}'." + ) + else: + raise ConfigError( + "The provided 'database' must be of type 'ContigsDatabase' or 'PanDatabase'. " + f"Instead, the argument was of type '{type(database)}'." + ) + + # The KO database is needed if KOs in the stored network aren't among the current gene + # annotations. + try: + ko_db = KODatabase(ko_dir=self.ko_dir) + except ConfigError as e: + raise ConfigError( + f"{e} Please set up the KO database in the default directory with the program, " + "'anvi-reaction-network'." + ) + for row in reactions_table.itertuples(): # Each row of the table contains information on a different ModelSEED reaction. reaction = ModelSEEDReaction() @@ -1721,7 +2144,8 @@ def load_contigs_database_network( try: compound = network.metabolites[modelseed_compound_id] except KeyError: - # This is not the first reaction involving the compound, so an object for it already exists. + # This is not the first reaction involving the compound, so an object for it + # already exists. compound = ModelSEEDCompound() compound.modelseed_id = modelseed_compound_id reaction.compounds.append(compound) @@ -1735,7 +2159,8 @@ def load_contigs_database_network( reversibility: int = row.reversibility reaction.reversibility = bool(reversibility) - # Map KEGG reaction aliases of the ModelSEED reaction to all KOs that were associated with the KEGG reaction. + # Map KEGG reaction aliases of the ModelSEED reaction to all KOs that were associated + # with the KEGG reaction. kegg_reaction_ko_ids: Dict[str, List[str]] = {} kegg_reaction_sources: str = row.ko_kegg_reaction_source for kegg_reaction_item in kegg_reaction_sources.split('; '): @@ -1745,7 +2170,8 @@ def load_contigs_database_network( kegg_reaction_id, ko_ids = kegg_reaction_item.split(': (') ko_ids = ko_ids[:-1].split(', ') kegg_reaction_ko_ids[kegg_reaction_id] = ko_ids - # Record *all* KEGG reaction aliases of the ModelSEED reaction, including those not associated with KO annotations. + # Record *all* KEGG reaction aliases of the ModelSEED reaction, including those not + # associated with KO annotations. other_kegg_reaction_ids: str = row.other_kegg_reaction_ids reaction.kegg_aliases = list(kegg_reaction_ko_ids) if other_kegg_reaction_ids: @@ -1756,7 +2182,8 @@ def load_contigs_database_network( orphan_ko_ids = [] reaction_added_to_ko = False for kegg_reaction_id, ko_ids in kegg_reaction_ko_ids.items(): - # Record the ModelSEED reaction as one of the aliases of the KEGG reaction in the network. + # Record the ModelSEED reaction as one of the aliases of the KEGG reaction in the + # network. try: network.kegg_modelseed_aliases[kegg_reaction_id].append(modelseed_reaction_id) except KeyError: @@ -1766,12 +2193,20 @@ def load_contigs_database_network( try: ko = network.kos[ko_id] except KeyError: - # This only happens when the current set of gene KO annotations in the contigs - # database does not match the set from which the reaction network was originally - # made, and the KO in the network is no longer a gene annotation in the database. + # In the case of a genomic network, this error arises when the current set + # of gene KO annotations in the contigs database does not match the set from + # which the reaction network was originally made, and the KO under + # consideration in the network is no longer a gene annotation in the + # database. In the case of a pangenomic network, this error arises when the + # current set of gene cluster consensus KO annotations does not match the + # set from which the reaction network was originally made and the consensus + # KO under consideration in the network no longer annotates a gene cluster + # in the pan database. (The current set of gene cluster consensus KO + # annotations is derived from the pan and genomes storage databases using + # the parameters, 'consensus_threshold' and 'discard_ties'.) ko = KO() ko.ko_id = ko_id - # The KO name is unknown from the contigs database, so take it from the KO database. + # The KO name is unknown from the database, so take it from the KO database. ko.ko_name = ko_db.ko_table.loc[ko_id, 'name'] network.kos[ko_id] = ko orphan_ko_ids.append(ko_id) @@ -1784,7 +2219,8 @@ def load_contigs_database_network( except KeyError: ko.kegg_reaction_aliases[modelseed_reaction_id] = [kegg_reaction_id] - # Map EC number aliases of the ModelSEED reaction to all KOs that were associated with the EC number. + # Map EC number aliases of the ModelSEED reaction to all KOs that were associated with + # the EC number. ec_number_ko_ids: Dict[str, List[str]] = {} ec_number_sources: str = row.ko_ec_number_source for ec_number_item in ec_number_sources.split('; '): @@ -1794,7 +2230,8 @@ def load_contigs_database_network( ec_number, ko_ids = ec_number_item.split(': (') ko_ids = ko_ids[:-1].split(', ') ec_number_ko_ids[ec_number] = ko_ids - # Record *all* EC number aliases of the ModelSEED reaction, including those not associated with KO annotations. + # Record *all* EC number aliases of the ModelSEED reaction, including those not + # associated with KO annotations. other_ec_numbers: str = row.other_ec_numbers reaction.ec_number_aliases = list(ec_number_ko_ids) if other_ec_numbers: @@ -1803,7 +2240,8 @@ def load_contigs_database_network( network.modelseed_ec_number_aliases[modelseed_reaction_id] = modelseed_ec_number_aliases = [] for ec_number, ko_ids in ec_number_ko_ids.items(): - # Record the ModelSEED reaction as one of the aliases of the EC number in the network. + # Record the ModelSEED reaction as one of the aliases of the EC number in the + # network. try: network.ec_number_modelseed_aliases[ec_number].append(modelseed_reaction_id) except KeyError: @@ -1813,12 +2251,10 @@ def load_contigs_database_network( try: ko = network.kos[ko_id] except KeyError: - # This only happens when the current set of gene KO annotations in the contigs - # database does not match the set from which the reaction network was originally - # made, and the KO in the network is no longer a gene annotation in the database. + # This error arises for the same reason as before (processing KEGG reactions). ko = KO() ko.ko_id = ko_id - # The KO name is unknown from the contigs database, so take it from the KO database. + # The KO name is unknown from the database, so take it from the KO database. ko.ko_name = ko_db.ko_table.loc[ko_id, 'name'] network.kos[ko_id] = ko orphan_ko_ids.append(ko_id) @@ -1832,14 +2268,71 @@ def load_contigs_database_network( ko.ec_number_aliases[modelseed_reaction_id] = [ec_number] if DEBUG: - self.run.info_single( - "The following KOs are in the stored reaction network in the contigs database " - "but are not among the gene KO annotations; the available version of the KO database " - f"was used to fill in the function names of these KOs: {', '.join(orphan_ko_ids)}" + # "Orphan" KOs can only arise when 'check_gene_annotations' is False in the calling + # method, 'load_contigs_database_network' or 'load_pan_database_network'. + if type(network) is GenomicNetwork: + self.run.info_single( + "The following KOs are found in the stored reaction network in the contigs " + "database, but they are not found among the current gene KO annotations in " + "the contigs database. The available version of the KO database set up by " + "anvi'o was used to retrieve the function 'names' of these KOs: " + f"{', '.join(orphan_ko_ids)}" + ) + elif type(network) is PangenomicNetwork: + self.run.info_single( + "The following KOs are found in the stored reaction network in the pan " + "database, but they are not found among the current gene KO annotations in " + "the genomes storage database. The available version of the KO database " + "set up by anvi'o was used to retrieve the function 'names' of these KOs: " + f"{', '.join(orphan_ko_ids)}" + ) + + def _load_modelseed_compounds( + self, + database: Union[ContigsDatabase, PanDatabase], + network: Union[GenomicNetwork, PangenomicNetwork] + ) -> None: + """ + Add ModelSEED compounds to the network being loaded from the database. + + Parameters + ========== + database : ContigsDatabase or PanDatabase + The database storign a reaction network. In loading a genomic network, provide a contigs + database; in loading a pangenomic network, provide a pan database. + + network : GenomicNetwork or PangenomicNetwork + The reaction network under construction. + + Returns + ======= + None + """ + # Load the table of compounds data. + if type(database) is ContigsDatabase: + metabolites_table = database.db.get_table_as_dataframe('gene_function_metabolites') + if type(network) is not GenomicNetwork: + raise ConfigError( + "The provided 'database' was of type 'ContigsDatabase', so the provided " + "'network' must be of type 'GenomicNetwork'. Instead, the reaction network " + f"argument was of type '{type(network)}'." + ) + elif type(database) is PanDatabase: + metabolites_table = database.db.get_table_as_dataframe('gene_cluster_function_metabolites') + if type(network) is not PangenomicNetwork: + raise ConfigError( + "The provided 'database' was of type 'PanDatabase', so the provided 'network' " + "must be of type 'PangenomicNetwork'. Instead, the reaction network argument " + f"was of type '{type(database)}'." ) + else: + raise ConfigError( + "The provided 'database' must be of type 'ContigsDatabase' or 'PanDatabase'. " + f"Instead, the argument was of type '{type(database)}'." + ) - # Assign ModelSEED compound object attributes. for row in metabolites_table.itertuples(): + # Each row of the table contains information on a different ModelSEED compound. modelseed_compound_id = row.modelseed_compound_id compound = network.metabolites[modelseed_compound_id] modelseed_compound_name: str = row.modelseed_compound_name @@ -1854,54 +2347,6 @@ def load_contigs_database_network( charge: int = row.charge compound.charge = charge if not np.isnan(charge) else None - # Remove any trace of genes that do not contribute to the reaction network. Also remove - # unnetworked KO links to genes. - unnetworked_gcids = [] - for gcid, gene in network.genes.items(): - gene_in_network = False - unnetworked_ko_indices: List[int] = [] - idx = 0 - for ko, e_value in zip(gene.kos, gene.e_values): - if ko.reactions: - gene_in_network = True - else: - unnetworked_ko_indices.append(idx) - idx += 1 - if gene_in_network: - for unnetworked_ko_index in reversed(unnetworked_ko_indices): - gene.kos.pop(unnetworked_ko_index) - gene.e_values.pop(unnetworked_ko_index) - else: - unnetworked_gcids.append(gcid) - for gcid in unnetworked_gcids: - network.genes.pop(gcid) - - # Remove any trace of KOs that do not contribute to the reaction network. - unnetworked_ko_ids = [] - for ko_id, ko in network.kos.items(): - if not ko.reactions: - unnetworked_ko_ids.append(ko_id) - for ko_id in unnetworked_ko_ids: - network.kos.pop(ko_id) - - # Remove entries showing KO KEGG reaction aliases of ModelSEED reactions where there are no aliases. - modelseed_reaction_ids = [] - for modelseed_reaction_id, kegg_reaction_ids in network.modelseed_kegg_aliases.items(): - if not kegg_reaction_ids: - modelseed_reaction_ids.append(modelseed_reaction_id) - for modelseed_reaction_id in modelseed_reaction_ids: - network.modelseed_kegg_aliases.pop(modelseed_reaction_id) - - # Remove entries showing EC number aliases of ModelSEED reactions where there are no aliases. - modelseed_reaction_ids = [] - for modelseed_reaction_id, ec_numbers in network.modelseed_ec_number_aliases.items(): - if not ec_numbers: - modelseed_reaction_ids.append(modelseed_reaction_id) - for modelseed_reaction_id in modelseed_reaction_ids: - network.modelseed_ec_number_aliases.pop(modelseed_reaction_id) - - return network - def make_network( self, contigs_db: str = None, @@ -1927,11 +2372,12 @@ def make_network( pan_db : str, None Path to a pan database. The pangenomic network is determined for gene clusters stored in the database. If 'store' is True, the network is saved in the database. - 'genomes_storage_db' is also required. + An argument for the paired 'genomes_storage_db' is also required. genomes_storage_db : str, None Path to a genomes storage database. The pangenomic network is derived from gene KO - annotations stored in the database. 'pan_db' is also required. + annotations stored in the database. An argument for the paired 'pan_db' is also + required. store : bool, True Save the network. A network constructed from a contigs database is stored in that @@ -2523,6 +2969,7 @@ def make_pangenomic_network( gene_cluster = GeneCluster() gene_cluster.gene_cluster_id = gene_cluster_id + gene_cluster.genomes = list(pan_super.gene_clusters[gene_cluster_id]) # Add the gene cluster to the network, regardless of whether it yields reactions. Gene # clusters not contributing to the reaction network are removed later. network.gene_clusters[gene_cluster_id] = gene_cluster diff --git a/anvio/docs/artifacts/reaction-network-json.md b/anvio/docs/artifacts/reaction-network-json.md index 2b3e5bf2da..bf1a15aeb8 100644 --- a/anvio/docs/artifacts/reaction-network-json.md +++ b/anvio/docs/artifacts/reaction-network-json.md @@ -1,5 +1,5 @@ This artifact represents **a JSON-formatted file derived from a %(reaction-network)s**. -The program, %(anvi-get-metabolic-model-file)s, produces this file from the %(reaction-network)s stored in a %(contigs-db)s. The genes, reactions, and metabolites predicted to be involved in metabolism can be inspected in this file, which is formatted for compatability with software used for flux balance analysis, such as [COBRApy](https://opencobra.github.io/cobrapy/). +The program, %(anvi-get-metabolic-model-file)s, produces this file from the %(reaction-network)s stored in a %(contigs-db)s or %(pan-db)s. The genes, reactions, and metabolites predicted to be involved in metabolism can be inspected in this file, which is formatted for compatability with software used for flux balance analysis, such as [COBRApy](https://opencobra.github.io/cobrapy/). %(anvi-get-metabolic-model-file)s includes an "objective function" as the first entry of the "reactions" section of the file, a prerequisite for flux balance analysis. The objective function represents the biomass composition of metabolites in the ["core metabolism" of *E. coli*](http://bigg.ucsd.edu/models/e_coli_core). diff --git a/anvio/docs/artifacts/reaction-network.md b/anvio/docs/artifacts/reaction-network.md index 331af5ba57..bb9df9371a 100644 --- a/anvio/docs/artifacts/reaction-network.md +++ b/anvio/docs/artifacts/reaction-network.md @@ -1,5 +1,5 @@ -This artifact represents **the metabolic reaction network stored in a %(contigs-db)s by %(anvi-reaction-network)s.** +This artifact represents **the metabolic reaction network stored in a %(contigs-db)s or a %(pan-db)s by %(anvi-reaction-network)s.** -The program, %(anvi-reaction-network)s, generates a reaction network from genes encoding enzymes in the %(contigs-db)s. The reaction network represents biochemical reactions and the constituent metabolites predicted from the genome. The program relies upon [KEGG Orthology (KO)](https://www.genome.jp/kegg/ko.html) annotations of protein-coding genes and reference data in the [ModelSEED Biochemistry database](https://github.com/ModelSEED/ModelSEEDDatabase), and is therefore subject to all the limitations thereof, including incomplete annotation of genes with protein orthologs and imprecise knowledge of the reactions catalyzed by enzymes. +The program, %(anvi-reaction-network)s, generates a reaction network from genes encoding enzymes in the %(contigs-db)s or from gene clusters with consensus enzyme annotations in the %(pan-db)s. The reaction network represents biochemical reactions and the constituent metabolites predicted from the genome or pangenome. The program relies upon [KEGG Orthology (KO)](https://www.genome.jp/kegg/ko.html) annotations of protein-coding genes and reference data in the [ModelSEED Biochemistry database](https://github.com/ModelSEED/ModelSEEDDatabase), and is therefore subject to all the limitations thereof, including incomplete annotation of genes with protein orthologs and imprecise knowledge of the reactions catalyzed by enzymes. -The representation of the reaction network in two tables of the %(contigs-db)s, `gene_function_reactions` and `gene_function_metabolites`, is generalizable to other sources of metabolic data, linking genes to predicted functional orthologs and the associated reactions and metabolites. This data can be exported to a JSON-formatted file by %(anvi-get-metabolic-model-file)s for inspection and metabolic model analyses. +The representation of the reaction network in two tables of the %(contigs-db)s, `gene_function_reactions` and `gene_function_metabolites`, is generalizable to other sources of metabolic data, linking genes to predicted functional orthologs and the associated reactions and metabolites. Reaction and metabolite data are likewise stored in the identically formatted tables, `gene_cluster_function_reactions` and `gene_cluster_function_metabolites`, in the %(pan-db)s. This data can be exported to a JSON-formatted file by %(anvi-get-metabolic-model-file)s for inspection and metabolic model analyses. diff --git a/anvio/docs/programs/anvi-get-metabolic-model-file.md b/anvio/docs/programs/anvi-get-metabolic-model-file.md index 16fcb89930..361fd98b3e 100644 --- a/anvio/docs/programs/anvi-get-metabolic-model-file.md +++ b/anvio/docs/programs/anvi-get-metabolic-model-file.md @@ -1,30 +1,60 @@ -This program **exports a metabolic %(reaction-network)s from a %(contigs-db)s to a %(reaction-network-json)s file** suitable for inspection and flux balance analysis. +This program **exports a metabolic %(reaction-network)s from a %(contigs-db)s OR a %(pan-db)s and %(genomes-storage-db)s to a %(reaction-network-json)s file** formatted for flux balance analysis. -The required input to this program is a %(contigs-db)s in which a %(reaction-network)s has been stored by %(anvi-reaction-network)s. +The required input to this program is a %(contigs-db)s OR a %(pan-db)s in which a %(reaction-network)s has been stored by %(anvi-reaction-network)s. The %(pan-db)s must be accompanied by a %(genomes-storage-db)s input. -The %(reaction-network-json)s file output contains sections on the metabolites, reactions, and genes constituting the %(reaction-network)s that had been predicted from the genome. An "objective function" representing the biomass composition of metabolites in the ["core metabolism" of *E. coli*](http://bigg.ucsd.edu/models/e_coli_core) is automatically added as the first entry in the "reactions" section of the file and can be deleted as needed. An objective function is needed for flux balance analysis. +The %(reaction-network-json)s file output contains sections on the metabolites, reactions, and genes (or gene clusters) constituting the %(reaction-network)s that had been predicted from the genome (or pangenome). An "objective function" representing the biomass composition of metabolites in the ["core metabolism" of *E. coli*](http://bigg.ucsd.edu/models/e_coli_core) is automatically added as the first entry in the "reactions" section of the file and can be deleted as needed. An objective function is needed for flux balance analysis. ## Usage -%(anvi-get-metabolic-model-file)s requires a %(contigs-db)s as input and the path to an output %(reaction-network-json)s file. +%(anvi-get-metabolic-model-file)s requires a %(contigs-db)s OR a %(pan-db)s and %(genomes-storage-db)s as input, plus the path to an output %(reaction-network-json)s file. {{ codestart }} -anvi-get-metabolic-model-file -c %(contigs-db)s \ +anvi-get-metabolic-model-file -c /path/to/contigs-db \ -o /path/to/ouput.json {{ codestop }} -An existing file at the target output location must be explicitly overwritten with the `-W` flag. +{{ codestart }} +anvi-get-metabolic-model-file -p /path/to/pan-db \ + -g /path/to/genomes-storage-db \ + -o /path/to/output.json +{{ codestop }} + +An existing file at the target output location must be explicitly overwritten with the flag, `--overwrite-output-destinations`. {{ codestart }} -anvi-get-metabolic-model-file -c %(contigs-db)s \ +anvi-get-metabolic-model-file -c /path/to/contigs-db \ -o /path/to/output.json \ - -W + --overwrite-output-destinations {{ codestop }} -The flag, `--remove-missing-objective-metabolites` must be used to remove metabolites in the *E. coli* core biomass objective function from the output file if the metabolites are not produced or consumed by the predicted %(reaction-network)s. [COBRApy](https://opencobra.github.io/cobrapy/), for instance, cannot load the JSON file if metabolites in the objective function are missing from the genomic model. +The flag, `--remove-missing-objective-metabolites` must be used to remove metabolites in the *E. coli* core biomass objective function from the %(reaction-network-json)s file if the metabolites are not produced or consumed by the predicted %(reaction-network)s. [COBRApy](https://opencobra.github.io/cobrapy/), for instance, cannot load the JSON file if metabolites in the objective function are missing from the model. {{ codestart }} -anvi-get-metabolic-model-file -c %(contigs-db)s \ +anvi-get-metabolic-model-file -c /path/to/contigs-db \ -o /path/to/output.json \ --remove-missing-objective-metabolites {{ codestop }} + +It is possible that the gene KO annotations used to construct the stored reaction network have since been changed in the %(contigs-db)s or the %(genomes-storage-db)s. By default, without using the flag, `--ignore-changed-gene-annotations`, this program checks that the set of gene KO annotations that is currently stored was also that used in construction of the %(reaction-network)s, and raises an error if this is not the case. Use of this flag ignores that check, permitting the set of gene annotations to have changed since creation of the network. + +{{ codestart }} +anvi-get-metabolic-model-file -p /path/to/contigs-db \ + -o /path/to/output.json \ + --ignore-changed-gene-annotations +{{ codestop }} + +For a pangenomic network, the option `--record-genomes` determines which additional information is added to the output %(reaction-network-json)s file regarding genome membership. By default, genome names are recorded for gene clusters and reactions, which is equivalent to `--record-genomes cluster reaction`. 'cluster' records in the 'notes' section of each 'gene' (cluster) entry in the JSON file which genomes are part of the cluster. 'reaction' and 'metabolite', respectively, record the genomes predicted to encode enzymes associated with reaction and metabolite entries. The arguments, 'cluster', 'reaction', and 'metabolite', are valid, and are all used in the following example. + +{{ codestart }} +anvi-get-metabolic-model-file -p /path/to/pan-db \ + -g /path/to/genomes-storage-db \ + --record-genomes cluster reaction metabolite +{{ codestop }} + +The use of `--record-genomes` as a flag without any arguments prevents genome membership from being recorded at all in the %(reaction-network-json)s file. + +{{ codestart }} +anvi-get-metabolic-model-file -p /path/to/pan-db \ + -g /path/to/genomes-storage-db \ + --record-genomes +{{ codestop }} diff --git a/anvio/docs/programs/anvi-reaction-network.md b/anvio/docs/programs/anvi-reaction-network.md index ede8fae597..9f36880fb5 100644 --- a/anvio/docs/programs/anvi-reaction-network.md +++ b/anvio/docs/programs/anvi-reaction-network.md @@ -1,12 +1,12 @@ -This program **stores a metabolic %(reaction-network)s in a %(contigs-db)s.** +This program **stores a metabolic %(reaction-network)s in a %(contigs-db)s or %(pan-db)s.** -The network consists of data on biochemical reactions predicted to be encoded by the genome, referencing the [KEGG Orthology (KO)](https://www.genome.jp/kegg/ko.html) and [ModelSEED Biochemistry](https://github.com/ModelSEED/ModelSEEDDatabase) databases. +The network consists of data on biochemical reactions predicted to be encoded by the genome or pangenome, referencing the [KEGG Orthology (KO)](https://www.genome.jp/kegg/ko.html) and [ModelSEED Biochemistry](https://github.com/ModelSEED/ModelSEEDDatabase) databases. -Information on the predicted reactions and the involved metabolites are stored in two tables of the %(contigs-db)s. The program, %(anvi-get-metabolic-model-file)s, can be used to export the %(reaction-network)s from the database to a %(reaction-network-json)s file suitable for inspection and flux balance analysis. +Information on the predicted reactions and the involved metabolites are stored in two tables of the %(contigs-db)s or %(pan-db)s. The program, %(anvi-get-metabolic-model-file)s, can be used to export the %(reaction-network)s from the database to a %(reaction-network-json)s file formatted for flux balance analysis. ## Usage -%(anvi-reaction-network)s takes a %(contigs-db)s as required input. Genes stored within the database must have KO protein annotations, which can be assigned by %(anvi-run-kegg-kofams)s. +%(anvi-reaction-network)s takes a either a %(contigs-db)s OR a %(pan-db)s and %(genomes-storage-db)s as required input. Genes stored within the %(contigs-db)s or %(genomes-storage-db)s must have KO protein annotations, which can be assigned by %(anvi-run-kegg-kofams)s. The KO and ModelSEED Biochemistry databases must be set up and available to the program. By default, these are expected to be set up in default anvi'o data directories. %(anvi-setup-kegg-data)s and %(anvi-setup-modelseed-database)s must be run to set up these databases. @@ -17,11 +17,21 @@ anvi-reaction-network -c /path/to/contigs-db Custom locations for the reference databases can be provided with the flags, `--ko-dir` and `--modelseed-dir`. {{ codestart }} -anvi-reaction-network -c /path/to/contigs-db --ko-dir /path/to/set-up/ko-dir --modelseed-dir /path/to/set-up/modelseed-dir +anvi-reaction-network -c /path/to/contigs-db \ + --ko-dir /path/to/set-up/ko-dir \ + --modelseed-dir /path/to/set-up/modelseed-dir {{ codestop }} If a %(contigs-db)s already contains a %(reaction-network)s from a previous run of this program, the flag `--overwrite-existing-network` can overwrite the existing network with a new one. For example, if %(anvi-run-kegg-kofams)s is run again on a database using a newer version of KEGG, then %(anvi-reaction-network)s should be rerun to update the %(reaction-network)s derived from the KO annotations. {{ codestart }} -anvi-reaction-network -c /path/to/contigs-db --overwrite-existing-network +anvi-reaction-network -c /path/to/contigs-db \ + --overwrite-existing-network +{{ codestop }} + +A %(reaction-network)s can also be generated from consensus KO annotations of gene clusters. This can be used to understand the conservation or divergence of parts of the metabolic network between organisms in the pangenome. + +{{ codestart }} +anvi-reaction-network -p /path/to/pan-db \ + -g /path/to/genomes-storage-db {{ codestop }} diff --git a/bin/anvi-get-metabolic-model-file b/bin/anvi-get-metabolic-model-file index 2830ec5cf0..d2444378d1 100755 --- a/bin/anvi-get-metabolic-model-file +++ b/bin/anvi-get-metabolic-model-file @@ -1,16 +1,13 @@ #!/usr/bin/env python # -*- coding: utf-8 -DESCRIPTION = """This program writes a metabolic reaction network to a file suitable for flux balance analysis.""" +DESCRIPTION = """This program exports a metabolic reaction network to a file suitable for flux balance analysis.""" from sys import exit from argparse import Namespace -# import anvio.biochemistry.refdbs as refdbs -# import anvio.biochemistry.metabolicmodel as metabolicmodel import anvio.biochemistry.reactionnetwork as reactionnetwork from anvio import A, K -# from anvio.terminal import Run from anvio.errors import ConfigError from anvio import __version__ as VERSION from anvio.argparse import ArgumentParser @@ -29,40 +26,82 @@ __description__ = DESCRIPTION def main() -> None: args = get_args() + + record_genomes = [] + for arg in args.record_genomes: + if arg == 'cluster': + record_genomes.append('gene cluster') + else: + record_genomes.append(arg) + record_genomes = tuple(record_genomes) + constructor = reactionnetwork.Constructor() - network = constructor.load_contigs_database_network(args.contigs_db) - network.export_json( - args.output_file, - overwrite=args.overwrite_output_destinations, - objective='e_coli_core', - remove_missing_objective_metabolites=args.remove_missing_objective_metabolites - ) - # if args.contigs_db: - # external_genome = metabolicmodel.ExternalGenome( - # args.contigs_db, - # db_superdir=args.db_superdir, - # remove_missing_objective_metabolites=args.remove_missing_objective_metabolites - # ) - # external_genome.write_cobrapy_json(args.output_file) - # elif args.genomes_storage and args.pan_db: - # pangenome = metabolicmodel.Pangenome( - # args.genomes_storage, - # args.pan_db, - # collection_name=args.collection_name, - # bin_id=args.bin_id, - # db_superdir=args.db_superdir, - # discard_ties=args.discard_ties, - # consensus_threshold=args.consensus_threshold, - # remove_missing_objective_metabolites=args.remove_missing_objective_metabolites - # ) - # pangenome.write_cobrapy_json(args.output_file) + if args.contigs_db: + network: reactionnetwork.GenomicNetwork = constructor.load_network( + contigs_db=args.contigs_db, + check_gene_annotations=not args.ignore_changed_gene_annotations + ) + network.export_json( + args.output_file, + overwrite=args.overwrite_output_destinations, + objective='e_coli_core', + remove_missing_objective_metabolites=args.remove_missing_objective_metabolites + ) + elif args.pan_db or args.genomes_storage_db: + network: reactionnetwork.PangenomicNetwork = constructor.load_network( + pan_db=args.pan_db, + genomes_storage_db=args.genomes_storage, + check_gene_annotations=not args.ignore_changed_gene_annotations + ) + network.export_json( + args.output_file, + overwrite=args.overwrite_output_destinations, + objective='e_coli_core', + remove_missing_objective_metabolites=args.remove_missing_objective_metabolites, + record_genomes=record_genomes + ) + else: + raise ConfigError( + "Either a contigs database (`--contigs-db`) OR a pan database (`--pan-db`) and genomes " + "storage database (`--genomes-storage`) must be provided. These contain the reaction " + "network to be loaded and exported." + ) def get_args() -> Namespace: parser = ArgumentParser(description=DESCRIPTION) - parser.add_argument(*A('contigs-db'), **K('contigs-db')) - parser.add_argument(*A('output-file'), **K('output-file')) - parser.add_argument(*A('overwrite-output-destinations'), **K('overwrite-output-destinations')) - parser.add_argument( + + groupA = parser.add_argument_group( + "SINGLE GENOME OR METAGENOME", + "Export a reaction network stored in a contigs database." + ) + groupA.add_argument(*A('contigs-db'), **K('contigs-db', {'required': False})) + + groupB = parser.add_argument_group( + "PANGENOME", + "Export a reaction network stored in a pan database, with the associated genomes storage " + "database also required." + ) + groupB.add_argument(*A('pan-db'), **K('pan-db', {'required': False})) + groupB.add_argument(*A('genomes-storage'), **K('genomes-storage', {'required': False})) + groupB.add_argument( + '--record-genomes', nargs='*', default=['cluster', 'reaction'], + help=( + "This option records the genome membership of gene clusters in JSON entries. By " + "default, genome names are recorded for gene clusters and reactions: this is " + "equivalent to the argument 'cluster reaction'. To not record genome membership at " + "all, use this as a flag without any arguments, i.e., '--record-genomes'. The " + "following arguments can be provided in any combination: 'cluster', 'reaction', and " + "'metabolite'. 'reaction' and 'metabolite' record the genomes predicted to encode " + "enzymes associated with reactions and metabolites, respectively." + ) + ) + + groupC = parser.add_argument_group("OUTPUT") + groupC.add_argument(*A('output-file'), **K('output-file')) + groupC.add_argument(*A('overwrite-output-destinations'), **K('overwrite-output-destinations')) + + groupD = parser.add_argument_group("OTHER OPTIONS") + groupD.add_argument( '--remove-missing-objective-metabolites', default=False, action='store_true', help=( "The metabolic model file includes the core E. coli biomass objective function (BOF), " @@ -72,66 +111,17 @@ def get_args() -> Namespace: "metabolites. Removed BOF metabolite data are reported to the terminal." ) ) - - # groupA = parser.add_argument_group( - # "SINGLE GENOME INPUT", "Generate a metabolic model file from an \"external\" genome." - # ) - # groupA.add_argument(*A('contigs-db'), **K('contigs-db', {'required': False})) - - # groupB = parser.add_argument_group( - # "PANGENOME INPUT", - # "Generate a metabolic model file from a pangenome. 'genomes-storage' and 'pan-db' are " - # "required. Gene clusters can be selected from the pangenome with both 'collection-name' " - # "and 'bin-id'." - # ) - # groupB.add_argument(*A('genomes-storage'), **K('genomes-storage')) - # groupB.add_argument(*A('pan-db'), **K('pan-db', {'required': False})) - # groupB.add_argument(*A('collection-name'), **K('collection-name')) - # groupB.add_argument(*A('bin-id'), **K('bin-id')) - # groupB.add_argument( - # '--consensus-threshold', default=None, type=float, - # help=( - # "If this argument is provided, then a protein annotation must be assigned to this " - # "minimum proportion of genes in a cluster to be imparted to the cluster itself. " - # "By default, without this argument, the annotation assigned to the most genes becomes " - # "that of the cluster (also see '--discard-ties'). The consensus threshold must be a " - # "number from 0 to 1." - # ) - # ) - # groupB.add_argument( - # '--discard-ties', default=False, action='store_true', - # help=( - # "By default, a gene cluster is assigned a protein ortholog annotation by finding the " - # "ortholog that occurs in the greatest number of genes in the cluster (see " - # "--consensus-threshold) and arbitrarily choosing one ortholog in case of a tie. With " - # "this flag, the cluster is not assigned an ortholog annotation in case of a tie." - # ) - # ) - - # groupC = parser.add_argument_group("OUTPUT", "Output path of COBRApy JSON file") - # groupC.add_argument(*A('output-file'), **K('output-file')) - - # groupD = parser.add_argument_group( - # "MODEL", "Parameterization of the COBRApy JSON file unrelated to genomic inputs" - # ) - # groupD.add_argument( - # '--remove-missing-objective-metabolites', default=False, action='store_true', - # help=( - # "The metabolic model file includes the core E. coli biomass objective function (BOF), " - # "as in the COBRApy example file, 'e_coli_core.json'. With this flag, metabolites in " - # "the BOF are removed if they are not produced or consumed by enzymes in the model. " - # "Without this flag, COBRApy cannot load the output JSON file if there are missing BOF " - # "metabolites. Removed BOF metabolite data are reported to the terminal." - # ) - # ) - - # groupE = parser.add_argument_group("DATABASE", "Protein reference database information") - # groupE.add_argument( - # '--db-superdir', default=refdbs.ProteinReferenceDatabase.default_superdir, type=str, - # help=( - # "Directory in which database subdirectories (e.g., 'ModelSEED', 'KEGG') are located." - # ) - # ) + groupD.add_argument( + '--ignore-changed-gene-annotations', default=False, action='store_true', + help=( + "It is possible that the gene KO annotations used to construct the stored reaction " + "network have since been changed in the database. By default, without using this flag, " + "this program checks that the set of gene KO annotations that is currently stored was " + "also that used in construction of the reaction network, and raises an error if this " + "is not the case. Use of this flag ignores that check, permitting the set of gene " + "annotations to have changed since creation of the reaction network." + ) + ) args = parser.get_args(parser) return args diff --git a/bin/anvi-reaction-network b/bin/anvi-reaction-network index 099c63b753..b48f0d4fd3 100755 --- a/bin/anvi-reaction-network +++ b/bin/anvi-reaction-network @@ -1,16 +1,17 @@ #!/usr/bin/env python # -*- coding: utf-8 +DESCRIPTION = """This program generates a metabolic reaction network in an anvi'o contigs or pan database.""" import sys from argparse import Namespace -from anvio import A as A, K as K, __version__ as VERSION - from anvio.argparse import ArgumentParser from anvio.errors import ConfigError, FilesNPathsError +from anvio import A as A, K as K, __version__ as VERSION from anvio.biochemistry.reactionnetwork import Constructor + __author__ = "Developers of anvi'o (see AUTHORS.txt)" __copyright__ = "Copyleft 2015-2023, the Meren Lab (http://merenlab.org/)" __license__ = "GPL 3.0" @@ -18,7 +19,7 @@ __version__ = VERSION __authors__ = ["semiller10"] __requires__ = ["contigs-db", "kegg-functions", "reaction-ref-data", "kegg-data"] __provides__ = ["reaction-network"] -__description__ = "Generate a metabolic reaction network in an anvi'o contigs database" +__description__ = DESCRIPTION def main() -> None: