Merge pull request #34 from monarch-initiative/33-use-genes_to_diseas…

…e-from-hpo-in-place-of-disease-pg 33 use genes to disease from hpo in place of disease pg
monarch-initiative · Nov 8, 2024 · 79c718a · 79c718a
2 parents df9a34b + 19d579d
commit 79c718a
Show file tree

Hide file tree

Showing 7 changed files with 57 additions and 96 deletions.
diff --git a/README.md b/README.md
@@ -47,7 +47,7 @@ p2p create --phenotype-annotation /path/to/phenotype.hpoa --output-dir /path/to/
 To add known gene-to-phenotype relationships to phenopackets:
 
 ```shell
-p2p add-genes --phenopacket-dir /path/to/synthetic-phenopackets --disease-pg /path/to/disease.pg --hgnc-data /path/to/hgnc_complete_set.txt --output-dir /path/to/output-dir
+p2p add-genes --phenopacket-dir /path/to/synthetic-phenopackets --genes-to-disease /path/to/genes_to_disease.txt --hgnc-data /path/to/hgnc_complete_set.txt --output-dir /path/to/output-dir
 ```
 
-> **_NOTE:_** To add known gene-to-phenotype the Exomiser disease.pg file is expected
+> **_NOTE:_** To add known gene-to-phenotype the genes_to_disease.txt is expected. It can be downloaded [here](https://hpo.jax.org/data/annotations).
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "phenotype2phenopacket"
-version = "0.5.1"
+version = "0.6.0"
 description = ""
 authors = ["Yasemin Bridges <[email protected]>"]
 readme = "README.md"

diff --git a/src/phenotype2phenopacket/add/add_genes.py b/src/phenotype2phenopacket/add/add_genes.py
@@ -18,24 +18,24 @@
 
 
 def get_phenotype_to_disease_entries(
-    omim_disease_pg: pl.DataFrame, disease: Disease
+    genes_to_disease: pl.DataFrame, disease: Disease
 ) -> pl.DataFrame:
     """
     Return disease.pg entries that match the provided OMIM disease ID.
 
     Args:
-        omim_disease_pg (pl.DataFrame): DataFrame containing disease.pg entries.
+        genes_to_disease (pl.DataFrame): DataFrame containing genes_to_disease.txt entries.
         disease (Disease): Disease object containing the OMIM disease ID.
 
     Returns:
         pl.DataFrame: Filtered DataFrame containing entries matching the OMIM disease ID.
     """
-    return omim_disease_pg.filter(pl.col("database_id") == disease.term.id)
+    return genes_to_disease.filter(pl.col("disease_id") == disease.term.id)
 
 
 def add_genes(
     phenopacket_path: Path,
-    disease_pg: pl.DataFrame,
+    genes_to_disease: pl.DataFrame,
     gene_identifier_updater: GeneIdentifierUpdater,
     output_dir: Path,
 ):
@@ -44,21 +44,21 @@ def add_genes(
 
     Args:
         phenopacket_path (Path): Path to the phenopacket file.
-        disease_pg (pl.DataFrame): DataFrame containing disease.pg entries.
+        genes_to_disease (pl.DataFrame): DataFrame containing genes_to_disease.txt entries.
         gene_identifier_updater (GeneIdentifierUpdater): Object for updating gene identifiers.
         output_dir (Path): Directory to write the updated phenopacket.
 
     """
     phenopacket = phenopacket_reader(phenopacket_path)
     disease = PhenopacketUtil(phenopacket).return_phenopacket_disease()
-    filtered_disease_pg = get_phenotype_to_disease_entries(disease_pg, disease)
-    if len(filtered_disease_pg) == 0:
+    filtered_genes_to_disease = get_phenotype_to_disease_entries(genes_to_disease, disease)
+    if len(filtered_genes_to_disease) == 0:
         print(f"No gene-to-phenotype matches: {disease.term.id}, {disease.term.label}")
     else:
         phenopacket_with_genes = PhenopacketInterpretationExtender(
             phenopacket
         ).add_gene_interpretation_to_phenopacket(
-            omim_disease_phenotype_gene_map=filtered_disease_pg,
+            omim_disease_phenotype_gene_map=filtered_genes_to_disease,
             gene_identifier_updater=gene_identifier_updater,
         )
         (
@@ -68,13 +68,13 @@ def add_genes(
         )
 
 
-def add_genes_to_directory(phenopacket_dir: Path, disease_pg: pl.DataFrame, output_dir: Path):
+def add_genes_to_directory(phenopacket_dir: Path, genes_to_disease: pl.DataFrame, output_dir: Path):
     """
     Add known gene-to-phenotype relationships to the interpretations of a directory of phenopackets.
 
     Args:
         phenopacket_dir (Path): Directory containing the phenopacket files.
-        disease_pg (pl.DataFrame): DataFrame containing disease.pg entries.
+        genes_to_disease (pl.DataFrame): DataFrame containing genes_to_disease.txt entries.
         output_dir (Path): Directory to store the updated phenopackets.
     """
     hgnc_dict = create_hgnc_dict()
@@ -83,4 +83,4 @@ def add_genes_to_directory(phenopacket_dir: Path, disease_pg: pl.DataFrame, outp
         gene_identifier="ensembl_id", hgnc_data=hgnc_dict, identifier_map=identifier_map
     )
     for phenopacket_path in all_files(phenopacket_dir):
-        add_genes(phenopacket_path, disease_pg, gene_identifier_updater, output_dir)
+        add_genes(phenopacket_path, genes_to_disease, gene_identifier_updater, output_dir)
diff --git a/src/phenotype2phenopacket/cli_add.py b/src/phenotype2phenopacket/cli_add.py
@@ -3,7 +3,7 @@
 import click
 
 from phenotype2phenopacket.add.add_genes import add_genes_to_directory
-from phenotype2phenopacket.utils.utils import read_disease_pg
+from phenotype2phenopacket.utils.utils import read_genes_to_disease
 
 
 @click.command("add-genes")
@@ -15,10 +15,10 @@
     type=Path,
 )
 @click.option(
-    "--disease-pg",
-    "-d",
+    "--genes-to-disease",
+    "-g",
     required=True,
-    help="Path to disease.pg data file.",
+    help="Path to genes_to_disease.txt data file.",
     type=Path,
 )
 @click.option(
@@ -30,21 +30,21 @@
 )
 def add_genes_command(
     phenopacket_dir: Path,
-    disease_pg: Path,
+    genes_to_disease: Path,
     output_dir: Path,
 ):
     """
     Add known gene-to-phenotype relationships to a set of phenopackets in a directory.
 
     Args:
         phenopacket_dir (Path): Directory containing the phenopacket files.
-        disease_pg (Path): Path to the disease.pg file.
+        genes_to_disease (Path): Path to the genes_to_disease.txt file.
         output_dir (Path): Directory to store the updated phenopackets.
     """
     output_dir.mkdir(exist_ok=True)
-    disease_pg_df = read_disease_pg(disease_pg)
+    genes_to_disease_df = read_genes_to_disease(genes_to_disease)
     add_genes_to_directory(
         phenopacket_dir,
-        disease_pg_df,
+        genes_to_disease_df,
         output_dir,
     )
diff --git a/src/phenotype2phenopacket/utils/phenopacket_utils.py b/src/phenotype2phenopacket/utils/phenopacket_utils.py
@@ -861,45 +861,41 @@ def __init__(self, phenopacket: Phenopacket):
 
     @staticmethod
     def create_gene_genomic_interpretation(
-        gene_to_phenotype_entry: dict, gene_identifier_updater: GeneIdentifierUpdater
+        gene_to_disease_entry: dict, gene_identifier_updater: GeneIdentifierUpdater
     ) -> GenomicInterpretation:
         """
         Create genomic interpretation for a gene-to-phenotype relationship.
 
         This method generates a GenomicInterpretation object based on a gene-to-phenotype relationship entry.
 
         Args:
-            gene_to_phenotype_entry (dict): A dictionary representing a gene-to-phenotype relationship.
+            gene_to_disease_entry (dict): A dictionary representing a gene-to-disease relationship.
             gene_identifier_updater (GeneIdentifierUpdater): An instance of GeneIdentifierUpdater.
 
         Returns:
             GenomicInterpretation or None: An instance of GenomicInterpretation representing the interpretation
                                            of the gene-to-phenotype relationship or None if unsuccessful.
         """
         try:
-            gene_symbol = gene_identifier_updater.obtain_gene_symbol_from_identifier(
-                str(gene_to_phenotype_entry["entrez_id"])
-            )
+            gene_symbol = gene_to_disease_entry["gene_symbol"]
             return GenomicInterpretation(
                 subject_or_biosample_id="patient1",
-                interpretation_status=(
-                    4 if gene_to_phenotype_entry["disease_name"].startswith("?") is False else 0
-                ),
+                interpretation_status=4,
                 gene=GeneDescriptor(
                     value_id=gene_identifier_updater.find_identifier(gene_symbol),
                     symbol=gene_symbol,
                 ),
             )
         except KeyError:
-            print(f"Unable to find gene_symbol for {gene_to_phenotype_entry['entrez_id']}")
+            print(f"Unable to find gene_symbol for {gene_to_disease_entry['entrez_id']}")
             return None
         except TypeError:
-            print("N/A value", gene_to_phenotype_entry)
+            print("N/A value", gene_to_disease_entry)
             return None
 
     def create_gene_genomic_interpretations(
         self,
-        omim_disease_phenotype_gene_map: pl.DataFrame,
+        genes_to_disease_map: pl.DataFrame,
         gene_identifier_updater: GeneIdentifierUpdater,
     ) -> List[GenomicInterpretation]:
         """
@@ -909,25 +905,25 @@ def create_gene_genomic_interpretations(
         containing known gene-to-phenotype relationships.
 
         Args:
-            omim_disease_phenotype_gene_map (pl.DataFrame): DataFrame containing gene-to-phenotype mappings.
+            genes_to_disease_map (pl.DataFrame): DataFrame containing gene-to-disease mappings.
             gene_identifier_updater (GeneIdentifierUpdater): An instance of GeneIdentifierUpdater.
 
         Returns:
             List[GenomicInterpretation]: A list of GenomicInterpretation objects representing the interpretations
                   of gene-to-phenotype relationships.
         """
         genomic_interpretations = []
-        for phenotype_entry in omim_disease_phenotype_gene_map.rows(named=True):
+        for disease_entry in genes_to_disease_map.rows(named=True):
             genomic_interpretation = self.create_gene_genomic_interpretation(
-                phenotype_entry, gene_identifier_updater
+                disease_entry, gene_identifier_updater
             )
             if genomic_interpretation is not None:
                 genomic_interpretations.append(genomic_interpretation)
         return genomic_interpretations
 
     def create_gene_diagnosis(
         self,
-        omim_disease_phenotype_gene_map: pl.DataFrame,
+        genes_to_disease_map: pl.DataFrame,
         gene_identifier_updater: GeneIdentifierUpdater,
         disease: Disease,
     ) -> Diagnosis:
@@ -938,7 +934,7 @@ def create_gene_diagnosis(
         provided in a DataFrame and a Disease object.
 
         Args:
-            omim_disease_phenotype_gene_map (pl.DataFrame): DataFrame containing gene-to-phenotype mappings.
+            genes_to_disease_map (pl.DataFrame): DataFrame containing gene-to-disease mappings.
             gene_identifier_updater (GeneIdentifierUpdater): An instance of GeneIdentifierUpdater.
             disease (Disease): An instance of Disease representing the disease information.
 
@@ -947,7 +943,7 @@ def create_gene_diagnosis(
                                or None if no genomic interpretations were found.
         """
         genomic_interpretations = self.create_gene_genomic_interpretations(
-            omim_disease_phenotype_gene_map, gene_identifier_updater
+            genes_to_disease_map, gene_identifier_updater
         )
         return (
             Diagnosis(
@@ -963,7 +959,7 @@ def create_gene_diagnosis(
 
     def create_gene_interpretation(
         self,
-        omim_disease_phenotype_gene_map: pl.DataFrame,
+        genes_to_disease_map: pl.DataFrame,
         gene_identifier_updater: GeneIdentifierUpdater,
     ) -> Interpretation:
         """
@@ -973,7 +969,7 @@ def create_gene_interpretation(
         provided in a DataFrame and a GeneIdentifierUpdater instance.
 
         Args:
-            omim_disease_phenotype_gene_map (pl.DataFrame): DataFrame containing gene-to-phenotype mappings.
+            genes_to_disease_map (pl.DataFrame): DataFrame containing gene-to-disease mappings.
             gene_identifier_updater (GeneIdentifierUpdater): An instance of GeneIdentifierUpdater.
 
         Returns:
@@ -983,7 +979,7 @@ def create_gene_interpretation(
         phenopacket_util = PhenopacketUtil(self.phenopacket)
         disease = phenopacket_util.return_phenopacket_disease()
         diagnosis = self.create_gene_diagnosis(
-            omim_disease_phenotype_gene_map, gene_identifier_updater, disease
+            genes_to_disease_map, gene_identifier_updater, disease
         )
         return (
             Interpretation(
@@ -997,7 +993,7 @@ def create_gene_interpretation(
 
     def add_gene_interpretation_to_phenopacket(
         self,
-        omim_disease_phenotype_gene_map: pl.DataFrame,
+        genes_to_disease_map: pl.DataFrame,
         gene_identifier_updater: GeneIdentifierUpdater,
     ) -> Phenopacket:
         """
@@ -1006,7 +1002,7 @@ def add_gene_interpretation_to_phenopacket(
         This method adds gene-based interpretations to a copy of the Phenopacket.
 
         Args:
-            omim_disease_phenotype_gene_map (pl.DataFrame): DataFrame containing gene-to-phenotype mappings.
+            genes_to_disease_map (pl.DataFrame): DataFrame containing gene-to-disease mappings.
             gene_identifier_updater (GeneIdentifierUpdater): An instance of GeneIdentifierUpdater.
 
         Returns:
@@ -1015,9 +1011,7 @@ def add_gene_interpretation_to_phenopacket(
         """
         phenopacket_copy = copy(self.phenopacket)
         interpretations = [
-            self.create_gene_interpretation(
-                omim_disease_phenotype_gene_map, gene_identifier_updater
-            )
+            self.create_gene_interpretation(genes_to_disease_map, gene_identifier_updater)
         ]
         if interpretations is not None:
             phenopacket_copy.interpretations.extend(interpretations)

diff --git a/src/phenotype2phenopacket/utils/utils.py b/src/phenotype2phenopacket/utils/utils.py
@@ -28,35 +28,15 @@ def is_float(element: any) -> bool:
         return False
 
 
-def read_disease_pg(disease_pg: Path) -> pl.DataFrame:
+def read_genes_to_disease(genes_to_disease: Path) -> pl.DataFrame:
     """
-    Read a disease.pg file and return a filtered Polars DataFrame.
-
-    This function reads the contents of a 'disease.pg' file using Polars read_csv method
-    and constructs a DataFrame. It filters the DataFrame to include only rows where the 'database_id'
-    column starts with 'OMIM'.
-
+    Read the genes_to_disease.txt file and return a Polars DataFrame.
     Args:
-        disease_pg (Path): The path to the 'disease.pg' file.
-
+        genes_to_disease (Path): Path to the genes_to_disease.txt file.
     Returns:
-        pl.DataFrame: A filtered Polars DataFrame containing specific columns and rows
-                      where 'database_id' starts with 'OMIM'.
-    """
-    disease = pl.read_csv(
-        disease_pg,
-        separator="|",
-        new_columns=[
-            "database_id",
-            "gene_mim_number",
-            "disease_name",
-            "entrez_id",
-            "diagnosis_status",
-            "inheritance",
-        ],
-        has_header=False,
-    )
-    return disease.filter(pl.col("database_id").str.starts_with("OMIM"))
+        pl.DataFrame: A  Polars DataFrame containing the contents of the genes_to_disease.txt.
+    """
+    return pl.read_csv(genes_to_disease, sep="\t")
 
 
 def load_ontology(local_cached_ontology: Path = None):