Skip to content

Commit

Permalink
Merge pull request #34 from monarch-initiative/33-use-genes_to_diseas…
Browse files Browse the repository at this point in the history
…e-from-hpo-in-place-of-disease-pg

33 use genes to disease from hpo in place of disease pg
  • Loading branch information
yaseminbridges authored Nov 8, 2024
2 parents df9a34b + 19d579d commit 79c718a
Show file tree
Hide file tree
Showing 7 changed files with 57 additions and 96 deletions.
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ p2p create --phenotype-annotation /path/to/phenotype.hpoa --output-dir /path/to/
To add known gene-to-phenotype relationships to phenopackets:

```shell
p2p add-genes --phenopacket-dir /path/to/synthetic-phenopackets --disease-pg /path/to/disease.pg --hgnc-data /path/to/hgnc_complete_set.txt --output-dir /path/to/output-dir
p2p add-genes --phenopacket-dir /path/to/synthetic-phenopackets --genes-to-disease /path/to/genes_to_disease.txt --hgnc-data /path/to/hgnc_complete_set.txt --output-dir /path/to/output-dir
```

> **_NOTE:_** To add known gene-to-phenotype the Exomiser disease.pg file is expected
> **_NOTE:_** To add known gene-to-phenotype the genes_to_disease.txt is expected. It can be downloaded [here](https://hpo.jax.org/data/annotations).
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "phenotype2phenopacket"
version = "0.5.1"
version = "0.6.0"
description = ""
authors = ["Yasemin Bridges <[email protected]>"]
readme = "README.md"
Expand Down
22 changes: 11 additions & 11 deletions src/phenotype2phenopacket/add/add_genes.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,24 +18,24 @@


def get_phenotype_to_disease_entries(
omim_disease_pg: pl.DataFrame, disease: Disease
genes_to_disease: pl.DataFrame, disease: Disease
) -> pl.DataFrame:
"""
Return disease.pg entries that match the provided OMIM disease ID.
Args:
omim_disease_pg (pl.DataFrame): DataFrame containing disease.pg entries.
genes_to_disease (pl.DataFrame): DataFrame containing genes_to_disease.txt entries.
disease (Disease): Disease object containing the OMIM disease ID.
Returns:
pl.DataFrame: Filtered DataFrame containing entries matching the OMIM disease ID.
"""
return omim_disease_pg.filter(pl.col("database_id") == disease.term.id)
return genes_to_disease.filter(pl.col("disease_id") == disease.term.id)


def add_genes(
phenopacket_path: Path,
disease_pg: pl.DataFrame,
genes_to_disease: pl.DataFrame,
gene_identifier_updater: GeneIdentifierUpdater,
output_dir: Path,
):
Expand All @@ -44,21 +44,21 @@ def add_genes(
Args:
phenopacket_path (Path): Path to the phenopacket file.
disease_pg (pl.DataFrame): DataFrame containing disease.pg entries.
genes_to_disease (pl.DataFrame): DataFrame containing genes_to_disease.txt entries.
gene_identifier_updater (GeneIdentifierUpdater): Object for updating gene identifiers.
output_dir (Path): Directory to write the updated phenopacket.
"""
phenopacket = phenopacket_reader(phenopacket_path)
disease = PhenopacketUtil(phenopacket).return_phenopacket_disease()
filtered_disease_pg = get_phenotype_to_disease_entries(disease_pg, disease)
if len(filtered_disease_pg) == 0:
filtered_genes_to_disease = get_phenotype_to_disease_entries(genes_to_disease, disease)
if len(filtered_genes_to_disease) == 0:
print(f"No gene-to-phenotype matches: {disease.term.id}, {disease.term.label}")
else:
phenopacket_with_genes = PhenopacketInterpretationExtender(
phenopacket
).add_gene_interpretation_to_phenopacket(
omim_disease_phenotype_gene_map=filtered_disease_pg,
omim_disease_phenotype_gene_map=filtered_genes_to_disease,
gene_identifier_updater=gene_identifier_updater,
)
(
Expand All @@ -68,13 +68,13 @@ def add_genes(
)


def add_genes_to_directory(phenopacket_dir: Path, disease_pg: pl.DataFrame, output_dir: Path):
def add_genes_to_directory(phenopacket_dir: Path, genes_to_disease: pl.DataFrame, output_dir: Path):
"""
Add known gene-to-phenotype relationships to the interpretations of a directory of phenopackets.
Args:
phenopacket_dir (Path): Directory containing the phenopacket files.
disease_pg (pl.DataFrame): DataFrame containing disease.pg entries.
genes_to_disease (pl.DataFrame): DataFrame containing genes_to_disease.txt entries.
output_dir (Path): Directory to store the updated phenopackets.
"""
hgnc_dict = create_hgnc_dict()
Expand All @@ -83,4 +83,4 @@ def add_genes_to_directory(phenopacket_dir: Path, disease_pg: pl.DataFrame, outp
gene_identifier="ensembl_id", hgnc_data=hgnc_dict, identifier_map=identifier_map
)
for phenopacket_path in all_files(phenopacket_dir):
add_genes(phenopacket_path, disease_pg, gene_identifier_updater, output_dir)
add_genes(phenopacket_path, genes_to_disease, gene_identifier_updater, output_dir)
16 changes: 8 additions & 8 deletions src/phenotype2phenopacket/cli_add.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
import click

from phenotype2phenopacket.add.add_genes import add_genes_to_directory
from phenotype2phenopacket.utils.utils import read_disease_pg
from phenotype2phenopacket.utils.utils import read_genes_to_disease


@click.command("add-genes")
Expand All @@ -15,10 +15,10 @@
type=Path,
)
@click.option(
"--disease-pg",
"-d",
"--genes-to-disease",
"-g",
required=True,
help="Path to disease.pg data file.",
help="Path to genes_to_disease.txt data file.",
type=Path,
)
@click.option(
Expand All @@ -30,21 +30,21 @@
)
def add_genes_command(
phenopacket_dir: Path,
disease_pg: Path,
genes_to_disease: Path,
output_dir: Path,
):
"""
Add known gene-to-phenotype relationships to a set of phenopackets in a directory.
Args:
phenopacket_dir (Path): Directory containing the phenopacket files.
disease_pg (Path): Path to the disease.pg file.
genes_to_disease (Path): Path to the genes_to_disease.txt file.
output_dir (Path): Directory to store the updated phenopackets.
"""
output_dir.mkdir(exist_ok=True)
disease_pg_df = read_disease_pg(disease_pg)
genes_to_disease_df = read_genes_to_disease(genes_to_disease)
add_genes_to_directory(
phenopacket_dir,
disease_pg_df,
genes_to_disease_df,
output_dir,
)
44 changes: 19 additions & 25 deletions src/phenotype2phenopacket/utils/phenopacket_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -861,45 +861,41 @@ def __init__(self, phenopacket: Phenopacket):

@staticmethod
def create_gene_genomic_interpretation(
gene_to_phenotype_entry: dict, gene_identifier_updater: GeneIdentifierUpdater
gene_to_disease_entry: dict, gene_identifier_updater: GeneIdentifierUpdater
) -> GenomicInterpretation:
"""
Create genomic interpretation for a gene-to-phenotype relationship.
This method generates a GenomicInterpretation object based on a gene-to-phenotype relationship entry.
Args:
gene_to_phenotype_entry (dict): A dictionary representing a gene-to-phenotype relationship.
gene_to_disease_entry (dict): A dictionary representing a gene-to-disease relationship.
gene_identifier_updater (GeneIdentifierUpdater): An instance of GeneIdentifierUpdater.
Returns:
GenomicInterpretation or None: An instance of GenomicInterpretation representing the interpretation
of the gene-to-phenotype relationship or None if unsuccessful.
"""
try:
gene_symbol = gene_identifier_updater.obtain_gene_symbol_from_identifier(
str(gene_to_phenotype_entry["entrez_id"])
)
gene_symbol = gene_to_disease_entry["gene_symbol"]
return GenomicInterpretation(
subject_or_biosample_id="patient1",
interpretation_status=(
4 if gene_to_phenotype_entry["disease_name"].startswith("?") is False else 0
),
interpretation_status=4,
gene=GeneDescriptor(
value_id=gene_identifier_updater.find_identifier(gene_symbol),
symbol=gene_symbol,
),
)
except KeyError:
print(f"Unable to find gene_symbol for {gene_to_phenotype_entry['entrez_id']}")
print(f"Unable to find gene_symbol for {gene_to_disease_entry['entrez_id']}")
return None
except TypeError:
print("N/A value", gene_to_phenotype_entry)
print("N/A value", gene_to_disease_entry)
return None

def create_gene_genomic_interpretations(
self,
omim_disease_phenotype_gene_map: pl.DataFrame,
genes_to_disease_map: pl.DataFrame,
gene_identifier_updater: GeneIdentifierUpdater,
) -> List[GenomicInterpretation]:
"""
Expand All @@ -909,25 +905,25 @@ def create_gene_genomic_interpretations(
containing known gene-to-phenotype relationships.
Args:
omim_disease_phenotype_gene_map (pl.DataFrame): DataFrame containing gene-to-phenotype mappings.
genes_to_disease_map (pl.DataFrame): DataFrame containing gene-to-disease mappings.
gene_identifier_updater (GeneIdentifierUpdater): An instance of GeneIdentifierUpdater.
Returns:
List[GenomicInterpretation]: A list of GenomicInterpretation objects representing the interpretations
of gene-to-phenotype relationships.
"""
genomic_interpretations = []
for phenotype_entry in omim_disease_phenotype_gene_map.rows(named=True):
for disease_entry in genes_to_disease_map.rows(named=True):
genomic_interpretation = self.create_gene_genomic_interpretation(
phenotype_entry, gene_identifier_updater
disease_entry, gene_identifier_updater
)
if genomic_interpretation is not None:
genomic_interpretations.append(genomic_interpretation)
return genomic_interpretations

def create_gene_diagnosis(
self,
omim_disease_phenotype_gene_map: pl.DataFrame,
genes_to_disease_map: pl.DataFrame,
gene_identifier_updater: GeneIdentifierUpdater,
disease: Disease,
) -> Diagnosis:
Expand All @@ -938,7 +934,7 @@ def create_gene_diagnosis(
provided in a DataFrame and a Disease object.
Args:
omim_disease_phenotype_gene_map (pl.DataFrame): DataFrame containing gene-to-phenotype mappings.
genes_to_disease_map (pl.DataFrame): DataFrame containing gene-to-disease mappings.
gene_identifier_updater (GeneIdentifierUpdater): An instance of GeneIdentifierUpdater.
disease (Disease): An instance of Disease representing the disease information.
Expand All @@ -947,7 +943,7 @@ def create_gene_diagnosis(
or None if no genomic interpretations were found.
"""
genomic_interpretations = self.create_gene_genomic_interpretations(
omim_disease_phenotype_gene_map, gene_identifier_updater
genes_to_disease_map, gene_identifier_updater
)
return (
Diagnosis(
Expand All @@ -963,7 +959,7 @@ def create_gene_diagnosis(

def create_gene_interpretation(
self,
omim_disease_phenotype_gene_map: pl.DataFrame,
genes_to_disease_map: pl.DataFrame,
gene_identifier_updater: GeneIdentifierUpdater,
) -> Interpretation:
"""
Expand All @@ -973,7 +969,7 @@ def create_gene_interpretation(
provided in a DataFrame and a GeneIdentifierUpdater instance.
Args:
omim_disease_phenotype_gene_map (pl.DataFrame): DataFrame containing gene-to-phenotype mappings.
genes_to_disease_map (pl.DataFrame): DataFrame containing gene-to-disease mappings.
gene_identifier_updater (GeneIdentifierUpdater): An instance of GeneIdentifierUpdater.
Returns:
Expand All @@ -983,7 +979,7 @@ def create_gene_interpretation(
phenopacket_util = PhenopacketUtil(self.phenopacket)
disease = phenopacket_util.return_phenopacket_disease()
diagnosis = self.create_gene_diagnosis(
omim_disease_phenotype_gene_map, gene_identifier_updater, disease
genes_to_disease_map, gene_identifier_updater, disease
)
return (
Interpretation(
Expand All @@ -997,7 +993,7 @@ def create_gene_interpretation(

def add_gene_interpretation_to_phenopacket(
self,
omim_disease_phenotype_gene_map: pl.DataFrame,
genes_to_disease_map: pl.DataFrame,
gene_identifier_updater: GeneIdentifierUpdater,
) -> Phenopacket:
"""
Expand All @@ -1006,7 +1002,7 @@ def add_gene_interpretation_to_phenopacket(
This method adds gene-based interpretations to a copy of the Phenopacket.
Args:
omim_disease_phenotype_gene_map (pl.DataFrame): DataFrame containing gene-to-phenotype mappings.
genes_to_disease_map (pl.DataFrame): DataFrame containing gene-to-disease mappings.
gene_identifier_updater (GeneIdentifierUpdater): An instance of GeneIdentifierUpdater.
Returns:
Expand All @@ -1015,9 +1011,7 @@ def add_gene_interpretation_to_phenopacket(
"""
phenopacket_copy = copy(self.phenopacket)
interpretations = [
self.create_gene_interpretation(
omim_disease_phenotype_gene_map, gene_identifier_updater
)
self.create_gene_interpretation(genes_to_disease_map, gene_identifier_updater)
]
if interpretations is not None:
phenopacket_copy.interpretations.extend(interpretations)
Expand Down
32 changes: 6 additions & 26 deletions src/phenotype2phenopacket/utils/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,35 +28,15 @@ def is_float(element: any) -> bool:
return False


def read_disease_pg(disease_pg: Path) -> pl.DataFrame:
def read_genes_to_disease(genes_to_disease: Path) -> pl.DataFrame:
"""
Read a disease.pg file and return a filtered Polars DataFrame.
This function reads the contents of a 'disease.pg' file using Polars read_csv method
and constructs a DataFrame. It filters the DataFrame to include only rows where the 'database_id'
column starts with 'OMIM'.
Read the genes_to_disease.txt file and return a Polars DataFrame.
Args:
disease_pg (Path): The path to the 'disease.pg' file.
genes_to_disease (Path): Path to the genes_to_disease.txt file.
Returns:
pl.DataFrame: A filtered Polars DataFrame containing specific columns and rows
where 'database_id' starts with 'OMIM'.
"""
disease = pl.read_csv(
disease_pg,
separator="|",
new_columns=[
"database_id",
"gene_mim_number",
"disease_name",
"entrez_id",
"diagnosis_status",
"inheritance",
],
has_header=False,
)
return disease.filter(pl.col("database_id").str.starts_with("OMIM"))
pl.DataFrame: A Polars DataFrame containing the contents of the genes_to_disease.txt.
"""
return pl.read_csv(genes_to_disease, sep="\t")


def load_ontology(local_cached_ontology: Path = None):
Expand Down
Loading

0 comments on commit 79c718a

Please sign in to comment.