Skip to content

Commit

Permalink
Added sequence ontology terms as the 'type' for each variant node tha…
Browse files Browse the repository at this point in the history
…t we add into the ingest. Added a test for this as well
  • Loading branch information
AO33 committed Jul 2, 2024
1 parent c320955 commit 0237a59
Show file tree
Hide file tree
Showing 3 changed files with 67 additions and 3 deletions.
10 changes: 7 additions & 3 deletions src/clinvar_ingest/transform.py
Original file line number Diff line number Diff line change
Expand Up @@ -386,7 +386,7 @@ def map_mondo_to_hp(group_info, disease_ids):
"flagged_submission":0, # ??? Conflicting information is what the really means...
".":0} #Means that there is no data submitted for germline classification"

var2disease_star_min = 3 ### 3 is reviewed by expert panel and above (not sure about practice guidline yet...)
var2disease_star_min = 3 ### 3 is reviewed by expert panel and above

# Manually curated to help determine predicate
# For using submission_summary file (pulling terms from the clinicial significance column)
Expand Down Expand Up @@ -471,7 +471,10 @@ def map_mondo_to_hp(group_info, disease_ids):
crev = row["CLNREVSTAT"]
ginfo = row["GENEINFO"]
raw_diss_info = row["CLNDISDB"]

so_info = [v.split("|")[0] for v in row["MC"].split(",") if "SO:" in v] # Pull out sequence ontology term(s) ### Example MC column SO:0001575|splice_donor_variant,SO:0001587|nonsense
### Note, that the Sequence ontology term could be derived from the "CLNVCSO" vcf column as well, however the terms listed in that column are much less specific and are not actually particularly useful (Too broad)
### The terms listed within the MC column are far more specific to the effect(s) a variant will have on any given gene it overlaps, thus making it the preffered choice.

# No record info means we don't want to include
if varid not in var_records:
no_record += 1
Expand Down Expand Up @@ -511,7 +514,8 @@ def map_mondo_to_hp(group_info, disease_ids):
xref=["DBSNP:{}".format(row["RS"])],
has_gene=gene_ids,
in_taxon=["NCBITaxon:9606"],
in_taxon_label="Homo sapiens")
in_taxon_label="Homo sapiens",
type=so_info)

entities.append(seq_var)
vars_added += 1
Expand Down
1 change: 1 addition & 0 deletions src/clinvar_ingest/transform.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,7 @@ node_properties:
- id
- name
- category
- type


# category
Expand Down
59 changes: 59 additions & 0 deletions tests/test_clinvar.py
Original file line number Diff line number Diff line change
Expand Up @@ -326,6 +326,55 @@ def test_case6_row():
return row
#{'MONDO:0019118': ['HP:0000556', 'HP:0007736', 'HP:0007910', 'HP:0007974', 'HP:0007982']}

# Multiple records, Single HPO, Single gene, Multiple MC info
@pytest.fixture
def test_case7_row():
row = {
'CHROM':'1',
'POS':'173911974',
'ID':'654211',
'REF':'T',
'ALT':'G',
'QUAL':'.',
'FILTER':'.',
'AF_ESP':'.',
'AF_EXAC':'1e-05',
'AF_TGP':'.',
'ALLELEID':'627126',
'CLNDN':'not_provided|Hereditary_antithrombin_deficiency',
'CLNDNINCL':'.',
'CLNDISDB':'MedGen:C3661900|Human_Phenotype_Ontology:HP:0001976,MONDO:MONDO:0013144,MedGen:C0272375,OMIM:613118,Orphanet:82',
'CLNDISDBINCL':'.',
'CLNHGVS':'NC_000001.11:g.173911974T>G',
'CLNREVSTAT':'reviewed_by_expert_panel',
'CLNSIG':'Likely_pathogenic',
'CLNSIGCONF':'.',
'CLNSIGINCL':'.',
'CLNVC':'single_nucleotide_variant',
'CLNVCSO':'SO:0001483',
'CLNVI':'.',
'DBVARID':'.',
'GENEINFO':'SERPINC1:462',
'MC':'SO:0001583|missense_variant,SO:0001627|intron_variant',
'ONCDN':'.',
'ONCDNINCL':'.',
'ONCDISDB':'.',
'ONCDISDBINCL':'.',
'ONC':'.',
'ONCINCL':'.',
'ONCREVSTAT':'.',
'ONCCONF':'.',
'ORIGIN':'1',
'RS':'765445413',
'SCIDN':'.',
'SCIDNINCL':'.',
'SCIDISDB':'.',
'SCIDISDBINCL':'.',
'SCIREVSTAT':'.',
'SCI':'.',
'SCIINCL':'.',
}
return row


####################################################################
Expand Down Expand Up @@ -367,6 +416,11 @@ def test_case6_entities(test_case6_row, mock_koza):
test_case6_row,
TRANSFORM_SCRIPT)

@pytest.fixture
def test_case7_entities(test_case7_row, mock_koza):
return mock_koza(INGEST_NAME,
test_case7_row,
TRANSFORM_SCRIPT)


########################
Expand Down Expand Up @@ -398,6 +452,11 @@ def test_case6(test_case6_entities):
assert len([association for association in test_case6_entities if isinstance(association, VariantToGeneAssociation)]) == 2
assert len([association for association in test_case6_entities if isinstance(association, VariantToPhenotypicFeatureAssociation)]) == 5

def test_case7(test_case7_entities):
assert len(test_case7_entities) == 4 # SequenceVariant, VariantToGene, VariantToDisease, VariantToPhenotype
assert test_case7_entities[2].object == "MONDO:0013144" # Multiple records are available and we want to make sure we choose the proper on
assert test_case7_entities[0].type == ["SO:0001583", "SO:0001627"]


# TO DO: Tests for proper predicates? Test rows for variants that are not of review status 3 stars or above?
# (Tricky because paramters / decisions about what information actually gets pulled and when can alter the results obtained for variants with a review status of less than 3 stars

0 comments on commit 0237a59

Please sign in to comment.