Added sequence ontology terms as the 'type' for each variant node tha…

…t we add into the ingest. Added a test for this as well
monarch-initiative · Jul 2, 2024 · 0237a59 · 0237a59
1 parent c320955
commit 0237a59
Show file tree

Hide file tree

Showing 3 changed files with 67 additions and 3 deletions.
diff --git a/src/clinvar_ingest/transform.py b/src/clinvar_ingest/transform.py
@@ -386,7 +386,7 @@ def map_mondo_to_hp(group_info, disease_ids):
                    "flagged_submission":0, # ??? Conflicting information is what the really means...
                    ".":0} #Means that there is no data submitted for germline classification"
 
-var2disease_star_min = 3 ### 3 is reviewed by expert panel and above (not sure about practice guidline yet...)
+var2disease_star_min = 3 ### 3 is reviewed by expert panel and above
 
 # Manually curated to help determine predicate
 # For using submission_summary file (pulling terms from the clinicial significance column)
@@ -471,7 +471,10 @@ def map_mondo_to_hp(group_info, disease_ids):
     crev = row["CLNREVSTAT"]
     ginfo = row["GENEINFO"]
     raw_diss_info = row["CLNDISDB"]
-
+    so_info = [v.split("|")[0] for v in row["MC"].split(",") if "SO:" in v] # Pull out sequence ontology term(s) ### Example MC column SO:0001575|splice_donor_variant,SO:0001587|nonsense
+    ### Note, that the Sequence ontology term could be derived from the "CLNVCSO" vcf column as well, however the terms listed in that column are much less specific and are not actually particularly useful (Too broad)
+    ### The terms listed within the MC column are far more specific to the effect(s) a variant will have on any given gene it overlaps, thus making it the preffered choice. 
+
     # No record info means we don't want to include
     if varid not in var_records:
         no_record += 1
@@ -511,7 +514,8 @@ def map_mondo_to_hp(group_info, disease_ids):
                     xref=["DBSNP:{}".format(row["RS"])],
                     has_gene=gene_ids,
                     in_taxon=["NCBITaxon:9606"],
-                    in_taxon_label="Homo sapiens")
+                    in_taxon_label="Homo sapiens",
+                    type=so_info)
 
     entities.append(seq_var)
     vars_added += 1

diff --git a/src/clinvar_ingest/transform.yaml b/src/clinvar_ingest/transform.yaml
@@ -76,6 +76,7 @@ node_properties:
   - id
   - name
   - category
+  - type
 
 
   # category

diff --git a/tests/test_clinvar.py b/tests/test_clinvar.py
@@ -326,6 +326,55 @@ def test_case6_row():
     return row
     #{'MONDO:0019118': ['HP:0000556', 'HP:0007736', 'HP:0007910', 'HP:0007974', 'HP:0007982']}
 
+# Multiple records, Single HPO, Single gene, Multiple MC info
+@pytest.fixture
+def test_case7_row():
+    row = {
+    'CHROM':'1',
+    'POS':'173911974',
+    'ID':'654211',
+    'REF':'T',
+    'ALT':'G',
+    'QUAL':'.',
+    'FILTER':'.',
+    'AF_ESP':'.',
+    'AF_EXAC':'1e-05',
+    'AF_TGP':'.',
+    'ALLELEID':'627126',
+    'CLNDN':'not_provided|Hereditary_antithrombin_deficiency',
+    'CLNDNINCL':'.',
+    'CLNDISDB':'MedGen:C3661900|Human_Phenotype_Ontology:HP:0001976,MONDO:MONDO:0013144,MedGen:C0272375,OMIM:613118,Orphanet:82',
+    'CLNDISDBINCL':'.',
+    'CLNHGVS':'NC_000001.11:g.173911974T>G',
+    'CLNREVSTAT':'reviewed_by_expert_panel',
+    'CLNSIG':'Likely_pathogenic',
+    'CLNSIGCONF':'.',
+    'CLNSIGINCL':'.',
+    'CLNVC':'single_nucleotide_variant',
+    'CLNVCSO':'SO:0001483',
+    'CLNVI':'.',
+    'DBVARID':'.',
+    'GENEINFO':'SERPINC1:462',
+    'MC':'SO:0001583|missense_variant,SO:0001627|intron_variant',
+    'ONCDN':'.',
+    'ONCDNINCL':'.',
+    'ONCDISDB':'.',
+    'ONCDISDBINCL':'.',
+    'ONC':'.',
+    'ONCINCL':'.',
+    'ONCREVSTAT':'.',
+    'ONCCONF':'.',
+    'ORIGIN':'1',
+    'RS':'765445413',
+    'SCIDN':'.',
+    'SCIDNINCL':'.',
+    'SCIDISDB':'.',
+    'SCIDISDBINCL':'.',
+    'SCIREVSTAT':'.',
+    'SCI':'.',
+    'SCIINCL':'.',
+    }
+    return row
 
 
 ####################################################################
@@ -367,6 +416,11 @@ def test_case6_entities(test_case6_row, mock_koza):
                      test_case6_row,
                      TRANSFORM_SCRIPT)
 
+@pytest.fixture
+def test_case7_entities(test_case7_row, mock_koza):
+    return mock_koza(INGEST_NAME,
+                     test_case7_row,
+                     TRANSFORM_SCRIPT)
 
 
 ########################
@@ -398,6 +452,11 @@ def test_case6(test_case6_entities):
     assert len([association for association in test_case6_entities if isinstance(association, VariantToGeneAssociation)]) == 2
     assert len([association for association in test_case6_entities if isinstance(association, VariantToPhenotypicFeatureAssociation)]) == 5
 
+def test_case7(test_case7_entities):
+    assert len(test_case7_entities) == 4 # SequenceVariant, VariantToGene, VariantToDisease, VariantToPhenotype 
+    assert test_case7_entities[2].object == "MONDO:0013144" # Multiple records are available and we want to make sure we choose the proper on
+    assert test_case7_entities[0].type == ["SO:0001583", "SO:0001627"]
+
 
 # TO DO: Tests for proper predicates? Test rows for variants that are not of review status 3 stars or above? 
 # (Tricky because paramters / decisions about what information actually gets pulled and when can alter the results obtained for variants with a review status of less than 3 stars
-Original file line number
+Diff line change
@@ Expand Up / @@ -76,6 +76,7 @@ node_properties: @@
       - id
       - name
       - category
+      - type
       # category
@@ Expand Down @@