Differential category (#43)

* added multiple models, along the lines of multiple languages. Several functions are called twice, output goes into multilingual and multimodel within output and raw_results dirs. Two hardcoded model names in ontogpt have to be edited in order for this to work * del comments * multimodel plots etc polished and finished * started working on evaluation of results and modularization, too * finished contingency table
monarch-initiative · Aug 5, 2024 · 7c03bd2 · 7c03bd2
1 parent d90eec6
commit 7c03bd2
Show file tree

Hide file tree

Showing 3 changed files with 94 additions and 6 deletions.
diff --git a/src/malco/analysis/eval_diagnose_category.py b/src/malco/analysis/eval_diagnose_category.py
@@ -0,0 +1,81 @@
+import pandas as pd
+import numpy as np
+
+from oaklib.datamodels.vocabulary import IS_A, PART_OF
+from oaklib.interfaces import MappingProviderInterface
+from oaklib.interfaces import OboGraphInterface
+from oaklib.interfaces.obograph_interface import GraphTraversalMethod
+
+from oaklib import get_adapter
+
+
+def mondo_adapter() -> OboGraphInterface:
+    """
+    Get the adapter for the MONDO ontology.
+
+    Returns:
+        Adapter: The adapter.
+    """
+    return get_adapter("sqlite:obo:mondo") 
+
+def mondo_mapping(term, adapter): 
+    print(term)
+    mondos = []
+    for m in adapter.sssom_mappings([term], source="OMIM"):
+        if m.predicate_id == "skos:exactMatch":
+            mondos.append(m.subject_id)
+    return mondos
+
+def find_category(omim_term, disease_categories, mondo):
+    if not isinstance(mondo, MappingProviderInterface):
+        raise ValueError("Adapter is not an MappingProviderInterface")
+    # What is best algorithm to avoid traversing the mondo graph a billion times?    
+    # Find ancestors
+    mondo_term = mondo_mapping(omim_term, mondo)
+    ancestor_list = mondo.ancestors(mondo_term, predicates=[IS_A, PART_OF]) #, reflexive=True) # method=GraphTraversalMethod.ENTAILMENT
+
+    for mondo_ancestor in ancestor_list:
+        if mondo_ancestor in disease_categories:
+            return mondo_ancestor # This should be smt like MONDO:0045024 (cancer or benign tumor)
+
+
+# Find 42 diseases categories
+mondo = mondo_adapter()
+disease_categories = mondo.relationships(objects = ["MONDO:0700096"], predicates=[IS_A])
+# make df contingency table with header=diseases_category, correct, incorrect and initialize all to 0.
+header = ["label","correct", "incorrect"]
+#header = ["diseases_category", "correct", "incorrect"]
+dc_list = [i[0] for i in list(disease_categories)]
+#contingency_table = pd.DataFrame(0, index=np.arange(len(dc_list)), columns=header)
+contingency_table = pd.DataFrame(0, index=dc_list, columns=header)
+#dc_labels = []
+for j in dc_list:
+    contingency_table.loc[j,"label"] = mondo.label(j)
+
+
+# example path of full results
+filename = "testout_multmodel_b4run/raw_results/multimodel/gpt-4/full_df_results.tsv"
+
+# label   term    score   rank    correct_term    is_correct      reciprocal_rank
+# PMID_35962790_Family_B_Individual_3__II_6__en-prompt.txt        MONDO:0008675   1.0     1.0     OMIM:620545     False        0.0
+
+df = pd.read_csv(
+        filename, sep="\t" #, header=None, names=["description", "term", "label"]
+    )
+
+ppkts = df.groupby("label")[["term", "correct_term", "is_correct"]] 
+
+for ppkt in ppkts:
+    # find this phenopackets category <cat> from OMIM
+    category_index = find_category(ppkt[1].iloc[0]["correct_term"], dc_list, mondo)
+    #cat_ind = find_cat_index(category)
+    # is there a true? ppkt is tuple ("filename", dataframe) --> ppkt[1] is a dataframe 
+    if not any(ppkt[1]["is_correct"]):
+        # no  --> increase <cat> incorrect
+        contingency_table.loc[category_index, "incorrect"] += 1
+    else:
+        # yes --> increase <cat> correct
+        contingency_table.loc[category_index, "correct"] += 1
+
+print(contingency_table)
+
diff --git a/src/malco/post_process/post_process.py b/src/malco/post_process/post_process.py
@@ -12,7 +12,7 @@ def post_process(raw_results_dir: Path, output_dir: Path, langs: tuple, models:
         raw_results_dir (Path): Path to the raw results directory.
         output_dir (Path): Path to the output directory.
     """
-
+    
     for lang in langs:
         raw_results_lang = raw_results_dir / "multilingual" / lang
         output_lang = output_dir / "multilingual" / lang
@@ -21,7 +21,7 @@ def post_process(raw_results_dir: Path, output_dir: Path, langs: tuple, models:
 
         create_standardised_results(raw_results_dir=raw_results_lang,
                                     output_dir=output_lang, output_file_name="results.tsv")
-        
+
     for model in models:
         raw_results_model = raw_results_dir / "multimodel" / model
         output_model = output_dir / "multimodel" / model

diff --git a/src/malco/runner.py b/src/malco/runner.py
@@ -18,9 +18,14 @@ class MalcoRunner(PhEvalRunner):
     output_dir: Path
     config_file: Path
     version: str
-    # Declare a tuple (immutable!) of languages
+    # Declare a tuple of languages and models
+    #TODO move next 4 lines to input file
     languages = ("en", "es", "nl", "it", "de")
-    models = ('gpt-4o', 'gpt-4') # Decide on list of models: Claude-Sonnet (Anthropic key), 
+    models = ("gpt-3.5-turbo", "gpt-4", "gpt-4-turbo", "gpt-4o") # Decide on list of models: Claude-Sonnet (Anthropic key), 
+    just_run = 1          # only run the run part of the code
+    just_postprocess = 0  # only run the postprocess part of the code
+
+
 
     def prepare(self):
         """
@@ -55,7 +60,8 @@ def post_process(self,
                      output_dir=self.output_dir,
                      langs=self.languages,
                      models=self.models)
-
+
+
         comparing = "language"
         mrr_file, plot_dir, num_ppkt, topn_file = compute_mrr(comparing,
             output_dir=self.output_dir / "multilingual" ,
@@ -65,7 +71,8 @@ def post_process(self,
 
         if print_plot:
             make_plots(mrr_file, plot_dir, self.languages, num_ppkt, self.models, topn_file, comparing)
-
+
+
         comparing = "model"
         mrr_file, plot_dir, num_ppkt, topn_file = compute_mrr( comparing,
             output_dir=self.output_dir / "multimodel" ,