Skip to content

Commit

Permalink
Differential category (#43)
Browse files Browse the repository at this point in the history
* added multiple models, along the lines of multiple languages. Several functions are called twice, output goes into multilingual and multimodel within output and raw_results dirs. Two hardcoded model names in ontogpt have to be edited in order for this to work

* del comments

* multimodel plots etc polished and finished

* started working on evaluation of results and modularization, too

* finished contingency table
  • Loading branch information
leokim-l authored Aug 5, 2024
1 parent d90eec6 commit 7c03bd2
Show file tree
Hide file tree
Showing 3 changed files with 94 additions and 6 deletions.
81 changes: 81 additions & 0 deletions src/malco/analysis/eval_diagnose_category.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,81 @@
import pandas as pd
import numpy as np

from oaklib.datamodels.vocabulary import IS_A, PART_OF
from oaklib.interfaces import MappingProviderInterface
from oaklib.interfaces import OboGraphInterface
from oaklib.interfaces.obograph_interface import GraphTraversalMethod

from oaklib import get_adapter


def mondo_adapter() -> OboGraphInterface:
"""
Get the adapter for the MONDO ontology.
Returns:
Adapter: The adapter.
"""
return get_adapter("sqlite:obo:mondo")

def mondo_mapping(term, adapter):
print(term)
mondos = []
for m in adapter.sssom_mappings([term], source="OMIM"):
if m.predicate_id == "skos:exactMatch":
mondos.append(m.subject_id)
return mondos

def find_category(omim_term, disease_categories, mondo):
if not isinstance(mondo, MappingProviderInterface):
raise ValueError("Adapter is not an MappingProviderInterface")
# What is best algorithm to avoid traversing the mondo graph a billion times?
# Find ancestors
mondo_term = mondo_mapping(omim_term, mondo)
ancestor_list = mondo.ancestors(mondo_term, predicates=[IS_A, PART_OF]) #, reflexive=True) # method=GraphTraversalMethod.ENTAILMENT

for mondo_ancestor in ancestor_list:
if mondo_ancestor in disease_categories:
return mondo_ancestor # This should be smt like MONDO:0045024 (cancer or benign tumor)


# Find 42 diseases categories
mondo = mondo_adapter()
disease_categories = mondo.relationships(objects = ["MONDO:0700096"], predicates=[IS_A])
# make df contingency table with header=diseases_category, correct, incorrect and initialize all to 0.
header = ["label","correct", "incorrect"]
#header = ["diseases_category", "correct", "incorrect"]
dc_list = [i[0] for i in list(disease_categories)]
#contingency_table = pd.DataFrame(0, index=np.arange(len(dc_list)), columns=header)
contingency_table = pd.DataFrame(0, index=dc_list, columns=header)
#dc_labels = []
for j in dc_list:
contingency_table.loc[j,"label"] = mondo.label(j)


# example path of full results
filename = "testout_multmodel_b4run/raw_results/multimodel/gpt-4/full_df_results.tsv"

# label term score rank correct_term is_correct reciprocal_rank
# PMID_35962790_Family_B_Individual_3__II_6__en-prompt.txt MONDO:0008675 1.0 1.0 OMIM:620545 False 0.0

df = pd.read_csv(
filename, sep="\t" #, header=None, names=["description", "term", "label"]
)

ppkts = df.groupby("label")[["term", "correct_term", "is_correct"]]

for ppkt in ppkts:
# find this phenopackets category <cat> from OMIM
category_index = find_category(ppkt[1].iloc[0]["correct_term"], dc_list, mondo)
#cat_ind = find_cat_index(category)
# is there a true? ppkt is tuple ("filename", dataframe) --> ppkt[1] is a dataframe
if not any(ppkt[1]["is_correct"]):
# no --> increase <cat> incorrect
contingency_table.loc[category_index, "incorrect"] += 1
else:
# yes --> increase <cat> correct
contingency_table.loc[category_index, "correct"] += 1

print(contingency_table)

4 changes: 2 additions & 2 deletions src/malco/post_process/post_process.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ def post_process(raw_results_dir: Path, output_dir: Path, langs: tuple, models:
raw_results_dir (Path): Path to the raw results directory.
output_dir (Path): Path to the output directory.
"""

for lang in langs:
raw_results_lang = raw_results_dir / "multilingual" / lang
output_lang = output_dir / "multilingual" / lang
Expand All @@ -21,7 +21,7 @@ def post_process(raw_results_dir: Path, output_dir: Path, langs: tuple, models:

create_standardised_results(raw_results_dir=raw_results_lang,
output_dir=output_lang, output_file_name="results.tsv")

for model in models:
raw_results_model = raw_results_dir / "multimodel" / model
output_model = output_dir / "multimodel" / model
Expand Down
15 changes: 11 additions & 4 deletions src/malco/runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,9 +18,14 @@ class MalcoRunner(PhEvalRunner):
output_dir: Path
config_file: Path
version: str
# Declare a tuple (immutable!) of languages
# Declare a tuple of languages and models
#TODO move next 4 lines to input file
languages = ("en", "es", "nl", "it", "de")
models = ('gpt-4o', 'gpt-4') # Decide on list of models: Claude-Sonnet (Anthropic key),
models = ("gpt-3.5-turbo", "gpt-4", "gpt-4-turbo", "gpt-4o") # Decide on list of models: Claude-Sonnet (Anthropic key),
just_run = 1 # only run the run part of the code
just_postprocess = 0 # only run the postprocess part of the code



def prepare(self):
"""
Expand Down Expand Up @@ -55,7 +60,8 @@ def post_process(self,
output_dir=self.output_dir,
langs=self.languages,
models=self.models)



comparing = "language"
mrr_file, plot_dir, num_ppkt, topn_file = compute_mrr(comparing,
output_dir=self.output_dir / "multilingual" ,
Expand All @@ -65,7 +71,8 @@ def post_process(self,

if print_plot:
make_plots(mrr_file, plot_dir, self.languages, num_ppkt, self.models, topn_file, comparing)



comparing = "model"
mrr_file, plot_dir, num_ppkt, topn_file = compute_mrr( comparing,
output_dir=self.output_dir / "multimodel" ,
Expand Down

0 comments on commit 7c03bd2

Please sign in to comment.