From 79f682517048a2d477cb4e0c12686dbc1466a0ed Mon Sep 17 00:00:00 2001 From: Leonardo macOS Date: Tue, 26 Nov 2024 18:20:38 +0100 Subject: [PATCH 1/2] started setting it up, needs some ironing out but almost good to go --- src/malco/analysis/ita_grounding_analysis.py | 79 ++++++++++++++++++++ 1 file changed, 79 insertions(+) create mode 100644 src/malco/analysis/ita_grounding_analysis.py diff --git a/src/malco/analysis/ita_grounding_analysis.py b/src/malco/analysis/ita_grounding_analysis.py new file mode 100644 index 000000000..5bb6d02e6 --- /dev/null +++ b/src/malco/analysis/ita_grounding_analysis.py @@ -0,0 +1,79 @@ +from malco.post_process.post_process_results_format import read_raw_result_yaml +from pathlib import Path +import pandas as pd +import os +# Each row has +# c1 * c2 * c3 * c4 * c5 * c6 * c7 * c8 +# PMID (str) * label/term (str) * rank * ita_reply (bool) * correct_result OMIM ID * correct_result OMIM label * MONDO ID (if applicable) * correct? 0/1 (in excel) + +# Correct results +file = "/Users/leonardo/git/malco/in_ita_reply/correct_results.tsv" +answers = pd.read_csv( + file, sep="\t", header=None, names=["description", "term", "label"] + ) + +# Mapping each label to its correct term +cres = answers.set_index("label").to_dict() # Cleanup this fella TODO + +# Just populate df with two for loops, then sort alfabetically +data = [] + +# load ita replies +ita_file = Path("/Users/leonardo/git/malco/out_itanoeng/raw_results/multilingual/it/results.yaml") +ita_result = read_raw_result_yaml(ita_file) + +# extract input_text from yaml for ita, or extracted_object, terms +for ppkt_out in ita_result: + extracted_object = ppkt_out.get("extracted_object") + if extracted_object: + label = extracted_object.get("label").replace('_it-prompt', '_en-prompt') + terms = extracted_object.get("terms") + if terms: + num_terms = len(terms) + rank_list = [i + 1 for i in range(num_terms)] + for term, rank in zip(terms, rank_list): + data.append({"pubmedid": label, "term": term, "rank": rank, "ita_reply": True, "correct_omim_id": cres[label][0], + "correct_omim_id": cres[label][1], "mondo_id": float('Nan')}) + + +# load eng replies +eng_file = Path("/Users/leonardo/git/malco/out_itanoeng/raw_results/multilingual/it_w_en/results.yaml") +eng_result = read_raw_result_yaml(eng_file) + +# extract named_entities, id and label from yaml for eng +# extract input_text from yaml for ita, or extracted_object, terms +for ppkt_out in eng_result: + extracted_object = ppkt_out.get("extracted_object") + if extracted_object: + label = extracted_object.get("label")#.str.replace('_[a-z][a-z]-prompt', '', regex=True) + terms = extracted_object.get("terms") + if terms: + num_terms = len(terms) + rank_list = [i + 1 for i in range(num_terms)] + for term, rank in zip(terms, rank_list): + if term.str.startswith("MONDO"): + breakpoint() + ne = ppkt_out.get("named_entities") + mid = ne.get("id") + mlab = ne.get("label") # TODO finish + else: + mlab = float('Nan') + + data.append({"pubmedid": label, "term": term, "rank": rank, "ita_reply": False, "correct_omim_id": cres[label][0], + "correct_omim_id": cres[label][1], "mondo_id": mlab}) + +# Create DataFrame +column_names = [ + "PMID", + "diagnosis", + "rank", + "ita_reply", + "correct_OMIMid", + "correct_OMIMlabel", + "MONDOid", +] + +df = pd.DataFrame(data, columns=column_names) +df = df.sort_values(by = 'Name') + +#df.to_excel(os.getcwd() + "ita_replies2curate.xlsx") From 51d76d552a959f275c26fe61a1d3a7b5e07f9aab Mon Sep 17 00:00:00 2001 From: Leonardo macOS Date: Fri, 29 Nov 2024 18:31:34 +0100 Subject: [PATCH 2/2] finished the creation of the table, forgot I was in this branch and also fixed #63 here --- ...y => count_translated_prompts_and_copy.py} | 0 src/malco/analysis/ita_grounding_analysis.py | 36 +++++++++---------- .../post_process_results_format.py | 9 ++++- src/malco/post_process/ranking_utils.py | 23 +++++++----- 4 files changed, 41 insertions(+), 27 deletions(-) rename src/malco/analysis/{count_translated_prompts.py => count_translated_prompts_and_copy.py} (100%) diff --git a/src/malco/analysis/count_translated_prompts.py b/src/malco/analysis/count_translated_prompts_and_copy.py similarity index 100% rename from src/malco/analysis/count_translated_prompts.py rename to src/malco/analysis/count_translated_prompts_and_copy.py diff --git a/src/malco/analysis/ita_grounding_analysis.py b/src/malco/analysis/ita_grounding_analysis.py index 5bb6d02e6..c4156560c 100644 --- a/src/malco/analysis/ita_grounding_analysis.py +++ b/src/malco/analysis/ita_grounding_analysis.py @@ -3,8 +3,8 @@ import pandas as pd import os # Each row has -# c1 * c2 * c3 * c4 * c5 * c6 * c7 * c8 -# PMID (str) * label/term (str) * rank * ita_reply (bool) * correct_result OMIM ID * correct_result OMIM label * MONDO ID (if applicable) * correct? 0/1 (in excel) +# c1 * c2 * c3 * c4 * c5 * c6 * c7 * c8 +# PMID (str) * label/term (str) * * rank * ita_reply (bool) * correct_result OMIM ID * correct_result OMIM label * MONDO label (if applicable) * correct? 0/1 (in excel) # Correct results file = "/Users/leonardo/git/malco/in_ita_reply/correct_results.tsv" @@ -13,7 +13,7 @@ ) # Mapping each label to its correct term -cres = answers.set_index("label").to_dict() # Cleanup this fella TODO +cres = answers.set_index("label").to_dict() # Just populate df with two for loops, then sort alfabetically data = [] @@ -32,8 +32,8 @@ num_terms = len(terms) rank_list = [i + 1 for i in range(num_terms)] for term, rank in zip(terms, rank_list): - data.append({"pubmedid": label, "term": term, "rank": rank, "ita_reply": True, "correct_omim_id": cres[label][0], - "correct_omim_id": cres[label][1], "mondo_id": float('Nan')}) + data.append({"pubmedid": label, "term": term, "mondo_label": float('Nan'), "rank": rank, "ita_reply": True, "correct_omim_id": cres['term'][label], + "correct_omim_description": cres['description'][label]}) # load eng replies @@ -45,35 +45,35 @@ for ppkt_out in eng_result: extracted_object = ppkt_out.get("extracted_object") if extracted_object: - label = extracted_object.get("label")#.str.replace('_[a-z][a-z]-prompt', '', regex=True) + label = extracted_object.get("label").replace('_it-prompt', '_en-prompt') terms = extracted_object.get("terms") if terms: num_terms = len(terms) rank_list = [i + 1 for i in range(num_terms)] for term, rank in zip(terms, rank_list): - if term.str.startswith("MONDO"): - breakpoint() + if term.startswith("MONDO"): ne = ppkt_out.get("named_entities") - mid = ne.get("id") - mlab = ne.get("label") # TODO finish + for entity in ne: + if entity.get('id')==term: + mlab = entity.get('label') else: mlab = float('Nan') - data.append({"pubmedid": label, "term": term, "rank": rank, "ita_reply": False, "correct_omim_id": cres[label][0], - "correct_omim_id": cres[label][1], "mondo_id": mlab}) + data.append({"pubmedid": label, "term": mlab, "mondo_label": term, "rank": rank, "ita_reply": False, "correct_omim_id": cres["term"][label], + "correct_omim_description": cres['description'][label]}) # Create DataFrame column_names = [ "PMID", - "diagnosis", + "GPT Diagnosis", + "MONDO ID", "rank", "ita_reply", "correct_OMIMid", "correct_OMIMlabel", - "MONDOid", ] -df = pd.DataFrame(data, columns=column_names) -df = df.sort_values(by = 'Name') - -#df.to_excel(os.getcwd() + "ita_replies2curate.xlsx") +df = pd.DataFrame(data) +df.columns = column_names +df.sort_values(by = ['PMID', 'ita_reply', 'rank'], inplace=True) +#df.to_excel(os.getcwd() + "ita_replies2curate.xlsx") # does not work, wrong path, not important diff --git a/src/malco/post_process/post_process_results_format.py b/src/malco/post_process/post_process_results_format.py index 3ad542715..28e3d3dc8 100644 --- a/src/malco/post_process/post_process_results_format.py +++ b/src/malco/post_process/post_process_results_format.py @@ -56,7 +56,14 @@ def create_standardised_results( ) # terms will now ONLY contain MONDO IDs OR 'N/A'. # The latter should be dealt with downstream - terms = [i[1][0][0] for i in result] # MONDO_ID + new_terms = [] + for i in result: + if i[1] == [("N/A", "No grounding found")]: + new_terms.append(i[0]) + else: + new_terms.append(i[1][0][0]) + terms = new_terms + #terms = [i[1][0][0] for i in result] # MONDO_ID if terms: # Note, the if allows for rerunning ppkts that failed due to connection issues # We can have multiple identical ppkts/prompts in results.yaml diff --git a/src/malco/post_process/ranking_utils.py b/src/malco/post_process/ranking_utils.py index 24f58abce..cd548cceb 100644 --- a/src/malco/post_process/ranking_utils.py +++ b/src/malco/post_process/ranking_utils.py @@ -98,6 +98,7 @@ def compute_mrr_and_ranks( "n10p", "nf", "num_cases", + "grounding_failed", # and no correct reply elsewhere in the differential ] rank_df = pd.DataFrame(0, index=np.arange(len(results_files)), columns=header) @@ -143,6 +144,7 @@ def compute_mrr_and_ranks( ) df.dropna(subset=["correct_term"]) + # Save full data frame full_df_path = output_dir / results_files[i].split("/")[0] full_df_filename = "full_df_results.tsv" @@ -155,14 +157,17 @@ def compute_mrr_and_ranks( # Calculate top of each rank rank_df.loc[i, comparing] = results_files[i].split("/")[0] - ppkts = df.groupby("label")[["rank", "is_correct"]] + ppkts = df.groupby("label")[["term", "rank", "is_correct"]] # for each group for ppkt in ppkts: # is there a true? ppkt is tuple ("filename", dataframe) --> ppkt[1] is a dataframe if not any(ppkt[1]["is_correct"]): - # no --> increase nf = "not found" - rank_df.loc[i, "nf"] += 1 + if all(ppkt[1]["term"].str.startswith("MONDO")): + # no --> increase nf = "not found" + rank_df.loc[i, "nf"] += 1 + else: + rank_df.loc[i, "grounding_failed"] += 1 else: # yes --> what's it rank? It's jind = ppkt[1].index[ppkt[1]["is_correct"]] @@ -204,10 +209,12 @@ def compute_mrr_and_ranks( writer.writerow(results_files) writer.writerow(mrr_scores) + # TODO this could be moved in an anaysis script with the plotting... df = pd.read_csv(topn_file, delimiter="\t") - df["top1"] = (df["n1"]) / df["num_cases"] - df["top3"] = (df["n1"] + df["n2"] + df["n3"]) / df["num_cases"] - df["top5"] = (df["n1"] + df["n2"] + df["n3"] + df["n4"] + df["n5"]) / df["num_cases"] + valid_cases = df["num_cases"] - df["grounding_failed"] + df["top1"] = (df["n1"]) / valid_cases + df["top3"] = (df["n1"] + df["n2"] + df["n3"]) / valid_cases + df["top5"] = (df["n1"] + df["n2"] + df["n3"] + df["n4"] + df["n5"]) / valid_cases df["top10"] = ( df["n1"] + df["n2"] @@ -219,8 +226,8 @@ def compute_mrr_and_ranks( + df["n8"] + df["n9"] + df["n10"] - ) / df["num_cases"] - df["not_found"] = (df["nf"]) / df["num_cases"] + ) / valid_cases + df["not_found"] = (df["nf"]) / valid_cases df_aggr = pd.DataFrame() df_aggr = pd.melt(