From 79f682517048a2d477cb4e0c12686dbc1466a0ed Mon Sep 17 00:00:00 2001
From: Leonardo macOS <leonardochimirri94@gmail.com>
Date: Tue, 26 Nov 2024 18:20:38 +0100
Subject: [PATCH 1/2] started setting it up, needs some ironing out but almost
 good to go

---
 src/malco/analysis/ita_grounding_analysis.py | 79 ++++++++++++++++++++
 1 file changed, 79 insertions(+)
 create mode 100644 src/malco/analysis/ita_grounding_analysis.py

diff --git a/src/malco/analysis/ita_grounding_analysis.py b/src/malco/analysis/ita_grounding_analysis.py
new file mode 100644
index 000000000..5bb6d02e6
--- /dev/null
+++ b/src/malco/analysis/ita_grounding_analysis.py
@@ -0,0 +1,79 @@
+from malco.post_process.post_process_results_format import read_raw_result_yaml
+from pathlib import Path
+import pandas as pd
+import os
+# Each row has
+#    c1      *       c2         *  c3  *       c4         *        c5              *            c6              *  c7                       * c8
+# PMID (str) * label/term (str) * rank * ita_reply (bool) * correct_result OMIM ID * correct_result OMIM label  *  MONDO ID (if applicable) * correct? 0/1 (in excel)
+
+# Correct results
+file = "/Users/leonardo/git/malco/in_ita_reply/correct_results.tsv"
+answers = pd.read_csv(
+        file, sep="\t", header=None, names=["description", "term", "label"]
+    )
+
+# Mapping each label to its correct term
+cres = answers.set_index("label").to_dict() # Cleanup this fella TODO
+
+# Just populate df with two for loops, then sort alfabetically
+data = []
+
+# load ita replies
+ita_file = Path("/Users/leonardo/git/malco/out_itanoeng/raw_results/multilingual/it/results.yaml")
+ita_result = read_raw_result_yaml(ita_file)
+
+# extract input_text from yaml for ita, or extracted_object, terms
+for ppkt_out in ita_result:
+    extracted_object = ppkt_out.get("extracted_object")
+    if extracted_object:
+        label = extracted_object.get("label").replace('_it-prompt', '_en-prompt')
+        terms = extracted_object.get("terms")
+        if terms:
+            num_terms = len(terms)
+            rank_list = [i + 1 for i in range(num_terms)]
+            for term, rank in zip(terms, rank_list):
+                data.append({"pubmedid": label, "term": term, "rank": rank, "ita_reply": True, "correct_omim_id": cres[label][0], 
+                             "correct_omim_id": cres[label][1], "mondo_id": float('Nan')})
+
+
+# load eng replies
+eng_file = Path("/Users/leonardo/git/malco/out_itanoeng/raw_results/multilingual/it_w_en/results.yaml")
+eng_result = read_raw_result_yaml(eng_file)
+
+# extract named_entities, id and label from yaml for eng
+# extract input_text from yaml for ita, or extracted_object, terms
+for ppkt_out in eng_result:
+    extracted_object = ppkt_out.get("extracted_object")
+    if extracted_object:
+        label = extracted_object.get("label")#.str.replace('_[a-z][a-z]-prompt', '', regex=True)
+        terms = extracted_object.get("terms")
+        if terms:
+            num_terms = len(terms)
+            rank_list = [i + 1 for i in range(num_terms)]
+            for term, rank in zip(terms, rank_list):
+                if term.str.startswith("MONDO"):
+                    breakpoint()
+                    ne = ppkt_out.get("named_entities")
+                    mid = ne.get("id")
+                    mlab = ne.get("label") # TODO finish
+                else:
+                    mlab = float('Nan')
+
+                data.append({"pubmedid": label, "term": term, "rank": rank, "ita_reply": False, "correct_omim_id": cres[label][0], 
+                             "correct_omim_id": cres[label][1], "mondo_id": mlab})
+
+# Create DataFrame
+column_names = [
+    "PMID",
+    "diagnosis",
+    "rank",
+    "ita_reply",
+    "correct_OMIMid",
+    "correct_OMIMlabel",
+    "MONDOid",
+]
+
+df = pd.DataFrame(data, columns=column_names)
+df = df.sort_values(by = 'Name') 
+
+#df.to_excel(os.getcwd() + "ita_replies2curate.xlsx")

From 51d76d552a959f275c26fe61a1d3a7b5e07f9aab Mon Sep 17 00:00:00 2001
From: Leonardo macOS <leonardochimirri94@gmail.com>
Date: Fri, 29 Nov 2024 18:31:34 +0100
Subject: [PATCH 2/2] finished the creation of the table, forgot I was in this
 branch and also fixed #63 here

---
 ...y => count_translated_prompts_and_copy.py} |  0
 src/malco/analysis/ita_grounding_analysis.py  | 36 +++++++++----------
 .../post_process_results_format.py            |  9 ++++-
 src/malco/post_process/ranking_utils.py       | 23 +++++++-----
 4 files changed, 41 insertions(+), 27 deletions(-)
 rename src/malco/analysis/{count_translated_prompts.py => count_translated_prompts_and_copy.py} (100%)

diff --git a/src/malco/analysis/count_translated_prompts.py b/src/malco/analysis/count_translated_prompts_and_copy.py
similarity index 100%
rename from src/malco/analysis/count_translated_prompts.py
rename to src/malco/analysis/count_translated_prompts_and_copy.py
diff --git a/src/malco/analysis/ita_grounding_analysis.py b/src/malco/analysis/ita_grounding_analysis.py
index 5bb6d02e6..c4156560c 100644
--- a/src/malco/analysis/ita_grounding_analysis.py
+++ b/src/malco/analysis/ita_grounding_analysis.py
@@ -3,8 +3,8 @@
 import pandas as pd
 import os
 # Each row has
-#    c1      *       c2         *  c3  *       c4         *        c5              *            c6              *  c7                       * c8
-# PMID (str) * label/term (str) * rank * ita_reply (bool) * correct_result OMIM ID * correct_result OMIM label  *  MONDO ID (if applicable) * correct? 0/1 (in excel)
+#    c1      *       c2         *  c3   *       c4         *        c5              *            c6              *  c7                       * c8
+# PMID (str) * label/term (str) *       *   rank           * ita_reply (bool) * correct_result OMIM ID * correct_result OMIM label  *  MONDO label (if applicable) * correct? 0/1 (in excel)
 
 # Correct results
 file = "/Users/leonardo/git/malco/in_ita_reply/correct_results.tsv"
@@ -13,7 +13,7 @@
     )
 
 # Mapping each label to its correct term
-cres = answers.set_index("label").to_dict() # Cleanup this fella TODO
+cres = answers.set_index("label").to_dict()
 
 # Just populate df with two for loops, then sort alfabetically
 data = []
@@ -32,8 +32,8 @@
             num_terms = len(terms)
             rank_list = [i + 1 for i in range(num_terms)]
             for term, rank in zip(terms, rank_list):
-                data.append({"pubmedid": label, "term": term, "rank": rank, "ita_reply": True, "correct_omim_id": cres[label][0], 
-                             "correct_omim_id": cres[label][1], "mondo_id": float('Nan')})
+                data.append({"pubmedid": label, "term": term, "mondo_label": float('Nan'), "rank": rank, "ita_reply": True, "correct_omim_id": cres['term'][label], 
+                             "correct_omim_description": cres['description'][label]})
 
 
 # load eng replies
@@ -45,35 +45,35 @@
 for ppkt_out in eng_result:
     extracted_object = ppkt_out.get("extracted_object")
     if extracted_object:
-        label = extracted_object.get("label")#.str.replace('_[a-z][a-z]-prompt', '', regex=True)
+        label = extracted_object.get("label").replace('_it-prompt', '_en-prompt')
         terms = extracted_object.get("terms")
         if terms:
             num_terms = len(terms)
             rank_list = [i + 1 for i in range(num_terms)]
             for term, rank in zip(terms, rank_list):
-                if term.str.startswith("MONDO"):
-                    breakpoint()
+                if term.startswith("MONDO"):
                     ne = ppkt_out.get("named_entities")
-                    mid = ne.get("id")
-                    mlab = ne.get("label") # TODO finish
+                    for entity in ne:
+                        if entity.get('id')==term:
+                            mlab = entity.get('label')
                 else:
                     mlab = float('Nan')
 
-                data.append({"pubmedid": label, "term": term, "rank": rank, "ita_reply": False, "correct_omim_id": cres[label][0], 
-                             "correct_omim_id": cres[label][1], "mondo_id": mlab})
+                data.append({"pubmedid": label, "term": mlab, "mondo_label": term, "rank": rank, "ita_reply": False, "correct_omim_id": cres["term"][label], 
+                             "correct_omim_description": cres['description'][label]})
 
 # Create DataFrame
 column_names = [
     "PMID",
-    "diagnosis",
+    "GPT Diagnosis",
+    "MONDO ID",
     "rank",
     "ita_reply",
     "correct_OMIMid",
     "correct_OMIMlabel",
-    "MONDOid",
 ]
 
-df = pd.DataFrame(data, columns=column_names)
-df = df.sort_values(by = 'Name') 
-
-#df.to_excel(os.getcwd() + "ita_replies2curate.xlsx")
+df = pd.DataFrame(data)
+df.columns = column_names
+df.sort_values(by = ['PMID', 'ita_reply', 'rank'], inplace=True) 
+#df.to_excel(os.getcwd() + "ita_replies2curate.xlsx") # does not work, wrong path, not important
diff --git a/src/malco/post_process/post_process_results_format.py b/src/malco/post_process/post_process_results_format.py
index 3ad542715..28e3d3dc8 100644
--- a/src/malco/post_process/post_process_results_format.py
+++ b/src/malco/post_process/post_process_results_format.py
@@ -56,7 +56,14 @@ def create_standardised_results(
                         )
                         # terms will now ONLY contain MONDO IDs OR 'N/A'.
                         # The latter should be dealt with downstream
-                        terms = [i[1][0][0] for i in result]  # MONDO_ID
+                        new_terms = []
+                        for i in result:
+                            if i[1] == [("N/A", "No grounding found")]:
+                                new_terms.append(i[0])
+                            else:
+                                new_terms.append(i[1][0][0])
+                        terms = new_terms
+                        #terms = [i[1][0][0] for i in result]  # MONDO_ID
                     if terms:
                         # Note, the if allows for rerunning ppkts that failed due to connection issues
                         # We can have multiple identical ppkts/prompts in results.yaml
diff --git a/src/malco/post_process/ranking_utils.py b/src/malco/post_process/ranking_utils.py
index 24f58abce..cd548cceb 100644
--- a/src/malco/post_process/ranking_utils.py
+++ b/src/malco/post_process/ranking_utils.py
@@ -98,6 +98,7 @@ def compute_mrr_and_ranks(
         "n10p",
         "nf",
         "num_cases",
+        "grounding_failed", # and no correct reply elsewhere in the differential
     ]
     rank_df = pd.DataFrame(0, index=np.arange(len(results_files)), columns=header)
 
@@ -143,6 +144,7 @@ def compute_mrr_and_ranks(
             )
 
             df.dropna(subset=["correct_term"])
+
             # Save full data frame
             full_df_path = output_dir / results_files[i].split("/")[0]
             full_df_filename = "full_df_results.tsv"
@@ -155,14 +157,17 @@ def compute_mrr_and_ranks(
             # Calculate top<n> of each rank
             rank_df.loc[i, comparing] = results_files[i].split("/")[0]
 
-            ppkts = df.groupby("label")[["rank", "is_correct"]]
+            ppkts = df.groupby("label")[["term", "rank", "is_correct"]]
 
             # for each group
             for ppkt in ppkts:
                 # is there a true? ppkt is tuple ("filename", dataframe) --> ppkt[1] is a dataframe
                 if not any(ppkt[1]["is_correct"]):
-                    # no  --> increase nf = "not found"
-                    rank_df.loc[i, "nf"] += 1
+                    if all(ppkt[1]["term"].str.startswith("MONDO")):
+                        # no  --> increase nf = "not found"
+                        rank_df.loc[i, "nf"] += 1
+                    else:
+                        rank_df.loc[i, "grounding_failed"] += 1
                 else:
                     # yes --> what's it rank? It's <j>
                     jind = ppkt[1].index[ppkt[1]["is_correct"]]
@@ -204,10 +209,12 @@ def compute_mrr_and_ranks(
         writer.writerow(results_files)
         writer.writerow(mrr_scores)
 
+    # TODO this could be moved in an anaysis script with the plotting...
     df = pd.read_csv(topn_file, delimiter="\t")
-    df["top1"] = (df["n1"]) / df["num_cases"]
-    df["top3"] = (df["n1"] + df["n2"] + df["n3"]) / df["num_cases"]
-    df["top5"] = (df["n1"] + df["n2"] + df["n3"] + df["n4"] + df["n5"]) / df["num_cases"]
+    valid_cases = df["num_cases"] - df["grounding_failed"]
+    df["top1"] = (df["n1"]) / valid_cases
+    df["top3"] = (df["n1"] + df["n2"] + df["n3"]) / valid_cases
+    df["top5"] = (df["n1"] + df["n2"] + df["n3"] + df["n4"] + df["n5"]) / valid_cases
     df["top10"] = (
         df["n1"]
         + df["n2"]
@@ -219,8 +226,8 @@ def compute_mrr_and_ranks(
         + df["n8"]
         + df["n9"]
         + df["n10"]
-    ) / df["num_cases"]
-    df["not_found"] = (df["nf"]) / df["num_cases"]
+    ) / valid_cases
+    df["not_found"] = (df["nf"]) / valid_cases
 
     df_aggr = pd.DataFrame()
     df_aggr = pd.melt(