From f4edd5f7fce354bac99cc6d9ec0ead6f203ba81e Mon Sep 17 00:00:00 2001 From: Peter Robinson Date: Mon, 24 Jun 2024 13:10:08 +0200 Subject: [PATCH 1/3] Turkish translation --- docs/cases/PMID_28569194.txt | 11 + .../phenopacket2prompt/Main.java | 2 +- .../cmd/BatchMineCommand.java | 2 +- .../cmd/GbtTranslateBatchCommand.java | 8 + .../cmd/GptTranslateCommand.java | 4 + .../phenopacket2prompt/cmd/Utility.java | 12 +- .../output/PromptGenerator.java | 7 + .../impl/german/PpktIndividualGerman.java | 6 +- .../german/PpktPhenotypicfeatureGerman.java | 4 +- .../impl/turkish/PpktIndividualTurkish.java | 482 ++++++++++++++++++ .../turkish/PpktPhenotypicfeatureTurkish.java | 139 +++++ .../output/impl/turkish/PpktTextTurkish.java | 24 + .../impl/turkish/TurkishBuildingBlocks.java | 326 ++++++++++++ .../impl/turkish/TurkishPromptGenerator.java | 104 ++++ 14 files changed, 1120 insertions(+), 11 deletions(-) create mode 100644 docs/cases/PMID_28569194.txt create mode 100644 src/main/java/org/monarchinitiative/phenopacket2prompt/output/impl/turkish/PpktIndividualTurkish.java create mode 100644 src/main/java/org/monarchinitiative/phenopacket2prompt/output/impl/turkish/PpktPhenotypicfeatureTurkish.java create mode 100644 src/main/java/org/monarchinitiative/phenopacket2prompt/output/impl/turkish/PpktTextTurkish.java create mode 100644 src/main/java/org/monarchinitiative/phenopacket2prompt/output/impl/turkish/TurkishBuildingBlocks.java create mode 100644 src/main/java/org/monarchinitiative/phenopacket2prompt/output/impl/turkish/TurkishPromptGenerator.java diff --git a/docs/cases/PMID_28569194.txt b/docs/cases/PMID_28569194.txt new file mode 100644 index 0000000..2a4c344 --- /dev/null +++ b/docs/cases/PMID_28569194.txt @@ -0,0 +1,11 @@ +[source] +pmid = PMID:28569194 +title = Severe child form of primary hyperoxaluria type 2 - a case report revealing consequence of GRHPR deficiency on metabolism +[diagnosis] +disease_id = OMIM:260000 +disease_label = Hyperoxaluria, primary, type II +[text] +A 10-month-old girl with a history of tonsilopharyngitis, bilateral inborn hip dysplasia with improvement, dispenzarised by an orthopaedist, was referred to hospital because of a one-day febrilities and positive urinary finding of ketone bodies, proteins, leukocyturia and haematuria. She was born after a normal pregnancy at full term with an uncomplicated perinatal and neonatal course. There was no history of gross haematuria or colicky abdominal pain, and she had received no medication except for antipyretics. The family history was negative with respect to renal and metabolic disease, including urolithiasis and nephrocalcinosis, however, genetic counselling revealed that the parents have common ancestors in the fourth generation. +On physical examination the child appeared mildly dehydrated with increased body temperature of 37.6 °C. A laboratory examination found moderate anaemia (haemoglobin 93.0 g/l), leucocytosis (15x109/l) and moderately elevated CRP (42.6 mg/l). Serum electrolyte, urea and creatinine levels were normal. The calcium/creatinine ratio and 24 h calcium excretion was within the reference range. Blood gas analysis was normal with no evidence of metabolic acidosis. Ultrasonography of the abdomen raised suspicion of ureterolithiasis l.sin. Cystourethrography was indicated, showing concrement sized 2 cm x 0.8 cm in the distal part of the left ureter with mild dilatation of the renal pelvis and ureter (Fig. 1). +At the age of 11 months, an ureterolithotomy was performed at the University Children Hospital Bratislava. A chemical analysis of the extracted stone was performed, and the stone was characterised as a composite of virtually pure calcium oxalate. The following course was without any complications. + diff --git a/src/main/java/org/monarchinitiative/phenopacket2prompt/Main.java b/src/main/java/org/monarchinitiative/phenopacket2prompt/Main.java index 1960bbc..4c98cb2 100644 --- a/src/main/java/org/monarchinitiative/phenopacket2prompt/Main.java +++ b/src/main/java/org/monarchinitiative/phenopacket2prompt/Main.java @@ -20,7 +20,7 @@ public static void main(String[] args){ .addSubcommand("download", new DownloadCommand()) .addSubcommand("prompt", new PromptCommand()) .addSubcommand("mine", new TextMineCommand()) - .addSubcommand("batchmine", new TextMineCommand()) + .addSubcommand("batchmine", new BatchMineCommand()) .addSubcommand("translate", new GptTranslateCommand()) ; cline.setToggleBooleanFlags(false); diff --git a/src/main/java/org/monarchinitiative/phenopacket2prompt/cmd/BatchMineCommand.java b/src/main/java/org/monarchinitiative/phenopacket2prompt/cmd/BatchMineCommand.java index c6e133d..42587a2 100644 --- a/src/main/java/org/monarchinitiative/phenopacket2prompt/cmd/BatchMineCommand.java +++ b/src/main/java/org/monarchinitiative/phenopacket2prompt/cmd/BatchMineCommand.java @@ -14,7 +14,7 @@ import java.util.List; import java.util.concurrent.Callable; -@CommandLine.Command(name = "batchmine", aliases = {"B2"}, +@CommandLine.Command(name = "batchmine", mixinStandardHelpOptions = true, description = "Batch Text mine, Translate, and Output phenopacket and prompt") public class BatchMineCommand implements Callable { diff --git a/src/main/java/org/monarchinitiative/phenopacket2prompt/cmd/GbtTranslateBatchCommand.java b/src/main/java/org/monarchinitiative/phenopacket2prompt/cmd/GbtTranslateBatchCommand.java index bc10f09..a53fcf3 100644 --- a/src/main/java/org/monarchinitiative/phenopacket2prompt/cmd/GbtTranslateBatchCommand.java +++ b/src/main/java/org/monarchinitiative/phenopacket2prompt/cmd/GbtTranslateBatchCommand.java @@ -75,8 +75,11 @@ public Integer call() throws Exception { List correctResultList = Utility.outputPromptsEnglish(ppktFiles, hpo); // output all non-English languages here // SPANISH + + /* PromptGenerator spanish = utility.spanish(); Utility.outputPromptsInternational(ppktFiles,"es", spanish); + // Dutch PromptGenerator dutch = utility.dutch(); Utility.outputPromptsInternational(ppktFiles,"nl", dutch); @@ -86,6 +89,11 @@ public Integer call() throws Exception { // ITALIAN PromptGenerator italian = utility.italian(); Utility.outputPromptsInternational(ppktFiles,"it", italian); +*/ + + PromptGenerator turkish = utility.turkish(); + Utility.outputPromptsInternational(ppktFiles,"tr", turkish); + // output original phenopackets PpktCopy pcopy = new PpktCopy(new File(outdirname)); for (var file : ppktFiles) { diff --git a/src/main/java/org/monarchinitiative/phenopacket2prompt/cmd/GptTranslateCommand.java b/src/main/java/org/monarchinitiative/phenopacket2prompt/cmd/GptTranslateCommand.java index 478bf60..a31b049 100644 --- a/src/main/java/org/monarchinitiative/phenopacket2prompt/cmd/GptTranslateCommand.java +++ b/src/main/java/org/monarchinitiative/phenopacket2prompt/cmd/GptTranslateCommand.java @@ -78,6 +78,10 @@ public Integer call() throws Exception { PromptGenerator italian = PromptGenerator.italian(internationalMap.get("it")); prompt = italian.createPrompt(individual); } + case "tr" -> { + PromptGenerator turkish = PromptGenerator.turkish(internationalMap.get("tr")); + prompt = turkish.createPrompt(individual); + } default -> prompt = "did not recognize language code " + languageCode; } diff --git a/src/main/java/org/monarchinitiative/phenopacket2prompt/cmd/Utility.java b/src/main/java/org/monarchinitiative/phenopacket2prompt/cmd/Utility.java index 1c8c085..72be2e8 100644 --- a/src/main/java/org/monarchinitiative/phenopacket2prompt/cmd/Utility.java +++ b/src/main/java/org/monarchinitiative/phenopacket2prompt/cmd/Utility.java @@ -48,15 +48,19 @@ public PromptGenerator german() { } public PromptGenerator spanish() { - return PromptGenerator.german(internationalMap.get("es")); + return PromptGenerator.spanish(internationalMap.get("es")); } public PromptGenerator dutch() { - return PromptGenerator.german(internationalMap.get("nl")); + return PromptGenerator.dutch(internationalMap.get("nl")); + } + + public PromptGenerator turkish() { + return PromptGenerator.turkish(internationalMap.get("tr")); } public PromptGenerator italian() { - return PromptGenerator.german(internationalMap.get("it")); + return PromptGenerator.italian(internationalMap.get("it")); } @@ -171,8 +175,8 @@ public static void outputPromptsInternational(List ppktFiles, String langu individualList.add(individual); } outputPromptsInternationalFromIndividualList(individualList, - PROMPT_DIR, languageCode, + PROMPT_DIR, generator); } diff --git a/src/main/java/org/monarchinitiative/phenopacket2prompt/output/PromptGenerator.java b/src/main/java/org/monarchinitiative/phenopacket2prompt/output/PromptGenerator.java index 4d11b42..96f0ccd 100644 --- a/src/main/java/org/monarchinitiative/phenopacket2prompt/output/PromptGenerator.java +++ b/src/main/java/org/monarchinitiative/phenopacket2prompt/output/PromptGenerator.java @@ -11,6 +11,8 @@ import org.monarchinitiative.phenopacket2prompt.output.impl.spanish.*; import org.monarchinitiative.phenopacket2prompt.output.impl.dutch.*; import org.monarchinitiative.phenopacket2prompt.output.impl.italian.*; +import org.monarchinitiative.phenopacket2prompt.output.impl.turkish.PpktPhenotypicfeatureTurkish; +import org.monarchinitiative.phenopacket2prompt.output.impl.turkish.TurkishPromptGenerator; import java.util.List; @@ -60,6 +62,11 @@ static PromptGenerator italian(HpInternational international) { return new ItalianPromptGenerator(pfgen); } + static PromptGenerator turkish(HpInternational international) { + PpktPhenotypicFeatureGenerator pfgen = new PpktPhenotypicfeatureTurkish(international); + return new TurkishPromptGenerator(pfgen); + } + /** * The following structure should work for most other languages, but the function diff --git a/src/main/java/org/monarchinitiative/phenopacket2prompt/output/impl/german/PpktIndividualGerman.java b/src/main/java/org/monarchinitiative/phenopacket2prompt/output/impl/german/PpktIndividualGerman.java index 3e0c96c..743db0f 100644 --- a/src/main/java/org/monarchinitiative/phenopacket2prompt/output/impl/german/PpktIndividualGerman.java +++ b/src/main/java/org/monarchinitiative/phenopacket2prompt/output/impl/german/PpktIndividualGerman.java @@ -241,7 +241,7 @@ private String iso8601individualDescription(PhenopacketSex psex, Iso8601Age iso8 case FEMALE -> String.format("%s %s %s", bbGenerator.probandWasA(), dAlter(iso8601Age, GrammatikalischesGeschlecht.NEUTRUM), // "das Mädchen" bbGenerator.girl()); - case MALE -> String.format("%s %s %s", bbGenerator.probandWasA(), + case MALE -> String.format("%s ein %s %s", bbGenerator.probandWasA(), dAlter(iso8601Age, GrammatikalischesGeschlecht.MAENNLICH), bbGenerator.boy()); default -> String.format("%s %s %s", bbGenerator.probandWasA(), @@ -256,13 +256,13 @@ private String iso8601individualDescription(PhenopacketSex psex, Iso8601Age iso8 case MALE -> String.format("%s ein %s %s", bbGenerator.probandWasA(), dAlter(iso8601Age, GrammatikalischesGeschlecht.MAENNLICH), bbGenerator.maleInfant()); - default -> String.format("%s %s %s", bbGenerator.probandWasA(), + default -> String.format("%s ein %s %s", bbGenerator.probandWasA(), dAlter(iso8601Age, GrammatikalischesGeschlecht.MAENNLICH), // "der Säugling bbGenerator.infant()); }; } else { return switch (psex) { - case FEMALE -> String.format("Die Probandin war ein %s", bbGenerator.probandWasA(), bbGenerator.newbornGirl()); + case FEMALE -> String.format("Die Probandin war ein %s", bbGenerator.probandWasA(), bbGenerator.newbornGirl()); // das case MALE -> String.format("Der Proband war ein %s", bbGenerator.probandWasA(), bbGenerator.newbornBoy()); default -> String.format("Der Proband war ein Neugeborenes ohne angegebenes Geschlecht"); }; diff --git a/src/main/java/org/monarchinitiative/phenopacket2prompt/output/impl/german/PpktPhenotypicfeatureGerman.java b/src/main/java/org/monarchinitiative/phenopacket2prompt/output/impl/german/PpktPhenotypicfeatureGerman.java index 8dcb4f3..2233460 100644 --- a/src/main/java/org/monarchinitiative/phenopacket2prompt/output/impl/german/PpktPhenotypicfeatureGerman.java +++ b/src/main/java/org/monarchinitiative/phenopacket2prompt/output/impl/german/PpktPhenotypicfeatureGerman.java @@ -92,11 +92,11 @@ public String featuresAtEncounter(String personString, String ageString, List1? "wurden":"wurde", + excluded.size()>1? "wurden die folgenden Symptome":"wurde das folgende Symptom", excludedStr); } else if (!observed.isEmpty()) { return String.format("%s präsentierte %s mit den folgenden Symptomen: %s.", ageString, personString, observedStr); diff --git a/src/main/java/org/monarchinitiative/phenopacket2prompt/output/impl/turkish/PpktIndividualTurkish.java b/src/main/java/org/monarchinitiative/phenopacket2prompt/output/impl/turkish/PpktIndividualTurkish.java new file mode 100644 index 0000000..b9f2bab --- /dev/null +++ b/src/main/java/org/monarchinitiative/phenopacket2prompt/output/impl/turkish/PpktIndividualTurkish.java @@ -0,0 +1,482 @@ +package org.monarchinitiative.phenopacket2prompt.output.impl.turkish; + +import org.monarchinitiative.phenol.base.PhenolRuntimeException; +import org.monarchinitiative.phenopacket2prompt.model.*; +import org.monarchinitiative.phenopacket2prompt.output.BuildingBlockGenerator; +import org.monarchinitiative.phenopacket2prompt.output.PPKtIndividualInfoGenerator; + +import java.util.ArrayList; +import java.util.List; +import java.util.Optional; + +public class PpktIndividualTurkish implements PPKtIndividualInfoGenerator { + + private final BuildingBlockGenerator bbGenerator; + /** grammatical sex */ + private enum GrammatikalischesGeschlecht { + MAENNLICH, WEIBLICH, NEUTRUM + } + + public PpktIndividualTurkish() { + bbGenerator = new TurkishBuildingBlocks(); + } + + @Override + public String getIndividualDescription(PpktIndividual individual) { + if (individual.annotationCount() == 0) { + throw new PhenolRuntimeException("No HPO annotations"); + } + Optional lastExamOpt = individual.getAgeAtLastExamination(); + Optional onsetOpt = individual.getAgeAtOnset(); + PhenopacketSex psex = individual.getSex(); + String individualDescription; + String onsetDescription; + if (lastExamOpt.isPresent()) { + var lastExamAge = lastExamOpt.get(); + if (lastExamAge.ageType().equals(PhenopacketAgeType.ISO8601_AGE_TYPE)) { + Iso8601Age isoAge = (Iso8601Age) lastExamAge; + individualDescription = iso8601individualDescription(psex, isoAge); + } else if (lastExamAge.ageType().equals(PhenopacketAgeType.HPO_ONSET_AGE_TYPE)) { + HpoOnsetAge hpoOnsetTermAge = (HpoOnsetAge) lastExamAge; + individualDescription = hpoOnsetIndividualDescription(psex, hpoOnsetTermAge); + } else { + // should never happen + throw new PhenolRuntimeException("Did not recognize last exam age type " + lastExamAge.ageType()); + } + } else { + individualDescription = switch (psex) { + case FEMALE -> bbGenerator.probandWasAFemale(); + case MALE -> bbGenerator.probandWasAMale(); + default -> bbGenerator.probandWasAnIndividual(); + }; + } + if (onsetOpt.isPresent()) { + var onsetAge = onsetOpt.get(); + if (onsetAge.ageType().equals(PhenopacketAgeType.ISO8601_AGE_TYPE)) { + Iso8601Age isoAge = (Iso8601Age) onsetAge; + onsetDescription = iso8601onsetDescription(isoAge); + } else if (onsetAge.ageType().equals(PhenopacketAgeType.HPO_ONSET_AGE_TYPE)) { + HpoOnsetAge hpoOnsetTermAge = (HpoOnsetAge) onsetAge; + onsetDescription = hpoOnsetDescription(hpoOnsetTermAge); + } else { + // should never happen + throw new PhenolRuntimeException("Did not recognize last exam age type " + onsetAge.ageType()); + } + } else { + onsetDescription = "Der Krankheitsbeginn wurde nicht angegeben"; + } + return String.format("%s. %s.", individualDescription, onsetDescription); + } + + private String hpoOnsetDescription(HpoOnsetAge hpoOnsetTermAge) { + return String.format("Der Krankheitsbeginn trat %s auf", + nameOfLifeStage(hpoOnsetTermAge)); + } + + private String nameOfLifeStage(HpoOnsetAge hpoOnsetTermAge) { + if (hpoOnsetTermAge.isFetus()) { + return "während der Fetalperiode"; + } else if (hpoOnsetTermAge.isCongenital()) { + return "zum Zeitpunkt der Geburt"; + } else if (hpoOnsetTermAge.isInfant()) { + return "im Säuglingsalter"; + } else if (hpoOnsetTermAge.isChild()) { + return "im Kindesalter"; + } else if (hpoOnsetTermAge.isJuvenile()) { + return "im Jugendlichenalter"; + } else if (hpoOnsetTermAge.isNeonate()) { + return "im Neugeborenenalter"; // +bbGenerator.newborn(); + } else if (hpoOnsetTermAge.isYoungAdult()) { + return "im jungen Erwachsenenalter" ; + } else if (hpoOnsetTermAge.isMiddleAge()) { + return "im mittleren Erwachsenenalter" ; + } else if (hpoOnsetTermAge.isLateAdultAge()) { + return "im späten Erwachsenenalter" ; + } else if (hpoOnsetTermAge.isAdult()) { + // d.h. nicht weiter spezifiziert + return "im Erwachsenenalter" ; + } else { + throw new PhenolRuntimeException("Could not identify German life stage name for HpoOnsetAge " + hpoOnsetTermAge.toString()); + } + } + + private String iso8601onsetDescription(Iso8601Age isoAge) { + return String.format("Der Krankheitsbeginn trat im Alter von %s auf", + bbGenerator.yearsMonthsDaysOld(isoAge.getYears(), isoAge.getMonths(), isoAge.getDays())); + } + + + + + + + + public String ageAndSexAtLastExamination(PpktIndividual individual) { + PhenopacketSex psex = individual.getSex(); + Optional ageOpt = individual.getAgeAtLastExamination(); + if (ageOpt.isEmpty()) { + ageOpt = individual.getAgeAtOnset(); + } + String sex; + switch (psex) { + case FEMALE -> sex = bbGenerator.woman(); + case MALE -> sex = bbGenerator.man(); + default -> sex = bbGenerator.adult(); + } + + if (ageOpt.isEmpty()) { + return sex; + } + PhenopacketAge age = ageOpt.get(); + if (age.ageType().equals(PhenopacketAgeType.ISO8601_AGE_TYPE)) { + Iso8601Age isoage = (Iso8601Age) age; + int y = isoage.getYears(); + int m = isoage.getMonths(); + int d = isoage.getDays(); + if (psex.equals(PhenopacketSex.FEMALE)) { + if (y > 17) { + return String.format("Eine %djährige Patientin", y); + } else if (y > 9) { + return String.format("Eine %djährige Jugendliche", y); + } else if (y > 0) { + return String.format("Ein %djähriges Mädchen", y); + } else if (m>0) { + return String.format("Ein %d Monate alter weiblicher Säugling", m); + } else { + return String.format("Ein %d Tage alter weiblicher Säugling", d); + } + } + } else { + // age is an HPO onset term, we do not have an exact date + } + if (age.isChild()) { + return switch (psex) { + case FEMALE -> bbGenerator.girl(); + case MALE -> bbGenerator.boy(); + default -> bbGenerator.child(); + }; + } else if (age.isCongenital()) { + return switch (psex) { + case FEMALE -> bbGenerator.newbornGirl(); + case MALE -> bbGenerator.newbornBoy(); + default -> bbGenerator.newborn(); + }; + } else if (age.isFetus()) { + return switch (psex) { + case FEMALE -> bbGenerator.femaleFetus(); + case MALE -> bbGenerator.maleFetus(); + default -> bbGenerator.fetus(); + }; + } else if (age.isInfant()) { + return switch (psex) { + case FEMALE -> bbGenerator.femaleInfant(); + case MALE -> bbGenerator.maleInfant(); + default -> bbGenerator.infant(); + }; + } else { + return switch (psex) { + // TODO -- MORE GRANULARITY + case FEMALE -> bbGenerator.woman(); + case MALE -> bbGenerator.man(); + default -> bbGenerator.adult(); + }; + } + } + + + private String imAlterVonIsoAgeExact(PhenopacketAge ppktAge) { + Iso8601Age iso8601Age = (Iso8601Age) ppktAge; + int y = iso8601Age.getYears(); + int m = iso8601Age.getMonths(); + int d = iso8601Age.getDays(); + + if (y > 10) { + return String.format("Im Alter von %d Jahren", y); + } else if (y > 0) { + if (m > 0) { + return String.format("Im Alter von %d %s und %d %s", y, + y>1?"Jahren" : "Jahr", + m, m>1?"Monaten" : "Monat"); + } else { + return String.format("Im Alter von %d %s", y, y>1?"Jahren" : "Jahr"); + } + } + if (m>0) { + return String.format("Im Alter von %d %s y %d %s", m, m>1?"Monaten" : "Monat", + d, d>1?"Tagen" : "Tag"); + } else { + return String.format("%d Tage", d); + } + } + + + private String iso8601individualDescription(PhenopacketSex psex, Iso8601Age iso8601Age) { + int y = iso8601Age.getYears(); + int m = iso8601Age.getMonths(); + int d = iso8601Age.getDays(); + // if older + if (y > 17) { + return switch (psex) { + case FEMALE -> String.format("Die Probandin war eine %s Frau", + dAlter(iso8601Age, GrammatikalischesGeschlecht.WEIBLICH)); + case MALE -> String.format("Der Proband war ein %s Mann", + dAlter(iso8601Age, GrammatikalischesGeschlecht.MAENNLICH)); + default -> String.format("Der Proband war ein %s %s", + dAlter(iso8601Age, GrammatikalischesGeschlecht.NEUTRUM), + bbGenerator.individual()); + }; + } else if (y > 9) { + return switch (psex) { + case FEMALE -> String.format("%s %s %s", bbGenerator.probandWasA(), + dAlter(iso8601Age, GrammatikalischesGeschlecht.WEIBLICH), + bbGenerator.adolescentGirl()); + case MALE -> String.format("%s %s %s", bbGenerator.probandWasA(), + dAlter(iso8601Age, GrammatikalischesGeschlecht.MAENNLICH), + bbGenerator.adolescentBoy()); + default -> String.format("%s %s %s", bbGenerator.probandWasA(), + dAlter(iso8601Age, GrammatikalischesGeschlecht.NEUTRUM), bbGenerator.adolescentChild()); + }; + } else if (y > 0) { + String age = String.format("Olgu %d yaş %d aylık", y, m); + return switch (psex) { + case FEMALE -> String.format("%s %s %s", bbGenerator.probandWasA(), + dAlter(iso8601Age, GrammatikalischesGeschlecht.NEUTRUM), // "das Mädchen" + bbGenerator.girl()); + case MALE -> String.format("%s bir erkek çocuktu.", age); + default -> String.format("Bu Pat. bir yil on ay kacindik%s %s %s", bbGenerator.probandWasA(), + dAlter(iso8601Age, GrammatikalischesGeschlecht.NEUTRUM), // Das Individuum + bbGenerator.child()); + }; + } else if (m > 0 || d > 0) { + return switch (psex) { + case FEMALE -> String.format("%s ein %s %s", bbGenerator.probandWasA(), + dAlter(iso8601Age, GrammatikalischesGeschlecht.MAENNLICH), // "der weibliche Säungling", + bbGenerator.femaleInfant()); + case MALE -> String.format("%s ein %s %s", bbGenerator.probandWasA(), + dAlter(iso8601Age, GrammatikalischesGeschlecht.MAENNLICH), + bbGenerator.maleInfant()); + default -> String.format("%s %s %s", bbGenerator.probandWasA(), + dAlter(iso8601Age, GrammatikalischesGeschlecht.MAENNLICH), // "der Säugling + bbGenerator.infant()); + }; + } else { + return switch (psex) { + case FEMALE -> String.format("Die Probandin war ein %s", bbGenerator.probandWasA(), bbGenerator.newbornGirl()); + case MALE -> String.format("Der Proband war ein %s", bbGenerator.probandWasA(), bbGenerator.newbornBoy()); + default -> String.format("Der Proband war ein Neugeborenes ohne angegebenes Geschlecht"); + }; + } + } + + /** + * @param iso8601Age + * @return zB. "4 Jahre und 2 Monate alter" "3 Monate und 1 Tag altes" + */ + private String dAlter(Iso8601Age iso8601Age, GrammatikalischesGeschlecht geschlecht) { + int y = iso8601Age.getYears(); + int m = iso8601Age.getMonths(); + int d = iso8601Age.getDays(); + List components = new ArrayList<>(); + if (y > 0) { + components.add(String.format("%d %s", y, y > 1 ? "Jahre" : "Jahr")); + } + if (m > 0) { + components.add(String.format("%d %s", m, m > 1 ? "Monate" : "Monat")); + } + if (d > 0) { + components.add(String.format("%d %s", d, d > 1 ? "Tage" : "Tag")); + } + String ymd; + if (components.isEmpty()) { + ymd = ""; + } else if (components.size() == 1) { + ymd = components.get(0); + } else if (components.size() == 2) { + ymd = String.format("%s und %s", components.get(0), components.get(1)); + } else { + ymd = String.format("%s, %s und %s", components.get(0), components.get(1), components.get(2)); + } + return switch (geschlecht) { + case MAENNLICH -> String.format("%s alter", ymd); + case WEIBLICH -> String.format("%s alte", ymd); + case NEUTRUM -> String.format("%s altes", ymd); + }; + } + + + private String iso8601ToYearMonth(Iso8601Age iso8601Age, PhenopacketSex psex) { + int y = iso8601Age.getYears(); + int m = iso8601Age.getMonths(); + int d = iso8601Age.getDays(); + if (psex.equals(PhenopacketSex.MALE)) { + if (iso8601Age.getMonths() == 0) { + return String.format("ein %djähriger Junge", y); + } else { + return String.format("ein %d %s, %d %s alter Junge", y, y>1?"Jahre":"Jahr", m, m>1?"Monate":"Monat"); + } + } else if (psex.equals(PhenopacketSex.FEMALE)) { + if (iso8601Age.getMonths() == 0) { + return String.format("ein %djähriges Mädchen", y); + } else { + return String.format("ein %d %s, %d %s altes Mädchen", y, y>1?"Jahre":"Jahr", m, m>1?"Monate":"Monat"); + } + } + if (iso8601Age.getMonths() == 0) { + return String.format("ein %djähriges Kind", y); + } else { + return String.format("ein %d %s, %d %s altes Kind", y, y>1?"Jahre":"Jahr", m, m>1?"Monate":"Monat"); } + } + + private String monthString(int m) { + return m>1 ? "Monate": "Monat"; + } + + private String dayString(int d) { + return d>1 ? "Tage": "Tag"; + } + + private String iso8601ToMonthDay(Iso8601Age iso8601Age) { + int m = iso8601Age.getMonths(); + int d = iso8601Age.getDays(); + if (m == 0) { + return String.format("de %d dias", d); + } else if (d>0){ + return String.format("%d %s und %d %s", m, monthString(m), d, dayString(d)); + } else { + return String.format("%d %s", m, m>1 ? "Monate": "Monat"); + } + } + + /** + * Create a phrase such as "at the age of 7 years, 4 months, and 2 days" + * Leave out the months and days if they are zero. + * @param isoAge + * @return + */ + private String iso8601AtAgeOf(Iso8601Age isoAge) { + List components = new ArrayList<>(); + + if (isoAge.getYears()>1) { + components.add(String.format("%d Jahren", isoAge.getYears())); + } else if (isoAge.getYears() == 1) { + components.add("einem Jahr"); + } + if (isoAge.getMonths() > 1) { + components.add(String.format("%d Monaten", isoAge.getMonths())); + } else if (isoAge.getMonths() == 1) { + components.add("einem Monat"); + } + if (isoAge.getDays()>1) { + components.add(String.format("%d Tagen", isoAge.getDays())); + } else if (isoAge.getDays()==1) { + components.add("einem Tag"); + } + if (components.isEmpty()) { + return "bei der Geburt"; + } else if (components.size() == 1) { + return "im Alter von " + components.getFirst(); + } else if (components.size() == 2) { + return "im Alter von " + components.get(0) + " und " + components.get(1); + } else { + return "im Alter von " + components.get(0) + ", " + components.get(1) + + " und " + components.get(2); + } + } +/* + private String onsetTermAtAgeOf(HpoOnsetAge hpoOnsetTermAge) { + if (hpoOnsetTermAge.isFetus()) { + return "in der Fetalperiode"; + } else if (hpoOnsetTermAge.isCongenital()) { + return "bei der Geburt"; + } else if (hpoOnsetTermAge.isInfant()) { + return "im Säuglingsalter"; + } else if (hpoOnsetTermAge.isChild()) { + return "in der Kindheit"; + } else if (hpoOnsetTermAge.isJuvenile()) { + return "als Jugendlich adolescente"; + } else { + return "im Erwachsenenalter"; + } + } +*/ + + + private String hpoOnsetIndividualDescription(PhenopacketSex psex, HpoOnsetAge hpoOnsetTermAge) { + if (hpoOnsetTermAge.isFetus()) { + return switch (psex) { + case FEMALE -> String.format("%s %s", bbGenerator.probandWasAFemale(), bbGenerator.femaleFetus()); + case MALE -> String.format("%s %s", bbGenerator.probandWasAMale(), bbGenerator.maleFetus()); + default -> String.format("%s %s", bbGenerator.probandWasA(), bbGenerator.fetus()); + }; + } else if (hpoOnsetTermAge.isCongenital()) { + return switch (psex) { + case FEMALE -> "Die Probandin war ein weibliches Neugeborenes"; + case MALE -> "Der Probandwar ein männliches Neugeborenes"; + default -> "Der Patient war ein Neugeborenes ohne angegebenes Geschelcht"; + }; + } else if (hpoOnsetTermAge.isInfant()) { + return switch (psex) { + case FEMALE -> "Die Probandin war ein weiblicher Säugling"; + case MALE -> "Der Proband war ein männlicher Säugling"; + default -> "Der Proband war ein Säugling ohne angegebenes Geschlecht"; + }; + } else if (hpoOnsetTermAge.isChild()) { + return switch (psex) { + case FEMALE -> "Die Probandin war ein Mädchen"; + case MALE -> "Der Proband war ein Junge"; + default -> "Der Proband war ein Kind ohne angegebenes Geschlecht"; + }; + } else if (hpoOnsetTermAge.isJuvenile()) { + return switch (psex) { + case FEMALE -> "Die Probandin war eine Jugendliche"; + case MALE -> "Der Proband war ein Jugendlicher"; + default -> "Der Proband war ein Jugendlicher ohne angegebenes Geschlecht"; + }; + } else if (hpoOnsetTermAge.isAdult()) { + return switch (psex) { + case FEMALE -> "Die Probandin war eine Frau"; + case MALE -> "Der Proband war ein Mann"; + default -> "Der Proband war eine erwachsene Person ohne angegebenes Geschlecht"; + + }; + } else { + throw new PhenolRuntimeException("Could not find HPO onset type " + hpoOnsetTermAge.toString()); + } + } + + + @Override + public String heSheIndividual(PhenopacketSex psex) { + return switch (psex) { + case FEMALE -> "sie"; + case MALE -> "er"; + default -> "die Person"; + }; + } + + @Override + public String atAgeForVignette(PhenopacketAge ppktAge) { + if (ppktAge.ageType().equals(PhenopacketAgeType.ISO8601_AGE_TYPE)) { + return imAlterVonIsoAgeExact(ppktAge); + } else if (ppktAge.ageType().equals(PhenopacketAgeType.HPO_ONSET_AGE_TYPE)) { + String label = ppktAge.age(); // something like "Infantile onset" + return switch (label) { + case "Infantile onset" -> "Als Säugling"; + case "Childhood onset" -> "In der Kindheit"; + case "Neonatal onset" -> "In der neugeborenen Zeit"; + case "Congenital onset" -> "Zum Zeitpunkt der Geburt"; + case "Adult onset" -> "Im Erwachsenenalter"; + case "Juvenile onset" -> "Im Jugendlichenalter"; + default-> { + throw new PhenolRuntimeException("No German translation for " + label); + } + }; + } else { + return ""; // should never get here + } + } + + + + +} diff --git a/src/main/java/org/monarchinitiative/phenopacket2prompt/output/impl/turkish/PpktPhenotypicfeatureTurkish.java b/src/main/java/org/monarchinitiative/phenopacket2prompt/output/impl/turkish/PpktPhenotypicfeatureTurkish.java new file mode 100644 index 0000000..31c362c --- /dev/null +++ b/src/main/java/org/monarchinitiative/phenopacket2prompt/output/impl/turkish/PpktPhenotypicfeatureTurkish.java @@ -0,0 +1,139 @@ +package org.monarchinitiative.phenopacket2prompt.output.impl.turkish; + +import org.monarchinitiative.phenol.base.PhenolRuntimeException; +import org.monarchinitiative.phenopacket2prompt.international.HpInternational; +import org.monarchinitiative.phenopacket2prompt.model.OntologyTerm; +import org.monarchinitiative.phenopacket2prompt.output.PpktPhenotypicFeatureGenerator; + +import java.util.*; +import java.util.stream.Collectors; + +public class PpktPhenotypicfeatureTurkish implements PpktPhenotypicFeatureGenerator { + + private final HpInternational turkish; + private Set missingTranslations; + + + public PpktPhenotypicfeatureTurkish(HpInternational international) { + turkish = international; + missingTranslations = new HashSet<>(); + } + + + private List getTranslations(List ontologyTerms) { + List labels = new ArrayList<>(); + for (var term: ontologyTerms) { + Optional opt = turkish.getLabel(term.getTid()); + if (opt.isPresent()) { + labels.add(opt.get()); + } else { + String missing = String.format(" %s (%s)", term.getLabel(), term.getTid().getValue()); + missingTranslations.add(missing); + } + } + return labels; + } + + + + private String getCommaList(List items) { + if (items.isEmpty()) { + return ""; // this will be filtered out later + } + if (items.size() == 1) { + return items.getFirst(); + } + if (items.size() == 2) { + // no comma if we just have two items. + // one item will work with the below code + return String.join(" und ", items); + } + // if we have more than two, join all but the very last item with a comma + String penultimate = items.stream() + .limit(items.size() - 1) + .collect(Collectors.joining(",")); + String ultimate = items.get(items.size() - 1); + return penultimate + " und " + ultimate; + } + + @Override + public String formatFeatures(List ontologyTerms) { + List observedTerms = getObservedFeatures(ontologyTerms); + List excludedTerms = getExcludedFeatures(ontologyTerms); + List observedLabels = getTranslations(observedTerms); + List excludedLabels = getTranslations(excludedTerms); + if (observedLabels.isEmpty() && excludedLabels.isEmpty()) { + return "keine phänotypischen Abnormalitäten"; // should never happen, actually! + } else if (excludedLabels.isEmpty()) { + return getCommaList(observedLabels) + ". "; + } else if (observedLabels.isEmpty()) { + if (excludedLabels.size() > 1) { + return String.format("%s wurden ausgeschlossen.", getCommaList(excludedLabels)); + } else { + return String.format("%s wurde ausgeschlossen.",excludedLabels.getFirst()); + } + } else { + String exclusion = String.format("Dagegen %s %s ausgeschlossen.", excludedLabels.size()>1? "wurden":"wurde", getCommaList(excludedLabels)); + return getCommaList(observedLabels) + ". " + exclusion; + } + } + + public Set getMissingTranslations() { + return missingTranslations; + } + + + @Override + public String featuresAtEncounter(String personString, String ageString, List ontologyTerms) { + List observed = getObservedFeatures(ontologyTerms); + List excluded = getExcludedFeatures(ontologyTerms); + List observedGerman = getTranslations(observed); + List excludedGerman = getTranslations(excluded); + var observedStr = getCommaList(observedGerman); + var excludedStr = getCommaList(excludedGerman); + if (!observed.isEmpty() && ! excluded.isEmpty()) { + return String.format("%s präsentierte %s mit den folgenden Symptomen: %s. Im Gegensatz %s ausgeschlossen: %s.", + ageString, + personString, + observedStr, + excluded.size()>1? "wurden die folgenden Symptome":"wurde das folgende Symptom", + excludedStr); + } else if (!observed.isEmpty()) { + return String.format("%s präsentierte %s mit den folgenden Symptomen: %s.", ageString, personString, observedStr); + } else if (!excluded.isEmpty()) { + return String.format("%s %s die folgenden Symptome ausgeschlossen: %s.", + ageString, + excluded.size()>1? "wurden":"wurde", excludedStr); + } else { + throw new PhenolRuntimeException("No features found for time point " + ageString); // should never happen + } + } + + @Override + public String featuresAtOnset(String personString, List ontologyTerms) { + List observed = getObservedFeatures(ontologyTerms); + List excluded = getExcludedFeatures(ontologyTerms); + List observedGerman = getTranslations(observed); + List excludedGerman = getTranslations(excluded); + var observedStr = getCommaList(observedGerman); + var excludedStr = getCommaList(excludedGerman); + + if (!observed.isEmpty() && ! excluded.isEmpty()) { + return String.format("%s präsentierte mit den folgenden Symptomen: %s. Im Gegensatz %s die folgenden Symptome ausgeschlossen: %s.", + personString, + observedStr, + excluded.size()>1? "wurden":"wurde", + excludedStr); + } else if (!observed.isEmpty()) { + return String.format("%s präsentierte mit den folgenden Symptomen: %s.", personString, observedStr); + } else if (!excluded.isEmpty()) { + return String.format("Beim Krankheitsbeginn %s die folgenden Symptome ausgeschlossen: %s.", + excluded.size()>1? "wurden":"wurde", excludedStr); + } else { + return "Keine phänotypischen Abnormalitäten wurden explizit zu Krankheitsbeginn beschrieben."; + } + } + + + +} diff --git a/src/main/java/org/monarchinitiative/phenopacket2prompt/output/impl/turkish/PpktTextTurkish.java b/src/main/java/org/monarchinitiative/phenopacket2prompt/output/impl/turkish/PpktTextTurkish.java new file mode 100644 index 0000000..3e1457f --- /dev/null +++ b/src/main/java/org/monarchinitiative/phenopacket2prompt/output/impl/turkish/PpktTextTurkish.java @@ -0,0 +1,24 @@ +package org.monarchinitiative.phenopacket2prompt.output.impl.turkish; + +import org.monarchinitiative.phenopacket2prompt.output.PhenopacketTextGenerator; + +public class PpktTextTurkish implements PhenopacketTextGenerator { + + @Override + public String GPT_PROMPT_HEADER() { + return """ +Teşhislerinizin insan uzmanlarınkine kıyasla nasıl olduğunu görmek için klinik bir vaka raporu ile bir deney yapıyorum. Size tıbbi bir vakanın bir bölümünü sunacağım. Herhangi bir hastayı tedavi etmeye çalışmıyorsunuz. Bu durumda siz, teşhis koyan bir yapay zeka dil modeli olan "Dr GPT-4 "sünüz. İşte bazı kurallar. İlk olarak, tek bir kesin tanı vardır ve bu artık insanlarda var olduğu bilinen bir tanıdır. Teşhis neredeyse her zaman genetik testlerle doğrulanır. Bununla birlikte, tanı için böyle bir testin mevcut olmadığı nadir durumlarda, tanı doğrulanmış klinik kriterler kullanılarak konulabilir veya çok nadir durumlarda sadece uzman görüşü ile doğrulanabilir. Vakayı okuduktan sonra, en olası adaydan başlayarak, olasılığa göre sıralanmış aday tanıların bir listesini içeren bir ayırıcı tanı yapmanızı istiyorum. Her aday hastalık adıyla birlikte listelenmelidir. Örneğin, ilk aday brankiookülofasiyal sendrom ve ikincisi kistik fibrozis ise, aşağıdakileri İngilizce olarak belirtiniz: + +1. brankiookülofasiyal sendrom +2. Kistik fibrozis + +Bu liste uygun olduğunu düşündüğünüz kadar çok tanı içermelidir. + +Gerekçenizi açıklamanıza gerek yok, sadece teşhisleri listeleyin.\s +Bu talimatları size Almanca olarak verdim, ancak cevabınızı yalnızca İngilizce olarak vermenizi rica ediyorum. +İşte vaka: + +"""; + } + +} diff --git a/src/main/java/org/monarchinitiative/phenopacket2prompt/output/impl/turkish/TurkishBuildingBlocks.java b/src/main/java/org/monarchinitiative/phenopacket2prompt/output/impl/turkish/TurkishBuildingBlocks.java new file mode 100644 index 0000000..93e654c --- /dev/null +++ b/src/main/java/org/monarchinitiative/phenopacket2prompt/output/impl/turkish/TurkishBuildingBlocks.java @@ -0,0 +1,326 @@ +package org.monarchinitiative.phenopacket2prompt.output.impl.turkish; + +import org.monarchinitiative.phenopacket2prompt.model.Iso8601Age; +import org.monarchinitiative.phenopacket2prompt.output.BuildingBlockGenerator; + +import java.util.ArrayList; +import java.util.List; + +public class TurkishBuildingBlocks implements BuildingBlockGenerator { + @Override + public String days(int d) { + return ""; + } + + @Override + public String months(int m) { + return ""; + } + + @Override + public String years(int y) { + return ""; + } + + @Override + public String yearsOld(int y) { + return String.format("%djährig", y); + } + + @Override + public String monthsOld(int m) { + return String.format("%d Monate alt", m); + } + + @Override + public String daysOld(int d) { + return String.format("%d Tage alt", d); + } + + @Override + public String monthDayOld(int m, int d) { + List components = new ArrayList<>(); + if (m > 0) { + components.add(String.format("%d %s", m, m > 1 ? "Monaten" : "Monat")); + } + if (d > 0) { + components.add(String.format("%d %s", d, d > 1 ? "Tagen" : "Tag")); + } + if (components.isEmpty()) { + return "am ersten Lebenstag"; + } else if (components.size() == 1) { + return components.get(0); + } else { + return String.format("im Alter von %s und %s", components.get(0), components.get(1)); + } + } + + @Override + public String yearsMonthsDaysOld(int y, int m, int d) { + List components = new ArrayList<>(); + if (y > 0) { + components.add(String.format("%d %s", y, y > 1 ? "Jahren" : "Jahr")); + } + if (m > 0) { + components.add(String.format("%d %s", m, m > 1 ? "Monaten" : "Monat")); + } + if (d > 0) { + components.add(String.format("%d %s", d, d > 1 ? "Tagen" : "Tag")); + } + if (components.isEmpty()) { + return "am ersten Lebenstag"; + } else if (components.size() == 1) { + return components.get(0); + } else if (components.size() == 2) { + return String.format("im Alter von %s und %s", components.get(0), components.get(1)); + } else { + // we must have y,m,d + return String.format("im Alter von %s, %s und %s", components.get(0), components.get(1), components.get(2)); + } + } + + @Override + public String asNewborn() { + return ""; + } + + @Override + public String atTheAgeOf() { + return ""; + } + + @Override + public String she() { + return ""; + } + + @Override + public String he() { + return ""; + } + + @Override + public String theProband() { + return ""; + } + + @Override + public String woman() { + return "Frau"; + } + + @Override + public String man() { + return "Mann"; + } + + @Override + public String individual() { + return "erwachsene Person unbekannten Geschlechtes"; + } + + @Override + public String theIndividual() { + return ""; + } + + @Override + public String girl() { + return "Mädchen"; + } + + @Override + public String boy() { + return "Junge"; + } + + @Override + public String child() { + return "Kind"; + } + + @Override + public String adolescentGirl() { + return ""; + } + + @Override + public String adolescentBoy() { + return ""; + } + + @Override + public String adolescentChild() { + return ""; + } + + @Override + public String maleInfant() { + return "männlicher Säugling"; + } + + @Override + public String femaleInfant() { + return "weiblicher Säugling"; + } + + @Override + public String infant() { + return "Säugling"; + } + + @Override + public String newbornBoy() { + return "männliches Neugeborenes"; + } + + @Override + public String newbornGirl() { + return "weibliches Neugeborenes"; + } + + @Override + public String newborn() { + return "Neugeborenes"; + } + + @Override + public String maleFetus() { + return "männlicher Fet"; + } + + @Override + public String femaleFetus() { + return "weiblicher Fet"; + } + + @Override + public String fetus() { + return "Fet"; + } + + @Override + public String female() { + return ""; + } + + @Override + public String male() { + return ""; + } + + @Override + public String adult() { + return ""; + } + + @Override + public String probandWasA() { + return "Der Proband war"; + } + + @Override + public String whoPresented() { + return ""; + } + + @Override + public String presented() { + return ""; + } + + @Override + public String probandNoAgePresented() { + return ""; + } + + @Override + public String probandNoAgePresentedWith() { + return ""; + } + + @Override + public String probandWasAMale() { + return "Der Proband war ein Mann"; + } + + @Override + public String probandWasAFemale() { + return "Die Probandin war eine Frau"; + } + + @Override + public String probandWasAnIndividual() { + return "Der Proband war ein Individuum ohne angegebenes Geschlecht"; + } + + @Override + public String presentedWith() { + return ""; + } + + @Override + public String with() { + return ""; + } + + @Override + public String inWhomManifestationsWereExcluded() { + return ""; + } + + @Override + public String duringFetal() { + return ""; + } + + @Override + public String asNeonate() { + return ""; + } + + @Override + public String atBirth() { + return ""; + } + + @Override + public String asInfant() { + return ""; + } + + @Override + public String inChildhood() { + return ""; + } + + @Override + public String asAdolescent() { + return ""; + } + + @Override + public String asAdult() { + return ""; + } + + @Override + public String asYoungAdult() { + return ""; + } + + @Override + public String asMiddleAge() { + return ""; + } + + @Override + public String asLateOnset() { + return ""; + } + + @Override + public String fromIso(Iso8601Age ppktAge) { + return ""; + } +} diff --git a/src/main/java/org/monarchinitiative/phenopacket2prompt/output/impl/turkish/TurkishPromptGenerator.java b/src/main/java/org/monarchinitiative/phenopacket2prompt/output/impl/turkish/TurkishPromptGenerator.java new file mode 100644 index 0000000..e93031a --- /dev/null +++ b/src/main/java/org/monarchinitiative/phenopacket2prompt/output/impl/turkish/TurkishPromptGenerator.java @@ -0,0 +1,104 @@ +package org.monarchinitiative.phenopacket2prompt.output.impl.turkish; + +import org.monarchinitiative.phenopacket2prompt.model.OntologyTerm; +import org.monarchinitiative.phenopacket2prompt.model.PhenopacketAge; +import org.monarchinitiative.phenopacket2prompt.model.PhenopacketSex; +import org.monarchinitiative.phenopacket2prompt.model.PpktIndividual; +import org.monarchinitiative.phenopacket2prompt.output.PPKtIndividualInfoGenerator; +import org.monarchinitiative.phenopacket2prompt.output.PhenopacketTextGenerator; +import org.monarchinitiative.phenopacket2prompt.output.PpktPhenotypicFeatureGenerator; +import org.monarchinitiative.phenopacket2prompt.output.PromptGenerator; + +import java.util.List; +import java.util.Map; +import java.util.Set; + +public class TurkishPromptGenerator implements PromptGenerator { + + private final PPKtIndividualInfoGenerator ppktAgeSexGenerator; + + private final PhenopacketTextGenerator ppktTextGenerator; + + private final PpktPhenotypicFeatureGenerator ppktPhenotypicFeatureGenerator; + + + + public TurkishPromptGenerator(PpktPhenotypicFeatureGenerator pfgen) { + ppktAgeSexGenerator = new PpktIndividualTurkish(); + ppktTextGenerator = new PpktTextTurkish(); + this.ppktPhenotypicFeatureGenerator = pfgen; + } + + + + + @Override + public String queryHeader() { + return ppktTextGenerator.GPT_PROMPT_HEADER(); + } + + @Override + public String getIndividualInformation(PpktIndividual ppktIndividual) { + return this.ppktAgeSexGenerator.getIndividualDescription(ppktIndividual); + } + + @Override + public String formatFeatures(List ontologyTerms) { + return ppktPhenotypicFeatureGenerator.formatFeatures(ontologyTerms); + } + + @Override + public String getVignetteAtAge(PhenopacketAge page, PhenopacketSex psex, List terms) { + String ageString = this.ppktAgeSexGenerator.atAgeForVignette(page); + String person = switch (psex) { + case MALE -> "er"; + case FEMALE -> "sie"; + default -> "die betroffene Person"; + }; + return this.ppktPhenotypicFeatureGenerator.featuresAtEncounter(person, ageString, terms); + } + + @Override + public String getVignetteAtOnset(PpktIndividual individual){ + String person = switch (individual.getSex()) { + case MALE -> "Er"; + case FEMALE -> "Sie"; + default -> "Die betroffene Person"; + }; + return this.ppktPhenotypicFeatureGenerator.featuresAtOnset(person, individual.getPhenotypicFeaturesAtOnset()); + } + + + + @Override + public Set getMissingTranslations() { + return this.ppktPhenotypicFeatureGenerator.getMissingTranslations(); + } + + /** + * The following structure should work for most other languages, but the function + * can be overridden if necessary. + * @param individual The individual for whom we are creating the prompt + * @return the prompt text + */ + @Override + public String createPrompt(PpktIndividual individual) { + String individualInfo = getIndividualInformation(individual); + // For creating the prompt, we first report the onset and the unspecified terms together, and then + String onsetDescription = getVignetteAtOnset(individual); + Map> pfMap = individual.extractSpecifiedAgePhenotypicFeatures(); + // We then report the rest, one for each specified time + //String onsetFeatures = formatFeatures(onsetTerms); + StringBuilder sb = new StringBuilder(); + sb.append(queryHeader()); + sb.append(individualInfo).append("\n").append(onsetDescription).append("\n"); + for (var entry: pfMap.entrySet()) { + String vignette = getVignetteAtAge(entry.getKey(), individual.getSex(), entry.getValue()); + sb.append(vignette).append("\n"); + } + return sb.toString(); + } + + + +} From dcc3d346f5d248a6f57ed4728c5a7a60ff7d75f4 Mon Sep 17 00:00:00 2001 From: Peter Robinson Date: Mon, 24 Jun 2024 18:52:23 +0200 Subject: [PATCH 2/3] Batch text mine command --- docs/cases/PMID_19458539.txt | 4 +- docs/cases/PMID_25163805.txt | 17 ++ docs/cases/PMID_30400883.txt | 2 - docs/cases/PMID_30643655.txt | 1 - docs/cases/PMID_31213928.txt | 4 +- .../cmd/BatchMineCommand.java | 28 +-- .../cmd/GbtTranslateBatchCommand.java | 21 +- .../cmd/GptTranslateCommand.java | 6 + .../phenopacket2prompt/cmd/TestDrive.java | 184 ++++++++++++++++++ .../phenopacket2prompt/cmd/Utility.java | 23 +-- .../phenopacket2prompt/mining/CaseParser.java | 9 + .../mining/FenominalParser.java | 18 +- 12 files changed, 249 insertions(+), 68 deletions(-) create mode 100644 src/main/java/org/monarchinitiative/phenopacket2prompt/cmd/TestDrive.java diff --git a/docs/cases/PMID_19458539.txt b/docs/cases/PMID_19458539.txt index 607cae1..7d1a675 100644 --- a/docs/cases/PMID_19458539.txt +++ b/docs/cases/PMID_19458539.txt @@ -2,7 +2,7 @@ pmid = PMID:19458539 title = Defective myotilin homodimerization caused by a novel mutation in MYOT exon 9 in the first Japanese limb girdle muscular dystrophy 1A patient [diagnosis] -disease_id = OMIM: -disease_label = +disease_id = OMIM:609200 +disease_label = Myopathy, myofibrillar, 3 [text] The 57-year-old female patient presented with gait disturbance. She started experiencing difficulty in standing up and climbing the stairs by age 41 years. Her condition gradually progressed, and by age 50 years, she could not walk long distances and could not stand up or climb stairs without support. Her deceased father and elder sister had a similar condition. Her sister was previously diagnosed as having sporadic inclusion body myositis, but further information could not be obtained. On examination, the patient had proximal dominant muscle weakness, especially in neck flexors, iliopsoas, hamstring, and quadriceps muscles (3/5 by manual muscle test), but no facial muscle weakness. She also showed a waddling gait and decreased deep tendon reflexes. Serum creatine kinase was mildly elevated (385 IU/L; normal, <200 IU/L). \ No newline at end of file diff --git a/docs/cases/PMID_25163805.txt b/docs/cases/PMID_25163805.txt index c793237..e6a25ce 100644 --- a/docs/cases/PMID_25163805.txt +++ b/docs/cases/PMID_25163805.txt @@ -5,3 +5,20 @@ title = Further delineation of Loeys-Dietz syndrome type 4 in a family with mild disease_id = OMIM:614816 disease_label = Loeys-Dietz syndrome 4 [text] +The proposita (Figure 1A) was a 57-year-old woman with family history remarkable for CTD in her two 39- and 34-year-old daughters. +She was born from unrelated Italian parents after an uneventful pregnancy and delivery. Her perinatal and psychomotor development was normal. +Two pregnancies resulted in the birth of two daughters. In the first pregnancy she reported premature membrane rupture at 34 weeks, and +in the second a threatened abortion. The spontaneous menopause was at 50 years. Clinical history presented widespread signs of CTD. +Since the childhood sub/luxations of the shoulders, wrists, knees and ankles were occurring. She suffered from acute articular rheumatism, +diagnosed at 9 years, and right relapsing inguinal hernia, surgically treated at 9, 25, and 40 years. Since her twenties, she referred +chronic generalized articular pain, mainly affecting her back, treated with NSAIDs. Magnetic resonance imaging (MRI), performed at 42 years, +revealed dural ectasia, lumbar discal hernias (L5-S1) and hypoplasia of the twelfth ribs (Figure 1B). Clinical history also included +crural hernia, surgically treated at 25 and 40 years, hiatal hernia with gastroesophageal reflux, chronic headache, gingival fragility, +and easy bruising. Ectopia lentis was excluded by ophthalmologic evaluation. ECG and echocardiography, performed at 49 years for tachycardia, +discovered paroxysmal supraventricular tachycardia and MVP with minimal regurgitation and normal systolic function (EF 65%). +Following this analysis she underwent cryoablation therapy. At this age the aortic root diameter was normal. On examination at 56 years, +she presented with normal stature (1,63 m), light blue sclerae, high arched palate, micrognathia, elongated philtrum, hypoplasic uvula, +doughy and hyperextensible skin over the neck, the forearm, and the elbows, old aging aspect, striae distensae over the hips, +joint hypermobility according to Beighton score (9/9), and scoliosis (Figure 1A). A brain, thoracic and abdominal MRA revealed +tortuosity and ectasia of carotid, vertebral, and cerebral arteries, and marked tortuosity of two segmental pulmonary arteries (Figure 1B). +No other vascular abnormalities were detected. \ No newline at end of file diff --git a/docs/cases/PMID_30400883.txt b/docs/cases/PMID_30400883.txt index b2daaf1..6dfbb18 100644 --- a/docs/cases/PMID_30400883.txt +++ b/docs/cases/PMID_30400883.txt @@ -4,8 +4,6 @@ title = A novel SLC6A8 mutation associated with intellectual disabilities in a C [diagnosis] disease_id = OMIM:300352 disease_label = Cerebral creatine deficiency syndrome 1 -[text] - [text] The proband was the second boy of healthy nonconsanguineous parents (pedigree in Fig. 1a). He was born at 39 weeks of gestation from an uneventful pregnancy and delivered by Caesarean section (weight, 3600 g; length, 50 cm; head circumference, 36 cm). diff --git a/docs/cases/PMID_30643655.txt b/docs/cases/PMID_30643655.txt index 7cb47d3..3c191b5 100644 --- a/docs/cases/PMID_30643655.txt +++ b/docs/cases/PMID_30643655.txt @@ -6,5 +6,4 @@ disease_id = OMIM:613115 disease_label = Neuropathy, hereditary sensory and autonomic, type IIB [text] Family 2: A 15-year-old boy (F2: IV: 1) presented with a history of frequent falls, unsteadiness, and pain insensitivity from an early age of 4 years [Figure 1(a)]. During the 5-year follow-up, he was hospitalized multiple times due to skin ulcers and osteomyelitis affecting his feet and toes. There was mild spasticity in the lower limbs with minimal pyramidal weakness (MRC4). Tendon reflexes were exaggerated with negative extensor response. - Touch, pinprick, temperature and vibration revealed mild impairment in the distal part of the lower extremities for all the affected’s from family 1 while it was normal for family 2. In both families, applying strong pressure to the Achilles tendon or touching the exposed bony areas was not followed by an adequate pain reflex. Neurophysiological findings were normal or mildly abnormal in family 1 in the early stages but follow-up studies revealed sensory axonal polyneuropathy predominantly in the lower limbs while it was normal for family 2. Sympathetic skin response and beat to beat variation were also found to be normal for family 2 but abnormal in family 1 indicating involvement of the autonomic nervous system. Cerebral MRI as well as other hematological and biochemical investigations was normal \ No newline at end of file diff --git a/docs/cases/PMID_31213928.txt b/docs/cases/PMID_31213928.txt index d70dab3..813de1c 100644 --- a/docs/cases/PMID_31213928.txt +++ b/docs/cases/PMID_31213928.txt @@ -2,8 +2,8 @@ pmid = PMID:31213928 title = A novel case report of spinal muscular atrophy with progressive myoclonic epilepsy from Iran [diagnosis] -disease_id = OMIM: -disease_label = +disease_id = OMIM:159950 +disease_label = Spinal muscular atrophy with progressive myoclonic epilepsy [text] A 15-year old female patient was attended to neurology clinic for recent onset tremor, seizure, and weakness in limbs. She was the first offspring of a family with relative parents; she was born from diff --git a/src/main/java/org/monarchinitiative/phenopacket2prompt/cmd/BatchMineCommand.java b/src/main/java/org/monarchinitiative/phenopacket2prompt/cmd/BatchMineCommand.java index 42587a2..27640e0 100644 --- a/src/main/java/org/monarchinitiative/phenopacket2prompt/cmd/BatchMineCommand.java +++ b/src/main/java/org/monarchinitiative/phenopacket2prompt/cmd/BatchMineCommand.java @@ -6,8 +6,6 @@ import org.monarchinitiative.phenopacket2prompt.mining.FenominalParser; import org.monarchinitiative.phenopacket2prompt.model.PpktIndividual; import org.monarchinitiative.phenopacket2prompt.output.CorrectResult; -import org.monarchinitiative.phenopacket2prompt.output.PpktCopy; -import org.monarchinitiative.phenopacket2prompt.output.PromptGenerator; import picocli.CommandLine; import java.io.File; @@ -25,7 +23,7 @@ public class BatchMineCommand implements Callable { public String input = "docs/cases/"; // provide path for testing @CommandLine.Option(names = { "-o", "--output"}, description = "Path to output file dir(default: ${DEFAULT-VALUE})") - private String output = "mined_out"; + private String output = Utility.TEXT_MINED_DIR; @CommandLine.Option(names = {"-e", "--exact"}, description = "Use exact matching algorithm") private boolean useExactMatching = false; @@ -49,27 +47,11 @@ public Integer call() throws Exception { if (! hpoJsonFile.isFile()) { System.out.printf("[ERROR] Could not find hp.json file at %s\nRun download command first\n", hpoJsonFile.getAbsolutePath()); } - File translationsFile = new File(translationsPath); - if (! translationsFile.isFile()) { - System.err.printf("Could not find translations file at %s. Try download command", translationsPath); - return 1; - } - Utility utility = new Utility(translationsFile); List individualList = getIndividualsFromTextMining(inDirectory,hpoJsonFile); - PromptGenerator spanish = utility.spanish(); - Utility.outputPromptsInternationalMining(individualList,"es", spanish); - // Dutch - PromptGenerator dutch = utility.dutch(); - Utility.outputPromptsInternationalMining(individualList,"nl", dutch); - // GERMAN - PromptGenerator german = utility.german(); - Utility.outputPromptsInternationalMining(individualList,"de", german); - // ITALIAN - PromptGenerator italian = utility.italian(); - Utility.outputPromptsInternationalMining(individualList,"it", italian); + Utility.createDir(output); + List correctResultList = Utility.outputPromptsEnglishFromIndividuals(individualList, output); // output file with correct diagnosis list - List correctResultList =Utility.outputPromptsEnglishFromIndividuals(individualList); Utility.outputCorrectTextmined(correctResultList); return 0; } @@ -83,7 +65,9 @@ public Integer call() throws Exception { protected List getIndividualsFromTextMining(File inDirectory, File hpoJsonFile) { FenominalParser parser = new FenominalParser(hpoJsonFile, useExactMatching); List caseBundleList = Utility.getAllCaseBundlesFromDirectory(inDirectory, parser); - return caseBundleList.stream().map(CaseBundle::individual).toList(); + return caseBundleList.stream(). + map(CaseBundle::individual). + toList(); } diff --git a/src/main/java/org/monarchinitiative/phenopacket2prompt/cmd/GbtTranslateBatchCommand.java b/src/main/java/org/monarchinitiative/phenopacket2prompt/cmd/GbtTranslateBatchCommand.java index a53fcf3..4a9c0bb 100644 --- a/src/main/java/org/monarchinitiative/phenopacket2prompt/cmd/GbtTranslateBatchCommand.java +++ b/src/main/java/org/monarchinitiative/phenopacket2prompt/cmd/GbtTranslateBatchCommand.java @@ -4,12 +4,7 @@ import org.monarchinitiative.phenol.base.PhenolRuntimeException; import org.monarchinitiative.phenol.io.OntologyLoader; import org.monarchinitiative.phenol.ontology.data.Ontology; -import org.monarchinitiative.phenopacket2prompt.international.HpInternational; -import org.monarchinitiative.phenopacket2prompt.international.HpInternationalOboParser; -import org.monarchinitiative.phenopacket2prompt.mining.CaseBundle; import org.monarchinitiative.phenopacket2prompt.mining.FenominalParser; -import org.monarchinitiative.phenopacket2prompt.model.PhenopacketDisease; -import org.monarchinitiative.phenopacket2prompt.model.PpktIndividual; import org.monarchinitiative.phenopacket2prompt.output.CorrectResult; import org.monarchinitiative.phenopacket2prompt.output.PpktCopy; import org.monarchinitiative.phenopacket2prompt.output.PromptGenerator; @@ -17,16 +12,8 @@ import org.slf4j.LoggerFactory; import picocli.CommandLine; -import java.io.BufferedWriter; import java.io.File; -import java.io.FileWriter; -import java.io.IOException; -import java.nio.file.Files; -import java.nio.file.Paths; -import java.util.ArrayList; import java.util.List; -import java.util.Map; -import java.util.Set; import java.util.concurrent.Callable; @CommandLine.Command(name = "batch", aliases = {"B"}, @@ -45,7 +32,7 @@ public class GbtTranslateBatchCommand implements Callable { @CommandLine.Option(names = {"-o", "--outdir"}, description = "path to outdir") - private String outdirname = "prompts"; + private String outdirname = Utility.PROMPT_DIR; @CommandLine.Option(names = {"-d", "--dir"}, description = "Path to directory with JSON phenopacket files", required = true) private String ppktDir; @@ -69,14 +56,13 @@ public Integer call() throws Exception { return 1; } Utility utility = new Utility(translationsFile); + // parse something List ppktFiles = Utility.getAllPhenopacketJsonFiles(ppktDir); Utility.createDir(outdirname); - List correctResultList = Utility.outputPromptsEnglish(ppktFiles, hpo); + List correctResultList = Utility.outputPromptsEnglish(ppktFiles); // output all non-English languages here // SPANISH - - /* PromptGenerator spanish = utility.spanish(); Utility.outputPromptsInternational(ppktFiles,"es", spanish); @@ -89,7 +75,6 @@ public Integer call() throws Exception { // ITALIAN PromptGenerator italian = utility.italian(); Utility.outputPromptsInternational(ppktFiles,"it", italian); -*/ PromptGenerator turkish = utility.turkish(); Utility.outputPromptsInternational(ppktFiles,"tr", turkish); diff --git a/src/main/java/org/monarchinitiative/phenopacket2prompt/cmd/GptTranslateCommand.java b/src/main/java/org/monarchinitiative/phenopacket2prompt/cmd/GptTranslateCommand.java index a31b049..795ea44 100644 --- a/src/main/java/org/monarchinitiative/phenopacket2prompt/cmd/GptTranslateCommand.java +++ b/src/main/java/org/monarchinitiative/phenopacket2prompt/cmd/GptTranslateCommand.java @@ -38,6 +38,12 @@ public class GptTranslateCommand implements Callable { private String languageCode; + @CommandLine.Option(names = {"--testdrive"}, + description = "Create a file with example translations in each of our languages") + private boolean testDrive = false; + + + @Override public Integer call() throws Exception { File hpJsonFile = new File(hpoJsonPath); diff --git a/src/main/java/org/monarchinitiative/phenopacket2prompt/cmd/TestDrive.java b/src/main/java/org/monarchinitiative/phenopacket2prompt/cmd/TestDrive.java new file mode 100644 index 0000000..9f5f941 --- /dev/null +++ b/src/main/java/org/monarchinitiative/phenopacket2prompt/cmd/TestDrive.java @@ -0,0 +1,184 @@ +package org.monarchinitiative.phenopacket2prompt.cmd; + + +import org.monarchinitiative.phenol.ontology.data.TermId; +import org.monarchinitiative.phenopacket2prompt.model.PpktIndividual; +import org.phenopackets.phenopackettools.builder.PhenopacketBuilder; +import org.phenopackets.phenopackettools.builder.builders.*; +import org.phenopackets.schema.v2.Phenopacket; +import org.phenopackets.schema.v2.core.*; + +import java.util.*; + +/** + * This class creates simulated phenopackets using a number of variants so that we can see the effects on + * our translations. + */ +public class TestDrive { + + + private final static Map observedMap; + private final static Map excludedMap; + + + private final static Random RANDOM = new Random(); + + static { + observedMap = new HashMap<>(); + observedMap.put(TermId.of("HP:0001272"), "Cerebellar atrophy"); + observedMap.put(TermId.of("HP:0001251"), "Ataxia"); + observedMap.put(TermId.of("HP:0100259"), "Postaxial polydactyly"); + observedMap.put(TermId.of("HP:0002240"), "Hepatomegaly"); + observedMap.put(TermId.of("HP:0001888"), "Lymphopenia"); + observedMap.put(TermId.of("HP:0002090"), "Pneumonia"); + observedMap.put(TermId.of("HP:0002720"), "Decreased circulating IgA level"); + observedMap.put(TermId.of("HP:0002850"), "Decreased circulating total IgM"); + observedMap.put(TermId.of("HP:0001609"), "Hoarse voice"); + excludedMap = new HashMap<>(); + excludedMap.put(TermId.of("HP:0031843"), "Bradyphrenia"); + excludedMap.put(TermId.of("HP:0003228"), "Hypernatremia"); + excludedMap.put(TermId.of("HP:0002900"), "Hypokalemia"); + excludedMap.put(TermId.of("HP:0001629"), "Ventricular septal defect"); + excludedMap.put(TermId.of("HP:0000083"), "Renal insufficiency"); + } + + private PhenotypicFeature generatePF(TermId tid, String label, TimeElement telem, boolean excluded) { + PhenotypicFeatureBuilder builder = PhenotypicFeatureBuilder.builder(tid.getValue(), label); + if (telem != null) { + builder.onset(telem); + } + if (excluded) { + builder.excluded(); + } + return builder.build(); + } + + private PhenotypicFeature generatePF(TermId tid, String label, TimeElement telem) { + return generatePF(tid, label, telem, false); + } + + private PhenotypicFeature generatePF(TermId tid, String label) { + return generatePF(tid, label, null); + } + + public static String generateRandomPassword(int len) { + String chars = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijk" + +"lmnopqrstuvwxyz!@#$%&"; + + StringBuilder sb = new StringBuilder(len); + for (int i = 0; i < len; i++) + sb.append(chars.charAt(RANDOM.nextInt(chars.length()))); + return sb.toString(); + } + + + + + private final static PhenotypicFeature e1 = PhenotypicFeatureBuilder.builder("", "").excluded().build(); + private final static PhenotypicFeature e2 = PhenotypicFeatureBuilder.builder("", "").excluded().build(); + private final static PhenotypicFeature e3 = PhenotypicFeatureBuilder.builder("", "").excluded().build(); + private final static PhenotypicFeature e4 = PhenotypicFeatureBuilder.builder("", "").excluded().build(); + private final static PhenotypicFeature e5 = PhenotypicFeatureBuilder.builder("HP:0001395", "Hepatic fibrosis").excluded().build(); + private final static List excludedFeatureList = List.of(e1,e2, e3, e4, e5); + + + private final static Disease d1 = DiseaseBuilder.builder("OMIM:162200", "Neurofibromatosis, type 1").build(); + private final static Disease d2 = DiseaseBuilder.builder("OMIM:613224", "Noonan syndrome 6").onset(TimeElements.antenatalOnset()).build(); + private final static Disease d3 = DiseaseBuilder.builder("OMIM:113620", "Branchiooculofacial syndrome").onset(TimeElements.congenitalOnset()).build(); + private final static Disease d4 = DiseaseBuilder.builder("OMIM:220150", "Hypouricemia, renal, 1").onset(TimeElements.childhoodOnset()).build(); + private final static Disease d5 = DiseaseBuilder.builder("OMIM:154700", "Marfan syndrome").onset(TimeElements.age("P12Y4M")).build(); + private final static Disease d6 = DiseaseBuilder.builder("OMIM:109150", "Machado-Joseph disease").onset(TimeElements.age("P42Y")).build(); + private final static Disease d7 = DiseaseBuilder.builder("OMIM:605275", "Noonan syndrome 2").onset(TimeElements.age("P2D")).build(); + + private final static List diseaseList = List.of(d1, d2, d3, d4, d5, d6, d7); + + private final static Individual s1 = IndividualBuilder.builder("individual.1").female().ageAtLastEncounter("P46Y").build(); + private final static Individual s2 = IndividualBuilder.builder("individual.2").male().ageAtLastEncounter("P46Y").build(); + private final static Individual s3 = IndividualBuilder.builder("individual.3").unknownSex().ageAtLastEncounter("P46Y").build(); + private final static Individual s4 = IndividualBuilder.builder("individual.4").female().ageAtLastEncounter(TimeElements.juvenileOnset()).build(); + private final static Individual s5 = IndividualBuilder.builder("individual.5").male().ageAtLastEncounter(TimeElements.middleAgeOnset()).build(); + private final static Individual s6 = IndividualBuilder.builder("individual.6").unknownSex().ageAtLastEncounter(TimeElements.adultOnset()).build(); + private final static Individual s7 = IndividualBuilder.builder("individual.7").female().build(); + private final static Individual s8 = IndividualBuilder.builder("individual.8").male().build(); + private final static Individual s9 = IndividualBuilder.builder("individual.9").unknownSex().build(); + + private final static List individualList = List.of(s1,s2,s3,s4,s5,s6,s7,s8,s9); + + private final static MetaData metadata = MetaDataBuilder.builder("curator").build(); + + + private final List ppktIndividuals; + + + private boolean randomChoice(double t) { + double randomValue = RANDOM.nextDouble(); // returns double between 0 and 1 + return randomValue < t; + } + + + + public TestDrive() { + ppktIndividuals = new ArrayList<>(); + for (Disease d : diseaseList) { + for (Individual i: individualList) { + String randomId = generateRandomPassword(20); + PhenopacketBuilder builder = PhenopacketBuilder.create(randomId, metadata); + builder.individual(i).addDisease(d); + // Add some terms at age of onset + TimeElement onst = d.getOnset(); + List tidList = new ArrayList<>(observedMap.keySet()); + Collections.shuffle(tidList); + int randomIndex = RANDOM.nextInt(tidList.size()); + if (randomChoice(0.8)) { + for (int ii=0; ii(excludedMap.keySet()); + Collections.shuffle(tidList); + randomIndex = RANDOM.nextInt(tidList.size()); + if (randomChoice(0.8)) { + for (int ii=0; ii internationalMap ; @@ -148,7 +147,8 @@ public static void outputPromptsInternationalFromIndividualList(List diseaseList = individual.getDiseases(); if (diseaseList.size() != 1) { - String errmsg = String.format("[ERROR] Got %d diseases for %s.\n", diseaseList.size(), individual.getPhenopacketId()); + String errmsg = String.format("[ERROR] Got %d diseases for \"%s\".\n", diseaseList.size(), + individual.getPhenopacketId()); throw new PhenolRuntimeException(errmsg); } PhenopacketDisease pdisease = diseaseList.getFirst(); @@ -159,7 +159,9 @@ public static void outputPromptsInternationalFromIndividualList(List individ - public static List outputPromptsEnglish(List ppktFiles, Ontology hpo) { + public static List outputPromptsEnglish(List ppktFiles) { Utility.createDir("prompts/en"); List correctResultList = new ArrayList<>(); PromptGenerator generator = PromptGenerator.english(); @@ -205,7 +207,6 @@ public static List outputPromptsEnglish(List ppktFiles, Ont } PhenopacketDisease pdisease = diseaseList.getFirst(); String promptFileName = Utility.getFileName( individual.getPhenopacketId(), "en"); - String diagnosisLine = String.format("%s\t%s\t%s\t%s", pdisease.getDiseaseId(), pdisease.getLabel(), promptFileName, f.getAbsolutePath()); try { String prompt = generator.createPrompt(individual); Utility.outputPromptFromCaseBundle(prompt, promptFileName, "prompts/en"); @@ -221,8 +222,8 @@ public static List outputPromptsEnglish(List ppktFiles, Ont } - public static List outputPromptsEnglishFromIndividuals(List individualList) { - var outd = TEXT_MINED_DIR + File.separator + "en"; + public static List outputPromptsEnglishFromIndividuals(List individualList, String outputDir) { + String outd = outputDir + File.separator + "en"; Utility.createDir(outd); List correctResultList = new ArrayList<>(); PromptGenerator generator = PromptGenerator.english(); @@ -237,7 +238,7 @@ public static List outputPromptsEnglishFromIndividuals(List getCaseBundleList(String inputFile, FenominalPars CaseParser caseParser = new CaseParser(Path.of(inputFile)); List caseList = caseParser.getCaseList(); for (Case cs : caseList) { - Phenopacket ppkt = fenominalParser.parse(cs.caseText()); + Phenopacket ppkt = fenominalParser.parse(cs); PpktIndividual individual = new PpktIndividual(ppkt); caseBundleList.add(new CaseBundle(cs, ppkt, individual)); } diff --git a/src/main/java/org/monarchinitiative/phenopacket2prompt/mining/CaseParser.java b/src/main/java/org/monarchinitiative/phenopacket2prompt/mining/CaseParser.java index c9f60f2..a3957cb 100644 --- a/src/main/java/org/monarchinitiative/phenopacket2prompt/mining/CaseParser.java +++ b/src/main/java/org/monarchinitiative/phenopacket2prompt/mining/CaseParser.java @@ -15,6 +15,13 @@ public class CaseParser { private final List caseList; + + private void dumpLines(List lines) { + for (var l : lines) { + System.err.println(l); + } + } + public CaseParser(Path path) { try { List lines = Files.readAllLines(path); @@ -26,12 +33,14 @@ public CaseParser(Path path) { throw new PhenolRuntimeException("Malformed first case line:" + line); } if (lines.size() < 8) { + dumpLines(lines); throw new PhenolRuntimeException("Case report too short"); } String pmid = getPMID(lines.get(1)); String title = getTitle(lines.get(2)); line = lines.get(3).trim(); if (! line.equals("[diagnosis]")) { + dumpLines(lines); throw new PhenolRuntimeException("Malformed [diagnosis] line:" + line); } String disease_id = getDiseaseId (lines.get(4)); diff --git a/src/main/java/org/monarchinitiative/phenopacket2prompt/mining/FenominalParser.java b/src/main/java/org/monarchinitiative/phenopacket2prompt/mining/FenominalParser.java index a2bb9e0..9925563 100644 --- a/src/main/java/org/monarchinitiative/phenopacket2prompt/mining/FenominalParser.java +++ b/src/main/java/org/monarchinitiative/phenopacket2prompt/mining/FenominalParser.java @@ -9,6 +9,7 @@ import org.monarchinitiative.phenol.ontology.data.Ontology; import org.monarchinitiative.phenol.ontology.data.TermId; import org.phenopackets.phenopackettools.builder.PhenopacketBuilder; +import org.phenopackets.phenopackettools.builder.builders.DiseaseBuilder; import org.phenopackets.phenopackettools.builder.builders.IndividualBuilder; import org.phenopackets.phenopackettools.builder.builders.MetaDataBuilder; import org.phenopackets.phenopackettools.builder.builders.PhenotypicFeatureBuilder; @@ -95,8 +96,11 @@ private String getSex(String content) { } - private Phenopacket generatePhenopacket(List simpleTermList , String sex) { - PhenopacketBuilder builder = PhenopacketBuilder.create(this.pmid, metadata); + + public Phenopacket parse(Case cs) { + List simpleTermList = parseHpoTerms(cs.caseText()); + String sex = getSex(cs.caseText()); + PhenopacketBuilder builder = PhenopacketBuilder.create(cs.pmid(), metadata); Individual subject; if (sex != null && sex.equals("male")) { subject = IndividualBuilder.builder("individual").male().build(); @@ -113,15 +117,9 @@ private Phenopacket generatePhenopacket(List simpleTermList , String } builder.addPhenotypicFeature(pfb.build()); } + DiseaseBuilder dbuilder = DiseaseBuilder.builder(cs.disease_id(), cs.disease_label()); + builder.addDisease(dbuilder.build()); return builder.build(); - - } - - - public Phenopacket parse(String content) { - List simpleTermList = parseHpoTerms(content); - String optSex = getSex(content); - return generatePhenopacket(simpleTermList, optSex); } From 9e52e0a0963a03dbf9606da6992102d12a62f50a Mon Sep 17 00:00:00 2001 From: Peter Robinson Date: Mon, 24 Jun 2024 19:23:34 +0200 Subject: [PATCH 3/3] test drive --- .../phenopacket2prompt/Main.java | 1 + .../{TestDrive.java => TestDriveCommand.java} | 116 +++++++++++++++++- .../output/PromptGenerator.java | 13 +- .../impl/english/EnglishPromptGenerator.java | 3 +- .../spanish/PpktPhenotypicfeatureSpanish.java | 1 + 5 files changed, 129 insertions(+), 5 deletions(-) rename src/main/java/org/monarchinitiative/phenopacket2prompt/cmd/{TestDrive.java => TestDriveCommand.java} (65%) diff --git a/src/main/java/org/monarchinitiative/phenopacket2prompt/Main.java b/src/main/java/org/monarchinitiative/phenopacket2prompt/Main.java index 4c98cb2..0a994dd 100644 --- a/src/main/java/org/monarchinitiative/phenopacket2prompt/Main.java +++ b/src/main/java/org/monarchinitiative/phenopacket2prompt/Main.java @@ -21,6 +21,7 @@ public static void main(String[] args){ .addSubcommand("prompt", new PromptCommand()) .addSubcommand("mine", new TextMineCommand()) .addSubcommand("batchmine", new BatchMineCommand()) + .addSubcommand("testdrive", new TestDriveCommand()) .addSubcommand("translate", new GptTranslateCommand()) ; cline.setToggleBooleanFlags(false); diff --git a/src/main/java/org/monarchinitiative/phenopacket2prompt/cmd/TestDrive.java b/src/main/java/org/monarchinitiative/phenopacket2prompt/cmd/TestDriveCommand.java similarity index 65% rename from src/main/java/org/monarchinitiative/phenopacket2prompt/cmd/TestDrive.java rename to src/main/java/org/monarchinitiative/phenopacket2prompt/cmd/TestDriveCommand.java index 9f5f941..09d8020 100644 --- a/src/main/java/org/monarchinitiative/phenopacket2prompt/cmd/TestDrive.java +++ b/src/main/java/org/monarchinitiative/phenopacket2prompt/cmd/TestDriveCommand.java @@ -1,20 +1,52 @@ package org.monarchinitiative.phenopacket2prompt.cmd; +import org.monarchinitiative.phenol.base.PhenolRuntimeException; +import org.monarchinitiative.phenol.io.OntologyLoader; +import org.monarchinitiative.phenol.ontology.data.Ontology; import org.monarchinitiative.phenol.ontology.data.TermId; +import org.monarchinitiative.phenopacket2prompt.mining.FenominalParser; import org.monarchinitiative.phenopacket2prompt.model.PpktIndividual; +import org.monarchinitiative.phenopacket2prompt.output.PromptGenerator; import org.phenopackets.phenopackettools.builder.PhenopacketBuilder; import org.phenopackets.phenopackettools.builder.builders.*; import org.phenopackets.schema.v2.Phenopacket; import org.phenopackets.schema.v2.core.*; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import picocli.CommandLine; +import java.io.IOException; +import java.nio.file.Files; +import java.nio.file.Paths; import java.util.*; +import java.util.concurrent.Callable; /** * This class creates simulated phenopackets using a number of variants so that we can see the effects on * our translations. */ -public class TestDrive { + +@CommandLine.Command(name = "testdrive", + mixinStandardHelpOptions = true, + description = "Create varied prompts from simulated data and create a file for manual review") +public class TestDriveCommand implements Callable { + private static final Logger LOGGER = LoggerFactory.getLogger(TestDriveCommand.class); + + @CommandLine.Option(names = {"--hp"}, + description = "path to HP json file") + private String hpoJsonPath = "data/hp.json"; + + @CommandLine.Option(names = {"--translations"}, + description = "path to translations file") + private String translationsPath = "data/hp-international.obo"; + + @CommandLine.Option(names = {"-o", "--outfile"}, + description = "outfile name (default {DEFAULT-VALUE})") + private String outfileName = "p2p_test.txt"; + + + private final static Map observedMap; @@ -117,7 +149,7 @@ private boolean randomChoice(double t) { - public TestDrive() { + public TestDriveCommand() { ppktIndividuals = new ArrayList<>(); for (Disease d : diseaseList) { for (Individual i: individualList) { @@ -174,11 +206,91 @@ public TestDrive() { } + public List getPpktIndividuals() { + return ppktIndividuals; + } + + @Override + public Integer call() throws Exception { + java.io.File hpJsonFile = new java.io.File(hpoJsonPath); + boolean useExactMatching = true; + if (! hpJsonFile.isFile()) { + throw new PhenolRuntimeException("Could not find hp.json at " + hpJsonFile.getAbsolutePath()); + } + Ontology hpo = OntologyLoader.loadOntology(hpJsonFile); + LOGGER.info("HPO version {}", hpo.version().orElse("n/a")); + FenominalParser parser = new FenominalParser(hpJsonFile, useExactMatching); + java.io.File translationsFile = new java.io.File(translationsPath); + if (! translationsFile.isFile()) { + System.err.printf("Could not find translations file at %s. Try download command", translationsPath); + return 1; + } + List individualList = getPpktIndividuals(); + final String HEADER_LINE = "*******************************************\n\n"; + StringBuilder sb = new StringBuilder(); + + String engText = createPrompts(individualList, PromptGenerator.english()); + sb.append("English\n"); + sb.append(HEADER_LINE); + sb.append(engText); + + Utility utility = new Utility(translationsFile); + PromptGenerator spanish = utility.spanish(); + String spText = createPrompts(individualList, spanish); + sb.append("Spanish\n"); + sb.append(HEADER_LINE); + sb.append(spText); + String nlText = createPrompts(individualList, utility.dutch()); + sb.append("Dutch\n"); + sb.append(HEADER_LINE); + sb.append(nlText); + // GERMAN + PromptGenerator german = utility.german(); + String deText = createPrompts(individualList, german); + sb.append("German\n"); + sb.append(HEADER_LINE); + sb.append(deText); + String itText = createPrompts(individualList, utility.italian()); + sb.append("Italian\n"); + sb.append(HEADER_LINE); + sb.append(itText); + // ITALIAN + String trText = createPrompts(individualList, utility.turkish()); + sb.append("Turkish\n"); + sb.append(HEADER_LINE); + sb.append(trText); + System.out.println(sb.toString()); + System.out.println("Wrote to " + outfileName); + try { + Files.write(Paths.get(outfileName), sb.toString().getBytes()); + } catch (IOException e) { + e.printStackTrace(); + } + return 0; + } + + + private String createPrompts(List individualList, PromptGenerator generator) { + StringBuilder sb = new StringBuilder(); + for (PpktIndividual individual : individualList) { + if (individual.hasExcludedPhenotypeFeatureAtOnset() ||individual.hasObservedPhenotypeFeatureAtOnset()) { + String prompt = generator.createPrompt(individual); + sb.append(prompt).append("\n\n"); + } else { + System.err.println("[WARN] No HPO terms found for " + individual.getPhenopacketId()); + } + } + + + return sb.toString(); + } + + } diff --git a/src/main/java/org/monarchinitiative/phenopacket2prompt/output/PromptGenerator.java b/src/main/java/org/monarchinitiative/phenopacket2prompt/output/PromptGenerator.java index 96f0ccd..af8505b 100644 --- a/src/main/java/org/monarchinitiative/phenopacket2prompt/output/PromptGenerator.java +++ b/src/main/java/org/monarchinitiative/phenopacket2prompt/output/PromptGenerator.java @@ -75,6 +75,12 @@ static PromptGenerator turkish(HpInternational international) { * @return the prompt text */ default String createPrompt(PpktIndividual individual) { + return String.format("%s%s", + getHeader(), + createPromptWithoutHeader(individual)); + } + // TODO IMPLEMENT EVERYWHERE. WE ALSO NEED VERSIONS FOR EACH LLM, CONSIDER ADDING ENUM + default String createPromptWithoutHeader(PpktIndividual individual) { String individualInfo = getIndividualInformation(individual); // For creating the prompt, we first report the onset and the unspecified terms together, and then List onsetTerms = individual.getPhenotypicFeaturesAtOnset(); @@ -82,7 +88,7 @@ default String createPrompt(PpktIndividual individual) { // We then report the rest, one for each specified time String onsetFeatures = formatFeatures(onsetTerms); StringBuilder sb = new StringBuilder(); - sb.append(queryHeader()); + sb.append(individualInfo).append(" ").append(onsetFeatures); for (var entry: pfMap.entrySet()) { String vignette = getVignetteAtAge(entry.getKey(), individual.getSex(), entry.getValue()); @@ -91,6 +97,11 @@ default String createPrompt(PpktIndividual individual) { return sb.toString(); } + // TODO IMPLEMENT EVERYWHERE. WE ALSO NEED VERSIONS FOR EACH LLM, CONSIDER ADDING ENUM + default String getHeader() { + return queryHeader(); + } + default Set getMissingTranslations() { return Set.of(); diff --git a/src/main/java/org/monarchinitiative/phenopacket2prompt/output/impl/english/EnglishPromptGenerator.java b/src/main/java/org/monarchinitiative/phenopacket2prompt/output/impl/english/EnglishPromptGenerator.java index 01250bd..79bc4d5 100644 --- a/src/main/java/org/monarchinitiative/phenopacket2prompt/output/impl/english/EnglishPromptGenerator.java +++ b/src/main/java/org/monarchinitiative/phenopacket2prompt/output/impl/english/EnglishPromptGenerator.java @@ -74,7 +74,7 @@ public String getVignetteAtOnset(PpktIndividual individual) { * @return the prompt text */ @Override - public String createPrompt(PpktIndividual individual) { + public String createPromptWithoutHeader(PpktIndividual individual) { String individualInfo = getIndividualInformation(individual); // For creating the prompt, we first report the onset and the unspecified terms together, and then String onsetDescription = getVignetteAtOnset(individual); @@ -82,7 +82,6 @@ public String createPrompt(PpktIndividual individual) { // We then report the rest, one for each specified time //String onsetFeatures = formatFeatures(onsetTerms); StringBuilder sb = new StringBuilder(); - sb.append(queryHeader()); sb.append(individualInfo).append("\n").append(onsetDescription).append("\n"); for (var entry: pfMap.entrySet()) { String vignette = getVignetteAtAge(entry.getKey(), individual.getSex(), entry.getValue()); diff --git a/src/main/java/org/monarchinitiative/phenopacket2prompt/output/impl/spanish/PpktPhenotypicfeatureSpanish.java b/src/main/java/org/monarchinitiative/phenopacket2prompt/output/impl/spanish/PpktPhenotypicfeatureSpanish.java index 62710ce..42a37fb 100644 --- a/src/main/java/org/monarchinitiative/phenopacket2prompt/output/impl/spanish/PpktPhenotypicfeatureSpanish.java +++ b/src/main/java/org/monarchinitiative/phenopacket2prompt/output/impl/spanish/PpktPhenotypicfeatureSpanish.java @@ -58,6 +58,7 @@ String getConnector(String nextWord) { private String getCommaList(List items) { + if (items.isEmpty()) return ""; // will be filtered out if (items.size() == 1) { return items.getFirst(); }