From c79d73fdd9a57de9eab5a3ddb0126af5bf02b34f Mon Sep 17 00:00:00 2001 From: Dave Martin Date: Tue, 5 Nov 2024 16:05:22 +0000 Subject: [PATCH] Fixes for: * https://github.com/CatalogueOfLife/data/issues/719 * https://github.com/CatalogueOfLife/data/issues/722 Work for * https://github.com/CatalogueOfLife/backend/issues/1350 Fixes for prefix mapping and metadata loading --- matching-ws/Dockerfile | 9 +- matching-ws/pom.xml | 14 +- .../matching/MatchingApplication.java | 5 +- .../matching/controller/MatchController.java | 54 +-- .../matching/index/DatasetIndex.java | 310 ++++++++++++------ .../catalogue/matching/index/NameNRank.java | 30 +- .../catalogue/matching/model/APIMetadata.java | 3 - .../catalogue/matching/model/Dataset.java | 6 + .../matching/model/NameUsageMatch.java | 89 ++++- .../matching/model/NameUsageQuery.java | 8 +- .../matching/model/StoredParsedName.java | 64 ++++ .../matching/service/IndexingService.java | 124 +++++-- .../matching/service/MatchingService.java | 32 +- .../catalogue/matching/util/IUCNUtils.java | 40 +++ .../matching/util/IndexConstants.java | 2 + matching-ws/src/main/resources/datasets.json | 29 +- .../life/catalogue/matching/IDMatchingIT.java | 2 +- .../catalogue/matching/IDMatchingTest.java | 75 +++++ .../catalogue/matching/NameUsageBuilder.java | 4 +- 19 files changed, 669 insertions(+), 231 deletions(-) create mode 100644 matching-ws/src/main/java/life/catalogue/matching/model/StoredParsedName.java create mode 100644 matching-ws/src/main/java/life/catalogue/matching/util/IUCNUtils.java create mode 100644 matching-ws/src/test/java/life/catalogue/matching/IDMatchingTest.java diff --git a/matching-ws/Dockerfile b/matching-ws/Dockerfile index 4a0ebe218..224d901d8 100644 --- a/matching-ws/Dockerfile +++ b/matching-ws/Dockerfile @@ -27,14 +27,11 @@ RUN git clone https://github.com/CatalogueOfLife/backend.git WORKDIR /app/backend RUN git checkout $GIT_BRANCH -# Build all the CLB modules -RUN mvn clean install package -DskipTests - # Build the Maven project and create a exec file WORKDIR /app/backend/matching-ws -# Run tests - full backend tests require additional services (e.g. ES) -RUN mvn clean install package +# Build all the CLB modules +RUN mvn clean install package -DskipTests -DskipITs # Store git commit id and log RUN curl -o /app/backend/git.json -H "Accept: application/vnd.github+json" "https://api.github.com/repos/catalogueoflife/backend/commits/$(git rev-parse HEAD)" @@ -106,4 +103,4 @@ RUN chown -R $USER:$USER /opt/gbif/$APP_ARTIFACT USER $USER EXPOSE $SERVER_PORT -CMD java $JVM_OPTIONS -jar app.jar --server.port=$SERVER_PORT --working.dir=/opt/gbif/$APP_ARTIFACT --mode=RUN --spring.cloud.bootstrap.location=/opt/gbif/$APP_ARTIFACT/bootstrap.yaml \ No newline at end of file +CMD java $JVM_OPTIONS -jar app.jar --server.port=$SERVER_PORT --working.dir=/opt/gbif/$APP_ARTIFACT/ --mode=RUN --spring.cloud.bootstrap.location=/opt/gbif/$APP_ARTIFACT/bootstrap.yaml \ No newline at end of file diff --git a/matching-ws/pom.xml b/matching-ws/pom.xml index 19ca20d59..4da425756 100644 --- a/matching-ws/pom.xml +++ b/matching-ws/pom.xml @@ -15,6 +15,7 @@ 11 9.10.0 2.7.18 + 2021.0.9 1.18.22 2.43.0 1.2.13 @@ -284,11 +285,6 @@ spring-boot-starter-validation ${spring-boot.version} - - net.openhft - chronicle-map - 3.25ea6 - org.springframework.boot spring-boot-starter-web @@ -356,14 +352,6 @@ logstash-logback-encoder ${logstash-logback.version} - - - - - - - - org.springframework.boot spring-boot-configuration-processor diff --git a/matching-ws/src/main/java/life/catalogue/matching/MatchingApplication.java b/matching-ws/src/main/java/life/catalogue/matching/MatchingApplication.java index e964e2e02..3f7820022 100644 --- a/matching-ws/src/main/java/life/catalogue/matching/MatchingApplication.java +++ b/matching-ws/src/main/java/life/catalogue/matching/MatchingApplication.java @@ -87,7 +87,7 @@ public void run(ApplicationArguments args) { } private void initialiseWebapp() { - Optional metadata = matchingService.getAPIMetadata(false); + Optional metadata = matchingService.getAPIMetadata(true); if (metadata.isEmpty()) { log.error("No main index found. Cannot start web services"); return; @@ -136,7 +136,8 @@ private void runIndexingIfRequired(ApplicationArguments args) throws Exception { indexingService.indexIdentifiers(id); } - log.info("Indexing completed"); + matchingService.getAPIMetadata(true); + log.info("Indexing ready"); } private ExecutionMode getMode(ApplicationArguments args) { diff --git a/matching-ws/src/main/java/life/catalogue/matching/controller/MatchController.java b/matching-ws/src/main/java/life/catalogue/matching/controller/MatchController.java index de51ebc67..c2c2df057 100644 --- a/matching-ws/src/main/java/life/catalogue/matching/controller/MatchController.java +++ b/matching-ws/src/main/java/life/catalogue/matching/controller/MatchController.java @@ -15,6 +15,7 @@ import java.util.stream.Collectors; import life.catalogue.matching.model.*; import life.catalogue.matching.service.MatchingService; +import life.catalogue.matching.util.IUCNUtils; import lombok.AllArgsConstructor; import lombok.Builder; import lombok.Data; @@ -112,9 +113,13 @@ public NameUsageMatch matchOldPaths( HttpServletRequest response) { return matchV2( usageKey, - taxonID,taxonConceptID,scientificNameID, - scientificName2, scientificName, - authorship, authorship2, + taxonID, + taxonConceptID, + scientificNameID, + scientificName2, + scientificName, + authorship, + authorship2, genericName, specificEpithet, infraspecificEpithet, @@ -279,6 +284,7 @@ public NameUsageMatch matchV2( taxonID, taxonConceptID, scientificNameID, + scientificName, scientificName2, authorship, @@ -288,6 +294,7 @@ public NameUsageMatch matchV2( infraspecificEpithet, rank, rank2, + classification, exclude, strict, @@ -428,6 +435,7 @@ public Object matchFlatV1( taxonID, taxonConceptID, scientificNameID, + scientificName, scientificName2, authorship, @@ -437,6 +445,7 @@ public Object matchFlatV1( infraspecificEpithet, rank, rank2, + classification, exclude != null ? exclude.stream().map(Object::toString).collect(Collectors.toSet()) : Set.of(), strict, @@ -598,6 +607,7 @@ public Object matchV1( taxonID, taxonConceptID, scientificNameID, + scientificName, scientificName2, authorship, @@ -607,6 +617,7 @@ public Object matchV1( infraspecificEpithet, rank, rank2, + classification, exclude != null ? exclude.stream().map(Object::toString).collect(Collectors.toSet()) : Set.of(), strict, @@ -671,7 +682,7 @@ public Map iucnRedListV1(@PathVariable(value = "usageKey", requi return Map.of(); } NameUsageMatch.Status status = statusList.get(0); - String formatted = formatIucn(status.getStatus()); + String formatted = IUCNUtils.formatIucn(status.getStatus()); if (formatted == null || formatted.isEmpty()) { return Map.of(); } @@ -679,7 +690,7 @@ public Map iucnRedListV1(@PathVariable(value = "usageKey", requi String scientificName = match.getAcceptedUsage() != null ? match.getAcceptedUsage().getCanonicalName() : match.getUsage().getCanonicalName(); try { - IUCN iucn = IUCN.valueOf(formatted); // throws IllegalArgumentException if not found + IUCNUtils.IUCN iucn = IUCNUtils.IUCN.valueOf(formatted); // throws IllegalArgumentException if not found watch.stop(); log("v1/species/iucnRedListCategory", usageKey, watch); return Map.of( @@ -689,7 +700,7 @@ public Map iucnRedListV1(@PathVariable(value = "usageKey", requi "taxonomicStatus", NameUsageMatchV1.TaxonomicStatusV1.convert( match.getDiagnostics().getStatus()), "iucnTaxonID", status.getSourceId(), - "code", iucn.code + "code", iucn.getCode() ); } catch (IllegalArgumentException e) { log.error("IUCN category not found: {}", formatted, e); @@ -751,37 +762,6 @@ private static void addIfNotNull(StringJoiner joiner, Object value) { } } - String formatIucn(String original){ - if (original == null) { - return null; - } - // Trim the string - String trimmed = original.trim(); - // Convert to uppercase - String uppercased = trimmed.toUpperCase(); - // Replace any whitespace with a single underscore - return uppercased.replaceAll("\\s+", "_"); - } - - enum IUCN { - EXTINCT("EX"), - EXTINCT_IN_THE_WILD("EW"), - CRITICALLY_ENDANGERED ("CR"), - ENDANGERED ("EN"), - VULNERABLE ("VU"), - NEAR_THREATENED ("NT"), - CONSERVATION_DEPENDENT ("CD"), - LEAST_CONCERN ("LC"), - DATA_DEFICIENT ("DD"), - NOT_EVALUATED ("NE"); - - private final String code; - - IUCN(String code) { - this.code = code; - } - } - @Data @Builder @NoArgsConstructor diff --git a/matching-ws/src/main/java/life/catalogue/matching/index/DatasetIndex.java b/matching-ws/src/main/java/life/catalogue/matching/index/DatasetIndex.java index b75dfe85a..23f46f585 100644 --- a/matching-ws/src/main/java/life/catalogue/matching/index/DatasetIndex.java +++ b/matching-ws/src/main/java/life/catalogue/matching/index/DatasetIndex.java @@ -2,8 +2,10 @@ import static life.catalogue.matching.util.IndexConstants.DATASETS_JSON; import static life.catalogue.matching.util.IndexConstants.*; +import com.fasterxml.jackson.core.JsonParser; import com.fasterxml.jackson.databind.JsonNode; import com.fasterxml.jackson.databind.ObjectMapper; +import com.fasterxml.jackson.databind.DeserializationFeature; import java.io.File; import java.io.FileReader; import java.io.IOException; @@ -11,6 +13,7 @@ import java.nio.file.DirectoryStream; import java.nio.file.Files; import java.nio.file.Path; +import java.nio.file.Paths; import java.nio.file.attribute.BasicFileAttributes; import java.time.Instant; import java.time.ZoneId; @@ -21,20 +24,23 @@ import life.catalogue.api.vocab.MatchType; import life.catalogue.api.vocab.TaxonomicStatus; import life.catalogue.matching.model.*; +import life.catalogue.matching.util.IUCNUtils; import life.catalogue.matching.util.LuceneUtils; import life.catalogue.matching.Main; import lombok.extern.slf4j.Slf4j; import org.apache.lucene.document.Document; -import org.apache.lucene.index.DirectoryReader; -import org.apache.lucene.index.IndexReader; -import org.apache.lucene.index.Term; +import org.apache.lucene.index.*; import org.apache.lucene.search.*; import org.apache.lucene.store.Directory; +import org.apache.lucene.store.FSDirectory; import org.apache.lucene.store.MMapDirectory; +import org.apache.lucene.util.BytesRef; + import org.gbif.nameparser.api.NomCode; import org.gbif.nameparser.api.Rank; import org.jetbrains.annotations.NotNull; +import org.jetbrains.annotations.Nullable; import org.springframework.beans.factory.annotation.Value; import org.springframework.stereotype.Service; @@ -57,10 +63,12 @@ public class DatasetIndex { protected static final ScientificNameAnalyzer scientificNameAnalyzer = new ScientificNameAnalyzer(); + protected static final ObjectMapper MAPPER = new ObjectMapper(); + @Value("${index.path:/data/matching-ws/index}") String indexPath; - @Value("${working.path:/tmp/}") + @Value("${working.dir:/tmp/}") String workingDir; private boolean isInitialised = false; @@ -77,16 +85,11 @@ public boolean getIsInitialised() { .build()) .build(); - public boolean exists(String indexPath) { - return new File(indexPath).exists() - && new File(indexPath + "/" + MAIN_INDEX_DIR).exists() - && Objects.requireNonNull(new File(indexPath + "/" + MAIN_INDEX_DIR).listFiles()).length > 0; - } - /** Attempts to read the index from disk if it exists. */ @PostConstruct void init() { + MAPPER.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false); final String mainIndexPath = getMainIndexPath(); final Map prefixMapping = loadPrefixMapping(); @@ -251,6 +254,31 @@ public APIMetadata getAPIMetadata(){ return metadata; } + public static List distinctValuesForField(String field, String indexPath) throws Exception { + + List distinctValues = new ArrayList<>(); + FSDirectory directory = FSDirectory.open(Paths.get(indexPath)); + + try (DirectoryReader directoryReader = DirectoryReader.open(directory)) { + + // Get the field terms + for (LeafReaderContext leafContext : directoryReader.leaves()) { + LeafReader leafReader = leafContext.reader(); + Terms terms = leafReader.terms(field); + + if (terms != null) { + TermsEnum termsEnum = terms.iterator(); + BytesRef byteRef; + while ((byteRef = termsEnum.next()) != null) { + String termValue = byteRef.utf8ToString(); + distinctValues.add(termValue); + } + } + } + } + return distinctValues; + } + /** * Returns the metadata of the index. This includes the number of taxa, the size on disk, the * dataset title and key, and the build information. @@ -290,16 +318,16 @@ private IndexMetadata getIndexMetadata(String indexPath, IndexSearcher searcher, try { Map rankCounts = new LinkedHashMap<>(); - rankCounts.put(Rank.KINGDOM.name(), getCountForRank(searcher, Rank.KINGDOM)); - rankCounts.put(Rank.PHYLUM.name(), getCountForRank(searcher, Rank.PHYLUM)); - rankCounts.put(Rank.CLASS.name(), getCountForRank(searcher, Rank.CLASS)); - rankCounts.put(Rank.ORDER.name(), getCountForRank(searcher, Rank.ORDER)); - rankCounts.put(Rank.FAMILY.name(), getCountForRank(searcher, Rank.FAMILY)); - rankCounts.put(Rank.GENUS.name(), getCountForRank(searcher, Rank.GENUS)); - rankCounts.put(Rank.SPECIES.name(), getCountForRank(searcher, Rank.SPECIES)); - rankCounts.put(Rank.SUBSPECIES.name(), getCountForRank(searcher, Rank.SUBSPECIES)); + distinctValuesForField(FIELD_RANK, indexPath).stream().sorted( (a, b) -> Rank.valueOf(a).ordinal() - Rank.valueOf(b).ordinal() + ).forEach(rank -> { + try { + rankCounts.put(rank, getCountForRank(searcher, rank)); + } catch (IOException e) { + log.error("Cannot read index information", e); + } + }); metadata.setNameUsageByRankCount(rankCounts); - } catch (IOException e) { + } catch (Exception e) { log.error("Cannot read index information", e); } return metadata; @@ -311,7 +339,7 @@ private IndexMetadata getIndexMetadata(String indexPath, IndexSearcher searcher, */ private Optional getGitInfo() { ObjectMapper mapper = new ObjectMapper(); - final String filePath = workingDir + "/" + GIT_JSON; + final String filePath = workingDir + GIT_JSON; try { if (new File(filePath).exists()) { // Read JSON file and parse to JsonNode @@ -358,7 +386,7 @@ public Map getDatasetInfo(String indexPath) { try { if (new File(filePath).exists()){ - log.info("Loading dataset info from {}", filePath); + log.debug("Loading dataset info from {}", filePath); // Read JSON file and parse to JsonNode JsonNode rootNode = mapper.readTree(new File(filePath)); // Navigate to the author node @@ -383,8 +411,8 @@ public Map getDatasetInfo(String indexPath) { return Map.of(); } - private long getCountForRank(IndexSearcher searcher, Rank rank) throws IOException { - Query query = new TermQuery(new Term(FIELD_RANK, rank.name())); + private long getCountForRank(IndexSearcher searcher, String rank) throws IOException { + Query query = new TermQuery(new Term(FIELD_RANK, rank)); return searcher.search(query, new TotalHitCountCollectorManager()); } @@ -416,22 +444,6 @@ public NameUsageMatch matchByUsageKey(String usageKey) { return matchByKey(usageKey, this::getByUsageKey); } - private static String escapeQueryChars(String s) { - StringBuilder sb = new StringBuilder(); - for (int i = 0; i < s.length(); i++) { - char c = s.charAt(i); - // These are the special characters that need to be escaped - if (c == '\\' || c == '+' || c == '-' || c == '!' || c == '(' || c == ')' || - c == ':' || c == '^' || c == '[' || c == ']' || c == '\"' || c == '{' || - c == '}' || c == '~' || c == '*' || c == '?' || c == '|' || c == '&' || - c == '/' || Character.isWhitespace(c)) { - sb.append('\\'); - } - sb.append(c); - } - return sb.toString(); - } - private Optional getByUsageKey(String usageKey) { Query query = new TermQuery(new Term(FIELD_ID, usageKey)); try { @@ -541,12 +553,36 @@ public List lookupIdentifier(@NotNull String identifier) { * @return List of ExternalID */ public List lookupIdentifier(@NotNull String datasetID, @NotNull String identifier) { + return lookupIdentifier(datasetID, identifier, identifierSearchers); + } + + /** + * Matches an external ID. Intended for debug purposes only, to quickly + * check if ids are present and joined to main index or not. + * + * @param datasetID the datasetKey to match + * @param identifier the identifier to match + * @return List of ExternalID + */ + public List lookupAncillary(@NotNull String datasetID, @NotNull String identifier) { + return lookupIdentifier(datasetID, identifier, ancillarySearchers); + } + + /** + * Matches an external ID. Intended for debug purposes only, to quickly + * check if ids are present and joined to main index or not. + * + * @param datasetID the datasetKey to match + * @param identifier the identifier to match + * @return List of ExternalID + */ + public List lookupIdentifier(@NotNull String datasetID, @NotNull String identifier, Map searchers) { List results = new ArrayList<>(); try { // if join indexes are present, add them to the match - if (identifierSearchers != null && !identifierSearchers.isEmpty()) { - for (Dataset dataset : identifierSearchers.keySet()) { + if (searchers != null && !searchers.isEmpty()) { + for (Dataset dataset : searchers.keySet()) { // use the prefix mapping if (dataset.getKey().toString().equals(datasetID) || (dataset.getGbifKey() != null && dataset.getGbifKey().equals(datasetID))) { @@ -557,12 +593,12 @@ public List lookupIdentifier(@NotNull String datasetID, @NotNull Str } // find the index and search it - IndexSearcher identifierSearcher = identifierSearchers.get(dataset); + IndexSearcher searcher = searchers.get(dataset); Query identifierQuery = new TermQuery(new Term(FIELD_ID, identifier)); - TopDocs identifierDocs = identifierSearcher.search(identifierQuery, 3); + TopDocs identifierDocs = searcher.search(identifierQuery, 3); if (identifierDocs.totalHits.value > 0) { - Document identifierDoc = identifierSearcher.storedFields(). + Document identifierDoc = searcher.storedFields(). document(identifierDocs.scoreDocs[0].doc); results.add(toExternalID(identifierDoc, dataset)); @@ -598,43 +634,19 @@ private static ExternalID toExternalID(Document doc, Dataset dataset) { * @param ignoredIssue the issue to add if the identifier is ignored * @return NameUsageMatch */ - public NameUsageMatch matchByExternalKey(String key, Issue notFoundIssue, Issue ignoredIssue) { - - NameUsageMatch usageMatch = matchByUsageKey(key); - if (usageMatch.getDiagnostics().getMatchType() != MatchType.NONE) { - return usageMatch; - } + public NameUsageMatch matchByExternalKey(String suppliedKey, Issue notFoundIssue, Issue ignoredIssue) { // if join indexes are present, add them to the match if (identifierSearchers != null && !identifierSearchers.isEmpty()){ try { for (Dataset dataset: identifierSearchers.keySet()){ - // use the prefix mapping - if (dataset.getPrefixMapping() != null && !dataset.getPrefixMapping().isEmpty()) { - for (String prefix : dataset.getPrefixMapping()) { - if (key.startsWith(prefix)) { - key = key.replace(prefix, ""); - } - } - } - - if ( - (dataset.getPrefix() == null || !key.startsWith(dataset.getPrefix())) - && !dataset.getPrefix().equals("*")) { - // only search indexes with matching prefixes - continue; - } - - log.debug("Searching for identifier {} in dataset {}", key, dataset.getKey()); - - if (dataset.getRemovePrefixForMatching()){ - key = key.replace(dataset.getPrefix(), ""); - } + Optional key = extractKeyForSearch(suppliedKey, dataset); + if (key.isEmpty()) continue; // find the index and search it IndexSearcher identifierSearcher = identifierSearchers.get(dataset); - Query identifierQuery = new TermQuery(new Term(FIELD_ID, key)); + Query identifierQuery = new TermQuery(new Term(FIELD_ID, key.get())); TopDocs identifierDocs = identifierSearcher.search(identifierQuery, 3); if (identifierDocs.totalHits.value > 0) { @@ -667,7 +679,7 @@ public NameUsageMatch matchByExternalKey(String key, Issue notFoundIssue, Issue } } } catch (IOException e) { - log.error("Problem querying external ID indexes with {}", key, e); + log.error("Problem querying external ID indexes with {}", suppliedKey, e); } } @@ -675,6 +687,38 @@ public NameUsageMatch matchByExternalKey(String key, Issue notFoundIssue, Issue return NO_MATCH; } + public static Optional extractKeyForSearch(String key, Dataset dataset) { + if (!hasRecognisedPrefix(key, dataset)) { + // only search indexes with matching prefixes + return Optional.empty(); + } + + // use the prefix mapping + if (dataset.getPrefixMapping() != null && !dataset.getPrefixMapping().isEmpty()) { + for (String prefix : dataset.getPrefixMapping()) { + if (key.startsWith(prefix)) { + key = key.replace(prefix, dataset.getPrefix()); + } + } + } + + // if configured, remove the prefix + if (dataset.getRemovePrefixForMatching() != null && dataset.getRemovePrefixForMatching()){ + key = key.replace(dataset.getPrefix(), ""); + } + log.debug("Searching for identifier {} in dataset {}", key, dataset.getKey()); + return Optional.of(key); + } + + private static boolean hasRecognisedPrefix(String key, Dataset dataset) { + if (dataset.getPrefix() == null){ + return false; + } + if (key.startsWith(dataset.getPrefix())) + return true; + return dataset.getPrefixMapping().stream().anyMatch(key::startsWith); + } + private static NameUsageMatch noMatch(Issue issue, String note) { return NameUsageMatch.builder() .diagnostics( @@ -719,7 +763,6 @@ private List loadHigherTaxa(String parentID) { higherTaxon.setRank(Rank.valueOf(doc.get(FIELD_RANK))); higherTaxon.setParentID(doc.get(FIELD_PARENT_ID)); higherTaxa.add(0, higherTaxon); -// higherTaxonomyCache.put(currentParentID, higherTaxon); // get next parent currentParentID = doc.get(FIELD_PARENT_ID); } else { @@ -743,16 +786,7 @@ private NameUsageMatch fromDoc(Document doc) { NameUsageMatch u = NameUsageMatch.builder().build(); u.setDiagnostics(NameUsageMatch.Diagnostics.builder().build()); - // set the usage - u.setUsage( - NameUsageMatch.RankedName.builder() - .key(doc.get(FIELD_ID)) - .name(doc.get(FIELD_SCIENTIFIC_NAME)) - .rank(Rank.valueOf(doc.get(FIELD_RANK))) - .canonicalName(doc.get(FIELD_CANONICAL_NAME)) - .code(getCode(doc)) - .build() - ); + u.setUsage(constructUsage(doc)); String acceptedParentID = null; @@ -761,15 +795,7 @@ private NameUsageMatch fromDoc(Document doc) { Optional accDocOpt = getByUsageKey(doc.get(FIELD_ACCEPTED_ID)); if (accDocOpt.isPresent()) { Document accDoc = accDocOpt.get(); - u.setAcceptedUsage( - NameUsageMatch.RankedName.builder() - .key(accDoc.get(FIELD_ID)) - .name(accDoc.get(FIELD_SCIENTIFIC_NAME)) - .rank(Rank.valueOf(accDoc.get(FIELD_RANK))) - .canonicalName(accDoc.get(FIELD_CANONICAL_NAME)) - .code(getCode(accDoc)) - .build() - ); + u.setAcceptedUsage(constructUsage(accDoc)); acceptedParentID = accDoc.get(FIELD_PARENT_ID); } } @@ -798,7 +824,7 @@ private NameUsageMatch fromDoc(Document doc) { classification.add( NameUsageMatch.RankedName.builder() .key(doc.get(FIELD_ID)) - .name( doc.get(FIELD_CANONICAL_NAME)) + .name(doc.get(FIELD_CANONICAL_NAME)) .rank(Rank.valueOf(doc.get(FIELD_RANK))) .canonicalName(doc.get(FIELD_CANONICAL_NAME)) .build() @@ -809,19 +835,26 @@ private NameUsageMatch fromDoc(Document doc) { // if ancillary join indexes are present, add them to the match for (Dataset dataset: ancillarySearchers.keySet()){ IndexSearcher ancillarySearcher = ancillarySearchers.get(dataset); - Query query = new TermQuery(new Term(FIELD_JOIN_ID, doc.get(FIELD_ID) )); + Query query = new TermQuery( + new Term(FIELD_JOIN_ID, doc.get(FIELD_ID)) + ); try { TopDocs docs = ancillarySearcher.search(query, 3); if (docs.totalHits.value > 0) { Document ancillaryDoc = ancillarySearcher.storedFields().document(docs.scoreDocs[0].doc); - String status = ancillaryDoc.get(FIELD_CATEGORY); NameUsageMatch.Status ancillaryStatus = new NameUsageMatch.Status(); - ancillaryStatus.setStatus(status); - ancillaryStatus.setDatasetKey(dataset.getKey().toString()); - ancillaryStatus.setGbifKey(dataset.getGbifKey()); - ancillaryStatus.setDatasetAlias(dataset.getAlias()); - ancillaryStatus.setSourceId(ancillaryDoc.get(FIELD_ID)); - u.addAdditionalStatus(ancillaryStatus); + ancillaryStatus.setStatus(ancillaryDoc.get(FIELD_CATEGORY)); + String formattedIUCN = IUCNUtils.formatIucn(ancillaryDoc.get(FIELD_CATEGORY)); + if (formattedIUCN != null) { + IUCNUtils.IUCN iucn = IUCNUtils.IUCN.valueOf(formattedIUCN); + ancillaryStatus.setStatus(formattedIUCN); + ancillaryStatus.setStatusCode(iucn.getCode()); + ancillaryStatus.setDatasetKey(dataset.getKey().toString()); + ancillaryStatus.setGbifKey(dataset.getGbifKey()); + ancillaryStatus.setDatasetAlias(dataset.getAlias()); + ancillaryStatus.setSourceId(ancillaryDoc.get(FIELD_ID)); + u.addAdditionalStatus(ancillaryStatus); + } } } catch (IOException e) { log.error("Cannot load usage {} from lucene index", doc.get(FIELD_ID), e); @@ -834,6 +867,78 @@ private NameUsageMatch fromDoc(Document doc) { return u; } + private static NameUsageMatch.Usage constructUsage(Document doc) { + StoredParsedName pn = null; + String parsedNameJson = doc.get(FIELD_PARSED_NAME_JSON); + if (parsedNameJson != null) { + try { + pn = MAPPER.readValue(parsedNameJson, StoredParsedName.class); + } catch (Exception e) { + log.error("Cannot parse parsed name json", e); + } + } + + // set the usage + NameUsageMatch.Usage.UsageBuilder b = NameUsageMatch.Usage.builder() + .key(doc.get(FIELD_ID)) + .name(doc.get(FIELD_SCIENTIFIC_NAME)) + .authorship(doc.get(FIELD_AUTHORSHIP)) + .rank(Rank.valueOf(doc.get(FIELD_RANK))) + .canonicalName(doc.get(FIELD_CANONICAL_NAME)) + .code(getCode(doc)); + + if (pn != null) { + b.genus(pn.getGenus()) + .infragenericEpithet(pn.getInfragenericEpithet()) + .specificEpithet(pn.getSpecificEpithet()) + .infraspecificEpithet(pn.getInfraspecificEpithet()) + .cultivarEpithet(pn.getCultivarEpithet()) + .phrase(pn.getPhrase()) + .voucher(pn.getVoucher()) + .nominatingParty(pn.getNominatingParty()) + .candidatus(pn.isCandidatus()) + .notho(pn.getNotho()) + .originalSpelling(pn.getOriginalSpelling()) + .epithetQualifier(pn.getEpithetQualifier()) + .type(pn.getType()) + .extinct(pn.isExtinct()) + + .sanctioningAuthor(pn.getSanctioningAuthor()) + .taxonomicNote(pn.getTaxonomicNote()) + .nomenclaturalNote(pn.getNomenclaturalNote()) + .publishedIn(pn.getPublishedIn()) + .unparsed(pn.getUnparsed()) + .doubtful(pn.isDoubtful()) + .manuscript(pn.isManuscript()) + .state(pn.getState()) + .warnings(pn.getWarnings()); + + if (pn.getCombinationAuthorship() != null + && pn.getCombinationAuthorship().getAuthors() != null + && !pn.getCombinationAuthorship().getAuthors().isEmpty() + ) { + b.combinationAuthorship( + NameUsageMatch.Authorship.builder() + .authors(pn.getCombinationAuthorship().getAuthors()) + .year(pn.getCombinationAuthorship().getYear()) + .build()); + } + + if (pn.getBasionymAuthorship() != null + && pn.getBasionymAuthorship().getAuthors() != null + && !pn.getBasionymAuthorship().getAuthors().isEmpty() + ) { + b.basionymAuthorship( + NameUsageMatch.Authorship.builder() + .authors(pn.getBasionymAuthorship().getAuthors()) + .year(pn.getBasionymAuthorship().getYear()) + .build()); + } + } + + return b.build(); + } + private static NomCode getCode(Document doc) { if (doc.get(FIELD_NOMENCLATURAL_CODE) == null) { return null; @@ -883,6 +988,7 @@ public List matchByName(String name, boolean fuzzySearch, int ma try { return search(q, name, fuzzySearch, maxMatches); } catch (RuntimeException e) { + log.error("Lucene search error", e); // for example TooComplexToDeterminizeException, see // http://dev.gbif.org/issues/browse/POR-2725 log.warn("Lucene failed to fuzzy search for name [{}]. Try a straight match instead", name); diff --git a/matching-ws/src/main/java/life/catalogue/matching/index/NameNRank.java b/matching-ws/src/main/java/life/catalogue/matching/index/NameNRank.java index e4a36cd12..0b54c2a4b 100644 --- a/matching-ws/src/main/java/life/catalogue/matching/index/NameNRank.java +++ b/matching-ws/src/main/java/life/catalogue/matching/index/NameNRank.java @@ -12,7 +12,6 @@ import life.catalogue.matching.util.CleanupUtils; import org.apache.commons.lang3.StringUtils; -import org.gbif.nameparser.api.Authorship; import org.gbif.nameparser.api.NamePart; import org.gbif.nameparser.api.ParsedName; import org.gbif.nameparser.api.Rank; @@ -214,28 +213,19 @@ private static boolean exists(String x) { @VisibleForTesting public static String expandAbbreviatedGenus(String scientificName, String genus) { - if (exists(scientificName) && exists(genus)) { + if (exists(scientificName) && exists(genus) && !scientificName.equalsIgnoreCase(genus)) { String[] parts = scientificName.split(" +", 2); - if (parts[0].length() <= 2) { - String genusCorrect = StringUtils.capitalize(genus.trim().toLowerCase()); + String genusCorrect = StringUtils.capitalize(genus.trim().toLowerCase()); + if (parts[0].length() <= 2 && genusCorrect.length() > 2 && ( + parts[0].equals("?") // is the genus missing alltogether? + || parts[0].length() == 2 && parts[0].charAt(1) == '.' && parts[0].charAt(0) == genusCorrect.charAt(0) + || parts[0].length() == 1 && parts[0].charAt(0) == genusCorrect.charAt(0) + )) { StringBuilder sb = new StringBuilder(); - // is the genus missing alltogether? - if (parts[0].equals("?")) { - sb.append(genusCorrect); - } else if (genusCorrect.length() > 1) { - // test if name has an abbreviated genus - if (parts[0].length() == 2 - && parts[0].charAt(1) == '.' - && parts[0].charAt(0) == genusCorrect.charAt(0) - || parts[0].length() == 1 && parts[0].charAt(0) == genusCorrect.charAt(0)) { - sb.append(genusCorrect); - } - } else { - sb.append(parts[0]); - } + sb.append(genus); if (parts.length > 1) { - sb.append(" "); - sb.append(parts[1]); + sb.append(" ") + .append(parts[1]); } return sb.toString(); } diff --git a/matching-ws/src/main/java/life/catalogue/matching/model/APIMetadata.java b/matching-ws/src/main/java/life/catalogue/matching/model/APIMetadata.java index b632ecf29..63fbccd78 100644 --- a/matching-ws/src/main/java/life/catalogue/matching/model/APIMetadata.java +++ b/matching-ws/src/main/java/life/catalogue/matching/model/APIMetadata.java @@ -4,13 +4,10 @@ import com.fasterxml.jackson.annotation.JsonInclude; import io.swagger.v3.oas.annotations.media.Schema; -import lombok.Builder; import lombok.Data; import java.util.ArrayList; -import java.util.HashMap; import java.util.List; -import java.util.Map; /** * Metadata about this API and about the indexes behind the API. diff --git a/matching-ws/src/main/java/life/catalogue/matching/model/Dataset.java b/matching-ws/src/main/java/life/catalogue/matching/model/Dataset.java index 0ccb698a8..24c81c74a 100644 --- a/matching-ws/src/main/java/life/catalogue/matching/model/Dataset.java +++ b/matching-ws/src/main/java/life/catalogue/matching/model/Dataset.java @@ -3,7 +3,10 @@ import com.fasterxml.jackson.annotation.JsonIgnoreProperties; import com.fasterxml.jackson.annotation.JsonInclude; +import lombok.AllArgsConstructor; +import lombok.Builder; import lombok.Data; +import lombok.NoArgsConstructor; import java.util.List; @@ -13,6 +16,9 @@ @Data @JsonInclude(JsonInclude.Include.NON_NULL) @JsonIgnoreProperties(ignoreUnknown = true) +@Builder +@NoArgsConstructor +@AllArgsConstructor public class Dataset { Integer key; String gbifKey; diff --git a/matching-ws/src/main/java/life/catalogue/matching/model/NameUsageMatch.java b/matching-ws/src/main/java/life/catalogue/matching/model/NameUsageMatch.java index d15b8149a..88760abbc 100644 --- a/matching-ws/src/main/java/life/catalogue/matching/model/NameUsageMatch.java +++ b/matching-ws/src/main/java/life/catalogue/matching/model/NameUsageMatch.java @@ -5,9 +5,8 @@ import com.fasterxml.jackson.annotation.JsonInclude; import java.io.Serializable; -import java.util.ArrayList; -import java.util.List; -import java.util.Optional; +import java.util.*; + import io.swagger.v3.oas.annotations.media.Schema; import life.catalogue.api.vocab.MatchType; @@ -31,9 +30,9 @@ public class NameUsageMatch implements LinneanClassification { @Schema(description = "If the matched usage is a synonym") boolean synonym; @Schema(description = "The matched name usage") - RankedName usage; + Usage usage; @Schema(description = "The accepted name usage for the match. This will only be populated when we've matched a synonym name usage.") - RankedName acceptedUsage; + Usage acceptedUsage; @Schema(description = "The classification of the accepted name usage.") List classification; @Schema(description = "Diagnostics for a name match including the type of match and confidence level", implementation = Diagnostics.class) @@ -311,6 +310,82 @@ public static class Diagnostics { List alternatives; } + /** + * A name with an identifier and a taxonomic rank. + */ + @Schema(description = "A name with an identifier and a taxonomic rank", title = "Usage", type = "object") + @JsonInclude(JsonInclude.Include.NON_NULL) + @Data + @AllArgsConstructor + @NoArgsConstructor + @ToString + @Builder + public static class Usage implements Serializable { + + private static final long serialVersionUID = 3423423423423L; + + @Schema(description = "The identifier for the name usage") + private String key; + @Schema(description = "The name usage") + private String name; + private String canonicalName; + private String authorship; + @JsonIgnore private String parentID; + @Schema(description = "The taxonomic rank for the name usage") + private Rank rank; + @Schema(description = "The nomenclatural code for the name usage") + private NomCode code; + private String uninomial; + private String genus; + private String infragenericEpithet; + private String specificEpithet; + private String infraspecificEpithet; + private String cultivarEpithet; + private String phrase; + private String voucher; + private String nominatingParty; + private boolean candidatus; + private String notho; + private Boolean originalSpelling; + private Map epithetQualifier; + private String type; + protected boolean extinct; + private Authorship combinationAuthorship; + private Authorship basionymAuthorship; + private String sanctioningAuthor; + private String taxonomicNote; + private String nomenclaturalNote; + private String publishedIn; + private String unparsed; + private boolean doubtful; + private boolean manuscript; + private String state; + private Set warnings; + + //additional flags + private boolean isAbbreviated; + private boolean isAutonym; + private boolean isBinomial; + private boolean isTrinomial; + private boolean isIncomplete; + private boolean isIndetermined; + private boolean isPhraseName; + private String terminalEpithet; + } + + @Schema(description = "An scientific name authorship for a name usage, split into components", title = "Authorship", type = "object") + @JsonInclude(JsonInclude.Include.NON_NULL) + @Data + @AllArgsConstructor + @NoArgsConstructor + @ToString + @Builder + public static class Authorship { + private List authors = new ArrayList(); + private List exAuthors = new ArrayList(); + private String year; + } + /** * A name with an identifier and a taxonomic rank. */ @@ -343,7 +418,7 @@ public static class RankedName implements Serializable { @Data @JsonIgnoreProperties(ignoreUnknown = true) @JsonInclude(JsonInclude.Include.NON_EMPTY) - @Schema(description = "A status value derived from a dataset or external source. E.g. IUCN Red List status.", + @Schema(description = "A status value derived from a dataset or external source. E.g. IUCN Red List.", title = "Status", type = "object") public static class Status { @Schema(description = "The dataset key for the dataset that the status is associated with") @@ -354,6 +429,8 @@ public static class Status { private String gbifKey; @Schema(description = "The status value") private String status; + @Schema(description = "The status code value") + private String statusCode; @Schema(description = "The ID in the source dataset for this status. e.g. the IUCN ID for this taxon") private String sourceId; } diff --git a/matching-ws/src/main/java/life/catalogue/matching/model/NameUsageQuery.java b/matching-ws/src/main/java/life/catalogue/matching/model/NameUsageQuery.java index 7a05095a9..b4a8c55e6 100644 --- a/matching-ws/src/main/java/life/catalogue/matching/model/NameUsageQuery.java +++ b/matching-ws/src/main/java/life/catalogue/matching/model/NameUsageQuery.java @@ -31,15 +31,15 @@ public static NameUsageQuery create( String taxonID, String taxonConceptID, String scientificNameID, - String scientificName2, String scientificName, - String authorship2, + String scientificName2, String authorship, - String rank2, - String rank, + String authorship2, String genericName, String specificEpithet, String infraspecificEpithet, + String rank, + String rank2, Classification classification, Set exclude, Boolean strict, diff --git a/matching-ws/src/main/java/life/catalogue/matching/model/StoredParsedName.java b/matching-ws/src/main/java/life/catalogue/matching/model/StoredParsedName.java new file mode 100644 index 000000000..ff2639fcc --- /dev/null +++ b/matching-ws/src/main/java/life/catalogue/matching/model/StoredParsedName.java @@ -0,0 +1,64 @@ +package life.catalogue.matching.model; + +import lombok.*; + +import lombok.Data; +import java.util.ArrayList; +import java.util.List; +import java.util.Map; +import java.util.Set; + +@Builder +@AllArgsConstructor +@NoArgsConstructor +@Data +public class StoredParsedName { + private String rank; + private String code; + private String uninomial; + private String genus; + private String infragenericEpithet; + private String specificEpithet; + private String infraspecificEpithet; + private String cultivarEpithet; + private String phrase; + private String voucher; + private String nominatingParty; + private boolean candidatus; + private String notho; + private Boolean originalSpelling; + private Map epithetQualifier; + private String type; + protected boolean extinct; + private StoredAuthorship combinationAuthorship; + private StoredAuthorship basionymAuthorship; + private String sanctioningAuthor; + private String taxonomicNote; + private String nomenclaturalNote; + private String publishedIn; + private String unparsed; + private boolean doubtful; + private boolean manuscript; + private String state; + private Set warnings; + + //additional flags + private boolean isAbbreviated; + private boolean isAutonym; + private boolean isBinomial; + private boolean isTrinomial; + private boolean isIncomplete; + private boolean isIndetermined; + private boolean isPhraseName; + private String terminalEpithet; + + @Builder + @AllArgsConstructor + @NoArgsConstructor + @Data + public static class StoredAuthorship { + private List authors = new ArrayList(); + private List exAuthors = new ArrayList(); + private String year; + } +} diff --git a/matching-ws/src/main/java/life/catalogue/matching/service/IndexingService.java b/matching-ws/src/main/java/life/catalogue/matching/service/IndexingService.java index 6cdc0b8c8..39581338e 100644 --- a/matching-ws/src/main/java/life/catalogue/matching/service/IndexingService.java +++ b/matching-ws/src/main/java/life/catalogue/matching/service/IndexingService.java @@ -14,10 +14,9 @@ import java.util.concurrent.*; import java.util.concurrent.atomic.AtomicInteger; import java.util.concurrent.atomic.AtomicLong; -import java.util.function.Consumer; -import java.util.function.Supplier; import java.util.regex.Matcher; import java.util.regex.Pattern; +import com.fasterxml.jackson.core.JsonProcessingException; import com.fasterxml.jackson.databind.JsonNode; import com.fasterxml.jackson.databind.ObjectMapper; import com.opencsv.CSVWriterBuilder; @@ -29,23 +28,17 @@ import life.catalogue.api.model.ReleaseAttempt; import life.catalogue.api.vocab.DatasetOrigin; import life.catalogue.api.vocab.TaxonomicStatus; - import life.catalogue.matching.db.DatasetMapper; - import life.catalogue.matching.index.ScientificNameAnalyzer; import life.catalogue.matching.model.Classification; +import life.catalogue.matching.model.StoredParsedName; import life.catalogue.matching.model.Dataset; import life.catalogue.matching.model.NameUsage; - import life.catalogue.matching.model.NameUsageMatch; - import life.catalogue.matching.util.NameParsers; - import lombok.extern.slf4j.Slf4j; - import org.apache.commons.io.FileUtils; import org.apache.commons.lang3.StringUtils; -import org.apache.ibatis.cursor.Cursor; import org.apache.ibatis.session.SqlSession; import org.apache.ibatis.session.SqlSessionFactory; import org.apache.lucene.analysis.Analyzer; @@ -111,6 +104,8 @@ public class IndexingService { protected final MatchingService matchingService; + protected static final ObjectMapper MAPPER = new ObjectMapper(); + private static final String REL_PATTERN_STR = "(\\d+)(?:LX?RC?|R(\\d+))"; private static final Pattern REL_PATTERN = Pattern.compile("^" + REL_PATTERN_STR + "$"); @@ -208,7 +203,7 @@ private Optional lookupDataset(SqlSessionFactory factory, Integer key) } /** - * Writes an export of the name usages in a checklist bank dataset to a CSV file. + * Writes an export of the name usages in a checklist bank dataset to a CSV file. * * @param datasetKeyInput a dataset key or a release key * @throws Exception if the dataset key is invalid or the export fails @@ -664,6 +659,10 @@ public void run() { nameUsageMatch.getAcceptedUsage() != null ? nameUsageMatch.getAcceptedUsage().getKey() : nameUsageMatch.getUsage().getKey(), Field.Store.YES) ); + + // reduce the side of these indexes by removing the parsed name + doc.removeField(FIELD_PARSED_NAME_JSON); + writer.addDocument(doc); matchedCounter.incrementAndGet(); } else { @@ -680,7 +679,7 @@ public void run() { } private boolean isAccepted(String status) { - return status != null && !status.equals(TaxonomicStatus.ACCEPTED.name()); + return status != null && status.equals(TaxonomicStatus.ACCEPTED.name()); } } @@ -801,12 +800,6 @@ private void indexFile(String exportPath, String indexPath) throws Exception { mapper.writeValue(new File(indexPath + "/" + METADATA_JSON), metadata); } - class YourThreadFactory implements ThreadFactory { - public Thread newThread(Runnable r) { - return new Thread(r, "NameUsage-Indexing-taskThread"); - } - } - static class IndexingTask implements Runnable { private final IndexWriter writer; private final List nameUsages; @@ -846,6 +839,11 @@ private static void finishIndex(IndexWriter indexWriter) throws IOException { return Paths.get(indexPath); } + /** + * Generate the lucene document for a name usage + * @param nameUsage to convert to lucene document + * @return lucene document + */ protected static Document toDoc(NameUsage nameUsage) { Document doc = new Document(); @@ -853,18 +851,18 @@ protected static Document toDoc(NameUsage nameUsage) { Porting notes: The canonical name *sensu strictu* with nothing else but three name parts at most (genus, species, infraspecific). No rank or hybrid markers and no authorship, cultivar or strain information. Infrageneric names are represented without a - leading genus. Unicode characters are replaced by their matching ASCII characters." + leading genus. Unicode characters are replaced by their matching ASCII characters. */ - Rank rank = Rank.valueOf(nameUsage.getRank()); + Rank rank = Rank.valueOf(nameUsage.getRank()); Optional optCanonical = Optional.empty(); + ParsedName pn = null; + NomCode nomCode = null; try { - NomCode nomCode = null; if (!StringUtils.isEmpty(nameUsage.getNomenclaturalCode())) { nomCode = NomCode.valueOf(nameUsage.getNomenclaturalCode()); } - ParsedName pn = NameParsers.INSTANCE.parse(nameUsage.getScientificName(), rank, nomCode); - + pn = NameParsers.INSTANCE.parse(nameUsage.getScientificName(), rank, nomCode); // canonicalMinimal will construct the name without the hybrid marker and authorship String canonical = NameFormatter.canonicalMinimal(pn); optCanonical = Optional.ofNullable(canonical); @@ -873,6 +871,25 @@ protected static Document toDoc(NameUsage nameUsage) { log.debug("Unable to parse name to create canonical: {}", nameUsage.getScientificName()); } + if (pn != null){ + try { + // if there an authorship, reparse with it to get the component authorship parts + StoredParsedName storedParsedName = StringUtils.isBlank(nameUsage.getAuthorship()) ? + getStoredParsedName(pn) : constructParsedName(nameUsage, rank, nomCode); + // store the parsed name components in JSON + doc.add(new StoredField( + FIELD_PARSED_NAME_JSON, + MAPPER.writeValueAsString(storedParsedName)) + ); + } catch (UnparsableNameException | InterruptedException e) { + // do nothing + log.debug("Unable to parse name to create canonical: {}", nameUsage.getScientificName()); + } catch ( JsonProcessingException e) { + // do nothing + log.debug("Unable to parse name to create canonical: {}", nameUsage.getScientificName()); + } + } + final String canonical = optCanonical.orElse(nameUsage.getScientificName()); // use custom precision step as we do not need range queries and prefer to save memory usage @@ -895,7 +912,9 @@ protected static Document toDoc(NameUsage nameUsage) { String nameComplete = nameUsage.getScientificName(); if (StringUtils.isNotBlank(nameUsage.getAuthorship())) { nameComplete += " " + nameUsage.getAuthorship(); + doc.add(new TextField(FIELD_AUTHORSHIP, nameUsage.getAuthorship(), Field.Store.YES)); } + doc.add(new TextField(FIELD_SCIENTIFIC_NAME, nameComplete, Field.Store.YES)); // this lucene index is not persistent, so not risk in changing ordinal numbers @@ -920,11 +939,60 @@ protected static Document toDoc(NameUsage nameUsage) { return doc; } - public static void consume(Supplier> cursorSupplier, Consumer handler) { - try (Cursor cursor = cursorSupplier.get()) { - cursor.forEach(handler); - } catch (IOException e) { - throw new RuntimeException(e); - } + @NotNull + private static StoredParsedName constructParsedName(NameUsage nameUsage, Rank rank, NomCode nomCode) throws UnparsableNameException, InterruptedException { + ParsedName pn = !StringUtils.isBlank(nameUsage.getAuthorship()) ? + NameParsers.INSTANCE.parse(nameUsage.getScientificName() + " " + nameUsage.getAuthorship(), rank, nomCode) + : NameParsers.INSTANCE.parse(nameUsage.getScientificName(), rank, nomCode); + return getStoredParsedName(pn); + } + + @NotNull + private static StoredParsedName getStoredParsedName(ParsedName pn) { + StoredParsedName storedParsedName = new StoredParsedName(); + storedParsedName.setAbbreviated(pn.isAbbreviated()); + storedParsedName.setAutonym(pn.isAutonym()); + storedParsedName.setBinomial(pn.isBinomial()); + storedParsedName.setCandidatus(pn.isCandidatus()); + storedParsedName.setCultivarEpithet(pn.getCultivarEpithet()); + storedParsedName.setDoubtful(pn.isDoubtful()); + storedParsedName.setGenus(pn.getGenus()); + storedParsedName.setUninomial(pn.getUninomial()); + storedParsedName.setUnparsed(pn.getUnparsed()); + storedParsedName.setTrinomial(pn.isTrinomial()); + storedParsedName.setIncomplete(pn.isIncomplete()); + storedParsedName.setIndetermined(pn.isIndetermined()); + storedParsedName.setTerminalEpithet(pn.getTerminalEpithet()); + storedParsedName.setInfragenericEpithet(pn.getInfragenericEpithet()); + storedParsedName.setInfraspecificEpithet(pn.getInfraspecificEpithet()); + storedParsedName.setExtinct(pn.isExtinct()); + storedParsedName.setPublishedIn(pn.getPublishedIn()); + storedParsedName.setSanctioningAuthor(pn.getSanctioningAuthor()); + storedParsedName.setSpecificEpithet(pn.getSpecificEpithet()); + storedParsedName.setPhrase(pn.getPhrase()); + storedParsedName.setPhraseName(pn.isPhraseName()); + storedParsedName.setVoucher(pn.getVoucher()); + storedParsedName.setNominatingParty(pn.getNominatingParty()); + storedParsedName.setNomenclaturalNote(pn.getNomenclaturalNote()); + storedParsedName.setWarnings(pn.getWarnings()); + if (pn.getBasionymAuthorship() != null) { + storedParsedName.setBasionymAuthorship( + StoredParsedName.StoredAuthorship.builder() + .authors(pn.getBasionymAuthorship().getAuthors()) + .exAuthors(pn.getBasionymAuthorship().getExAuthors()) + .year(pn.getBasionymAuthorship().getYear()).build() + ); + } + if (pn.getCombinationAuthorship() != null) { + storedParsedName.setCombinationAuthorship( + StoredParsedName.StoredAuthorship.builder() + .authors(pn.getCombinationAuthorship().getAuthors()) + .exAuthors(pn.getCombinationAuthorship().getExAuthors()) + .year(pn.getCombinationAuthorship().getYear()).build() + ); + } + storedParsedName.setType(pn.getType() != null ? pn.getType().name() : null); + storedParsedName.setNotho(pn.getNotho() != null ? pn.getNotho().name() : null); + return storedParsedName; } } diff --git a/matching-ws/src/main/java/life/catalogue/matching/service/MatchingService.java b/matching-ws/src/main/java/life/catalogue/matching/service/MatchingService.java index 08dec6c28..515d41a3c 100644 --- a/matching-ws/src/main/java/life/catalogue/matching/service/MatchingService.java +++ b/matching-ws/src/main/java/life/catalogue/matching/service/MatchingService.java @@ -55,7 +55,7 @@ @Service public class MatchingService { - @Value("${working.path:/tmp/}") + @Value("${working.dir:/tmp/}") protected String metadataFilePath; @Value("${online.dictionary.url:'https://rs.gbif.org/dictionaries/'}") @@ -174,7 +174,7 @@ public Optional getAPIMetadata(boolean regenerate) { File metadata = new File(metadataFilePath + "/index-metadata.json"); try { - if (!metadata.exists() || regenerate) { + if (regenerate || !metadata.exists()) { APIMetadata metadata1 = datasetIndex.getAPIMetadata(); //serialise to file ObjectMapper mapper = new ObjectMapper(); @@ -199,7 +199,6 @@ private static boolean isMatch(@Nullable NameUsageMatch match) { private static NameUsageMatch higherMatch(NameUsageMatch match, NameUsageMatch firstMatch) { match.getDiagnostics().setMatchType(MatchType.HIGHERRANK); - // FIXME addAlternatives(match, firstMatch.getDiagnostics().getAlternatives()); return match; } @@ -262,7 +261,9 @@ public List lookupJoins(String identifier){ * @return the list of matches */ public List matchID(String datasetID, String identifier){ - return datasetIndex.lookupIdentifier(datasetID, identifier); + List ids = datasetIndex.lookupIdentifier(datasetID, identifier); + List ancillary = datasetIndex.lookupAncillary(datasetID, identifier); + return ImmutableList.builder().addAll(ids).addAll(ancillary).build(); } public NameUsageMatch match( @@ -507,7 +508,12 @@ && getGenusOrAbove(parsedName) != null if (rank == null) { if (parsedName.isBinomial() || parsedName.isTrinomial() - || (parsedName.getRank() != null && parsedName.getRank().ordinal() >= Rank.SPECIES.ordinal())) { + || ( + parsedName.getRank() != null + && parsedName.getRank().ordinal() >= Rank.SPECIES.ordinal() + && parsedName.getEpithet(NamePart.SPECIFIC) != null //see https://github.com/CatalogueOfLife/data/issues/719 + ) + ) { rank = Rank.valueOf(parsedName.getRank().name()); } } @@ -577,6 +583,18 @@ && nextAboveGenusDiffers(classification, match1)) { // for strict matching do not try higher ranks if (isMatch(match1) || strict) { + // https://github.com/CatalogueOfLife/data/issues/719 + // this caters for the scenario where the Taxacrum sp. + // and the only sensible match is to a higher rank (genus) + if ( + isMatch(match1) + && parsedName != null + && parsedName.getRank() != null + && parsedName.getRank().ordinal() >= Rank.SPECIES.ordinal() + && parsedName.getEpithet(NamePart.SPECIFIC) == null + ){ + match1.getDiagnostics().setMatchType(MatchType.HIGHERRANK); + } return match1; } @@ -712,7 +730,9 @@ private List queryIndex(Rank rank, String canonicalName, boolean m -> { if (m.getDiagnostics().getMatchType() == MatchType.EXACT && rank == Rank.SPECIES_AGGREGATE - && m.getUsage().getRank() != Rank.SPECIES_AGGREGATE) { + && (m.getUsage().getRank() != Rank.SPECIES_AGGREGATE + || m.getAcceptedUsage().getRank() != Rank.SPECIES_AGGREGATE) + ) { log.info( "Species aggregate match found for {} {}. Ignore and prefer higher matches", m.getUsage().getRank(), diff --git a/matching-ws/src/main/java/life/catalogue/matching/util/IUCNUtils.java b/matching-ws/src/main/java/life/catalogue/matching/util/IUCNUtils.java new file mode 100644 index 000000000..27fec73fc --- /dev/null +++ b/matching-ws/src/main/java/life/catalogue/matching/util/IUCNUtils.java @@ -0,0 +1,40 @@ +package life.catalogue.matching.util; + +public class IUCNUtils { + + public static String formatIucn(String original){ + if (original == null) { + return null; + } + // Trim the string + String trimmed = original.trim(); + // Convert to uppercase + String uppercased = trimmed.toUpperCase(); + // Replace any whitespace with a single underscore + return uppercased.replaceAll("\\s+", "_"); + } + + public enum IUCN { + EXTINCT("EX"), + EXTINCT_IN_THE_WILD("EW"), + CRITICALLY_ENDANGERED ("CR"), + ENDANGERED ("EN"), + VULNERABLE ("VU"), + NEAR_THREATENED ("NT"), + CONSERVATION_DEPENDENT ("CD"), + LEAST_CONCERN ("LC"), + DATA_DEFICIENT ("DD"), + NOT_EVALUATED ("NE"); + + private final String code; + + IUCN(String code) { + this.code = code; + } + + public String getCode() { + return code; + } + + } +} diff --git a/matching-ws/src/main/java/life/catalogue/matching/util/IndexConstants.java b/matching-ws/src/main/java/life/catalogue/matching/util/IndexConstants.java index 0d59bca61..b6334e6fc 100644 --- a/matching-ws/src/main/java/life/catalogue/matching/util/IndexConstants.java +++ b/matching-ws/src/main/java/life/catalogue/matching/util/IndexConstants.java @@ -8,10 +8,12 @@ public class IndexConstants { public static final String FIELD_ACCEPTED_ID = "accid"; public static final String FIELD_CANONICAL_NAME = "canonical"; public static final String FIELD_SCIENTIFIC_NAME = "sciname"; + public static final String FIELD_AUTHORSHIP = "authorship"; public static final String FIELD_RANK = "rank"; public static final String FIELD_STATUS = "status"; public static final String FIELD_PARENT_ID = "parentId"; public static final String FIELD_NOMENCLATURAL_CODE = "nomcode"; + public static final String FIELD_PARSED_NAME_JSON = "parsedName"; public static final String FIELD_CATEGORY = "category"; public static final String FIELD_JOIN_ID = "joinId"; public static final String MAIN_INDEX_DIR = "main"; diff --git a/matching-ws/src/main/resources/datasets.json b/matching-ws/src/main/resources/datasets.json index 6c17a925c..afd996ec3 100644 --- a/matching-ws/src/main/resources/datasets.json +++ b/matching-ws/src/main/resources/datasets.json @@ -15,6 +15,7 @@ "title": "IPNI", "prefix": "urn:lsid:ipni.org:names:", "prefixMapping": [ + "ipni:", "https://www.ipni.org/n/" ], "removePrefixForMatching": true @@ -25,6 +26,7 @@ "title": "WoRMS", "prefix": "urn:lsid:marinespecies.org:taxname:", "prefixMapping": [ + "worms:", "http://marinespecies.org/data.php?id=", "https://marinespecies.org/data.php?id=", "https://www.marinespecies.org/aphia.php?p=taxdetails&id=" @@ -40,6 +42,31 @@ "key": "2041", "gbifKey": "de8934f4-a136-481c-a87a-b0b202b80a31", "title": "Dyntaxa. Svensk taxonomisk databas", - "prefix": "urn:lsid:dyntaxa.se:Taxon:" + "prefix": "urn:lsid:dyntaxa.se:Taxon:", + "prefixMapping": [ + "dyntaxa:" + ] + }, + { + "key": "2144", + "gbifKey": "9ca92552-f23a-41a8-a140-01abaa31c931", + "title": "ITIS", + "prefixMapping": [ + "itis:", + "tsn:", + "https://www.itis.gov/servlet/SingleRpt/SingleRpt?search_topic=TSN&search_value=", + "https://marinespecies.org/data.php?id=", + "https://www.marinespecies.org/aphia.php?p=taxdetails&id=" + ] + }, + { + "key": "139831", + "title": "iNaturalist", + "prefix": "https://www.inaturalist.org/taxa/", + "prefixMapping": [ + "inat:", + "iNaturalist:", + "https://www.inaturalist.org/taxa/" + ] } ] diff --git a/matching-ws/src/test/java/life/catalogue/matching/IDMatchingIT.java b/matching-ws/src/test/java/life/catalogue/matching/IDMatchingIT.java index 6e0112c83..cea714ce1 100644 --- a/matching-ws/src/test/java/life/catalogue/matching/IDMatchingIT.java +++ b/matching-ws/src/test/java/life/catalogue/matching/IDMatchingIT.java @@ -56,7 +56,7 @@ public static void buildMatcher() throws IOException { 1 ); - Dataset dataset = new Dataset(); + Dataset dataset = Dataset.builder().build(); dataset.setKey(1); dataset.setAlias("DUMMY_IDS"); dataset.setTitle("Dummy dataset for testing"); diff --git a/matching-ws/src/test/java/life/catalogue/matching/IDMatchingTest.java b/matching-ws/src/test/java/life/catalogue/matching/IDMatchingTest.java new file mode 100644 index 000000000..3becebc9b --- /dev/null +++ b/matching-ws/src/test/java/life/catalogue/matching/IDMatchingTest.java @@ -0,0 +1,75 @@ +package life.catalogue.matching; + +import life.catalogue.matching.index.DatasetIndex; +import life.catalogue.matching.model.Dataset; + +import org.junit.jupiter.api.Test; + +import java.util.List; +import java.util.Optional; + +import static org.junit.jupiter.api.Assertions.assertEquals; + +public class IDMatchingTest { + + @Test + public void testWithoutPrefix() { + Dataset dataset = Dataset.builder() + .key(1) + .prefix("urn:lsid:ipni.org:names:") + .prefixMapping(List.of("urn:lsid:ipni.org:names:", "ipni:")) + .removePrefixForMatching(true) + .build(); + assertEquals(Optional.of("1"), DatasetIndex.extractKeyForSearch("ipni:1", dataset)); + } + + @Test + public void testWithoutPrefix2() { + Dataset dataset = Dataset.builder() + .key(1) + .prefix("gbif:") + .prefixMapping(List.of("gbif:")) + .removePrefixForMatching(true) + .build(); + assertEquals(Optional.of("1"), DatasetIndex.extractKeyForSearch("gbif:1", dataset)); + } + + @Test + public void testWithPrefix() { + Dataset dataset = Dataset.builder() + .key(1) + .prefix("urn:lsid:marinespecies.org:taxname:") + .prefixMapping(List.of("worms:")) + .removePrefixForMatching(false) + .build(); + assertEquals(Optional.of("urn:lsid:marinespecies.org:taxname:1"), + DatasetIndex.extractKeyForSearch("worms:1", dataset) + ); + } + + @Test + public void testWithUnrecognisedPrefix() { + Dataset dataset = Dataset.builder() + .key(1) + .prefix("urn:lsid:marinespecies.org:taxname:") + .prefixMapping(List.of("worms:")) + .removePrefixForMatching(false) + .build(); + assertEquals(Optional.empty(), + DatasetIndex.extractKeyForSearch("nonsense:1", dataset) + ); + } + + @Test + public void testWithUnrecognisedPrefix2() { + Dataset dataset = Dataset.builder() + .key(1) + .prefix("urn:lsid:marinespecies.org:taxname:") + .prefixMapping(List.of("worms:")) + .removePrefixForMatching(false) + .build(); + assertEquals(Optional.empty(), + DatasetIndex.extractKeyForSearch("1", dataset) + ); + } +} diff --git a/matching-ws/src/test/java/life/catalogue/matching/NameUsageBuilder.java b/matching-ws/src/test/java/life/catalogue/matching/NameUsageBuilder.java index 9f85865de..8d568c078 100644 --- a/matching-ws/src/test/java/life/catalogue/matching/NameUsageBuilder.java +++ b/matching-ws/src/test/java/life/catalogue/matching/NameUsageBuilder.java @@ -40,12 +40,12 @@ public static NameUsageMatch newNameUsageMatch( String speciesKey) { NameUsageMatch m = NameUsageMatch.builder().diagnostics(NameUsageMatch.Diagnostics.builder().build()).build(); - m.setUsage(NameUsageMatch.RankedName.builder() + m.setUsage(NameUsageMatch.Usage.builder() .key(usageKey) .name(scientificName) .canonicalName(canonicalName) .rank(rank).build()); - m.setAcceptedUsage(NameUsageMatch.RankedName.builder().key(acceptedUsageKey).build()); + m.setAcceptedUsage(NameUsageMatch.Usage.builder().key(acceptedUsageKey).build()); m.getDiagnostics().setStatus(status); m.getDiagnostics().setConfidence(confidence); m.getDiagnostics().setNote(note);