From c79d73fdd9a57de9eab5a3ddb0126af5bf02b34f Mon Sep 17 00:00:00 2001
From: Dave Martin <djtfmartin@gmail.com>
Date: Tue, 5 Nov 2024 16:05:22 +0000
Subject: [PATCH] Fixes for: *
 https://github.com/CatalogueOfLife/data/issues/719 *
 https://github.com/CatalogueOfLife/data/issues/722 Work for *
 https://github.com/CatalogueOfLife/backend/issues/1350 Fixes for prefix
 mapping and metadata loading

---
 matching-ws/Dockerfile                        |   9 +-
 matching-ws/pom.xml                           |  14 +-
 .../matching/MatchingApplication.java         |   5 +-
 .../matching/controller/MatchController.java  |  54 +--
 .../matching/index/DatasetIndex.java          | 310 ++++++++++++------
 .../catalogue/matching/index/NameNRank.java   |  30 +-
 .../catalogue/matching/model/APIMetadata.java |   3 -
 .../catalogue/matching/model/Dataset.java     |   6 +
 .../matching/model/NameUsageMatch.java        |  89 ++++-
 .../matching/model/NameUsageQuery.java        |   8 +-
 .../matching/model/StoredParsedName.java      |  64 ++++
 .../matching/service/IndexingService.java     | 124 +++++--
 .../matching/service/MatchingService.java     |  32 +-
 .../catalogue/matching/util/IUCNUtils.java    |  40 +++
 .../matching/util/IndexConstants.java         |   2 +
 matching-ws/src/main/resources/datasets.json  |  29 +-
 .../life/catalogue/matching/IDMatchingIT.java |   2 +-
 .../catalogue/matching/IDMatchingTest.java    |  75 +++++
 .../catalogue/matching/NameUsageBuilder.java  |   4 +-
 19 files changed, 669 insertions(+), 231 deletions(-)
 create mode 100644 matching-ws/src/main/java/life/catalogue/matching/model/StoredParsedName.java
 create mode 100644 matching-ws/src/main/java/life/catalogue/matching/util/IUCNUtils.java
 create mode 100644 matching-ws/src/test/java/life/catalogue/matching/IDMatchingTest.java
diff --git a/matching-ws/Dockerfile b/matching-ws/Dockerfile
index 4a0ebe218..224d901d8 100644
--- a/matching-ws/Dockerfile
+++ b/matching-ws/Dockerfile
@@ -27,14 +27,11 @@ RUN git clone https://github.com/CatalogueOfLife/backend.git
 WORKDIR /app/backend
 RUN git checkout $GIT_BRANCH
 
-# Build all the CLB modules
-RUN mvn clean install package -DskipTests
-
 # Build the Maven project and create a exec file
 WORKDIR /app/backend/matching-ws
 
-# Run tests - full backend tests require additional services (e.g. ES)
-RUN mvn clean install package
+# Build all the CLB modules
+RUN mvn clean install package -DskipTests -DskipITs
 
 # Store git commit id and log
 RUN curl -o /app/backend/git.json -H "Accept: application/vnd.github+json" "https://api.github.com/repos/catalogueoflife/backend/commits/$(git rev-parse HEAD)"
@@ -106,4 +103,4 @@ RUN chown -R $USER:$USER /opt/gbif/$APP_ARTIFACT
 USER $USER
 EXPOSE $SERVER_PORT
 
-CMD java $JVM_OPTIONS -jar app.jar --server.port=$SERVER_PORT --working.dir=/opt/gbif/$APP_ARTIFACT --mode=RUN --spring.cloud.bootstrap.location=/opt/gbif/$APP_ARTIFACT/bootstrap.yaml
\ No newline at end of file
+CMD java $JVM_OPTIONS -jar app.jar --server.port=$SERVER_PORT --working.dir=/opt/gbif/$APP_ARTIFACT/ --mode=RUN --spring.cloud.bootstrap.location=/opt/gbif/$APP_ARTIFACT/bootstrap.yaml
\ No newline at end of file
diff --git a/matching-ws/pom.xml b/matching-ws/pom.xml
index 19ca20d59..4da425756 100644
--- a/matching-ws/pom.xml
+++ b/matching-ws/pom.xml
@@ -15,6 +15,7 @@
     <java.version>11</java.version>
     <lucene.version>9.10.0</lucene.version>
     <spring-boot.version>2.7.18</spring-boot.version>
+    <spring-cloud.version>2021.0.9</spring-cloud.version>
     <lombok.version>1.18.22</lombok.version>
     <spotless-maven-plugin.version>2.43.0</spotless-maven-plugin.version>
     <logback.version>1.2.13</logback.version>
@@ -284,11 +285,6 @@
       <artifactId>spring-boot-starter-validation</artifactId>
       <version>${spring-boot.version}</version>
     </dependency>
-    <dependency>
-      <groupId>net.openhft</groupId>
-      <artifactId>chronicle-map</artifactId>
-      <version>3.25ea6</version>
-    </dependency>
     <dependency>
       <groupId>org.springframework.boot</groupId>
       <artifactId>spring-boot-starter-web</artifactId>
@@ -356,14 +352,6 @@
       <artifactId>logstash-logback-encoder</artifactId>
       <version>${logstash-logback.version}</version>
     </dependency>
-    <!-- enable for dev -->
-<!--    <dependency>-->
-<!--      <groupId>org.springframework.boot</groupId>-->
-<!--      <artifactId>spring-boot-devtools</artifactId>-->
-<!--      <version>${spring-boot.version}</version>-->
-<!--      <scope>runtime</scope>-->
-<!--      <optional>true</optional>-->
-<!--    </dependency>-->
     <dependency>
       <groupId>org.springframework.boot</groupId>
       <artifactId>spring-boot-configuration-processor</artifactId>
diff --git a/matching-ws/src/main/java/life/catalogue/matching/MatchingApplication.java b/matching-ws/src/main/java/life/catalogue/matching/MatchingApplication.java
index e964e2e02..3f7820022 100644
--- a/matching-ws/src/main/java/life/catalogue/matching/MatchingApplication.java
+++ b/matching-ws/src/main/java/life/catalogue/matching/MatchingApplication.java
@@ -87,7 +87,7 @@ public void run(ApplicationArguments args) {
   }
 
   private void initialiseWebapp() {
-    Optional<APIMetadata> metadata = matchingService.getAPIMetadata(false);
+    Optional<APIMetadata> metadata = matchingService.getAPIMetadata(true);
     if (metadata.isEmpty()) {
       log.error("No main index found. Cannot start web services");
       return;
@@ -136,7 +136,8 @@ private void runIndexingIfRequired(ApplicationArguments args) throws Exception {
         indexingService.indexIdentifiers(id);
     }
 
-    log.info("Indexing completed");
+    matchingService.getAPIMetadata(true);
+    log.info("Indexing ready");
   }
 
   private ExecutionMode getMode(ApplicationArguments args) {
diff --git a/matching-ws/src/main/java/life/catalogue/matching/controller/MatchController.java b/matching-ws/src/main/java/life/catalogue/matching/controller/MatchController.java
index de51ebc67..c2c2df057 100644
--- a/matching-ws/src/main/java/life/catalogue/matching/controller/MatchController.java
+++ b/matching-ws/src/main/java/life/catalogue/matching/controller/MatchController.java
@@ -15,6 +15,7 @@
 import java.util.stream.Collectors;
 import life.catalogue.matching.model.*;
 import life.catalogue.matching.service.MatchingService;
+import life.catalogue.matching.util.IUCNUtils;
 import lombok.AllArgsConstructor;
 import lombok.Builder;
 import lombok.Data;
@@ -112,9 +113,13 @@ public NameUsageMatch matchOldPaths(
     HttpServletRequest response) {
     return matchV2(
       usageKey,
-      taxonID,taxonConceptID,scientificNameID,
-      scientificName2, scientificName,
-      authorship, authorship2,
+      taxonID,
+      taxonConceptID,
+      scientificNameID,
+      scientificName2,
+      scientificName,
+      authorship,
+      authorship2,
       genericName,
       specificEpithet,
       infraspecificEpithet,
@@ -279,6 +284,7 @@ public NameUsageMatch matchV2(
         taxonID,
         taxonConceptID,
         scientificNameID,
+
         scientificName,
         scientificName2,
         authorship,
@@ -288,6 +294,7 @@ public NameUsageMatch matchV2(
         infraspecificEpithet,
         rank,
         rank2,
+
         classification,
         exclude,
         strict,
@@ -428,6 +435,7 @@ public Object matchFlatV1(
         taxonID,
         taxonConceptID,
         scientificNameID,
+
         scientificName,
         scientificName2,
         authorship,
@@ -437,6 +445,7 @@ public Object matchFlatV1(
         infraspecificEpithet,
         rank,
         rank2,
+
         classification,
         exclude != null ? exclude.stream().map(Object::toString).collect(Collectors.toSet()) : Set.of(),
         strict,
@@ -598,6 +607,7 @@ public Object matchV1(
       taxonID,
       taxonConceptID,
       scientificNameID,
+
       scientificName,
       scientificName2,
       authorship,
@@ -607,6 +617,7 @@ public Object matchV1(
       infraspecificEpithet,
       rank,
       rank2,
+
       classification,
       exclude != null ? exclude.stream().map(Object::toString).collect(Collectors.toSet()) : Set.of(),
       strict,
@@ -671,7 +682,7 @@ public Map<String, Object> iucnRedListV1(@PathVariable(value = "usageKey", requi
       return Map.of();
     }
     NameUsageMatch.Status status = statusList.get(0);
-    String formatted = formatIucn(status.getStatus());
+    String formatted = IUCNUtils.formatIucn(status.getStatus());
     if (formatted == null || formatted.isEmpty()) {
       return Map.of();
     }
@@ -679,7 +690,7 @@ public Map<String, Object> iucnRedListV1(@PathVariable(value = "usageKey", requi
     String scientificName = match.getAcceptedUsage() != null ? match.getAcceptedUsage().getCanonicalName() : match.getUsage().getCanonicalName();
 
     try {
-      IUCN iucn = IUCN.valueOf(formatted); // throws IllegalArgumentException if not found
+      IUCNUtils.IUCN iucn = IUCNUtils.IUCN.valueOf(formatted); // throws IllegalArgumentException if not found
       watch.stop();
       log("v1/species/iucnRedListCategory", usageKey, watch);
       return Map.of(
@@ -689,7 +700,7 @@ public Map<String, Object> iucnRedListV1(@PathVariable(value = "usageKey", requi
         "taxonomicStatus", NameUsageMatchV1.TaxonomicStatusV1.convert(
           match.getDiagnostics().getStatus()),
         "iucnTaxonID", status.getSourceId(),
-        "code", iucn.code
+        "code", iucn.getCode()
       );
     } catch (IllegalArgumentException e) {
       log.error("IUCN category not found: {}", formatted, e);
@@ -751,37 +762,6 @@ private static void addIfNotNull(StringJoiner joiner, Object value) {
     }
   }
 
-  String formatIucn(String original){
-    if (original == null) {
-      return null;
-    }
-    // Trim the string
-    String trimmed = original.trim();
-    // Convert to uppercase
-    String uppercased = trimmed.toUpperCase();
-    // Replace any whitespace with a single underscore
-    return uppercased.replaceAll("\\s+", "_");
-  }
-
-   enum IUCN {
-    EXTINCT("EX"),
-    EXTINCT_IN_THE_WILD("EW"),
-    CRITICALLY_ENDANGERED ("CR"),
-    ENDANGERED ("EN"),
-    VULNERABLE ("VU"),
-    NEAR_THREATENED ("NT"),
-    CONSERVATION_DEPENDENT ("CD"),
-    LEAST_CONCERN ("LC"),
-    DATA_DEFICIENT ("DD"),
-    NOT_EVALUATED ("NE");
-
-    private final String code;
-
-    IUCN(String code) {
-      this.code = code;
-    }
-  }
-
   @Data
   @Builder
   @NoArgsConstructor
diff --git a/matching-ws/src/main/java/life/catalogue/matching/index/DatasetIndex.java b/matching-ws/src/main/java/life/catalogue/matching/index/DatasetIndex.java
index b75dfe85a..23f46f585 100644
--- a/matching-ws/src/main/java/life/catalogue/matching/index/DatasetIndex.java
+++ b/matching-ws/src/main/java/life/catalogue/matching/index/DatasetIndex.java
@@ -2,8 +2,10 @@
 
 import static life.catalogue.matching.util.IndexConstants.DATASETS_JSON;
 import static life.catalogue.matching.util.IndexConstants.*;
+import com.fasterxml.jackson.core.JsonParser;
 import com.fasterxml.jackson.databind.JsonNode;
 import com.fasterxml.jackson.databind.ObjectMapper;
+import com.fasterxml.jackson.databind.DeserializationFeature;
 import java.io.File;
 import java.io.FileReader;
 import java.io.IOException;
@@ -11,6 +13,7 @@
 import java.nio.file.DirectoryStream;
 import java.nio.file.Files;
 import java.nio.file.Path;
+import java.nio.file.Paths;
 import java.nio.file.attribute.BasicFileAttributes;
 import java.time.Instant;
 import java.time.ZoneId;
@@ -21,20 +24,23 @@
 import life.catalogue.api.vocab.MatchType;
 import life.catalogue.api.vocab.TaxonomicStatus;
 import life.catalogue.matching.model.*;
+import life.catalogue.matching.util.IUCNUtils;
 import life.catalogue.matching.util.LuceneUtils;
 import life.catalogue.matching.Main;
 import lombok.extern.slf4j.Slf4j;
 import org.apache.lucene.document.Document;
-import org.apache.lucene.index.DirectoryReader;
-import org.apache.lucene.index.IndexReader;
-import org.apache.lucene.index.Term;
+import org.apache.lucene.index.*;
 import org.apache.lucene.search.*;
 import org.apache.lucene.store.Directory;
+import org.apache.lucene.store.FSDirectory;
 import org.apache.lucene.store.MMapDirectory;
 
+import org.apache.lucene.util.BytesRef;
+
 import org.gbif.nameparser.api.NomCode;
 import org.gbif.nameparser.api.Rank;
 import org.jetbrains.annotations.NotNull;
+import org.jetbrains.annotations.Nullable;
 import org.springframework.beans.factory.annotation.Value;
 import org.springframework.stereotype.Service;
 
@@ -57,10 +63,12 @@ public class DatasetIndex {
 
   protected static final ScientificNameAnalyzer scientificNameAnalyzer = new ScientificNameAnalyzer();
 
+  protected static final ObjectMapper MAPPER = new ObjectMapper();
+
   @Value("${index.path:/data/matching-ws/index}")
   String indexPath;
 
-  @Value("${working.path:/tmp/}")
+  @Value("${working.dir:/tmp/}")
   String workingDir;
 
   private boolean isInitialised = false;
@@ -77,16 +85,11 @@ public boolean getIsInitialised() {
         .build())
     .build();
 
-  public boolean exists(String indexPath) {
-    return new File(indexPath).exists()
-      && new File(indexPath + "/" + MAIN_INDEX_DIR).exists()
-      && Objects.requireNonNull(new File(indexPath + "/" + MAIN_INDEX_DIR).listFiles()).length > 0;
-  }
-
   /** Attempts to read the index from disk if it exists. */
   @PostConstruct
   void init() {
 
+    MAPPER.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false);
     final String mainIndexPath = getMainIndexPath();
     final Map<Integer, Dataset> prefixMapping = loadPrefixMapping();
 
@@ -251,6 +254,31 @@ public APIMetadata getAPIMetadata(){
     return metadata;
   }
 
+  public static List<String> distinctValuesForField(String field, String indexPath) throws Exception {
+
+    List<String> distinctValues = new ArrayList<>();
+    FSDirectory directory = FSDirectory.open(Paths.get(indexPath));
+
+    try (DirectoryReader directoryReader = DirectoryReader.open(directory)) {
+
+      // Get the field terms
+      for (LeafReaderContext leafContext : directoryReader.leaves()) {
+        LeafReader leafReader = leafContext.reader();
+        Terms terms = leafReader.terms(field);
+
+        if (terms != null) {
+          TermsEnum termsEnum = terms.iterator();
+          BytesRef byteRef;
+          while ((byteRef = termsEnum.next()) != null) {
+            String termValue = byteRef.utf8ToString();
+            distinctValues.add(termValue);
+          }
+        }
+      }
+    }
+    return distinctValues;
+  }
+
   /**
    * Returns the metadata of the index. This includes the number of taxa, the size on disk, the
    * dataset title and key, and the build information.
@@ -290,16 +318,16 @@ private IndexMetadata getIndexMetadata(String indexPath, IndexSearcher searcher,
 
     try {
       Map<String, Long> rankCounts = new LinkedHashMap<>();
-      rankCounts.put(Rank.KINGDOM.name(), getCountForRank(searcher, Rank.KINGDOM));
-      rankCounts.put(Rank.PHYLUM.name(), getCountForRank(searcher, Rank.PHYLUM));
-      rankCounts.put(Rank.CLASS.name(), getCountForRank(searcher, Rank.CLASS));
-      rankCounts.put(Rank.ORDER.name(), getCountForRank(searcher, Rank.ORDER));
-      rankCounts.put(Rank.FAMILY.name(), getCountForRank(searcher, Rank.FAMILY));
-      rankCounts.put(Rank.GENUS.name(), getCountForRank(searcher, Rank.GENUS));
-      rankCounts.put(Rank.SPECIES.name(), getCountForRank(searcher, Rank.SPECIES));
-      rankCounts.put(Rank.SUBSPECIES.name(), getCountForRank(searcher, Rank.SUBSPECIES));
+      distinctValuesForField(FIELD_RANK, indexPath).stream().sorted( (a, b) -> Rank.valueOf(a).ordinal() - Rank.valueOf(b).ordinal()
+      ).forEach(rank -> {
+        try {
+          rankCounts.put(rank, getCountForRank(searcher, rank));
+        } catch (IOException e) {
+          log.error("Cannot read index information", e);
+        }
+      });
       metadata.setNameUsageByRankCount(rankCounts);
-    } catch (IOException e) {
+    } catch (Exception e) {
       log.error("Cannot read index information", e);
     }
     return metadata;
@@ -311,7 +339,7 @@ private IndexMetadata getIndexMetadata(String indexPath, IndexSearcher searcher,
    */
   private Optional<BuildInfo> getGitInfo() {
     ObjectMapper mapper = new ObjectMapper();
-    final String filePath = workingDir + "/" + GIT_JSON;
+    final String filePath = workingDir + GIT_JSON;
     try {
       if (new File(filePath).exists()) {
         // Read JSON file and parse to JsonNode
@@ -358,7 +386,7 @@ public Map<String, Object> getDatasetInfo(String indexPath) {
 
     try {
       if (new File(filePath).exists()){
-        log.info("Loading dataset info from {}", filePath);
+        log.debug("Loading dataset info from {}", filePath);
         // Read JSON file and parse to JsonNode
         JsonNode rootNode = mapper.readTree(new File(filePath));
         // Navigate to the author node
@@ -383,8 +411,8 @@ public Map<String, Object> getDatasetInfo(String indexPath) {
     return Map.of();
   }
 
-  private long getCountForRank(IndexSearcher searcher, Rank rank) throws IOException {
-    Query query = new TermQuery(new Term(FIELD_RANK, rank.name()));
+  private long getCountForRank(IndexSearcher searcher, String rank) throws IOException {
+    Query query = new TermQuery(new Term(FIELD_RANK, rank));
     return searcher.search(query, new TotalHitCountCollectorManager());
   }
 
@@ -416,22 +444,6 @@ public NameUsageMatch matchByUsageKey(String usageKey) {
     return matchByKey(usageKey, this::getByUsageKey);
   }
 
-  private static String escapeQueryChars(String s) {
-    StringBuilder sb = new StringBuilder();
-    for (int i = 0; i < s.length(); i++) {
-      char c = s.charAt(i);
-      // These are the special characters that need to be escaped
-      if (c == '\\' || c == '+' || c == '-' || c == '!' || c == '(' || c == ')' ||
-        c == ':' || c == '^' || c == '[' || c == ']' || c == '\"' || c == '{' ||
-        c == '}' || c == '~' || c == '*' || c == '?' || c == '|' || c == '&' ||
-        c == '/' || Character.isWhitespace(c)) {
-        sb.append('\\');
-      }
-      sb.append(c);
-    }
-    return sb.toString();
-  }
-
   private Optional<Document> getByUsageKey(String usageKey) {
     Query query = new TermQuery(new Term(FIELD_ID, usageKey));
     try {
@@ -541,12 +553,36 @@ public List<ExternalID> lookupIdentifier(@NotNull String identifier)  {
    * @return List of ExternalID
    */
   public List<ExternalID> lookupIdentifier(@NotNull String datasetID, @NotNull String identifier)  {
+    return lookupIdentifier(datasetID, identifier, identifierSearchers);
+  }
+
+  /**
+   * Matches an external ID. Intended for debug purposes only, to quickly
+   * check if ids are present and joined to main index or not.
+   *
+   * @param datasetID the datasetKey to match
+   * @param identifier the identifier to match
+   * @return List of ExternalID
+   */
+  public List<ExternalID> lookupAncillary(@NotNull String datasetID, @NotNull String identifier)  {
+    return lookupIdentifier(datasetID, identifier, ancillarySearchers);
+  }
+
+  /**
+   * Matches an external ID. Intended for debug purposes only, to quickly
+   * check if ids are present and joined to main index or not.
+   *
+   * @param datasetID the datasetKey to match
+   * @param identifier the identifier to match
+   * @return List of ExternalID
+   */
+  public List<ExternalID> lookupIdentifier(@NotNull String datasetID, @NotNull String identifier,  Map<Dataset, IndexSearcher> searchers)  {
     List<ExternalID> results = new ArrayList<>();
 
     try {
       // if join indexes are present, add them to the match
-      if (identifierSearchers != null && !identifierSearchers.isEmpty()) {
-        for (Dataset dataset : identifierSearchers.keySet()) {
+      if (searchers != null && !searchers.isEmpty()) {
+        for (Dataset dataset : searchers.keySet()) {
 
           // use the prefix mapping
           if (dataset.getKey().toString().equals(datasetID) || (dataset.getGbifKey() != null && dataset.getGbifKey().equals(datasetID))) {
@@ -557,12 +593,12 @@ public List<ExternalID> lookupIdentifier(@NotNull String datasetID, @NotNull Str
             }
 
             // find the index and search it
-            IndexSearcher identifierSearcher = identifierSearchers.get(dataset);
+            IndexSearcher searcher = searchers.get(dataset);
             Query identifierQuery = new TermQuery(new Term(FIELD_ID, identifier));
-            TopDocs identifierDocs = identifierSearcher.search(identifierQuery, 3);
+            TopDocs identifierDocs = searcher.search(identifierQuery, 3);
 
             if (identifierDocs.totalHits.value > 0) {
-              Document identifierDoc = identifierSearcher.storedFields().
+              Document identifierDoc = searcher.storedFields().
                 document(identifierDocs.scoreDocs[0].doc);
 
               results.add(toExternalID(identifierDoc, dataset));
@@ -598,43 +634,19 @@ private static ExternalID toExternalID(Document doc, Dataset dataset) {
    * @param ignoredIssue the issue to add if the identifier is ignored
    * @return NameUsageMatch
    */
-  public NameUsageMatch matchByExternalKey(String key, Issue notFoundIssue, Issue ignoredIssue) {
-
-    NameUsageMatch usageMatch = matchByUsageKey(key);
-    if (usageMatch.getDiagnostics().getMatchType() != MatchType.NONE) {
-      return usageMatch;
-    }
+  public NameUsageMatch matchByExternalKey(String suppliedKey, Issue notFoundIssue, Issue ignoredIssue) {
 
     // if join indexes are present, add them to the match
     if (identifierSearchers != null && !identifierSearchers.isEmpty()){
       try {
         for (Dataset dataset: identifierSearchers.keySet()){
 
-          // use the prefix mapping
-          if (dataset.getPrefixMapping() != null && !dataset.getPrefixMapping().isEmpty()) {
-            for (String prefix : dataset.getPrefixMapping()) {
-              if (key.startsWith(prefix)) {
-                key = key.replace(prefix, "");
-              }
-            }
-          }
-
-          if (
-            (dataset.getPrefix() == null || !key.startsWith(dataset.getPrefix()))
-              && !dataset.getPrefix().equals("*")) {
-            // only search indexes with matching prefixes
-            continue;
-          }
-
-          log.debug("Searching for identifier {} in dataset {}", key, dataset.getKey());
-
-          if (dataset.getRemovePrefixForMatching()){
-            key = key.replace(dataset.getPrefix(), "");
-          }
+          Optional<String> key = extractKeyForSearch(suppliedKey, dataset);
+          if (key.isEmpty()) continue;
 
           // find the index and search it
           IndexSearcher identifierSearcher = identifierSearchers.get(dataset);
-          Query identifierQuery = new TermQuery(new Term(FIELD_ID, key));
+          Query identifierQuery = new TermQuery(new Term(FIELD_ID, key.get()));
           TopDocs identifierDocs = identifierSearcher.search(identifierQuery, 3);
 
           if (identifierDocs.totalHits.value > 0) {
@@ -667,7 +679,7 @@ public NameUsageMatch matchByExternalKey(String key, Issue notFoundIssue, Issue
           }
         }
       } catch (IOException e) {
-        log.error("Problem querying external ID indexes with {}", key, e);
+        log.error("Problem querying external ID indexes with {}", suppliedKey, e);
       }
     }
 
@@ -675,6 +687,38 @@ public NameUsageMatch matchByExternalKey(String key, Issue notFoundIssue, Issue
     return NO_MATCH;
   }
 
+  public static Optional<String> extractKeyForSearch(String key, Dataset dataset) {
+    if (!hasRecognisedPrefix(key, dataset)) {
+      // only search indexes with matching prefixes
+      return Optional.empty();
+    }
+
+    // use the prefix mapping
+    if (dataset.getPrefixMapping() != null && !dataset.getPrefixMapping().isEmpty()) {
+      for (String prefix : dataset.getPrefixMapping()) {
+        if (key.startsWith(prefix)) {
+          key = key.replace(prefix, dataset.getPrefix());
+        }
+      }
+    }
+
+    // if configured, remove the prefix
+    if (dataset.getRemovePrefixForMatching() != null && dataset.getRemovePrefixForMatching()){
+      key = key.replace(dataset.getPrefix(), "");
+    }
+    log.debug("Searching for identifier {} in dataset {}", key, dataset.getKey());
+    return Optional.of(key);
+  }
+
+  private static boolean hasRecognisedPrefix(String key, Dataset dataset) {
+    if (dataset.getPrefix() == null){
+      return false;
+    }
+    if (key.startsWith(dataset.getPrefix()))
+      return true;
+    return dataset.getPrefixMapping().stream().anyMatch(key::startsWith);
+  }
+
   private static NameUsageMatch noMatch(Issue issue, String note) {
     return NameUsageMatch.builder()
       .diagnostics(
@@ -719,7 +763,6 @@ private List<NameUsageMatch.RankedName> loadHigherTaxa(String parentID) {
           higherTaxon.setRank(Rank.valueOf(doc.get(FIELD_RANK)));
           higherTaxon.setParentID(doc.get(FIELD_PARENT_ID));
           higherTaxa.add(0, higherTaxon);
-//          higherTaxonomyCache.put(currentParentID, higherTaxon);
           // get next parent
           currentParentID = doc.get(FIELD_PARENT_ID);
         } else {
@@ -743,16 +786,7 @@ private NameUsageMatch fromDoc(Document doc) {
     NameUsageMatch u = NameUsageMatch.builder().build();
     u.setDiagnostics(NameUsageMatch.Diagnostics.builder().build());
 
-    // set the usage
-    u.setUsage(
-      NameUsageMatch.RankedName.builder()
-        .key(doc.get(FIELD_ID))
-        .name(doc.get(FIELD_SCIENTIFIC_NAME))
-        .rank(Rank.valueOf(doc.get(FIELD_RANK)))
-        .canonicalName(doc.get(FIELD_CANONICAL_NAME))
-        .code(getCode(doc))
-        .build()
-    );
+    u.setUsage(constructUsage(doc));
 
     String acceptedParentID = null;
 
@@ -761,15 +795,7 @@ private NameUsageMatch fromDoc(Document doc) {
       Optional<Document> accDocOpt = getByUsageKey(doc.get(FIELD_ACCEPTED_ID));
       if (accDocOpt.isPresent()) {
         Document accDoc = accDocOpt.get();
-        u.setAcceptedUsage(
-          NameUsageMatch.RankedName.builder()
-            .key(accDoc.get(FIELD_ID))
-            .name(accDoc.get(FIELD_SCIENTIFIC_NAME))
-            .rank(Rank.valueOf(accDoc.get(FIELD_RANK)))
-            .canonicalName(accDoc.get(FIELD_CANONICAL_NAME))
-            .code(getCode(accDoc))
-            .build()
-        );
+        u.setAcceptedUsage(constructUsage(accDoc));
         acceptedParentID = accDoc.get(FIELD_PARENT_ID);
       }
     }
@@ -798,7 +824,7 @@ private NameUsageMatch fromDoc(Document doc) {
       classification.add(
         NameUsageMatch.RankedName.builder()
           .key(doc.get(FIELD_ID))
-          .name( doc.get(FIELD_CANONICAL_NAME))
+          .name(doc.get(FIELD_CANONICAL_NAME))
           .rank(Rank.valueOf(doc.get(FIELD_RANK)))
           .canonicalName(doc.get(FIELD_CANONICAL_NAME))
           .build()
@@ -809,19 +835,26 @@ private NameUsageMatch fromDoc(Document doc) {
     // if ancillary join indexes are present, add them to the match
     for (Dataset dataset: ancillarySearchers.keySet()){
       IndexSearcher ancillarySearcher = ancillarySearchers.get(dataset);
-      Query query = new TermQuery(new Term(FIELD_JOIN_ID, doc.get(FIELD_ID) ));
+      Query query = new TermQuery(
+        new Term(FIELD_JOIN_ID, doc.get(FIELD_ID))
+      );
       try {
         TopDocs docs = ancillarySearcher.search(query, 3);
         if (docs.totalHits.value > 0) {
           Document ancillaryDoc = ancillarySearcher.storedFields().document(docs.scoreDocs[0].doc);
-          String status = ancillaryDoc.get(FIELD_CATEGORY);
           NameUsageMatch.Status ancillaryStatus = new NameUsageMatch.Status();
-          ancillaryStatus.setStatus(status);
-          ancillaryStatus.setDatasetKey(dataset.getKey().toString());
-          ancillaryStatus.setGbifKey(dataset.getGbifKey());
-          ancillaryStatus.setDatasetAlias(dataset.getAlias());
-          ancillaryStatus.setSourceId(ancillaryDoc.get(FIELD_ID));
-          u.addAdditionalStatus(ancillaryStatus);
+          ancillaryStatus.setStatus(ancillaryDoc.get(FIELD_CATEGORY));
+          String formattedIUCN = IUCNUtils.formatIucn(ancillaryDoc.get(FIELD_CATEGORY));
+          if (formattedIUCN != null) {
+            IUCNUtils.IUCN iucn = IUCNUtils.IUCN.valueOf(formattedIUCN);
+            ancillaryStatus.setStatus(formattedIUCN);
+            ancillaryStatus.setStatusCode(iucn.getCode());
+            ancillaryStatus.setDatasetKey(dataset.getKey().toString());
+            ancillaryStatus.setGbifKey(dataset.getGbifKey());
+            ancillaryStatus.setDatasetAlias(dataset.getAlias());
+            ancillaryStatus.setSourceId(ancillaryDoc.get(FIELD_ID));
+            u.addAdditionalStatus(ancillaryStatus);
+          }
         }
       } catch (IOException e) {
         log.error("Cannot load usage {} from lucene index", doc.get(FIELD_ID), e);
@@ -834,6 +867,78 @@ private NameUsageMatch fromDoc(Document doc) {
     return u;
   }
 
+  private static NameUsageMatch.Usage constructUsage(Document doc) {
+    StoredParsedName pn = null;
+    String parsedNameJson = doc.get(FIELD_PARSED_NAME_JSON);
+    if (parsedNameJson != null) {
+      try {
+        pn = MAPPER.readValue(parsedNameJson, StoredParsedName.class);
+      } catch (Exception e) {
+        log.error("Cannot parse parsed name json", e);
+      }
+    }
+
+    // set the usage
+    NameUsageMatch.Usage.UsageBuilder b = NameUsageMatch.Usage.builder()
+        .key(doc.get(FIELD_ID))
+        .name(doc.get(FIELD_SCIENTIFIC_NAME))
+        .authorship(doc.get(FIELD_AUTHORSHIP))
+        .rank(Rank.valueOf(doc.get(FIELD_RANK)))
+        .canonicalName(doc.get(FIELD_CANONICAL_NAME))
+        .code(getCode(doc));
+
+    if (pn != null) {
+      b.genus(pn.getGenus())
+        .infragenericEpithet(pn.getInfragenericEpithet())
+        .specificEpithet(pn.getSpecificEpithet())
+        .infraspecificEpithet(pn.getInfraspecificEpithet())
+        .cultivarEpithet(pn.getCultivarEpithet())
+        .phrase(pn.getPhrase())
+        .voucher(pn.getVoucher())
+        .nominatingParty(pn.getNominatingParty())
+        .candidatus(pn.isCandidatus())
+        .notho(pn.getNotho())
+        .originalSpelling(pn.getOriginalSpelling())
+        .epithetQualifier(pn.getEpithetQualifier())
+        .type(pn.getType())
+        .extinct(pn.isExtinct())
+
+        .sanctioningAuthor(pn.getSanctioningAuthor())
+        .taxonomicNote(pn.getTaxonomicNote())
+        .nomenclaturalNote(pn.getNomenclaturalNote())
+        .publishedIn(pn.getPublishedIn())
+        .unparsed(pn.getUnparsed())
+        .doubtful(pn.isDoubtful())
+        .manuscript(pn.isManuscript())
+        .state(pn.getState())
+        .warnings(pn.getWarnings());
+
+        if (pn.getCombinationAuthorship() != null
+          && pn.getCombinationAuthorship().getAuthors() != null
+          && !pn.getCombinationAuthorship().getAuthors().isEmpty()
+        ) {
+          b.combinationAuthorship(
+            NameUsageMatch.Authorship.builder()
+              .authors(pn.getCombinationAuthorship().getAuthors())
+              .year(pn.getCombinationAuthorship().getYear())
+              .build());
+        }
+
+        if (pn.getBasionymAuthorship() != null
+          && pn.getBasionymAuthorship().getAuthors() != null
+          && !pn.getBasionymAuthorship().getAuthors().isEmpty()
+        ) {
+          b.basionymAuthorship(
+            NameUsageMatch.Authorship.builder()
+              .authors(pn.getBasionymAuthorship().getAuthors())
+              .year(pn.getBasionymAuthorship().getYear())
+              .build());
+        }
+    }
+
+    return  b.build();
+  }
+
   private static NomCode getCode(Document doc) {
     if (doc.get(FIELD_NOMENCLATURAL_CODE) == null) {
       return null;
@@ -883,6 +988,7 @@ public List<NameUsageMatch> matchByName(String name, boolean fuzzySearch, int ma
     try {
       return search(q, name, fuzzySearch, maxMatches);
     } catch (RuntimeException e) {
+      log.error("Lucene search error", e);
       // for example TooComplexToDeterminizeException, see
       // http://dev.gbif.org/issues/browse/POR-2725
       log.warn("Lucene failed to fuzzy search for name [{}]. Try a straight match instead", name);
diff --git a/matching-ws/src/main/java/life/catalogue/matching/index/NameNRank.java b/matching-ws/src/main/java/life/catalogue/matching/index/NameNRank.java
index e4a36cd12..0b54c2a4b 100644
--- a/matching-ws/src/main/java/life/catalogue/matching/index/NameNRank.java
+++ b/matching-ws/src/main/java/life/catalogue/matching/index/NameNRank.java
@@ -12,7 +12,6 @@
 import life.catalogue.matching.util.CleanupUtils;
 
 import org.apache.commons.lang3.StringUtils;
-import org.gbif.nameparser.api.Authorship;
 import org.gbif.nameparser.api.NamePart;
 import org.gbif.nameparser.api.ParsedName;
 import org.gbif.nameparser.api.Rank;
@@ -214,28 +213,19 @@ private static boolean exists(String x) {
 
   @VisibleForTesting
   public static String expandAbbreviatedGenus(String scientificName, String genus) {
-    if (exists(scientificName) && exists(genus)) {
+    if (exists(scientificName) && exists(genus) && !scientificName.equalsIgnoreCase(genus)) {
       String[] parts = scientificName.split(" +", 2);
-      if (parts[0].length() <= 2) {
-        String genusCorrect = StringUtils.capitalize(genus.trim().toLowerCase());
+      String genusCorrect = StringUtils.capitalize(genus.trim().toLowerCase());
+      if (parts[0].length() <= 2 && genusCorrect.length() > 2 && (
+        parts[0].equals("?") // is the genus missing alltogether?
+          || parts[0].length() == 2 && parts[0].charAt(1) == '.' && parts[0].charAt(0) == genusCorrect.charAt(0)
+          || parts[0].length() == 1 && parts[0].charAt(0) == genusCorrect.charAt(0)
+      )) {
         StringBuilder sb = new StringBuilder();
-        // is the genus missing alltogether?
-        if (parts[0].equals("?")) {
-          sb.append(genusCorrect);
-        } else if (genusCorrect.length() > 1) {
-          // test if name has an abbreviated genus
-          if (parts[0].length() == 2
-                  && parts[0].charAt(1) == '.'
-                  && parts[0].charAt(0) == genusCorrect.charAt(0)
-              || parts[0].length() == 1 && parts[0].charAt(0) == genusCorrect.charAt(0)) {
-            sb.append(genusCorrect);
-          }
-        } else {
-          sb.append(parts[0]);
-        }
+        sb.append(genus);
         if (parts.length > 1) {
-          sb.append(" ");
-          sb.append(parts[1]);
+          sb.append(" ")
+            .append(parts[1]);
         }
         return sb.toString();
       }
diff --git a/matching-ws/src/main/java/life/catalogue/matching/model/APIMetadata.java b/matching-ws/src/main/java/life/catalogue/matching/model/APIMetadata.java
index b632ecf29..63fbccd78 100644
--- a/matching-ws/src/main/java/life/catalogue/matching/model/APIMetadata.java
+++ b/matching-ws/src/main/java/life/catalogue/matching/model/APIMetadata.java
@@ -4,13 +4,10 @@
 import com.fasterxml.jackson.annotation.JsonInclude;
 
 import io.swagger.v3.oas.annotations.media.Schema;
-import lombok.Builder;
 import lombok.Data;
 
 import java.util.ArrayList;
-import java.util.HashMap;
 import java.util.List;
-import java.util.Map;
 
 /**
  * Metadata about this API and about the indexes behind the API.
diff --git a/matching-ws/src/main/java/life/catalogue/matching/model/Dataset.java b/matching-ws/src/main/java/life/catalogue/matching/model/Dataset.java
index 0ccb698a8..24c81c74a 100644
--- a/matching-ws/src/main/java/life/catalogue/matching/model/Dataset.java
+++ b/matching-ws/src/main/java/life/catalogue/matching/model/Dataset.java
@@ -3,7 +3,10 @@
 import com.fasterxml.jackson.annotation.JsonIgnoreProperties;
 import com.fasterxml.jackson.annotation.JsonInclude;
 
+import lombok.AllArgsConstructor;
+import lombok.Builder;
 import lombok.Data;
+import lombok.NoArgsConstructor;
 
 import java.util.List;
 
@@ -13,6 +16,9 @@
 @Data
 @JsonInclude(JsonInclude.Include.NON_NULL)
 @JsonIgnoreProperties(ignoreUnknown = true)
+@Builder
+@NoArgsConstructor
+@AllArgsConstructor
 public class Dataset {
   Integer key;
   String gbifKey;
diff --git a/matching-ws/src/main/java/life/catalogue/matching/model/NameUsageMatch.java b/matching-ws/src/main/java/life/catalogue/matching/model/NameUsageMatch.java
index d15b8149a..88760abbc 100644
--- a/matching-ws/src/main/java/life/catalogue/matching/model/NameUsageMatch.java
+++ b/matching-ws/src/main/java/life/catalogue/matching/model/NameUsageMatch.java
@@ -5,9 +5,8 @@
 import com.fasterxml.jackson.annotation.JsonInclude;
 
 import java.io.Serializable;
-import java.util.ArrayList;
-import java.util.List;
-import java.util.Optional;
+import java.util.*;
+
 import io.swagger.v3.oas.annotations.media.Schema;
 
 import life.catalogue.api.vocab.MatchType;
@@ -31,9 +30,9 @@ public class NameUsageMatch implements LinneanClassification {
   @Schema(description = "If the matched usage is a synonym")
   boolean synonym;
   @Schema(description = "The matched name usage")
-  RankedName usage;
+  Usage usage;
   @Schema(description = "The accepted name usage for the match. This will only be populated when we've matched a synonym name usage.")
-  RankedName acceptedUsage;
+  Usage acceptedUsage;
   @Schema(description = "The classification of the accepted name usage.")
   List<RankedName> classification;
   @Schema(description = "Diagnostics for a name match including the type of match and confidence level",  implementation = Diagnostics.class)
@@ -311,6 +310,82 @@ public static class Diagnostics {
     List<NameUsageMatch> alternatives;
   }
 
+  /**
+   * A name with an identifier and a taxonomic rank.
+   */
+  @Schema(description = "A name with an identifier and a taxonomic rank", title = "Usage", type = "object")
+  @JsonInclude(JsonInclude.Include.NON_NULL)
+  @Data
+  @AllArgsConstructor
+  @NoArgsConstructor
+  @ToString
+  @Builder
+  public static class Usage implements Serializable {
+
+    private static final long serialVersionUID = 3423423423423L;
+
+    @Schema(description = "The identifier for the name usage")
+    private String key;
+    @Schema(description = "The name usage")
+    private String name;
+    private String canonicalName;
+    private String authorship;
+    @JsonIgnore private String parentID;
+    @Schema(description = "The taxonomic rank for the name usage")
+    private Rank rank;
+    @Schema(description = "The nomenclatural code for the name usage")
+    private NomCode code;
+    private String uninomial;
+    private String genus;
+    private String infragenericEpithet;
+    private String specificEpithet;
+    private String infraspecificEpithet;
+    private String cultivarEpithet;
+    private String phrase;
+    private String voucher;
+    private String nominatingParty;
+    private boolean candidatus;
+    private String notho;
+    private Boolean originalSpelling;
+    private Map<String, String> epithetQualifier;
+    private String type;
+    protected boolean extinct;
+    private Authorship combinationAuthorship;
+    private Authorship basionymAuthorship;
+    private String sanctioningAuthor;
+    private String taxonomicNote;
+    private String nomenclaturalNote;
+    private String publishedIn;
+    private String unparsed;
+    private boolean doubtful;
+    private boolean manuscript;
+    private String state;
+    private Set<String> warnings;
+
+    //additional flags
+    private boolean isAbbreviated;
+    private boolean isAutonym;
+    private boolean isBinomial;
+    private boolean isTrinomial;
+    private boolean isIncomplete;
+    private boolean isIndetermined;
+    private boolean isPhraseName;
+    private String terminalEpithet;
+  }
+
+  @Schema(description = "An scientific name authorship for a name usage, split into components", title = "Authorship", type = "object")
+  @JsonInclude(JsonInclude.Include.NON_NULL)
+  @Data
+  @AllArgsConstructor
+  @NoArgsConstructor
+  @ToString
+  @Builder
+  public static class Authorship {
+    private List<String> authors = new ArrayList();
+    private List<String> exAuthors = new ArrayList();
+    private String year;
+  }
+
   /**
    * A name with an identifier and a taxonomic rank.
    */
@@ -343,7 +418,7 @@ public static class RankedName implements Serializable {
   @Data
   @JsonIgnoreProperties(ignoreUnknown = true)
   @JsonInclude(JsonInclude.Include.NON_EMPTY)
-  @Schema(description = "A status value derived from a dataset or external source. E.g. IUCN Red List status.",
+  @Schema(description = "A status value derived from a dataset or external source. E.g. IUCN Red List.",
     title = "Status", type = "object")
   public static class Status {
     @Schema(description = "The dataset key for the dataset that the status is associated with")
@@ -354,6 +429,8 @@ public static class Status {
     private String gbifKey;
     @Schema(description = "The status value")
     private String status;
+    @Schema(description = "The status code value")
+    private String statusCode;
     @Schema(description = "The ID in the source dataset for this status. e.g. the IUCN ID for this taxon")
     private String sourceId;
   }
diff --git a/matching-ws/src/main/java/life/catalogue/matching/model/NameUsageQuery.java b/matching-ws/src/main/java/life/catalogue/matching/model/NameUsageQuery.java
index 7a05095a9..b4a8c55e6 100644
--- a/matching-ws/src/main/java/life/catalogue/matching/model/NameUsageQuery.java
+++ b/matching-ws/src/main/java/life/catalogue/matching/model/NameUsageQuery.java
@@ -31,15 +31,15 @@ public static NameUsageQuery create(
     String taxonID,
     String taxonConceptID,
     String scientificNameID,
-    String scientificName2,
     String scientificName,
-    String authorship2,
+    String scientificName2,
     String authorship,
-    String rank2,
-    String rank,
+    String authorship2,
     String genericName,
     String specificEpithet,
     String infraspecificEpithet,
+    String rank,
+    String rank2,
     Classification classification,
     Set<String> exclude,
     Boolean strict,
diff --git a/matching-ws/src/main/java/life/catalogue/matching/model/StoredParsedName.java b/matching-ws/src/main/java/life/catalogue/matching/model/StoredParsedName.java
new file mode 100644
index 000000000..ff2639fcc
--- /dev/null
+++ b/matching-ws/src/main/java/life/catalogue/matching/model/StoredParsedName.java
@@ -0,0 +1,64 @@
+package life.catalogue.matching.model;
+
+import lombok.*;
+
+import lombok.Data;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+
+@Builder
+@AllArgsConstructor
+@NoArgsConstructor
+@Data
+public class StoredParsedName {
+  private String rank;
+  private String code;
+  private String uninomial;
+  private String genus;
+  private String infragenericEpithet;
+  private String specificEpithet;
+  private String infraspecificEpithet;
+  private String cultivarEpithet;
+  private String phrase;
+  private String voucher;
+  private String nominatingParty;
+  private boolean candidatus;
+  private String notho;
+  private Boolean originalSpelling;
+  private Map<String, String> epithetQualifier;
+  private String type;
+  protected boolean extinct;
+  private StoredAuthorship combinationAuthorship;
+  private StoredAuthorship basionymAuthorship;
+  private String sanctioningAuthor;
+  private String taxonomicNote;
+  private String nomenclaturalNote;
+  private String publishedIn;
+  private String unparsed;
+  private boolean doubtful;
+  private boolean manuscript;
+  private String state;
+  private Set<String> warnings;
+
+  //additional flags
+  private boolean isAbbreviated;
+  private boolean isAutonym;
+  private boolean isBinomial;
+  private boolean isTrinomial;
+  private boolean isIncomplete;
+  private boolean isIndetermined;
+  private boolean isPhraseName;
+  private String terminalEpithet;
+
+  @Builder
+  @AllArgsConstructor
+  @NoArgsConstructor
+  @Data
+  public static class StoredAuthorship {
+    private List<String> authors = new ArrayList();
+    private List<String> exAuthors = new ArrayList();
+    private String year;
+  }
+}
diff --git a/matching-ws/src/main/java/life/catalogue/matching/service/IndexingService.java b/matching-ws/src/main/java/life/catalogue/matching/service/IndexingService.java
index 6cdc0b8c8..39581338e 100644
--- a/matching-ws/src/main/java/life/catalogue/matching/service/IndexingService.java
+++ b/matching-ws/src/main/java/life/catalogue/matching/service/IndexingService.java
@@ -14,10 +14,9 @@
 import java.util.concurrent.*;
 import java.util.concurrent.atomic.AtomicInteger;
 import java.util.concurrent.atomic.AtomicLong;
-import java.util.function.Consumer;
-import java.util.function.Supplier;
 import java.util.regex.Matcher;
 import java.util.regex.Pattern;
+import com.fasterxml.jackson.core.JsonProcessingException;
 import com.fasterxml.jackson.databind.JsonNode;
 import com.fasterxml.jackson.databind.ObjectMapper;
 import com.opencsv.CSVWriterBuilder;
@@ -29,23 +28,17 @@
 import life.catalogue.api.model.ReleaseAttempt;
 import life.catalogue.api.vocab.DatasetOrigin;
 import life.catalogue.api.vocab.TaxonomicStatus;
-
 import life.catalogue.matching.db.DatasetMapper;
-
 import life.catalogue.matching.index.ScientificNameAnalyzer;
 import life.catalogue.matching.model.Classification;
+import life.catalogue.matching.model.StoredParsedName;
 import life.catalogue.matching.model.Dataset;
 import life.catalogue.matching.model.NameUsage;
-
 import life.catalogue.matching.model.NameUsageMatch;
-
 import life.catalogue.matching.util.NameParsers;
-
 import lombok.extern.slf4j.Slf4j;
-
 import org.apache.commons.io.FileUtils;
 import org.apache.commons.lang3.StringUtils;
-import org.apache.ibatis.cursor.Cursor;
 import org.apache.ibatis.session.SqlSession;
 import org.apache.ibatis.session.SqlSessionFactory;
 import org.apache.lucene.analysis.Analyzer;
@@ -111,6 +104,8 @@ public class IndexingService {
 
   protected final MatchingService matchingService;
 
+  protected static final ObjectMapper MAPPER = new ObjectMapper();
+
   private static final String REL_PATTERN_STR = "(\\d+)(?:LX?RC?|R(\\d+))";
   private static final Pattern REL_PATTERN = Pattern.compile("^" + REL_PATTERN_STR + "$");
 
@@ -208,7 +203,7 @@ private Optional<Dataset> lookupDataset(SqlSessionFactory factory, Integer key)
   }
 
   /**
-   * Writes an export of  the name usages in a checklist bank dataset to a CSV file.
+   * Writes an export of the name usages in a checklist bank dataset to a CSV file.
    *
    * @param datasetKeyInput a dataset key or a release key
    * @throws Exception if the dataset key is invalid or the export fails
@@ -664,6 +659,10 @@ public void run() {
                 nameUsageMatch.getAcceptedUsage() != null ? nameUsageMatch.getAcceptedUsage().getKey() :
                   nameUsageMatch.getUsage().getKey(), Field.Store.YES)
               );
+
+              // reduce the side of these indexes by removing the parsed name
+              doc.removeField(FIELD_PARSED_NAME_JSON);
+
               writer.addDocument(doc);
               matchedCounter.incrementAndGet();
             } else {
@@ -680,7 +679,7 @@ public void run() {
     }
 
     private boolean isAccepted(String status) {
-      return status != null && !status.equals(TaxonomicStatus.ACCEPTED.name());
+      return status != null && status.equals(TaxonomicStatus.ACCEPTED.name());
     }
   }
 
@@ -801,12 +800,6 @@ private void indexFile(String exportPath, String indexPath) throws Exception {
     mapper.writeValue(new File(indexPath + "/" + METADATA_JSON), metadata);
   }
 
-  class YourThreadFactory implements ThreadFactory {
-    public Thread newThread(Runnable r) {
-      return new Thread(r, "NameUsage-Indexing-taskThread");
-    }
-  }
-
   static class IndexingTask implements Runnable {
     private final IndexWriter writer;
     private final List<NameUsage> nameUsages;
@@ -846,6 +839,11 @@ private static void finishIndex(IndexWriter indexWriter) throws IOException {
     return Paths.get(indexPath);
   }
 
+  /**
+   * Generate the lucene document for a name usage
+   * @param nameUsage to convert to lucene document
+   * @return lucene document
+   */
   protected static Document toDoc(NameUsage nameUsage) {
 
     Document doc = new Document();
@@ -853,18 +851,18 @@ protected static Document toDoc(NameUsage nameUsage) {
      Porting notes: The canonical name *sensu strictu* with nothing else but three name parts at
      most (genus, species, infraspecific). No rank or hybrid markers and no authorship,
      cultivar or strain information. Infrageneric names are represented without a
-     leading genus. Unicode characters are replaced by their matching ASCII characters."
+     leading genus. Unicode characters are replaced by their matching ASCII characters.
     */
-     Rank rank = Rank.valueOf(nameUsage.getRank());
+    Rank rank = Rank.valueOf(nameUsage.getRank());
 
     Optional<String> optCanonical = Optional.empty();
+    ParsedName pn = null;
+    NomCode nomCode = null;
     try {
-      NomCode nomCode = null;
       if (!StringUtils.isEmpty(nameUsage.getNomenclaturalCode())) {
         nomCode = NomCode.valueOf(nameUsage.getNomenclaturalCode());
       }
-      ParsedName pn = NameParsers.INSTANCE.parse(nameUsage.getScientificName(), rank, nomCode);
-
+      pn = NameParsers.INSTANCE.parse(nameUsage.getScientificName(), rank, nomCode);
       // canonicalMinimal will construct the name without the hybrid marker and authorship
       String canonical = NameFormatter.canonicalMinimal(pn);
       optCanonical = Optional.ofNullable(canonical);
@@ -873,6 +871,25 @@ protected static Document toDoc(NameUsage nameUsage) {
       log.debug("Unable to parse name to create canonical: {}", nameUsage.getScientificName());
     }
 
+    if (pn != null){
+      try {
+        // if there an authorship, reparse with it to get the component authorship parts
+        StoredParsedName storedParsedName = StringUtils.isBlank(nameUsage.getAuthorship()) ?
+          getStoredParsedName(pn) : constructParsedName(nameUsage, rank, nomCode);
+        // store the parsed name components in JSON
+        doc.add(new StoredField(
+          FIELD_PARSED_NAME_JSON,
+          MAPPER.writeValueAsString(storedParsedName))
+        );
+      } catch (UnparsableNameException | InterruptedException e) {
+        // do nothing
+        log.debug("Unable to parse name to create canonical: {}", nameUsage.getScientificName());
+      } catch ( JsonProcessingException e) {
+        // do nothing
+        log.debug("Unable to parse name to create canonical: {}", nameUsage.getScientificName());
+      }
+    }
+
     final String canonical = optCanonical.orElse(nameUsage.getScientificName());
 
     // use custom precision step as we do not need range queries and prefer to save memory usage
@@ -895,7 +912,9 @@ protected static Document toDoc(NameUsage nameUsage) {
     String nameComplete = nameUsage.getScientificName();
     if (StringUtils.isNotBlank(nameUsage.getAuthorship())) {
       nameComplete += " " + nameUsage.getAuthorship();
+      doc.add(new TextField(FIELD_AUTHORSHIP, nameUsage.getAuthorship(), Field.Store.YES));
     }
+
     doc.add(new TextField(FIELD_SCIENTIFIC_NAME, nameComplete, Field.Store.YES));
 
     // this lucene index is not persistent, so not risk in changing ordinal numbers
@@ -920,11 +939,60 @@ protected static Document toDoc(NameUsage nameUsage) {
     return doc;
   }
 
-  public static <T> void consume(Supplier<Cursor<T>> cursorSupplier, Consumer<T> handler) {
-    try (Cursor<T> cursor = cursorSupplier.get()) {
-      cursor.forEach(handler);
-    } catch (IOException e) {
-      throw new RuntimeException(e);
-    }
+  @NotNull
+  private static StoredParsedName constructParsedName(NameUsage nameUsage, Rank rank, NomCode nomCode) throws UnparsableNameException, InterruptedException {
+    ParsedName pn = !StringUtils.isBlank(nameUsage.getAuthorship()) ?
+      NameParsers.INSTANCE.parse(nameUsage.getScientificName() + " " + nameUsage.getAuthorship(), rank, nomCode)
+      : NameParsers.INSTANCE.parse(nameUsage.getScientificName(), rank, nomCode);
+    return getStoredParsedName(pn);
+  }
+
+  @NotNull
+  private static StoredParsedName getStoredParsedName(ParsedName pn) {
+    StoredParsedName storedParsedName = new StoredParsedName();
+    storedParsedName.setAbbreviated(pn.isAbbreviated());
+    storedParsedName.setAutonym(pn.isAutonym());
+    storedParsedName.setBinomial(pn.isBinomial());
+    storedParsedName.setCandidatus(pn.isCandidatus());
+    storedParsedName.setCultivarEpithet(pn.getCultivarEpithet());
+    storedParsedName.setDoubtful(pn.isDoubtful());
+    storedParsedName.setGenus(pn.getGenus());
+    storedParsedName.setUninomial(pn.getUninomial());
+    storedParsedName.setUnparsed(pn.getUnparsed());
+    storedParsedName.setTrinomial(pn.isTrinomial());
+    storedParsedName.setIncomplete(pn.isIncomplete());
+    storedParsedName.setIndetermined(pn.isIndetermined());
+    storedParsedName.setTerminalEpithet(pn.getTerminalEpithet());
+    storedParsedName.setInfragenericEpithet(pn.getInfragenericEpithet());
+    storedParsedName.setInfraspecificEpithet(pn.getInfraspecificEpithet());
+    storedParsedName.setExtinct(pn.isExtinct());
+    storedParsedName.setPublishedIn(pn.getPublishedIn());
+    storedParsedName.setSanctioningAuthor(pn.getSanctioningAuthor());
+    storedParsedName.setSpecificEpithet(pn.getSpecificEpithet());
+    storedParsedName.setPhrase(pn.getPhrase());
+    storedParsedName.setPhraseName(pn.isPhraseName());
+    storedParsedName.setVoucher(pn.getVoucher());
+    storedParsedName.setNominatingParty(pn.getNominatingParty());
+    storedParsedName.setNomenclaturalNote(pn.getNomenclaturalNote());
+    storedParsedName.setWarnings(pn.getWarnings());
+    if (pn.getBasionymAuthorship() != null) {
+      storedParsedName.setBasionymAuthorship(
+        StoredParsedName.StoredAuthorship.builder()
+          .authors(pn.getBasionymAuthorship().getAuthors())
+          .exAuthors(pn.getBasionymAuthorship().getExAuthors())
+          .year(pn.getBasionymAuthorship().getYear()).build()
+      );
+    }
+    if (pn.getCombinationAuthorship() != null) {
+      storedParsedName.setCombinationAuthorship(
+        StoredParsedName.StoredAuthorship.builder()
+          .authors(pn.getCombinationAuthorship().getAuthors())
+          .exAuthors(pn.getCombinationAuthorship().getExAuthors())
+          .year(pn.getCombinationAuthorship().getYear()).build()
+      );
+    }
+    storedParsedName.setType(pn.getType() != null ? pn.getType().name() : null);
+    storedParsedName.setNotho(pn.getNotho() != null ? pn.getNotho().name() : null);
+    return storedParsedName;
   }
 }
diff --git a/matching-ws/src/main/java/life/catalogue/matching/service/MatchingService.java b/matching-ws/src/main/java/life/catalogue/matching/service/MatchingService.java
index 08dec6c28..515d41a3c 100644
--- a/matching-ws/src/main/java/life/catalogue/matching/service/MatchingService.java
+++ b/matching-ws/src/main/java/life/catalogue/matching/service/MatchingService.java
@@ -55,7 +55,7 @@
 @Service
 public class MatchingService {
 
-  @Value("${working.path:/tmp/}")
+  @Value("${working.dir:/tmp/}")
   protected String metadataFilePath;
 
   @Value("${online.dictionary.url:'https://rs.gbif.org/dictionaries/'}")
@@ -174,7 +174,7 @@ public Optional<APIMetadata> getAPIMetadata(boolean regenerate) {
 
     File metadata = new File(metadataFilePath + "/index-metadata.json");
     try {
-      if (!metadata.exists()  || regenerate) {
+      if (regenerate || !metadata.exists()) {
         APIMetadata metadata1 = datasetIndex.getAPIMetadata();
         //serialise to file
         ObjectMapper mapper = new ObjectMapper();
@@ -199,7 +199,6 @@ private static boolean isMatch(@Nullable NameUsageMatch match) {
 
   private static NameUsageMatch higherMatch(NameUsageMatch match, NameUsageMatch firstMatch) {
     match.getDiagnostics().setMatchType(MatchType.HIGHERRANK);
-    // FIXME
     addAlternatives(match, firstMatch.getDiagnostics().getAlternatives());
     return match;
   }
@@ -262,7 +261,9 @@ public List<ExternalID> lookupJoins(String identifier){
    * @return the list of matches
    */
   public List<ExternalID> matchID(String datasetID, String identifier){
-    return datasetIndex.lookupIdentifier(datasetID, identifier);
+    List<ExternalID> ids = datasetIndex.lookupIdentifier(datasetID, identifier);
+    List<ExternalID> ancillary = datasetIndex.lookupAncillary(datasetID, identifier);
+    return ImmutableList.<ExternalID>builder().addAll(ids).addAll(ancillary).build();
   }
 
   public NameUsageMatch match(
@@ -507,7 +508,12 @@ && getGenusOrAbove(parsedName) != null
         if (rank == null) {
           if (parsedName.isBinomial()
               || parsedName.isTrinomial()
-              || (parsedName.getRank() != null && parsedName.getRank().ordinal() >= Rank.SPECIES.ordinal())) {
+              || (
+                parsedName.getRank() != null
+                  && parsedName.getRank().ordinal() >= Rank.SPECIES.ordinal()
+                  && parsedName.getEpithet(NamePart.SPECIFIC) != null  //see https://github.com/CatalogueOfLife/data/issues/719
+              )
+          ) {
             rank = Rank.valueOf(parsedName.getRank().name());
           }
         }
@@ -577,6 +583,18 @@ && nextAboveGenusDiffers(classification, match1)) {
 
     // for strict matching do not try higher ranks
     if (isMatch(match1) || strict) {
+      // https://github.com/CatalogueOfLife/data/issues/719
+      // this caters for the scenario where the Taxacrum sp.
+      // and the only sensible match is to a higher rank (genus)
+      if (
+        isMatch(match1)
+        && parsedName != null
+        && parsedName.getRank() != null
+        && parsedName.getRank().ordinal() >= Rank.SPECIES.ordinal()
+        && parsedName.getEpithet(NamePart.SPECIFIC) == null
+      ){
+          match1.getDiagnostics().setMatchType(MatchType.HIGHERRANK);
+      }
       return match1;
     }
 
@@ -712,7 +730,9 @@ private List<NameUsageMatch> queryIndex(Rank rank, String canonicalName, boolean
         m -> {
           if (m.getDiagnostics().getMatchType() == MatchType.EXACT
               && rank == Rank.SPECIES_AGGREGATE
-              && m.getUsage().getRank() != Rank.SPECIES_AGGREGATE) {
+              && (m.getUsage().getRank() != Rank.SPECIES_AGGREGATE
+                  ||  m.getAcceptedUsage().getRank() != Rank.SPECIES_AGGREGATE)
+          ) {
             log.info(
                 "Species aggregate match found for {} {}. Ignore and prefer higher matches",
                 m.getUsage().getRank(),
diff --git a/matching-ws/src/main/java/life/catalogue/matching/util/IUCNUtils.java b/matching-ws/src/main/java/life/catalogue/matching/util/IUCNUtils.java
new file mode 100644
index 000000000..27fec73fc
--- /dev/null
+++ b/matching-ws/src/main/java/life/catalogue/matching/util/IUCNUtils.java
@@ -0,0 +1,40 @@
+package life.catalogue.matching.util;
+
+public class IUCNUtils {
+
+  public static String formatIucn(String original){
+    if (original == null) {
+      return null;
+    }
+    // Trim the string
+    String trimmed = original.trim();
+    // Convert to uppercase
+    String uppercased = trimmed.toUpperCase();
+    // Replace any whitespace with a single underscore
+    return uppercased.replaceAll("\\s+", "_");
+  }
+
+  public enum IUCN {
+    EXTINCT("EX"),
+    EXTINCT_IN_THE_WILD("EW"),
+    CRITICALLY_ENDANGERED ("CR"),
+    ENDANGERED ("EN"),
+    VULNERABLE ("VU"),
+    NEAR_THREATENED ("NT"),
+    CONSERVATION_DEPENDENT ("CD"),
+    LEAST_CONCERN ("LC"),
+    DATA_DEFICIENT ("DD"),
+    NOT_EVALUATED ("NE");
+
+    private final String code;
+
+    IUCN(String code) {
+      this.code = code;
+        }
+
+    public String getCode() {
+        return code;
+    }
+
+  }
+}
diff --git a/matching-ws/src/main/java/life/catalogue/matching/util/IndexConstants.java b/matching-ws/src/main/java/life/catalogue/matching/util/IndexConstants.java
index 0d59bca61..b6334e6fc 100644
--- a/matching-ws/src/main/java/life/catalogue/matching/util/IndexConstants.java
+++ b/matching-ws/src/main/java/life/catalogue/matching/util/IndexConstants.java
@@ -8,10 +8,12 @@ public class IndexConstants {
   public static final String FIELD_ACCEPTED_ID = "accid";
   public static final String FIELD_CANONICAL_NAME = "canonical";
   public static final String FIELD_SCIENTIFIC_NAME = "sciname";
+  public static final String FIELD_AUTHORSHIP = "authorship";
   public static final String FIELD_RANK = "rank";
   public static final String FIELD_STATUS = "status";
   public static final String FIELD_PARENT_ID = "parentId";
   public static final String FIELD_NOMENCLATURAL_CODE = "nomcode";
+  public static final String FIELD_PARSED_NAME_JSON = "parsedName";
   public static final String FIELD_CATEGORY = "category";
   public static final String FIELD_JOIN_ID = "joinId";
   public static final String MAIN_INDEX_DIR = "main";
diff --git a/matching-ws/src/main/resources/datasets.json b/matching-ws/src/main/resources/datasets.json
index 6c17a925c..afd996ec3 100644
--- a/matching-ws/src/main/resources/datasets.json
+++ b/matching-ws/src/main/resources/datasets.json
@@ -15,6 +15,7 @@
     "title": "IPNI",
     "prefix": "urn:lsid:ipni.org:names:",
     "prefixMapping": [
+      "ipni:",
       "https://www.ipni.org/n/"
     ],
     "removePrefixForMatching": true
@@ -25,6 +26,7 @@
     "title": "WoRMS",
     "prefix": "urn:lsid:marinespecies.org:taxname:",
     "prefixMapping": [
+      "worms:",
       "http://marinespecies.org/data.php?id=",
       "https://marinespecies.org/data.php?id=",
       "https://www.marinespecies.org/aphia.php?p=taxdetails&id="
@@ -40,6 +42,31 @@
     "key": "2041",
     "gbifKey": "de8934f4-a136-481c-a87a-b0b202b80a31",
     "title": "Dyntaxa. Svensk taxonomisk databas",
-    "prefix": "urn:lsid:dyntaxa.se:Taxon:"
+    "prefix": "urn:lsid:dyntaxa.se:Taxon:",
+    "prefixMapping": [
+      "dyntaxa:"
+    ]
+  },
+  {
+    "key": "2144",
+    "gbifKey": "9ca92552-f23a-41a8-a140-01abaa31c931",
+    "title": "ITIS",
+    "prefixMapping": [
+      "itis:",
+      "tsn:",
+      "https://www.itis.gov/servlet/SingleRpt/SingleRpt?search_topic=TSN&search_value=",
+      "https://marinespecies.org/data.php?id=",
+      "https://www.marinespecies.org/aphia.php?p=taxdetails&id="
+    ]
+  },
+  {
+    "key": "139831",
+    "title": "iNaturalist",
+    "prefix": "https://www.inaturalist.org/taxa/",
+    "prefixMapping": [
+      "inat:",
+      "iNaturalist:",
+      "https://www.inaturalist.org/taxa/"
+    ]
   }
 ]
diff --git a/matching-ws/src/test/java/life/catalogue/matching/IDMatchingIT.java b/matching-ws/src/test/java/life/catalogue/matching/IDMatchingIT.java
index 6e0112c83..cea714ce1 100644
--- a/matching-ws/src/test/java/life/catalogue/matching/IDMatchingIT.java
+++ b/matching-ws/src/test/java/life/catalogue/matching/IDMatchingIT.java
@@ -56,7 +56,7 @@ public static void buildMatcher() throws IOException {
       1
     );
 
-    Dataset dataset = new Dataset();
+    Dataset dataset = Dataset.builder().build();
     dataset.setKey(1);
     dataset.setAlias("DUMMY_IDS");
     dataset.setTitle("Dummy dataset for testing");
diff --git a/matching-ws/src/test/java/life/catalogue/matching/IDMatchingTest.java b/matching-ws/src/test/java/life/catalogue/matching/IDMatchingTest.java
new file mode 100644
index 000000000..3becebc9b
--- /dev/null
+++ b/matching-ws/src/test/java/life/catalogue/matching/IDMatchingTest.java
@@ -0,0 +1,75 @@
+package life.catalogue.matching;
+
+import life.catalogue.matching.index.DatasetIndex;
+import life.catalogue.matching.model.Dataset;
+
+import org.junit.jupiter.api.Test;
+
+import java.util.List;
+import java.util.Optional;
+
+import static org.junit.jupiter.api.Assertions.assertEquals;
+
+public class IDMatchingTest {
+
+  @Test
+  public void testWithoutPrefix() {
+    Dataset dataset = Dataset.builder()
+      .key(1)
+      .prefix("urn:lsid:ipni.org:names:")
+      .prefixMapping(List.of("urn:lsid:ipni.org:names:", "ipni:"))
+      .removePrefixForMatching(true)
+      .build();
+    assertEquals(Optional.of("1"), DatasetIndex.extractKeyForSearch("ipni:1", dataset));
+  }
+
+  @Test
+  public void testWithoutPrefix2() {
+    Dataset dataset = Dataset.builder()
+      .key(1)
+      .prefix("gbif:")
+      .prefixMapping(List.of("gbif:"))
+      .removePrefixForMatching(true)
+      .build();
+    assertEquals(Optional.of("1"), DatasetIndex.extractKeyForSearch("gbif:1", dataset));
+  }
+
+  @Test
+  public void testWithPrefix() {
+    Dataset dataset = Dataset.builder()
+      .key(1)
+      .prefix("urn:lsid:marinespecies.org:taxname:")
+      .prefixMapping(List.of("worms:"))
+      .removePrefixForMatching(false)
+      .build();
+    assertEquals(Optional.of("urn:lsid:marinespecies.org:taxname:1"),
+      DatasetIndex.extractKeyForSearch("worms:1", dataset)
+    );
+  }
+
+  @Test
+  public void testWithUnrecognisedPrefix() {
+    Dataset dataset = Dataset.builder()
+      .key(1)
+      .prefix("urn:lsid:marinespecies.org:taxname:")
+      .prefixMapping(List.of("worms:"))
+      .removePrefixForMatching(false)
+      .build();
+    assertEquals(Optional.empty(),
+      DatasetIndex.extractKeyForSearch("nonsense:1", dataset)
+    );
+  }
+
+  @Test
+  public void testWithUnrecognisedPrefix2() {
+    Dataset dataset = Dataset.builder()
+      .key(1)
+      .prefix("urn:lsid:marinespecies.org:taxname:")
+      .prefixMapping(List.of("worms:"))
+      .removePrefixForMatching(false)
+      .build();
+    assertEquals(Optional.empty(),
+      DatasetIndex.extractKeyForSearch("1", dataset)
+    );
+  }
+}
diff --git a/matching-ws/src/test/java/life/catalogue/matching/NameUsageBuilder.java b/matching-ws/src/test/java/life/catalogue/matching/NameUsageBuilder.java
index 9f85865de..8d568c078 100644
--- a/matching-ws/src/test/java/life/catalogue/matching/NameUsageBuilder.java
+++ b/matching-ws/src/test/java/life/catalogue/matching/NameUsageBuilder.java
@@ -40,12 +40,12 @@ public static NameUsageMatch newNameUsageMatch(
       String speciesKey) {
 
     NameUsageMatch m = NameUsageMatch.builder().diagnostics(NameUsageMatch.Diagnostics.builder().build()).build();
-    m.setUsage(NameUsageMatch.RankedName.builder()
+    m.setUsage(NameUsageMatch.Usage.builder()
         .key(usageKey)
         .name(scientificName)
         .canonicalName(canonicalName)
         .rank(rank).build());
-    m.setAcceptedUsage(NameUsageMatch.RankedName.builder().key(acceptedUsageKey).build());
+    m.setAcceptedUsage(NameUsageMatch.Usage.builder().key(acceptedUsageKey).build());
     m.getDiagnostics().setStatus(status);
     m.getDiagnostics().setConfidence(confidence);
     m.getDiagnostics().setNote(note);