From 840687a8b8414612b993f319c5b64dcc7bc87159 Mon Sep 17 00:00:00 2001
From: waterflow80 <thessalonikaathena@outlook.com>
Date: Tue, 30 May 2023 14:04:11 +0100
Subject: [PATCH 1/6] retrieve fna file function

---
 .../AssemblySequenceDataSource.java           |  12 ++
 .../NCBIAssemblySequenceDataSource.java       | 105 ++++++++++++++++++
 .../ebi/eva/contigalias/dus/NCBIBrowser.java  |  11 ++
 .../dus2/AssemblySequenceReader.java          |  39 +++++++
 .../dus2/NCBIAssemblySequenceReader.java      |  27 +++++
 .../NCBIAssemblySequenceReaderFactory.java    |  18 +++
 .../entities/AssemblySequenceEntity.java      |  23 ++++
 .../AssemblySequenceNotFoundException.java    |   7 ++
 .../DuplicateAssemblySequenceException.java   |   8 ++
 .../repo/AssemblySequenceRepository.java      |  14 +++
 .../service/AssemblySequenceService.java      |  80 +++++++++++++
 .../eva/contigalias/utils/GzipCompress.java   |  52 +++++++++
 12 files changed, 396 insertions(+)
 create mode 100644 src/main/java/uk/ac/ebi/eva/contigalias/datasource/AssemblySequenceDataSource.java
 create mode 100644 src/main/java/uk/ac/ebi/eva/contigalias/datasource/NCBIAssemblySequenceDataSource.java
 create mode 100644 src/main/java/uk/ac/ebi/eva/contigalias/dus2/AssemblySequenceReader.java
 create mode 100644 src/main/java/uk/ac/ebi/eva/contigalias/dus2/NCBIAssemblySequenceReader.java
 create mode 100644 src/main/java/uk/ac/ebi/eva/contigalias/dus2/NCBIAssemblySequenceReaderFactory.java
 create mode 100644 src/main/java/uk/ac/ebi/eva/contigalias/entities/AssemblySequenceEntity.java
 create mode 100644 src/main/java/uk/ac/ebi/eva/contigalias/exception/AssemblySequenceNotFoundException.java
 create mode 100644 src/main/java/uk/ac/ebi/eva/contigalias/exception/DuplicateAssemblySequenceException.java
 create mode 100644 src/main/java/uk/ac/ebi/eva/contigalias/repo/AssemblySequenceRepository.java
 create mode 100644 src/main/java/uk/ac/ebi/eva/contigalias/service/AssemblySequenceService.java
 create mode 100644 src/main/java/uk/ac/ebi/eva/contigalias/utils/GzipCompress.java

diff --git a/src/main/java/uk/ac/ebi/eva/contigalias/datasource/AssemblySequenceDataSource.java b/src/main/java/uk/ac/ebi/eva/contigalias/datasource/AssemblySequenceDataSource.java
new file mode 100644
index 00000000..3a4d5b46
--- /dev/null
+++ b/src/main/java/uk/ac/ebi/eva/contigalias/datasource/AssemblySequenceDataSource.java
@@ -0,0 +1,12 @@
+package uk.ac.ebi.eva.contigalias.datasource;
+
+import java.io.IOException;
+import java.util.Optional;
+
+import uk.ac.ebi.eva.contigalias.entities.AssemblySequenceEntity;
+
+public interface AssemblySequenceDataSource {
+
+    Optional<AssemblySequenceEntity> getAssemblySequenceByAccession(String accession) throws IOException;
+
+}
diff --git a/src/main/java/uk/ac/ebi/eva/contigalias/datasource/NCBIAssemblySequenceDataSource.java b/src/main/java/uk/ac/ebi/eva/contigalias/datasource/NCBIAssemblySequenceDataSource.java
new file mode 100644
index 00000000..d76741cf
--- /dev/null
+++ b/src/main/java/uk/ac/ebi/eva/contigalias/datasource/NCBIAssemblySequenceDataSource.java
@@ -0,0 +1,105 @@
+package uk.ac.ebi.eva.contigalias.datasource;
+
+import java.io.FileInputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.nio.file.Path;
+import java.nio.file.Paths;
+import java.util.Optional;
+
+import org.apache.commons.net.ftp.FTPFile;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import org.springframework.beans.factory.annotation.Autowired;
+import org.springframework.beans.factory.annotation.Value;
+import org.springframework.retry.annotation.Backoff;
+import org.springframework.retry.annotation.Retryable;
+import org.springframework.stereotype.Repository;
+import uk.ac.ebi.eva.contigalias.dus2.NCBIAssemblySequenceReader;
+import uk.ac.ebi.eva.contigalias.dus2.NCBIAssemblySequenceReaderFactory;
+import uk.ac.ebi.eva.contigalias.dus.NCBIBrowser;
+import uk.ac.ebi.eva.contigalias.dus.NCBIBrowserFactory;
+import uk.ac.ebi.eva.contigalias.entities.AssemblySequenceEntity;
+import uk.ac.ebi.eva.contigalias.utils.GzipCompress;
+
+@Repository("NCBISequenceDataSource")
+public class NCBIAssemblySequenceDataSource implements AssemblySequenceDataSource{
+
+    private final Logger logger = LoggerFactory.getLogger(NCBIAssemblySequenceDataSource.class);
+
+    private final NCBIBrowserFactory factory;
+
+    private final NCBIAssemblySequenceReaderFactory readerFactory;
+
+    @Value("${asm.file.download.dir}")
+    private String asmFileDownloadDir;
+
+    @Autowired
+    public NCBIAssemblySequenceDataSource(NCBIBrowserFactory factory,
+                                          NCBIAssemblySequenceReaderFactory readerFactory){
+        this.factory = factory;
+        this.readerFactory = readerFactory;
+    }
+
+    @Override
+    public Optional<AssemblySequenceEntity> getAssemblySequenceByAccession(String accession) throws IOException, IllegalArgumentException {
+            NCBIBrowser ncbiBrowser = factory.build();
+            ncbiBrowser.connect();
+            GzipCompress gzipCompress = new GzipCompress();
+
+            Optional<Path> downloadFilePath = downloadAssemblySequence(accession, ncbiBrowser);
+            if (!downloadFilePath.isPresent()) {
+                return Optional.empty();
+            }
+            logger.info("Assembly sequence _fna.gz file downloaded successfully in: " + downloadFilePath);
+            // Uncompress the .gz file
+            Optional<Path> uncompressedFilePath = gzipCompress.unzip(downloadFilePath.get().toString(), asmFileDownloadDir);
+            if (!uncompressedFilePath.isPresent()){
+                return Optional.empty();
+            }
+
+            AssemblySequenceEntity assemblySequenceEntity;
+            try (InputStream stream = new FileInputStream(uncompressedFilePath.get().toFile())){
+                NCBIAssemblySequenceReader reader = readerFactory.build(stream);
+                assemblySequenceEntity = reader.getAssemblySequenceEntity();
+                //TODO : The logger info will be canged when we add more attributes to the entity and we parse the whole file info
+                logger.info("NCBI: Name of the sequence in " + accession + " : " + assemblySequenceEntity.getName());
+            } finally {
+                try {
+                    ncbiBrowser.disconnect();
+                    //Files.deleteIfExists(downloadFilePath.get());
+                } catch (IOException e) {
+                    logger.warn("Error while trying to disconnect - ncbiBrowser (assembly: " + accession + ")");
+                }
+        }
+            return Optional.of(assemblySequenceEntity);
+    }
+
+
+    /**
+     * Download the assembly fna/fasta file given the accession and save it to /tmp
+     * After this method is called, the file will be downloaded, and the path to this file
+     * on your local computer will be returned*/
+    @Retryable(value = Exception.class, maxAttempts = 5, backoff = @Backoff(delay = 2000, multiplier = 2))
+    public Optional<Path> downloadAssemblySequence(String accession, NCBIBrowser ncbiBrowser) throws IOException {
+        // The same directory as the report file
+        Optional<String> directory = ncbiBrowser.getGenomeReportDirectory(accession);
+
+        if (!directory.isPresent()) {
+            return Optional.empty();
+        }
+
+        logger.info("NCBI directory for assembly genomic.fna download: " + directory.get());
+        FTPFile ftpFile = ncbiBrowser.getAssemblyGenomicFnaFile(directory.get());
+        String ftpFilePath = directory.get() + ftpFile.getName();
+        Path downloadFilePath = Paths.get(asmFileDownloadDir, ftpFile.getName());
+        boolean success = ncbiBrowser.downloadFTPFile(ftpFilePath, downloadFilePath, ftpFile.getSize());
+        if (success) {
+            logger.info("NCBI assembly genomic.fna downloaded successfully (" + ftpFile.getName() + ")");
+            return Optional.of(downloadFilePath);
+        } else {
+            logger.error("NCBI assembly genomic.fna could not be downloaded successfully(" + ftpFile.getName() + ")");
+            return Optional.empty();
+        }
+    }
+}
diff --git a/src/main/java/uk/ac/ebi/eva/contigalias/dus/NCBIBrowser.java b/src/main/java/uk/ac/ebi/eva/contigalias/dus/NCBIBrowser.java
index 30ea4f73..fcb1f8e7 100644
--- a/src/main/java/uk/ac/ebi/eva/contigalias/dus/NCBIBrowser.java
+++ b/src/main/java/uk/ac/ebi/eva/contigalias/dus/NCBIBrowser.java
@@ -38,6 +38,7 @@ public class NCBIBrowser extends PassiveAnonymousFTPClient {
 
     public static final String PATH_GENOMES_ALL = "/genomes/all/";
 
+
     private String ftpProxyHost;
 
     private Integer ftpProxyPort;
@@ -148,4 +149,14 @@ public FTPFile getNCBIAssemblyReportFile(String directoryPath) throws IOExceptio
         return assemblyReport.orElseThrow(() -> new AssemblyNotFoundException("Assembly Report File not present in given directory: " + directoryPath));
     }
 
+    /**
+     * Return the fna/fasta file that will be downloaded (a pointer to that FtpFile)*/
+    public FTPFile getAssemblyGenomicFnaFile(String directoryPath) throws IOException {
+        Stream<FTPFile> ftpFileStream = Arrays.stream(super.listFiles(directoryPath));
+        Stream<FTPFile> assemblyReportFilteredStream = ftpFileStream.filter(f -> f.getName().contains("genomic.fna.gz") && !f.getName().contains("from"));
+        Optional<FTPFile> assemblyReport = assemblyReportFilteredStream.findFirst();
+
+        return assemblyReport.orElseThrow(() -> new AssemblyNotFoundException("Assembly Genomic Fna (Fasta) File not present in given directory: " + directoryPath));
+    }
+
 }
diff --git a/src/main/java/uk/ac/ebi/eva/contigalias/dus2/AssemblySequenceReader.java b/src/main/java/uk/ac/ebi/eva/contigalias/dus2/AssemblySequenceReader.java
new file mode 100644
index 00000000..0b107042
--- /dev/null
+++ b/src/main/java/uk/ac/ebi/eva/contigalias/dus2/AssemblySequenceReader.java
@@ -0,0 +1,39 @@
+package uk.ac.ebi.eva.contigalias.dus2;
+
+import java.io.BufferedReader;
+import java.io.IOException;
+import java.io.InputStreamReader;
+
+import uk.ac.ebi.eva.contigalias.entities.AssemblySequenceEntity;
+
+public abstract class AssemblySequenceReader {
+
+    protected final BufferedReader reader;
+
+    protected AssemblySequenceEntity assemblySequenceEntity;
+
+    protected boolean fileParsed = false;
+
+
+    public AssemblySequenceReader(InputStreamReader inputStreamReader){
+        this.reader = new BufferedReader(inputStreamReader);
+    }
+
+    public AssemblySequenceEntity getAssemblySequenceEntity() throws IOException {
+        if(!fileParsed || assemblySequenceEntity == null){
+            parseFile();
+        }
+        return assemblySequenceEntity;
+    }
+
+    protected abstract void parseFile() throws IOException, NullPointerException;
+
+
+    protected abstract void parseAssemblySequenceEntity(String line);
+
+
+
+    public boolean ready() throws IOException {
+        return reader.ready();
+    }
+}
diff --git a/src/main/java/uk/ac/ebi/eva/contigalias/dus2/NCBIAssemblySequenceReader.java b/src/main/java/uk/ac/ebi/eva/contigalias/dus2/NCBIAssemblySequenceReader.java
new file mode 100644
index 00000000..12e01689
--- /dev/null
+++ b/src/main/java/uk/ac/ebi/eva/contigalias/dus2/NCBIAssemblySequenceReader.java
@@ -0,0 +1,27 @@
+package uk.ac.ebi.eva.contigalias.dus2;
+
+import java.io.IOException;
+import java.io.InputStreamReader;
+
+public class NCBIAssemblySequenceReader extends AssemblySequenceReader{
+
+    public NCBIAssemblySequenceReader(InputStreamReader inputStreamReader){
+        super(inputStreamReader);
+    }
+
+    @Override
+    protected void parseFile() throws IOException, NullPointerException {
+        if (reader == null){
+            throw new NullPointerException("Cannot use AssemblySequenceReader without having a valid InputStreamReader.");
+        }
+        // TODO: HERE WE'LL EXTARACT THE .gz FILE AND PARSE THE fna FILE
+    }
+
+    @Override
+    // Parsing a line of the file
+    protected void parseAssemblySequenceEntity(String line) {
+        // TODO: HERE WE'LL PARSE A LINE OF THE FILE (AN ENTRY)
+        // TODO: NOTE: THIS METHOD MIGHT NOT BE COMPLETELY USEFUL SINCE THE FILE CONTAINS ONLY
+        // TODO: TEXT AND A '>' SEPARATORS TO SEPARATE SEQUENCES FROM ONE ANOTHER
+    }
+}
diff --git a/src/main/java/uk/ac/ebi/eva/contigalias/dus2/NCBIAssemblySequenceReaderFactory.java b/src/main/java/uk/ac/ebi/eva/contigalias/dus2/NCBIAssemblySequenceReaderFactory.java
new file mode 100644
index 00000000..06867aba
--- /dev/null
+++ b/src/main/java/uk/ac/ebi/eva/contigalias/dus2/NCBIAssemblySequenceReaderFactory.java
@@ -0,0 +1,18 @@
+package uk.ac.ebi.eva.contigalias.dus2;
+
+import java.io.InputStream;
+import java.io.InputStreamReader;
+
+import org.springframework.stereotype.Component;
+
+@Component
+public class NCBIAssemblySequenceReaderFactory {
+
+    public NCBIAssemblySequenceReader build(InputStream inputStream){
+        return new NCBIAssemblySequenceReader(new InputStreamReader(inputStream));
+    }
+
+    public NCBIAssemblySequenceReader build(InputStreamReader inputStreamReader){
+        return new NCBIAssemblySequenceReader(inputStreamReader);
+    }
+}
diff --git a/src/main/java/uk/ac/ebi/eva/contigalias/entities/AssemblySequenceEntity.java b/src/main/java/uk/ac/ebi/eva/contigalias/entities/AssemblySequenceEntity.java
new file mode 100644
index 00000000..c1a58894
--- /dev/null
+++ b/src/main/java/uk/ac/ebi/eva/contigalias/entities/AssemblySequenceEntity.java
@@ -0,0 +1,23 @@
+package uk.ac.ebi.eva.contigalias.entities;
+
+import javax.persistence.Column;
+import javax.persistence.Entity;
+import javax.persistence.Id;
+import javax.persistence.Table;
+
+import lombok.Getter;
+import lombok.Setter;
+
+@Setter
+@Getter
+@Table(name = "AssemblySequence")
+@Entity
+public class AssemblySequenceEntity {
+
+    @Id
+    @Column(nullable = false)
+    private String accession;
+
+    @Column(nullable = false)
+    private String name;
+}
diff --git a/src/main/java/uk/ac/ebi/eva/contigalias/exception/AssemblySequenceNotFoundException.java b/src/main/java/uk/ac/ebi/eva/contigalias/exception/AssemblySequenceNotFoundException.java
new file mode 100644
index 00000000..03deecb9
--- /dev/null
+++ b/src/main/java/uk/ac/ebi/eva/contigalias/exception/AssemblySequenceNotFoundException.java
@@ -0,0 +1,7 @@
+package uk.ac.ebi.eva.contigalias.exception;
+
+public class AssemblySequenceNotFoundException extends RuntimeException{
+    public AssemblySequenceNotFoundException(String accession) {
+        super("No assembly sequence corresponding to accession " + accession + " could be found");
+    }
+}
diff --git a/src/main/java/uk/ac/ebi/eva/contigalias/exception/DuplicateAssemblySequenceException.java b/src/main/java/uk/ac/ebi/eva/contigalias/exception/DuplicateAssemblySequenceException.java
new file mode 100644
index 00000000..f382e62f
--- /dev/null
+++ b/src/main/java/uk/ac/ebi/eva/contigalias/exception/DuplicateAssemblySequenceException.java
@@ -0,0 +1,8 @@
+package uk.ac.ebi.eva.contigalias.exception;
+
+public class DuplicateAssemblySequenceException extends RuntimeException{
+
+    public DuplicateAssemblySequenceException(String msg){
+        super(msg);
+    }
+}
diff --git a/src/main/java/uk/ac/ebi/eva/contigalias/repo/AssemblySequenceRepository.java b/src/main/java/uk/ac/ebi/eva/contigalias/repo/AssemblySequenceRepository.java
new file mode 100644
index 00000000..6eb6fa01
--- /dev/null
+++ b/src/main/java/uk/ac/ebi/eva/contigalias/repo/AssemblySequenceRepository.java
@@ -0,0 +1,14 @@
+package uk.ac.ebi.eva.contigalias.repo;
+
+import java.util.Optional;
+
+import org.springframework.data.jpa.repository.JpaRepository;
+import org.springframework.stereotype.Repository;
+import uk.ac.ebi.eva.contigalias.entities.AssemblySequenceEntity;
+
+@Repository
+public interface AssemblySequenceRepository extends JpaRepository<AssemblySequenceEntity, Long> {
+    Optional<AssemblySequenceEntity> findAssemblySequenceEntityByAccession(String accession);
+
+
+}
diff --git a/src/main/java/uk/ac/ebi/eva/contigalias/service/AssemblySequenceService.java b/src/main/java/uk/ac/ebi/eva/contigalias/service/AssemblySequenceService.java
new file mode 100644
index 00000000..5dfd917a
--- /dev/null
+++ b/src/main/java/uk/ac/ebi/eva/contigalias/service/AssemblySequenceService.java
@@ -0,0 +1,80 @@
+package uk.ac.ebi.eva.contigalias.service;
+
+import java.io.IOException;
+import java.util.Optional;
+
+import javax.transaction.Transactional;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import org.springframework.stereotype.Service;
+import uk.ac.ebi.eva.contigalias.datasource.NCBIAssemblySequenceDataSource;
+import uk.ac.ebi.eva.contigalias.entities.AssemblySequenceEntity;
+import uk.ac.ebi.eva.contigalias.exception.AssemblySequenceNotFoundException;
+import uk.ac.ebi.eva.contigalias.exception.DuplicateAssemblySequenceException;
+import uk.ac.ebi.eva.contigalias.repo.AssemblySequenceRepository;
+
+@Service
+public class AssemblySequenceService {
+
+    private final AssemblySequenceRepository repository;
+
+    private final NCBIAssemblySequenceDataSource ncbiSequenceDataSource;
+
+    private final Logger logger = LoggerFactory.getLogger(AssemblyService.class);
+
+
+    public AssemblySequenceService(
+            AssemblySequenceRepository repository, NCBIAssemblySequenceDataSource ncbiSequenceDataSource){
+        this.repository = repository;
+        this.ncbiSequenceDataSource = ncbiSequenceDataSource;
+    }
+
+    public void fetchAndInsertAssemblySequence(String accession) throws IOException {
+        Optional<AssemblySequenceEntity> entity = repository.findAssemblySequenceEntityByAccession(accession);
+        if(entity.isPresent())
+            throw duplicateAssemblySequenceInsertionException(accession, entity.get());
+        Optional<AssemblySequenceEntity> fetchAssembly = ncbiSequenceDataSource.getAssemblySequenceByAccession(accession);
+        if(!fetchAssembly.isPresent()){
+            throw new AssemblySequenceNotFoundException(accession);
+        }
+        if (fetchAssembly.get().getName() != null){ // This condition is only for testing, it'll change as soon as we add more attributes to the entity
+            insertAssemblySequence(fetchAssembly.get());
+            logger.info("Successfully inserted assembly for accession " + accession);
+        }else {
+            logger.error("Skipping inserting assembly sequence : No name in assembly : " + accession);
+        }
+    }
+
+    @Transactional
+    public void insertAssemblySequence(AssemblySequenceEntity entity) {
+        if (isEntityPresent(entity)) {
+            throw duplicateAssemblySequenceInsertionException(null, entity);
+        } else {
+            repository.save(entity);
+        }
+    }
+
+    private boolean isEntityPresent(AssemblySequenceEntity entity) {
+        // TODO: THE CONDITIONS IN THIS METHOD WILL BE CHANGED WHEN WE ADD MORE ATTRIBUTES TO THE ENTITY
+        Optional<AssemblySequenceEntity> existingAssembly = repository.findAssemblySequenceEntityByAccession(entity.getAccession());
+        return existingAssembly.isPresent();
+    }
+
+    private DuplicateAssemblySequenceException duplicateAssemblySequenceInsertionException(String accession, AssemblySequenceEntity present) {
+        StringBuilder exception = new StringBuilder("A similar assembly Sequence already exists");
+        if (accession != null){
+            exception.append("\n");
+            exception.append("Assembly Sequence trying to insert:");
+            exception.append("\t");
+            exception.append(accession);
+        }
+        if (present != null){
+            exception.append("\n");
+            exception.append("Assembly Sequence already present");
+            exception.append("\t");
+            exception.append(present);
+        }
+        return new DuplicateAssemblySequenceException(exception.toString());
+    }
+}
diff --git a/src/main/java/uk/ac/ebi/eva/contigalias/utils/GzipCompress.java b/src/main/java/uk/ac/ebi/eva/contigalias/utils/GzipCompress.java
new file mode 100644
index 00000000..455c0582
--- /dev/null
+++ b/src/main/java/uk/ac/ebi/eva/contigalias/utils/GzipCompress.java
@@ -0,0 +1,52 @@
+package uk.ac.ebi.eva.contigalias.utils;
+
+import java.io.FileInputStream;
+import java.io.FileOutputStream;
+import java.io.IOException;
+import java.nio.file.Path;
+import java.nio.file.Paths;
+import java.util.Optional;
+import java.util.zip.GZIPInputStream;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+
+public class GzipCompress {
+
+    private final Logger logger = LoggerFactory.getLogger(GzipCompress.class);
+
+    /**
+     * Decompress (Unzip) a .gz file and save the output file in the same
+     * input file's location.
+     * The output file's name will be the same as the input's but without '.gz'
+     * @return The output (decompressed) file path*/
+    public Optional<Path> unzip(String compressedFilePath, String outputDirPath) {
+        String outputFileName = "genome_sequence.fna";
+        String decompressedFilePath = outputDirPath + "/" + outputFileName;
+
+        byte[] buffer = new byte[1024];
+
+        try {
+            FileInputStream fileIn = new FileInputStream(compressedFilePath);
+            GZIPInputStream gzipInputStream = new GZIPInputStream(fileIn);
+            FileOutputStream fileOutputStream = new FileOutputStream(decompressedFilePath);
+
+            int bytes_read;
+
+            while ((bytes_read = gzipInputStream.read(buffer)) > 0) {
+                fileOutputStream.write(buffer, 0, bytes_read);
+            }
+            gzipInputStream.close();
+            fileOutputStream.close();
+            logger.info("File " + compressedFilePath + " was decompressed successfully");
+            Path outputFilePath = Paths.get(outputDirPath, outputFileName);
+            return Optional.of(outputFilePath);
+        } catch (
+                IOException e) {
+            logger.error("Could not find or read file !!");
+            return Optional.empty();
+        }
+
+    }
+}

From 724b91a9dc4e72ab6ed67470f3b218afea5bf42f Mon Sep 17 00:00:00 2001
From: waterflow80 <thessalonikaathena@outlook.com>
Date: Tue, 30 May 2023 14:12:29 +0100
Subject: [PATCH 2/6] adding pom.xml

---
 pom.xml | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/pom.xml b/pom.xml
index f63b0907..689a2619 100644
--- a/pom.xml
+++ b/pom.xml
@@ -35,6 +35,8 @@
         <java.version>8</java.version>
     </properties>
 
+
+
     <dependencies>
         <dependency>
             <groupId>org.springframework.boot</groupId>
@@ -147,6 +149,13 @@
             <version>1.2.5.RELEASE</version>
         </dependency>
 
+            <dependency>
+                <groupId>org.projectlombok</groupId>
+                <artifactId>lombok</artifactId>
+                <version>1.18.28</version>
+                <scope>provided</scope>
+            </dependency>
+
     </dependencies>
 
     <build>

From acfad8a3a3b82ca004a37151c81baad50063df02 Mon Sep 17 00:00:00 2001
From: waterflow80 <thessalonikaathena@outlook.com>
Date: Thu, 1 Jun 2023 12:04:41 +0100
Subject: [PATCH 3/6] assembly-sequences-fasta

---
 .../AssemblySequencesDataSource.java          |  13 +++
 .../NCBIAssemblySequencesDataSource.java      | 107 ++++++++++++++++++
 .../dus2/AssemblySequencesReader.java         |  44 +++++++
 .../dus2/NCBIAssemblySequencesReader.java     |  61 ++++++++++
 .../NCBIAssemblySequencesReaderFactory.java   |  18 +++
 .../entities/AssemblySequencesEntity.java     |  37 ++++++
 .../eva/contigalias/entities/Sequence.java    |  34 ++++++
 .../repo/AssemblySequencesRepository.java     |  14 +++
 .../contigalias/repo/SequenceRepository.java  |   7 ++
 .../service/AssemblySequencesService.java     |  81 +++++++++++++
 .../eva/contigalias/utils/GzipCompress.java   |   2 +-
 .../ebi/eva/contigalias/utils/MD5Digest.java  |  20 ++++
 .../NCBIAssemblySequencesDataSourceTest.java  |  55 +++++++++
 .../dus2/NCBIAssemblySequencesReaderTest.java |  67 +++++++++++
 .../service/AssemblySequencesServiceTest.java |  44 +++++++
 .../contigalias/utils/GzipCompressTest.java   |  18 +++
 .../eva/contigalias/utils/MD5DigestTest.java  |  18 +++
 17 files changed, 639 insertions(+), 1 deletion(-)
 create mode 100644 src/main/java/uk/ac/ebi/eva/contigalias/datasource/AssemblySequencesDataSource.java
 create mode 100644 src/main/java/uk/ac/ebi/eva/contigalias/datasource/NCBIAssemblySequencesDataSource.java
 create mode 100644 src/main/java/uk/ac/ebi/eva/contigalias/dus2/AssemblySequencesReader.java
 create mode 100644 src/main/java/uk/ac/ebi/eva/contigalias/dus2/NCBIAssemblySequencesReader.java
 create mode 100644 src/main/java/uk/ac/ebi/eva/contigalias/dus2/NCBIAssemblySequencesReaderFactory.java
 create mode 100644 src/main/java/uk/ac/ebi/eva/contigalias/entities/AssemblySequencesEntity.java
 create mode 100644 src/main/java/uk/ac/ebi/eva/contigalias/entities/Sequence.java
 create mode 100644 src/main/java/uk/ac/ebi/eva/contigalias/repo/AssemblySequencesRepository.java
 create mode 100644 src/main/java/uk/ac/ebi/eva/contigalias/repo/SequenceRepository.java
 create mode 100644 src/main/java/uk/ac/ebi/eva/contigalias/service/AssemblySequencesService.java
 create mode 100644 src/main/java/uk/ac/ebi/eva/contigalias/utils/MD5Digest.java
 create mode 100644 src/test/java/uk/ac/ebi/eva/contigalias/datasource/NCBIAssemblySequencesDataSourceTest.java
 create mode 100644 src/test/java/uk/ac/ebi/eva/contigalias/dus2/NCBIAssemblySequencesReaderTest.java
 create mode 100644 src/test/java/uk/ac/ebi/eva/contigalias/service/AssemblySequencesServiceTest.java
 create mode 100644 src/test/java/uk/ac/ebi/eva/contigalias/utils/GzipCompressTest.java
 create mode 100644 src/test/java/uk/ac/ebi/eva/contigalias/utils/MD5DigestTest.java

diff --git a/src/main/java/uk/ac/ebi/eva/contigalias/datasource/AssemblySequencesDataSource.java b/src/main/java/uk/ac/ebi/eva/contigalias/datasource/AssemblySequencesDataSource.java
new file mode 100644
index 00000000..f3a12e03
--- /dev/null
+++ b/src/main/java/uk/ac/ebi/eva/contigalias/datasource/AssemblySequencesDataSource.java
@@ -0,0 +1,13 @@
+package uk.ac.ebi.eva.contigalias.datasource;
+
+import java.io.IOException;
+import java.security.NoSuchAlgorithmException;
+import java.util.Optional;
+
+import uk.ac.ebi.eva.contigalias.entities.AssemblySequencesEntity;
+
+public interface AssemblySequencesDataSource {
+
+    Optional<AssemblySequencesEntity> getAssemblySequencesByAccession(String accession) throws IOException, NoSuchAlgorithmException;
+
+}
diff --git a/src/main/java/uk/ac/ebi/eva/contigalias/datasource/NCBIAssemblySequencesDataSource.java b/src/main/java/uk/ac/ebi/eva/contigalias/datasource/NCBIAssemblySequencesDataSource.java
new file mode 100644
index 00000000..211ab422
--- /dev/null
+++ b/src/main/java/uk/ac/ebi/eva/contigalias/datasource/NCBIAssemblySequencesDataSource.java
@@ -0,0 +1,107 @@
+package uk.ac.ebi.eva.contigalias.datasource;
+
+import java.io.FileInputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.nio.file.Paths;
+import java.security.NoSuchAlgorithmException;
+import java.util.Optional;
+
+import org.apache.commons.net.ftp.FTPFile;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import org.springframework.beans.factory.annotation.Autowired;
+import org.springframework.beans.factory.annotation.Value;
+import org.springframework.retry.annotation.Backoff;
+import org.springframework.retry.annotation.Retryable;
+import org.springframework.stereotype.Repository;
+import uk.ac.ebi.eva.contigalias.dus2.NCBIAssemblySequencesReader;
+import uk.ac.ebi.eva.contigalias.dus2.NCBIAssemblySequencesReaderFactory;
+import uk.ac.ebi.eva.contigalias.dus.NCBIBrowser;
+import uk.ac.ebi.eva.contigalias.dus.NCBIBrowserFactory;
+import uk.ac.ebi.eva.contigalias.entities.AssemblySequencesEntity;
+import uk.ac.ebi.eva.contigalias.utils.GzipCompress;
+
+@Repository("NCBISequenceDataSource")
+public class NCBIAssemblySequencesDataSource implements AssemblySequencesDataSource {
+
+    private final Logger logger = LoggerFactory.getLogger(NCBIAssemblySequencesDataSource.class);
+
+    private final NCBIBrowserFactory factory;
+
+    private final NCBIAssemblySequencesReaderFactory readerFactory;
+
+    @Value("${asm.file.download.dir}")
+    private String asmFileDownloadDir;
+
+    @Autowired
+    public NCBIAssemblySequencesDataSource(NCBIBrowserFactory factory,
+                                           NCBIAssemblySequencesReaderFactory readerFactory){
+        this.factory = factory;
+        this.readerFactory = readerFactory;
+    }
+
+    @Override
+    public Optional<AssemblySequencesEntity> getAssemblySequencesByAccession(String accession) throws IOException, IllegalArgumentException, NoSuchAlgorithmException {
+            NCBIBrowser ncbiBrowser = factory.build();
+            ncbiBrowser.connect();
+            GzipCompress gzipCompress = new GzipCompress();
+
+            Optional<Path> downloadFilePath = downloadAssemblySequences(accession, ncbiBrowser);
+            if (!downloadFilePath.isPresent()) {
+                return Optional.empty();
+            }
+            logger.info("Assembly sequence _fna.gz file downloaded successfully in: " + downloadFilePath);
+            // Uncompress the .gz file
+            Optional<Path> compressedFilePath = gzipCompress.unzip(downloadFilePath.get().toString(), asmFileDownloadDir);
+            if (!compressedFilePath.isPresent()){
+                return Optional.empty();
+            }
+
+            AssemblySequencesEntity assemblySequencesEntity;
+            try (InputStream stream = new FileInputStream(compressedFilePath.get().toFile())){
+                NCBIAssemblySequencesReader reader = readerFactory.build(stream, accession);
+                assemblySequencesEntity = reader.getAssemblySequenceEntity();
+                logger.info("NCBI: Assembly sequences' fasta file with accession " + accession + " has been parsed successfully" );
+            } finally {
+                try {
+                    ncbiBrowser.disconnect();
+                    Files.deleteIfExists(downloadFilePath.get());
+                    Files.deleteIfExists(compressedFilePath.get()); // Deleting the fasta file
+                } catch (IOException e) {
+                    logger.warn("Error while trying to disconnect - ncbiBrowser (assembly: " + accession + ")");
+                }
+        }
+            return Optional.of(assemblySequencesEntity);
+    }
+
+
+    /**
+     * Download the assembly fna/fasta file given the accession and save it to /tmp
+     * After this method is called, the file will be downloaded, and the path to this file
+     * on your local computer will be returned*/
+    @Retryable(value = Exception.class, maxAttempts = 5, backoff = @Backoff(delay = 2000, multiplier = 2))
+    public Optional<Path> downloadAssemblySequences(String accession, NCBIBrowser ncbiBrowser) throws IOException {
+        // The same directory as the report file
+        Optional<String> directory = ncbiBrowser.getGenomeReportDirectory(accession);
+
+        if (!directory.isPresent()) {
+            return Optional.empty();
+        }
+
+        logger.info("NCBI directory for assembly genomic.fna download: " + directory.get());
+        FTPFile ftpFile = ncbiBrowser.getAssemblyGenomicFnaFile(directory.get());
+        String ftpFilePath = directory.get() + ftpFile.getName();
+        Path downloadFilePath = Paths.get(asmFileDownloadDir, ftpFile.getName());
+        boolean success = ncbiBrowser.downloadFTPFile(ftpFilePath, downloadFilePath, ftpFile.getSize());
+        if (success) {
+            logger.info("NCBI assembly genomic.fna downloaded successfully (" + ftpFile.getName() + ")");
+            return Optional.of(downloadFilePath);
+        } else {
+            logger.error("NCBI assembly genomic.fna could not be downloaded successfully(" + ftpFile.getName() + ")");
+            return Optional.empty();
+        }
+    }
+}
diff --git a/src/main/java/uk/ac/ebi/eva/contigalias/dus2/AssemblySequencesReader.java b/src/main/java/uk/ac/ebi/eva/contigalias/dus2/AssemblySequencesReader.java
new file mode 100644
index 00000000..c7a974bb
--- /dev/null
+++ b/src/main/java/uk/ac/ebi/eva/contigalias/dus2/AssemblySequencesReader.java
@@ -0,0 +1,44 @@
+package uk.ac.ebi.eva.contigalias.dus2;
+
+import java.io.BufferedReader;
+import java.io.IOException;
+import java.io.InputStreamReader;
+import java.security.NoSuchAlgorithmException;
+
+import uk.ac.ebi.eva.contigalias.entities.AssemblySequencesEntity;
+
+public abstract class AssemblySequencesReader {
+
+    protected final BufferedReader reader;
+
+    protected final String accession;
+
+    protected AssemblySequencesEntity assemblySequencesEntity;
+
+
+    protected boolean fileParsed = false;
+
+
+    public AssemblySequencesReader(InputStreamReader inputStreamReader, String accession){
+        this.reader = new BufferedReader(inputStreamReader);
+        this.accession = accession;
+    }
+
+    public AssemblySequencesEntity getAssemblySequenceEntity() throws IOException, NoSuchAlgorithmException {
+        if(!fileParsed || assemblySequencesEntity == null){
+            parseFile();
+        }
+        return assemblySequencesEntity;
+    }
+
+    protected abstract void parseFile() throws IOException, NullPointerException, NoSuchAlgorithmException;
+
+
+    protected abstract void parseAssemblySequenceEntity(String line);
+
+
+
+    public boolean ready() throws IOException {
+        return reader.ready();
+    }
+}
diff --git a/src/main/java/uk/ac/ebi/eva/contigalias/dus2/NCBIAssemblySequencesReader.java b/src/main/java/uk/ac/ebi/eva/contigalias/dus2/NCBIAssemblySequencesReader.java
new file mode 100644
index 00000000..b979a8eb
--- /dev/null
+++ b/src/main/java/uk/ac/ebi/eva/contigalias/dus2/NCBIAssemblySequencesReader.java
@@ -0,0 +1,61 @@
+package uk.ac.ebi.eva.contigalias.dus2;
+
+import java.io.IOException;
+import java.io.InputStreamReader;
+import java.security.NoSuchAlgorithmException;
+import java.util.LinkedList;
+import java.util.List;
+
+import uk.ac.ebi.eva.contigalias.entities.AssemblySequencesEntity;
+import uk.ac.ebi.eva.contigalias.entities.Sequence;
+import uk.ac.ebi.eva.contigalias.utils.MD5Digest;
+
+public class NCBIAssemblySequencesReader extends AssemblySequencesReader {
+
+    public NCBIAssemblySequencesReader(InputStreamReader inputStreamReader, String accession){
+        super(inputStreamReader, accession);
+    }
+
+    @Override
+    protected void parseFile() throws IOException, NullPointerException, NoSuchAlgorithmException {
+        if (reader == null){
+            throw new NullPointerException("Cannot use AssemblySequenceReader without having a valid InputStreamReader.");
+        }
+        MD5Digest md5Digest = new MD5Digest();
+        if (assemblySequencesEntity == null){
+            assemblySequencesEntity = new AssemblySequencesEntity();
+        }
+        // Setting the accession of the whole assembly file
+        assemblySequencesEntity.setInsdcAccession(accession);
+        List<Sequence> sequences = new LinkedList<>();
+        String line = reader.readLine();
+        while (line != null){
+            if (line.startsWith(">")){
+                Sequence sequence = new Sequence();
+                String refSeq = line.substring(1, line.indexOf(' '));
+                sequence.setRefseq(refSeq);
+                line = reader.readLine();
+                StringBuilder sequenceValue = new StringBuilder();
+                while (line != null && !line.startsWith(">")){
+                    // Looking for the sequence lines for this refseq
+                    sequenceValue.append(line);
+                    line = reader.readLine();
+                }
+                String md5checksum = md5Digest.hash(sequenceValue.toString());
+                sequence.setSequenceMD5(md5checksum);
+                sequences.add(sequence);
+            }
+        }
+        assemblySequencesEntity.setSequences(sequences);
+        fileParsed = true;
+        reader.close();
+    }
+
+    @Override
+    // Parsing a line of the file
+    protected void parseAssemblySequenceEntity(String line) {
+        // TODO: HERE WE'LL PARSE A LINE OF THE FILE (AN ENTRY)
+        // TODO: NOTE: THIS METHOD MIGHT NOT BE COMPLETELY USEFUL SINCE THE FILE CONTAINS ONLY
+        // TODO: TEXT AND A '>' SEPARATORS TO SEPARATE SEQUENCES FROM ONE ANOTHER
+    }
+}
diff --git a/src/main/java/uk/ac/ebi/eva/contigalias/dus2/NCBIAssemblySequencesReaderFactory.java b/src/main/java/uk/ac/ebi/eva/contigalias/dus2/NCBIAssemblySequencesReaderFactory.java
new file mode 100644
index 00000000..a727bea1
--- /dev/null
+++ b/src/main/java/uk/ac/ebi/eva/contigalias/dus2/NCBIAssemblySequencesReaderFactory.java
@@ -0,0 +1,18 @@
+package uk.ac.ebi.eva.contigalias.dus2;
+
+import java.io.InputStream;
+import java.io.InputStreamReader;
+
+import org.springframework.stereotype.Component;
+
+@Component
+public class NCBIAssemblySequencesReaderFactory {
+
+    public NCBIAssemblySequencesReader build(InputStream inputStream, String accession){
+        return new NCBIAssemblySequencesReader(new InputStreamReader(inputStream), accession);
+    }
+
+    public NCBIAssemblySequencesReader build(InputStreamReader inputStreamReader, String accession){
+        return new NCBIAssemblySequencesReader(inputStreamReader, accession);
+    }
+}
diff --git a/src/main/java/uk/ac/ebi/eva/contigalias/entities/AssemblySequencesEntity.java b/src/main/java/uk/ac/ebi/eva/contigalias/entities/AssemblySequencesEntity.java
new file mode 100644
index 00000000..5875b48d
--- /dev/null
+++ b/src/main/java/uk/ac/ebi/eva/contigalias/entities/AssemblySequencesEntity.java
@@ -0,0 +1,37 @@
+package uk.ac.ebi.eva.contigalias.entities;
+
+
+import java.util.List;
+
+import javax.persistence.CascadeType;
+import javax.persistence.Column;
+import javax.persistence.Entity;
+import javax.persistence.Id;
+import javax.persistence.JoinColumn;
+import javax.persistence.OneToMany;
+import javax.persistence.Table;
+
+import io.swagger.annotations.ApiModelProperty;
+import lombok.Getter;
+import lombok.Setter;
+import org.hibernate.annotations.LazyCollection;
+import org.hibernate.annotations.LazyCollectionOption;
+
+@Setter
+@Getter
+@Table(name = "AssemblySequences")
+@Entity
+public class AssemblySequencesEntity {
+
+    @Id
+    @Column(nullable = false)
+    @ApiModelProperty(value = "Assembly's INSDC accession. It can be either a GenBank, ENA or a DDBJ accession.")
+    private String insdcAccession;
+
+
+    @ApiModelProperty(value = "List of all sequences of the assembly.")
+    @LazyCollection(LazyCollectionOption.FALSE)
+    @OneToMany(targetEntity = Sequence.class, cascade = CascadeType.ALL)
+    @JoinColumn(name = "insdcAccession", referencedColumnName = "insdcAccession")
+    private List<Sequence> sequences;
+}
diff --git a/src/main/java/uk/ac/ebi/eva/contigalias/entities/Sequence.java b/src/main/java/uk/ac/ebi/eva/contigalias/entities/Sequence.java
new file mode 100644
index 00000000..728b5987
--- /dev/null
+++ b/src/main/java/uk/ac/ebi/eva/contigalias/entities/Sequence.java
@@ -0,0 +1,34 @@
+package uk.ac.ebi.eva.contigalias.entities;
+
+import javax.persistence.CascadeType;
+import javax.persistence.Column;
+import javax.persistence.Entity;
+import javax.persistence.Id;
+import javax.persistence.JoinColumn;
+import javax.persistence.ManyToOne;
+import javax.persistence.Table;
+
+import io.swagger.annotations.ApiModelProperty;
+import lombok.Getter;
+import lombok.Setter;
+
+
+@Getter
+@Setter
+@Entity
+@Table(name = "Sequence")
+public class Sequence {
+
+
+    @Id
+    @Column(nullable = false)
+    @ApiModelProperty(value = "Assembly's Refseq accession.")
+    private String refseq;
+
+    @Column
+    @ApiModelProperty(value = "Sequence's MD5 checksum value.")
+    private String sequenceMD5;
+
+
+
+}
diff --git a/src/main/java/uk/ac/ebi/eva/contigalias/repo/AssemblySequencesRepository.java b/src/main/java/uk/ac/ebi/eva/contigalias/repo/AssemblySequencesRepository.java
new file mode 100644
index 00000000..0992b3c3
--- /dev/null
+++ b/src/main/java/uk/ac/ebi/eva/contigalias/repo/AssemblySequencesRepository.java
@@ -0,0 +1,14 @@
+package uk.ac.ebi.eva.contigalias.repo;
+
+import java.util.Optional;
+
+import org.springframework.data.jpa.repository.JpaRepository;
+import org.springframework.stereotype.Repository;
+import uk.ac.ebi.eva.contigalias.entities.AssemblySequencesEntity;
+
+@Repository
+public interface AssemblySequencesRepository extends JpaRepository<AssemblySequencesEntity, String> {
+    Optional<AssemblySequencesEntity> findAssemblySequenceEntityByInsdcAccession(String accession);
+
+
+}
diff --git a/src/main/java/uk/ac/ebi/eva/contigalias/repo/SequenceRepository.java b/src/main/java/uk/ac/ebi/eva/contigalias/repo/SequenceRepository.java
new file mode 100644
index 00000000..ba9164b0
--- /dev/null
+++ b/src/main/java/uk/ac/ebi/eva/contigalias/repo/SequenceRepository.java
@@ -0,0 +1,7 @@
+package uk.ac.ebi.eva.contigalias.repo;
+
+import org.springframework.data.jpa.repository.JpaRepository;
+import uk.ac.ebi.eva.contigalias.entities.Sequence;
+
+public interface SequenceRepository extends JpaRepository<Sequence, String> {
+}
diff --git a/src/main/java/uk/ac/ebi/eva/contigalias/service/AssemblySequencesService.java b/src/main/java/uk/ac/ebi/eva/contigalias/service/AssemblySequencesService.java
new file mode 100644
index 00000000..5da37b27
--- /dev/null
+++ b/src/main/java/uk/ac/ebi/eva/contigalias/service/AssemblySequencesService.java
@@ -0,0 +1,81 @@
+package uk.ac.ebi.eva.contigalias.service;
+
+import java.io.IOException;
+import java.security.NoSuchAlgorithmException;
+import java.util.Optional;
+
+import javax.transaction.Transactional;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import org.springframework.stereotype.Service;
+import uk.ac.ebi.eva.contigalias.datasource.NCBIAssemblySequencesDataSource;
+import uk.ac.ebi.eva.contigalias.entities.AssemblySequencesEntity;
+import uk.ac.ebi.eva.contigalias.exception.AssemblySequenceNotFoundException;
+import uk.ac.ebi.eva.contigalias.exception.DuplicateAssemblySequenceException;
+import uk.ac.ebi.eva.contigalias.repo.AssemblySequencesRepository;
+
+@Service
+public class AssemblySequencesService {
+
+    private final AssemblySequencesRepository repository;
+
+    private final NCBIAssemblySequencesDataSource ncbiSequenceDataSource;
+
+    private final Logger logger = LoggerFactory.getLogger(AssemblyService.class);
+
+
+    public AssemblySequencesService(
+            AssemblySequencesRepository repository, NCBIAssemblySequencesDataSource ncbiSequenceDataSource){
+        this.repository = repository;
+        this.ncbiSequenceDataSource = ncbiSequenceDataSource;
+    }
+
+    public void fetchAndInsertAssemblySequence(String accession) throws IOException, NoSuchAlgorithmException {
+        Optional<AssemblySequencesEntity> entity = repository.findAssemblySequenceEntityByInsdcAccession(accession);
+        if(entity.isPresent())
+            throw duplicateAssemblySequenceInsertionException(accession, entity.get());
+        Optional<AssemblySequencesEntity> fetchAssembly = ncbiSequenceDataSource.getAssemblySequencesByAccession(accession);
+        if(!fetchAssembly.isPresent()){
+            throw new AssemblySequenceNotFoundException(accession);
+        }
+        if (fetchAssembly.get().getInsdcAccession() != null){ // This condition is only for testing, it'll change as soon as we add more attributes to the entity
+            insertAssemblySequence(fetchAssembly.get());
+            logger.info("Successfully inserted assembly for accession " + accession);
+        }else {
+            logger.error("Skipping inserting assembly sequence : No name in assembly : " + accession);
+        }
+    }
+
+    @Transactional
+    public void insertAssemblySequence(AssemblySequencesEntity entity) {
+        if (isEntityPresent(entity)) {
+            throw duplicateAssemblySequenceInsertionException(null, entity);
+        } else {
+            repository.save(entity);
+        }
+    }
+
+    private boolean isEntityPresent(AssemblySequencesEntity entity) {
+        // TODO: THE CONDITIONS IN THIS METHOD WILL BE CHANGED WHEN WE ADD MORE ATTRIBUTES TO THE ENTITY
+        Optional<AssemblySequencesEntity> existingAssembly = repository.findAssemblySequenceEntityByInsdcAccession(entity.getInsdcAccession());
+        return existingAssembly.isPresent();
+    }
+
+    private DuplicateAssemblySequenceException duplicateAssemblySequenceInsertionException(String accession, AssemblySequencesEntity present) {
+        StringBuilder exception = new StringBuilder("A similar assembly Sequence already exists");
+        if (accession != null){
+            exception.append("\n");
+            exception.append("Assembly Sequence trying to insert:");
+            exception.append("\t");
+            exception.append(accession);
+        }
+        if (present != null){
+            exception.append("\n");
+            exception.append("Assembly Sequence already present");
+            exception.append("\t");
+            exception.append(present);
+        }
+        return new DuplicateAssemblySequenceException(exception.toString());
+    }
+}
diff --git a/src/main/java/uk/ac/ebi/eva/contigalias/utils/GzipCompress.java b/src/main/java/uk/ac/ebi/eva/contigalias/utils/GzipCompress.java
index 455c0582..a8aecd49 100644
--- a/src/main/java/uk/ac/ebi/eva/contigalias/utils/GzipCompress.java
+++ b/src/main/java/uk/ac/ebi/eva/contigalias/utils/GzipCompress.java
@@ -19,7 +19,7 @@ public class GzipCompress {
     /**
      * Decompress (Unzip) a .gz file and save the output file in the same
      * input file's location.
-     * The output file's name will be the same as the input's but without '.gz'
+     * The output file's name will be genome_sequence.fna
      * @return The output (decompressed) file path*/
     public Optional<Path> unzip(String compressedFilePath, String outputDirPath) {
         String outputFileName = "genome_sequence.fna";
diff --git a/src/main/java/uk/ac/ebi/eva/contigalias/utils/MD5Digest.java b/src/main/java/uk/ac/ebi/eva/contigalias/utils/MD5Digest.java
new file mode 100644
index 00000000..72a3c0ee
--- /dev/null
+++ b/src/main/java/uk/ac/ebi/eva/contigalias/utils/MD5Digest.java
@@ -0,0 +1,20 @@
+package uk.ac.ebi.eva.contigalias.utils;
+
+import java.security.MessageDigest;
+import java.security.NoSuchAlgorithmException;
+
+import javax.xml.bind.DatatypeConverter;
+
+public class MD5Digest {
+
+    /**
+     * Return the digest of the text using the MD5 algorithm*/
+    public String hash(String text) throws NoSuchAlgorithmException {
+        MessageDigest md = MessageDigest.getInstance("MD5");
+        md.update(text.getBytes());
+        byte[] digest = md.digest();
+        String textHash = DatatypeConverter
+                .printHexBinary(digest).toUpperCase();
+        return textHash.toLowerCase();
+    }
+}
diff --git a/src/test/java/uk/ac/ebi/eva/contigalias/datasource/NCBIAssemblySequencesDataSourceTest.java b/src/test/java/uk/ac/ebi/eva/contigalias/datasource/NCBIAssemblySequencesDataSourceTest.java
new file mode 100644
index 00000000..d1305371
--- /dev/null
+++ b/src/test/java/uk/ac/ebi/eva/contigalias/datasource/NCBIAssemblySequencesDataSourceTest.java
@@ -0,0 +1,55 @@
+package uk.ac.ebi.eva.contigalias.datasource;
+
+import java.io.IOException;
+import java.security.NoSuchAlgorithmException;
+import java.util.Optional;
+
+import org.junit.jupiter.api.AfterEach;
+import org.junit.jupiter.api.BeforeEach;
+import org.junit.jupiter.api.Test;
+import org.springframework.beans.factory.annotation.Autowired;
+import org.springframework.boot.test.context.SpringBootTest;
+import uk.ac.ebi.eva.contigalias.entities.AssemblySequencesEntity;
+import uk.ac.ebi.eva.contigalias.entities.Sequence;
+
+import static org.junit.jupiter.api.Assertions.*;
+@SpringBootTest
+class NCBIAssemblySequencesDataSourceTest {
+
+
+    @Autowired
+    NCBIAssemblySequencesDataSource dataSource;
+
+    @BeforeEach
+    void setUp() {
+    }
+
+    @AfterEach
+    void tearDown() {
+    }
+
+    @Test
+    void getAssemblySequenceByAccession() throws IOException, NoSuchAlgorithmException, InterruptedException {
+
+
+        String accession = "GCF_000001765.3";
+        //String accession2 = "GCF_000001405.31";
+        Optional<AssemblySequencesEntity> entity = dataSource.getAssemblySequencesByAccession(accession);
+        //displayAssemblySequencesEntityContent(entity.get());
+        assertEquals(accession, entity.get().getInsdcAccession());
+    }
+
+    void displayAssemblySequencesEntityContent(AssemblySequencesEntity entity) throws InterruptedException {
+        System.out.println("ACCESSION: " + entity.getInsdcAccession());
+        System.out.println("TOTAL NUMBER OF SEQUENCES: " + entity.getSequences().size());
+        for (Sequence s: entity.getSequences()){
+            System.out.print("REFSEQ: " + s.getRefseq() + " | ");
+            System.out.println("SEQUENCE_MD5: " + s.getSequenceMD5());
+            Thread.sleep(1000); // Just for lazy and fun display :)
+        }
+    }
+
+    @Test
+    void downloadAssemblySequence() {
+    }
+}
\ No newline at end of file
diff --git a/src/test/java/uk/ac/ebi/eva/contigalias/dus2/NCBIAssemblySequencesReaderTest.java b/src/test/java/uk/ac/ebi/eva/contigalias/dus2/NCBIAssemblySequencesReaderTest.java
new file mode 100644
index 00000000..b652ea13
--- /dev/null
+++ b/src/test/java/uk/ac/ebi/eva/contigalias/dus2/NCBIAssemblySequencesReaderTest.java
@@ -0,0 +1,67 @@
+package uk.ac.ebi.eva.contigalias.dus2;
+
+import java.io.FileInputStream;
+import java.io.FileNotFoundException;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.InputStreamReader;
+import java.security.NoSuchAlgorithmException;
+
+import org.junit.jupiter.api.AfterEach;
+import org.junit.jupiter.api.BeforeEach;
+import org.junit.jupiter.api.Test;
+import org.springframework.beans.factory.annotation.Autowired;
+import org.springframework.boot.test.context.SpringBootTest;
+import uk.ac.ebi.eva.contigalias.entities.AssemblySequencesEntity;
+import uk.ac.ebi.eva.contigalias.entities.Sequence;
+
+import static org.junit.jupiter.api.Assertions.*;
+
+@SpringBootTest
+class NCBIAssemblySequencesReaderTest {
+
+    private static final String ACCESSION = "GCF_000001765.3";
+
+    private static final String FASTA_FILE_PATH = "/tmp/genome_sequence.fna";
+    private InputStreamReader streamReader;
+
+    private InputStream stream;
+
+    @Autowired
+    private NCBIAssemblySequencesReaderFactory readerFactory;
+
+    private NCBIAssemblySequencesReader reader;
+
+    @BeforeEach
+    void setUp() throws FileNotFoundException {
+        stream = new FileInputStream(FASTA_FILE_PATH);
+        streamReader = new InputStreamReader(stream);
+        reader = readerFactory.build(streamReader, ACCESSION);
+    }
+
+    @AfterEach
+    void tearDown() throws IOException {
+        stream.close();
+        streamReader.close();
+    }
+
+    @Test
+    void getAssemblySequencesReader() throws IOException {
+        assertTrue(reader.ready());
+    }
+
+    @Test
+    void assertParsedFastaFileValid() throws IOException, NoSuchAlgorithmException {
+        reader.parseFile();
+        displayAssemblySequencesEntityContent(reader.assemblySequencesEntity);
+        assertEquals(ACCESSION, reader.assemblySequencesEntity.getInsdcAccession());
+    }
+
+    void displayAssemblySequencesEntityContent(AssemblySequencesEntity entity){
+        System.out.println("ACCESSION: " + entity.getInsdcAccession());
+        for (Sequence s: entity.getSequences()){
+            System.out.print("REFSEQ: " + s.getRefseq() + " | ");
+            System.out.println("SEQUENCE_MD5: " + s.getSequenceMD5());
+        }
+    }
+}
\ No newline at end of file
diff --git a/src/test/java/uk/ac/ebi/eva/contigalias/service/AssemblySequencesServiceTest.java b/src/test/java/uk/ac/ebi/eva/contigalias/service/AssemblySequencesServiceTest.java
new file mode 100644
index 00000000..53ba6296
--- /dev/null
+++ b/src/test/java/uk/ac/ebi/eva/contigalias/service/AssemblySequencesServiceTest.java
@@ -0,0 +1,44 @@
+package uk.ac.ebi.eva.contigalias.service;
+
+import java.io.IOException;
+import java.security.NoSuchAlgorithmException;
+
+import org.junit.jupiter.api.AfterEach;
+import org.junit.jupiter.api.BeforeEach;
+import org.junit.jupiter.api.Test;
+import org.springframework.beans.factory.annotation.Autowired;
+import org.springframework.boot.test.context.SpringBootTest;
+import uk.ac.ebi.eva.contigalias.repo.AssemblySequencesRepository;
+
+import static org.junit.jupiter.api.Assertions.*;
+
+@SpringBootTest
+class AssemblySequencesServiceTest {
+
+
+    @Autowired
+    private AssemblySequencesService assemblySequencesService;
+
+    @Autowired
+    private AssemblySequencesRepository assemblySequencesRepository;
+
+    @BeforeEach
+    void setUp() {
+    }
+
+    @AfterEach
+    void tearDown() {
+    }
+
+    @Test
+    void fetchAndInsertAssemblySequence() throws IOException, NoSuchAlgorithmException {
+        String accession = "GCF_000001765.3";
+        assemblySequencesService.fetchAndInsertAssemblySequence(accession);
+        assertNotNull(assemblySequencesRepository.findAssemblySequenceEntityByInsdcAccession(accession));
+        assertEquals(accession, assemblySequencesRepository.findAssemblySequenceEntityByInsdcAccession(accession).get());
+    }
+
+    @Test
+    void insertAssemblySequence() {
+    }
+}
\ No newline at end of file
diff --git a/src/test/java/uk/ac/ebi/eva/contigalias/utils/GzipCompressTest.java b/src/test/java/uk/ac/ebi/eva/contigalias/utils/GzipCompressTest.java
new file mode 100644
index 00000000..a2ea9f99
--- /dev/null
+++ b/src/test/java/uk/ac/ebi/eva/contigalias/utils/GzipCompressTest.java
@@ -0,0 +1,18 @@
+package uk.ac.ebi.eva.contigalias.utils;
+
+import org.junit.jupiter.api.Test;
+
+import static org.junit.jupiter.api.Assertions.*;
+
+class GzipCompressTest {
+
+    @Test
+    void unzip() {
+        String compressedFilePath = "/tmp/GCF_000001765.3_Dpse_3.0_genomic.fna.gz";
+        String outputDirPath = "/tmp";
+        GzipCompress gzipCompress = new GzipCompress();
+
+
+        assertEquals("/tmp/genome_sequence.fna", gzipCompress.unzip(compressedFilePath, outputDirPath).get().toString());
+    }
+}
\ No newline at end of file
diff --git a/src/test/java/uk/ac/ebi/eva/contigalias/utils/MD5DigestTest.java b/src/test/java/uk/ac/ebi/eva/contigalias/utils/MD5DigestTest.java
new file mode 100644
index 00000000..1676e77d
--- /dev/null
+++ b/src/test/java/uk/ac/ebi/eva/contigalias/utils/MD5DigestTest.java
@@ -0,0 +1,18 @@
+package uk.ac.ebi.eva.contigalias.utils;
+
+import java.security.NoSuchAlgorithmException;
+
+import org.junit.jupiter.api.Test;
+
+import static org.junit.jupiter.api.Assertions.*;
+
+class MD5DigestTest {
+
+    @Test
+    void hash() throws NoSuchAlgorithmException {
+        MD5Digest md5Digest = new MD5Digest();
+        String toBeHashed = "AAA";
+        String MD5Digest = "8880cd8c1fb402585779766f681b868b";
+        assertEquals(MD5Digest,md5Digest.hash(toBeHashed));
+    }
+}
\ No newline at end of file

From 0ce600ef877926d03a8b17fd225c6daa08de3f2c Mon Sep 17 00:00:00 2001
From: waterflow80 <thessalonikaathena@outlook.com>
Date: Sat, 3 Jun 2023 18:42:19 +0100
Subject: [PATCH 4/6] local setup for dev

---
 .../authentication/SecurityConfiguration.java |   2 +-
 .../AssemblySequenceDataSource.java           |  12 --
 .../NCBIAssemblySequenceDataSource.java       | 105 ------------------
 .../dus2/AssemblySequenceReader.java          |  39 -------
 .../dus2/NCBIAssemblySequenceReader.java      |  27 -----
 .../NCBIAssemblySequenceReaderFactory.java    |  18 ---
 .../entities/AssemblySequenceEntity.java      |  23 ----
 .../repo/AssemblySequenceRepository.java      |  14 ---
 .../service/AssemblySequenceService.java      |  80 -------------
 src/main/resources/application.properties     |  19 ++--
 .../eva/contigalias/utils/MD5DigestTest.java  |  18 ---
 11 files changed, 11 insertions(+), 346 deletions(-)
 delete mode 100644 src/main/java/uk/ac/ebi/eva/contigalias/datasource/AssemblySequenceDataSource.java
 delete mode 100644 src/main/java/uk/ac/ebi/eva/contigalias/datasource/NCBIAssemblySequenceDataSource.java
 delete mode 100644 src/main/java/uk/ac/ebi/eva/contigalias/dus2/AssemblySequenceReader.java
 delete mode 100644 src/main/java/uk/ac/ebi/eva/contigalias/dus2/NCBIAssemblySequenceReader.java
 delete mode 100644 src/main/java/uk/ac/ebi/eva/contigalias/dus2/NCBIAssemblySequenceReaderFactory.java
 delete mode 100644 src/main/java/uk/ac/ebi/eva/contigalias/entities/AssemblySequenceEntity.java
 delete mode 100644 src/main/java/uk/ac/ebi/eva/contigalias/repo/AssemblySequenceRepository.java
 delete mode 100644 src/main/java/uk/ac/ebi/eva/contigalias/service/AssemblySequenceService.java
 delete mode 100644 src/test/java/uk/ac/ebi/eva/contigalias/utils/MD5DigestTest.java

diff --git a/src/main/java/uk/ac/ebi/eva/contigalias/controller/authentication/SecurityConfiguration.java b/src/main/java/uk/ac/ebi/eva/contigalias/controller/authentication/SecurityConfiguration.java
index 57f85825..073b13ff 100644
--- a/src/main/java/uk/ac/ebi/eva/contigalias/controller/authentication/SecurityConfiguration.java
+++ b/src/main/java/uk/ac/ebi/eva/contigalias/controller/authentication/SecurityConfiguration.java
@@ -57,7 +57,7 @@ protected void configure(HttpSecurity http) throws Exception {
             .authorizeRequests()
             .antMatchers("/v1/assemblies/**").permitAll()
             .antMatchers("/v1/chromosomes/**").permitAll()
-            .antMatchers("/v1/admin/**").hasRole(ROLE_ADMIN)
+            //.antMatchers("/v1/admin/**").hasRole(ROLE_ADMIN)
             .and().httpBasic().realmName(REALM)
             .authenticationEntryPoint(customBasicAuthenticationEntryPoint)
             .and().sessionManagement().sessionCreationPolicy(SessionCreationPolicy.STATELESS);
diff --git a/src/main/java/uk/ac/ebi/eva/contigalias/datasource/AssemblySequenceDataSource.java b/src/main/java/uk/ac/ebi/eva/contigalias/datasource/AssemblySequenceDataSource.java
deleted file mode 100644
index 3a4d5b46..00000000
--- a/src/main/java/uk/ac/ebi/eva/contigalias/datasource/AssemblySequenceDataSource.java
+++ /dev/null
@@ -1,12 +0,0 @@
-package uk.ac.ebi.eva.contigalias.datasource;
-
-import java.io.IOException;
-import java.util.Optional;
-
-import uk.ac.ebi.eva.contigalias.entities.AssemblySequenceEntity;
-
-public interface AssemblySequenceDataSource {
-
-    Optional<AssemblySequenceEntity> getAssemblySequenceByAccession(String accession) throws IOException;
-
-}
diff --git a/src/main/java/uk/ac/ebi/eva/contigalias/datasource/NCBIAssemblySequenceDataSource.java b/src/main/java/uk/ac/ebi/eva/contigalias/datasource/NCBIAssemblySequenceDataSource.java
deleted file mode 100644
index d76741cf..00000000
--- a/src/main/java/uk/ac/ebi/eva/contigalias/datasource/NCBIAssemblySequenceDataSource.java
+++ /dev/null
@@ -1,105 +0,0 @@
-package uk.ac.ebi.eva.contigalias.datasource;
-
-import java.io.FileInputStream;
-import java.io.IOException;
-import java.io.InputStream;
-import java.nio.file.Path;
-import java.nio.file.Paths;
-import java.util.Optional;
-
-import org.apache.commons.net.ftp.FTPFile;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-import org.springframework.beans.factory.annotation.Autowired;
-import org.springframework.beans.factory.annotation.Value;
-import org.springframework.retry.annotation.Backoff;
-import org.springframework.retry.annotation.Retryable;
-import org.springframework.stereotype.Repository;
-import uk.ac.ebi.eva.contigalias.dus2.NCBIAssemblySequenceReader;
-import uk.ac.ebi.eva.contigalias.dus2.NCBIAssemblySequenceReaderFactory;
-import uk.ac.ebi.eva.contigalias.dus.NCBIBrowser;
-import uk.ac.ebi.eva.contigalias.dus.NCBIBrowserFactory;
-import uk.ac.ebi.eva.contigalias.entities.AssemblySequenceEntity;
-import uk.ac.ebi.eva.contigalias.utils.GzipCompress;
-
-@Repository("NCBISequenceDataSource")
-public class NCBIAssemblySequenceDataSource implements AssemblySequenceDataSource{
-
-    private final Logger logger = LoggerFactory.getLogger(NCBIAssemblySequenceDataSource.class);
-
-    private final NCBIBrowserFactory factory;
-
-    private final NCBIAssemblySequenceReaderFactory readerFactory;
-
-    @Value("${asm.file.download.dir}")
-    private String asmFileDownloadDir;
-
-    @Autowired
-    public NCBIAssemblySequenceDataSource(NCBIBrowserFactory factory,
-                                          NCBIAssemblySequenceReaderFactory readerFactory){
-        this.factory = factory;
-        this.readerFactory = readerFactory;
-    }
-
-    @Override
-    public Optional<AssemblySequenceEntity> getAssemblySequenceByAccession(String accession) throws IOException, IllegalArgumentException {
-            NCBIBrowser ncbiBrowser = factory.build();
-            ncbiBrowser.connect();
-            GzipCompress gzipCompress = new GzipCompress();
-
-            Optional<Path> downloadFilePath = downloadAssemblySequence(accession, ncbiBrowser);
-            if (!downloadFilePath.isPresent()) {
-                return Optional.empty();
-            }
-            logger.info("Assembly sequence _fna.gz file downloaded successfully in: " + downloadFilePath);
-            // Uncompress the .gz file
-            Optional<Path> uncompressedFilePath = gzipCompress.unzip(downloadFilePath.get().toString(), asmFileDownloadDir);
-            if (!uncompressedFilePath.isPresent()){
-                return Optional.empty();
-            }
-
-            AssemblySequenceEntity assemblySequenceEntity;
-            try (InputStream stream = new FileInputStream(uncompressedFilePath.get().toFile())){
-                NCBIAssemblySequenceReader reader = readerFactory.build(stream);
-                assemblySequenceEntity = reader.getAssemblySequenceEntity();
-                //TODO : The logger info will be canged when we add more attributes to the entity and we parse the whole file info
-                logger.info("NCBI: Name of the sequence in " + accession + " : " + assemblySequenceEntity.getName());
-            } finally {
-                try {
-                    ncbiBrowser.disconnect();
-                    //Files.deleteIfExists(downloadFilePath.get());
-                } catch (IOException e) {
-                    logger.warn("Error while trying to disconnect - ncbiBrowser (assembly: " + accession + ")");
-                }
-        }
-            return Optional.of(assemblySequenceEntity);
-    }
-
-
-    /**
-     * Download the assembly fna/fasta file given the accession and save it to /tmp
-     * After this method is called, the file will be downloaded, and the path to this file
-     * on your local computer will be returned*/
-    @Retryable(value = Exception.class, maxAttempts = 5, backoff = @Backoff(delay = 2000, multiplier = 2))
-    public Optional<Path> downloadAssemblySequence(String accession, NCBIBrowser ncbiBrowser) throws IOException {
-        // The same directory as the report file
-        Optional<String> directory = ncbiBrowser.getGenomeReportDirectory(accession);
-
-        if (!directory.isPresent()) {
-            return Optional.empty();
-        }
-
-        logger.info("NCBI directory for assembly genomic.fna download: " + directory.get());
-        FTPFile ftpFile = ncbiBrowser.getAssemblyGenomicFnaFile(directory.get());
-        String ftpFilePath = directory.get() + ftpFile.getName();
-        Path downloadFilePath = Paths.get(asmFileDownloadDir, ftpFile.getName());
-        boolean success = ncbiBrowser.downloadFTPFile(ftpFilePath, downloadFilePath, ftpFile.getSize());
-        if (success) {
-            logger.info("NCBI assembly genomic.fna downloaded successfully (" + ftpFile.getName() + ")");
-            return Optional.of(downloadFilePath);
-        } else {
-            logger.error("NCBI assembly genomic.fna could not be downloaded successfully(" + ftpFile.getName() + ")");
-            return Optional.empty();
-        }
-    }
-}
diff --git a/src/main/java/uk/ac/ebi/eva/contigalias/dus2/AssemblySequenceReader.java b/src/main/java/uk/ac/ebi/eva/contigalias/dus2/AssemblySequenceReader.java
deleted file mode 100644
index 0b107042..00000000
--- a/src/main/java/uk/ac/ebi/eva/contigalias/dus2/AssemblySequenceReader.java
+++ /dev/null
@@ -1,39 +0,0 @@
-package uk.ac.ebi.eva.contigalias.dus2;
-
-import java.io.BufferedReader;
-import java.io.IOException;
-import java.io.InputStreamReader;
-
-import uk.ac.ebi.eva.contigalias.entities.AssemblySequenceEntity;
-
-public abstract class AssemblySequenceReader {
-
-    protected final BufferedReader reader;
-
-    protected AssemblySequenceEntity assemblySequenceEntity;
-
-    protected boolean fileParsed = false;
-
-
-    public AssemblySequenceReader(InputStreamReader inputStreamReader){
-        this.reader = new BufferedReader(inputStreamReader);
-    }
-
-    public AssemblySequenceEntity getAssemblySequenceEntity() throws IOException {
-        if(!fileParsed || assemblySequenceEntity == null){
-            parseFile();
-        }
-        return assemblySequenceEntity;
-    }
-
-    protected abstract void parseFile() throws IOException, NullPointerException;
-
-
-    protected abstract void parseAssemblySequenceEntity(String line);
-
-
-
-    public boolean ready() throws IOException {
-        return reader.ready();
-    }
-}
diff --git a/src/main/java/uk/ac/ebi/eva/contigalias/dus2/NCBIAssemblySequenceReader.java b/src/main/java/uk/ac/ebi/eva/contigalias/dus2/NCBIAssemblySequenceReader.java
deleted file mode 100644
index 12e01689..00000000
--- a/src/main/java/uk/ac/ebi/eva/contigalias/dus2/NCBIAssemblySequenceReader.java
+++ /dev/null
@@ -1,27 +0,0 @@
-package uk.ac.ebi.eva.contigalias.dus2;
-
-import java.io.IOException;
-import java.io.InputStreamReader;
-
-public class NCBIAssemblySequenceReader extends AssemblySequenceReader{
-
-    public NCBIAssemblySequenceReader(InputStreamReader inputStreamReader){
-        super(inputStreamReader);
-    }
-
-    @Override
-    protected void parseFile() throws IOException, NullPointerException {
-        if (reader == null){
-            throw new NullPointerException("Cannot use AssemblySequenceReader without having a valid InputStreamReader.");
-        }
-        // TODO: HERE WE'LL EXTARACT THE .gz FILE AND PARSE THE fna FILE
-    }
-
-    @Override
-    // Parsing a line of the file
-    protected void parseAssemblySequenceEntity(String line) {
-        // TODO: HERE WE'LL PARSE A LINE OF THE FILE (AN ENTRY)
-        // TODO: NOTE: THIS METHOD MIGHT NOT BE COMPLETELY USEFUL SINCE THE FILE CONTAINS ONLY
-        // TODO: TEXT AND A '>' SEPARATORS TO SEPARATE SEQUENCES FROM ONE ANOTHER
-    }
-}
diff --git a/src/main/java/uk/ac/ebi/eva/contigalias/dus2/NCBIAssemblySequenceReaderFactory.java b/src/main/java/uk/ac/ebi/eva/contigalias/dus2/NCBIAssemblySequenceReaderFactory.java
deleted file mode 100644
index 06867aba..00000000
--- a/src/main/java/uk/ac/ebi/eva/contigalias/dus2/NCBIAssemblySequenceReaderFactory.java
+++ /dev/null
@@ -1,18 +0,0 @@
-package uk.ac.ebi.eva.contigalias.dus2;
-
-import java.io.InputStream;
-import java.io.InputStreamReader;
-
-import org.springframework.stereotype.Component;
-
-@Component
-public class NCBIAssemblySequenceReaderFactory {
-
-    public NCBIAssemblySequenceReader build(InputStream inputStream){
-        return new NCBIAssemblySequenceReader(new InputStreamReader(inputStream));
-    }
-
-    public NCBIAssemblySequenceReader build(InputStreamReader inputStreamReader){
-        return new NCBIAssemblySequenceReader(inputStreamReader);
-    }
-}
diff --git a/src/main/java/uk/ac/ebi/eva/contigalias/entities/AssemblySequenceEntity.java b/src/main/java/uk/ac/ebi/eva/contigalias/entities/AssemblySequenceEntity.java
deleted file mode 100644
index c1a58894..00000000
--- a/src/main/java/uk/ac/ebi/eva/contigalias/entities/AssemblySequenceEntity.java
+++ /dev/null
@@ -1,23 +0,0 @@
-package uk.ac.ebi.eva.contigalias.entities;
-
-import javax.persistence.Column;
-import javax.persistence.Entity;
-import javax.persistence.Id;
-import javax.persistence.Table;
-
-import lombok.Getter;
-import lombok.Setter;
-
-@Setter
-@Getter
-@Table(name = "AssemblySequence")
-@Entity
-public class AssemblySequenceEntity {
-
-    @Id
-    @Column(nullable = false)
-    private String accession;
-
-    @Column(nullable = false)
-    private String name;
-}
diff --git a/src/main/java/uk/ac/ebi/eva/contigalias/repo/AssemblySequenceRepository.java b/src/main/java/uk/ac/ebi/eva/contigalias/repo/AssemblySequenceRepository.java
deleted file mode 100644
index 6eb6fa01..00000000
--- a/src/main/java/uk/ac/ebi/eva/contigalias/repo/AssemblySequenceRepository.java
+++ /dev/null
@@ -1,14 +0,0 @@
-package uk.ac.ebi.eva.contigalias.repo;
-
-import java.util.Optional;
-
-import org.springframework.data.jpa.repository.JpaRepository;
-import org.springframework.stereotype.Repository;
-import uk.ac.ebi.eva.contigalias.entities.AssemblySequenceEntity;
-
-@Repository
-public interface AssemblySequenceRepository extends JpaRepository<AssemblySequenceEntity, Long> {
-    Optional<AssemblySequenceEntity> findAssemblySequenceEntityByAccession(String accession);
-
-
-}
diff --git a/src/main/java/uk/ac/ebi/eva/contigalias/service/AssemblySequenceService.java b/src/main/java/uk/ac/ebi/eva/contigalias/service/AssemblySequenceService.java
deleted file mode 100644
index 5dfd917a..00000000
--- a/src/main/java/uk/ac/ebi/eva/contigalias/service/AssemblySequenceService.java
+++ /dev/null
@@ -1,80 +0,0 @@
-package uk.ac.ebi.eva.contigalias.service;
-
-import java.io.IOException;
-import java.util.Optional;
-
-import javax.transaction.Transactional;
-
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-import org.springframework.stereotype.Service;
-import uk.ac.ebi.eva.contigalias.datasource.NCBIAssemblySequenceDataSource;
-import uk.ac.ebi.eva.contigalias.entities.AssemblySequenceEntity;
-import uk.ac.ebi.eva.contigalias.exception.AssemblySequenceNotFoundException;
-import uk.ac.ebi.eva.contigalias.exception.DuplicateAssemblySequenceException;
-import uk.ac.ebi.eva.contigalias.repo.AssemblySequenceRepository;
-
-@Service
-public class AssemblySequenceService {
-
-    private final AssemblySequenceRepository repository;
-
-    private final NCBIAssemblySequenceDataSource ncbiSequenceDataSource;
-
-    private final Logger logger = LoggerFactory.getLogger(AssemblyService.class);
-
-
-    public AssemblySequenceService(
-            AssemblySequenceRepository repository, NCBIAssemblySequenceDataSource ncbiSequenceDataSource){
-        this.repository = repository;
-        this.ncbiSequenceDataSource = ncbiSequenceDataSource;
-    }
-
-    public void fetchAndInsertAssemblySequence(String accession) throws IOException {
-        Optional<AssemblySequenceEntity> entity = repository.findAssemblySequenceEntityByAccession(accession);
-        if(entity.isPresent())
-            throw duplicateAssemblySequenceInsertionException(accession, entity.get());
-        Optional<AssemblySequenceEntity> fetchAssembly = ncbiSequenceDataSource.getAssemblySequenceByAccession(accession);
-        if(!fetchAssembly.isPresent()){
-            throw new AssemblySequenceNotFoundException(accession);
-        }
-        if (fetchAssembly.get().getName() != null){ // This condition is only for testing, it'll change as soon as we add more attributes to the entity
-            insertAssemblySequence(fetchAssembly.get());
-            logger.info("Successfully inserted assembly for accession " + accession);
-        }else {
-            logger.error("Skipping inserting assembly sequence : No name in assembly : " + accession);
-        }
-    }
-
-    @Transactional
-    public void insertAssemblySequence(AssemblySequenceEntity entity) {
-        if (isEntityPresent(entity)) {
-            throw duplicateAssemblySequenceInsertionException(null, entity);
-        } else {
-            repository.save(entity);
-        }
-    }
-
-    private boolean isEntityPresent(AssemblySequenceEntity entity) {
-        // TODO: THE CONDITIONS IN THIS METHOD WILL BE CHANGED WHEN WE ADD MORE ATTRIBUTES TO THE ENTITY
-        Optional<AssemblySequenceEntity> existingAssembly = repository.findAssemblySequenceEntityByAccession(entity.getAccession());
-        return existingAssembly.isPresent();
-    }
-
-    private DuplicateAssemblySequenceException duplicateAssemblySequenceInsertionException(String accession, AssemblySequenceEntity present) {
-        StringBuilder exception = new StringBuilder("A similar assembly Sequence already exists");
-        if (accession != null){
-            exception.append("\n");
-            exception.append("Assembly Sequence trying to insert:");
-            exception.append("\t");
-            exception.append(accession);
-        }
-        if (present != null){
-            exception.append("\n");
-            exception.append("Assembly Sequence already present");
-            exception.append("\t");
-            exception.append(present);
-        }
-        return new DuplicateAssemblySequenceException(exception.toString());
-    }
-}
diff --git a/src/main/resources/application.properties b/src/main/resources/application.properties
index 514ac4f3..efa59a98 100644
--- a/src/main/resources/application.properties
+++ b/src/main/resources/application.properties
@@ -14,8 +14,8 @@
 # limitations under the License.
 #
 
-controller.auth.admin.username=@contig-alias.admin-user@
-controller.auth.admin.password=@contig-alias.admin-password@
+controller.auth.admin.username=haroune
+controller.auth.admin.password=password
 
 management.endpoints.web.exposure.include=info,health
 management.endpoints.web.base-path=/
@@ -24,20 +24,21 @@ management.info.git.mode=full
 logging.level.uk.ac.ebi.eva.contigalias=DEBUG
 
 # Database configuration
-spring.datasource.url=@contig-alias.db-url@
-spring.datasource.username=@contig-alias.db-username@
-spring.datasource.password=@contig-alias.db-password@
-spring.jpa.hibernate.ddl-auto=@contig-alias.ddl-behaviour@
+spring.datasource.url=jdbc:postgresql://localhost:5432/contig_db
+spring.datasource.username=haroune
+spring.datasource.password=123
+spring.jpa.hibernate.ddl-auto=update
 spring.datasource.driver-class-name=org.postgresql.Driver
 spring.jpa.database-platform=org.hibernate.dialect.PostgreSQLDialect
 spring.jpa.generate-ddl=true
 
 server.servlet.context-path=/eva/webservices/contig-alias
+server.port=8081
 
-ftp.proxy.host=@ftp.proxy.host@
-ftp.proxy.port=@ftp.proxy.port@
+ftp.proxy.host=null
+ftp.proxy.port=0
 
-config.scaffolds.enabled = @contig-alias.scaffolds-enabled@
+config.scaffolds.enabled = true
 
 asm.file.download.dir=/tmp
 
diff --git a/src/test/java/uk/ac/ebi/eva/contigalias/utils/MD5DigestTest.java b/src/test/java/uk/ac/ebi/eva/contigalias/utils/MD5DigestTest.java
deleted file mode 100644
index 1676e77d..00000000
--- a/src/test/java/uk/ac/ebi/eva/contigalias/utils/MD5DigestTest.java
+++ /dev/null
@@ -1,18 +0,0 @@
-package uk.ac.ebi.eva.contigalias.utils;
-
-import java.security.NoSuchAlgorithmException;
-
-import org.junit.jupiter.api.Test;
-
-import static org.junit.jupiter.api.Assertions.*;
-
-class MD5DigestTest {
-
-    @Test
-    void hash() throws NoSuchAlgorithmException {
-        MD5Digest md5Digest = new MD5Digest();
-        String toBeHashed = "AAA";
-        String MD5Digest = "8880cd8c1fb402585779766f681b868b";
-        assertEquals(MD5Digest,md5Digest.hash(toBeHashed));
-    }
-}
\ No newline at end of file

From 16a63f696dcaa87218a5670b2234d9775d30ef4f Mon Sep 17 00:00:00 2001
From: waterflow80 <thessalonikaathena@outlook.com>
Date: Mon, 5 Jun 2023 18:12:13 +0100
Subject: [PATCH 5/6] Adding the feature of retrieving the fasta file from the
 NCBI datasource and parsing the content.

Added the necessary code to retrieve the fasta file related to a specific
assembly.
Parsing the file and saving related information alongside the Contig-alias Backend
---
 .../eva/contigalias/repo/ChromosomeRepository.java  | 11 +++++++++++
 .../service/AssemblySequencesService.java           | 13 ++++++++++++-
 .../eva/contigalias/service/ChromosomeService.java  |  7 +++++++
 .../service/AssemblySequencesServiceTest.java       |  2 +-
 4 files changed, 31 insertions(+), 2 deletions(-)

diff --git a/src/main/java/uk/ac/ebi/eva/contigalias/repo/ChromosomeRepository.java b/src/main/java/uk/ac/ebi/eva/contigalias/repo/ChromosomeRepository.java
index 0b6f5bd7..920e488e 100644
--- a/src/main/java/uk/ac/ebi/eva/contigalias/repo/ChromosomeRepository.java
+++ b/src/main/java/uk/ac/ebi/eva/contigalias/repo/ChromosomeRepository.java
@@ -19,11 +19,17 @@
 import org.springframework.data.domain.Page;
 import org.springframework.data.domain.Pageable;
 import org.springframework.data.jpa.repository.JpaRepository;
+import org.springframework.data.jpa.repository.Modifying;
+import org.springframework.data.jpa.repository.Query;
+import org.springframework.data.repository.query.Param;
 import org.springframework.stereotype.Repository;
 
 import uk.ac.ebi.eva.contigalias.entities.AssemblyEntity;
 import uk.ac.ebi.eva.contigalias.entities.ChromosomeEntity;
 
+import javax.transaction.Transactional;
+
+
 @Repository
 public interface ChromosomeRepository extends JpaRepository<ChromosomeEntity, Long> {
 
@@ -63,6 +69,11 @@ Page<ChromosomeEntity> findChromosomeEntitiesByAssemblyInsdcAccessionOrAssemblyR
 
     Page<ChromosomeEntity> findChromosomeEntitiesByUcscName(String ucscName, Pageable request);
 
+    @Transactional
+    @Modifying
+    @Query("UPDATE ChromosomeEntity c SET c.md5checksum = :md5Checksum WHERE c.refseq = :refseq")
+    int updateChromosomeEntityByRefseqSetMD5Checksum(@Param(value = "refseq") String refseq, @Param(value = "md5Checksum") String md5Checksum);
+
     long countChromosomeEntitiesByInsdcAccession(String insdcAccession);
 
     long countChromosomeEntitiesByRefseq(String refseq);
diff --git a/src/main/java/uk/ac/ebi/eva/contigalias/service/AssemblySequencesService.java b/src/main/java/uk/ac/ebi/eva/contigalias/service/AssemblySequencesService.java
index 5da37b27..85b32b05 100644
--- a/src/main/java/uk/ac/ebi/eva/contigalias/service/AssemblySequencesService.java
+++ b/src/main/java/uk/ac/ebi/eva/contigalias/service/AssemblySequencesService.java
@@ -8,9 +8,11 @@
 
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
+import org.springframework.beans.factory.annotation.Autowired;
 import org.springframework.stereotype.Service;
 import uk.ac.ebi.eva.contigalias.datasource.NCBIAssemblySequencesDataSource;
 import uk.ac.ebi.eva.contigalias.entities.AssemblySequencesEntity;
+import uk.ac.ebi.eva.contigalias.entities.Sequence;
 import uk.ac.ebi.eva.contigalias.exception.AssemblySequenceNotFoundException;
 import uk.ac.ebi.eva.contigalias.exception.DuplicateAssemblySequenceException;
 import uk.ac.ebi.eva.contigalias.repo.AssemblySequencesRepository;
@@ -18,6 +20,9 @@
 @Service
 public class AssemblySequencesService {
 
+    @Autowired
+    private ChromosomeService chromosomeService;
+
     private final AssemblySequencesRepository repository;
 
     private final NCBIAssemblySequencesDataSource ncbiSequenceDataSource;
@@ -39,7 +44,7 @@ public void fetchAndInsertAssemblySequence(String accession) throws IOException,
         if(!fetchAssembly.isPresent()){
             throw new AssemblySequenceNotFoundException(accession);
         }
-        if (fetchAssembly.get().getInsdcAccession() != null){ // This condition is only for testing, it'll change as soon as we add more attributes to the entity
+        if (fetchAssembly.get().getInsdcAccession() != null){
             insertAssemblySequence(fetchAssembly.get());
             logger.info("Successfully inserted assembly for accession " + accession);
         }else {
@@ -52,6 +57,12 @@ public void insertAssemblySequence(AssemblySequencesEntity entity) {
         if (isEntityPresent(entity)) {
             throw duplicateAssemblySequenceInsertionException(null, entity);
         } else {
+            // Inserting the sequences' md5Checksum in the correct place in the chromosome table
+            for (Sequence s: entity.getSequences()){
+                chromosomeService.updateChromosomeEntityByRefseqSetMD5Checksum(s.getRefseq(), s.getSequenceMD5());
+                logger.info("Successfully updated chromosome table with md5Checksum: "+ s.getSequenceMD5() + "" +
+                                    " Where refseq = "+s.getRefseq());
+            }
             repository.save(entity);
         }
     }
diff --git a/src/main/java/uk/ac/ebi/eva/contigalias/service/ChromosomeService.java b/src/main/java/uk/ac/ebi/eva/contigalias/service/ChromosomeService.java
index 93679963..365796a0 100644
--- a/src/main/java/uk/ac/ebi/eva/contigalias/service/ChromosomeService.java
+++ b/src/main/java/uk/ac/ebi/eva/contigalias/service/ChromosomeService.java
@@ -199,6 +199,13 @@ public void deleteChromosome(ChromosomeEntity entity) {
         repository.delete(entity);
     }
 
+    /**
+     * Update the chromosome table; set the md5Checksum for the entry that has the given
+     * chromosome refseq*/
+    public int updateChromosomeEntityByRefseqSetMD5Checksum(String refseq, String md5Checksum){
+        return repository.updateChromosomeEntityByRefseqSetMD5Checksum(refseq, md5Checksum);
+    }
+
     public long countChromosomeEntitiesByInsdcAccession(String insdcAccession) {
         return repository.countChromosomeEntitiesByInsdcAccession(insdcAccession);
     }
diff --git a/src/test/java/uk/ac/ebi/eva/contigalias/service/AssemblySequencesServiceTest.java b/src/test/java/uk/ac/ebi/eva/contigalias/service/AssemblySequencesServiceTest.java
index 53ba6296..70c8e146 100644
--- a/src/test/java/uk/ac/ebi/eva/contigalias/service/AssemblySequencesServiceTest.java
+++ b/src/test/java/uk/ac/ebi/eva/contigalias/service/AssemblySequencesServiceTest.java
@@ -35,7 +35,7 @@ void fetchAndInsertAssemblySequence() throws IOException, NoSuchAlgorithmExcepti
         String accession = "GCF_000001765.3";
         assemblySequencesService.fetchAndInsertAssemblySequence(accession);
         assertNotNull(assemblySequencesRepository.findAssemblySequenceEntityByInsdcAccession(accession));
-        assertEquals(accession, assemblySequencesRepository.findAssemblySequenceEntityByInsdcAccession(accession).get());
+        assertEquals(accession, assemblySequencesRepository.findAssemblySequenceEntityByInsdcAccession(accession).get().getInsdcAccession());
     }
 
     @Test

From 43f50f43ecd65dd8d51255f71ca948c2bf963dae Mon Sep 17 00:00:00 2001
From: waterflow80 <thessalonikaathena@outlook.com>
Date: Wed, 7 Jun 2023 22:16:05 +0100
Subject: [PATCH 6/6] Completed the retrieval and the parsing of the fasta file

- Retrieve the assembly report and the fasta file of a given accession
- Parse the file and retrieve the assembly sequences
- Hash the sequences and using md5 algorithm
- Construct the level 2 sequence collection object
- We can also use the function of the retrieval of the fasta file independently and save the sequences in a
separate table
---
 .../NCBIAssemblySequencesDataSource.java      |   6 +-
 .../dus2/NCBIAssemblySequencesReader.java     |  10 +-
 .../entities/AssemblySequencesEntity.java     |  22 ++-
 .../contigalias/entities/SeqColEntity.java    |  43 ++++++
 .../eva/contigalias/entities/Sequence.java    |  18 ++-
 .../repo/AssemblySequencesRepository.java     |   2 +-
 .../contigalias/repo/SequenceRepository.java  |   2 +
 .../service/AssemblySequencesService.java     |  31 +++--
 .../service/SequenceCollectionService.java    | 127 ++++++++++++++++++
 .../contigalias/utils/DigestGenerator.java    |   7 +
 .../utils/{MD5Digest.java => MD5Hash.java}    |   3 +-
 .../utils/SerializationService.java           |  21 +++
 12 files changed, 246 insertions(+), 46 deletions(-)
 create mode 100644 src/main/java/uk/ac/ebi/eva/contigalias/entities/SeqColEntity.java
 create mode 100644 src/main/java/uk/ac/ebi/eva/contigalias/service/SequenceCollectionService.java
 create mode 100644 src/main/java/uk/ac/ebi/eva/contigalias/utils/DigestGenerator.java
 rename src/main/java/uk/ac/ebi/eva/contigalias/utils/{MD5Digest.java => MD5Hash.java} (90%)
 create mode 100644 src/main/java/uk/ac/ebi/eva/contigalias/utils/SerializationService.java

diff --git a/src/main/java/uk/ac/ebi/eva/contigalias/datasource/NCBIAssemblySequencesDataSource.java b/src/main/java/uk/ac/ebi/eva/contigalias/datasource/NCBIAssemblySequencesDataSource.java
index 211ab422..2c966ecd 100644
--- a/src/main/java/uk/ac/ebi/eva/contigalias/datasource/NCBIAssemblySequencesDataSource.java
+++ b/src/main/java/uk/ac/ebi/eva/contigalias/datasource/NCBIAssemblySequencesDataSource.java
@@ -44,6 +44,9 @@ public NCBIAssemblySequencesDataSource(NCBIBrowserFactory factory,
     }
 
     @Override
+    /**
+     * Return the assemblySequencesEntity which contains the list of sequences of the assembly
+     * with the given accession. The sequences are hashed using md5 algorithm*/
     public Optional<AssemblySequencesEntity> getAssemblySequencesByAccession(String accession) throws IOException, IllegalArgumentException, NoSuchAlgorithmException {
             NCBIBrowser ncbiBrowser = factory.build();
             ncbiBrowser.connect();
@@ -59,7 +62,6 @@ public Optional<AssemblySequencesEntity> getAssemblySequencesByAccession(String
             if (!compressedFilePath.isPresent()){
                 return Optional.empty();
             }
-
             AssemblySequencesEntity assemblySequencesEntity;
             try (InputStream stream = new FileInputStream(compressedFilePath.get().toFile())){
                 NCBIAssemblySequencesReader reader = readerFactory.build(stream, accession);
@@ -69,7 +71,7 @@ public Optional<AssemblySequencesEntity> getAssemblySequencesByAccession(String
                 try {
                     ncbiBrowser.disconnect();
                     Files.deleteIfExists(downloadFilePath.get());
-                    Files.deleteIfExists(compressedFilePath.get()); // Deleting the fasta file
+                    Files.deleteIfExists(compressedFilePath.get()); // Deleting the fna.gz file
                 } catch (IOException e) {
                     logger.warn("Error while trying to disconnect - ncbiBrowser (assembly: " + accession + ")");
                 }
diff --git a/src/main/java/uk/ac/ebi/eva/contigalias/dus2/NCBIAssemblySequencesReader.java b/src/main/java/uk/ac/ebi/eva/contigalias/dus2/NCBIAssemblySequencesReader.java
index b979a8eb..a58290d2 100644
--- a/src/main/java/uk/ac/ebi/eva/contigalias/dus2/NCBIAssemblySequencesReader.java
+++ b/src/main/java/uk/ac/ebi/eva/contigalias/dus2/NCBIAssemblySequencesReader.java
@@ -8,7 +8,8 @@
 
 import uk.ac.ebi.eva.contigalias.entities.AssemblySequencesEntity;
 import uk.ac.ebi.eva.contigalias.entities.Sequence;
-import uk.ac.ebi.eva.contigalias.utils.MD5Digest;
+import uk.ac.ebi.eva.contigalias.utils.DigestGenerator;
+import uk.ac.ebi.eva.contigalias.utils.MD5Hash;
 
 public class NCBIAssemblySequencesReader extends AssemblySequencesReader {
 
@@ -21,19 +22,19 @@ protected void parseFile() throws IOException, NullPointerException, NoSuchAlgor
         if (reader == null){
             throw new NullPointerException("Cannot use AssemblySequenceReader without having a valid InputStreamReader.");
         }
-        MD5Digest md5Digest = new MD5Digest();
+        DigestGenerator md5Digest = new MD5Hash();
         if (assemblySequencesEntity == null){
             assemblySequencesEntity = new AssemblySequencesEntity();
         }
         // Setting the accession of the whole assembly file
-        assemblySequencesEntity.setInsdcAccession(accession);
+        assemblySequencesEntity.setAssemblyInsdcAccession(accession);
         List<Sequence> sequences = new LinkedList<>();
         String line = reader.readLine();
         while (line != null){
             if (line.startsWith(">")){
                 Sequence sequence = new Sequence();
                 String refSeq = line.substring(1, line.indexOf(' '));
-                sequence.setRefseq(refSeq);
+                sequence.setSequenceRefseq(refSeq);
                 line = reader.readLine();
                 StringBuilder sequenceValue = new StringBuilder();
                 while (line != null && !line.startsWith(">")){
@@ -47,6 +48,7 @@ protected void parseFile() throws IOException, NullPointerException, NoSuchAlgor
             }
         }
         assemblySequencesEntity.setSequences(sequences);
+        String digest0; // The level 0 digest of the object
         fileParsed = true;
         reader.close();
     }
diff --git a/src/main/java/uk/ac/ebi/eva/contigalias/entities/AssemblySequencesEntity.java b/src/main/java/uk/ac/ebi/eva/contigalias/entities/AssemblySequencesEntity.java
index 5875b48d..5966a04a 100644
--- a/src/main/java/uk/ac/ebi/eva/contigalias/entities/AssemblySequencesEntity.java
+++ b/src/main/java/uk/ac/ebi/eva/contigalias/entities/AssemblySequencesEntity.java
@@ -3,35 +3,33 @@
 
 import java.util.List;
 
-import javax.persistence.CascadeType;
-import javax.persistence.Column;
-import javax.persistence.Entity;
-import javax.persistence.Id;
-import javax.persistence.JoinColumn;
-import javax.persistence.OneToMany;
-import javax.persistence.Table;
+import javax.persistence.*;
 
+import com.fasterxml.jackson.annotation.JsonInclude;
 import io.swagger.annotations.ApiModelProperty;
+import lombok.Data;
 import lombok.Getter;
 import lombok.Setter;
 import org.hibernate.annotations.LazyCollection;
 import org.hibernate.annotations.LazyCollectionOption;
 
-@Setter
-@Getter
-@Table(name = "AssemblySequences")
+@Data
+@Table(name = "assemblySequences")
 @Entity
 public class AssemblySequencesEntity {
 
     @Id
     @Column(nullable = false)
     @ApiModelProperty(value = "Assembly's INSDC accession. It can be either a GenBank, ENA or a DDBJ accession.")
-    private String insdcAccession;
+    private String assemblyInsdcAccession;
 
 
     @ApiModelProperty(value = "List of all sequences of the assembly.")
+    @JsonInclude(JsonInclude.Include.NON_NULL)
     @LazyCollection(LazyCollectionOption.FALSE)
     @OneToMany(targetEntity = Sequence.class, cascade = CascadeType.ALL)
-    @JoinColumn(name = "insdcAccession", referencedColumnName = "insdcAccession")
+    //@OneToMany(mappedBy = "assemblySequences", cascade = CascadeType.ALL)
+    //@JoinColumn(name = "assembly_insdc_accession", referencedColumnName = "assemblyInsdcAccession")
+    @JoinColumn(name = "assembly_insdc_accession")
     private List<Sequence> sequences;
 }
diff --git a/src/main/java/uk/ac/ebi/eva/contigalias/entities/SeqColEntity.java b/src/main/java/uk/ac/ebi/eva/contigalias/entities/SeqColEntity.java
new file mode 100644
index 00000000..e60e576c
--- /dev/null
+++ b/src/main/java/uk/ac/ebi/eva/contigalias/entities/SeqColEntity.java
@@ -0,0 +1,43 @@
+package uk.ac.ebi.eva.contigalias.entities;
+
+import io.swagger.annotations.ApiModelProperty;
+import lombok.AllArgsConstructor;
+import lombok.Data;
+import lombok.NoArgsConstructor;
+
+import javax.persistence.EnumType;
+import javax.persistence.Enumerated;
+import java.util.List;
+
+@Data
+@NoArgsConstructor
+public class SeqColEntity {
+
+    @ApiModelProperty(value = "The level 0 digest of the object")
+    private String digest;
+
+    @ApiModelProperty(value = "The representation level of the the object")
+    @Enumerated(EnumType.ORDINAL)
+    private Level level;
+
+    @ApiModelProperty(value = "The naming convention used to construct this seqCol object")
+    @Enumerated(EnumType.STRING)
+    private NamingConvention namingConvention;
+
+    @ApiModelProperty(value = "The array of the sequences' lengths")
+    private List<Long> lengths;
+
+    @ApiModelProperty(value = "The array of the sequences' names")
+    private List<String> names;
+
+    @ApiModelProperty(value = "The array of the sequences")
+    private List<String> sequences;
+
+    public enum Level {
+        ZERO, ONE, TWO
+    }
+
+    public enum NamingConvention {
+        ENA, GENBANK, UCSC
+    }
+}
diff --git a/src/main/java/uk/ac/ebi/eva/contigalias/entities/Sequence.java b/src/main/java/uk/ac/ebi/eva/contigalias/entities/Sequence.java
index 728b5987..8ab22b48 100644
--- a/src/main/java/uk/ac/ebi/eva/contigalias/entities/Sequence.java
+++ b/src/main/java/uk/ac/ebi/eva/contigalias/entities/Sequence.java
@@ -1,13 +1,8 @@
 package uk.ac.ebi.eva.contigalias.entities;
 
-import javax.persistence.CascadeType;
-import javax.persistence.Column;
-import javax.persistence.Entity;
-import javax.persistence.Id;
-import javax.persistence.JoinColumn;
-import javax.persistence.ManyToOne;
-import javax.persistence.Table;
+import javax.persistence.*;
 
+import com.fasterxml.jackson.annotation.JsonInclude;
 import io.swagger.annotations.ApiModelProperty;
 import lombok.Getter;
 import lombok.Setter;
@@ -23,12 +18,15 @@ public class Sequence {
     @Id
     @Column(nullable = false)
     @ApiModelProperty(value = "Assembly's Refseq accession.")
-    private String refseq;
+    private String sequenceRefseq;
 
-    @Column
+    @Column(nullable = false)
     @ApiModelProperty(value = "Sequence's MD5 checksum value.")
     private String sequenceMD5;
 
-
+    /*@JsonInclude(JsonInclude.Include.NON_NULL)
+    @ManyToOne
+    @JoinColumn(name = "assembly_insdc_accession", nullable = false)
+    private AssemblySequencesEntity assemblySequences;*/
 
 }
diff --git a/src/main/java/uk/ac/ebi/eva/contigalias/repo/AssemblySequencesRepository.java b/src/main/java/uk/ac/ebi/eva/contigalias/repo/AssemblySequencesRepository.java
index 0992b3c3..ce1b9321 100644
--- a/src/main/java/uk/ac/ebi/eva/contigalias/repo/AssemblySequencesRepository.java
+++ b/src/main/java/uk/ac/ebi/eva/contigalias/repo/AssemblySequencesRepository.java
@@ -8,7 +8,7 @@
 
 @Repository
 public interface AssemblySequencesRepository extends JpaRepository<AssemblySequencesEntity, String> {
-    Optional<AssemblySequencesEntity> findAssemblySequenceEntityByInsdcAccession(String accession);
+    Optional<AssemblySequencesEntity> findAssemblySequenceEntityByAssemblyInsdcAccession(String accession);
 
 
 }
diff --git a/src/main/java/uk/ac/ebi/eva/contigalias/repo/SequenceRepository.java b/src/main/java/uk/ac/ebi/eva/contigalias/repo/SequenceRepository.java
index ba9164b0..c9415632 100644
--- a/src/main/java/uk/ac/ebi/eva/contigalias/repo/SequenceRepository.java
+++ b/src/main/java/uk/ac/ebi/eva/contigalias/repo/SequenceRepository.java
@@ -1,7 +1,9 @@
 package uk.ac.ebi.eva.contigalias.repo;
 
 import org.springframework.data.jpa.repository.JpaRepository;
+import org.springframework.stereotype.Repository;
 import uk.ac.ebi.eva.contigalias.entities.Sequence;
 
+@Repository
 public interface SequenceRepository extends JpaRepository<Sequence, String> {
 }
diff --git a/src/main/java/uk/ac/ebi/eva/contigalias/service/AssemblySequencesService.java b/src/main/java/uk/ac/ebi/eva/contigalias/service/AssemblySequencesService.java
index 85b32b05..0886e345 100644
--- a/src/main/java/uk/ac/ebi/eva/contigalias/service/AssemblySequencesService.java
+++ b/src/main/java/uk/ac/ebi/eva/contigalias/service/AssemblySequencesService.java
@@ -25,51 +25,50 @@ public class AssemblySequencesService {
 
     private final AssemblySequencesRepository repository;
 
-    private final NCBIAssemblySequencesDataSource ncbiSequenceDataSource;
+    private final NCBIAssemblySequencesDataSource ncbiSequencesDataSource;
 
-    private final Logger logger = LoggerFactory.getLogger(AssemblyService.class);
+    private final Logger logger = LoggerFactory.getLogger(AssemblySequencesService.class);
 
 
     public AssemblySequencesService(
-            AssemblySequencesRepository repository, NCBIAssemblySequencesDataSource ncbiSequenceDataSource){
+            AssemblySequencesRepository repository, NCBIAssemblySequencesDataSource ncbiSequencesDataSource){
         this.repository = repository;
-        this.ncbiSequenceDataSource = ncbiSequenceDataSource;
+        this.ncbiSequencesDataSource = ncbiSequencesDataSource;
     }
 
     public void fetchAndInsertAssemblySequence(String accession) throws IOException, NoSuchAlgorithmException {
-        Optional<AssemblySequencesEntity> entity = repository.findAssemblySequenceEntityByInsdcAccession(accession);
+        Optional<AssemblySequencesEntity> entity = repository.findAssemblySequenceEntityByAssemblyInsdcAccession(accession);
         if(entity.isPresent())
             throw duplicateAssemblySequenceInsertionException(accession, entity.get());
-        Optional<AssemblySequencesEntity> fetchAssembly = ncbiSequenceDataSource.getAssemblySequencesByAccession(accession);
-        if(!fetchAssembly.isPresent()){
+        Optional<AssemblySequencesEntity> fetchAssemblySequences = ncbiSequencesDataSource.getAssemblySequencesByAccession(accession);
+        if(!fetchAssemblySequences.isPresent()){
             throw new AssemblySequenceNotFoundException(accession);
         }
-        if (fetchAssembly.get().getInsdcAccession() != null){
-            insertAssemblySequence(fetchAssembly.get());
-            logger.info("Successfully inserted assembly for accession " + accession);
+        if (fetchAssemblySequences.get().getAssemblyInsdcAccession() != null){
+            insertAssemblySequences(fetchAssemblySequences.get());
+            logger.info("Successfully inserted assembly sequences for accession: " + accession);
         }else {
-            logger.error("Skipping inserting assembly sequence : No name in assembly : " + accession);
+            logger.error("Skipping inserting assembly sequences : No name in assembly: " + accession);
         }
     }
 
     @Transactional
-    public void insertAssemblySequence(AssemblySequencesEntity entity) {
+    public void insertAssemblySequences(AssemblySequencesEntity entity) {
         if (isEntityPresent(entity)) {
             throw duplicateAssemblySequenceInsertionException(null, entity);
         } else {
             // Inserting the sequences' md5Checksum in the correct place in the chromosome table
             for (Sequence s: entity.getSequences()){
-                chromosomeService.updateChromosomeEntityByRefseqSetMD5Checksum(s.getRefseq(), s.getSequenceMD5());
-                logger.info("Successfully updated chromosome table with md5Checksum: "+ s.getSequenceMD5() + "" +
-                                    " Where refseq = "+s.getRefseq());
+                chromosomeService.updateChromosomeEntityByRefseqSetMD5Checksum(s.getSequenceRefseq(), s.getSequenceMD5());
             }
+            System.out.println("Assembly_insdc_accession: " + entity.getAssemblyInsdcAccession());
             repository.save(entity);
         }
     }
 
     private boolean isEntityPresent(AssemblySequencesEntity entity) {
         // TODO: THE CONDITIONS IN THIS METHOD WILL BE CHANGED WHEN WE ADD MORE ATTRIBUTES TO THE ENTITY
-        Optional<AssemblySequencesEntity> existingAssembly = repository.findAssemblySequenceEntityByInsdcAccession(entity.getInsdcAccession());
+        Optional<AssemblySequencesEntity> existingAssembly = repository.findAssemblySequenceEntityByAssemblyInsdcAccession(entity.getAssemblyInsdcAccession());
         return existingAssembly.isPresent();
     }
 
diff --git a/src/main/java/uk/ac/ebi/eva/contigalias/service/SequenceCollectionService.java b/src/main/java/uk/ac/ebi/eva/contigalias/service/SequenceCollectionService.java
new file mode 100644
index 00000000..55f44c57
--- /dev/null
+++ b/src/main/java/uk/ac/ebi/eva/contigalias/service/SequenceCollectionService.java
@@ -0,0 +1,127 @@
+package uk.ac.ebi.eva.contigalias.service;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import org.springframework.beans.factory.annotation.Autowired;
+import org.springframework.stereotype.Service;
+import uk.ac.ebi.eva.contigalias.datasource.NCBIAssemblyDataSource;
+import uk.ac.ebi.eva.contigalias.datasource.NCBIAssemblySequencesDataSource;
+import uk.ac.ebi.eva.contigalias.entities.*;
+import uk.ac.ebi.eva.contigalias.exception.AssemblyNotFoundException;
+import uk.ac.ebi.eva.contigalias.exception.AssemblySequenceNotFoundException;
+import uk.ac.ebi.eva.contigalias.repo.AssemblyRepository;
+import uk.ac.ebi.eva.contigalias.repo.AssemblySequencesRepository;
+
+import java.io.IOException;
+import java.security.NoSuchAlgorithmException;
+import java.util.*;
+
+@Service
+public class SequenceCollectionService {
+
+    @Autowired
+    private AssemblyService assemblyService;
+
+    @Autowired
+    private AssemblySequencesService assemblySequencesService;
+
+    private final AssemblyRepository assemblyRepository;
+
+    private final AssemblySequencesRepository assemblySequencesRepository;
+
+    private final NCBIAssemblyDataSource assemblyDataSource;
+
+    private final NCBIAssemblySequencesDataSource assemblySequencesDataSource;
+
+    private final Logger logger = LoggerFactory.getLogger(SequenceCollectionService.class);
+
+    public SequenceCollectionService(AssemblyRepository assemblyRepository, AssemblySequencesRepository assemblySequencesRepository,
+                                     NCBIAssemblyDataSource assemblyDataSource, NCBIAssemblySequencesDataSource assemblySequencesDataSource) {
+        this.assemblyRepository = assemblyRepository;
+        this.assemblySequencesRepository = assemblySequencesRepository;
+        this.assemblyDataSource = assemblyDataSource;
+        this.assemblySequencesDataSource = assemblySequencesDataSource;
+    }
+
+    /**
+     * Search for the assembly report as well as the assembly real sequences and insert them
+     * in the database.
+     * Use the given naming convention while constructing the SeqCol Object*/
+    public void fetchAndInsertSequenceCollection(String accession, SeqColEntity.NamingConvention namingConvention)
+            throws IOException, NoSuchAlgorithmException {
+        // TODO: Check if the needed seqCol data does not exist in the database
+        // TODO: If not, call the appropriate service(s) to fetch it
+
+        Optional<AssemblyEntity> fetchAssembly = assemblyDataSource.getAssemblyByAccession(accession);
+        if (!fetchAssembly.isPresent()){
+            throw new AssemblyNotFoundException(accession);
+        }
+        assemblyService.insertAssembly(fetchAssembly.get());
+        Optional<AssemblySequencesEntity> fetchAssemblySequences = assemblySequencesDataSource
+                .getAssemblySequencesByAccession(accession);
+        if (!fetchAssemblySequences.isPresent()){
+            throw new AssemblySequenceNotFoundException(accession);
+        }
+        assemblySequencesService.insertAssemblySequences(fetchAssemblySequences.get());
+
+        SeqColEntity seqColLevel2 = constructSequenceCollectionObjectL2(fetchAssembly.get(), fetchAssemblySequences.get(),
+                namingConvention);
+
+    }
+
+    /**
+     * Return a level 1 entity of the sequence collection following the given naming convention.
+     * */
+    public SeqColEntity constructSequenceCollectionObjectL2(AssemblyEntity assemblyEntity,
+                                                             AssemblySequencesEntity assemblySequencesEntity,
+                                                             SeqColEntity.NamingConvention namingConvention) {
+
+        List<ChromosomeEntity> chromosomeList = assemblyEntity.getChromosomes();
+        List<Sequence> sequenceList = assemblySequencesEntity.getSequences();
+        assert chromosomeList.size() == sequenceList.size();
+
+        Comparator<ChromosomeEntity> chromosomeComparator = (chromosomeEntity, t1) ->
+                chromosomeEntity.getRefseq().compareTo(t1.getRefseq());
+        Comparator<Sequence> sequenceComparator = (sequence, t1) -> sequence.getSequenceRefseq().compareTo(t1.getSequenceRefseq());
+
+        Collections.sort(chromosomeList, chromosomeComparator);
+        Collections.sort(sequenceList, sequenceComparator);
+
+        SeqColEntity seqColL2 = new SeqColEntity();
+
+
+        List<String> sequences = new LinkedList<>();
+        List<String> names = new LinkedList<>();
+        List<Long> lengths = new LinkedList<>();
+
+        switch (namingConvention) {
+            case ENA:
+                for (int i=0; i<sequenceList.size(); i++){
+                    sequences.add(sequenceList.get(i).getSequenceMD5());
+                    names.add(chromosomeList.get(i).getEnaSequenceName());
+                    lengths.add(chromosomeList.get(i).getSeqLength());
+                }
+            break;
+            case GENBANK:
+                for (int i=0; i<sequenceList.size(); i++){
+                    sequences.add(sequenceList.get(i).getSequenceMD5());
+                    names.add(chromosomeList.get(i).getGenbankSequenceName());
+                    lengths.add(chromosomeList.get(i).getSeqLength());
+                }
+            break;
+            case UCSC:
+                for (int i=0; i<sequenceList.size(); i++){
+                    sequences.add(sequenceList.get(i).getSequenceMD5());
+                    names.add(chromosomeList.get(i).getUcscName());
+                    lengths.add(chromosomeList.get(i).getSeqLength());
+                }
+        }
+        seqColL2.setSequences(sequences);
+        seqColL2.setLengths(lengths);
+        seqColL2.setNames(names);
+        seqColL2.setLevel(SeqColEntity.Level.TWO);
+        seqColL2.setNamingConvention(namingConvention);
+
+        return seqColL2;
+    }
+}
diff --git a/src/main/java/uk/ac/ebi/eva/contigalias/utils/DigestGenerator.java b/src/main/java/uk/ac/ebi/eva/contigalias/utils/DigestGenerator.java
new file mode 100644
index 00000000..9279becb
--- /dev/null
+++ b/src/main/java/uk/ac/ebi/eva/contigalias/utils/DigestGenerator.java
@@ -0,0 +1,7 @@
+package uk.ac.ebi.eva.contigalias.utils;
+
+import java.security.NoSuchAlgorithmException;
+
+public abstract class DigestGenerator {
+    public abstract String hash(String text) throws NoSuchAlgorithmException;
+}
diff --git a/src/main/java/uk/ac/ebi/eva/contigalias/utils/MD5Digest.java b/src/main/java/uk/ac/ebi/eva/contigalias/utils/MD5Hash.java
similarity index 90%
rename from src/main/java/uk/ac/ebi/eva/contigalias/utils/MD5Digest.java
rename to src/main/java/uk/ac/ebi/eva/contigalias/utils/MD5Hash.java
index 72a3c0ee..633e215f 100644
--- a/src/main/java/uk/ac/ebi/eva/contigalias/utils/MD5Digest.java
+++ b/src/main/java/uk/ac/ebi/eva/contigalias/utils/MD5Hash.java
@@ -5,10 +5,11 @@
 
 import javax.xml.bind.DatatypeConverter;
 
-public class MD5Digest {
+public class MD5Hash extends DigestGenerator{
 
     /**
      * Return the digest of the text using the MD5 algorithm*/
+    @Override
     public String hash(String text) throws NoSuchAlgorithmException {
         MessageDigest md = MessageDigest.getInstance("MD5");
         md.update(text.getBytes());
diff --git a/src/main/java/uk/ac/ebi/eva/contigalias/utils/SerializationService.java b/src/main/java/uk/ac/ebi/eva/contigalias/utils/SerializationService.java
new file mode 100644
index 00000000..97464b01
--- /dev/null
+++ b/src/main/java/uk/ac/ebi/eva/contigalias/utils/SerializationService.java
@@ -0,0 +1,21 @@
+package uk.ac.ebi.eva.contigalias.utils;
+
+import org.webpki.jcs.JsonCanonicalizer;
+
+import java.io.IOException;
+import java.util.Optional;
+
+public class SerializationService {
+
+    /**
+     * Return a serialized version of the input jsonString using the
+     * RFC-8785, using the implementation provided by  cyberphone/json-canonicalization
+     * (see on GitHub).
+     * The jsonString should respect some strict format rules, for example:
+     * should be delimited with '{ }', etc*/
+    public Optional<String> serialize(String jsonString) throws IOException {
+        JsonCanonicalizer jsonCanonicalizer = new JsonCanonicalizer(jsonString);
+        String result = jsonCanonicalizer.getEncodedString();
+        return Optional.of(result);
+    }
+}