From 840687a8b8414612b993f319c5b64dcc7bc87159 Mon Sep 17 00:00:00 2001 From: waterflow80 <thessalonikaathena@outlook.com> Date: Tue, 30 May 2023 14:04:11 +0100 Subject: [PATCH 1/6] retrieve fna file function --- .../AssemblySequenceDataSource.java | 12 ++ .../NCBIAssemblySequenceDataSource.java | 105 ++++++++++++++++++ .../ebi/eva/contigalias/dus/NCBIBrowser.java | 11 ++ .../dus2/AssemblySequenceReader.java | 39 +++++++ .../dus2/NCBIAssemblySequenceReader.java | 27 +++++ .../NCBIAssemblySequenceReaderFactory.java | 18 +++ .../entities/AssemblySequenceEntity.java | 23 ++++ .../AssemblySequenceNotFoundException.java | 7 ++ .../DuplicateAssemblySequenceException.java | 8 ++ .../repo/AssemblySequenceRepository.java | 14 +++ .../service/AssemblySequenceService.java | 80 +++++++++++++ .../eva/contigalias/utils/GzipCompress.java | 52 +++++++++ 12 files changed, 396 insertions(+) create mode 100644 src/main/java/uk/ac/ebi/eva/contigalias/datasource/AssemblySequenceDataSource.java create mode 100644 src/main/java/uk/ac/ebi/eva/contigalias/datasource/NCBIAssemblySequenceDataSource.java create mode 100644 src/main/java/uk/ac/ebi/eva/contigalias/dus2/AssemblySequenceReader.java create mode 100644 src/main/java/uk/ac/ebi/eva/contigalias/dus2/NCBIAssemblySequenceReader.java create mode 100644 src/main/java/uk/ac/ebi/eva/contigalias/dus2/NCBIAssemblySequenceReaderFactory.java create mode 100644 src/main/java/uk/ac/ebi/eva/contigalias/entities/AssemblySequenceEntity.java create mode 100644 src/main/java/uk/ac/ebi/eva/contigalias/exception/AssemblySequenceNotFoundException.java create mode 100644 src/main/java/uk/ac/ebi/eva/contigalias/exception/DuplicateAssemblySequenceException.java create mode 100644 src/main/java/uk/ac/ebi/eva/contigalias/repo/AssemblySequenceRepository.java create mode 100644 src/main/java/uk/ac/ebi/eva/contigalias/service/AssemblySequenceService.java create mode 100644 src/main/java/uk/ac/ebi/eva/contigalias/utils/GzipCompress.java diff --git a/src/main/java/uk/ac/ebi/eva/contigalias/datasource/AssemblySequenceDataSource.java b/src/main/java/uk/ac/ebi/eva/contigalias/datasource/AssemblySequenceDataSource.java new file mode 100644 index 00000000..3a4d5b46 --- /dev/null +++ b/src/main/java/uk/ac/ebi/eva/contigalias/datasource/AssemblySequenceDataSource.java @@ -0,0 +1,12 @@ +package uk.ac.ebi.eva.contigalias.datasource; + +import java.io.IOException; +import java.util.Optional; + +import uk.ac.ebi.eva.contigalias.entities.AssemblySequenceEntity; + +public interface AssemblySequenceDataSource { + + Optional<AssemblySequenceEntity> getAssemblySequenceByAccession(String accession) throws IOException; + +} diff --git a/src/main/java/uk/ac/ebi/eva/contigalias/datasource/NCBIAssemblySequenceDataSource.java b/src/main/java/uk/ac/ebi/eva/contigalias/datasource/NCBIAssemblySequenceDataSource.java new file mode 100644 index 00000000..d76741cf --- /dev/null +++ b/src/main/java/uk/ac/ebi/eva/contigalias/datasource/NCBIAssemblySequenceDataSource.java @@ -0,0 +1,105 @@ +package uk.ac.ebi.eva.contigalias.datasource; + +import java.io.FileInputStream; +import java.io.IOException; +import java.io.InputStream; +import java.nio.file.Path; +import java.nio.file.Paths; +import java.util.Optional; + +import org.apache.commons.net.ftp.FTPFile; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import org.springframework.beans.factory.annotation.Autowired; +import org.springframework.beans.factory.annotation.Value; +import org.springframework.retry.annotation.Backoff; +import org.springframework.retry.annotation.Retryable; +import org.springframework.stereotype.Repository; +import uk.ac.ebi.eva.contigalias.dus2.NCBIAssemblySequenceReader; +import uk.ac.ebi.eva.contigalias.dus2.NCBIAssemblySequenceReaderFactory; +import uk.ac.ebi.eva.contigalias.dus.NCBIBrowser; +import uk.ac.ebi.eva.contigalias.dus.NCBIBrowserFactory; +import uk.ac.ebi.eva.contigalias.entities.AssemblySequenceEntity; +import uk.ac.ebi.eva.contigalias.utils.GzipCompress; + +@Repository("NCBISequenceDataSource") +public class NCBIAssemblySequenceDataSource implements AssemblySequenceDataSource{ + + private final Logger logger = LoggerFactory.getLogger(NCBIAssemblySequenceDataSource.class); + + private final NCBIBrowserFactory factory; + + private final NCBIAssemblySequenceReaderFactory readerFactory; + + @Value("${asm.file.download.dir}") + private String asmFileDownloadDir; + + @Autowired + public NCBIAssemblySequenceDataSource(NCBIBrowserFactory factory, + NCBIAssemblySequenceReaderFactory readerFactory){ + this.factory = factory; + this.readerFactory = readerFactory; + } + + @Override + public Optional<AssemblySequenceEntity> getAssemblySequenceByAccession(String accession) throws IOException, IllegalArgumentException { + NCBIBrowser ncbiBrowser = factory.build(); + ncbiBrowser.connect(); + GzipCompress gzipCompress = new GzipCompress(); + + Optional<Path> downloadFilePath = downloadAssemblySequence(accession, ncbiBrowser); + if (!downloadFilePath.isPresent()) { + return Optional.empty(); + } + logger.info("Assembly sequence _fna.gz file downloaded successfully in: " + downloadFilePath); + // Uncompress the .gz file + Optional<Path> uncompressedFilePath = gzipCompress.unzip(downloadFilePath.get().toString(), asmFileDownloadDir); + if (!uncompressedFilePath.isPresent()){ + return Optional.empty(); + } + + AssemblySequenceEntity assemblySequenceEntity; + try (InputStream stream = new FileInputStream(uncompressedFilePath.get().toFile())){ + NCBIAssemblySequenceReader reader = readerFactory.build(stream); + assemblySequenceEntity = reader.getAssemblySequenceEntity(); + //TODO : The logger info will be canged when we add more attributes to the entity and we parse the whole file info + logger.info("NCBI: Name of the sequence in " + accession + " : " + assemblySequenceEntity.getName()); + } finally { + try { + ncbiBrowser.disconnect(); + //Files.deleteIfExists(downloadFilePath.get()); + } catch (IOException e) { + logger.warn("Error while trying to disconnect - ncbiBrowser (assembly: " + accession + ")"); + } + } + return Optional.of(assemblySequenceEntity); + } + + + /** + * Download the assembly fna/fasta file given the accession and save it to /tmp + * After this method is called, the file will be downloaded, and the path to this file + * on your local computer will be returned*/ + @Retryable(value = Exception.class, maxAttempts = 5, backoff = @Backoff(delay = 2000, multiplier = 2)) + public Optional<Path> downloadAssemblySequence(String accession, NCBIBrowser ncbiBrowser) throws IOException { + // The same directory as the report file + Optional<String> directory = ncbiBrowser.getGenomeReportDirectory(accession); + + if (!directory.isPresent()) { + return Optional.empty(); + } + + logger.info("NCBI directory for assembly genomic.fna download: " + directory.get()); + FTPFile ftpFile = ncbiBrowser.getAssemblyGenomicFnaFile(directory.get()); + String ftpFilePath = directory.get() + ftpFile.getName(); + Path downloadFilePath = Paths.get(asmFileDownloadDir, ftpFile.getName()); + boolean success = ncbiBrowser.downloadFTPFile(ftpFilePath, downloadFilePath, ftpFile.getSize()); + if (success) { + logger.info("NCBI assembly genomic.fna downloaded successfully (" + ftpFile.getName() + ")"); + return Optional.of(downloadFilePath); + } else { + logger.error("NCBI assembly genomic.fna could not be downloaded successfully(" + ftpFile.getName() + ")"); + return Optional.empty(); + } + } +} diff --git a/src/main/java/uk/ac/ebi/eva/contigalias/dus/NCBIBrowser.java b/src/main/java/uk/ac/ebi/eva/contigalias/dus/NCBIBrowser.java index 30ea4f73..fcb1f8e7 100644 --- a/src/main/java/uk/ac/ebi/eva/contigalias/dus/NCBIBrowser.java +++ b/src/main/java/uk/ac/ebi/eva/contigalias/dus/NCBIBrowser.java @@ -38,6 +38,7 @@ public class NCBIBrowser extends PassiveAnonymousFTPClient { public static final String PATH_GENOMES_ALL = "/genomes/all/"; + private String ftpProxyHost; private Integer ftpProxyPort; @@ -148,4 +149,14 @@ public FTPFile getNCBIAssemblyReportFile(String directoryPath) throws IOExceptio return assemblyReport.orElseThrow(() -> new AssemblyNotFoundException("Assembly Report File not present in given directory: " + directoryPath)); } + /** + * Return the fna/fasta file that will be downloaded (a pointer to that FtpFile)*/ + public FTPFile getAssemblyGenomicFnaFile(String directoryPath) throws IOException { + Stream<FTPFile> ftpFileStream = Arrays.stream(super.listFiles(directoryPath)); + Stream<FTPFile> assemblyReportFilteredStream = ftpFileStream.filter(f -> f.getName().contains("genomic.fna.gz") && !f.getName().contains("from")); + Optional<FTPFile> assemblyReport = assemblyReportFilteredStream.findFirst(); + + return assemblyReport.orElseThrow(() -> new AssemblyNotFoundException("Assembly Genomic Fna (Fasta) File not present in given directory: " + directoryPath)); + } + } diff --git a/src/main/java/uk/ac/ebi/eva/contigalias/dus2/AssemblySequenceReader.java b/src/main/java/uk/ac/ebi/eva/contigalias/dus2/AssemblySequenceReader.java new file mode 100644 index 00000000..0b107042 --- /dev/null +++ b/src/main/java/uk/ac/ebi/eva/contigalias/dus2/AssemblySequenceReader.java @@ -0,0 +1,39 @@ +package uk.ac.ebi.eva.contigalias.dus2; + +import java.io.BufferedReader; +import java.io.IOException; +import java.io.InputStreamReader; + +import uk.ac.ebi.eva.contigalias.entities.AssemblySequenceEntity; + +public abstract class AssemblySequenceReader { + + protected final BufferedReader reader; + + protected AssemblySequenceEntity assemblySequenceEntity; + + protected boolean fileParsed = false; + + + public AssemblySequenceReader(InputStreamReader inputStreamReader){ + this.reader = new BufferedReader(inputStreamReader); + } + + public AssemblySequenceEntity getAssemblySequenceEntity() throws IOException { + if(!fileParsed || assemblySequenceEntity == null){ + parseFile(); + } + return assemblySequenceEntity; + } + + protected abstract void parseFile() throws IOException, NullPointerException; + + + protected abstract void parseAssemblySequenceEntity(String line); + + + + public boolean ready() throws IOException { + return reader.ready(); + } +} diff --git a/src/main/java/uk/ac/ebi/eva/contigalias/dus2/NCBIAssemblySequenceReader.java b/src/main/java/uk/ac/ebi/eva/contigalias/dus2/NCBIAssemblySequenceReader.java new file mode 100644 index 00000000..12e01689 --- /dev/null +++ b/src/main/java/uk/ac/ebi/eva/contigalias/dus2/NCBIAssemblySequenceReader.java @@ -0,0 +1,27 @@ +package uk.ac.ebi.eva.contigalias.dus2; + +import java.io.IOException; +import java.io.InputStreamReader; + +public class NCBIAssemblySequenceReader extends AssemblySequenceReader{ + + public NCBIAssemblySequenceReader(InputStreamReader inputStreamReader){ + super(inputStreamReader); + } + + @Override + protected void parseFile() throws IOException, NullPointerException { + if (reader == null){ + throw new NullPointerException("Cannot use AssemblySequenceReader without having a valid InputStreamReader."); + } + // TODO: HERE WE'LL EXTARACT THE .gz FILE AND PARSE THE fna FILE + } + + @Override + // Parsing a line of the file + protected void parseAssemblySequenceEntity(String line) { + // TODO: HERE WE'LL PARSE A LINE OF THE FILE (AN ENTRY) + // TODO: NOTE: THIS METHOD MIGHT NOT BE COMPLETELY USEFUL SINCE THE FILE CONTAINS ONLY + // TODO: TEXT AND A '>' SEPARATORS TO SEPARATE SEQUENCES FROM ONE ANOTHER + } +} diff --git a/src/main/java/uk/ac/ebi/eva/contigalias/dus2/NCBIAssemblySequenceReaderFactory.java b/src/main/java/uk/ac/ebi/eva/contigalias/dus2/NCBIAssemblySequenceReaderFactory.java new file mode 100644 index 00000000..06867aba --- /dev/null +++ b/src/main/java/uk/ac/ebi/eva/contigalias/dus2/NCBIAssemblySequenceReaderFactory.java @@ -0,0 +1,18 @@ +package uk.ac.ebi.eva.contigalias.dus2; + +import java.io.InputStream; +import java.io.InputStreamReader; + +import org.springframework.stereotype.Component; + +@Component +public class NCBIAssemblySequenceReaderFactory { + + public NCBIAssemblySequenceReader build(InputStream inputStream){ + return new NCBIAssemblySequenceReader(new InputStreamReader(inputStream)); + } + + public NCBIAssemblySequenceReader build(InputStreamReader inputStreamReader){ + return new NCBIAssemblySequenceReader(inputStreamReader); + } +} diff --git a/src/main/java/uk/ac/ebi/eva/contigalias/entities/AssemblySequenceEntity.java b/src/main/java/uk/ac/ebi/eva/contigalias/entities/AssemblySequenceEntity.java new file mode 100644 index 00000000..c1a58894 --- /dev/null +++ b/src/main/java/uk/ac/ebi/eva/contigalias/entities/AssemblySequenceEntity.java @@ -0,0 +1,23 @@ +package uk.ac.ebi.eva.contigalias.entities; + +import javax.persistence.Column; +import javax.persistence.Entity; +import javax.persistence.Id; +import javax.persistence.Table; + +import lombok.Getter; +import lombok.Setter; + +@Setter +@Getter +@Table(name = "AssemblySequence") +@Entity +public class AssemblySequenceEntity { + + @Id + @Column(nullable = false) + private String accession; + + @Column(nullable = false) + private String name; +} diff --git a/src/main/java/uk/ac/ebi/eva/contigalias/exception/AssemblySequenceNotFoundException.java b/src/main/java/uk/ac/ebi/eva/contigalias/exception/AssemblySequenceNotFoundException.java new file mode 100644 index 00000000..03deecb9 --- /dev/null +++ b/src/main/java/uk/ac/ebi/eva/contigalias/exception/AssemblySequenceNotFoundException.java @@ -0,0 +1,7 @@ +package uk.ac.ebi.eva.contigalias.exception; + +public class AssemblySequenceNotFoundException extends RuntimeException{ + public AssemblySequenceNotFoundException(String accession) { + super("No assembly sequence corresponding to accession " + accession + " could be found"); + } +} diff --git a/src/main/java/uk/ac/ebi/eva/contigalias/exception/DuplicateAssemblySequenceException.java b/src/main/java/uk/ac/ebi/eva/contigalias/exception/DuplicateAssemblySequenceException.java new file mode 100644 index 00000000..f382e62f --- /dev/null +++ b/src/main/java/uk/ac/ebi/eva/contigalias/exception/DuplicateAssemblySequenceException.java @@ -0,0 +1,8 @@ +package uk.ac.ebi.eva.contigalias.exception; + +public class DuplicateAssemblySequenceException extends RuntimeException{ + + public DuplicateAssemblySequenceException(String msg){ + super(msg); + } +} diff --git a/src/main/java/uk/ac/ebi/eva/contigalias/repo/AssemblySequenceRepository.java b/src/main/java/uk/ac/ebi/eva/contigalias/repo/AssemblySequenceRepository.java new file mode 100644 index 00000000..6eb6fa01 --- /dev/null +++ b/src/main/java/uk/ac/ebi/eva/contigalias/repo/AssemblySequenceRepository.java @@ -0,0 +1,14 @@ +package uk.ac.ebi.eva.contigalias.repo; + +import java.util.Optional; + +import org.springframework.data.jpa.repository.JpaRepository; +import org.springframework.stereotype.Repository; +import uk.ac.ebi.eva.contigalias.entities.AssemblySequenceEntity; + +@Repository +public interface AssemblySequenceRepository extends JpaRepository<AssemblySequenceEntity, Long> { + Optional<AssemblySequenceEntity> findAssemblySequenceEntityByAccession(String accession); + + +} diff --git a/src/main/java/uk/ac/ebi/eva/contigalias/service/AssemblySequenceService.java b/src/main/java/uk/ac/ebi/eva/contigalias/service/AssemblySequenceService.java new file mode 100644 index 00000000..5dfd917a --- /dev/null +++ b/src/main/java/uk/ac/ebi/eva/contigalias/service/AssemblySequenceService.java @@ -0,0 +1,80 @@ +package uk.ac.ebi.eva.contigalias.service; + +import java.io.IOException; +import java.util.Optional; + +import javax.transaction.Transactional; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import org.springframework.stereotype.Service; +import uk.ac.ebi.eva.contigalias.datasource.NCBIAssemblySequenceDataSource; +import uk.ac.ebi.eva.contigalias.entities.AssemblySequenceEntity; +import uk.ac.ebi.eva.contigalias.exception.AssemblySequenceNotFoundException; +import uk.ac.ebi.eva.contigalias.exception.DuplicateAssemblySequenceException; +import uk.ac.ebi.eva.contigalias.repo.AssemblySequenceRepository; + +@Service +public class AssemblySequenceService { + + private final AssemblySequenceRepository repository; + + private final NCBIAssemblySequenceDataSource ncbiSequenceDataSource; + + private final Logger logger = LoggerFactory.getLogger(AssemblyService.class); + + + public AssemblySequenceService( + AssemblySequenceRepository repository, NCBIAssemblySequenceDataSource ncbiSequenceDataSource){ + this.repository = repository; + this.ncbiSequenceDataSource = ncbiSequenceDataSource; + } + + public void fetchAndInsertAssemblySequence(String accession) throws IOException { + Optional<AssemblySequenceEntity> entity = repository.findAssemblySequenceEntityByAccession(accession); + if(entity.isPresent()) + throw duplicateAssemblySequenceInsertionException(accession, entity.get()); + Optional<AssemblySequenceEntity> fetchAssembly = ncbiSequenceDataSource.getAssemblySequenceByAccession(accession); + if(!fetchAssembly.isPresent()){ + throw new AssemblySequenceNotFoundException(accession); + } + if (fetchAssembly.get().getName() != null){ // This condition is only for testing, it'll change as soon as we add more attributes to the entity + insertAssemblySequence(fetchAssembly.get()); + logger.info("Successfully inserted assembly for accession " + accession); + }else { + logger.error("Skipping inserting assembly sequence : No name in assembly : " + accession); + } + } + + @Transactional + public void insertAssemblySequence(AssemblySequenceEntity entity) { + if (isEntityPresent(entity)) { + throw duplicateAssemblySequenceInsertionException(null, entity); + } else { + repository.save(entity); + } + } + + private boolean isEntityPresent(AssemblySequenceEntity entity) { + // TODO: THE CONDITIONS IN THIS METHOD WILL BE CHANGED WHEN WE ADD MORE ATTRIBUTES TO THE ENTITY + Optional<AssemblySequenceEntity> existingAssembly = repository.findAssemblySequenceEntityByAccession(entity.getAccession()); + return existingAssembly.isPresent(); + } + + private DuplicateAssemblySequenceException duplicateAssemblySequenceInsertionException(String accession, AssemblySequenceEntity present) { + StringBuilder exception = new StringBuilder("A similar assembly Sequence already exists"); + if (accession != null){ + exception.append("\n"); + exception.append("Assembly Sequence trying to insert:"); + exception.append("\t"); + exception.append(accession); + } + if (present != null){ + exception.append("\n"); + exception.append("Assembly Sequence already present"); + exception.append("\t"); + exception.append(present); + } + return new DuplicateAssemblySequenceException(exception.toString()); + } +} diff --git a/src/main/java/uk/ac/ebi/eva/contigalias/utils/GzipCompress.java b/src/main/java/uk/ac/ebi/eva/contigalias/utils/GzipCompress.java new file mode 100644 index 00000000..455c0582 --- /dev/null +++ b/src/main/java/uk/ac/ebi/eva/contigalias/utils/GzipCompress.java @@ -0,0 +1,52 @@ +package uk.ac.ebi.eva.contigalias.utils; + +import java.io.FileInputStream; +import java.io.FileOutputStream; +import java.io.IOException; +import java.nio.file.Path; +import java.nio.file.Paths; +import java.util.Optional; +import java.util.zip.GZIPInputStream; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + + +public class GzipCompress { + + private final Logger logger = LoggerFactory.getLogger(GzipCompress.class); + + /** + * Decompress (Unzip) a .gz file and save the output file in the same + * input file's location. + * The output file's name will be the same as the input's but without '.gz' + * @return The output (decompressed) file path*/ + public Optional<Path> unzip(String compressedFilePath, String outputDirPath) { + String outputFileName = "genome_sequence.fna"; + String decompressedFilePath = outputDirPath + "/" + outputFileName; + + byte[] buffer = new byte[1024]; + + try { + FileInputStream fileIn = new FileInputStream(compressedFilePath); + GZIPInputStream gzipInputStream = new GZIPInputStream(fileIn); + FileOutputStream fileOutputStream = new FileOutputStream(decompressedFilePath); + + int bytes_read; + + while ((bytes_read = gzipInputStream.read(buffer)) > 0) { + fileOutputStream.write(buffer, 0, bytes_read); + } + gzipInputStream.close(); + fileOutputStream.close(); + logger.info("File " + compressedFilePath + " was decompressed successfully"); + Path outputFilePath = Paths.get(outputDirPath, outputFileName); + return Optional.of(outputFilePath); + } catch ( + IOException e) { + logger.error("Could not find or read file !!"); + return Optional.empty(); + } + + } +} From 724b91a9dc4e72ab6ed67470f3b218afea5bf42f Mon Sep 17 00:00:00 2001 From: waterflow80 <thessalonikaathena@outlook.com> Date: Tue, 30 May 2023 14:12:29 +0100 Subject: [PATCH 2/6] adding pom.xml --- pom.xml | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/pom.xml b/pom.xml index f63b0907..689a2619 100644 --- a/pom.xml +++ b/pom.xml @@ -35,6 +35,8 @@ <java.version>8</java.version> </properties> + + <dependencies> <dependency> <groupId>org.springframework.boot</groupId> @@ -147,6 +149,13 @@ <version>1.2.5.RELEASE</version> </dependency> + <dependency> + <groupId>org.projectlombok</groupId> + <artifactId>lombok</artifactId> + <version>1.18.28</version> + <scope>provided</scope> + </dependency> + </dependencies> <build> From acfad8a3a3b82ca004a37151c81baad50063df02 Mon Sep 17 00:00:00 2001 From: waterflow80 <thessalonikaathena@outlook.com> Date: Thu, 1 Jun 2023 12:04:41 +0100 Subject: [PATCH 3/6] assembly-sequences-fasta --- .../AssemblySequencesDataSource.java | 13 +++ .../NCBIAssemblySequencesDataSource.java | 107 ++++++++++++++++++ .../dus2/AssemblySequencesReader.java | 44 +++++++ .../dus2/NCBIAssemblySequencesReader.java | 61 ++++++++++ .../NCBIAssemblySequencesReaderFactory.java | 18 +++ .../entities/AssemblySequencesEntity.java | 37 ++++++ .../eva/contigalias/entities/Sequence.java | 34 ++++++ .../repo/AssemblySequencesRepository.java | 14 +++ .../contigalias/repo/SequenceRepository.java | 7 ++ .../service/AssemblySequencesService.java | 81 +++++++++++++ .../eva/contigalias/utils/GzipCompress.java | 2 +- .../ebi/eva/contigalias/utils/MD5Digest.java | 20 ++++ .../NCBIAssemblySequencesDataSourceTest.java | 55 +++++++++ .../dus2/NCBIAssemblySequencesReaderTest.java | 67 +++++++++++ .../service/AssemblySequencesServiceTest.java | 44 +++++++ .../contigalias/utils/GzipCompressTest.java | 18 +++ .../eva/contigalias/utils/MD5DigestTest.java | 18 +++ 17 files changed, 639 insertions(+), 1 deletion(-) create mode 100644 src/main/java/uk/ac/ebi/eva/contigalias/datasource/AssemblySequencesDataSource.java create mode 100644 src/main/java/uk/ac/ebi/eva/contigalias/datasource/NCBIAssemblySequencesDataSource.java create mode 100644 src/main/java/uk/ac/ebi/eva/contigalias/dus2/AssemblySequencesReader.java create mode 100644 src/main/java/uk/ac/ebi/eva/contigalias/dus2/NCBIAssemblySequencesReader.java create mode 100644 src/main/java/uk/ac/ebi/eva/contigalias/dus2/NCBIAssemblySequencesReaderFactory.java create mode 100644 src/main/java/uk/ac/ebi/eva/contigalias/entities/AssemblySequencesEntity.java create mode 100644 src/main/java/uk/ac/ebi/eva/contigalias/entities/Sequence.java create mode 100644 src/main/java/uk/ac/ebi/eva/contigalias/repo/AssemblySequencesRepository.java create mode 100644 src/main/java/uk/ac/ebi/eva/contigalias/repo/SequenceRepository.java create mode 100644 src/main/java/uk/ac/ebi/eva/contigalias/service/AssemblySequencesService.java create mode 100644 src/main/java/uk/ac/ebi/eva/contigalias/utils/MD5Digest.java create mode 100644 src/test/java/uk/ac/ebi/eva/contigalias/datasource/NCBIAssemblySequencesDataSourceTest.java create mode 100644 src/test/java/uk/ac/ebi/eva/contigalias/dus2/NCBIAssemblySequencesReaderTest.java create mode 100644 src/test/java/uk/ac/ebi/eva/contigalias/service/AssemblySequencesServiceTest.java create mode 100644 src/test/java/uk/ac/ebi/eva/contigalias/utils/GzipCompressTest.java create mode 100644 src/test/java/uk/ac/ebi/eva/contigalias/utils/MD5DigestTest.java diff --git a/src/main/java/uk/ac/ebi/eva/contigalias/datasource/AssemblySequencesDataSource.java b/src/main/java/uk/ac/ebi/eva/contigalias/datasource/AssemblySequencesDataSource.java new file mode 100644 index 00000000..f3a12e03 --- /dev/null +++ b/src/main/java/uk/ac/ebi/eva/contigalias/datasource/AssemblySequencesDataSource.java @@ -0,0 +1,13 @@ +package uk.ac.ebi.eva.contigalias.datasource; + +import java.io.IOException; +import java.security.NoSuchAlgorithmException; +import java.util.Optional; + +import uk.ac.ebi.eva.contigalias.entities.AssemblySequencesEntity; + +public interface AssemblySequencesDataSource { + + Optional<AssemblySequencesEntity> getAssemblySequencesByAccession(String accession) throws IOException, NoSuchAlgorithmException; + +} diff --git a/src/main/java/uk/ac/ebi/eva/contigalias/datasource/NCBIAssemblySequencesDataSource.java b/src/main/java/uk/ac/ebi/eva/contigalias/datasource/NCBIAssemblySequencesDataSource.java new file mode 100644 index 00000000..211ab422 --- /dev/null +++ b/src/main/java/uk/ac/ebi/eva/contigalias/datasource/NCBIAssemblySequencesDataSource.java @@ -0,0 +1,107 @@ +package uk.ac.ebi.eva.contigalias.datasource; + +import java.io.FileInputStream; +import java.io.IOException; +import java.io.InputStream; +import java.nio.file.Files; +import java.nio.file.Path; +import java.nio.file.Paths; +import java.security.NoSuchAlgorithmException; +import java.util.Optional; + +import org.apache.commons.net.ftp.FTPFile; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import org.springframework.beans.factory.annotation.Autowired; +import org.springframework.beans.factory.annotation.Value; +import org.springframework.retry.annotation.Backoff; +import org.springframework.retry.annotation.Retryable; +import org.springframework.stereotype.Repository; +import uk.ac.ebi.eva.contigalias.dus2.NCBIAssemblySequencesReader; +import uk.ac.ebi.eva.contigalias.dus2.NCBIAssemblySequencesReaderFactory; +import uk.ac.ebi.eva.contigalias.dus.NCBIBrowser; +import uk.ac.ebi.eva.contigalias.dus.NCBIBrowserFactory; +import uk.ac.ebi.eva.contigalias.entities.AssemblySequencesEntity; +import uk.ac.ebi.eva.contigalias.utils.GzipCompress; + +@Repository("NCBISequenceDataSource") +public class NCBIAssemblySequencesDataSource implements AssemblySequencesDataSource { + + private final Logger logger = LoggerFactory.getLogger(NCBIAssemblySequencesDataSource.class); + + private final NCBIBrowserFactory factory; + + private final NCBIAssemblySequencesReaderFactory readerFactory; + + @Value("${asm.file.download.dir}") + private String asmFileDownloadDir; + + @Autowired + public NCBIAssemblySequencesDataSource(NCBIBrowserFactory factory, + NCBIAssemblySequencesReaderFactory readerFactory){ + this.factory = factory; + this.readerFactory = readerFactory; + } + + @Override + public Optional<AssemblySequencesEntity> getAssemblySequencesByAccession(String accession) throws IOException, IllegalArgumentException, NoSuchAlgorithmException { + NCBIBrowser ncbiBrowser = factory.build(); + ncbiBrowser.connect(); + GzipCompress gzipCompress = new GzipCompress(); + + Optional<Path> downloadFilePath = downloadAssemblySequences(accession, ncbiBrowser); + if (!downloadFilePath.isPresent()) { + return Optional.empty(); + } + logger.info("Assembly sequence _fna.gz file downloaded successfully in: " + downloadFilePath); + // Uncompress the .gz file + Optional<Path> compressedFilePath = gzipCompress.unzip(downloadFilePath.get().toString(), asmFileDownloadDir); + if (!compressedFilePath.isPresent()){ + return Optional.empty(); + } + + AssemblySequencesEntity assemblySequencesEntity; + try (InputStream stream = new FileInputStream(compressedFilePath.get().toFile())){ + NCBIAssemblySequencesReader reader = readerFactory.build(stream, accession); + assemblySequencesEntity = reader.getAssemblySequenceEntity(); + logger.info("NCBI: Assembly sequences' fasta file with accession " + accession + " has been parsed successfully" ); + } finally { + try { + ncbiBrowser.disconnect(); + Files.deleteIfExists(downloadFilePath.get()); + Files.deleteIfExists(compressedFilePath.get()); // Deleting the fasta file + } catch (IOException e) { + logger.warn("Error while trying to disconnect - ncbiBrowser (assembly: " + accession + ")"); + } + } + return Optional.of(assemblySequencesEntity); + } + + + /** + * Download the assembly fna/fasta file given the accession and save it to /tmp + * After this method is called, the file will be downloaded, and the path to this file + * on your local computer will be returned*/ + @Retryable(value = Exception.class, maxAttempts = 5, backoff = @Backoff(delay = 2000, multiplier = 2)) + public Optional<Path> downloadAssemblySequences(String accession, NCBIBrowser ncbiBrowser) throws IOException { + // The same directory as the report file + Optional<String> directory = ncbiBrowser.getGenomeReportDirectory(accession); + + if (!directory.isPresent()) { + return Optional.empty(); + } + + logger.info("NCBI directory for assembly genomic.fna download: " + directory.get()); + FTPFile ftpFile = ncbiBrowser.getAssemblyGenomicFnaFile(directory.get()); + String ftpFilePath = directory.get() + ftpFile.getName(); + Path downloadFilePath = Paths.get(asmFileDownloadDir, ftpFile.getName()); + boolean success = ncbiBrowser.downloadFTPFile(ftpFilePath, downloadFilePath, ftpFile.getSize()); + if (success) { + logger.info("NCBI assembly genomic.fna downloaded successfully (" + ftpFile.getName() + ")"); + return Optional.of(downloadFilePath); + } else { + logger.error("NCBI assembly genomic.fna could not be downloaded successfully(" + ftpFile.getName() + ")"); + return Optional.empty(); + } + } +} diff --git a/src/main/java/uk/ac/ebi/eva/contigalias/dus2/AssemblySequencesReader.java b/src/main/java/uk/ac/ebi/eva/contigalias/dus2/AssemblySequencesReader.java new file mode 100644 index 00000000..c7a974bb --- /dev/null +++ b/src/main/java/uk/ac/ebi/eva/contigalias/dus2/AssemblySequencesReader.java @@ -0,0 +1,44 @@ +package uk.ac.ebi.eva.contigalias.dus2; + +import java.io.BufferedReader; +import java.io.IOException; +import java.io.InputStreamReader; +import java.security.NoSuchAlgorithmException; + +import uk.ac.ebi.eva.contigalias.entities.AssemblySequencesEntity; + +public abstract class AssemblySequencesReader { + + protected final BufferedReader reader; + + protected final String accession; + + protected AssemblySequencesEntity assemblySequencesEntity; + + + protected boolean fileParsed = false; + + + public AssemblySequencesReader(InputStreamReader inputStreamReader, String accession){ + this.reader = new BufferedReader(inputStreamReader); + this.accession = accession; + } + + public AssemblySequencesEntity getAssemblySequenceEntity() throws IOException, NoSuchAlgorithmException { + if(!fileParsed || assemblySequencesEntity == null){ + parseFile(); + } + return assemblySequencesEntity; + } + + protected abstract void parseFile() throws IOException, NullPointerException, NoSuchAlgorithmException; + + + protected abstract void parseAssemblySequenceEntity(String line); + + + + public boolean ready() throws IOException { + return reader.ready(); + } +} diff --git a/src/main/java/uk/ac/ebi/eva/contigalias/dus2/NCBIAssemblySequencesReader.java b/src/main/java/uk/ac/ebi/eva/contigalias/dus2/NCBIAssemblySequencesReader.java new file mode 100644 index 00000000..b979a8eb --- /dev/null +++ b/src/main/java/uk/ac/ebi/eva/contigalias/dus2/NCBIAssemblySequencesReader.java @@ -0,0 +1,61 @@ +package uk.ac.ebi.eva.contigalias.dus2; + +import java.io.IOException; +import java.io.InputStreamReader; +import java.security.NoSuchAlgorithmException; +import java.util.LinkedList; +import java.util.List; + +import uk.ac.ebi.eva.contigalias.entities.AssemblySequencesEntity; +import uk.ac.ebi.eva.contigalias.entities.Sequence; +import uk.ac.ebi.eva.contigalias.utils.MD5Digest; + +public class NCBIAssemblySequencesReader extends AssemblySequencesReader { + + public NCBIAssemblySequencesReader(InputStreamReader inputStreamReader, String accession){ + super(inputStreamReader, accession); + } + + @Override + protected void parseFile() throws IOException, NullPointerException, NoSuchAlgorithmException { + if (reader == null){ + throw new NullPointerException("Cannot use AssemblySequenceReader without having a valid InputStreamReader."); + } + MD5Digest md5Digest = new MD5Digest(); + if (assemblySequencesEntity == null){ + assemblySequencesEntity = new AssemblySequencesEntity(); + } + // Setting the accession of the whole assembly file + assemblySequencesEntity.setInsdcAccession(accession); + List<Sequence> sequences = new LinkedList<>(); + String line = reader.readLine(); + while (line != null){ + if (line.startsWith(">")){ + Sequence sequence = new Sequence(); + String refSeq = line.substring(1, line.indexOf(' ')); + sequence.setRefseq(refSeq); + line = reader.readLine(); + StringBuilder sequenceValue = new StringBuilder(); + while (line != null && !line.startsWith(">")){ + // Looking for the sequence lines for this refseq + sequenceValue.append(line); + line = reader.readLine(); + } + String md5checksum = md5Digest.hash(sequenceValue.toString()); + sequence.setSequenceMD5(md5checksum); + sequences.add(sequence); + } + } + assemblySequencesEntity.setSequences(sequences); + fileParsed = true; + reader.close(); + } + + @Override + // Parsing a line of the file + protected void parseAssemblySequenceEntity(String line) { + // TODO: HERE WE'LL PARSE A LINE OF THE FILE (AN ENTRY) + // TODO: NOTE: THIS METHOD MIGHT NOT BE COMPLETELY USEFUL SINCE THE FILE CONTAINS ONLY + // TODO: TEXT AND A '>' SEPARATORS TO SEPARATE SEQUENCES FROM ONE ANOTHER + } +} diff --git a/src/main/java/uk/ac/ebi/eva/contigalias/dus2/NCBIAssemblySequencesReaderFactory.java b/src/main/java/uk/ac/ebi/eva/contigalias/dus2/NCBIAssemblySequencesReaderFactory.java new file mode 100644 index 00000000..a727bea1 --- /dev/null +++ b/src/main/java/uk/ac/ebi/eva/contigalias/dus2/NCBIAssemblySequencesReaderFactory.java @@ -0,0 +1,18 @@ +package uk.ac.ebi.eva.contigalias.dus2; + +import java.io.InputStream; +import java.io.InputStreamReader; + +import org.springframework.stereotype.Component; + +@Component +public class NCBIAssemblySequencesReaderFactory { + + public NCBIAssemblySequencesReader build(InputStream inputStream, String accession){ + return new NCBIAssemblySequencesReader(new InputStreamReader(inputStream), accession); + } + + public NCBIAssemblySequencesReader build(InputStreamReader inputStreamReader, String accession){ + return new NCBIAssemblySequencesReader(inputStreamReader, accession); + } +} diff --git a/src/main/java/uk/ac/ebi/eva/contigalias/entities/AssemblySequencesEntity.java b/src/main/java/uk/ac/ebi/eva/contigalias/entities/AssemblySequencesEntity.java new file mode 100644 index 00000000..5875b48d --- /dev/null +++ b/src/main/java/uk/ac/ebi/eva/contigalias/entities/AssemblySequencesEntity.java @@ -0,0 +1,37 @@ +package uk.ac.ebi.eva.contigalias.entities; + + +import java.util.List; + +import javax.persistence.CascadeType; +import javax.persistence.Column; +import javax.persistence.Entity; +import javax.persistence.Id; +import javax.persistence.JoinColumn; +import javax.persistence.OneToMany; +import javax.persistence.Table; + +import io.swagger.annotations.ApiModelProperty; +import lombok.Getter; +import lombok.Setter; +import org.hibernate.annotations.LazyCollection; +import org.hibernate.annotations.LazyCollectionOption; + +@Setter +@Getter +@Table(name = "AssemblySequences") +@Entity +public class AssemblySequencesEntity { + + @Id + @Column(nullable = false) + @ApiModelProperty(value = "Assembly's INSDC accession. It can be either a GenBank, ENA or a DDBJ accession.") + private String insdcAccession; + + + @ApiModelProperty(value = "List of all sequences of the assembly.") + @LazyCollection(LazyCollectionOption.FALSE) + @OneToMany(targetEntity = Sequence.class, cascade = CascadeType.ALL) + @JoinColumn(name = "insdcAccession", referencedColumnName = "insdcAccession") + private List<Sequence> sequences; +} diff --git a/src/main/java/uk/ac/ebi/eva/contigalias/entities/Sequence.java b/src/main/java/uk/ac/ebi/eva/contigalias/entities/Sequence.java new file mode 100644 index 00000000..728b5987 --- /dev/null +++ b/src/main/java/uk/ac/ebi/eva/contigalias/entities/Sequence.java @@ -0,0 +1,34 @@ +package uk.ac.ebi.eva.contigalias.entities; + +import javax.persistence.CascadeType; +import javax.persistence.Column; +import javax.persistence.Entity; +import javax.persistence.Id; +import javax.persistence.JoinColumn; +import javax.persistence.ManyToOne; +import javax.persistence.Table; + +import io.swagger.annotations.ApiModelProperty; +import lombok.Getter; +import lombok.Setter; + + +@Getter +@Setter +@Entity +@Table(name = "Sequence") +public class Sequence { + + + @Id + @Column(nullable = false) + @ApiModelProperty(value = "Assembly's Refseq accession.") + private String refseq; + + @Column + @ApiModelProperty(value = "Sequence's MD5 checksum value.") + private String sequenceMD5; + + + +} diff --git a/src/main/java/uk/ac/ebi/eva/contigalias/repo/AssemblySequencesRepository.java b/src/main/java/uk/ac/ebi/eva/contigalias/repo/AssemblySequencesRepository.java new file mode 100644 index 00000000..0992b3c3 --- /dev/null +++ b/src/main/java/uk/ac/ebi/eva/contigalias/repo/AssemblySequencesRepository.java @@ -0,0 +1,14 @@ +package uk.ac.ebi.eva.contigalias.repo; + +import java.util.Optional; + +import org.springframework.data.jpa.repository.JpaRepository; +import org.springframework.stereotype.Repository; +import uk.ac.ebi.eva.contigalias.entities.AssemblySequencesEntity; + +@Repository +public interface AssemblySequencesRepository extends JpaRepository<AssemblySequencesEntity, String> { + Optional<AssemblySequencesEntity> findAssemblySequenceEntityByInsdcAccession(String accession); + + +} diff --git a/src/main/java/uk/ac/ebi/eva/contigalias/repo/SequenceRepository.java b/src/main/java/uk/ac/ebi/eva/contigalias/repo/SequenceRepository.java new file mode 100644 index 00000000..ba9164b0 --- /dev/null +++ b/src/main/java/uk/ac/ebi/eva/contigalias/repo/SequenceRepository.java @@ -0,0 +1,7 @@ +package uk.ac.ebi.eva.contigalias.repo; + +import org.springframework.data.jpa.repository.JpaRepository; +import uk.ac.ebi.eva.contigalias.entities.Sequence; + +public interface SequenceRepository extends JpaRepository<Sequence, String> { +} diff --git a/src/main/java/uk/ac/ebi/eva/contigalias/service/AssemblySequencesService.java b/src/main/java/uk/ac/ebi/eva/contigalias/service/AssemblySequencesService.java new file mode 100644 index 00000000..5da37b27 --- /dev/null +++ b/src/main/java/uk/ac/ebi/eva/contigalias/service/AssemblySequencesService.java @@ -0,0 +1,81 @@ +package uk.ac.ebi.eva.contigalias.service; + +import java.io.IOException; +import java.security.NoSuchAlgorithmException; +import java.util.Optional; + +import javax.transaction.Transactional; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import org.springframework.stereotype.Service; +import uk.ac.ebi.eva.contigalias.datasource.NCBIAssemblySequencesDataSource; +import uk.ac.ebi.eva.contigalias.entities.AssemblySequencesEntity; +import uk.ac.ebi.eva.contigalias.exception.AssemblySequenceNotFoundException; +import uk.ac.ebi.eva.contigalias.exception.DuplicateAssemblySequenceException; +import uk.ac.ebi.eva.contigalias.repo.AssemblySequencesRepository; + +@Service +public class AssemblySequencesService { + + private final AssemblySequencesRepository repository; + + private final NCBIAssemblySequencesDataSource ncbiSequenceDataSource; + + private final Logger logger = LoggerFactory.getLogger(AssemblyService.class); + + + public AssemblySequencesService( + AssemblySequencesRepository repository, NCBIAssemblySequencesDataSource ncbiSequenceDataSource){ + this.repository = repository; + this.ncbiSequenceDataSource = ncbiSequenceDataSource; + } + + public void fetchAndInsertAssemblySequence(String accession) throws IOException, NoSuchAlgorithmException { + Optional<AssemblySequencesEntity> entity = repository.findAssemblySequenceEntityByInsdcAccession(accession); + if(entity.isPresent()) + throw duplicateAssemblySequenceInsertionException(accession, entity.get()); + Optional<AssemblySequencesEntity> fetchAssembly = ncbiSequenceDataSource.getAssemblySequencesByAccession(accession); + if(!fetchAssembly.isPresent()){ + throw new AssemblySequenceNotFoundException(accession); + } + if (fetchAssembly.get().getInsdcAccession() != null){ // This condition is only for testing, it'll change as soon as we add more attributes to the entity + insertAssemblySequence(fetchAssembly.get()); + logger.info("Successfully inserted assembly for accession " + accession); + }else { + logger.error("Skipping inserting assembly sequence : No name in assembly : " + accession); + } + } + + @Transactional + public void insertAssemblySequence(AssemblySequencesEntity entity) { + if (isEntityPresent(entity)) { + throw duplicateAssemblySequenceInsertionException(null, entity); + } else { + repository.save(entity); + } + } + + private boolean isEntityPresent(AssemblySequencesEntity entity) { + // TODO: THE CONDITIONS IN THIS METHOD WILL BE CHANGED WHEN WE ADD MORE ATTRIBUTES TO THE ENTITY + Optional<AssemblySequencesEntity> existingAssembly = repository.findAssemblySequenceEntityByInsdcAccession(entity.getInsdcAccession()); + return existingAssembly.isPresent(); + } + + private DuplicateAssemblySequenceException duplicateAssemblySequenceInsertionException(String accession, AssemblySequencesEntity present) { + StringBuilder exception = new StringBuilder("A similar assembly Sequence already exists"); + if (accession != null){ + exception.append("\n"); + exception.append("Assembly Sequence trying to insert:"); + exception.append("\t"); + exception.append(accession); + } + if (present != null){ + exception.append("\n"); + exception.append("Assembly Sequence already present"); + exception.append("\t"); + exception.append(present); + } + return new DuplicateAssemblySequenceException(exception.toString()); + } +} diff --git a/src/main/java/uk/ac/ebi/eva/contigalias/utils/GzipCompress.java b/src/main/java/uk/ac/ebi/eva/contigalias/utils/GzipCompress.java index 455c0582..a8aecd49 100644 --- a/src/main/java/uk/ac/ebi/eva/contigalias/utils/GzipCompress.java +++ b/src/main/java/uk/ac/ebi/eva/contigalias/utils/GzipCompress.java @@ -19,7 +19,7 @@ public class GzipCompress { /** * Decompress (Unzip) a .gz file and save the output file in the same * input file's location. - * The output file's name will be the same as the input's but without '.gz' + * The output file's name will be genome_sequence.fna * @return The output (decompressed) file path*/ public Optional<Path> unzip(String compressedFilePath, String outputDirPath) { String outputFileName = "genome_sequence.fna"; diff --git a/src/main/java/uk/ac/ebi/eva/contigalias/utils/MD5Digest.java b/src/main/java/uk/ac/ebi/eva/contigalias/utils/MD5Digest.java new file mode 100644 index 00000000..72a3c0ee --- /dev/null +++ b/src/main/java/uk/ac/ebi/eva/contigalias/utils/MD5Digest.java @@ -0,0 +1,20 @@ +package uk.ac.ebi.eva.contigalias.utils; + +import java.security.MessageDigest; +import java.security.NoSuchAlgorithmException; + +import javax.xml.bind.DatatypeConverter; + +public class MD5Digest { + + /** + * Return the digest of the text using the MD5 algorithm*/ + public String hash(String text) throws NoSuchAlgorithmException { + MessageDigest md = MessageDigest.getInstance("MD5"); + md.update(text.getBytes()); + byte[] digest = md.digest(); + String textHash = DatatypeConverter + .printHexBinary(digest).toUpperCase(); + return textHash.toLowerCase(); + } +} diff --git a/src/test/java/uk/ac/ebi/eva/contigalias/datasource/NCBIAssemblySequencesDataSourceTest.java b/src/test/java/uk/ac/ebi/eva/contigalias/datasource/NCBIAssemblySequencesDataSourceTest.java new file mode 100644 index 00000000..d1305371 --- /dev/null +++ b/src/test/java/uk/ac/ebi/eva/contigalias/datasource/NCBIAssemblySequencesDataSourceTest.java @@ -0,0 +1,55 @@ +package uk.ac.ebi.eva.contigalias.datasource; + +import java.io.IOException; +import java.security.NoSuchAlgorithmException; +import java.util.Optional; + +import org.junit.jupiter.api.AfterEach; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; +import org.springframework.beans.factory.annotation.Autowired; +import org.springframework.boot.test.context.SpringBootTest; +import uk.ac.ebi.eva.contigalias.entities.AssemblySequencesEntity; +import uk.ac.ebi.eva.contigalias.entities.Sequence; + +import static org.junit.jupiter.api.Assertions.*; +@SpringBootTest +class NCBIAssemblySequencesDataSourceTest { + + + @Autowired + NCBIAssemblySequencesDataSource dataSource; + + @BeforeEach + void setUp() { + } + + @AfterEach + void tearDown() { + } + + @Test + void getAssemblySequenceByAccession() throws IOException, NoSuchAlgorithmException, InterruptedException { + + + String accession = "GCF_000001765.3"; + //String accession2 = "GCF_000001405.31"; + Optional<AssemblySequencesEntity> entity = dataSource.getAssemblySequencesByAccession(accession); + //displayAssemblySequencesEntityContent(entity.get()); + assertEquals(accession, entity.get().getInsdcAccession()); + } + + void displayAssemblySequencesEntityContent(AssemblySequencesEntity entity) throws InterruptedException { + System.out.println("ACCESSION: " + entity.getInsdcAccession()); + System.out.println("TOTAL NUMBER OF SEQUENCES: " + entity.getSequences().size()); + for (Sequence s: entity.getSequences()){ + System.out.print("REFSEQ: " + s.getRefseq() + " | "); + System.out.println("SEQUENCE_MD5: " + s.getSequenceMD5()); + Thread.sleep(1000); // Just for lazy and fun display :) + } + } + + @Test + void downloadAssemblySequence() { + } +} \ No newline at end of file diff --git a/src/test/java/uk/ac/ebi/eva/contigalias/dus2/NCBIAssemblySequencesReaderTest.java b/src/test/java/uk/ac/ebi/eva/contigalias/dus2/NCBIAssemblySequencesReaderTest.java new file mode 100644 index 00000000..b652ea13 --- /dev/null +++ b/src/test/java/uk/ac/ebi/eva/contigalias/dus2/NCBIAssemblySequencesReaderTest.java @@ -0,0 +1,67 @@ +package uk.ac.ebi.eva.contigalias.dus2; + +import java.io.FileInputStream; +import java.io.FileNotFoundException; +import java.io.IOException; +import java.io.InputStream; +import java.io.InputStreamReader; +import java.security.NoSuchAlgorithmException; + +import org.junit.jupiter.api.AfterEach; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; +import org.springframework.beans.factory.annotation.Autowired; +import org.springframework.boot.test.context.SpringBootTest; +import uk.ac.ebi.eva.contigalias.entities.AssemblySequencesEntity; +import uk.ac.ebi.eva.contigalias.entities.Sequence; + +import static org.junit.jupiter.api.Assertions.*; + +@SpringBootTest +class NCBIAssemblySequencesReaderTest { + + private static final String ACCESSION = "GCF_000001765.3"; + + private static final String FASTA_FILE_PATH = "/tmp/genome_sequence.fna"; + private InputStreamReader streamReader; + + private InputStream stream; + + @Autowired + private NCBIAssemblySequencesReaderFactory readerFactory; + + private NCBIAssemblySequencesReader reader; + + @BeforeEach + void setUp() throws FileNotFoundException { + stream = new FileInputStream(FASTA_FILE_PATH); + streamReader = new InputStreamReader(stream); + reader = readerFactory.build(streamReader, ACCESSION); + } + + @AfterEach + void tearDown() throws IOException { + stream.close(); + streamReader.close(); + } + + @Test + void getAssemblySequencesReader() throws IOException { + assertTrue(reader.ready()); + } + + @Test + void assertParsedFastaFileValid() throws IOException, NoSuchAlgorithmException { + reader.parseFile(); + displayAssemblySequencesEntityContent(reader.assemblySequencesEntity); + assertEquals(ACCESSION, reader.assemblySequencesEntity.getInsdcAccession()); + } + + void displayAssemblySequencesEntityContent(AssemblySequencesEntity entity){ + System.out.println("ACCESSION: " + entity.getInsdcAccession()); + for (Sequence s: entity.getSequences()){ + System.out.print("REFSEQ: " + s.getRefseq() + " | "); + System.out.println("SEQUENCE_MD5: " + s.getSequenceMD5()); + } + } +} \ No newline at end of file diff --git a/src/test/java/uk/ac/ebi/eva/contigalias/service/AssemblySequencesServiceTest.java b/src/test/java/uk/ac/ebi/eva/contigalias/service/AssemblySequencesServiceTest.java new file mode 100644 index 00000000..53ba6296 --- /dev/null +++ b/src/test/java/uk/ac/ebi/eva/contigalias/service/AssemblySequencesServiceTest.java @@ -0,0 +1,44 @@ +package uk.ac.ebi.eva.contigalias.service; + +import java.io.IOException; +import java.security.NoSuchAlgorithmException; + +import org.junit.jupiter.api.AfterEach; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; +import org.springframework.beans.factory.annotation.Autowired; +import org.springframework.boot.test.context.SpringBootTest; +import uk.ac.ebi.eva.contigalias.repo.AssemblySequencesRepository; + +import static org.junit.jupiter.api.Assertions.*; + +@SpringBootTest +class AssemblySequencesServiceTest { + + + @Autowired + private AssemblySequencesService assemblySequencesService; + + @Autowired + private AssemblySequencesRepository assemblySequencesRepository; + + @BeforeEach + void setUp() { + } + + @AfterEach + void tearDown() { + } + + @Test + void fetchAndInsertAssemblySequence() throws IOException, NoSuchAlgorithmException { + String accession = "GCF_000001765.3"; + assemblySequencesService.fetchAndInsertAssemblySequence(accession); + assertNotNull(assemblySequencesRepository.findAssemblySequenceEntityByInsdcAccession(accession)); + assertEquals(accession, assemblySequencesRepository.findAssemblySequenceEntityByInsdcAccession(accession).get()); + } + + @Test + void insertAssemblySequence() { + } +} \ No newline at end of file diff --git a/src/test/java/uk/ac/ebi/eva/contigalias/utils/GzipCompressTest.java b/src/test/java/uk/ac/ebi/eva/contigalias/utils/GzipCompressTest.java new file mode 100644 index 00000000..a2ea9f99 --- /dev/null +++ b/src/test/java/uk/ac/ebi/eva/contigalias/utils/GzipCompressTest.java @@ -0,0 +1,18 @@ +package uk.ac.ebi.eva.contigalias.utils; + +import org.junit.jupiter.api.Test; + +import static org.junit.jupiter.api.Assertions.*; + +class GzipCompressTest { + + @Test + void unzip() { + String compressedFilePath = "/tmp/GCF_000001765.3_Dpse_3.0_genomic.fna.gz"; + String outputDirPath = "/tmp"; + GzipCompress gzipCompress = new GzipCompress(); + + + assertEquals("/tmp/genome_sequence.fna", gzipCompress.unzip(compressedFilePath, outputDirPath).get().toString()); + } +} \ No newline at end of file diff --git a/src/test/java/uk/ac/ebi/eva/contigalias/utils/MD5DigestTest.java b/src/test/java/uk/ac/ebi/eva/contigalias/utils/MD5DigestTest.java new file mode 100644 index 00000000..1676e77d --- /dev/null +++ b/src/test/java/uk/ac/ebi/eva/contigalias/utils/MD5DigestTest.java @@ -0,0 +1,18 @@ +package uk.ac.ebi.eva.contigalias.utils; + +import java.security.NoSuchAlgorithmException; + +import org.junit.jupiter.api.Test; + +import static org.junit.jupiter.api.Assertions.*; + +class MD5DigestTest { + + @Test + void hash() throws NoSuchAlgorithmException { + MD5Digest md5Digest = new MD5Digest(); + String toBeHashed = "AAA"; + String MD5Digest = "8880cd8c1fb402585779766f681b868b"; + assertEquals(MD5Digest,md5Digest.hash(toBeHashed)); + } +} \ No newline at end of file From 0ce600ef877926d03a8b17fd225c6daa08de3f2c Mon Sep 17 00:00:00 2001 From: waterflow80 <thessalonikaathena@outlook.com> Date: Sat, 3 Jun 2023 18:42:19 +0100 Subject: [PATCH 4/6] local setup for dev --- .../authentication/SecurityConfiguration.java | 2 +- .../AssemblySequenceDataSource.java | 12 -- .../NCBIAssemblySequenceDataSource.java | 105 ------------------ .../dus2/AssemblySequenceReader.java | 39 ------- .../dus2/NCBIAssemblySequenceReader.java | 27 ----- .../NCBIAssemblySequenceReaderFactory.java | 18 --- .../entities/AssemblySequenceEntity.java | 23 ---- .../repo/AssemblySequenceRepository.java | 14 --- .../service/AssemblySequenceService.java | 80 ------------- src/main/resources/application.properties | 19 ++-- .../eva/contigalias/utils/MD5DigestTest.java | 18 --- 11 files changed, 11 insertions(+), 346 deletions(-) delete mode 100644 src/main/java/uk/ac/ebi/eva/contigalias/datasource/AssemblySequenceDataSource.java delete mode 100644 src/main/java/uk/ac/ebi/eva/contigalias/datasource/NCBIAssemblySequenceDataSource.java delete mode 100644 src/main/java/uk/ac/ebi/eva/contigalias/dus2/AssemblySequenceReader.java delete mode 100644 src/main/java/uk/ac/ebi/eva/contigalias/dus2/NCBIAssemblySequenceReader.java delete mode 100644 src/main/java/uk/ac/ebi/eva/contigalias/dus2/NCBIAssemblySequenceReaderFactory.java delete mode 100644 src/main/java/uk/ac/ebi/eva/contigalias/entities/AssemblySequenceEntity.java delete mode 100644 src/main/java/uk/ac/ebi/eva/contigalias/repo/AssemblySequenceRepository.java delete mode 100644 src/main/java/uk/ac/ebi/eva/contigalias/service/AssemblySequenceService.java delete mode 100644 src/test/java/uk/ac/ebi/eva/contigalias/utils/MD5DigestTest.java diff --git a/src/main/java/uk/ac/ebi/eva/contigalias/controller/authentication/SecurityConfiguration.java b/src/main/java/uk/ac/ebi/eva/contigalias/controller/authentication/SecurityConfiguration.java index 57f85825..073b13ff 100644 --- a/src/main/java/uk/ac/ebi/eva/contigalias/controller/authentication/SecurityConfiguration.java +++ b/src/main/java/uk/ac/ebi/eva/contigalias/controller/authentication/SecurityConfiguration.java @@ -57,7 +57,7 @@ protected void configure(HttpSecurity http) throws Exception { .authorizeRequests() .antMatchers("/v1/assemblies/**").permitAll() .antMatchers("/v1/chromosomes/**").permitAll() - .antMatchers("/v1/admin/**").hasRole(ROLE_ADMIN) + //.antMatchers("/v1/admin/**").hasRole(ROLE_ADMIN) .and().httpBasic().realmName(REALM) .authenticationEntryPoint(customBasicAuthenticationEntryPoint) .and().sessionManagement().sessionCreationPolicy(SessionCreationPolicy.STATELESS); diff --git a/src/main/java/uk/ac/ebi/eva/contigalias/datasource/AssemblySequenceDataSource.java b/src/main/java/uk/ac/ebi/eva/contigalias/datasource/AssemblySequenceDataSource.java deleted file mode 100644 index 3a4d5b46..00000000 --- a/src/main/java/uk/ac/ebi/eva/contigalias/datasource/AssemblySequenceDataSource.java +++ /dev/null @@ -1,12 +0,0 @@ -package uk.ac.ebi.eva.contigalias.datasource; - -import java.io.IOException; -import java.util.Optional; - -import uk.ac.ebi.eva.contigalias.entities.AssemblySequenceEntity; - -public interface AssemblySequenceDataSource { - - Optional<AssemblySequenceEntity> getAssemblySequenceByAccession(String accession) throws IOException; - -} diff --git a/src/main/java/uk/ac/ebi/eva/contigalias/datasource/NCBIAssemblySequenceDataSource.java b/src/main/java/uk/ac/ebi/eva/contigalias/datasource/NCBIAssemblySequenceDataSource.java deleted file mode 100644 index d76741cf..00000000 --- a/src/main/java/uk/ac/ebi/eva/contigalias/datasource/NCBIAssemblySequenceDataSource.java +++ /dev/null @@ -1,105 +0,0 @@ -package uk.ac.ebi.eva.contigalias.datasource; - -import java.io.FileInputStream; -import java.io.IOException; -import java.io.InputStream; -import java.nio.file.Path; -import java.nio.file.Paths; -import java.util.Optional; - -import org.apache.commons.net.ftp.FTPFile; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; -import org.springframework.beans.factory.annotation.Autowired; -import org.springframework.beans.factory.annotation.Value; -import org.springframework.retry.annotation.Backoff; -import org.springframework.retry.annotation.Retryable; -import org.springframework.stereotype.Repository; -import uk.ac.ebi.eva.contigalias.dus2.NCBIAssemblySequenceReader; -import uk.ac.ebi.eva.contigalias.dus2.NCBIAssemblySequenceReaderFactory; -import uk.ac.ebi.eva.contigalias.dus.NCBIBrowser; -import uk.ac.ebi.eva.contigalias.dus.NCBIBrowserFactory; -import uk.ac.ebi.eva.contigalias.entities.AssemblySequenceEntity; -import uk.ac.ebi.eva.contigalias.utils.GzipCompress; - -@Repository("NCBISequenceDataSource") -public class NCBIAssemblySequenceDataSource implements AssemblySequenceDataSource{ - - private final Logger logger = LoggerFactory.getLogger(NCBIAssemblySequenceDataSource.class); - - private final NCBIBrowserFactory factory; - - private final NCBIAssemblySequenceReaderFactory readerFactory; - - @Value("${asm.file.download.dir}") - private String asmFileDownloadDir; - - @Autowired - public NCBIAssemblySequenceDataSource(NCBIBrowserFactory factory, - NCBIAssemblySequenceReaderFactory readerFactory){ - this.factory = factory; - this.readerFactory = readerFactory; - } - - @Override - public Optional<AssemblySequenceEntity> getAssemblySequenceByAccession(String accession) throws IOException, IllegalArgumentException { - NCBIBrowser ncbiBrowser = factory.build(); - ncbiBrowser.connect(); - GzipCompress gzipCompress = new GzipCompress(); - - Optional<Path> downloadFilePath = downloadAssemblySequence(accession, ncbiBrowser); - if (!downloadFilePath.isPresent()) { - return Optional.empty(); - } - logger.info("Assembly sequence _fna.gz file downloaded successfully in: " + downloadFilePath); - // Uncompress the .gz file - Optional<Path> uncompressedFilePath = gzipCompress.unzip(downloadFilePath.get().toString(), asmFileDownloadDir); - if (!uncompressedFilePath.isPresent()){ - return Optional.empty(); - } - - AssemblySequenceEntity assemblySequenceEntity; - try (InputStream stream = new FileInputStream(uncompressedFilePath.get().toFile())){ - NCBIAssemblySequenceReader reader = readerFactory.build(stream); - assemblySequenceEntity = reader.getAssemblySequenceEntity(); - //TODO : The logger info will be canged when we add more attributes to the entity and we parse the whole file info - logger.info("NCBI: Name of the sequence in " + accession + " : " + assemblySequenceEntity.getName()); - } finally { - try { - ncbiBrowser.disconnect(); - //Files.deleteIfExists(downloadFilePath.get()); - } catch (IOException e) { - logger.warn("Error while trying to disconnect - ncbiBrowser (assembly: " + accession + ")"); - } - } - return Optional.of(assemblySequenceEntity); - } - - - /** - * Download the assembly fna/fasta file given the accession and save it to /tmp - * After this method is called, the file will be downloaded, and the path to this file - * on your local computer will be returned*/ - @Retryable(value = Exception.class, maxAttempts = 5, backoff = @Backoff(delay = 2000, multiplier = 2)) - public Optional<Path> downloadAssemblySequence(String accession, NCBIBrowser ncbiBrowser) throws IOException { - // The same directory as the report file - Optional<String> directory = ncbiBrowser.getGenomeReportDirectory(accession); - - if (!directory.isPresent()) { - return Optional.empty(); - } - - logger.info("NCBI directory for assembly genomic.fna download: " + directory.get()); - FTPFile ftpFile = ncbiBrowser.getAssemblyGenomicFnaFile(directory.get()); - String ftpFilePath = directory.get() + ftpFile.getName(); - Path downloadFilePath = Paths.get(asmFileDownloadDir, ftpFile.getName()); - boolean success = ncbiBrowser.downloadFTPFile(ftpFilePath, downloadFilePath, ftpFile.getSize()); - if (success) { - logger.info("NCBI assembly genomic.fna downloaded successfully (" + ftpFile.getName() + ")"); - return Optional.of(downloadFilePath); - } else { - logger.error("NCBI assembly genomic.fna could not be downloaded successfully(" + ftpFile.getName() + ")"); - return Optional.empty(); - } - } -} diff --git a/src/main/java/uk/ac/ebi/eva/contigalias/dus2/AssemblySequenceReader.java b/src/main/java/uk/ac/ebi/eva/contigalias/dus2/AssemblySequenceReader.java deleted file mode 100644 index 0b107042..00000000 --- a/src/main/java/uk/ac/ebi/eva/contigalias/dus2/AssemblySequenceReader.java +++ /dev/null @@ -1,39 +0,0 @@ -package uk.ac.ebi.eva.contigalias.dus2; - -import java.io.BufferedReader; -import java.io.IOException; -import java.io.InputStreamReader; - -import uk.ac.ebi.eva.contigalias.entities.AssemblySequenceEntity; - -public abstract class AssemblySequenceReader { - - protected final BufferedReader reader; - - protected AssemblySequenceEntity assemblySequenceEntity; - - protected boolean fileParsed = false; - - - public AssemblySequenceReader(InputStreamReader inputStreamReader){ - this.reader = new BufferedReader(inputStreamReader); - } - - public AssemblySequenceEntity getAssemblySequenceEntity() throws IOException { - if(!fileParsed || assemblySequenceEntity == null){ - parseFile(); - } - return assemblySequenceEntity; - } - - protected abstract void parseFile() throws IOException, NullPointerException; - - - protected abstract void parseAssemblySequenceEntity(String line); - - - - public boolean ready() throws IOException { - return reader.ready(); - } -} diff --git a/src/main/java/uk/ac/ebi/eva/contigalias/dus2/NCBIAssemblySequenceReader.java b/src/main/java/uk/ac/ebi/eva/contigalias/dus2/NCBIAssemblySequenceReader.java deleted file mode 100644 index 12e01689..00000000 --- a/src/main/java/uk/ac/ebi/eva/contigalias/dus2/NCBIAssemblySequenceReader.java +++ /dev/null @@ -1,27 +0,0 @@ -package uk.ac.ebi.eva.contigalias.dus2; - -import java.io.IOException; -import java.io.InputStreamReader; - -public class NCBIAssemblySequenceReader extends AssemblySequenceReader{ - - public NCBIAssemblySequenceReader(InputStreamReader inputStreamReader){ - super(inputStreamReader); - } - - @Override - protected void parseFile() throws IOException, NullPointerException { - if (reader == null){ - throw new NullPointerException("Cannot use AssemblySequenceReader without having a valid InputStreamReader."); - } - // TODO: HERE WE'LL EXTARACT THE .gz FILE AND PARSE THE fna FILE - } - - @Override - // Parsing a line of the file - protected void parseAssemblySequenceEntity(String line) { - // TODO: HERE WE'LL PARSE A LINE OF THE FILE (AN ENTRY) - // TODO: NOTE: THIS METHOD MIGHT NOT BE COMPLETELY USEFUL SINCE THE FILE CONTAINS ONLY - // TODO: TEXT AND A '>' SEPARATORS TO SEPARATE SEQUENCES FROM ONE ANOTHER - } -} diff --git a/src/main/java/uk/ac/ebi/eva/contigalias/dus2/NCBIAssemblySequenceReaderFactory.java b/src/main/java/uk/ac/ebi/eva/contigalias/dus2/NCBIAssemblySequenceReaderFactory.java deleted file mode 100644 index 06867aba..00000000 --- a/src/main/java/uk/ac/ebi/eva/contigalias/dus2/NCBIAssemblySequenceReaderFactory.java +++ /dev/null @@ -1,18 +0,0 @@ -package uk.ac.ebi.eva.contigalias.dus2; - -import java.io.InputStream; -import java.io.InputStreamReader; - -import org.springframework.stereotype.Component; - -@Component -public class NCBIAssemblySequenceReaderFactory { - - public NCBIAssemblySequenceReader build(InputStream inputStream){ - return new NCBIAssemblySequenceReader(new InputStreamReader(inputStream)); - } - - public NCBIAssemblySequenceReader build(InputStreamReader inputStreamReader){ - return new NCBIAssemblySequenceReader(inputStreamReader); - } -} diff --git a/src/main/java/uk/ac/ebi/eva/contigalias/entities/AssemblySequenceEntity.java b/src/main/java/uk/ac/ebi/eva/contigalias/entities/AssemblySequenceEntity.java deleted file mode 100644 index c1a58894..00000000 --- a/src/main/java/uk/ac/ebi/eva/contigalias/entities/AssemblySequenceEntity.java +++ /dev/null @@ -1,23 +0,0 @@ -package uk.ac.ebi.eva.contigalias.entities; - -import javax.persistence.Column; -import javax.persistence.Entity; -import javax.persistence.Id; -import javax.persistence.Table; - -import lombok.Getter; -import lombok.Setter; - -@Setter -@Getter -@Table(name = "AssemblySequence") -@Entity -public class AssemblySequenceEntity { - - @Id - @Column(nullable = false) - private String accession; - - @Column(nullable = false) - private String name; -} diff --git a/src/main/java/uk/ac/ebi/eva/contigalias/repo/AssemblySequenceRepository.java b/src/main/java/uk/ac/ebi/eva/contigalias/repo/AssemblySequenceRepository.java deleted file mode 100644 index 6eb6fa01..00000000 --- a/src/main/java/uk/ac/ebi/eva/contigalias/repo/AssemblySequenceRepository.java +++ /dev/null @@ -1,14 +0,0 @@ -package uk.ac.ebi.eva.contigalias.repo; - -import java.util.Optional; - -import org.springframework.data.jpa.repository.JpaRepository; -import org.springframework.stereotype.Repository; -import uk.ac.ebi.eva.contigalias.entities.AssemblySequenceEntity; - -@Repository -public interface AssemblySequenceRepository extends JpaRepository<AssemblySequenceEntity, Long> { - Optional<AssemblySequenceEntity> findAssemblySequenceEntityByAccession(String accession); - - -} diff --git a/src/main/java/uk/ac/ebi/eva/contigalias/service/AssemblySequenceService.java b/src/main/java/uk/ac/ebi/eva/contigalias/service/AssemblySequenceService.java deleted file mode 100644 index 5dfd917a..00000000 --- a/src/main/java/uk/ac/ebi/eva/contigalias/service/AssemblySequenceService.java +++ /dev/null @@ -1,80 +0,0 @@ -package uk.ac.ebi.eva.contigalias.service; - -import java.io.IOException; -import java.util.Optional; - -import javax.transaction.Transactional; - -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; -import org.springframework.stereotype.Service; -import uk.ac.ebi.eva.contigalias.datasource.NCBIAssemblySequenceDataSource; -import uk.ac.ebi.eva.contigalias.entities.AssemblySequenceEntity; -import uk.ac.ebi.eva.contigalias.exception.AssemblySequenceNotFoundException; -import uk.ac.ebi.eva.contigalias.exception.DuplicateAssemblySequenceException; -import uk.ac.ebi.eva.contigalias.repo.AssemblySequenceRepository; - -@Service -public class AssemblySequenceService { - - private final AssemblySequenceRepository repository; - - private final NCBIAssemblySequenceDataSource ncbiSequenceDataSource; - - private final Logger logger = LoggerFactory.getLogger(AssemblyService.class); - - - public AssemblySequenceService( - AssemblySequenceRepository repository, NCBIAssemblySequenceDataSource ncbiSequenceDataSource){ - this.repository = repository; - this.ncbiSequenceDataSource = ncbiSequenceDataSource; - } - - public void fetchAndInsertAssemblySequence(String accession) throws IOException { - Optional<AssemblySequenceEntity> entity = repository.findAssemblySequenceEntityByAccession(accession); - if(entity.isPresent()) - throw duplicateAssemblySequenceInsertionException(accession, entity.get()); - Optional<AssemblySequenceEntity> fetchAssembly = ncbiSequenceDataSource.getAssemblySequenceByAccession(accession); - if(!fetchAssembly.isPresent()){ - throw new AssemblySequenceNotFoundException(accession); - } - if (fetchAssembly.get().getName() != null){ // This condition is only for testing, it'll change as soon as we add more attributes to the entity - insertAssemblySequence(fetchAssembly.get()); - logger.info("Successfully inserted assembly for accession " + accession); - }else { - logger.error("Skipping inserting assembly sequence : No name in assembly : " + accession); - } - } - - @Transactional - public void insertAssemblySequence(AssemblySequenceEntity entity) { - if (isEntityPresent(entity)) { - throw duplicateAssemblySequenceInsertionException(null, entity); - } else { - repository.save(entity); - } - } - - private boolean isEntityPresent(AssemblySequenceEntity entity) { - // TODO: THE CONDITIONS IN THIS METHOD WILL BE CHANGED WHEN WE ADD MORE ATTRIBUTES TO THE ENTITY - Optional<AssemblySequenceEntity> existingAssembly = repository.findAssemblySequenceEntityByAccession(entity.getAccession()); - return existingAssembly.isPresent(); - } - - private DuplicateAssemblySequenceException duplicateAssemblySequenceInsertionException(String accession, AssemblySequenceEntity present) { - StringBuilder exception = new StringBuilder("A similar assembly Sequence already exists"); - if (accession != null){ - exception.append("\n"); - exception.append("Assembly Sequence trying to insert:"); - exception.append("\t"); - exception.append(accession); - } - if (present != null){ - exception.append("\n"); - exception.append("Assembly Sequence already present"); - exception.append("\t"); - exception.append(present); - } - return new DuplicateAssemblySequenceException(exception.toString()); - } -} diff --git a/src/main/resources/application.properties b/src/main/resources/application.properties index 514ac4f3..efa59a98 100644 --- a/src/main/resources/application.properties +++ b/src/main/resources/application.properties @@ -14,8 +14,8 @@ # limitations under the License. # -controller.auth.admin.username=@contig-alias.admin-user@ -controller.auth.admin.password=@contig-alias.admin-password@ +controller.auth.admin.username=haroune +controller.auth.admin.password=password management.endpoints.web.exposure.include=info,health management.endpoints.web.base-path=/ @@ -24,20 +24,21 @@ management.info.git.mode=full logging.level.uk.ac.ebi.eva.contigalias=DEBUG # Database configuration -spring.datasource.url=@contig-alias.db-url@ -spring.datasource.username=@contig-alias.db-username@ -spring.datasource.password=@contig-alias.db-password@ -spring.jpa.hibernate.ddl-auto=@contig-alias.ddl-behaviour@ +spring.datasource.url=jdbc:postgresql://localhost:5432/contig_db +spring.datasource.username=haroune +spring.datasource.password=123 +spring.jpa.hibernate.ddl-auto=update spring.datasource.driver-class-name=org.postgresql.Driver spring.jpa.database-platform=org.hibernate.dialect.PostgreSQLDialect spring.jpa.generate-ddl=true server.servlet.context-path=/eva/webservices/contig-alias +server.port=8081 -ftp.proxy.host=@ftp.proxy.host@ -ftp.proxy.port=@ftp.proxy.port@ +ftp.proxy.host=null +ftp.proxy.port=0 -config.scaffolds.enabled = @contig-alias.scaffolds-enabled@ +config.scaffolds.enabled = true asm.file.download.dir=/tmp diff --git a/src/test/java/uk/ac/ebi/eva/contigalias/utils/MD5DigestTest.java b/src/test/java/uk/ac/ebi/eva/contigalias/utils/MD5DigestTest.java deleted file mode 100644 index 1676e77d..00000000 --- a/src/test/java/uk/ac/ebi/eva/contigalias/utils/MD5DigestTest.java +++ /dev/null @@ -1,18 +0,0 @@ -package uk.ac.ebi.eva.contigalias.utils; - -import java.security.NoSuchAlgorithmException; - -import org.junit.jupiter.api.Test; - -import static org.junit.jupiter.api.Assertions.*; - -class MD5DigestTest { - - @Test - void hash() throws NoSuchAlgorithmException { - MD5Digest md5Digest = new MD5Digest(); - String toBeHashed = "AAA"; - String MD5Digest = "8880cd8c1fb402585779766f681b868b"; - assertEquals(MD5Digest,md5Digest.hash(toBeHashed)); - } -} \ No newline at end of file From 16a63f696dcaa87218a5670b2234d9775d30ef4f Mon Sep 17 00:00:00 2001 From: waterflow80 <thessalonikaathena@outlook.com> Date: Mon, 5 Jun 2023 18:12:13 +0100 Subject: [PATCH 5/6] Adding the feature of retrieving the fasta file from the NCBI datasource and parsing the content. Added the necessary code to retrieve the fasta file related to a specific assembly. Parsing the file and saving related information alongside the Contig-alias Backend --- .../eva/contigalias/repo/ChromosomeRepository.java | 11 +++++++++++ .../service/AssemblySequencesService.java | 13 ++++++++++++- .../eva/contigalias/service/ChromosomeService.java | 7 +++++++ .../service/AssemblySequencesServiceTest.java | 2 +- 4 files changed, 31 insertions(+), 2 deletions(-) diff --git a/src/main/java/uk/ac/ebi/eva/contigalias/repo/ChromosomeRepository.java b/src/main/java/uk/ac/ebi/eva/contigalias/repo/ChromosomeRepository.java index 0b6f5bd7..920e488e 100644 --- a/src/main/java/uk/ac/ebi/eva/contigalias/repo/ChromosomeRepository.java +++ b/src/main/java/uk/ac/ebi/eva/contigalias/repo/ChromosomeRepository.java @@ -19,11 +19,17 @@ import org.springframework.data.domain.Page; import org.springframework.data.domain.Pageable; import org.springframework.data.jpa.repository.JpaRepository; +import org.springframework.data.jpa.repository.Modifying; +import org.springframework.data.jpa.repository.Query; +import org.springframework.data.repository.query.Param; import org.springframework.stereotype.Repository; import uk.ac.ebi.eva.contigalias.entities.AssemblyEntity; import uk.ac.ebi.eva.contigalias.entities.ChromosomeEntity; +import javax.transaction.Transactional; + + @Repository public interface ChromosomeRepository extends JpaRepository<ChromosomeEntity, Long> { @@ -63,6 +69,11 @@ Page<ChromosomeEntity> findChromosomeEntitiesByAssemblyInsdcAccessionOrAssemblyR Page<ChromosomeEntity> findChromosomeEntitiesByUcscName(String ucscName, Pageable request); + @Transactional + @Modifying + @Query("UPDATE ChromosomeEntity c SET c.md5checksum = :md5Checksum WHERE c.refseq = :refseq") + int updateChromosomeEntityByRefseqSetMD5Checksum(@Param(value = "refseq") String refseq, @Param(value = "md5Checksum") String md5Checksum); + long countChromosomeEntitiesByInsdcAccession(String insdcAccession); long countChromosomeEntitiesByRefseq(String refseq); diff --git a/src/main/java/uk/ac/ebi/eva/contigalias/service/AssemblySequencesService.java b/src/main/java/uk/ac/ebi/eva/contigalias/service/AssemblySequencesService.java index 5da37b27..85b32b05 100644 --- a/src/main/java/uk/ac/ebi/eva/contigalias/service/AssemblySequencesService.java +++ b/src/main/java/uk/ac/ebi/eva/contigalias/service/AssemblySequencesService.java @@ -8,9 +8,11 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import org.springframework.beans.factory.annotation.Autowired; import org.springframework.stereotype.Service; import uk.ac.ebi.eva.contigalias.datasource.NCBIAssemblySequencesDataSource; import uk.ac.ebi.eva.contigalias.entities.AssemblySequencesEntity; +import uk.ac.ebi.eva.contigalias.entities.Sequence; import uk.ac.ebi.eva.contigalias.exception.AssemblySequenceNotFoundException; import uk.ac.ebi.eva.contigalias.exception.DuplicateAssemblySequenceException; import uk.ac.ebi.eva.contigalias.repo.AssemblySequencesRepository; @@ -18,6 +20,9 @@ @Service public class AssemblySequencesService { + @Autowired + private ChromosomeService chromosomeService; + private final AssemblySequencesRepository repository; private final NCBIAssemblySequencesDataSource ncbiSequenceDataSource; @@ -39,7 +44,7 @@ public void fetchAndInsertAssemblySequence(String accession) throws IOException, if(!fetchAssembly.isPresent()){ throw new AssemblySequenceNotFoundException(accession); } - if (fetchAssembly.get().getInsdcAccession() != null){ // This condition is only for testing, it'll change as soon as we add more attributes to the entity + if (fetchAssembly.get().getInsdcAccession() != null){ insertAssemblySequence(fetchAssembly.get()); logger.info("Successfully inserted assembly for accession " + accession); }else { @@ -52,6 +57,12 @@ public void insertAssemblySequence(AssemblySequencesEntity entity) { if (isEntityPresent(entity)) { throw duplicateAssemblySequenceInsertionException(null, entity); } else { + // Inserting the sequences' md5Checksum in the correct place in the chromosome table + for (Sequence s: entity.getSequences()){ + chromosomeService.updateChromosomeEntityByRefseqSetMD5Checksum(s.getRefseq(), s.getSequenceMD5()); + logger.info("Successfully updated chromosome table with md5Checksum: "+ s.getSequenceMD5() + "" + + " Where refseq = "+s.getRefseq()); + } repository.save(entity); } } diff --git a/src/main/java/uk/ac/ebi/eva/contigalias/service/ChromosomeService.java b/src/main/java/uk/ac/ebi/eva/contigalias/service/ChromosomeService.java index 93679963..365796a0 100644 --- a/src/main/java/uk/ac/ebi/eva/contigalias/service/ChromosomeService.java +++ b/src/main/java/uk/ac/ebi/eva/contigalias/service/ChromosomeService.java @@ -199,6 +199,13 @@ public void deleteChromosome(ChromosomeEntity entity) { repository.delete(entity); } + /** + * Update the chromosome table; set the md5Checksum for the entry that has the given + * chromosome refseq*/ + public int updateChromosomeEntityByRefseqSetMD5Checksum(String refseq, String md5Checksum){ + return repository.updateChromosomeEntityByRefseqSetMD5Checksum(refseq, md5Checksum); + } + public long countChromosomeEntitiesByInsdcAccession(String insdcAccession) { return repository.countChromosomeEntitiesByInsdcAccession(insdcAccession); } diff --git a/src/test/java/uk/ac/ebi/eva/contigalias/service/AssemblySequencesServiceTest.java b/src/test/java/uk/ac/ebi/eva/contigalias/service/AssemblySequencesServiceTest.java index 53ba6296..70c8e146 100644 --- a/src/test/java/uk/ac/ebi/eva/contigalias/service/AssemblySequencesServiceTest.java +++ b/src/test/java/uk/ac/ebi/eva/contigalias/service/AssemblySequencesServiceTest.java @@ -35,7 +35,7 @@ void fetchAndInsertAssemblySequence() throws IOException, NoSuchAlgorithmExcepti String accession = "GCF_000001765.3"; assemblySequencesService.fetchAndInsertAssemblySequence(accession); assertNotNull(assemblySequencesRepository.findAssemblySequenceEntityByInsdcAccession(accession)); - assertEquals(accession, assemblySequencesRepository.findAssemblySequenceEntityByInsdcAccession(accession).get()); + assertEquals(accession, assemblySequencesRepository.findAssemblySequenceEntityByInsdcAccession(accession).get().getInsdcAccession()); } @Test From 43f50f43ecd65dd8d51255f71ca948c2bf963dae Mon Sep 17 00:00:00 2001 From: waterflow80 <thessalonikaathena@outlook.com> Date: Wed, 7 Jun 2023 22:16:05 +0100 Subject: [PATCH 6/6] Completed the retrieval and the parsing of the fasta file - Retrieve the assembly report and the fasta file of a given accession - Parse the file and retrieve the assembly sequences - Hash the sequences and using md5 algorithm - Construct the level 2 sequence collection object - We can also use the function of the retrieval of the fasta file independently and save the sequences in a separate table --- .../NCBIAssemblySequencesDataSource.java | 6 +- .../dus2/NCBIAssemblySequencesReader.java | 10 +- .../entities/AssemblySequencesEntity.java | 22 ++- .../contigalias/entities/SeqColEntity.java | 43 ++++++ .../eva/contigalias/entities/Sequence.java | 18 ++- .../repo/AssemblySequencesRepository.java | 2 +- .../contigalias/repo/SequenceRepository.java | 2 + .../service/AssemblySequencesService.java | 31 +++-- .../service/SequenceCollectionService.java | 127 ++++++++++++++++++ .../contigalias/utils/DigestGenerator.java | 7 + .../utils/{MD5Digest.java => MD5Hash.java} | 3 +- .../utils/SerializationService.java | 21 +++ 12 files changed, 246 insertions(+), 46 deletions(-) create mode 100644 src/main/java/uk/ac/ebi/eva/contigalias/entities/SeqColEntity.java create mode 100644 src/main/java/uk/ac/ebi/eva/contigalias/service/SequenceCollectionService.java create mode 100644 src/main/java/uk/ac/ebi/eva/contigalias/utils/DigestGenerator.java rename src/main/java/uk/ac/ebi/eva/contigalias/utils/{MD5Digest.java => MD5Hash.java} (90%) create mode 100644 src/main/java/uk/ac/ebi/eva/contigalias/utils/SerializationService.java diff --git a/src/main/java/uk/ac/ebi/eva/contigalias/datasource/NCBIAssemblySequencesDataSource.java b/src/main/java/uk/ac/ebi/eva/contigalias/datasource/NCBIAssemblySequencesDataSource.java index 211ab422..2c966ecd 100644 --- a/src/main/java/uk/ac/ebi/eva/contigalias/datasource/NCBIAssemblySequencesDataSource.java +++ b/src/main/java/uk/ac/ebi/eva/contigalias/datasource/NCBIAssemblySequencesDataSource.java @@ -44,6 +44,9 @@ public NCBIAssemblySequencesDataSource(NCBIBrowserFactory factory, } @Override + /** + * Return the assemblySequencesEntity which contains the list of sequences of the assembly + * with the given accession. The sequences are hashed using md5 algorithm*/ public Optional<AssemblySequencesEntity> getAssemblySequencesByAccession(String accession) throws IOException, IllegalArgumentException, NoSuchAlgorithmException { NCBIBrowser ncbiBrowser = factory.build(); ncbiBrowser.connect(); @@ -59,7 +62,6 @@ public Optional<AssemblySequencesEntity> getAssemblySequencesByAccession(String if (!compressedFilePath.isPresent()){ return Optional.empty(); } - AssemblySequencesEntity assemblySequencesEntity; try (InputStream stream = new FileInputStream(compressedFilePath.get().toFile())){ NCBIAssemblySequencesReader reader = readerFactory.build(stream, accession); @@ -69,7 +71,7 @@ public Optional<AssemblySequencesEntity> getAssemblySequencesByAccession(String try { ncbiBrowser.disconnect(); Files.deleteIfExists(downloadFilePath.get()); - Files.deleteIfExists(compressedFilePath.get()); // Deleting the fasta file + Files.deleteIfExists(compressedFilePath.get()); // Deleting the fna.gz file } catch (IOException e) { logger.warn("Error while trying to disconnect - ncbiBrowser (assembly: " + accession + ")"); } diff --git a/src/main/java/uk/ac/ebi/eva/contigalias/dus2/NCBIAssemblySequencesReader.java b/src/main/java/uk/ac/ebi/eva/contigalias/dus2/NCBIAssemblySequencesReader.java index b979a8eb..a58290d2 100644 --- a/src/main/java/uk/ac/ebi/eva/contigalias/dus2/NCBIAssemblySequencesReader.java +++ b/src/main/java/uk/ac/ebi/eva/contigalias/dus2/NCBIAssemblySequencesReader.java @@ -8,7 +8,8 @@ import uk.ac.ebi.eva.contigalias.entities.AssemblySequencesEntity; import uk.ac.ebi.eva.contigalias.entities.Sequence; -import uk.ac.ebi.eva.contigalias.utils.MD5Digest; +import uk.ac.ebi.eva.contigalias.utils.DigestGenerator; +import uk.ac.ebi.eva.contigalias.utils.MD5Hash; public class NCBIAssemblySequencesReader extends AssemblySequencesReader { @@ -21,19 +22,19 @@ protected void parseFile() throws IOException, NullPointerException, NoSuchAlgor if (reader == null){ throw new NullPointerException("Cannot use AssemblySequenceReader without having a valid InputStreamReader."); } - MD5Digest md5Digest = new MD5Digest(); + DigestGenerator md5Digest = new MD5Hash(); if (assemblySequencesEntity == null){ assemblySequencesEntity = new AssemblySequencesEntity(); } // Setting the accession of the whole assembly file - assemblySequencesEntity.setInsdcAccession(accession); + assemblySequencesEntity.setAssemblyInsdcAccession(accession); List<Sequence> sequences = new LinkedList<>(); String line = reader.readLine(); while (line != null){ if (line.startsWith(">")){ Sequence sequence = new Sequence(); String refSeq = line.substring(1, line.indexOf(' ')); - sequence.setRefseq(refSeq); + sequence.setSequenceRefseq(refSeq); line = reader.readLine(); StringBuilder sequenceValue = new StringBuilder(); while (line != null && !line.startsWith(">")){ @@ -47,6 +48,7 @@ protected void parseFile() throws IOException, NullPointerException, NoSuchAlgor } } assemblySequencesEntity.setSequences(sequences); + String digest0; // The level 0 digest of the object fileParsed = true; reader.close(); } diff --git a/src/main/java/uk/ac/ebi/eva/contigalias/entities/AssemblySequencesEntity.java b/src/main/java/uk/ac/ebi/eva/contigalias/entities/AssemblySequencesEntity.java index 5875b48d..5966a04a 100644 --- a/src/main/java/uk/ac/ebi/eva/contigalias/entities/AssemblySequencesEntity.java +++ b/src/main/java/uk/ac/ebi/eva/contigalias/entities/AssemblySequencesEntity.java @@ -3,35 +3,33 @@ import java.util.List; -import javax.persistence.CascadeType; -import javax.persistence.Column; -import javax.persistence.Entity; -import javax.persistence.Id; -import javax.persistence.JoinColumn; -import javax.persistence.OneToMany; -import javax.persistence.Table; +import javax.persistence.*; +import com.fasterxml.jackson.annotation.JsonInclude; import io.swagger.annotations.ApiModelProperty; +import lombok.Data; import lombok.Getter; import lombok.Setter; import org.hibernate.annotations.LazyCollection; import org.hibernate.annotations.LazyCollectionOption; -@Setter -@Getter -@Table(name = "AssemblySequences") +@Data +@Table(name = "assemblySequences") @Entity public class AssemblySequencesEntity { @Id @Column(nullable = false) @ApiModelProperty(value = "Assembly's INSDC accession. It can be either a GenBank, ENA or a DDBJ accession.") - private String insdcAccession; + private String assemblyInsdcAccession; @ApiModelProperty(value = "List of all sequences of the assembly.") + @JsonInclude(JsonInclude.Include.NON_NULL) @LazyCollection(LazyCollectionOption.FALSE) @OneToMany(targetEntity = Sequence.class, cascade = CascadeType.ALL) - @JoinColumn(name = "insdcAccession", referencedColumnName = "insdcAccession") + //@OneToMany(mappedBy = "assemblySequences", cascade = CascadeType.ALL) + //@JoinColumn(name = "assembly_insdc_accession", referencedColumnName = "assemblyInsdcAccession") + @JoinColumn(name = "assembly_insdc_accession") private List<Sequence> sequences; } diff --git a/src/main/java/uk/ac/ebi/eva/contigalias/entities/SeqColEntity.java b/src/main/java/uk/ac/ebi/eva/contigalias/entities/SeqColEntity.java new file mode 100644 index 00000000..e60e576c --- /dev/null +++ b/src/main/java/uk/ac/ebi/eva/contigalias/entities/SeqColEntity.java @@ -0,0 +1,43 @@ +package uk.ac.ebi.eva.contigalias.entities; + +import io.swagger.annotations.ApiModelProperty; +import lombok.AllArgsConstructor; +import lombok.Data; +import lombok.NoArgsConstructor; + +import javax.persistence.EnumType; +import javax.persistence.Enumerated; +import java.util.List; + +@Data +@NoArgsConstructor +public class SeqColEntity { + + @ApiModelProperty(value = "The level 0 digest of the object") + private String digest; + + @ApiModelProperty(value = "The representation level of the the object") + @Enumerated(EnumType.ORDINAL) + private Level level; + + @ApiModelProperty(value = "The naming convention used to construct this seqCol object") + @Enumerated(EnumType.STRING) + private NamingConvention namingConvention; + + @ApiModelProperty(value = "The array of the sequences' lengths") + private List<Long> lengths; + + @ApiModelProperty(value = "The array of the sequences' names") + private List<String> names; + + @ApiModelProperty(value = "The array of the sequences") + private List<String> sequences; + + public enum Level { + ZERO, ONE, TWO + } + + public enum NamingConvention { + ENA, GENBANK, UCSC + } +} diff --git a/src/main/java/uk/ac/ebi/eva/contigalias/entities/Sequence.java b/src/main/java/uk/ac/ebi/eva/contigalias/entities/Sequence.java index 728b5987..8ab22b48 100644 --- a/src/main/java/uk/ac/ebi/eva/contigalias/entities/Sequence.java +++ b/src/main/java/uk/ac/ebi/eva/contigalias/entities/Sequence.java @@ -1,13 +1,8 @@ package uk.ac.ebi.eva.contigalias.entities; -import javax.persistence.CascadeType; -import javax.persistence.Column; -import javax.persistence.Entity; -import javax.persistence.Id; -import javax.persistence.JoinColumn; -import javax.persistence.ManyToOne; -import javax.persistence.Table; +import javax.persistence.*; +import com.fasterxml.jackson.annotation.JsonInclude; import io.swagger.annotations.ApiModelProperty; import lombok.Getter; import lombok.Setter; @@ -23,12 +18,15 @@ public class Sequence { @Id @Column(nullable = false) @ApiModelProperty(value = "Assembly's Refseq accession.") - private String refseq; + private String sequenceRefseq; - @Column + @Column(nullable = false) @ApiModelProperty(value = "Sequence's MD5 checksum value.") private String sequenceMD5; - + /*@JsonInclude(JsonInclude.Include.NON_NULL) + @ManyToOne + @JoinColumn(name = "assembly_insdc_accession", nullable = false) + private AssemblySequencesEntity assemblySequences;*/ } diff --git a/src/main/java/uk/ac/ebi/eva/contigalias/repo/AssemblySequencesRepository.java b/src/main/java/uk/ac/ebi/eva/contigalias/repo/AssemblySequencesRepository.java index 0992b3c3..ce1b9321 100644 --- a/src/main/java/uk/ac/ebi/eva/contigalias/repo/AssemblySequencesRepository.java +++ b/src/main/java/uk/ac/ebi/eva/contigalias/repo/AssemblySequencesRepository.java @@ -8,7 +8,7 @@ @Repository public interface AssemblySequencesRepository extends JpaRepository<AssemblySequencesEntity, String> { - Optional<AssemblySequencesEntity> findAssemblySequenceEntityByInsdcAccession(String accession); + Optional<AssemblySequencesEntity> findAssemblySequenceEntityByAssemblyInsdcAccession(String accession); } diff --git a/src/main/java/uk/ac/ebi/eva/contigalias/repo/SequenceRepository.java b/src/main/java/uk/ac/ebi/eva/contigalias/repo/SequenceRepository.java index ba9164b0..c9415632 100644 --- a/src/main/java/uk/ac/ebi/eva/contigalias/repo/SequenceRepository.java +++ b/src/main/java/uk/ac/ebi/eva/contigalias/repo/SequenceRepository.java @@ -1,7 +1,9 @@ package uk.ac.ebi.eva.contigalias.repo; import org.springframework.data.jpa.repository.JpaRepository; +import org.springframework.stereotype.Repository; import uk.ac.ebi.eva.contigalias.entities.Sequence; +@Repository public interface SequenceRepository extends JpaRepository<Sequence, String> { } diff --git a/src/main/java/uk/ac/ebi/eva/contigalias/service/AssemblySequencesService.java b/src/main/java/uk/ac/ebi/eva/contigalias/service/AssemblySequencesService.java index 85b32b05..0886e345 100644 --- a/src/main/java/uk/ac/ebi/eva/contigalias/service/AssemblySequencesService.java +++ b/src/main/java/uk/ac/ebi/eva/contigalias/service/AssemblySequencesService.java @@ -25,51 +25,50 @@ public class AssemblySequencesService { private final AssemblySequencesRepository repository; - private final NCBIAssemblySequencesDataSource ncbiSequenceDataSource; + private final NCBIAssemblySequencesDataSource ncbiSequencesDataSource; - private final Logger logger = LoggerFactory.getLogger(AssemblyService.class); + private final Logger logger = LoggerFactory.getLogger(AssemblySequencesService.class); public AssemblySequencesService( - AssemblySequencesRepository repository, NCBIAssemblySequencesDataSource ncbiSequenceDataSource){ + AssemblySequencesRepository repository, NCBIAssemblySequencesDataSource ncbiSequencesDataSource){ this.repository = repository; - this.ncbiSequenceDataSource = ncbiSequenceDataSource; + this.ncbiSequencesDataSource = ncbiSequencesDataSource; } public void fetchAndInsertAssemblySequence(String accession) throws IOException, NoSuchAlgorithmException { - Optional<AssemblySequencesEntity> entity = repository.findAssemblySequenceEntityByInsdcAccession(accession); + Optional<AssemblySequencesEntity> entity = repository.findAssemblySequenceEntityByAssemblyInsdcAccession(accession); if(entity.isPresent()) throw duplicateAssemblySequenceInsertionException(accession, entity.get()); - Optional<AssemblySequencesEntity> fetchAssembly = ncbiSequenceDataSource.getAssemblySequencesByAccession(accession); - if(!fetchAssembly.isPresent()){ + Optional<AssemblySequencesEntity> fetchAssemblySequences = ncbiSequencesDataSource.getAssemblySequencesByAccession(accession); + if(!fetchAssemblySequences.isPresent()){ throw new AssemblySequenceNotFoundException(accession); } - if (fetchAssembly.get().getInsdcAccession() != null){ - insertAssemblySequence(fetchAssembly.get()); - logger.info("Successfully inserted assembly for accession " + accession); + if (fetchAssemblySequences.get().getAssemblyInsdcAccession() != null){ + insertAssemblySequences(fetchAssemblySequences.get()); + logger.info("Successfully inserted assembly sequences for accession: " + accession); }else { - logger.error("Skipping inserting assembly sequence : No name in assembly : " + accession); + logger.error("Skipping inserting assembly sequences : No name in assembly: " + accession); } } @Transactional - public void insertAssemblySequence(AssemblySequencesEntity entity) { + public void insertAssemblySequences(AssemblySequencesEntity entity) { if (isEntityPresent(entity)) { throw duplicateAssemblySequenceInsertionException(null, entity); } else { // Inserting the sequences' md5Checksum in the correct place in the chromosome table for (Sequence s: entity.getSequences()){ - chromosomeService.updateChromosomeEntityByRefseqSetMD5Checksum(s.getRefseq(), s.getSequenceMD5()); - logger.info("Successfully updated chromosome table with md5Checksum: "+ s.getSequenceMD5() + "" + - " Where refseq = "+s.getRefseq()); + chromosomeService.updateChromosomeEntityByRefseqSetMD5Checksum(s.getSequenceRefseq(), s.getSequenceMD5()); } + System.out.println("Assembly_insdc_accession: " + entity.getAssemblyInsdcAccession()); repository.save(entity); } } private boolean isEntityPresent(AssemblySequencesEntity entity) { // TODO: THE CONDITIONS IN THIS METHOD WILL BE CHANGED WHEN WE ADD MORE ATTRIBUTES TO THE ENTITY - Optional<AssemblySequencesEntity> existingAssembly = repository.findAssemblySequenceEntityByInsdcAccession(entity.getInsdcAccession()); + Optional<AssemblySequencesEntity> existingAssembly = repository.findAssemblySequenceEntityByAssemblyInsdcAccession(entity.getAssemblyInsdcAccession()); return existingAssembly.isPresent(); } diff --git a/src/main/java/uk/ac/ebi/eva/contigalias/service/SequenceCollectionService.java b/src/main/java/uk/ac/ebi/eva/contigalias/service/SequenceCollectionService.java new file mode 100644 index 00000000..55f44c57 --- /dev/null +++ b/src/main/java/uk/ac/ebi/eva/contigalias/service/SequenceCollectionService.java @@ -0,0 +1,127 @@ +package uk.ac.ebi.eva.contigalias.service; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import org.springframework.beans.factory.annotation.Autowired; +import org.springframework.stereotype.Service; +import uk.ac.ebi.eva.contigalias.datasource.NCBIAssemblyDataSource; +import uk.ac.ebi.eva.contigalias.datasource.NCBIAssemblySequencesDataSource; +import uk.ac.ebi.eva.contigalias.entities.*; +import uk.ac.ebi.eva.contigalias.exception.AssemblyNotFoundException; +import uk.ac.ebi.eva.contigalias.exception.AssemblySequenceNotFoundException; +import uk.ac.ebi.eva.contigalias.repo.AssemblyRepository; +import uk.ac.ebi.eva.contigalias.repo.AssemblySequencesRepository; + +import java.io.IOException; +import java.security.NoSuchAlgorithmException; +import java.util.*; + +@Service +public class SequenceCollectionService { + + @Autowired + private AssemblyService assemblyService; + + @Autowired + private AssemblySequencesService assemblySequencesService; + + private final AssemblyRepository assemblyRepository; + + private final AssemblySequencesRepository assemblySequencesRepository; + + private final NCBIAssemblyDataSource assemblyDataSource; + + private final NCBIAssemblySequencesDataSource assemblySequencesDataSource; + + private final Logger logger = LoggerFactory.getLogger(SequenceCollectionService.class); + + public SequenceCollectionService(AssemblyRepository assemblyRepository, AssemblySequencesRepository assemblySequencesRepository, + NCBIAssemblyDataSource assemblyDataSource, NCBIAssemblySequencesDataSource assemblySequencesDataSource) { + this.assemblyRepository = assemblyRepository; + this.assemblySequencesRepository = assemblySequencesRepository; + this.assemblyDataSource = assemblyDataSource; + this.assemblySequencesDataSource = assemblySequencesDataSource; + } + + /** + * Search for the assembly report as well as the assembly real sequences and insert them + * in the database. + * Use the given naming convention while constructing the SeqCol Object*/ + public void fetchAndInsertSequenceCollection(String accession, SeqColEntity.NamingConvention namingConvention) + throws IOException, NoSuchAlgorithmException { + // TODO: Check if the needed seqCol data does not exist in the database + // TODO: If not, call the appropriate service(s) to fetch it + + Optional<AssemblyEntity> fetchAssembly = assemblyDataSource.getAssemblyByAccession(accession); + if (!fetchAssembly.isPresent()){ + throw new AssemblyNotFoundException(accession); + } + assemblyService.insertAssembly(fetchAssembly.get()); + Optional<AssemblySequencesEntity> fetchAssemblySequences = assemblySequencesDataSource + .getAssemblySequencesByAccession(accession); + if (!fetchAssemblySequences.isPresent()){ + throw new AssemblySequenceNotFoundException(accession); + } + assemblySequencesService.insertAssemblySequences(fetchAssemblySequences.get()); + + SeqColEntity seqColLevel2 = constructSequenceCollectionObjectL2(fetchAssembly.get(), fetchAssemblySequences.get(), + namingConvention); + + } + + /** + * Return a level 1 entity of the sequence collection following the given naming convention. + * */ + public SeqColEntity constructSequenceCollectionObjectL2(AssemblyEntity assemblyEntity, + AssemblySequencesEntity assemblySequencesEntity, + SeqColEntity.NamingConvention namingConvention) { + + List<ChromosomeEntity> chromosomeList = assemblyEntity.getChromosomes(); + List<Sequence> sequenceList = assemblySequencesEntity.getSequences(); + assert chromosomeList.size() == sequenceList.size(); + + Comparator<ChromosomeEntity> chromosomeComparator = (chromosomeEntity, t1) -> + chromosomeEntity.getRefseq().compareTo(t1.getRefseq()); + Comparator<Sequence> sequenceComparator = (sequence, t1) -> sequence.getSequenceRefseq().compareTo(t1.getSequenceRefseq()); + + Collections.sort(chromosomeList, chromosomeComparator); + Collections.sort(sequenceList, sequenceComparator); + + SeqColEntity seqColL2 = new SeqColEntity(); + + + List<String> sequences = new LinkedList<>(); + List<String> names = new LinkedList<>(); + List<Long> lengths = new LinkedList<>(); + + switch (namingConvention) { + case ENA: + for (int i=0; i<sequenceList.size(); i++){ + sequences.add(sequenceList.get(i).getSequenceMD5()); + names.add(chromosomeList.get(i).getEnaSequenceName()); + lengths.add(chromosomeList.get(i).getSeqLength()); + } + break; + case GENBANK: + for (int i=0; i<sequenceList.size(); i++){ + sequences.add(sequenceList.get(i).getSequenceMD5()); + names.add(chromosomeList.get(i).getGenbankSequenceName()); + lengths.add(chromosomeList.get(i).getSeqLength()); + } + break; + case UCSC: + for (int i=0; i<sequenceList.size(); i++){ + sequences.add(sequenceList.get(i).getSequenceMD5()); + names.add(chromosomeList.get(i).getUcscName()); + lengths.add(chromosomeList.get(i).getSeqLength()); + } + } + seqColL2.setSequences(sequences); + seqColL2.setLengths(lengths); + seqColL2.setNames(names); + seqColL2.setLevel(SeqColEntity.Level.TWO); + seqColL2.setNamingConvention(namingConvention); + + return seqColL2; + } +} diff --git a/src/main/java/uk/ac/ebi/eva/contigalias/utils/DigestGenerator.java b/src/main/java/uk/ac/ebi/eva/contigalias/utils/DigestGenerator.java new file mode 100644 index 00000000..9279becb --- /dev/null +++ b/src/main/java/uk/ac/ebi/eva/contigalias/utils/DigestGenerator.java @@ -0,0 +1,7 @@ +package uk.ac.ebi.eva.contigalias.utils; + +import java.security.NoSuchAlgorithmException; + +public abstract class DigestGenerator { + public abstract String hash(String text) throws NoSuchAlgorithmException; +} diff --git a/src/main/java/uk/ac/ebi/eva/contigalias/utils/MD5Digest.java b/src/main/java/uk/ac/ebi/eva/contigalias/utils/MD5Hash.java similarity index 90% rename from src/main/java/uk/ac/ebi/eva/contigalias/utils/MD5Digest.java rename to src/main/java/uk/ac/ebi/eva/contigalias/utils/MD5Hash.java index 72a3c0ee..633e215f 100644 --- a/src/main/java/uk/ac/ebi/eva/contigalias/utils/MD5Digest.java +++ b/src/main/java/uk/ac/ebi/eva/contigalias/utils/MD5Hash.java @@ -5,10 +5,11 @@ import javax.xml.bind.DatatypeConverter; -public class MD5Digest { +public class MD5Hash extends DigestGenerator{ /** * Return the digest of the text using the MD5 algorithm*/ + @Override public String hash(String text) throws NoSuchAlgorithmException { MessageDigest md = MessageDigest.getInstance("MD5"); md.update(text.getBytes()); diff --git a/src/main/java/uk/ac/ebi/eva/contigalias/utils/SerializationService.java b/src/main/java/uk/ac/ebi/eva/contigalias/utils/SerializationService.java new file mode 100644 index 00000000..97464b01 --- /dev/null +++ b/src/main/java/uk/ac/ebi/eva/contigalias/utils/SerializationService.java @@ -0,0 +1,21 @@ +package uk.ac.ebi.eva.contigalias.utils; + +import org.webpki.jcs.JsonCanonicalizer; + +import java.io.IOException; +import java.util.Optional; + +public class SerializationService { + + /** + * Return a serialized version of the input jsonString using the + * RFC-8785, using the implementation provided by cyberphone/json-canonicalization + * (see on GitHub). + * The jsonString should respect some strict format rules, for example: + * should be delimited with '{ }', etc*/ + public Optional<String> serialize(String jsonString) throws IOException { + JsonCanonicalizer jsonCanonicalizer = new JsonCanonicalizer(jsonString); + String result = jsonCanonicalizer.getEncodedString(); + return Optional.of(result); + } +}