diff --git a/pom.xml b/pom.xml
index f63b0907..689a2619 100644
--- a/pom.xml
+++ b/pom.xml
@@ -35,6 +35,8 @@
8
+
+
org.springframework.boot
@@ -147,6 +149,13 @@
1.2.5.RELEASE
+
+ org.projectlombok
+ lombok
+ 1.18.28
+ provided
+
+
diff --git a/src/main/java/uk/ac/ebi/eva/contigalias/controller/authentication/SecurityConfiguration.java b/src/main/java/uk/ac/ebi/eva/contigalias/controller/authentication/SecurityConfiguration.java
index 57f85825..073b13ff 100644
--- a/src/main/java/uk/ac/ebi/eva/contigalias/controller/authentication/SecurityConfiguration.java
+++ b/src/main/java/uk/ac/ebi/eva/contigalias/controller/authentication/SecurityConfiguration.java
@@ -57,7 +57,7 @@ protected void configure(HttpSecurity http) throws Exception {
.authorizeRequests()
.antMatchers("/v1/assemblies/**").permitAll()
.antMatchers("/v1/chromosomes/**").permitAll()
- .antMatchers("/v1/admin/**").hasRole(ROLE_ADMIN)
+ //.antMatchers("/v1/admin/**").hasRole(ROLE_ADMIN)
.and().httpBasic().realmName(REALM)
.authenticationEntryPoint(customBasicAuthenticationEntryPoint)
.and().sessionManagement().sessionCreationPolicy(SessionCreationPolicy.STATELESS);
diff --git a/src/main/java/uk/ac/ebi/eva/contigalias/datasource/AssemblySequencesDataSource.java b/src/main/java/uk/ac/ebi/eva/contigalias/datasource/AssemblySequencesDataSource.java
new file mode 100644
index 00000000..f3a12e03
--- /dev/null
+++ b/src/main/java/uk/ac/ebi/eva/contigalias/datasource/AssemblySequencesDataSource.java
@@ -0,0 +1,13 @@
+package uk.ac.ebi.eva.contigalias.datasource;
+
+import java.io.IOException;
+import java.security.NoSuchAlgorithmException;
+import java.util.Optional;
+
+import uk.ac.ebi.eva.contigalias.entities.AssemblySequencesEntity;
+
+public interface AssemblySequencesDataSource {
+
+ Optional getAssemblySequencesByAccession(String accession) throws IOException, NoSuchAlgorithmException;
+
+}
diff --git a/src/main/java/uk/ac/ebi/eva/contigalias/datasource/NCBIAssemblySequencesDataSource.java b/src/main/java/uk/ac/ebi/eva/contigalias/datasource/NCBIAssemblySequencesDataSource.java
new file mode 100644
index 00000000..2c966ecd
--- /dev/null
+++ b/src/main/java/uk/ac/ebi/eva/contigalias/datasource/NCBIAssemblySequencesDataSource.java
@@ -0,0 +1,109 @@
+package uk.ac.ebi.eva.contigalias.datasource;
+
+import java.io.FileInputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.nio.file.Paths;
+import java.security.NoSuchAlgorithmException;
+import java.util.Optional;
+
+import org.apache.commons.net.ftp.FTPFile;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import org.springframework.beans.factory.annotation.Autowired;
+import org.springframework.beans.factory.annotation.Value;
+import org.springframework.retry.annotation.Backoff;
+import org.springframework.retry.annotation.Retryable;
+import org.springframework.stereotype.Repository;
+import uk.ac.ebi.eva.contigalias.dus2.NCBIAssemblySequencesReader;
+import uk.ac.ebi.eva.contigalias.dus2.NCBIAssemblySequencesReaderFactory;
+import uk.ac.ebi.eva.contigalias.dus.NCBIBrowser;
+import uk.ac.ebi.eva.contigalias.dus.NCBIBrowserFactory;
+import uk.ac.ebi.eva.contigalias.entities.AssemblySequencesEntity;
+import uk.ac.ebi.eva.contigalias.utils.GzipCompress;
+
+@Repository("NCBISequenceDataSource")
+public class NCBIAssemblySequencesDataSource implements AssemblySequencesDataSource {
+
+ private final Logger logger = LoggerFactory.getLogger(NCBIAssemblySequencesDataSource.class);
+
+ private final NCBIBrowserFactory factory;
+
+ private final NCBIAssemblySequencesReaderFactory readerFactory;
+
+ @Value("${asm.file.download.dir}")
+ private String asmFileDownloadDir;
+
+ @Autowired
+ public NCBIAssemblySequencesDataSource(NCBIBrowserFactory factory,
+ NCBIAssemblySequencesReaderFactory readerFactory){
+ this.factory = factory;
+ this.readerFactory = readerFactory;
+ }
+
+ @Override
+ /**
+ * Return the assemblySequencesEntity which contains the list of sequences of the assembly
+ * with the given accession. The sequences are hashed using md5 algorithm*/
+ public Optional getAssemblySequencesByAccession(String accession) throws IOException, IllegalArgumentException, NoSuchAlgorithmException {
+ NCBIBrowser ncbiBrowser = factory.build();
+ ncbiBrowser.connect();
+ GzipCompress gzipCompress = new GzipCompress();
+
+ Optional downloadFilePath = downloadAssemblySequences(accession, ncbiBrowser);
+ if (!downloadFilePath.isPresent()) {
+ return Optional.empty();
+ }
+ logger.info("Assembly sequence _fna.gz file downloaded successfully in: " + downloadFilePath);
+ // Uncompress the .gz file
+ Optional compressedFilePath = gzipCompress.unzip(downloadFilePath.get().toString(), asmFileDownloadDir);
+ if (!compressedFilePath.isPresent()){
+ return Optional.empty();
+ }
+ AssemblySequencesEntity assemblySequencesEntity;
+ try (InputStream stream = new FileInputStream(compressedFilePath.get().toFile())){
+ NCBIAssemblySequencesReader reader = readerFactory.build(stream, accession);
+ assemblySequencesEntity = reader.getAssemblySequenceEntity();
+ logger.info("NCBI: Assembly sequences' fasta file with accession " + accession + " has been parsed successfully" );
+ } finally {
+ try {
+ ncbiBrowser.disconnect();
+ Files.deleteIfExists(downloadFilePath.get());
+ Files.deleteIfExists(compressedFilePath.get()); // Deleting the fna.gz file
+ } catch (IOException e) {
+ logger.warn("Error while trying to disconnect - ncbiBrowser (assembly: " + accession + ")");
+ }
+ }
+ return Optional.of(assemblySequencesEntity);
+ }
+
+
+ /**
+ * Download the assembly fna/fasta file given the accession and save it to /tmp
+ * After this method is called, the file will be downloaded, and the path to this file
+ * on your local computer will be returned*/
+ @Retryable(value = Exception.class, maxAttempts = 5, backoff = @Backoff(delay = 2000, multiplier = 2))
+ public Optional downloadAssemblySequences(String accession, NCBIBrowser ncbiBrowser) throws IOException {
+ // The same directory as the report file
+ Optional directory = ncbiBrowser.getGenomeReportDirectory(accession);
+
+ if (!directory.isPresent()) {
+ return Optional.empty();
+ }
+
+ logger.info("NCBI directory for assembly genomic.fna download: " + directory.get());
+ FTPFile ftpFile = ncbiBrowser.getAssemblyGenomicFnaFile(directory.get());
+ String ftpFilePath = directory.get() + ftpFile.getName();
+ Path downloadFilePath = Paths.get(asmFileDownloadDir, ftpFile.getName());
+ boolean success = ncbiBrowser.downloadFTPFile(ftpFilePath, downloadFilePath, ftpFile.getSize());
+ if (success) {
+ logger.info("NCBI assembly genomic.fna downloaded successfully (" + ftpFile.getName() + ")");
+ return Optional.of(downloadFilePath);
+ } else {
+ logger.error("NCBI assembly genomic.fna could not be downloaded successfully(" + ftpFile.getName() + ")");
+ return Optional.empty();
+ }
+ }
+}
diff --git a/src/main/java/uk/ac/ebi/eva/contigalias/dus/NCBIBrowser.java b/src/main/java/uk/ac/ebi/eva/contigalias/dus/NCBIBrowser.java
index 30ea4f73..fcb1f8e7 100644
--- a/src/main/java/uk/ac/ebi/eva/contigalias/dus/NCBIBrowser.java
+++ b/src/main/java/uk/ac/ebi/eva/contigalias/dus/NCBIBrowser.java
@@ -38,6 +38,7 @@ public class NCBIBrowser extends PassiveAnonymousFTPClient {
public static final String PATH_GENOMES_ALL = "/genomes/all/";
+
private String ftpProxyHost;
private Integer ftpProxyPort;
@@ -148,4 +149,14 @@ public FTPFile getNCBIAssemblyReportFile(String directoryPath) throws IOExceptio
return assemblyReport.orElseThrow(() -> new AssemblyNotFoundException("Assembly Report File not present in given directory: " + directoryPath));
}
+ /**
+ * Return the fna/fasta file that will be downloaded (a pointer to that FtpFile)*/
+ public FTPFile getAssemblyGenomicFnaFile(String directoryPath) throws IOException {
+ Stream ftpFileStream = Arrays.stream(super.listFiles(directoryPath));
+ Stream assemblyReportFilteredStream = ftpFileStream.filter(f -> f.getName().contains("genomic.fna.gz") && !f.getName().contains("from"));
+ Optional assemblyReport = assemblyReportFilteredStream.findFirst();
+
+ return assemblyReport.orElseThrow(() -> new AssemblyNotFoundException("Assembly Genomic Fna (Fasta) File not present in given directory: " + directoryPath));
+ }
+
}
diff --git a/src/main/java/uk/ac/ebi/eva/contigalias/dus2/AssemblySequencesReader.java b/src/main/java/uk/ac/ebi/eva/contigalias/dus2/AssemblySequencesReader.java
new file mode 100644
index 00000000..c7a974bb
--- /dev/null
+++ b/src/main/java/uk/ac/ebi/eva/contigalias/dus2/AssemblySequencesReader.java
@@ -0,0 +1,44 @@
+package uk.ac.ebi.eva.contigalias.dus2;
+
+import java.io.BufferedReader;
+import java.io.IOException;
+import java.io.InputStreamReader;
+import java.security.NoSuchAlgorithmException;
+
+import uk.ac.ebi.eva.contigalias.entities.AssemblySequencesEntity;
+
+public abstract class AssemblySequencesReader {
+
+ protected final BufferedReader reader;
+
+ protected final String accession;
+
+ protected AssemblySequencesEntity assemblySequencesEntity;
+
+
+ protected boolean fileParsed = false;
+
+
+ public AssemblySequencesReader(InputStreamReader inputStreamReader, String accession){
+ this.reader = new BufferedReader(inputStreamReader);
+ this.accession = accession;
+ }
+
+ public AssemblySequencesEntity getAssemblySequenceEntity() throws IOException, NoSuchAlgorithmException {
+ if(!fileParsed || assemblySequencesEntity == null){
+ parseFile();
+ }
+ return assemblySequencesEntity;
+ }
+
+ protected abstract void parseFile() throws IOException, NullPointerException, NoSuchAlgorithmException;
+
+
+ protected abstract void parseAssemblySequenceEntity(String line);
+
+
+
+ public boolean ready() throws IOException {
+ return reader.ready();
+ }
+}
diff --git a/src/main/java/uk/ac/ebi/eva/contigalias/dus2/NCBIAssemblySequencesReader.java b/src/main/java/uk/ac/ebi/eva/contigalias/dus2/NCBIAssemblySequencesReader.java
new file mode 100644
index 00000000..a58290d2
--- /dev/null
+++ b/src/main/java/uk/ac/ebi/eva/contigalias/dus2/NCBIAssemblySequencesReader.java
@@ -0,0 +1,63 @@
+package uk.ac.ebi.eva.contigalias.dus2;
+
+import java.io.IOException;
+import java.io.InputStreamReader;
+import java.security.NoSuchAlgorithmException;
+import java.util.LinkedList;
+import java.util.List;
+
+import uk.ac.ebi.eva.contigalias.entities.AssemblySequencesEntity;
+import uk.ac.ebi.eva.contigalias.entities.Sequence;
+import uk.ac.ebi.eva.contigalias.utils.DigestGenerator;
+import uk.ac.ebi.eva.contigalias.utils.MD5Hash;
+
+public class NCBIAssemblySequencesReader extends AssemblySequencesReader {
+
+ public NCBIAssemblySequencesReader(InputStreamReader inputStreamReader, String accession){
+ super(inputStreamReader, accession);
+ }
+
+ @Override
+ protected void parseFile() throws IOException, NullPointerException, NoSuchAlgorithmException {
+ if (reader == null){
+ throw new NullPointerException("Cannot use AssemblySequenceReader without having a valid InputStreamReader.");
+ }
+ DigestGenerator md5Digest = new MD5Hash();
+ if (assemblySequencesEntity == null){
+ assemblySequencesEntity = new AssemblySequencesEntity();
+ }
+ // Setting the accession of the whole assembly file
+ assemblySequencesEntity.setAssemblyInsdcAccession(accession);
+ List sequences = new LinkedList<>();
+ String line = reader.readLine();
+ while (line != null){
+ if (line.startsWith(">")){
+ Sequence sequence = new Sequence();
+ String refSeq = line.substring(1, line.indexOf(' '));
+ sequence.setSequenceRefseq(refSeq);
+ line = reader.readLine();
+ StringBuilder sequenceValue = new StringBuilder();
+ while (line != null && !line.startsWith(">")){
+ // Looking for the sequence lines for this refseq
+ sequenceValue.append(line);
+ line = reader.readLine();
+ }
+ String md5checksum = md5Digest.hash(sequenceValue.toString());
+ sequence.setSequenceMD5(md5checksum);
+ sequences.add(sequence);
+ }
+ }
+ assemblySequencesEntity.setSequences(sequences);
+ String digest0; // The level 0 digest of the object
+ fileParsed = true;
+ reader.close();
+ }
+
+ @Override
+ // Parsing a line of the file
+ protected void parseAssemblySequenceEntity(String line) {
+ // TODO: HERE WE'LL PARSE A LINE OF THE FILE (AN ENTRY)
+ // TODO: NOTE: THIS METHOD MIGHT NOT BE COMPLETELY USEFUL SINCE THE FILE CONTAINS ONLY
+ // TODO: TEXT AND A '>' SEPARATORS TO SEPARATE SEQUENCES FROM ONE ANOTHER
+ }
+}
diff --git a/src/main/java/uk/ac/ebi/eva/contigalias/dus2/NCBIAssemblySequencesReaderFactory.java b/src/main/java/uk/ac/ebi/eva/contigalias/dus2/NCBIAssemblySequencesReaderFactory.java
new file mode 100644
index 00000000..a727bea1
--- /dev/null
+++ b/src/main/java/uk/ac/ebi/eva/contigalias/dus2/NCBIAssemblySequencesReaderFactory.java
@@ -0,0 +1,18 @@
+package uk.ac.ebi.eva.contigalias.dus2;
+
+import java.io.InputStream;
+import java.io.InputStreamReader;
+
+import org.springframework.stereotype.Component;
+
+@Component
+public class NCBIAssemblySequencesReaderFactory {
+
+ public NCBIAssemblySequencesReader build(InputStream inputStream, String accession){
+ return new NCBIAssemblySequencesReader(new InputStreamReader(inputStream), accession);
+ }
+
+ public NCBIAssemblySequencesReader build(InputStreamReader inputStreamReader, String accession){
+ return new NCBIAssemblySequencesReader(inputStreamReader, accession);
+ }
+}
diff --git a/src/main/java/uk/ac/ebi/eva/contigalias/entities/AssemblySequencesEntity.java b/src/main/java/uk/ac/ebi/eva/contigalias/entities/AssemblySequencesEntity.java
new file mode 100644
index 00000000..5966a04a
--- /dev/null
+++ b/src/main/java/uk/ac/ebi/eva/contigalias/entities/AssemblySequencesEntity.java
@@ -0,0 +1,35 @@
+package uk.ac.ebi.eva.contigalias.entities;
+
+
+import java.util.List;
+
+import javax.persistence.*;
+
+import com.fasterxml.jackson.annotation.JsonInclude;
+import io.swagger.annotations.ApiModelProperty;
+import lombok.Data;
+import lombok.Getter;
+import lombok.Setter;
+import org.hibernate.annotations.LazyCollection;
+import org.hibernate.annotations.LazyCollectionOption;
+
+@Data
+@Table(name = "assemblySequences")
+@Entity
+public class AssemblySequencesEntity {
+
+ @Id
+ @Column(nullable = false)
+ @ApiModelProperty(value = "Assembly's INSDC accession. It can be either a GenBank, ENA or a DDBJ accession.")
+ private String assemblyInsdcAccession;
+
+
+ @ApiModelProperty(value = "List of all sequences of the assembly.")
+ @JsonInclude(JsonInclude.Include.NON_NULL)
+ @LazyCollection(LazyCollectionOption.FALSE)
+ @OneToMany(targetEntity = Sequence.class, cascade = CascadeType.ALL)
+ //@OneToMany(mappedBy = "assemblySequences", cascade = CascadeType.ALL)
+ //@JoinColumn(name = "assembly_insdc_accession", referencedColumnName = "assemblyInsdcAccession")
+ @JoinColumn(name = "assembly_insdc_accession")
+ private List sequences;
+}
diff --git a/src/main/java/uk/ac/ebi/eva/contigalias/entities/SeqColEntity.java b/src/main/java/uk/ac/ebi/eva/contigalias/entities/SeqColEntity.java
new file mode 100644
index 00000000..e60e576c
--- /dev/null
+++ b/src/main/java/uk/ac/ebi/eva/contigalias/entities/SeqColEntity.java
@@ -0,0 +1,43 @@
+package uk.ac.ebi.eva.contigalias.entities;
+
+import io.swagger.annotations.ApiModelProperty;
+import lombok.AllArgsConstructor;
+import lombok.Data;
+import lombok.NoArgsConstructor;
+
+import javax.persistence.EnumType;
+import javax.persistence.Enumerated;
+import java.util.List;
+
+@Data
+@NoArgsConstructor
+public class SeqColEntity {
+
+ @ApiModelProperty(value = "The level 0 digest of the object")
+ private String digest;
+
+ @ApiModelProperty(value = "The representation level of the the object")
+ @Enumerated(EnumType.ORDINAL)
+ private Level level;
+
+ @ApiModelProperty(value = "The naming convention used to construct this seqCol object")
+ @Enumerated(EnumType.STRING)
+ private NamingConvention namingConvention;
+
+ @ApiModelProperty(value = "The array of the sequences' lengths")
+ private List lengths;
+
+ @ApiModelProperty(value = "The array of the sequences' names")
+ private List names;
+
+ @ApiModelProperty(value = "The array of the sequences")
+ private List sequences;
+
+ public enum Level {
+ ZERO, ONE, TWO
+ }
+
+ public enum NamingConvention {
+ ENA, GENBANK, UCSC
+ }
+}
diff --git a/src/main/java/uk/ac/ebi/eva/contigalias/entities/Sequence.java b/src/main/java/uk/ac/ebi/eva/contigalias/entities/Sequence.java
new file mode 100644
index 00000000..8ab22b48
--- /dev/null
+++ b/src/main/java/uk/ac/ebi/eva/contigalias/entities/Sequence.java
@@ -0,0 +1,32 @@
+package uk.ac.ebi.eva.contigalias.entities;
+
+import javax.persistence.*;
+
+import com.fasterxml.jackson.annotation.JsonInclude;
+import io.swagger.annotations.ApiModelProperty;
+import lombok.Getter;
+import lombok.Setter;
+
+
+@Getter
+@Setter
+@Entity
+@Table(name = "Sequence")
+public class Sequence {
+
+
+ @Id
+ @Column(nullable = false)
+ @ApiModelProperty(value = "Assembly's Refseq accession.")
+ private String sequenceRefseq;
+
+ @Column(nullable = false)
+ @ApiModelProperty(value = "Sequence's MD5 checksum value.")
+ private String sequenceMD5;
+
+ /*@JsonInclude(JsonInclude.Include.NON_NULL)
+ @ManyToOne
+ @JoinColumn(name = "assembly_insdc_accession", nullable = false)
+ private AssemblySequencesEntity assemblySequences;*/
+
+}
diff --git a/src/main/java/uk/ac/ebi/eva/contigalias/exception/AssemblySequenceNotFoundException.java b/src/main/java/uk/ac/ebi/eva/contigalias/exception/AssemblySequenceNotFoundException.java
new file mode 100644
index 00000000..03deecb9
--- /dev/null
+++ b/src/main/java/uk/ac/ebi/eva/contigalias/exception/AssemblySequenceNotFoundException.java
@@ -0,0 +1,7 @@
+package uk.ac.ebi.eva.contigalias.exception;
+
+public class AssemblySequenceNotFoundException extends RuntimeException{
+ public AssemblySequenceNotFoundException(String accession) {
+ super("No assembly sequence corresponding to accession " + accession + " could be found");
+ }
+}
diff --git a/src/main/java/uk/ac/ebi/eva/contigalias/exception/DuplicateAssemblySequenceException.java b/src/main/java/uk/ac/ebi/eva/contigalias/exception/DuplicateAssemblySequenceException.java
new file mode 100644
index 00000000..f382e62f
--- /dev/null
+++ b/src/main/java/uk/ac/ebi/eva/contigalias/exception/DuplicateAssemblySequenceException.java
@@ -0,0 +1,8 @@
+package uk.ac.ebi.eva.contigalias.exception;
+
+public class DuplicateAssemblySequenceException extends RuntimeException{
+
+ public DuplicateAssemblySequenceException(String msg){
+ super(msg);
+ }
+}
diff --git a/src/main/java/uk/ac/ebi/eva/contigalias/repo/AssemblySequencesRepository.java b/src/main/java/uk/ac/ebi/eva/contigalias/repo/AssemblySequencesRepository.java
new file mode 100644
index 00000000..ce1b9321
--- /dev/null
+++ b/src/main/java/uk/ac/ebi/eva/contigalias/repo/AssemblySequencesRepository.java
@@ -0,0 +1,14 @@
+package uk.ac.ebi.eva.contigalias.repo;
+
+import java.util.Optional;
+
+import org.springframework.data.jpa.repository.JpaRepository;
+import org.springframework.stereotype.Repository;
+import uk.ac.ebi.eva.contigalias.entities.AssemblySequencesEntity;
+
+@Repository
+public interface AssemblySequencesRepository extends JpaRepository {
+ Optional findAssemblySequenceEntityByAssemblyInsdcAccession(String accession);
+
+
+}
diff --git a/src/main/java/uk/ac/ebi/eva/contigalias/repo/ChromosomeRepository.java b/src/main/java/uk/ac/ebi/eva/contigalias/repo/ChromosomeRepository.java
index 0b6f5bd7..920e488e 100644
--- a/src/main/java/uk/ac/ebi/eva/contigalias/repo/ChromosomeRepository.java
+++ b/src/main/java/uk/ac/ebi/eva/contigalias/repo/ChromosomeRepository.java
@@ -19,11 +19,17 @@
import org.springframework.data.domain.Page;
import org.springframework.data.domain.Pageable;
import org.springframework.data.jpa.repository.JpaRepository;
+import org.springframework.data.jpa.repository.Modifying;
+import org.springframework.data.jpa.repository.Query;
+import org.springframework.data.repository.query.Param;
import org.springframework.stereotype.Repository;
import uk.ac.ebi.eva.contigalias.entities.AssemblyEntity;
import uk.ac.ebi.eva.contigalias.entities.ChromosomeEntity;
+import javax.transaction.Transactional;
+
+
@Repository
public interface ChromosomeRepository extends JpaRepository {
@@ -63,6 +69,11 @@ Page findChromosomeEntitiesByAssemblyInsdcAccessionOrAssemblyR
Page findChromosomeEntitiesByUcscName(String ucscName, Pageable request);
+ @Transactional
+ @Modifying
+ @Query("UPDATE ChromosomeEntity c SET c.md5checksum = :md5Checksum WHERE c.refseq = :refseq")
+ int updateChromosomeEntityByRefseqSetMD5Checksum(@Param(value = "refseq") String refseq, @Param(value = "md5Checksum") String md5Checksum);
+
long countChromosomeEntitiesByInsdcAccession(String insdcAccession);
long countChromosomeEntitiesByRefseq(String refseq);
diff --git a/src/main/java/uk/ac/ebi/eva/contigalias/repo/SequenceRepository.java b/src/main/java/uk/ac/ebi/eva/contigalias/repo/SequenceRepository.java
new file mode 100644
index 00000000..c9415632
--- /dev/null
+++ b/src/main/java/uk/ac/ebi/eva/contigalias/repo/SequenceRepository.java
@@ -0,0 +1,9 @@
+package uk.ac.ebi.eva.contigalias.repo;
+
+import org.springframework.data.jpa.repository.JpaRepository;
+import org.springframework.stereotype.Repository;
+import uk.ac.ebi.eva.contigalias.entities.Sequence;
+
+@Repository
+public interface SequenceRepository extends JpaRepository {
+}
diff --git a/src/main/java/uk/ac/ebi/eva/contigalias/service/AssemblySequencesService.java b/src/main/java/uk/ac/ebi/eva/contigalias/service/AssemblySequencesService.java
new file mode 100644
index 00000000..0886e345
--- /dev/null
+++ b/src/main/java/uk/ac/ebi/eva/contigalias/service/AssemblySequencesService.java
@@ -0,0 +1,91 @@
+package uk.ac.ebi.eva.contigalias.service;
+
+import java.io.IOException;
+import java.security.NoSuchAlgorithmException;
+import java.util.Optional;
+
+import javax.transaction.Transactional;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import org.springframework.beans.factory.annotation.Autowired;
+import org.springframework.stereotype.Service;
+import uk.ac.ebi.eva.contigalias.datasource.NCBIAssemblySequencesDataSource;
+import uk.ac.ebi.eva.contigalias.entities.AssemblySequencesEntity;
+import uk.ac.ebi.eva.contigalias.entities.Sequence;
+import uk.ac.ebi.eva.contigalias.exception.AssemblySequenceNotFoundException;
+import uk.ac.ebi.eva.contigalias.exception.DuplicateAssemblySequenceException;
+import uk.ac.ebi.eva.contigalias.repo.AssemblySequencesRepository;
+
+@Service
+public class AssemblySequencesService {
+
+ @Autowired
+ private ChromosomeService chromosomeService;
+
+ private final AssemblySequencesRepository repository;
+
+ private final NCBIAssemblySequencesDataSource ncbiSequencesDataSource;
+
+ private final Logger logger = LoggerFactory.getLogger(AssemblySequencesService.class);
+
+
+ public AssemblySequencesService(
+ AssemblySequencesRepository repository, NCBIAssemblySequencesDataSource ncbiSequencesDataSource){
+ this.repository = repository;
+ this.ncbiSequencesDataSource = ncbiSequencesDataSource;
+ }
+
+ public void fetchAndInsertAssemblySequence(String accession) throws IOException, NoSuchAlgorithmException {
+ Optional entity = repository.findAssemblySequenceEntityByAssemblyInsdcAccession(accession);
+ if(entity.isPresent())
+ throw duplicateAssemblySequenceInsertionException(accession, entity.get());
+ Optional fetchAssemblySequences = ncbiSequencesDataSource.getAssemblySequencesByAccession(accession);
+ if(!fetchAssemblySequences.isPresent()){
+ throw new AssemblySequenceNotFoundException(accession);
+ }
+ if (fetchAssemblySequences.get().getAssemblyInsdcAccession() != null){
+ insertAssemblySequences(fetchAssemblySequences.get());
+ logger.info("Successfully inserted assembly sequences for accession: " + accession);
+ }else {
+ logger.error("Skipping inserting assembly sequences : No name in assembly: " + accession);
+ }
+ }
+
+ @Transactional
+ public void insertAssemblySequences(AssemblySequencesEntity entity) {
+ if (isEntityPresent(entity)) {
+ throw duplicateAssemblySequenceInsertionException(null, entity);
+ } else {
+ // Inserting the sequences' md5Checksum in the correct place in the chromosome table
+ for (Sequence s: entity.getSequences()){
+ chromosomeService.updateChromosomeEntityByRefseqSetMD5Checksum(s.getSequenceRefseq(), s.getSequenceMD5());
+ }
+ System.out.println("Assembly_insdc_accession: " + entity.getAssemblyInsdcAccession());
+ repository.save(entity);
+ }
+ }
+
+ private boolean isEntityPresent(AssemblySequencesEntity entity) {
+ // TODO: THE CONDITIONS IN THIS METHOD WILL BE CHANGED WHEN WE ADD MORE ATTRIBUTES TO THE ENTITY
+ Optional existingAssembly = repository.findAssemblySequenceEntityByAssemblyInsdcAccession(entity.getAssemblyInsdcAccession());
+ return existingAssembly.isPresent();
+ }
+
+ private DuplicateAssemblySequenceException duplicateAssemblySequenceInsertionException(String accession, AssemblySequencesEntity present) {
+ StringBuilder exception = new StringBuilder("A similar assembly Sequence already exists");
+ if (accession != null){
+ exception.append("\n");
+ exception.append("Assembly Sequence trying to insert:");
+ exception.append("\t");
+ exception.append(accession);
+ }
+ if (present != null){
+ exception.append("\n");
+ exception.append("Assembly Sequence already present");
+ exception.append("\t");
+ exception.append(present);
+ }
+ return new DuplicateAssemblySequenceException(exception.toString());
+ }
+}
diff --git a/src/main/java/uk/ac/ebi/eva/contigalias/service/ChromosomeService.java b/src/main/java/uk/ac/ebi/eva/contigalias/service/ChromosomeService.java
index 93679963..365796a0 100644
--- a/src/main/java/uk/ac/ebi/eva/contigalias/service/ChromosomeService.java
+++ b/src/main/java/uk/ac/ebi/eva/contigalias/service/ChromosomeService.java
@@ -199,6 +199,13 @@ public void deleteChromosome(ChromosomeEntity entity) {
repository.delete(entity);
}
+ /**
+ * Update the chromosome table; set the md5Checksum for the entry that has the given
+ * chromosome refseq*/
+ public int updateChromosomeEntityByRefseqSetMD5Checksum(String refseq, String md5Checksum){
+ return repository.updateChromosomeEntityByRefseqSetMD5Checksum(refseq, md5Checksum);
+ }
+
public long countChromosomeEntitiesByInsdcAccession(String insdcAccession) {
return repository.countChromosomeEntitiesByInsdcAccession(insdcAccession);
}
diff --git a/src/main/java/uk/ac/ebi/eva/contigalias/service/SequenceCollectionService.java b/src/main/java/uk/ac/ebi/eva/contigalias/service/SequenceCollectionService.java
new file mode 100644
index 00000000..55f44c57
--- /dev/null
+++ b/src/main/java/uk/ac/ebi/eva/contigalias/service/SequenceCollectionService.java
@@ -0,0 +1,127 @@
+package uk.ac.ebi.eva.contigalias.service;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import org.springframework.beans.factory.annotation.Autowired;
+import org.springframework.stereotype.Service;
+import uk.ac.ebi.eva.contigalias.datasource.NCBIAssemblyDataSource;
+import uk.ac.ebi.eva.contigalias.datasource.NCBIAssemblySequencesDataSource;
+import uk.ac.ebi.eva.contigalias.entities.*;
+import uk.ac.ebi.eva.contigalias.exception.AssemblyNotFoundException;
+import uk.ac.ebi.eva.contigalias.exception.AssemblySequenceNotFoundException;
+import uk.ac.ebi.eva.contigalias.repo.AssemblyRepository;
+import uk.ac.ebi.eva.contigalias.repo.AssemblySequencesRepository;
+
+import java.io.IOException;
+import java.security.NoSuchAlgorithmException;
+import java.util.*;
+
+@Service
+public class SequenceCollectionService {
+
+ @Autowired
+ private AssemblyService assemblyService;
+
+ @Autowired
+ private AssemblySequencesService assemblySequencesService;
+
+ private final AssemblyRepository assemblyRepository;
+
+ private final AssemblySequencesRepository assemblySequencesRepository;
+
+ private final NCBIAssemblyDataSource assemblyDataSource;
+
+ private final NCBIAssemblySequencesDataSource assemblySequencesDataSource;
+
+ private final Logger logger = LoggerFactory.getLogger(SequenceCollectionService.class);
+
+ public SequenceCollectionService(AssemblyRepository assemblyRepository, AssemblySequencesRepository assemblySequencesRepository,
+ NCBIAssemblyDataSource assemblyDataSource, NCBIAssemblySequencesDataSource assemblySequencesDataSource) {
+ this.assemblyRepository = assemblyRepository;
+ this.assemblySequencesRepository = assemblySequencesRepository;
+ this.assemblyDataSource = assemblyDataSource;
+ this.assemblySequencesDataSource = assemblySequencesDataSource;
+ }
+
+ /**
+ * Search for the assembly report as well as the assembly real sequences and insert them
+ * in the database.
+ * Use the given naming convention while constructing the SeqCol Object*/
+ public void fetchAndInsertSequenceCollection(String accession, SeqColEntity.NamingConvention namingConvention)
+ throws IOException, NoSuchAlgorithmException {
+ // TODO: Check if the needed seqCol data does not exist in the database
+ // TODO: If not, call the appropriate service(s) to fetch it
+
+ Optional fetchAssembly = assemblyDataSource.getAssemblyByAccession(accession);
+ if (!fetchAssembly.isPresent()){
+ throw new AssemblyNotFoundException(accession);
+ }
+ assemblyService.insertAssembly(fetchAssembly.get());
+ Optional fetchAssemblySequences = assemblySequencesDataSource
+ .getAssemblySequencesByAccession(accession);
+ if (!fetchAssemblySequences.isPresent()){
+ throw new AssemblySequenceNotFoundException(accession);
+ }
+ assemblySequencesService.insertAssemblySequences(fetchAssemblySequences.get());
+
+ SeqColEntity seqColLevel2 = constructSequenceCollectionObjectL2(fetchAssembly.get(), fetchAssemblySequences.get(),
+ namingConvention);
+
+ }
+
+ /**
+ * Return a level 1 entity of the sequence collection following the given naming convention.
+ * */
+ public SeqColEntity constructSequenceCollectionObjectL2(AssemblyEntity assemblyEntity,
+ AssemblySequencesEntity assemblySequencesEntity,
+ SeqColEntity.NamingConvention namingConvention) {
+
+ List chromosomeList = assemblyEntity.getChromosomes();
+ List sequenceList = assemblySequencesEntity.getSequences();
+ assert chromosomeList.size() == sequenceList.size();
+
+ Comparator chromosomeComparator = (chromosomeEntity, t1) ->
+ chromosomeEntity.getRefseq().compareTo(t1.getRefseq());
+ Comparator sequenceComparator = (sequence, t1) -> sequence.getSequenceRefseq().compareTo(t1.getSequenceRefseq());
+
+ Collections.sort(chromosomeList, chromosomeComparator);
+ Collections.sort(sequenceList, sequenceComparator);
+
+ SeqColEntity seqColL2 = new SeqColEntity();
+
+
+ List sequences = new LinkedList<>();
+ List names = new LinkedList<>();
+ List lengths = new LinkedList<>();
+
+ switch (namingConvention) {
+ case ENA:
+ for (int i=0; i unzip(String compressedFilePath, String outputDirPath) {
+ String outputFileName = "genome_sequence.fna";
+ String decompressedFilePath = outputDirPath + "/" + outputFileName;
+
+ byte[] buffer = new byte[1024];
+
+ try {
+ FileInputStream fileIn = new FileInputStream(compressedFilePath);
+ GZIPInputStream gzipInputStream = new GZIPInputStream(fileIn);
+ FileOutputStream fileOutputStream = new FileOutputStream(decompressedFilePath);
+
+ int bytes_read;
+
+ while ((bytes_read = gzipInputStream.read(buffer)) > 0) {
+ fileOutputStream.write(buffer, 0, bytes_read);
+ }
+ gzipInputStream.close();
+ fileOutputStream.close();
+ logger.info("File " + compressedFilePath + " was decompressed successfully");
+ Path outputFilePath = Paths.get(outputDirPath, outputFileName);
+ return Optional.of(outputFilePath);
+ } catch (
+ IOException e) {
+ logger.error("Could not find or read file !!");
+ return Optional.empty();
+ }
+
+ }
+}
diff --git a/src/main/java/uk/ac/ebi/eva/contigalias/utils/MD5Hash.java b/src/main/java/uk/ac/ebi/eva/contigalias/utils/MD5Hash.java
new file mode 100644
index 00000000..633e215f
--- /dev/null
+++ b/src/main/java/uk/ac/ebi/eva/contigalias/utils/MD5Hash.java
@@ -0,0 +1,21 @@
+package uk.ac.ebi.eva.contigalias.utils;
+
+import java.security.MessageDigest;
+import java.security.NoSuchAlgorithmException;
+
+import javax.xml.bind.DatatypeConverter;
+
+public class MD5Hash extends DigestGenerator{
+
+ /**
+ * Return the digest of the text using the MD5 algorithm*/
+ @Override
+ public String hash(String text) throws NoSuchAlgorithmException {
+ MessageDigest md = MessageDigest.getInstance("MD5");
+ md.update(text.getBytes());
+ byte[] digest = md.digest();
+ String textHash = DatatypeConverter
+ .printHexBinary(digest).toUpperCase();
+ return textHash.toLowerCase();
+ }
+}
diff --git a/src/main/java/uk/ac/ebi/eva/contigalias/utils/SerializationService.java b/src/main/java/uk/ac/ebi/eva/contigalias/utils/SerializationService.java
new file mode 100644
index 00000000..97464b01
--- /dev/null
+++ b/src/main/java/uk/ac/ebi/eva/contigalias/utils/SerializationService.java
@@ -0,0 +1,21 @@
+package uk.ac.ebi.eva.contigalias.utils;
+
+import org.webpki.jcs.JsonCanonicalizer;
+
+import java.io.IOException;
+import java.util.Optional;
+
+public class SerializationService {
+
+ /**
+ * Return a serialized version of the input jsonString using the
+ * RFC-8785, using the implementation provided by cyberphone/json-canonicalization
+ * (see on GitHub).
+ * The jsonString should respect some strict format rules, for example:
+ * should be delimited with '{ }', etc*/
+ public Optional serialize(String jsonString) throws IOException {
+ JsonCanonicalizer jsonCanonicalizer = new JsonCanonicalizer(jsonString);
+ String result = jsonCanonicalizer.getEncodedString();
+ return Optional.of(result);
+ }
+}
diff --git a/src/main/resources/application.properties b/src/main/resources/application.properties
index 514ac4f3..efa59a98 100644
--- a/src/main/resources/application.properties
+++ b/src/main/resources/application.properties
@@ -14,8 +14,8 @@
# limitations under the License.
#
-controller.auth.admin.username=@contig-alias.admin-user@
-controller.auth.admin.password=@contig-alias.admin-password@
+controller.auth.admin.username=haroune
+controller.auth.admin.password=password
management.endpoints.web.exposure.include=info,health
management.endpoints.web.base-path=/
@@ -24,20 +24,21 @@ management.info.git.mode=full
logging.level.uk.ac.ebi.eva.contigalias=DEBUG
# Database configuration
-spring.datasource.url=@contig-alias.db-url@
-spring.datasource.username=@contig-alias.db-username@
-spring.datasource.password=@contig-alias.db-password@
-spring.jpa.hibernate.ddl-auto=@contig-alias.ddl-behaviour@
+spring.datasource.url=jdbc:postgresql://localhost:5432/contig_db
+spring.datasource.username=haroune
+spring.datasource.password=123
+spring.jpa.hibernate.ddl-auto=update
spring.datasource.driver-class-name=org.postgresql.Driver
spring.jpa.database-platform=org.hibernate.dialect.PostgreSQLDialect
spring.jpa.generate-ddl=true
server.servlet.context-path=/eva/webservices/contig-alias
+server.port=8081
-ftp.proxy.host=@ftp.proxy.host@
-ftp.proxy.port=@ftp.proxy.port@
+ftp.proxy.host=null
+ftp.proxy.port=0
-config.scaffolds.enabled = @contig-alias.scaffolds-enabled@
+config.scaffolds.enabled = true
asm.file.download.dir=/tmp
diff --git a/src/test/java/uk/ac/ebi/eva/contigalias/datasource/NCBIAssemblySequencesDataSourceTest.java b/src/test/java/uk/ac/ebi/eva/contigalias/datasource/NCBIAssemblySequencesDataSourceTest.java
new file mode 100644
index 00000000..d1305371
--- /dev/null
+++ b/src/test/java/uk/ac/ebi/eva/contigalias/datasource/NCBIAssemblySequencesDataSourceTest.java
@@ -0,0 +1,55 @@
+package uk.ac.ebi.eva.contigalias.datasource;
+
+import java.io.IOException;
+import java.security.NoSuchAlgorithmException;
+import java.util.Optional;
+
+import org.junit.jupiter.api.AfterEach;
+import org.junit.jupiter.api.BeforeEach;
+import org.junit.jupiter.api.Test;
+import org.springframework.beans.factory.annotation.Autowired;
+import org.springframework.boot.test.context.SpringBootTest;
+import uk.ac.ebi.eva.contigalias.entities.AssemblySequencesEntity;
+import uk.ac.ebi.eva.contigalias.entities.Sequence;
+
+import static org.junit.jupiter.api.Assertions.*;
+@SpringBootTest
+class NCBIAssemblySequencesDataSourceTest {
+
+
+ @Autowired
+ NCBIAssemblySequencesDataSource dataSource;
+
+ @BeforeEach
+ void setUp() {
+ }
+
+ @AfterEach
+ void tearDown() {
+ }
+
+ @Test
+ void getAssemblySequenceByAccession() throws IOException, NoSuchAlgorithmException, InterruptedException {
+
+
+ String accession = "GCF_000001765.3";
+ //String accession2 = "GCF_000001405.31";
+ Optional entity = dataSource.getAssemblySequencesByAccession(accession);
+ //displayAssemblySequencesEntityContent(entity.get());
+ assertEquals(accession, entity.get().getInsdcAccession());
+ }
+
+ void displayAssemblySequencesEntityContent(AssemblySequencesEntity entity) throws InterruptedException {
+ System.out.println("ACCESSION: " + entity.getInsdcAccession());
+ System.out.println("TOTAL NUMBER OF SEQUENCES: " + entity.getSequences().size());
+ for (Sequence s: entity.getSequences()){
+ System.out.print("REFSEQ: " + s.getRefseq() + " | ");
+ System.out.println("SEQUENCE_MD5: " + s.getSequenceMD5());
+ Thread.sleep(1000); // Just for lazy and fun display :)
+ }
+ }
+
+ @Test
+ void downloadAssemblySequence() {
+ }
+}
\ No newline at end of file
diff --git a/src/test/java/uk/ac/ebi/eva/contigalias/dus2/NCBIAssemblySequencesReaderTest.java b/src/test/java/uk/ac/ebi/eva/contigalias/dus2/NCBIAssemblySequencesReaderTest.java
new file mode 100644
index 00000000..b652ea13
--- /dev/null
+++ b/src/test/java/uk/ac/ebi/eva/contigalias/dus2/NCBIAssemblySequencesReaderTest.java
@@ -0,0 +1,67 @@
+package uk.ac.ebi.eva.contigalias.dus2;
+
+import java.io.FileInputStream;
+import java.io.FileNotFoundException;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.InputStreamReader;
+import java.security.NoSuchAlgorithmException;
+
+import org.junit.jupiter.api.AfterEach;
+import org.junit.jupiter.api.BeforeEach;
+import org.junit.jupiter.api.Test;
+import org.springframework.beans.factory.annotation.Autowired;
+import org.springframework.boot.test.context.SpringBootTest;
+import uk.ac.ebi.eva.contigalias.entities.AssemblySequencesEntity;
+import uk.ac.ebi.eva.contigalias.entities.Sequence;
+
+import static org.junit.jupiter.api.Assertions.*;
+
+@SpringBootTest
+class NCBIAssemblySequencesReaderTest {
+
+ private static final String ACCESSION = "GCF_000001765.3";
+
+ private static final String FASTA_FILE_PATH = "/tmp/genome_sequence.fna";
+ private InputStreamReader streamReader;
+
+ private InputStream stream;
+
+ @Autowired
+ private NCBIAssemblySequencesReaderFactory readerFactory;
+
+ private NCBIAssemblySequencesReader reader;
+
+ @BeforeEach
+ void setUp() throws FileNotFoundException {
+ stream = new FileInputStream(FASTA_FILE_PATH);
+ streamReader = new InputStreamReader(stream);
+ reader = readerFactory.build(streamReader, ACCESSION);
+ }
+
+ @AfterEach
+ void tearDown() throws IOException {
+ stream.close();
+ streamReader.close();
+ }
+
+ @Test
+ void getAssemblySequencesReader() throws IOException {
+ assertTrue(reader.ready());
+ }
+
+ @Test
+ void assertParsedFastaFileValid() throws IOException, NoSuchAlgorithmException {
+ reader.parseFile();
+ displayAssemblySequencesEntityContent(reader.assemblySequencesEntity);
+ assertEquals(ACCESSION, reader.assemblySequencesEntity.getInsdcAccession());
+ }
+
+ void displayAssemblySequencesEntityContent(AssemblySequencesEntity entity){
+ System.out.println("ACCESSION: " + entity.getInsdcAccession());
+ for (Sequence s: entity.getSequences()){
+ System.out.print("REFSEQ: " + s.getRefseq() + " | ");
+ System.out.println("SEQUENCE_MD5: " + s.getSequenceMD5());
+ }
+ }
+}
\ No newline at end of file
diff --git a/src/test/java/uk/ac/ebi/eva/contigalias/service/AssemblySequencesServiceTest.java b/src/test/java/uk/ac/ebi/eva/contigalias/service/AssemblySequencesServiceTest.java
new file mode 100644
index 00000000..70c8e146
--- /dev/null
+++ b/src/test/java/uk/ac/ebi/eva/contigalias/service/AssemblySequencesServiceTest.java
@@ -0,0 +1,44 @@
+package uk.ac.ebi.eva.contigalias.service;
+
+import java.io.IOException;
+import java.security.NoSuchAlgorithmException;
+
+import org.junit.jupiter.api.AfterEach;
+import org.junit.jupiter.api.BeforeEach;
+import org.junit.jupiter.api.Test;
+import org.springframework.beans.factory.annotation.Autowired;
+import org.springframework.boot.test.context.SpringBootTest;
+import uk.ac.ebi.eva.contigalias.repo.AssemblySequencesRepository;
+
+import static org.junit.jupiter.api.Assertions.*;
+
+@SpringBootTest
+class AssemblySequencesServiceTest {
+
+
+ @Autowired
+ private AssemblySequencesService assemblySequencesService;
+
+ @Autowired
+ private AssemblySequencesRepository assemblySequencesRepository;
+
+ @BeforeEach
+ void setUp() {
+ }
+
+ @AfterEach
+ void tearDown() {
+ }
+
+ @Test
+ void fetchAndInsertAssemblySequence() throws IOException, NoSuchAlgorithmException {
+ String accession = "GCF_000001765.3";
+ assemblySequencesService.fetchAndInsertAssemblySequence(accession);
+ assertNotNull(assemblySequencesRepository.findAssemblySequenceEntityByInsdcAccession(accession));
+ assertEquals(accession, assemblySequencesRepository.findAssemblySequenceEntityByInsdcAccession(accession).get().getInsdcAccession());
+ }
+
+ @Test
+ void insertAssemblySequence() {
+ }
+}
\ No newline at end of file
diff --git a/src/test/java/uk/ac/ebi/eva/contigalias/utils/GzipCompressTest.java b/src/test/java/uk/ac/ebi/eva/contigalias/utils/GzipCompressTest.java
new file mode 100644
index 00000000..a2ea9f99
--- /dev/null
+++ b/src/test/java/uk/ac/ebi/eva/contigalias/utils/GzipCompressTest.java
@@ -0,0 +1,18 @@
+package uk.ac.ebi.eva.contigalias.utils;
+
+import org.junit.jupiter.api.Test;
+
+import static org.junit.jupiter.api.Assertions.*;
+
+class GzipCompressTest {
+
+ @Test
+ void unzip() {
+ String compressedFilePath = "/tmp/GCF_000001765.3_Dpse_3.0_genomic.fna.gz";
+ String outputDirPath = "/tmp";
+ GzipCompress gzipCompress = new GzipCompress();
+
+
+ assertEquals("/tmp/genome_sequence.fna", gzipCompress.unzip(compressedFilePath, outputDirPath).get().toString());
+ }
+}
\ No newline at end of file