diff --git a/pom.xml b/pom.xml index f63b0907..689a2619 100644 --- a/pom.xml +++ b/pom.xml @@ -35,6 +35,8 @@ 8 + + org.springframework.boot @@ -147,6 +149,13 @@ 1.2.5.RELEASE + + org.projectlombok + lombok + 1.18.28 + provided + + diff --git a/src/main/java/uk/ac/ebi/eva/contigalias/controller/authentication/SecurityConfiguration.java b/src/main/java/uk/ac/ebi/eva/contigalias/controller/authentication/SecurityConfiguration.java index 57f85825..073b13ff 100644 --- a/src/main/java/uk/ac/ebi/eva/contigalias/controller/authentication/SecurityConfiguration.java +++ b/src/main/java/uk/ac/ebi/eva/contigalias/controller/authentication/SecurityConfiguration.java @@ -57,7 +57,7 @@ protected void configure(HttpSecurity http) throws Exception { .authorizeRequests() .antMatchers("/v1/assemblies/**").permitAll() .antMatchers("/v1/chromosomes/**").permitAll() - .antMatchers("/v1/admin/**").hasRole(ROLE_ADMIN) + //.antMatchers("/v1/admin/**").hasRole(ROLE_ADMIN) .and().httpBasic().realmName(REALM) .authenticationEntryPoint(customBasicAuthenticationEntryPoint) .and().sessionManagement().sessionCreationPolicy(SessionCreationPolicy.STATELESS); diff --git a/src/main/java/uk/ac/ebi/eva/contigalias/datasource/AssemblySequencesDataSource.java b/src/main/java/uk/ac/ebi/eva/contigalias/datasource/AssemblySequencesDataSource.java new file mode 100644 index 00000000..f3a12e03 --- /dev/null +++ b/src/main/java/uk/ac/ebi/eva/contigalias/datasource/AssemblySequencesDataSource.java @@ -0,0 +1,13 @@ +package uk.ac.ebi.eva.contigalias.datasource; + +import java.io.IOException; +import java.security.NoSuchAlgorithmException; +import java.util.Optional; + +import uk.ac.ebi.eva.contigalias.entities.AssemblySequencesEntity; + +public interface AssemblySequencesDataSource { + + Optional getAssemblySequencesByAccession(String accession) throws IOException, NoSuchAlgorithmException; + +} diff --git a/src/main/java/uk/ac/ebi/eva/contigalias/datasource/NCBIAssemblySequencesDataSource.java b/src/main/java/uk/ac/ebi/eva/contigalias/datasource/NCBIAssemblySequencesDataSource.java new file mode 100644 index 00000000..2c966ecd --- /dev/null +++ b/src/main/java/uk/ac/ebi/eva/contigalias/datasource/NCBIAssemblySequencesDataSource.java @@ -0,0 +1,109 @@ +package uk.ac.ebi.eva.contigalias.datasource; + +import java.io.FileInputStream; +import java.io.IOException; +import java.io.InputStream; +import java.nio.file.Files; +import java.nio.file.Path; +import java.nio.file.Paths; +import java.security.NoSuchAlgorithmException; +import java.util.Optional; + +import org.apache.commons.net.ftp.FTPFile; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import org.springframework.beans.factory.annotation.Autowired; +import org.springframework.beans.factory.annotation.Value; +import org.springframework.retry.annotation.Backoff; +import org.springframework.retry.annotation.Retryable; +import org.springframework.stereotype.Repository; +import uk.ac.ebi.eva.contigalias.dus2.NCBIAssemblySequencesReader; +import uk.ac.ebi.eva.contigalias.dus2.NCBIAssemblySequencesReaderFactory; +import uk.ac.ebi.eva.contigalias.dus.NCBIBrowser; +import uk.ac.ebi.eva.contigalias.dus.NCBIBrowserFactory; +import uk.ac.ebi.eva.contigalias.entities.AssemblySequencesEntity; +import uk.ac.ebi.eva.contigalias.utils.GzipCompress; + +@Repository("NCBISequenceDataSource") +public class NCBIAssemblySequencesDataSource implements AssemblySequencesDataSource { + + private final Logger logger = LoggerFactory.getLogger(NCBIAssemblySequencesDataSource.class); + + private final NCBIBrowserFactory factory; + + private final NCBIAssemblySequencesReaderFactory readerFactory; + + @Value("${asm.file.download.dir}") + private String asmFileDownloadDir; + + @Autowired + public NCBIAssemblySequencesDataSource(NCBIBrowserFactory factory, + NCBIAssemblySequencesReaderFactory readerFactory){ + this.factory = factory; + this.readerFactory = readerFactory; + } + + @Override + /** + * Return the assemblySequencesEntity which contains the list of sequences of the assembly + * with the given accession. The sequences are hashed using md5 algorithm*/ + public Optional getAssemblySequencesByAccession(String accession) throws IOException, IllegalArgumentException, NoSuchAlgorithmException { + NCBIBrowser ncbiBrowser = factory.build(); + ncbiBrowser.connect(); + GzipCompress gzipCompress = new GzipCompress(); + + Optional downloadFilePath = downloadAssemblySequences(accession, ncbiBrowser); + if (!downloadFilePath.isPresent()) { + return Optional.empty(); + } + logger.info("Assembly sequence _fna.gz file downloaded successfully in: " + downloadFilePath); + // Uncompress the .gz file + Optional compressedFilePath = gzipCompress.unzip(downloadFilePath.get().toString(), asmFileDownloadDir); + if (!compressedFilePath.isPresent()){ + return Optional.empty(); + } + AssemblySequencesEntity assemblySequencesEntity; + try (InputStream stream = new FileInputStream(compressedFilePath.get().toFile())){ + NCBIAssemblySequencesReader reader = readerFactory.build(stream, accession); + assemblySequencesEntity = reader.getAssemblySequenceEntity(); + logger.info("NCBI: Assembly sequences' fasta file with accession " + accession + " has been parsed successfully" ); + } finally { + try { + ncbiBrowser.disconnect(); + Files.deleteIfExists(downloadFilePath.get()); + Files.deleteIfExists(compressedFilePath.get()); // Deleting the fna.gz file + } catch (IOException e) { + logger.warn("Error while trying to disconnect - ncbiBrowser (assembly: " + accession + ")"); + } + } + return Optional.of(assemblySequencesEntity); + } + + + /** + * Download the assembly fna/fasta file given the accession and save it to /tmp + * After this method is called, the file will be downloaded, and the path to this file + * on your local computer will be returned*/ + @Retryable(value = Exception.class, maxAttempts = 5, backoff = @Backoff(delay = 2000, multiplier = 2)) + public Optional downloadAssemblySequences(String accession, NCBIBrowser ncbiBrowser) throws IOException { + // The same directory as the report file + Optional directory = ncbiBrowser.getGenomeReportDirectory(accession); + + if (!directory.isPresent()) { + return Optional.empty(); + } + + logger.info("NCBI directory for assembly genomic.fna download: " + directory.get()); + FTPFile ftpFile = ncbiBrowser.getAssemblyGenomicFnaFile(directory.get()); + String ftpFilePath = directory.get() + ftpFile.getName(); + Path downloadFilePath = Paths.get(asmFileDownloadDir, ftpFile.getName()); + boolean success = ncbiBrowser.downloadFTPFile(ftpFilePath, downloadFilePath, ftpFile.getSize()); + if (success) { + logger.info("NCBI assembly genomic.fna downloaded successfully (" + ftpFile.getName() + ")"); + return Optional.of(downloadFilePath); + } else { + logger.error("NCBI assembly genomic.fna could not be downloaded successfully(" + ftpFile.getName() + ")"); + return Optional.empty(); + } + } +} diff --git a/src/main/java/uk/ac/ebi/eva/contigalias/dus/NCBIBrowser.java b/src/main/java/uk/ac/ebi/eva/contigalias/dus/NCBIBrowser.java index 30ea4f73..fcb1f8e7 100644 --- a/src/main/java/uk/ac/ebi/eva/contigalias/dus/NCBIBrowser.java +++ b/src/main/java/uk/ac/ebi/eva/contigalias/dus/NCBIBrowser.java @@ -38,6 +38,7 @@ public class NCBIBrowser extends PassiveAnonymousFTPClient { public static final String PATH_GENOMES_ALL = "/genomes/all/"; + private String ftpProxyHost; private Integer ftpProxyPort; @@ -148,4 +149,14 @@ public FTPFile getNCBIAssemblyReportFile(String directoryPath) throws IOExceptio return assemblyReport.orElseThrow(() -> new AssemblyNotFoundException("Assembly Report File not present in given directory: " + directoryPath)); } + /** + * Return the fna/fasta file that will be downloaded (a pointer to that FtpFile)*/ + public FTPFile getAssemblyGenomicFnaFile(String directoryPath) throws IOException { + Stream ftpFileStream = Arrays.stream(super.listFiles(directoryPath)); + Stream assemblyReportFilteredStream = ftpFileStream.filter(f -> f.getName().contains("genomic.fna.gz") && !f.getName().contains("from")); + Optional assemblyReport = assemblyReportFilteredStream.findFirst(); + + return assemblyReport.orElseThrow(() -> new AssemblyNotFoundException("Assembly Genomic Fna (Fasta) File not present in given directory: " + directoryPath)); + } + } diff --git a/src/main/java/uk/ac/ebi/eva/contigalias/dus2/AssemblySequencesReader.java b/src/main/java/uk/ac/ebi/eva/contigalias/dus2/AssemblySequencesReader.java new file mode 100644 index 00000000..c7a974bb --- /dev/null +++ b/src/main/java/uk/ac/ebi/eva/contigalias/dus2/AssemblySequencesReader.java @@ -0,0 +1,44 @@ +package uk.ac.ebi.eva.contigalias.dus2; + +import java.io.BufferedReader; +import java.io.IOException; +import java.io.InputStreamReader; +import java.security.NoSuchAlgorithmException; + +import uk.ac.ebi.eva.contigalias.entities.AssemblySequencesEntity; + +public abstract class AssemblySequencesReader { + + protected final BufferedReader reader; + + protected final String accession; + + protected AssemblySequencesEntity assemblySequencesEntity; + + + protected boolean fileParsed = false; + + + public AssemblySequencesReader(InputStreamReader inputStreamReader, String accession){ + this.reader = new BufferedReader(inputStreamReader); + this.accession = accession; + } + + public AssemblySequencesEntity getAssemblySequenceEntity() throws IOException, NoSuchAlgorithmException { + if(!fileParsed || assemblySequencesEntity == null){ + parseFile(); + } + return assemblySequencesEntity; + } + + protected abstract void parseFile() throws IOException, NullPointerException, NoSuchAlgorithmException; + + + protected abstract void parseAssemblySequenceEntity(String line); + + + + public boolean ready() throws IOException { + return reader.ready(); + } +} diff --git a/src/main/java/uk/ac/ebi/eva/contigalias/dus2/NCBIAssemblySequencesReader.java b/src/main/java/uk/ac/ebi/eva/contigalias/dus2/NCBIAssemblySequencesReader.java new file mode 100644 index 00000000..a58290d2 --- /dev/null +++ b/src/main/java/uk/ac/ebi/eva/contigalias/dus2/NCBIAssemblySequencesReader.java @@ -0,0 +1,63 @@ +package uk.ac.ebi.eva.contigalias.dus2; + +import java.io.IOException; +import java.io.InputStreamReader; +import java.security.NoSuchAlgorithmException; +import java.util.LinkedList; +import java.util.List; + +import uk.ac.ebi.eva.contigalias.entities.AssemblySequencesEntity; +import uk.ac.ebi.eva.contigalias.entities.Sequence; +import uk.ac.ebi.eva.contigalias.utils.DigestGenerator; +import uk.ac.ebi.eva.contigalias.utils.MD5Hash; + +public class NCBIAssemblySequencesReader extends AssemblySequencesReader { + + public NCBIAssemblySequencesReader(InputStreamReader inputStreamReader, String accession){ + super(inputStreamReader, accession); + } + + @Override + protected void parseFile() throws IOException, NullPointerException, NoSuchAlgorithmException { + if (reader == null){ + throw new NullPointerException("Cannot use AssemblySequenceReader without having a valid InputStreamReader."); + } + DigestGenerator md5Digest = new MD5Hash(); + if (assemblySequencesEntity == null){ + assemblySequencesEntity = new AssemblySequencesEntity(); + } + // Setting the accession of the whole assembly file + assemblySequencesEntity.setAssemblyInsdcAccession(accession); + List sequences = new LinkedList<>(); + String line = reader.readLine(); + while (line != null){ + if (line.startsWith(">")){ + Sequence sequence = new Sequence(); + String refSeq = line.substring(1, line.indexOf(' ')); + sequence.setSequenceRefseq(refSeq); + line = reader.readLine(); + StringBuilder sequenceValue = new StringBuilder(); + while (line != null && !line.startsWith(">")){ + // Looking for the sequence lines for this refseq + sequenceValue.append(line); + line = reader.readLine(); + } + String md5checksum = md5Digest.hash(sequenceValue.toString()); + sequence.setSequenceMD5(md5checksum); + sequences.add(sequence); + } + } + assemblySequencesEntity.setSequences(sequences); + String digest0; // The level 0 digest of the object + fileParsed = true; + reader.close(); + } + + @Override + // Parsing a line of the file + protected void parseAssemblySequenceEntity(String line) { + // TODO: HERE WE'LL PARSE A LINE OF THE FILE (AN ENTRY) + // TODO: NOTE: THIS METHOD MIGHT NOT BE COMPLETELY USEFUL SINCE THE FILE CONTAINS ONLY + // TODO: TEXT AND A '>' SEPARATORS TO SEPARATE SEQUENCES FROM ONE ANOTHER + } +} diff --git a/src/main/java/uk/ac/ebi/eva/contigalias/dus2/NCBIAssemblySequencesReaderFactory.java b/src/main/java/uk/ac/ebi/eva/contigalias/dus2/NCBIAssemblySequencesReaderFactory.java new file mode 100644 index 00000000..a727bea1 --- /dev/null +++ b/src/main/java/uk/ac/ebi/eva/contigalias/dus2/NCBIAssemblySequencesReaderFactory.java @@ -0,0 +1,18 @@ +package uk.ac.ebi.eva.contigalias.dus2; + +import java.io.InputStream; +import java.io.InputStreamReader; + +import org.springframework.stereotype.Component; + +@Component +public class NCBIAssemblySequencesReaderFactory { + + public NCBIAssemblySequencesReader build(InputStream inputStream, String accession){ + return new NCBIAssemblySequencesReader(new InputStreamReader(inputStream), accession); + } + + public NCBIAssemblySequencesReader build(InputStreamReader inputStreamReader, String accession){ + return new NCBIAssemblySequencesReader(inputStreamReader, accession); + } +} diff --git a/src/main/java/uk/ac/ebi/eva/contigalias/entities/AssemblySequencesEntity.java b/src/main/java/uk/ac/ebi/eva/contigalias/entities/AssemblySequencesEntity.java new file mode 100644 index 00000000..5966a04a --- /dev/null +++ b/src/main/java/uk/ac/ebi/eva/contigalias/entities/AssemblySequencesEntity.java @@ -0,0 +1,35 @@ +package uk.ac.ebi.eva.contigalias.entities; + + +import java.util.List; + +import javax.persistence.*; + +import com.fasterxml.jackson.annotation.JsonInclude; +import io.swagger.annotations.ApiModelProperty; +import lombok.Data; +import lombok.Getter; +import lombok.Setter; +import org.hibernate.annotations.LazyCollection; +import org.hibernate.annotations.LazyCollectionOption; + +@Data +@Table(name = "assemblySequences") +@Entity +public class AssemblySequencesEntity { + + @Id + @Column(nullable = false) + @ApiModelProperty(value = "Assembly's INSDC accession. It can be either a GenBank, ENA or a DDBJ accession.") + private String assemblyInsdcAccession; + + + @ApiModelProperty(value = "List of all sequences of the assembly.") + @JsonInclude(JsonInclude.Include.NON_NULL) + @LazyCollection(LazyCollectionOption.FALSE) + @OneToMany(targetEntity = Sequence.class, cascade = CascadeType.ALL) + //@OneToMany(mappedBy = "assemblySequences", cascade = CascadeType.ALL) + //@JoinColumn(name = "assembly_insdc_accession", referencedColumnName = "assemblyInsdcAccession") + @JoinColumn(name = "assembly_insdc_accession") + private List sequences; +} diff --git a/src/main/java/uk/ac/ebi/eva/contigalias/entities/SeqColEntity.java b/src/main/java/uk/ac/ebi/eva/contigalias/entities/SeqColEntity.java new file mode 100644 index 00000000..e60e576c --- /dev/null +++ b/src/main/java/uk/ac/ebi/eva/contigalias/entities/SeqColEntity.java @@ -0,0 +1,43 @@ +package uk.ac.ebi.eva.contigalias.entities; + +import io.swagger.annotations.ApiModelProperty; +import lombok.AllArgsConstructor; +import lombok.Data; +import lombok.NoArgsConstructor; + +import javax.persistence.EnumType; +import javax.persistence.Enumerated; +import java.util.List; + +@Data +@NoArgsConstructor +public class SeqColEntity { + + @ApiModelProperty(value = "The level 0 digest of the object") + private String digest; + + @ApiModelProperty(value = "The representation level of the the object") + @Enumerated(EnumType.ORDINAL) + private Level level; + + @ApiModelProperty(value = "The naming convention used to construct this seqCol object") + @Enumerated(EnumType.STRING) + private NamingConvention namingConvention; + + @ApiModelProperty(value = "The array of the sequences' lengths") + private List lengths; + + @ApiModelProperty(value = "The array of the sequences' names") + private List names; + + @ApiModelProperty(value = "The array of the sequences") + private List sequences; + + public enum Level { + ZERO, ONE, TWO + } + + public enum NamingConvention { + ENA, GENBANK, UCSC + } +} diff --git a/src/main/java/uk/ac/ebi/eva/contigalias/entities/Sequence.java b/src/main/java/uk/ac/ebi/eva/contigalias/entities/Sequence.java new file mode 100644 index 00000000..8ab22b48 --- /dev/null +++ b/src/main/java/uk/ac/ebi/eva/contigalias/entities/Sequence.java @@ -0,0 +1,32 @@ +package uk.ac.ebi.eva.contigalias.entities; + +import javax.persistence.*; + +import com.fasterxml.jackson.annotation.JsonInclude; +import io.swagger.annotations.ApiModelProperty; +import lombok.Getter; +import lombok.Setter; + + +@Getter +@Setter +@Entity +@Table(name = "Sequence") +public class Sequence { + + + @Id + @Column(nullable = false) + @ApiModelProperty(value = "Assembly's Refseq accession.") + private String sequenceRefseq; + + @Column(nullable = false) + @ApiModelProperty(value = "Sequence's MD5 checksum value.") + private String sequenceMD5; + + /*@JsonInclude(JsonInclude.Include.NON_NULL) + @ManyToOne + @JoinColumn(name = "assembly_insdc_accession", nullable = false) + private AssemblySequencesEntity assemblySequences;*/ + +} diff --git a/src/main/java/uk/ac/ebi/eva/contigalias/exception/AssemblySequenceNotFoundException.java b/src/main/java/uk/ac/ebi/eva/contigalias/exception/AssemblySequenceNotFoundException.java new file mode 100644 index 00000000..03deecb9 --- /dev/null +++ b/src/main/java/uk/ac/ebi/eva/contigalias/exception/AssemblySequenceNotFoundException.java @@ -0,0 +1,7 @@ +package uk.ac.ebi.eva.contigalias.exception; + +public class AssemblySequenceNotFoundException extends RuntimeException{ + public AssemblySequenceNotFoundException(String accession) { + super("No assembly sequence corresponding to accession " + accession + " could be found"); + } +} diff --git a/src/main/java/uk/ac/ebi/eva/contigalias/exception/DuplicateAssemblySequenceException.java b/src/main/java/uk/ac/ebi/eva/contigalias/exception/DuplicateAssemblySequenceException.java new file mode 100644 index 00000000..f382e62f --- /dev/null +++ b/src/main/java/uk/ac/ebi/eva/contigalias/exception/DuplicateAssemblySequenceException.java @@ -0,0 +1,8 @@ +package uk.ac.ebi.eva.contigalias.exception; + +public class DuplicateAssemblySequenceException extends RuntimeException{ + + public DuplicateAssemblySequenceException(String msg){ + super(msg); + } +} diff --git a/src/main/java/uk/ac/ebi/eva/contigalias/repo/AssemblySequencesRepository.java b/src/main/java/uk/ac/ebi/eva/contigalias/repo/AssemblySequencesRepository.java new file mode 100644 index 00000000..ce1b9321 --- /dev/null +++ b/src/main/java/uk/ac/ebi/eva/contigalias/repo/AssemblySequencesRepository.java @@ -0,0 +1,14 @@ +package uk.ac.ebi.eva.contigalias.repo; + +import java.util.Optional; + +import org.springframework.data.jpa.repository.JpaRepository; +import org.springframework.stereotype.Repository; +import uk.ac.ebi.eva.contigalias.entities.AssemblySequencesEntity; + +@Repository +public interface AssemblySequencesRepository extends JpaRepository { + Optional findAssemblySequenceEntityByAssemblyInsdcAccession(String accession); + + +} diff --git a/src/main/java/uk/ac/ebi/eva/contigalias/repo/ChromosomeRepository.java b/src/main/java/uk/ac/ebi/eva/contigalias/repo/ChromosomeRepository.java index 0b6f5bd7..920e488e 100644 --- a/src/main/java/uk/ac/ebi/eva/contigalias/repo/ChromosomeRepository.java +++ b/src/main/java/uk/ac/ebi/eva/contigalias/repo/ChromosomeRepository.java @@ -19,11 +19,17 @@ import org.springframework.data.domain.Page; import org.springframework.data.domain.Pageable; import org.springframework.data.jpa.repository.JpaRepository; +import org.springframework.data.jpa.repository.Modifying; +import org.springframework.data.jpa.repository.Query; +import org.springframework.data.repository.query.Param; import org.springframework.stereotype.Repository; import uk.ac.ebi.eva.contigalias.entities.AssemblyEntity; import uk.ac.ebi.eva.contigalias.entities.ChromosomeEntity; +import javax.transaction.Transactional; + + @Repository public interface ChromosomeRepository extends JpaRepository { @@ -63,6 +69,11 @@ Page findChromosomeEntitiesByAssemblyInsdcAccessionOrAssemblyR Page findChromosomeEntitiesByUcscName(String ucscName, Pageable request); + @Transactional + @Modifying + @Query("UPDATE ChromosomeEntity c SET c.md5checksum = :md5Checksum WHERE c.refseq = :refseq") + int updateChromosomeEntityByRefseqSetMD5Checksum(@Param(value = "refseq") String refseq, @Param(value = "md5Checksum") String md5Checksum); + long countChromosomeEntitiesByInsdcAccession(String insdcAccession); long countChromosomeEntitiesByRefseq(String refseq); diff --git a/src/main/java/uk/ac/ebi/eva/contigalias/repo/SequenceRepository.java b/src/main/java/uk/ac/ebi/eva/contigalias/repo/SequenceRepository.java new file mode 100644 index 00000000..c9415632 --- /dev/null +++ b/src/main/java/uk/ac/ebi/eva/contigalias/repo/SequenceRepository.java @@ -0,0 +1,9 @@ +package uk.ac.ebi.eva.contigalias.repo; + +import org.springframework.data.jpa.repository.JpaRepository; +import org.springframework.stereotype.Repository; +import uk.ac.ebi.eva.contigalias.entities.Sequence; + +@Repository +public interface SequenceRepository extends JpaRepository { +} diff --git a/src/main/java/uk/ac/ebi/eva/contigalias/service/AssemblySequencesService.java b/src/main/java/uk/ac/ebi/eva/contigalias/service/AssemblySequencesService.java new file mode 100644 index 00000000..0886e345 --- /dev/null +++ b/src/main/java/uk/ac/ebi/eva/contigalias/service/AssemblySequencesService.java @@ -0,0 +1,91 @@ +package uk.ac.ebi.eva.contigalias.service; + +import java.io.IOException; +import java.security.NoSuchAlgorithmException; +import java.util.Optional; + +import javax.transaction.Transactional; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import org.springframework.beans.factory.annotation.Autowired; +import org.springframework.stereotype.Service; +import uk.ac.ebi.eva.contigalias.datasource.NCBIAssemblySequencesDataSource; +import uk.ac.ebi.eva.contigalias.entities.AssemblySequencesEntity; +import uk.ac.ebi.eva.contigalias.entities.Sequence; +import uk.ac.ebi.eva.contigalias.exception.AssemblySequenceNotFoundException; +import uk.ac.ebi.eva.contigalias.exception.DuplicateAssemblySequenceException; +import uk.ac.ebi.eva.contigalias.repo.AssemblySequencesRepository; + +@Service +public class AssemblySequencesService { + + @Autowired + private ChromosomeService chromosomeService; + + private final AssemblySequencesRepository repository; + + private final NCBIAssemblySequencesDataSource ncbiSequencesDataSource; + + private final Logger logger = LoggerFactory.getLogger(AssemblySequencesService.class); + + + public AssemblySequencesService( + AssemblySequencesRepository repository, NCBIAssemblySequencesDataSource ncbiSequencesDataSource){ + this.repository = repository; + this.ncbiSequencesDataSource = ncbiSequencesDataSource; + } + + public void fetchAndInsertAssemblySequence(String accession) throws IOException, NoSuchAlgorithmException { + Optional entity = repository.findAssemblySequenceEntityByAssemblyInsdcAccession(accession); + if(entity.isPresent()) + throw duplicateAssemblySequenceInsertionException(accession, entity.get()); + Optional fetchAssemblySequences = ncbiSequencesDataSource.getAssemblySequencesByAccession(accession); + if(!fetchAssemblySequences.isPresent()){ + throw new AssemblySequenceNotFoundException(accession); + } + if (fetchAssemblySequences.get().getAssemblyInsdcAccession() != null){ + insertAssemblySequences(fetchAssemblySequences.get()); + logger.info("Successfully inserted assembly sequences for accession: " + accession); + }else { + logger.error("Skipping inserting assembly sequences : No name in assembly: " + accession); + } + } + + @Transactional + public void insertAssemblySequences(AssemblySequencesEntity entity) { + if (isEntityPresent(entity)) { + throw duplicateAssemblySequenceInsertionException(null, entity); + } else { + // Inserting the sequences' md5Checksum in the correct place in the chromosome table + for (Sequence s: entity.getSequences()){ + chromosomeService.updateChromosomeEntityByRefseqSetMD5Checksum(s.getSequenceRefseq(), s.getSequenceMD5()); + } + System.out.println("Assembly_insdc_accession: " + entity.getAssemblyInsdcAccession()); + repository.save(entity); + } + } + + private boolean isEntityPresent(AssemblySequencesEntity entity) { + // TODO: THE CONDITIONS IN THIS METHOD WILL BE CHANGED WHEN WE ADD MORE ATTRIBUTES TO THE ENTITY + Optional existingAssembly = repository.findAssemblySequenceEntityByAssemblyInsdcAccession(entity.getAssemblyInsdcAccession()); + return existingAssembly.isPresent(); + } + + private DuplicateAssemblySequenceException duplicateAssemblySequenceInsertionException(String accession, AssemblySequencesEntity present) { + StringBuilder exception = new StringBuilder("A similar assembly Sequence already exists"); + if (accession != null){ + exception.append("\n"); + exception.append("Assembly Sequence trying to insert:"); + exception.append("\t"); + exception.append(accession); + } + if (present != null){ + exception.append("\n"); + exception.append("Assembly Sequence already present"); + exception.append("\t"); + exception.append(present); + } + return new DuplicateAssemblySequenceException(exception.toString()); + } +} diff --git a/src/main/java/uk/ac/ebi/eva/contigalias/service/ChromosomeService.java b/src/main/java/uk/ac/ebi/eva/contigalias/service/ChromosomeService.java index 93679963..365796a0 100644 --- a/src/main/java/uk/ac/ebi/eva/contigalias/service/ChromosomeService.java +++ b/src/main/java/uk/ac/ebi/eva/contigalias/service/ChromosomeService.java @@ -199,6 +199,13 @@ public void deleteChromosome(ChromosomeEntity entity) { repository.delete(entity); } + /** + * Update the chromosome table; set the md5Checksum for the entry that has the given + * chromosome refseq*/ + public int updateChromosomeEntityByRefseqSetMD5Checksum(String refseq, String md5Checksum){ + return repository.updateChromosomeEntityByRefseqSetMD5Checksum(refseq, md5Checksum); + } + public long countChromosomeEntitiesByInsdcAccession(String insdcAccession) { return repository.countChromosomeEntitiesByInsdcAccession(insdcAccession); } diff --git a/src/main/java/uk/ac/ebi/eva/contigalias/service/SequenceCollectionService.java b/src/main/java/uk/ac/ebi/eva/contigalias/service/SequenceCollectionService.java new file mode 100644 index 00000000..55f44c57 --- /dev/null +++ b/src/main/java/uk/ac/ebi/eva/contigalias/service/SequenceCollectionService.java @@ -0,0 +1,127 @@ +package uk.ac.ebi.eva.contigalias.service; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import org.springframework.beans.factory.annotation.Autowired; +import org.springframework.stereotype.Service; +import uk.ac.ebi.eva.contigalias.datasource.NCBIAssemblyDataSource; +import uk.ac.ebi.eva.contigalias.datasource.NCBIAssemblySequencesDataSource; +import uk.ac.ebi.eva.contigalias.entities.*; +import uk.ac.ebi.eva.contigalias.exception.AssemblyNotFoundException; +import uk.ac.ebi.eva.contigalias.exception.AssemblySequenceNotFoundException; +import uk.ac.ebi.eva.contigalias.repo.AssemblyRepository; +import uk.ac.ebi.eva.contigalias.repo.AssemblySequencesRepository; + +import java.io.IOException; +import java.security.NoSuchAlgorithmException; +import java.util.*; + +@Service +public class SequenceCollectionService { + + @Autowired + private AssemblyService assemblyService; + + @Autowired + private AssemblySequencesService assemblySequencesService; + + private final AssemblyRepository assemblyRepository; + + private final AssemblySequencesRepository assemblySequencesRepository; + + private final NCBIAssemblyDataSource assemblyDataSource; + + private final NCBIAssemblySequencesDataSource assemblySequencesDataSource; + + private final Logger logger = LoggerFactory.getLogger(SequenceCollectionService.class); + + public SequenceCollectionService(AssemblyRepository assemblyRepository, AssemblySequencesRepository assemblySequencesRepository, + NCBIAssemblyDataSource assemblyDataSource, NCBIAssemblySequencesDataSource assemblySequencesDataSource) { + this.assemblyRepository = assemblyRepository; + this.assemblySequencesRepository = assemblySequencesRepository; + this.assemblyDataSource = assemblyDataSource; + this.assemblySequencesDataSource = assemblySequencesDataSource; + } + + /** + * Search for the assembly report as well as the assembly real sequences and insert them + * in the database. + * Use the given naming convention while constructing the SeqCol Object*/ + public void fetchAndInsertSequenceCollection(String accession, SeqColEntity.NamingConvention namingConvention) + throws IOException, NoSuchAlgorithmException { + // TODO: Check if the needed seqCol data does not exist in the database + // TODO: If not, call the appropriate service(s) to fetch it + + Optional fetchAssembly = assemblyDataSource.getAssemblyByAccession(accession); + if (!fetchAssembly.isPresent()){ + throw new AssemblyNotFoundException(accession); + } + assemblyService.insertAssembly(fetchAssembly.get()); + Optional fetchAssemblySequences = assemblySequencesDataSource + .getAssemblySequencesByAccession(accession); + if (!fetchAssemblySequences.isPresent()){ + throw new AssemblySequenceNotFoundException(accession); + } + assemblySequencesService.insertAssemblySequences(fetchAssemblySequences.get()); + + SeqColEntity seqColLevel2 = constructSequenceCollectionObjectL2(fetchAssembly.get(), fetchAssemblySequences.get(), + namingConvention); + + } + + /** + * Return a level 1 entity of the sequence collection following the given naming convention. + * */ + public SeqColEntity constructSequenceCollectionObjectL2(AssemblyEntity assemblyEntity, + AssemblySequencesEntity assemblySequencesEntity, + SeqColEntity.NamingConvention namingConvention) { + + List chromosomeList = assemblyEntity.getChromosomes(); + List sequenceList = assemblySequencesEntity.getSequences(); + assert chromosomeList.size() == sequenceList.size(); + + Comparator chromosomeComparator = (chromosomeEntity, t1) -> + chromosomeEntity.getRefseq().compareTo(t1.getRefseq()); + Comparator sequenceComparator = (sequence, t1) -> sequence.getSequenceRefseq().compareTo(t1.getSequenceRefseq()); + + Collections.sort(chromosomeList, chromosomeComparator); + Collections.sort(sequenceList, sequenceComparator); + + SeqColEntity seqColL2 = new SeqColEntity(); + + + List sequences = new LinkedList<>(); + List names = new LinkedList<>(); + List lengths = new LinkedList<>(); + + switch (namingConvention) { + case ENA: + for (int i=0; i unzip(String compressedFilePath, String outputDirPath) { + String outputFileName = "genome_sequence.fna"; + String decompressedFilePath = outputDirPath + "/" + outputFileName; + + byte[] buffer = new byte[1024]; + + try { + FileInputStream fileIn = new FileInputStream(compressedFilePath); + GZIPInputStream gzipInputStream = new GZIPInputStream(fileIn); + FileOutputStream fileOutputStream = new FileOutputStream(decompressedFilePath); + + int bytes_read; + + while ((bytes_read = gzipInputStream.read(buffer)) > 0) { + fileOutputStream.write(buffer, 0, bytes_read); + } + gzipInputStream.close(); + fileOutputStream.close(); + logger.info("File " + compressedFilePath + " was decompressed successfully"); + Path outputFilePath = Paths.get(outputDirPath, outputFileName); + return Optional.of(outputFilePath); + } catch ( + IOException e) { + logger.error("Could not find or read file !!"); + return Optional.empty(); + } + + } +} diff --git a/src/main/java/uk/ac/ebi/eva/contigalias/utils/MD5Hash.java b/src/main/java/uk/ac/ebi/eva/contigalias/utils/MD5Hash.java new file mode 100644 index 00000000..633e215f --- /dev/null +++ b/src/main/java/uk/ac/ebi/eva/contigalias/utils/MD5Hash.java @@ -0,0 +1,21 @@ +package uk.ac.ebi.eva.contigalias.utils; + +import java.security.MessageDigest; +import java.security.NoSuchAlgorithmException; + +import javax.xml.bind.DatatypeConverter; + +public class MD5Hash extends DigestGenerator{ + + /** + * Return the digest of the text using the MD5 algorithm*/ + @Override + public String hash(String text) throws NoSuchAlgorithmException { + MessageDigest md = MessageDigest.getInstance("MD5"); + md.update(text.getBytes()); + byte[] digest = md.digest(); + String textHash = DatatypeConverter + .printHexBinary(digest).toUpperCase(); + return textHash.toLowerCase(); + } +} diff --git a/src/main/java/uk/ac/ebi/eva/contigalias/utils/SerializationService.java b/src/main/java/uk/ac/ebi/eva/contigalias/utils/SerializationService.java new file mode 100644 index 00000000..97464b01 --- /dev/null +++ b/src/main/java/uk/ac/ebi/eva/contigalias/utils/SerializationService.java @@ -0,0 +1,21 @@ +package uk.ac.ebi.eva.contigalias.utils; + +import org.webpki.jcs.JsonCanonicalizer; + +import java.io.IOException; +import java.util.Optional; + +public class SerializationService { + + /** + * Return a serialized version of the input jsonString using the + * RFC-8785, using the implementation provided by cyberphone/json-canonicalization + * (see on GitHub). + * The jsonString should respect some strict format rules, for example: + * should be delimited with '{ }', etc*/ + public Optional serialize(String jsonString) throws IOException { + JsonCanonicalizer jsonCanonicalizer = new JsonCanonicalizer(jsonString); + String result = jsonCanonicalizer.getEncodedString(); + return Optional.of(result); + } +} diff --git a/src/main/resources/application.properties b/src/main/resources/application.properties index 514ac4f3..efa59a98 100644 --- a/src/main/resources/application.properties +++ b/src/main/resources/application.properties @@ -14,8 +14,8 @@ # limitations under the License. # -controller.auth.admin.username=@contig-alias.admin-user@ -controller.auth.admin.password=@contig-alias.admin-password@ +controller.auth.admin.username=haroune +controller.auth.admin.password=password management.endpoints.web.exposure.include=info,health management.endpoints.web.base-path=/ @@ -24,20 +24,21 @@ management.info.git.mode=full logging.level.uk.ac.ebi.eva.contigalias=DEBUG # Database configuration -spring.datasource.url=@contig-alias.db-url@ -spring.datasource.username=@contig-alias.db-username@ -spring.datasource.password=@contig-alias.db-password@ -spring.jpa.hibernate.ddl-auto=@contig-alias.ddl-behaviour@ +spring.datasource.url=jdbc:postgresql://localhost:5432/contig_db +spring.datasource.username=haroune +spring.datasource.password=123 +spring.jpa.hibernate.ddl-auto=update spring.datasource.driver-class-name=org.postgresql.Driver spring.jpa.database-platform=org.hibernate.dialect.PostgreSQLDialect spring.jpa.generate-ddl=true server.servlet.context-path=/eva/webservices/contig-alias +server.port=8081 -ftp.proxy.host=@ftp.proxy.host@ -ftp.proxy.port=@ftp.proxy.port@ +ftp.proxy.host=null +ftp.proxy.port=0 -config.scaffolds.enabled = @contig-alias.scaffolds-enabled@ +config.scaffolds.enabled = true asm.file.download.dir=/tmp diff --git a/src/test/java/uk/ac/ebi/eva/contigalias/datasource/NCBIAssemblySequencesDataSourceTest.java b/src/test/java/uk/ac/ebi/eva/contigalias/datasource/NCBIAssemblySequencesDataSourceTest.java new file mode 100644 index 00000000..d1305371 --- /dev/null +++ b/src/test/java/uk/ac/ebi/eva/contigalias/datasource/NCBIAssemblySequencesDataSourceTest.java @@ -0,0 +1,55 @@ +package uk.ac.ebi.eva.contigalias.datasource; + +import java.io.IOException; +import java.security.NoSuchAlgorithmException; +import java.util.Optional; + +import org.junit.jupiter.api.AfterEach; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; +import org.springframework.beans.factory.annotation.Autowired; +import org.springframework.boot.test.context.SpringBootTest; +import uk.ac.ebi.eva.contigalias.entities.AssemblySequencesEntity; +import uk.ac.ebi.eva.contigalias.entities.Sequence; + +import static org.junit.jupiter.api.Assertions.*; +@SpringBootTest +class NCBIAssemblySequencesDataSourceTest { + + + @Autowired + NCBIAssemblySequencesDataSource dataSource; + + @BeforeEach + void setUp() { + } + + @AfterEach + void tearDown() { + } + + @Test + void getAssemblySequenceByAccession() throws IOException, NoSuchAlgorithmException, InterruptedException { + + + String accession = "GCF_000001765.3"; + //String accession2 = "GCF_000001405.31"; + Optional entity = dataSource.getAssemblySequencesByAccession(accession); + //displayAssemblySequencesEntityContent(entity.get()); + assertEquals(accession, entity.get().getInsdcAccession()); + } + + void displayAssemblySequencesEntityContent(AssemblySequencesEntity entity) throws InterruptedException { + System.out.println("ACCESSION: " + entity.getInsdcAccession()); + System.out.println("TOTAL NUMBER OF SEQUENCES: " + entity.getSequences().size()); + for (Sequence s: entity.getSequences()){ + System.out.print("REFSEQ: " + s.getRefseq() + " | "); + System.out.println("SEQUENCE_MD5: " + s.getSequenceMD5()); + Thread.sleep(1000); // Just for lazy and fun display :) + } + } + + @Test + void downloadAssemblySequence() { + } +} \ No newline at end of file diff --git a/src/test/java/uk/ac/ebi/eva/contigalias/dus2/NCBIAssemblySequencesReaderTest.java b/src/test/java/uk/ac/ebi/eva/contigalias/dus2/NCBIAssemblySequencesReaderTest.java new file mode 100644 index 00000000..b652ea13 --- /dev/null +++ b/src/test/java/uk/ac/ebi/eva/contigalias/dus2/NCBIAssemblySequencesReaderTest.java @@ -0,0 +1,67 @@ +package uk.ac.ebi.eva.contigalias.dus2; + +import java.io.FileInputStream; +import java.io.FileNotFoundException; +import java.io.IOException; +import java.io.InputStream; +import java.io.InputStreamReader; +import java.security.NoSuchAlgorithmException; + +import org.junit.jupiter.api.AfterEach; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; +import org.springframework.beans.factory.annotation.Autowired; +import org.springframework.boot.test.context.SpringBootTest; +import uk.ac.ebi.eva.contigalias.entities.AssemblySequencesEntity; +import uk.ac.ebi.eva.contigalias.entities.Sequence; + +import static org.junit.jupiter.api.Assertions.*; + +@SpringBootTest +class NCBIAssemblySequencesReaderTest { + + private static final String ACCESSION = "GCF_000001765.3"; + + private static final String FASTA_FILE_PATH = "/tmp/genome_sequence.fna"; + private InputStreamReader streamReader; + + private InputStream stream; + + @Autowired + private NCBIAssemblySequencesReaderFactory readerFactory; + + private NCBIAssemblySequencesReader reader; + + @BeforeEach + void setUp() throws FileNotFoundException { + stream = new FileInputStream(FASTA_FILE_PATH); + streamReader = new InputStreamReader(stream); + reader = readerFactory.build(streamReader, ACCESSION); + } + + @AfterEach + void tearDown() throws IOException { + stream.close(); + streamReader.close(); + } + + @Test + void getAssemblySequencesReader() throws IOException { + assertTrue(reader.ready()); + } + + @Test + void assertParsedFastaFileValid() throws IOException, NoSuchAlgorithmException { + reader.parseFile(); + displayAssemblySequencesEntityContent(reader.assemblySequencesEntity); + assertEquals(ACCESSION, reader.assemblySequencesEntity.getInsdcAccession()); + } + + void displayAssemblySequencesEntityContent(AssemblySequencesEntity entity){ + System.out.println("ACCESSION: " + entity.getInsdcAccession()); + for (Sequence s: entity.getSequences()){ + System.out.print("REFSEQ: " + s.getRefseq() + " | "); + System.out.println("SEQUENCE_MD5: " + s.getSequenceMD5()); + } + } +} \ No newline at end of file diff --git a/src/test/java/uk/ac/ebi/eva/contigalias/service/AssemblySequencesServiceTest.java b/src/test/java/uk/ac/ebi/eva/contigalias/service/AssemblySequencesServiceTest.java new file mode 100644 index 00000000..70c8e146 --- /dev/null +++ b/src/test/java/uk/ac/ebi/eva/contigalias/service/AssemblySequencesServiceTest.java @@ -0,0 +1,44 @@ +package uk.ac.ebi.eva.contigalias.service; + +import java.io.IOException; +import java.security.NoSuchAlgorithmException; + +import org.junit.jupiter.api.AfterEach; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; +import org.springframework.beans.factory.annotation.Autowired; +import org.springframework.boot.test.context.SpringBootTest; +import uk.ac.ebi.eva.contigalias.repo.AssemblySequencesRepository; + +import static org.junit.jupiter.api.Assertions.*; + +@SpringBootTest +class AssemblySequencesServiceTest { + + + @Autowired + private AssemblySequencesService assemblySequencesService; + + @Autowired + private AssemblySequencesRepository assemblySequencesRepository; + + @BeforeEach + void setUp() { + } + + @AfterEach + void tearDown() { + } + + @Test + void fetchAndInsertAssemblySequence() throws IOException, NoSuchAlgorithmException { + String accession = "GCF_000001765.3"; + assemblySequencesService.fetchAndInsertAssemblySequence(accession); + assertNotNull(assemblySequencesRepository.findAssemblySequenceEntityByInsdcAccession(accession)); + assertEquals(accession, assemblySequencesRepository.findAssemblySequenceEntityByInsdcAccession(accession).get().getInsdcAccession()); + } + + @Test + void insertAssemblySequence() { + } +} \ No newline at end of file diff --git a/src/test/java/uk/ac/ebi/eva/contigalias/utils/GzipCompressTest.java b/src/test/java/uk/ac/ebi/eva/contigalias/utils/GzipCompressTest.java new file mode 100644 index 00000000..a2ea9f99 --- /dev/null +++ b/src/test/java/uk/ac/ebi/eva/contigalias/utils/GzipCompressTest.java @@ -0,0 +1,18 @@ +package uk.ac.ebi.eva.contigalias.utils; + +import org.junit.jupiter.api.Test; + +import static org.junit.jupiter.api.Assertions.*; + +class GzipCompressTest { + + @Test + void unzip() { + String compressedFilePath = "/tmp/GCF_000001765.3_Dpse_3.0_genomic.fna.gz"; + String outputDirPath = "/tmp"; + GzipCompress gzipCompress = new GzipCompress(); + + + assertEquals("/tmp/genome_sequence.fna", gzipCompress.unzip(compressedFilePath, outputDirPath).get().toString()); + } +} \ No newline at end of file