Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

With contig alias #112

Open
wants to merge 6 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 9 additions & 0 deletions pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,8 @@
<java.version>8</java.version>
</properties>



<dependencies>
<dependency>
<groupId>org.springframework.boot</groupId>
Expand Down Expand Up @@ -147,6 +149,13 @@
<version>1.2.5.RELEASE</version>
</dependency>

<dependency>
<groupId>org.projectlombok</groupId>
<artifactId>lombok</artifactId>
<version>1.18.28</version>
<scope>provided</scope>
</dependency>

</dependencies>

<build>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@ protected void configure(HttpSecurity http) throws Exception {
.authorizeRequests()
.antMatchers("/v1/assemblies/**").permitAll()
.antMatchers("/v1/chromosomes/**").permitAll()
.antMatchers("/v1/admin/**").hasRole(ROLE_ADMIN)
//.antMatchers("/v1/admin/**").hasRole(ROLE_ADMIN)
.and().httpBasic().realmName(REALM)
.authenticationEntryPoint(customBasicAuthenticationEntryPoint)
.and().sessionManagement().sessionCreationPolicy(SessionCreationPolicy.STATELESS);
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
package uk.ac.ebi.eva.contigalias.datasource;

import java.io.IOException;
import java.security.NoSuchAlgorithmException;
import java.util.Optional;

import uk.ac.ebi.eva.contigalias.entities.AssemblySequencesEntity;

public interface AssemblySequencesDataSource {

Optional<AssemblySequencesEntity> getAssemblySequencesByAccession(String accession) throws IOException, NoSuchAlgorithmException;

}
Original file line number Diff line number Diff line change
@@ -0,0 +1,109 @@
package uk.ac.ebi.eva.contigalias.datasource;

import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.security.NoSuchAlgorithmException;
import java.util.Optional;

import org.apache.commons.net.ftp.FTPFile;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.beans.factory.annotation.Value;
import org.springframework.retry.annotation.Backoff;
import org.springframework.retry.annotation.Retryable;
import org.springframework.stereotype.Repository;
import uk.ac.ebi.eva.contigalias.dus2.NCBIAssemblySequencesReader;
import uk.ac.ebi.eva.contigalias.dus2.NCBIAssemblySequencesReaderFactory;
import uk.ac.ebi.eva.contigalias.dus.NCBIBrowser;
import uk.ac.ebi.eva.contigalias.dus.NCBIBrowserFactory;
import uk.ac.ebi.eva.contigalias.entities.AssemblySequencesEntity;
import uk.ac.ebi.eva.contigalias.utils.GzipCompress;

@Repository("NCBISequenceDataSource")
public class NCBIAssemblySequencesDataSource implements AssemblySequencesDataSource {

private final Logger logger = LoggerFactory.getLogger(NCBIAssemblySequencesDataSource.class);

private final NCBIBrowserFactory factory;

private final NCBIAssemblySequencesReaderFactory readerFactory;

@Value("${asm.file.download.dir}")
private String asmFileDownloadDir;

@Autowired
public NCBIAssemblySequencesDataSource(NCBIBrowserFactory factory,
NCBIAssemblySequencesReaderFactory readerFactory){
this.factory = factory;
this.readerFactory = readerFactory;
}

@Override
/**
* Return the assemblySequencesEntity which contains the list of sequences of the assembly
* with the given accession. The sequences are hashed using md5 algorithm*/
public Optional<AssemblySequencesEntity> getAssemblySequencesByAccession(String accession) throws IOException, IllegalArgumentException, NoSuchAlgorithmException {
NCBIBrowser ncbiBrowser = factory.build();
ncbiBrowser.connect();
GzipCompress gzipCompress = new GzipCompress();

Optional<Path> downloadFilePath = downloadAssemblySequences(accession, ncbiBrowser);
if (!downloadFilePath.isPresent()) {
return Optional.empty();
}
logger.info("Assembly sequence _fna.gz file downloaded successfully in: " + downloadFilePath);
// Uncompress the .gz file
Optional<Path> compressedFilePath = gzipCompress.unzip(downloadFilePath.get().toString(), asmFileDownloadDir);
if (!compressedFilePath.isPresent()){
return Optional.empty();
}
AssemblySequencesEntity assemblySequencesEntity;
try (InputStream stream = new FileInputStream(compressedFilePath.get().toFile())){
NCBIAssemblySequencesReader reader = readerFactory.build(stream, accession);
assemblySequencesEntity = reader.getAssemblySequenceEntity();
logger.info("NCBI: Assembly sequences' fasta file with accession " + accession + " has been parsed successfully" );
} finally {
try {
ncbiBrowser.disconnect();
Files.deleteIfExists(downloadFilePath.get());
Files.deleteIfExists(compressedFilePath.get()); // Deleting the fna.gz file
} catch (IOException e) {
logger.warn("Error while trying to disconnect - ncbiBrowser (assembly: " + accession + ")");
}
}
return Optional.of(assemblySequencesEntity);
}


/**
* Download the assembly fna/fasta file given the accession and save it to /tmp
* After this method is called, the file will be downloaded, and the path to this file
* on your local computer will be returned*/
@Retryable(value = Exception.class, maxAttempts = 5, backoff = @Backoff(delay = 2000, multiplier = 2))
public Optional<Path> downloadAssemblySequences(String accession, NCBIBrowser ncbiBrowser) throws IOException {
// The same directory as the report file
Optional<String> directory = ncbiBrowser.getGenomeReportDirectory(accession);

if (!directory.isPresent()) {
return Optional.empty();
}

logger.info("NCBI directory for assembly genomic.fna download: " + directory.get());
FTPFile ftpFile = ncbiBrowser.getAssemblyGenomicFnaFile(directory.get());
String ftpFilePath = directory.get() + ftpFile.getName();
Path downloadFilePath = Paths.get(asmFileDownloadDir, ftpFile.getName());
boolean success = ncbiBrowser.downloadFTPFile(ftpFilePath, downloadFilePath, ftpFile.getSize());
if (success) {
logger.info("NCBI assembly genomic.fna downloaded successfully (" + ftpFile.getName() + ")");
return Optional.of(downloadFilePath);
} else {
logger.error("NCBI assembly genomic.fna could not be downloaded successfully(" + ftpFile.getName() + ")");
return Optional.empty();
}
}
}
11 changes: 11 additions & 0 deletions src/main/java/uk/ac/ebi/eva/contigalias/dus/NCBIBrowser.java
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@ public class NCBIBrowser extends PassiveAnonymousFTPClient {

public static final String PATH_GENOMES_ALL = "/genomes/all/";


private String ftpProxyHost;

private Integer ftpProxyPort;
Expand Down Expand Up @@ -148,4 +149,14 @@ public FTPFile getNCBIAssemblyReportFile(String directoryPath) throws IOExceptio
return assemblyReport.orElseThrow(() -> new AssemblyNotFoundException("Assembly Report File not present in given directory: " + directoryPath));
}

/**
* Return the fna/fasta file that will be downloaded (a pointer to that FtpFile)*/
public FTPFile getAssemblyGenomicFnaFile(String directoryPath) throws IOException {
Stream<FTPFile> ftpFileStream = Arrays.stream(super.listFiles(directoryPath));
Stream<FTPFile> assemblyReportFilteredStream = ftpFileStream.filter(f -> f.getName().contains("genomic.fna.gz") && !f.getName().contains("from"));
Optional<FTPFile> assemblyReport = assemblyReportFilteredStream.findFirst();

return assemblyReport.orElseThrow(() -> new AssemblyNotFoundException("Assembly Genomic Fna (Fasta) File not present in given directory: " + directoryPath));
}

}
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
package uk.ac.ebi.eva.contigalias.dus2;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.security.NoSuchAlgorithmException;

import uk.ac.ebi.eva.contigalias.entities.AssemblySequencesEntity;

public abstract class AssemblySequencesReader {

protected final BufferedReader reader;

protected final String accession;

protected AssemblySequencesEntity assemblySequencesEntity;


protected boolean fileParsed = false;


public AssemblySequencesReader(InputStreamReader inputStreamReader, String accession){
this.reader = new BufferedReader(inputStreamReader);
this.accession = accession;
}

public AssemblySequencesEntity getAssemblySequenceEntity() throws IOException, NoSuchAlgorithmException {
if(!fileParsed || assemblySequencesEntity == null){
parseFile();
}
return assemblySequencesEntity;
}

protected abstract void parseFile() throws IOException, NullPointerException, NoSuchAlgorithmException;


protected abstract void parseAssemblySequenceEntity(String line);



public boolean ready() throws IOException {
return reader.ready();
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
package uk.ac.ebi.eva.contigalias.dus2;

import java.io.IOException;
import java.io.InputStreamReader;
import java.security.NoSuchAlgorithmException;
import java.util.LinkedList;
import java.util.List;

import uk.ac.ebi.eva.contigalias.entities.AssemblySequencesEntity;
import uk.ac.ebi.eva.contigalias.entities.Sequence;
import uk.ac.ebi.eva.contigalias.utils.DigestGenerator;
import uk.ac.ebi.eva.contigalias.utils.MD5Hash;

public class NCBIAssemblySequencesReader extends AssemblySequencesReader {

public NCBIAssemblySequencesReader(InputStreamReader inputStreamReader, String accession){
super(inputStreamReader, accession);
}

@Override
protected void parseFile() throws IOException, NullPointerException, NoSuchAlgorithmException {
if (reader == null){
throw new NullPointerException("Cannot use AssemblySequenceReader without having a valid InputStreamReader.");
}
DigestGenerator md5Digest = new MD5Hash();
if (assemblySequencesEntity == null){
assemblySequencesEntity = new AssemblySequencesEntity();
}
// Setting the accession of the whole assembly file
assemblySequencesEntity.setAssemblyInsdcAccession(accession);
List<Sequence> sequences = new LinkedList<>();
String line = reader.readLine();
while (line != null){
if (line.startsWith(">")){
Sequence sequence = new Sequence();
String refSeq = line.substring(1, line.indexOf(' '));
sequence.setSequenceRefseq(refSeq);
line = reader.readLine();
StringBuilder sequenceValue = new StringBuilder();
while (line != null && !line.startsWith(">")){
// Looking for the sequence lines for this refseq
sequenceValue.append(line);
line = reader.readLine();
}
String md5checksum = md5Digest.hash(sequenceValue.toString());
sequence.setSequenceMD5(md5checksum);
sequences.add(sequence);
}
}
assemblySequencesEntity.setSequences(sequences);
String digest0; // The level 0 digest of the object
fileParsed = true;
reader.close();
}

@Override
// Parsing a line of the file
protected void parseAssemblySequenceEntity(String line) {
// TODO: HERE WE'LL PARSE A LINE OF THE FILE (AN ENTRY)
// TODO: NOTE: THIS METHOD MIGHT NOT BE COMPLETELY USEFUL SINCE THE FILE CONTAINS ONLY
// TODO: TEXT AND A '>' SEPARATORS TO SEPARATE SEQUENCES FROM ONE ANOTHER
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
package uk.ac.ebi.eva.contigalias.dus2;

import java.io.InputStream;
import java.io.InputStreamReader;

import org.springframework.stereotype.Component;

@Component
public class NCBIAssemblySequencesReaderFactory {

public NCBIAssemblySequencesReader build(InputStream inputStream, String accession){
return new NCBIAssemblySequencesReader(new InputStreamReader(inputStream), accession);
}

public NCBIAssemblySequencesReader build(InputStreamReader inputStreamReader, String accession){
return new NCBIAssemblySequencesReader(inputStreamReader, accession);
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
package uk.ac.ebi.eva.contigalias.entities;


import java.util.List;

import javax.persistence.*;

import com.fasterxml.jackson.annotation.JsonInclude;
import io.swagger.annotations.ApiModelProperty;
import lombok.Data;
import lombok.Getter;
import lombok.Setter;
import org.hibernate.annotations.LazyCollection;
import org.hibernate.annotations.LazyCollectionOption;

@Data
@Table(name = "assemblySequences")
@Entity
public class AssemblySequencesEntity {

@Id
@Column(nullable = false)
@ApiModelProperty(value = "Assembly's INSDC accession. It can be either a GenBank, ENA or a DDBJ accession.")
private String assemblyInsdcAccession;


@ApiModelProperty(value = "List of all sequences of the assembly.")
@JsonInclude(JsonInclude.Include.NON_NULL)
@LazyCollection(LazyCollectionOption.FALSE)
@OneToMany(targetEntity = Sequence.class, cascade = CascadeType.ALL)
//@OneToMany(mappedBy = "assemblySequences", cascade = CascadeType.ALL)
//@JoinColumn(name = "assembly_insdc_accession", referencedColumnName = "assemblyInsdcAccession")
@JoinColumn(name = "assembly_insdc_accession")
private List<Sequence> sequences;
}
43 changes: 43 additions & 0 deletions src/main/java/uk/ac/ebi/eva/contigalias/entities/SeqColEntity.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
package uk.ac.ebi.eva.contigalias.entities;

import io.swagger.annotations.ApiModelProperty;
import lombok.AllArgsConstructor;
import lombok.Data;
import lombok.NoArgsConstructor;

import javax.persistence.EnumType;
import javax.persistence.Enumerated;
import java.util.List;

@Data
@NoArgsConstructor
public class SeqColEntity {

@ApiModelProperty(value = "The level 0 digest of the object")
private String digest;

@ApiModelProperty(value = "The representation level of the the object")
@Enumerated(EnumType.ORDINAL)
private Level level;

@ApiModelProperty(value = "The naming convention used to construct this seqCol object")
@Enumerated(EnumType.STRING)
private NamingConvention namingConvention;

@ApiModelProperty(value = "The array of the sequences' lengths")
private List<Long> lengths;

@ApiModelProperty(value = "The array of the sequences' names")
private List<String> names;

@ApiModelProperty(value = "The array of the sequences")
private List<String> sequences;

public enum Level {
ZERO, ONE, TWO
}

public enum NamingConvention {
ENA, GENBANK, UCSC
}
}
Loading