Skip to content

Commit

Permalink
2.0.1: Bug fixes and support for compressed inputs for index building.
Browse files Browse the repository at this point in the history
  • Loading branch information
bolosky committed Jan 15, 2022
2 parents 1397876 + 34c50ee commit 17bba1c
Show file tree
Hide file tree
Showing 22 changed files with 492 additions and 376 deletions.
388 changes: 177 additions & 211 deletions SNAPLib/AffineGapVectorized.cpp

Large diffs are not rendered by default.

20 changes: 19 additions & 1 deletion SNAPLib/AffineGapVectorized.h
Original file line number Diff line number Diff line change
Expand Up @@ -1378,10 +1378,12 @@ template<int TEXT_DIRECTION = 1> class AffineGapVectorized {

class AffineGapVectorizedWithCigar {
public:
AffineGapVectorizedWithCigar(); // FIXME: Pass scoring parameters
AffineGapVectorizedWithCigar();

AffineGapVectorizedWithCigar(int i_matchReward, int i_subPenalty, int i_gapOpenPenalty, int i_gapExtendPenalty);

void init(int i_matchReward, int i_subPenalty, int i_gapOpenPenalty, int i_gapExtendPenalty);

// Compute the affine gap score between two strings and write the CIGAR string in cigarBuf.
// Returns -1 if the edit distance exceeds k or -2 if we run out of space in cigarBuf.
int computeGlobalScore(const char* text, int textLen, const char* pattern, const char* quality, int patternLen, int w,
Expand Down Expand Up @@ -1453,4 +1455,20 @@ class AffineGapVectorizedWithCigar {

LocalCigarResult res;

int computeFinalCigarString(
const char* text,
int textLen,
const char* pattern,
int patternLen,
int n_res,
int min_i,
const char *cigarBufStart,
char** cigarBuf,
int cigarBufLen,
bool useM,
CigarFormat format,
int* o_cigarBufUsed,
int* o_netDel
);

};
15 changes: 8 additions & 7 deletions SNAPLib/AlignerOptions.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -231,13 +231,14 @@ AlignerOptions::usage()
" are the same as the NM values, except that they contain penalties for soft clipping reads that hang over the end of contigs (but not for\n"
" soft clipping that's due to # quality scores or that was present in the input SAM/BAM file and retained due to -pc)\n"
" -G- disable affine gap scoring (default: true)\n"
" Scoring parameters (works only when -G- is not used)\n"
" cost for match -gm (default: %u)\n"
" cost for substitution -gs (default: %u)\n"
" cost for opening a gap -go (default: %u)\n"
" cost for extending a gap -ge (default: %u)\n"
" bonus for alignment reaching 5' end of read -g5 (default: %u)\n"
" bonus for alignment reaching 3' end of read -g3 (default: %u)\n"
" Affine gap scoring parameters (works only when -G- is not used):\n"
" -gm cost for match (default: %u)\n"
" -gs cost for substitution (default: %u)\n"
" -go cost for opening a gap (default: %u)\n"
" -ge cost for extending a gap (default: %u)\n"
" -g5 bonus for alignment reaching 5' end of read (default: %u)\n"
" -g3 bonus for alignment reaching 3' end of read (default: %u)\n"
"\n"
" -A- Disable ALT awareness. The default is to try to map reads to the primary assembly and only to choose ALT alignments when they're much better,\n"
" and to compute MAPQ for non-ALT alignments using only non-ALT hits. This flag disables that behavior and results in ALT-oblivious behavior.\n"
" -ea Emit ALT alignments. When the aligner is ALT aware (i.e., -A- isn't specified) if it finds an ALT alignment that would have been\n"
Expand Down
2 changes: 1 addition & 1 deletion SNAPLib/Bam.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -951,7 +951,7 @@ BAMFormat::getWriterSupplier(
dataSupplier = DataWriterSupplier::create(options->outputFile.fileName, options->writeBufferSize, options->emitInternalScore, options->internalScoreTag, gzipSupplier);
}

return ReadWriterSupplier::create(this, dataSupplier, genome, options->killIfTooSlow, options->emitInternalScore, options->internalScoreTag, options->ignoreAlignmentAdjustmentsForOm);
return ReadWriterSupplier::create(this, dataSupplier, genome, options->killIfTooSlow, options->emitInternalScore, options->internalScoreTag, options->ignoreAlignmentAdjustmentsForOm, options->matchReward, options->subPenalty, options->gapOpenPenalty, options->gapExtendPenalty);
}

bool
Expand Down
2 changes: 1 addition & 1 deletion SNAPLib/CommandProcessor.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ Revision History:
#include "Error.h"
#include "Compat.h"

const char *SNAP_VERSION = "2.0.0";
const char *SNAP_VERSION = "2.0.1";

static void usage()
{
Expand Down
6 changes: 3 additions & 3 deletions SNAPLib/DataReader.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2006,7 +2006,7 @@ DecompressDataReader::decompress(
if (status < 0 && status != Z_BUF_ERROR) {
WriteErrorMessage("GzipDataReader: inflate failed with %d\n", status);
if (Z_DATA_ERROR == status) {
fprintf(stderr, "%d is Z_DATA_ERROR, indicating a corrupt input. zstream->availIn is 0x%x:", status, zstream->avail_in);
fprintf(stderr, "%d is Z_DATA_ERROR, indicating a corrupt input. Are you *sure* this is a gzipped file and it's not just plain text or from some other compressor like bzip? zstream->availIn is 0x%x:", status, zstream->avail_in);
for (_int64 offset = 0; offset < oldAvailIn; offset++) {
if (offset % 32 == 0) {
fprintf(stderr, "\n0x%llx\t", offset);
Expand All @@ -2019,7 +2019,7 @@ DecompressDataReader::decompress(
return false;
}
if (status < 0 && zstream->avail_out == 0 && zstream->avail_in > 0) {
WriteErrorMessage("insufficient decompression buffer space - increase expansion factor, currently -xf %.1f\n", DataSupplier::ExpansionFactor);
WriteErrorMessage("insufficient decompression buffer space - increase expansion factor, currently -xf %.1f (if you're indexing, please decompress the FASTA/ALT files externally and use the uncompressed version here)\n", DataSupplier::ExpansionFactor);
return false;
}

Expand Down Expand Up @@ -2295,7 +2295,7 @@ DecompressDataReader::decompressThreadContinuous(
entry->decompressed + reader->overflowBytes, entry->decompressedSize - reader->overflowBytes, &decompressedWritten,
first ? StartMultiBlock : ContinueMultiBlock);
if (!ok) {
WriteErrorMessage("Failed to decompress BAM file at offset %lld\n", reader->inner->getFileOffset());
WriteErrorMessage("Failed to decompress gzip/BAM file at offset %lld\n", reader->inner->getFileOffset());
soft_exit(1);
}
_ASSERT(compressedRead == entry->compressedValid && decompressedWritten <= reader->extraBytes - reader->overflowBytes);
Expand Down
2 changes: 1 addition & 1 deletion SNAPLib/DataReader.h
Original file line number Diff line number Diff line change
Expand Up @@ -98,7 +98,7 @@ class DataReader
// read bytes from the beginning of the file for the header
virtual char* readHeader(_int64* io_headerSize) = 0;

// seek to a particular range in the file
// seek to a particular range in the file. Set amountOfFileToProcess to go to EOF.
virtual void reinit(_int64 startingOffset, _int64 amountOfFileToProcess) = 0;

// get all remaining data in current batch
Expand Down
12 changes: 7 additions & 5 deletions SNAPLib/FASTA.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ Revision History:
#include "Error.h"
#include "exit.h"
#include "Util.h"
#include "DataReader.h"

using namespace std;

Expand Down Expand Up @@ -220,9 +221,10 @@ ReadFASTAGenome(
isValidGenomeCharacter['A'] = isValidGenomeCharacter['T'] = isValidGenomeCharacter['C'] = isValidGenomeCharacter['G'] = isValidGenomeCharacter['N'] = true;
isValidGenomeCharacter['a'] = isValidGenomeCharacter['t'] = isValidGenomeCharacter['c'] = isValidGenomeCharacter['g'] = isValidGenomeCharacter['n'] = true;

FILE *fastaFile = fopen(fileName, "r");
if (fastaFile == NULL) {
WriteErrorMessage("Unable to open FASTA file '%s' (does it exist and do you have permission to open it?)\n",fileName);
DataReader* fastaDataReader = getDefaultOrGzipDataReader(fileName);

if (fastaDataReader == NULL) {
WriteErrorMessage("Unable to open FASTA file '%s' (does it exist and do you have permission to open it?)\n", fileName);
return NULL;
}

Expand All @@ -244,7 +246,7 @@ ReadFASTAGenome(

int nextContigNumber = 0;

while (NULL != reallocatingFgets(&lineBuffer, &lineBufferSize, fastaFile)) {
while (NULL != reallocatingFgets(&lineBuffer, &lineBufferSize, fastaDataReader)) {
fileSize += strlen(lineBuffer);
if (lineBuffer[0] == '>') {
nContigs++;
Expand Down Expand Up @@ -382,7 +384,7 @@ ReadFASTAGenome(
genome->sortContigsByName();
genome->setUpContigNumbersByOriginalOrder();

fclose(fastaFile);
delete fastaDataReader;
delete [] paddingBuffer;
delete [] lineBuffer;
return genome;
Expand Down
38 changes: 18 additions & 20 deletions SNAPLib/FASTQ.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -391,19 +391,6 @@ PairedInterleavedFASTQReader::getNextReadPair(Read *read0, Read *read1)
}
bytesConsumed += FASTQReader::getReadFromBuffer(buffer + bytesConsumed, validBytes - bytesConsumed, read1, fileName, data, context);

//
// Validate the Read IDs.
//
if (read0->getIdLength() < 2 || memcmp(read0->getId() + read0->getIdLength() - 2, "/1", 2)) {
WriteErrorMessage("PairedInterleavedFASTQReader: first read of batch doesn't have ID ending with /1: '%.*s'\n", read0->getIdLength(), read0->getId());
soft_exit(1);
}

if (read1->getIdLength() < 2 || memcmp(read1->getId() + read1->getIdLength() - 2, "/2", 2)) {
WriteErrorMessage("PairedInterleavedFASTQReader: second read of batch doesn't have ID ending with /2: '%.*s'\n", read1->getIdLength(), read1->getId());
soft_exit(1);
}

data->advance(bytesConsumed);
return true;
}
Expand All @@ -421,10 +408,16 @@ PairedInterleavedFASTQReader::reinit(_int64 startingOffset, _int64 amountOfFileT
// If we're not at the start of the file, we might have the tail end of a read that someone
// who got the previous range will process; advance past that. This is fairly tricky because
// there can be '@' signs in the quality string (and maybe even in read names?).
if (startingOffset != 0) {
if (!FASTQReader::skipPartialRecord(data)) {
return;
}

if (startingOffset == 0) {
return;
}

WriteErrorMessage("This should be dead code. If you get here there's a bug in SNAP, please report it.\n");
soft_exit(1);

if (!FASTQReader::skipPartialRecord(data)) {
return;
}

//
Expand Down Expand Up @@ -642,17 +635,21 @@ PairedInterleavedFASTQReader::createPairedReadSupplierGenerator(
{
bool isStdin = !strcmp(fileName,"-");

if (gzip || isStdin) {
//WriteStatusMessage("PairedInterleavedFASTQ using supplier queue\n");
if (gzip || isStdin || true /* always use queue for PairedInterleavedFASTQ because otherwise we need to know the read ID format to seek into the middle of the file, but that's nonstandard. */) {
DataSupplier *dataSupplier;

if (isStdin) {
if (gzip) {
dataSupplier = DataSupplier::GzipStdio;
} else {
dataSupplier = DataSupplier::Stdio;
}
} else {
dataSupplier = DataSupplier::GzipDefault;
if (gzip) {
dataSupplier = DataSupplier::GzipDefault;
} else {
dataSupplier = DataSupplier::Default;
}
}

PairedReadReader *reader = PairedInterleavedFASTQReader::create(dataSupplier, fileName,
Expand All @@ -662,6 +659,7 @@ PairedInterleavedFASTQReader::createPairedReadSupplierGenerator(
delete reader;
return NULL;
}

ReadSupplierQueue *queue = new ReadSupplierQueue(reader);
queue->startReaders();
return queue;
Expand Down
2 changes: 1 addition & 1 deletion SNAPLib/Genome.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -543,7 +543,7 @@ Genome::getLocationOfContig(const char *contigName, GenomeLocation *location, In
*location = contigsByName[mid].beginningLocation;
}
if (index != NULL) {
*index = InternalContigNumToInt(mid);
*index = InternalContigNumToInt(contigsByName[mid].internalContigNumber);
}
return true;
} else if (c < 0) {
Expand Down
16 changes: 10 additions & 6 deletions SNAPLib/GenomeIndex.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@ Revision History:
#include "exit.h"
#include "Error.h"
#include "directions.h"
#include "DataReader.h"

using namespace std;

Expand Down Expand Up @@ -158,6 +159,8 @@ GenomeIndex::runIndexer(
unsigned *altLiftoverProjContigOffsets = NULL;
char **altLiftoverProjCigar = NULL;

DataSupplier::ExpansionFactor = 50; // This is for the decompression of a gzipped FASTA/ALT file. FASTAs compress really well so we need a lot.

for (int n = 2; n < argc; n++) {
if (strcmp(argv[n], "-s") == 0) {
if (n + 1 < argc) {
Expand Down Expand Up @@ -251,7 +254,7 @@ GenomeIndex::runIndexer(
n++;
} else if (!strcmp(argv[n], "-altContigFile")) {
if (n + 1 < argc) {
FILE *inputFile = fopen(argv[n + 1], "r");
DataReader* inputFile = getDefaultOrGzipDataReader(argv[n + 1]);
if (NULL == inputFile) {
WriteErrorMessage("Unable to open alt contig list file %s\n", argv[n + 1]);
soft_exit(1);
Expand All @@ -269,15 +272,15 @@ GenomeIndex::runIndexer(
addToCountedListOfStrings(contigNameBuffer, &nAltOptIn, &altOptInList);
} // while we have an input string.

fclose(inputFile);
delete inputFile;
delete[] contigNameBuffer;
} else {
usage();
}
n++;
} else if (!strcmp(argv[n], "-nonAltContigFile")) {
if (n + 1 < argc) {
FILE *inputFile = fopen(argv[n + 1], "r");
DataReader *inputFile = getDefaultOrGzipDataReader(argv[n + 1]);
if (NULL == inputFile) {
WriteErrorMessage("Unable to open non-alt contig list file %s\n", argv[n + 1]);
soft_exit(1);
Expand All @@ -295,7 +298,7 @@ GenomeIndex::runIndexer(
addToCountedListOfStrings(contigNameBuffer, &nAltOptOut, &altOptOutList);
} // while we have an input string.

fclose(inputFile);
delete inputFile;
delete[] contigNameBuffer;
}
else {
Expand All @@ -304,7 +307,8 @@ GenomeIndex::runIndexer(
n++;
} else if (!strcmp(argv[n], "-altLiftoverFile")) {
if (n + 1 < argc) {
FILE* inputFile = fopen(argv[n + 1], "r");
DataReader* inputFile = getDefaultOrGzipDataReader(argv[n + 1]);

if (NULL == inputFile) {
WriteErrorMessage("Unable to open ALT liftover file %s\n", argv[n + 1]);
soft_exit(1);
Expand Down Expand Up @@ -395,7 +399,7 @@ GenomeIndex::runIndexer(
altLiftoverProjCigar[i][projCigarLength] = '\0';
}

fclose(inputFile);
delete inputFile;
delete[] altLiftoverBuffer;
}
else {
Expand Down
5 changes: 5 additions & 0 deletions SNAPLib/LandauVishkin.h
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,12 @@
#include "exit.h"
#include "Genome.h"

#ifdef LONG_READS
const int MAX_K = 1000;
#else // LONG_READS
const int MAX_K = 63;
#endif // LONG_READS

const int ScoreAboveLimit = -1; // A score value that means "we didn't compute the score because it was above the score limit."
const int TooBigScoreValue = 65536; // This is much bigger than any score we'll ever see

Expand Down
4 changes: 2 additions & 2 deletions SNAPLib/Read.h
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ struct PairedAlignmentResult;

//#define LONG_READS
#ifdef LONG_READS
#define MAX_READ_LENGTH 400000
#define MAX_READ_LENGTH 20000
#else
#define MAX_READ_LENGTH 400
#endif
Expand Down Expand Up @@ -219,7 +219,7 @@ class ReadWriterSupplier

static ReadWriterSupplier* create(const FileFormat* format, DataWriterSupplier* dataSupplier,
const Genome* genome, bool killIfTooSlowbool, bool emitInternalScore, char *internalScoreTag,
bool ignoreAlignmentAdjustmentsForOm);
bool ignoreAlignmentAdjustmentsForOm, int matchReward, int subPenalty, int gapOpenPenalty, int gapExtendPenalty);

virtual ~ReadWriterSupplier() {}
};
Expand Down
Loading

0 comments on commit 17bba1c

Please sign in to comment.