Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add args for adjusting mismatch number #311

Open
wants to merge 4 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
114 changes: 59 additions & 55 deletions README.md

Large diffs are not rendered by default.

8 changes: 4 additions & 4 deletions src/adaptertrimmer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ bool AdapterTrimmer::trimByOverlapAnalysis(Read* r1, Read* r2, FilterResult* fr,
return false;
}

bool AdapterTrimmer::trimByMultiSequences(Read* r, FilterResult* fr, vector<string>& adapterList, bool isR2, bool incTrimmedCounter) {
bool AdapterTrimmer::trimByMultiSequences(Read* r, FilterResult* fr, vector<string>& adapterList, bool isR2, int allowOneMismatchForEach, bool incTrimmedCounter) {
int matchReq = 4;
if(adapterList.size() > 16)
matchReq = 5;
Expand All @@ -69,8 +69,7 @@ bool AdapterTrimmer::trimByMultiSequences(Read* r, FilterResult* fr, vector<stri
return trimmed;
}

bool AdapterTrimmer::trimBySequence(Read* r, FilterResult* fr, string& adapterseq, bool isR2, int matchReq) {
const int allowOneMismatchForEach = 8;
bool AdapterTrimmer::trimBySequence(Read* r, FilterResult* fr, string& adapterseq, bool isR2, int allowOneMismatchForEach, int matchReq) {

int rlen = r->length();
int alen = adapterseq.length();
Expand Down Expand Up @@ -141,6 +140,7 @@ bool AdapterTrimmer::test() {
"+",
"///EEEEEEEEEEEEEEEEEEEEEEEEEE////EEEEEEEEEEEEE////E////E");
string adapter = "TTTTCCACGGGGATACTACTG";

bool trimmed = AdapterTrimmer::trimBySequence(&r, NULL, adapter);
if (r.mSeq.mStr != "TTTTAACCCCCCCCCCCCCCCCCCCCCCCCCCCCAATTTTAAAA")
return false;
Expand All @@ -161,4 +161,4 @@ bool AdapterTrimmer::test() {
}

return true;
}
}
7 changes: 3 additions & 4 deletions src/adaptertrimmer.h
Original file line number Diff line number Diff line change
Expand Up @@ -17,12 +17,11 @@ class AdapterTrimmer{

static bool trimByOverlapAnalysis(Read* r1, Read* r2, FilterResult* fr, int diffLimit, int overlapRequire, double diffPercentLimit);
static bool trimByOverlapAnalysis(Read* r1, Read* r2, FilterResult* fr, OverlapResult ov, int frontTrimmed1 = 0, int frontTrimmed2 = 0);
static bool trimBySequence(Read* r1, FilterResult* fr, string& adapter, bool isR2 = false, int matchReq = 4);
static bool trimByMultiSequences(Read* r1, FilterResult* fr, vector<string>& adapterList, bool isR2 = false, bool incTrimmedCounter = true);
static bool trimBySequence(Read* r1, FilterResult* fr, string& adapter, bool isR2 = false, int allowOneMismatchForEach = 8, int matchReq = 4);
static bool trimByMultiSequences(Read* r1, FilterResult* fr, vector<string>& adapterList, bool isR2 = false, int allowOneMismatchForEach = 8, bool incTrimmedCounter = true);
static bool test();


};


#endif
#endif
10 changes: 9 additions & 1 deletion src/main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,7 @@ int main(int argc, char* argv[]){
cmd.add<string>("adapter_sequence_r2", 0, "the adapter for read2 (PE data only). This is used if R1/R2 are found not overlapped. If not specified, it will be the same as <adapter_sequence>", false, "auto");
cmd.add<string>("adapter_fasta", 0, "specify a FASTA file to trim both read1 and read2 (if PE) by all the sequences in this FASTA file", false, "");
cmd.add("detect_adapter_for_pe", 0, "by default, the auto-detection for adapter is for SE data input only, turn on this option to enable it for PE data.");
cmd.add<int>("adapter_mm_freq", 0, "allowed mismatched within every n bases to detect adapter. 8 by defaults.", false, 8);

// trimming
cmd.add<int>("trim_front1", 'f', "trimming how many bases in front for read1, default is 0", false, 0);
Expand All @@ -71,10 +72,14 @@ int main(int argc, char* argv[]){
cmd.add("trim_poly_g", 'g', "force polyG tail trimming, by default trimming is automatically enabled for Illumina NextSeq/NovaSeq data");
cmd.add<int>("poly_g_min_len", 0, "the minimum length to detect polyG in the read tail. 10 by default.", false, 10);
cmd.add("disable_trim_poly_g", 'G', "disable polyG tail trimming, by default trimming is automatically enabled for Illumina NextSeq/NovaSeq data");

cmd.add<int>("poly_g_mm_freq", 0, "allowed mismatched within every n bases to detect polyG. 8 by defaults.", false, 8);
cmd.add<int>("poly_g_mm_max", 0, "the maximum number of mismatched allowed in detecting polyG. 5 by defaults.", false, 5);

// polyX tail trimming
cmd.add("trim_poly_x", 'x', "enable polyX trimming in 3' ends.");
cmd.add<int>("poly_x_min_len", 0, "the minimum length to detect polyX in the read tail. 10 by default.", false, 10);
cmd.add<int>("poly_x_mm_freq", 0, "allowed mismatched within every n bases to detect polyX. 8 by defaults.", false, 8);
cmd.add<int>("poly_x_mm_max", 0, "the maximum number of mismatched allowed in detecting polyX. 5 by defaults.", false, 5);

// cutting by quality
cmd.add("cut_front", '5', "move a sliding window from front (5') to tail, drop the bases in the window if its mean quality < threshold, stop otherwise.");
Expand Down Expand Up @@ -192,6 +197,7 @@ int main(int argc, char* argv[]){
opt.adapter.detectAdapterForPE = cmd.exist("detect_adapter_for_pe");
opt.adapter.sequence = cmd.get<string>("adapter_sequence");
opt.adapter.sequenceR2 = cmd.get<string>("adapter_sequence_r2");
opt.adapter.allowOneMismatchForEach = cmd.get<int>("adapter_mm_freq");
opt.adapter.fastaFile = cmd.get<string>("adapter_fasta");
if(opt.adapter.sequenceR2=="auto" && !opt.adapter.detectAdapterForPE && opt.adapter.sequence != "auto") {
opt.adapter.sequenceR2 = opt.adapter.sequence;
Expand Down Expand Up @@ -233,6 +239,8 @@ int main(int argc, char* argv[]){
opt.polyXTrim.enabled = true;
}
opt.polyXTrim.minLen = cmd.get<int>("poly_x_min_len");
opt.polyXTrim.allowOneMismatchForEach = cmd.get<int>("poly_x_mm_freq");
opt.polyXTrim.maxMismatch = cmd.get<int>("poly_x_mm_max");


// sliding window cutting by quality
Expand Down
12 changes: 11 additions & 1 deletion src/options.h
Original file line number Diff line number Diff line change
Expand Up @@ -82,21 +82,29 @@ class PolyGTrimmerOptions {
PolyGTrimmerOptions() {
enabled = false;
minLen = 10;
allowOneMismatchForEach = 8;
maxMismatch = 5;
}
public:
bool enabled;
int minLen;
int allowOneMismatchForEach;
int maxMismatch;
};

class PolyXTrimmerOptions {
public:
PolyXTrimmerOptions() {
enabled = false;
minLen = 10;
allowOneMismatchForEach = 8;
maxMismatch = 5;
}
public:
bool enabled;
int minLen;
int allowOneMismatchForEach;
int maxMismatch;
};

class UMIOptions {
Expand Down Expand Up @@ -194,6 +202,7 @@ class AdapterOptions {
public:
AdapterOptions() {
enabled = true;
allowOneMismatchForEach = 8;
hasSeqR1 = false;
hasSeqR2 = false;
detectAdapterForPE = false;
Expand All @@ -204,6 +213,7 @@ class AdapterOptions {
string sequenceR2;
string detectedAdapter1;
string detectedAdapter2;
int allowOneMismatchForEach;
vector<string> seqsInFasta;
string fastaFile;
bool hasSeqR1;
Expand Down Expand Up @@ -376,4 +386,4 @@ class Options{

};

#endif
#endif
9 changes: 3 additions & 6 deletions src/peprocessor.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -371,7 +371,9 @@ bool PairEndProcessor::processPairEnd(ReadPairPack* pack, ThreadConfig* config){

if(r1 != NULL && r2!=NULL) {
if(mOptions->polyGTrim.enabled)
PolyX::trimPolyG(r1, r2, config->getFilterResult(), mOptions->polyGTrim.minLen);
PolyX::trimPolyG(r1, r2, config->getFilterResult(), mOptions->polyGTrim.minLen, mOptions->polyGTrim.allowOneMismatchForEach, mOptions->polyGTrim.maxMismatch);
if(mOptions->polyXTrim.enabled)
PolyX::trimPolyX(r1, r2, config->getFilterResult(), mOptions->polyXTrim.minLen, mOptions->polyXTrim.allowOneMismatchForEach, mOptions->polyXTrim.maxMismatch);
}
bool isizeEvaluated = false;
if(r1 != NULL && r2!=NULL && (mOptions->adapter.enabled || mOptions->correction.enabled)){
Expand Down Expand Up @@ -416,11 +418,6 @@ bool PairEndProcessor::processPairEnd(ReadPairPack* pack, ThreadConfig* config){
isizeEvaluated = true;
}

if(r1 != NULL && r2!=NULL) {
if(mOptions->polyXTrim.enabled)
PolyX::trimPolyX(r1, r2, config->getFilterResult(), mOptions->polyXTrim.minLen);
}

if(r1 != NULL && r2!=NULL) {
if( mOptions->trim.maxLen1 > 0 && mOptions->trim.maxLen1 < r1->length())
r1->resize(mOptions->trim.maxLen1);
Expand Down
26 changes: 10 additions & 16 deletions src/polyx.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -8,15 +8,12 @@ PolyX::PolyX(){
PolyX::~PolyX(){
}

void PolyX::trimPolyG(Read* r1, Read* r2, FilterResult* fr, int compareReq) {
trimPolyG(r1, fr, compareReq);
trimPolyG(r2, fr, compareReq);
void PolyX::trimPolyG(Read* r1, Read* r2, FilterResult* fr, int compareReq, int allowOneMismatchForEach, int maxMismatch) {
trimPolyG(r1, fr, compareReq, allowOneMismatchForEach, maxMismatch);
trimPolyG(r2, fr, compareReq, allowOneMismatchForEach, maxMismatch);
}

void PolyX::trimPolyG(Read* r, FilterResult* fr, int compareReq) {
const int allowOneMismatchForEach = 8;
const int maxMismatch = 5;

void PolyX::trimPolyG(Read* r, FilterResult* fr, int compareReq, int allowOneMismatchForEach, int maxMismatch) {
const char* data = r->mSeq.mStr.c_str();

int rlen = r->length();
Expand All @@ -41,15 +38,12 @@ void PolyX::trimPolyG(Read* r, FilterResult* fr, int compareReq) {
}
}

void PolyX::trimPolyX(Read* r1, Read* r2, FilterResult* fr, int compareReq) {
trimPolyX(r1, fr, compareReq);
trimPolyX(r2, fr, compareReq);
void PolyX::trimPolyX(Read* r1, Read* r2, FilterResult* fr, int compareReq, int allowOneMismatchForEach, int maxMismatch) {
trimPolyX(r1, fr, compareReq, allowOneMismatchForEach, maxMismatch);
trimPolyX(r2, fr, compareReq, allowOneMismatchForEach, maxMismatch);
}

void PolyX::trimPolyX(Read* r, FilterResult* fr, int compareReq) {
const int allowOneMismatchForEach = 8;
const int maxMismatch = 5;

void PolyX::trimPolyX(Read* r, FilterResult* fr, int compareReq, int allowOneMismatchForEach, int maxMismatch) {
const char* data = r->mSeq.mStr.c_str();

int rlen = r->length();
Expand Down Expand Up @@ -123,8 +117,8 @@ bool PolyX::test() {
"///EEEEEEEEEEEEEEEEEEEEEEEEEE////EEEEEEEEEEEEE////E////E");

FilterResult fr(NULL, false);
PolyX::trimPolyX(&r, &fr, 10);
PolyX::trimPolyX(&r, NULL, 10, 8, 5);
r.print();

return r.mSeq.mStr == "ATTTT" && fr.getTotalPolyXTrimmedReads() == 1 && fr.getTotalPolyXTrimmedBases() == 51;
}
}
10 changes: 5 additions & 5 deletions src/polyx.h
Original file line number Diff line number Diff line change
Expand Up @@ -15,14 +15,14 @@ class PolyX{
PolyX();
~PolyX();

static void trimPolyG(Read* r1, Read* r2, FilterResult* fr, int compareReq);
static void trimPolyG(Read* r1, FilterResult* fr, int compareReq);
static void trimPolyX(Read* r1, Read* r2, FilterResult* fr, int compareReq);
static void trimPolyX(Read* r1, FilterResult* fr, int compareReq);
static void trimPolyG(Read* r1, Read* r2, FilterResult* fr, int compareReq, int allowOneMismatchForEach, int maxMismatch);
static void trimPolyG(Read* r1, FilterResult* fr, int compareReq, int allowOneMismatchForEach, int maxMismatch);
static void trimPolyX(Read* r1, Read* r2, FilterResult* fr, int compareReq, int allowOneMismatchForEach, int maxMismatch);
static void trimPolyX(Read* r1, FilterResult* fr, int compareReq, int allowOneMismatchForEach, int maxMismatch);
static bool test();


};


#endif
#endif
6 changes: 4 additions & 2 deletions src/seprocessor.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -212,6 +212,7 @@ bool SingleEndProcessor::processSingleEnd(ReadPack* pack, ThreadConfig* config){
continue;
}


// fix MGI
if(mOptions->fixMGI) {
or1->fixMGI();
Expand All @@ -227,7 +228,7 @@ bool SingleEndProcessor::processSingleEnd(ReadPack* pack, ThreadConfig* config){

if(r1 != NULL) {
if(mOptions->polyGTrim.enabled)
PolyX::trimPolyG(r1, config->getFilterResult(), mOptions->polyGTrim.minLen);
PolyX::trimPolyG(r1, config->getFilterResult(), mOptions->polyGTrim.minLen, mOptions->polyGTrim.allowOneMismatchForEach, mOptions->polyGTrim.maxMismatch);
}

if(r1 != NULL && mOptions->adapter.enabled){
Expand All @@ -242,9 +243,10 @@ bool SingleEndProcessor::processSingleEnd(ReadPack* pack, ThreadConfig* config){

if(r1 != NULL) {
if(mOptions->polyXTrim.enabled)
PolyX::trimPolyX(r1, config->getFilterResult(), mOptions->polyXTrim.minLen);
PolyX::trimPolyX(r1, config->getFilterResult(), mOptions->polyXTrim.minLen, mOptions->polyXTrim.allowOneMismatchForEach, mOptions->polyXTrim.maxMismatch);
}


if(r1 != NULL) {
if( mOptions->trim.maxLen1 > 0 && mOptions->trim.maxLen1 < r1->length())
r1->resize(mOptions->trim.maxLen1);
Expand Down