From c6200b460a6a830cf86ed4a266126acecb3a1a82 Mon Sep 17 00:00:00 2001
From: Ye Chang <yech1990@gmail.com>
Date: Mon, 4 Jan 2021 17:52:37 +0800
Subject: [PATCH 1/3] add args for mismatch

---
 README.md              | 45 +++++++++++++++++++++++-------------------
 src/adaptertrimmer.cpp |  7 +++----
 src/adaptertrimmer.h   |  4 ++--
 src/main.cpp           | 12 +++++++++--
 src/options.h          | 12 ++++++++++-
 src/peprocessor.cpp    |  4 ++--
 src/polyx.cpp          | 26 ++++++++++--------------
 src/polyx.h            | 10 +++++-----
 src/seprocessor.cpp    |  8 ++++----
 9 files changed, 72 insertions(+), 56 deletions(-)
diff --git a/README.md b/README.md
index e63b15d..c9c7b7f 100644
--- a/README.md
+++ b/README.md
@@ -237,12 +237,13 @@ options:
       --interleaved_in                 indicate that <in1> is an interleaved FASTQ which contains both read1 and read2. Disabled by defaut.
       --reads_to_process             specify how many reads/pairs to be processed. Default 0 means process all reads. (int [=0])
       --dont_overwrite               don't overwrite existing files. Overwritting is allowed by default.
-  
+
   # adapter trimming options
   -A, --disable_adapter_trimming     adapter trimming is enabled by default. If this option is specified, adapter trimming is disabled
-  -a, --adapter_sequence               the adapter for read1. For SE data, if not specified, the adapter will be auto-detected. For PE data, this is used if R1/R2 are found not overlapped. (string [=auto])
-      --adapter_sequence_r2            the adapter for read2 (PE data only). This is used if R1/R2 are found not overlapped. If not specified, it will be the same as <adapter_sequence> (string [=])
-    
+  -a, --adapter_sequence             the adapter for read1. For SE data, if not specified, the adapter will be auto-detected. For PE data, this is used if R1/R2 are found not overlapped. (string [=auto])
+      --adapter_sequence_r2          the adapter for read2 (PE data only). This is used if R1/R2 are found not overlapped. If not specified, it will be the same as <adapter_sequence> (string [=])
+      --adapter_mm_freq              allowed mismatched within every n bases to detect adapter. 8 by defaults. (int [=8])
+
   # global trimming options
   -f, --trim_front1                  trimming how many bases in front for read1, default is 0 (int [=0])
   -t, --trim_tail1                   trimming how many bases in tail for read1, default is 0 (int [=0])
@@ -251,50 +252,54 @@ options:
 
   # polyG tail trimming, useful for NextSeq/NovaSeq data
   -g, --trim_poly_g                  force polyG tail trimming, by default trimming is automatically enabled for Illumina NextSeq/NovaSeq data
-      --poly_g_min_len                 the minimum length to detect polyG in the read tail. 10 by default. (int [=10])
+      --poly_g_min_len               the minimum length to detect polyG in the read tail. 10 by default. (int [=10])
+      --poly_g_mm_freq               allowed mismatched within every n bases to detect polyG. 8 by defaults. (int [=8])
+      --poly_g_mm_max                the maximum number of mismatched allowed in detecting polyG. 5 by defaults. (int [=5])
   -G, --disable_trim_poly_g          disable polyG tail trimming, by default trimming is automatically enabled for Illumina NextSeq/NovaSeq data
 
   # polyX tail trimming
-  -x, --trim_poly_x                    enable polyX trimming in 3' ends.
-      --poly_x_min_len                 the minimum length to detect polyX in the read tail. 10 by default. (int [=10])
-  
+  -x, --trim_poly_x                  enable polyX trimming in 3' ends.
+      --poly_x_min_len               the minimum length to detect polyX in the read tail. 10 by default. (int [=10])
+      --poly_x_mm_freq               allowed mismatched within every n bases to detect polyX. 8 by defaults. (int [=8])
+      --poly_x_mm_max                the maximum number of mismatched allowed in detecting polyX. 5 by defaults. (int [=5])
+
   # per read cutting by quality options
   -5, --cut_by_quality5              enable per read cutting by quality in front (5'), default is disabled (WARNING: this will interfere deduplication for both PE/SE data)
   -3, --cut_by_quality3              enable per read cutting by quality in tail (3'), default is disabled (WARNING: this will interfere deduplication for SE data)
   -W, --cut_window_size              the size of the sliding window for sliding window trimming, default is 4 (int [=4])
   -M, --cut_mean_quality             the bases in the sliding window with mean quality below cutting_quality will be cut, default is Q20 (int [=20])
-  
+
   # quality filtering options
   -Q, --disable_quality_filtering    quality filtering is enabled by default. If this option is specified, quality filtering is disabled
   -q, --qualified_quality_phred      the quality value that a base is qualified. Default 15 means phred quality >=Q15 is qualified. (int [=15])
   -u, --unqualified_percent_limit    how many percents of bases are allowed to be unqualified (0~100). Default 40 means 40% (int [=40])
   -n, --n_base_limit                 if one read's number of N base is >n_base_limit, then this read/pair is discarded. Default is 5 (int [=5])
-  
+
   # length filtering options
   -L, --disable_length_filtering     length filtering is enabled by default. If this option is specified, length filtering is disabled
   -l, --length_required              reads shorter than length_required will be discarded, default is 15. (int [=15])
       --length_limit                 reads longer than length_limit will be discarded, default 0 means no limitation. (int [=0])
 
   # low complexity filtering
-  -y, --low_complexity_filter          enable low complexity filter. The complexity is defined as the percentage of base that is different from its next base (base[i] != base[i+1]).
-  -Y, --complexity_threshold           the threshold for low complexity filter (0~100). Default is 30, which means 30% complexity is required. (int [=30])
+  -y, --low_complexity_filter        enable low complexity filter. The complexity is defined as the percentage of base that is different from its next base (base[i] != base[i+1]).
+  -Y, --complexity_threshold         the threshold for low complexity filter (0~100). Default is 30, which means 30% complexity is required. (int [=30])
 
   # filter reads with unwanted indexes (to remove possible contamination)
-      --filter_by_index1               specify a file contains a list of barcodes of index1 to be filtered out, one barcode per line (string [=])
-      --filter_by_index2               specify a file contains a list of barcodes of index2 to be filtered out, one barcode per line (string [=])
-      --filter_by_index_threshold      the allowed difference of index barcode for index filtering, default 0 means completely identical. (int [=0])
+      --filter_by_index1             specify a file contains a list of barcodes of index1 to be filtered out, one barcode per line (string [=])
+      --filter_by_index2             specify a file contains a list of barcodes of index2 to be filtered out, one barcode per line (string [=])
+      --filter_by_index_threshold    the allowed difference of index barcode for index filtering, default 0 means completely identical. (int [=0])
 
   # base correction by overlap analysis options
   -c, --correction                   enable base correction in overlapped regions (only for PE data), default is disabled
       --overlap_len_require          the minimum length of the overlapped region for overlap analysis based adapter trimming and correction. 30 by default. (int [=30])
       --overlap_diff_limit           the maximum difference of the overlapped region for overlap analysis based adapter trimming and correction. 5 by default. (int [=5])
-  
+
   # UMI processing
   -U, --umi                          enable unique molecular identifer (UMI) preprocessing
       --umi_loc                      specify the location of UMI, can be (index1/index2/read1/read2/per_index/per_read, default is none (string [=])
       --umi_len                      if the UMI is in read1/read2, its length should be provided (int [=0])
       --umi_prefix                   if specified, an underline will be used to connect prefix and UMI (i.e. prefix=UMI, UMI=AATTCG, final=UMI_AATTCG). No prefix by default (string [=])
-      --umi_skip                       if the UMI is in read1/read2, fastp can skip several bases following UMI, default is 0 (int [=0])
+      --umi_skip                     if the UMI is in read1/read2, fastp can skip several bases following UMI, default is 0 (int [=0])
 
   # overrepresented sequence analysis
   -p, --overrepresentation_analysis    enable overrepresented sequence analysis.
@@ -304,15 +309,15 @@ options:
   -j, --json                         the json format report file name (string [=fastp.json])
   -h, --html                         the html format report file name (string [=fastp.html])
   -R, --report_title                 should be quoted with ' or ", default is "fastp report" (string [=fastp report])
-  
+
   # threading options
   -w, --thread                       worker thread number, default is 2 (int [=2])
-  
+
   # output splitting options
   -s, --split                        split output by limiting total split file number with this option (2~999), a sequential number prefix will be added to output name ( 0001.out.fq, 0002.out.fq...), disabled by default (int [=0])
   -S, --split_by_lines               split output by limiting lines of each file with this option(>=1000), a sequential number prefix will be added to output name ( 0001.out.fq, 0002.out.fq...), disabled by default (long [=0])
   -d, --split_prefix_digits          the digits for the sequential number padding (1~10), default is 4, so the filename will be padded as 0001.xxx, 0 to disable padding (int [=4])
-  
+
   # help
   -?, --help                         print this message
 ```
diff --git a/src/adaptertrimmer.cpp b/src/adaptertrimmer.cpp
index 4c103f5..1bcc8f1 100644
--- a/src/adaptertrimmer.cpp
+++ b/src/adaptertrimmer.cpp
@@ -38,9 +38,8 @@ bool AdapterTrimmer::trimByOverlapAnalysis(Read* r1, Read* r2, FilterResult* fr,
     return false;
 }
 
-bool AdapterTrimmer::trimBySequence(Read* r, FilterResult* fr, string& adapterseq, bool isR2) {
+bool AdapterTrimmer::trimBySequence(Read* r, FilterResult* fr, string& adapterseq, int allowOneMismatchForEach, bool isR2) {
     const int matchReq = 4;
-    const int allowOneMismatchForEach = 8;
 
     int rlen = r->length();
     int alen = adapterseq.length();
@@ -93,6 +92,6 @@ bool AdapterTrimmer::test() {
         "+",
         "///EEEEEEEEEEEEEEEEEEEEEEEEEE////EEEEEEEEEEEEE////E////E");
     string adapter = "TTTTCCACGGGGATACTACTG";
-    bool trimmed = AdapterTrimmer::trimBySequence(&r, NULL, adapter);
+    bool trimmed = AdapterTrimmer::trimBySequence(&r, NULL, adapter, 8);
     return r.mSeq.mStr == "TTTTAACCCCCCCCCCCCCCCCCCCCCCCCCCCCAATTTTAAAA";
-}
\ No newline at end of file
+}
diff --git a/src/adaptertrimmer.h b/src/adaptertrimmer.h
index 974918e..e014181 100644
--- a/src/adaptertrimmer.h
+++ b/src/adaptertrimmer.h
@@ -17,11 +17,11 @@ class AdapterTrimmer{
 
     static bool trimByOverlapAnalysis(Read* r1, Read* r2, FilterResult* fr);
     static bool trimByOverlapAnalysis(Read* r1, Read* r2, FilterResult* fr, OverlapResult ov);
-    static bool trimBySequence(Read* r1, FilterResult* fr, string& adapter, bool isR2 = false);
+    static bool trimBySequence(Read* r1, FilterResult* fr, string& adapter, int allowOneMismatchForEach, bool isR2 = false);
     static bool test();
 
 
 };
 
 
-#endif
\ No newline at end of file
+#endif
diff --git a/src/main.cpp b/src/main.cpp
index a5da675..f899507 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -48,6 +48,7 @@ int main(int argc, char* argv[]){
     cmd.add("disable_adapter_trimming", 'A', "adapter trimming is enabled by default. If this option is specified, adapter trimming is disabled");
     cmd.add<string>("adapter_sequence", 'a', "the adapter for read1. For SE data, if not specified, the adapter will be auto-detected. For PE data, this is used if R1/R2 are found not overlapped.", false, "auto");
     cmd.add<string>("adapter_sequence_r2", 0, "the adapter for read2 (PE data only). This is used if R1/R2 are found not overlapped. If not specified, it will be the same as <adapter_sequence>", false, "");
+    cmd.add<int>("adapter_mm_freq", 0, "allowed mismatched within every n bases to detect adapter. 8 by defaults.", false, 8);
 
     // trimming
     cmd.add<int>("trim_front1", 'f', "trimming how many bases in front for read1, default is 0", false, 0);
@@ -59,10 +60,14 @@ int main(int argc, char* argv[]){
     cmd.add("trim_poly_g", 'g', "force polyG tail trimming, by default trimming is automatically enabled for Illumina NextSeq/NovaSeq data");
     cmd.add<int>("poly_g_min_len", 0, "the minimum length to detect polyG in the read tail. 10 by default.", false, 10);
     cmd.add("disable_trim_poly_g", 'G', "disable polyG tail trimming, by default trimming is automatically enabled for Illumina NextSeq/NovaSeq data");
-    
+    cmd.add<int>("poly_g_mm_freq", 0, "allowed mismatched within every n bases to detect polyG. 8 by defaults.", false, 8);
+    cmd.add<int>("poly_g_mm_max", 0, "the maximum number of mismatched allowed in detecting polyG. 5 by defaults.", false, 5);
+
     // polyX tail trimming
     cmd.add("trim_poly_x", 'x', "enable polyX trimming in 3' ends.");
     cmd.add<int>("poly_x_min_len", 0, "the minimum length to detect polyX in the read tail. 10 by default.", false, 10);
+    cmd.add<int>("poly_x_mm_freq", 0, "allowed mismatched within every n bases to detect polyX. 8 by defaults.", false, 8);
+    cmd.add<int>("poly_x_mm_max", 0, "the maximum number of mismatched allowed in detecting polyX. 5 by defaults.", false, 5);
 
     // sliding window cutting for each reads
     cmd.add("cut_by_quality5", '5', "enable per read cutting by quality in front (5'), default is disabled (WARNING: this will interfere deduplication for both PE/SE data)");
@@ -146,6 +151,7 @@ int main(int argc, char* argv[]){
     opt.adapter.enabled = !cmd.exist("disable_adapter_trimming");
     opt.adapter.sequence = cmd.get<string>("adapter_sequence");
     opt.adapter.sequenceR2 = cmd.get<string>("adapter_sequence_r2");
+    opt.adapter.allowOneMismatchForEach = cmd.get<int>("adapter_mm_freq");
     if(opt.adapter.sequenceR2.empty() && opt.adapter.sequence != "auto") {
         opt.adapter.sequenceR2 = opt.adapter.sequence;
     }
@@ -178,6 +184,8 @@ int main(int argc, char* argv[]){
         opt.polyXTrim.enabled = true;
     }
     opt.polyXTrim.minLen = cmd.get<int>("poly_x_min_len");
+    opt.polyXTrim.allowOneMismatchForEach = cmd.get<int>("poly_x_mm_freq");
+    opt.polyXTrim.maxMismatch = cmd.get<int>("poly_x_mm_max");
 
     // sliding window cutting by quality
     opt.qualityCut.enabled5 = cmd.exist("cut_by_quality5");
@@ -363,4 +371,4 @@ int main(int argc, char* argv[]){
     cerr << "fastp v" << FASTP_VER << ", time used: " << (t2)-t1 << " seconds" << endl;
 
     return 0;
-}
\ No newline at end of file
+}
diff --git a/src/options.h b/src/options.h
index 1633fef..ac62bcf 100644
--- a/src/options.h
+++ b/src/options.h
@@ -70,10 +70,14 @@ class PolyGTrimmerOptions {
     PolyGTrimmerOptions() {
         enabled = false;
         minLen = 10;
+        allowOneMismatchForEach = 8;
+        maxMismatch = 5;
     }
 public:
     bool enabled;
     int minLen;
+    int allowOneMismatchForEach;
+    int maxMismatch;
 };
 
 class PolyXTrimmerOptions {
@@ -81,10 +85,14 @@ class PolyXTrimmerOptions {
     PolyXTrimmerOptions() {
         enabled = false;
         minLen = 10;
+        allowOneMismatchForEach = 8;
+        maxMismatch = 5;
     }
 public:
     bool enabled;
     int minLen;
+    int allowOneMismatchForEach;
+    int maxMismatch;
 };
 
 class UMIOptions {
@@ -161,6 +169,7 @@ class AdapterOptions {
 public:
     AdapterOptions() {
         enabled = true;
+        allowOneMismatchForEach = 8;
         hasSeqR1 = false;
         hasSeqR2 = false;
     }
@@ -170,6 +179,7 @@ class AdapterOptions {
     string sequenceR2;
     string detectedAdapter1;
     string detectedAdapter2;
+    int allowOneMismatchForEach;
     bool hasSeqR1;
     bool hasSeqR2;
 };
@@ -314,4 +324,4 @@ class Options{
 
 };
 
-#endif
\ No newline at end of file
+#endif
diff --git a/src/peprocessor.cpp b/src/peprocessor.cpp
index 0fbb964..bc5bfb1 100644
--- a/src/peprocessor.cpp
+++ b/src/peprocessor.cpp
@@ -263,9 +263,9 @@ bool PairEndProcessor::processPairEnd(ReadPairPack* pack, ThreadConfig* config){
 
         if(r1 != NULL && r2!=NULL) {
             if(mOptions->polyGTrim.enabled)
-                PolyX::trimPolyG(r1, r2, config->getFilterResult(), mOptions->polyGTrim.minLen);
+                PolyX::trimPolyG(r1, r2, config->getFilterResult(), mOptions->polyGTrim.minLen, mOptions->polyGTrim.allowOneMismatchForEach, mOptions->polyGTrim.maxMismatch);
             if(mOptions->polyXTrim.enabled)
-                PolyX::trimPolyX(r1, r2, config->getFilterResult(), mOptions->polyXTrim.minLen);
+                PolyX::trimPolyX(r1, r2, config->getFilterResult(), mOptions->polyXTrim.minLen, mOptions->polyXTrim.allowOneMismatchForEach, mOptions->polyXTrim.maxMismatch);
         }
         bool isizeEvaluated = false;
         if(r1 != NULL && r2!=NULL && (mOptions->adapter.enabled || mOptions->correction.enabled)){
diff --git a/src/polyx.cpp b/src/polyx.cpp
index 20426d8..9a5698f 100644
--- a/src/polyx.cpp
+++ b/src/polyx.cpp
@@ -7,15 +7,12 @@ PolyX::PolyX(){
 PolyX::~PolyX(){
 }
 
-void PolyX::trimPolyG(Read* r1, Read* r2, FilterResult* fr, int compareReq) {
-    trimPolyG(r1, fr, compareReq);
-    trimPolyG(r2, fr, compareReq);
+void PolyX::trimPolyG(Read* r1, Read* r2, FilterResult* fr, int compareReq, int allowOneMismatchForEach, int maxMismatch) {
+    trimPolyG(r1, fr, compareReq, allowOneMismatchForEach, maxMismatch);
+    trimPolyG(r2, fr, compareReq, allowOneMismatchForEach, maxMismatch);
 }
 
-void PolyX::trimPolyG(Read* r, FilterResult* fr, int compareReq) {
-    const int allowOneMismatchForEach = 8;
-    const int maxMismatch = 5;
-
+void PolyX::trimPolyG(Read* r, FilterResult* fr, int compareReq, int allowOneMismatchForEach, int maxMismatch) {
     const char* data = r->mSeq.mStr.c_str();
 
     int rlen = r->length();
@@ -40,15 +37,12 @@ void PolyX::trimPolyG(Read* r, FilterResult* fr, int compareReq) {
     }
 }
 
-void PolyX::trimPolyX(Read* r1, Read* r2, FilterResult* fr, int compareReq) {
-    trimPolyX(r1, fr, compareReq);
-    trimPolyX(r2, fr, compareReq);
+void PolyX::trimPolyX(Read* r1, Read* r2, FilterResult* fr, int compareReq, int allowOneMismatchForEach, int maxMismatch) {
+    trimPolyX(r1, fr, compareReq, allowOneMismatchForEach, maxMismatch);
+    trimPolyX(r2, fr, compareReq, allowOneMismatchForEach, maxMismatch);
 }
 
-void PolyX::trimPolyX(Read* r, FilterResult* fr, int compareReq) {
-    const int allowOneMismatchForEach = 8;
-    const int maxMismatch = 5;
-
+void PolyX::trimPolyX(Read* r, FilterResult* fr, int compareReq, int allowOneMismatchForEach, int maxMismatch) {
     const char* data = r->mSeq.mStr.c_str();
 
     int rlen = r->length();
@@ -117,7 +111,7 @@ bool PolyX::test() {
         "ATTTTAAAAAAAAAATAAAAAAAAAAAAACAAAAAAAAAAAAAAAAAAAAAAAAAT",
         "+",
         "///EEEEEEEEEEEEEEEEEEEEEEEEEE////EEEEEEEEEEEEE////E////E");
-    PolyX::trimPolyX(&r, NULL, 10);
+    PolyX::trimPolyX(&r, NULL, 10, 8, 5);
     r.print();
     return r.mSeq.mStr == "ATTTT";
-}
\ No newline at end of file
+}
diff --git a/src/polyx.h b/src/polyx.h
index a1dbac9..51ad649 100644
--- a/src/polyx.h
+++ b/src/polyx.h
@@ -15,14 +15,14 @@ class PolyX{
     PolyX();
     ~PolyX();
 
-    static void trimPolyG(Read* r1, Read* r2, FilterResult* fr, int compareReq);
-    static void trimPolyG(Read* r1, FilterResult* fr, int compareReq);
-    static void trimPolyX(Read* r1, Read* r2, FilterResult* fr, int compareReq);
-    static void trimPolyX(Read* r1, FilterResult* fr, int compareReq);
+    static void trimPolyG(Read* r1, Read* r2, FilterResult* fr, int compareReq, int allowOneMismatchForEach, int maxMismatch);
+    static void trimPolyG(Read* r1, FilterResult* fr, int compareReq, int allowOneMismatchForEach, int maxMismatch);
+    static void trimPolyX(Read* r1, Read* r2, FilterResult* fr, int compareReq, int allowOneMismatchForEach, int maxMismatch);
+    static void trimPolyX(Read* r1, FilterResult* fr, int compareReq, int allowOneMismatchForEach, int maxMismatch);
     static bool test();
 
 
 };
 
 
-#endif
\ No newline at end of file
+#endif
diff --git a/src/seprocessor.cpp b/src/seprocessor.cpp
index 4d10794..742435c 100644
--- a/src/seprocessor.cpp
+++ b/src/seprocessor.cpp
@@ -196,7 +196,7 @@ bool SingleEndProcessor::processSingleEnd(ReadPack* pack, ThreadConfig* config){
             delete or1;
             continue;
         }
-        
+
         // umi processing
         if(mOptions->umi.enabled)
             mUmiProcessor->process(or1);
@@ -206,13 +206,13 @@ bool SingleEndProcessor::processSingleEnd(ReadPack* pack, ThreadConfig* config){
 
         if(r1 != NULL) {
             if(mOptions->polyGTrim.enabled)
-                PolyX::trimPolyG(r1, config->getFilterResult(), mOptions->polyGTrim.minLen);
+                PolyX::trimPolyG(r1, config->getFilterResult(), mOptions->polyGTrim.minLen, mOptions->polyGTrim.allowOneMismatchForEach, mOptions->polyGTrim.maxMismatch);
             if(mOptions->polyXTrim.enabled)
-                PolyX::trimPolyX(r1, config->getFilterResult(), mOptions->polyXTrim.minLen);
+                PolyX::trimPolyX(r1, config->getFilterResult(), mOptions->polyXTrim.minLen, mOptions->polyXTrim.allowOneMismatchForEach, mOptions->polyXTrim.maxMismatch);
         }
 
         if(r1 != NULL && mOptions->adapter.enabled && mOptions->adapter.hasSeqR1){
-            AdapterTrimmer::trimBySequence(r1, config->getFilterResult(), mOptions->adapter.sequence);
+            AdapterTrimmer::trimBySequence(r1, config->getFilterResult(), mOptions->adapter.sequence, mOptions->adapter.allowOneMismatchForEach);
         }
 
         int result = mFilter->passFilter(r1);

From 8b2c72f81c6bddf0cf6e16c8f61fc5db14ac00d5 Mon Sep 17 00:00:00 2001
From: Ye Chang <yech1990@gmail.com>
Date: Tue, 5 Jan 2021 14:57:15 +0800
Subject: [PATCH 2/3] fix conflict

---
 src/peprocessor.cpp | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/src/peprocessor.cpp b/src/peprocessor.cpp
index 1e741be..849a832 100644
--- a/src/peprocessor.cpp
+++ b/src/peprocessor.cpp
@@ -418,11 +418,6 @@ bool PairEndProcessor::processPairEnd(ReadPairPack* pack, ThreadConfig* config){
             isizeEvaluated = true;
         }
 
-        if(r1 != NULL && r2!=NULL) {
-            if(mOptions->polyXTrim.enabled)
-                PolyX::trimPolyX(r1, r2, config->getFilterResult(), mOptions->polyXTrim.minLen);
-        }
-
         if(r1 != NULL && r2!=NULL) {
             if( mOptions->trim.maxLen1 > 0 && mOptions->trim.maxLen1 < r1->length())
                 r1->resize(mOptions->trim.maxLen1);

From 0a47db9a9453ffe113b8a8f8bc7dbb651e9b3a15 Mon Sep 17 00:00:00 2001
From: Ye Chang <yech1990@gmail.com>
Date: Tue, 5 Jan 2021 15:19:45 +0800
Subject: [PATCH 3/3] fix conflict

---
 README.md              | 81 +++++++++++++++++++++---------------------
 src/adaptertrimmer.cpp |  8 ++---
 src/adaptertrimmer.h   |  4 +--
 3 files changed, 46 insertions(+), 47 deletions(-)

diff --git a/README.md b/README.md
index b001831..421d3bf 100644
--- a/README.md
+++ b/README.md
@@ -335,37 +335,38 @@ options:
   -i, --in1                          read1 input file name (string)
   -o, --out1                         read1 output file name (string [=])
   -I, --in2                          read2 input file name (string [=])
-  -O, --out2                           read2 output file name (string [=])
-      --unpaired1                      for PE input, if read1 passed QC but read2 not, it will be written to unpaired1. Default is to discard it. (string [=])
-      --unpaired2                      for PE input, if read2 passed QC but read1 not, it will be written to unpaired2. If --unpaired2 is same as --unpaired1 (default mode), both unpaired reads will be written to this same file. (string [=])
-      --failed_out                     specify the file to store reads that cannot pass the filters. (string [=])
-      --overlapped_out                 for each read pair, output the overlapped region if it has no any mismatched base. (string [=])
-  -m, --merge                          for paired-end input, merge each pair of reads into a single read if they are overlapped. The merged reads will be written to the file given by --merged_out, the unmerged reads will be written to the files specified by --out1 and --out2. The merging mode is disabled by default.
-      --merged_out                     in the merging mode, specify the file name to store merged output, or specify --stdout to stream the merged output (string [=])
-      --include_unmerged               in the merging mode, write the unmerged or unpaired reads to the file specified by --merge. Disabled by default.
+  -O, --out2                         read2 output file name (string [=])
+      --unpaired1                    for PE input, if read1 passed QC but read2 not, it will be written to unpaired1. Default is to discard it. (string [=])
+      --unpaired2                    for PE input, if read2 passed QC but read1 not, it will be written to unpaired2. If --unpaired2 is same as --unpaired1 (default mode), both unpaired reads will be written to this same file. (string [=])
+      --failed_out                   specify the file to store reads that cannot pass the filters. (string [=])
+      --overlapped_out               for each read pair, output the overlapped region if it has no any mismatched base. (string [=])
+  -m, --merge                        for paired-end input, merge each pair of reads into a single read if they are overlapped. The merged reads will be written to the file given by --merged_out, the unmerged reads will be written to the files specified by --out1 and --out2. The merging mode is disabled by default.
+      --merged_out                   in the merging mode, specify the file name to store merged output, or specify --stdout to stream the merged output (string [=])
+      --include_unmerged             in the merging mode, write the unmerged or unpaired reads to the file specified by --merge. Disabled by default.
   -6, --phred64                      indicate the input is using phred64 scoring (it'll be converted to phred33, so the output will still be phred33)
   -z, --compression                  compression level for gzip output (1 ~ 9). 1 is fastest, 9 is smallest, default is 4. (int [=4])
-      --stdin                          input from STDIN. If the STDIN is interleaved paired-end FASTQ, please also add --interleaved_in.
-      --stdout                         output passing-filters reads to STDOUT. This option will result in interleaved FASTQ output for paired-end input. Disabled by default.
-      --interleaved_in                 indicate that <in1> is an interleaved FASTQ which contains both read1 and read2. Disabled by default.
+      --stdin                        input from STDIN. If the STDIN is interleaved paired-end FASTQ, please also add --interleaved_in.
+      --stdout                       output passing-filters reads to STDOUT. This option will result in interleaved FASTQ output for paired-end input. Disabled by default.
+      --interleaved_in               indicate that <in1> is an interleaved FASTQ which contains both read1 and read2. Disabled by default.
       --reads_to_process             specify how many reads/pairs to be processed. Default 0 means process all reads. (int [=0])
       --dont_overwrite               don't overwrite existing files. Overwritting is allowed by default.
-      --fix_mgi_id                     the MGI FASTQ ID format is not compatible with many BAM operation tools, enable this option to fix it.
-  
+      --fix_mgi_id                   the MGI FASTQ ID format is not compatible with many BAM operation tools, enable this option to fix it.
+
   # adapter trimming options
   -A, --disable_adapter_trimming     adapter trimming is enabled by default. If this option is specified, adapter trimming is disabled
-  -a, --adapter_sequence               the adapter for read1. For SE data, if not specified, the adapter will be auto-detected. For PE data, this is used if R1/R2 are found not overlapped. (string [=auto])
-      --adapter_sequence_r2            the adapter for read2 (PE data only). This is used if R1/R2 are found not overlapped. If not specified, it will be the same as <adapter_sequence> (string [=])
-      --adapter_fasta                  specify a FASTA file to trim both read1 and read2 (if PE) by all the sequences in this FASTA file (string [=])
-      --detect_adapter_for_pe          by default, the adapter sequence auto-detection is enabled for SE data only, turn on this option to enable it for PE data.
+  -a, --adapter_sequence             the adapter for read1. For SE data, if not specified, the adapter will be auto-detected. For PE data, this is used if R1/R2 are found not overlapped. (string [=auto])
+      --adapter_sequence_r2          the adapter for read2 (PE data only). This is used if R1/R2 are found not overlapped. If not specified, it will be the same as <adapter_sequence> (string [=])
+      --adapter_fasta                specify a FASTA file to trim both read1 and read2 (if PE) by all the sequences in this FASTA file (string [=])
+      --detect_adapter_for_pe        by default, the adapter sequence auto-detection is enabled for SE data only, turn on this option to enable it for PE data.
+      --adapter_mm_freq              allowed mismatched within every n bases to detect adapter. 8 by defaults. (int [=8])
 
   # global trimming options
-  -f, --trim_front1                    trimming how many bases in front for read1, default is 0 (int [=0])
-  -t, --trim_tail1                     trimming how many bases in tail for read1, default is 0 (int [=0])
-  -b, --max_len1                       if read1 is longer than max_len1, then trim read1 at its tail to make it as long as max_len1. Default 0 means no limitation (int [=0])
-  -F, --trim_front2                    trimming how many bases in front for read2. If it's not specified, it will follow read1's settings (int [=0])
-  -T, --trim_tail2                     trimming how many bases in tail for read2. If it's not specified, it will follow read1's settings (int [=0])
-  -B, --max_len2                       if read2 is longer than max_len2, then trim read2 at its tail to make it as long as max_len2. Default 0 means no limitation. If it's not specified, it will follow read1's settings (int [=0])
+  -f, --trim_front1                  trimming how many bases in front for read1, default is 0 (int [=0])
+  -t, --trim_tail1                   trimming how many bases in tail for read1, default is 0 (int [=0])
+  -b, --max_len1                     if read1 is longer than max_len1, then trim read1 at its tail to make it as long as max_len1. Default 0 means no limitation (int [=0])
+  -F, --trim_front2                  trimming how many bases in front for read2. If it's not specified, it will follow read1's settings (int [=0])
+  -T, --trim_tail2                   trimming how many bases in tail for read2. If it's not specified, it will follow read1's settings (int [=0])
+  -B, --max_len2                     if read2 is longer than max_len2, then trim read2 at its tail to make it as long as max_len2. Default 0 means no limitation. If it's not specified, it will follow read1's settings (int [=0])
 
   # polyG tail trimming, useful for NextSeq/NovaSeq data
   -g, --trim_poly_g                  force polyG tail trimming, by default trimming is automatically enabled for Illumina NextSeq/NovaSeq data
@@ -381,18 +382,18 @@ options:
       --poly_x_mm_max                the maximum number of mismatched allowed in detecting polyX. 5 by defaults. (int [=5])
 
   # per read cutting by quality options
-  -5, --cut_front                      move a sliding window from front (5') to tail, drop the bases in the window if its mean quality < threshold, stop otherwise.
-  -3, --cut_tail                       move a sliding window from tail (3') to front, drop the bases in the window if its mean quality < threshold, stop otherwise.
-  -r, --cut_right                      move a sliding window from front to tail, if meet one window with mean quality < threshold, drop the bases in the window and the right part, and then stop.
-  -W, --cut_window_size                the window size option shared by cut_front, cut_tail or cut_sliding. Range: 1~1000, default: 4 (int [=4])
-  -M, --cut_mean_quality               the mean quality requirement option shared by cut_front, cut_tail or cut_sliding. Range: 1~36 default: 20 (Q20) (int [=20])
-      --cut_front_window_size          the window size option of cut_front, default to cut_window_size if not specified (int [=4])
-      --cut_front_mean_quality         the mean quality requirement option for cut_front, default to cut_mean_quality if not specified (int [=20])
-      --cut_tail_window_size           the window size option of cut_tail, default to cut_window_size if not specified (int [=4])
-      --cut_tail_mean_quality          the mean quality requirement option for cut_tail, default to cut_mean_quality if not specified (int [=20])
-      --cut_right_window_size          the window size option of cut_right, default to cut_window_size if not specified (int [=4])
-      --cut_right_mean_quality         the mean quality requirement option for cut_right, default to cut_mean_quality if not specified (int [=20])
-  
+  -5, --cut_front                    move a sliding window from front (5') to tail, drop the bases in the window if its mean quality < threshold, stop otherwise.
+  -3, --cut_tail                     move a sliding window from tail (3') to front, drop the bases in the window if its mean quality < threshold, stop otherwise.
+  -r, --cut_right                    move a sliding window from front to tail, if meet one window with mean quality < threshold, drop the bases in the window and the right part, and then stop.
+  -W, --cut_window_size              the window size option shared by cut_front, cut_tail or cut_sliding. Range: 1~1000, default: 4 (int [=4])
+  -M, --cut_mean_quality             the mean quality requirement option shared by cut_front, cut_tail or cut_sliding. Range: 1~36 default: 20 (Q20) (int [=20])
+      --cut_front_window_size        the window size option of cut_front, default to cut_window_size if not specified (int [=4])
+      --cut_front_mean_quality       the mean quality requirement option for cut_front, default to cut_mean_quality if not specified (int [=20])
+      --cut_tail_window_size         the window size option of cut_tail, default to cut_window_size if not specified (int [=4])
+      --cut_tail_mean_quality        the mean quality requirement option for cut_tail, default to cut_mean_quality if not specified (int [=20])
+      --cut_right_window_size        the window size option of cut_right, default to cut_window_size if not specified (int [=4])
+      --cut_right_mean_quality       the mean quality requirement option for cut_right, default to cut_mean_quality if not specified (int [=20])
+
   # quality filtering options
   -Q, --disable_quality_filtering    quality filtering is enabled by default. If this option is specified, quality filtering is disabled
   -q, --qualified_quality_phred      the quality value that a base is qualified. Default 15 means phred quality >=Q15 is qualified. (int [=15])
@@ -416,9 +417,9 @@ options:
 
   # base correction by overlap analysis options
   -c, --correction                   enable base correction in overlapped regions (only for PE data), default is disabled
-      --overlap_len_require            the minimum length to detect overlapped region of PE reads. This will affect overlap analysis based PE merge, adapter trimming and correction. 30 by default. (int [=30])
-      --overlap_diff_limit             the maximum number of mismatched bases to detect overlapped region of PE reads. This will affect overlap analysis based PE merge, adapter trimming and correction. 5 by default. (int [=5])
-      --overlap_diff_percent_limit     the maximum percentage of mismatched bases to detect overlapped region of PE reads. This will affect overlap analysis based PE merge, adapter trimming and correction. Default 20 means 20%. (int [=20])
+      --overlap_len_require          the minimum length to detect overlapped region of PE reads. This will affect overlap analysis based PE merge, adapter trimming and correction. 30 by default. (int [=30])
+      --overlap_diff_limit           the maximum number of mismatched bases to detect overlapped region of PE reads. This will affect overlap analysis based PE merge, adapter trimming and correction. 5 by default. (int [=5])
+      --overlap_diff_percent_limit   the maximum percentage of mismatched bases to detect overlapped region of PE reads. This will affect overlap analysis based PE merge, adapter trimming and correction. Default 20 means 20%. (int [=20])
 
   # UMI processing
   -U, --umi                          enable unique molecular identifier (UMI) preprocessing
@@ -428,8 +429,8 @@ options:
       --umi_skip                     if the UMI is in read1/read2, fastp can skip several bases following UMI, default is 0 (int [=0])
 
   # overrepresented sequence analysis
-  -p, --overrepresentation_analysis    enable overrepresented sequence analysis.
-  -P, --overrepresentation_sampling    One in (--overrepresentation_sampling) reads will be computed for overrepresentation analysis (1~10000), smaller is slower, default is 20. (int [=20])
+  -p, --overrepresentation_analysis  enable overrepresented sequence analysis.
+  -P, --overrepresentation_sampling  One in (--overrepresentation_sampling) reads will be computed for overrepresentation analysis (1~10000), smaller is slower, default is 20. (int [=20])
 
   # reporting options
   -j, --json                         the json format report file name (string [=fastp.json])
diff --git a/src/adaptertrimmer.cpp b/src/adaptertrimmer.cpp
index 1800e2e..6764b59 100644
--- a/src/adaptertrimmer.cpp
+++ b/src/adaptertrimmer.cpp
@@ -45,8 +45,7 @@ bool AdapterTrimmer::trimByOverlapAnalysis(Read* r1, Read* r2, FilterResult* fr,
     return false;
 }
 
-
-bool AdapterTrimmer::trimByMultiSequences(Read* r, FilterResult* fr, vector<string>& adapterList, bool isR2, bool incTrimmedCounter) {
+bool AdapterTrimmer::trimByMultiSequences(Read* r, FilterResult* fr, vector<string>& adapterList, bool isR2, int allowOneMismatchForEach, bool incTrimmedCounter) {
     int matchReq = 4;
     if(adapterList.size() > 16)
         matchReq = 5;
@@ -70,8 +69,7 @@ bool AdapterTrimmer::trimByMultiSequences(Read* r, FilterResult* fr, vector<stri
     return trimmed;
 }
 
-bool AdapterTrimmer::trimBySequence(Read* r, FilterResult* fr, string& adapterseq, bool isR2, int matchReq) {
-    const int allowOneMismatchForEach = 8;
+bool AdapterTrimmer::trimBySequence(Read* r, FilterResult* fr, string& adapterseq, bool isR2, int allowOneMismatchForEach, int matchReq) {
 
     int rlen = r->length();
     int alen = adapterseq.length();
@@ -163,4 +161,4 @@ bool AdapterTrimmer::test() {
     }
 
     return true;
-}
\ No newline at end of file
+}
diff --git a/src/adaptertrimmer.h b/src/adaptertrimmer.h
index 3a6d1fe..ff372a0 100644
--- a/src/adaptertrimmer.h
+++ b/src/adaptertrimmer.h
@@ -17,8 +17,8 @@ class AdapterTrimmer{
 
     static bool trimByOverlapAnalysis(Read* r1, Read* r2, FilterResult* fr, int diffLimit, int overlapRequire, double diffPercentLimit);
     static bool trimByOverlapAnalysis(Read* r1, Read* r2, FilterResult* fr, OverlapResult ov, int frontTrimmed1 = 0, int frontTrimmed2 = 0);
-    static bool trimBySequence(Read* r1, FilterResult* fr, string& adapter, bool isR2 = false, int matchReq = 4);
-    static bool trimByMultiSequences(Read* r1, FilterResult* fr, vector<string>& adapterList, bool isR2 = false, bool incTrimmedCounter = true);
+    static bool trimBySequence(Read* r1, FilterResult* fr, string& adapter, bool isR2 = false, int allowOneMismatchForEach = 8, int matchReq = 4);
+    static bool trimByMultiSequences(Read* r1, FilterResult* fr, vector<string>& adapterList, bool isR2 = false, int allowOneMismatchForEach = 8, bool incTrimmedCounter = true);
     static bool test();
 
 };