Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

trim, bwa, and deduplication rules updated to work with single-end data #22

Merged
merged 10 commits into from
Oct 23, 2023
6 changes: 3 additions & 3 deletions config/cluster.json
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
"threads": "4",
"time": "1-18:00:00"
},
"trim_pe": {
"trim": {
"mem": "64g",
"threads": "32"
},
Expand All @@ -15,12 +15,12 @@
"gres": "lscratch:256",
"mem": "110g"
},
"BWA_PE": {
"BWA": {
"threads": "32",
"mem": "220g",
"gres": "lscratch:64"
},
"picard_dedup": {
"dedup": {
"gres": "lscratch:256",
"mem": "96g",
"threads": "2"
Expand Down
49 changes: 23 additions & 26 deletions workflow/Snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,14 @@ from scripts.common import (
# Timestamp in YYYYMMDD format
today = str(datetime.datetime.today()).split()[0].replace('-', '')

# Check for SE or PE FastQ files:
convert = {1: False, 2: True} # 1 = SE, 2 = PE, -1 = Unknown
try:
paired_end = convert[config['project']['nends']] # True if PE else false
except KeyError:
# Catching case when value is -1 or unknown
sys.exit("Fatal: Raw data could not be classified as single-end or paired-end data!")

# Global workflow variables
configfile: "config.json"
samples = config['samples']
Expand Down Expand Up @@ -82,28 +90,8 @@ def zip_contrasts(contrast, PeakTools):
contrasts.append( g1 + "_vs_" + g2 + "-" + PeakTool )
return(zipGroup1, zipGroup2, zipTool, contrasts)

# Creating naming conventions for bam files
# and tagAlign files if SE, extension objects
# defined here are especially relevant for the
# deeptools rules and all four are used in at
# least one rule
se=""
pe=""
if config['project']['nends'] == 2 :
pe="yes"
elif config['project']['nends'] == 1 :
se="yes"

extensions = [ "sorted.RPGC", "Q5DD.RPGC" ]
extensions2 = list(map(lambda x:re.sub(".RPGC","",x),extensions))

if pe == "yes":
extensions3 = { extensions2[i] + "." : "bam" for i in range(len(extensions2)) }
extensions4 = [ extensions2[i] + ".bam" for i in range(len(extensions2)) ]
else:
types = [ "bam", "tagAlign.gz" ]
extensions3 = { extensions2[i] + "." : types[i] for i in range(len(extensions2)) }
extensions4 = [ extensions2[i] + "." + types[i] for i in range(len(extensions2)) ]


# Getting sample relationships from config
Expand Down Expand Up @@ -188,14 +176,21 @@ cfTool_dir="cfChIPtool"
cfTool_subdir2="cfChIPtool/BED/H3K4me3"

zipGroup1, zipGroup2, zipToolC, contrasts = zip_contrasts(contrast, PeakTools)
# Final targets of the pipeline
# Final targets of the pipeline

if paired_end:
extensionsDict = {"sorted": "bam", "Q5DD":"bam"}
extensionsFull = ['sorted.bam', 'Q5DD.bam']
else:
extensionsDict= {"sorted": "bam", "Q5DD_tagAlign": "gz"}
extensionsFull = ['sorted.bam', 'Q5DD_tagAlign.gz']

if assay == "cfchip":
rule all:
input:
join(workpath,"multiqc_report.html"),
expand(join(workpath,qc_dir,"{name}.{ext}.insert_size_metrics.txt"),name=samples,ext=extensions2),
expand(join(workpath,bam_dir,"{name}.{ext}"),name=samples,ext=extensions4[1]),
expand(join(workpath,qc_dir,"{name}.{ext}.insert_size_metrics.txt"),name=samples,ext=list(extensionsDict.keys())),
expand(join(workpath,bam_dir,"{name}.{ext}"),name=samples,ext=extensionsFull),
expand(join(workpath,qc_dir,"{name}.preseq.dat"), name=samples),
expand(join(workpath,macsN_dir,"{name}","{name}_peaks.narrowPeak"),name=chips),
expand(join(workpath,"PeakQC","{PeakTool}.{name}.Q5DD.FRiP_table.txt"), PeakTool=PeakTools, name=samples),
Expand All @@ -217,8 +212,8 @@ elif assay in ["atac", "chip"]:
rule all:
input:
join(workpath,"multiqc_report.html"),
expand(join(workpath,qc_dir,"{name}.{ext}.insert_size_metrics.txt"),name=samples,ext=extensions2),
expand(join(workpath,bam_dir,"{name}.{ext}"),name=samples,ext=extensions4[1]),
provided(expand(join(workpath,qc_dir,"{name}.{ext}.insert_size_metrics.txt"),name=samples,ext=list(extensionsDict.keys())), paired_end==True),
expand(join(workpath,bam_dir,"{name}.{ext}"),name=samples,ext=extensionsFull),
expand(join(workpath,qc_dir,"{name}.preseq.dat"), name=samples),
expand(join(workpath,macsN_dir,"{name}","{name}_peaks.narrowPeak"),name=chips),
provided(expand(join(workpath,"macsBroad","{name}","{name}_peaks.broadPeak"),name=chips), assay=="chip"),
Expand All @@ -236,7 +231,9 @@ elif assay in ["atac", "chip"]:
provided(expand(join(workpath,uropa_dir,diffbind_dir,'{name}_{PeakTool}_uropa_{type}_allhits.txt'),
PeakTool=['DiffbindEdgeR','DiffbindDeseq2'],name=contrasts,type=["protTSS", "prot", "protSEC", "genes"]), reps == "yes"),

provided(expand(join(workpath,"Genrich","{name}","{name}.narrowPeak"),name=chips), assay=="atac")
provided(expand(join(workpath,"Genrich","{name}","{name}.narrowPeak"),name=chips), assay=="atac"),


#############################

# QC/alignment rules: trim_pe,
Expand Down
16 changes: 8 additions & 8 deletions workflow/rules/qc.smk
Original file line number Diff line number Diff line change
Expand Up @@ -88,7 +88,7 @@ rule rawfastqc:
"""
input:
expand(join(workpath,"{name}.R1.fastq.gz"), name=samples) if \
se == "yes" else \
not paired_end else \
expand(join(workpath,"{name}.R{rn}.fastq.gz"), name=samples,rn=[1,2])
output:
expand(join(workpath,'rawfastQC',"{name}.R1_fastqc.html"),name=samples),
Expand Down Expand Up @@ -119,7 +119,7 @@ rule fastqc:
"""
input:
expand(join(workpath,trim_dir,"{name}.R1.trim.fastq.gz"),name=samples) if \
se == "yes" else \
not paired_end else \
expand(join(workpath,trim_dir,"{name}.R{rn}.trim.fastq.gz"), name=samples,rn=[1,2])
output:
expand(join(workpath,'fastQC',"{name}.R1.trim_fastqc.html"),name=samples),
Expand Down Expand Up @@ -149,16 +149,16 @@ rule fastq_screen:
FastQ Screen report and logfiles
"""
input:
join(workpath,trim_dir,"{name}.R1.trim.fastq.gz") if se == "yes" else \
join(workpath,trim_dir,"{name}.R1.trim.fastq.gz") if not paired_end else \
expand(join(workpath,trim_dir,"{name}.R{rn}.trim.fastq.gz"),name=samples,rn=[1,2])
output:
join(workpath,"FQscreen","{name}.R1.trim_screen.txt") if se == "yes" else \
join(workpath,"FQscreen","{name}.R1.trim_screen.txt") if not paired_end else \
expand(join(workpath,"FQscreen","{name}.R{rn}.trim_screen.txt"),name=samples,rn=[1,2]),
join(workpath,"FQscreen","{name}.R1.trim_screen.png") if se == "yes" else \
join(workpath,"FQscreen","{name}.R1.trim_screen.png") if not paired_end else \
expand(join(workpath,"FQscreen","{name}.R{rn}.trim_screen.png"),name=samples,rn=[1,2]),
join(workpath,"FQscreen2","{name}.R1.trim_screen.txt") if se == "yes" else \
join(workpath,"FQscreen2","{name}.R1.trim_screen.txt") if not paired_end else \
expand(join(workpath,"FQscreen2","{name}.R{rn}.trim_screen.txt"),name=samples,rn=[1,2]),
join(workpath,"FQscreen2","{name}.R1.trim_screen.png") if se == "yes" else \
join(workpath,"FQscreen2","{name}.R1.trim_screen.png") if not paired_end else \
expand(join(workpath,"FQscreen2","{name}.R{rn}.trim_screen.png"),name=samples,rn=[1,2]),
params:
rname = 'fqscreen',
Expand Down Expand Up @@ -298,7 +298,7 @@ rule insert_size:
Number of reads per insert size and their histogram
"""
input:
bam = lambda w : join(workpath,bam_dir,w.name + "." + w.ext + "." + extensions3[w.ext + "."])
bam = lambda w : join(workpath,bam_dir,w.name + "." + w.ext + "." + extensionsDict[w.ext])
output:
txt= join(workpath,qc_dir,"{name}.{ext}.insert_size_metrics.txt"),
pdf= join(workpath,qc_dir,"{name}.{ext}.insert_size_histogram.pdf"),
Expand Down
Loading