From 33253483e32944d93a36ebd9caddc15ee7308021 Mon Sep 17 00:00:00 2001 From: Ryan Routsong Date: Tue, 27 Aug 2024 14:42:23 -0500 Subject: [PATCH] Expand dba rules (#44) * feat: add cftoolkit v0.6.0 w rmarkdown * fix: inherit from docker v0.5.0 * feat: move to static images for diff bind and uropa rules * fix: rework diff bind and uropa scripts for modularity and new rules * fix: fix hooks, split diff bind and uropa into multiple rule hierarchical rules * chore: remove planned new docker version * fix: full path for config * fix: set cwd at rule instead of script * fix: cd to output dir in diff bind rule * fix: make sif cache aware of env * fix: review changes * fix: syntax error with rmarkdown::render * fix: images not image * fix: remove references to container sif files * fix: pathing in rmarkdown and syntax in prep_diffbind --- bin/DiffBind_v2_ChIPseq.Rmd | 107 +++++--- bin/DiffBind_v2_ChIPseq_block.Rmd | 155 +++++++----- bin/prep_diffbind.py | 101 ++++---- chrom-seek | 5 +- config/containers.json | 1 + workflow/Snakefile | 13 +- workflow/rules/dba.smk | 395 +++++++++++++++++++----------- workflow/rules/hooks.smk | 6 +- 8 files changed, 493 insertions(+), 290 deletions(-) diff --git a/bin/DiffBind_v2_ChIPseq.Rmd b/bin/DiffBind_v2_ChIPseq.Rmd index 031799d..400aa95 100755 --- a/bin/DiffBind_v2_ChIPseq.Rmd +++ b/bin/DiffBind_v2_ChIPseq.Rmd @@ -16,34 +16,44 @@ params: --- ```{r, include=FALSE, warning=FALSE, message=FALSE} -## grab args -dateandtime<-format(Sys.time(), "%a %b %d %Y - %X") - +# inputs +dateandtime <- format(Sys.time(), "%a %b %d %Y - %X") csvfile <- params$csvfile +outbase <- dirname(csvfile) contrasts <- params$contrasts peakcaller <- params$peakcaller -``` -**Groups being compared:** - *`r contrasts`* -**Peak sources:** - *`r peakcaller`* -**Report generated:** - *`r dateandtime`* +# file output suffixes +cp_bed <- "_Diffbind_consensusPeaks.bed" +edger_txt <- "_Diffbind_EdgeR.txt" +deseq2_txt <- "_Diffbind_Deseq2.txt" +edger_bed <- "_Diffbind_EdgeR.bed" +deseq2_bed <- "_Diffbind_Deseq2.bed" +deseq2_bed_fullist <- "_Diffbind_Deseq2_fullList.txt" +edger_bed_fullist <- "_Diffbind_EdgeR_fullList.txt" -```{r setup, echo=FALSE, warning=FALSE,message=FALSE} +# knitr options knitr::opts_chunk$set(echo = FALSE, include=TRUE, message=FALSE, warning=FALSE, error=FALSE) + +# libraries suppressMessages(library(DT)) suppressMessages(library(DiffBind)) suppressMessages(library(parallel)) ``` +**Groups being compared:** + *`r contrasts`* +**Peak sources:** + *`r peakcaller`* +**Report generated:** + *`r dateandtime`* + # Peak Data Read in sample sheet information and peak information ```{r samples} @@ -88,13 +98,13 @@ if ( grepl("narrow",samples$samples$Peaks[1]) ) { print ("Narrow peak calling tool.") print ("Differential peaks are 250bp upstream and downstream of the summits.") } else if ( grepl("broad",samples$samples$Peaks[1]) ) { - summits <- FALSE - print ("Broad peak calling tool.") - print ("Differential peaks are consensus peaks.") + summits <- FALSE + print ("Broad peak calling tool.") + print ("Differential peaks are consensus peaks.") } else { - summits <- FALSE - print ("Indeterminate peak calling tool.") - print ("Differential peaks are consensus peaks.") + summits <- FALSE + print ("Indeterminate peak calling tool.") + print ("Differential peaks are consensus peaks.") } ``` @@ -105,9 +115,9 @@ if (summits == TRUE) { DBdataCounts <- dba.count(samples) } print(DBdataCounts) -outfile2 <- paste0(contrasts, "-", peakcaller,"_Diffbind_consensusPeaks.bed") -consensus2 <- dba.peakset(DBdataCounts,bRetrieve=T) -consensus2$name <- paste0("Peak",1:length(consensus2)) +outfile2 <- paste0(contrasts, "-", peakcaller, cp_bed) +consensus2 <- dba.peakset(DBdataCounts, bRetrieve=T) +consensus2$name <- paste0("Peak", 1:length(consensus2)) #rtracklayer::export(consensus2,outfile2) ``` @@ -210,11 +220,24 @@ try(dba.plotHeatmap(DBAnalysisEdgeR,contrast=1,method = DBA_EDGER,correlations=F ## Top 500 differentially bound peaks {.tabset .tabset-fade} ### DeSeq2 {-} ```{r Deseq2Report} -outfile <- paste0(contrasts, "-", peakcaller, "_Diffbind_Deseq2.txt") -outfile2 <- paste0(contrasts, "-", peakcaller, "_Diffbind_Deseq2.bed") +outfile <- paste0(contrasts, "-", peakcaller, deseq2_txt) +outfile2 <- paste0(contrasts, "-", peakcaller, deseq2_bed) DBReportDeseq2$name <- paste0("Peak",1:length(DBReportDeseq2)) -try(rtracklayer::export(DBReportDeseq2, outfile2),silent=TRUE) -write.table(DBReportDeseq2, outfile, quote=F, sep="\t", row.names=F) + +tryDeseqExport <- function(DBReportDeseq2, outfile2) { + tryCatch( + { + rtracklayer::export(DBReportDeseq2, outfile2) + }, + error = function(cond) { + print("ERROR: Failed to export DeSeq bed file `rtracklayer::export(DBReportDeseq2, outfile2)`, output blank file") + write.table(outfile2, file='empty', col.names=FALSE) + } + ) +} + +tryDeseqExport(DBReportDeseq2, file.path(outbase, outfile2)) +write.table(DBReportDeseq2, file.path(outbase, outfile), quote=F, sep="\t", row.names=F) D2i <- length(DBReportDeseq2) if (D2i == 0) { i=1 @@ -226,17 +249,31 @@ if (D2i == 0) { try(DT::datatable(data.frame(DBReportDeseq2)[1:i,], rownames=F),silent=TRUE) report2 <- dba.report(DBAnalysisDeseq2,method = DBA_DESEQ2,th=100,bNormalized=T,bFlip=FALSE,precision=0,bCalled=T) -outfile3 <- paste0(contrasts, "-", peakcaller, "_Diffbind_Deseq2_fullList.txt") -write.table(report2, outfile3, quote=F, sep="\t", row.names=F) +outfile3 <- paste0(contrasts, "-", peakcaller, deseq2_bed_fullist) +write.table(report2, file.path(outbase, outfile3), quote=F, sep="\t", row.names=F) ``` ### EdgeR {-} ```{r EdgeRReport} -outfile <- paste0(contrasts, "-", peakcaller,"_Diffbind_EdgeR.txt") -outfile2 <- paste0(contrasts, "-", peakcaller,"_Diffbind_EdgeR.bed") +outfile <- paste0(contrasts, "-", peakcaller, edger_txt) +outfile2 <- paste0(contrasts, "-", peakcaller, edger_bed) DBReportEdgeR$name <- paste0("Peak",1:length(DBReportEdgeR)) -try(rtracklayer::export(DBReportEdgeR, outfile2),silent=TRUE) -write.table(DBReportEdgeR, outfile, quote=F, sep="\t", row.names=F) + +tryEdgeRExport <- function(edger_report, fout) { + tryCatch( + { + rtracklayer::export(edger_report, fout) + }, + error = function(cond) { + print("ERROR: Failed to export EdgeR bed file `rtracklayer::export(edger_report, fout))`, output blank file") + write.table(fout, file='empty', col.names=FALSE) + } + ) +} + +tryEdgeRExport(DBReportEdgeR, file.path(outbase, outfile2)) +write.table(DBReportEdgeR, file.path(outbase, outfile), quote=F, sep="\t", row.names=F) + Ei <- length(DBReportEdgeR) if (Ei == 0) { i=1 @@ -248,8 +285,8 @@ if (Ei == 0) { try(DT::datatable(data.frame(DBReportEdgeR)[1:i,], rownames=F),silent=TRUE) report2 <- dba.report(DBAnalysisEdgeR,method = DBA_EDGER,th=100,bNormalized=T,bFlip=FALSE,precision=0,bCalled=T) -outfile3 <- paste0(contrasts, "-", peakcaller, "_Diffbind_EdgeR_fullList.txt") -write.table(report2, outfile3, quote=F, sep="\t", row.names=F) +outfile3 <- paste0(contrasts, "-", peakcaller, edger_bed_fullist) +write.table(report2, file.path(outbase, outfile3), quote=F, sep="\t", row.names=F) ``` ## R tool version information diff --git a/bin/DiffBind_v2_ChIPseq_block.Rmd b/bin/DiffBind_v2_ChIPseq_block.Rmd index 2a508b5..68b9f6a 100755 --- a/bin/DiffBind_v2_ChIPseq_block.Rmd +++ b/bin/DiffBind_v2_ChIPseq_block.Rmd @@ -13,57 +13,65 @@ params: csvfile: samplesheet.csv contrasts: "group1_vs_group2" peakcaller: "macs" - dir: "/path/to/DiffBindBlock/directory" --- ```{r, include=FALSE, warning=FALSE, message=FALSE} -## grab args -dateandtime<-format(Sys.time(), "%a %b %d %Y - %X") - +# global variables +dateandtime <- format(Sys.time(), "%a %b %d %Y - %X") csvfile <- params$csvfile +outbase <- dirname(csvfile) contrasts <- params$contrasts peakcaller <- params$peakcaller -``` -**Groups being compared:** - *`r contrasts`* -**Peak sources:** - *`r peakcaller`* -**Report generated:** - *`r dateandtime`* +# file output suffixes +cp_bed <- "_Diffbind_consensusPeaks_block.bed" +edger_txt <- "_Diffbind_EdgeR_block.txt" +deseq2_txt <- "_Diffbind_Deseq2_block.txt" +edger_bed <- "_Diffbind_EdgeR_block.bed" +deseq2_bed <- "_Diffbind_Deseq2_block.bed" +deseq2_bed_fullist <- "_Diffbind_Deseq2_fullList_block.txt" +edger_bed_fullist <- "_Diffbind_EdgeR_fullList_block.txt" -```{r setup, echo=FALSE, warning=FALSE,message=FALSE} +# knittr configuration knitr::opts_chunk$set(echo = FALSE, include=TRUE, message=FALSE, warning=FALSE, error=FALSE) -knitr::opts_knit$set(root.dir=params$dir) + +# load libs suppressMessages(library(DT)) suppressMessages(library(DiffBind)) suppressMessages(library(parallel)) ``` +**Groups being compared:** + *`r contrasts`* +**Peak sources:** + *`r peakcaller`* +**Report generated:** + *`r dateandtime`* + # Peak Data Read in sample sheet information and peak information ```{r samples} samples <- dba(sampleSheet=csvfile) -consensus <- dba.peakset(samples,consensus=DBA_CONDITION) +consensus <- dba.peakset(samples, consensus=DBA_CONDITION) print(samples) ``` ## Correlation heatmap: Only peaks Pearson correlation of peak positions: all samples versus all samples ```{r heatmap1} -try(dba.plotHeatmap(samples,main="",cexRow=1,cexCol=1),silent=TRUE) +try(dba.plotHeatmap(samples, main="", cexRow=1, cexCol=1), silent=TRUE) ``` ## PCA: Only peaks Variance of peak positions ```{r PCA1, fig.height=5,fig.width=5} -try(dba.plotPCA(samples,DBA_CONDITION),silent=TRUE) +try(dba.plotPCA(samples,DBA_CONDITION), silent=TRUE) ``` ## Overlapping peak counts @@ -73,11 +81,11 @@ the consensus peak set are the peaks identified in at least 2 samples for that c from the consensus peak set used for differential analyses. ```{r Venn, fig_height=4} if (nrow(samples$samples) < 5) { - dba.plotVenn(samples,1:nrow(samples$samples)) + dba.plotVenn(samples, 1:nrow(samples$samples)) } else { - dba.plotVenn(consensus,consensus$masks$Consensus,main="Binding Site Overlaps: 'consensus', comparing between groups") - try(dba.plotVenn(samples,samples$masks[[3]],main="Binding Site Overlaps: samples in Group1"),silent=TRUE) - try(dba.plotVenn(samples,samples$masks[[4]],main="Binding Site Overlaps: samples in Group2"),silent=TRUE) + dba.plotVenn(consensus, consensus$masks$Consensus, main="Binding Site Overlaps: 'consensus', comparing between groups") + try(dba.plotVenn(samples,samples$masks[[3]],main="Binding Site Overlaps: samples in Group1"), silent=TRUE) + try(dba.plotVenn(samples,samples$masks[[4]],main="Binding Site Overlaps: samples in Group2"), silent=TRUE) } ``` @@ -85,18 +93,18 @@ if (nrow(samples$samples) < 5) { Consensus peaks are peaks found in at least two samples, independent of condition. FRiP is of consensus peaks and will not match FRiP values calculated outside of this tool. ```{r peaksORsummits} -if ( grepl("narrow",samples$samples$Peaks[1]) ) { +if ( grepl("narrow", samples$samples$Peaks[1]) ) { summits <- TRUE print ("Narrow peak calling tool.") print ("Differential peaks are 250bp upstream and downstream of the summits.") -} else if ( grepl("broad",samples$samples$Peaks[1]) ) { - summits <- FALSE - print ("Broad peak calling tool.") - print ("Differential peaks are consensus peaks.") +} else if ( grepl("broad", samples$samples$Peaks[1]) ) { + summits <- FALSE + print ("Broad peak calling tool.") + print ("Differential peaks are consensus peaks.") } else { - summits <- FALSE - print ("Indeterminate peak calling tool.") - print ("Differential peaks are consensus peaks.") + summits <- FALSE + print ("Indeterminate peak calling tool.") + print ("Differential peaks are consensus peaks.") } ``` @@ -107,28 +115,28 @@ if (summits == TRUE) { DBdataCounts <- dba.count(samples) } print(DBdataCounts) -outfile2 <- paste0(contrasts, "-", peakcaller,"_Diffbind_consensusPeaks.bed") -consensus2 <- dba.peakset(DBdataCounts,bRetrieve=T) -consensus2$name <- paste0("Peak",1:length(consensus2)) -#rtracklayer::export(consensus2,outfile2) +outfile2 <- paste0(contrasts, "-", peakcaller, cp_bed) +consensus2 <- dba.peakset(DBdataCounts, bRetrieve=T) +consensus2$name <- paste0("Peak", 1:length(consensus2)) +#rtracklayer::export(consensus2, outfile2) ``` ## Correlation heatmap: Peaks and reads Pearson correlation of library-size normalized counts of consensus peaks: all samples versus all samples ```{r heatmap2} -try(dba.plotHeatmap(DBdataCounts,main="",cexRow=1,cexCol=1),silent=TRUE) +try(dba.plotHeatmap(DBdataCounts, main="", cexRow=1, cexCol=1),silent=TRUE) ``` ## Heatmap: Average signal across each peak 1000 most variable consensus peaks (library-size normalized counts) ```{r heatmap3} -try(dba.plotHeatmap(DBdataCounts,correlations=FALSE,cexRow=1,cexCol=1),silent=TRUE) +try(dba.plotHeatmap(DBdataCounts, correlations=FALSE, cexRow=1, cexCol=1),silent=TRUE) ``` ## PCA: Peaks and reads Variation of library-size normalized counts of consensus peaks ```{r PCA2, fig.height=5,fig.width=5} -try(dba.plotPCA(DBdataCounts,DBA_CONDITION),silent=TRUE) +try(dba.plotPCA(DBdataCounts, DBA_CONDITION), silent=TRUE) ``` # Set Up Contrast @@ -189,12 +197,12 @@ Each dot is a consensus peak. ### DeSeq2 {-} ```{r Volcano1} -try(dba.plotVolcano(DBAnalysisDeseq2, method = DBA_DESEQ2_BLOCK),silent=TRUE) +try(dba.plotVolcano(DBAnalysisDeseq2, method = DBA_DESEQ2_BLOCK), silent=TRUE) ``` ### EdgeR {-} ```{r Volcano2} -try(dba.plotVolcano(DBAnalysisEdgeR, method = DBA_EDGER_BLOCK),silent=TRUE) +try(dba.plotVolcano(DBAnalysisEdgeR, method = DBA_EDGER_BLOCK), silent=TRUE) ``` ## Heatmap: Differential {.tabset .tabset-fade} @@ -202,24 +210,38 @@ try(dba.plotVolcano(DBAnalysisEdgeR, method = DBA_EDGER_BLOCK),silent=TRUE) ### DeSeq2 {-} ```{r heatmap4D} -try(dba.plotHeatmap(DBAnalysisDeseq2,contrast=1,method = DBA_DESEQ2_BLOCK, - correlations=FALSE,margin=20,cexRow=1,cexCol=1),silent=TRUE) +try(dba.plotHeatmap(DBAnalysisDeseq2, contrast=1, method = DBA_DESEQ2_BLOCK, + correlations=FALSE, margin=20, cexRow=1, cexCol=1), silent=TRUE) ``` ### EdgeR {-} ```{r heatmap4E} -try(dba.plotHeatmap(DBAnalysisEdgeR,contrast=1,method = DBA_EDGER_BLOCK, - correlations=FALSE,margin=20,cexRow=1,cexCol=1),silent=TRUE) +try(dba.plotHeatmap(DBAnalysisEdgeR, contrast=1, method = DBA_EDGER_BLOCK, + correlations=FALSE, margin=20, cexRow=1, cexCol=1), silent=TRUE) ``` ## Top 500 differentially bound peaks {.tabset .tabset-fade} ### DeSeq2 {-} ```{r Deseq2Report} -outfile <- paste0(contrasts, "-", peakcaller, "_Diffbind_Deseq2_block.txt") -outfile2 <- paste0(contrasts, "-", peakcaller, "_Diffbind_Deseq2_block.bed") -DBReportDeseq2$name <- paste0("Peak",1:length(DBReportDeseq2)) -try(rtracklayer::export(DBReportDeseq2, outfile2),silent=TRUE) -write.table(DBReportDeseq2, outfile, quote=F, sep="\t", row.names=F) +outfile <- paste0(contrasts, "-", peakcaller, deseq2_txt) +outfile2 <- paste0(contrasts, "-", peakcaller, deseq2_bed) +DBReportDeseq2$name <- paste0("Peak", 1:length(DBReportDeseq2)) + +tryDeseqExport <- function(DBReportDeseq2, outfile2) { + tryCatch( + { + rtracklayer::export(DBReportDeseq2, outfile2) + }, + error = function(cond) { + print("ERROR: Failed to export DeSeq bed file `rtracklayer::export(DBReportDeseq2, outfile2)`, output blank file") + write.table(outfile2, file='empty', col.names=FALSE) + } + ) +} + +tryDeseqExport(DBReportDeseq2, file.path(outbase, outfile2)) + +write.table(DBReportDeseq2, file.path(outbase, outfile), quote=F, sep="\t", row.names=F) D2i <- length(DBReportDeseq2) if (D2i == 0) { i=1 @@ -232,17 +254,32 @@ try(DT::datatable(data.frame(DBReportDeseq2)[1:i,], rownames=F),silent=TRUE) report2 <- dba.report(DBAnalysisDeseq2,method = DBA_DESEQ2_BLOCK, th=100,bNormalized=T,bFlip=FALSE,precision=0) -outfile3 <- paste0(contrasts, "-", peakcaller, "_Diffbind_Deseq2_block_fullList.txt") -write.table(report2, outfile3, quote=F, sep="\t", row.names=F) + +outfile3 <- paste0(contrasts, "-", peakcaller, deseq2_bed_fullist) +write.table(report2, file.path(outbase, outfile3), quote=F, sep="\t", row.names=F) ``` ### EdgeR {-} ```{r EdgeRReport} -outfile <- paste0(contrasts, "-", peakcaller,"_Diffbind_EdgeR_block.txt") -outfile2 <- paste0(contrasts, "-", peakcaller,"_Diffbind_EdgeR_block.bed") -DBReportEdgeR$name <- paste0("Peak",1:length(DBReportEdgeR)) -try(rtracklayer::export(DBReportEdgeR, outfile2),silent=TRUE) -write.table(DBReportEdgeR, outfile, quote=F, sep="\t", row.names=F) +outfile <- paste0(contrasts, "-", peakcaller, edger_txt) +outfile2 <- paste0(contrasts, "-", peakcaller, edger_bed) +DBReportEdgeR$name <- paste0("Peak", 1:length(DBReportEdgeR)) + +tryEdgeRExport <- function(edger_report, fout) { + tryCatch( + { + rtracklayer::export(edger_report, fout) + }, + error = function(cond) { + print("ERROR: Failed to export EdgeR bed file `rtracklayer::export(edger_report, fout))`, output blank file") + write.table(fout, file='empty', col.names=FALSE) + } + ) +} + +tryEdgeRExport(DBReportEdgeR, file.path(outbase, outfile2)) + +write.table(DBReportEdgeR, file.path(outbase, outfile), quote=F, sep="\t", row.names=F) Ei <- length(DBReportEdgeR) if (Ei == 0) { i=1 @@ -251,12 +288,12 @@ if (Ei == 0) { } else { i=Ei } -try(DT::datatable(data.frame(DBReportEdgeR)[1:i,], rownames=F),silent=TRUE) +try(DT::datatable(data.frame(DBReportEdgeR)[1:i,], rownames=F), silent=TRUE) report2 <- dba.report(DBAnalysisEdgeR,method = DBA_EDGER_BLOCK, th=100,bNormalized=T,bFlip=FALSE,precision=0) -outfile3 <- paste0(contrasts, "-", peakcaller, "_Diffbind_EdgeR_block_fullList.txt") -write.table(report2, outfile3, quote=F, sep="\t", row.names=F) +outfile3 <- paste0(contrasts, "-", peakcaller, edger_bed_fullist) +write.table(report2, file.path(outbase, outfile3), quote=F, sep="\t", row.names=F) ``` ## R tool version information diff --git a/bin/prep_diffbind.py b/bin/prep_diffbind.py index 3711c68..235c3e4 100755 --- a/bin/prep_diffbind.py +++ b/bin/prep_diffbind.py @@ -1,55 +1,64 @@ #!/usr/bin/env python3 - import json import argparse +from csv import DictWriter +from os.path import join -parser = argparse.ArgumentParser(description='Script to prepare the DiffBind input csv') -parser.add_argument('--g1',dest='group1',required=True,help='Name of the first group') -parser.add_argument('--g2',dest='group2',required=True,help='Name of the second group') -parser.add_argument('--wp',dest='workpath',required=True,help='Full path of the working directory') -parser.add_argument('--pt',dest='peaktool',required=True,help='Name of the the peak calling tool, also the directory where the peak file will be located') -parser.add_argument('--pe',dest='peakextension',required=True,help='The file extension of the peakcall output') -parser.add_argument('--pc',dest='peakcaller',required=True,help='Value for the PeakCaller column of the DiffBind csv') -parser.add_argument('--bd',dest='bamdir',required=True,help='Name of the directory where the bam files are located') -parser.add_argument('--csv',dest='csvfile',required=True,help='Name of the output csv file') -args = parser.parse_args() +def main(group1, group2, peaktool, peakext, peakcaller, csvfile, wp, bam_dir): + config = json.load(open(join(wp, "config.json"))) + chip2input = config['project']['peaks']['inputs'] + groupdata = config['project']['groups'] + blocks = config['project']['blocks'] + blocking = False if set(blocks.values()) in ({None}, {''}) else True + + if blocking: + cols = ["SampleID", "Condition", "Treatment", "Replicate", "bamReads", + "ControlID", "bamControl", "Peaks", "PeakCaller"] + else: + cols = ["SampleID", "Condition", "Replicate", "bamReads", "ControlID", + "bamControl", "Peaks", "PeakCaller"] + + samplesheet = [] + for condition in group1, group2: + for chip in groupdata[condition]: + replicate = str([ i + 1 for i in range(len(groupdata[condition])) if groupdata[condition][i]== chip ][0]) + bamReads = join(bam_dir, chip + ".Q5DD.bam") + controlID = chip2input[chip] + if controlID != "": + bamControl = join(bam_dir, controlID + ".Q5DD.bam") + else: + bamControl = "" + peaks = join(wp, peaktool, chip, chip + peakext) + if blocking: + block = blocks[chip] + this_row = dict(zip(cols, [chip, condition, block, replicate, bamReads, + controlID, bamControl, peaks, peakcaller])) + else: + this_row = dict(zip(cols, [chip, condition, replicate, bamReads, + controlID, bamControl, peaks, peakcaller])) + samplesheet.append(this_row) + -with open("config.json","r") as read_file: - config=json.load(read_file) - -chip2input = config['project']['peaks']['inputs'] -groupdata = config['project']['groups'] -blocks = config['project']['blocks'] + with open(csvfile, 'w') as f: + wrt = DictWriter(f, dialect='excel', fieldnames=cols) + wrt.writeheader() + wrt.writerows(samplesheet) -if None in list(blocks.values()): - samplesheet = [",".join(["SampleID", "Condition", "Replicate", "bamReads", - "ControlID", "bamControl", "Peaks", "PeakCaller"])] -else: - samplesheet = [",".join(["SampleID", "Condition", "Treatment", "Replicate", "bamReads", - "ControlID", "bamControl", "Peaks", "PeakCaller"])] - -for condition in args.group1, args.group2: - for chip in groupdata[condition]: - replicate = str([ i + 1 for i in range(len(groupdata[condition])) if groupdata[condition][i]== chip ][0]) - bamReads = args.bamdir + "/" + chip + ".Q5DD.bam" - controlID = chip2input[chip] - if controlID != "": - bamControl = args.bamdir + "/" + controlID + ".Q5DD.bam" - else: - bamControl = "" - peaks = args.workpath + "/" + args.peaktool + "/" + chip + "/" + chip + args.peakextension - if None in list(blocks.values()): - samplesheet.append(",".join([chip, condition, replicate, bamReads, - controlID, bamControl, peaks, args.peakcaller])) - else: - block = blocks[chip] - samplesheet.append(",".join([chip, condition, block, replicate, bamReads, - controlID, bamControl, peaks, args.peakcaller])) - -f = open(args.csvfile, 'w') -f.write ("\n".join(samplesheet)) -f.write ("\n") -f.close() +if __name__ == "__main__": + parser = argparse.ArgumentParser(description='Script to prepare the DiffBind input csv') + parser.add_argument('--g1', dest='group1', required=True, help='Name of the first group') + parser.add_argument('--g2', dest='group2', required=True, help='Name of the second group') + parser.add_argument('--wp', dest='wp', required=True, help='Full path of the working directory') + parser.add_argument('--pt', dest='peaktool', required=True, + help='Name of the the peak calling tool, also the directory where the peak file will be located') + parser.add_argument('--pe', dest='peakext', required=True, help='The file extension of the peakcall output') + parser.add_argument('--pc', dest='peakcaller', required=True, + help='Value for the PeakCaller column of the DiffBind csv') + parser.add_argument('--bd', dest='bam_dir', required=True, + help='Name of the directory where the bam files are located') + parser.add_argument('--csv', dest='csvfile', required=True, help='Name of the output csv file') + args = parser.parse_args() + main(args.group1, args.group2, args.peaktool, args.peakext, args.peakcaller, args.csvfile, args.wp, args.bam_dir) diff --git a/chrom-seek b/chrom-seek index 36c6e13..42986fd 100755 --- a/chrom-seek +++ b/chrom-seek @@ -300,10 +300,11 @@ def cache(sub_args): """ # Check for dependencies require(['singularity'], ['singularity']) + sif_cache = sub_args.sif_cache # Get absolute PATH to templates in exome-seek git repo repo_path = os.path.dirname(os.path.abspath(__file__)) - images = os.path.join(repo_path, 'config','containers.json') + images = os.path.join(repo_path, 'config', 'containers.json') # Create image cache if not exists(sif_cache): @@ -725,6 +726,7 @@ def parsed_arguments(name, description): subparser_run.add_argument( '--singularity-cache', type = lambda option: check_cache(parser, os.path.abspath(os.path.expanduser(option))), + default = None, required = False, help = argparse.SUPPRESS ) @@ -941,7 +943,6 @@ def parsed_arguments(name, description): def main(): - # Sanity check for usage if len(sys.argv) == 1: # Nothing was provided diff --git a/config/containers.json b/config/containers.json index 5329fa7..280e934 100644 --- a/config/containers.json +++ b/config/containers.json @@ -1,6 +1,7 @@ { "images": { "cfchip": "docker://skchronicles/cfchip_toolkit:v0.5.0", + "uropa": "docker://quay.io/biocontainers/uropa:4.0.2--pyhdfd78af_0", "python": "docker://asyakhleborodova/chrom_seek_python:v0.1.0", "ppqt": "docker://asyakhleborodova/ppqt:v0.2.0" } diff --git a/workflow/Snakefile b/workflow/Snakefile index 0025ec5..8faf4d1 100644 --- a/workflow/Snakefile +++ b/workflow/Snakefile @@ -17,6 +17,8 @@ samples = config['samples'] bin_path = config['project']['binpath'] workpath = config['project']['workpath'] assay = config['options']['assay'] +blocks = config['project']['blocks'] +blocking = False if set(blocks.values()) in ({None}, {''}) else True paired_end = False if config['project']['nends'] == 1 else True chips = config['project']['peaks']['chips'] contrast = config['project']['contrast'] @@ -99,7 +101,11 @@ if assay == "cfchip": join(diffbind_dir, "{group1}_vs_{group2}-{PeakTool}", "{group1}_vs_{group2}-{PeakTool}_Diffbind.html"), group1=zipGroup1, group2=zipGroup2, PeakTool=zipToolC )) - + if blocking: + rule_all_ins.extend(expand( + join(diffbind_dir, "{group1}_vs_{group2}-{PeakTool}", "{group1}_vs_{group2}-{PeakTool}_Diffbind_blocking.html"), + group1=zipGroup1, group2=zipGroup2, PeakTool=zipToolC + )) if contrast: rule_all_ins.extend(expand( join(uropa_diffbind_dir, "{name}_{PeakTool}_uropa_{_type}_allhits.txt"), @@ -161,6 +167,11 @@ elif assay in ["atac", "chip"]: join(diffbind_dir, "{group1}_vs_{group2}-{PeakTool}", "{group1}_vs_{group2}-{PeakTool}_Diffbind.html"), group1=zipGroup1, group2=zipGroup2, PeakTool=zipToolC )) + if blocking: + rule_all_ins.extend(expand( + join(diffbind_dir, "{group1}_vs_{group2}-{PeakTool}", "{group1}_vs_{group2}-{PeakTool}_Diffbind_blocking.html"), + group1=zipGroup1, group2=zipGroup2, PeakTool=zipToolC + )) if contrast: rule_all_ins.extend(expand( join(uropa_diffbind_dir, "{name}_{PeakTool}_uropa_{_type}_allhits.txt"), diff --git a/workflow/rules/dba.smk b/workflow/rules/dba.smk index c6f0b5e..eef1245 100644 --- a/workflow/rules/dba.smk +++ b/workflow/rules/dba.smk @@ -4,22 +4,23 @@ import os import json from os.path import join from scripts.common import allocated, mk_dir_if_not_exist -from scripts.peakcall import outputIDR, zip_peak_files, calc_effective_genome_fraction, get_manorm_sizes +from scripts.peakcall import outputIDR, zip_peak_files, \ + calc_effective_genome_fraction, get_manorm_sizes from scripts.grouping import test_for_block # ~~ workflow configuration -workpath = config['project']['workpath'] -bin_path = config['project']['binpath'] -genome = config['options']['genome'] -blocks = config['project']['blocks'] -groupdata = config['project']['groups'] -contrast = config['project']['contrast'] -uropaver = config['tools']['UROPAVER'] -gtf = config['references'][genome]['GTFFILE'] +workpath = config["project"]["workpath"] +bin_path = config["project"]["binpath"] +genome = config["options"]["genome"] +blocks = config["project"]["blocks"] +groupdata = config["project"]["groups"] +contrast = config["project"]["contrast"] +uropaver = config["tools"]["UROPAVER"] +gtf = config["references"][genome]["GTFFILE"] -# ~~ directoriesxw -diffbind_dir_block = join(workpath, "DiffBindBlock") + +# ~~ directories diffbind_dir2 = join(workpath, "DiffBind_block") diffbind_dir = join(workpath, "DiffBind") uropa_dir = join(workpath, "UROPA_annotations") @@ -36,139 +37,258 @@ otherDirs = [qc_dir, homer_dir, uropa_dir] cfTool_dir = join(workpath, "cfChIPtool") cfTool_subdir2 = join(cfTool_dir, "BED", "H3K4me3") + # ~~ workflow switches -blocking = False if None in list(blocks.values()) else True +blocking = False if set(blocks.values()) in ({None}, {""}) else True if reps == "yes": otherDirs.append(diffbind_dir) mk_dir_if_not_exist(PeakTools + otherDirs) - # ~~ peak calling configuration and outputs -PeakToolsNG = [ tool for tool in PeakTools if tool != "gem" ] +PeakToolsNG = [tool for tool in PeakTools if tool != "gem"] PeakExtensions = { - 'macsNarrow': '_peaks.narrowPeak', - 'macsBroad': '_peaks.broadPeak', - 'sicer': '_broadpeaks.bed', - 'gem': '.GEM_events.narrowPeak' , - 'MANorm': '_all_MA.bed', - 'DiffbindEdgeR': '_Diffbind_EdgeR.bed', - 'DiffbindDeseq2': '_Diffbind_Deseq2.bed', - 'DiffbindEdgeRBlock': '_Diffbind_EdgeR_block.bed', - 'DiffbindDeseq2Block': '_Diffbind_Deseq2_block.bed', - 'Genrich': '.narrowPeak', - 'DiffBindQC': '_DiffBindQC_TMMcounts.bed' + "macsNarrow": "_peaks.narrowPeak", + "macsBroad": "_peaks.broadPeak", + "sicer": "_broadpeaks.bed", + "gem": ".GEM_events.narrowPeak", + "MANorm": "_all_MA.bed", + "DiffbindEdgeR": "_Diffbind_EdgeR.bed", + "DiffbindDeseq2": "_Diffbind_Deseq2.bed", + "DiffbindEdgeRBlock": "_Diffbind_EdgeR_block.bed", + "DiffbindDeseq2Block": "_Diffbind_Deseq2_block.bed", + "Genrich": ".narrowPeak", + "DiffBindQC": "_DiffBindQC_TMMcounts.bed", } -FileTypesDiffBind = { - 'macsNarrow': 'narrowPeak', - 'macsBroad': 'narrowPeak', - 'sicer': 'bed', - 'gem': 'narrowPeak', - 'Genrich': 'narrowPeak' +FileTypesDiffBind = { + "macsNarrow": "narrowPeak", + "macsBroad": "narrowPeak", + "sicer": "bed", + "gem": "narrowPeak", + "Genrich": "narrowPeak", } -PeakExtensionsIDR = { - 'macsNarrow': '_peaks.narrowPeak', - 'macsBroad': '_peaks.broadPeak', - 'sicer': '_sicer.broadPeak' +PeakExtensionsIDR = { + "macsNarrow": "_peaks.narrowPeak", + "macsBroad": "_peaks.broadPeak", + "sicer": "_sicer.broadPeak", } -FileTypesIDR = { - 'macsNarrow': 'narrowPeak', - 'macsBroad': 'broadPeak', - 'sicer': 'broadPeak' +FileTypesIDR = { + "macsNarrow": "narrowPeak", + "macsBroad": "broadPeak", + "sicer": "broadPeak", } -RankColIDR = { - 'macsNarrow': 'q.value', - 'macsBroad': 'q.value', - 'sicer': 'q.value' -} -IDRgroup, IDRsample1, IDRsample2, IDRpeaktool = outputIDR(groupswreps, groupdata, chip2input, PeakToolsNG) +RankColIDR = {"macsNarrow": "q.value", "macsBroad": "q.value", "sicer": "q.value"} +IDRgroup, IDRsample1, IDRsample2, IDRpeaktool = outputIDR( + groupswreps, groupdata, chip2input, PeakToolsNG +) zipSample, zipTool, zipExt = zip_peak_files(chips, PeakTools, PeakExtensions) contrastBlock = test_for_block(groupdata, contrast, blocks) zipGroup1B, zipGroup2B, zipToolCB, contrastsB = zip_contrasts(contrastBlock, PeakTools) -# ~~ rules -rule diffbind: + +# ~~ rules +rule init_diffbind: input: - lambda w: [ join(workpath, w.PeakTool, chip, chip + PeakExtensions[w.PeakTool]) for chip in chips ] + bed = lambda w: [ + join(workpath, w.PeakTool, chip, chip + PeakExtensions[w.PeakTool]) + for chip in chips + ], output: - html = join(diffbind_dir, "{group1}_vs_{group2}-{PeakTool}", "{group1}_vs_{group2}-{PeakTool}_Diffbind.html"), - Deseq2 = join(diffbind_dir, "{group1}_vs_{group2}-{PeakTool}", "{group1}_vs_{group2}-{PeakTool}_Diffbind_Deseq2.bed"), - EdgeR = join(diffbind_dir, "{group1}_vs_{group2}-{PeakTool}", "{group1}_vs_{group2}-{PeakTool}_Diffbind_EdgeR.bed"), - EdgeR_txt = join(diffbind_dir, "{group1}_vs_{group2}-{PeakTool}", "{group1}_vs_{group2}-{PeakTool}_Diffbind_EdgeR.txt"), - Deseq2_txt = join(diffbind_dir, "{group1}_vs_{group2}-{PeakTool}", "{group1}_vs_{group2}-{PeakTool}_Diffbind_Deseq2.txt"), - EdgeR_ftxt = join(diffbind_dir, "{group1}_vs_{group2}-{PeakTool}", "{group1}_vs_{group2}-{PeakTool}_Diffbind_EdgeR_fullList.txt"), - Deseq2_ftxt = join(diffbind_dir, "{group1}_vs_{group2}-{PeakTool}", "{group1}_vs_{group2}-{PeakTool}_Diffbind_Deseq2_fullList.txt"), - html_block = provided(join(uropa_diffbind_dir, "{group1}_vs_{group2}-{PeakTool}", "{group1}_vs_{group2}-{PeakTool}_Diffbind_blocking.html"), blocking) + csvfile = join( + diffbind_dir, + "{group1}_vs_{group2}-{PeakTool}", + "{group1}_vs_{group2}-{PeakTool}_Diffbind_prep.csv", + ), params: - # variables and wildcards used in the shell directive - rname = "diffbind", + rname = "initialize_diffbind", group1 = "{group1}", group2 = "{group2}", this_peaktool = "{PeakTool}", + peakcaller = lambda w: FileTypesDiffBind[w.PeakTool], + this_peakextension = lambda w: PeakExtensions[w.PeakTool], + pythonscript = join(bin_path, "prep_diffbind.py"), + bam_dir = bam_dir, + workpath = workpath, + container: + config["images"]["cfchip"] + shell: + """ + python {params.pythonscript} \ + --g1 {params.group1} \ + --g2 {params.group2} \ + --wp {params.workpath} \ + --pt {params.this_peaktool} \ + --pe {params.this_peakextension} \ + --bd {params.bam_dir} \ + --pc {params.peakcaller} \ + --csv {output.csvfile} + """ + + +rule diffbind: + input: + csvfile = join( + diffbind_dir, + "{group1}_vs_{group2}-{PeakTool}", + "{group1}_vs_{group2}-{PeakTool}_Diffbind_prep.csv", + ), + bed = lambda w: [ + join(workpath, w.PeakTool, chip, chip + PeakExtensions[w.PeakTool]) + for chip in chips + ], + output: + diffbind_report = join( + diffbind_dir, + "{group1}_vs_{group2}-{PeakTool}", + "{group1}_vs_{group2}-{PeakTool}_Diffbind.html", + ), + Deseq2 = join( + diffbind_dir, + "{group1}_vs_{group2}-{PeakTool}", + "{group1}_vs_{group2}-{PeakTool}_Diffbind_Deseq2.bed", + ), + EdgeR = join( + diffbind_dir, + "{group1}_vs_{group2}-{PeakTool}", + "{group1}_vs_{group2}-{PeakTool}_Diffbind_EdgeR.bed", + ), + EdgeR_txt = join( + diffbind_dir, + "{group1}_vs_{group2}-{PeakTool}", + "{group1}_vs_{group2}-{PeakTool}_Diffbind_EdgeR.txt", + ), + Deseq2_txt = join( + diffbind_dir, + "{group1}_vs_{group2}-{PeakTool}", + "{group1}_vs_{group2}-{PeakTool}_Diffbind_Deseq2.txt", + ), + EdgeR_ftxt = join( + diffbind_dir, + "{group1}_vs_{group2}-{PeakTool}", + "{group1}_vs_{group2}-{PeakTool}_Diffbind_EdgeR_fullList.txt", + ), + Deseq2_ftxt = join( + diffbind_dir, + "{group1}_vs_{group2}-{PeakTool}", + "{group1}_vs_{group2}-{PeakTool}_Diffbind_Deseq2_fullList.txt", + ), + # consensus_pks = join( + # diffbind_dir, + # "{group1}_vs_{group2}-{PeakTool}", + # "{group1}_vs_{group2}-{PeakTool}_Diffbind_consensusPeaks.bed" + # ), + params: + rname = "diffbind", + this_peaktool = "{PeakTool}", this_contrast = "{group1}_vs_{group2}", this_peakextension = lambda w: PeakExtensions[w.PeakTool], peakcaller = lambda w: FileTypesDiffBind[w.PeakTool], - # scripts in the bin directory used in the shell directive rscript = join(bin_path, "DiffBind_v2_ChIPseq.Rmd"), - pythonscript = join(bin_path, "prep_diffbind.py"), - blocking_rscript = join(bin_path, "DiffBind_v2_ChIPseq_block.Rmd"), - # output base directories or full file locations outdir = join(diffbind_dir, "{group1}_vs_{group2}-{PeakTool}"), - csvfile = join( - diffbind_dir, - "{group1}_vs_{group2}-{PeakTool}", - "{group1}_vs_{group2}-{PeakTool}_Diffbind_prep.csv" - ), - Deseq2_block = provided(join( - diffbind_dir_block, - "{group1}_vs_{group2}-{PeakTool}", - "{group1}_vs_{group2}-{PeakTool}_Diffbind_Deseq2_block.bed" - ), blocking), - EdgeR_block = provided(join( - diffbind_dir_block, - "{group1}_vs_{group2}-{PeakTool}", - "{group1}_vs_{group2}-{PeakTool}_Diffbind_EdgeR_block.bed" - ), blocking), - outdir_block = join(diffbind_dir_block, "{group1}_vs_{group2}-{PeakTool}"), container: - config['images']['cfchip'] - shell: + config["images"]["cfchip"] + shell: """ - python {params.pythonscript} --g1 {params.group1} --g2 {params.group2} --wp {workpath} \ - --pt {params.this_peaktool} --pe {params.this_peakextension} --bd {bam_dir} \ - --pc {params.peakcaller} --csv {params.csvfile} - cp {params.rscript} {params.outdir} + mkdir -p {params.outdir} cd {params.outdir} - Rscript -e 'rmarkdown::render("DiffBind_v2_ChIPseq.Rmd", output_file= "{output.html}", \ - params=list(csvfile="{params.csvfile}", contrasts="{params.this_contrast}", peakcaller="{params.this_peaktool}"))' - if [ ! -f {output.Deseq2} ]; then touch {output.Deseq2}; fi - if [ ! -f {output.EdgeR} ]; then touch {output.EdgeR}; fi + Rscript -e 'rmarkdown::render("{params.rscript}", \ + output_file="{output.diffbind_report}", \ + params=list( \ + csvfile="{input.csvfile}", \ + contrasts="{params.this_contrast}", \ + peakcaller="{params.this_peaktool}" \ + ) \ + )' + """ - if [ '"""+str(blocking)+"""' == True ]; then - echo "DiffBind with Blocking" - Rscript -e 'rmarkdown::render("{params.blocking_rscript}", output_file= "{output.html_block}", \ - params=list(csvfile= "{params.csvfile}", contrasts="{params.this_contrast}", peakcaller="{params.this_peaktool}", dir="{params.outdir_block}"))' - if [ ! -f {params.Deseq2_block} ]; then touch {params.Deseq2_block}; fi - if [ ! -f {params.EdgeR_block} ]; then touch {params.EdgeR_block}; fi - fi + +rule diffbind_blocking: + input: + csvfile = join( + diffbind_dir, + "{group1}_vs_{group2}-{PeakTool}", + "{group1}_vs_{group2}-{PeakTool}_Diffbind_prep.csv", + ), + bed = lambda w: [ + join(workpath, w.PeakTool, chip, chip + PeakExtensions[w.PeakTool]) + for chip in chips + ], + output: + diffbind_block_report = join( + diffbind_dir, + "{group1}_vs_{group2}-{PeakTool}", + "{group1}_vs_{group2}-{PeakTool}_Diffbind_blocking.html", + ), + Deseq2 = join( + diffbind_dir, + "{group1}_vs_{group2}-{PeakTool}", + "{group1}_vs_{group2}-{PeakTool}_Diffbind_Deseq2_block.bed", + ), + EdgeR = join( + diffbind_dir, + "{group1}_vs_{group2}-{PeakTool}", + "{group1}_vs_{group2}-{PeakTool}_Diffbind_EdgeR_block.bed", + ), + EdgeR_txt = join( + diffbind_dir, + "{group1}_vs_{group2}-{PeakTool}", + "{group1}_vs_{group2}-{PeakTool}_Diffbind_EdgeR_block.txt", + ), + Deseq2_txt = join( + diffbind_dir, + "{group1}_vs_{group2}-{PeakTool}", + "{group1}_vs_{group2}-{PeakTool}_Diffbind_Deseq2_block.txt", + ), + EdgeR_ftxt = join( + diffbind_dir, + "{group1}_vs_{group2}-{PeakTool}", + "{group1}_vs_{group2}-{PeakTool}_Diffbind_EdgeR_fullList_block.txt", + ), + Deseq2_ftxt = join( + diffbind_dir, + "{group1}_vs_{group2}-{PeakTool}", + "{group1}_vs_{group2}-{PeakTool}_Diffbind_Deseq2_fullList_block.txt", + ), + # consensus_pks = join( + # diffbind_dir, + # "{group1}_vs_{group2}-{PeakTool}", + # "{group1}_vs_{group2}-{PeakTool}_Diffbind_consensusPeaks_block.bed" + # ), + params: + rname = "diffbind_block", + blocking_rscript = join(bin_path, "DiffBind_v2_ChIPseq_block.Rmd"), + outdir = join(diffbind_dir, "{group1}_vs_{group2}-{PeakTool}"), + this_peaktool = "{PeakTool}", + this_contrast = "{group1}_vs_{group2}", + container: + config["images"]["cfchip"] + shell: + """ + mkdir -p {params.outdir} + cd {params.outdir} + Rscript -e 'rmarkdown::render("{params.blocking_rscript}", \ + output_file="{output.diffbind_block_report}", \ + params=list( \ + csvfile="{input.csvfile}", \ + contrasts="{params.this_contrast}", \ + peakcaller="{params.this_peaktool}" \ + ) \ + )' """ -rule UROPA: +localrules: UROPA_prep_in + + +rule UROPA_prep_in: input: lambda w: join(workpath, w.PeakTool1, w.name, w.name + PeakExtensions[w.PeakTool2]), - output: - txt = join(uropa_dir, '{PeakTool1}', '{name}_{PeakTool2}_uropa_{type}_allhits.txt'), - bed1 = temp(join(uropa_dir, '{PeakTool1}', '{name}_{PeakTool2}_uropa_{type}_allhits.bed')), - bed2 = temp(join(uropa_dir, '{PeakTool1}', '{name}_{PeakTool2}_uropa_{type}_finalhits.bed')), - json = join(uropa_dir, '{PeakTool1}', '{name}.{PeakTool2}.{type}.json'), params: - rname = "uropa", - fldr = join(uropa_dir, '{PeakTool1}'), - outroot = join(uropa_dir, '{PeakTool1}', '{name}_{PeakTool2}_uropa_{type}'), - threads: 4, + fldr = join(uropa_dir, "{PeakTool1}"), + output: + json = join(uropa_dir, "{PeakTool1}", "{name}.{PeakTool2}.{type}.json"), run: # Dynamically creates UROPA config file if not os.path.exists(params.fldr): @@ -238,51 +358,36 @@ rule UROPA: if not os.path.exists(output.json): raise FileNotFoundError(output.json + " does not exist!") - shell.prefix(f"module load {uropaver};") - shell("uropa -i " + output.json + " -p " + params.outroot + " -t " + str(threads) + " -s") -rule manorm: + +rule UROPA: input: - bam1 = lambda w: join(bam_dir, groupdata[w.group1][0] + ".Q5DD.bam"), - bam2 = lambda w: join(bam_dir, groupdata[w.group2][0] + ".Q5DD.bam"), - # ppqt = join(ppqt_dir, "Q5DD.ppqt.txt"), # ppqt input into manorm TODO - peak1 = lambda w: join(workpath, w.tool, groupdata[w.group1][0], groupdata[w.group1][0] + PeakExtensions[w.tool]), - peak2 = lambda w: join(workpath, w.tool, groupdata[w.group2][0], groupdata[w.group2][0] + PeakExtensions[w.tool]), + json = join(uropa_dir, "{PeakTool1}", "{name}.{PeakTool2}.{type}.json"), output: - xls = join(manorm_dir, "{group1}_vs_{group2}-{tool}", "{group1}_vs_{group2}-{tool}_all_MAvalues.xls"), - bed = temp(join(manorm_dir, "{group1}_vs_{group2}-{tool}", "{group1}_vs_{group2}-{tool}_all_MA.bed")), - wigA = join(manorm_dir, "{group1}_vs_{group2}-{tool}", "output_tracks", "{group1}_vs_{group2}_A_values.wig.gz"), - wigM = join(manorm_dir, "{group1}_vs_{group2}-{tool}", "output_tracks", "{group1}_vs_{group2}_M_values.wig.gz"), - wigP = join(manorm_dir, "{group1}_vs_{group2}-{tool}", "output_tracks", "{group1}_vs_{group2}_P_values.wig.gz"), + txt = join( + uropa_dir, + "{PeakTool1}", + "{name}_{PeakTool2}_uropa_{type}_allhits.txt" + ), + bed1 = temp(join( + uropa_dir, + "{PeakTool1}", + "{name}_{PeakTool2}_uropa_{type}_allhits.bed" + )), + bed2 = temp(join( + uropa_dir, + "{PeakTool1}", + "{name}_{PeakTool2}_uropa_{type}_finalhits.bed", + )), + log = join(uropa_dir, "{PeakTool1}", "{name}_{PeakTool2}_uropa_{type}.log") params: - rname = 'manorm', - fldr = join(manorm_dir, "{group1}_vs_{group2}-{tool}"), - bedtoolsver = config['tools']['BEDTOOLSVER'], - manormver = "manorm/1.1.4", - extsizes = lambda w, input: get_manorm_sizes(w.group1, w.group2, groupdata, "") + rname="uropa", + outroot=join(uropa_dir, "{PeakTool1}", "{name}_{PeakTool2}_uropa_{type}"), + threads: 4 + container: + config["images"]["uropa"] shell: """ - if [ ! -e /lscratch/$SLURM_JOBID ]; then - mkdir /lscratch/$SLURM_JOBID - fi - cd /lscratch/$SLURM_JOBID - module load {params.manormver} - module load {params.bedtoolsver} - bamToBed -i {input.bam1} > bam1.bed - bamToBed -i {input.bam2} > bam2.bed - cut -f 1,2,3 {input.peak1} > peak1.bed - cut -f 1,2,3 {input.peak2} > peak2.bed - manorm \ - --p1 peak1.bed \ - --p2 peak2.bed \ - --r1 bam1.bed \ - --r2 bam2.bed \ - {params.extsizes} \ - -o {params.fldr} \ - --name1 {wildcards.group1} \ - --name2 {wildcards.group2} - gzip {params.fldr}/output_tracks/*wig - mv {params.fldr}/{wildcards.group1}_vs_{wildcards.group2}_all_MAvalues.xls {output.xls} - tail -n +2 {output.xls} | nl -w2 | awk -v OFS='\t' '{{print $2,$3,$4,$9$1,$6}}' > {output.bed} + uropa -i {input.json} -p {params.outroot} -l {output.log} -t {threads} -s """ \ No newline at end of file diff --git a/workflow/rules/hooks.smk b/workflow/rules/hooks.smk index 1c8666f..162a0b3 100644 --- a/workflow/rules/hooks.smk +++ b/workflow/rules/hooks.smk @@ -31,8 +31,9 @@ if config['options']['mode'] == 'slurm': # previously submitted jobs sleep 15; rm -f COMPLETED FAILED RUNNING; timestamp=$(date +"%Y-%m-%d_%H-%M-%S"); + snk_log=$(ls -Art logfiles/snakemake_*.log | tail -n 1); ./bin/jobby \\ - $(grep --color=never "^Submitted .* external jobid" logfiles/snakemake.log \\ + $(grep --color=never "^Submitted .* external jobid" ${{snk_log}} \\ | awk '{{print $NF}}' \\ | sed "s/['.]//g" \\ | sort \\ @@ -68,8 +69,9 @@ if config['options']['mode'] == 'slurm': # previously submitted jobs sleep 15; rm -f COMPLETED FAILED RUNNING; timestamp=$(date +"%Y-%m-%d_%H-%M-%S"); + snk_log=$(ls -Art logfiles/snakemake_*.log | tail -n 1); ./bin/jobby \\ - $(grep --color=never "^Submitted .* external jobid" logfiles/snakemake.log \\ + $(grep --color=never "^Submitted .* external jobid" ${{snk_log}} \\ | awk '{{print $NF}}' \\ | sed "s/['.]//g" \\ | sort \\